83 files changed, 34023 insertions, 0 deletions
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/Core b/third_party/eigen3/unsupported/Eigen/CXX11/Core
new file mode 100644
index 0000000000..1b3690716c
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/Core
@@ -0,0 +1,46 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_CORE_MODULE
+#define EIGEN_CXX11_CORE_MODULE
+
+#include <Eigen/Core>
+
+#include <Eigen/src/Core/util/DisableStupidWarnings.h>
+
+/** \defgroup CXX11_Core_Module C++11 Core Module
+  *
+  * This module provides common core features for all modules that
+  * explicitly depend on C++11. Currently, this is only the Tensor
+  * module. Note that at this stage, you should not need to include
+  * this module directly.
+  *
+  * It also provides a limited fallback for compilers that don't support
+  * CXX11 yet, such as nvcc.
+  *
+  * \code
+  * #include <Eigen/CXX11/Core>
+  * \endcode
+  */
+
+// Only a subset of cxx11 is allowed at Google, so we default to emulate the
+// cxx11 functionality that we need.
+#include "src/Core/util/FixedSizeVector.h"
+#if 1
+#include <vector>
+#include "src/Core/util/EmulateCXX11Meta.h"
+#else
+#include "src/Core/util/CXX11Workarounds.h"
+#include "src/Core/util/CXX11Meta.h"
+#endif
+#include <Eigen/src/Core/util/ReenableStupidWarnings.h>
+
+#endif // EIGEN_CXX11_CORE_MODULE
+
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/FixedPoint b/third_party/eigen3/unsupported/Eigen/CXX11/FixedPoint
new file mode 100644
index 0000000000..35b55de46d
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/FixedPoint
@@ -0,0 +1,51 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_FIXED_POINT_MODULE
+#define EIGEN_CXX11_FIXED_POINT_MODULE
+
+#include <Eigen/Core>
+#include <stdint.h>
+
+/** \defgroup CXX11_FixedPoint_Module Fixed Point Module
+  *
+  * This module provides common core features for all modules that
+  * explicitly depend on C++11. Currently, this is only the Tensor
+  * module. Note that at this stage, you should not need to include
+  * this module directly.
+  *
+  * It also provides a limited fallback for compilers that don't support
+  * CXX11 yet, such as nvcc.
+  *
+  * \code
+  * #include <Eigen/CXX11/FixedPoint>
+  * \endcode
+  */
+
+#include "src/FixedPoint/FixedPointTypes.h"
+
+// Use optimized implementations whenever available
+#ifdef EIGEN_VECTORIZE_AVX2
+#define EIGEN_USE_OPTIMIZED_INT8_UINT8_MAT_MAT_PRODUCT
+#include "src/Tensor/TensorContractionThreadPool.h"
+#include "src/FixedPoint/PacketMathAVX2.h"
+#include "src/FixedPoint/MatMatProductAVX2.h"
+#include "src/FixedPoint/TypeCastingAVX2.h"
+
+#elif defined EIGEN_VECTORIZE_NEON
+#define EIGEN_USE_OPTIMIZED_INT8_UINT8_MAT_MAT_PRODUCT
+#include "src/FixedPoint/MatMatProductNEON.h"
+#endif
+
+// Use the default implementation when no optimized code is available
+#include "src/FixedPoint/MatMatProduct.h"
+#include "src/FixedPoint/MatVecProduct.h"
+
+
+#endif // EIGEN_CXX11_FIXED_POINT_MODULE
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/NeuralNetworks b/third_party/eigen3/unsupported/Eigen/CXX11/NeuralNetworks
new file mode 100644
index 0000000000..7741b68d8a
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/NeuralNetworks
@@ -0,0 +1,35 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_NEURAL_NETWORKS_MODULE
+#define EIGEN_CXX11_NEURAL_NETWORKS_MODULE
+
+#include "unsupported/Eigen/CXX11/Tensor"
+
+/** \defgroup CXX11_NeuralNetworks_Module Neural Networks Module
+  *
+  * This module provides an efficient implementation of the common primitives
+  * used by neural networks.
+  * The primitives are  built on top of the tensor library.
+  *
+  * \code
+  * #include <Eigen/CXX11/NeuralNetworks>
+  * \endcode
+  */
+
+#include "unsupported/Eigen/CXX11/src/NeuralNetworks/Activations.h"
+#include "unsupported/Eigen/CXX11/src/NeuralNetworks/Attention.h"
+#include "unsupported/Eigen/CXX11/src/NeuralNetworks/Pooling.h"
+#include "unsupported/Eigen/CXX11/src/NeuralNetworks/SoftMax.h"
+#include "unsupported/Eigen/CXX11/src/NeuralNetworks/BackwardCuboidConvolutions.h"
+#include "unsupported/Eigen/CXX11/src/NeuralNetworks/CuboidConvolution.h"
+#include "unsupported/Eigen/CXX11/src/NeuralNetworks/BackwardSpatialConvolutions.h"
+#include "unsupported/Eigen/CXX11/src/NeuralNetworks/SpatialConvolutions.h"
+
+#endif  // EIGEN_CXX11_NEURAL_NETWORKS_MODULE
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/Tensor b/third_party/eigen3/unsupported/Eigen/CXX11/Tensor
new file mode 100644
index 0000000000..3904c72eef
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/Tensor
@@ -0,0 +1,145 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_MODULE
+#define EIGEN_CXX11_TENSOR_MODULE
+
+#include "Eigen/src/Core/util/StaticAssert.h"
+#include "unsupported/Eigen/CXX11/Core"
+
+#include "Eigen/src/Core/util/DisableStupidWarnings.h"
+
+/** \defgroup CXX11_Tensor_Module Tensor Module
+  *
+  * This module provides a Tensor class for storing arbitrarily indexed
+  * objects.
+  *
+  * \code
+  * #include <Eigen/CXX11/Tensor>
+  * \endcode
+  */
+
+#include <cstddef>
+#include <cstring>
+#include <stdint.h>
+
+#if __cplusplus > 199711
+#include <random>
+#endif
+
+#ifdef EIGEN_USE_THREADS
+#if defined(EIGEN_USE_CUSTOM_THREAD_POOL)
+// Use the Eigen implementation of the ThreadPool class. We only need to
+// include a few multithreading headers
+#include <condition_variable>
+#include <deque>
+#include <mutex>
+#include <thread>
+#else
+#include "tensorflow/core/platform/port.h"
+#endif  // EIGEN_USE_CUSTOM_THREAD_POOL
+
+#include <functional>
+
+#endif  // EIGEN_USE_THREADS
+
+#ifdef EIGEN_USE_GPU
+#include "tensorflow/core/platform/port.h"
+#if !defined(__GCUDACC__) && !defined(__GCUDACC_HOST__)
+#include <cuda.h>
+#include <cufft.h>
+#include <cuda_runtime.h>
+#ifdef __CUDACC__
+#include <curand_kernel.h>
+#endif  // defined(__CUDACC__)
+#else
+#include "perftools/gputools/executor/gcuda.h"
+#ifdef __CUDACC__
+#include "third_party/gpus/cuda/curand_device/curand_kernel.h"
+#endif  // defined(__CUDACC__)
+#endif  // __GCUDACC__
+#endif  // EIGEN_USE_GPU
+
+#ifdef _WIN32
+#include <winbase.h>
+#elif defined(__APPLE__)
+#include <mach/mach_time.h>
+#else
+#include <time.h>
+#endif
+
+#include "Eigen/Core"
+
+// Beware: the order of the include matters to some compilers. For example
+// TensorIndexList.h should be included before TensorDimensions.h in order to
+// use index lists to encode tensor dimensions when compiling with llvm.
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h"
+
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorBase.h"
+
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h"
+
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionMappers.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h"
+#include "unsupported/Eigen/CXX11/src/NeuralNetworks/TensorConvolutionByFFT.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorTrueIndices.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h"
+
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h"
+
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/Tensor.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorVarDim.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorMap.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorRef.h"
+
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h"
+
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorIO.h"
+
+#include "Eigen/src/Core/util/ReenableStupidWarnings.h"
+
+#endif // EIGEN_CXX11_TENSOR_MODULE
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/TensorSymmetry b/third_party/eigen3/unsupported/Eigen/CXX11/TensorSymmetry
new file mode 100644
index 0000000000..027c6087f9
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/TensorSymmetry
@@ -0,0 +1,40 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSORSYMMETRY_MODULE
+#define EIGEN_CXX11_TENSORSYMMETRY_MODULE
+
+#include <Eigen/CXX11/Tensor>
+
+#include <Eigen/src/Core/util/DisableStupidWarnings.h>
+
+/** \defgroup CXX11_TensorSymmetry_Module Tensor Symmetry Module
+  *
+  * This module provides a classes that allow for the definition of
+  * symmetries w.r.t. tensor indices.
+  *
+  * Including this module will implicitly include the Tensor module.
+  *
+  * \code
+  * #include <Eigen/TensorSymmetry>
+  * \endcode
+  */
+
+#include "src/TensorSymmetry/util/TemplateGroupTheory.h"
+#include "src/TensorSymmetry/Symmetry.h"
+#include "src/TensorSymmetry/StaticSymmetry.h"
+#include "src/TensorSymmetry/DynamicSymmetry.h"
+
+#include <Eigen/src/Core/util/ReenableStupidWarnings.h>
+
+#endif // EIGEN_CXX11_TENSORSYMMETRY_MODULE
+
+/*
+ * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle;
+ */
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h
new file mode 100644
index 0000000000..ad6a9dda10
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h
@@ -0,0 +1,508 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11META_H
+#define EIGEN_CXX11META_H
+
+namespace Eigen {
+
+namespace internal {
+
+/** \internal
+  * \file CXX11/Core/util/CXX11Meta.h
+  * This file contains generic metaprogramming classes which are not specifically related to Eigen.
+  * This file expands upon Core/util/Meta.h and adds support for C++11 specific features.
+  */
+
+template<typename... tt>
+struct type_list { constexpr static int count = sizeof...(tt); };
+
+template<typename t, typename... tt>
+struct type_list<t, tt...> { constexpr static int count = sizeof...(tt) + 1; typedef t first_type; };
+
+template<typename T, T... nn>
+struct numeric_list { constexpr static std::size_t count = sizeof...(nn); };
+
+template<typename T, T n, T... nn>
+struct numeric_list<T, n, nn...> { constexpr static std::size_t count = sizeof...(nn) + 1; constexpr static T first_value = n; };
+
+/* numeric list constructors
+ *
+ * equivalencies:
+ *     constructor                                              result
+ *     typename gen_numeric_list<int, 5>::type                  numeric_list<int, 0,1,2,3,4>
+ *     typename gen_numeric_list_reversed<int, 5>::type         numeric_list<int, 4,3,2,1,0>
+ *     typename gen_numeric_list_swapped_pair<int, 5,1,2>::type numeric_list<int, 0,2,1,3,4>
+ *     typename gen_numeric_list_repeated<int, 0, 5>::type      numeric_list<int, 0,0,0,0,0>
+ */
+
+template<typename T, std::size_t n, T... ii> struct gen_numeric_list              : gen_numeric_list<T, n-1, n-1, ii...> {};
+template<typename T, T... ii>                struct gen_numeric_list<T, 0, ii...> { typedef numeric_list<T, ii...> type; };
+
+template<typename T, std::size_t n, T... ii> struct gen_numeric_list_reversed              : gen_numeric_list_reversed<T, n-1, ii..., n-1> {};
+template<typename T, T... ii>                struct gen_numeric_list_reversed<T, 0, ii...> { typedef numeric_list<T, ii...> type; };
+
+template<typename T, std::size_t n, T a, T b, T... ii> struct gen_numeric_list_swapped_pair                    : gen_numeric_list_swapped_pair<T, n-1, a, b, (n-1) == a ? b : ((n-1) == b ? a : (n-1)), ii...> {};
+template<typename T, T a, T b, T... ii>                struct gen_numeric_list_swapped_pair<T, 0, a, b, ii...> { typedef numeric_list<T, ii...> type; };
+
+template<typename T, std::size_t n, T V, T... nn> struct gen_numeric_list_repeated                 : gen_numeric_list_repeated<T, n-1, V, V, nn...> {};
+template<typename T, T V, T... nn>                struct gen_numeric_list_repeated<T, 0, V, nn...> { typedef numeric_list<T, nn...> type; };
+
+/* list manipulation: concatenate */
+
+template<class a, class b> struct concat;
+
+template<typename... as, typename... bs> struct concat<type_list<as...>,       type_list<bs...>>        { typedef type_list<as..., bs...> type; };
+template<typename T, T... as, T... bs>   struct concat<numeric_list<T, as...>, numeric_list<T, bs...> > { typedef numeric_list<T, as..., bs...> type; };
+
+template<typename... p> struct mconcat;
+template<typename a>                             struct mconcat<a>           { typedef a type; };
+template<typename a, typename b>                 struct mconcat<a, b>        : concat<a, b> {};
+template<typename a, typename b, typename... cs> struct mconcat<a, b, cs...> : concat<a, typename mconcat<b, cs...>::type> {};
+
+/* list manipulation: extract slices */
+
+template<int n, typename x> struct take;
+template<int n, typename a, typename... as> struct take<n, type_list<a, as...>> : concat<type_list<a>, typename take<n-1, type_list<as...>>::type> {};
+template<int n>                             struct take<n, type_list<>>         { typedef type_list<> type; };
+template<typename a, typename... as>        struct take<0, type_list<a, as...>> { typedef type_list<> type; };
+template<>                                  struct take<0, type_list<>>         { typedef type_list<> type; };
+
+template<typename T, int n, T a, T... as> struct take<n, numeric_list<T, a, as...>> : concat<numeric_list<T, a>, typename take<n-1, numeric_list<T, as...>>::type> {};
+template<typename T, int n>               struct take<n, numeric_list<T>>           { typedef numeric_list<T> type; };
+template<typename T, T a, T... as>        struct take<0, numeric_list<T, a, as...>> { typedef numeric_list<T> type; };
+template<typename T>                      struct take<0, numeric_list<T>>           { typedef numeric_list<T> type; };
+
+template<typename T, int n, T... ii>      struct h_skip_helper_numeric;
+template<typename T, int n, T i, T... ii> struct h_skip_helper_numeric<T, n, i, ii...> : h_skip_helper_numeric<T, n-1, ii...> {};
+template<typename T, T i, T... ii>        struct h_skip_helper_numeric<T, 0, i, ii...> { typedef numeric_list<T, i, ii...> type; };
+template<typename T, int n>               struct h_skip_helper_numeric<T, n>           { typedef numeric_list<T> type; };
+template<typename T>                      struct h_skip_helper_numeric<T, 0>           { typedef numeric_list<T> type; };
+
+template<int n, typename... tt>             struct h_skip_helper_type;
+template<int n, typename t, typename... tt> struct h_skip_helper_type<n, t, tt...> : h_skip_helper_type<n-1, tt...> {};
+template<typename t, typename... tt>        struct h_skip_helper_type<0, t, tt...> { typedef type_list<t, tt...> type; };
+template<int n>                             struct h_skip_helper_type<n>           { typedef type_list<> type; };
+template<>                                  struct h_skip_helper_type<0>           { typedef type_list<> type; };
+
+template<int n>
+struct h_skip {
+  template<typename T, T... ii>
+  constexpr static inline typename h_skip_helper_numeric<T, n, ii...>::type helper(numeric_list<T, ii...>) { return typename h_skip_helper_numeric<T, n, ii...>::type(); }
+  template<typename... tt>
+  constexpr static inline typename h_skip_helper_type<n, tt...>::type helper(type_list<tt...>) { return typename h_skip_helper_type<n, tt...>::type(); }
+};
+
+template<int n, typename a> struct skip { typedef decltype(h_skip<n>::helper(a())) type; };
+
+template<int start, int count, typename a> struct slice : take<count, typename skip<start, a>::type> {};
+
+/* list manipulation: retrieve single element from list */
+
+template<int n, typename x> struct get;
+
+template<int n, typename a, typename... as>               struct get<n, type_list<a, as...>>   : get<n-1, type_list<as...>> {};
+template<typename a, typename... as>                      struct get<0, type_list<a, as...>>   { typedef a type; };
+template<int n EIGEN_TPL_PP_SPEC_HACK_DEFC(typename, as)> struct get<n, type_list<EIGEN_TPL_PP_SPEC_HACK_USE(as)>> { static_assert((n - n) < 0, "meta-template get: The element to extract from a list must be smaller than the size of the list."); };
+
+template<typename T, int n, T a, T... as>                        struct get<n, numeric_list<T, a, as...>>   : get<n-1, numeric_list<T, as...>> {};
+template<typename T, T a, T... as>                               struct get<0, numeric_list<T, a, as...>>   { constexpr static T value = a; };
+template<typename T, int n EIGEN_TPL_PP_SPEC_HACK_DEFC(T, as)>   struct get<n, numeric_list<T EIGEN_TPL_PP_SPEC_HACK_USEC(as)>> { static_assert((n - n) < 0, "meta-template get: The element to extract from a list must be smaller than the size of the list."); };
+
+/* always get type, regardless of dummy; good for parameter pack expansion */
+
+template<typename T, T dummy, typename t> struct id_numeric  { typedef t type; };
+template<typename dummy, typename t>      struct id_type     { typedef t type; };
+
+/* equality checking, flagged version */
+
+template<typename a, typename b> struct is_same_gf : is_same<a, b> { constexpr static int global_flags = 0; };
+
+/* apply_op to list */
+
+template<
+  bool from_left, // false
+  template<typename, typename> class op,
+  typename additional_param,
+  typename... values
+>
+struct h_apply_op_helper                                        { typedef type_list<typename op<values, additional_param>::type...> type; };
+template<
+  template<typename, typename> class op,
+  typename additional_param,
+  typename... values
+>
+struct h_apply_op_helper<true, op, additional_param, values...> { typedef type_list<typename op<additional_param, values>::type...> type; };
+
+template<
+  bool from_left,
+  template<typename, typename> class op,
+  typename additional_param
+>
+struct h_apply_op
+{
+  template<typename... values>
+  constexpr static typename h_apply_op_helper<from_left, op, additional_param, values...>::type helper(type_list<values...>)
+  { return typename h_apply_op_helper<from_left, op, additional_param, values...>::type(); }
+};
+
+template<
+  template<typename, typename> class op,
+  typename additional_param,
+  typename a
+>
+struct apply_op_from_left { typedef decltype(h_apply_op<true, op, additional_param>::helper(a())) type; };
+
+template<
+  template<typename, typename> class op,
+  typename additional_param,
+  typename a
+>
+struct apply_op_from_right { typedef decltype(h_apply_op<false, op, additional_param>::helper(a())) type; };
+
+/* see if an element is in a list */
+
+template<
+  template<typename, typename> class test,
+  typename check_against,
+  typename h_list,
+  bool last_check_positive = false
+>
+struct contained_in_list;
+
+template<
+  template<typename, typename> class test,
+  typename check_against,
+  typename h_list
+>
+struct contained_in_list<test, check_against, h_list, true>
+{
+  constexpr static bool value = true;
+};
+
+template<
+  template<typename, typename> class test,
+  typename check_against,
+  typename a,
+  typename... as
+>
+struct contained_in_list<test, check_against, type_list<a, as...>, false> : contained_in_list<test, check_against, type_list<as...>, test<check_against, a>::value> {};
+
+template<
+  template<typename, typename> class test,
+  typename check_against
+  EIGEN_TPL_PP_SPEC_HACK_DEFC(typename, empty)
+>
+struct contained_in_list<test, check_against, type_list<EIGEN_TPL_PP_SPEC_HACK_USE(empty)>, false> { constexpr static bool value = false; };
+
+/* see if an element is in a list and check for global flags */
+
+template<
+  template<typename, typename> class test,
+  typename check_against,
+  typename h_list,
+  int default_flags = 0,
+  bool last_check_positive = false,
+  int last_check_flags = default_flags
+>
+struct contained_in_list_gf;
+
+template<
+  template<typename, typename> class test,
+  typename check_against,
+  typename h_list,
+  int default_flags,
+  int last_check_flags
+>
+struct contained_in_list_gf<test, check_against, h_list, default_flags, true, last_check_flags>
+{
+  constexpr static bool value = true;
+  constexpr static int global_flags = last_check_flags;
+};
+
+template<
+  template<typename, typename> class test,
+  typename check_against,
+  typename a,
+  typename... as,
+  int default_flags,
+  int last_check_flags
+>
+struct contained_in_list_gf<test, check_against, type_list<a, as...>, default_flags, false, last_check_flags> : contained_in_list_gf<test, check_against, type_list<as...>, default_flags, test<check_against, a>::value, test<check_against, a>::global_flags> {};
+
+template<
+  template<typename, typename> class test,
+  typename check_against
+  EIGEN_TPL_PP_SPEC_HACK_DEFC(typename, empty),
+  int default_flags,
+  int last_check_flags
+>
+struct contained_in_list_gf<test, check_against, type_list<EIGEN_TPL_PP_SPEC_HACK_USE(empty)>, default_flags, false, last_check_flags> { constexpr static bool value = false; constexpr static int global_flags = default_flags; };
+
+/* generic reductions */
+
+template<
+  typename Reducer,
+  typename... Ts
+> struct reduce;
+
+template<
+  typename Reducer,
+  typename A,
+  typename... Ts
+> struct reduce<Reducer, A, Ts...>
+{
+  constexpr static inline A run(A a, Ts...) { return a; }
+};
+
+template<
+  typename Reducer,
+  typename A,
+  typename B,
+  typename... Ts
+> struct reduce<Reducer, A, B, Ts...>
+{
+  constexpr static inline auto run(A a, B b, Ts... ts) -> decltype(Reducer::run(a, reduce<Reducer, B, Ts...>::run(b, ts...))) {
+    return Reducer::run(a, reduce<Reducer, B, Ts...>::run(b, ts...));
+  }
+};
+
+/* generic binary operations */
+
+struct sum_op           { template<typename A, typename B> constexpr static inline auto run(A a, B b) -> decltype(a + b)   { return a + b;   } };
+struct product_op       { template<typename A, typename B> constexpr static inline auto run(A a, B b) -> decltype(a * b)   { return a * b;   } };
+
+struct logical_and_op   { template<typename A, typename B> constexpr static inline auto run(A a, B b) -> decltype(a && b)  { return a && b;  } };
+struct logical_or_op    { template<typename A, typename B> constexpr static inline auto run(A a, B b) -> decltype(a || b)  { return a || b;  } };
+
+struct equal_op         { template<typename A, typename B> constexpr static inline auto run(A a, B b) -> decltype(a == b)  { return a == b;  } };
+struct not_equal_op     { template<typename A, typename B> constexpr static inline auto run(A a, B b) -> decltype(a != b)  { return a != b;  } };
+struct lesser_op        { template<typename A, typename B> constexpr static inline auto run(A a, B b) -> decltype(a < b)   { return a < b;   } };
+struct lesser_equal_op  { template<typename A, typename B> constexpr static inline auto run(A a, B b) -> decltype(a <= b)  { return a <= b;  } };
+struct greater_op       { template<typename A, typename B> constexpr static inline auto run(A a, B b) -> decltype(a > b)   { return a > b;   } };
+struct greater_equal_op { template<typename A, typename B> constexpr static inline auto run(A a, B b) -> decltype(a >= b)  { return a >= b;  } };
+
+/* generic unary operations */
+
+struct not_op                { template<typename A> constexpr static inline auto run(A a) -> decltype(!a)      { return !a;      } };
+struct negation_op           { template<typename A> constexpr static inline auto run(A a) -> decltype(-a)      { return -a;      } };
+struct greater_equal_zero_op { template<typename A> constexpr static inline auto run(A a) -> decltype(a >= 0)  { return a >= 0;  } };
+
+
+/* reductions for lists */
+
+// using auto -> return value spec makes ICC 13.0 and 13.1 crash here, so we have to hack it
+// together in front... (13.0 doesn't work with array_prod/array_reduce/... anyway, but 13.1
+// does...
+template<typename... Ts>
+constexpr inline decltype(reduce<product_op, Ts...>::run((*((Ts*)0))...)) arg_prod(Ts... ts)
+{
+  return reduce<product_op, Ts...>::run(ts...);
+}
+
+template<typename... Ts>
+constexpr inline decltype(reduce<sum_op, Ts...>::run((*((Ts*)0))...)) arg_sum(Ts... ts)
+{
+  return reduce<sum_op, Ts...>::run(ts...);
+}
+
+/* reverse arrays */
+
+template<typename Array, int... n>
+constexpr inline Array h_array_reverse(Array arr, numeric_list<int, n...>)
+{
+  return {{array_get<sizeof...(n) - n - 1>(arr)...}};
+}
+
+template<typename T, std::size_t N>
+constexpr inline std::array<T, N> array_reverse(std::array<T, N> arr)
+{
+  return h_array_reverse(arr, typename gen_numeric_list<int, N>::type());
+}
+
+/* generic array reductions */
+
+// can't reuse standard reduce() interface above because Intel's Compiler
+// *really* doesn't like it, so we just reimplement the stuff
+// (start from N - 1 and work down to 0 because specialization for
+// n == N - 1 also doesn't work in Intel's compiler, so it goes into
+// an infinite loop)
+template<typename Reducer, typename T, std::size_t N, std::size_t n = N - 1>
+struct h_array_reduce {
+  constexpr static inline auto run(std::array<T, N> arr, T identity) -> decltype(Reducer::run(h_array_reduce<Reducer, T, N, n - 1>::run(arr), array_get<n>(arr)))
+  {
+    return Reducer::run(h_array_reduce<Reducer, T, N, n - 1>::run(arr), array_get<n>(arr));
+  }
+};
+
+template<typename Reducer, typename T, std::size_t N>
+struct h_array_reduce<Reducer, T, N, 0>
+{
+  constexpr static inline T run(std::array<T, N> arr, T identity)
+  {
+    return array_get<0>(arr);
+  }
+};
+
+template<typename Reducer, typename T, std::size_t N>
+struct h_array_reduce<Reducer, T, 0>
+{
+  constexpr static inline T run(std::array<T, 0> arr, T identity)
+  {
+    return identity;
+  }
+};
+
+template<typename Reducer, typename T, std::size_t N>
+constexpr inline auto array_reduce(std::array<T, N> arr, T identity) -> decltype(h_array_reduce<Reducer, T, N>::run(arr))
+{
+  return h_array_reduce<Reducer, T, N>::run(arr, identity);
+}
+
+/* standard array reductions */
+
+template<typename T, std::size_t N>
+constexpr inline auto array_sum(std::array<T, N> arr) -> decltype(array_reduce<sum_op, T, N>(arr))
+{
+  return array_reduce<sum_op, T, N>(arr, 0);
+}
+
+template<typename T, std::size_t N>
+constexpr inline auto array_prod(std::array<T, N> arr) -> decltype(array_reduce<product_op, T, N>(arr))
+{
+  return array_reduce<product_op, T, N>(arr, 1);
+}
+
+/* zip an array */
+
+template<typename Op, typename A, typename B, std::size_t N, int... n>
+constexpr inline std::array<decltype(Op::run(A(), B())),N> h_array_zip(std::array<A, N> a, std::array<B, N> b, numeric_list<int, n...>)
+{
+  return std::array<decltype(Op::run(A(), B())),N>{{ Op::run(array_get<n>(a), array_get<n>(b))... }};
+}
+
+template<typename Op, typename A, typename B, std::size_t N>
+constexpr inline std::array<decltype(Op::run(A(), B())),N> array_zip(std::array<A, N> a, std::array<B, N> b)
+{
+  return h_array_zip<Op>(a, b, typename gen_numeric_list<int, N>::type());
+}
+
+/* zip an array and reduce the result */
+
+template<typename Reducer, typename Op, typename A, typename B, std::size_t N, int... n>
+constexpr inline auto h_array_zip_and_reduce(std::array<A, N> a, std::array<B, N> b, numeric_list<int, n...>) -> decltype(reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A(), B()))>::type...>::run(Op::run(array_get<n>(a), array_get<n>(b))...))
+{
+  return reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A(), B()))>::type...>::run(Op::run(array_get<n>(a), array_get<n>(b))...);
+}
+
+template<typename Reducer, typename Op, typename A, typename B, std::size_t N>
+constexpr inline auto array_zip_and_reduce(std::array<A, N> a, std::array<B, N> b) -> decltype(h_array_zip_and_reduce<Reducer, Op, A, B, N>(a, b, typename gen_numeric_list<int, N>::type()))
+{
+  return h_array_zip_and_reduce<Reducer, Op, A, B, N>(a, b, typename gen_numeric_list<int, N>::type());
+}
+
+/* apply stuff to an array */
+
+template<typename Op, typename A, std::size_t N, int... n>
+constexpr inline std::array<decltype(Op::run(A())),N> h_array_apply(std::array<A, N> a, numeric_list<int, n...>)
+{
+  return std::array<decltype(Op::run(A())),N>{{ Op::run(array_get<n>(a))... }};
+}
+
+template<typename Op, typename A, std::size_t N>
+constexpr inline std::array<decltype(Op::run(A())),N> array_apply(std::array<A, N> a)
+{
+  return h_array_apply<Op>(a, typename gen_numeric_list<int, N>::type());
+}
+
+/* apply stuff to an array and reduce */
+
+template<typename Reducer, typename Op, typename A, std::size_t N, int... n>
+constexpr inline auto h_array_apply_and_reduce(std::array<A, N> arr, numeric_list<int, n...>) -> decltype(reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A()))>::type...>::run(Op::run(array_get<n>(arr))...))
+{
+  return reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A()))>::type...>::run(Op::run(array_get<n>(arr))...);
+}
+
+template<typename Reducer, typename Op, typename A, std::size_t N>
+constexpr inline auto array_apply_and_reduce(std::array<A, N> a) -> decltype(h_array_apply_and_reduce<Reducer, Op, A, N>(a, typename gen_numeric_list<int, N>::type()))
+{
+  return h_array_apply_and_reduce<Reducer, Op, A, N>(a, typename gen_numeric_list<int, N>::type());
+}
+
+/* repeat a value n times (and make an array out of it
+ * usage:
+ *   std::array<int, 16> = repeat<16>(42);
+ */
+
+template<int n>
+struct h_repeat
+{
+  template<typename t, int... ii>
+  constexpr static inline std::array<t, n> run(t v, numeric_list<int, ii...>)
+  {
+    return {{ typename id_numeric<int, ii, t>::type(v)... }};
+  }
+};
+
+template<int n, typename t>
+constexpr std::array<t, n> repeat(t v) { return h_repeat<n>::run(v, typename gen_numeric_list<int, n>::type()); }
+
+/* instantiate a class by a C-style array */
+template<class InstType, typename ArrType, std::size_t N, bool Reverse, typename... Ps>
+struct h_instantiate_by_c_array;
+
+template<class InstType, typename ArrType, std::size_t N, typename... Ps>
+struct h_instantiate_by_c_array<InstType, ArrType, N, false, Ps...>
+{
+  static InstType run(ArrType* arr, Ps... args)
+  {
+    return h_instantiate_by_c_array<InstType, ArrType, N - 1, false, Ps..., ArrType>::run(arr + 1, args..., arr[0]);
+  }
+};
+
+template<class InstType, typename ArrType, std::size_t N, typename... Ps>
+struct h_instantiate_by_c_array<InstType, ArrType, N, true, Ps...>
+{
+  static InstType run(ArrType* arr, Ps... args)
+  {
+    return h_instantiate_by_c_array<InstType, ArrType, N - 1, false, ArrType, Ps...>::run(arr + 1, arr[0], args...);
+  }
+};
+
+template<class InstType, typename ArrType, typename... Ps>
+struct h_instantiate_by_c_array<InstType, ArrType, 0, false, Ps...>
+{
+  static InstType run(ArrType* arr, Ps... args)
+  {
+    (void)arr;
+    return InstType(args...);
+  }
+};
+
+template<class InstType, typename ArrType, typename... Ps>
+struct h_instantiate_by_c_array<InstType, ArrType, 0, true, Ps...>
+{
+  static InstType run(ArrType* arr, Ps... args)
+  {
+    (void)arr;
+    return InstType(args...);
+  }
+};
+
+template<class InstType, typename ArrType, std::size_t N, bool Reverse = false>
+InstType instantiate_by_c_array(ArrType* arr)
+{
+  return h_instantiate_by_c_array<InstType, ArrType, N, Reverse>::run(arr);
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11META_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h
new file mode 100644
index 0000000000..a590cf4e18
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h
@@ -0,0 +1,116 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11WORKAROUNDS_H
+#define EIGEN_CXX11WORKAROUNDS_H
+
+/* COMPATIBILITY CHECKS
+ * (so users of compilers that are too old get some realistic error messages)
+ */
+#if defined(__INTEL_COMPILER) && (__INTEL_COMPILER < 1310)
+#error Intel Compiler only supports required C++ features since version 13.1.
+// note that most stuff in principle works with 13.0 but when combining
+// some features, at some point 13.0 will just fail with an internal assertion
+#elif defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 6))
+// G++ < 4.6 by default will continue processing the source files - even if we use #error to make
+// it error out. For this reason, we use the pragma to make sure G++ aborts at the first error
+// it sees. Unfortunately, that is still not our #error directive, but at least the output is
+// short enough the user has a chance to see that the compiler version is not sufficient for
+// the funky template mojo we use.
+#pragma GCC diagnostic error "-Wfatal-errors"
+#error GNU C++ Compiler (g++) only supports required C++ features since version 4.6.
+#endif
+
+/* Check that the compiler at least claims to support C++11. It might not be sufficient
+ * because the compiler may not implement it correctly, but at least we'll know.
+ */
+#if __cplusplus <= 199711L
+#if defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER)
+#pragma GCC diagnostic error "-Wfatal-errors"
+#endif
+#error This library needs at least a C++11 compliant compiler. If you use g++/clang, please enable the -std=c++11 compiler flag. (-std=c++0x on older versions.)
+#endif
+
+namespace Eigen {
+
+// Use std::array as Eigen array
+template <typename T, std::size_t N> using array = std::array<T, N>;
+
+namespace internal {
+
+/* std::get is only constexpr in C++14, not yet in C++11
+ *     - libstdc++ from version 4.7 onwards has it nevertheless,
+ *                                          so use that
+ *     - libstdc++ older versions: use _M_instance directly
+ *     - libc++ all versions so far: use __elems_ directly
+ *     - all other libs: use std::get to be portable, but
+ *                       this may not be constexpr
+ */
+#if defined(__GLIBCXX__) && __GLIBCXX__ < 20120322
+#define STD_GET_ARR_HACK             a._M_instance[I]
+#elif defined(_LIBCPP_VERSION)
+#define STD_GET_ARR_HACK             a.__elems_[I]
+#else
+#define STD_GET_ARR_HACK             std::template get<I, T, N>(a)
+#endif
+
+template<std::size_t I, class T, std::size_t N> constexpr inline T&       array_get(std::array<T,N>&       a) { return (T&)       STD_GET_ARR_HACK; }
+template<std::size_t I, class T, std::size_t N> constexpr inline T&&      array_get(std::array<T,N>&&      a) { return (T&&)      STD_GET_ARR_HACK; }
+template<std::size_t I, class T, std::size_t N> constexpr inline T const& array_get(std::array<T,N> const& a) { return (T const&) STD_GET_ARR_HACK; }
+
+template<std::size_t I, class T> constexpr inline T&       array_get(std::vector<T>&       a) { return a[I]; }
+template<std::size_t I, class T> constexpr inline T&&      array_get(std::vector<T>&&      a) { return a[I]; }
+template<std::size_t I, class T> constexpr inline T const& array_get(std::vector<T> const& a) { return a[I]; }
+
+#undef STD_GET_ARR_HACK
+
+template <typename T> struct array_size;
+template<class T, std::size_t N> struct array_size<const std::array<T,N> > {
+  static const size_t value = N;
+};
+template <typename T> struct array_size;
+template<class T, std::size_t N> struct array_size<std::array<T,N> > {
+  static const size_t value = N;
+};
+
+/* Suppose you have a template of the form
+ * template<typename T> struct X;
+ * And you want to specialize it in such a way:
+ *    template<typename S1, typename... SN> struct X<Foo<S1, SN...>> { ::: };
+ *    template<>                            struct X<Foo<>>          { ::: };
+ * This will work in Intel's compiler 13.0, but only to some extent in g++ 4.6, since
+ * g++ can only match templates called with parameter packs if the number of template
+ * arguments is not a fixed size (so inside the first specialization, referencing
+ * X<Foo<Sn...>> will fail in g++). On the other hand, g++ will accept the following:
+ *    template<typename S...> struct X<Foo<S...>> { ::: }:
+ * as an additional (!) specialization, which will then only match the empty case.
+ * But Intel's compiler 13.0 won't accept that, it will only accept the empty syntax,
+ * so we have to create a workaround for this.
+ */
+#if defined(__GNUC__) && !defined(__INTEL_COMPILER)
+#define EIGEN_TPL_PP_SPEC_HACK_DEF(mt, n)    mt... n
+#define EIGEN_TPL_PP_SPEC_HACK_DEFC(mt, n)   , EIGEN_TPL_PP_SPEC_HACK_DEF(mt, n)
+#define EIGEN_TPL_PP_SPEC_HACK_USE(n)        n...
+#define EIGEN_TPL_PP_SPEC_HACK_USEC(n)       , n...
+#else
+#define EIGEN_TPL_PP_SPEC_HACK_DEF(mt, n)
+#define EIGEN_TPL_PP_SPEC_HACK_DEFC(mt, n)
+#define EIGEN_TPL_PP_SPEC_HACK_USE(n)
+#define EIGEN_TPL_PP_SPEC_HACK_USEC(n)
+#endif
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11WORKAROUNDS_H
+
+/*
+ * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle;
+ */
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h
new file mode 100644
index 0000000000..a1e1dca8e1
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h
@@ -0,0 +1,456 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_EMULATE_CXX11_META_H
+#define EIGEN_EMULATE_CXX11_META_H
+
+
+
+namespace Eigen {
+
+// The array class is only available starting with cxx11. Emulate our own here
+// if needed
+template <typename T, size_t n> class array {
+ public:
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE T& operator[] (size_t index) { return values[index]; }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE const T& operator[] (size_t index) const { return values[index]; }
+
+  static EIGEN_ALWAYS_INLINE std::size_t size() { return n; }
+
+  T values[n];
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE array() { }
+  explicit EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE array(const T& v) {
+    EIGEN_STATIC_ASSERT(n==1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    values[0] = v;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE array(const T& v1, const T& v2) {
+    EIGEN_STATIC_ASSERT(n==2, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    values[0] = v1;
+    values[1] = v2;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3) {
+    EIGEN_STATIC_ASSERT(n==3, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    values[0] = v1;
+    values[1] = v2;
+    values[2] = v3;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3,
+                            const T& v4) {
+    EIGEN_STATIC_ASSERT(n==4, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    values[0] = v1;
+    values[1] = v2;
+    values[2] = v3;
+    values[3] = v4;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4,
+                            const T& v5) {
+    EIGEN_STATIC_ASSERT(n==5, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    values[0] = v1;
+    values[1] = v2;
+    values[2] = v3;
+    values[3] = v4;
+    values[4] = v5;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4,
+                            const T& v5, const T& v6) {
+    EIGEN_STATIC_ASSERT(n==6, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    values[0] = v1;
+    values[1] = v2;
+    values[2] = v3;
+    values[3] = v4;
+    values[4] = v5;
+    values[5] = v6;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4,
+                            const T& v5, const T& v6, const T& v7) {
+    EIGEN_STATIC_ASSERT(n==7, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    values[0] = v1;
+    values[1] = v2;
+    values[2] = v3;
+    values[3] = v4;
+    values[4] = v5;
+    values[5] = v6;
+    values[6] = v7;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE array(
+      const T& v1, const T& v2, const T& v3, const T& v4,
+      const T& v5, const T& v6, const T& v7, const T& v8) {
+    EIGEN_STATIC_ASSERT(n==8, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    values[0] = v1;
+    values[1] = v2;
+    values[2] = v3;
+    values[3] = v4;
+    values[4] = v5;
+    values[5] = v6;
+    values[6] = v7;
+    values[7] = v8;
+  }
+
+#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+  array(std::initializer_list<T> l) {
+    eigen_assert(l.size() == n);
+    internal::smart_copy(l.begin(), l.end(), values);
+  }
+#endif
+};
+
+// Specialize array for zero size
+template <typename T> class array<T, 0> {
+ public:
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE T& operator[] (size_t index) {
+    eigen_assert(false && "Can't index a zero size array");
+    return *static_cast<T*>(NULL);
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE const T& operator[] (size_t index) const {
+    eigen_assert(false && "Can't index a zero size array");
+    return *static_cast<const T*>(NULL);
+  }
+
+  static EIGEN_ALWAYS_INLINE std::size_t size() { return 0; }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE array() { }
+
+#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+  array(std::initializer_list<T> l) {
+    eigen_assert(l.size() == 0);
+  }
+#endif
+};
+
+namespace internal {
+
+/** \internal
+  * \file CXX11/Core/util/EmulateCXX11Meta.h
+  * This file emulates a subset of the functionality provided by CXXMeta.h for
+  * compilers that don't yet support cxx11 such as nvcc.
+  */
+
+struct empty_list { static const std::size_t count = 0; };
+
+template<typename T, typename Tail=empty_list> struct type_list {
+  typedef T HeadType;
+  typedef Tail TailType;
+  static const T head;
+  static const Tail tail;
+  static const std::size_t count = 1 + Tail::count;
+};
+
+struct null_type { };
+
+template<typename T1 = null_type, typename T2 = null_type, typename T3 = null_type,
+         typename T4 = null_type, typename T5 = null_type, typename T6 = null_type,
+         typename T7 = null_type, typename T8 = null_type>
+struct make_type_list {
+  typedef typename make_type_list<T2, T3, T4, T5, T6, T7, T8>::type tailresult;
+
+  typedef type_list<T1, tailresult> type;
+};
+
+template<> struct make_type_list<> {
+  typedef empty_list type;
+};
+
+
+template <std::size_t index, class TList> struct get_type;
+
+template <class Head, class Tail>
+struct get_type<0, type_list<Head, Tail> >
+{
+  typedef Head type;
+};
+
+template <std::size_t i, class Head, class Tail>
+struct get_type<i, type_list<Head, Tail> >
+{
+  typedef typename get_type<i-1, Tail>::type type;
+};
+
+
+/* numeric list */
+template <typename T, T n>
+struct type2val {
+  typedef T type;
+  static const T value = n;
+};
+
+
+template<typename T, size_t n, T V> struct gen_numeric_list_repeated;
+
+template<typename T, T V> struct gen_numeric_list_repeated<T, 1, V> {
+  typedef typename make_type_list<type2val<T, V> >::type type;
+};
+
+template<typename T, T V> struct gen_numeric_list_repeated<T, 2, V> {
+  typedef typename make_type_list<type2val<T, V>, type2val<T, V> >::type type;
+};
+
+template<typename T, T V> struct gen_numeric_list_repeated<T, 3, V> {
+  typedef typename make_type_list<type2val<T, V>, type2val<T, V>, type2val<T, V> >::type type;
+};
+
+template<typename T, T V> struct gen_numeric_list_repeated<T, 4, V> {
+  typedef typename make_type_list<type2val<T, V>, type2val<T, V>, type2val<T, V>, type2val<T, V> >::type type;
+};
+
+template<typename T, T V> struct gen_numeric_list_repeated<T, 5, V> {
+  typedef typename make_type_list<type2val<T, V>, type2val<T, V>, type2val<T, V>, type2val<T, V>, type2val<T, V> >::type type;
+};
+
+template<typename T, T V> struct gen_numeric_list_repeated<T, 6, V> {
+  typedef typename make_type_list<type2val<T, V>, type2val<T, V>, type2val<T, V>,
+                                  type2val<T, V>, type2val<T, V>, type2val<T, V> >::type type;
+};
+
+template<typename T, T V> struct gen_numeric_list_repeated<T, 7, V> {
+  typedef typename make_type_list<type2val<T, V>, type2val<T, V>, type2val<T, V>,
+                                  type2val<T, V>, type2val<T, V>, type2val<T, V>,
+                                  type2val<T, V> >::type type;
+};
+
+template<typename T, T V> struct gen_numeric_list_repeated<T, 8, V> {
+  typedef typename make_type_list<type2val<T, V>, type2val<T, V>, type2val<T, V>,
+                                  type2val<T, V>, type2val<T, V>, type2val<T, V>,
+                                  type2val<T, V>, type2val<T, V> >::type type;
+};
+
+
+template <std::size_t index, class NList> struct get;
+
+template <std::size_t i>
+struct get<i, empty_list>
+{
+  get() { eigen_assert(false && "index overflow"); }
+  typedef void type;
+  static const char value = '\0';
+};
+
+template <std::size_t i, class Head>
+struct get<i, type_list<Head, empty_list> >
+{
+  get() { eigen_assert(false && "index overflow"); }
+  typedef void type;
+  static const char value = '\0';
+};
+
+template <class Head>
+struct get<0, type_list<Head, empty_list> >
+{
+  typedef typename Head::type type;
+  static const type value = Head::value;
+};
+
+template <class Head, class Tail>
+struct get<0, type_list<Head, Tail> >
+{
+  typedef typename Head::type type;
+  static const type value = Head::value;
+};
+
+template <std::size_t i, class Head, class Tail>
+struct get<i, type_list<Head, Tail> >
+{
+  typedef typename Tail::HeadType::type type;
+  static const type value = get<i-1, Tail>::value;
+};
+
+
+template <class NList> struct arg_prod {
+  static const typename NList::HeadType::type value = get<0, NList>::value * arg_prod<typename NList::TailType>::value;
+};
+template <> struct arg_prod<empty_list> {
+  static const int value = 1;
+};
+
+
+template<int n, typename t>
+array<t, n> repeat(t v) {
+  array<t, n> array;
+  array.fill(v);
+  return array;
+}
+
+template<std::size_t I, class Head, class Tail>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Head::type array_get(type_list<Head, Tail>& a) {
+  return get<I, type_list<Head, Tail> >::value;
+}
+template<std::size_t I, class Head, class Tail>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Head::type array_get(const type_list<Head, Tail>& a) {
+  return get<I, type_list<Head, Tail> >::value;
+}
+
+template <class NList>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NList::HeadType::type array_prod(const NList& l) {
+  return arg_prod<NList>::value;
+};
+
+template<std::size_t n, typename t>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const array<t, n>& a) {
+  t prod = 1;
+  for (size_t i = 0; i < n; ++i) { prod *= a[i]; }
+  return prod;
+}
+
+template<typename t>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const std::vector<t>& a) {
+  t prod = 1;
+  for (size_t i = 0; i < a.size(); ++i) { prod *= a[i]; }
+  return prod;
+}
+
+template<std::size_t I, class T, std::size_t N>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& array_get(array<T,N>& a) {
+  return a[I];
+}
+template<std::size_t I, class T, std::size_t N>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& array_get(const array<T,N>& a) {
+  return a[I];
+}
+
+template<std::size_t I, class T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& array_get(std::vector<T>& a) {
+  return a[I];
+}
+template<std::size_t I, class T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& array_get(const std::vector<T>& a) {
+  return a[I];
+}
+
+template <typename T> struct array_size;
+template<class T, std::size_t N> struct array_size<array<T,N> > {
+  static const size_t value = N;
+};
+template <typename T> struct array_size;
+template<class T, std::size_t N> struct array_size<array<T,N>& > {
+  static const size_t value = N;
+};
+template <typename T> struct array_size;
+template<class T, std::size_t N> struct array_size<const array<T,N> > {
+  static const size_t value = N;
+};
+template <typename T> struct array_size;
+template<class T, std::size_t N> struct array_size<const array<T,N>& > {
+  static const size_t value = N;
+};
+
+struct sum_op {
+  template<typename A, typename B> static inline bool run(A a, B b) { return a + b; }
+};
+struct product_op {
+  template<typename A, typename B> static inline bool run(A a, B b) { return a * b; }
+};
+
+struct logical_and_op {
+  template<typename A, typename B> static inline bool run(A a, B b) { return a && b; }
+};
+struct logical_or_op {
+  template<typename A, typename B> static inline bool run(A a, B b) { return a || b; }
+};
+
+struct equal_op {
+  template<typename A, typename B> static inline bool run(A a, B b) { return a == b; }
+};
+struct not_equal_op {
+  template<typename A, typename B> static inline bool run(A a, B b) { return a != b; }
+};
+struct lesser_op {
+  template<typename A, typename B> static inline bool run(A a, B b) { return a < b; }
+};
+struct lesser_equal_op {
+  template<typename A, typename B> static inline bool run(A a, B b) { return a <= b; }
+};
+
+struct greater_op {
+  template<typename A, typename B> static inline bool run(A a, B b) { return a > b; }
+};
+struct greater_equal_op {
+  template<typename A, typename B> static inline bool run(A a, B b) { return a >= b; }
+};
+
+struct not_op {
+  template<typename A> static inline bool run(A a) { return !a; }
+};
+struct negation_op {
+  template<typename A> static inline bool run(A a) { return -a; }
+};
+struct greater_equal_zero_op {
+  template<typename A> static inline bool run(A a) { return a >= 0; }
+};
+
+
+template<typename Reducer, typename Op, typename A, std::size_t N>
+struct ArrayApplyAndReduce {
+  static inline bool run(const array<A, N>& a) {
+    EIGEN_STATIC_ASSERT(N >= 2, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    bool result = Reducer::run(Op::run(a[0]), Op::run(a[1]));
+    for (size_t i = 2; i < N; ++i) {
+      result = Reducer::run(result, Op::run(a[i]));
+    }
+    return result;
+  }
+};
+
+template<typename Reducer, typename Op, typename A>
+struct ArrayApplyAndReduce<Reducer, Op, A, 1>  {
+  static inline bool run(const array<A, 1>& a) {
+    return Op::run(a[0]);
+  }
+};
+
+template<typename Reducer, typename Op, typename A, std::size_t N>
+inline bool array_apply_and_reduce(const array<A, N>& a) {
+  return ArrayApplyAndReduce<Reducer, Op, A, N>::run(a);
+}
+
+template<typename Reducer, typename Op, typename A, typename B, std::size_t N>
+struct ArrayZipAndReduce {
+  static inline bool run(const array<A, N>& a, const array<B, N>& b) {
+    EIGEN_STATIC_ASSERT(N >= 2, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    bool result = Reducer::run(Op::run(a[0], b[0]), Op::run(a[1], b[1]));
+    for (size_t i = 2; i < N; ++i) {
+      result = Reducer::run(result, Op::run(a[i], b[i]));
+    }
+    return result;
+  }
+};
+
+template<typename Reducer, typename Op, typename A, typename B>
+struct ArrayZipAndReduce<Reducer, Op, A, B, 1> {
+  static inline bool run(const array<A, 1>& a, const array<B, 1>& b) {
+    return Op::run(a[0], b[0]);
+  }
+};
+
+template<typename Reducer, typename Op, typename A, typename B, std::size_t N>
+inline bool array_zip_and_reduce(const array<A, N>& a, const array<B, N>& b) {
+  return ArrayZipAndReduce<Reducer, Op, A, B, N>::run(a, b);
+}
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+
+
+#endif  // EIGEN_EMULATE_CXX11_META_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Core/util/FixedSizeVector.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Core/util/FixedSizeVector.h
new file mode 100644
index 0000000000..c68119aa03
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Core/util/FixedSizeVector.h
@@ -0,0 +1,128 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_FIXEDSIZEVECTOR_H
+#define EIGEN_FIXEDSIZEVECTOR_H
+
+namespace Eigen {
+
+/** \class FixedSizeVector
+  * \ingroup Core
+  *
+  * \brief The FixedSizeVector class.
+  *
+  * The %FixedSizeVector provides a subset of std::vector functionality.
+  *
+  * The goal is to provide basic std::vector operations when using
+  * std::vector is not an option (e.g. on GPU or when compiling using
+  * FMA/AVX, as this can cause either compilation failures or illegal
+  * instruction failures).
+  *
+  */
+template <typename T>
+class FixedSizeVector {
+ public:
+  // Construct a new FixedSizeVector, reserve n elements.
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit FixedSizeVector(size_t n)
+      : reserve_(n), size_(0),
+        data_(static_cast<T*>(internal::aligned_malloc(n * sizeof(T)))) {
+    for (size_t i = 0; i < n; ++i) { new (&data_[i]) T; }
+  }
+
+  // Construct a new FixedSizeVector, reserve and resize to n.
+  // Copy the init value to all elements.
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit FixedSizeVector(size_t n, const T& init)
+      : reserve_(n), size_(n),
+        data_(static_cast<T*>(internal::aligned_malloc(n * sizeof(T)))) {
+    for (size_t i = 0; i < n; ++i) { new (&data_[i]) T(init); }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  ~FixedSizeVector() {
+    for (size_t i = 0; i < size_; ++i) {
+      data_[i].~T();
+    }
+    internal::aligned_free(data_);
+  }
+
+  // Append new elements (up to reserved size).
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void push_back(const T& t) {
+    eigen_assert(size_ < reserve_);
+    data_[size_++] = t;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  const T& operator[] (size_t i) const {
+    eigen_assert(i < size_);
+    return data_[i];
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  T& operator[] (size_t i) {
+    eigen_assert(i < size_);
+    return data_[i];
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  T& back() {
+    eigen_assert(size_ > 0);
+    return data_[size_ - 1];
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  const T& back() const {
+    eigen_assert(size_ > 0);
+    return data_[size_ - 1];
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void pop_back() {
+    // NOTE: This does not destroy the value at the end the way
+    // std::vector's version of pop_back() does.  That happens when
+    // the Vector is destroyed.
+    eigen_assert(size_ > 0);
+    size_--;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  size_t size() const { return size_; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  bool empty() const { return size_ == 0; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  T* data() { return data_; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  const T* data() const { return data_; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  T* begin() { return data_; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  T* end() { return data_ + size_; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  const T* begin() const { return data_; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  const T* end() const { return data_ + size_; }
+
+ private:
+  size_t reserve_;
+  size_t size_;
+  T* data_;
+};
+
+}  // namespace Eigen
+
+#endif  // EIGEN_FIXEDSIZEVECTOR_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/FixedPointTypes.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/FixedPointTypes.h
new file mode 100644
index 0000000000..564729ce48
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/FixedPointTypes.h
@@ -0,0 +1,341 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_FIXED_POINT_TYPES_H
+#define EIGEN_CXX11_FIXED_POINT_TYPES_H
+
+#include <cmath>
+#include <iostream>
+
+namespace Eigen {
+
+// The mantissa part of the fixed point representation. See
+// go/tensorfixedpoint for details
+struct QInt8;
+struct QUInt8;
+struct QInt16;
+struct QUInt16;
+struct QInt32;
+
+template <>
+struct NumTraits<QInt8> : GenericNumTraits<int8_t> {};
+template <>
+struct NumTraits<QUInt8> : GenericNumTraits<uint8_t> {};
+template <>
+struct NumTraits<QInt16> : GenericNumTraits<int16_t> {};
+template <>
+struct NumTraits<QUInt16> : GenericNumTraits<uint16_t> {};
+template <>
+struct NumTraits<QInt32> : GenericNumTraits<int32_t> {};
+
+namespace internal {
+template <>
+struct scalar_product_traits<QInt32, double> {
+  enum {
+    // Cost = NumTraits<T>::MulCost,
+    Defined = 1
+  };
+  typedef QInt32 ReturnType;
+};
+}
+
+// Wrap the 8bit int into a QInt8 struct instead of using a typedef to prevent
+// the compiler from silently type cast the mantissa into a bigger or a smaller
+// representation.
+struct QInt8 {
+  QInt8() {}
+  QInt8(const int8_t v) : value(v) {}
+  QInt8(const QInt32 v);
+
+  operator int() const { return static_cast<int>(value); }
+
+  int8_t value;
+};
+
+struct QUInt8 {
+  QUInt8() {}
+  QUInt8(const uint8_t v) : value(v) {}
+  QUInt8(const QInt32 v);
+
+  operator int() const { return static_cast<int>(value); }
+
+  uint8_t value;
+};
+
+struct QInt16 {
+  QInt16() {}
+  QInt16(const int16_t v) : value(v) {}
+  QInt16(const QInt32 v);
+  operator int() const { return static_cast<int>(value); }
+
+  int16_t value;
+};
+
+struct QUInt16 {
+  QUInt16() {}
+  QUInt16(const uint16_t v) : value(v) {}
+  QUInt16(const QInt32 v);
+  operator int() const { return static_cast<int>(value); }
+
+  uint16_t value;
+};
+
+struct QInt32 {
+  QInt32() {}
+  QInt32(const int8_t v) : value(v) {}
+  QInt32(const int32_t v) : value(v) {}
+  QInt32(const QInt8 v) : value(v.value) {}
+  QInt32(const float v) : value(static_cast<int32_t>(lrint(v))) {}
+#ifdef EIGEN_MAKING_DOCS
+  // Workaround to fix build on PPC.
+  QInt32(unsigned long v) : value(v) {}
+#endif
+
+  operator float() const { return static_cast<float>(value); }
+
+  int32_t value;
+};
+
+EIGEN_STRONG_INLINE QInt8::QInt8(const QInt32 v)
+    : value(v.value > 127 ? 127 : (v.value < -128 ? -128 : v.value)) {}
+EIGEN_STRONG_INLINE QUInt8::QUInt8(const QInt32 v)
+    : value(v.value > 255 ? 255 : (v.value < 0 ? 0 : v.value)) {}
+EIGEN_STRONG_INLINE QInt16::QInt16(const QInt32 v)
+    : value(v.value > 32767 ? 32767 : (v.value < -32768 ? -32768 : v.value)) {}
+EIGEN_STRONG_INLINE QUInt16::QUInt16(const QInt32 v)
+    : value(v.value > 65535 ? 65535 : (v.value < 0 ? 0 : v.value)) {}
+
+// Basic widening 8-bit operations: This will be vectorized in future CLs.
+EIGEN_STRONG_INLINE QInt32 operator*(const QInt8 a, const QInt8 b) {
+  return QInt32(static_cast<int32_t>(a.value) * static_cast<int32_t>(b.value));
+}
+EIGEN_STRONG_INLINE QInt32 operator*(const QInt8 a, const QUInt8 b) {
+  return QInt32(static_cast<int32_t>(a.value) * static_cast<int32_t>(b.value));
+}
+EIGEN_STRONG_INLINE QInt32 operator+(const QInt8 a, const QInt8 b) {
+  return QInt32(static_cast<int32_t>(a.value) + static_cast<int32_t>(b.value));
+}
+EIGEN_STRONG_INLINE QInt32 operator-(const QInt8 a, const QInt8 b) {
+  return QInt32(static_cast<int32_t>(a.value) - static_cast<int32_t>(b.value));
+}
+
+// Basic widening 16-bit operations: This will be vectorized in future CLs.
+EIGEN_STRONG_INLINE QInt32 operator*(const QInt16 a, const QInt16 b) {
+  return QInt32(static_cast<int32_t>(a.value) * static_cast<int32_t>(b.value));
+}
+EIGEN_STRONG_INLINE QInt32 operator*(const QInt16 a, const QUInt16 b) {
+  return QInt32(static_cast<int32_t>(a.value) * static_cast<int32_t>(b.value));
+}
+EIGEN_STRONG_INLINE QInt32 operator+(const QInt16 a, const QInt16 b) {
+  return QInt32(static_cast<int32_t>(a.value) + static_cast<int32_t>(b.value));
+}
+EIGEN_STRONG_INLINE QInt32 operator-(const QInt16 a, const QInt16 b) {
+  return QInt32(static_cast<int32_t>(a.value) - static_cast<int32_t>(b.value));
+}
+
+// Mixed QInt32 op QInt8 operations. This will be vectorized in future CLs.
+EIGEN_STRONG_INLINE QInt32 operator+(const QInt32 a, const QInt8 b) {
+  return QInt32(a.value + static_cast<int32_t>(b.value));
+}
+EIGEN_STRONG_INLINE QInt32 operator+(const QInt8 a, const QInt32 b) {
+  return QInt32(static_cast<int32_t>(a.value) + b.value);
+}
+EIGEN_STRONG_INLINE QInt32 operator-(const QInt32 a, const QInt8 b) {
+  return QInt32(a.value - static_cast<int32_t>(b.value));
+}
+EIGEN_STRONG_INLINE QInt32 operator-(const QInt8 a, const QInt32 b) {
+  return QInt32(static_cast<int32_t>(a.value) - b.value);
+}
+EIGEN_STRONG_INLINE QInt32 operator*(const QInt32 a, const QInt8 b) {
+  return QInt32(a.value * static_cast<int32_t>(b.value));
+}
+EIGEN_STRONG_INLINE QInt32 operator*(const QInt8 a, const QInt32 b) {
+  return QInt32(static_cast<int32_t>(a.value) * b.value);
+}
+
+// Mixed QInt32 op QInt16 operations. This will be vectorized in future CLs.
+EIGEN_STRONG_INLINE QInt32 operator+(const QInt32 a, const QInt16 b) {
+  return QInt32(a.value + static_cast<int32_t>(b.value));
+}
+EIGEN_STRONG_INLINE QInt32 operator+(const QInt16 a, const QInt32 b) {
+  return QInt32(static_cast<int32_t>(a.value) + b.value);
+}
+EIGEN_STRONG_INLINE QInt32 operator-(const QInt32 a, const QInt16 b) {
+  return QInt32(a.value - static_cast<int32_t>(b.value));
+}
+EIGEN_STRONG_INLINE QInt32 operator-(const QInt16 a, const QInt32 b) {
+  return QInt32(static_cast<int32_t>(a.value) - b.value);
+}
+EIGEN_STRONG_INLINE QInt32 operator*(const QInt32 a, const QInt16 b) {
+  return QInt32(a.value * static_cast<int32_t>(b.value));
+}
+EIGEN_STRONG_INLINE QInt32 operator*(const QInt16 a, const QInt32 b) {
+  return QInt32(static_cast<int32_t>(a.value) * b.value);
+}
+
+// Mixed QInt32 op QUInt8 operations. This will be vectorized in future CLs.
+EIGEN_STRONG_INLINE QInt32 operator+(const QInt32 a, const QUInt8 b) {
+  return QInt32(a.value + static_cast<int32_t>(b.value));
+}
+EIGEN_STRONG_INLINE QInt32 operator+(const QUInt8 a, const QInt32 b) {
+  return QInt32(static_cast<int32_t>(a.value) + b.value);
+}
+EIGEN_STRONG_INLINE QInt32 operator-(const QInt32 a, const QUInt8 b) {
+  return QInt32(a.value - static_cast<int32_t>(b.value));
+}
+EIGEN_STRONG_INLINE QInt32 operator-(const QUInt8 a, const QInt32 b) {
+  return QInt32(static_cast<int32_t>(a.value) - b.value);
+}
+EIGEN_STRONG_INLINE QInt32 operator*(const QInt32 a, const QUInt8 b) {
+  return QInt32(a.value * static_cast<int32_t>(b.value));
+}
+EIGEN_STRONG_INLINE QInt32 operator*(const QUInt8 a, const QInt32 b) {
+  return QInt32(static_cast<int32_t>(a.value) * b.value);
+}
+
+// Mixed QInt32 op QUInt16 operations. This will be vectorized in future CLs.
+EIGEN_STRONG_INLINE QInt32 operator+(const QInt32 a, const QUInt16 b) {
+  return QInt32(a.value + static_cast<int32_t>(b.value));
+}
+EIGEN_STRONG_INLINE QInt32 operator+(const QUInt16 a, const QInt32 b) {
+  return QInt32(static_cast<int32_t>(a.value) + b.value);
+}
+EIGEN_STRONG_INLINE QInt32 operator-(const QInt32 a, const QUInt16 b) {
+  return QInt32(a.value - static_cast<int32_t>(b.value));
+}
+EIGEN_STRONG_INLINE QInt32 operator-(const QUInt16 a, const QInt32 b) {
+  return QInt32(static_cast<int32_t>(a.value) - b.value);
+}
+EIGEN_STRONG_INLINE QInt32 operator*(const QInt32 a, const QUInt16 b) {
+  return QInt32(a.value * static_cast<int32_t>(b.value));
+}
+EIGEN_STRONG_INLINE QInt32 operator*(const QUInt16 a, const QInt32 b) {
+  return QInt32(static_cast<int32_t>(a.value) * b.value);
+}
+
+// Basic arithmetic operations on QInt32, which behaves like a int32_t.
+EIGEN_STRONG_INLINE QInt32 operator+(const QInt32 a, const QInt32 b) {
+  return a.value + b.value;
+}
+EIGEN_STRONG_INLINE QInt32 operator-(const QInt32 a, const QInt32 b) {
+  return a.value - b.value;
+}
+EIGEN_STRONG_INLINE QInt32 operator*(const QInt32 a, const QInt32 b) {
+  return a.value * b.value;
+}
+EIGEN_STRONG_INLINE QInt32 operator/(const QInt32 a, const QInt32 b) {
+  return a.value / b.value;
+}
+EIGEN_STRONG_INLINE QInt32& operator+=(QInt32& a, const QInt32 b) {
+  a.value += b.value;
+  return a;
+}
+EIGEN_STRONG_INLINE QInt32& operator-=(QInt32& a, const QInt32 b) {
+  a.value -= b.value;
+  return a;
+}
+EIGEN_STRONG_INLINE QInt32& operator*=(QInt32& a, const QInt32 b) {
+  a.value *= b.value;
+  return a;
+}
+EIGEN_STRONG_INLINE QInt32& operator/=(QInt32& a, const QInt32 b) {
+  a.value /= b.value;
+  return a;
+}
+EIGEN_STRONG_INLINE QInt32 operator-(const QInt32 a) {
+  return -a.value;
+}
+
+// Scaling QInt32 by double. We do the arithmetic in double because
+// float only has 23 bits of mantissa, so casting QInt32 to float might reduce
+// accuracy by discarding up to 7 (least significant) bits.
+EIGEN_STRONG_INLINE QInt32 operator*(const QInt32 a, const double b) {
+  return static_cast<int32_t>(lrint(static_cast<double>(a.value) * b));
+}
+EIGEN_STRONG_INLINE QInt32 operator*(const double a, const QInt32 b) {
+  return static_cast<int32_t>(lrint(a * static_cast<double>(b.value)));
+}
+EIGEN_STRONG_INLINE QInt32& operator*=(QInt32& a, const double b) {
+  a.value = static_cast<int32_t>(lrint(static_cast<double>(a.value) * b));
+  return a;
+}
+
+// Comparisons
+EIGEN_STRONG_INLINE bool operator==(const QInt8 a, const QInt8 b) {
+  return a.value == b.value;
+}
+EIGEN_STRONG_INLINE bool operator==(const QUInt8 a, const QUInt8 b) {
+  return a.value == b.value;
+}
+EIGEN_STRONG_INLINE bool operator==(const QInt16 a, const QInt16 b) {
+  return a.value == b.value;
+}
+EIGEN_STRONG_INLINE bool operator==(const QUInt16 a, const QUInt16 b) {
+  return a.value == b.value;
+}
+EIGEN_STRONG_INLINE bool operator==(const QInt32 a, const QInt32 b) {
+  return a.value == b.value;
+}
+
+EIGEN_STRONG_INLINE bool operator<(const QInt8 a, const QInt8 b) {
+  return a.value < b.value;
+}
+EIGEN_STRONG_INLINE bool operator<(const QUInt8 a, const QUInt8 b) {
+  return a.value < b.value;
+}
+EIGEN_STRONG_INLINE bool operator<(const QInt16 a, const QInt16 b) {
+  return a.value < b.value;
+}
+EIGEN_STRONG_INLINE bool operator<(const QUInt16 a, const QUInt16 b) {
+  return a.value < b.value;
+}
+EIGEN_STRONG_INLINE bool operator<(const QInt32 a, const QInt32 b) {
+  return a.value < b.value;
+}
+
+EIGEN_STRONG_INLINE bool operator>(const QInt8 a, const QInt8 b) {
+  return a.value > b.value;
+}
+EIGEN_STRONG_INLINE bool operator>(const QUInt8 a, const QUInt8 b) {
+  return a.value > b.value;
+}
+EIGEN_STRONG_INLINE bool operator>(const QInt16 a, const QInt16 b) {
+  return a.value > b.value;
+}
+EIGEN_STRONG_INLINE bool operator>(const QUInt16 a, const QUInt16 b) {
+  return a.value > b.value;
+}
+EIGEN_STRONG_INLINE bool operator>(const QInt32 a, const QInt32 b) {
+  return a.value > b.value;
+}
+
+EIGEN_STRONG_INLINE std::ostream& operator<<(std::ostream& os, QInt8 a) {
+  os << static_cast<int>(a.value);
+  return os;
+}
+EIGEN_STRONG_INLINE std::ostream& operator<<(std::ostream& os, QUInt8 a) {
+  os << static_cast<int>(a.value);
+  return os;
+}
+EIGEN_STRONG_INLINE std::ostream& operator<<(std::ostream& os, QInt16 a) {
+  os << static_cast<int>(a.value);
+  return os;
+}
+EIGEN_STRONG_INLINE std::ostream& operator<<(std::ostream& os, QUInt16 a) {
+  os << static_cast<int>(a.value);
+  return os;
+}
+EIGEN_STRONG_INLINE std::ostream& operator<<(std::ostream& os, QInt32 a) {
+  os << a.value;
+  return os;
+}
+
+}  // namespace Eigen
+
+#endif  // EIGEN_CXX11_FIXED_POINT_TYPES_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProduct.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProduct.h
new file mode 100644
index 0000000000..4d0dca07df
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProduct.h
@@ -0,0 +1,255 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_FIXED_POINT_MAT_MAT_PRODUCT_H
+#define EIGEN_CXX11_FIXED_POINT_MAT_MAT_PRODUCT_H
+
+
+namespace Eigen {
+namespace internal {
+
+// Accumulate the product of 2 QInt8 inputs on 32 bits to prevent
+// overflows
+template<> struct scalar_product_traits<QInt8, QInt8>
+{
+  enum {
+    Defined = 1
+  };
+  typedef QInt32 ReturnType;
+};
+
+// Accumulate the product of QInt8 inputs with QUint8 inputs on 32 bits
+// to prevent overflows
+template<> struct scalar_product_traits<QInt8, QUInt8>
+{
+  enum {
+    Defined = 1
+  };
+  typedef QInt32 ReturnType;
+};
+
+// Description of the product implementation. It's pretty simple now since
+// nothing is vectorized yet.
+// This definition tackle the case where both lhs and rhs are encoded using
+// signed 8bit integers
+#ifndef EIGEN_USE_OPTIMIZED_INT8_INT8_MAT_MAT_PRODUCT
+
+template<bool _ConjLhs, bool _ConjRhs>
+class gebp_traits<QInt8, QInt8, _ConjLhs, _ConjRhs>
+{
+public:
+  typedef QInt8 LhsScalar;
+  typedef QInt8 RhsScalar;
+  typedef QInt32 ResScalar;
+
+  enum {
+    // register block size along the M and N directions
+    // One for the current implementation
+    nr = 1,
+    mr = 1,
+    // Progress made at each iteration of the product loop
+    // also 1 for the current implementation
+    LhsProgress = 1,
+    RhsProgress = 1
+  };
+};
+
+// The signed 8bit Mat-Mat product itself.
+template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<QInt8, QInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
+{
+  EIGEN_DONT_INLINE
+  void operator()(const DataMapper& res, const QInt8* blockA, const QInt8* blockB,
+                  Index rows, Index depth, Index cols, QInt32 alpha,
+                  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
+};
+
+template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+EIGEN_DONT_INLINE
+void gebp_kernel<QInt8, QInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
+::operator()(const DataMapper& res, const QInt8* blockA, const QInt8* blockB,
+             Index rows, Index depth, Index cols, QInt32 alpha,
+             Index strideA, Index strideB, Index offsetA, Index offsetB)
+{
+  EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  eigen_assert(alpha.value == 1);
+  eigen_assert(strideA == -1);
+  eigen_assert(strideB == -1);
+  eigen_assert(offsetA == 0);
+  eigen_assert(offsetB == 0);
+
+  eigen_assert(rows > 0);
+  eigen_assert(cols > 0);
+  eigen_assert(depth > 0);
+  eigen_assert(blockA);
+  eigen_assert(blockB);
+
+  for (Index j = 0; j < cols; ++j) {
+    Index startB = j * depth;
+
+    for (Index i = 0; i < rows; ++i) {
+      Index startA = i * depth;
+
+      for (Index k = 0; k < depth; ++k) {
+        res(i, j) += blockA[startA + k] * blockB[startB + k];
+      }
+    }
+  }
+}
+#endif
+
+
+// This definition tackle the case where the lhs is encoded using signed 8bit
+// integers and the rhs using unsigned 8bit integers.
+#ifndef EIGEN_USE_OPTIMIZED_INT8_UINT8_MAT_MAT_PRODUCT
+template<bool _ConjLhs, bool _ConjRhs>
+class gebp_traits<QInt8, QUInt8, _ConjLhs, _ConjRhs>
+{
+public:
+  typedef QInt8 LhsScalar;
+  typedef QUInt8 RhsScalar;
+  typedef QInt32 ResScalar;
+
+  enum {
+    // register block size along the M and N directions
+    // One for the current implementation
+    nr = 1,
+    mr = 1,
+    // Progress made at each iteration of the product loop
+    // also 1 for the current implementation
+    LhsProgress = 1,
+    RhsProgress = 1
+  };
+};
+
+// Mat-Mat product of a signed 8bit lhs with an unsigned 8bit rhs
+template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
+{
+  EIGEN_DONT_INLINE
+  void operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB,
+                  Index rows, Index depth, Index cols, QInt32 alpha,
+                  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
+};
+
+template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+EIGEN_DONT_INLINE
+void gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
+::operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB,
+             Index rows, Index depth, Index cols, QInt32 alpha,
+             Index strideA, Index strideB, Index offsetA, Index offsetB)
+{
+  EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  eigen_assert(alpha.value == 1);
+  eigen_assert(strideA == -1);
+  eigen_assert(strideB == -1);
+  eigen_assert(offsetA == 0);
+  eigen_assert(offsetB == 0);
+
+  eigen_assert(rows > 0);
+  eigen_assert(cols > 0);
+  eigen_assert(depth > 0);
+  eigen_assert(blockA);
+  eigen_assert(blockB);
+
+  for (Index j = 0; j < cols; ++j) {
+    Index startB = j * depth;
+
+    for (Index i = 0; i < rows; ++i) {
+      Index startA = i * depth;
+
+      for (Index k = 0; k < depth; ++k) {
+        res(i, j) += blockA[startA + k] * blockB[startB + k];
+      }
+    }
+  }
+}
+#endif
+
+
+// This definition tackle the case where the khs is encoded using unsigned 8bit
+// integers and the rhs using signed 8bit integers.
+#ifndef EIGEN_USE_OPTIMIZED_UINT8_INT8_MAT_MAT_PRODUCT
+template<bool _ConjLhs, bool _ConjRhs>
+class gebp_traits<QUInt8, QInt8, _ConjLhs, _ConjRhs>
+{
+public:
+  typedef QUInt8 LhsScalar;
+  typedef QInt8 RhsScalar;
+  typedef QInt32 ResScalar;
+
+  enum {
+    // register block size along the M and N directions
+    // One for the current implementation
+    nr = 1,
+    mr = 1,
+    // Progress made at each iteration of the product loop
+    // also 1 for the current implementation
+    LhsProgress = 1,
+    RhsProgress = 1
+  };
+};
+
+
+// Mat-Mat product of an unsigned 8bit lhs with a signed 8bit rhs
+template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<QUInt8, QInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
+{
+  EIGEN_DONT_INLINE
+  void operator()(const DataMapper& res, const QUInt8* blockA, const QInt8* blockB,
+                  Index rows, Index depth, Index cols, QInt32 alpha,
+                  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
+};
+
+template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+EIGEN_DONT_INLINE
+void gebp_kernel<QUInt8, QInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
+::operator()(const DataMapper& res, const QUInt8* blockA, const QInt8* blockB,
+             Index rows, Index depth, Index cols, QInt32 alpha,
+             Index strideA, Index strideB, Index offsetA, Index offsetB)
+{
+  EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  eigen_assert(alpha.value == 1);
+  eigen_assert(strideA == -1);
+  eigen_assert(strideB == -1);
+  eigen_assert(offsetA == 0);
+  eigen_assert(offsetB == 0);
+
+  eigen_assert(rows > 0);
+  eigen_assert(cols > 0);
+  eigen_assert(depth > 0);
+  eigen_assert(blockA);
+  eigen_assert(blockB);
+
+  for (Index j = 0; j < cols; ++j) {
+    Index startB = j * depth;
+
+    for (Index i = 0; i < rows; ++i) {
+      Index startA = i * depth;
+
+      for (Index k = 0; k < depth; ++k) {
+        res(i, j) += blockA[startA + k] * blockB[startB + k];
+      }
+    }
+  }
+}
+#endif
+
+}  // namespace internal
+}  // namespace Eigen
+
+
+
+#endif  // EIGEN_CXX11_FIXED_POINT_MAT_MAT_PRODUCT_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductAVX2.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductAVX2.h
new file mode 100644
index 0000000000..d561b79fbd
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductAVX2.h
@@ -0,0 +1,1743 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+// Copyright (C) 2015 Matthew Sarett <msarett@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_FIXED_POINT_MAT_MAT_PRODUCT_AVX2_H
+#define EIGEN_CXX11_FIXED_POINT_MAT_MAT_PRODUCT_AVX2_H
+
+namespace Eigen {
+namespace internal {
+
+// AVX2 optimized implementation of Mat-Mat product.
+// LHS is encoded using signed 8-bit integers.
+// RHS is encoded using unsigned 8-bit integers.
+#ifdef EIGEN_USE_OPTIMIZED_INT8_UINT8_MAT_MAT_PRODUCT
+
+// Define quantized traits
+template<bool _ConjLhs, bool _ConjRhs>
+class gebp_traits<QInt8, QUInt8, _ConjLhs, _ConjRhs>
+{
+public:
+  typedef QInt8 LhsScalar;
+  typedef QUInt8 RhsScalar;
+  typedef QInt32 ResScalar;
+
+  enum {
+    // Define register blocking scheme.
+    nr = 32,
+    mr = 32,
+    kr = 8,
+    // Ignore progress tracking per loop iteration.
+    LhsProgress = -1,
+    RhsProgress = -1
+  };
+};
+
+// Specialized blocking for quantized implementations.
+// Used by TensorContractionThreadPool, inputs must have dimensions that are
+// multiples of 32.
+template<int KcFactor, typename Index>
+struct ComputeGemmByColBlockingSizes<QInt8, QUInt8, KcFactor, Index> {
+  void operator()(Index& k, Index& m, Index& n, Index num_threads)
+  {
+    eigen_assert(m % 32 == 0);
+    eigen_assert(n % 32 == 0);
+    eigen_assert(k % 32 == 0);
+    if (!k || !m || !n) {
+      return;
+    }
+    n = (((n / num_threads) + 31) / 32) * 32;
+  }
+};
+
+// Specialized blocking for quantized implementations.
+// Used by TensorContractionThreadPool, inputs must have dimensions that are
+// multiples of 32.
+template<int KcFactor, typename Index>
+struct ComputeGemmByRowBlockingSizes<QInt8, QUInt8, KcFactor, Index> {
+  void operator()(Index& k, Index& m, Index& n, Index num_threads)
+  {
+    eigen_assert(m % 32 == 0);
+    eigen_assert(n % 32 == 0 || n == 1);
+    eigen_assert(k % 32 == 0);
+    if (!k || !m || !n) {
+      return;
+    }
+    // Special case to avoid breaking the unimplemented matrix-vector case
+    if (n == 1) {
+      n = 32;
+    }
+    m = (((m / num_threads) + 31) / 32) * 32;
+  }
+};
+
+// Specialized blocking for quantized implementations.
+// Used by TensorContraction and GeneralMatrixMatrix, inputs are padded to
+// multiples of 32.
+template <int MaxRows, int MaxCols, int MaxDepth, int KcFactor>
+class gemm_blocking_space<ColMajor, QInt8, QInt8, MaxRows, MaxCols, MaxDepth,
+                          KcFactor, false>
+    : public level3_blocking<QInt8, QInt8> {
+  DenseIndex m_sizeA;
+  DenseIndex m_sizeB;
+
+ public:
+  gemm_blocking_space(DenseIndex rows, DenseIndex cols, DenseIndex depth,
+                      DenseIndex /*num_threads*/, bool /*l3_blocking*/) {
+    this->m_mc = ((rows + 31) / 32) * 32;
+    this->m_nc = ((cols + 31) / 32) * 32;
+    this->m_kc = ((depth + 31) / 32) * 32;
+    m_sizeA = this->m_mc * this->m_kc;
+    m_sizeB = this->m_kc * this->m_nc;
+  }
+  void allocateA() {
+    if (this->m_blockA == 0) this->m_blockA = aligned_new<QInt8>(m_sizeA);
+  }
+  void allocateB() {
+    if (this->m_blockB == 0) this->m_blockB = aligned_new<QInt8>(m_sizeB);
+  }
+  void allocateAll() {
+    allocateA();
+    allocateB();
+  }
+  ~gemm_blocking_space() {
+    aligned_delete(this->m_blockA, m_sizeA);
+    aligned_delete(this->m_blockB, m_sizeB);
+  }
+};
+
+
+template <int MaxRows, int MaxCols, int MaxDepth, int KcFactor>
+class gemm_blocking_space<ColMajor, QInt8, QUInt8, MaxRows, MaxCols, MaxDepth,
+                          KcFactor, false>
+    : public level3_blocking<QInt8, QUInt8> {
+  DenseIndex m_sizeA;
+  DenseIndex m_sizeB;
+
+ public:
+  gemm_blocking_space(DenseIndex rows, DenseIndex cols, DenseIndex depth,
+                      DenseIndex /*num_threads*/, bool /*l3_blocking*/) {
+    this->m_mc = ((rows + 31) / 32) * 32;
+    this->m_nc = ((cols + 31) / 32) * 32;
+    this->m_kc = ((depth + 31) / 32) * 32;
+    m_sizeA = this->m_mc * this->m_kc;
+    m_sizeB = this->m_kc * this->m_nc;
+  }
+  void allocateA() {
+    if (this->m_blockA == 0) this->m_blockA = aligned_new<QInt8>(m_sizeA);
+  }
+  void allocateB() {
+    if (this->m_blockB == 0) this->m_blockB = aligned_new<QUInt8>(m_sizeB);
+  }
+  void allocateAll() {
+    allocateA();
+    allocateB();
+  }
+  ~gemm_blocking_space() {
+    aligned_delete(this->m_blockA, m_sizeA);
+    aligned_delete(this->m_blockB, m_sizeB);
+  }
+};
+
+// Alternate templates for any input sizes
+template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, int StorageOrder, bool Conjugate = false, bool PanelMode = false>
+struct gemm_pack_lhs_any;
+template <typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs_any<QInt8, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode> {
+  EIGEN_DONT_INLINE void operator()
+      (QInt8* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride = 0, Index offset = 0);
+};
+
+template<typename Scalar, typename Index, typename DataMapper, int nr, int StorageOrder, bool Conjugate = false, bool PanelMode=false>
+struct gemm_pack_rhs_any;
+template <typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs_any<QUInt8, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode> {
+  EIGEN_DONT_INLINE void operator()
+      (QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride = 0, Index offset = 0);
+};
+
+template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs=false, bool ConjugateRhs=false>
+struct gebp_kernel_any;
+template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel_any<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
+{
+  typedef typename DataMapper::LinearMapper LinearMapper;
+
+  EIGEN_DONT_INLINE
+  void operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB,
+                  Index rows, Index depth, Index cols, QInt32 alpha,
+                  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
+};
+
+// Alternate implementations for any input sizes
+template <typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
+EIGEN_DONT_INLINE void gemm_pack_lhs_any<QInt8, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode>::
+operator()(QInt8* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) {
+  eigen_assert(stride == 0);
+  eigen_assert(offset == 0);
+
+  // Get vector pointer
+  __m256i* blockA_256 = reinterpret_cast<__m256i*>(blockA);
+
+  // Get even multiples of the dimensions
+  Index rows_32 = (rows / 32) * 32;
+  Index depth_8 = (depth / 8) * 8;
+
+  // Get padding for when depth is not a multiple of 32
+  int padding = 0;
+  if (depth % 32 != 0) {
+    int depth_32 = (depth / 32) * 32;
+    int extra_depth = depth - depth_32;
+    int extra_depth_8 = ((extra_depth + 7) / 8) * 8;
+    padding = 32 - extra_depth_8;
+  }
+
+  // Pack rows in sets of 32
+  for (Index m = 0; m < rows_32; m += 32) {
+    // Pack depth in sets of 8
+    for (Index k = 0; k < depth_8; k += 8) {
+      // Load vectors
+      __m256i L_A = lhs.loadPacket(m, k);
+      __m256i L_B = lhs.loadPacket(m, k + 1);
+
+      // Interleave 8-bit elements
+      __m256i L_AB0_AB16 = _mm256_unpacklo_epi8(L_A, L_B);
+      __m256i L_AB8_AB24 = _mm256_unpackhi_epi8(L_A, L_B);
+
+      __m256i L_C = lhs.loadPacket(m, k + 2);
+      __m256i L_D = lhs.loadPacket(m, k + 3);
+      __m256i L_CD0_CD16 = _mm256_unpacklo_epi8(L_C, L_D);
+      __m256i L_CD8_CD24 = _mm256_unpackhi_epi8(L_C, L_D);
+
+      // Interleave 16-bit elements
+      __m256i L_AD0_AD16 = _mm256_unpacklo_epi16(L_AB0_AB16, L_CD0_CD16);
+      __m256i L_AD4_AD20 = _mm256_unpackhi_epi16(L_AB0_AB16, L_CD0_CD16);
+
+      // Use permute before we store to cross 128-bit lanes
+      __m256i L_AD0 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x20);
+      _mm256_store_si256(blockA_256++, L_AD0);
+
+      // Complete packing for 32 x 8 block
+      __m256i L_AD16 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x31);
+      __m256i L_AD8_AD24 = _mm256_unpacklo_epi16(L_AB8_AB24, L_CD8_CD24);
+      __m256i L_AD12_AD28 = _mm256_unpackhi_epi16(L_AB8_AB24, L_CD8_CD24);
+      __m256i L_AD8 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x20);
+      _mm256_store_si256(blockA_256++, L_AD8);
+      _mm256_store_si256(blockA_256++, L_AD16);
+      __m256i L_AD24 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x31);
+      _mm256_store_si256(blockA_256++, L_AD24);
+      __m256i L_E = lhs.loadPacket(m, k + 4);
+      __m256i L_F = lhs.loadPacket(m, k + 5);
+      __m256i L_EF0_EF16 = _mm256_unpacklo_epi8(L_E, L_F);
+      __m256i L_EF8_EF24 = _mm256_unpackhi_epi8(L_E, L_F);
+      __m256i L_G = lhs.loadPacket(m, k + 6);
+      __m256i L_H = lhs.loadPacket(m, k + 7);
+      __m256i L_GH0_GH16 = _mm256_unpacklo_epi8(L_G, L_H);
+      __m256i L_GH8_GH24 = _mm256_unpackhi_epi8(L_G, L_H);
+      __m256i L_EH0_EH16 = _mm256_unpacklo_epi16(L_EF0_EF16, L_GH0_GH16);
+      __m256i L_EH4_EH20 = _mm256_unpackhi_epi16(L_EF0_EF16, L_GH0_GH16);
+      __m256i L_EH0 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x20);
+      _mm256_store_si256(blockA_256++, L_EH0);
+      __m256i L_EH16 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x31);
+      __m256i L_EH8_EH24 = _mm256_unpacklo_epi16(L_EF8_EF24, L_GH8_GH24);
+      __m256i L_EH12_EH28 = _mm256_unpackhi_epi16(L_EF8_EF24, L_GH8_GH24);
+      __m256i L_EH8 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x20);
+      _mm256_store_si256(blockA_256++, L_EH8);
+      _mm256_store_si256(blockA_256++, L_EH16);
+      __m256i L_EH24 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x31);
+      _mm256_store_si256(blockA_256++, L_EH24);
+    }
+
+    // Finish the k dimension, padding with zeros
+    if (depth_8 < depth) {
+      __m256i L_A, L_B, L_C, L_D, L_E, L_F, L_G, L_H;
+      switch (depth - depth_8) {
+      case 1:
+        L_A = lhs.loadPacket(m, depth_8);
+        L_B = _mm256_setzero_si256();
+        L_C = _mm256_setzero_si256();
+        L_D = _mm256_setzero_si256();
+        L_E = _mm256_setzero_si256();
+        L_F = _mm256_setzero_si256();
+        L_G = _mm256_setzero_si256();
+        L_H = _mm256_setzero_si256();
+        break;
+      case 2:
+        L_A = lhs.loadPacket(m, depth_8);
+        L_B = lhs.loadPacket(m, depth_8 + 1);
+        L_C = _mm256_setzero_si256();
+        L_D = _mm256_setzero_si256();
+        L_E = _mm256_setzero_si256();
+        L_F = _mm256_setzero_si256();
+        L_G = _mm256_setzero_si256();
+        L_H = _mm256_setzero_si256();
+        break;
+      case 3:
+        L_A = lhs.loadPacket(m, depth_8);
+        L_B = lhs.loadPacket(m, depth_8 + 1);
+        L_C = lhs.loadPacket(m, depth_8 + 2);
+        L_D = _mm256_setzero_si256();
+        L_E = _mm256_setzero_si256();
+        L_F = _mm256_setzero_si256();
+        L_G = _mm256_setzero_si256();
+        L_H = _mm256_setzero_si256();
+        break;
+      case 4:
+        L_A = lhs.loadPacket(m, depth_8);
+        L_B = lhs.loadPacket(m, depth_8 + 1);
+        L_C = lhs.loadPacket(m, depth_8 + 2);
+        L_D = lhs.loadPacket(m, depth_8 + 3);
+        L_E = _mm256_setzero_si256();
+        L_F = _mm256_setzero_si256();
+        L_G = _mm256_setzero_si256();
+        L_H = _mm256_setzero_si256();
+        break;
+      case 5:
+        L_A = lhs.loadPacket(m, depth_8);
+        L_B = lhs.loadPacket(m, depth_8 + 1);
+        L_C = lhs.loadPacket(m, depth_8 + 2);
+        L_D = lhs.loadPacket(m, depth_8 + 3);
+        L_E = lhs.loadPacket(m, depth_8 + 4);
+        L_F = _mm256_setzero_si256();
+        L_G = _mm256_setzero_si256();
+        L_H = _mm256_setzero_si256();
+        break;
+      case 6:
+        L_A = lhs.loadPacket(m, depth_8);
+        L_B = lhs.loadPacket(m, depth_8 + 1);
+        L_C = lhs.loadPacket(m, depth_8 + 2);
+        L_D = lhs.loadPacket(m, depth_8 + 3);
+        L_E = lhs.loadPacket(m, depth_8 + 4);
+        L_F = lhs.loadPacket(m, depth_8 + 5);
+        L_G = _mm256_setzero_si256();
+        L_H = _mm256_setzero_si256();
+        break;
+      case 7:
+        L_A = lhs.loadPacket(m, depth_8);
+        L_B = lhs.loadPacket(m, depth_8 + 1);
+        L_C = lhs.loadPacket(m, depth_8 + 2);
+        L_D = lhs.loadPacket(m, depth_8 + 3);
+        L_E = lhs.loadPacket(m, depth_8 + 4);
+        L_F = lhs.loadPacket(m, depth_8 + 5);
+        L_G = lhs.loadPacket(m, depth_8 + 6);
+        L_H = _mm256_setzero_si256();
+        break;
+      }
+
+      // Interleave 8-bit elements
+      __m256i L_AB0_AB16 = _mm256_unpacklo_epi8(L_A, L_B);
+      __m256i L_AB8_AB24 = _mm256_unpackhi_epi8(L_A, L_B);
+
+      __m256i L_CD0_CD16 = _mm256_unpacklo_epi8(L_C, L_D);
+      __m256i L_CD8_CD24 = _mm256_unpackhi_epi8(L_C, L_D);
+
+      // Interleave 16-bit elements
+      __m256i L_AD0_AD16 = _mm256_unpacklo_epi16(L_AB0_AB16, L_CD0_CD16);
+      __m256i L_AD4_AD20 = _mm256_unpackhi_epi16(L_AB0_AB16, L_CD0_CD16);
+
+      // Use permute before we store to cross 128-bit lanes
+      __m256i L_AD0 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x20);
+      _mm256_store_si256(blockA_256++, L_AD0);
+
+      // Complete packing
+      __m256i L_AD16 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x31);
+      __m256i L_AD8_AD24 = _mm256_unpacklo_epi16(L_AB8_AB24, L_CD8_CD24);
+      __m256i L_AD12_AD28 = _mm256_unpackhi_epi16(L_AB8_AB24, L_CD8_CD24);
+      __m256i L_AD8 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x20);
+      _mm256_store_si256(blockA_256++, L_AD8);
+      _mm256_store_si256(blockA_256++, L_AD16);
+      __m256i L_AD24 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x31);
+      _mm256_store_si256(blockA_256++, L_AD24);
+      __m256i L_EF0_EF16 = _mm256_unpacklo_epi8(L_E, L_F);
+      __m256i L_EF8_EF24 = _mm256_unpackhi_epi8(L_E, L_F);
+      __m256i L_GH0_GH16 = _mm256_unpacklo_epi8(L_G, L_H);
+      __m256i L_GH8_GH24 = _mm256_unpackhi_epi8(L_G, L_H);
+      __m256i L_EH0_EH16 = _mm256_unpacklo_epi16(L_EF0_EF16, L_GH0_GH16);
+      __m256i L_EH4_EH20 = _mm256_unpackhi_epi16(L_EF0_EF16, L_GH0_GH16);
+      __m256i L_EH0 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x20);
+      _mm256_store_si256(blockA_256++, L_EH0);
+      __m256i L_EH16 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x31);
+      __m256i L_EH8_EH24 = _mm256_unpacklo_epi16(L_EF8_EF24, L_GH8_GH24);
+      __m256i L_EH12_EH28 = _mm256_unpackhi_epi16(L_EF8_EF24, L_GH8_GH24);
+      __m256i L_EH8 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x20);
+      _mm256_store_si256(blockA_256++, L_EH8);
+      _mm256_store_si256(blockA_256++, L_EH16);
+      __m256i L_EH24 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x31);
+      _mm256_store_si256(blockA_256++, L_EH24);
+    }
+    blockA_256 += padding;
+  }
+
+  // Finish the m dimension, padding with zeros
+  if (rows_32 < rows) {
+    // Pack depth in sets of 8
+    for (Index k = 0; k < depth_8; k += 8) {
+      // Load vectors
+      __m256i L_A = _mm256_setzero_si256();
+      __m256i L_B = _mm256_setzero_si256();
+      __m256i L_C = _mm256_setzero_si256();
+      __m256i L_D = _mm256_setzero_si256();
+      __m256i L_E = _mm256_setzero_si256();
+      __m256i L_F = _mm256_setzero_si256();
+      __m256i L_G = _mm256_setzero_si256();
+      __m256i L_H = _mm256_setzero_si256();
+      for (Index m = 0; m < rows - rows_32; m++) {
+        QInt8* ptr = (QInt8*) &L_A;
+        ptr[m] = lhs(rows_32 + m, k);
+        ptr = (QInt8*) &L_B;
+        ptr[m] = lhs(rows_32 + m, k + 1);
+        ptr = (QInt8*) &L_C;
+        ptr[m] = lhs(rows_32 + m, k + 2);
+        ptr = (QInt8*) &L_D;
+        ptr[m] = lhs(rows_32 + m, k + 3);
+        ptr = (QInt8*) &L_E;
+        ptr[m] = lhs(rows_32 + m, k + 4);
+        ptr = (QInt8*) &L_F;
+        ptr[m] = lhs(rows_32 + m, k + 5);
+        ptr = (QInt8*) &L_G;
+        ptr[m] = lhs(rows_32 + m, k + 6);
+        ptr = (QInt8*) &L_H;
+        ptr[m] = lhs(rows_32 + m, k + 7);
+      }
+
+      // Interleave 8-bit elements
+      __m256i L_AB0_AB16 = _mm256_unpacklo_epi8(L_A, L_B);
+      __m256i L_AB8_AB24 = _mm256_unpackhi_epi8(L_A, L_B);
+      __m256i L_CD0_CD16 = _mm256_unpacklo_epi8(L_C, L_D);
+      __m256i L_CD8_CD24 = _mm256_unpackhi_epi8(L_C, L_D);
+
+      // Interleave 16-bit elements
+      __m256i L_AD0_AD16 = _mm256_unpacklo_epi16(L_AB0_AB16, L_CD0_CD16);
+      __m256i L_AD4_AD20 = _mm256_unpackhi_epi16(L_AB0_AB16, L_CD0_CD16);
+
+      // Use permute before we store to cross 128-bit lanes
+      __m256i L_AD0 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x20);
+      _mm256_store_si256(blockA_256++, L_AD0);
+
+      // Complete packing for 32 x 8 block
+      __m256i L_AD16 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x31);
+      __m256i L_AD8_AD24 = _mm256_unpacklo_epi16(L_AB8_AB24, L_CD8_CD24);
+      __m256i L_AD12_AD28 = _mm256_unpackhi_epi16(L_AB8_AB24, L_CD8_CD24);
+      __m256i L_AD8 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x20);
+      _mm256_store_si256(blockA_256++, L_AD8);
+      _mm256_store_si256(blockA_256++, L_AD16);
+      __m256i L_AD24 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x31);
+      _mm256_store_si256(blockA_256++, L_AD24);
+      __m256i L_EF0_EF16 = _mm256_unpacklo_epi8(L_E, L_F);
+      __m256i L_EF8_EF24 = _mm256_unpackhi_epi8(L_E, L_F);
+      __m256i L_GH0_GH16 = _mm256_unpacklo_epi8(L_G, L_H);
+      __m256i L_GH8_GH24 = _mm256_unpackhi_epi8(L_G, L_H);
+      __m256i L_EH0_EH16 = _mm256_unpacklo_epi16(L_EF0_EF16, L_GH0_GH16);
+      __m256i L_EH4_EH20 = _mm256_unpackhi_epi16(L_EF0_EF16, L_GH0_GH16);
+      __m256i L_EH0 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x20);
+      _mm256_store_si256(blockA_256++, L_EH0);
+      __m256i L_EH16 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x31);
+      __m256i L_EH8_EH24 = _mm256_unpacklo_epi16(L_EF8_EF24, L_GH8_GH24);
+      __m256i L_EH12_EH28 = _mm256_unpackhi_epi16(L_EF8_EF24, L_GH8_GH24);
+      __m256i L_EH8 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x20);
+      _mm256_store_si256(blockA_256++, L_EH8);
+      _mm256_store_si256(blockA_256++, L_EH16);
+      __m256i L_EH24 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x31);
+      _mm256_store_si256(blockA_256++, L_EH24);
+    }
+
+    // Finish the k dimension, padding with zeros
+    if (depth_8 < depth) {
+      __m256i L_A, L_B, L_C, L_D, L_E, L_F, L_G, L_H;
+      QInt8* ptr;
+      switch (depth - depth_8) {
+      case 1:
+        L_A = _mm256_setzero_si256();
+        L_B = _mm256_setzero_si256();
+        L_C = _mm256_setzero_si256();
+        L_D = _mm256_setzero_si256();
+        L_E = _mm256_setzero_si256();
+        L_F = _mm256_setzero_si256();
+        L_G = _mm256_setzero_si256();
+        L_H = _mm256_setzero_si256();
+        for (Index m = 0; m < rows - rows_32; m++) {
+          QInt8* ptr = (QInt8*) &L_A;
+          ptr[m] = lhs(rows_32 + m, depth_8);
+        }
+        break;
+      case 2:
+        L_A = _mm256_setzero_si256();
+        L_B = _mm256_setzero_si256();
+        L_C = _mm256_setzero_si256();
+        L_D = _mm256_setzero_si256();
+        L_E = _mm256_setzero_si256();
+        L_F = _mm256_setzero_si256();
+        L_G = _mm256_setzero_si256();
+        L_H = _mm256_setzero_si256();
+        for (Index m = 0; m < rows - rows_32; m++) {
+          ptr = (QInt8*) &L_A;
+          ptr[m] = lhs(rows_32 + m, depth_8);
+          ptr = (QInt8*) &L_B;
+          ptr[m] = lhs(rows_32 + m, depth_8 + 1);
+        }
+        break;
+      case 3:
+        L_A = _mm256_setzero_si256();
+        L_B = _mm256_setzero_si256();
+        L_C = _mm256_setzero_si256();
+        L_D = _mm256_setzero_si256();
+        L_E = _mm256_setzero_si256();
+        L_F = _mm256_setzero_si256();
+        L_G = _mm256_setzero_si256();
+        L_H = _mm256_setzero_si256();
+        for (Index m = 0; m < rows - rows_32; m++) {
+          ptr = (QInt8*) &L_A;
+          ptr[m] = lhs(rows_32 + m, depth_8);
+          ptr = (QInt8*) &L_B;
+          ptr[m] = lhs(rows_32 + m, depth_8 + 1);
+          ptr = (QInt8*) &L_C;
+          ptr[m] = lhs(rows_32 + m, depth_8 + 2);
+        }
+        break;
+      case 4:
+        L_A = _mm256_setzero_si256();
+        L_B = _mm256_setzero_si256();
+        L_C = _mm256_setzero_si256();
+        L_D = _mm256_setzero_si256();
+        L_E = _mm256_setzero_si256();
+        L_F = _mm256_setzero_si256();
+        L_G = _mm256_setzero_si256();
+        L_H = _mm256_setzero_si256();
+        for (Index m = 0; m < rows - rows_32; m++) {
+          ptr = (QInt8*) &L_A;
+          ptr[m] = lhs(rows_32 + m, depth_8);
+          ptr = (QInt8*) &L_B;
+          ptr[m] = lhs(rows_32 + m, depth_8 + 1);
+          ptr = (QInt8*) &L_C;
+          ptr[m] = lhs(rows_32 + m, depth_8 + 2);
+          ptr = (QInt8*) &L_D;
+          ptr[m] = lhs(rows_32 + m, depth_8 + 3);
+        }
+        break;
+      case 5:
+        L_A = _mm256_setzero_si256();
+        L_B = _mm256_setzero_si256();
+        L_C = _mm256_setzero_si256();
+        L_D = _mm256_setzero_si256();
+        L_E = _mm256_setzero_si256();
+        L_F = _mm256_setzero_si256();
+        L_G = _mm256_setzero_si256();
+        L_H = _mm256_setzero_si256();
+        for (Index m = 0; m < rows - rows_32; m++) {
+          ptr = (QInt8*) &L_A;
+          ptr[m] = lhs(rows_32 + m, depth_8);
+          ptr = (QInt8*) &L_B;
+          ptr[m] = lhs(rows_32 + m, depth_8 + 1);
+          ptr = (QInt8*) &L_C;
+          ptr[m] = lhs(rows_32 + m, depth_8 + 2);
+          ptr = (QInt8*) &L_D;
+          ptr[m] = lhs(rows_32 + m, depth_8 + 3);
+          ptr = (QInt8*) &L_E;
+          ptr[m] = lhs(rows_32 + m, depth_8 + 4);
+        }
+        break;
+      case 6:
+        L_A = _mm256_setzero_si256();
+        L_B = _mm256_setzero_si256();
+        L_C = _mm256_setzero_si256();
+        L_D = _mm256_setzero_si256();
+        L_E = _mm256_setzero_si256();
+        L_F = _mm256_setzero_si256();
+        L_G = _mm256_setzero_si256();
+        L_H = _mm256_setzero_si256();
+        for (Index m = 0; m < rows - rows_32; m++) {
+          ptr = (QInt8*) &L_A;
+          ptr[m] = lhs(rows_32 + m, depth_8);
+          ptr = (QInt8*) &L_B;
+          ptr[m] = lhs(rows_32 + m, depth_8 + 1);
+          ptr = (QInt8*) &L_C;
+          ptr[m] = lhs(rows_32 + m, depth_8 + 2);
+          ptr = (QInt8*) &L_D;
+          ptr[m] = lhs(rows_32 + m, depth_8 + 3);
+          ptr = (QInt8*) &L_E;
+          ptr[m] = lhs(rows_32 + m, depth_8 + 4);
+          ptr = (QInt8*) &L_F;
+          ptr[m] = lhs(rows_32 + m, depth_8 + 5);
+        }
+        break;
+      case 7:
+        L_A = _mm256_setzero_si256();
+        L_B = _mm256_setzero_si256();
+        L_C = _mm256_setzero_si256();
+        L_D = _mm256_setzero_si256();
+        L_E = _mm256_setzero_si256();
+        L_F = _mm256_setzero_si256();
+        L_G = _mm256_setzero_si256();
+        L_H = _mm256_setzero_si256();
+        for (Index m = 0; m < rows - rows_32; m++) {
+          ptr = (QInt8*) &L_A;
+          ptr[m] = lhs(rows_32 + m, depth_8);
+          ptr = (QInt8*) &L_B;
+          ptr[m] = lhs(rows_32 + m, depth_8 + 1);
+          ptr = (QInt8*) &L_C;
+          ptr[m] = lhs(rows_32 + m, depth_8 + 2);
+          ptr = (QInt8*) &L_D;
+          ptr[m] = lhs(rows_32 + m, depth_8 + 3);
+          ptr = (QInt8*) &L_E;
+          ptr[m] = lhs(rows_32 + m, depth_8 + 4);
+          ptr = (QInt8*) &L_F;
+          ptr[m] = lhs(rows_32 + m, depth_8 + 5);
+          ptr = (QInt8*) &L_G;
+          ptr[m] = lhs(rows_32 + m, depth_8 + 6);
+        }
+        break;
+      }
+
+      // Interleave 8-bit elements
+      __m256i L_AB0_AB16 = _mm256_unpacklo_epi8(L_A, L_B);
+      __m256i L_AB8_AB24 = _mm256_unpackhi_epi8(L_A, L_B);
+      __m256i L_CD0_CD16 = _mm256_unpacklo_epi8(L_C, L_D);
+      __m256i L_CD8_CD24 = _mm256_unpackhi_epi8(L_C, L_D);
+
+      // Interleave 16-bit elements
+      __m256i L_AD0_AD16 = _mm256_unpacklo_epi16(L_AB0_AB16, L_CD0_CD16);
+      __m256i L_AD4_AD20 = _mm256_unpackhi_epi16(L_AB0_AB16, L_CD0_CD16);
+
+      // Use permute before we store to cross 128-bit lanes
+      __m256i L_AD0 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x20);
+      _mm256_store_si256(blockA_256++, L_AD0);
+
+      // Complete packing
+      __m256i L_AD16 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x31);
+      __m256i L_AD8_AD24 = _mm256_unpacklo_epi16(L_AB8_AB24, L_CD8_CD24);
+      __m256i L_AD12_AD28 = _mm256_unpackhi_epi16(L_AB8_AB24, L_CD8_CD24);
+      __m256i L_AD8 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x20);
+      _mm256_store_si256(blockA_256++, L_AD8);
+      _mm256_store_si256(blockA_256++, L_AD16);
+      __m256i L_AD24 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x31);
+      _mm256_store_si256(blockA_256++, L_AD24);
+      __m256i L_EF0_EF16 = _mm256_unpacklo_epi8(L_E, L_F);
+      __m256i L_EF8_EF24 = _mm256_unpackhi_epi8(L_E, L_F);
+      __m256i L_GH0_GH16 = _mm256_unpacklo_epi8(L_G, L_H);
+      __m256i L_GH8_GH24 = _mm256_unpackhi_epi8(L_G, L_H);
+      __m256i L_EH0_EH16 = _mm256_unpacklo_epi16(L_EF0_EF16, L_GH0_GH16);
+      __m256i L_EH4_EH20 = _mm256_unpackhi_epi16(L_EF0_EF16, L_GH0_GH16);
+      __m256i L_EH0 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x20);
+      _mm256_store_si256(blockA_256++, L_EH0);
+      __m256i L_EH16 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x31);
+      __m256i L_EH8_EH24 = _mm256_unpacklo_epi16(L_EF8_EF24, L_GH8_GH24);
+      __m256i L_EH12_EH28 = _mm256_unpackhi_epi16(L_EF8_EF24, L_GH8_GH24);
+      __m256i L_EH8 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x20);
+      _mm256_store_si256(blockA_256++, L_EH8);
+      _mm256_store_si256(blockA_256++, L_EH16);
+      __m256i L_EH24 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x31);
+      _mm256_store_si256(blockA_256++, L_EH24);
+    }
+  }
+}
+
+template <typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+EIGEN_DONT_INLINE void gemm_pack_rhs_any<QUInt8, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>::
+operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) {
+  eigen_assert(stride == 0);
+  eigen_assert(offset == 0);
+
+  // Get vector pointer
+  __m256i* blockB_256 = reinterpret_cast<__m256i*>(blockB);
+
+  // Get even multiples of the dimensions
+  Index cols_32 = (cols / 32) * 32;
+  Index depth_32 = (depth / 32) * 32;
+
+  // Perform a step of the packing for 4 columns
+  __m256i R_AB_L, R_AB_H, R_CD_L, R_CD_H, R_AD_0, R_AD_8, R_AD_16, R_AD_24;
+#define PACK_STEP                                            \
+  R_AB_L = _mm256_unpacklo_epi64(R_A, R_B);                  \
+  R_CD_L = _mm256_unpacklo_epi64(R_C, R_D);                  \
+  R_AB_H = _mm256_unpackhi_epi64(R_A, R_B);                  \
+  R_CD_H = _mm256_unpackhi_epi64(R_C, R_D);                  \
+  R_AD_0 = _mm256_permute2x128_si256(R_AB_L, R_CD_L, 0x20);  \
+  R_AD_16 = _mm256_permute2x128_si256(R_AB_L, R_CD_L, 0x31); \
+  R_AD_8 = _mm256_permute2x128_si256(R_AB_H, R_CD_H, 0x20);  \
+  R_AD_24 = _mm256_permute2x128_si256(R_AB_H, R_CD_H, 0x31); \
+  _mm256_store_si256(blockB_256, R_AD_0);                    \
+  _mm256_store_si256(blockB_256 + 8, R_AD_8);                \
+  _mm256_store_si256(blockB_256 + 16, R_AD_16);              \
+  _mm256_store_si256(blockB_256 + 24, R_AD_24);              \
+  blockB_256++;
+
+  // Pack cols in sets of 32
+  for (Index n = 0; n < cols_32; n += 32) {
+    // Pack depth in sets of 32
+    for (Index k = 0; k < depth_32; k += 32) {
+      __m256i R_A = rhs.loadPacket(k, n);
+      __m256i R_B = rhs.loadPacket(k, n + 1);
+      __m256i R_C = rhs.loadPacket(k, n + 2);
+      __m256i R_D = rhs.loadPacket(k, n + 3);
+      PACK_STEP;
+
+      R_A = rhs.loadPacket(k, n + 4);
+      R_B = rhs.loadPacket(k, n + 5);
+      R_C = rhs.loadPacket(k, n + 6);
+      R_D = rhs.loadPacket(k, n + 7);
+      PACK_STEP;
+
+      R_A = rhs.loadPacket(k, n + 8);
+      R_B = rhs.loadPacket(k, n + 9);
+      R_C = rhs.loadPacket(k, n + 10);
+      R_D = rhs.loadPacket(k, n + 11);
+      PACK_STEP;
+
+      R_A = rhs.loadPacket(k, n + 12);
+      R_B = rhs.loadPacket(k, n + 13);
+      R_C = rhs.loadPacket(k, n + 14);
+      R_D = rhs.loadPacket(k, n + 15);
+      PACK_STEP;
+
+      R_A = rhs.loadPacket(k, n + 16);
+      R_B = rhs.loadPacket(k, n + 17);
+      R_C = rhs.loadPacket(k, n + 18);
+      R_D = rhs.loadPacket(k, n + 19);
+      PACK_STEP;
+
+      R_A = rhs.loadPacket(k, n + 20);
+      R_B = rhs.loadPacket(k, n + 21);
+      R_C = rhs.loadPacket(k, n + 22);
+      R_D = rhs.loadPacket(k, n + 23);
+      PACK_STEP;
+
+      R_A = rhs.loadPacket(k, n + 24);
+      R_B = rhs.loadPacket(k, n + 25);
+      R_C = rhs.loadPacket(k, n + 26);
+      R_D = rhs.loadPacket(k, n + 27);
+      PACK_STEP;
+
+      R_A = rhs.loadPacket(k, n + 28);
+      R_B = rhs.loadPacket(k, n + 29);
+      R_C = rhs.loadPacket(k, n + 30);
+      R_D = rhs.loadPacket(k, n + 31);
+      PACK_STEP;
+
+      blockB_256 += 24;
+    }
+
+    if (depth_32 < depth) {
+      QUInt8* ptr;
+      __m256i R_A = _mm256_setzero_si256();
+      __m256i R_B = _mm256_setzero_si256();
+      __m256i R_C = _mm256_setzero_si256();
+      __m256i R_D = _mm256_setzero_si256();
+      for (Index k = depth_32; k < depth; k++) {
+        ptr = (QUInt8*) &R_A;
+        ptr[k - depth_32] = rhs(k, n);
+        ptr = (QUInt8*) &R_B;
+        ptr[k - depth_32] = rhs(k, n + 1);
+        ptr = (QUInt8*) &R_C;
+        ptr[k - depth_32] = rhs(k, n + 2);
+        ptr = (QUInt8*) &R_D;
+        ptr[k - depth_32] = rhs(k, n + 3);
+      }
+      PACK_STEP;
+
+      R_A = _mm256_setzero_si256();
+      R_B = _mm256_setzero_si256();
+      R_C = _mm256_setzero_si256();
+      R_D = _mm256_setzero_si256();
+      for (Index k = depth_32; k < depth; k++) {
+        ptr = (QUInt8*) &R_A;
+        ptr[k - depth_32] = rhs(k, n + 4);
+        ptr = (QUInt8*) &R_B;
+        ptr[k - depth_32] = rhs(k, n + 5);
+        ptr = (QUInt8*) &R_C;
+        ptr[k - depth_32] = rhs(k, n + 6);
+        ptr = (QUInt8*) &R_D;
+        ptr[k - depth_32] = rhs(k, n + 7);
+      }
+      PACK_STEP;
+
+      R_A = _mm256_setzero_si256();
+      R_B = _mm256_setzero_si256();
+      R_C = _mm256_setzero_si256();
+      R_D = _mm256_setzero_si256();
+      for (Index k = depth_32; k < depth; k++) {
+        ptr = (QUInt8*) &R_A;
+        ptr[k - depth_32] = rhs(k, n + 8);
+        ptr = (QUInt8*) &R_B;
+        ptr[k - depth_32] = rhs(k, n + 9);
+        ptr = (QUInt8*) &R_C;
+        ptr[k - depth_32] = rhs(k, n + 10);
+        ptr = (QUInt8*) &R_D;
+        ptr[k - depth_32] = rhs(k, n + 11);
+      }
+      PACK_STEP;
+
+      R_A = _mm256_setzero_si256();
+      R_B = _mm256_setzero_si256();
+      R_C = _mm256_setzero_si256();
+      R_D = _mm256_setzero_si256();
+      for (Index k = depth_32; k < depth; k++) {
+        ptr = (QUInt8*) &R_A;
+        ptr[k - depth_32] = rhs(k, n + 12);
+        ptr = (QUInt8*) &R_B;
+        ptr[k - depth_32] = rhs(k, n + 13);
+        ptr = (QUInt8*) &R_C;
+        ptr[k - depth_32] = rhs(k, n + 14);
+        ptr = (QUInt8*) &R_D;
+        ptr[k - depth_32] = rhs(k, n + 15);
+      }
+      PACK_STEP;
+
+      R_A = _mm256_setzero_si256();
+      R_B = _mm256_setzero_si256();
+      R_C = _mm256_setzero_si256();
+      R_D = _mm256_setzero_si256();
+      for (Index k = depth_32; k < depth; k++) {
+        ptr = (QUInt8*) &R_A;
+        ptr[k - depth_32] = rhs(k, n + 16);
+        ptr = (QUInt8*) &R_B;
+        ptr[k - depth_32] = rhs(k, n + 17);
+        ptr = (QUInt8*) &R_C;
+        ptr[k - depth_32] = rhs(k, n + 18);
+        ptr = (QUInt8*) &R_D;
+        ptr[k - depth_32] = rhs(k, n + 19);
+      }
+      PACK_STEP;
+
+      R_A = _mm256_setzero_si256();
+      R_B = _mm256_setzero_si256();
+      R_C = _mm256_setzero_si256();
+      R_D = _mm256_setzero_si256();
+      for (Index k = depth_32; k < depth; k++) {
+        ptr = (QUInt8*) &R_A;
+        ptr[k - depth_32] = rhs(k, n + 20);
+        ptr = (QUInt8*) &R_B;
+        ptr[k - depth_32] = rhs(k, n + 21);
+        ptr = (QUInt8*) &R_C;
+        ptr[k - depth_32] = rhs(k, n + 22);
+        ptr = (QUInt8*) &R_D;
+        ptr[k - depth_32] = rhs(k, n + 23);
+      }
+      PACK_STEP;
+
+      R_A = _mm256_setzero_si256();
+      R_B = _mm256_setzero_si256();
+      R_C = _mm256_setzero_si256();
+      R_D = _mm256_setzero_si256();
+      for (Index k = depth_32; k < depth; k++) {
+        ptr = (QUInt8*) &R_A;
+        ptr[k - depth_32] = rhs(k, n + 24);
+        ptr = (QUInt8*) &R_B;
+        ptr[k - depth_32] = rhs(k, n + 25);
+        ptr = (QUInt8*) &R_C;
+        ptr[k - depth_32] = rhs(k, n + 26);
+        ptr = (QUInt8*) &R_D;
+        ptr[k - depth_32] = rhs(k, n + 27);
+      }
+      PACK_STEP;
+
+      R_A = _mm256_setzero_si256();
+      R_B = _mm256_setzero_si256();
+      R_C = _mm256_setzero_si256();
+      R_D = _mm256_setzero_si256();
+      for (Index k = depth_32; k < depth; k++) {
+        ptr = (QUInt8*) &R_A;
+        ptr[k - depth_32] = rhs(k, n + 28);
+        ptr = (QUInt8*) &R_B;
+        ptr[k - depth_32] = rhs(k, n + 29);
+        ptr = (QUInt8*) &R_C;
+        ptr[k - depth_32] = rhs(k, n + 30);
+        ptr = (QUInt8*) &R_D;
+        ptr[k - depth_32] = rhs(k, n + 31);
+      }
+      PACK_STEP;
+      blockB_256 += 24;
+    }
+  }
+
+  // Finish packing cols
+  if (cols_32 < cols) {
+    // Pack depth in sets of 32
+    for (Index k = 0; k < depth_32; k += 32) {
+      __m256i R_A, R_B, R_C, R_D;
+      Index n;
+      for (n = cols_32; n < cols; n += 4) {
+        switch (cols - n) {
+        case 1:
+          R_A = rhs.loadPacket(k, n);
+          R_B = _mm256_setzero_si256();
+          R_C = _mm256_setzero_si256();
+          R_D = _mm256_setzero_si256();
+          PACK_STEP;
+          break;
+        case 2:
+          R_A = rhs.loadPacket(k, n);
+          R_B = rhs.loadPacket(k, n + 1);
+          R_C = _mm256_setzero_si256();
+          R_D = _mm256_setzero_si256();
+          PACK_STEP;
+          break;
+        case 3:
+          R_A = rhs.loadPacket(k, n);
+          R_B = rhs.loadPacket(k, n + 1);
+          R_C = rhs.loadPacket(k, n + 2);
+          R_D = _mm256_setzero_si256();
+          PACK_STEP;
+          break;
+        default:
+          R_A = rhs.loadPacket(k, n);
+          R_B = rhs.loadPacket(k, n + 1);
+          R_C = rhs.loadPacket(k, n + 2);
+          R_D = rhs.loadPacket(k, n + 3);
+          PACK_STEP;
+          break;
+        }
+      }
+
+      // Increment the block pointer.
+      // We must pad if cols is not a multiple of 32.
+      blockB_256 += 32 - (n - cols_32) / 4;
+    }
+
+    if (depth_32 < depth) {
+      for (Index n = cols_32; n < cols; n += 4) {
+        QUInt8* ptr;
+        __m256i R_A = _mm256_setzero_si256();
+        __m256i R_B = _mm256_setzero_si256();
+        __m256i R_C = _mm256_setzero_si256();
+        __m256i R_D = _mm256_setzero_si256();
+        switch (cols - n) {
+        case 1:
+          for (Index k = depth_32; k < depth; k++) {
+            ptr = (QUInt8*) &R_A;
+            ptr[k - depth_32] = rhs(k, n);
+          }
+          PACK_STEP;
+          break;
+        case 2:
+          for (Index k = depth_32; k < depth; k++) {
+            ptr = (QUInt8*) &R_A;
+            ptr[k - depth_32] = rhs(k, n);
+            ptr = (QUInt8*) &R_B;
+            ptr[k - depth_32] = rhs(k, n + 1);
+          }
+          PACK_STEP;
+          break;
+        case 3:
+          for (Index k = depth_32; k < depth; k++) {
+            ptr = (QUInt8*) &R_A;
+            ptr[k - depth_32] = rhs(k, n);
+            ptr = (QUInt8*) &R_B;
+            ptr[k - depth_32] = rhs(k, n + 1);
+            ptr = (QUInt8*) &R_C;
+            ptr[k - depth_32] = rhs(k, n + 2);
+          }
+          PACK_STEP;
+          break;
+        default:
+          for (Index k = depth_32; k < depth; k++) {
+            ptr = (QUInt8*) &R_A;
+            ptr[k - depth_32] = rhs(k, n);
+            ptr = (QUInt8*) &R_B;
+            ptr[k - depth_32] = rhs(k, n + 1);
+            ptr = (QUInt8*) &R_C;
+            ptr[k - depth_32] = rhs(k, n + 2);
+            ptr = (QUInt8*) &R_D;
+            ptr[k - depth_32] = rhs(k, n + 3);
+          }
+          PACK_STEP;
+          break;
+        }
+      }
+    }
+  }
+#undef PACK_STEP
+}
+
+template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+EIGEN_DONT_INLINE
+void gebp_kernel_any<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
+::operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB,
+             Index rows, Index depth, Index cols, QInt32 alpha,
+             Index strideA, Index strideB, Index offsetA, Index offsetB)
+{
+  EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  eigen_assert(alpha.value == 1);
+  eigen_assert(strideA == -1);
+  eigen_assert(strideB == -1);
+  eigen_assert(offsetA == 0);
+  eigen_assert(offsetB == 0);
+  eigen_assert(rows > 0);
+  eigen_assert(cols > 0);
+  eigen_assert(depth > 0);
+  eigen_assert(blockA);
+  eigen_assert(blockB);
+
+  Index rows_32 = ((rows + 31) / 32) * 32;
+  Index cols_32 = ((cols + 31) / 32) * 32;
+  Index depth_32 = ((depth + 31) / 32) * 32;
+
+  // Create result block
+  ei_declare_aligned_stack_constructed_variable(QInt32, blockO, 32 * 32, 0);
+  memset(blockO, 0, 32 * 32 * sizeof(QInt32));
+
+  // Get vectorized pointers
+  __m256i* blockO_256 = reinterpret_cast<__m256i*>(blockO);
+  const __m256i* blockA_256 = reinterpret_cast<const __m256i*>(blockA);
+  const __m256i* blockB_256 = reinterpret_cast<const __m256i*>(blockB);
+
+  // Loop over blocks of 32 columns
+  for (Index n = 0; n < cols_32; n += 32) {
+    // Reset index into blockA
+    Index indexL = 0;
+    // Loop over blocks of 32 rows
+    for (Index m = 0; m < rows_32; m += 32) {
+      // Reset index into blockB
+      Index indexR = n / 32 * depth_32;
+      // Loop over blocks of 8 on depth
+      for (Index k = 0; k < depth_32; k += 8) {
+        // Load inputs
+        __m256i L_AD0 = blockA_256[indexL++];
+        __m256i L_AD8 = blockA_256[indexL++];
+        __m256i L_AD16 = blockA_256[indexL++];
+        __m256i L_AD24 = blockA_256[indexL++];
+        __m256i L_EH0 = blockA_256[indexL++];
+        __m256i L_EH8 = blockA_256[indexL++];
+        __m256i L_EH16 = blockA_256[indexL++];
+        __m256i L_EH24 = blockA_256[indexL++];
+        __m256i R_AH0 = blockB_256[indexR++];
+        __m256i R_AH4 = blockB_256[indexR++];
+        __m256i R_AH8 = blockB_256[indexR++];
+        __m256i R_AH12 = blockB_256[indexR++];
+        __m256i R_AH16 = blockB_256[indexR++];
+        __m256i R_AH20 = blockB_256[indexR++];
+        __m256i R_AH24 = blockB_256[indexR++];
+        __m256i R_AH28 = blockB_256[indexR++];
+
+        // This constant is used with madd to convert 16 bit to 32 bit
+        const __m256i ONE = _mm256_set1_epi32(0x00010001);
+
+        // Declare variables used in COMPUTE_STEP
+        __m256i P_16_A, P_16_B, P_32_A, P_32_B, P_32;
+
+#define COMPUTE_STEP(R_INPUT_A, R_INPUT_B, OFFSET)                             \
+  P_16_A = _mm256_maddubs_epi16(R_INPUT_A, L_AD0);                             \
+  P_32_A = _mm256_madd_epi16(P_16_A, ONE);                                     \
+  P_16_B = _mm256_maddubs_epi16(R_INPUT_B, L_EH0);                             \
+  P_32_B = _mm256_madd_epi16(P_16_B, ONE);                                     \
+  P_32 = _mm256_add_epi32(P_32_A, P_32_B);                                     \
+  _mm256_store_si256(                                                          \
+      blockO_256 + 4 * OFFSET,                                                 \
+      _mm256_add_epi32(_mm256_load_si256(blockO_256 + 4 * OFFSET), P_32));     \
+                                                                               \
+  P_16_A = _mm256_maddubs_epi16(R_INPUT_A, L_AD8);                             \
+  P_32_A = _mm256_madd_epi16(P_16_A, ONE);                                     \
+  P_16_B = _mm256_maddubs_epi16(R_INPUT_B, L_EH8);                             \
+  P_32_B = _mm256_madd_epi16(P_16_B, ONE);                                     \
+  P_32 = _mm256_add_epi32(P_32_A, P_32_B);                                     \
+  _mm256_store_si256(                                                          \
+      blockO_256 + 4 * OFFSET + 1,                                             \
+      _mm256_add_epi32(_mm256_load_si256(blockO_256 + 4 * OFFSET + 1), P_32)); \
+                                                                               \
+  P_16_A = _mm256_maddubs_epi16(R_INPUT_A, L_AD16);                            \
+  P_32_A = _mm256_madd_epi16(P_16_A, ONE);                                     \
+  P_16_B = _mm256_maddubs_epi16(R_INPUT_B, L_EH16);                            \
+  P_32_B = _mm256_madd_epi16(P_16_B, ONE);                                     \
+  P_32 = _mm256_add_epi32(P_32_A, P_32_B);                                     \
+  _mm256_store_si256(                                                          \
+      blockO_256 + 4 * OFFSET + 2,                                             \
+      _mm256_add_epi32(_mm256_load_si256(blockO_256 + 4 * OFFSET + 2), P_32)); \
+                                                                               \
+  P_16_A = _mm256_maddubs_epi16(R_INPUT_A, L_AD24);                            \
+  P_32_A = _mm256_madd_epi16(P_16_A, ONE);                                     \
+  P_16_B = _mm256_maddubs_epi16(R_INPUT_B, L_EH24);                            \
+  P_32_B = _mm256_madd_epi16(P_16_B, ONE);                                     \
+  P_32 = _mm256_add_epi32(P_32_A, P_32_B);                                     \
+  _mm256_store_si256(                                                          \
+      blockO_256 + 4 * OFFSET + 3,                                             \
+      _mm256_add_epi32(_mm256_load_si256(blockO_256 + 4 * OFFSET + 3), P_32));
+
+        // Permute and shuffle to copy a single value across the entire vector
+        // Then compute the multiplication
+        __m256i R_AH0_ = _mm256_permute2x128_si256(R_AH0, R_AH0, 0x00);
+        __m256i R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        __m256i R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD0, R_EH0, 0);
+        __m256i R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        __m256i R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD1, R_EH1, 1);
+        R_AH0_ = _mm256_permute2x128_si256(R_AH0, R_AH0, 0x11);
+        __m256i R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        __m256i R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD2, R_EH2, 2);
+        __m256i R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        __m256i R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD3, R_EH3, 3);
+
+        R_AH0_ = _mm256_permute2x128_si256(R_AH4, R_AH4, 0x00);
+        R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD0, R_EH0, 4);
+        R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD1, R_EH1, 5);
+        R_AH0_ = _mm256_permute2x128_si256(R_AH4, R_AH4, 0x11);
+        R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD2, R_EH2, 6);
+        R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD3, R_EH3, 7);
+
+        R_AH0_ = _mm256_permute2x128_si256(R_AH8, R_AH8, 0x00);
+        R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD0, R_EH0, 8);
+        R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD1, R_EH1, 9);
+        R_AH0_ = _mm256_permute2x128_si256(R_AH8, R_AH8, 0x11);
+        R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD2, R_EH2, 10);
+        R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD3, R_EH3, 11);
+
+        R_AH0_ = _mm256_permute2x128_si256(R_AH12, R_AH12, 0x00);
+        R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD0, R_EH0, 12);
+        R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD1, R_EH1, 13);
+        R_AH0_ = _mm256_permute2x128_si256(R_AH12, R_AH12, 0x11);
+        R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD2, R_EH2, 14);
+        R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD3, R_EH3, 15);
+
+        R_AH0_ = _mm256_permute2x128_si256(R_AH16, R_AH16, 0x00);
+        R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD0, R_EH0, 16);
+        R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD1, R_EH1, 17);
+        R_AH0_ = _mm256_permute2x128_si256(R_AH16, R_AH16, 0x11);
+        R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD2, R_EH2, 18);
+        R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD3, R_EH3, 19);
+
+        R_AH0_ = _mm256_permute2x128_si256(R_AH20, R_AH20, 0x00);
+        R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD0, R_EH0, 20);
+        R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD1, R_EH1, 21);
+        R_AH0_ = _mm256_permute2x128_si256(R_AH20, R_AH20, 0x11);
+        R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD2, R_EH2, 22);
+        R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD3, R_EH3, 23);
+
+        R_AH0_ = _mm256_permute2x128_si256(R_AH24, R_AH24, 0x00);
+        R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD0, R_EH0, 24);
+        R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD1, R_EH1, 25);
+        R_AH0_ = _mm256_permute2x128_si256(R_AH24, R_AH24, 0x11);
+        R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD2, R_EH2, 26);
+        R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD3, R_EH3, 27);
+
+        R_AH0_ = _mm256_permute2x128_si256(R_AH28, R_AH28, 0x00);
+        R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD0, R_EH0, 28);
+        R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD1, R_EH1, 29);
+        R_AH0_ = _mm256_permute2x128_si256(R_AH28, R_AH28, 0x11);
+        R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD2, R_EH2, 30);
+        R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD3, R_EH3, 31);
+
+#undef COMPUTE_STEP
+      }
+
+      // Transfer the results to the result matrix.
+      if (m + 32 <= rows && n + 32 <= cols) {
+        Index i = 0;
+        for (Index j = n; j < n + 32; j++) {
+          LinearMapper r0 = res.getLinearMapper(m, j);
+          LinearMapper r1 = res.getLinearMapper(m + 8, j);
+          LinearMapper r2 = res.getLinearMapper(m + 16, j);
+          LinearMapper r3 = res.getLinearMapper(m + 24, j);
+          r0.storePacket(
+              0, _mm256_add_epi32(blockO_256[i++], r0.loadPacket(0)));
+          r1.storePacket(
+              0, _mm256_add_epi32(blockO_256[i++], r1.loadPacket(0)));
+          r2.storePacket(
+              0, _mm256_add_epi32(blockO_256[i++], r2.loadPacket(0)));
+          r3.storePacket(
+              0, _mm256_add_epi32(blockO_256[i++], r3.loadPacket(0)));
+        }
+      }
+      else {
+        for (Index j = n; j < cols; j++) {
+          for (Index i = m; i < rows; i++) {
+            res(i, j) = blockO[(j - n) * 32 + (i - m)];
+          }
+        }
+      }
+
+      // Zero the result block so it can be reused
+      memset(blockO, 0, 32 * 32 * sizeof(QInt32));
+    }
+  }
+}
+
+// Below are the fully optimized versions that are correct only for sizes that
+// are multiple of 32.  It is about a 10% performance benefit to keep these
+// implementations separate.
+
+// Arrange a block of the left input matrix in contiguous memory.
+//
+// Given column major input (A0 beside A1 in memory):
+// A0 B0 C0 D0 E0 F0 G0 H0 ...
+// A1 B1 C1 D1 E1 F1 G1 H1 ...
+// A2 B2 C2 D2 E2 F2 G2 H2 ...
+// A3 B3 C3 D3 E3 F3 G3 H3 ...
+// A4 B4 C4 D4 E4 F4 G4 H4 ...
+// A5 B5 C5 D5 E5 F5 G5 H5 ...
+// A6 B6 C6 D6 E6 F6 G6 H6 ...
+// A7 B7 C7 D7 E7 F7 G7 H7 ...
+// A8 ...
+// ...
+//
+// Packing yields output (A0 beside B0 in memory):
+// A0 B0 C0 D0
+// A1 B1 C1 D1
+// A2 B2 C2 D2
+// A3 B3 C3 D3
+// A4 B4 C4 D4
+// A5 B5 C5 D5
+// A6 B6 C6 D6
+// A7 B7 C7 D7
+// ...
+// A31 B31 C31 D31
+// E0 F0 G0 H0
+// E1 F1 G1 H1
+// E2 F2 G2 H2
+// E3 F3 G3 H3
+// E4 F4 G4 H4
+// E5 F5 G5 H5
+// E6 F6 G6 H6
+// E7 F7 G7 H7
+// ...
+//
+// Four elements of the same row are arranged contiguously because maddubs and
+// madd both perform an adjacent addition in the kernel.
+template <typename Index, typename DataMapper, int Pack1, int Pack2,
+          bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<QInt8, Index, DataMapper, Pack1, Pack2, ColMajor,
+                     Conjugate, PanelMode> {
+  EIGEN_DONT_INLINE void operator()(QInt8* blockA, const DataMapper& lhs,
+                                    Index depth, Index rows, Index stride = 0,
+                                    Index offset = 0);
+};
+
+template <typename Index, typename DataMapper, int Pack1, int Pack2,
+          bool Conjugate, bool PanelMode>
+EIGEN_DONT_INLINE void gemm_pack_lhs<QInt8, Index, DataMapper, Pack1, Pack2,
+                                     ColMajor, Conjugate, PanelMode>::
+operator()(QInt8* blockA, const DataMapper& lhs, Index depth, Index rows,
+           Index stride, Index offset) {
+  eigen_assert(stride == 0);
+  eigen_assert(offset == 0);
+
+  // Use alternate function for weird sizes
+  if (rows % 32 != 0 || depth % 32 != 0) {
+    gemm_pack_lhs_any<QInt8, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode> lhs_pack;
+    return lhs_pack(blockA, lhs, depth, rows, stride, offset);
+  }
+
+  // Get vector pointer
+  __m256i* blockA_256 = reinterpret_cast<__m256i*>(blockA);
+
+  // Pack rows in sets of 32
+  for (Index m = 0; m < rows; m += 32) {
+    // Pack depth in sets of 8
+    for (Index k = 0; k < depth; k += 8) {
+      // Load vectors
+      __m256i L_A = lhs.loadPacket(m, k);
+      __m256i L_B = lhs.loadPacket(m, k + 1);
+
+      // Interleave 8-bit elements
+      __m256i L_AB0_AB16 = _mm256_unpacklo_epi8(L_A, L_B);
+      __m256i L_AB8_AB24 = _mm256_unpackhi_epi8(L_A, L_B);
+
+      __m256i L_C = lhs.loadPacket(m, k + 2);
+      __m256i L_D = lhs.loadPacket(m, k + 3);
+      __m256i L_CD0_CD16 = _mm256_unpacklo_epi8(L_C, L_D);
+      __m256i L_CD8_CD24 = _mm256_unpackhi_epi8(L_C, L_D);
+
+      // Interleave 16-bit elements
+      __m256i L_AD0_AD16 = _mm256_unpacklo_epi16(L_AB0_AB16, L_CD0_CD16);
+      __m256i L_AD4_AD20 = _mm256_unpackhi_epi16(L_AB0_AB16, L_CD0_CD16);
+
+      // Use permute before we store to cross 128-bit lanes
+      __m256i L_AD0 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x20);
+      _mm256_store_si256(blockA_256++, L_AD0);
+
+      // Complete packing for 32 x 8 block
+      __m256i L_AD16 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x31);
+      __m256i L_AD8_AD24 = _mm256_unpacklo_epi16(L_AB8_AB24, L_CD8_CD24);
+      __m256i L_AD12_AD28 = _mm256_unpackhi_epi16(L_AB8_AB24, L_CD8_CD24);
+      __m256i L_AD8 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x20);
+      _mm256_store_si256(blockA_256++, L_AD8);
+      _mm256_store_si256(blockA_256++, L_AD16);
+      __m256i L_AD24 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x31);
+      _mm256_store_si256(blockA_256++, L_AD24);
+      __m256i L_E = lhs.loadPacket(m, k + 4);
+      __m256i L_F = lhs.loadPacket(m, k + 5);
+      __m256i L_EF0_EF16 = _mm256_unpacklo_epi8(L_E, L_F);
+      __m256i L_EF8_EF24 = _mm256_unpackhi_epi8(L_E, L_F);
+      __m256i L_G = lhs.loadPacket(m, k + 6);
+      __m256i L_H = lhs.loadPacket(m, k + 7);
+      __m256i L_GH0_GH16 = _mm256_unpacklo_epi8(L_G, L_H);
+      __m256i L_GH8_GH24 = _mm256_unpackhi_epi8(L_G, L_H);
+      __m256i L_EH0_EH16 = _mm256_unpacklo_epi16(L_EF0_EF16, L_GH0_GH16);
+      __m256i L_EH4_EH20 = _mm256_unpackhi_epi16(L_EF0_EF16, L_GH0_GH16);
+      __m256i L_EH0 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x20);
+      _mm256_store_si256(blockA_256++, L_EH0);
+      __m256i L_EH16 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x31);
+      __m256i L_EH8_EH24 = _mm256_unpacklo_epi16(L_EF8_EF24, L_GH8_GH24);
+      __m256i L_EH12_EH28 = _mm256_unpackhi_epi16(L_EF8_EF24, L_GH8_GH24);
+      __m256i L_EH8 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x20);
+      _mm256_store_si256(blockA_256++, L_EH8);
+      _mm256_store_si256(blockA_256++, L_EH16);
+      __m256i L_EH24 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x31);
+      _mm256_store_si256(blockA_256++, L_EH24);
+    }
+  }
+}
+
+// Arrange a block of the right input matrix in contiguous memory.
+//
+// Given column major input (A0 beside A1 in memory):
+// A0 B0 C0 D0 E0 F0 G0 H0 ...
+// A1 B1 C1 D1 E1 F1 G1 H1 ...
+// A2 B2 C2 D2 E2 F2 G2 H2 ...
+// A3 B3 C3 D3 E3 F3 G3 H3 ...
+// A4 B4 C4 D4 E4 F4 G4 H4 ...
+// A5 B5 C5 D5 E5 F5 G5 H5 ...
+// A6 B6 C6 D6 E6 F6 G6 H6 ...
+// A7 B7 C7 D7 E7 F7 G7 H7 ...
+// A8 ...
+// ...
+//
+// Packing yields row major output (A0 beside A1 in memory):
+// A0 A1 A2 A3 A4 A5 A6 A7
+// B0 B1 B2 B3 B4 B5 B6 B7
+// ...
+//
+// At least four elements of the same col are arranged contiguously because
+// maddubs and madd both perform an adjacent addition in the kernel.  We can
+// save work by leaving 8 adjacent elements because kr = 8.
+template <typename Index, typename DataMapper, int nr, bool Conjugate,
+          bool PanelMode>
+struct gemm_pack_rhs<QUInt8, Index, DataMapper, nr, ColMajor, Conjugate,
+                     PanelMode> {
+  EIGEN_DONT_INLINE void operator()(QUInt8* blockB, const DataMapper& rhs,
+                                    Index depth, Index cols, Index stride = 0,
+                                    Index offset = 0);
+};
+
+template <typename Index, typename DataMapper, int nr, bool Conjugate,
+          bool PanelMode>
+EIGEN_DONT_INLINE void gemm_pack_rhs<QUInt8, Index, DataMapper, nr, ColMajor,
+                                     Conjugate, PanelMode>::
+operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols,
+           Index stride, Index offset) {
+  eigen_assert(stride == 0);
+  eigen_assert(offset == 0);
+
+  // Use alternate function for weird sizes
+  if (cols % 32 != 0 || depth % 32 != 0) {
+    gemm_pack_rhs_any<QUInt8, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode> rhs_pack;
+    return rhs_pack(blockB, rhs, depth, cols, stride, offset);
+  }
+
+  // Get vector pointer
+  __m256i* blockB_256 = reinterpret_cast<__m256i*>(blockB);
+
+  // Perform a step of the packing for 4 columns
+  __m256i R_AB_L, R_AB_H, R_CD_L, R_CD_H, R_AD_0, R_AD_8, R_AD_16, R_AD_24;
+#define PACK_STEP                                            \
+  R_AB_L = _mm256_unpacklo_epi64(R_A, R_B);                  \
+  R_CD_L = _mm256_unpacklo_epi64(R_C, R_D);                  \
+  R_AB_H = _mm256_unpackhi_epi64(R_A, R_B);                  \
+  R_CD_H = _mm256_unpackhi_epi64(R_C, R_D);                  \
+  R_AD_0 = _mm256_permute2x128_si256(R_AB_L, R_CD_L, 0x20);  \
+  R_AD_16 = _mm256_permute2x128_si256(R_AB_L, R_CD_L, 0x31); \
+  R_AD_8 = _mm256_permute2x128_si256(R_AB_H, R_CD_H, 0x20);  \
+  R_AD_24 = _mm256_permute2x128_si256(R_AB_H, R_CD_H, 0x31); \
+  _mm256_store_si256(blockB_256, R_AD_0);                    \
+  _mm256_store_si256(blockB_256 + 8, R_AD_8);                \
+  _mm256_store_si256(blockB_256 + 16, R_AD_16);              \
+  _mm256_store_si256(blockB_256 + 24, R_AD_24);              \
+  blockB_256++;
+
+  // Pack cols in sets of 32
+  for (Index n = 0; n < cols; n += 32) {
+    // Pack depth in sets of 32
+    for (Index k = 0; k < depth; k += 32) {
+      __m256i R_A = rhs.loadPacket(k, n);
+      __m256i R_B = rhs.loadPacket(k, n + 1);
+      __m256i R_C = rhs.loadPacket(k, n + 2);
+      __m256i R_D = rhs.loadPacket(k, n + 3);
+      PACK_STEP;
+
+      R_A = rhs.loadPacket(k, n + 4);
+      R_B = rhs.loadPacket(k, n + 5);
+      R_C = rhs.loadPacket(k, n + 6);
+      R_D = rhs.loadPacket(k, n + 7);
+      PACK_STEP;
+
+      R_A = rhs.loadPacket(k, n + 8);
+      R_B = rhs.loadPacket(k, n + 9);
+      R_C = rhs.loadPacket(k, n + 10);
+      R_D = rhs.loadPacket(k, n + 11);
+      PACK_STEP;
+
+      R_A = rhs.loadPacket(k, n + 12);
+      R_B = rhs.loadPacket(k, n + 13);
+      R_C = rhs.loadPacket(k, n + 14);
+      R_D = rhs.loadPacket(k, n + 15);
+      PACK_STEP;
+
+      R_A = rhs.loadPacket(k, n + 16);
+      R_B = rhs.loadPacket(k, n + 17);
+      R_C = rhs.loadPacket(k, n + 18);
+      R_D = rhs.loadPacket(k, n + 19);
+      PACK_STEP;
+
+      R_A = rhs.loadPacket(k, n + 20);
+      R_B = rhs.loadPacket(k, n + 21);
+      R_C = rhs.loadPacket(k, n + 22);
+      R_D = rhs.loadPacket(k, n + 23);
+      PACK_STEP;
+
+      R_A = rhs.loadPacket(k, n + 24);
+      R_B = rhs.loadPacket(k, n + 25);
+      R_C = rhs.loadPacket(k, n + 26);
+      R_D = rhs.loadPacket(k, n + 27);
+      PACK_STEP;
+
+      R_A = rhs.loadPacket(k, n + 28);
+      R_B = rhs.loadPacket(k, n + 29);
+      R_C = rhs.loadPacket(k, n + 30);
+      R_D = rhs.loadPacket(k, n + 31);
+      PACK_STEP;
+
+      blockB_256 += 24;
+    }
+  }
+#undef PACK_STEP
+}
+
+// Perform the actual multiplication on packed inputs
+template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
+{
+  typedef typename DataMapper::LinearMapper LinearMapper;
+
+  EIGEN_DONT_INLINE
+  void operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB,
+                  Index rows, Index depth, Index cols, QInt32 alpha,
+                  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
+};
+
+template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+EIGEN_DONT_INLINE
+void gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
+::operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB,
+             Index rows, Index depth, Index cols, QInt32 alpha,
+             Index strideA, Index strideB, Index offsetA, Index offsetB)
+{
+  EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  eigen_assert(alpha.value == 1);
+  eigen_assert(strideA == -1);
+  eigen_assert(strideB == -1);
+  eigen_assert(offsetA == 0);
+  eigen_assert(offsetB == 0);
+  eigen_assert(rows > 0);
+  eigen_assert(cols > 0);
+  eigen_assert(depth > 0);
+  eigen_assert(blockA);
+  eigen_assert(blockB);
+
+  // Use alternate function for weird sizes
+  if (rows % 32 != 0 || cols % 32 != 0 || depth % 32 != 0) {
+    gebp_kernel_any<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> gebp;
+    return gebp(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
+  }
+
+  // Create result block
+  QInt32* blockO = aligned_new<QInt32>(32 * 32);
+  // Allocating the result block is about 5-10% faster than declaring stack
+  // space.  It is unclear why this is the case.
+  // ei_declare_aligned_stack_constructed_variable(QInt32, blockO, 32 * 32, 0);
+  memset(blockO, 0, 32 * 32 * sizeof(QInt32));
+
+  // Get vectorized pointers
+  __m256i* blockO_256 = reinterpret_cast<__m256i*>(blockO);
+  const __m256i* blockA_256 = reinterpret_cast<const __m256i*>(blockA);
+  const __m256i* blockB_256 = reinterpret_cast<const __m256i*>(blockB);
+
+  // Loop over blocks of 32 columns
+  for (Index n = 0; n < cols; n += 32) {
+    // Reset index into blockA
+    Index indexL = 0;
+    // Loop over blocks of 32 rows
+    for (Index m = 0; m < rows; m += 32) {
+      // Reset index into blockB
+      Index indexR = n / 32 * depth;
+      // Loop over blocks of 8 on depth
+      for (Index k = 0; k < depth; k += 8) {
+        // Load inputs
+        __m256i L_AD0 = blockA_256[indexL++];
+        __m256i L_AD8 = blockA_256[indexL++];
+        __m256i L_AD16 = blockA_256[indexL++];
+        __m256i L_AD24 = blockA_256[indexL++];
+        __m256i L_EH0 = blockA_256[indexL++];
+        __m256i L_EH8 = blockA_256[indexL++];
+        __m256i L_EH16 = blockA_256[indexL++];
+        __m256i L_EH24 = blockA_256[indexL++];
+        __m256i R_AH0 = blockB_256[indexR++];
+        __m256i R_AH4 = blockB_256[indexR++];
+        __m256i R_AH8 = blockB_256[indexR++];
+        __m256i R_AH12 = blockB_256[indexR++];
+        __m256i R_AH16 = blockB_256[indexR++];
+        __m256i R_AH20 = blockB_256[indexR++];
+        __m256i R_AH24 = blockB_256[indexR++];
+        __m256i R_AH28 = blockB_256[indexR++];
+
+        // This constant is used with madd to convert 16 bit to 32 bit
+        const __m256i ONE = _mm256_set1_epi32(0x00010001);
+
+        // Declare variables used in COMPUTE_STEP
+        __m256i P_16_A, P_16_B, P_32_A, P_32_B, P_32;
+
+#define COMPUTE_STEP(R_INPUT_A, R_INPUT_B, OFFSET)                             \
+  P_16_A = _mm256_maddubs_epi16(R_INPUT_A, L_AD0);                             \
+  P_32_A = _mm256_madd_epi16(P_16_A, ONE);                                     \
+  P_16_B = _mm256_maddubs_epi16(R_INPUT_B, L_EH0);                             \
+  P_32_B = _mm256_madd_epi16(P_16_B, ONE);                                     \
+  P_32 = _mm256_add_epi32(P_32_A, P_32_B);                                     \
+  _mm256_store_si256(                                                          \
+      blockO_256 + 4 * OFFSET,                                                 \
+      _mm256_add_epi32(_mm256_load_si256(blockO_256 + 4 * OFFSET), P_32));     \
+                                                                               \
+  P_16_A = _mm256_maddubs_epi16(R_INPUT_A, L_AD8);                             \
+  P_32_A = _mm256_madd_epi16(P_16_A, ONE);                                     \
+  P_16_B = _mm256_maddubs_epi16(R_INPUT_B, L_EH8);                             \
+  P_32_B = _mm256_madd_epi16(P_16_B, ONE);                                     \
+  P_32 = _mm256_add_epi32(P_32_A, P_32_B);                                     \
+  _mm256_store_si256(                                                          \
+      blockO_256 + 4 * OFFSET + 1,                                             \
+      _mm256_add_epi32(_mm256_load_si256(blockO_256 + 4 * OFFSET + 1), P_32)); \
+                                                                               \
+  P_16_A = _mm256_maddubs_epi16(R_INPUT_A, L_AD16);                            \
+  P_32_A = _mm256_madd_epi16(P_16_A, ONE);                                     \
+  P_16_B = _mm256_maddubs_epi16(R_INPUT_B, L_EH16);                            \
+  P_32_B = _mm256_madd_epi16(P_16_B, ONE);                                     \
+  P_32 = _mm256_add_epi32(P_32_A, P_32_B);                                     \
+  _mm256_store_si256(                                                          \
+      blockO_256 + 4 * OFFSET + 2,                                             \
+      _mm256_add_epi32(_mm256_load_si256(blockO_256 + 4 * OFFSET + 2), P_32)); \
+                                                                               \
+  P_16_A = _mm256_maddubs_epi16(R_INPUT_A, L_AD24);                            \
+  P_32_A = _mm256_madd_epi16(P_16_A, ONE);                                     \
+  P_16_B = _mm256_maddubs_epi16(R_INPUT_B, L_EH24);                            \
+  P_32_B = _mm256_madd_epi16(P_16_B, ONE);                                     \
+  P_32 = _mm256_add_epi32(P_32_A, P_32_B);                                     \
+  _mm256_store_si256(                                                          \
+      blockO_256 + 4 * OFFSET + 3,                                             \
+      _mm256_add_epi32(_mm256_load_si256(blockO_256 + 4 * OFFSET + 3), P_32));
+
+        // Permute and shuffle to copy a single value across the entire vector
+        // Then compute the multiplication
+        __m256i R_AH0_ = _mm256_permute2x128_si256(R_AH0, R_AH0, 0x00);
+        __m256i R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        __m256i R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD0, R_EH0, 0);
+        __m256i R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        __m256i R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD1, R_EH1, 1);
+        R_AH0_ = _mm256_permute2x128_si256(R_AH0, R_AH0, 0x11);
+        __m256i R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        __m256i R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD2, R_EH2, 2);
+        __m256i R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        __m256i R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD3, R_EH3, 3);
+
+        R_AH0_ = _mm256_permute2x128_si256(R_AH4, R_AH4, 0x00);
+        R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD0, R_EH0, 4);
+        R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD1, R_EH1, 5);
+        R_AH0_ = _mm256_permute2x128_si256(R_AH4, R_AH4, 0x11);
+        R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD2, R_EH2, 6);
+        R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD3, R_EH3, 7);
+
+        R_AH0_ = _mm256_permute2x128_si256(R_AH8, R_AH8, 0x00);
+        R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD0, R_EH0, 8);
+        R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD1, R_EH1, 9);
+        R_AH0_ = _mm256_permute2x128_si256(R_AH8, R_AH8, 0x11);
+        R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD2, R_EH2, 10);
+        R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD3, R_EH3, 11);
+
+        R_AH0_ = _mm256_permute2x128_si256(R_AH12, R_AH12, 0x00);
+        R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD0, R_EH0, 12);
+        R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD1, R_EH1, 13);
+        R_AH0_ = _mm256_permute2x128_si256(R_AH12, R_AH12, 0x11);
+        R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD2, R_EH2, 14);
+        R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD3, R_EH3, 15);
+
+        R_AH0_ = _mm256_permute2x128_si256(R_AH16, R_AH16, 0x00);
+        R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD0, R_EH0, 16);
+        R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD1, R_EH1, 17);
+        R_AH0_ = _mm256_permute2x128_si256(R_AH16, R_AH16, 0x11);
+        R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD2, R_EH2, 18);
+        R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD3, R_EH3, 19);
+
+        R_AH0_ = _mm256_permute2x128_si256(R_AH20, R_AH20, 0x00);
+        R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD0, R_EH0, 20);
+        R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD1, R_EH1, 21);
+        R_AH0_ = _mm256_permute2x128_si256(R_AH20, R_AH20, 0x11);
+        R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD2, R_EH2, 22);
+        R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD3, R_EH3, 23);
+
+        R_AH0_ = _mm256_permute2x128_si256(R_AH24, R_AH24, 0x00);
+        R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD0, R_EH0, 24);
+        R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD1, R_EH1, 25);
+        R_AH0_ = _mm256_permute2x128_si256(R_AH24, R_AH24, 0x11);
+        R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD2, R_EH2, 26);
+        R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD3, R_EH3, 27);
+
+        R_AH0_ = _mm256_permute2x128_si256(R_AH28, R_AH28, 0x00);
+        R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD0, R_EH0, 28);
+        R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD1, R_EH1, 29);
+        R_AH0_ = _mm256_permute2x128_si256(R_AH28, R_AH28, 0x11);
+        R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD2, R_EH2, 30);
+        R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD3, R_EH3, 31);
+
+#undef COMPUTE_STEP
+      }
+
+      // Transfer the results to the result matrix
+      Index i = 0;
+      for (Index j = n; j < n + 32; j++) {
+        LinearMapper r0 = res.getLinearMapper(m, j);
+        LinearMapper r1 = res.getLinearMapper(m + 8, j);
+        LinearMapper r2 = res.getLinearMapper(m + 16, j);
+        LinearMapper r3 = res.getLinearMapper(m + 24, j);
+        r0.storePacket(
+            0, _mm256_add_epi32(blockO_256[i++], r0.loadPacket(0)));
+        r1.storePacket(
+            0, _mm256_add_epi32(blockO_256[i++], r1.loadPacket(0)));
+        r2.storePacket(
+            0, _mm256_add_epi32(blockO_256[i++], r2.loadPacket(0)));
+        r3.storePacket(
+            0, _mm256_add_epi32(blockO_256[i++], r3.loadPacket(0)));
+      }
+
+      // Zero the result block so it can be reused
+      memset(blockO, 0, 32 * 32 * sizeof(QInt32));
+    }
+  }
+  aligned_delete(blockO, 32 * 32);
+}
+
+#endif  // EIGEN_USE_OPTIMIZED_INT8_UINT8_MAT_MAT_PRODUCT
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_CXX11_FIXED_POINT_MAT_MAT_PRODUCT_AVX2_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductNEON.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductNEON.h
new file mode 100644
index 0000000000..99894cafb5
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductNEON.h
@@ -0,0 +1,95 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+// Copyright (C) 2015 Benoit Jacob <benoitjacob@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_FIXED_POINT_MAT_MAT_PRODUCT_NEON_H
+#define EIGEN_CXX11_FIXED_POINT_MAT_MAT_PRODUCT_NEON_H
+
+
+namespace Eigen {
+namespace internal {
+
+
+// AVX2 optimized implementation of the case where the lhs is encoded using signed 8bit
+// integers and the rhs using unsigned 8bit integers.
+#ifdef EIGEN_USE_OPTIMIZED_INT8_UINT8_MAT_MAT_PRODUCT
+
+template<bool _ConjLhs, bool _ConjRhs>
+class gebp_traits<QInt8, QUInt8, _ConjLhs, _ConjRhs>
+{
+public:
+  typedef QInt8 LhsScalar;
+  typedef QUInt8 RhsScalar;
+  typedef QInt32 ResScalar;
+
+  enum {
+    // register block size along the M and N directions
+    // One for the current implementation
+    nr = 1,
+    mr = 1,
+    // Progress made at each iteration of the product loop
+    // also 1 for the current implementation
+    LhsProgress = 1,
+    RhsProgress = 1
+  };
+};
+
+// Mat-Mat product of a signed 8bit lhs with an unsigned 8bit rhs
+template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
+{
+  EIGEN_DONT_INLINE
+  void operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB,
+                  Index rows, Index depth, Index cols, QInt32 alpha,
+                  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
+};
+
+template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+EIGEN_DONT_INLINE
+void gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
+::operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB,
+             Index rows, Index depth, Index cols, QInt32 alpha,
+             Index strideA, Index strideB, Index offsetA, Index offsetB)
+{
+  EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  eigen_assert(alpha.value == 1);
+  eigen_assert(strideA == -1);
+  eigen_assert(strideB == -1);
+  eigen_assert(offsetA == 0);
+  eigen_assert(offsetB == 0);
+
+  eigen_assert(rows > 0);
+  eigen_assert(cols > 0);
+  eigen_assert(depth > 0);
+  eigen_assert(blockA);
+  eigen_assert(blockB);
+
+  for (Index j = 0; j < cols; ++j) {
+    Index startB = j * depth;
+
+    for (Index i = 0; i < rows; ++i) {
+      Index startA = i * depth;
+
+      for (Index k = 0; k < depth; ++k) {
+        res(i, j) += blockA[startA + k] * blockB[startB + k];
+      }
+    }
+  }
+}
+#endif
+
+
+}  // namespace internal
+}  // namespace Eigen
+
+
+
+#endif  // EIGEN_CXX11_FIXED_POINT_MAT_MAT_PRODUCT_NEON_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatVecProduct.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatVecProduct.h
new file mode 100644
index 0000000000..18b5085b89
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatVecProduct.h
@@ -0,0 +1,123 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_FIXED_POINT_MAT_VEC_PRODUCT_H
+#define EIGEN_CXX11_FIXED_POINT_MAT_VEC_PRODUCT_H
+
+
+namespace Eigen {
+namespace internal {
+
+// Mat-Vec product
+// Both lhs and rhs are encoded as 8bit signed integers
+template<typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version>
+struct general_matrix_vector_product<Index,QInt8,LhsMapper,ColMajor,ConjugateLhs,QInt8,RhsMapper,ConjugateRhs,Version>
+{
+EIGEN_DONT_INLINE static void run(
+  Index rows, Index cols,
+  const LhsMapper& lhs,
+  const RhsMapper& rhs,
+  QInt32* res, Index resIncr,
+  QInt8 alpha);
+};
+
+template<typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version>
+EIGEN_DONT_INLINE void general_matrix_vector_product<Index,QInt8,LhsMapper,ColMajor,ConjugateLhs,QInt8,RhsMapper,ConjugateRhs,Version>::run(
+    Index rows, Index cols,
+    const LhsMapper& lhs,
+    const RhsMapper& rhs,
+    QInt32* res, Index resIncr,
+    QInt8 alpha)
+{
+  eigen_assert(alpha.value == 1);
+  eigen_assert(resIncr == 1);
+  eigen_assert(rows > 0);
+  eigen_assert(cols > 0);
+
+  for (Index i = 0; i < rows; ++i) {
+    for (Index j = 0; j < cols; ++j) {
+      res[i] += lhs(i, j) * rhs(j, 0);
+    }
+  }
+}
+
+
+// Mat-Vec product
+// The lhs is encoded using 8bit signed integers, the rhs using 8bit unsigned integers
+template<typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version>
+struct general_matrix_vector_product<Index,QInt8,LhsMapper,ColMajor,ConjugateLhs,QUInt8,RhsMapper,ConjugateRhs,Version>
+{
+EIGEN_DONT_INLINE static void run(
+  Index rows, Index cols,
+  const LhsMapper& lhs,
+  const RhsMapper& rhs,
+  QInt32* res, Index resIncr,
+  QUInt8 alpha);
+};
+
+template<typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version>
+EIGEN_DONT_INLINE void general_matrix_vector_product<Index,QInt8,LhsMapper,ColMajor,ConjugateLhs,QUInt8,RhsMapper,ConjugateRhs,Version>::run(
+    Index rows, Index cols,
+    const LhsMapper& lhs,
+    const RhsMapper& rhs,
+    QInt32* res, Index resIncr,
+    QUInt8 alpha)
+{
+  eigen_assert(alpha.value == 1);
+  eigen_assert(resIncr == 1);
+  eigen_assert(rows > 0);
+  eigen_assert(cols > 0);
+
+  for (Index i = 0; i < rows; ++i) {
+    for (Index j = 0; j < cols; ++j) {
+      res[i] += lhs(i, j) * rhs(j, 0);
+    }
+  }
+}
+
+
+// Mat-Vec product
+// The lhs is encoded using bit unsigned integers, the rhs using 8bit signed integers
+template<typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version>
+struct general_matrix_vector_product<Index,QUInt8,LhsMapper,ColMajor,ConjugateLhs,QInt8,RhsMapper,ConjugateRhs,Version>
+{
+EIGEN_DONT_INLINE static void run(
+  Index rows, Index cols,
+  const LhsMapper& lhs,
+  const RhsMapper& rhs,
+  QInt32* res, Index resIncr,
+  QInt8 alpha);
+};
+
+template<typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version>
+EIGEN_DONT_INLINE void general_matrix_vector_product<Index,QUInt8,LhsMapper,ColMajor,ConjugateLhs,QInt8,RhsMapper,ConjugateRhs,Version>::run(
+    Index rows, Index cols,
+    const LhsMapper& lhs,
+    const RhsMapper& rhs,
+    QInt32* res, Index resIncr,
+    QInt8 alpha)
+{
+  eigen_assert(alpha.value == 1);
+  eigen_assert(resIncr == 1);
+  eigen_assert(rows > 0);
+  eigen_assert(cols > 0);
+
+  for (Index i = 0; i < rows; ++i) {
+    for (Index j = 0; j < cols; ++j) {
+      res[i] += lhs(i, j) * rhs(j, 0);
+    }
+  }
+}
+
+}  // namespace internal
+}  // namespace Eigen
+
+
+
+#endif  // EIGEN_CXX11_FIXED_POINT_MAT_VEC_PRODUCT_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h
new file mode 100644
index 0000000000..cae1a0b06d
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h
@@ -0,0 +1,409 @@
+#ifndef THIRD_PARTY_EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_PACKETMATHAVX2_H_
+#define THIRD_PARTY_EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_PACKETMATHAVX2_H_
+
+namespace Eigen {
+namespace internal {
+
+typedef struct Packet32q8i {
+  __m256i val;
+  operator __m256i() const { return val; }
+  Packet32q8i();
+  Packet32q8i(__m256i val) : val(val) {}
+} Packet32q8i;
+
+typedef struct Packet32q8u {
+  __m256i val;
+  operator __m256i() const { return val; }
+  Packet32q8u();
+  Packet32q8u(__m256i val) : val(val) {}
+} Packet32q8u;
+
+typedef struct Packet16q8i {
+  __m128i val;
+  operator __m128i() const { return val; }
+  Packet16q8i();
+  Packet16q8i(__m128i val) : val(val) {}
+} Packet16q8i;
+
+typedef struct Packet16q8u {
+  __m128i val;
+  operator __m128i() const { return val; }
+  Packet16q8u();
+  Packet16q8u(__m128i val) : val(val) {}
+} Packet16q8u;
+
+typedef struct Packet8q32i {
+  __m256i val;
+  operator __m256i() const { return val; }
+  Packet8q32i();
+  Packet8q32i(__m256i val) : val(val) {}
+} Packet8q32i;
+
+typedef struct Packet4q32i {
+  __m128i val;
+  operator __m128i() const { return val; }
+  Packet4q32i();
+  Packet4q32i(__m128i val) : val(val) {}
+} Packet4q32i;
+
+template <>
+struct packet_traits<QInt8> : default_packet_traits {
+  typedef Packet32q8i type;
+  typedef Packet16q8i half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 32,
+  };
+  enum {
+    HasAdd = 0,
+    HasSub = 0,
+    HasMul = 0,
+    HasNegate = 0,
+    HasAbs = 0,
+    HasAbs2 = 0,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 0,
+    HasSetLinear = 0
+  };
+};
+template <>
+struct packet_traits<QUInt8> : default_packet_traits {
+  typedef Packet32q8u type;
+  typedef Packet16q8u half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 32,
+  };
+  enum {
+    HasAdd = 0,
+    HasSub = 0,
+    HasMul = 0,
+    HasNegate = 0,
+    HasAbs = 0,
+    HasAbs2 = 0,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 0,
+    HasSetLinear = 0
+  };
+};
+template <>
+struct packet_traits<QInt32> : default_packet_traits {
+  typedef Packet8q32i type;
+  typedef Packet4q32i half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 8,
+  };
+  enum {
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasNegate = 1,
+    HasAbs = 0,
+    HasAbs2 = 0,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 0,
+    HasSetLinear = 0
+  };
+};
+
+template <>
+struct unpacket_traits<Packet32q8i> {
+  typedef QInt8 type;
+  typedef Packet16q8i half;
+  enum { size = 32 };
+};
+template <>
+struct unpacket_traits<Packet32q8u> {
+  typedef QUInt8 type;
+  typedef Packet16q8u half;
+  enum { size = 32 };
+};
+template <>
+struct unpacket_traits<Packet8q32i> {
+  typedef QInt32 type;
+  typedef Packet4q32i half;
+  enum { size = 8 };
+};
+
+// Unaligned load
+template <>
+EIGEN_STRONG_INLINE Packet32q8i ploadu<Packet32q8i>(const QInt8* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(
+      reinterpret_cast<const __m256i*>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet32q8u ploadu<Packet32q8u>(const QUInt8* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(
+      reinterpret_cast<const __m256i*>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8q32i ploadu<Packet8q32i>(const QInt32* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(
+      reinterpret_cast<const __m256i*>(from));
+}
+
+// Aligned load
+template <>
+EIGEN_STRONG_INLINE Packet32q8i pload<Packet32q8i>(const QInt8* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(
+      reinterpret_cast<const __m256i*>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet32q8u pload<Packet32q8u>(const QUInt8* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(
+      reinterpret_cast<const __m256i*>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8q32i pload<Packet8q32i>(const QInt32* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(
+      reinterpret_cast<const __m256i*>(from));
+}
+
+// Unaligned store
+template <>
+EIGEN_STRONG_INLINE void pstoreu<QInt8>(QInt8* to, const Packet32q8i& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(
+      reinterpret_cast<__m256i*>(to), from.val);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<QUInt8>(QUInt8* to, const Packet32q8u& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(
+      reinterpret_cast<__m256i*>(to), from.val);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<QInt32>(QInt32* to, const Packet8q32i& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(
+      reinterpret_cast<__m256i*>(to), from.val);
+}
+
+// Aligned store
+template <>
+EIGEN_STRONG_INLINE void pstore<QInt32>(QInt32* to, const Packet8q32i& from) {
+  EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to),
+                                               from.val);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<QUInt8>(QUInt8* to, const Packet32q8u& from) {
+  EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to),
+                                               from.val);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<QInt8>(QInt8* to, const Packet32q8i& from) {
+  EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to),
+                                               from.val);
+}
+
+// Extract first element.
+template <>
+EIGEN_STRONG_INLINE QInt32 pfirst<Packet8q32i>(const Packet8q32i& a) {
+  return _mm_cvtsi128_si32(_mm256_castsi256_si128(a));
+}
+template <>
+EIGEN_STRONG_INLINE QUInt8 pfirst<Packet32q8u>(const Packet32q8u& a) {
+  return static_cast<uint8_t>(_mm256_extract_epi8(a.val, 0));
+}
+template <>
+EIGEN_STRONG_INLINE QInt8 pfirst<Packet32q8i>(const Packet32q8i& a) {
+  return _mm256_extract_epi8(a.val, 0);
+}
+
+// Initialize to constant value.
+template <>
+EIGEN_STRONG_INLINE Packet32q8i pset1<Packet32q8i>(const QInt8& from) {
+  return _mm256_set1_epi8(from.value);
+}
+template <>
+EIGEN_STRONG_INLINE Packet32q8u pset1<Packet32q8u>(const QUInt8& from) {
+  return _mm256_set1_epi8(static_cast<uint8_t>(from.value));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8q32i pset1<Packet8q32i>(const QInt32& from) {
+  return _mm256_set1_epi32(from.value);
+}
+
+// Basic arithmetic packet ops for QInt32.
+template <>
+EIGEN_STRONG_INLINE Packet8q32i padd<Packet8q32i>(const Packet8q32i& a,
+                                                  const Packet8q32i& b) {
+  return _mm256_add_epi32(a.val, b.val);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8q32i psub<Packet8q32i>(const Packet8q32i& a,
+                                                  const Packet8q32i& b) {
+  return _mm256_sub_epi32(a.val, b.val);
+}
+// Note: mullo truncates the result to 32 bits.
+template <>
+EIGEN_STRONG_INLINE Packet8q32i pmul<Packet8q32i>(const Packet8q32i& a,
+                                                  const Packet8q32i& b) {
+  return _mm256_mullo_epi32(a.val, b.val);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8q32i pnegate<Packet8q32i>(const Packet8q32i& a) {
+  return _mm256_sub_epi32(_mm256_setzero_si256(), a.val);
+}
+
+// Min and max.
+template <>
+EIGEN_STRONG_INLINE Packet8q32i pmin<Packet8q32i>(const Packet8q32i& a,
+                                                  const Packet8q32i& b) {
+  return _mm256_min_epi32(a.val, b.val);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8q32i pmax<Packet8q32i>(const Packet8q32i& a,
+                                                  const Packet8q32i& b) {
+  return _mm256_max_epi32(a.val, b.val);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet32q8u pmin<Packet32q8u>(const Packet32q8u& a,
+                                                  const Packet32q8u& b) {
+  return _mm256_min_epu8(a.val, b.val);
+}
+template <>
+EIGEN_STRONG_INLINE Packet32q8u pmax<Packet32q8u>(const Packet32q8u& a,
+                                                  const Packet32q8u& b) {
+  return _mm256_max_epu8(a.val, b.val);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet32q8i pmin<Packet32q8i>(const Packet32q8i& a,
+                                                  const Packet32q8i& b) {
+  return _mm256_min_epi8(a.val, b.val);
+}
+template <>
+EIGEN_STRONG_INLINE Packet32q8i pmax<Packet32q8i>(const Packet32q8i& a,
+                                                  const Packet32q8i& b) {
+  return _mm256_max_epi8(a.val, b.val);
+}
+
+// Reductions.
+template <>
+EIGEN_STRONG_INLINE QInt32 predux_min<Packet8q32i>(const Packet8q32i& a) {
+  __m256i tmp = _mm256_min_epi32(a, _mm256_permute2f128_si256(a, a, 1));
+  tmp =
+      _mm256_min_epi32(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
+  return pfirst<Packet8q32i>(
+      _mm256_min_epi32(tmp, _mm256_shuffle_epi32(tmp, 1)));
+}
+template <>
+EIGEN_STRONG_INLINE QInt32 predux_max<Packet8q32i>(const Packet8q32i& a) {
+  __m256i tmp = _mm256_max_epi32(a, _mm256_permute2f128_si256(a, a, 1));
+  tmp =
+      _mm256_max_epi32(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
+  return pfirst<Packet8q32i>(
+      _mm256_max_epi32(tmp, _mm256_shuffle_epi32(tmp, 1)));
+}
+
+template <>
+EIGEN_STRONG_INLINE QUInt8 predux_min<Packet32q8u>(const Packet32q8u& a) {
+  __m256i tmp = _mm256_min_epu8(a, _mm256_permute2f128_si256(a, a, 1));
+  tmp =
+      _mm256_min_epu8(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
+  tmp = _mm256_min_epu8(tmp, _mm256_shuffle_epi32(tmp, 1));
+  tmp = _mm256_min_epu8(tmp,
+                        _mm256_shufflelo_epi16(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
+  return std::min(static_cast<uint8_t>(_mm256_extract_epi8(tmp, 0)),
+                  static_cast<uint8_t>(_mm256_extract_epi8(tmp, 1)));
+}
+template <>
+EIGEN_STRONG_INLINE QUInt8 predux_max<Packet32q8u>(const Packet32q8u& a) {
+  __m256i tmp = _mm256_max_epu8(a, _mm256_permute2f128_si256(a, a, 1));
+  tmp =
+      _mm256_max_epu8(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
+  tmp = _mm256_max_epu8(tmp, _mm256_shuffle_epi32(tmp, 1));
+  tmp = _mm256_max_epu8(tmp,
+                        _mm256_shufflelo_epi16(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
+  return std::max(static_cast<uint8_t>(_mm256_extract_epi8(tmp, 0)),
+                  static_cast<uint8_t>(_mm256_extract_epi8(tmp, 1)));
+}
+
+template <>
+EIGEN_STRONG_INLINE QInt8 predux_min<Packet32q8i>(const Packet32q8i& a) {
+  __m256i tmp = _mm256_min_epi8(a, _mm256_permute2f128_si256(a, a, 1));
+  tmp = _mm256_min_epi8(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
+  tmp = _mm256_min_epi8(tmp, _mm256_shuffle_epi32(tmp, 1));
+  tmp = _mm256_min_epi8(tmp, _mm256_shufflelo_epi16(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
+  return std::min(_mm256_extract_epi8(tmp, 0), _mm256_extract_epi8(tmp, 1));
+}
+template <>
+EIGEN_STRONG_INLINE QInt8 predux_max<Packet32q8i>(const Packet32q8i& a) {
+  __m256i tmp = _mm256_max_epi8(a, _mm256_permute2f128_si256(a, a, 1));
+  tmp = _mm256_max_epi8(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
+  tmp = _mm256_max_epi8(tmp, _mm256_shuffle_epi32(tmp, 1));
+  tmp = _mm256_max_epi8(tmp, _mm256_shufflelo_epi16(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
+  return std::max(_mm256_extract_epi8(tmp, 0), _mm256_extract_epi8(tmp, 1));
+}
+
+// Comparisons
+template <>
+EIGEN_STRONG_INLINE Packet8q32i peq<Packet8q32i>(const Packet8q32i& a,
+                                                 const Packet8q32i& b) {
+  return _mm256_cmpeq_epi32(a.val, b.val);
+}
+template <>
+EIGEN_STRONG_INLINE Packet32q8i peq<Packet32q8i>(const Packet32q8i& a,
+                                                 const Packet32q8i& b) {
+  return _mm256_cmpeq_epi8(a.val, b.val);
+}
+template <>
+EIGEN_STRONG_INLINE Packet32q8u peq<Packet32q8u>(const Packet32q8u& a,
+                                                 const Packet32q8u& b) {
+  return _mm256_cmpeq_epi8(a.val, b.val);
+}
+
+// Note: There are no instructions in AVX2 for unsigned lt/gt comparison.
+// These are added in AVX-512.
+template <>
+EIGEN_STRONG_INLINE Packet8q32i ple<Packet8q32i>(const Packet8q32i& a,
+                                                 const Packet8q32i& b) {
+  const __m256i gt = _mm256_cmpgt_epi32(a.val, b.val);
+  return _mm256_xor_si256(gt, gt);
+}
+template <>
+EIGEN_STRONG_INLINE Packet32q8i ple<Packet32q8i>(const Packet32q8i& a,
+                                                 const Packet32q8i& b) {
+  const __m256i gt = _mm256_cmpgt_epi8(a.val, b.val);
+  return _mm256_xor_si256(gt, gt);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8q32i plt<Packet8q32i>(const Packet8q32i& a,
+                                                 const Packet8q32i& b) {
+  return _mm256_cmpgt_epi32(b.val, a.val);
+}
+template <>
+EIGEN_STRONG_INLINE Packet32q8i plt<Packet32q8i>(const Packet32q8i& a,
+                                                 const Packet32q8i& b) {
+  return _mm256_cmpgt_epi8(b.val, a.val);
+}
+
+// Vectorized scaling of Packet32q8i by float.
+template <>
+struct functor_traits<scalar_multiple2_op<QInt32, double>> {
+  enum { Cost = 4 * NumTraits<float>::MulCost, PacketAccess = true };
+};
+
+template <>
+EIGEN_STRONG_INLINE const Packet8q32i
+scalar_multiple2_op<QInt32, double>::packetOp(const Packet8q32i& a) const {
+  __m256d scale = _mm256_set1_pd(m_other);
+  __m256d a_lo = _mm256_cvtepi32_pd(_mm256_castsi256_si128(a));
+  __m128i result_lo = _mm256_cvtpd_epi32(_mm256_mul_pd(scale, a_lo));
+  __m256d a_hi = _mm256_cvtepi32_pd(_mm256_extracti128_si256(a, 1));
+  __m128i result_hi = _mm256_cvtpd_epi32(_mm256_mul_pd(scale, a_hi));
+  return _mm256_insertf128_si256(_mm256_castsi128_si256(result_lo), result_hi,
+                                 1);
+}
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif  // THIRD_PARTY_EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_PACKETMATHAVX2_H_
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX2.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX2.h
new file mode 100644
index 0000000000..045384d7fc
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX2.h
@@ -0,0 +1,66 @@
+#ifndef THIRD_PARTY_EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX2_H_
+#define THIRD_PARTY_EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX2_H_
+
+namespace Eigen {
+namespace internal {
+
+typedef __m256 Packet8f;
+
+template <>
+struct type_casting_traits<QInt32, float> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+
+template <>
+EIGEN_STRONG_INLINE Packet8f pcast<Packet8q32i>(const Packet8q32i& a) {
+  return _mm256_cvtepi32_ps(a.val);
+}
+
+template <>
+struct type_casting_traits<float, QInt32> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+
+template <>
+EIGEN_STRONG_INLINE Packet8q32i pcast<Packet8f>(const Packet8f& a) {
+  return _mm256_cvtps_epi32(a);
+}
+
+template <>
+struct type_casting_traits<QInt32, QInt8> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 };
+};
+
+template <>
+EIGEN_STRONG_INLINE Packet32q8i
+pcast<Packet8q32i, Packet32q8i>(const Packet8q32i& a, const Packet8q32i& b,
+                                const Packet8q32i& c, const Packet8q32i& d) {
+  __m256i converted = _mm256_packs_epi16(_mm256_packs_epi32(a.val, b.val),
+                                         _mm256_packs_epi32(c.val, d.val));
+  // Since packs does not cross 128 bit lane boundaries,
+  // we have to permute to properly order the final result.
+  const __m256i permute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
+  return _mm256_permutevar8x32_epi32(converted, permute_mask);
+}
+
+template <>
+struct type_casting_traits<QInt32, QUInt8> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 };
+};
+
+template <>
+EIGEN_STRONG_INLINE Packet32q8u
+pcast<Packet8q32i, Packet32q8u>(const Packet8q32i& a, const Packet8q32i& b,
+                                const Packet8q32i& c, const Packet8q32i& d) {
+  const __m256i converted = _mm256_packus_epi16(
+      _mm256_packs_epi32(a.val, b.val), _mm256_packs_epi32(c.val, d.val));
+  // Since packus does not cross 128 bit lane boundaries,
+  // we have to permute to properly order the final result.
+  const __m256i permute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
+  return _mm256_permutevar8x32_epi32(converted, permute_mask);
+}
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif  // THIRD_PARTY_EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX2_H_
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Activations.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Activations.h
new file mode 100644
index 0000000000..94d616f2b5
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Activations.h
@@ -0,0 +1,116 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#ifndef EIGEN_CXX11_NEURAL_NETWORKS_ACTIVATIONS_H
+#define EIGEN_CXX11_NEURAL_NETWORKS_ACTIVATIONS_H
+
+namespace Eigen {
+
+/** scalar_sigmoid_fast_derivative_op
+  * \ingroup CXX11_NeuralNetworks_Module
+  * \brief Template functor to compute the fast derivative of a sigmoid
+  *
+  * Input should be the backpropagated gradient.
+  *
+  * \sa class CwiseUnaryOp, Cwise::sigmoid_fast_derivative()
+  */
+template <typename T>
+struct scalar_sigmoid_fast_derivative_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_sigmoid_fast_derivative_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& y) const {
+    const T one = T(1);
+    return (one - y) * y;
+  }
+
+  template <typename Packet>
+  inline Packet packetOp(const Packet& y) const {
+    const Packet one = internal::pset1<Packet>(1);
+    return internal::pmul(internal::psub(one, y), y);
+  }
+};
+
+namespace internal {
+template <typename T>
+struct functor_traits<scalar_sigmoid_fast_derivative_op<T> > {
+  enum {
+    Cost = NumTraits<T>::AddCost * 2 + NumTraits<T>::MulCost,
+    PacketAccess = packet_traits<T>::HasAdd && packet_traits<T>::HasMul &&
+                   packet_traits<T>::HasNegate
+  };
+};
+}  // namespace internal
+
+/** scalar_tanh_fast_derivative_op
+  * \ingroup CXX11_NeuralNetworks_Module
+  * \brief Template functor to compute the fast derivative of a tanh
+  *
+  * Input should be the backpropagated gradient.
+  *
+  * \sa class CwiseUnaryOp, Cwise::tanh_fast_derivative()
+  */
+template <typename T>
+struct scalar_tanh_fast_derivative_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_tanh_fast_derivative_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& y) const {
+    const T one = T(1);
+    return one - (y * y);
+  }
+
+  template <typename Packet>
+  inline Packet packetOp(const Packet& y) const {
+    const Packet one = internal::pset1<Packet>(1);
+    return internal::psub(one, internal::pmul(y, y));
+  }
+};
+
+namespace internal {
+template <typename T>
+struct functor_traits<scalar_tanh_fast_derivative_op<T> > {
+  enum {
+    Cost = NumTraits<T>::AddCost * 2 + NumTraits<T>::MulCost * 1,
+    PacketAccess = packet_traits<T>::HasAdd && packet_traits<T>::HasMul &&
+                   packet_traits<T>::HasNegate
+  };
+};
+}  // namespace internal
+
+/**
+  * \ingroup CXX11_NeuralNetworks_Module
+  * \brief Template functor to clip the the magnitude of the first scalar.
+  *
+  * \sa class CwiseBinaryOp, MatrixBase::Clip
+  */
+template <typename Scalar>
+struct scalar_clip_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_clip_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar
+  operator()(const Scalar& a, const Scalar& b) const {
+    return numext::mini(numext::maxi(a, -b), b);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet
+  packetOp(const Packet& a, const Packet& b) const {
+    return internal::pmin(internal::pmax(a, internal::pnegate(b)), b);
+  }
+};
+
+namespace internal {
+template <typename Scalar>
+struct functor_traits<scalar_clip_op<Scalar> > {
+  enum {
+    Cost = NumTraits<Scalar>::AddCost * 3,
+    PacketAccess = packet_traits<Scalar>::HasMax &&
+                   packet_traits<Scalar>::HasMin &&
+                   packet_traits<Scalar>::HasNegate
+  };
+};
+}  // namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_NEURAL_NETWORKS_ACTIVATIONS_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Attention.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Attention.h
new file mode 100644
index 0000000000..d4bc7a3515
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Attention.h
@@ -0,0 +1,209 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#ifndef EIGEN_CXX11_NEURAL_NETWORKS_ATTENTION_H
+#define EIGEN_CXX11_NEURAL_NETWORKS_ATTENTION_H
+
+namespace Eigen {
+
+/** ExtractGlimpses
+  * \ingroup CXX11_NeuralNetworks_Module
+  *
+  * \brief Extract glimpses from an input tensor.
+  *
+  * The input parameter is expected to be a col-major tensor with a rank of 4 (depth, x, y, and batch).
+  * The width and height parameters specify the extension of the returned glimpses.
+  * The offsets parameter specifies the x, y locations of the center of the glimpses relative to the center of the input image. The vector is expected to contain one IndexPair for each image in the batch dimension.
+  * The normalized boolean indicates if incoming coordinates are normalized so that 0.0 and 1.0 correspond to the minimum and maximum of each height and width dimension.
+  * The centered boolean indicates if incoming coordinates are centered relative to the image, in which case -1.0 and 1.0 correspond to minimum and maximum of each dimension while 0.0 corresponds to the center.
+  *
+  * The result can be assigned to a tensor of rank equal to that of the input. The result will be laid out in col-major order (depth, x, y, batch).
+  * The dimensions of the result will be equal to the dimensions of the input except for width and height which will be equal to the requested glimpse size.
+  */
+namespace {
+template <typename Index>
+struct GlimpseExtractionOp {
+  GlimpseExtractionOp(const Index width, const Index height,
+                      const std::vector<IndexPair<float> >& offsets,
+                      const bool normalized,
+                      const bool centered,
+                      const bool uniform_noise) :
+      width_(width), height_(height), offsets_(offsets),
+      normalized_(normalized), centered_(centered), uniform_noise_(uniform_noise) { }
+
+  template <typename Input>
+  DSizes<Index, 4> dimensions(const Input& input) const {
+    typedef typename internal::traits<Input>::Index IndexType;
+    typedef TensorRef<Tensor<typename internal::traits<Input>::Scalar, 4,
+                             internal::traits<Input>::Layout, IndexType> > Ref;
+    Ref in(input);
+
+    DSizes<Index, 4> dims = in.dimensions();
+
+    dims[0] = in.dimension(0);
+    dims[1] = width_;
+    dims[2] = height_;
+    dims[3] = in.dimension(3);
+    return dims;
+  }
+
+  template <typename Input, typename Output, typename Device>
+  EIGEN_DEVICE_FUNC
+  void eval(const Input& input, Output& output, const Device& device) const
+  {
+    typedef typename internal::traits<Input>::Index IndexType;
+    typedef TensorRef<Tensor<typename internal::traits<Input>::Scalar, 4,
+                             internal::traits<Input>::Layout, IndexType> > Ref;
+    Ref in(input);
+
+    const Index num_channels = in.dimension(0);
+    const Index input_width = in.dimension(1);
+    const Index input_height = in.dimension(2);
+    const Index batch_size = in.dimension(3);
+    eigen_assert(input_width > 0);
+    eigen_assert(input_height > 0);
+
+    for (Index i = 0; i < batch_size; ++i) {
+      float x = offsets_[i].first, y = offsets_[i].second;
+
+      // Un-normalize coordinates back to pixel space if normalized.
+      if (normalized_) {
+        x *= input_width;
+        y *= input_height;
+      }
+      // Un-center if coordinates are centered on the image center.
+      if (centered_) {
+        x /= 2.0f;
+        y /= 2.0f;
+        x += input_width / 2.0f;
+        y += input_height / 2.0f;
+      }
+      // Remove half of the glimpse window.
+      x -= width_ / 2.0f;
+      y -= height_ / 2.0f;
+
+      const Index offset_x = (Index) x;
+      const Index offset_y = (Index) y;
+      Index glimpse_width = width_;
+      Index glimpse_height = height_;
+      bool partial_overlap = false;
+      DSizes<Index, 3> slice_offset(0, offset_x, offset_y);
+      DSizes<Index, 3> slice_extent(num_channels, width_, height_);
+      DSizes<Index, 3> base_offset(0, 0, 0);
+
+      if (offset_x < 0) {
+        slice_offset[1] = 0;
+        glimpse_width = (std::max<Index>)(0, width_ + offset_x);
+        slice_extent[1] = glimpse_width;
+        base_offset[1] = width_ - glimpse_width;
+        partial_overlap = true;
+      } else if (offset_x + width_ >= input_width) {
+        glimpse_width = (std::max<Index>)(0, input_width - offset_x);
+        slice_extent[1] = glimpse_width;
+        partial_overlap = true;
+      }
+      if (offset_y < 0) {
+        slice_offset[2] = 0;
+        glimpse_height = (std::max<Index>)(0, height_ + offset_y);
+        slice_extent[2] = glimpse_height;
+        base_offset[2] = height_ - glimpse_height;
+        partial_overlap = true;
+      } else if (offset_y + height_ >= input_height) {
+        glimpse_height = (std::max<Index>)(0, input_height - offset_y);
+        slice_extent[2] = glimpse_height;
+        partial_overlap = true;
+      }
+      slice_extent[1] = std::min<Index>(input_width, slice_extent[1]);
+      slice_extent[2] = std::min<Index>(input_height, slice_extent[2]);
+
+      if (partial_overlap) {
+        if (uniform_noise_) {
+          // Initialize the glimpse with uniform noise.
+          typedef typename internal::remove_const<
+            typename internal::traits<Input>::Scalar>::type Scalar;
+          TensorFixedSize<Scalar, Sizes<> > mini;
+          mini.device(device) = input.template chip<3>(i).minimum();
+          TensorFixedSize<float, Sizes<> > range;
+          range.device(device) =
+              (input.template chip<3>(i).maximum() - mini).template cast<float>();
+
+          DSizes<Index, 3> glimpse_size(num_channels, width_, height_);
+          TensorMap<Tensor<float, 3> > tmp(NULL, glimpse_size);
+          output.template chip<3>(i).device(device) =
+              mini.reshape(Sizes<1,1,1>()).broadcast(glimpse_size) +
+              (tmp.random() * range.reshape(Sizes<1,1,1>()).broadcast(glimpse_size)).template cast<Scalar>();
+        } else {
+          // Initialize the glimpse with white noise: compute the mean and sigma
+          // of each channel, and use them to shape the gaussian.
+          DSizes<Index, 2> glimpse_size(width_, height_);
+          DSizes<Index, 2> input_size(input_width, input_height);
+          typedef typename internal::remove_const<
+            typename internal::traits<Input>::Scalar>::type Scalar;
+
+          for (int j = 0; j < num_channels; ++j) {
+            TensorFixedSize<Scalar, Sizes<> > mean;
+            mean.device(device) = input.template chip<3>(i).template chip<0>(j).template cast<float>().mean();
+            TensorFixedSize<float, Sizes<> > sigma;
+            sigma.device(device) =
+                (input.template chip<3>(i).template chip<0>(j).template cast<float>() - mean.reshape(Sizes<1,1>()).broadcast(input_size)).square().mean().sqrt();
+            TensorFixedSize<Scalar, Sizes<> > mini;
+            mini.device(device) = input.template chip<3>(i).template chip<0>(j).minimum();
+            TensorFixedSize<float, Sizes<> > maxi;
+            maxi.device(device) = input.template chip<3>(i).template chip<0>(j).maximum();
+
+            TensorMap<Tensor<float, 2> > tmp(NULL, glimpse_size);
+            output.template chip<3>(i).template chip<0>(j).device(device) =
+                (mean.reshape(Sizes<1,1>()).broadcast(glimpse_size) +
+                 (tmp.random(internal::NormalRandomGenerator<float>()) * sigma.reshape(Sizes<1,1>()).broadcast(glimpse_size)).template cast<Scalar>()).cwiseMin(maxi.reshape(Sizes<1,1>()).broadcast(glimpse_size)).cwiseMax(mini.reshape(Sizes<1,1>()).broadcast(glimpse_size));
+          }
+        }
+
+        // Copy the part of the glimpse that cover the input image if any.
+        if (glimpse_width == 0 || glimpse_height == 0) {
+          continue;
+        }
+        output.template chip<3>(i).slice(base_offset, slice_extent).device(device) = input.template chip<3>(i).slice(slice_offset, slice_extent);
+      } else {
+        output.template chip<3>(i).device(device) = input.template chip<3>(i).slice(slice_offset, slice_extent);
+      }
+    }
+  }
+
+ private:
+  const Index width_;
+  const Index height_;
+  const std::vector<IndexPair<float> > offsets_;
+  const bool normalized_;
+  const bool centered_;
+  const bool uniform_noise_;
+};
+}
+
+
+template <typename Input>
+EIGEN_ALWAYS_INLINE
+static const TensorCustomUnaryOp<const GlimpseExtractionOp<typename internal::traits<Input>::Index>, const Input>
+ExtractGlimpses(const Input& input,
+                const typename internal::traits<Input>::Index width,
+                const typename internal::traits<Input>::Index height,
+                const std::vector<IndexPair<float> >& offsets,
+                const bool normalized = true, const bool centered = true,
+                const bool uniform_noise = true)
+{
+  EIGEN_STATIC_ASSERT(internal::traits<Input>::Layout == ColMajor, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == 4, YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  typedef typename internal::traits<Input>::Index Index;
+  const GlimpseExtractionOp<Index> op(width, height, offsets, normalized,
+                                      centered, uniform_noise);
+  return input.customOp(op);
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_NEURAL_NETWORKS_ATTENTION_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/BackwardCuboidConvolutions.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/BackwardCuboidConvolutions.h
new file mode 100644
index 0000000000..12ce23444c
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/BackwardCuboidConvolutions.h
@@ -0,0 +1,523 @@
+#ifndef EIGEN_CXX11_NEURAL_NETWORKS_BACKWARD_CUBOID_CONVOLUTIONS_H
+#define EIGEN_CXX11_NEURAL_NETWORKS_BACKWARD_CUBOID_CONVOLUTIONS_H
+
+#include "Patch3d.h"
+
+namespace Eigen {
+
+/** CuboidConvolutionBackwardInput
+  * \ingroup CXX11_NeuralNetworks_Module
+  *
+  * \brief Computes the backprop for the input of a 3D convolution.
+  *
+  * The output_backward parameter is expected to be a tensor with a rank of 4 or more (channels, depth, height, width, and optionally others)
+  * The kernel parameter is expected to be a 5D tensor (filters, channels, kernel_depth, kernel_height, kernel_width)
+  * output_backward and kernel have to be in the same layout.
+  *
+  * The dimensions of the result will be filters, depth, height, width (and others if applicable).
+  *
+  * It is possible to swap the order of the depth, width and height dimensions provided that the same order is used in the input, the kernel, and the output.
+  *
+  * All dimension orders above are given for col-major, and should be reversed for row-major.
+  */
+
+template <typename OutputBackward, typename Kernel>
+EIGEN_ALWAYS_INLINE static const typename internal::conditional<
+    internal::traits<OutputBackward>::Layout == ColMajor,
+    TensorReshapingOp<
+        const DSizes<typename internal::traits<OutputBackward>::Index,
+                     internal::traits<OutputBackward>::NumDimensions>,
+        const TensorContractionOp<
+            const array< IndexPair<typename internal::traits<OutputBackward>::Index>, 2>,
+            const TensorReshapingOp<
+                const DSizes< typename internal::traits<OutputBackward>::Index, 3>,
+                const TensorReverseOp<const array<bool, 5>, const Kernel>
+            >,
+            const TensorReshapingOp<
+                const DSizes< typename internal::traits<OutputBackward>::Index, 3>,
+                const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const OutputBackward>
+            >
+        >
+    >,
+    TensorReshapingOp<
+        const DSizes<typename internal::traits<OutputBackward>::Index,
+                     internal::traits<OutputBackward>::NumDimensions>,
+        const TensorContractionOp<
+            const array< IndexPair<typename internal::traits<OutputBackward>::Index>, 2>,
+            const TensorReshapingOp<
+                const DSizes< typename internal::traits<OutputBackward>::Index, 3>,
+                const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const OutputBackward>
+            >,
+            const TensorReshapingOp<
+                const DSizes<typename internal::traits<OutputBackward>::Index, 3>,
+                const TensorReverseOp<const array<bool, 5>, const Kernel>
+            >
+        >
+    >
+>::type
+CuboidConvolutionBackwardInput(
+    const Kernel& kernel, const OutputBackward& output_backward,
+    typename internal::traits<OutputBackward>::Index inputPlanes,
+    typename internal::traits<OutputBackward>::Index inputRows,
+    typename internal::traits<OutputBackward>::Index inputCols,
+    const DenseIndex stridePlanes = 1, const DenseIndex strideRows = 1,
+    const DenseIndex strideCols = 1) {
+  typedef typename internal::traits<OutputBackward>::Index TensorIndex;
+  const TensorRef<const Tensor<typename internal::traits<Kernel>::Scalar, internal::traits<Kernel>::NumDimensions, internal::traits<Kernel>::Layout, TensorIndex> > kern(kernel);
+  const TensorRef<const Tensor<typename internal::traits<OutputBackward>::Scalar, internal::traits<OutputBackward>::NumDimensions, internal::traits<OutputBackward>::Layout, TensorIndex> > out(output_backward);
+
+  EIGEN_STATIC_ASSERT(internal::traits<Kernel>::Layout == internal::traits<OutputBackward>::Layout, YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  static const bool isColMajor = (internal::traits<OutputBackward>::Layout == ColMajor);
+
+  static const int NumDims = internal::traits<OutputBackward>::NumDimensions;
+
+  // Number of filters to apply. This is the same as the output depth of the result
+  const TensorIndex kernelFilters = isColMajor ? kern.dimensions()[0] : kern.dimensions()[4];
+  // Number of channels. This is the same as the input depth.
+  const TensorIndex kernelChannels = isColMajor ? kern.dimensions()[1] : kern.dimensions()[3];
+  const TensorIndex kernelPlanes = isColMajor ? kern.dimensions()[2] : kern.dimensions()[2];
+  const TensorIndex kernelRows = isColMajor ? kern.dimensions()[3] : kern.dimensions()[1];
+  const TensorIndex kernelCols = isColMajor ? kern.dimensions()[4] : kern.dimensions()[0];
+
+  const TensorIndex outputPlanes = isColMajor ? out.dimensions()[1] : out.dimensions()[NumDims - 2];
+  const TensorIndex outputRows = isColMajor ? out.dimensions()[2] : out.dimensions()[NumDims - 3];
+  const TensorIndex outputCols = isColMajor ? out.dimensions()[3] : out.dimensions()[NumDims - 4];
+
+  TensorIndex forward_pad_z, forward_pad_y, forward_pad_x;
+  const TensorIndex size_z = ceil(inputPlanes / static_cast<float>(stridePlanes));
+  const TensorIndex size_y = ceil(inputRows / static_cast<float>(strideRows));
+  const TensorIndex size_x = ceil(inputCols / static_cast<float>(strideCols));
+
+  // Infer padding type.
+  if (size_z == outputPlanes && size_y == outputRows && size_x == outputCols) {
+    // SAME padding.
+    const TensorIndex dz = size_z * stridePlanes + kernelPlanes - 1 - inputPlanes;
+    const TensorIndex dy = size_y * strideRows + kernelRows - 1 - inputRows;
+    const TensorIndex dx = size_x * strideCols + kernelCols - 1 - inputCols;
+
+    forward_pad_z = dz - dz / 2;
+    forward_pad_y = dy - dy / 2;
+    forward_pad_x = dx - dx / 2;
+  } else {
+    // VALID padding.
+    forward_pad_z = 0;
+    forward_pad_y = 0;
+    forward_pad_x = 0;
+  }
+  const TensorIndex padding_ztop = kernelPlanes - 1 - forward_pad_z;
+  const TensorIndex padding_top = kernelRows - 1 - forward_pad_y;
+  const TensorIndex padding_left = kernelCols - 1 - forward_pad_x;
+
+  const TensorIndex padding_zbottom = inputPlanes + kernelPlanes - 1 - (outputPlanes - 1) * stridePlanes - 1 - padding_ztop;
+  const TensorIndex padding_bottom = inputRows + kernelRows - 1 - (outputRows - 1) * strideRows - 1 - padding_top;
+  const TensorIndex padding_right = inputCols + kernelCols - 1 - (outputCols - 1) * strideCols - 1 - padding_left;
+
+  eigen_assert(padding_ztop >= 0);
+  eigen_assert(padding_zbottom >= 0);
+  eigen_assert(padding_top >= 0);
+  eigen_assert(padding_left >= 0);
+  eigen_assert(padding_bottom >= 0);
+  eigen_assert(padding_right >= 0);
+
+  // The kernel has dimensions filters X channels X patch_planes X patch_rows X patch_cols.
+  // We need to reverse the kernel along the spatial dimensions.
+  array<bool, 5> kernel_reverse;
+  if (isColMajor) {
+    kernel_reverse[0] = false;
+    kernel_reverse[1] = false;
+    kernel_reverse[2] = true;
+    kernel_reverse[3] = true;
+    kernel_reverse[4] = true;
+  } else {
+    kernel_reverse[0] = true;
+    kernel_reverse[1] = true;
+    kernel_reverse[2] = true;
+    kernel_reverse[3] = false;
+    kernel_reverse[4] = false;
+  }
+
+  DSizes<TensorIndex, 3> kernel_dims;
+  if (isColMajor) {
+    kernel_dims[0] = kernelFilters;
+    kernel_dims[1] = kernelChannels;
+    kernel_dims[2] = kernelRows * kernelCols * kernelPlanes;
+  } else {
+    kernel_dims[0] = kernelRows * kernelCols * kernelPlanes;
+    kernel_dims[1] = kernelChannels;
+    kernel_dims[2] = kernelFilters;
+  }
+
+  // The output_backward has dimensions out_depth X out_planes X out_rows X out_cols X OTHERS
+  // When we extract the image patches from output_backward, it will have dimensions:
+  //   out_depth X (patch_planes * patch_rows * patch_cols) X (input_planes * input_rows * input_cols * OTHERS)
+  DSizes<TensorIndex, 3> pre_contract_dims;
+  if (isColMajor) {
+    pre_contract_dims[0] = kernelFilters;
+    pre_contract_dims[1] = kernelRows * kernelCols * kernelPlanes;
+    pre_contract_dims[2] = inputRows * inputCols * inputPlanes;
+    for (int i = 4; i < NumDims; ++i) {
+      pre_contract_dims[2] *= out.dimension(i);
+    }
+  } else {
+    pre_contract_dims[2] = kernelFilters;
+    pre_contract_dims[1] = kernelRows * kernelCols * kernelPlanes;
+    pre_contract_dims[0] = inputRows * inputCols * inputPlanes;
+    for (int i = 0; i < NumDims - 4; ++i) {
+      pre_contract_dims[0] *= out.dimension(i);
+    }
+  }
+
+  // We will contract along dimensions (0, 2) in kernel and (0, 1) in
+  // output_backward, if this is col-major, and
+  // dimensions (0, 2) in kernel and (1, 2) in output_backward, if this row-major.
+  array<IndexPair<TensorIndex>, 2> contract_dims;
+  if (isColMajor) {
+    // col-major: kernel.contract(output.patches)
+    contract_dims[0] = IndexPair<TensorIndex>(0, 0);
+    contract_dims[1] = IndexPair<TensorIndex>(2, 1);
+  } else {
+    // row-major: output.patches.contract(kernel)
+    contract_dims[0] = IndexPair<TensorIndex>(1, 0);
+    contract_dims[1] = IndexPair<TensorIndex>(2, 2);
+  }
+
+  // Post contraction, the dimensions of the input_backprop is
+  //  channels X input_planes X input_rows X input_cols X OTHERS
+  DSizes<TensorIndex, NumDims> post_contract_dims;
+  if (isColMajor) {
+    post_contract_dims[0] = kernelChannels;
+    post_contract_dims[1] = inputPlanes;
+    post_contract_dims[2] = inputRows;
+    post_contract_dims[3] = inputCols;
+    for (int i = 4; i < NumDims; ++i) {
+      post_contract_dims[i] = out.dimension(i);
+    }
+  } else {
+    post_contract_dims[NumDims - 1] = kernelChannels;
+    post_contract_dims[NumDims - 2] = inputPlanes;
+    post_contract_dims[NumDims - 3] = inputRows;
+    post_contract_dims[NumDims - 4] = inputCols;
+    for (int i = 0; i < NumDims - 4; ++i) {
+      post_contract_dims[i] = out.dimension(i);
+    }
+  }
+
+  DSizes<TensorIndex, NumDims> strides;
+  for (int i = 0; i < NumDims; i++) {
+    strides[i] = 1;
+  }
+  if (isColMajor) {
+    strides[1] = stridePlanes;
+    strides[2] = strideRows;
+    strides[3] = strideCols;
+  } else {
+    strides[NumDims - 2] = stridePlanes;
+    strides[NumDims - 3] = strideRows;
+    strides[NumDims - 4] = strideCols;
+  }
+
+  return choose(
+      Cond<internal::traits<OutputBackward>::Layout == ColMajor>(),
+      kernel.reverse(kernel_reverse)
+          .reshape(kernel_dims)
+          .contract(
+              output_backward.extract_volume_patches(kernelPlanes, kernelRows, kernelCols,
+                                                     1, 1, 1, stridePlanes, strideRows, strideCols,
+                               padding_ztop, padding_zbottom,
+                               padding_top, padding_bottom,
+                               padding_left, padding_right)
+                  .reshape(pre_contract_dims),
+              contract_dims)
+          .reshape(post_contract_dims),
+      output_backward.extract_volume_patches(kernelPlanes, kernelRows, kernelCols,
+                                             1, 1, 1, stridePlanes, strideRows, strideCols,
+                       padding_ztop, padding_zbottom,
+                       padding_top, padding_bottom,
+                       padding_left, padding_right)
+          .reshape(pre_contract_dims)
+          .contract(kernel.reverse(kernel_reverse).reshape(kernel_dims),
+                    contract_dims)
+          .reshape(post_contract_dims));
+}
+
+
+/** CuboidConvolutionBackwardKernel
+  * \ingroup CXX11_NeuralNetworks_Module
+  *
+  * \brief Computes the backprop for the filter of a 3D convolution.
+  *
+  * The output_backward parameter is expected to be a tensor with a rank of 4 or more (channels, depth, height, width, and optionally others)
+  * The kernel parameter is expected to be a 4D tensor (filters, channels, kernel_depth, kernel_height, kernel_width)
+  * output_backward and kernel have to be in the same layout.
+  *
+  * The dimensions of the result will be filters, depth, height, width (and others if applicable).
+  *
+  * It is possible to swap the order of the depth, width and height dimensions provided that the same order is used in the input, the kernel, and the output.
+  *
+  * All dimension orders above are given for col-major, and should be reversed for row-major.
+  */
+template <typename OutputBackward, typename Input>
+EIGEN_ALWAYS_INLINE static const typename internal::conditional<
+    internal::traits<OutputBackward>::Layout == ColMajor,
+    const TensorShufflingOp<
+        const array<typename internal::traits<OutputBackward>::Index, 5>,
+        const TensorReverseOp<
+            const array<bool, 5>,
+            const TensorReshapingOp<
+                const DSizes<typename internal::traits<OutputBackward>::Index, 5>,
+                const TensorContractionOp<
+                    const array< IndexPair<typename internal::traits<Input>::Index>, 2>,
+                    const TensorReshapingOp<
+                        const DSizes<typename internal::traits<Input>::Index, 3>,
+                        const Input>,
+                    const TensorReshapingOp<
+                        const DSizes< typename internal::traits<OutputBackward>::Index, 4>,
+                        const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const OutputBackward>
+                    >
+                >
+            >
+        >
+    >,
+    const TensorShufflingOp<
+        const array<typename internal::traits<OutputBackward>::Index, 5>,
+        const TensorReverseOp<
+            const array<bool, 5>,
+            const TensorReshapingOp<
+                const DSizes<typename internal::traits<OutputBackward>::Index, 5>,
+                const TensorContractionOp<
+                    const array< IndexPair<typename internal::traits<Input>::Index>, 2>,
+                    const TensorReshapingOp<
+                        const DSizes< typename internal::traits<OutputBackward>::Index, 4>,
+                        const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const OutputBackward>
+                    >,
+                    const TensorReshapingOp<
+                        const DSizes<typename internal::traits<Input>::Index, 3>,
+                        const Input
+                    >
+                >
+            >
+        >
+    >
+>::type
+CuboidConvolutionBackwardKernel(
+    const Input& input, const OutputBackward& output_backward,
+    typename internal::traits<Input>::Index kernelPlanes,
+    typename internal::traits<Input>::Index kernelRows,
+    typename internal::traits<Input>::Index kernelCols,
+    const DenseIndex stridePlanes = 1,
+    const DenseIndex strideRows = 1,
+    const DenseIndex strideCols = 1) {
+  typedef typename internal::traits<Input>::Index TensorIndex;
+  TensorRef<Tensor<typename internal::traits<Input>::Scalar, internal::traits<Input>::NumDimensions, internal::traits<Input>::Layout, TensorIndex> > in(input);
+  TensorRef<Tensor<typename internal::traits<OutputBackward>::Scalar, internal::traits<OutputBackward>::NumDimensions, internal::traits<OutputBackward>::Layout, TensorIndex> > out(output_backward);
+
+  EIGEN_STATIC_ASSERT(internal::traits<Input>::Layout == internal::traits<OutputBackward>::Layout, YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  static const bool isColMajor = (internal::traits<Input>::Layout == ColMajor);
+
+  static const int NumDims = internal::traits<Input>::NumDimensions;
+  EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == internal::traits<OutputBackward>::NumDimensions, YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  const TensorIndex inputPlanes = isColMajor ? in.dimension(1) : in.dimension(NumDims - 2);
+  const TensorIndex inputRows = isColMajor ? in.dimension(2) : in.dimension(NumDims - 3);
+  const TensorIndex inputCols = isColMajor ? in.dimension(3) : in.dimension(NumDims - 4);
+
+  const TensorIndex outputPlanes = isColMajor ? out.dimension(1) : out.dimension(NumDims - 2);
+  const TensorIndex outputRows = isColMajor ? out.dimension(2) : out.dimension(NumDims - 3);
+  const TensorIndex outputCols = isColMajor ? out.dimension(3) : out.dimension(NumDims - 4);
+
+  const TensorIndex kernelFilters = isColMajor ? out.dimension(0) : out.dimension(NumDims - 1);
+  const TensorIndex kernelChannels = isColMajor ? in.dimension(0) : in.dimension(NumDims - 1);
+
+  TensorIndex forward_pad_z, forward_pad_y, forward_pad_x;
+  const TensorIndex size_z = ceil(inputPlanes / static_cast<float>(stridePlanes));
+  const TensorIndex size_y = ceil(inputRows / static_cast<float>(strideRows));
+  const TensorIndex size_x = ceil(inputCols / static_cast<float>(strideCols));
+
+  // Infer padding type.
+  if (size_z == outputPlanes && size_y == outputRows && size_x == outputCols) {
+    // SAME padding.
+    const TensorIndex dz = size_z * stridePlanes + kernelPlanes - 1 - inputPlanes;
+    const TensorIndex dy = size_y * strideRows + kernelRows - 1 - inputRows;
+    const TensorIndex dx = size_x * strideCols + kernelCols - 1 - inputCols;
+
+    forward_pad_z = dz - dz / 2;
+    forward_pad_y = dy - dy / 2;
+    forward_pad_x = dx - dx / 2;
+  } else {
+    // VALID padding.
+    forward_pad_z = 0;
+    forward_pad_y = 0;
+    forward_pad_x = 0;
+  }
+
+  const TensorIndex padding_ztop = kernelPlanes - 1 - forward_pad_z;
+  const TensorIndex padding_top = kernelRows - 1 - forward_pad_y;
+  const TensorIndex padding_left = kernelCols - 1 - forward_pad_x;
+
+  const TensorIndex padding_zbottom = inputPlanes + kernelPlanes - 1 - (outputPlanes - 1) * stridePlanes - 1 - padding_ztop;
+  const TensorIndex padding_bottom = inputRows + kernelRows - 1 - (outputRows - 1) * strideRows - 1 - padding_top;
+  const TensorIndex padding_right = inputCols + kernelCols - 1 - (outputCols - 1) * strideCols - 1 - padding_left;
+
+  eigen_assert(padding_ztop >= 0);
+  eigen_assert(padding_zbottom >= 0);
+  eigen_assert(padding_top >= 0);
+  eigen_assert(padding_left >= 0);
+  eigen_assert(padding_bottom >= 0);
+  eigen_assert(padding_right >= 0);
+
+  // The output_backward has dimensions out_depth X out_plaens X out_rows X out_cols X OTHERS
+  // When we extract the image patches from output_backward (with input as the
+  // kernel), it will have dimensions
+  //  (out_depth) X (input_planes * input_rows * input_cols) X (kernel_planes * kernel_rows * kernel_cols) X OTHERS
+  DSizes<TensorIndex, 4> pre_contract_dims;
+  if (isColMajor) {
+    pre_contract_dims[0] = kernelFilters;
+    pre_contract_dims[1] = inputRows * inputCols * inputPlanes;
+    pre_contract_dims[2] = kernelRows * kernelCols * kernelPlanes;
+    pre_contract_dims[3] = 1;
+    for (int i = 4; i < NumDims; ++i) {
+      pre_contract_dims[3] *= out.dimension(i);
+    }
+  } else {
+    pre_contract_dims[3] = kernelFilters;
+    pre_contract_dims[2] = inputRows * inputCols * inputPlanes;
+    pre_contract_dims[1] = kernelRows * kernelCols * kernelPlanes;
+    pre_contract_dims[0] = 1;
+    for (int i = 0; i < NumDims - 4; ++i) {
+      pre_contract_dims[0] *= out.dimension(i);
+    }
+  }
+
+  // The input has dimensions in_depth X (input_planes * input_rows * input_cols) X OTHERS
+  DSizes<TensorIndex, 3> input_dims;
+  if (isColMajor) {
+    input_dims[0] = kernelChannels;
+    input_dims[1] = inputRows * inputCols * inputPlanes;
+    input_dims[2] = 1;
+    for (int i = 4; i < NumDims; ++i) {
+      input_dims[2] *= in.dimension(i);
+    }
+    eigen_assert(input_dims[2] == pre_contract_dims[3]);
+  } else {
+    input_dims[2] = kernelChannels;
+    input_dims[1] = inputRows * inputCols * inputPlanes;
+    input_dims[0] = 1;
+    for (int i = 0; i < NumDims - 4; ++i) {
+      input_dims[0] *= in.dimension(i);
+    }
+    eigen_assert(input_dims[0] == pre_contract_dims[0]);
+  }
+
+  // We will contract along dimensions (1, 2) in in and (1, 3) in out, if
+  // this is col-major.
+  // For row-major, it's dimensions (0, 1) in in and (0, 2) in out.
+  array<IndexPair<TensorIndex>, 2> contract_dims;
+  if (isColMajor) {
+    // col-major: in.contract(output.patches)
+    contract_dims[0] = IndexPair<TensorIndex>(1, 1);
+    contract_dims[1] = IndexPair<TensorIndex>(2, 3);
+  } else {
+    // row-major: output.patches.contract(in)
+    contract_dims[0] = IndexPair<TensorIndex>(0, 0);
+    contract_dims[1] = IndexPair<TensorIndex>(2, 1);
+  }
+
+  // After the contraction, the kernel will have dimension
+  //   in_depth X out_depth X kernel_patches X kernel_rows X kernel_cols
+  // We will need to shuffle the first two dimensions and reverse the spatial dimensions.
+  // The end shape is:
+  //   out_depth X in_shape X kernel_planes X kernel_rows X kernel_cols
+
+  // This is the shape of the kernel *before* the shuffling.
+  DSizes<TensorIndex, 5> kernel_dims;
+  if (isColMajor) {
+    kernel_dims[0] = kernelChannels;
+    kernel_dims[1] = kernelFilters;
+    kernel_dims[2] = kernelPlanes;
+    kernel_dims[3] = kernelRows;
+    kernel_dims[4] = kernelCols;
+  } else {
+    kernel_dims[0] = kernelCols;
+    kernel_dims[1] = kernelRows;
+    kernel_dims[2] = kernelPlanes;
+    kernel_dims[3] = kernelFilters;
+    kernel_dims[4] = kernelChannels;
+  }
+
+  // Flip filters and channels.
+  array<TensorIndex, 5> kernel_shuffle;
+  if (isColMajor) {
+    kernel_shuffle[0] = 1;
+    kernel_shuffle[1] = 0;
+    kernel_shuffle[2] = 2;
+    kernel_shuffle[3] = 3;
+    kernel_shuffle[4] = 4;
+  } else {
+    kernel_shuffle[0] = 0;
+    kernel_shuffle[1] = 1;
+    kernel_shuffle[2] = 2;
+    kernel_shuffle[3] = 4;
+    kernel_shuffle[4] = 3;
+  }
+
+  // Reverse the spatial dimensions.
+  array<bool, 5> kernel_reverse;
+  if (isColMajor) {
+    kernel_reverse[0] = false;
+    kernel_reverse[1] = false;
+    kernel_reverse[2] = true;
+    kernel_reverse[3] = true;
+    kernel_reverse[4] = true;
+  } else {
+    kernel_reverse[0] = true;
+    kernel_reverse[1] = true;
+    kernel_reverse[2] = true;
+    kernel_reverse[3] = false;
+    kernel_reverse[4] = false;
+  }
+
+  DSizes<TensorIndex, NumDims> strides;
+  for (int i = 0; i < NumDims; i++) {
+    strides[i] = 1;
+  }
+  if (isColMajor) {
+    strides[1] = stridePlanes;
+    strides[2] = strideRows;
+    strides[3] = strideCols;
+  } else {
+    strides[NumDims - 2] = stridePlanes;
+    strides[NumDims - 3] = strideRows;
+    strides[NumDims - 4] = strideCols;
+  }
+  return choose(
+      Cond<internal::traits<Input>::Layout == ColMajor>(),
+      input.reshape(input_dims)
+          .contract(
+              output_backward.extract_volume_patches(
+                                 inputPlanes, inputRows, inputCols, 1,
+                                 1, 1, stridePlanes, strideRows, strideCols,
+
+                                 padding_ztop, padding_zbottom, padding_top,
+                                 padding_bottom, padding_left, padding_right)
+                  .reshape(pre_contract_dims),
+              contract_dims)
+          .reshape(kernel_dims)
+          .reverse(kernel_reverse)
+          .shuffle(kernel_shuffle),
+      output_backward.extract_volume_patches(
+                         inputPlanes, inputRows, inputCols, 1, 1, 1,
+                         stridePlanes, strideRows, strideCols, padding_ztop,
+                         padding_zbottom, padding_top, padding_bottom,
+                         padding_left, padding_right)
+          .reshape(pre_contract_dims)
+          .contract(input.reshape(input_dims), contract_dims)
+          .reshape(kernel_dims)
+          .reverse(kernel_reverse)
+          .shuffle(kernel_shuffle));
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_NEURAL_NETWORKS_BACKWARD_CUBOID_CONVOLUTIONS_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/BackwardSpatialConvolutions.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/BackwardSpatialConvolutions.h
new file mode 100644
index 0000000000..188dc75bf6
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/BackwardSpatialConvolutions.h
@@ -0,0 +1,351 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Ke Yang <yangke@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_NEURAL_NETWORKS_BACKWARD_SPATIAL_CONVOLUTIONS_H
+#define EIGEN_CXX11_NEURAL_NETWORKS_BACKWARD_SPATIAL_CONVOLUTIONS_H
+
+namespace Eigen {
+
+/** SpatialConvolutionBackwardInput
+  * \ingroup CXX11_NeuralNetworks_Module
+  *
+  * \brief Computes the backprop for the input of a 2D convolution.
+  *
+  * The output_backward parameter is expected to be a tensor with a rank of 3 or more (channels, height, width, and optionally others)
+  * The kernel parameter is expected to be a 4D tensor (filters, channels, kernel_height, kernel_width)
+  * The output_backward and the kernel must both be in col-major layout. The result will also be in col-major layout.
+  *
+  * If in_stride > 1, then applies convolution with holes (aka atrous convolution), sampling every in_stride input pixels.
+  *
+  * The result can be assigned to a tensor of rank equal to the rank of the output_backward. The dimensions of the result will be filters, height, width (and others if applicable).
+  *
+  * It is possible to swap the order of the width and height dimensions provided that the same order is used in the input, the kernel, and the output.
+  *
+  */
+
+template <typename OutputBackward, typename Kernel>
+EIGEN_ALWAYS_INLINE
+static const typename internal::conditional<
+  internal::traits<OutputBackward>::Layout == ColMajor,
+  TensorReshapingOp<const DSizes<typename internal::traits<OutputBackward>::Index, internal::traits<OutputBackward>::NumDimensions>, const TensorContractionOp<const array<IndexPair<typename internal::traits<OutputBackward>::Index>, 2>, const TensorReshapingOp<const DSizes<typename internal::traits<OutputBackward>::Index, 3>, const TensorReverseOp<const array<bool, 4>, const Kernel> >, const TensorReshapingOp<const DSizes<typename internal::traits<OutputBackward>::Index, 3>, const TensorImagePatchOp<Dynamic, Dynamic, const OutputBackward> > > >,
+  TensorReshapingOp<const DSizes<typename internal::traits<OutputBackward>::Index, internal::traits<OutputBackward>::NumDimensions>, const TensorContractionOp<const array<IndexPair<typename internal::traits<OutputBackward>::Index>, 2>, const TensorReshapingOp<const DSizes<typename internal::traits<OutputBackward>::Index, 3>, const TensorImagePatchOp<Dynamic, Dynamic, const OutputBackward> >, const TensorReshapingOp<const DSizes<typename internal::traits<OutputBackward>::Index, 3>, const TensorReverseOp<const array<bool, 4>, const Kernel> > > > >::type
+SpatialConvolutionBackwardInput(const Kernel& kernel, const OutputBackward& output_backward, typename internal::traits<OutputBackward>::Index inputRows, typename internal::traits<OutputBackward>::Index inputCols, const DenseIndex stride = 1, const DenseIndex in_stride = 1) {
+
+  typedef typename internal::traits<OutputBackward>::Index TensorIndex;
+  TensorRef<Tensor<typename internal::traits<Kernel>::Scalar, internal::traits<Kernel>::NumDimensions, internal::traits<Kernel>::Layout, TensorIndex> > kern(kernel);
+  TensorRef<Tensor<typename internal::traits<OutputBackward>::Scalar, internal::traits<OutputBackward>::NumDimensions, internal::traits<OutputBackward>::Layout, TensorIndex> > out(output_backward);
+
+  EIGEN_STATIC_ASSERT(internal::traits<Kernel>::Layout == internal::traits<OutputBackward>::Layout, YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  static const bool isColMajor = (internal::traits<OutputBackward>::Layout == ColMajor);
+
+  static const int NumDims = internal::traits<OutputBackward>::NumDimensions;
+
+  // Number of filters to apply. This is the same as the output depth of the result
+  const TensorIndex kernelFilters = isColMajor ? kern.dimensions()[0] : kern.dimensions()[3];
+  // Number of channels. This is the same as the input depth.
+  const TensorIndex kernelChannels = isColMajor ? kern.dimensions()[1] : kern.dimensions()[2];
+  const TensorIndex kernelRows = isColMajor ? kern.dimensions()[2] : kern.dimensions()[1];
+  const TensorIndex kernelCols = isColMajor ? kern.dimensions()[3] : kern.dimensions()[0];
+
+  // This is the effective kernel size, taking into account the (in_stride - 1) zero-values
+  // inserted between consecutive kernel elements in atrous convolution
+  const TensorIndex kernelRowsEff = kernelRows + (kernelRows - 1) * (in_stride - 1);
+  const TensorIndex kernelColsEff = kernelCols + (kernelCols - 1) * (in_stride - 1);
+
+  const TensorIndex outputRows = isColMajor ? output_backward.dimension(1) : output_backward.dimension(NumDims - 2);
+  const TensorIndex outputCols = isColMajor ? output_backward.dimension(2) : output_backward.dimension(NumDims - 3);
+
+  // Computing the forward padding
+  const TensorIndex forward_pad_top = ((outputRows - 1) * stride + kernelRowsEff - inputRows) / 2;
+  const TensorIndex forward_pad_left = ((outputCols - 1) * stride + kernelColsEff - inputCols) / 2;
+
+  const TensorIndex padding_top = kernelRowsEff - 1 - forward_pad_top;
+  const TensorIndex padding_left = kernelColsEff - 1 - forward_pad_left;
+  const TensorIndex padding_bottom = inputRows + kernelRowsEff - 1 - (outputRows - 1) * stride - 1 - padding_top;
+  const TensorIndex padding_right = inputCols + kernelColsEff - 1 - (outputCols - 1) * stride - 1 - padding_left;
+
+  eigen_assert(padding_top >= 0);
+  eigen_assert(padding_left >= 0);
+  eigen_assert(padding_bottom >= 0);
+  eigen_assert(padding_right >= 0);
+
+  // The kernel has dimensions filters X channels X patch_rows X patch_cols
+  // We need to reverse the kernel along dimensions corresponding to rows and
+  // cols.
+  // TODO(yangke): we can make things slightly faster by collapsing the dimensions
+  // where we don't reverse. Try that once we have a faster compiler.
+  array<bool, 4> kernel_reverse;
+  if (isColMajor) {
+    kernel_reverse[0] = false;
+    kernel_reverse[1] = false;
+    kernel_reverse[2] = true;
+    kernel_reverse[3] = true;
+  } else {
+    kernel_reverse[0] = true;
+    kernel_reverse[1] = true;
+    kernel_reverse[2] = false;
+    kernel_reverse[3] = false;
+  }
+
+  DSizes<TensorIndex, 3> kernel_dims;
+  if (isColMajor) {
+    kernel_dims[0] = kernelFilters;
+    kernel_dims[1] = kernelChannels;
+    kernel_dims[2] = kernelRows * kernelCols;
+  } else {
+    kernel_dims[0] = kernelRows * kernelCols;
+    kernel_dims[1] = kernelChannels;
+    kernel_dims[2] = kernelFilters;
+  }
+
+  // The output_backward has dimensions out_depth X out_rows X out_cols X OTHERS
+  // When we extract the image patches from output_backward, it will have dimensions
+  //   out_depth X (patch_rows * patch_cols) X (input_rows * input_cols * OTHERS)
+  DSizes<TensorIndex, 3> pre_contract_dims;
+  if (isColMajor) {
+    pre_contract_dims[0] = kernelFilters;
+    pre_contract_dims[1] = kernelRows * kernelCols;
+    pre_contract_dims[2] = inputRows * inputCols;
+    for (int i = 3; i < NumDims; ++i) {
+      pre_contract_dims[2] *= out.dimension(i);
+    }
+  } else {
+    pre_contract_dims[2] = kernelFilters;
+    pre_contract_dims[1] = kernelRows * kernelCols;
+    pre_contract_dims[0] = inputRows * inputCols;
+    for (int i = 0; i < NumDims - 3; ++i) {
+      pre_contract_dims[0] *= out.dimension(i);
+    }
+  }
+
+  // We will contract along dimensions (0, 2) in kernel and (0, 1) in
+  // output_backward, if this is col-major, and
+  // dimensions (0, 2) in kernel and (1, 2) in output_backward, if this row-major.
+  array<IndexPair<TensorIndex>, 2> contract_dims;
+  if (isColMajor) {
+    // col-major: kernel.contract(output.patches)
+    contract_dims[0] = IndexPair<TensorIndex>(0, 0);
+    contract_dims[1] = IndexPair<TensorIndex>(2, 1);
+  } else {
+    // row-major: output.patches.contract(kernel)
+    contract_dims[0] = IndexPair<TensorIndex>(1, 0);
+    contract_dims[1] = IndexPair<TensorIndex>(2, 2);
+  }
+
+  // Post contraction, the dimensions of the input_backprop is
+  //  channels X input_rows X input_cols X OTHERS
+  DSizes<TensorIndex, NumDims> post_contract_dims;
+  if (isColMajor) {
+    post_contract_dims[0] = kernelChannels;
+    post_contract_dims[1] = inputRows;
+    post_contract_dims[2] = inputCols;
+    for (int i = 3; i < NumDims; ++i) {
+      post_contract_dims[i] = out.dimension(i);
+    }
+  } else {
+    post_contract_dims[NumDims - 1] = kernelChannels;
+    post_contract_dims[NumDims - 2] = inputRows;
+    post_contract_dims[NumDims - 3] = inputCols;
+    for (int i = 0; i < NumDims - 3; ++i) {
+      post_contract_dims[i] = out.dimension(i);
+    }
+  }
+
+  return choose(Cond<internal::traits<OutputBackward>::Layout == ColMajor>(),
+                kernel.reverse(kernel_reverse).reshape(kernel_dims).contract(output_backward.extract_image_patches(kernelRows, kernelCols, 1, 1, in_stride, in_stride, stride, stride, padding_top, padding_bottom, padding_left, padding_right, 0).reshape(pre_contract_dims), contract_dims).reshape(post_contract_dims),
+                output_backward.extract_image_patches(kernelRows, kernelCols, 1, 1, in_stride, in_stride, stride, stride, padding_top, padding_bottom, padding_left, padding_right, 0).reshape(pre_contract_dims).contract(kernel.reverse(kernel_reverse).reshape(kernel_dims), contract_dims).reshape(post_contract_dims));
+}
+
+
+/** SpatialConvolutionBackwardKernel
+  * \ingroup CXX11_NeuralNetworks_Module
+  *
+  * \brief Computes the backprop for the filter of a 2D convolution.
+  *
+  * The output_backward parameter is expected to be a tensor with a rank of 3 or more (channels, height, width, and optionally others)
+  * The kernel parameter is expected to be a 4D tensor (filters, channels, kernel_height, kernel_width)
+  * The output_backward and the kernel must both be in col-major layout. The result will also be in col-major layout.
+  *
+  * If in_stride > 1, then applies convolution with holes (aka atrous convolution), sampling every in_stride input pixels.
+  *
+  * The result can be assigned to a tensor of rank equal to the rank of the output_backward. The dimensions of the result will be filters, height, width (and others if applicable).
+  *
+  * It is possible to swap the order of the width and height dimensions provided that the same order is used in the input, the kernel, and the output.
+  *
+  */
+// TODO(gpapan): Resolve a bug in TensorContractionInputMapper at SpatialConvolutions.h that yangke circumvented by using .reshape().reshape().
+// This can significantly accelerate SpatialConvolutionBackwardKernel.
+
+template <typename OutputBackward, typename Input>
+EIGEN_ALWAYS_INLINE
+static const typename internal::conditional<
+  internal::traits<OutputBackward>::Layout == ColMajor,
+  const TensorShufflingOp<const array<typename internal::traits<OutputBackward>::Index, 4>, const TensorReverseOp<const array<bool, 4>, const TensorReshapingOp<const DSizes<typename internal::traits<OutputBackward>::Index, 4>, const TensorContractionOp<const array<IndexPair<typename internal::traits<Input>::Index>, 2>, const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 3>, const Input>, const TensorReshapingOp<const DSizes<typename internal::traits<OutputBackward>::Index, 4>, const TensorReshapingOp<const DSizes<typename internal::traits<OutputBackward>::Index, 4>, const TensorImagePatchOp<Dynamic, Dynamic, const OutputBackward> > > > > > >,
+  const TensorShufflingOp<const array<typename internal::traits<OutputBackward>::Index, 4>, const TensorReverseOp<const array<bool, 4>, const TensorReshapingOp<const DSizes<typename internal::traits<OutputBackward>::Index, 4>, const TensorContractionOp<const array<IndexPair<typename internal::traits<Input>::Index>, 2>, const TensorReshapingOp<const DSizes<typename internal::traits<OutputBackward>::Index, 4>, const TensorReshapingOp<const DSizes<typename internal::traits<OutputBackward>::Index, 4>, const TensorImagePatchOp<Dynamic, Dynamic, const OutputBackward> > >, const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 3>, const Input> > > > > >::type
+SpatialConvolutionBackwardKernel(const Input& input, const OutputBackward& output_backward, typename internal::traits<Input>::Index kernelRows, typename internal::traits<Input>::Index kernelCols, const DenseIndex stride = 1, const DenseIndex in_stride = 1) {
+
+  typedef typename internal::traits<Input>::Index TensorIndex;
+  TensorRef<Tensor<typename internal::traits<Input>::Scalar, internal::traits<Input>::NumDimensions, internal::traits<Input>::Layout, TensorIndex> > in(input);
+  TensorRef<Tensor<typename internal::traits<OutputBackward>::Scalar, internal::traits<OutputBackward>::NumDimensions, internal::traits<OutputBackward>::Layout, TensorIndex> > out(output_backward);
+
+  EIGEN_STATIC_ASSERT(internal::traits<Input>::Layout == internal::traits<OutputBackward>::Layout, YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  // stride and in_stride cannot both be larger than 1
+  eigen_assert(!(stride > 1 && in_stride > 1));
+
+  static const bool isColMajor = (internal::traits<Input>::Layout == ColMajor);
+
+  static const int NumDims = internal::traits<Input>::NumDimensions;
+  EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == internal::traits<OutputBackward>::NumDimensions, YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  const TensorIndex inputRows = isColMajor ? in.dimension(1) : in.dimension(NumDims - 2);
+  const TensorIndex inputCols = isColMajor ? in.dimension(2) : in.dimension(NumDims - 3);
+
+  const TensorIndex outputRows = isColMajor ? output_backward.dimension(1) : output_backward.dimension(NumDims - 2);
+  const TensorIndex outputCols = isColMajor ? output_backward.dimension(2) : output_backward.dimension(NumDims - 3);
+
+  // Number of filters to apply. This is the same as the output depth of the result
+  const TensorIndex kernelFilters = isColMajor ? out.dimensions()[0] : out.dimensions()[NumDims - 1];
+
+  // Number of channels. This is the same as the input depth.
+  const TensorIndex kernelChannels = isColMajor ? in.dimensions()[0] : in.dimensions()[NumDims - 1];
+
+  // This is the effective kernel size, taking into account the (in_stride - 1) zero-values
+  // inserted between consecutive kernel elements in atrous convolution
+  const TensorIndex kernelRowsEff = kernelRows + (kernelRows - 1) * (in_stride - 1);
+  const TensorIndex kernelColsEff = kernelCols + (kernelCols - 1) * (in_stride - 1);
+
+  // Computing the forward padding
+  const TensorIndex forward_pad_top = ((outputRows - 1) * stride + kernelRowsEff - inputRows) / 2;
+  const TensorIndex forward_pad_left = ((outputCols - 1) * stride + kernelColsEff - inputCols) / 2;
+
+  // TODO: factor out the padding computation.
+  const TensorIndex padding_top = kernelRowsEff - 1 - forward_pad_top;
+  const TensorIndex padding_left = kernelColsEff - 1 - forward_pad_left;
+  const TensorIndex padding_bottom = inputRows + kernelRowsEff - 1 - (outputRows - 1) * stride - 1 - padding_top;
+  const TensorIndex padding_right = inputCols + kernelColsEff - 1 - (outputCols - 1) * stride - 1 - padding_left;
+
+  eigen_assert(padding_top >= 0);
+  eigen_assert(padding_left >= 0);
+  eigen_assert(padding_bottom >= 0);
+  eigen_assert(padding_right >= 0);
+
+  // The output_backward has dimensions out_depth X out_rows X out_cols X OTHERS
+  // When we extract the image patches from output_backward (with input as the
+  // kernel), it will have dimensions
+  //  (out_depth) X (input_rows * input_cols) X (kernel_rows * kernel_cols) X OTHERS
+  DSizes<TensorIndex, 4> pre_contract_dims;
+  if (isColMajor) {
+    pre_contract_dims[0] = kernelFilters;
+    pre_contract_dims[1] = inputRows * inputCols;
+    pre_contract_dims[2] = kernelRows * kernelCols;
+    pre_contract_dims[3] = 1;
+    for (int i = 3; i < NumDims; ++i) {
+      pre_contract_dims[3] *= out.dimension(i);
+    }
+  } else {
+    pre_contract_dims[3] = kernelFilters;
+    pre_contract_dims[2] = inputRows * inputCols;
+    pre_contract_dims[1] = kernelRows * kernelCols;
+    pre_contract_dims[0] = 1;
+    for (int i = 0; i < NumDims - 3; ++i) {
+      pre_contract_dims[0] *= out.dimension(i);
+    }
+  }
+
+  // The input has dimensions in_depth X (input_rows * input_cols) X OTHERS
+  DSizes<TensorIndex, 3> input_dims;
+  if (isColMajor) {
+    input_dims[0] = kernelChannels;
+    input_dims[1] = inputRows * inputCols;
+    input_dims[2] = 1;
+    for (int i = 3; i < NumDims; ++i) {
+      input_dims[2] *= in.dimension(i);
+    }
+    eigen_assert(input_dims[2] == pre_contract_dims[3]);
+  } else {
+    input_dims[2] = kernelChannels;
+    input_dims[1] = inputRows * inputCols;
+    input_dims[0] = 1;
+    for (int i = 0; i < NumDims - 3; ++i) {
+      input_dims[0] *= in.dimension(i);
+    }
+    eigen_assert(input_dims[0] == pre_contract_dims[0]);
+  }
+
+  // We will contract along dimensions (1, 2) in in and (1, 3) in out, if
+  // this is col-major.
+  // For row-major, it's dimensions (0, 1) in in and (0, 2) in out.
+  array<IndexPair<TensorIndex>, 2> contract_dims;
+  if (isColMajor) {
+    // col-major: in.contract(output.patches)
+    contract_dims[0] = IndexPair<TensorIndex>(1, 1);
+    contract_dims[1] = IndexPair<TensorIndex>(2, 3);
+  } else {
+    // row-major: output.patches.contract(in)
+    contract_dims[0] = IndexPair<TensorIndex>(0, 0);
+    contract_dims[1] = IndexPair<TensorIndex>(2, 1);
+  }
+
+  // After the contraction, the kernel will have dimension
+  // in_depth X out_depth X kernel_rows X kernel_cols
+  // We will need to shuffle the first two dimensions and reverse the latter
+  // two dimensions.
+  // The end shape is
+  // out_depth X in_shape X kernel_rows X kernel_cols
+
+  // This is the shape of the kernel *before* the shuffling.
+  DSizes<TensorIndex, 4> kernel_dims;
+  if (isColMajor) {
+    kernel_dims[0] = kernelChannels;
+    kernel_dims[1] = kernelFilters;
+    kernel_dims[2] = kernelRows;
+    kernel_dims[3] = kernelCols;
+  } else {
+    kernel_dims[0] = kernelCols;
+    kernel_dims[1] = kernelRows;
+    kernel_dims[2] = kernelFilters;
+    kernel_dims[3] = kernelChannels;
+  }
+
+  array<TensorIndex, 4> kernel_shuffle;
+  if (isColMajor) {
+    kernel_shuffle[0] = 1;
+    kernel_shuffle[1] = 0;
+    kernel_shuffle[2] = 2;
+    kernel_shuffle[3] = 3;
+  } else {
+    kernel_shuffle[0] = 0;
+    kernel_shuffle[1] = 1;
+    kernel_shuffle[2] = 3;
+    kernel_shuffle[3] = 2;
+  }
+
+  array<bool, 4> kernel_reverse;
+  if (isColMajor) {
+    kernel_reverse[0] = false;
+    kernel_reverse[1] = false;
+    kernel_reverse[2] = true;
+    kernel_reverse[3] = true;
+  } else {
+    kernel_reverse[0] = true;
+    kernel_reverse[1] = true;
+    kernel_reverse[2] = false;
+    kernel_reverse[3] = false;
+  }
+
+  return choose(Cond<internal::traits<Input>::Layout == ColMajor>(),
+                input.reshape(input_dims).contract(output_backward.extract_image_patches(inputRows, inputCols, in_stride, in_stride, 1, 1, stride, stride, padding_top, padding_bottom, padding_left, padding_right, 0).reshape(pre_contract_dims).reshape(pre_contract_dims), contract_dims).reshape(kernel_dims).reverse(kernel_reverse).shuffle(kernel_shuffle),
+                output_backward.extract_image_patches(inputRows, inputCols, in_stride, in_stride, 1, 1, stride, stride, padding_top, padding_bottom, padding_left, padding_right, 0).reshape(pre_contract_dims).reshape(pre_contract_dims).contract(input.reshape(input_dims), contract_dims).reshape(kernel_dims).reverse(kernel_reverse).shuffle(kernel_shuffle));
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_NEURAL_NETWORKS_BACKWARD_SPATIAL_CONVOLUTIONS_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/CuboidConvolution.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/CuboidConvolution.h
new file mode 100644
index 0000000000..dfb9dcedba
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/CuboidConvolution.h
@@ -0,0 +1,179 @@
+#ifndef EIGEN_CXX11_SRC_NEURAL_NETWORKS_CUBOID_CONVOLUTION_H
+#define EIGEN_CXX11_SRC_NEURAL_NETWORKS_CUBOID_CONVOLUTION_H
+
+#include "Patch3d.h"
+
+namespace Eigen {
+
+/** CuboidConvolution
+  * \ingroup CXX11_NeuralNetworks_Module
+  *
+  * \brief Applies a 3D convolution over a multichannel input voxel block.
+  *
+  * The input parameter is expected to be a tensor with a rank of 4 or more (channels, depth, height, width, and optionally others).
+  * The kernel parameter is expected to be a 5D tensor (filters, channels, kernel_depth, kernel_height, kernel_width).
+  * The result can be assigned to a tensor of rank equal to the rank of the input. The dimensions of the result will be filters, depth, height, width (and others if applicable).
+  *
+  * The input and kernel have to be in the same layout, and both row-major and
+  * col-major are supported. The shapes given above are for col-major layout.
+  * For row-major, all dimensions should be reversed.
+  *
+  * It is possible to swap the order of the depth, width, and height dimensions provided that the same order is used in the input, the kernel, and the output.
+  */
+template <typename Input, typename Kernel>
+EIGEN_ALWAYS_INLINE
+static const typename internal::conditional <
+    internal::traits<Input>::Layout == ColMajor,
+    TensorReshapingOp<
+        const DSizes<typename internal::traits<Input>::Index,
+                     internal::traits<Input>::NumDimensions>,
+        const TensorContractionOp<
+            const array<IndexPair<typename internal::traits<Input>::Index>, 1>,
+            const TensorReshapingOp<
+                const DSizes<typename internal::traits<Input>::Index, 2>,
+                const Kernel>,
+            const TensorReshapingOp<
+                const DSizes<typename internal::traits<Input>::Index, 2>,
+                const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic,
+                                          const Input> > > >,
+    TensorReshapingOp<
+        const DSizes<typename internal::traits<Input>::Index,
+                     internal::traits<Input>::NumDimensions>,
+        const TensorContractionOp<
+            const array<IndexPair<typename internal::traits<Input>::Index>, 1>,
+            const TensorReshapingOp<
+                const DSizes<typename internal::traits<Input>::Index, 2>,
+                const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic,
+                                          const Input> > ,
+                const TensorReshapingOp<
+                    const DSizes<typename internal::traits<Input>::Index, 2>,
+                    const Kernel> > > >::type
+CuboidConvolution(const Input& input, const Kernel& kernel,
+                  const DenseIndex stridePlanes = 1,
+                  const DenseIndex strideRows = 1,
+                  const DenseIndex strideCols = 1,
+                  const PaddingType padding_type = PADDING_SAME) {
+  typedef typename internal::traits<Input>::Index TensorIndex;
+  TensorRef<Tensor<typename internal::traits<Input>::Scalar, internal::traits<Input>::NumDimensions, internal::traits<Input>::Layout, TensorIndex> > in(input);
+  TensorRef<Tensor<typename internal::traits<Kernel>::Scalar, internal::traits<Kernel>::NumDimensions, internal::traits<Kernel>::Layout, TensorIndex> > kern(kernel);
+
+  EIGEN_STATIC_ASSERT(internal::traits<Input>::Layout == internal::traits<Kernel>::Layout, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  static const bool isColMajor = (internal::traits<Input>::Layout == ColMajor);
+  static const int NumDims = internal::traits<Input>::NumDimensions;
+
+  // Number of filters to apply. This is the same as the output depth of the result.
+  const TensorIndex kernelFilters = isColMajor ? kern.dimensions()[0] : kern.dimensions()[4];
+  const TensorIndex kernelChannels = isColMajor ? kern.dimensions()[1] : kern.dimensions()[3];
+
+  // Spatial size of the kernel.
+  const TensorIndex kernelDepth = isColMajor ? kern.dimensions()[2] : kern.dimensions()[2];
+  const TensorIndex kernelRows = isColMajor ? kern.dimensions()[3] : kern.dimensions()[1];
+  const TensorIndex kernelCols = isColMajor ? kern.dimensions()[4] : kern.dimensions()[0];
+
+  if (isColMajor) {
+    eigen_assert(kernelChannels == in.dimension(0));
+  } else {
+    eigen_assert(kernelChannels == in.dimension(NumDims - 1));
+  }
+
+  const TensorIndex inputPlanes = isColMajor ? in.dimension(1) : in.dimension(NumDims - 2);
+  const TensorIndex inputRows = isColMajor ? in.dimension(2) : in.dimension(NumDims - 3);
+  const TensorIndex inputCols = isColMajor ? in.dimension(3) : in.dimension(NumDims - 4);
+
+  const float stride_planes_f = static_cast<float>(stridePlanes);
+  const float stride_rows_f = static_cast<float>(strideRows);
+  const float stride_cols_f = static_cast<float>(strideCols);
+  TensorIndex out_depth;
+  TensorIndex out_height;
+  TensorIndex out_width;
+  switch (padding_type) {
+    case PADDING_VALID:
+      out_depth = ceil((inputPlanes - kernelDepth + 1.f) / stride_planes_f);
+      out_height = ceil((inputRows - kernelRows + 1.f) / stride_rows_f);
+      out_width = ceil((inputCols - kernelCols + 1.f) / stride_cols_f);
+      break;
+    case PADDING_SAME:
+      out_depth = ceil(inputPlanes / stride_planes_f);
+      out_height = ceil(inputRows / stride_rows_f);
+      out_width = ceil(inputCols / stride_cols_f);
+      break;
+    default:
+      eigen_assert(false && "unexpected padding");
+  }
+
+  DSizes<TensorIndex, 2> kernel_dims;
+  if (isColMajor) {
+    kernel_dims[0] = kernelFilters;
+    kernel_dims[1] = kernelChannels * kernelDepth * kernelRows * kernelCols;
+  } else {
+    kernel_dims[0] = kernelChannels * kernelDepth * kernelRows * kernelCols;
+    kernel_dims[1] = kernelFilters;
+  }
+
+  // Molds the output of the patch extraction result into a 2D tensor:
+  // - the first dimension (dims[0]): the patch values to be multiplied with the kernels
+  // - the second dimension (dims[1]): everything else
+  DSizes<TensorIndex, 2> pre_contract_dims;
+  if (isColMajor) {
+    pre_contract_dims[0] = kernelChannels * kernelDepth * kernelRows * kernelCols;
+    pre_contract_dims[1] = out_depth * out_height * out_width;
+    for (int i = 4; i < NumDims; ++i) {
+      pre_contract_dims[1] *= in.dimension(i);
+    }
+  } else {
+    pre_contract_dims[1] = kernelChannels * kernelDepth * kernelRows * kernelCols;
+    pre_contract_dims[0] = out_depth * out_height * out_width;
+    for (int i = 0; i < NumDims - 4; ++i) {
+      pre_contract_dims[0] *= in.dimension(i);
+    }
+  }
+
+  array<IndexPair<TensorIndex>, 1> contract_dims;
+  contract_dims[0] = IndexPair<TensorIndex>(1, 0);
+
+  // Molds the output of the contraction into the shape expected by the user
+  // (assuming ColMajor):
+  // - 1st dim: kernel filters
+  // - 2nd dim: output depth
+  // - 3nd dim: output height
+  // - 4rd dim: output width
+  // - 5th dim and beyond: everything else including batch size
+  DSizes<TensorIndex, NumDims> post_contract_dims;
+  if (isColMajor) {
+    post_contract_dims[0] = kernelFilters;
+    post_contract_dims[1] = out_depth;
+    post_contract_dims[2] = out_height;
+    post_contract_dims[3] = out_width;
+    for (int i = 4; i < NumDims; ++i) {
+      post_contract_dims[i] = in.dimension(i);
+    }
+  } else {
+    post_contract_dims[NumDims - 1] = kernelFilters;
+    post_contract_dims[NumDims - 2] = out_depth;
+    post_contract_dims[NumDims - 3] = out_height;
+    post_contract_dims[NumDims - 4] = out_width;
+    for (int i = 0; i < NumDims - 4; ++i) {
+      post_contract_dims[i] = in.dimension(i);
+    }
+  }
+
+  return choose(
+      Cond<internal::traits<Input>::Layout == ColMajor>(),
+      kernel.reshape(kernel_dims)
+          .contract(input.extract_volume_patches(
+                             kernelDepth, kernelRows, kernelCols, stridePlanes,
+                             strideRows, strideCols, padding_type)
+                        .reshape(pre_contract_dims),
+                    contract_dims)
+          .reshape(post_contract_dims),
+      input.extract_volume_patches(kernelDepth, kernelRows, kernelCols,
+                                   stridePlanes, strideRows, strideCols,
+                                   padding_type)
+          .reshape(pre_contract_dims)
+          .contract(kernel.reshape(kernel_dims), contract_dims)
+          .reshape(post_contract_dims));
+}
+
+} // end namespace Eigen
+
+#endif  // EIGEN_CXX11_SRC_NEURAL_NETWORKS_CUBOID_CONVOLUTION_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Patch3d.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Patch3d.h
new file mode 100644
index 0000000000..df60fe18a3
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Patch3d.h
@@ -0,0 +1,233 @@
+#ifndef EIGEN_CXX11_SRC_NEURAL_NETWORKS_PATCH3D_H
+#define EIGEN_CXX11_SRC_NEURAL_NETWORKS_PATCH3D_H
+
+#if not defined(__CUDACC__)
+#include <type_traits>
+#endif
+
+namespace Eigen {
+namespace internal {
+
+/** Extract3DPatches
+ * \ingroup CXX11_NeuralNetworksModule
+ *
+ * \brief Extracts 3D patches from a multichannel input volume.
+ *
+ * The input parameter is expected to be a tensor with a rank of 4 or more
+ * (channels, depth, height, width, optional others in col-major, and the
+ * reverse order in row-major).
+
+ * The return value will be a tensor of 3 more dimension than the input tensor.
+ * In col-major, the first 4 dimensions of the result are: channels, patch_depth,
+ * patch_height, patch_width. The next dimensions will identify the patch
+ * position on the 3D grid of extracted patches: z, y, x. The remaining
+ * dimensions, if any, will be the same as the 'other' dimensions of the input
+ * tensor.
+ */
+
+template <typename Input>
+EIGEN_ALWAYS_INLINE static const TensorStridingOp<
+    const array<typename internal::traits<Input>::Index,
+                internal::traits<Input>::NumDimensions + 3>,
+    const TensorReshapingOp<
+        const DSizes<typename internal::traits<Input>::Index,
+                     internal::traits<Input>::NumDimensions + 3>,
+        const TensorPatchOp<
+            const DSizes<typename internal::traits<Input>::Index,
+                         internal::traits<Input>::NumDimensions>,
+            const TensorPaddingOp<
+                const array<IndexPair<typename internal::traits<Input>::Index>,
+                            internal::traits<Input>::NumDimensions>,
+                const Input> > > >
+Extract3DPatches(
+    const Input& input, const DenseIndex patchPlanes,
+    const DenseIndex patchRows, const DenseIndex patchCols,
+    const DenseIndex stridePlanes, const DenseIndex strideRows,
+    const DenseIndex strideCols,
+    const DenseIndex paddingZTop, const DenseIndex paddingZBottom,
+    const DenseIndex paddingTop, const DenseIndex paddingBottom,
+    const DenseIndex paddingLeft, const DenseIndex paddingRight,
+    const typename internal::traits<Input>::Scalar padding_value = 0) {
+
+  typedef typename internal::traits<Input>::Index TensorIndex;
+  TensorRef<Tensor<typename internal::traits<Input>::Scalar, internal::traits<Input>::NumDimensions, internal::traits<Input>::Layout, TensorIndex> > in(input);
+
+  EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions >= 4, YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  static const bool isColMajor = (internal::traits<Input>::Layout == ColMajor);
+  static const int NumDims = internal::traits<Input>::NumDimensions;
+  static const int ExtDims = NumDims + 3;
+
+  // Tensor size after patch extraction. We add three dimensions to unpack the
+  // linear patch index into a 3D grid over which stride() can work.
+  DSizes<TensorIndex, ExtDims> pre_stride_dims;
+
+  if (isColMajor) {
+    pre_stride_dims[0] = in.dimension(0);
+    pre_stride_dims[1] = patchPlanes;
+    pre_stride_dims[2] = patchRows;
+    pre_stride_dims[3] = patchCols;
+  } else {
+    pre_stride_dims[ExtDims - 1] = in.dimension(NumDims - 1);
+    pre_stride_dims[ExtDims - 4] = patchCols;
+    pre_stride_dims[ExtDims - 3] = patchRows;
+    pre_stride_dims[ExtDims - 2] = patchPlanes;
+  }
+
+  const TensorIndex inputPlanes = isColMajor ? in.dimension(1) : in.dimension(NumDims - 2);
+  const TensorIndex inputRows = isColMajor ? in.dimension(2) : in.dimension(NumDims - 3);
+  const TensorIndex inputCols = isColMajor ? in.dimension(3) : in.dimension(NumDims - 4);
+
+  array<IndexPair<TensorIndex>, NumDims> paddings;
+  for (int i = 0; i < NumDims; ++i) {
+    paddings[i] = IndexPair<TensorIndex>(0, 0);
+  }
+
+  paddings[isColMajor ? 1 : (NumDims - 2)] = IndexPair<TensorIndex>(paddingZTop, paddingZBottom);
+  paddings[isColMajor ? 2 : (NumDims - 3)] = IndexPair<TensorIndex>(paddingTop, paddingBottom);
+  paddings[isColMajor ? 3 : (NumDims - 4)] = IndexPair<TensorIndex>(paddingLeft, paddingRight);
+
+  pre_stride_dims[isColMajor ? 4 : (ExtDims - 5)] = inputPlanes + paddingZBottom + paddingZTop - patchPlanes + 1;
+  pre_stride_dims[isColMajor ? 5 : (ExtDims - 6)] = inputRows + paddingTop + paddingBottom - patchRows + 1;
+  pre_stride_dims[isColMajor ? 6 : (ExtDims - 7)] = inputCols + paddingLeft + paddingRight - patchCols + 1;
+
+  if (isColMajor) {
+    for (int i = 7; i < NumDims + 3; ++i) {
+      pre_stride_dims[i] = in.dimension(i - 3);
+    }
+  } else {
+    for (int i = 0; i < NumDims - 4; ++i) {
+      pre_stride_dims[i] = in.dimension(i);
+    }
+  }
+
+  DSizes<TensorIndex, NumDims> patch_dims;
+  if (isColMajor) {
+    patch_dims[0] = in.dimension(0);
+    patch_dims[1] = patchPlanes;
+    patch_dims[2] = patchRows;
+    patch_dims[3] = patchCols;
+    for (int i = 4; i < NumDims; ++i) {
+      patch_dims[i] = 1;
+    }
+  } else {
+    patch_dims[NumDims - 1] = in.dimension(NumDims - 1);
+    patch_dims[NumDims - 4] = patchCols;
+    patch_dims[NumDims - 3] = patchRows;
+    patch_dims[NumDims - 2] = patchPlanes;
+    for (int i = 0; i < NumDims - 4; i++) {
+      patch_dims[i] = 1;
+    }
+  }
+
+  array<TensorIndex, NumDims + 3> strides;
+  if (isColMajor) {
+    // No striding within the patches.
+    for (int i = 0; i < 4; ++i) {
+      strides[i] = 1;
+    }
+    // Apply striding in the spatial patch grid dimensions only.
+    strides[4] = stridePlanes;
+    strides[5] = strideRows;
+    strides[6] = strideCols;
+    // No striding in the remaining dimensions (batches, ...).
+    for (int i = 7; i < NumDims + 3; i++) {
+      strides[i] = 1;
+    }
+  } else {
+    // No striding within the patches.
+    for (int i = 1; i <= 4; ++i) {
+      strides[ExtDims - i] = 1;
+    }
+    // Apply striding in the spatial patch grid dimensions only.
+    strides[ExtDims - 7] = strideCols;
+    strides[ExtDims - 6] = strideRows;
+    strides[ExtDims - 5] = stridePlanes;
+    // No striding in the remaining dimensions (batches, ...).
+    for (int i = 0; i < NumDims - 4; i++) {
+      strides[i] = 1;
+    }
+  }
+
+  // TODO(mjanusz): Consider getting rid of pad(), and stride() and extend
+  // extract_patches to take additional parameters for padding/striding,
+  // similarly to etract_image_patches.
+  return input.pad(paddings, padding_value).extract_patches(patch_dims).reshape(pre_stride_dims).stride(strides);
+}
+
+
+template <typename Input>
+EIGEN_ALWAYS_INLINE static const TensorStridingOp<
+    const array<typename internal::traits<Input>::Index,
+                internal::traits<Input>::NumDimensions + 3>,
+    const TensorReshapingOp<
+        const DSizes<typename internal::traits<Input>::Index,
+                     internal::traits<Input>::NumDimensions + 3>,
+        const TensorPatchOp<
+            const DSizes<typename internal::traits<Input>::Index,
+                         internal::traits<Input>::NumDimensions>,
+            const TensorPaddingOp<
+                const array<IndexPair<typename internal::traits<Input>::Index>,
+                            internal::traits<Input>::NumDimensions>,
+                const Input> > > >
+Extract3DPatches(
+    const Input& input, const DenseIndex patchPlanes,
+    const DenseIndex patchRows, const DenseIndex patchCols,
+    const DenseIndex stridePlanes, const DenseIndex strideRows,
+    const DenseIndex strideCols, const PaddingType padding_type,
+    const typename internal::traits<Input>::Scalar padding_value = 0) {
+  typedef typename internal::traits<Input>::Index TensorIndex;
+  TensorRef<Tensor<typename internal::traits<Input>::Scalar, internal::traits<Input>::NumDimensions, internal::traits<Input>::Layout, TensorIndex> > in(input);
+
+  EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions >= 4, YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  static const bool isColMajor = (internal::traits<Input>::Layout == ColMajor);
+  static const int NumDims = internal::traits<Input>::NumDimensions;
+
+  const TensorIndex inputPlanes = isColMajor ? in.dimension(1) : in.dimension(NumDims - 2);
+  const TensorIndex inputRows = isColMajor ? in.dimension(2) : in.dimension(NumDims - 3);
+  const TensorIndex inputCols = isColMajor ? in.dimension(3) : in.dimension(NumDims - 4);
+
+  switch (padding_type) {
+    case PADDING_VALID:
+      // No padding in any dimension.
+      return Extract3DPatches(input, patchPlanes, patchRows, patchCols,
+                              stridePlanes, strideRows, strideCols,
+                              0, 0, 0, 0, 0, 0, padding_value);
+    case PADDING_SAME:
+      // The side of the tensor before striding should be just the expected
+      // output times the stride.
+      const TensorIndex size_z = ceil(inputPlanes / static_cast<float>(stridePlanes)) * stridePlanes;
+      const TensorIndex size_y = ceil(inputRows / static_cast<float>(strideRows)) * strideRows;
+      const TensorIndex size_x = ceil(inputCols / static_cast<float>(strideCols)) * strideCols;
+
+      // The size of the patch space is going to be: padded_input_size - patch_size + 1.
+      // This has to match the expected size before striding (pre_stride_dims).
+      // The deltas below extend the input to the expected size.
+      const TensorIndex dz = size_z + patchPlanes - 1 - inputPlanes;
+      const TensorIndex dy = size_y + patchRows - 1 - inputRows;
+      const TensorIndex dx = size_x + patchCols - 1 - inputCols;
+
+      return Extract3DPatches(input, patchPlanes, patchRows, patchCols,
+                              stridePlanes, strideRows, strideCols,
+                              dz - dz / 2, dz / 2,
+                              dy - dy / 2, dy / 2,
+                              dx - dx / 2, dx / 2,
+                              padding_value);
+  }
+}
+
+// TODO(mjanusz): Switch this to a 'using' alias once CUDA supports C++11.
+template <typename Input>
+struct Extract3DPatchesType {
+  typedef const TensorStridingOp< const array<typename internal::traits<Input>::Index, internal::traits<Input>::NumDimensions + 3>,
+      const TensorReshapingOp< const DSizes<typename internal::traits<Input>::Index, internal::traits<Input>::NumDimensions + 3>,
+      const TensorPatchOp< const DSizes<typename internal::traits<Input>::Index, internal::traits<Input>::NumDimensions>,
+      const TensorPaddingOp< const array< IndexPair<typename internal::traits<Input>::Index>, internal::traits<Input>::NumDimensions>,
+      const Input> > > > type;
+};
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_SRC_NEURAL_NETWORKS_PATCH3D_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Pooling.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Pooling.h
new file mode 100644
index 0000000000..8dea22806c
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Pooling.h
@@ -0,0 +1,442 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#ifndef EIGEN_CXX11_NEURAL_NETWORKS_POOLING_H
+#define EIGEN_CXX11_NEURAL_NETWORKS_POOLING_H
+
+#include "Patch3d.h"
+
+namespace Eigen {
+
+/** SpatialMaxPooling
+  * \ingroup CXX11_NeuralNetworks_Module
+  *
+  * \brief Applies a max-pooling over a multichannel input image.
+  *
+  * The input parameter is expected to be a with a rank of 4 (channels, height, width, others in col-major, and the reverse of that in row-major).
+  *
+  * The result can be assigned to a tensor of rank equal to the rank of the input. The dimensions of the result will be channels, height, width, and others (in col-major, and the reverse of that if the input was row-major).
+  *
+  * The order of the width and height dimensions can be swapped if needed.
+  *
+*/
+#if !defined(EIGEN_HAS_INDEX_LIST)
+template <typename Input>
+EIGEN_ALWAYS_INLINE
+static const TensorReshapingOp<const Eigen::DSizes<typename internal::traits<Input>::Index, internal::traits<Input>::NumDimensions>, const TensorReductionOp<internal::MaxReducer<typename internal::remove_const<typename internal::traits<Input>::Scalar>::type>, const Eigen::array<int, 2>, const TensorImagePatchOp<Dynamic, Dynamic, const Input> > >
+#else
+template <typename Input>
+EIGEN_ALWAYS_INLINE
+static const TensorReshapingOp<const Eigen::DSizes<typename internal::traits<Input>::Index, internal::traits<Input>::NumDimensions>, const TensorReductionOp<internal::MaxReducer<typename internal::remove_const<typename internal::traits<Input>::Scalar>::type>, typename internal::conditional<internal::traits<Input>::Layout == ColMajor, const Eigen::IndexList<Eigen::type2index<1>, Eigen::type2index<2> >, const Eigen::IndexList<Eigen::type2index<2>, Eigen::type2index<3> > >::type, const TensorImagePatchOp<Dynamic, Dynamic, const Input> > >
+#endif
+SpatialMaxPooling(const Input& input, DenseIndex patchRows, DenseIndex patchCols,
+                  DenseIndex strideRows, DenseIndex strideCols, const PaddingType padding_type,
+                  DenseIndex in_strideRows = 1, DenseIndex in_strideCols = 1)
+{
+  EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == 4, YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  typedef typename internal::traits<Input>::Index TensorIndex;
+  TensorRef<Tensor<typename internal::traits<Input>::Scalar, internal::traits<Input>::NumDimensions, internal::traits<Input>::Layout, TensorIndex> > in(input);
+
+  const DenseIndex patchRowsEff = patchRows + (patchRows - 1) * (in_strideRows - 1);
+  const DenseIndex patchColsEff = patchCols + (patchCols - 1) * (in_strideCols - 1);
+
+  static const bool isColMajor = (internal::traits<Input>::Layout == ColMajor);
+  static const int idxRows = isColMajor ? 1 : 2;
+  static const int idxCols = isColMajor ? 2 : 1;
+
+  // Molds the output of the reduction into the shape expected by the user.
+  // (assuming col-major):
+  // - 1st dim: channels
+  // - 2nd dim: output height
+  // - 3rd dim: output width
+  // - 4th dim and beyond: everything else including batch size
+  Eigen::DSizes<TensorIndex, internal::traits<Input>::NumDimensions> post_reduce_dims;
+  post_reduce_dims[0] = in.dimension(0);
+  if (padding_type == PADDING_VALID) {
+    post_reduce_dims[idxRows] = numext::ceil((in.dimension(idxRows) - patchRowsEff + 1.f) / static_cast<float>(strideRows));
+    post_reduce_dims[idxCols] = numext::ceil((in.dimension(idxCols) - patchColsEff + 1.f) / static_cast<float>(strideCols));
+  } else {
+    post_reduce_dims[idxRows] = numext::ceil(in.dimension(idxRows) / static_cast<float>(strideRows));
+    post_reduce_dims[idxCols] = numext::ceil(in.dimension(idxCols) / static_cast<float>(strideCols));
+  }
+  post_reduce_dims[3] = in.dimension(3);
+
+#if !defined(EIGEN_HAS_INDEX_LIST)
+  // nvcc doesn't support cxx11
+  Eigen::array<int, 2> reduction_dims;
+  if (isColMajor) {
+    reduction_dims[0] = 1;
+    reduction_dims[1] = 2;
+  } else {
+    reduction_dims[0] = 2;
+    reduction_dims[1] = 3;
+  }
+#else
+  // Take advantage of cxx11 to give the compiler information it can use to
+  // optimize the code.
+  typename internal::conditional<internal::traits<Input>::Layout == ColMajor, const Eigen::IndexList<Eigen::type2index<1>, Eigen::type2index<2> >, const Eigen::IndexList<Eigen::type2index<2>, Eigen::type2index<3> > >::type reduction_dims;
+#endif
+
+  return input.extract_image_patches(patchRows, patchCols, strideRows, strideCols, in_strideRows, in_strideCols, padding_type, -Eigen::NumTraits<typename internal::remove_const<typename internal::traits<Input>::Scalar>::type>::highest()).maximum(reduction_dims).reshape(post_reduce_dims);
+}
+
+/** CuboidMaxPooling
+  * \ingroup CXX11_NeuralNetworks_Module
+  *
+  * \brief Applies a max-pooling over a multichannel input volume.
+  *
+  * The input parameter is expected to be a tensor with a rank of 5 (channels, depth, height, width, others in col-major, and the reverse of that in row-major).
+  *
+  * The result can be assigned to a tensor of rank equal to the rank of the input. The dimensions of the result will be channels, depth, height, width, and others (in col-major, and the reverse of that if the input was row-major).
+  *
+  * The order of the depth, width and height dimensions can be swapped if needed.
+  *
+*/
+#if !defined(EIGEN_HAS_INDEX_LIST)
+template <typename Input>
+EIGEN_ALWAYS_INLINE static const TensorReshapingOp<
+    const Eigen::DSizes<DenseIndex, internal::traits<Input>::NumDimensions>,
+    const TensorReductionOp<
+        internal::MaxReducer<float>, const Eigen::array<int, 1>,
+        const TensorReshapingOp<
+            const Eigen::DSizes<DenseIndex, 3>,
+            const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Input> > > >
+#else
+template <typename Input>
+EIGEN_ALWAYS_INLINE static const TensorReshapingOp<
+    const Eigen::DSizes<DenseIndex, internal::traits<Input>::NumDimensions>,
+    const TensorReductionOp<
+        internal::MaxReducer<float>,
+        const Eigen::IndexList<Eigen::type2index<1> >,
+        const TensorReshapingOp<
+            const Eigen::DSizes<DenseIndex, 3>,
+            const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Input> > > >
+#endif
+CuboidMaxPooling(const Input& input, DenseIndex patchPlanes,
+                 DenseIndex patchRows, DenseIndex patchCols,
+                 DenseIndex stridePlanes, DenseIndex strideRows,
+                 DenseIndex strideCols, const PaddingType padding_type) {
+  EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == 5, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  static const bool isColMajor = (internal::traits<Input>::Layout == ColMajor);
+
+  typedef typename internal::traits<Input>::Index TensorIndex;
+  TensorRef<Tensor<typename internal::traits<Input>::Scalar, internal::traits<Input>::NumDimensions, internal::traits<Input>::Layout, TensorIndex> > in(input);
+
+  static const int idxPlanes = isColMajor ? 1 : 3;
+  static const int idxRows = 2;
+  static const int idxCols = isColMajor ? 3 : 1;
+
+  // Molds the output of the reduction into the shape expected by the used
+  // (assuming col-major):
+  // - 1st dim: channels
+  // - 2nd dim: output depth
+  // - 3rd dim: output height
+  // - 4th dim: output width
+  // - 5th dim and beyond: everything else including batch size
+  Eigen::DSizes<DenseIndex, internal::traits<Input>::NumDimensions> post_reduce_dims;
+  post_reduce_dims[0] = in.dimension(0);
+  if (padding_type == PADDING_VALID) {
+    post_reduce_dims[idxPlanes] = numext::ceil((in.dimension(idxPlanes) - patchPlanes + 1.f) / static_cast<float>(stridePlanes));
+    post_reduce_dims[idxRows] = numext::ceil((in.dimension(idxRows) - patchRows + 1.f) / static_cast<float>(strideRows));
+    post_reduce_dims[idxCols] = numext::ceil((in.dimension(idxCols) - patchCols + 1.f) / static_cast<float>(strideCols));
+  } else {
+    post_reduce_dims[idxPlanes] = numext::ceil(in.dimension(idxPlanes) / static_cast<float>(stridePlanes));
+    post_reduce_dims[idxRows] = numext::ceil(in.dimension(idxRows) / static_cast<float>(strideRows));
+    post_reduce_dims[idxCols] = numext::ceil(in.dimension(idxCols) / static_cast<float>(strideCols));
+  }
+  post_reduce_dims[4] = in.dimension(4);
+
+  Eigen::DSizes<DenseIndex, 3> pre_reduce_dims;
+  pre_reduce_dims[1] = patchRows * patchCols * patchPlanes;
+  if (isColMajor) {
+    pre_reduce_dims[0] = post_reduce_dims[0];
+    pre_reduce_dims[2] = post_reduce_dims[1] * post_reduce_dims[2] * post_reduce_dims[3] * post_reduce_dims[4];
+  } else {
+    pre_reduce_dims[0] = post_reduce_dims[0] * post_reduce_dims[1] * post_reduce_dims[2] * post_reduce_dims[3];
+    pre_reduce_dims[2] = post_reduce_dims[4];
+  }
+
+#if !defined(EIGEN_HAS_INDEX_LIST)
+  // nvcc doesn't support cxx11
+  Eigen::array<int, 1> reduction_dims;
+  reduction_dims[0] = 1;
+#else
+  // Take advantage of cxx11 to give the compiler information it can use to
+  // optimize the code.
+  Eigen::IndexList<Eigen::type2index<1> > reduction_dims;
+#endif
+  return input.extract_volume_patches(patchPlanes, patchRows, patchCols,
+                                      stridePlanes, strideRows, strideCols,
+                                      padding_type, -Eigen::NumTraits<float>::highest())
+      .reshape(pre_reduce_dims)
+      .maximum(reduction_dims)
+      .reshape(post_reduce_dims);
+}
+
+
+/** SpatialAvgPooling
+  * \ingroup CXX11_NeuralNetworks_Module
+  *
+  * \brief Applies an average pooling over a multichannel input image.
+  *
+  * The input parameter is expected to be a tensor with a rank of 4 (channels, height, width, others in col-major, and the reverse of that in row-major).
+  *
+  * The result can be assigned to a tensor of rank equal to the rank of the input. The dimensions of the result will be channels, height, width, and others (in col-major, and the reverse of that if the input was row-major).
+  *
+  * The order of the width and height dimensions can be swapped if needed.
+  *
+*/
+namespace internal {
+
+template <typename T> struct AvgPoolMeanReducer
+{
+#if (EIGEN_ARCH_i386 || EIGEN_ARCH_x86_64 || defined (EIGEN_USE_GPU) || defined(__CUDACC__) || defined(__CUDA_ARCH__))
+  // We only support packet access for floats.
+  static const bool PacketAccess = internal::is_same<T, float>::value;
+#else
+  static const bool PacketAccess = false;
+#endif
+  static const bool IsStateful = true;
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE AvgPoolMeanReducer() : scalarCount_(0) {
+    typedef typename packet_traits<T>::type Packet;
+    packetCount_ = pset1<Packet>(0.0);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) {
+    if (t != -Eigen::NumTraits<T>::highest()) {
+      (*accum) = (*accum) + t;
+      scalarCount_++;
+    }
+  }
+
+
+#if (!defined (EIGEN_USE_GPU) || !defined(__CUDACC__) || !defined(__CUDA_ARCH__))
+#ifdef EIGEN_VECTORIZE_AVX
+#define pequal(a,b) _mm256_cmp_ps(a,b,_CMP_EQ_UQ)
+#define psel(a,b,false_mask) _mm256_blendv_ps(a,b,false_mask)
+#else
+#define pequal(a,b) _mm_cmpeq_ps(a,b)
+#define psel(a,b,false_mask) _mm_or_ps(_mm_andnot_ps(false_mask, a), _mm_and_ps(false_mask, b))
+#endif
+
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) {
+    reducePacketWithType(static_cast<T>(0), p, accum);
+  }
+
+  template <typename Packet>
+  void reducePacketWithType(T, const Packet& p, Packet* accum) {
+    Packet skip_mask = pequal(p, pset1<Packet>(-Eigen::NumTraits<T>::highest()));
+    (*accum) = padd<Packet>(*accum, psel(p, pset1<Packet>(0), skip_mask));
+    packetCount_ = padd<Packet>(packetCount_, psel(pset1<Packet>(1), pset1<Packet>(0), skip_mask));
+  }
+
+#else
+#define pequal(a,b) make_float4(a.x == b.x ? 1.f : 0, a.y == b.y ? 1.f : 0, a.z == b.z ? 1.f : 0, a.w == b.w ? 1.f : 0)
+#define psel(a,b,c) make_float4(c.x ? b.x : a.x, c.y ? b.y : a.y, c.z ? b.z : a.z, c.w ? b.w : a.w)
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const float4& p, float4* accum) {
+    float4 skip_mask = pequal(p, pset1<float4>(-Eigen::NumTraits<float>::highest()));
+    (*accum) = padd<float4>(*accum, psel(p, pset1<float4>(0), skip_mask));
+    packetCount_ = padd<float4>(packetCount_, psel(pset1<float4>(1), pset1<float4>(0), skip_mask));
+  }
+
+#endif
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
+    return static_cast<T>(0);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
+    return pset1<Packet>(0);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const {
+    eigen_assert(scalarCount_ > 0);
+    return accum / scalarCount_;
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const {
+    return pdiv(vaccum, packetCount_);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const {
+    return (saccum + predux(vaccum)) / (scalarCount_ + predux(packetCount_));
+  }
+
+ protected:
+    typedef typename packet_traits<T>::type Packet;
+    int scalarCount_;
+    Packet packetCount_;
+};
+
+}  // namespace internal
+
+#if !defined(EIGEN_HAS_INDEX_LIST)
+template <typename Input>
+EIGEN_ALWAYS_INLINE
+static const TensorReshapingOp<const Eigen::DSizes<typename internal::traits<Input>::Index, internal::traits<Input>::NumDimensions>, const TensorReductionOp<internal::AvgPoolMeanReducer<typename internal::remove_const<typename internal::traits<Input>::Scalar>::type>, const Eigen::array<int, 2>, const TensorImagePatchOp<Dynamic, Dynamic, const Input> > >
+#else
+template <typename Input>
+EIGEN_ALWAYS_INLINE
+static const TensorReshapingOp<const Eigen::DSizes<typename internal::traits<Input>::Index, internal::traits<Input>::NumDimensions>, const TensorReductionOp<internal::AvgPoolMeanReducer<typename internal::remove_const<typename internal::traits<Input>::Scalar>::type>, typename internal::conditional<internal::traits<Input>::Layout == ColMajor, const Eigen::IndexList<Eigen::type2index<1>, Eigen::type2index<2> >, const Eigen::IndexList<Eigen::type2index<2>, Eigen::type2index<3> > >::type, const TensorImagePatchOp<Dynamic, Dynamic, const Input> > >
+#endif
+SpatialAvgPooling(const Input& input, DenseIndex patchRows, DenseIndex patchCols,
+                  DenseIndex strideRows, DenseIndex strideCols, const PaddingType padding_type,
+                  DenseIndex in_strideRows = 1, DenseIndex in_strideCols = 1)
+{
+  EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == 4, YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  typedef typename internal::traits<Input>::Index TensorIndex;
+  TensorRef<Tensor<typename internal::traits<Input>::Scalar, internal::traits<Input>::NumDimensions, internal::traits<Input>::Layout, TensorIndex> > in(input);
+
+  const DenseIndex patchRowsEff = patchRows + (patchRows - 1) * (in_strideRows - 1);
+  const DenseIndex patchColsEff = patchCols + (patchCols - 1) * (in_strideCols - 1);
+
+  static const bool isColMajor = (internal::traits<Input>::Layout == ColMajor);
+  static const int idxRows = isColMajor ? 1 : 2;
+  static const int idxCols = isColMajor ? 2 : 1;
+
+  // Molds the output of the reduction into the shape expected by the user.
+  // (assuming col-major):
+  // - 1st dim: channels
+  // - 2nd dim: output height
+  // - 3rd dim: output width
+  // - 4th dim and beyond: everything else including batch size
+  Eigen::DSizes<TensorIndex, internal::traits<Input>::NumDimensions> post_reduce_dims;
+  post_reduce_dims[0] = in.dimension(0);
+  if (padding_type == PADDING_VALID) {
+    post_reduce_dims[idxRows] = numext::ceil((in.dimension(idxRows) - patchRowsEff + 1.f) / static_cast<float>(strideRows));
+    post_reduce_dims[idxCols] = numext::ceil((in.dimension(idxCols) - patchColsEff + 1.f) / static_cast<float>(strideCols));
+  } else {
+    post_reduce_dims[idxRows] = numext::ceil(in.dimension(idxRows) / static_cast<float>(strideRows));
+    post_reduce_dims[idxCols] = numext::ceil(in.dimension(idxCols) / static_cast<float>(strideCols));
+  }
+  post_reduce_dims[3] = in.dimension(3);
+
+  typedef typename internal::remove_const<typename internal::traits<Input>::Scalar>::type CoeffReturnType;
+  internal::AvgPoolMeanReducer<CoeffReturnType> mean_with_nan;
+
+#if !defined(EIGEN_HAS_INDEX_LIST)
+  // nvcc doesn't support cxx11
+  Eigen::array<int, 2> reduction_dims;
+  if (isColMajor) {
+    reduction_dims[0] = 1;
+    reduction_dims[1] = 2;
+  } else {
+    reduction_dims[0] = 2;
+    reduction_dims[1] = 3;
+  }
+#else
+  // Take advantage of cxx11 to give the compiler information it can use to
+  // optimize the code.
+  typename internal::conditional<internal::traits<Input>::Layout == ColMajor, const Eigen::IndexList<Eigen::type2index<1>, Eigen::type2index<2> >, const Eigen::IndexList<Eigen::type2index<2>, Eigen::type2index<3> > >::type reduction_dims;
+#endif
+  return input.extract_image_patches(patchRows, patchCols, strideRows, strideCols, in_strideRows, in_strideCols, padding_type, -Eigen::NumTraits<typename internal::remove_const<typename internal::traits<Input>::Scalar>::type>::highest()).reduce(reduction_dims, mean_with_nan).reshape(post_reduce_dims);
+}
+
+
+/** CuboidAvgPooling
+  * \ingroup CXX11_NeuralNetworks_Module
+  *
+  * \brief Applies an average pooling over a multichannel input volume.
+  *
+  * The input parameter is expected to be a tensor with a rank of 5 (channels, depth, height, width, others, and the reverse of that in row-major).
+  *
+  * The result can be assigned to a tensor of rank equal to the rank of the input. The dimensions of the result will be channels, depth, width, and others (in col-major, and the reverse of that if the input was row-major).
+  *
+  * The order of the depth, width and height dimensions can be swapped if needed.
+  *
+*/
+#if !defined(EIGEN_HAS_INDEX_LIST)
+template <typename Input>
+EIGEN_ALWAYS_INLINE static const TensorReshapingOp<
+    const Eigen::DSizes<DenseIndex, internal::traits<Input>::NumDimensions>,
+    const TensorReductionOp<
+        internal::AvgPoolMeanReducer<float>, const Eigen::array<int, 1>,
+        const TensorReshapingOp<
+            const Eigen::DSizes<DenseIndex, 3>,
+            const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Input> > > >
+#else
+template <typename Input>
+EIGEN_ALWAYS_INLINE static const TensorReshapingOp<
+      const Eigen::DSizes<DenseIndex, internal::traits<Input>::NumDimensions>,
+      const TensorReductionOp<
+          internal::AvgPoolMeanReducer<float>,
+          const Eigen::IndexList<Eigen::type2index<1> >,
+          const TensorReshapingOp<
+              const Eigen::DSizes<DenseIndex, 3>,
+              const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Input> > > >
+#endif
+CuboidAvgPooling(const Input& input, DenseIndex patchPlanes,
+                 DenseIndex patchRows, DenseIndex patchCols,
+                 DenseIndex stridePlanes, DenseIndex strideRows,
+                 DenseIndex strideCols, const PaddingType padding_type) {
+  EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == 5, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  static const bool isColMajor = (internal::traits<Input>::Layout == ColMajor);
+
+  typedef typename internal::traits<Input>::Index TensorIndex;
+  TensorRef<Tensor<typename internal::traits<Input>::Scalar, internal::traits<Input>::NumDimensions, internal::traits<Input>::Layout, TensorIndex> > in(input);
+
+  static const int idxPlanes = isColMajor ? 1 : 3;
+  static const int idxRows = 2;
+  static const int idxCols = isColMajor ? 3 : 1;
+  // Molds the output of the reduction into the shape expected by the used
+  // (assuming col-major):
+  // - 1st dim: channels
+  // - 2nd dim: outupt depth
+  // - 3rd dim: output height
+  // - 4th dim: output width
+  // - 5th dim and beyond: everything else including batch size
+  Eigen::DSizes<DenseIndex, internal::traits<Input>::NumDimensions> post_reduce_dims;
+  post_reduce_dims[0] = in.dimension(0);
+  if (padding_type == PADDING_VALID) {
+    post_reduce_dims[idxPlanes] = numext::ceil((in.dimension(idxPlanes) - patchPlanes + 1.f) / static_cast<float>(stridePlanes));
+    post_reduce_dims[idxRows] = numext::ceil((in.dimension(idxRows) - patchRows + 1.f) / static_cast<float>(strideRows));
+    post_reduce_dims[idxCols] = numext::ceil((in.dimension(idxCols) - patchCols + 1.f) / static_cast<float>(strideCols));
+  } else {
+    post_reduce_dims[idxPlanes] = numext::ceil(in.dimension(idxPlanes) / static_cast<float>(stridePlanes));
+    post_reduce_dims[idxRows] = numext::ceil(in.dimension(idxRows) / static_cast<float>(strideRows));
+    post_reduce_dims[idxCols] = numext::ceil(in.dimension(idxCols) / static_cast<float>(strideCols));
+  }
+  post_reduce_dims[4] = in.dimension(4);
+
+  Eigen::DSizes<DenseIndex, 3> pre_reduce_dims;
+  pre_reduce_dims[1] = patchRows * patchCols * patchPlanes;
+  if (isColMajor) {
+    pre_reduce_dims[0] = post_reduce_dims[0];
+    pre_reduce_dims[2] = post_reduce_dims[1] * post_reduce_dims[2] * post_reduce_dims[3] * post_reduce_dims[4];
+  } else {
+    pre_reduce_dims[0] = post_reduce_dims[0] * post_reduce_dims[1] * post_reduce_dims[2] * post_reduce_dims[3];
+    pre_reduce_dims[2] = post_reduce_dims[4];
+  }
+
+  typedef typename internal::remove_const<typename internal::traits<Input>::Scalar>::type CoeffReturnType;
+  internal::AvgPoolMeanReducer<CoeffReturnType> mean_with_nan;
+
+#if !defined(EIGEN_HAS_INDEX_LIST)
+  // nvcc doesn't support cxx11
+  Eigen::array<int, 1> reduction_dims;
+  reduction_dims[0] = 1;
+#else
+  // Take advantage of cxx11 to give the compiler information it can use to
+  // optimize the code.
+  Eigen::IndexList<Eigen::type2index<1> > reduction_dims;
+#endif
+  return input.extract_volume_patches(patchPlanes, patchRows, patchCols,
+                                      stridePlanes, strideRows, strideCols,
+                                      padding_type, -Eigen::NumTraits<float>::highest())
+      .reshape(pre_reduce_dims)
+      .reduce(reduction_dims, mean_with_nan)
+      .reshape(post_reduce_dims);
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_NEURAL_NETWORKS_POOLING_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/SoftMax.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/SoftMax.h
new file mode 100644
index 0000000000..223ae28ffd
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/SoftMax.h
@@ -0,0 +1,82 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#ifndef EIGEN_CXX11_NEURAL_NETWORKS_SOFTMAX_H
+#define EIGEN_CXX11_NEURAL_NETWORKS_SOFTMAX_H
+
+namespace Eigen {
+
+/** SoftMax
+  * \ingroup CXX11_NeuralNetworks_Module
+  *
+  * \brief Applies a softmax
+  *
+  * The input parameter is expected to be a col-major tensor with a rank of 2 (depth and other).
+  *
+  * The result can be assigned to a tensor of rank and dimensions equal to that of the input. The result will be laid out in col-major order.
+  *
+*/
+
+namespace {
+struct SoftmaxOp {
+  SoftmaxOp(const float beta) : beta_(beta) { }
+
+  template <typename Input>
+  typename Input::Dimensions dimensions(const Input& input) const {
+    return input.dimensions();
+  }
+
+  template <typename Input, typename Output, typename Device>
+  void eval(const Input& input, Output& output, const Device& device) const
+  {
+#if !defined(EIGEN_HAS_INDEX_LIST)
+    // nvcc doesn't support cxx11
+    Eigen::array<typename internal::traits<Input>::Index, 1> depth_dim;
+    depth_dim[0] = 0;
+    Eigen::array<typename internal::traits<Input>::Index, 2> bcast;
+    bcast[0] = dimensions(input)[0];
+    bcast[1] = 1;
+    DSizes<typename internal::traits<Input>::Index, 2> dims2d;
+    dims2d[0] = 1;
+    dims2d[1] = dimensions(input)[1];
+#else
+    // Take advantage of cxx11 to give the compiler information it can use to
+    // optimize the code.
+    Eigen::IndexList<Eigen::type2index<0>> depth_dim;
+    Eigen::IndexList<int, Eigen::type2index<1>> bcast;
+    bcast.set(0, dimensions(input)[0]);
+    Eigen::IndexList<Eigen::type2index<1>, typename internal::traits<Input>::Index> dims2d;
+    dims2d.set(1, dimensions(input)[1]);
+#endif
+
+    output.device(device) = ((input - input.maximum(depth_dim).eval().reshape(dims2d).broadcast(bcast)) * beta_).exp();
+    output.device(device) = output / (output.sum(depth_dim).eval().reshape(dims2d).broadcast(bcast));
+  }
+
+ private:
+  const float beta_;
+};
+}
+
+
+template <typename Input>
+EIGEN_ALWAYS_INLINE
+static const TensorCustomUnaryOp<const SoftmaxOp, const Input>
+SoftMax(const Input& input, const float beta)
+{
+  EIGEN_STATIC_ASSERT(internal::traits<Input>::Layout == ColMajor, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == 2, YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  const SoftmaxOp op(beta);
+  return input.customOp(op);
+}
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_NEURAL_NETWORKS_SOFTMAX_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/SpatialConvolutions.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/SpatialConvolutions.h
new file mode 100644
index 0000000000..34a9fcf037
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/SpatialConvolutions.h
@@ -0,0 +1,634 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#ifndef EIGEN_CXX11_NEURAL_NETWORKS_SPATIAL_CONVOLUTIONS_H
+#define EIGEN_CXX11_NEURAL_NETWORKS_SPATIAL_CONVOLUTIONS_H
+
+namespace Eigen {
+
+namespace internal {
+
+// These optimizations require vector instructions
+#ifdef EIGEN_VECTORIZE
+
+// TODO: Consolidate this part of the code with the image patch extraction code
+// since they are both very similar.
+template <typename NewDimension, DenseIndex Rows, DenseIndex Cols, typename ArgType, typename Device,
+          typename Scalar, typename Index,
+          typename nocontract_t, typename contract_t,
+          int Side, size_t packet_size,
+          bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
+class TensorContractionInputMapper<Scalar, Index, Side, TensorEvaluator<const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >, Device>, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>
+{
+ public:
+  typedef TensorContractionInputMapper<Scalar, Index, Side, TensorEvaluator<const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >, Device>, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> Self;
+  typedef Self SubMapper;
+  typedef Self VectorMapper;
+  typedef Self LinearMapper;
+  typedef typename packet_traits<Scalar>::type Packet;
+
+  TensorContractionInputMapper(const TensorEvaluator<const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >, Device>& tensor,
+                               const nocontract_t&, const nocontract_t&,
+                               const contract_t&, const contract_t&,
+                               const Index depth_offset = 0, const Index col_offset = 0)
+      : m_depth_offset(depth_offset), m_col_offset(col_offset), m_impl(tensor.impl().impl())
+  {
+    if (internal::traits<ArgType>::Layout == ColMajor) {
+      m_patch_depth = tensor.impl().dimensions()[0];
+      m_patch_rows = tensor.impl().dimensions()[1];
+      m_patch_cols = tensor.impl().dimensions()[2];
+      m_num_patches = tensor.impl().dimensions()[3];
+    } else {
+      static const int NumDims = tensor.impl().dimensions().size();
+      m_patch_depth = tensor.impl().dimensions()[NumDims - 1];
+      m_patch_rows = tensor.impl().dimensions()[NumDims - 2];
+      m_patch_cols = tensor.impl().dimensions()[NumDims - 3];
+      m_num_patches = tensor.impl().dimensions()[NumDims - 4];
+    }
+    m_patch_row_inflate_strides = tensor.impl().rowInflateStride();
+    m_patch_col_inflate_strides = tensor.impl().colInflateStride();
+
+    m_colStride = m_patch_rows;
+
+    m_outputRows = tensor.impl().outputRows();
+    m_row_strides = tensor.impl().userRowStride();
+    m_col_strides = tensor.impl().userColStride();
+
+    m_in_row_strides = tensor.impl().userInRowStride();
+    m_in_col_strides = tensor.impl().userInColStride();
+
+    if (internal::traits<ArgType>::Layout == ColMajor) {
+      m_inputRows = tensor.impl().impl().dimensions()[1];
+      m_inputCols = tensor.impl().impl().dimensions()[2];
+    } else {
+      static const int NumDims = tensor.impl().impl().dimensions().size();
+      m_inputRows = tensor.impl().impl().dimensions()[NumDims - 2];
+      m_inputCols = tensor.impl().impl().dimensions()[NumDims - 3];
+    }
+
+    m_rowInputStride = m_patch_depth;
+    m_colInputStride = m_patch_depth * m_inputRows;
+    m_patchInputStride = m_patch_depth * m_inputRows * m_inputCols;
+
+    m_rowPaddingTop = tensor.impl().rowPaddingTop();
+    m_colPaddingLeft = tensor.impl().colPaddingLeft();
+
+    m_fastInputRowStride = internal::TensorIntDivisor<Index>(m_patch_row_inflate_strides);
+    m_fastInputColStride = internal::TensorIntDivisor<Index>(m_patch_col_inflate_strides);
+    m_fastNumPatches = internal::TensorIntDivisor<Index>(m_num_patches);
+    m_fastColStride = internal::TensorIntDivisor<Index>(m_colStride);
+    m_fastOutputRows = internal::TensorIntDivisor<Index>(m_outputRows);
+    m_fastDimZero = internal::TensorIntDivisor<Index>(m_patch_depth);
+
+    computeBaseIndices(m_col_offset, m_rowIndex, m_colIndex, m_otherIndex);
+  }
+
+  TensorContractionInputMapper(const TensorContractionInputMapper& base_mapper,
+                               const Index depth_offset,
+                               const Index col_offset) : m_depth_offset(depth_offset), m_col_offset(col_offset), m_impl(base_mapper.m_impl) {
+    m_patch_depth = base_mapper.m_patch_depth;
+    m_patch_rows = base_mapper.m_patch_rows;
+    m_patch_cols = base_mapper.m_patch_cols;
+    m_num_patches = base_mapper.m_num_patches;
+    m_patch_row_inflate_strides = base_mapper.m_patch_row_inflate_strides;
+    m_patch_col_inflate_strides = base_mapper.m_patch_col_inflate_strides;
+
+    m_colStride = base_mapper.m_colStride;
+
+    m_rowInputStride = base_mapper.m_rowInputStride;
+    m_colInputStride = base_mapper.m_colInputStride;
+    m_patchInputStride = base_mapper.m_patchInputStride;
+
+    m_inputRows = base_mapper.m_inputRows;
+    m_inputCols = base_mapper.m_inputCols;
+
+    m_outputRows = base_mapper.m_outputRows;
+    m_row_strides = base_mapper.m_row_strides;
+    m_col_strides = base_mapper.m_col_strides;
+
+    m_in_row_strides = base_mapper.m_in_row_strides;
+    m_in_col_strides = base_mapper.m_in_col_strides;
+
+    m_rowPaddingTop = base_mapper.m_rowPaddingTop;
+    m_colPaddingLeft = base_mapper.m_colPaddingLeft;
+
+    m_fastInputRowStride = base_mapper.m_fastInputRowStride;
+    m_fastInputColStride = base_mapper.m_fastInputColStride;
+    m_fastNumPatches = base_mapper.m_fastNumPatches;
+    m_fastColStride = base_mapper.m_fastColStride;
+    m_fastOutputRows = base_mapper.m_fastOutputRows;
+    m_fastDimZero = base_mapper.m_fastDimZero;
+
+    computeBaseIndices(m_col_offset, m_rowIndex, m_colIndex, m_otherIndex);
+  }
+
+ // If true, turns off some optimizations for loading packets since the image
+  // patches are "non-standard" such as there are non-trivial strides or
+  // inflations in the input.
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE bool nonStandardPatches() const {
+    return m_in_row_strides != 1 || m_in_col_strides != 1 || m_patch_row_inflate_strides != 1 || m_patch_col_inflate_strides != 1;
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE SubMapper getSubMapper(Index i, Index j) const {
+    return SubMapper(*this, m_depth_offset + i, m_col_offset + j);
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE LinearMapper getLinearMapper(Index i, Index j) const {
+    return LinearMapper(*this, m_depth_offset + i, m_col_offset + j);
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Scalar operator()(Index row) const {
+    return loadCoeff(row + m_depth_offset, m_rowIndex, m_colIndex, m_otherIndex);
+  }
+
+  // Load the coefficient at the patchIndex location instead of the usual m_rowIndex,
+  // m_colIndex, m_otherIndex. This is currently only used by the gpu code.  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE Scalar operator()(Index row, Index patchIndex) const {
+    checkZeroOffsets();
+    Index rowIndex, colIndex, otherIndex;
+    computeBaseIndices(patchIndex, rowIndex, colIndex, otherIndex);
+    return loadCoeff(row, rowIndex, colIndex, otherIndex);
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Packet loadPacket(Index row) const {
+    return loadPacket(row + m_depth_offset, m_rowIndex, m_colIndex, m_otherIndex);
+  }
+
+  // Load the packet at the patchIndex location instead of the usual m_rowIndex,
+  // m_colIndex, m_otherIndex. This is currently only used by the gpu code.
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Packet loadPacket(Index row, Index patchIndex) const {
+    checkZeroOffsets();
+    Index rowIndex, colIndex, otherIndex;
+    computeBaseIndices(patchIndex, rowIndex, colIndex, otherIndex);
+    return loadPacket(row, rowIndex, colIndex, otherIndex);
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index patchDepth() const { return m_patch_depth; }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index patchRows() const { return m_patch_rows; }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index patchCols() const { return m_patch_cols; }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE bool padRow(const Index row) const {
+    const Index r = m_rowIndex + row;
+    return r < 0 | r >= m_inputRows;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE bool padCol(const Index col) const {
+    const Index c = m_colIndex + col;
+    return c < 0 | c >= m_inputCols;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index baseIndex(const Index row, const Index col) const {
+    const Index r = m_rowIndex + row;
+    const Index c = m_colIndex + col;
+    return r * m_rowInputStride + c * m_colInputStride + m_otherIndex;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Packet packetNoPadding(const Index depth, const Index baseIndex) const {
+    const Index inputIndex = depth + baseIndex;
+    return m_impl.template packet<Unaligned>(inputIndex);
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index rowOffset() const {
+    const Index patchOffset = m_depth_offset / m_fastDimZero;
+    const Index colOffset = patchOffset / m_fastColStride;
+    return patchOffset-colOffset*m_colStride;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index colOffset() const {
+    const Index patchOffset = m_depth_offset / m_fastDimZero;
+    const Index colOffset = patchOffset / m_fastColStride;
+    return colOffset;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index depthOffset() const {
+    const Index patchOffset = m_depth_offset % m_patch_depth;
+    return patchOffset;
+  }
+
+ private:
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE Scalar loadCoeff(Index patchId, Index rowIndex, Index colIndex, Index otherIndex) const {
+    // Find the offset of the element wrt the location of the first element.
+    const Index patchOffset = patchId / m_fastDimZero;
+
+    const Index colOffset = patchOffset / m_fastColStride;
+    const Index inputCol = colIndex + colOffset * m_in_col_strides;
+    const Index origInputCol = (m_patch_col_inflate_strides == 1) ? inputCol : ((inputCol >= 0) ? (inputCol / m_fastInputColStride) : 0);
+    const Index rowOffset = patchOffset - colOffset * m_colStride;
+    const Index inputRow = rowIndex + rowOffset * m_in_row_strides;
+    const Index origInputRow = (m_patch_row_inflate_strides == 1) ? inputRow : ((inputRow >= 0) ? (inputRow / m_fastInputRowStride) : 0);
+    if (origInputCol < 0 | origInputRow < 0 | origInputCol >= m_inputCols | origInputRow >= m_inputRows |
+        (inputCol != origInputCol * m_patch_col_inflate_strides) | (inputRow != origInputRow * m_patch_row_inflate_strides)) {
+      return Scalar(0);
+    }
+    const Index depth = patchId - patchOffset * m_patch_depth;
+    const Index inputIndex = depth + origInputRow * m_rowInputStride + origInputCol * m_colInputStride + otherIndex;
+    return m_impl.coeff(inputIndex);
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Packet loadPacket(Index patchId, Index rowIndex, Index colIndex, Index otherIndex) const {
+    const Index packetSize = internal::unpacket_traits<Packet>::size;
+    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(patchId < m_patch_depth*m_patch_rows*m_patch_cols);
+
+    if (nonStandardPatches()) {
+      return packetWithPossibleZero(patchId, rowIndex, colIndex, otherIndex);
+    }
+
+    if ((m_patch_depth % packetSize) == 0) {
+      // Find the offset of the element wrt the location of the first element.
+      const Index patchOffset = patchId / m_fastDimZero;
+      eigen_assert((patchId + packetSize - 1)  / m_fastDimZero == patchOffset);
+
+      const Index colOffset = patchOffset / m_fastColStride;
+      const Index inputCol = colIndex + colOffset;
+      const Index rowOffset = patchOffset - colOffset*m_colStride;
+      const Index inputRow = rowIndex + rowOffset;
+      if (inputCol < 0 | inputRow < 0 | inputCol >= m_inputCols | inputRow >= m_inputRows) {
+        // all zeros
+        return internal::pset1<Packet>(Scalar(0));
+      }
+      // no padding
+      const Index depth = patchId - patchOffset * m_patch_depth;
+      const Index inputIndex = depth + inputRow * m_rowInputStride + inputCol * m_colInputStride + otherIndex;
+      return m_impl.template packet<Unaligned>(inputIndex);
+    }
+    else {
+      const Index patchOffsets[2] = {patchId / m_fastDimZero, (patchId + packetSize - 1) / m_fastDimZero};
+
+      const Index colOffsets[2] = {patchOffsets[0] / m_fastColStride, patchOffsets[1] / m_fastColStride};
+
+      const Index inputCols[2] = {colIndex + colOffsets[0], colIndex + colOffsets[1]};
+      if (inputCols[0] >= m_inputCols | inputCols[1] < 0) {
+        // all zeros
+        return internal::pset1<Packet>(Scalar(0));
+      }
+
+      if (inputCols[0] == inputCols[1]) {
+        const Index rowOffsets[2] = {patchOffsets[0] - colOffsets[0]*m_colStride, patchOffsets[1] - colOffsets[1]*m_colStride};
+        eigen_assert(rowOffsets[0] <= rowOffsets[1]);
+        const Index inputRows[2] = {rowIndex + rowOffsets[0], rowIndex + rowOffsets[1]};
+
+        if (inputRows[0] >= m_inputRows | inputRows[1] < 0) {
+          // all zeros
+          return internal::pset1<Packet>(Scalar(0));
+        }
+
+        if (inputRows[0] >= 0 & inputRows[1] < m_inputRows) {
+          // no padding
+          const Index depth = patchId - patchOffsets[0] * m_patch_depth;
+          const Index inputIndex = depth + inputRows[0] * m_rowInputStride + inputCols[0] * m_colInputStride + otherIndex;
+          return m_impl.template packet<Unaligned>(inputIndex);
+        }
+      }
+    }
+    return packetWithPossibleZero(patchId, rowIndex, colIndex, otherIndex);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet packetWithPossibleZero(Index patchId, Index rowIndex, Index colIndex, Index otherIndex) const
+  {
+    const int packetSize = internal::unpacket_traits<Packet>::size;
+    EIGEN_ALIGN_MAX typename internal::remove_const<Scalar>::type values[packetSize];
+    for (int i = 0; i < packetSize; ++i) {
+      values[i] = loadCoeff(patchId+i, rowIndex, colIndex, otherIndex);
+    }
+    Packet rslt = internal::pload<Packet>(values);
+    return rslt;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void computeBaseIndices(Index patchIndex, Index& rowIndex, Index& colIndex, Index& otherIndex) const {
+    const int NumInputDims = array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+    otherIndex = (NumInputDims == 3) ? 0 : patchIndex / m_fastNumPatches;
+    const Index patch2DIndex = (NumInputDims == 3) ? patchIndex : (patchIndex - otherIndex * m_num_patches);
+    otherIndex *= m_patchInputStride;
+    colIndex = patch2DIndex / m_fastOutputRows;
+    rowIndex = patch2DIndex - colIndex * m_outputRows;
+    colIndex = colIndex * m_col_strides - m_colPaddingLeft;
+    rowIndex = rowIndex * m_row_strides - m_rowPaddingTop;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void checkZeroOffsets() const {
+    eigen_assert(m_col_offset == 0);
+    eigen_assert(m_depth_offset == 0);
+    eigen_assert(m_rowIndex == 0);
+    eigen_assert(m_colIndex == 0);
+    eigen_assert(m_otherIndex == 0);
+  }
+
+  Index m_depth_offset;  // First row in the input matrix
+  Index m_col_offset;    // First col in the input matrix
+
+  Index m_patch_depth;   // patch depth, which is equal to the input depth
+  Index m_patch_rows;    // number of rows in the patch
+  Index m_patch_cols;    // number of colums in the patch
+  Index m_num_patches;   // number of patches to extract.
+  Index m_patch_row_inflate_strides;  // the strides for row inflation in the image patch
+  Index m_patch_col_inflate_strides;  // the strides for col inflation in the image patch
+  // Fast representation of inflation strides.
+  internal::TensorIntDivisor<Index> m_fastInputRowStride;
+  internal::TensorIntDivisor<Index> m_fastInputColStride;
+
+  Index m_otherStride;
+  Index m_colStride;
+  internal::TensorIntDivisor<Index> m_fastNumPatches;
+  internal::TensorIntDivisor<Index> m_fastColStride;
+
+  Index m_rowInputStride;     // row stride in the input tensor
+  Index m_colInputStride;     // col stride in the input tensor
+  Index m_patchInputStride;   // patch stride in the input tensor
+
+  Index m_inputRows;     // Number of rows in the input tensor
+  Index m_inputCols;     // Number of cols in the input tensor
+
+  Index m_outputRows;    // Number of patch rows
+
+  Index m_row_strides;   // User specified row stride
+  Index m_col_strides;   // User specified col stride
+
+  Index m_in_row_strides;  // User specified input row stride
+  Index m_in_col_strides;  // User specified input col stride
+
+  Index m_rowPaddingTop;    // Row padding
+  Index m_colPaddingLeft;   // Column padding
+
+  internal::TensorIntDivisor<Index> m_fastOutputRows;
+  internal::TensorIntDivisor<Index> m_fastDimZero;
+
+  Index m_rowIndex;        // precomputed row index corresponding to the col offset
+  Index m_colIndex;        // precomputed col index corresponding to the col offset
+  Index m_otherIndex;      // precomputed other index corresponding to the col offset
+
+  const TensorEvaluator<ArgType, Device> m_impl;
+};
+
+
+template <typename NewDimension, DenseIndex Rows, DenseIndex Cols, typename ArgType, typename Device,
+          typename Scalar, typename Index,
+          typename nocontract_t, typename contract_t,
+          int Side, size_t packet_size,
+          bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment, int nr>
+struct gemm_pack_rhs<Scalar, Index, TensorContractionInputMapper<Scalar, Index, Side, TensorEvaluator<const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >, Device>, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>, nr, ColMajor, false, false> {
+
+  typedef TensorContractionInputMapper<Scalar, Index, Side, TensorEvaluator<const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >, Device>, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> DataMapper;
+
+  static inline Index ceil_div(Index a, Index b) {
+    return (a + b - 1) / b;
+  }
+
+  EIGEN_DONT_INLINE void operator()(Scalar* block, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0) const {
+    eigen_assert(stride == 0);
+    eigen_assert(offset == 0);
+
+    EIGEN_STATIC_ASSERT((nr == 4), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    typedef typename DataMapper::LinearMapper LinearMapper;
+    typedef typename packet_traits<Scalar>::type Packet;
+
+    const Index packet_cols4 = (cols/4) * 4;
+    const Index peeled_k = (depth/packet_size) * packet_size;
+
+    for(Index j2=0; j2<packet_cols4; j2+=4)
+    {
+      const LinearMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
+      const LinearMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
+      const LinearMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
+      const LinearMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
+
+      Index k=0;
+      if((packet_size%4)==0 && !rhs.nonStandardPatches())
+      {
+        const Index patch_depth = rhs.patchDepth();
+        if ((patch_depth % packet_size) == 0) {
+          const Index patch_cols = rhs.patchCols();
+          const Index patch_rows = rhs.patchRows();
+
+          const Index startCol = rhs.colOffset();
+          const Index max_cols = std::min<Index>(ceil_div(peeled_k, patch_rows*patch_depth)+startCol, patch_cols);
+
+          for (Index c = startCol; c < max_cols; ++c) {
+            eigen_assert(k < peeled_k);
+            const Index startRow = (c == startCol) ? rhs.rowOffset() : 0;
+            const Index max_rows = std::min<Index>(ceil_div(peeled_k-c*patch_rows*patch_depth, patch_depth)+startRow, patch_rows);
+
+            const bool pad_col0 = dm0.padCol(c);
+            const bool pad_col1 = dm1.padCol(c);
+            const bool pad_col2 = dm2.padCol(c);
+            const bool pad_col3 = dm3.padCol(c);
+            for (Index r = startRow; r < max_rows; ++r) {
+              eigen_assert(k < peeled_k);
+              const bool pad0 = pad_col0 || dm0.padRow(r);
+              const bool pad1 = pad_col1 || dm1.padRow(r);
+              const bool pad2 = pad_col2 || dm2.padRow(r);
+              const bool pad3 = pad_col3 || dm3.padRow(r);
+
+              const Index idx0 = dm0.baseIndex(r, c);
+              const Index idx1 = dm1.baseIndex(r, c);
+              const Index idx2 = dm2.baseIndex(r, c);
+              const Index idx3 = dm3.baseIndex(r, c);
+
+              const Index startDepth = ((c == startCol) && (r == startRow)) ? rhs.depthOffset() : 0;
+              const Index max_depth = std::min<Index>(peeled_k-c*patch_rows*patch_depth-r*patch_depth+startDepth, patch_depth);
+              eigen_assert(max_depth % packet_size == 0);
+              for (Index d = startDepth; d < max_depth; d += packet_size) {
+                eigen_assert(k < peeled_k);
+                PacketBlock<Packet, 4> kernel;
+                kernel.packet[0] = pad0 ? pset1<Packet>(0) : dm0.packetNoPadding(d, idx0);
+                kernel.packet[1] = pad1 ? pset1<Packet>(0) : dm1.packetNoPadding(d, idx1);
+                kernel.packet[2] = pad2 ? pset1<Packet>(0) : dm2.packetNoPadding(d, idx2);
+                kernel.packet[3] = pad3 ? pset1<Packet>(0) : dm3.packetNoPadding(d, idx3);
+                ptranspose(kernel);
+                pstoreu(block+0*packet_size, kernel.packet[0]);
+                pstoreu(block+1*packet_size, kernel.packet[1]);
+                pstoreu(block+2*packet_size, kernel.packet[2]);
+                pstoreu(block+3*packet_size, kernel.packet[3]);
+                block+=4*packet_size;
+                k += packet_size;
+              }
+            }
+          }
+        }
+
+        for(; k<peeled_k; k+=packet_size) {
+          PacketBlock<Packet, 4> kernel;
+          kernel.packet[0] = dm0.loadPacket(k);
+          kernel.packet[1] = dm1.loadPacket(k);
+          kernel.packet[2] = dm2.loadPacket(k);
+          kernel.packet[3] = dm3.loadPacket(k);
+          ptranspose(kernel);
+          pstoreu(block+0*packet_size, kernel.packet[0]);
+          pstoreu(block+1*packet_size, kernel.packet[1]);
+          pstoreu(block+2*packet_size, kernel.packet[2]);
+          pstoreu(block+3*packet_size, kernel.packet[3]);
+          block+=4*packet_size;
+        }
+      }
+      for(; k<depth; k++)
+      {
+        block[0] = dm0(k);
+        block[1] = dm1(k);
+        block[2] = dm2(k);
+        block[3] = dm3(k);
+        block += 4;
+      }
+    }
+
+    // copy the remaining columns one at a time (nr==1)
+    for(Index j2=packet_cols4; j2<cols; ++j2)
+    {
+      const LinearMapper dm0 = rhs.getLinearMapper(0, j2);
+      for(Index k=0; k<depth; k++)
+      {
+        *block = dm0(k);
+        block += 1;
+      }
+    }
+  }
+};
+
+#endif  // EIGEN_VECTORIZE
+}  // end namespace internal
+
+
+/** SpatialConvolution
+  * \ingroup CXX11_NeuralNetworks_Module
+  *
+  * \brief Applies a 2D convolution over a multichannel input image.
+  *
+  * The input parameter is expected to be a tensor with a rank of 3 or more (channels, height, width, and optionally others)
+  * The kernel parameter is expected to be a 4D tensor (filters, channels, kernel_height, kernel_width)
+  * The input and the kernel must both be in col-major layout. The result will also be in col-major layout.
+  *
+  * If in_stride > 1, then applies convolution with holes (aka atrous convolution), sampling every in_stride input pixels.
+  *
+  * The result can be assigned to a tensor of rank equal to the rank of the input. The dimensions of the result will be filters, height, width (and others if applicable).
+  *
+  * It is possible to swap the order of the width and height dimensions provided that the same order is used in the input, the kernel, and the output.
+  *
+  */
+template <typename Input, typename Kernel>
+EIGEN_ALWAYS_INLINE
+static const typename internal::conditional<
+  internal::traits<Input>::Layout == ColMajor,
+  TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, internal::traits<Input>::NumDimensions>, const TensorContractionOp<const array<IndexPair<typename internal::traits<Input>::Index>, 1>, const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 2>, const Kernel>, const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 2>, const TensorImagePatchOp<Dynamic, Dynamic, const Input> > > >,
+  TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, internal::traits<Input>::NumDimensions>, const TensorContractionOp<const array<IndexPair<typename internal::traits<Input>::Index>, 1>, const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 2>, const TensorImagePatchOp<Dynamic, Dynamic, const Input> >, const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 2>, const Kernel> > > >::type
+SpatialConvolution(const Input& input, const Kernel& kernel, const DenseIndex stride = 1, const PaddingType padding_type = PADDING_SAME, const DenseIndex in_stride = 1) {
+
+  typedef typename internal::traits<Input>::Index TensorIndex;
+  TensorRef<Tensor<typename internal::traits<Input>::Scalar, internal::traits<Input>::NumDimensions, internal::traits<Input>::Layout, TensorIndex> > in(input);
+  TensorRef<Tensor<typename internal::traits<Kernel>::Scalar, internal::traits<Kernel>::NumDimensions, internal::traits<Kernel>::Layout, TensorIndex> > kern(kernel);
+
+  EIGEN_STATIC_ASSERT(internal::traits<Input>::Layout == internal::traits<Kernel>::Layout, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  static const bool isColMajor = (internal::traits<Input>::Layout == ColMajor);
+
+  static const int NumDims = internal::traits<Input>::NumDimensions;
+
+  // Number of filters to apply. This is the same as the output depth of the result
+  const TensorIndex kernelFilters = isColMajor ? kern.dimensions()[0] : kern.dimensions()[3];
+  // Number of channels. This is the same as the input depth.
+  const TensorIndex kernelChannels = isColMajor ? kern.dimensions()[1] : kern.dimensions()[2];
+  const TensorIndex kernelRows = isColMajor ? kern.dimensions()[2] : kern.dimensions()[1];
+  const TensorIndex kernelCols = isColMajor ? kern.dimensions()[3] : kern.dimensions()[0];
+
+  const DenseIndex kernelRowsEff = kernelRows + (kernelRows - 1) * (in_stride - 1);
+  const DenseIndex kernelColsEff = kernelCols + (kernelCols - 1) * (in_stride - 1);
+
+  array<IndexPair<TensorIndex>, 1> contract_dims;
+  contract_dims[0] = IndexPair<TensorIndex>(1, 0);
+
+  const TensorIndex InputRows = isColMajor ? in.dimension(1) : in.dimension(NumDims - 2);
+  const TensorIndex InputCols = isColMajor ? in.dimension(2) : in.dimension(NumDims - 3);
+
+  TensorIndex out_height;
+  TensorIndex out_width;
+  switch (padding_type) {
+    case PADDING_VALID:
+      out_height = numext::ceil((InputRows - kernelRowsEff + 1.f) / static_cast<float>(stride));
+      out_width = numext::ceil((InputCols - kernelColsEff + 1.f) / static_cast<float>(stride));
+      break;
+    case PADDING_SAME:
+      out_height = numext::ceil(InputRows / static_cast<float>(stride));
+      out_width = numext::ceil(InputCols / static_cast<float>(stride));
+      break;
+    default:
+      eigen_assert(false && "unexpected padding");
+  }
+
+  // Molds the output of the patch extraction code into a 2d tensor:
+  // - the first dimension (dims[0]): the patch values to be multiplied with the kernels
+  // - the second dimension (dims[1]): everything else
+  DSizes<TensorIndex, 2> pre_contract_dims;
+  if (isColMajor) {
+    pre_contract_dims[0] = kernelChannels * kernelRows * kernelCols;
+    pre_contract_dims[1] = out_height * out_width;
+    for (int i = 3; i < NumDims; ++i) {
+      pre_contract_dims[1] *= in.dimension(i);
+    }
+  } else {
+    pre_contract_dims[1] = kernelChannels * kernelRows * kernelCols;
+    pre_contract_dims[0] = out_height * out_width;
+    for (int i = 0; i < NumDims - 3; ++i) {
+      pre_contract_dims[0] *= in.dimension(i);
+    }
+  }
+
+  // Molds the output of the contraction into the shape expected by the used
+  // (assuming this is ColMajor):
+  // - 1st dim: kernel filters
+  // - 2nd dim: output height
+  // - 3rd dim: output width
+  // - 4th dim and beyond: everything else including batch size
+  DSizes<TensorIndex, NumDims> post_contract_dims;
+  if (isColMajor) {
+    post_contract_dims[0] = kernelFilters;
+    post_contract_dims[1] = out_height;
+    post_contract_dims[2] = out_width;
+    for (int i = 3; i < NumDims; ++i) {
+      post_contract_dims[i] = in.dimension(i);
+    }
+  } else {
+    post_contract_dims[NumDims - 1] = kernelFilters;
+    post_contract_dims[NumDims - 2] = out_height;
+    post_contract_dims[NumDims - 3] = out_width;
+    for (int i = 0; i < NumDims - 3; ++i) {
+      post_contract_dims[i] = in.dimension(i);
+    }
+  }
+
+  DSizes<TensorIndex, 2> kernel_dims;
+  if (isColMajor) {
+    kernel_dims[0] = kernelFilters;
+    kernel_dims[1] = kernelChannels * kernelRows * kernelCols;
+  } else {
+    kernel_dims[0] = kernelChannels * kernelRows * kernelCols;
+    kernel_dims[1] = kernelFilters;
+  }
+  // TODO(yangke): choose() is defined in TensorContraction.h -- consider
+  // moving it to somewhere more "common".
+  return choose(Cond<internal::traits<Input>::Layout == ColMajor>(),
+                kernel.reshape(kernel_dims).contract(input.extract_image_patches(kernelRows, kernelCols, stride, stride, in_stride, in_stride, padding_type).reshape(pre_contract_dims), contract_dims).reshape(post_contract_dims),
+                input.extract_image_patches(kernelRows, kernelCols, stride, stride, in_stride, in_stride, padding_type).reshape(pre_contract_dims).contract(kernel.reshape(kernel_dims), contract_dims).reshape(post_contract_dims));
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_NEURAL_NETWORKS_SPATIAL_CONVOLUTIONS_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/TensorConvolutionByFFT.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/TensorConvolutionByFFT.h
new file mode 100644
index 0000000000..0e72173536
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/TensorConvolutionByFFT.h
@@ -0,0 +1,289 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+// Copyright (C) 2015 Jianwei Cui <thucjw@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTIONBYFFT_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTIONBYFFT_H
+
+namespace Eigen {
+
+/** \class TensorConvolutionByFFT
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor convolution class.
+  *
+  *
+  */
+namespace internal {
+
+
+template<typename Dimensions, typename InputXprType, typename KernelXprType>
+struct traits<TensorConvolutionByFFTOp<Dimensions, InputXprType, KernelXprType> >
+{
+  // Type promotion to handle the case where the types of the lhs and the rhs are different.
+  typedef typename promote_storage_type<typename InputXprType::Scalar,
+                                        typename KernelXprType::Scalar>::ret Scalar;
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename promote_storage_type<typename traits<InputXprType>::StorageKind,
+                                        typename traits<KernelXprType>::StorageKind>::ret StorageKind;
+  typedef typename promote_index_type<typename traits<InputXprType>::Index,
+                                      typename traits<KernelXprType>::Index>::type Index;
+  typedef typename InputXprType::Nested LhsNested;
+  typedef typename KernelXprType::Nested RhsNested;
+  typedef typename remove_reference<LhsNested>::type _LhsNested;
+  typedef typename remove_reference<RhsNested>::type _RhsNested;
+  static const int NumDimensions = traits<InputXprType>::NumDimensions;
+  static const int Layout = traits<InputXprType>::Layout;
+
+  enum {
+    Flags = 0,
+  };
+};
+
+template<typename Dimensions, typename InputXprType, typename KernelXprType>
+struct eval<TensorConvolutionByFFTOp<Dimensions, InputXprType, KernelXprType>, Eigen::Dense>
+{
+  typedef const TensorConvolutionByFFTOp<Dimensions, InputXprType, KernelXprType>& type;
+};
+
+template<typename Dimensions, typename InputXprType, typename KernelXprType>
+struct nested<TensorConvolutionByFFTOp<Dimensions, InputXprType, KernelXprType>, 1, typename eval<TensorConvolutionByFFTOp<Dimensions, InputXprType, KernelXprType> >::type>
+{
+  typedef TensorConvolutionByFFTOp<Dimensions, InputXprType, KernelXprType> type;
+};
+
+}  // end namespace internal
+
+
+
+template<typename Indices, typename InputXprType, typename KernelXprType>
+class TensorConvolutionByFFTOp : public TensorBase<TensorConvolutionByFFTOp<Indices, InputXprType, KernelXprType> >
+{
+  public:
+  typedef typename Eigen::internal::traits<TensorConvolutionByFFTOp>::Scalar Scalar;
+  typedef typename Eigen::internal::traits<TensorConvolutionByFFTOp>::Packet Packet;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename internal::promote_storage_type<typename InputXprType::CoeffReturnType,
+                                                  typename KernelXprType::CoeffReturnType>::ret CoeffReturnType;
+  typedef typename internal::promote_storage_type<typename InputXprType::PacketReturnType,
+                                                  typename KernelXprType::PacketReturnType>::ret PacketReturnType;
+  typedef typename Eigen::internal::nested<TensorConvolutionByFFTOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorConvolutionByFFTOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorConvolutionByFFTOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConvolutionByFFTOp(const InputXprType& input, const KernelXprType& kernel, const Indices& dims)
+      : m_input_xpr(input), m_kernel_xpr(kernel), m_indices(dims) {}
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const Indices& indices() const { return m_indices; }
+
+    /** \returns the nested expressions */
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const typename internal::remove_all<typename InputXprType::Nested>::type&
+    inputExpression() const { return m_input_xpr; }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const typename internal::remove_all<typename KernelXprType::Nested>::type&
+    kernelExpression() const { return m_kernel_xpr; }
+
+  protected:
+    typename InputXprType::Nested m_input_xpr;
+    typename KernelXprType::Nested m_kernel_xpr;
+    const Indices m_indices;
+};
+
+
+template<typename Indices, typename InputArgType, typename KernelArgType, typename Device>
+struct TensorEvaluator<const TensorConvolutionByFFTOp<Indices, InputArgType, KernelArgType>, Device>
+{
+  typedef TensorConvolutionByFFTOp<Indices, InputArgType, KernelArgType> XprType;
+
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+
+  static const int NumDims = internal::array_size<typename TensorEvaluator<InputArgType, Device>::Dimensions>::value;
+  static const int NumKernelDims = internal::array_size<Indices>::value;
+  typedef typename XprType::Index Index;
+  typedef DSizes<Index, NumDims> Dimensions;
+
+  enum {
+    IsAligned = TensorEvaluator<InputArgType, Device>::IsAligned &
+                TensorEvaluator<KernelArgType, Device>::IsAligned,
+    PacketAccess = false,
+    BlockAccess = false,
+    Layout = TensorEvaluator<InputArgType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_kernel(NULL), m_local_kernel(false), m_device(device)
+  {
+    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+    const typename TensorEvaluator<InputArgType, Device>::Dimensions& input_dims = m_inputImpl.dimensions();
+    const typename TensorEvaluator<KernelArgType, Device>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_inputStride[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        m_inputStride[i] = m_inputStride[i - 1] * input_dims[i - 1];
+      }
+    } else {
+      m_inputStride[NumDims - 1] = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        m_inputStride[i] = m_inputStride[i + 1] * input_dims[i + 1];
+      }
+    }
+
+    m_dimensions = m_inputImpl.dimensions();
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = 0; i < NumKernelDims; ++i) {
+        const Index index = op.indices()[i];
+        const Index input_dim = input_dims[index];
+        const Index kernel_dim = kernel_dims[i];
+        const Index result_dim = input_dim - kernel_dim + 1;
+        m_dimensions[index] = result_dim;
+        if (i > 0) {
+          m_kernelStride[i] = m_kernelStride[i - 1] * kernel_dims[i - 1];
+        } else {
+          m_kernelStride[0] = 1;
+        }
+        m_indexStride[i] = m_inputStride[index];
+      }
+
+      m_outputStride[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        m_outputStride[i] = m_outputStride[i - 1] * m_dimensions[i - 1];
+      }
+    } else {
+      for (int i = NumKernelDims - 1; i >= 0; --i) {
+        const Index index = op.indices()[i];
+        const Index input_dim = input_dims[index];
+        const Index kernel_dim = kernel_dims[i];
+        const Index result_dim = input_dim - kernel_dim + 1;
+        m_dimensions[index] = result_dim;
+        if (i < NumKernelDims - 1) {
+          m_kernelStride[i] = m_kernelStride[i + 1] * kernel_dims[i + 1];
+        } else {
+          m_kernelStride[NumKernelDims - 1] = 1;
+        }
+        m_indexStride[i] = m_inputStride[index];
+      }
+
+      m_outputStride[NumDims - 1] = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        m_outputStride[i] = m_outputStride[i + 1] * m_dimensions[i + 1];
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
+    m_inputImpl.evalSubExprsIfNeeded(NULL);
+    m_kernelImpl.evalSubExprsIfNeeded(NULL);
+
+    typedef typename internal::traits<InputArgType>::Index TensorIndex;
+
+    Tensor<Scalar, NumDims, Layout, TensorIndex> input(m_inputImpl.dimensions());
+    for (int i = 0; i < m_inputImpl.dimensions().TotalSize(); ++i) {
+      input.data()[i] = m_inputImpl.coeff(i);
+    }
+
+    Tensor<Scalar, NumDims, Layout, TensorIndex> kernel(m_kernelImpl.dimensions());
+    for (int i = 0; i < m_kernelImpl.dimensions().TotalSize(); ++i) {
+      kernel.data()[i] = m_kernelImpl.coeff(i);
+    }
+
+    array<std::pair<ptrdiff_t, ptrdiff_t>, NumDims> paddings;
+    for (int i = 0; i < NumDims; ++i) {
+      paddings[i] = std::make_pair(0, m_inputImpl.dimensions()[i] - m_kernelImpl.dimensions()[i]);
+    }
+
+    Eigen::array<bool, NumKernelDims> reverse;
+    for (int i = 0; i < NumKernelDims; ++i) {
+      reverse[i] = true;
+    }
+
+    Eigen::array<bool, NumDims> fft;
+    for (int i = 0; i < NumDims; ++i) {
+      fft[i] = i;
+    }
+
+    Eigen::DSizes<TensorIndex, NumDims> slice_offsets;
+    for (int i = 0; i < NumDims; ++i) {
+      slice_offsets[i] = m_kernelImpl.dimensions()[i] - 1;
+    }
+
+    Eigen::DSizes<TensorIndex, NumDims> slice_extents;
+    for (int i = 0; i < NumDims; ++i) {
+      slice_extents[i] = m_inputImpl.dimensions()[i] - m_kernelImpl.dimensions()[i] + 1;
+    }
+
+    Tensor<Scalar, NumDims, Layout, TensorIndex> kernel_variant =  kernel.reverse(reverse).pad(paddings);
+    Tensor<std::complex<Scalar>, NumDims, Layout, TensorIndex> kernel_fft =  kernel_variant.template fft<Eigen::BothParts, FFT_FORWARD>(fft);
+    //Tensor<std::complex<Scalar>, NumDims, Layout|IndexType> kernel_fft =  kernel.reverse(reverse).pad(paddings).template fft<2>(fft);
+    Tensor<std::complex<Scalar>, NumDims, Layout, TensorIndex> input_fft = input.template fft<Eigen::BothParts, FFT_FORWARD>(fft);
+    Tensor<std::complex<Scalar>, NumDims, Layout, TensorIndex> prod = (input_fft * kernel_fft).template fft<Eigen::BothParts, FFT_REVERSE>(fft);
+    Tensor<std::complex<Scalar>, NumDims, Layout, TensorIndex> tensor_result = prod.slice(slice_offsets, slice_extents);
+
+    for (int i = 0; i < tensor_result.size(); ++i) {
+      data[i] = std::real(tensor_result.data()[i]);
+    }
+    return false;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_inputImpl.cleanup();
+    if (m_local_kernel) {
+      m_device.deallocate((void*)m_kernel);
+      m_local_kernel = false;
+    }
+    m_kernel = NULL;
+  }
+
+  void evalTo(typename XprType::Scalar* buffer) {
+    evalSubExprsIfNeeded(NULL);
+    for (int i = 0; i < dimensions().TotalSize(); ++i) {
+      buffer[i] += coeff(i);
+    }
+    cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    CoeffReturnType result = CoeffReturnType(0);
+    return result;
+  }
+
+  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+
+ private:
+  array<Index, NumDims> m_inputStride;
+  array<Index, NumDims> m_outputStride;
+
+  array<Index, NumKernelDims> m_indexStride;
+  array<Index, NumKernelDims> m_kernelStride;
+  TensorEvaluator<InputArgType, Device> m_inputImpl;
+  TensorEvaluator<KernelArgType, Device> m_kernelImpl;
+  Dimensions m_dimensions;
+
+  KernelArgType m_kernelArg;
+  const Scalar* m_kernel;
+  bool m_local_kernel;
+  const Device& m_device;
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTIONBYFFT_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
new file mode 100644
index 0000000000..9db0d2698f
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
@@ -0,0 +1,461 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_H
+#define EIGEN_CXX11_TENSOR_TENSOR_H
+
+namespace Eigen {
+
+/** \class Tensor
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief The tensor class.
+  *
+  * The %Tensor class is the work-horse for all \em dense tensors within Eigen.
+  *
+  * The %Tensor class encompasses only dynamic-size objects so far.
+  *
+  * The first two template parameters are required:
+  * \tparam Scalar_ \anchor tensor_tparam_scalar Numeric type, e.g. float, double, int or std::complex<float>.
+  *                 User defined scalar types are supported as well (see \ref user_defined_scalars "here").
+  * \tparam NumIndices_ Number of indices (i.e. rank of the tensor)
+  *
+  * The remaining template parameters are optional -- in most cases you don't have to worry about them.
+  * \tparam Options_ \anchor tensor_tparam_options A combination of either \b #RowMajor or \b #ColMajor, and of either
+  *                 \b #AutoAlign or \b #DontAlign.
+  *                 The former controls \ref TopicStorageOrders "storage order", and defaults to column-major. The latter controls alignment, which is required
+  *                 for vectorization. It defaults to aligning tensors. Note that tensors currently do not support any operations that profit from vectorization.
+  *                 Support for such operations (i.e. adding two tensors etc.) is planned.
+  *
+  * You can access elements of tensors using normal subscripting:
+  *
+  * \code
+  * Eigen::Tensor<double, 4> t(10, 10, 10, 10);
+  * t(0, 1, 2, 3) = 42.0;
+  * \endcode
+  *
+  * This class can be extended with the help of the plugin mechanism described on the page
+  * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_TENSOR_PLUGIN.
+  *
+  * <i><b>Some notes:</b></i>
+  *
+  * <dl>
+  * <dt><b>Relation to other parts of Eigen:</b></dt>
+  * <dd>The midterm developement goal for this class is to have a similar hierarchy as Eigen uses for matrices, so that
+  * taking blocks or using tensors in expressions is easily possible, including an interface with the vector/matrix code
+  * by providing .asMatrix() and .asVector() (or similar) methods for rank 2 and 1 tensors. However, currently, the %Tensor
+  * class does not provide any of these features and is only available as a stand-alone class that just allows for
+  * coefficient access. Also, when fixed-size tensors are implemented, the number of template arguments is likely to
+  * change dramatically.</dd>
+  * </dl>
+  *
+  * \ref TopicStorageOrders
+  */
+
+template<typename Scalar_, std::size_t NumIndices_, int Options_, typename IndexType_>
+class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexType_> >
+{
+  public:
+    typedef Tensor<Scalar_, NumIndices_, Options_, IndexType_> Self;
+    typedef TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexType_> > Base;
+    typedef typename Eigen::internal::nested<Self>::type Nested;
+    typedef typename internal::traits<Self>::StorageKind StorageKind;
+    typedef typename internal::traits<Self>::Index Index;
+    typedef Scalar_ Scalar;
+    typedef typename internal::packet_traits<Scalar>::type Packet;
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+    typedef typename Base::CoeffReturnType CoeffReturnType;
+    typedef typename Base::PacketReturnType PacketReturnType;
+
+    enum {
+      IsAligned = bool(EIGEN_ALIGN) & !(Options_ & DontAlign),
+      PacketAccess = (internal::packet_traits<Scalar>::size > 1),
+      BlockAccess = false,
+      Layout = Options_ & RowMajor ? RowMajor : ColMajor,
+      CoordAccess = true,
+    };
+
+    static const int Options = Options_;
+    static const std::size_t NumIndices = NumIndices_;
+    typedef DSizes<Index, NumIndices_> Dimensions;
+
+  protected:
+    TensorStorage<Scalar, Dimensions, Options_> m_storage;
+
+  public:
+    // Metadata
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index                         rank()                   const { return NumIndices; }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index                         dimension(std::size_t n) const { return m_storage.dimensions()[n]; }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions&             dimensions()    const { return m_storage.dimensions(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index                         size()                   const { return m_storage.size(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar                        *data()                        { return m_storage.data(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar                  *data()                  const { return m_storage.data(); }
+
+    // This makes EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
+    // work, because that uses base().coeffRef() - and we don't yet
+    // implement a similar class hierarchy
+    inline Self& base()             { return *this; }
+    inline const Self& base() const { return *this; }
+
+#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+    template<typename... IndexTypes>
+    EIGEN_DEVICE_FUNC inline const Scalar& coeff(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const
+    {
+      // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
+      EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+      return coeff(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
+    }
+#endif
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(const array<Index, NumIndices>& indices) const
+    {
+      eigen_internal_assert(checkIndexRange(indices));
+      return m_storage.data()[linearizedIndex(indices)];
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff() const
+    {
+      EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+      return m_storage.data()[0];
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index index) const
+    {
+      eigen_internal_assert(index >= 0 && index < size());
+      return m_storage.data()[index];
+    }
+
+#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+    template<typename... IndexTypes>
+    inline Scalar& coeffRef(Index firstIndex, Index secondIndex, IndexTypes... otherIndices)
+    {
+      // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
+      EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+      return coeffRef(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
+    }
+#endif
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(const array<Index, NumIndices>& indices)
+    {
+      eigen_internal_assert(checkIndexRange(indices));
+      return m_storage.data()[linearizedIndex(indices)];
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef()
+    {
+      EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE)
+      return m_storage.data()[0];
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index)
+    {
+      eigen_internal_assert(index >= 0 && index < size());
+      return m_storage.data()[index];
+    }
+
+#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+    template<typename... IndexTypes>
+    inline const Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const
+    {
+      // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
+      EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+      return this->operator()(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
+    }
+#else
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1) const
+    {
+      return coeff(array<Index, 2>(i0, i1));
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2) const
+    {
+      return coeff(array<Index, 3>(i0, i1, i2));
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3) const
+    {
+      return coeff(array<Index, 4>(i0, i1, i2, i3));
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const
+    {
+      return coeff(array<Index, 5>(i0, i1, i2, i3, i4));
+    }
+#endif
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(const array<Index, NumIndices>& indices) const
+    {
+      eigen_assert(checkIndexRange(indices));
+      return coeff(indices);
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()() const
+    {
+      EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+      return coeff();
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(Index index) const
+    {
+      eigen_internal_assert(index >= 0 && index < size());
+      return coeff(index);
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator[](Index index) const
+    {
+      // The bracket operator is only for vectors, use the parenthesis operator instead.
+      EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE);
+      return coeff(index);
+    }
+
+#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+    template<typename... IndexTypes>
+    inline Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices)
+    {
+      // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
+      EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+      return operator()(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
+    }
+#else
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1)
+    {
+      return coeffRef(array<Index, 2>(i0, i1));
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2)
+    {
+      return coeffRef(array<Index, 3>(i0, i1, i2));
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3)
+    {
+      return coeffRef(array<Index, 4>(i0, i1, i2, i3));
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4)
+    {
+      return coeffRef(array<Index, 5>(i0, i1, i2, i3, i4));
+    }
+#endif
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(const array<Index, NumIndices>& indices)
+    {
+      eigen_assert(checkIndexRange(indices));
+      return coeffRef(indices);
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()()
+    {
+      EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+      return coeffRef();
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(Index index)
+    {
+      eigen_assert(index >= 0 && index < size());
+      return coeffRef(index);
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator[](Index index)
+    {
+      // The bracket operator is only for vectors, use the parenthesis operator instead
+      EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+      return coeffRef(index);
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Tensor()
+      : m_storage()
+    {
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Tensor(const Self& other)
+      : m_storage(other.m_storage)
+    {
+    }
+
+#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+    template<typename... IndexTypes>
+    inline Tensor(Index firstDimension, IndexTypes... otherDimensions)
+        : m_storage(internal::array_prod(array<Index, NumIndices>{{firstDimension, otherDimensions...}}), array<Index, NumIndices>{{firstDimension, otherDimensions...}})
+    {
+      // The number of dimensions used to construct a tensor must be equal to the rank of the tensor.
+      EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    }
+#else
+    inline explicit Tensor(Index dim1)
+      : m_storage(dim1, array<Index, 1>(dim1))
+    {
+      EIGEN_STATIC_ASSERT(1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    }
+    inline explicit Tensor(Index dim1, Index dim2)
+      : m_storage(dim1*dim2, array<Index, 2>(dim1, dim2))
+    {
+      EIGEN_STATIC_ASSERT(2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    }
+    inline explicit Tensor(Index dim1, Index dim2, Index dim3)
+      : m_storage(dim1*dim2*dim3, array<Index, 3>(dim1, dim2, dim3))
+    {
+      EIGEN_STATIC_ASSERT(3 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    }
+    inline explicit Tensor(Index dim1, Index dim2, Index dim3, Index dim4)
+      : m_storage(dim1*dim2*dim3*dim4, array<Index, 4>(dim1, dim2, dim3, dim4))
+    {
+      EIGEN_STATIC_ASSERT(4 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    }
+    inline explicit Tensor(Index dim1, Index dim2, Index dim3, Index dim4, Index dim5)
+      : m_storage(dim1*dim2*dim3*dim4*dim5, array<Index, 4>(dim1, dim2, dim3, dim4, dim5))
+    {
+      EIGEN_STATIC_ASSERT(5 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    }
+#endif
+
+    inline explicit Tensor(const array<Index, NumIndices>& dimensions)
+        : m_storage(internal::array_prod(dimensions), dimensions)
+    {
+      EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
+    }
+
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Tensor(const TensorBase<OtherDerived, ReadOnlyAccessors>& other)
+    {
+      typedef TensorAssignOp<Tensor, const OtherDerived> Assign;
+      Assign assign(*this, other.derived());
+      resize(TensorEvaluator<const Assign, DefaultDevice>(assign, DefaultDevice()).dimensions());
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+    }
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Tensor(const TensorBase<OtherDerived, WriteAccessors>& other)
+    {
+      typedef TensorAssignOp<Tensor, const OtherDerived> Assign;
+      Assign assign(*this, other.derived());
+      resize(TensorEvaluator<const Assign, DefaultDevice>(assign, DefaultDevice()).dimensions());
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Tensor& operator=(const Tensor& other)
+    {
+      typedef TensorAssignOp<Tensor, const Tensor> Assign;
+      Assign assign(*this, other);
+      resize(TensorEvaluator<const Assign, DefaultDevice>(assign, DefaultDevice()).dimensions());
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+      return *this;
+    }
+    template<typename Other>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Tensor& operator=(const Other& other)
+    {
+      typedef TensorAssignOp<Tensor, const Other> Assign;
+      Assign assign(*this, other);
+      resize(TensorEvaluator<const Assign, DefaultDevice>(assign, DefaultDevice()).dimensions());
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+      return *this;
+    }
+
+#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+    template<typename... IndexTypes> EIGEN_DEVICE_FUNC
+    void resize(Index firstDimension, IndexTypes... otherDimensions)
+    {
+      // The number of dimensions used to resize a tensor must be equal to the rank of the tensor.
+      EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+      resize(array<Index, NumIndices>{firstDimension, otherDimensions...});
+    }
+#endif
+
+    EIGEN_DEVICE_FUNC
+    void resize()
+    {
+      EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+      // Nothing to do: rank 0 tensors have fixed size
+    }
+
+    EIGEN_DEVICE_FUNC
+    void resize(const array<Index, NumIndices>& dimensions)
+    {
+      Index size = Index(1);
+      for (size_t i = 0; i < NumIndices; i++) {
+        internal::check_rows_cols_for_overflow<Dynamic>::run(size, dimensions[i]);
+        size *= dimensions[i];
+      }
+      #ifdef EIGEN_INITIALIZE_COEFFS
+        bool size_changed = size != this->size();
+        m_storage.resize(size, dimensions);
+        if(size_changed) EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
+      #else
+        m_storage.resize(size, dimensions);
+      #endif
+    }
+
+    EIGEN_DEVICE_FUNC
+    void resize(const DSizes<Index, NumIndices>& dimensions) {
+      array<Index, NumIndices> dims;
+      for (int i = 0; i < NumIndices; ++i) {
+        dims[i] = dimensions[i];
+      }
+      resize(dims);
+    }
+
+#ifndef EIGEN_EMULATE_CXX11_META_H
+    template <typename std::size_t... Indices>
+    EIGEN_DEVICE_FUNC
+    void resize(const Sizes<Indices...>& dimensions) {
+      array<Index, NumIndices> dims;
+      for (int i = 0; i < NumIndices; ++i) {
+        dims[i] = dimensions[i];
+      }
+      resize(dims);
+    }
+#else
+    template <std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5>
+    EIGEN_DEVICE_FUNC
+    void resize(const Sizes<V1, V2, V3, V4, V5>& dimensions) {
+      array<Index, NumIndices> dims;
+      for (int i = 0; i < NumIndices; ++i) {
+        dims[i] = dimensions[i];
+      }
+      resize(dims);
+    }
+#endif
+
+  protected:
+
+    bool checkIndexRange(const array<Index, NumIndices>& indices) const
+    {
+      using internal::array_apply_and_reduce;
+      using internal::array_zip_and_reduce;
+      using internal::greater_equal_zero_op;
+      using internal::logical_and_op;
+      using internal::lesser_op;
+
+      return
+        // check whether the indices are all >= 0
+        array_apply_and_reduce<logical_and_op, greater_equal_zero_op>(indices) &&
+        // check whether the indices fit in the dimensions
+        array_zip_and_reduce<logical_and_op, lesser_op>(indices, m_storage.dimensions());
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index linearizedIndex(const array<Index, NumIndices>& indices) const
+    {
+      if (Options&RowMajor) {
+        return m_storage.dimensions().IndexOfRowMajor(indices);
+      } else {
+        return m_storage.dimensions().IndexOfColMajor(indices);
+      }
+    }
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
new file mode 100644
index 0000000000..ee3bf7fe34
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
@@ -0,0 +1,288 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Eugene Brevdo <ebrevdo@gmail.com>
+//                    Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_ARG_MAX_H
+#define EIGEN_CXX11_TENSOR_TENSOR_ARG_MAX_H
+
+namespace Eigen {
+namespace internal {
+
+/** \class TensorIndexTuple
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor + Index Tuple class.
+  *
+  *
+  */
+template<typename XprType>
+struct traits<TensorIndexTupleOp<XprType> > : public traits<XprType>
+{
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef Tuple<Index, typename XprTraits::Scalar> Scalar;
+  typedef typename XprType::Nested Nested;
+  typedef typename remove_reference<Nested>::type _Nested;
+  static const int NumDimensions = XprTraits::NumDimensions;
+  static const int Layout = XprTraits::Layout;
+};
+
+template<typename XprType>
+struct eval<TensorIndexTupleOp<XprType>, Eigen::Dense>
+{
+  typedef const TensorIndexTupleOp<XprType>& type;
+};
+
+template<typename XprType>
+struct nested<TensorIndexTupleOp<XprType>, 1,
+              typename eval<TensorIndexTupleOp<XprType> >::type>
+{
+  typedef TensorIndexTupleOp<XprType> type;
+};
+
+}  // end namespace internal
+
+template<typename XprType>
+class TensorIndexTupleOp : public TensorBase<TensorIndexTupleOp<XprType>, ReadOnlyAccessors>
+{
+  public:
+  typedef typename Eigen::internal::traits<TensorIndexTupleOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename Eigen::internal::nested<TensorIndexTupleOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorIndexTupleOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorIndexTupleOp>::Index Index;
+  typedef Tuple<Index, typename XprType::CoeffReturnType> CoeffReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIndexTupleOp(const XprType& expr)
+      : m_xpr(expr) {}
+
+  EIGEN_DEVICE_FUNC
+  const typename internal::remove_all<typename XprType::Nested>::type&
+  expression() const { return m_xpr; }
+
+  protected:
+    typename XprType::Nested m_xpr;
+};
+
+// Eval as rvalue
+template<typename ArgType, typename Device>
+struct TensorEvaluator<const TensorIndexTupleOp<ArgType>, Device>
+{
+  typedef TensorIndexTupleOp<ArgType> XprType;
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+  typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
+  static const int NumDims = internal::array_size<Dimensions>::value;
+
+  enum {
+    IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
+    PacketAccess = /*TensorEvaluator<ArgType, Device>::PacketAccess*/ false,
+    BlockAccess = false,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device) { }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {
+    return m_impl.dimensions();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_impl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    return CoeffReturnType(index, m_impl.coeff(index));
+  }
+
+  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+
+ protected:
+  TensorEvaluator<ArgType, Device> m_impl;
+};
+
+namespace internal {
+
+/** \class TensorTupleIndex
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Converts to Tensor<Tuple<Index, Scalar> > and reduces to Tensor<Index>.
+  *
+  */
+template<typename ReduceOp, typename Dims, typename XprType>
+struct traits<TensorTupleReducerOp<ReduceOp, Dims, XprType> > : public traits<XprType>
+{
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef Index Scalar;
+  typedef typename XprType::Nested Nested;
+  typedef typename remove_reference<Nested>::type _Nested;
+  static const int NumDimensions = XprTraits::NumDimensions;
+  static const int Layout = XprTraits::Layout;
+};
+
+template<typename ReduceOp, typename Dims, typename XprType>
+struct eval<TensorTupleReducerOp<ReduceOp, Dims, XprType>, Eigen::Dense>
+{
+  typedef const TensorTupleReducerOp<ReduceOp, Dims, XprType>& type;
+};
+
+template<typename ReduceOp, typename Dims, typename XprType>
+struct nested<TensorTupleReducerOp<ReduceOp, Dims, XprType>, 1,
+              typename eval<TensorTupleReducerOp<ReduceOp, Dims, XprType> >::type>
+{
+  typedef TensorTupleReducerOp<ReduceOp, Dims, XprType> type;
+};
+
+}  // end namespace internal
+
+template<typename ReduceOp, typename Dims, typename XprType>
+class TensorTupleReducerOp : public TensorBase<TensorTupleReducerOp<ReduceOp, Dims, XprType>, ReadOnlyAccessors>
+{
+  public:
+  typedef typename Eigen::internal::traits<TensorTupleReducerOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename Eigen::internal::nested<TensorTupleReducerOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorTupleReducerOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorTupleReducerOp>::Index Index;
+  typedef Index CoeffReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorTupleReducerOp(const XprType& expr,
+                                                          const ReduceOp& reduce_op,
+                                                          const int return_dim,
+                                                          const Dims& reduce_dims)
+      : m_xpr(expr), m_reduce_op(reduce_op), m_return_dim(return_dim), m_reduce_dims(reduce_dims) {}
+
+  EIGEN_DEVICE_FUNC
+  const typename internal::remove_all<typename XprType::Nested>::type&
+  expression() const { return m_xpr; }
+
+  EIGEN_DEVICE_FUNC
+  const ReduceOp& reduce_op() const { return m_reduce_op; }
+
+  EIGEN_DEVICE_FUNC
+  const Dims& reduce_dims() const { return m_reduce_dims; }
+
+  EIGEN_DEVICE_FUNC
+  int return_dim() const { return m_return_dim; }
+
+  protected:
+    typename XprType::Nested m_xpr;
+    const ReduceOp m_reduce_op;
+    const int m_return_dim;
+    const Dims m_reduce_dims;
+};
+
+// Eval as rvalue
+template<typename ReduceOp, typename Dims, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Device>
+{
+  typedef TensorTupleReducerOp<ReduceOp, Dims, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename TensorIndexTupleOp<ArgType>::CoeffReturnType TupleType;
+  typedef typename TensorEvaluator<const TensorReductionOp<ReduceOp, Dims, const TensorIndexTupleOp<ArgType> >, Device>::Dimensions Dimensions;
+  typedef typename TensorEvaluator<const TensorIndexTupleOp<ArgType> , Device>::Dimensions InputDimensions;
+  static const int NumDims = internal::array_size<InputDimensions>::value;
+  typedef array<Index, NumDims> StrideDims;
+
+  enum {
+    IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
+    PacketAccess = /*TensorEvaluator<ArgType, Device>::PacketAccess*/ false,
+    BlockAccess = false,
+    Layout = TensorEvaluator<const TensorReductionOp<ReduceOp, Dims, const TensorIndexTupleOp<ArgType> >, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_orig_impl(op.expression(), device),
+        m_impl(op.expression().index_tuples().reduce(op.reduce_dims(), op.reduce_op()), device),
+        m_return_dim(op.return_dim()),
+        m_strides(gen_strides(m_orig_impl.dimensions())),
+        m_stride_mod(gen_stride_mod(m_orig_impl.dimensions())),
+        m_stride_div(gen_stride_div()) { }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {
+    return m_impl.dimensions();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_impl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    const TupleType v = m_impl.coeff(index);
+    return (m_return_dim < 0) ? v.first : (v.first % m_stride_mod) / m_stride_div;
+  }
+
+  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+
+ private:
+  EIGEN_DEVICE_FUNC StrideDims gen_strides(const InputDimensions& dims) {
+    StrideDims strides;
+    if (m_return_dim < 0) return strides;  // Won't be using these.
+    eigen_assert(m_return_dim < NumDims &&
+                 "Asking to convert index to a dimension outside of the rank");
+
+    // Calculate m_stride_div and m_stride_mod, which are used to
+    // calculate the value of an index w.r.t. the m_return_dim.
+    if (Layout == static_cast<int>(ColMajor)) {
+      strides[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        strides[i] = strides[i-1] * dims[i-1];
+      }
+    } else {
+      strides[NumDims-1] = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        strides[i] = strides[i+1] * dims[i+1];
+      }
+    }
+    return strides;
+  }
+
+  EIGEN_DEVICE_FUNC Index gen_stride_mod(const InputDimensions& dims) {
+    if (Layout == static_cast<int>(ColMajor)) {
+      return (m_return_dim < NumDims - 1) ? m_strides[m_return_dim + 1] : dims.TotalSize();
+    } else {
+      return (m_return_dim > 0) ? m_strides[m_return_dim - 1] : dims.TotalSize();
+    }
+  }
+
+  EIGEN_DEVICE_FUNC Index gen_stride_div() {
+    return m_strides[m_return_dim];
+  }
+
+ protected:
+  TensorEvaluator<const TensorIndexTupleOp<ArgType>, Device> m_orig_impl;
+  TensorEvaluator<const TensorReductionOp<ReduceOp, Dims, const TensorIndexTupleOp<ArgType> >, Device> m_impl;
+  const int m_return_dim;
+  const StrideDims m_strides;
+  const Index m_stride_mod;
+  const Index m_stride_div;
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_ARG_MAX_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
new file mode 100644
index 0000000000..fdb943e713
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
@@ -0,0 +1,179 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H
+#define EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H
+
+namespace Eigen {
+
+/** \class TensorAssign
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief The tensor assignment class.
+  *
+  * This class is represents the assignment of the values resulting from the evaluation of
+  * the rhs expression to the memory locations denoted by the lhs expression.
+  */
+namespace internal {
+template<typename LhsXprType, typename RhsXprType>
+struct traits<TensorAssignOp<LhsXprType, RhsXprType> >
+{
+  typedef typename LhsXprType::Scalar Scalar;
+  typedef typename traits<LhsXprType>::StorageKind StorageKind;
+  typedef typename promote_index_type<typename traits<LhsXprType>::Index,
+                                      typename traits<RhsXprType>::Index>::type Index;
+  typedef typename LhsXprType::Nested LhsNested;
+  typedef typename RhsXprType::Nested RhsNested;
+  typedef typename remove_reference<LhsNested>::type _LhsNested;
+  typedef typename remove_reference<RhsNested>::type _RhsNested;
+  static const std::size_t NumDimensions = internal::traits<LhsXprType>::NumDimensions;
+  static const int Layout = internal::traits<LhsXprType>::Layout;
+
+  enum {
+    Flags = 0,
+  };
+};
+
+template<typename LhsXprType, typename RhsXprType>
+struct eval<TensorAssignOp<LhsXprType, RhsXprType>, Eigen::Dense>
+{
+  typedef const TensorAssignOp<LhsXprType, RhsXprType>& type;
+};
+
+template<typename LhsXprType, typename RhsXprType>
+struct nested<TensorAssignOp<LhsXprType, RhsXprType>, 1, typename eval<TensorAssignOp<LhsXprType, RhsXprType> >::type>
+{
+  typedef TensorAssignOp<LhsXprType, RhsXprType> type;
+};
+
+}  // end namespace internal
+
+
+
+template<typename LhsXprType, typename RhsXprType>
+class TensorAssignOp : public TensorBase<TensorAssignOp<LhsXprType, RhsXprType> >
+{
+  public:
+  typedef typename Eigen::internal::traits<TensorAssignOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename LhsXprType::CoeffReturnType CoeffReturnType;
+  typedef typename Eigen::internal::traits<TensorAssignOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorAssignOp>::Index Index;
+  static const std::size_t NumDims = Eigen::internal::traits<TensorAssignOp>::NumDimensions;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorAssignOp(LhsXprType& lhs, const RhsXprType& rhs)
+      : m_lhs_xpr(lhs), m_rhs_xpr(rhs) {}
+
+    /** \returns the nested expressions */
+    EIGEN_DEVICE_FUNC
+    typename internal::remove_all<typename LhsXprType::Nested>::type&
+    lhsExpression() const { return *((typename internal::remove_all<typename LhsXprType::Nested>::type*)&m_lhs_xpr); }
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename RhsXprType::Nested>::type&
+    rhsExpression() const { return m_rhs_xpr; }
+
+  protected:
+    typename internal::remove_all<typename LhsXprType::Nested>::type& m_lhs_xpr;
+    const typename internal::remove_all<typename RhsXprType::Nested>::type& m_rhs_xpr;
+};
+
+
+template<typename LeftArgType, typename RightArgType, typename Device>
+struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
+{
+  typedef TensorAssignOp<LeftArgType, RightArgType> XprType;
+
+  enum {
+    IsAligned = TensorEvaluator<LeftArgType, Device>::IsAligned &
+                TensorEvaluator<RightArgType, Device>::IsAligned,
+    PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess &
+                   TensorEvaluator<RightArgType, Device>::PacketAccess,
+    BlockAccess = TensorEvaluator<LeftArgType, Device>::BlockAccess &
+                  TensorEvaluator<RightArgType, Device>::BlockAccess,
+    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
+  };
+
+  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) :
+      m_leftImpl(op.lhsExpression(), device),
+      m_rightImpl(op.rhsExpression(), device)
+  {
+    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  }
+
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef typename TensorEvaluator<RightArgType, Device>::Dimensions Dimensions;
+  static const std::size_t NumDims = XprType::NumDims;
+
+  typedef typename internal::TensorBlock<
+    Index, typename internal::remove_const<Scalar>::type, NumDims, Layout>
+    TensorBlock;
+
+  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
+  {
+    // TODO: use left impl instead if right impl dimensions are known at compile time.
+    return m_rightImpl.dimensions();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
+    eigen_assert(dimensions_match(m_leftImpl.dimensions(), m_rightImpl.dimensions()));
+    m_leftImpl.evalSubExprsIfNeeded(NULL);
+    // If the lhs provides raw access to its storage area (i.e. if m_leftImpl.data() returns a non
+    // null value), attempt to evaluate the rhs expression in place. Returns true iff in place
+    // evaluation isn't supported and the caller still needs to manually assign the values generated
+    // by the rhs to the lhs.
+    return m_rightImpl.evalSubExprsIfNeeded(m_leftImpl.data());
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_leftImpl.cleanup();
+    m_rightImpl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) {
+    m_leftImpl.coeffRef(i) = m_rightImpl.coeff(i);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) {
+    const int LhsStoreMode = TensorEvaluator<LeftArgType, Device>::IsAligned ? Aligned : Unaligned;
+    const int RhsLoadMode = TensorEvaluator<RightArgType, Device>::IsAligned ? Aligned : Unaligned;
+    m_leftImpl.template writePacket<LhsStoreMode>(i, m_rightImpl.template packet<RhsLoadMode>(i));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
+      std::vector<internal::TensorOpResourceRequirements>* resources) const {
+    m_leftImpl.getResourceRequirements(resources);
+    m_rightImpl.getResourceRequirements(resources);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlock(TensorBlock* block) {
+    m_rightImpl.block(block);
+    m_leftImpl.writeBlock(*block);
+  }
+
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
+  {
+    return m_leftImpl.coeff(index);
+  }
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const
+  {
+    return m_leftImpl.template packet<LoadMode>(index);
+  }
+
+ private:
+  TensorEvaluator<LeftArgType, Device> m_leftImpl;
+  TensorEvaluator<RightArgType, Device> m_rightImpl;
+};
+
+}
+
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
new file mode 100644
index 0000000000..35ebca151b
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
@@ -0,0 +1,934 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_BASE_H
+#define EIGEN_CXX11_TENSOR_TENSOR_BASE_H
+
+// clang-format off
+
+namespace Eigen {
+
+/** \class TensorBase
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief The tensor base class.
+  *
+  * This class is the common parent of the Tensor and TensorMap class, thus
+  * making it possible to use either class interchangably in expressions.
+  */
+
+template<typename Derived>
+class TensorBase<Derived, ReadOnlyAccessors>
+{
+  public:
+    typedef internal::traits<Derived> DerivedTraits;
+    typedef typename DerivedTraits::Scalar Scalar;
+    typedef typename DerivedTraits::Index Index;
+    typedef typename internal::remove_const<Scalar>::type CoeffReturnType;
+    typedef typename internal::packet_traits<CoeffReturnType>::type PacketReturnType;
+    static const int NumDimensions = DerivedTraits::NumDimensions;
+
+    // Generic nullary operation support.
+    template <typename CustomNullaryOp> EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseNullaryOp<CustomNullaryOp, const Derived>
+    nullaryExpr(const CustomNullaryOp& func) const {
+      return TensorCwiseNullaryOp<CustomNullaryOp, const Derived>(derived(), func);
+    }
+
+    // Coefficient-wise nullary operators
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived>
+    constant(const Scalar& value) const {
+      return nullaryExpr(internal::scalar_constant_op<Scalar>(value));
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseNullaryOp<internal::UniformRandomGenerator<Scalar>, const Derived>
+    random() const {
+      return nullaryExpr(internal::UniformRandomGenerator<Scalar>());
+    }
+    template <typename RandomGenerator> EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseNullaryOp<RandomGenerator, const Derived>
+    random(const RandomGenerator& gen = RandomGenerator()) const {
+      return nullaryExpr(gen);
+    }
+
+    // Tensor generation
+    template <typename Generator> EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorGeneratorOp<Generator, const Derived>
+    generate(const Generator& generator) const {
+      return TensorGeneratorOp<Generator, const Derived>(derived(), generator);
+    }
+
+    // Generic unary operation support.
+    template <typename CustomUnaryOp> EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<CustomUnaryOp, const Derived>
+    unaryExpr(const CustomUnaryOp& func) const {
+      return TensorCwiseUnaryOp<CustomUnaryOp, const Derived>(derived(), func);
+    }
+
+    // Coefficient-wise unary operators
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_opposite_op<Scalar>, const Derived>
+    operator-() const {
+      return unaryExpr(internal::scalar_opposite_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_sqrt_op<Scalar>, const Derived>
+    sqrt() const {
+      return unaryExpr(internal::scalar_sqrt_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_rsqrt_op<Scalar>, const Derived>
+    rsqrt() const {
+      return unaryExpr(internal::scalar_rsqrt_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_square_op<Scalar>, const Derived>
+    square() const {
+      return unaryExpr(internal::scalar_square_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_cube_op<Scalar>, const Derived>
+    cube() const {
+      return unaryExpr(internal::scalar_cube_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_inverse_op<Scalar>, const Derived>
+    inverse() const {
+      return unaryExpr(internal::scalar_inverse_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_tanh_op<Scalar>, const Derived>
+    tanh() const {
+      return unaryExpr(internal::scalar_tanh_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_sigmoid_op<Scalar>, const Derived>
+    sigmoid() const {
+      return unaryExpr(internal::scalar_sigmoid_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_exp_op<Scalar>, const Derived>
+    exp() const {
+      return unaryExpr(internal::scalar_exp_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_log_op<Scalar>, const Derived>
+    log() const {
+      return unaryExpr(internal::scalar_log_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_abs_op<Scalar>, const Derived>
+    abs() const {
+      return unaryExpr(internal::scalar_abs_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_pow_op<Scalar>, const Derived>
+    pow(Scalar exponent) const {
+      return unaryExpr(internal::scalar_pow_op<Scalar>(exponent));
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_add_op<Scalar>, const Derived>
+    operator+ (Scalar rhs) const {
+      return unaryExpr(internal::scalar_add_op<Scalar>(rhs));
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_sub_op<Scalar>, const Derived>
+    operator- (Scalar rhs) const {
+      EIGEN_STATIC_ASSERT((std::numeric_limits<Scalar>::is_signed || internal::is_same<Scalar, const std::complex<float> >::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
+      return unaryExpr(internal::scalar_sub_op<Scalar>(rhs));
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const Derived>
+    operator* (Scalar rhs) const {
+      return unaryExpr(internal::scalar_multiple_op<Scalar>(rhs));
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_quotient1_op<Scalar>, const Derived>
+    operator/ (Scalar rhs) const {
+      // EIGEN_STATIC_ASSERT(!std::numeric_limits<Scalar>::is_integer, YOU_MADE_A_PROGRAMMING_MISTAKE);
+      return unaryExpr(internal::scalar_quotient1_op<Scalar>(rhs));
+    }
+
+    template <typename Scale>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_multiple2_op<Scalar, Scale>, const Derived>
+    scale (Scale rhs) const {
+      return unaryExpr(internal::scalar_multiple2_op<Scalar, Scale>(rhs));
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_mod_op<Scalar>, const Derived>
+    operator% (Scalar rhs) const {
+      EIGEN_STATIC_ASSERT(std::numeric_limits<Scalar>::is_integer, YOU_MADE_A_PROGRAMMING_MISTAKE_TRY_MOD);
+      return unaryExpr(internal::scalar_mod_op<Scalar>(rhs));
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_fmod_op<Scalar>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+    mod(Scalar rhs) const {
+      EIGEN_STATIC_ASSERT(!std::numeric_limits<Scalar>::is_integer, YOU_MADE_A_PROGRAMMING_MISTAKE_FMOD_IS_NOT_FOR_INTEGERS);
+      return mod(constant(rhs));
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_max_op<Scalar>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+    cwiseMax(Scalar threshold) const {
+      return cwiseMax(constant(threshold));
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_min_op<Scalar>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+    cwiseMin(Scalar threshold) const {
+      return cwiseMin(constant(threshold));
+    }
+
+    template <typename NewType> EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorConversionOp<NewType, const Derived>
+    cast() const {
+      return TensorConversionOp<NewType, const Derived>(derived());
+    }
+
+    // Generic binary operation support.
+    template <typename CustomBinaryOp, typename OtherDerived> EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<CustomBinaryOp, const Derived, const OtherDerived>
+    binaryExpr(const OtherDerived& other, const CustomBinaryOp& func) const {
+      return TensorCwiseBinaryOp<CustomBinaryOp, const Derived, const OtherDerived>(derived(), other, func);
+    }
+
+    // Coefficient-wise binary operators.
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_sum_op<Scalar>, const Derived, const OtherDerived>
+    operator+(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_sum_op<Scalar>());
+    }
+
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_difference_op<Scalar>, const Derived, const OtherDerived>
+    operator-(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_difference_op<Scalar>());
+    }
+
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_product_op<Scalar>, const Derived, const OtherDerived>
+    operator*(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_product_op<Scalar>());
+    }
+
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_quotient_op<Scalar>, const Derived, const OtherDerived>
+    operator/(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_quotient_op<Scalar>());
+    }
+
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_fmod_op<Scalar>, const Derived, const OtherDerived>
+    mod(const OtherDerived& other) const {
+      EIGEN_STATIC_ASSERT(!std::numeric_limits<Scalar>::is_integer, YOU_MADE_A_PROGRAMMING_MISTAKE_FMOD_IS_NOT_FOR_INTEGERS);
+      return binaryExpr(other.derived(), internal::scalar_fmod_op<Scalar>());
+    }
+
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_max_op<Scalar>, const Derived, const OtherDerived>
+    cwiseMax(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_max_op<Scalar>());
+    }
+
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_min_op<Scalar>, const Derived, const OtherDerived>
+    cwiseMin(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_min_op<Scalar>());
+    }
+
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_boolean_and_op, const Derived, const OtherDerived>
+    operator&&(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_boolean_and_op());
+    }
+
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_boolean_or_op, const Derived, const OtherDerived>
+    operator||(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_boolean_or_op());
+    }
+
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_boolean_xor_op, const Derived, const OtherDerived>
+    operator^(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_boolean_xor_op());
+    }
+
+    // Comparisons and tests.
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<std::less<Scalar>, const Derived, const OtherDerived>
+    operator<(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), std::less<Scalar>());
+    }
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<std::less_equal<Scalar>, const Derived, const OtherDerived>
+    operator<=(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), std::less_equal<Scalar>());
+    }
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<std::greater<Scalar>, const Derived, const OtherDerived>
+    operator>(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), std::greater<Scalar>());
+    }
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<std::greater_equal<Scalar>, const Derived, const OtherDerived>
+    operator>=(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), std::greater_equal<Scalar>());
+    }
+
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<std::equal_to<Scalar>, const Derived, const OtherDerived>
+    operator==(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), std::equal_to<Scalar>());
+    }
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<std::not_equal_to<Scalar>, const Derived, const OtherDerived>
+    operator!=(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), std::not_equal_to<Scalar>());
+    }
+
+    // comparisons and tests for Scalars
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<std::less<Scalar>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+    operator<(Scalar threshold) const {
+      return operator<(constant(threshold));
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<std::less_equal<Scalar>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+    operator<=(Scalar threshold) const {
+      return operator<=(constant(threshold));
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<std::greater<Scalar>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+    operator>(Scalar threshold) const {
+      return operator>(constant(threshold));
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<std::greater_equal<Scalar>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+    operator>=(Scalar threshold) const {
+      return operator>=(constant(threshold));
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<std::equal_to<Scalar>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+    operator==(Scalar threshold) const {
+      return operator==(constant(threshold));
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<std::not_equal_to<Scalar>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+    operator!=(Scalar threshold) const {
+      return operator!=(constant(threshold));
+    }
+
+    // Coefficient-wise ternary operators.
+    template<typename ThenDerived, typename ElseDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorSelectOp<const Derived, const ThenDerived, const ElseDerived>
+    select(const ThenDerived& thenTensor, const ElseDerived& elseTensor) const {
+      return TensorSelectOp<const Derived, const ThenDerived, const ElseDerived>(derived(), thenTensor.derived(), elseTensor.derived());
+    }
+
+    // Contractions.
+    typedef Eigen::IndexPair<Index> DimensionPair;
+
+    template<typename OtherDerived, typename Dimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorContractionOp<const Dimensions, const Derived, const OtherDerived>
+    contract(const OtherDerived& other, const Dimensions& dims) const {
+      return TensorContractionOp<const Dimensions, const Derived, const OtherDerived>(derived(), other.derived(), dims);
+    }
+
+    // Convolutions.
+    template<typename KernelDerived, typename Dimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorConvolutionOp<const Dimensions, const Derived, const KernelDerived>
+    convolve(const KernelDerived& kernel, const Dimensions& dims) const {
+      return TensorConvolutionOp<const Dimensions, const Derived, const KernelDerived>(derived(), kernel.derived(), dims);
+    }
+
+    // Convolutions by fft.
+    template<typename KernelDerived, typename Dimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorConvolutionByFFTOp<const Dimensions, const Derived, const KernelDerived>
+    convolvebyfft(const KernelDerived& kernel, const Dimensions& dims) const {
+      return TensorConvolutionByFFTOp<const Dimensions, const Derived, const KernelDerived>(derived(), kernel.derived(), dims);
+    }
+
+    // Reductions.
+    template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReductionOp<internal::SumReducer<CoeffReturnType>, const Dims, const Derived>
+    sum(const Dims& dims) const {
+      return TensorReductionOp<internal::SumReducer<CoeffReturnType>, const Dims, const Derived>(derived(), dims, internal::SumReducer<CoeffReturnType>());
+    }
+
+    const TensorReductionOp<internal::SumReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>
+    sum() const {
+      DimensionList<Index, NumDimensions> in_dims;
+      return TensorReductionOp<internal::SumReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::SumReducer<CoeffReturnType>());
+    }
+
+    template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReductionOp<internal::MeanReducer<CoeffReturnType>, const Dims, const Derived>
+    mean(const Dims& dims) const {
+      return TensorReductionOp<internal::MeanReducer<CoeffReturnType>, const Dims, const Derived>(derived(), dims, internal::MeanReducer<CoeffReturnType>());
+    }
+
+    const TensorReductionOp<internal::MeanReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>
+    mean() const {
+      DimensionList<Index, NumDimensions> in_dims;
+      return TensorReductionOp<internal::MeanReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::MeanReducer<CoeffReturnType>());
+    }
+
+    template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReductionOp<internal::ProdReducer<CoeffReturnType>, const Dims, const Derived>
+    prod(const Dims& dims) const {
+      return TensorReductionOp<internal::ProdReducer<CoeffReturnType>, const Dims, const Derived>(derived(), dims, internal::ProdReducer<CoeffReturnType>());
+    }
+
+    const TensorReductionOp<internal::ProdReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>
+    prod() const {
+      DimensionList<Index, NumDimensions> in_dims;
+      return TensorReductionOp<internal::ProdReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::ProdReducer<CoeffReturnType>());
+    }
+
+    template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReductionOp<internal::MaxReducer<CoeffReturnType>, const Dims, const Derived>
+    maximum(const Dims& dims) const {
+      return TensorReductionOp<internal::MaxReducer<CoeffReturnType>, const Dims, const Derived>(derived(), dims, internal::MaxReducer<CoeffReturnType>());
+    }
+
+    const TensorReductionOp<internal::MaxReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>
+    maximum() const {
+      DimensionList<Index, NumDimensions> in_dims;
+      return TensorReductionOp<internal::MaxReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::MaxReducer<CoeffReturnType>());
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorTupleReducerOp<
+      internal::ArgMaxTupleReducer<Tuple<Index, CoeffReturnType> >,
+      const array<Index, NumDimensions>, const Derived>
+    argmax() const {
+      array<Index, NumDimensions> in_dims;
+      for (int d = 0; d < NumDimensions; ++d) in_dims[d] = d;
+      return TensorTupleReducerOp<
+        internal::ArgMaxTupleReducer<Tuple<Index, CoeffReturnType> >,
+        const array<Index, NumDimensions>,
+        const Derived>(derived(), internal::ArgMaxTupleReducer<Tuple<Index, CoeffReturnType> >(), -1, in_dims);
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorTupleReducerOp<
+      internal::ArgMinTupleReducer<Tuple<Index, CoeffReturnType> >,
+      const array<Index, NumDimensions>, const Derived>
+    argmin() const {
+      array<Index, NumDimensions> in_dims;
+      for (int d = 0; d < NumDimensions; ++d) in_dims[d] = d;
+      return TensorTupleReducerOp<
+        internal::ArgMinTupleReducer<Tuple<Index, CoeffReturnType> >,
+        const array<Index, NumDimensions>,
+        const Derived>(derived(), internal::ArgMinTupleReducer<Tuple<Index, CoeffReturnType> >(), -1, in_dims);
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorTupleReducerOp<
+      internal::ArgMaxTupleReducer<Tuple<Index, CoeffReturnType> >,
+      const array<Index, 1>, const Derived>
+    argmax(const int return_dim) const {
+      array<Index, 1> in_dims;
+      in_dims[0] = return_dim;
+      return TensorTupleReducerOp<
+        internal::ArgMaxTupleReducer<Tuple<Index, CoeffReturnType> >,
+        const array<Index, 1>,
+        const Derived>(derived(), internal::ArgMaxTupleReducer<Tuple<Index, CoeffReturnType> >(), return_dim, in_dims);
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorTupleReducerOp<
+      internal::ArgMinTupleReducer<Tuple<Index, CoeffReturnType> >,
+      const array<Index, 1>, const Derived>
+    argmin(const int return_dim) const {
+      array<Index, 1> in_dims;
+      in_dims[0] = return_dim;
+      return TensorTupleReducerOp<
+        internal::ArgMinTupleReducer<Tuple<Index, CoeffReturnType> >,
+        const array<Index, 1>,
+        const Derived>(derived(), internal::ArgMinTupleReducer<Tuple<Index, CoeffReturnType> >(), return_dim, in_dims);
+    }
+
+    template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReductionOp<internal::MinReducer<CoeffReturnType>, const Dims, const Derived>
+    minimum(const Dims& dims) const {
+      return TensorReductionOp<internal::MinReducer<CoeffReturnType>, const Dims, const Derived>(derived(), dims, internal::MinReducer<CoeffReturnType>());
+    }
+
+    const TensorReductionOp<internal::MinReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>
+    minimum() const {
+      DimensionList<Index, NumDimensions> in_dims;
+      return TensorReductionOp<internal::MinReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::MinReducer<CoeffReturnType>());
+    }
+
+    // This does not short-circuit, so is potentially very inefficient.
+    template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReductionOp<internal::AndReducer, const Dims, const TensorConversionOp<bool, const Derived> >
+    all(const Dims& dims) const {
+      return cast<bool>().reduce(dims, internal::AndReducer());
+    }
+
+    // This does not short-circuit, so is potentially very inefficient.
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReductionOp<internal::AndReducer, const DimensionList<Index, NumDimensions>, const TensorConversionOp<bool, const Derived> >
+    all() const {
+      DimensionList<Index, NumDimensions> in_dims;
+      return cast<bool>().reduce(in_dims, internal::AndReducer());
+    }
+
+    // This does not short-circuit, so is potentially very inefficient.
+    template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReductionOp<internal::OrReducer, const Dims, const TensorConversionOp<bool, const Derived> >
+    any(const Dims& dims) const {
+      return cast<bool>().reduce(dims, internal::OrReducer());
+    }
+
+    // This does not short-circuit, so is potentially very inefficient.
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReductionOp<internal::OrReducer, const DimensionList<Index, NumDimensions>, const TensorConversionOp<bool, const Derived> >
+    any() const {
+      DimensionList<Index, NumDimensions> in_dims;
+      return cast<bool>().reduce(in_dims, internal::OrReducer());
+    }
+
+    template <typename Reducer, typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReductionOp<Reducer, const Dims, const Derived>
+    reduce(const Dims& dims, const Reducer& reducer) const {
+      return TensorReductionOp<Reducer, const Dims, const Derived>(derived(), dims, reducer);
+    }
+
+    template <typename Broadcast> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorBroadcastingOp<const Broadcast, const Derived>
+    broadcast(const Broadcast& broadcast) const {
+      return TensorBroadcastingOp<const Broadcast, const Derived>(derived(), broadcast);
+    }
+
+    template <int FFTDataType, int FFTDirection, typename FFT> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorFFTOp<const FFT, const Derived, FFTDataType, FFTDirection>
+    fft(const FFT& fft) const {
+      return TensorFFTOp<const FFT, const Derived, FFTDataType, FFTDirection>(derived(), fft);
+    }
+
+    template <typename Axis, typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorConcatenationOp<Axis, const Derived, const OtherDerived>
+    concatenate(const OtherDerived& other, Axis axis) const {
+      return TensorConcatenationOp<Axis, const Derived, const OtherDerived>(derived(), other.derived(), axis);
+    }
+
+    template <typename PatchDims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorPatchOp<const PatchDims, const Derived>
+    extract_patches(const PatchDims& patch_dims) const {
+      return TensorPatchOp<const PatchDims, const Derived>(derived(), patch_dims);
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Derived>
+    extract_volume_patches(const Index patch_planes, const Index patch_rows, const Index patch_cols,
+                           const Index plane_stride = 1, const Index row_stride = 1, const Index col_stride = 1,
+                           const PaddingType padding_type = PADDING_SAME, const Scalar padding_value = 0) const {
+      return TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Derived>(derived(), patch_planes, patch_rows, patch_cols, plane_stride, row_stride, col_stride, 1, 1, 1, 1, 1, 1, padding_type, padding_value);
+    }
+
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Derived>
+    extract_volume_patches(const Index patch_planes, const Index patch_rows, const Index patch_cols,
+                           const Index plane_stride, const Index row_stride, const Index col_stride,
+                           const Index plane_inflate_stride, const Index row_inflate_stride, const Index col_inflate_stride,
+                           const Index padding_top_z, const Index padding_bottom_z,
+                           const Index padding_top, const Index padding_bottom,
+                           const Index padding_left, const Index padding_right, const Scalar padding_value = 0) const {
+      return TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Derived>(derived(), patch_planes, patch_rows, patch_cols, plane_stride, row_stride, col_stride, 1, 1, 1, plane_inflate_stride, row_inflate_stride, col_inflate_stride, padding_top_z, padding_bottom_z, padding_top, padding_bottom, padding_left, padding_right, padding_value);
+    }
+
+    template <Index Rows, Index Cols> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorImagePatchOp<Rows, Cols, const Derived>
+    extract_image_patches() const {
+      return TensorImagePatchOp<Rows, Cols, const Derived>(derived(), Rows, Cols, 1, 1, 1, 1, 1, 1, PADDING_SAME, 0);
+    }
+
+    template <Index Rows, Index Cols> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorImagePatchOp<Rows, Cols, const Derived>
+    extract_image_patches(const PaddingType padding_type) const {
+      return TensorImagePatchOp<Rows, Cols, const Derived>(derived(), Rows, Cols, 1, 1, 1, 1, 1, 1, padding_type, 0);
+    }
+
+    template <Index Rows, Index Cols> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorImagePatchOp<Rows, Cols, const Derived>
+    extract_image_patches(const Index stride, const PaddingType padding_type) const {
+      return TensorImagePatchOp<Rows, Cols, const Derived>(derived(), Rows, Cols, stride, stride, 1, 1, 1, 1, padding_type, 0);
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorImagePatchOp<Dynamic, Dynamic, const Derived>
+    extract_image_patches(const Index patch_rows, const Index patch_cols,
+                          const Index row_stride = 1, const Index col_stride = 1) const {
+      return TensorImagePatchOp<Dynamic, Dynamic, const Derived>(derived(), patch_rows, patch_cols, row_stride, col_stride,
+                                                                 1, 1, 1, 1, PADDING_SAME, 0);
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorImagePatchOp<Dynamic, Dynamic, const Derived>
+    extract_image_patches(const Index patch_rows, const Index patch_cols,
+                          const Index row_stride, const Index col_stride,
+                          const PaddingType padding_type) const {
+      return TensorImagePatchOp<Dynamic, Dynamic, const Derived>(derived(), patch_rows, patch_cols, row_stride, col_stride,
+                                                                 1, 1, 1, 1, padding_type, 0);
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorImagePatchOp<Dynamic, Dynamic, const Derived>
+    extract_image_patches(const Index patch_rows, const Index patch_cols,
+                          const Index row_stride, const Index col_stride,
+                          const PaddingType padding_type, const Scalar padding_value) const {
+      return TensorImagePatchOp<Dynamic, Dynamic, const Derived>(derived(), patch_rows, patch_cols, row_stride, col_stride,
+                                                                 1, 1, 1, 1, padding_type, padding_value);
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorImagePatchOp<Dynamic, Dynamic, const Derived>
+    extract_image_patches(const Index patch_rows, const Index patch_cols,
+                          const Index row_stride, const Index col_stride,
+                          const Index in_row_stride, const Index in_col_stride) const {
+      return TensorImagePatchOp<Dynamic, Dynamic, const Derived>(derived(), patch_rows, patch_cols, row_stride, col_stride,
+                                                                 in_row_stride, in_col_stride, 1, 1, PADDING_SAME, 0);
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorImagePatchOp<Dynamic, Dynamic, const Derived>
+    extract_image_patches(const Index patch_rows, const Index patch_cols,
+                          const Index row_stride, const Index col_stride,
+                          const Index in_row_stride, const Index in_col_stride,
+                          const PaddingType padding_type) const {
+      return TensorImagePatchOp<Dynamic, Dynamic, const Derived>(derived(), patch_rows, patch_cols, row_stride, col_stride,
+                                                                 in_row_stride, in_col_stride, 1, 1, padding_type, 0);
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorImagePatchOp<Dynamic, Dynamic, const Derived>
+    extract_image_patches(const Index patch_rows, const Index patch_cols,
+                          const Index row_stride, const Index col_stride,
+                          const Index in_row_stride, const Index in_col_stride,
+                          const PaddingType padding_type, const Scalar padding_value) const {
+      return TensorImagePatchOp<Dynamic, Dynamic, const Derived>(derived(), patch_rows, patch_cols, row_stride, col_stride,
+                                                                 in_row_stride, in_col_stride, 1, 1, padding_type, padding_value);
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorImagePatchOp<Dynamic, Dynamic, const Derived>
+    extract_image_patches(const Index patch_rows, const Index patch_cols,
+                          const Index row_stride, const Index col_stride,
+                          const Index in_row_stride, const Index in_col_stride,
+                          const Index row_inflate_stride, const Index col_inflate_stride,
+                          const PaddingType padding_type, const Scalar padding_value) const {
+      return TensorImagePatchOp<Dynamic, Dynamic, const Derived>(derived(), patch_rows, patch_cols, row_stride, col_stride,
+                                                                 in_row_stride, in_col_stride, row_inflate_stride, col_inflate_stride,
+                                                                 padding_type, padding_value);
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorImagePatchOp<Dynamic, Dynamic, const Derived>
+    extract_image_patches(const Index patch_rows, const Index patch_cols,
+                          const Index row_stride, const Index col_stride,
+                          const Index in_row_stride, const Index in_col_stride,
+                          const Index row_inflate_stride, const Index col_inflate_stride,
+                          const Index padding_top, const Index padding_bottom,
+                          const Index padding_left,const Index padding_right,
+                          const Scalar padding_value) const {
+      return TensorImagePatchOp<Dynamic, Dynamic, const Derived>(derived(), patch_rows, patch_cols, row_stride, col_stride,
+                                                                 in_row_stride, in_col_stride, row_inflate_stride, col_inflate_stride,
+                                                                 padding_top, padding_bottom, padding_left, padding_right, padding_value);
+    }
+
+    // Morphing operators.
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorLayoutSwapOp<const Derived>
+    swap_layout() const {
+      return TensorLayoutSwapOp<const Derived>(derived());
+    }
+    template <typename NewDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReshapingOp<const NewDimensions, const Derived>
+    reshape(const NewDimensions& newDimensions) const {
+      return TensorReshapingOp<const NewDimensions, const Derived>(derived(), newDimensions);
+    }
+    template <typename StartIndices, typename Sizes> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorSlicingOp<const StartIndices, const Sizes, const Derived>
+    slice(const StartIndices& startIndices, const Sizes& sizes) const {
+      return TensorSlicingOp<const StartIndices, const Sizes, const Derived>(derived(), startIndices, sizes);
+    }
+    template <Index DimId> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorChippingOp<DimId, const Derived>
+    chip(const Index offset) const {
+      return TensorChippingOp<DimId, const Derived>(derived(), offset, DimId);
+    }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorChippingOp<Dynamic, const Derived>
+    chip(const Index offset, const Index dim) const {
+      return TensorChippingOp<Dynamic, const Derived>(derived(), offset, dim);
+    }
+    template <typename ReverseDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReverseOp<const ReverseDimensions, const Derived>
+    reverse(const ReverseDimensions& rev) const {
+      return TensorReverseOp<const ReverseDimensions, const Derived>(derived(), rev);
+    }
+    template <typename PaddingDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorPaddingOp<const PaddingDimensions, const Derived>
+    pad(const PaddingDimensions& padding) const {
+      return TensorPaddingOp<const PaddingDimensions, const Derived>(derived(), padding, Scalar(0));
+    }
+    template <typename PaddingDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorPaddingOp<const PaddingDimensions, const Derived>
+    pad (const PaddingDimensions& padding, const Scalar padding_value) const {
+      return TensorPaddingOp<const PaddingDimensions, const Derived>(derived(), padding, padding_value);
+    }
+    template <typename Shuffle> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorShufflingOp<const Shuffle, const Derived>
+    shuffle(const Shuffle& shuffle) const {
+      return TensorShufflingOp<const Shuffle, const Derived>(derived(), shuffle);
+    }
+    template <typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorStridingOp<const Strides, const Derived>
+    stride(const Strides& strides) const {
+      return TensorStridingOp<const Strides, const Derived>(derived(), strides);
+    }
+    template <typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorInflationOp<const Strides, const Derived>
+    inflate(const Strides& strides) const {
+      return TensorInflationOp<const Strides, const Derived>(derived(), strides);
+    }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorTrueIndicesOp<const Derived>
+    true_indices(const Index& not_true_value = -1) const {
+      return TensorTrueIndicesOp<const Derived>(derived(), not_true_value);
+    }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorIndexTupleOp<const Derived>
+    index_tuples() const {
+      return TensorIndexTupleOp<const Derived>(derived());
+    }
+    template <typename CustomUnaryFunc>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCustomUnaryOp<const CustomUnaryFunc, const Derived> customOp(const CustomUnaryFunc& op) const {
+      return TensorCustomUnaryOp<const CustomUnaryFunc, const Derived>(derived(), op);
+    }
+    template <typename OtherDerived, typename CustomBinaryFunc>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCustomBinaryOp<const CustomBinaryFunc, const Derived, const OtherDerived> customOp(const OtherDerived& other, const CustomBinaryFunc& op) const {
+      return TensorCustomBinaryOp<const CustomBinaryFunc, const Derived, const OtherDerived>(derived(), other, op);
+    }
+
+    // Force the evaluation of the expression.
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorForcedEvalOp<const Derived> eval() const {
+      return TensorForcedEvalOp<const Derived>(derived());
+    }
+
+  protected:
+    template <typename Scalar, std::size_t NumIndices, int Options, typename IndexType> friend class Tensor;
+    template <typename Scalar, int Option, typename IndexTypes> friend class TensorVarDim;
+    template <typename Scalar, typename Dimensions, int Option, typename IndexTypes> friend class TensorFixedSize;
+    template <typename OtherDerived, int AccessLevel> friend class TensorBase;
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Derived& derived() const { return *static_cast<const Derived*>(this); }
+};
+
+template<typename Derived>
+class TensorBase<Derived, WriteAccessors> : public TensorBase<Derived, ReadOnlyAccessors> {
+ public:
+    typedef internal::traits<Derived> DerivedTraits;
+    typedef typename DerivedTraits::Scalar Scalar;
+    typedef typename DerivedTraits::Index Index;
+    typedef Scalar CoeffReturnType;
+    typedef typename internal::packet_traits<Scalar>::type PacketReturnType;
+    static const int NumDimensions = DerivedTraits::NumDimensions;
+
+    template <typename Scalar, std::size_t NumIndices, int Options, typename IndexType> friend class Tensor;
+    template <typename Scalar, int Options, typename IndexType> friend class TensorVarDim;
+    template <typename OtherDerived, int AccessLevel> friend class TensorBase;
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Derived& setZero() {
+      return setConstant(Scalar(0));
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Derived& setConstant(const Scalar& val) {
+      return derived() = this->constant(val);
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Derived& setRandom() {
+      return derived() = this->random();
+    }
+    template <typename RandomGenerator> EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Derived& setRandom() {
+      return derived() = this->template random<RandomGenerator>();
+    }
+
+#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Derived& setValues(
+        const typename internal::Initializer<Derived, NumDimensions>::InitList& vals) {
+      TensorEvaluator<Derived, DefaultDevice> eval(derived(), DefaultDevice());
+      internal::initialize_tensor<Derived, NumDimensions>(eval, vals);
+      return derived();
+    }
+#endif  // EIGEN_HAS_VARIADIC_TEMPLATES
+
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Derived& operator+=(const OtherDerived& other) {
+      return derived() = derived() + other.derived();
+    }
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Derived& operator-=(const OtherDerived& other) {
+      return derived() = derived() - other.derived();
+    }
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Derived& operator*=(const OtherDerived& other) {
+      return derived() = derived() * other.derived();
+    }
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Derived& operator/=(const OtherDerived& other) {
+      return derived() = derived() / other.derived();
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorLayoutSwapOp<const Derived>
+    swap_layout() const {
+      return TensorLayoutSwapOp<const Derived>(derived());
+    }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    TensorLayoutSwapOp<Derived>
+    swap_layout() {
+      return TensorLayoutSwapOp<Derived>(derived());
+    }
+
+    template <typename Axis, typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorConcatenationOp<const Axis, const Derived, const OtherDerived>
+    concatenate(const OtherDerived& other, const Axis& axis) const {
+      return TensorConcatenationOp<const Axis, const Derived, const OtherDerived>(derived(), other, axis);
+    }
+    template <typename Axis, typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    TensorConcatenationOp<const Axis, Derived, OtherDerived>
+    concatenate(const OtherDerived& other, const Axis& axis) {
+      return TensorConcatenationOp<const Axis, Derived, OtherDerived>(derived(), other, axis);
+    }
+
+    template <typename NewDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReshapingOp<const NewDimensions, const Derived>
+    reshape(const NewDimensions& newDimensions) const {
+      return TensorReshapingOp<const NewDimensions, const Derived>(derived(), newDimensions);
+    }
+    template <typename NewDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    TensorReshapingOp<const NewDimensions, Derived>
+    reshape(const NewDimensions& newDimensions) {
+      return TensorReshapingOp<const NewDimensions, Derived>(derived(), newDimensions);
+    }
+
+    template <typename StartIndices, typename Sizes> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorSlicingOp<const StartIndices, const Sizes, const Derived>
+    slice(const StartIndices& startIndices, const Sizes& sizes) const {
+      return TensorSlicingOp<const StartIndices, const Sizes, const Derived>(derived(), startIndices, sizes);
+    }
+    template <typename StartIndices, typename Sizes> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    TensorSlicingOp<const StartIndices, const Sizes, Derived>
+    slice(const StartIndices& startIndices, const Sizes& sizes) {
+      return TensorSlicingOp<const StartIndices, const Sizes, Derived>(derived(), startIndices, sizes);
+    }
+
+    template <DenseIndex DimId> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorChippingOp<DimId, const Derived>
+    chip(const Index offset) const {
+      return TensorChippingOp<DimId, const Derived>(derived(), offset, DimId);
+    }
+    template <Index DimId> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    TensorChippingOp<DimId, Derived>
+    chip(const Index offset) {
+      return TensorChippingOp<DimId, Derived>(derived(), offset, DimId);
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorChippingOp<Dynamic, const Derived>
+    chip(const Index offset, const Index dim) const {
+      return TensorChippingOp<Dynamic, const Derived>(derived(), offset, dim);
+    }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    TensorChippingOp<Dynamic, Derived>
+    chip(const Index offset, const Index dim) {
+      return TensorChippingOp<Dynamic, Derived>(derived(), offset, dim);
+    }
+
+    template <typename ReverseDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReverseOp<const ReverseDimensions, const Derived>
+    reverse(const ReverseDimensions& rev) const {
+      return TensorReverseOp<const ReverseDimensions, const Derived>(derived(), rev);
+    }
+    template <typename ReverseDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    TensorReverseOp<const ReverseDimensions, Derived>
+    reverse(const ReverseDimensions& rev) {
+      return TensorReverseOp<const ReverseDimensions, Derived>(derived(), rev);
+    }
+
+    template <typename Shuffle> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorShufflingOp<const Shuffle, const Derived>
+    shuffle(const Shuffle& shuffle) const {
+      return TensorShufflingOp<const Shuffle, const Derived>(derived(), shuffle);
+    }
+    template <typename Shuffle> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    TensorShufflingOp<const Shuffle, Derived>
+    shuffle(const Shuffle& shuffle) {
+      return TensorShufflingOp<const Shuffle, Derived>(derived(), shuffle);
+    }
+
+    template <typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorStridingOp<const Strides, const Derived>
+    stride(const Strides& strides) const {
+      return TensorStridingOp<const Strides, const Derived>(derived(), strides);
+    }
+    template <typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    TensorStridingOp<const Strides, Derived>
+    stride(const Strides& strides) {
+      return TensorStridingOp<const Strides, Derived>(derived(), strides);
+    }
+
+    // Select the device on which to evaluate the expression.
+    template <typename DeviceType>
+    TensorDevice<Derived, DeviceType> device(const DeviceType& device) {
+      return TensorDevice<Derived, DeviceType>(device, derived());
+    }
+
+ protected:
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Derived& derived() { return *static_cast<Derived*>(this); }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Derived& derived() const { return *static_cast<const Derived*>(this); }
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_BASE_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
new file mode 100644
index 0000000000..ac428b169f
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
@@ -0,0 +1,627 @@
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
+#define EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
+
+namespace Eigen {
+
+/** \class TensorBlock
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor block class.
+  *
+  * This class represents a tensor block specified by the index of the
+  * first block coefficient, and the size of the block in each dimension.
+  *
+  */
+
+namespace internal {
+
+template <typename Index, typename Scalar, std::size_t NumDims, int Layout>
+class TensorBlock {
+ public:
+  typedef DSizes<Index, NumDims> Dimensions;
+
+  TensorBlock(const Index first_coeff_index,
+              const Dimensions& block_sizes,
+              const Dimensions& block_strides,
+              const Dimensions& tensor_strides,
+              Scalar* data)
+      : m_first_coeff_index(first_coeff_index),
+        m_block_sizes(block_sizes),
+        m_block_strides(block_strides),
+        m_tensor_strides(tensor_strides),
+        m_data(data) {}
+
+  Index first_coeff_index() const { return m_first_coeff_index; }
+
+  const Dimensions& block_sizes() const { return m_block_sizes; }
+
+  const Dimensions& block_strides() const { return m_block_strides; }
+
+  const Dimensions& tensor_strides() const { return m_tensor_strides; }
+
+  Scalar* data() { return m_data; }
+
+  const Scalar* data() const { return m_data; }
+
+ private:
+  Index m_first_coeff_index;
+  Dimensions m_block_sizes;
+  Dimensions m_block_strides;
+  Dimensions m_tensor_strides;
+  Scalar* m_data;  // Not owned.
+};
+
+template <typename Index, typename Scalar, bool Vectorizable>
+struct TensorBlockCopyOp {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
+      const Index num_coeff_to_copy, const Index dst_index,
+      const Index dst_stride, Scalar* EIGEN_RESTRICT dst_data, const Index src_index,
+      const Index src_stride, const Scalar* EIGEN_RESTRICT src_data) {
+    for (Index i = 0; i < num_coeff_to_copy; ++i) {
+      dst_data[dst_index + i * dst_stride] =
+          src_data[src_index + i * src_stride];
+    }
+  }
+};
+
+// NOTE: Benchmarks run on an implementation of this that broke each of the
+// loops in these conditionals into it's own template specialization (to
+// avoid conditionals in the caller's loop) did not show an improvement.
+template <typename Index, typename Scalar>
+struct TensorBlockCopyOp<Index, Scalar, true> {
+  typedef typename packet_traits<Scalar>::type Packet;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
+      const Index num_coeff_to_copy, const Index dst_index,
+      const Index dst_stride, Scalar* EIGEN_RESTRICT dst_data,
+      const Index src_index, const Index src_stride,
+      const Scalar* EIGEN_RESTRICT src_data) {
+    if (src_stride == 1) {
+      const Index packet_size = internal::unpacket_traits<Packet>::size;
+      const Index vectorized_size =
+          (num_coeff_to_copy / packet_size) * packet_size;
+      if (dst_stride == 1) {
+        // LINEAR
+        for (Index i = 0; i < vectorized_size; i += packet_size) {
+          Packet p = internal::ploadt<Packet, Unaligned>(
+              src_data + src_index + i);
+          internal::pstoret<Scalar, Packet, Unaligned>(
+              dst_data + dst_index + i, p);
+        }
+        for (Index i = vectorized_size; i < num_coeff_to_copy; ++i) {
+          dst_data[dst_index + i] = src_data[src_index + i];
+        }
+      } else {
+        // SCATTER
+        for (Index i = 0; i < vectorized_size; i += packet_size) {
+          Packet p = internal::ploadt<Packet, Unaligned>(
+              src_data + src_index + i);
+          internal::pscatter<Scalar, Packet>(
+              dst_data + dst_index + i * dst_stride, p, dst_stride);
+        }
+        for (Index i = vectorized_size; i < num_coeff_to_copy; ++i) {
+          dst_data[dst_index + i * dst_stride] = src_data[src_index + i];
+        }
+      }
+    } else {
+      if (dst_stride == 1) {
+        // GATHER
+        const Index packet_size = internal::unpacket_traits<Packet>::size;
+        const Index vectorized_size =
+            (num_coeff_to_copy / packet_size) * packet_size;
+        for (Index i = 0; i < vectorized_size; i += packet_size) {
+          Packet p = internal::pgather<Scalar, Packet>(
+              src_data + src_index + i * src_stride, src_stride);
+          internal::pstoret<Scalar, Packet, Unaligned>(
+              dst_data + dst_index + i, p);
+        }
+        for (Index i = vectorized_size; i < num_coeff_to_copy; ++i) {
+          dst_data[dst_index + i] = src_data[src_index + i * src_stride];
+        }
+      } else {
+        // RANDOM
+        for (Index i = 0; i < num_coeff_to_copy; ++i) {
+          dst_data[dst_index + i * dst_stride] =
+              src_data[src_index + i * src_stride];
+        }
+      }
+    }
+  }
+};
+
+/** \class TensorBlockIO
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor block IO class.
+  *
+  * This class is responsible for copying data between a tensor and a tensor
+  * block.
+  *
+  */
+template <typename Index, typename Scalar, std::size_t NumDims, int Layout,
+          bool Vectorizable, bool BlockRead>
+class TensorBlockIO {
+ public:
+  typedef typename internal::TensorBlock<Index, Scalar, NumDims, Layout>
+    TensorBlock;
+  typedef typename internal::TensorBlockCopyOp<Index, Scalar, Vectorizable>
+    TensorBlockCopyOp;
+
+ protected:
+  struct BlockIteratorState {
+    Index input_stride;
+    Index output_stride;
+    Index input_span;
+    Index output_span;
+    Index size;
+    Index count;
+  };
+
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Copy(
+      const TensorBlock& block, Index first_coeff_index,
+      const array<Index, NumDims>& tensor_to_block_dim_map,
+      const array<Index, NumDims>& tensor_strides, const Scalar* src_data,
+      Scalar* dst_data) {
+    // Calculate strides and dimensions.
+    const Index block_dim_for_tensor_stride1_dim =
+        NumDims == 0 ? 1 :
+        tensor_to_block_dim_map[static_cast<int>(Layout) ==
+                                        static_cast<int>(ColMajor)
+                                    ? 0
+                                    : NumDims - 1];
+    const size_t block_inner_dim_size =
+        NumDims == 0 ? 1 :
+        block.block_sizes()[block_dim_for_tensor_stride1_dim];
+    const size_t block_outer_dim_size =
+        NumDims == 0 ? 1 :
+        block.block_sizes().TotalSize() / block_inner_dim_size;
+
+    Index inputIndex;
+    Index outputIndex;
+    Index input_stride;
+    Index output_stride;
+
+    // Setup strides to read/write along the tensor's stride1 dimension.
+    if (BlockRead) {
+      inputIndex = first_coeff_index;
+      outputIndex = 0;
+      input_stride = 1;
+      output_stride = NumDims == 0 ? 1
+          : block.block_strides()[block_dim_for_tensor_stride1_dim];
+    } else {
+      inputIndex = 0;
+      outputIndex = first_coeff_index;
+      input_stride = NumDims == 0 ? 1
+          : block.block_strides()[block_dim_for_tensor_stride1_dim];
+      output_stride = 1;
+    }
+
+    const std::size_t at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1;
+    array<BlockIteratorState, at_least_1_dim> block_iter_state;
+
+    // Initialize block iterator state.
+    for (int i = 0; i < static_cast<int>(NumDims) - 1; ++i) {
+      const int dim = static_cast<int>(Layout) == static_cast<int>(ColMajor)
+                          ? i + 1
+                          : NumDims - i - 2;
+      block_iter_state[i].size =
+          block.block_sizes()[tensor_to_block_dim_map[dim]];
+      if (BlockRead) {
+        block_iter_state[i].input_stride = tensor_strides[dim];
+        block_iter_state[i].output_stride =
+            block.block_strides()[tensor_to_block_dim_map[dim]];
+      } else {
+        block_iter_state[i].input_stride =
+            block.block_strides()[tensor_to_block_dim_map[dim]];
+        block_iter_state[i].output_stride = tensor_strides[dim];
+      }
+      block_iter_state[i].input_span =
+          block_iter_state[i].input_stride * (block_iter_state[i].size - 1);
+      block_iter_state[i].output_span =
+          block_iter_state[i].output_stride * (block_iter_state[i].size - 1);
+      block_iter_state[i].count = 0;
+    }
+
+    // Iterate copying data from src to dst.
+    for (Index i = 0; i < block_outer_dim_size; ++i) {
+      TensorBlockCopyOp::Run(block_inner_dim_size, outputIndex, output_stride,
+                             dst_data, inputIndex, input_stride, src_data);
+      // Update index.
+      for (int i = 0; i < static_cast<int>(NumDims) - 1; ++i) {
+        if (++block_iter_state[i].count < block_iter_state[i].size) {
+          inputIndex += block_iter_state[i].input_stride;
+          outputIndex += block_iter_state[i].output_stride;
+          break;
+        }
+        block_iter_state[i].count = 0;
+        inputIndex -= block_iter_state[i].input_span;
+        outputIndex -= block_iter_state[i].output_span;
+      }
+    }
+  }
+};
+
+/** \class TensorBlockReader
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor block reader class.
+  *
+  * This class is responsible for reading a tensor block.
+  *
+  */
+
+template <typename Index, typename Scalar, std::size_t NumDims, int Layout,
+          bool Vectorizable>
+class TensorBlockReader : public TensorBlockIO<Index, Scalar, NumDims,
+                                               Layout, Vectorizable, true> {
+ public:
+  typedef typename internal::TensorBlock<Index, Scalar, NumDims, Layout>
+      TensorBlock;
+  typedef TensorBlockIO<Index, Scalar, NumDims, Layout, Vectorizable, true>
+      Base;
+
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
+      TensorBlock* block, const Scalar* src_data) {
+    array<Index, NumDims> tensor_to_block_dim_map;
+    for (int i = 0; i < NumDims; ++i) {
+      tensor_to_block_dim_map[i] = i;
+    }
+    Base::Copy(*block, block->first_coeff_index(), tensor_to_block_dim_map,
+               block->tensor_strides(), src_data, block->data());
+  }
+
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
+      TensorBlock* block, Index first_coeff_index,
+      const array<Index, NumDims>& tensor_to_block_dim_map,
+      const array<Index, NumDims>& tensor_strides, const Scalar* src_data) {
+    Base::Copy(*block, first_coeff_index, tensor_to_block_dim_map,
+               tensor_strides, src_data, block->data());
+  }
+};
+
+/** \class TensorBlockWriter
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor block writer class.
+  *
+  * This class is responsible for writing a tensor block.
+  *
+  */
+
+template <typename Index, typename Scalar, std::size_t NumDims, int Layout,
+          bool Vectorizable>
+class TensorBlockWriter : public TensorBlockIO<Index, Scalar, NumDims,
+                                               Layout, Vectorizable, false> {
+ public:
+  typedef typename internal::TensorBlock<Index, Scalar, NumDims, Layout>
+      TensorBlock;
+  typedef TensorBlockIO<Index, Scalar, NumDims, Layout, Vectorizable, false>
+      Base;
+
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
+      const TensorBlock& block, Scalar* dst_data) {
+    array<Index, NumDims> tensor_to_block_dim_map;
+    for (int i = 0; i < NumDims; ++i) {
+      tensor_to_block_dim_map[i] = i;
+    }
+    Base::Copy(block, block.first_coeff_index(), tensor_to_block_dim_map,
+               block.tensor_strides(), block.data(), dst_data);
+  }
+
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
+      const TensorBlock& block, Index first_coeff_index,
+      const array<Index, NumDims>& tensor_to_block_dim_map,
+      const array<Index, NumDims>& tensor_strides, Scalar* dst_data) {
+    Base::Copy(block, first_coeff_index, tensor_to_block_dim_map,
+               tensor_strides, block.data(), dst_data);
+  }
+};
+
+enum TensorBlockShapeType {
+  kUniformAllDims,
+  kSkewedInnerDims,
+};
+
+struct TensorOpResourceRequirements {
+  TensorBlockShapeType block_shape;
+  std::size_t block_total_size;
+  // TODO(andydavis) Add 'target_num_threads' to support communication of
+  // thread-resource requirements. This will allow ops deep in the
+  // expression tree (like reductions) to communicate resources
+  // requirements based on local state (like the total number of reductions
+  // to be computed).
+  TensorOpResourceRequirements(internal::TensorBlockShapeType shape,
+                               const std::size_t size)
+      : block_shape(shape), block_total_size(size) {}
+};
+
+/** \class TensorBlockMapper
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor block mapper class.
+  *
+  * This class is responsible for iterating over the blocks of a tensor.
+  *
+  */
+
+template <typename Index, typename Scalar, std::size_t NumDims, int Layout>
+class TensorBlockMapper {
+ public:
+  typedef typename internal::TensorBlock<Index, Scalar, NumDims, Layout>
+      TensorBlock;
+
+  TensorBlockMapper(const Eigen::DSizes<Index, NumDims>& dims,
+                    const TensorBlockShapeType block_shape,
+                    const size_t max_coeff_count)
+      : m_dimensions(dims), m_block_dim_sizes(dims), m_total_block_count(1) {
+    if (m_block_dim_sizes.TotalSize() > max_coeff_count) {
+      if (block_shape == kUniformAllDims) {
+        // Tensor will not fit within 'max_coeff_count' budget: calculate tensor
+        // block dimension sizes based on "square" dimension size target.
+        const size_t dim_size_target =
+            std::pow(static_cast<float>(max_coeff_count),
+                     1.0 / static_cast<float>(m_block_dim_sizes.rank()));
+        for (size_t i = 0; i < m_block_dim_sizes.rank(); ++i) {
+          // TODO(andydavis) Adjust the inner most 'm_block_dim_size' to make it
+          // a multiple of the packet size. Note that reducing 'm_block_dim_size'
+          // in this manner can increase the number of blocks, and so will
+          // amplify any per-block overhead.
+          m_block_dim_sizes[i] =
+              numext::mini(dim_size_target, static_cast<size_t>(m_dimensions[i]));
+        }
+        // Add any un-allocated coefficients to inner dimension(s).
+        Index total_size = m_block_dim_sizes.TotalSize();
+        for (int i = 0; i < NumDims; ++i) {
+          const int dim = static_cast<int>(Layout) == static_cast<int>(ColMajor)
+              ? i : NumDims - i - 1;
+          if (m_block_dim_sizes[dim] < m_dimensions[dim]) {
+            const Index total_size_other_dims = total_size /
+                m_block_dim_sizes[dim];
+            const Index alloc_avail = max_coeff_count / total_size_other_dims;
+            if (alloc_avail == m_block_dim_sizes[dim]) {
+              // Insufficient excess coefficients to allocate.
+              break;
+            }
+            m_block_dim_sizes[dim] = numext::mini(m_dimensions[dim], alloc_avail);
+            total_size = total_size_other_dims * m_block_dim_sizes[dim];
+          }
+        }
+      } else {
+        eigen_assert(block_shape == kSkewedInnerDims);
+        Index coeff_to_allocate = max_coeff_count;
+        for (int i = 0; i < NumDims; ++i) {
+          const int dim = static_cast<int>(Layout) == static_cast<int>(ColMajor)
+              ? i : NumDims - i - 1;
+          m_block_dim_sizes[dim] = numext::mini(coeff_to_allocate,
+                                                m_dimensions[dim]);
+          coeff_to_allocate /= numext::maxi(static_cast<Index>(1),
+                                            m_block_dim_sizes[dim]);
+        }
+      }
+    }
+
+    // Calculate block counts by dimension and total block count.
+    DSizes<Index, NumDims> block_count;
+    for (size_t i = 0; i < block_count.rank(); ++i) {
+      block_count[i] =
+          (m_dimensions[i] + m_block_dim_sizes[i] - 1) / m_block_dim_sizes[i];
+    }
+    m_total_block_count = array_prod(block_count);
+
+    // Calculate block strides (used for enumerating blocks).
+    if (NumDims > 0) {
+      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+        m_block_strides[0] = 1;
+        m_tensor_strides[0] = 1;
+        for (int i = 1; i < NumDims; ++i) {
+          m_block_strides[i] = m_block_strides[i - 1] * block_count[i - 1];
+          m_tensor_strides[i] = m_tensor_strides[i - 1] * m_dimensions[i - 1];
+        }
+      } else {
+        m_block_strides[NumDims - 1] = 1;
+        m_tensor_strides[NumDims - 1] = 1;
+        for (int i = NumDims - 2; i >= 0; --i) {
+          m_block_strides[i] = m_block_strides[i + 1] * block_count[i + 1];
+          m_tensor_strides[i] = m_tensor_strides[i + 1] * m_dimensions[i + 1];
+        }
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+  GetBlockForIndex(Index block_index, Scalar* data) const {
+    Index first_coeff_index = 0;
+    DSizes<Index, NumDims> coords;
+    DSizes<Index, NumDims> sizes;
+    DSizes<Index, NumDims> strides;
+    if (NumDims > 0) {
+      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+        for (int i = NumDims - 1; i > 0; --i) {
+          const Index idx = block_index / m_block_strides[i];
+          coords[i] = idx * m_block_dim_sizes[i];
+          sizes[i] =
+              numext::mini((m_dimensions[i] - coords[i]), m_block_dim_sizes[i]);
+          block_index -= idx * m_block_strides[i];
+          first_coeff_index += coords[i] * m_tensor_strides[i];
+        }
+        coords[0] = block_index * m_block_dim_sizes[0];
+        sizes[0] =
+            numext::mini((m_dimensions[0] - coords[0]), m_block_dim_sizes[0]);
+        first_coeff_index += coords[0] * m_tensor_strides[0];
+
+        strides[0] = 1;
+        for (int i = 1; i < NumDims; ++i) {
+          strides[i] = strides[i - 1] * sizes[i - 1];
+        }
+      } else {
+        for (int i = 0; i < NumDims - 1; ++i) {
+          const Index idx = block_index / m_block_strides[i];
+          coords[i] = idx * m_block_dim_sizes[i];
+          sizes[i] =
+              numext::mini((m_dimensions[i] - coords[i]), m_block_dim_sizes[i]);
+          block_index -= idx * m_block_strides[i];
+          first_coeff_index += coords[i] * m_tensor_strides[i];
+        }
+        coords[NumDims - 1] = block_index * m_block_dim_sizes[NumDims - 1];
+        sizes[NumDims - 1] =
+            numext::mini((m_dimensions[NumDims - 1] - coords[NumDims - 1]),
+                       m_block_dim_sizes[NumDims - 1]);
+        first_coeff_index += coords[NumDims - 1] * m_tensor_strides[NumDims - 1];
+
+        strides[NumDims - 1] = 1;
+        for (int i = NumDims - 2; i >= 0; --i) {
+          strides[i] = strides[i + 1] * sizes[i + 1];
+        }
+      }
+    }
+
+    return TensorBlock(first_coeff_index, sizes, strides, m_tensor_strides,
+                       data);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index total_block_count() const {
+    return m_total_block_count;
+  }
+
+ private:
+  DSizes<Index, NumDims> m_dimensions;
+  DSizes<Index, NumDims> m_block_dim_sizes;
+  DSizes<Index, NumDims> m_block_strides;
+  DSizes<Index, NumDims> m_tensor_strides;
+  Index m_total_block_count;
+};
+
+/** \class TensorSliceBlockMapper
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor slice block mapper class.
+  *
+  * This class is responsible for iterating over the blocks of
+  * a slice of a tensor. Supports shuffling of the block strides
+  * for callers that want to reduce strides for dimensions to be
+  * processed together.
+  *
+  */
+
+template <typename Index, typename Scalar, std::size_t NumDims, int Layout>
+class TensorSliceBlockMapper {
+ public:
+  typedef typename internal::TensorBlock<Index, Scalar, NumDims, Layout>
+      TensorBlock;
+  typedef DSizes<Index, NumDims> Dimensions;
+
+  TensorSliceBlockMapper(const Dimensions& tensor_dims,
+                         const Dimensions& tensor_slice_offsets,
+                         const Dimensions& tensor_slice_extents,
+                         const Dimensions& block_dim_sizes,
+                         const Dimensions& block_stride_order)
+      : m_tensor_dimensions(tensor_dims),
+        m_tensor_slice_offsets(tensor_slice_offsets),
+        m_tensor_slice_extents(tensor_slice_extents),
+        m_block_dim_sizes(block_dim_sizes),
+        m_block_stride_order(block_stride_order),
+        m_total_block_count(1) {
+    // Calculate block counts by dimension and total block count.
+    DSizes<Index, NumDims> block_count;
+    for (size_t i = 0; i < block_count.rank(); ++i) {
+      block_count[i] = (m_tensor_slice_extents[i] + m_block_dim_sizes[i] - 1) /
+          m_block_dim_sizes[i];
+    }
+    m_total_block_count = array_prod(block_count);
+
+    // Calculate block strides (used for enumerating blocks).
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_block_strides[0] = 1;
+      m_tensor_strides[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        m_block_strides[i] = m_block_strides[i - 1] * block_count[i - 1];
+        m_tensor_strides[i] = m_tensor_strides[i - 1] *
+            m_tensor_dimensions[i - 1];
+      }
+    } else {
+      m_block_strides[NumDims - 1] = 1;
+      m_tensor_strides[NumDims - 1] = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        m_block_strides[i] = m_block_strides[i + 1] * block_count[i + 1];
+        m_tensor_strides[i] = m_tensor_strides[i + 1] *
+            m_tensor_dimensions[i + 1];
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+  GetBlockForIndex(Index block_index, Scalar* data) const {
+    Index first_coeff_index = 0;
+    DSizes<Index, NumDims> coords;
+    DSizes<Index, NumDims> sizes;
+    DSizes<Index, NumDims> strides;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumDims - 1; i > 0; --i) {
+        const Index idx = block_index / m_block_strides[i];
+        coords[i] = m_tensor_slice_offsets[i] + idx * m_block_dim_sizes[i];
+        sizes[i] = numext::mini(m_tensor_slice_offsets[i] + m_tensor_slice_extents[i] - coords[i],
+                                m_block_dim_sizes[i]);
+        block_index -= idx * m_block_strides[i];
+        first_coeff_index += coords[i] * m_tensor_strides[i];
+      }
+      coords[0] = m_tensor_slice_offsets[0] +
+          block_index * m_block_dim_sizes[0];
+      sizes[0] = numext::mini(m_tensor_slice_offsets[0] + m_tensor_slice_extents[0] - coords[0],
+                                m_block_dim_sizes[0]);
+      first_coeff_index += coords[0] * m_tensor_strides[0];
+
+      Index prev_dim = m_block_stride_order[0];
+      strides[prev_dim] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        const Index curr_dim = m_block_stride_order[i];
+        strides[curr_dim] = strides[prev_dim] * sizes[prev_dim];
+        prev_dim = curr_dim;
+      }
+    } else {
+      for (int i = 0; i < static_cast<int>(NumDims) - 1; ++i) {
+        const Index idx = block_index / m_block_strides[i];
+        coords[i] = m_tensor_slice_offsets[i] + idx * m_block_dim_sizes[i];
+        sizes[i] = numext::mini(m_tensor_slice_offsets[i] + m_tensor_slice_extents[i] - coords[i],
+                                m_block_dim_sizes[i]);
+        block_index -= idx * m_block_strides[i];
+        first_coeff_index += coords[i] * m_tensor_strides[i];
+      }
+      coords[NumDims - 1] = m_tensor_slice_offsets[NumDims - 1] +
+          block_index * m_block_dim_sizes[NumDims - 1];
+      sizes[NumDims - 1] = numext::mini(
+          m_tensor_slice_offsets[NumDims - 1] + m_tensor_slice_extents[NumDims - 1] - coords[NumDims - 1],
+          m_block_dim_sizes[NumDims - 1]);
+      first_coeff_index += coords[NumDims - 1] * m_tensor_strides[NumDims - 1];
+
+      Index prev_dim = m_block_stride_order[NumDims - 1];
+      strides[prev_dim] = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        const Index curr_dim = m_block_stride_order[i];
+        strides[curr_dim] = strides[prev_dim] * sizes[prev_dim];
+        prev_dim = curr_dim;
+      }
+    }
+
+    return TensorBlock(first_coeff_index, sizes, strides, m_tensor_strides,
+                       data);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index total_block_count() const {
+    return m_total_block_count;
+  }
+
+ private:
+  Dimensions m_tensor_dimensions;
+  Dimensions m_tensor_slice_offsets;
+  Dimensions m_tensor_slice_extents;
+  Dimensions m_tensor_strides;
+  Dimensions m_block_dim_sizes;
+  Dimensions m_block_stride_order;
+  Dimensions m_block_strides;
+  Index m_total_block_count;
+};
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
new file mode 100644
index 0000000000..7e6d00fad6
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
@@ -0,0 +1,352 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H
+#define EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H
+
+namespace Eigen {
+
+/** \class TensorBroadcasting
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor broadcasting class.
+  *
+  *
+  */
+namespace internal {
+template<typename Broadcast, typename XprType>
+struct traits<TensorBroadcastingOp<Broadcast, XprType> > : public traits<XprType>
+{
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef typename remove_reference<Nested>::type _Nested;
+  static const int NumDimensions = XprTraits::NumDimensions;
+  static const int Layout = XprTraits::Layout;
+};
+
+template<typename Broadcast, typename XprType>
+struct eval<TensorBroadcastingOp<Broadcast, XprType>, Eigen::Dense>
+{
+  typedef const TensorBroadcastingOp<Broadcast, XprType>& type;
+};
+
+template<typename Broadcast, typename XprType>
+struct nested<TensorBroadcastingOp<Broadcast, XprType>, 1, typename eval<TensorBroadcastingOp<Broadcast, XprType> >::type>
+{
+  typedef TensorBroadcastingOp<Broadcast, XprType> type;
+};
+
+}  // end namespace internal
+
+
+
+template<typename Broadcast, typename XprType>
+class TensorBroadcastingOp : public TensorBase<TensorBroadcastingOp<Broadcast, XprType>, ReadOnlyAccessors>
+{
+  public:
+  typedef typename Eigen::internal::traits<TensorBroadcastingOp>::Scalar Scalar;
+  typedef typename Eigen::internal::traits<TensorBroadcastingOp>::Packet Packet;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+  typedef typename Eigen::internal::nested<TensorBroadcastingOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorBroadcastingOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorBroadcastingOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBroadcastingOp(const XprType& expr, const Broadcast& broadcast)
+      : m_xpr(expr), m_broadcast(broadcast) {}
+
+    EIGEN_DEVICE_FUNC
+    const Broadcast& broadcast() const { return m_broadcast; }
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename XprType::Nested>::type&
+    expression() const { return m_xpr; }
+
+  protected:
+    typename XprType::Nested m_xpr;
+    const Broadcast m_broadcast;
+};
+
+
+// Eval as rvalue
+template<typename Broadcast, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
+{
+  typedef TensorBroadcastingOp<Broadcast, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions;
+  EIGEN_STATIC_ASSERT(NumDims == internal::array_size<Broadcast>::value, "Broadcast cannot change rank")
+
+  enum {
+    IsAligned = false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+    : m_impl(op.expression(), device)
+  {
+    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+    const Broadcast& broadcast = op.broadcast();
+    for (int i = 0; i < NumDims; ++i) {
+      eigen_assert(input_dims[i] > 0);
+      m_dimensions[i] = input_dims[i] * broadcast[i];
+    }
+
+    if (NumDims > 0) {
+      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+        m_inputStrides[0] = 1;
+        m_outputStrides[0] = 1;
+        for (int i = 1; i < NumDims; ++i) {
+          m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1];
+          m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1];
+        }
+      } else {
+        // NumDims is always > 0 here, but use max to avoid compiler warning
+        m_inputStrides[numext::maxi(0, NumDims-1)] = 1;
+        m_outputStrides[numext::maxi(0, NumDims-1)] = 1;
+        for (int i = NumDims-2; i >= 0; --i) {
+          m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1];
+          m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1];
+        }
+      }
+    }
+  }
+
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_impl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffReturnType coeff(Index index) const
+  {
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      return coeffColMajor(index);
+    } else {
+      return coeffRowMajor(index);
+    }
+  }
+
+  // TODO: attempt to speed this up. The integer divisions and modulo are slow
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffColMajor(Index index) const
+  {
+    Index inputIndex = 0;
+    if (NumDims > 0) {
+      for (int i = NumDims - 1; i > 0; --i) {
+        const Index idx = index / m_outputStrides[i];
+        if (internal::index_statically_eq<Broadcast>()(i, 1)) {
+          eigen_assert(idx < m_impl.dimensions()[i]);
+          inputIndex += idx * m_inputStrides[i];
+        } else {
+          if (internal::index_statically_eq<InputDimensions>()(i, 1)) {
+            eigen_assert(idx % m_impl.dimensions()[i] == 0);
+          } else {
+            inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
+          }
+        }
+        index -= idx * m_outputStrides[i];
+      }
+      if (internal::index_statically_eq<Broadcast>()(0, 1)) {
+        eigen_assert(index < m_impl.dimensions()[0]);
+        inputIndex += index;
+      } else {
+        if (internal::index_statically_eq<InputDimensions>()(0, 1)) {
+          eigen_assert(index % m_impl.dimensions()[0] == 0);
+        } else {
+          inputIndex += (index % m_impl.dimensions()[0]);
+        }
+      }
+    }
+    return m_impl.coeff(inputIndex);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffRowMajor(Index index) const
+  {
+    Index inputIndex = 0;
+    if (NumDims > 0) {
+      for (int i = 0; i < NumDims - 1; ++i) {
+        const Index idx = index / m_outputStrides[i];
+        if (internal::index_statically_eq<Broadcast>()(i, 1)) {
+          eigen_assert(idx < m_impl.dimensions()[i]);
+          inputIndex += idx * m_inputStrides[i];
+        } else {
+          if (internal::index_statically_eq<InputDimensions>()(i, 1)) {
+            eigen_assert(idx % m_impl.dimensions()[i] == 0);
+          } else {
+            inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
+          }
+        }
+        index -= idx * m_outputStrides[i];
+      }
+      if (internal::index_statically_eq<Broadcast>()(NumDims-1, 1)) {
+        eigen_assert(index < m_impl.dimensions()[NumDims-1]);
+        inputIndex += index;
+      } else {
+        if (internal::index_statically_eq<InputDimensions>()(NumDims-1, 1)) {
+          eigen_assert(index % m_impl.dimensions()[NumDims-1] == 0);
+        } else {
+          inputIndex += (index % m_impl.dimensions()[NumDims-1]);
+        }
+      }
+    }
+    return m_impl.coeff(inputIndex);
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType packet(Index index) const
+  {
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      return packetColMajor<LoadMode>(index);
+    } else {
+      return packetRowMajor<LoadMode>(index);
+    }
+  }
+
+  // Ignore the LoadMode and always use unaligned loads since we can't guarantee
+  // the alignment at compile time.
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const
+  {
+    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+packetSize-1 < dimensions().TotalSize());
+
+    const Index originalIndex = index;
+
+    Index inputIndex = 0;
+    Index innermostLoc = 0;
+    if (NumDims > 0) {
+      for (int i = NumDims - 1; i > 0; --i) {
+        const Index idx = index / m_outputStrides[i];
+        if (internal::index_statically_eq<Broadcast>()(i, 1)) {
+          eigen_assert(idx < m_impl.dimensions()[i]);
+          inputIndex += idx * m_inputStrides[i];
+        } else {
+          if (internal::index_statically_eq<InputDimensions>()(i, 1)) {
+            eigen_assert(idx % m_impl.dimensions()[i] == 0);
+          } else {
+            inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
+          }
+        }
+        index -= idx * m_outputStrides[i];
+      }
+      if (internal::index_statically_eq<Broadcast>()(0, 1)) {
+        eigen_assert(index < m_impl.dimensions()[0]);
+        innermostLoc = index;
+      } else {
+        if (internal::index_statically_eq<InputDimensions>()(0, 1)) {
+          eigen_assert(innermostLoc % m_impl.dimensions()[0] == 0);
+          innermostLoc = 0;
+        } else {
+          innermostLoc = index % m_impl.dimensions()[0];
+        }
+      }
+      inputIndex += innermostLoc;
+    }
+
+    // Todo: this could be extended to the second dimension if we're not
+    // broadcasting alongside the first dimension, and so on.
+    if (innermostLoc + packetSize <= m_impl.dimensions()[0]) {
+      return m_impl.template packet<Unaligned>(inputIndex);
+    } else {
+      EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize];
+      values[0] = m_impl.coeff(inputIndex);
+      for (int i = 1; i < packetSize; ++i) {
+        values[i] = coeffColMajor(originalIndex+i);
+      }
+      PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+      return rslt;
+    }
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const
+  {
+    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+packetSize-1 < dimensions().TotalSize());
+
+    const Index originalIndex = index;
+
+    Index inputIndex = 0;
+    for (int i = 0; i < NumDims - 1; ++i) {
+      const Index idx = index / m_outputStrides[i];
+      if (internal::index_statically_eq<Broadcast>()(i, 1)) {
+        eigen_assert(idx < m_impl.dimensions()[i]);
+        inputIndex += idx * m_inputStrides[i];
+      } else {
+        if (internal::index_statically_eq<InputDimensions>()(i, 1)) {
+          eigen_assert(idx % m_impl.dimensions()[i] == 0);
+        } else {
+          inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
+        }
+      }
+      index -= idx * m_outputStrides[i];
+    }
+    Index innermostLoc;
+    if (internal::index_statically_eq<Broadcast>()(NumDims-1, 1)) {
+      eigen_assert(index < m_impl.dimensions()[NumDims-1]);
+      innermostLoc = index;
+    } else {
+      if (internal::index_statically_eq<InputDimensions>()(NumDims-1, 1)) {
+        eigen_assert(innermostLoc % m_impl.dimensions()[NumDims-1] == 0);
+        innermostLoc = 0;
+      } else {
+        innermostLoc = index % m_impl.dimensions()[NumDims-1];
+      }
+    }
+    inputIndex += innermostLoc;
+
+    // Todo: this could be extended to the second dimension if we're not
+    // broadcasting alongside the first dimension, and so on.
+    if (innermostLoc + packetSize <= m_impl.dimensions()[NumDims-1]) {
+      return m_impl.template packet<Unaligned>(inputIndex);
+    } else {
+      EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize];
+      values[0] = m_impl.coeff(inputIndex);
+      for (int i = 1; i < packetSize; ++i) {
+        values[i] = coeffRowMajor(originalIndex+i);
+      }
+      PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+      return rslt;
+    }
+  }
+
+
+  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+
+ protected:
+  Dimensions m_dimensions;
+  array<Index, NumDims> m_outputStrides;
+  array<Index, NumDims> m_inputStrides;
+  TensorEvaluator<ArgType, Device> m_impl;
+};
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
new file mode 100644
index 0000000000..36c436a613
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
@@ -0,0 +1,510 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CHIPPING_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CHIPPING_H
+
+namespace Eigen {
+
+/** \class TensorKChippingReshaping
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief A chip is a thin slice, corresponding to a column or a row in a 2-d tensor.
+  *
+  *
+  */
+
+namespace internal {
+template<DenseIndex DimId, typename XprType>
+struct traits<TensorChippingOp<DimId, XprType> > : public traits<XprType>
+{
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef typename remove_reference<Nested>::type _Nested;
+  static const int NumDimensions = XprTraits::NumDimensions - 1;
+  static const int Layout = XprTraits::Layout;
+};
+
+template<DenseIndex DimId, typename XprType>
+struct eval<TensorChippingOp<DimId, XprType>, Eigen::Dense>
+{
+  typedef const TensorChippingOp<DimId, XprType>& type;
+};
+
+template<DenseIndex DimId, typename XprType>
+struct nested<TensorChippingOp<DimId, XprType>, 1, typename eval<TensorChippingOp<DimId, XprType> >::type>
+{
+  typedef TensorChippingOp<DimId, XprType> type;
+};
+
+template <DenseIndex DimId>
+struct DimensionId
+{
+  DimensionId(DenseIndex dim) {
+    eigen_assert(dim == DimId);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex actualDim() const {
+    return DimId;
+  }
+};
+template <>
+struct DimensionId<Dynamic>
+{
+  DimensionId(DenseIndex dim) : actual_dim(dim) {
+    eigen_assert(dim >= 0);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex actualDim() const {
+    return actual_dim;
+  }
+ private:
+  const DenseIndex actual_dim;
+};
+
+
+}  // end namespace internal
+
+
+
+template<DenseIndex DimId, typename XprType>
+class TensorChippingOp : public TensorBase<TensorChippingOp<DimId, XprType> >
+{
+  public:
+  typedef typename Eigen::internal::traits<TensorChippingOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorChippingOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorChippingOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorChippingOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorChippingOp(const XprType& expr, const Index offset, const Index dim)
+      : m_xpr(expr), m_offset(offset), m_dim(dim) {
+  }
+
+  EIGEN_DEVICE_FUNC
+  const Index offset() const { return m_offset; }
+  EIGEN_DEVICE_FUNC
+  const Index dim() const { return m_dim.actualDim(); }
+
+  EIGEN_DEVICE_FUNC
+  const typename internal::remove_all<typename XprType::Nested>::type&
+  expression() const { return m_xpr; }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE TensorChippingOp& operator = (const TensorChippingOp& other)
+  {
+    typedef TensorAssignOp<TensorChippingOp, const TensorChippingOp> Assign;
+    Assign assign(*this, other);
+    internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+    return *this;
+  }
+
+  template<typename OtherDerived>
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE TensorChippingOp& operator = (const OtherDerived& other)
+  {
+    typedef TensorAssignOp<TensorChippingOp, const OtherDerived> Assign;
+    Assign assign(*this, other);
+    internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+    return *this;
+  }
+
+  protected:
+    typename XprType::Nested m_xpr;
+    const Index m_offset;
+    const internal::DimensionId<DimId> m_dim;
+};
+
+
+// Eval as rvalue
+template<DenseIndex DimId, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
+{
+  typedef TensorChippingOp<DimId, ArgType> XprType;
+  static const int NumInputDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+  static const int NumDims = NumInputDims-1;
+  typedef typename XprType::Index Index;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename internal::remove_const<Scalar>::type ScalarNonConst;
+
+  enum {
+    // Alignment can't be guaranteed at compile time since it depends on the
+    // slice offsets.
+    IsAligned = false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+  };
+
+  typedef internal::TensorBlock<Index, ScalarNonConst, NumInputDims, Layout>
+    InputTensorBlock;
+  typedef internal::TensorBlock<Index, ScalarNonConst, NumDims, Layout>
+    OutputTensorBlock;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device), m_dim(op.dim()), m_device(device)
+  {
+    EIGEN_STATIC_ASSERT(NumInputDims >= 1, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    eigen_assert(NumInputDims > m_dim.actualDim());
+    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+    eigen_assert(op.offset() < input_dims[m_dim.actualDim()]);
+
+    int j = 0;
+    for (int i = 0; i < NumInputDims; ++i) {
+      if (i != m_dim.actualDim()) {
+        m_dimensions[j] = input_dims[i];
+        ++j;
+      }
+    }
+
+    m_stride = 1;
+    m_inputStride = 1;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = 0; i < m_dim.actualDim(); ++i) {
+        m_stride *= input_dims[i];
+        m_inputStride *= input_dims[i];
+      }
+    } else {
+      for (int i = NumInputDims-1; i > m_dim.actualDim(); --i) {
+        m_stride *= input_dims[i];
+        m_inputStride *= input_dims[i];
+      }
+    }
+    m_inputStride *= input_dims[m_dim.actualDim()];
+    m_inputOffset = m_stride * op.offset();
+
+    if (BlockAccess) {
+      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+        m_inputStrides[0] = 1;
+        for (int i = 1; i < NumInputDims; ++i) {
+          m_inputStrides[i] = m_inputStrides[i - 1] * input_dims[i - 1];
+        }
+      } else {
+        m_inputStrides[NumInputDims - 1] = 1;
+        for (int i = NumInputDims - 2; i >= 0; --i) {
+          m_inputStrides[i] = m_inputStrides[i + 1] * input_dims[i + 1];
+        }
+      }
+
+      m_block_total_size_max = numext::maxi(static_cast<std::size_t>(1),
+                                            device.lastLevelCacheSize() /
+                                            sizeof(Scalar));
+    }
+  }
+
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_impl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    return m_impl.coeff(srcCoeff(index));
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  {
+    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+packetSize-1 < dimensions().TotalSize());
+
+    if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) &&
+         m_dim.actualDim() == 0) ||
+        (static_cast<int>(Layout) == static_cast<int>(RowMajor) &&
+         m_dim.actualDim() == NumInputDims - 1)) {
+      // m_stride is equal to 1, so let's avoid the integer division.
+      eigen_assert(m_stride == 1);
+      Index inputIndex = index * m_inputStride + m_inputOffset;
+      EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize];
+      for (int i = 0; i < packetSize; ++i) {
+        values[i] = m_impl.coeff(inputIndex);
+        inputIndex += m_inputStride;
+      }
+      PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+      return rslt;
+    } else if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) &&
+                m_dim.actualDim() == NumInputDims - 1) ||
+               (static_cast<int>(Layout) == static_cast<int>(RowMajor) &&
+                m_dim.actualDim() == 0)) {
+      // m_stride is aways greater than index, so let's avoid the integer division.
+      eigen_assert(m_stride > index);
+      return m_impl.template packet<LoadMode>(index + m_inputOffset);
+    } else {
+      const Index idx = index / m_stride;
+      const Index rem = index - idx * m_stride;
+      if (rem + packetSize <= m_stride) {
+        Index inputIndex = idx * m_inputStride + m_inputOffset + rem;
+        return m_impl.template packet<LoadMode>(inputIndex);
+      } else {
+        // Cross the stride boundary. Fallback to slow path.
+        EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize];
+        for (int i = 0; i < packetSize; ++i) {
+          values[i] = coeff(index);
+          ++index;
+        }
+        PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+        return rslt;
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
+      std::vector<internal::TensorOpResourceRequirements>* resources) const {
+    resources->push_back(internal::TensorOpResourceRequirements(
+        internal::kSkewedInnerDims, m_block_total_size_max));
+    m_impl.getResourceRequirements(resources);
+  }
+
+  // TODO(andydavis) Reduce the overhead of this function (experiment with
+  // using a fixed block size).
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
+      OutputTensorBlock* output_block) const {
+    // Calculate input block sizes.
+    const DSizes<Index, NumDims>& output_block_sizes =
+        output_block->block_sizes();
+    const DSizes<Index, NumDims>& output_block_strides =
+        output_block->block_strides();
+    const Index chip_dim = m_dim.actualDim();
+    DSizes<Index, NumInputDims> input_block_sizes;
+    DSizes<Index, NumInputDims> input_block_strides;
+    for (Index i = 0; i < NumInputDims; ++i) {
+      if (i < chip_dim) {
+        input_block_sizes[i] = output_block_sizes[i];
+        input_block_strides[i] = output_block_strides[i];
+      } else if (i > chip_dim) {
+        input_block_sizes[i] = output_block_sizes[i - 1];
+        input_block_strides[i] = output_block_strides[i - 1];
+      } else {
+        input_block_sizes[i] = 1;
+      }
+    }
+    // Fix up input_block_stride for chip dimension.
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      if (chip_dim == 0) {
+        input_block_strides[chip_dim] = 1;
+      } else {
+        input_block_strides[chip_dim] = input_block_strides[chip_dim - 1] *
+            input_block_sizes[chip_dim - 1];
+      }
+    } else {
+      if (chip_dim == NumInputDims - 1) {
+        input_block_strides[chip_dim] = 1;
+      } else {
+        input_block_strides[chip_dim] = input_block_strides[chip_dim + 1] *
+            input_block_sizes[chip_dim + 1];
+      }
+    }
+    // Instantiate and read input block from input tensor.
+    InputTensorBlock input_block(srcCoeff(output_block->first_coeff_index()),
+                                 input_block_sizes,
+                                 input_block_strides,
+                                 m_inputStrides,
+                                 output_block->data());
+    m_impl.block(&input_block);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const {
+    CoeffReturnType* result = const_cast<CoeffReturnType*>(m_impl.data());
+    if (((static_cast<int>(Layout) == static_cast<int>(ColMajor) &&
+          m_dim.actualDim() == NumDims) ||
+         (static_cast<int>(Layout) == static_cast<int>(RowMajor) &&
+          m_dim.actualDim() == 0)) &&
+        result) {
+      return result + m_inputOffset;
+    } else {
+      return NULL;
+    }
+  }
+
+ protected:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
+  {
+    Index inputIndex;
+    if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) &&
+         m_dim.actualDim() == 0) ||
+        (static_cast<int>(Layout) == static_cast<int>(RowMajor) &&
+         m_dim.actualDim() == NumInputDims - 1)) {
+      // m_stride is equal to 1, so let's avoid the integer division.
+      eigen_assert(m_stride == 1);
+      inputIndex = index * m_inputStride + m_inputOffset;
+    } else if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) &&
+                m_dim.actualDim() == NumInputDims - 1) ||
+               (static_cast<int>(Layout) == static_cast<int>(RowMajor) &&
+                m_dim.actualDim() == 0)) {
+      // m_stride is aways greater than index, so let's avoid the integer division.
+      eigen_assert(m_stride > index);
+      inputIndex = index + m_inputOffset;
+    } else {
+      const Index idx = index / m_stride;
+      inputIndex = idx * m_inputStride + m_inputOffset;
+      index -= idx * m_stride;
+      inputIndex += index;
+    }
+    return inputIndex;
+  }
+
+  Dimensions m_dimensions;
+  Index m_stride;
+  Index m_inputOffset;
+  Index m_inputStride;
+  DSizes<Index, NumInputDims> m_inputStrides;
+  TensorEvaluator<ArgType, Device> m_impl;
+  const internal::DimensionId<DimId> m_dim;
+  const Device& m_device;
+  std::size_t m_block_total_size_max;
+};
+
+
+// Eval as lvalue
+template<DenseIndex DimId, typename ArgType, typename Device>
+struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
+  : public TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
+{
+  typedef TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device> Base;
+  typedef TensorChippingOp<DimId, ArgType> XprType;
+  static const int NumInputDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+  static const int NumDims = NumInputDims-1;
+  typedef typename XprType::Index Index;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename XprType::Scalar Scalar;
+
+  enum {
+    IsAligned = false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+    : Base(op, device)
+    { }
+
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef typename internal::remove_const<Scalar>::type ScalarNonConst;
+  typedef internal::TensorBlock<Index, ScalarNonConst, NumInputDims, Layout>
+    InputTensorBlock;
+  typedef internal::TensorBlock<Index, ScalarNonConst, NumDims, Layout>
+    OutputTensorBlock;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
+  {
+    return this->m_impl.coeffRef(this->srcCoeff(index));
+  }
+
+  template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void writePacket(Index index, const PacketReturnType& x)
+  {
+    static const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+
+    if ((static_cast<int>(this->Layout) == static_cast<int>(ColMajor) &&
+         this->m_dim.actualDim() == 0) ||
+        (static_cast<int>(this->Layout) == static_cast<int>(RowMajor) &&
+         this->m_dim.actualDim() == NumInputDims - 1)) {
+      // m_stride is equal to 1, so let's avoid the integer division.
+      eigen_assert(this->m_stride == 1);
+      EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize];
+      internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
+      Index inputIndex = index * this->m_inputStride + this->m_inputOffset;
+      for (int i = 0; i < packetSize; ++i) {
+        this->m_impl.coeffRef(inputIndex) = values[i];
+        inputIndex += this->m_inputStride;
+      }
+    } else if ((static_cast<int>(this->Layout) == static_cast<int>(ColMajor) &&
+                this->m_dim.actualDim() == NumInputDims - 1) ||
+               (static_cast<int>(this->Layout) == static_cast<int>(RowMajor) &&
+                this->m_dim.actualDim() == 0)) {
+      // m_stride is aways greater than index, so let's avoid the integer division.
+      eigen_assert(this->m_stride > index);
+      this->m_impl.template writePacket<StoreMode>(index + this->m_inputOffset, x);
+    } else {
+      const Index idx = index / this->m_stride;
+      const Index rem = index - idx * this->m_stride;
+      if (rem + packetSize <= this->m_stride) {
+        const Index inputIndex = idx * this->m_inputStride + this->m_inputOffset + rem;
+        this->m_impl.template writePacket<StoreMode>(inputIndex, x);
+      } else {
+        // Cross stride boundary. Fallback to slow path.
+        EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize];
+        internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
+        for (int i = 0; i < packetSize; ++i) {
+          this->coeffRef(index) = values[i];
+          ++index;
+        }
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(
+      const OutputTensorBlock& output_block) {
+    // Calculate input block sizes.
+    const DSizes<Index, NumDims>& output_block_sizes =
+        output_block.block_sizes();
+    const DSizes<Index, NumDims>& output_block_strides =
+        output_block.block_strides();
+    const Index chip_dim = this->m_dim.actualDim();
+    DSizes<Index, NumInputDims> input_block_sizes;
+    DSizes<Index, NumInputDims> input_block_strides;
+    for (Index i = 0; i < NumInputDims; ++i) {
+      if (i < chip_dim) {
+        input_block_sizes[i] = output_block_sizes[i];
+        input_block_strides[i] = output_block_strides[i];
+      } else if (i > chip_dim) {
+        input_block_sizes[i] = output_block_sizes[i - 1];
+        input_block_strides[i] = output_block_strides[i - 1];
+      } else {
+        input_block_sizes[i] = 1;
+      }
+    }
+    // Fix up input_block_stride for chip dimension.
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      if (chip_dim == 0) {
+        input_block_strides[chip_dim] = 1;
+      } else {
+        input_block_strides[chip_dim] = input_block_strides[chip_dim - 1] *
+            input_block_sizes[chip_dim - 1];
+      }
+    } else {
+      if (chip_dim == NumInputDims - 1) {
+        input_block_strides[chip_dim] = 1;
+      } else {
+        input_block_strides[chip_dim] = input_block_strides[chip_dim - 1] *
+            input_block_sizes[chip_dim - 1];
+      }
+    }
+    // Write input block.
+    this->m_impl.writeBlock(
+        InputTensorBlock(this->srcCoeff(output_block.first_coeff_index()),
+                         input_block_sizes,
+                         input_block_strides,
+                         this->m_inputStrides,
+                         const_cast<ScalarNonConst*>(output_block.data())));
+  }
+
+};
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_CHIPPING_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
new file mode 100644
index 0000000000..54d9e5f2c8
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
@@ -0,0 +1,350 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONCATENATION_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONCATENATION_H
+
+namespace Eigen {
+
+/** \class TensorConcatenationOp
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor concatenation class.
+  *
+  *
+  */
+namespace internal {
+template<typename Axis, typename LhsXprType, typename RhsXprType>
+struct traits<TensorConcatenationOp<Axis, LhsXprType, RhsXprType> >
+{
+  // Type promotion to handle the case where the types of the lhs and the rhs are different.
+  typedef typename promote_storage_type<typename LhsXprType::Scalar,
+                                        typename RhsXprType::Scalar>::ret Scalar;
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename promote_storage_type<typename traits<LhsXprType>::StorageKind,
+                                        typename traits<RhsXprType>::StorageKind>::ret StorageKind;
+  typedef typename promote_index_type<typename traits<LhsXprType>::Index,
+                                      typename traits<RhsXprType>::Index>::type Index;
+  typedef typename LhsXprType::Nested LhsNested;
+  typedef typename RhsXprType::Nested RhsNested;
+  typedef typename remove_reference<LhsNested>::type _LhsNested;
+  typedef typename remove_reference<RhsNested>::type _RhsNested;
+  static const int NumDimensions = traits<LhsXprType>::NumDimensions;
+  static const int Layout = traits<LhsXprType>::Layout;
+  enum { Flags = 0 };
+};
+
+template<typename Axis, typename LhsXprType, typename RhsXprType>
+struct eval<TensorConcatenationOp<Axis, LhsXprType, RhsXprType>, Eigen::Dense>
+{
+  typedef const TensorConcatenationOp<Axis, LhsXprType, RhsXprType>& type;
+};
+
+template<typename Axis, typename LhsXprType, typename RhsXprType>
+struct nested<TensorConcatenationOp<Axis, LhsXprType, RhsXprType>, 1, typename eval<TensorConcatenationOp<Axis, LhsXprType, RhsXprType> >::type>
+{
+  typedef TensorConcatenationOp<Axis, LhsXprType, RhsXprType> type;
+};
+
+}  // end namespace internal
+
+
+template<typename Axis, typename LhsXprType, typename RhsXprType>
+class TensorConcatenationOp : public TensorBase<TensorConcatenationOp<Axis, LhsXprType, RhsXprType>, WriteAccessors>
+{
+  public:
+    typedef typename internal::traits<TensorConcatenationOp>::Scalar Scalar;
+    typedef typename internal::traits<TensorConcatenationOp>::Packet Packet;
+    typedef typename internal::traits<TensorConcatenationOp>::StorageKind StorageKind;
+    typedef typename internal::traits<TensorConcatenationOp>::Index Index;
+    typedef typename internal::nested<TensorConcatenationOp>::type Nested;
+    typedef typename internal::promote_storage_type<typename LhsXprType::CoeffReturnType,
+                                                    typename RhsXprType::CoeffReturnType>::ret CoeffReturnType;
+    typedef typename internal::promote_storage_type<typename LhsXprType::PacketReturnType,
+                                                    typename RhsXprType::PacketReturnType>::ret PacketReturnType;
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConcatenationOp(const LhsXprType& lhs, const RhsXprType& rhs, Axis axis)
+        : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_axis(axis) {}
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename LhsXprType::Nested>::type&
+    lhsExpression() const { return m_lhs_xpr; }
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename RhsXprType::Nested>::type&
+    rhsExpression() const { return m_rhs_xpr; }
+
+    EIGEN_DEVICE_FUNC const Axis& axis() const { return m_axis; }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorConcatenationOp& operator = (const TensorConcatenationOp& other)
+    {
+      typedef TensorAssignOp<TensorConcatenationOp, const TensorConcatenationOp> Assign;
+      Assign assign(*this, other);
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(
+          assign, DefaultDevice());
+      return *this;
+    }
+
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorConcatenationOp& operator = (const OtherDerived& other)
+    {
+      typedef TensorAssignOp<TensorConcatenationOp, const OtherDerived> Assign;
+      Assign assign(*this, other);
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(
+          assign, DefaultDevice());
+      return *this;
+    }
+
+  protected:
+    typename LhsXprType::Nested m_lhs_xpr;
+    typename RhsXprType::Nested m_rhs_xpr;
+    const Axis m_axis;
+};
+
+
+// Eval as rvalue
+template<typename Axis, typename LeftArgType, typename RightArgType, typename Device>
+struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgType>, Device>
+{
+  typedef TensorConcatenationOp<Axis, LeftArgType, RightArgType> XprType;
+  typedef typename XprType::Index Index;
+  static const int NumDims = internal::array_size<typename TensorEvaluator<LeftArgType, Device>::Dimensions>::value;
+  static const int RightNumDims = internal::array_size<typename TensorEvaluator<RightArgType, Device>::Dimensions>::value;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+  enum {
+    IsAligned = false,
+    PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess &
+                   TensorEvaluator<RightArgType, Device>::PacketAccess,
+    BlockAccess = false,
+    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+    : m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device), m_axis(op.axis())
+  {
+    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout) || NumDims == 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    EIGEN_STATIC_ASSERT(NumDims == RightNumDims, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(0 <= m_axis && m_axis < NumDims);
+    const Dimensions& lhs_dims = m_leftImpl.dimensions();
+    const Dimensions& rhs_dims = m_rightImpl.dimensions();
+    int i = 0;
+    for (; i < m_axis; ++i) {
+      eigen_assert(lhs_dims[i] > 0);
+      eigen_assert(lhs_dims[i] == rhs_dims[i]);
+      m_dimensions[i] = lhs_dims[i];
+    }
+    eigen_assert(lhs_dims[i] > 0);  // Now i == m_axis.
+    eigen_assert(rhs_dims[i] > 0);
+    m_dimensions[i] = lhs_dims[i] + rhs_dims[i];
+    for (++i; i < NumDims; ++i) {
+      eigen_assert(lhs_dims[i] > 0);
+      eigen_assert(lhs_dims[i] == rhs_dims[i]);
+      m_dimensions[i] = lhs_dims[i];
+    }
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_leftStrides[0] = 1;
+      m_rightStrides[0] = 1;
+      m_outputStrides[0] = 1;
+
+      for (int i = 1; i < NumDims; ++i) {
+        m_leftStrides[i] = m_leftStrides[i-1] * lhs_dims[i-1];
+        m_rightStrides[i] = m_rightStrides[i-1] * rhs_dims[i-1];
+        m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1];
+      }
+    } else {
+      m_leftStrides[NumDims - 1] = 1;
+      m_rightStrides[NumDims - 1] = 1;
+      m_outputStrides[NumDims - 1] = 1;
+
+      for (int i = NumDims - 2; i >= 0; --i) {
+        m_leftStrides[i] = m_leftStrides[i+1] * lhs_dims[i+1];
+        m_rightStrides[i] = m_rightStrides[i+1] * rhs_dims[i+1];
+        m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1];
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  // TODO(phli): Add short-circuit memcpy evaluation if underlying data are linear?
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/)
+  {
+    m_leftImpl.evalSubExprsIfNeeded(NULL);
+    m_rightImpl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup()
+  {
+    m_leftImpl.cleanup();
+    m_rightImpl.cleanup();
+  }
+
+  // TODO(phli): attempt to speed this up. The integer divisions and modulo are slow.
+  // See CL/76180724 comments for more ideas.
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    // Collect dimension-wise indices (subs).
+    array<Index, NumDims> subs;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumDims - 1; i > 0; --i) {
+        subs[i] = index / m_outputStrides[i];
+        index -= subs[i] * m_outputStrides[i];
+      }
+      subs[0] = index;
+    } else {
+      for (int i = 0; i < NumDims - 1; ++i) {
+        subs[i] = index / m_outputStrides[i];
+        index -= subs[i] * m_outputStrides[i];
+      }
+      subs[NumDims - 1] = index;
+    }
+
+    const Dimensions& left_dims = m_leftImpl.dimensions();
+    if (subs[m_axis] < left_dims[m_axis]) {
+      Index left_index;
+      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+        left_index = subs[0];
+        for (int i = 1; i < NumDims; ++i) {
+          left_index += (subs[i] % left_dims[i]) * m_leftStrides[i];
+        }
+      } else {
+        left_index = subs[NumDims - 1];
+        for (int i = NumDims - 2; i >= 0; --i) {
+          left_index += (subs[i] % left_dims[i]) * m_leftStrides[i];
+        }
+      }
+      return m_leftImpl.coeff(left_index);
+    } else {
+      subs[m_axis] -= left_dims[m_axis];
+      const Dimensions& right_dims = m_rightImpl.dimensions();
+      Index right_index;
+      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+        right_index = subs[0];
+        for (int i = 1; i < NumDims; ++i) {
+          right_index += (subs[i] % right_dims[i]) * m_rightStrides[i];
+        }
+      } else {
+        right_index = subs[NumDims - 1];
+        for (int i = NumDims - 2; i >= 0; --i) {
+          right_index += (subs[i] % right_dims[i]) * m_rightStrides[i];
+        }
+      }
+      return m_rightImpl.coeff(right_index);
+    }
+  }
+
+  // TODO(phli): Add a real vectorization.
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  {
+    static const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index + packetSize - 1 < dimensions().TotalSize());
+
+    EIGEN_ALIGN_DEFAULT CoeffReturnType values[packetSize];
+    for (int i = 0; i < packetSize; ++i) {
+      values[i] = coeff(index+i);
+    }
+    PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+    return rslt;
+  }
+
+  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+
+  protected:
+    Dimensions m_dimensions;
+    array<Index, NumDims> m_outputStrides;
+    array<Index, NumDims> m_leftStrides;
+    array<Index, NumDims> m_rightStrides;
+    TensorEvaluator<LeftArgType, Device> m_leftImpl;
+    TensorEvaluator<RightArgType, Device> m_rightImpl;
+    const Axis m_axis;
+};
+
+// Eval as lvalue
+template<typename Axis, typename LeftArgType, typename RightArgType, typename Device>
+  struct TensorEvaluator<TensorConcatenationOp<Axis, LeftArgType, RightArgType>, Device>
+  : public TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgType>, Device>
+{
+  typedef TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgType>, Device> Base;
+  typedef TensorConcatenationOp<Axis, LeftArgType, RightArgType> XprType;
+  typedef typename Base::Dimensions Dimensions;
+  enum {
+    IsAligned = false,
+    PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess &
+                   TensorEvaluator<RightArgType, Device>::PacketAccess,
+    BlockAccess = false,
+    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(XprType& op, const Device& device)
+    : Base(op, device)
+  {
+    EIGEN_STATIC_ASSERT((static_cast<int>(Layout) == static_cast<int>(ColMajor)), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  }
+
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
+  {
+    // Collect dimension-wise indices (subs).
+    array<Index, Base::NumDims> subs;
+    for (int i = Base::NumDims - 1; i > 0; --i) {
+      subs[i] = index / this->m_outputStrides[i];
+      index -= subs[i] * this->m_outputStrides[i];
+    }
+    subs[0] = index;
+
+    const Dimensions& left_dims = this->m_leftImpl.dimensions();
+    if (subs[this->m_axis] < left_dims[this->m_axis]) {
+      Index left_index = subs[0];
+      for (int i = 1; i < Base::NumDims; ++i) {
+        left_index += (subs[i] % left_dims[i]) * this->m_leftStrides[i];
+      }
+      return this->m_leftImpl.coeffRef(left_index);
+    } else {
+      subs[this->m_axis] -= left_dims[this->m_axis];
+      const Dimensions& right_dims = this->m_rightImpl.dimensions();
+      Index right_index = subs[0];
+      for (int i = 1; i < Base::NumDims; ++i) {
+        right_index += (subs[i] % right_dims[i]) * this->m_rightStrides[i];
+      }
+      return this->m_rightImpl.coeffRef(right_index);
+    }
+  }
+
+  template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void writePacket(Index index, const PacketReturnType& x)
+  {
+    static const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index + packetSize - 1 < this->dimensions().TotalSize());
+
+    EIGEN_ALIGN_DEFAULT CoeffReturnType values[packetSize];
+    internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
+    for (int i = 0; i < packetSize; ++i) {
+      coeffRef(index+i) = values[i];
+    }
+  }
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_CONCATENATION_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
new file mode 100644
index 0000000000..7fb384c65e
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -0,0 +1,635 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Eric Martin <eric@ericmart.in>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_H
+
+namespace Eigen {
+
+/** \class TensorContraction
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor contraction class.
+  *
+  *
+  */
+namespace internal {
+template<typename Dimensions, typename LhsXprType, typename RhsXprType>
+struct traits<TensorContractionOp<Dimensions, LhsXprType, RhsXprType> >
+{
+  // Type promotion to handle the case where the types of the lhs and the rhs are different.
+  typedef typename scalar_product_traits<typename LhsXprType::Scalar, typename RhsXprType::Scalar>::ReturnType Scalar;
+
+  typedef typename scalar_product_traits<typename traits<LhsXprType>::StorageKind,
+                                         typename traits<RhsXprType>::StorageKind>::ReturnType StorageKind;
+  typedef typename promote_index_type<typename traits<LhsXprType>::Index,
+                                      typename traits<RhsXprType>::Index>::type Index;
+  typedef typename LhsXprType::Nested LhsNested;
+  typedef typename RhsXprType::Nested RhsNested;
+  typedef typename remove_reference<LhsNested>::type _LhsNested;
+  typedef typename remove_reference<RhsNested>::type _RhsNested;
+
+  // From NumDims below.
+  static const int NumDimensions = traits<RhsXprType>::NumDimensions + traits<RhsXprType>::NumDimensions - 2 * array_size<Dimensions>::value;
+  static const int Layout = traits<LhsXprType>::Layout;
+
+  enum {
+    Flags = 0,
+  };
+};
+
+template<typename Dimensions, typename LhsXprType, typename RhsXprType>
+struct eval<TensorContractionOp<Dimensions, LhsXprType, RhsXprType>, Eigen::Dense>
+{
+  typedef const TensorContractionOp<Dimensions, LhsXprType, RhsXprType>& type;
+};
+
+template<typename Dimensions, typename LhsXprType, typename RhsXprType>
+struct nested<TensorContractionOp<Dimensions, LhsXprType, RhsXprType>, 1, typename eval<TensorContractionOp<Dimensions, LhsXprType, RhsXprType> >::type>
+{
+  typedef TensorContractionOp<Dimensions, LhsXprType, RhsXprType> type;
+};
+
+template<typename Indices_, typename LeftArgType_, typename RightArgType_, typename Device_>
+struct traits<TensorEvaluator<const TensorContractionOp<Indices_, LeftArgType_, RightArgType_>, Device_> > {
+  typedef Indices_ Indices;
+  typedef LeftArgType_ LeftArgType;
+  typedef RightArgType_ RightArgType;
+  typedef Device_ Device;
+
+  // From NumDims below.
+  static const int NumDimensions = traits<LeftArgType_>::NumDimensions + traits<RightArgType_>::NumDimensions - 2 * array_size<Indices_>::value;
+};
+
+}  // end namespace internal
+
+template<typename Indices, typename LhsXprType, typename RhsXprType>
+class TensorContractionOp : public TensorBase<TensorContractionOp<Indices, LhsXprType, RhsXprType> >
+{
+  public:
+  typedef typename Eigen::internal::traits<TensorContractionOp>::Scalar Scalar;
+  typedef typename internal::scalar_product_traits<typename LhsXprType::CoeffReturnType,
+                                                   typename RhsXprType::CoeffReturnType>::ReturnType CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorContractionOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorContractionOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorContractionOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionOp(
+      const LhsXprType& lhs, const RhsXprType& rhs, const Indices& dims)
+      : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_indices(dims) {}
+
+  EIGEN_DEVICE_FUNC const Indices& indices() const { return m_indices; }
+
+  /** \returns the nested expressions */
+  EIGEN_DEVICE_FUNC
+  const typename internal::remove_all<typename LhsXprType::Nested>::type&
+  lhsExpression() const { return m_lhs_xpr; }
+
+  EIGEN_DEVICE_FUNC
+  const typename internal::remove_all<typename RhsXprType::Nested>::type&
+  rhsExpression() const { return m_rhs_xpr; }
+
+  protected:
+    typename LhsXprType::Nested m_lhs_xpr;
+    typename RhsXprType::Nested m_rhs_xpr;
+    const Indices m_indices;
+};
+
+
+template<typename Derived>
+struct TensorContractionEvaluatorBase
+{
+  typedef typename internal::traits<Derived>::Indices Indices;
+  typedef typename internal::traits<Derived>::LeftArgType LeftArgType;
+  typedef typename internal::traits<Derived>::RightArgType RightArgType;
+  typedef typename internal::traits<Derived>::Device Device;
+
+  typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType;
+  typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
+  typedef typename XprType::Index Index;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+
+  enum {
+    IsAligned = true,
+    PacketAccess = (internal::packet_traits<Scalar>::size > 1),
+    BlockAccess = false,
+    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+  };
+
+  // Most of the code is assuming that both input tensors are ColMajor. If the
+  // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
+  // If we want to compute A * B = C, where A is LHS and B is RHS, the code
+  // will pretend B is LHS and A is RHS.
+  typedef typename internal::conditional<
+    static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
+  typedef typename internal::conditional<
+    static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
+
+  static const int LDims =
+      internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
+  static const int RDims =
+      internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
+  static const int ContractDims = internal::array_size<Indices>::value;
+  static const int NumDims = LDims + RDims - 2 * ContractDims;
+
+  typedef array<Index, LDims> left_dim_mapper_t;
+  typedef array<Index, RDims> right_dim_mapper_t;
+  typedef array<Index, ContractDims> contract_t;
+  typedef array<Index, LDims - ContractDims> left_nocontract_t;
+  typedef array<Index, RDims - ContractDims> right_nocontract_t;
+
+  typedef DSizes<Index, NumDims> Dimensions;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  TensorContractionEvaluatorBase(const XprType& op, const Device& device)
+      : m_leftImpl(choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(),
+                          op.lhsExpression(), op.rhsExpression()), device),
+        m_rightImpl(choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(),
+                          op.rhsExpression(), op.lhsExpression()), device),
+        m_device(device),
+        m_result(NULL) {
+    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) ==
+                         static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)),
+                        YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+    eigen_assert((contract_t::size > 0) && "Must contract on some indices");
+
+
+    DSizes<Index, LDims> eval_left_dims;
+    DSizes<Index, RDims> eval_right_dims;
+    array<IndexPair<Index>, ContractDims> eval_op_indices;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      // For ColMajor, we keep using the existing dimensions
+      for (int i = 0; i < LDims; i++) {
+        eval_left_dims[i] = m_leftImpl.dimensions()[i];
+      }
+      for (int i = 0; i < RDims; i++) {
+        eval_right_dims[i] = m_rightImpl.dimensions()[i];
+      }
+      // We keep the pairs of contracting indices.
+      for (int i = 0; i < ContractDims; i++) {
+        eval_op_indices[i].first = op.indices()[i].first;
+        eval_op_indices[i].second = op.indices()[i].second;
+      }
+    } else {
+      // For RowMajor, we need to reverse the existing dimensions
+      for (int i = 0; i < LDims; i++) {
+        eval_left_dims[i] = m_leftImpl.dimensions()[LDims - i - 1];
+      }
+      for (int i = 0; i < RDims; i++) {
+        eval_right_dims[i] = m_rightImpl.dimensions()[RDims - i - 1];
+      }
+      // We need to flip all the pairs of contracting indices as well as
+      // reversing the dimensions.
+      for (int i = 0; i < ContractDims; i++) {
+        eval_op_indices[i].first = LDims - 1 - op.indices()[ContractDims - 1 - i].second;
+        eval_op_indices[i].second = RDims - 1 - op.indices()[ContractDims - 1 - i].first;
+      }
+    }
+
+    array<Index, LDims> lhs_strides;
+    if (LDims > 0) {
+      lhs_strides[0] = 1;
+      for (int i = 0; i < LDims-1; ++i) {
+        lhs_strides[i+1] = lhs_strides[i] * eval_left_dims[i];
+      }
+    }
+
+    array<Index, RDims> rhs_strides;
+    if (RDims > 0) {
+      rhs_strides[0] = 1;
+      for (int i = 0; i < RDims-1; ++i) {
+        rhs_strides[i+1] = rhs_strides[i] * eval_right_dims[i];
+      }
+    }
+
+    if (m_i_strides.size() > 0) m_i_strides[0] = 1;
+    if (m_j_strides.size() > 0) m_j_strides[0] = 1;
+    if (m_k_strides.size() > 0) m_k_strides[0] = 1;
+
+    m_i_size = 1;
+    m_j_size = 1;
+    m_k_size = 1;
+
+    // To compute the dimension, we simply concatenate the non-contracting
+    // dimensions of the left and then the right tensor. Additionally, I also
+    // want to compute the cumulative products of the left non-contracting
+    // dimensions, right non-contracting dimensions, and the contracting
+    // dimensions (in the order of the contraction) to aid in the later
+    // computation of tensor indices for matrix indices.
+    m_lhs_inner_dim_contiguous = true;
+    int dim_idx = 0;
+    int nocontract_idx = 0;
+
+    for (int i = 0; i < LDims; i++) {
+      // find if we are contracting on index i of left tensor
+      bool contracting = false;
+      for (int j = 0; j < ContractDims; j++) {
+        if (eval_op_indices[j].first == i) {
+          contracting = true;
+          break;
+        }
+      }
+      if (!contracting) {
+        // add dimension size to output dimensions
+        m_dimensions[dim_idx] = eval_left_dims[i];
+        m_left_nocontract_strides[nocontract_idx] = lhs_strides[i];
+        if (dim_idx != i) {
+          m_lhs_inner_dim_contiguous = false;
+        }
+        if (nocontract_idx+1 < internal::array_size<left_nocontract_t>::value) {
+          m_i_strides[nocontract_idx+1] =
+              m_i_strides[nocontract_idx] * eval_left_dims[i];
+        } else {
+          m_i_size = m_i_strides[nocontract_idx] * eval_left_dims[i];
+        }
+        dim_idx++;
+        nocontract_idx++;
+      }
+    }
+
+    nocontract_idx = 0;
+    for (int i = 0; i < RDims; i++) {
+      bool contracting = false;
+      // find if we are contracting on index i of right tensor
+      for (int j = 0; j < ContractDims; j++) {
+        if (eval_op_indices[j].second == i) {
+          contracting = true;
+          break;
+        }
+      }
+      if (!contracting) {
+        m_dimensions[dim_idx] = eval_right_dims[i];
+        if (nocontract_idx+1 < internal::array_size<right_nocontract_t>::value) {
+          m_j_strides[nocontract_idx+1] =
+              m_j_strides[nocontract_idx] * eval_right_dims[i];
+        } else {
+          m_j_size = m_j_strides[nocontract_idx] * eval_right_dims[i];
+        }
+        m_right_nocontract_strides[nocontract_idx] = rhs_strides[i];
+        dim_idx++;
+        nocontract_idx++;
+      }
+    }
+
+    // now build contraction cumprod. We assumed above that non-contracting axes
+    // are represented in the same order in the matrix as they are in the tensor.
+    // This is not the case for contracting axes. As the contracting axes must be
+    // of the same size in each tensor, I'll only look at the first tensor here.
+    m_rhs_inner_dim_contiguous = true;
+    m_rhs_inner_dim_reordered = false;
+    for (int i = 0; i < ContractDims; i++) {
+      Index left = eval_op_indices[i].first;
+      Index right = eval_op_indices[i].second;
+
+      Index size = eval_left_dims[left];
+      eigen_assert(size == eval_right_dims[right] &&
+                   "Contraction axes must be same size");
+
+      if (i+1 < internal::array_size<contract_t>::value) {
+        m_k_strides[i+1] = m_k_strides[i] * size;
+      } else {
+        m_k_size = m_k_strides[i] * size;
+      }
+      m_left_contracting_strides[i] = lhs_strides[left];
+      m_right_contracting_strides[i] = rhs_strides[right];
+
+      if (i > 0 && right < eval_op_indices[i-1].second) {
+        m_rhs_inner_dim_reordered = true;
+      }
+      if (right != i) {
+        m_rhs_inner_dim_contiguous = false;
+      }
+    }
+
+    // If the layout is RowMajor, we need to reverse the m_dimensions
+    if (static_cast<int>(Layout) == static_cast<int>(RowMajor)) {
+      for (int i = 0, j = NumDims - 1; i < j; i++, j--) {
+        numext::swap(m_dimensions[i], m_dimensions[j]);
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
+    m_leftImpl.evalSubExprsIfNeeded(NULL);
+    m_rightImpl.evalSubExprsIfNeeded(NULL);
+    if (data) {
+      evalTo(data);
+      return false;
+    } else {
+      m_result = static_cast<Scalar *>(m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)));
+      evalTo(m_result);
+      return true;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC void evalTo(Scalar* buffer) const {
+    if (this->m_lhs_inner_dim_contiguous) {
+      if (this->m_rhs_inner_dim_contiguous) {
+        if (this->m_rhs_inner_dim_reordered) {
+          static_cast<const Derived*>(this)->template evalProduct<true, true, true, Unaligned>(buffer);
+        }
+        else {
+          static_cast<const Derived*>(this)->template evalProduct<true, true, false, Unaligned>(buffer);
+        }
+      }
+      else {
+       if (this->m_rhs_inner_dim_reordered) {
+          static_cast<const Derived*>(this)->template evalProduct<true, false, true, Unaligned>(buffer);
+        }
+        else {
+          static_cast<const Derived*>(this)->template evalProduct<true, false, false, Unaligned>(buffer);
+        }
+      }
+    }
+    else {
+      if (this->m_rhs_inner_dim_contiguous) {
+        if (this->m_rhs_inner_dim_reordered) {
+          static_cast<const Derived*>(this)->template evalProduct<false, true, true, Unaligned>(buffer);
+        }
+        else {
+          static_cast<const Derived*>(this)->template evalProduct<false, true, false, Unaligned>(buffer);
+        }
+      }
+      else {
+       if (this->m_rhs_inner_dim_reordered) {
+          static_cast<const Derived*>(this)->template evalProduct<false, false, true, Unaligned>(buffer);
+        }
+        else {
+          static_cast<const Derived*>(this)->template evalProduct<false, false, false, Unaligned>(buffer);
+        }
+      }
+    }
+  }
+
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
+  void evalGemv(Scalar* buffer) const {
+    const Index rows = m_i_size;
+    const Index cols = m_k_size;
+
+    typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
+    typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
+    typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
+    typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
+    const int lhs_packet_size = PacketType<LhsScalar, Device>::size;
+    const int rhs_packet_size = PacketType<RhsScalar, Device>::size;
+    typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
+                                                   LeftEvaluator, left_nocontract_t,
+                                                   contract_t, lhs_packet_size,
+                                                   lhs_inner_dim_contiguous,
+                                                   false, Unaligned> LhsMapper;
+
+    typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs,
+                                                   RightEvaluator, right_nocontract_t,
+                                                   contract_t, rhs_packet_size,
+                                                   rhs_inner_dim_contiguous,
+                                                   rhs_inner_dim_reordered, Unaligned> RhsMapper;
+
+    LhsMapper lhs(m_leftImpl, m_left_nocontract_strides, m_i_strides,
+                  m_left_contracting_strides, m_k_strides);
+    RhsMapper rhs(m_rightImpl, m_right_nocontract_strides, m_j_strides,
+                  m_right_contracting_strides, m_k_strides);
+
+    const RhsScalar alpha(1);
+    const Index resIncr(1);
+
+    // zero out the result buffer (which must be of size at least rows * sizeof(Scalar)
+    m_device.memset(buffer, 0, rows * sizeof(Scalar));
+
+    internal::general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,false,RhsScalar,RhsMapper,false>::run(
+        rows, cols, lhs, rhs,
+        buffer, resIncr, alpha);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_leftImpl.cleanup();
+    m_rightImpl.cleanup();
+
+    if (m_result != NULL) {
+      m_device.deallocate(m_result);
+      m_result = NULL;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    return m_result[index];
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const {
+    return internal::ploadt<PacketReturnType, LoadMode>(m_result + index);
+  }
+
+  EIGEN_DEVICE_FUNC Scalar* data() const { return m_result; }
+
+  protected:
+  // Note: nvcc doesn't like implicit copy constructor. If this is needed anywhere,
+  // then we'll have to write an explicit copy constructor...
+  //TensorContractionEvaluatorBase(const TensorContractionEvaluatorBase&);
+
+  TensorContractionEvaluatorBase& operator = (const TensorContractionEvaluatorBase&);
+  Dimensions m_dimensions;
+
+  contract_t m_k_strides;
+  contract_t m_left_contracting_strides;
+  contract_t m_right_contracting_strides;
+
+  bool m_lhs_inner_dim_contiguous;
+  bool m_rhs_inner_dim_contiguous;
+  bool m_rhs_inner_dim_reordered;
+
+  left_nocontract_t m_i_strides;
+  right_nocontract_t m_j_strides;
+  left_nocontract_t m_left_nocontract_strides;
+  right_nocontract_t m_right_nocontract_strides;
+
+  Index m_i_size;
+  Index m_j_size;
+  Index m_k_size;
+
+  TensorEvaluator<EvalLeftArgType, Device> m_leftImpl;
+  TensorEvaluator<EvalRightArgType, Device> m_rightImpl;
+  const Device& m_device;
+  Scalar* m_result;
+};
+
+
+// evaluator for default device
+template<typename Indices, typename LeftArgType, typename RightArgType, typename Device>
+struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> :
+    public TensorContractionEvaluatorBase<
+      TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> > {
+  typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> Self;
+  typedef TensorContractionEvaluatorBase<Self> Base;
+
+  typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType;
+  typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
+  typedef typename XprType::Index Index;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+
+  enum {
+    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
+  };
+
+  // Most of the code is assuming that both input tensors are ColMajor. If the
+  // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
+  // If we want to compute A * B = C, where A is LHS and B is RHS, the code
+  // will pretend B is LHS and A is RHS.
+  typedef typename internal::conditional<
+    static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
+  typedef typename internal::conditional<
+    static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
+
+  static const int LDims =
+      internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
+  static const int RDims =
+      internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
+  static const int ContractDims = internal::array_size<Indices>::value;
+
+  typedef array<Index, LDims> left_dim_mapper_t;
+  typedef array<Index, RDims> right_dim_mapper_t;
+
+  typedef array<Index, ContractDims> contract_t;
+  typedef array<Index, LDims - ContractDims> left_nocontract_t;
+  typedef array<Index, RDims - ContractDims> right_nocontract_t;
+
+  static const int NumDims = LDims + RDims - 2 * ContractDims;
+
+  // Could we use NumDimensions here?
+  typedef DSizes<Index, NumDims> Dimensions;
+
+
+  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) :
+      Base(op, device) { }
+
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
+  void evalProduct(Scalar* buffer) const {
+    if (this->m_j_size == 1) {
+      this->template evalGemv<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer);
+      return;
+    }
+
+    evalGemm<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer);
+  }
+
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
+  EIGEN_DEVICE_FUNC void evalGemm(Scalar* buffer) const {
+    // columns in left side, rows in right side
+    const Index k = this->m_k_size;
+
+    // rows in left side
+    const Index m = this->m_i_size;
+
+    // columns in right side
+    const Index n = this->m_j_size;
+
+    // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar)
+    this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
+
+    // define mr, nr, and all of my data mapper types
+    typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
+    typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
+    typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits;
+
+    const Index nr = Traits::nr;
+    const Index mr = Traits::mr;
+
+    typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
+    typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
+
+    const int lhs_packet_size = internal::packet_traits<LhsScalar>::size;
+    const int rhs_packet_size = internal::packet_traits<RhsScalar>::size;
+
+    typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
+                                                   LeftEvaluator, left_nocontract_t,
+                                                   contract_t, lhs_packet_size,
+                                                   lhs_inner_dim_contiguous,
+                                                   false, Unaligned> LhsMapper;
+
+    typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs,
+                                                   RightEvaluator, right_nocontract_t,
+                                                   contract_t, rhs_packet_size,
+                                                   rhs_inner_dim_contiguous,
+                                                   rhs_inner_dim_reordered, Unaligned> RhsMapper;
+
+    typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
+
+    // declare GEBP packing and kernel structs
+    // TODO: packing could be faster sometimes if we supported row major tensor mappers
+    internal::gemm_pack_lhs<LhsScalar, Index, typename LhsMapper::SubMapper, mr, Traits::LhsProgress, ColMajor> pack_lhs;
+    internal::gemm_pack_rhs<RhsScalar, Index, typename RhsMapper::SubMapper, nr, ColMajor> pack_rhs;
+
+    // TODO: replace false, false with conjugate values?
+    internal::gebp_kernel<LhsScalar, RhsScalar, Index, OutputMapper, mr, nr, false, false> gebp;
+
+    // initialize data mappers
+    LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,
+                  this->m_left_contracting_strides, this->m_k_strides);
+
+    RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides,
+                  this->m_right_contracting_strides, this->m_k_strides);
+
+    OutputMapper output(buffer, m);
+
+    // TODO: refine arguments here (am I row or col major, etc)
+    typedef typename internal::gemm_blocking_space<ColMajor, LhsScalar, RhsScalar, Dynamic, Dynamic, Dynamic> BlockingType;
+
+    // compute block sizes (which depend on number of threads)
+
+    // last parameter is true to use L3 blocking, 2nd to last parameter is 1 to
+    // indicate 1 thread
+    BlockingType blocking(m, n, k, 1, true);
+
+    const Index kc = blocking.kc();
+    const Index mc = (std::min<Index>)(m, blocking.mc());
+    const Index nc = (std::min<Index>)(n, blocking.nc());
+
+    // sizes of submatrices to live in cache. see Goto paper.
+    int sizeA = blocking.mc() * kc;
+    int sizeB = kc * blocking.nc();
+
+    // note: m_device.allocate should return 16 byte aligned pointers, but if blockA and blockB
+    //       aren't 16 byte aligned segfaults will happen due to SIMD instructions
+    LhsScalar* blockA = static_cast<LhsScalar *>(this->m_device.allocate(sizeA * sizeof(LhsScalar)));
+    RhsScalar* blockB = static_cast<RhsScalar *>(this->m_device.allocate(sizeB * sizeof(RhsScalar)));
+
+    for(Index i2=0; i2<m; i2+=mc)
+    {
+      const Index actual_mc = numext::mini(i2+mc,m)-i2;
+      for (Index k2 = 0; k2 < k; k2 += kc) {
+        // make sure we don't overshoot right edge of left matrix, then pack vertical panel
+        const Index actual_kc = numext::mini(k2 + kc, k) - k2;
+        pack_lhs(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc, 0, 0);
+
+        // series of horizontal blocks
+        for (Index j2 = 0; j2 < n; j2 += nc) {
+          // make sure we don't overshoot right edge of right matrix, then pack block
+          const Index actual_nc = numext::mini(j2 + nc, n) - j2;
+          pack_rhs(blockB, rhs.getSubMapper(k2, j2), actual_kc, actual_nc, 0, 0);
+
+          // call gebp (matrix kernel)
+          // The parameters here are copied from Eigen's GEMM implementation
+          gebp(output.getSubMapper(i2, j2), blockA, blockB, actual_mc, actual_kc, actual_nc, Scalar(1), -1, -1, 0, 0);
+        }
+      }
+    }
+
+    this->m_device.deallocate(blockA);
+    this->m_device.deallocate(blockB);
+  }
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
new file mode 100644
index 0000000000..f05746f298
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
@@ -0,0 +1,1387 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Eric Martin <eric@ericmart.in>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H
+
+#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+
+namespace Eigen {
+
+template<typename Scalar, typename Index, typename LhsMapper,
+         typename RhsMapper, typename OutputMapper, bool needs_edge_check>
+__device__ EIGEN_STRONG_INLINE void
+EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
+                               const OutputMapper output, volatile Scalar* lhs_shmem, volatile Scalar* rhs_shmem,
+                       const Index m_size, const Index n_size, const Index k_size) {
+
+  const Index m_block_idx = blockIdx.x;
+  const Index n_block_idx = blockIdx.y;
+
+  const Index base_m = 64 * m_block_idx;
+  const Index base_n = 64 * n_block_idx;
+
+  // declare and initialize 64 registers for output 8x8 block
+
+  // prefetch registers
+  Scalar lhs_pf0;
+  Scalar lhs_pf1;
+  Scalar lhs_pf2;
+  Scalar lhs_pf3;
+  Scalar lhs_pf4;
+  Scalar lhs_pf5;
+  Scalar lhs_pf6;
+  Scalar lhs_pf7;
+
+  Scalar rhs_pf0;
+  Scalar rhs_pf1;
+  Scalar rhs_pf2;
+  Scalar rhs_pf3;
+  Scalar rhs_pf4;
+  Scalar rhs_pf5;
+  Scalar rhs_pf6;
+  Scalar rhs_pf7;
+
+  // shared memory is formatted
+  // (contract idx in block, nocontract idx in block, block idx)
+  // where block idx is column major. This transposition limits the number of
+  // bank conflicts when reading the LHS. The core idea is that since the contracting
+  // index is shared by both sides, then the contracting index should be in threadIdx.x.
+
+  // On the LHS, we pad each row inside of each block with an extra element. This makes
+  // each block 8 rows of 9 elements, which is 72 elements. This gives no bank conflicts
+  // on writes and very few 2-way conflicts on reads. There is an 8x8 grid of these blocks.
+
+  // On the RHS we just add 8 padding elements to the end of each block. This gives no bank
+  // conflicts on writes and also none on reads.
+
+  // storage indices
+  const Index lhs_store_idx_base = threadIdx.y * 72 + threadIdx.x * 9 + threadIdx.z;
+  const Index rhs_store_idx_base = threadIdx.y * 72 + threadIdx.z * 8 + threadIdx.x;
+
+  const Index lhs_store_idx_0 = lhs_store_idx_base + 576 * 0;
+  const Index lhs_store_idx_1 = lhs_store_idx_base + 576 * 1;
+  const Index lhs_store_idx_2 = lhs_store_idx_base + 576 * 2;
+  const Index lhs_store_idx_3 = lhs_store_idx_base + 576 * 3;
+  const Index lhs_store_idx_4 = lhs_store_idx_base + 576 * 4;
+  const Index lhs_store_idx_5 = lhs_store_idx_base + 576 * 5;
+  const Index lhs_store_idx_6 = lhs_store_idx_base + 576 * 6;
+  const Index lhs_store_idx_7 = lhs_store_idx_base + 576 * 7;
+
+  const Index rhs_store_idx_0 = rhs_store_idx_base + 576 * 0;
+  const Index rhs_store_idx_1 = rhs_store_idx_base + 576 * 1;
+  const Index rhs_store_idx_2 = rhs_store_idx_base + 576 * 2;
+  const Index rhs_store_idx_3 = rhs_store_idx_base + 576 * 3;
+  const Index rhs_store_idx_4 = rhs_store_idx_base + 576 * 4;
+  const Index rhs_store_idx_5 = rhs_store_idx_base + 576 * 5;
+  const Index rhs_store_idx_6 = rhs_store_idx_base + 576 * 6;
+  const Index rhs_store_idx_7 = rhs_store_idx_base + 576 * 7;
+
+  // in the loading code, the following variables are important:
+  // threadIdx.x: the vertical position in an 8x8 block
+  // threadIdx.y: the vertical index of the 8x8 block in the grid
+  // threadIdx.z: the horizontal position in an 8x8 block
+  // k: the horizontal index of the 8x8 block in the grid
+  //
+  // The k parameter is implicit (it was the loop counter for a loop that went
+  // from 0 to <8, but now that loop is unrolled in the below code.
+
+  const Index load_idx_vert = threadIdx.x + 8 * threadIdx.y;
+  const Index lhs_vert = base_m + load_idx_vert;
+
+#define prefetchIntoRegisters(base_k)                           \
+  {                                                             \
+    lhs_pf0 = Scalar(0);                                        \
+    lhs_pf1 = Scalar(0);                                        \
+    lhs_pf2 = Scalar(0);                                        \
+    lhs_pf3 = Scalar(0);                                        \
+    lhs_pf4 = Scalar(0);                                        \
+    lhs_pf5 = Scalar(0);                                        \
+    lhs_pf6 = Scalar(0);                                        \
+    lhs_pf7 = Scalar(0);                                        \
+                                                                \
+    rhs_pf0 = Scalar(0);                                        \
+    rhs_pf1 = Scalar(0);                                        \
+    rhs_pf2 = Scalar(0);                                        \
+    rhs_pf3 = Scalar(0);                                        \
+    rhs_pf4 = Scalar(0);                                        \
+    rhs_pf5 = Scalar(0);                                        \
+    rhs_pf6 = Scalar(0);                                        \
+    rhs_pf7 = Scalar(0);                                        \
+                                                                \
+    if (!needs_edge_check || lhs_vert < m_size) {               \
+      const Index lhs_horiz_0 = base_k + threadIdx.z + 0 * 8;   \
+      const Index lhs_horiz_1 = base_k + threadIdx.z + 1 * 8;   \
+      const Index lhs_horiz_2 = base_k + threadIdx.z + 2 * 8;   \
+      const Index lhs_horiz_3 = base_k + threadIdx.z + 3 * 8;   \
+      const Index lhs_horiz_4 = base_k + threadIdx.z + 4 * 8;   \
+      const Index lhs_horiz_5 = base_k + threadIdx.z + 5 * 8;   \
+      const Index lhs_horiz_6 = base_k + threadIdx.z + 6 * 8;   \
+      const Index lhs_horiz_7 = base_k + threadIdx.z + 7 * 8;   \
+                                                                \
+      if (!needs_edge_check || lhs_horiz_7 < k_size) {          \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
+        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
+        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
+        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);                   \
+        lhs_pf5 = lhs(lhs_vert, lhs_horiz_5);                   \
+        lhs_pf6 = lhs(lhs_vert, lhs_horiz_6);                   \
+        lhs_pf7 = lhs(lhs_vert, lhs_horiz_7);                   \
+      } else if (lhs_horiz_6 < k_size) {                        \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
+        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
+        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
+        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);                   \
+        lhs_pf5 = lhs(lhs_vert, lhs_horiz_5);                   \
+        lhs_pf6 = lhs(lhs_vert, lhs_horiz_6);                   \
+      } else if (lhs_horiz_5 < k_size) {                        \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
+        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
+        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
+        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);                   \
+        lhs_pf5 = lhs(lhs_vert, lhs_horiz_5);                   \
+      } else if (lhs_horiz_4 < k_size) {                        \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
+        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
+        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
+        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);                   \
+      } else if (lhs_horiz_3 < k_size) {                        \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
+        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
+        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
+      } else if (lhs_horiz_2 < k_size) {                        \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
+        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
+      } else if (lhs_horiz_1 < k_size) {                        \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
+      } else if (lhs_horiz_0 < k_size) {                        \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
+      }                                                         \
+    }                                                           \
+                                                                \
+    const Index rhs_vert = base_k + load_idx_vert;              \
+    if (!needs_edge_check || rhs_vert < k_size) {               \
+      const Index rhs_horiz_0 = base_n + threadIdx.z + 0 * 8;   \
+      const Index rhs_horiz_1 = base_n + threadIdx.z + 1 * 8;   \
+      const Index rhs_horiz_2 = base_n + threadIdx.z + 2 * 8;   \
+      const Index rhs_horiz_3 = base_n + threadIdx.z + 3 * 8;   \
+      const Index rhs_horiz_4 = base_n + threadIdx.z + 4 * 8;   \
+      const Index rhs_horiz_5 = base_n + threadIdx.z + 5 * 8;   \
+      const Index rhs_horiz_6 = base_n + threadIdx.z + 6 * 8;   \
+      const Index rhs_horiz_7 = base_n + threadIdx.z + 7 * 8;   \
+                                                                \
+      if (rhs_horiz_7 < n_size) {                               \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
+        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
+        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
+        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);                   \
+        rhs_pf5 = rhs(rhs_vert, rhs_horiz_5);                   \
+        rhs_pf6 = rhs(rhs_vert, rhs_horiz_6);                   \
+        rhs_pf7 = rhs(rhs_vert, rhs_horiz_7);                   \
+      } else if (rhs_horiz_6 < n_size) {                        \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
+        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
+        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
+        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);                   \
+        rhs_pf5 = rhs(rhs_vert, rhs_horiz_5);                   \
+        rhs_pf6 = rhs(rhs_vert, rhs_horiz_6);                   \
+      } else if (rhs_horiz_5 < n_size) {                        \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
+        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
+        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
+        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);                   \
+        rhs_pf5 = rhs(rhs_vert, rhs_horiz_5);                   \
+      } else if (rhs_horiz_4 < n_size) {                        \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
+        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
+        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
+        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);                   \
+      } else if (rhs_horiz_3 < n_size) {                        \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
+        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
+        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
+      } else if (rhs_horiz_2 < n_size) {                        \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
+        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
+      } else if (rhs_horiz_1 < n_size) {                        \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
+      } else if (rhs_horiz_0 < n_size) {                        \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
+      }                                                         \
+    }                                                           \
+  }                                                             \
+
+#define writeRegToShmem(_)                      \
+  lhs_shmem[lhs_store_idx_0] = lhs_pf0;         \
+  rhs_shmem[rhs_store_idx_0] = rhs_pf0;         \
+                                                \
+  lhs_shmem[lhs_store_idx_1] = lhs_pf1;         \
+  rhs_shmem[rhs_store_idx_1] = rhs_pf1;         \
+                                                \
+  lhs_shmem[lhs_store_idx_2] = lhs_pf2;         \
+  rhs_shmem[rhs_store_idx_2] = rhs_pf2;         \
+                                                \
+  lhs_shmem[lhs_store_idx_3] = lhs_pf3;         \
+  rhs_shmem[rhs_store_idx_3] = rhs_pf3;         \
+                                                \
+  lhs_shmem[lhs_store_idx_4] = lhs_pf4;         \
+  rhs_shmem[rhs_store_idx_4] = rhs_pf4;         \
+                                                \
+  lhs_shmem[lhs_store_idx_5] = lhs_pf5;         \
+  rhs_shmem[rhs_store_idx_5] = rhs_pf5;         \
+                                                \
+  lhs_shmem[lhs_store_idx_6] = lhs_pf6;         \
+  rhs_shmem[rhs_store_idx_6] = rhs_pf6;         \
+                                                \
+  lhs_shmem[lhs_store_idx_7] = lhs_pf7;         \
+  rhs_shmem[rhs_store_idx_7] = rhs_pf7;         \
+
+  // declare and initialize result array
+#define res(i, j) _res_##i##j
+#define initResultRow(i)                        \
+  Scalar res(i, 0) = Scalar(0);                 \
+  Scalar res(i, 1) = Scalar(0);                 \
+  Scalar res(i, 2) = Scalar(0);                 \
+  Scalar res(i, 3) = Scalar(0);                 \
+  Scalar res(i, 4) = Scalar(0);                 \
+  Scalar res(i, 5) = Scalar(0);                 \
+  Scalar res(i, 6) = Scalar(0);                 \
+  Scalar res(i, 7) = Scalar(0);                 \
+
+  initResultRow(0);
+  initResultRow(1);
+  initResultRow(2);
+  initResultRow(3);
+  initResultRow(4);
+  initResultRow(5);
+  initResultRow(6);
+  initResultRow(7);
+#undef initResultRow
+
+  for (Index base_k = 0; base_k < k_size; base_k += 64) {
+    // wait for previous iteration to finish with shmem. Despite common sense,
+    // the code is a bit faster with this here then at bottom of loop
+    __syncthreads();
+
+    prefetchIntoRegisters(base_k);
+    writeRegToShmem();
+
+    #undef prefetchIntoRegisters
+    #undef writeRegToShmem
+
+    // wait for shared mem packing to be done before starting computation
+    __syncthreads();
+
+    // compute 8x8 matrix product by outer product. This involves packing one column
+    // of LHS and one row of RHS into registers (takes 16 registers).
+
+#define lcol(i) _lcol##i
+    Scalar lcol(0);
+    Scalar lcol(1);
+    Scalar lcol(2);
+    Scalar lcol(3);
+    Scalar lcol(4);
+    Scalar lcol(5);
+    Scalar lcol(6);
+    Scalar lcol(7);
+
+#define rrow(j) _rrow##j
+    Scalar rrow(0);
+    Scalar rrow(1);
+    Scalar rrow(2);
+    Scalar rrow(3);
+    Scalar rrow(4);
+    Scalar rrow(5);
+    Scalar rrow(6);
+    Scalar rrow(7);
+
+    // Now x corresponds to k, y to m, and z to n
+    const volatile Scalar* lhs_block = &lhs_shmem[threadIdx.x + 9 * threadIdx.y];
+    const volatile Scalar* rhs_block = &rhs_shmem[threadIdx.x + 8 * threadIdx.z];
+
+#define lhs_element(i, j) lhs_block[72 * ((i) + 8 * (j))]
+#define rhs_element(i, j) rhs_block[72 * ((i) + 8 * (j))]
+
+#define loadData(i, j)                          \
+    lcol(0) = lhs_element(0, j);               \
+    rrow(0) = rhs_element(i, 0);               \
+    lcol(1) = lhs_element(1, j);               \
+    rrow(1) = rhs_element(i, 1);               \
+    lcol(2) = lhs_element(2, j);               \
+    rrow(2) = rhs_element(i, 2);               \
+    lcol(3) = lhs_element(3, j);               \
+    rrow(3) = rhs_element(i, 3);               \
+    lcol(4) = lhs_element(4, j);               \
+    rrow(4) = rhs_element(i, 4);               \
+    lcol(5) = lhs_element(5, j);               \
+    rrow(5) = rhs_element(i, 5);               \
+    lcol(6) = lhs_element(6, j);               \
+    rrow(6) = rhs_element(i, 6);               \
+    lcol(7) = lhs_element(7, j);               \
+    rrow(7) = rhs_element(i, 7);               \
+
+#define computeCol(j)                           \
+    res(0, j) += lcol(0) * rrow(j);             \
+    res(1, j) += lcol(1) * rrow(j);             \
+    res(2, j) += lcol(2) * rrow(j);             \
+    res(3, j) += lcol(3) * rrow(j);             \
+    res(4, j) += lcol(4) * rrow(j);             \
+    res(5, j) += lcol(5) * rrow(j);             \
+    res(6, j) += lcol(6) * rrow(j);             \
+    res(7, j) += lcol(7) * rrow(j);             \
+
+#define computePass(i)                          \
+    loadData(i, i);                             \
+                                                \
+    computeCol(0);                              \
+    computeCol(1);                              \
+    computeCol(2);                              \
+    computeCol(3);                              \
+    computeCol(4);                              \
+    computeCol(5);                              \
+    computeCol(6);                              \
+    computeCol(7);                              \
+
+    computePass(0);
+    computePass(1);
+    computePass(2);
+    computePass(3);
+    computePass(4);
+    computePass(5);
+    computePass(6);
+    computePass(7);
+
+#undef lcol
+#undef rrow
+#undef lhs_element
+#undef rhs_element
+#undef loadData
+#undef computeCol
+#undef computePass
+  } // end loop over k
+
+  // we've now iterated over all of the large (ie width 64) k blocks and
+  // accumulated results in registers. At this point thread (x, y, z) contains
+  // the sum across all big k blocks of the product of little k block of index (x, y)
+  // with block of index (y, z). To compute the final output, we need to reduce
+  // the 8 threads over y by summation.
+#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask)
+
+#define reduceRow(i, mask)                      \
+  shuffleInc(i, 0, mask);                       \
+  shuffleInc(i, 1, mask);                       \
+  shuffleInc(i, 2, mask);                       \
+  shuffleInc(i, 3, mask);                       \
+  shuffleInc(i, 4, mask);                       \
+  shuffleInc(i, 5, mask);                       \
+  shuffleInc(i, 6, mask);                       \
+  shuffleInc(i, 7, mask);                       \
+
+#define reduceMatrix(mask)                      \
+  reduceRow(0, mask);                           \
+  reduceRow(1, mask);                           \
+  reduceRow(2, mask);                           \
+  reduceRow(3, mask);                           \
+  reduceRow(4, mask);                           \
+  reduceRow(5, mask);                           \
+  reduceRow(6, mask);                           \
+  reduceRow(7, mask);                           \
+
+  // actually perform the reduction, now each thread of index (_, y, z)
+  // contains the correct values in its registers that belong in the output
+  // block
+  reduceMatrix(1);
+  reduceMatrix(2);
+  reduceMatrix(4);
+
+#undef shuffleInc
+#undef reduceRow
+#undef reduceMatrix
+
+  // now we need to copy the 64 values into main memory. We can't split work
+  // among threads because all variables are in registers. There's 2 ways
+  // to do this:
+  // (1) have 1 thread do 64 writes from registers into global memory
+  // (2) have 1 thread do 64 writes into shared memory, and then 8 threads
+  //     each do 8 writes into global memory. We can just overwrite the shared
+  //     memory from the problem we just solved.
+  // (2) is slightly faster than (1) due to less branching and more ILP
+
+  // TODO: won't yield much gain, but could just use currently unused shared mem
+  //       and then we won't have to sync
+  // wait for shared mem to be out of use
+  __syncthreads();
+
+#define writeResultShmem(i, j)                                          \
+  lhs_shmem[i + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j] = res(i, j); \
+
+#define writeRow(i)                             \
+  writeResultShmem(i, 0);                       \
+  writeResultShmem(i, 1);                       \
+  writeResultShmem(i, 2);                       \
+  writeResultShmem(i, 3);                       \
+  writeResultShmem(i, 4);                       \
+  writeResultShmem(i, 5);                       \
+  writeResultShmem(i, 6);                       \
+  writeResultShmem(i, 7);                       \
+
+  if (threadIdx.x == 0) {
+    writeRow(0);
+    writeRow(1);
+    writeRow(2);
+    writeRow(3);
+    writeRow(4);
+    writeRow(5);
+    writeRow(6);
+    writeRow(7);
+  }
+#undef writeResultShmem
+#undef writeRow
+
+  const int max_i_write = (min)((int)((m_size - base_m - threadIdx.y + 7) / 8), 8);
+  const int max_j_write = (min)((int)((n_size - base_n - threadIdx.z + 7) / 8), 8);
+
+  if (threadIdx.x < max_i_write) {
+    if (max_j_write == 8) {
+      // TODO: can i trade bank conflicts for coalesced writes?
+      Scalar val0 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 0];
+      Scalar val1 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 1];
+      Scalar val2 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 2];
+      Scalar val3 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 3];
+      Scalar val4 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 4];
+      Scalar val5 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 5];
+      Scalar val6 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 6];
+      Scalar val7 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 7];
+
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 0) = val0;
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 1) = val1;
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 2) = val2;
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 3) = val3;
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 4) = val4;
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 5) = val5;
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 6) = val6;
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 7) = val7;
+    } else {
+#pragma unroll 7
+      for (int j = 0; j < max_j_write; j++) {
+        Scalar val = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j];
+        output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * j) = val;
+      }
+    }
+  }
+#undef res
+}
+
+
+template<typename Scalar, typename Index, typename LhsMapper,
+         typename RhsMapper, typename OutputMapper>
+__global__ void
+__launch_bounds__(512)
+EigenContractionKernel(const LhsMapper lhs, const RhsMapper rhs,
+                       const OutputMapper output,
+                       const Index m_size, const Index n_size, const Index k_size) {
+  __shared__ volatile Scalar lhs_shmem[72 * 64];
+  __shared__ volatile Scalar rhs_shmem[72 * 64];
+
+  const Index m_block_idx = blockIdx.x;
+  const Index n_block_idx = blockIdx.y;
+
+  const Index base_m = 64 * m_block_idx;
+  const Index base_n = 64 * n_block_idx;
+
+  if (base_m + 63 < m_size && base_n + 63 < n_size) {
+    EigenContractionKernelInternal<Scalar, Index, LhsMapper, RhsMapper, OutputMapper, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size);
+  } else {
+    EigenContractionKernelInternal<Scalar, Index, LhsMapper, RhsMapper, OutputMapper, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size);
+  }
+}
+
+
+template<typename Index, typename LhsMapper,
+         typename RhsMapper, typename OutputMapper, bool CHECK_LHS_BOUNDARY,
+         bool CHECK_RHS_BOUNDARY>
+__device__ EIGEN_STRONG_INLINE void
+EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rhs,
+                       const OutputMapper output, float2 lhs_shmem2[][16],
+                       float2 rhs_shmem2[][8], const Index m_size,
+                       const Index n_size, const Index k_size,
+                       const Index base_m, const Index base_n) {
+  typedef float Scalar;
+
+  // prefetch registers
+  float4 lhs_pf0, rhs_pf0;
+
+  float4 results[4];
+  for (int i = 0; i < 4; i++) {
+    results[i].x = results[i].y = results[i].z = results[i].w = 0;
+  }
+
+
+#define prefetch_lhs(reg, row, col)                   \
+    if (!CHECK_LHS_BOUNDARY) {                        \
+      if (col < k_size) {                             \
+        reg =lhs.loadPacket(row, col);                \
+      }                                               \
+    } else {                                          \
+      if (col < k_size) {                             \
+        if (row + 3 < m_size) {                       \
+          reg =lhs.loadPacket(row, col);              \
+        } else if (row + 2 < m_size) {                \
+          reg.x =lhs(row + 0, col);                   \
+          reg.y =lhs(row + 1, col);                   \
+          reg.z =lhs(row + 2, col);                   \
+        } else if (row + 1 < m_size) {                \
+          reg.x =lhs(row + 0, col);                   \
+          reg.y =lhs(row + 1, col);                   \
+        } else if (row  < m_size) {                   \
+          reg.x =lhs(row + 0, col);                   \
+        }                                             \
+      }                                               \
+    }                                                 \
+
+
+  Index lhs_vert = base_m+threadIdx.x*4;
+
+  for (Index k = 0; k < k_size; k += 16) {
+    lhs_pf0 = internal::pset1<float4>(0);
+    rhs_pf0 = internal::pset1<float4>(0);
+
+    Index lhs_horiz = threadIdx.y+k;
+    prefetch_lhs(lhs_pf0, lhs_vert, lhs_horiz)
+
+    Index rhs_vert = k+(threadIdx.x%4)*4;
+    Index rhs_horiz0 = (threadIdx.x>>2)+threadIdx.y*4+base_n;
+
+    if (!CHECK_RHS_BOUNDARY) {
+      if ((rhs_vert + 3) < k_size) {
+        // just CHECK_RHS_BOUNDARY
+        rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0);
+      } else if (rhs_vert + 2 < k_size) {
+        // just CHECK_RHS_BOUNDARY
+        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+        rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+        rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+      } else if (rhs_vert + 1 < k_size) {
+        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+        rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+      } else if (rhs_vert  < k_size) {
+        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+      }
+    } else {
+      if (rhs_horiz0 < n_size) {
+        if ((rhs_vert + 3) < k_size) {
+          rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0);
+        } else if ((rhs_vert + 2) < k_size) {
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+          rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+        } else if ((rhs_vert + 1) < k_size) {
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+        } else if (rhs_vert  < k_size) {
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+        }
+      }
+    }
+    float x1, x2 ;
+    // the following can be a bitwise operation..... some day.
+    if((threadIdx.x%8) < 4) {
+      x1 = rhs_pf0.y;
+      x2 = rhs_pf0.w;
+    } else {
+      x1 = rhs_pf0.x;
+      x2 = rhs_pf0.z;
+    }
+    x1 = __shfl_xor(x1, 4);
+    x2 = __shfl_xor(x2, 4);
+    if((threadIdx.x%8) < 4) {
+      rhs_pf0.y = x1;
+      rhs_pf0.w = x2;
+    } else {
+      rhs_pf0.x = x1;
+      rhs_pf0.z = x2;
+    }
+
+    // We have 64 features.
+    // Row 0 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 0, 1.
+    // Row 1 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 2, 3.
+    // ...
+    // Row 31 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 62, 63
+    // Row 32 -> times (2, 6, 10, 14, 3, 7, 11, 15) for features 0, 1
+    // ...
+    rhs_shmem2[(threadIdx.x>>3)+ threadIdx.y*2][threadIdx.x%8] = make_float2(rhs_pf0.x, rhs_pf0.y);
+    rhs_shmem2[(threadIdx.x>>3)+ threadIdx.y*2+32][threadIdx.x%8] = make_float2(rhs_pf0.z, rhs_pf0.w);
+
+    // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61)
+    // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61)
+    // ...
+    // Row 15 (time 15) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61)
+    // Row 16 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), ..  (62, 63)
+    // ...
+
+    lhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(lhs_pf0.x, lhs_pf0.y);
+    lhs_shmem2[threadIdx.y+16][threadIdx.x] = make_float2(lhs_pf0.z, lhs_pf0.w);
+
+
+#define add_vals(fl1, fl2, fr1, fr2)\
+    results[0].x += fl1.x * fr1.x;\
+    results[0].y += fl1.y * fr1.x;\
+    results[0].z += fl2.x * fr1.x;\
+    results[0].w += fl2.y * fr1.x;\
+\
+    results[1].x += fl1.x * fr1.y;\
+    results[1].y += fl1.y * fr1.y;\
+    results[1].z += fl2.x * fr1.y;\
+    results[1].w += fl2.y * fr1.y;\
+\
+    results[2].x += fl1.x * fr2.x;\
+    results[2].y += fl1.y * fr2.x;\
+    results[2].z += fl2.x * fr2.x;\
+    results[2].w += fl2.y * fr2.x;\
+\
+    results[3].x += fl1.x * fr2.y;\
+    results[3].y += fl1.y * fr2.y;\
+    results[3].z += fl2.x * fr2.y;\
+    results[3].w += fl2.y * fr2.y;\
+
+    __syncthreads();
+
+    // Do the multiplies.
+    #pragma unroll
+    for (int koff = 0; koff < 16; koff ++) {
+      // 32 x threads.
+      float2 fl1 = lhs_shmem2[koff][threadIdx.x];
+      float2 fl2 = lhs_shmem2[koff + 16][threadIdx.x];
+
+      int start_feature = threadIdx.y * 4;
+      float2 fr1 = rhs_shmem2[(start_feature>>1) + 32*((koff%4)/2)][koff/4 + (koff%2)*4];
+      float2 fr2 = rhs_shmem2[(start_feature>>1) + 1 + 32*((koff%4)/2)][koff/4 + (koff%2)*4];
+
+      add_vals(fl1, fl2, fr1, fr2)
+    }
+    __syncthreads();
+  }
+
+#undef prefetch_lhs
+#undef add_vals
+
+  Index horiz_base = threadIdx.y*4+base_n;
+  if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) {
+    for (int i = 0; i < 4; i++) {
+      output(lhs_vert, horiz_base + i) = results[i].x;
+      output(lhs_vert + 1, horiz_base + i) = results[i].y;
+      output(lhs_vert + 2, horiz_base + i) = results[i].z;
+      output(lhs_vert + 3, horiz_base + i) = results[i].w;
+    }
+  } else if (!CHECK_RHS_BOUNDARY) {
+    // CHECK LHS
+    if (lhs_vert + 3 < m_size) {
+      for (int i = 0; i < 4; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        output(lhs_vert + 2, horiz_base + i) = results[i].z;
+        output(lhs_vert + 3, horiz_base + i) = results[i].w;
+      }
+    } else if (lhs_vert + 2 < m_size) {
+      for (int i = 0; i < 4; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        output(lhs_vert + 2, horiz_base + i) = results[i].z;
+      }
+    } else if (lhs_vert + 1 < m_size) {
+      for (int i = 0; i < 4; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+      }
+    } else if (lhs_vert  < m_size) {
+      for (int i = 0; i < 4; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+      }
+    }
+  } else if (!CHECK_LHS_BOUNDARY) {
+    // CHECK RHS
+    /*
+    int ncols_rem = fminf(n_size- horiz_base, 4);
+    for (int i = 0; i < ncols_rem; i++) {
+      output(lhs_vert, horiz_base + i) = results[i].x;
+      output(lhs_vert + 1, horiz_base + i) = results[i].y;
+      output(lhs_vert + 2, horiz_base + i) = results[i].z;
+      output(lhs_vert + 3, horiz_base + i) = results[i].w;
+    }*/
+    for (int i = 0; i < 4; i++) {
+      if (horiz_base+i < n_size) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        output(lhs_vert + 2, horiz_base + i) = results[i].z;
+        output(lhs_vert + 3, horiz_base + i) = results[i].w;
+       }
+    }
+  } else {
+    // CHECK both boundaries.
+    for (int i = 0; i < 4; i++) {
+      if (horiz_base+i < n_size) {
+        if (lhs_vert < m_size)
+          output(lhs_vert, horiz_base + i) = results[i].x;
+        if (lhs_vert + 1 < m_size)
+          output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        if (lhs_vert + 2 < m_size)
+          output(lhs_vert + 2, horiz_base + i) = results[i].z;
+        if (lhs_vert + 3 < m_size)
+          output(lhs_vert + 3, horiz_base + i) = results[i].w;
+      }
+    }
+  }
+}
+
+
+template<typename Index, typename LhsMapper,
+         typename RhsMapper, typename OutputMapper, bool CHECK_LHS_BOUNDARY,
+         bool CHECK_RHS_BOUNDARY>
+__device__ EIGEN_ALWAYS_INLINE void
+EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
+                       const OutputMapper output, float2 lhs_shmem2[][32],
+                       float2 rhs_shmem2[][8], const Index m_size,
+                       const Index n_size, const Index k_size,
+                       const Index base_m, const Index base_n) {
+  typedef float Scalar;
+
+  // prefetch registers
+  float4 lhs_pf0, lhs_pf1, lhs_pf2, lhs_pf3;
+  float4 rhs_pf0, rhs_pf1;
+
+  float4 results[8];
+  for (int i=0; i < 8; i++) {
+    results[i].x = results[i].y = results[i].z = results[i].w = 0;
+  }
+
+
+  Index lhs_vert = base_m+threadIdx.x*4+(threadIdx.y%4)*32;
+  for (Index k = 0; k < k_size; k += 32) {
+    lhs_pf0 = internal::pset1<float4>(0);
+    lhs_pf1 = internal::pset1<float4>(0);
+    lhs_pf2 = internal::pset1<float4>(0);
+    lhs_pf3 = internal::pset1<float4>(0);
+
+    rhs_pf0 = internal::pset1<float4>(0);
+    rhs_pf1 = internal::pset1<float4>(0);
+
+     if (!CHECK_LHS_BOUNDARY) {
+      if ((threadIdx.y/4+k+24) < k_size) {
+        lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k));
+        lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8));
+        lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16));
+        lhs_pf3 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+24));
+      } else if ((threadIdx.y/4+k+16) < k_size) {
+        lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k));
+        lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8));
+        lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16));
+      } else if ((threadIdx.y/4+k+8) < k_size) {
+        lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k));
+        lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8));
+      } else if ((threadIdx.y/4+k) < k_size) {
+        lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k));
+      }
+    } else {
+      // just CHECK_LHS_BOUNDARY
+      if (lhs_vert + 3 < m_size) {
+        if ((threadIdx.y/4+k+24) < k_size) {
+          lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k));
+          lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8));
+          lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16));
+          lhs_pf3 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+24));
+        } else if ((threadIdx.y/4+k+16) < k_size) {
+          lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k));
+          lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8));
+          lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16));
+        } else if ((threadIdx.y/4+k+8) < k_size) {
+          lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k));
+          lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8));
+        } else if ((threadIdx.y/4+k) < k_size) {
+          lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k));
+        }
+      } else if (lhs_vert + 2 < m_size) {
+        if ((threadIdx.y/4+k+24) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+          lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
+          lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8));
+          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
+          lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
+          lhs_pf2.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+16));
+          lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24));
+          lhs_pf3.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+24));
+          lhs_pf3.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+24));
+        } else if ((threadIdx.y/4+k+16) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+          lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
+          lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8));
+          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
+          lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
+          lhs_pf2.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+16));
+        } else if ((threadIdx.y/4+k+8) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+          lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
+          lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8));
+        } else if ((threadIdx.y/4+k) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+          lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
+        }
+      } else if (lhs_vert + 1 < m_size) {
+        if ((threadIdx.y/4+k+24) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
+          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
+          lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
+          lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24));
+          lhs_pf3.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+24));
+        } else if ((threadIdx.y/4+k+16) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
+          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
+          lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
+        } else if ((threadIdx.y/4+k+8) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
+        } else if ((threadIdx.y/4+k) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+        }
+      } else if (lhs_vert < m_size) {
+        if ((threadIdx.y/4+k+24) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
+          lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24));
+        } else if ((threadIdx.y/4+k+16) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
+        } else if ((threadIdx.y/4+k+8) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+        } else if ((threadIdx.y/4+k) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+        }
+      }
+    }
+    __syncthreads();
+    Index rhs_vert = k+threadIdx.x*4;
+    Index rhs_horiz0 = threadIdx.y*2+base_n;
+    Index rhs_horiz1 = threadIdx.y*2+1+base_n;
+    if (!CHECK_RHS_BOUNDARY) {
+      if ((rhs_vert + 3) < k_size) {
+        // just CHECK_RHS_BOUNDARY
+        rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0);
+        rhs_pf1 = rhs.loadPacket(rhs_vert, rhs_horiz1);
+      } else if (rhs_vert + 2 < k_size) {
+        // just CHECK_RHS_BOUNDARY
+        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+        rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+        rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+        rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+        rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
+        rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1);
+      } else if (rhs_vert + 1 < k_size) {
+        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+        rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+        rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+        rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
+      } else if (rhs_vert  < k_size) {
+        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+        rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+      }
+    } else {
+      if (rhs_horiz1 < n_size) {
+        if ((rhs_vert + 3) < k_size) {
+          // just CHECK_RHS_BOUNDARY
+          rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0);
+          rhs_pf1 = rhs.loadPacket(rhs_vert, rhs_horiz1);
+        } else if (rhs_vert + 2 < k_size) {
+          // just CHECK_RHS_BOUNDARY
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+          rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+          rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+          rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
+          rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1);
+        } else if (k+threadIdx.x*4 + 1 < k_size) {
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+          rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+          rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
+        } else if (k+threadIdx.x*4  < k_size) {
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+        }
+      } else if (rhs_horiz0 < n_size) {
+        if ((rhs_vert + 3) < k_size) {
+          // just CHECK_RHS_BOUNDARY
+          rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0);
+        } else if ((rhs_vert + 2) < k_size) {
+          // just CHECK_RHS_BOUNDARY
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+          rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+        } else if ((rhs_vert + 1) < k_size) {
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+        } else if (rhs_vert  < k_size) {
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+        }
+      }
+    }
+    __syncthreads();
+    // Loaded. Do computation
+    // Row 0 -> times (0, 4, 8, .. 28) for features 0, 1.
+    // Row 1 -> times (0, 4, 8, .. 28) for features 2, 3.
+    // ..
+    // Row 31 -> times (0, 4, 8, .. 28) for features 62, 63
+    rhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(rhs_pf0.x, rhs_pf1.x);
+    // Row 32 -> times (1, 5, 9, .. 29) for features 0, 1.
+    // Row 33 -> times (1, 5, 9, .. 29) for features 2, 3.
+    // ..
+    rhs_shmem2[threadIdx.y+32][threadIdx.x] = make_float2(rhs_pf0.y, rhs_pf1.y);
+    // Row 64 -> times (2, 6, 10, .. 30) for features 0, 1.
+    // Row 65 -> times (2, 6, 10, .. 30) for features 2, 3.
+    rhs_shmem2[threadIdx.y+64][threadIdx.x] = make_float2(rhs_pf0.z, rhs_pf1.z);
+    // Row 96 -> times (3, 7, 11, .. 31) for features 0, 1.
+    // Row 97 -> times (3, 7, 11, .. 31) for features 2, 3.
+    rhs_shmem2[threadIdx.y+96][threadIdx.x] = make_float2(rhs_pf0.w, rhs_pf1.w);
+
+    // LHS.
+    // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61) .. (124, 125)
+    // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61) .. (124, 125)
+    // ...
+    // Row 8 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), ..  (62, 63) .. (126, 127)
+    // Row 15 (time 7) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), ..  (62, 63) .. (126, 127)
+
+
+#define add_vals(a_feat1, a_feat2, f1, f2, f3, f4)\
+      results[0].x += a_feat1.x * f1.x;\
+      results[1].x += a_feat1.x * f1.y;\
+      results[2].x += a_feat1.x * f2.x;\
+      results[3].x += a_feat1.x * f2.y;\
+      results[4].x += a_feat1.x * f3.x;\
+      results[5].x += a_feat1.x * f3.y;\
+      results[6].x += a_feat1.x * f4.x;\
+      results[7].x += a_feat1.x * f4.y;\
+\
+      results[0].y += a_feat1.y * f1.x;\
+      results[1].y += a_feat1.y * f1.y;\
+      results[2].y += a_feat1.y * f2.x;\
+      results[3].y += a_feat1.y * f2.y;\
+      results[4].y += a_feat1.y * f3.x;\
+      results[5].y += a_feat1.y * f3.y;\
+      results[6].y += a_feat1.y * f4.x;\
+      results[7].y += a_feat1.y * f4.y;\
+\
+      results[0].z += a_feat2.x * f1.x;\
+      results[1].z += a_feat2.x * f1.y;\
+      results[2].z += a_feat2.x * f2.x;\
+      results[3].z += a_feat2.x * f2.y;\
+      results[4].z += a_feat2.x * f3.x;\
+      results[5].z += a_feat2.x * f3.y;\
+      results[6].z += a_feat2.x * f4.x;\
+      results[7].z += a_feat2.x * f4.y;\
+\
+      results[0].w += a_feat2.y * f1.x;\
+      results[1].w += a_feat2.y * f1.y;\
+      results[2].w += a_feat2.y * f2.x;\
+      results[3].w += a_feat2.y * f2.y;\
+      results[4].w += a_feat2.y * f3.x;\
+      results[5].w += a_feat2.y * f3.y;\
+      results[6].w += a_feat2.y * f4.x;\
+      results[7].w += a_feat2.y * f4.y;\
+
+    lhs_shmem2[threadIdx.y/4][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf0.x, lhs_pf0.y);
+    lhs_shmem2[threadIdx.y/4+8][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf1.x, lhs_pf1.y);
+    lhs_shmem2[threadIdx.y/4+16][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf2.x, lhs_pf2.y);
+    lhs_shmem2[threadIdx.y/4+24][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf3.x, lhs_pf3.y);
+
+    lhs_shmem2[threadIdx.y/4 + 32][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf0.z, lhs_pf0.w);
+    lhs_shmem2[threadIdx.y/4 + 40][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf1.z, lhs_pf1.w);
+    lhs_shmem2[threadIdx.y/4 + 48][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf2.z, lhs_pf2.w);
+    lhs_shmem2[threadIdx.y/4 + 56][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf3.z, lhs_pf3.w);
+
+    __syncthreads();
+
+    // Do the multiplies.
+    #pragma unroll
+    for (int koff = 0; koff < 32; koff ++) {
+      float2 a3 = lhs_shmem2[koff][threadIdx.x + (threadIdx.y % 4) * 8];
+      float2 a4 = lhs_shmem2[koff + 32][threadIdx.x + (threadIdx.y % 4) * 8];
+
+      // first feature is at (threadIdx.y/4) * 8 last is at start + 8.
+      int start_feature = (threadIdx.y / 4) * 8;
+
+      float2 br1 = rhs_shmem2[start_feature/2 +     (koff % 4) * 32][koff/4];
+      float2 br2 = rhs_shmem2[start_feature/2 + 1 + (koff % 4) * 32][koff/4];
+      float2 br3 = rhs_shmem2[start_feature/2 + 2 + (koff % 4) * 32][koff/4];
+      float2 br4 = rhs_shmem2[start_feature/2 + 3 + (koff % 4) * 32][koff/4];
+
+      add_vals(a3, a4, br1, br2, br3, br4)
+    }
+    __syncthreads();
+  } // end loop over k
+
+
+  __syncthreads();
+  Index horiz_base = (threadIdx.y/4)*8+base_n;
+  if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) {
+    #pragma unroll
+    for (int i = 0; i < 8; i++) {
+      output(lhs_vert, horiz_base + i) = results[i].x;
+      output(lhs_vert + 1, horiz_base + i) = results[i].y;
+      output(lhs_vert + 2, horiz_base + i) = results[i].z;
+      output(lhs_vert + 3, horiz_base + i) = results[i].w;
+    }
+  } else if (!CHECK_RHS_BOUNDARY) {
+    if (lhs_vert + 3 < m_size) {
+      #pragma unroll
+      for (int i = 0; i < 8; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        output(lhs_vert + 2, horiz_base + i) = results[i].z;
+        output(lhs_vert + 3, horiz_base + i) = results[i].w;
+      }
+    } else if (lhs_vert + 2 < m_size) {
+      #pragma unroll
+      for (int i = 0; i < 8; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        output(lhs_vert + 2, horiz_base + i) = results[i].z;
+      }
+    } else if (lhs_vert + 1 < m_size) {
+      #pragma unroll
+      for (int i = 0; i < 8; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+      }
+    } else if (lhs_vert  < m_size) {
+      #pragma unroll
+      for (int i = 0; i < 8; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+      }
+    }
+  } else if (!CHECK_LHS_BOUNDARY) {
+    // CHECK BOUNDARY_B
+    #pragma unroll
+    for (int i = 0; i < 8; i++) {
+      if (horiz_base + i < n_size) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        output(lhs_vert + 2, horiz_base + i) = results[i].z;
+        output(lhs_vert + 3, horiz_base + i) = results[i].w;
+      }
+    }
+  } else {
+    // CHECK both boundaries.
+    #pragma unroll
+    for (int i = 0; i < 8; i++) {
+      if (horiz_base + i < n_size) {
+        if (lhs_vert < m_size)
+          output(lhs_vert, horiz_base + i) = results[i].x;
+        if (lhs_vert + 1 < m_size)
+          output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        if (lhs_vert + 2 < m_size)
+          output(lhs_vert + 2, horiz_base + i) = results[i].z;
+        if (lhs_vert + 3 < m_size)
+          output(lhs_vert + 3, horiz_base + i) = results[i].w;
+      }
+    }
+  }
+}
+
+
+template<typename Index, typename LhsMapper,
+         typename RhsMapper, typename OutputMapper>
+__global__ void
+__launch_bounds__(256)
+EigenFloatContractionKernel(const LhsMapper lhs, const RhsMapper rhs,
+                       const OutputMapper output,
+                       const Index m_size, const Index n_size, const Index k_size) {
+  __shared__ float2 lhs_shmem[64*32];
+  __shared__ float2 rhs_shmem[128*8];
+
+  typedef float2 LHS_MEM[64][32];
+  typedef float2 RHS_MEM[128][8];
+
+  typedef float2 LHS_MEM16x16[32][16];
+  typedef float2 RHS_MEM16x16[64][8];
+
+  const Index m_block_idx = blockIdx.x;
+  const Index n_block_idx = blockIdx.y;
+
+  const Index base_m = 128 * m_block_idx;
+  const Index base_n = 64 * n_block_idx;
+
+  const bool check_rhs = (base_n + 63) >= n_size;
+  const bool check_lhs128 = (base_m + 127) >= m_size;
+
+  if (!check_rhs) {
+    if (!check_lhs128) {
+      // >= 128 rows left
+      EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, false, false>(
+                     lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
+    } else {
+      EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, true, false>(
+                     lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
+    }
+  } else {
+    if (!check_lhs128) {
+      // >= 128 rows left
+      EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, false, true>(
+                     lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
+    } else {
+      EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, true, true>(
+                     lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
+    }
+  }
+}
+
+template<typename Index, typename LhsMapper,
+         typename RhsMapper, typename OutputMapper>
+__global__ void
+__launch_bounds__(256)
+EigenFloatContractionKernel16x16(const LhsMapper lhs, const RhsMapper rhs,
+                       const OutputMapper output,
+                       const Index m_size, const Index n_size, const Index k_size) {
+  __shared__ float2 lhs_shmem[32][16];
+  __shared__ float2 rhs_shmem[64][8];
+
+  const Index m_block_idx = blockIdx.x;
+  const Index n_block_idx = blockIdx.y;
+
+  const Index base_m = 64 * m_block_idx;
+  const Index base_n = 64 * n_block_idx;
+
+  if (base_m + 63 < m_size) {
+    if (base_n + 63 < n_size) {
+      EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, false, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
+    } else {
+      EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, false, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
+    }
+  } else {
+    if (base_n + 63 < n_size) {
+      EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, true, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
+    } else {
+      EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, true, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
+    }
+  }
+}
+
+
+template<typename Indices, typename LeftArgType, typename RightArgType>
+struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, GpuDevice> :
+    public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, GpuDevice> > {
+
+  typedef GpuDevice Device;
+
+  typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> Self;
+  typedef TensorContractionEvaluatorBase<Self> Base;
+
+  typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType;
+  typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
+  typedef typename XprType::Index Index;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, GpuDevice>::type PacketReturnType;
+
+  enum {
+    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
+  };
+
+  // Most of the code is assuming that both input tensors are ColMajor. If the
+  // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
+  // If we want to compute A * B = C, where A is LHS and B is RHS, the code
+  // will pretend B is LHS and A is RHS.
+  typedef typename internal::conditional<
+    static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
+  typedef typename internal::conditional<
+    static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
+
+  static const int LDims =
+      internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
+  static const int RDims =
+      internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
+  static const int ContractDims = internal::array_size<Indices>::value;
+
+  typedef array<Index, LDims> left_dim_mapper_t;
+  typedef array<Index, RDims> right_dim_mapper_t;
+
+  typedef array<Index, ContractDims> contract_t;
+  typedef array<Index, LDims - ContractDims> left_nocontract_t;
+  typedef array<Index, RDims - ContractDims> right_nocontract_t;
+
+  static const int NumDims = LDims + RDims - 2 * ContractDims;
+
+  typedef DSizes<Index, NumDims> Dimensions;
+
+  // typedefs needed in evalTo
+  typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
+  typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
+
+  typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
+  typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
+
+  typedef typename LeftEvaluator::Dimensions LeftDimensions;
+  typedef typename RightEvaluator::Dimensions RightDimensions;
+
+  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) :
+      Base(op, device) {}
+
+  // We need to redefine this method to make nvcc happy
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
+    this->m_leftImpl.evalSubExprsIfNeeded(NULL);
+    this->m_rightImpl.evalSubExprsIfNeeded(NULL);
+    if (data) {
+      evalTo(data);
+      return false;
+    } else {
+      this->m_result = static_cast<Scalar *>(this->m_device.allocate(this->dimensions().TotalSize() * sizeof(Scalar)));
+      evalTo(this->m_result);
+      return true;
+    }
+  }
+
+  void evalTo(Scalar* buffer) const {
+    if (this->m_lhs_inner_dim_contiguous) {
+      if (this->m_rhs_inner_dim_contiguous) {
+        if (this->m_rhs_inner_dim_reordered) {
+          evalTyped<true, true, true, Unaligned>(buffer);
+        }
+        else {
+          evalTyped<true, true, false, Unaligned>(buffer);
+        }
+      }
+      else {
+       if (this->m_rhs_inner_dim_reordered) {
+          evalTyped<true, false, true, Unaligned>(buffer);
+        }
+        else {
+          evalTyped<true, false, false, Unaligned>(buffer);
+        }
+      }
+    }
+    else {
+      if (this->m_rhs_inner_dim_contiguous) {
+        if (this->m_rhs_inner_dim_reordered) {
+          evalTyped<false, true, true, Unaligned>(buffer);
+        }
+        else {
+          evalTyped<false, true, false, Unaligned>(buffer);
+        }
+      }
+      else {
+       if (this->m_rhs_inner_dim_reordered) {
+          evalTyped<false, false, true, Unaligned>(buffer);
+        }
+        else {
+          evalTyped<false, false, false, Unaligned>(buffer);
+        }
+      }
+    }
+  }
+
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
+  void evalTyped(Scalar* buffer) const {
+    // columns in left side, rows in right side
+    const Index k = this->m_k_size;
+
+    // rows in left side
+    const Index m = this->m_i_size;
+
+    // columns in right side
+    const Index n = this->m_j_size;
+
+    // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar)
+    this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
+
+    typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
+                                                   LeftEvaluator, left_nocontract_t,
+                                                   contract_t, 4,
+                                                   lhs_inner_dim_contiguous,
+                                                   false, Unaligned> LhsMapper;
+
+    typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs,
+                                                   RightEvaluator, right_nocontract_t,
+                                                   contract_t, 4,
+                                                   rhs_inner_dim_contiguous,
+                                                   rhs_inner_dim_reordered, Unaligned> RhsMapper;
+
+    typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
+
+
+    // initialize data mappers
+    LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,
+                  this->m_left_contracting_strides, this->m_k_strides);
+
+    RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides,
+                  this->m_right_contracting_strides, this->m_k_strides);
+
+    OutputMapper output(buffer, m);
+
+    setCudaSharedMemConfig(cudaSharedMemBankSizeEightByte);
+    if (internal::is_same<LhsScalar, float>::value &&
+        internal::is_same<RhsScalar, float>::value) {
+      if (m < 768 || n < 768) {
+        const Index m_blocks = (m + 63) / 64;
+        const Index n_blocks = (n + 63) / 64;
+        const dim3 num_blocks(m_blocks, n_blocks, 1);
+        const dim3 block_size(16, 16, 1);
+        LAUNCH_CUDA_KERNEL((EigenFloatContractionKernel16x16<Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, this->m_device, lhs, rhs, output, m, n, k);
+      } else {
+       const Index m_blocks = (m + 127) / 128;
+        const Index n_blocks = (n + 63) / 64;
+        const dim3 num_blocks(m_blocks, n_blocks, 1);
+        const dim3 block_size(8, 32, 1);
+        LAUNCH_CUDA_KERNEL((EigenFloatContractionKernel<Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, this->m_device, lhs, rhs, output, m, n, k);
+      }
+    } else {
+      const Index m_blocks = (m + 63) / 64;
+      const Index n_blocks = (n + 63) / 64;
+      const dim3 num_blocks(m_blocks, n_blocks, 1);
+      const dim3 block_size(8, 8, 8);
+      LAUNCH_CUDA_KERNEL((EigenContractionKernel<Scalar, Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, this->m_device, lhs, rhs, output, m, n, k);
+    }
+  }
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_USE_GPU and __CUDACC__
+#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMappers.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMappers.h
new file mode 100644
index 0000000000..b5b09bf41e
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMappers.h
@@ -0,0 +1,383 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Eric Martin <eric@ericmart.in>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MAPPERS_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MAPPERS_H
+
+// NOTE: The file has strong column major bias/assumptions, which is pointed out
+// in comments. As of right now, this code will only work the column major packing
+// routines.
+
+/*
+ * A tensor contraction can be represented by a matrix multiplication. We don't
+ * want to actually reshape the tensor into a matrix (because this involves a
+ * full copy of the tensor), so the reshaping operation is implicit in a sense.
+ * This means we need a collection of methods take a matrix index and return
+ * the element of the tensor that would be at that index if we were to actually
+ * reshape the matrix. This file consists of these methods.
+ */
+
+namespace Eigen {
+namespace internal {
+
+enum {
+  Rhs = 0,
+  Lhs = 1,
+};
+
+/*
+ * Used to lookup the tensor index when working with the left and right
+ * arguments to a tensor contraction.
+ */
+template<typename Scalar, typename Index, int side,
+         typename Tensor,
+         typename nocontract_t, typename contract_t,
+         size_t packet_size, bool inner_dim_contiguous>
+class SimpleTensorContractionMapper {
+  public:
+  EIGEN_DEVICE_FUNC
+  SimpleTensorContractionMapper(const Tensor& tensor,
+                              const nocontract_t& nocontract_strides,
+                              const nocontract_t& ij_strides,
+                              const contract_t& contract_strides,
+                              const contract_t& k_strides) :
+      m_tensor(tensor),
+      m_nocontract_strides(nocontract_strides),
+      m_ij_strides(ij_strides),
+      m_contract_strides(contract_strides),
+      m_k_strides(k_strides) { }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE void prefetch(int i) { }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE Scalar operator()(Index row) const {
+    // column major assumption
+    return operator()(row, 0);
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE Scalar operator()(Index row, Index col) const {
+    return m_tensor.coeff(computeIndex(row, col));
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE Index computeIndex(Index row, Index col) const {
+    const bool left = (side == Lhs);
+    Index nocontract_val = left ? row : col;
+    Index linidx = 0;
+    for (int i = array_size<nocontract_t>::value - 1; i > 0; i--) {
+      const Index idx = nocontract_val / m_ij_strides[i];
+      linidx += idx * m_nocontract_strides[i];
+      nocontract_val -= idx * m_ij_strides[i];
+    }
+    if (array_size<typename Tensor::Dimensions>::value > array_size<contract_t>::value) {
+      if (side == Lhs && inner_dim_contiguous) {
+        eigen_assert(m_nocontract_strides[0] == 1);
+        linidx += nocontract_val;
+      } else {
+        linidx += nocontract_val * m_nocontract_strides[0];
+      }
+    }
+
+    Index contract_val = left ? col : row;
+    for (int i = array_size<contract_t>::value - 1; i > 0; i--) {
+      const Index idx = contract_val / m_k_strides[i];
+      linidx += idx * m_contract_strides[i];
+      contract_val -= idx * m_k_strides[i];
+    }
+    EIGEN_STATIC_ASSERT(array_size<contract_t>::value > 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    if (side == Rhs && inner_dim_contiguous) {
+      eigen_assert(m_contract_strides[0] == 1);
+      linidx += contract_val;
+    } else {
+      linidx += contract_val * m_contract_strides[0];
+    }
+
+    return linidx;
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE IndexPair<Index> computeIndexPair(Index row, Index col, const Index distance) const {
+    const bool left = (side == Lhs);
+    Index nocontract_val[2] = {left ? row : col, left ? row + distance : col};
+    Index linidx[2] = {0, 0};
+    for (int i = array_size<nocontract_t>::value - 1; i > 0; i--) {
+      const Index idx0 = nocontract_val[0] / m_ij_strides[i];
+      const Index idx1 = nocontract_val[1] / m_ij_strides[i];
+      linidx[0] += idx0 * m_nocontract_strides[i];
+      linidx[1] += idx1 * m_nocontract_strides[i];
+      nocontract_val[0] -= idx0 * m_ij_strides[i];
+      nocontract_val[1] -= idx1 * m_ij_strides[i];
+    }
+    if (array_size<typename Tensor::Dimensions>::value > array_size<contract_t>::value) {
+      if (side == Lhs && inner_dim_contiguous) {
+        eigen_assert(m_nocontract_strides[0] == 1);
+        linidx[0] += nocontract_val[0];
+        linidx[1] += nocontract_val[1];
+      } else {
+        linidx[0] += nocontract_val[0] * m_nocontract_strides[0];
+        linidx[1] += nocontract_val[1] * m_nocontract_strides[0];
+      }
+    }
+
+    Index contract_val[2] = {left ? col : row, left ? col : row + distance};
+    for (int i = array_size<contract_t>::value - 1; i > 0; i--) {
+      const Index idx0 = contract_val[0] / m_k_strides[i];
+      const Index idx1 = contract_val[1] / m_k_strides[i];
+      linidx[0] += idx0 * m_contract_strides[i];
+      linidx[1] += idx1 * m_contract_strides[i];
+      contract_val[0] -= idx0 * m_k_strides[i];
+      contract_val[1] -= idx1 * m_k_strides[i];
+    }
+    EIGEN_STATIC_ASSERT(array_size<contract_t>::value > 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    if (side == Rhs && inner_dim_contiguous) {
+      eigen_assert(m_contract_strides[0] == 1);
+      linidx[0] += contract_val[0];
+      linidx[1] += contract_val[1];
+    } else {
+      linidx[0] += contract_val[0] * m_contract_strides[0];
+      linidx[1] += contract_val[1] * m_contract_strides[0];
+    }
+    return IndexPair<Index>(linidx[0], linidx[1]);
+  }
+
+  Index firstAligned(Index size) const {
+    return size;
+  }
+  Index stride() const {
+    return 1;
+  }
+
+ protected:
+  const Tensor m_tensor;
+  const nocontract_t m_nocontract_strides;
+  const nocontract_t m_ij_strides;
+  const contract_t m_contract_strides;
+  const contract_t m_k_strides;
+};
+
+
+
+template<typename Scalar, typename Index, int side,
+         typename Tensor,
+         typename nocontract_t, typename contract_t,
+         size_t packet_size, bool inner_dim_contiguous,
+         bool inner_dim_reordered, int Alignment>
+  class BaseTensorContractionMapper : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous>
+{
+ public:
+  typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous> ParentMapper;
+
+  EIGEN_DEVICE_FUNC
+  BaseTensorContractionMapper(const Tensor& tensor,
+                              const nocontract_t& nocontract_strides,
+                              const nocontract_t& ij_strides,
+                              const contract_t& contract_strides,
+                              const contract_t& k_strides) :
+  ParentMapper(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { }
+
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename packet_traits<Scalar>::half HalfPacket;
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const {
+    // whole method makes column major assumption
+
+    // don't need to add offsets for now (because operator handles that)
+    // current code assumes packet size must be a multiple of 2
+    EIGEN_STATIC_ASSERT(packet_size % 2 == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+    if (Tensor::PacketAccess && inner_dim_contiguous && !inner_dim_reordered) {
+      const Index index = this->computeIndex(i, j);
+      eigen_assert(this->computeIndex(i+packet_size-1, j) == index + packet_size-1);
+      return this->m_tensor.template packet<Alignment>(index);
+    }
+
+    const IndexPair<Index> indexPair = this->computeIndexPair(i, j, packet_size - 1);
+    const Index first = indexPair.first;
+    const Index last = indexPair.second;
+
+    // We can always do optimized packet reads from left hand side right now, because
+    // the vertical matrix dimension on the left hand side is never contracting.
+    // On the right hand side we need to check if the contracting dimensions may have
+    // been shuffled first.
+    if (Tensor::PacketAccess &&
+        (side == Lhs || internal::array_size<contract_t>::value <= 1 || !inner_dim_reordered) &&
+        (last - first) == (packet_size - 1)) {
+
+      return this->m_tensor.template packet<Alignment>(first);
+    }
+
+    EIGEN_ALIGN_DEFAULT Scalar data[packet_size];
+
+    data[0] = this->m_tensor.coeff(first);
+    for (Index k = 1; k < packet_size - 1; k += 2) {
+      const IndexPair<Index> internal_pair = this->computeIndexPair(i + k, j, 1);
+      data[k] = this->m_tensor.coeff(internal_pair.first);
+      data[k + 1] = this->m_tensor.coeff(internal_pair.second);
+    }
+    data[packet_size - 1] = this->m_tensor.coeff(last);
+
+    return pload<Packet>(data);
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE HalfPacket loadHalfPacket(Index i, Index j) const {
+    // whole method makes column major assumption
+
+    // don't need to add offsets for now (because operator handles that)
+    const Index half_packet_size = unpacket_traits<HalfPacket>::size;
+    if (half_packet_size == packet_size) {
+      return loadPacket(i, j);
+    }
+    EIGEN_ALIGN_DEFAULT Scalar data[half_packet_size];
+    for (Index k = 0; k < half_packet_size; k++) {
+      data[k] = operator()(i + k, j);
+    }
+    return pload<HalfPacket>(data);
+  }
+};
+
+
+template<typename Scalar, typename Index, int side,
+         typename Tensor,
+         typename nocontract_t, typename contract_t,
+         bool inner_dim_contiguous,
+         bool inner_dim_reordered, int Alignment>
+class BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment> : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous>
+{
+ public:
+  typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous> ParentMapper;
+
+  EIGEN_DEVICE_FUNC
+  BaseTensorContractionMapper(const Tensor& tensor,
+                              const nocontract_t& nocontract_strides,
+                              const nocontract_t& ij_strides,
+                              const contract_t& contract_strides,
+                              const contract_t& k_strides) :
+  ParentMapper(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { }
+
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const {
+    EIGEN_ALIGN_DEFAULT Scalar data[1];
+    data[0] = this->m_tensor.coeff(this->computeIndex(i, j));
+    return pload<typename packet_traits<Scalar>::type>(data);
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE Packet loadHalfPacket(Index i, Index j) const {
+    return loadPacket(i, j);
+  }
+};
+
+template<typename Scalar, typename Index, int side,
+         typename Tensor,
+         typename nocontract_t, typename contract_t,
+         size_t packet_size,
+         bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
+class TensorContractionInputMapper;
+
+template<typename Scalar, typename Index, int side,
+         typename Tensor,
+         typename nocontract_t, typename contract_t,
+         size_t packet_size,
+         bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
+class TensorContractionSubMapper {
+ public:
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename packet_traits<Scalar>::half HalfPacket;
+
+  typedef TensorContractionInputMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> ParentMapper;
+  typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> Self;
+  typedef Self LinearMapper;
+
+  EIGEN_DEVICE_FUNC TensorContractionSubMapper(const ParentMapper& base_mapper, Index vert_offset, Index horiz_offset)
+      : m_base_mapper(base_mapper), m_vert_offset(vert_offset), m_horiz_offset(horiz_offset) { }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const {
+    return m_base_mapper(i + m_vert_offset, m_horiz_offset);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i, Index j) const {
+    return m_base_mapper(i + m_vert_offset, j + m_horiz_offset);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const {
+    return m_base_mapper.loadPacket(i + m_vert_offset, m_horiz_offset);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const {
+   return m_base_mapper.loadPacket(i + m_vert_offset, j + m_horiz_offset);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const {
+    return m_base_mapper.loadHalfPacket(i + m_vert_offset, m_horiz_offset);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, Packet p) const {
+    m_base_mapper.storePacket(i + m_vert_offset, m_horiz_offset, p);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const {
+    return LinearMapper(m_base_mapper, i + m_vert_offset, j + m_horiz_offset);
+  }
+
+  template <typename PacketT, int AlignmentType>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i) const {
+    EIGEN_STATIC_ASSERT((internal::is_same<PacketT, Packet>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    EIGEN_STATIC_ASSERT((AlignmentType == Aligned || Alignment == Unaligned), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    return loadPacket(i);
+  }
+
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC bool aligned(Index i) const {
+    return false;
+  }
+
+ private:
+  const ParentMapper& m_base_mapper;
+  const Index m_vert_offset;
+  const Index m_horiz_offset;
+};
+
+
+template<typename Scalar, typename Index, int side,
+         typename Tensor,
+         typename nocontract_t, typename contract_t,
+         size_t packet_size,
+         bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
+class TensorContractionInputMapper
+  : public BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> {
+
+ public:
+  typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> Base;
+  typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> SubMapper;
+  typedef SubMapper VectorMapper;
+
+  EIGEN_DEVICE_FUNC TensorContractionInputMapper(const Tensor& tensor,
+                               const nocontract_t& nocontract_strides,
+                               const nocontract_t& ij_strides,
+                               const contract_t& contract_strides,
+                               const contract_t& k_strides)
+      : Base(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE SubMapper getSubMapper(Index i, Index j) const {
+    return SubMapper(*this, i, j);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const {
+    return VectorMapper(*this, i, j);
+  }
+};
+
+
+} // end namespace internal
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MAPPERS_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
new file mode 100644
index 0000000000..c335086902
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
@@ -0,0 +1,713 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H
+
+namespace Eigen {
+namespace internal {
+
+// Specify blocking strategy for thread pool by cols
+template<typename LhsScalar, typename RhsScalar, int KcFactor, typename Index>
+struct ComputeGemmByColBlockingSizes {
+  void operator()(Index& k, Index& m, Index& n, Index num_threads = 1)
+  {
+    computeProductBlockingSizes<LhsScalar,RhsScalar,1>(k, m, n, num_threads);
+  }
+};
+
+// Specify blocking strategy for thread pool by rows
+template<typename LhsScalar, typename RhsScalar, int KcFactor, typename Index>
+struct ComputeGemmByRowBlockingSizes {
+  void operator()(Index& k, Index& m, Index& n, Index num_threads = 1)
+  {
+    if (!k || !m || !n) {
+      return;
+    }
+    m = (((m / num_threads) + 15) / 16) * 16;
+  }
+};
+
+} // namespace internal
+} // namespace Eigen
+
+// evaluator for thread pool device
+#ifdef EIGEN_USE_THREADS
+
+namespace Eigen {
+namespace internal {
+
+template<typename LhsScalar, typename LhsMapper, typename Index>
+struct packLhsArg {
+  LhsScalar* blockA;
+  const LhsMapper& lhs;
+  const Index m_start;
+  const Index k_start;
+  const Index mc;
+  const Index kc;
+};
+
+template<typename LhsScalar, typename RhsScalar, typename RhsMapper, typename OutputMapper, typename Index>
+struct packRhsAndKernelArg {
+  const FixedSizeVector<LhsScalar*>* blockAs;
+  RhsScalar* blockB;
+  const RhsMapper& rhs;
+  OutputMapper& output;
+  const Index m;
+  const Index k;
+  const Index n;
+  const Index mc;
+  const Index kc;
+  const Index nc;
+  const Index num_threads;
+  const Index num_blockAs;
+  const Index max_m;
+  const Index k_block_idx;
+  const Index m_block_idx;
+  const Index n_block_idx;
+  const Index m_blocks;
+  const Index n_blocks;
+  FixedSizeVector<Notification*>* kernel_notifications;
+  const FixedSizeVector<Notification*>* lhs_notifications;
+  const bool need_to_pack;
+};
+
+template<typename RhsScalar, typename RhsMapper, typename Index>
+struct packRhsArg {
+  RhsScalar* blockB;
+  const RhsMapper& rhs;
+  const Index n_start;
+  const Index k_start;
+  const Index nc;
+  const Index kc;
+};
+
+template<typename LhsScalar, typename RhsScalar, typename LhsMapper, typename OutputMapper, typename Index>
+struct packLhsAndKernelArg {
+  const FixedSizeVector<RhsScalar*>* blockBs;
+  LhsScalar* blockA;
+  const LhsMapper& lhs;
+  OutputMapper& output;
+  const Index m;
+  const Index k;
+  const Index n;
+  const Index mc;
+  const Index kc;
+  const Index nc;
+  const Index num_threads;
+  const Index num_blockBs;
+  const Index max_n;
+  const Index k_block_idx;
+  const Index m_block_idx;
+  const Index n_block_idx;
+  const Index m_blocks;
+  const Index n_blocks;
+  FixedSizeVector<Notification*>* kernel_notifications;
+  const FixedSizeVector<Notification*>* rhs_notifications;
+  const bool need_to_pack;
+};
+
+}  // end namespace internal
+
+
+template<typename Indices, typename LeftArgType, typename RightArgType>
+struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, ThreadPoolDevice> :
+    public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, ThreadPoolDevice> > {
+
+  typedef ThreadPoolDevice Device;
+
+  typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> Self;
+  typedef TensorContractionEvaluatorBase<Self> Base;
+
+  typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType;
+  typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
+  typedef typename XprType::Index Index;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, ThreadPoolDevice>::type PacketReturnType;
+
+  enum {
+    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
+  };
+
+  // Most of the code is assuming that both input tensors are ColMajor. If the
+  // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
+  // If we want to compute A * B = C, where A is LHS and B is RHS, the code
+  // will pretend B is LHS and A is RHS.
+  typedef typename internal::conditional<
+    static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
+  typedef typename internal::conditional<
+    static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
+
+  static const int LDims =
+      internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
+  static const int RDims =
+      internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
+  static const int ContractDims = internal::array_size<Indices>::value;
+
+  typedef array<Index, LDims> left_dim_mapper_t;
+  typedef array<Index, RDims> right_dim_mapper_t;
+
+  typedef array<Index, ContractDims> contract_t;
+  typedef array<Index, LDims - ContractDims> left_nocontract_t;
+  typedef array<Index, RDims - ContractDims> right_nocontract_t;
+
+  static const int NumDims = LDims + RDims - 2 * ContractDims;
+
+  typedef DSizes<Index, NumDims> Dimensions;
+
+  // typedefs needed in evalTo
+  typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
+  typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
+  typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits;
+
+  typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
+  typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
+
+  TensorEvaluator(const XprType& op, const Device& device) :
+      Base(op, device) {}
+
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
+  void evalProduct(Scalar* buffer) const {
+    // Disable Gemv on ARM/AVX or if multiple threads are in use
+#if !defined(EIGEN_VECTORIZE_NEON) && !defined(EIGEN_VECTORIZE_AVX)
+    if (this->m_j_size == 1 && this->m_device.numThreads() == 1) {
+      this->template evalGemv<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer);
+      return;
+    }
+#endif
+
+    if (this->m_j_size / this->m_device.numThreads() < Traits::nr &&
+        this->m_i_size / this->m_device.numThreads() >= Traits::mr) {
+      evalGemmByRows<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer);
+    } else {
+      evalGemmByCols<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer);
+    }
+  }
+
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
+  void evalGemmByCols(Scalar* buffer) const {
+    // columns in left side, rows in right side
+    const Index k = this->m_k_size;
+
+    // rows in left side
+    const Index m = this->m_i_size;
+
+    // columns in right side
+    const Index n = this->m_j_size;
+
+    // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar)
+    this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
+
+
+    const int lhs_packet_size = PacketType<LhsScalar, Device>::size;
+    const int rhs_packet_size = PacketType<RhsScalar, Device>::size;
+
+    typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
+                                                   LeftEvaluator, left_nocontract_t,
+                                                   contract_t, lhs_packet_size,
+                                                   lhs_inner_dim_contiguous,
+                                                   false, Unaligned> LhsMapper;
+
+    typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs,
+                                                   RightEvaluator, right_nocontract_t,
+                                                   contract_t, rhs_packet_size,
+                                                   rhs_inner_dim_contiguous,
+                                                   rhs_inner_dim_reordered, Unaligned> RhsMapper;
+
+    typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
+
+    // TODO: packing could be faster sometimes if we supported row major tensor mappers
+    typedef internal::gemm_pack_lhs<LhsScalar, Index, typename LhsMapper::SubMapper, Traits::mr,
+                                    Traits::LhsProgress, ColMajor> LhsPacker;
+    typedef internal::gemm_pack_rhs<RhsScalar, Index, typename RhsMapper::SubMapper, Traits::nr, ColMajor> RhsPacker;
+
+    // TODO: replace false, false with conjugate values?
+    typedef internal::gebp_kernel<LhsScalar, RhsScalar, Index, OutputMapper,
+                                  Traits::mr, Traits::nr, false, false> GebpKernel;
+
+    typedef internal::packLhsArg<LhsScalar, LhsMapper, Index> packLArg;
+    typedef internal::packRhsAndKernelArg<LhsScalar, RhsScalar, RhsMapper, OutputMapper, Index> packRKArg;
+
+    // initialize data mappers
+    LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,
+                  this->m_left_contracting_strides, this->m_k_strides);
+
+    RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides,
+                  this->m_right_contracting_strides, this->m_k_strides);
+
+    OutputMapper output(buffer, m);
+
+    LhsPacker pack_lhs;
+
+    // compute block sizes (which depend on number of threads)
+    const Index num_threads = this->m_device.numThreads();
+    Index mc = m;
+    Index nc = n;
+    Index kc = k;
+    internal::ComputeGemmByColBlockingSizes<LhsScalar,RhsScalar,1,Index> block;
+    block(kc, mc, nc, num_threads);
+    eigen_assert(mc <= m);
+    eigen_assert(nc <= n);
+    eigen_assert(kc <= k);
+
+#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
+    const Index k_blocks = CEIL_DIV(k, kc);
+    const Index n_blocks = CEIL_DIV(n, nc);
+    const Index m_blocks = CEIL_DIV(m, mc);
+#undef CEIL_DIV
+
+    const int sizeA = mc * kc;
+    const int sizeB = kc * nc;
+
+    /*   cout << "m: " << m << " n: " << n << " k: " << k << endl;
+    cout << "mc: " << mc << " nc: " << nc << " kc: " << kc << endl;
+    cout << "m_blocks: " << m_blocks << " n_blocks: " << n_blocks << " k_blocks: " << k_blocks << endl;
+    cout << "num threads: " << num_threads << endl;
+    */
+
+    // note: m_device.allocate should return 16 byte aligned pointers, but if blockA and blockB
+    //       aren't 16 byte aligned segfaults will happen due to SIMD instructions
+    // note: You can get away with allocating just a single blockA and offsets and meet the
+    //       the alignment requirements with the assumption that
+    //       (Traits::mr * sizeof(ResScalar)) % 16 == 0
+    const Index numBlockAs = (std::min)(num_threads, m_blocks);
+    FixedSizeVector<LhsScalar *> blockAs(num_threads);
+    for (int i = 0; i < num_threads; i++) {
+      blockAs.push_back(static_cast<LhsScalar *>(this->m_device.allocate(sizeA * sizeof(LhsScalar))));
+    }
+
+    // To circumvent alignment issues, I'm just going to separately allocate the memory for each thread
+    // TODO: is this too much memory to allocate? This simplifies coding a lot, but is wasteful.
+    //       Other options: (1) reuse memory when a thread finishes. con: tricky
+    //                      (2) allocate block B memory in each thread. con: overhead
+    FixedSizeVector<RhsScalar *> blockBs(n_blocks);
+    for (int i = 0; i < n_blocks; i++) {
+      blockBs.push_back(static_cast<RhsScalar *>(this->m_device.allocate(sizeB * sizeof(RhsScalar))));
+    }
+
+    // lhs_notifications starts with all null Notifications
+    FixedSizeVector<Notification*> lhs_notifications(num_threads, nullptr);
+
+    // this should really be numBlockAs * n_blocks;
+    const Index num_kernel_notifications = num_threads * n_blocks;
+    FixedSizeVector<Notification*> kernel_notifications(num_kernel_notifications,
+                                                        nullptr);
+
+    for (Index k_block_idx = 0; k_block_idx < k_blocks; k_block_idx++) {
+      const Index k_start = k_block_idx * kc;
+      // make sure we don't overshoot right edge of left matrix
+      const Index actual_kc = (std::min)(k_start + kc, k) - k_start;
+
+      for (Index m_block_idx = 0; m_block_idx < m_blocks; m_block_idx += numBlockAs) {
+        const int num_blocks = (std::min)(m_blocks-m_block_idx, numBlockAs);
+
+        for (Index mt_block_idx = m_block_idx; mt_block_idx < m_block_idx+num_blocks; mt_block_idx++) {
+          const Index m_start = mt_block_idx * mc;
+          const Index actual_mc = (std::min)(m_start + mc, m) - m_start;
+          eigen_assert(actual_mc > 0);
+
+          int blockAId = (k_block_idx * m_blocks + mt_block_idx) % num_threads;
+
+          // Wait for previous RHS kernels to complete.
+          for (int i = 0; i < n_blocks; ++i) {
+            int notification_id = (blockAId * n_blocks + i);
+
+            // Wait for any current kernels using this slot to complete
+            // before using it.
+            if (kernel_notifications[notification_id]) {
+              wait_until_ready(kernel_notifications[notification_id]);
+              delete kernel_notifications[notification_id];
+            }
+            kernel_notifications[notification_id] = new Notification();
+          }
+          const packLArg arg = {
+            blockAs[blockAId], // blockA
+            lhs,        // lhs
+            m_start,    // m
+            k_start,    // k
+            actual_mc,  // mc
+            actual_kc,  // kc
+          };
+
+          // Delete any existing notification since we may be
+          // replacing it.  The algorithm should ensure that there are
+          // no existing waiters on this notification.
+          delete lhs_notifications[blockAId];
+          lhs_notifications[blockAId] =
+              this->m_device.enqueue(&Self::packLhs<packLArg, LhsPacker>, arg);
+        }
+
+        // now start kernels.
+        const Index m_base_start = m_block_idx * mc;
+        const bool need_to_pack = m_block_idx == 0;
+
+        for (Index n_block_idx = 0; n_block_idx < n_blocks; n_block_idx++) {
+          const Index n_start = n_block_idx * nc;
+          const Index actual_nc = (std::min)(n_start + nc, n) - n_start;
+
+          // first make sure the previous kernels are all done before overwriting rhs. Also wait if
+          // we're going to start new k. In both cases need_to_pack is true.
+          if (need_to_pack) {
+            for (int i = num_blocks; i < num_threads; ++i) {
+              Index blockAId = (k_block_idx * m_blocks + i + m_block_idx) % num_threads;
+              Index future_id = (blockAId * n_blocks + n_block_idx);
+              wait_until_ready(kernel_notifications[future_id]);
+            }
+          }
+
+          packRKArg arg = {
+            &blockAs, // blockA
+            blockBs[n_block_idx], // blockB
+            rhs,          // rhs
+            output,       // output
+            m_base_start, // m
+            k_start,      // k
+            n_start,      // n
+            mc,           // mc
+            actual_kc,    // kc
+            actual_nc,    // nc
+            num_threads,
+            numBlockAs,
+            m,
+            k_block_idx,
+            m_block_idx,
+            n_block_idx, // n_block_idx
+            m_blocks, // m_blocks
+            n_blocks, // n_blocks
+            &kernel_notifications, // kernel_notifications
+            &lhs_notifications, // lhs_notifications
+            need_to_pack, // need_to_pack
+          };
+
+          // We asynchronously kick off this function, which ends up
+          // notifying the appropriate kernel_notifications objects,
+          // which this thread waits on before exiting.
+          //
+          // The wait for kernel_notifications below ensures that we
+          // don't have to keep track of the launch of this work.
+          this->m_device.enqueue_and_forget(&Self::packRhsAndKernel<packRKArg, RhsPacker, GebpKernel>, arg);
+        }
+      }
+    }
+
+    // Make sure all the kernels are done.
+    for (int i = 0; i < kernel_notifications.size(); ++i) {
+      wait_until_ready(kernel_notifications[i]);
+      delete kernel_notifications[i];
+    }
+
+    // No need to wait for lhs notifications since they should have
+    // already been waited on.  Just clean them up.
+    for (int i = 0; i < lhs_notifications.size(); ++i) {
+      delete lhs_notifications[i];
+    }
+
+    // deallocate all of the memory for both A and B's
+    for (int i = 0; i < blockAs.size(); i++) {
+      this->m_device.deallocate(blockAs[i]);
+    }
+    for (int i = 0; i < blockBs.size(); i++) {
+      this->m_device.deallocate(blockBs[i]);
+    }
+  }
+
+  /*
+   * Packs a LHS block of size (mt, kc) starting at lhs(m, k). Before packing
+   * the LHS block, check that all of the kernels that worked on the same
+   * mt_block_idx in the previous m_block are done.
+   */
+  template <typename packLArg, typename LhsPacker>
+  static void packLhs(const packLArg arg) {
+    // perform actual packing
+    LhsPacker pack_lhs;
+    pack_lhs(arg.blockA, arg.lhs.getSubMapper(arg.m_start, arg.k_start), arg.kc, arg.mc);
+  }
+
+  /*
+   * Packs a RHS block of size (kc, nc) starting at (k, n) after checking that
+   * all kernels in the previous block are done.
+   * Then for each LHS future, we wait on the future and then call GEBP
+   * on the area packed by the future (which starts at
+   * blockA + future_idx * mt * kc) on the LHS and with the full packed
+   * RHS block.
+   * The output of this GEBP is written to output(m + i * mt, n).
+   */
+  template <typename packRKArg, typename RhsPacker, typename GebpKernel>
+  static void packRhsAndKernel(packRKArg arg) {
+    if (arg.need_to_pack) {
+      RhsPacker pack_rhs;
+      pack_rhs(arg.blockB, arg.rhs.getSubMapper(arg.k, arg.n), arg.kc, arg.nc);
+    }
+
+    GebpKernel gebp;
+    for (Index mt_block_idx = 0; mt_block_idx < arg.num_blockAs; mt_block_idx++) {
+      const Index m_base_start = arg.m + arg.mc*mt_block_idx;
+      if (m_base_start < arg.max_m) {
+        int blockAId = (arg.k_block_idx * arg.m_blocks + mt_block_idx + arg.m_block_idx) % arg.num_threads;
+        wait_until_ready((*arg.lhs_notifications)[blockAId]);
+        const Index actual_mc = (std::min)(m_base_start + arg.mc, arg.max_m) - m_base_start;
+        gebp(arg.output.getSubMapper(m_base_start, arg.n),
+             (*arg.blockAs)[blockAId], arg.blockB,
+             actual_mc, arg.kc, arg.nc, Scalar(1), -1, -1, 0, 0);
+
+        // Notify that the kernel is done.
+        const Index set_idx = blockAId * arg.n_blocks + arg.n_block_idx;
+        (*arg.kernel_notifications)[set_idx]->Notify();
+      }
+    }
+  }
+
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
+  void evalGemmByRows(Scalar* buffer) const {
+    // columns in left side, rows in right side
+    const Index k = this->m_k_size;
+
+    // rows in left side
+    const Index m = this->m_i_size;
+
+    // columns in right side
+    const Index n = this->m_j_size;
+
+    // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar)
+    this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
+
+    const int lhs_packet_size = PacketType<LhsScalar, ThreadPoolDevice>::size;
+    const int rhs_packet_size = PacketType<RhsScalar, ThreadPoolDevice>::size;
+
+    typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
+                                                   LeftEvaluator, left_nocontract_t,
+                                                   contract_t, lhs_packet_size,
+                                                   lhs_inner_dim_contiguous,
+                                                   false, Unaligned> LhsMapper;
+
+    typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs,
+                                                   RightEvaluator, right_nocontract_t,
+                                                   contract_t, rhs_packet_size,
+                                                   rhs_inner_dim_contiguous,
+                                                   rhs_inner_dim_reordered, Unaligned> RhsMapper;
+
+    typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
+
+    // TODO: packing could be faster sometimes if we supported row major tensor mappers
+    typedef internal::gemm_pack_lhs<LhsScalar, Index, typename LhsMapper::SubMapper, Traits::mr,
+                                    Traits::LhsProgress, ColMajor> LhsPacker;
+    typedef internal::gemm_pack_rhs<RhsScalar, Index, typename RhsMapper::SubMapper, Traits::nr, ColMajor> RhsPacker;
+
+    // TODO: replace false, false with conjugate values?
+    typedef internal::gebp_kernel<LhsScalar, RhsScalar, Index, OutputMapper,
+                                  Traits::mr, Traits::nr, false, false> GebpKernel;
+
+    typedef internal::packRhsArg<RhsScalar, RhsMapper, Index> packRArg;
+    typedef internal::packLhsAndKernelArg<LhsScalar, RhsScalar, LhsMapper, OutputMapper, Index> packLKArg;
+
+    // initialize data mappers
+    LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,
+                  this->m_left_contracting_strides, this->m_k_strides);
+
+    RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides,
+                  this->m_right_contracting_strides, this->m_k_strides);
+
+    OutputMapper output(buffer, m);
+
+    RhsPacker pack_rhs;
+
+    // compute block sizes (which depend on number of threads)
+    const Index num_threads = this->m_device.numThreads();
+    Index mc = m;
+    Index nc = n;
+    Index kc = k;
+    internal::ComputeGemmByRowBlockingSizes<LhsScalar,RhsScalar,1,Index> block;
+    block(kc, mc, nc, num_threads);
+    eigen_assert(mc <= m);
+    eigen_assert(nc <= n);
+    eigen_assert(kc <= k);
+
+#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
+    const Index k_blocks = CEIL_DIV(k, kc);
+    const Index n_blocks = CEIL_DIV(n, nc);
+    const Index m_blocks = CEIL_DIV(m, mc);
+#undef CEIL_DIV
+
+
+    const int sizeA = mc * kc;
+    const int sizeB = kc * nc;
+
+    const Index numBlockBs = (std::min)(num_threads, n_blocks);
+    FixedSizeVector<RhsScalar *> blockBs(num_threads);
+    for (int i = 0; i < num_threads; i++) {
+      blockBs.push_back(static_cast<RhsScalar *>(this->m_device.allocate(sizeB * sizeof(RhsScalar))));
+    }
+
+    FixedSizeVector<LhsScalar *> blockAs(m_blocks);
+    for (int i = 0; i < m_blocks; i++) {
+      blockAs.push_back(static_cast<LhsScalar *>(this->m_device.allocate(sizeA * sizeof(LhsScalar))));
+    }
+
+    // lhs_notifications starts with all null Notifications
+    FixedSizeVector<Notification*> rhs_notifications(num_threads, nullptr);
+
+    // this should really be numBlockBs * m_blocks;
+    const Index num_kernel_notifications = num_threads * m_blocks;
+    FixedSizeVector<Notification*> kernel_notifications(num_kernel_notifications,
+                                                        nullptr);
+
+    for (Index k_block_idx = 0; k_block_idx < k_blocks; k_block_idx++) {
+      const Index k_start = k_block_idx * kc;
+      // make sure we don't overshoot right edge of left matrix
+      const Index actual_kc = (std::min)(k_start + kc, k) - k_start;
+
+      for (Index n_block_idx = 0; n_block_idx < n_blocks; n_block_idx += numBlockBs) {
+        const int num_blocks = (std::min)(n_blocks-n_block_idx, numBlockBs);
+
+        for (Index nt_block_idx = n_block_idx; nt_block_idx < n_block_idx+num_blocks; nt_block_idx++) {
+          const Index n_start = nt_block_idx * nc;
+          const Index actual_nc = (std::min)(n_start + nc, n) - n_start;
+          eigen_assert(actual_nc > 0);
+
+          int blockBId = (k_block_idx * n_blocks + nt_block_idx) % num_threads;
+          // Wait for previous RHS kernels to complete.
+          for (int i = 0; i < m_blocks; ++i) {
+            int notification_id = (blockBId * m_blocks + i);
+
+            // Wait for any current kernels using this slot to complete
+            // before using it.
+            if (kernel_notifications[notification_id]) {
+              wait_until_ready(kernel_notifications[notification_id]);
+              delete kernel_notifications[notification_id];
+            }
+            kernel_notifications[notification_id] = new Notification();
+          }
+          const packRArg arg = {
+            blockBs[blockBId], // blockB
+            rhs,               // rhs
+            n_start,           // n
+            k_start,           // k
+            actual_nc,         // nc
+            actual_kc,         // kc
+          };
+
+          // Delete any existing notification since we may be
+          // replacing it.  The algorithm should ensure that there are
+          // no existing waiters on this notification.
+          delete rhs_notifications[blockBId];
+          rhs_notifications[blockBId] =
+              this->m_device.enqueue(&Self::packRhs<packRArg, RhsPacker>, arg);
+        }
+
+        // now start kernels.
+        const Index n_base_start = n_block_idx * nc;
+        const bool need_to_pack = n_block_idx == 0;
+
+        for (Index m_block_idx = 0; m_block_idx < m_blocks; m_block_idx++) {
+          const Index m_start = m_block_idx * mc;
+          const Index actual_mc = (std::min)(m_start + mc, m) - m_start;
+
+          // first make sure the previous kernels are all done before overwriting rhs. Also wait if
+          // we're going to start new k. In both cases need_to_pack is true.
+          if (need_to_pack) {
+            for (int i = num_blocks; i < num_threads; ++i) {
+              Index blockBId = (k_block_idx * n_blocks + i + n_block_idx) % num_threads;
+              Index future_id = (blockBId * m_blocks + m_block_idx);
+              wait_until_ready(kernel_notifications[future_id]);
+            }
+          }
+
+          packLKArg arg = {
+            &blockBs,             // blockB
+            blockAs[m_block_idx], // blockA
+            lhs,                  // lhs
+            output,               // output
+            m_start,              // m
+            k_start,              // k
+            n_base_start,         // n
+            actual_mc,            // mc
+            actual_kc,            // kc
+            nc,                   // nc
+            num_threads,
+            numBlockBs,
+            n,
+            k_block_idx,
+            m_block_idx,
+            n_block_idx,
+            m_blocks,
+            n_blocks,
+            &kernel_notifications,
+            &rhs_notifications,
+            need_to_pack,
+          };
+
+          // We asynchronously kick off this function, which ends up
+          // notifying the appropriate kernel_notifications objects,
+          // which this thread waits on before exiting.
+          //
+          // The wait for kernel_notifications below ensures that we
+          // don't have to keep track of the launch of this work.
+          this->m_device.enqueue_and_forget(&Self::packLhsAndKernel<packLKArg, LhsPacker, GebpKernel>, arg);
+        }
+      }
+    }
+
+    // Make sure all the kernels are done.
+    for (int i = 0; i < kernel_notifications.size(); ++i) {
+      wait_until_ready(kernel_notifications[i]);
+      delete kernel_notifications[i];
+    }
+
+    // No need to wait for lhs notifications since they should have
+    // already been waited on.  Just clean them up.
+    for (int i = 0; i < rhs_notifications.size(); ++i) {
+      delete rhs_notifications[i];
+    }
+
+    // deallocate all of the memory for both A and B's
+    for (int i = 0; i < blockAs.size(); i++) {
+      this->m_device.deallocate(blockAs[i]);
+    }
+    for (int i = 0; i < blockBs.size(); i++) {
+      this->m_device.deallocate(blockBs[i]);
+    }
+  }
+
+  template <typename packRArg, typename RhsPacker>
+  static void packRhs(const packRArg arg) {
+    // perform actual packing
+    RhsPacker pack_rhs;
+    pack_rhs(arg.blockB, arg.rhs.getSubMapper(arg.k_start, arg.n_start), arg.kc, arg.nc);
+  }
+
+  template <typename packLKArg, typename LhsPacker, typename GebpKernel>
+  static void packLhsAndKernel(packLKArg arg) {
+    if (arg.need_to_pack) {
+      LhsPacker pack_lhs;
+      pack_lhs(arg.blockA, arg.lhs.getSubMapper(arg.m, arg.k), arg.kc, arg.mc);
+    }
+
+    GebpKernel gebp;
+    for (Index nt_block_idx = 0; nt_block_idx < arg.num_blockBs; nt_block_idx++) {
+      const Index n_base_start = arg.n + arg.nc*nt_block_idx;
+      if (n_base_start < arg.max_n) {
+        int blockBId = (arg.k_block_idx * arg.n_blocks + nt_block_idx + arg.n_block_idx) % arg.num_threads;
+        wait_until_ready((*arg.rhs_notifications)[blockBId]);
+        const Index actual_nc = (std::min)(n_base_start + arg.nc, arg.max_n) - n_base_start;
+        gebp(arg.output.getSubMapper(arg.m, n_base_start),
+             arg.blockA, (*arg.blockBs)[blockBId],
+             arg.mc, arg.kc, actual_nc, Scalar(1), -1, -1, 0, 0);
+
+        // Notify that the kernel is done.
+        const Index set_idx = blockBId * arg.m_blocks + arg.m_block_idx;
+        (*arg.kernel_notifications)[set_idx]->Notify();
+      }
+    }
+  }
+};
+
+} // end namespace Eigen
+
+#endif  // EIGEN_USE_THREADS
+#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
new file mode 100644
index 0000000000..d54091fa1c
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
@@ -0,0 +1,226 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVERSION_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONVERSION_H
+
+namespace Eigen {
+
+/** \class TensorConversionOp
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor conversion class. This class makes it possible to vectorize
+  * type casting operations when the number of scalars per packet in the source
+  * and the destination type differ
+  */
+namespace internal {
+template<typename TargetType, typename XprType>
+struct traits<TensorConversionOp<TargetType, XprType> >
+{
+  // Type promotion to handle the case where the types of the lhs and the rhs are different.
+  typedef TargetType Scalar;
+  typedef typename traits<XprType>::StorageKind StorageKind;
+  typedef typename traits<XprType>::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef typename remove_reference<Nested>::type _Nested;
+  static const int NumDimensions = traits<XprType>::NumDimensions;
+  static const int Layout = traits<XprType>::Layout;
+  enum { Flags = 0 };
+};
+
+template<typename TargetType, typename XprType>
+struct eval<TensorConversionOp<TargetType, XprType>, Eigen::Dense>
+{
+  typedef const TensorConversionOp<TargetType, XprType>& type;
+};
+
+template<typename TargetType, typename XprType>
+struct nested<TensorConversionOp<TargetType, XprType>, 1, typename eval<TensorConversionOp<TargetType, XprType> >::type>
+{
+  typedef TensorConversionOp<TargetType, XprType> type;
+};
+
+}  // end namespace internal
+
+
+template <typename TensorEvaluator, typename SrcPacket, typename TgtPacket, int SrcCoeffRatio, int TgtCoeffRatio>
+struct PacketConverter {
+  PacketConverter(const TensorEvaluator& impl)
+      : m_impl(impl) {}
+
+  template<int LoadMode, typename Index>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const {
+    return internal::pcast<SrcPacket, TgtPacket>(m_impl.template packet<LoadMode>(index));
+  }
+
+ private:
+  const TensorEvaluator& m_impl;
+};
+
+
+template <typename TensorEvaluator, typename SrcPacket, typename TgtPacket>
+struct PacketConverter<TensorEvaluator, SrcPacket, TgtPacket, 2, 1> {
+  PacketConverter(const TensorEvaluator& impl)
+      : m_impl(impl) {}
+
+  template<int LoadMode, typename Index>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const {
+    const int SrcPacketSize = internal::unpacket_traits<SrcPacket>::size;
+
+    SrcPacket src1 = m_impl.template packet<LoadMode>(index);
+    SrcPacket src2 = m_impl.template packet<LoadMode>(index + SrcPacketSize);
+    TgtPacket result = internal::pcast<SrcPacket, TgtPacket>(src1, src2);
+    return result;
+  }
+
+ private:
+  const TensorEvaluator& m_impl;
+};
+
+template <typename TensorEvaluator, typename SrcPacket, typename TgtPacket>
+struct PacketConverter<TensorEvaluator, SrcPacket, TgtPacket, 4, 1> {
+  PacketConverter(const TensorEvaluator& impl)
+      : m_impl(impl) {}
+
+  template<int LoadMode, typename Index>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const {
+    const int SrcPacketSize = internal::unpacket_traits<SrcPacket>::size;
+
+    SrcPacket src1 = m_impl.template packet<LoadMode>(index);
+    SrcPacket src2 = m_impl.template packet<LoadMode>(index + SrcPacketSize);
+    SrcPacket src3 = m_impl.template packet<LoadMode>(index + 2 * SrcPacketSize);
+    SrcPacket src4 = m_impl.template packet<LoadMode>(index + 3 * SrcPacketSize);
+    TgtPacket result = internal::pcast<SrcPacket, TgtPacket>(src1, src2, src3, src4);
+    return result;
+  }
+
+ private:
+  const TensorEvaluator& m_impl;
+};
+
+
+template <typename TensorEvaluator, typename SrcPacket, typename TgtPacket>
+struct PacketConverter<TensorEvaluator, SrcPacket, TgtPacket, 1, 2> {
+  PacketConverter(const TensorEvaluator& impl)
+      : m_impl(impl), m_maxIndex(impl.dimensions().TotalSize()) {}
+
+  template<int LoadMode, typename Index>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const {
+    const int SrcPacketSize = internal::unpacket_traits<SrcPacket>::size;
+    if (index + SrcPacketSize < m_maxIndex) {
+      return internal::pcast<SrcPacket, TgtPacket>(m_impl.template packet<LoadMode>(index));
+    } else {
+      const int TgtPacketSize = internal::unpacket_traits<TgtPacket>::size;
+      EIGEN_ALIGN_DEFAULT typename internal::unpacket_traits<TgtPacket>::type values[TgtPacketSize];
+      for (int i = 0; i < TgtPacketSize; ++i) {
+        values[i] = m_impl.coeff(index+i);
+      }
+      TgtPacket rslt = internal::pload<TgtPacket>(values);
+      return rslt;
+    }
+  }
+
+ private:
+  const TensorEvaluator& m_impl;
+  const typename TensorEvaluator::Index m_maxIndex;
+};
+
+template<typename TargetType, typename XprType>
+class TensorConversionOp : public TensorBase<TensorConversionOp<TargetType, XprType>, ReadOnlyAccessors>
+{
+  public:
+    typedef typename internal::traits<TensorConversionOp>::Scalar Scalar;
+    typedef typename internal::traits<TensorConversionOp>::StorageKind StorageKind;
+    typedef typename internal::traits<TensorConversionOp>::Index Index;
+    typedef typename internal::nested<TensorConversionOp>::type Nested;
+    typedef Scalar CoeffReturnType;
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConversionOp(const XprType& xpr)
+        : m_xpr(xpr) {}
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename XprType::Nested>::type&
+    expression() const { return m_xpr; }
+
+  protected:
+    typename XprType::Nested m_xpr;
+};
+
+
+
+
+// Eval as rvalue
+template<typename TargetType, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
+{
+  typedef TensorConversionOp<TargetType, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
+  typedef TargetType Scalar;
+  typedef TargetType CoeffReturnType;
+  typedef typename internal::remove_all<typename internal::traits<ArgType>::Scalar>::type SrcType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef typename PacketType<SrcType, Device>::type PacketSourceType;
+
+  enum {
+    IsAligned = false,
+    PacketAccess =
+        TensorEvaluator<ArgType, Device>::PacketAccess &&
+        internal::type_casting_traits<SrcType, TargetType>::VectorizedCast,
+    BlockAccess = false,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+    : m_impl(op.expression(), device)
+  {
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_impl.dimensions(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data)
+  {
+    if (internal::is_same<TargetType, SrcType>::value) {
+      return m_impl.evalSubExprsIfNeeded((SrcType*)data);
+    }
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup()
+  {
+    m_impl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    internal::scalar_cast_op<SrcType, TargetType> converter;
+    return converter(m_impl.coeff(index));
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  {
+    const int SrcCoeffRatio = internal::type_casting_traits<SrcType, TargetType>::SrcCoeffRatio;
+    const int TgtCoeffRatio = internal::type_casting_traits<SrcType, TargetType>::TgtCoeffRatio;
+    PacketConverter<TensorEvaluator<ArgType, Device>, PacketSourceType, PacketReturnType,
+                    SrcCoeffRatio, TgtCoeffRatio> converter(m_impl);
+    return converter.template packet<LoadMode>(index);
+  }
+
+  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+
+  protected:
+    TensorEvaluator<ArgType, Device> m_impl;
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVERSION_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
new file mode 100644
index 0000000000..58cae7162c
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
@@ -0,0 +1,1076 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
+
+namespace Eigen {
+
+/** \class TensorConvolution
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor convolution class.
+  *
+  *
+  */
+namespace internal {
+
+template <typename Index, typename InputDims, size_t NumKernelDims, int Layout>
+class IndexMapper {
+ public:
+  IndexMapper(const InputDims& input_dims, const array<Index, NumKernelDims>& kernel_dims,
+              const array<Index, NumKernelDims>& indices) {
+
+    array<Index, NumDims> dimensions = input_dims;
+    for (int i = 0; i < NumKernelDims; ++i) {
+      const Index index = indices[i];
+      const Index input_dim = input_dims[index];
+      const Index kernel_dim = kernel_dims[i];
+      const Index result_dim = input_dim - kernel_dim + 1;
+      dimensions[index] = result_dim;
+    }
+
+    array<Index, NumDims> inputStrides;
+    array<Index, NumDims> outputStrides;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      inputStrides[0] = 1;
+      outputStrides[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        inputStrides[i] = inputStrides[i-1] * input_dims[i-1];
+        outputStrides[i] = outputStrides[i-1] * dimensions[i-1];
+      }
+    } else {
+      inputStrides[NumDims - 1] = 1;
+      outputStrides[NumDims - 1] = 1;
+      for (int i = static_cast<int>(NumDims) - 2; i >= 0; --i) {
+        inputStrides[i] = inputStrides[i + 1] * input_dims[i + 1];
+        outputStrides[i] = outputStrides[i + 1] * dimensions[i + 1];
+      }
+    }
+
+    array<Index, NumDims> cudaInputDimensions;
+    array<Index, NumDims> cudaOutputDimensions;
+    array<Index, NumDims> tmp = dimensions;
+    array<Index, NumDims> ordering;
+    const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
+                              ? 0
+                              : NumDims - NumKernelDims;
+    for (int i = 0; i < NumKernelDims; ++i) {
+      const Index index = i + offset;
+      ordering[index] = indices[i];
+      tmp[indices[i]] = -1;
+      cudaInputDimensions[index] = input_dims[indices[i]];
+      cudaOutputDimensions[index] = dimensions[indices[i]];
+    }
+
+    int written = static_cast<int>(Layout) == static_cast<int>(ColMajor)
+                      ? NumKernelDims
+                      : 0;
+    for (int i = 0; i < NumDims; ++i) {
+      if (tmp[i] >= 0) {
+        ordering[written] = i;
+        cudaInputDimensions[written] = input_dims[i];
+        cudaOutputDimensions[written] = dimensions[i];
+        ++written;
+      }
+    }
+
+    for (int i = 0; i < NumDims; ++i) {
+      m_inputStrides[i] = inputStrides[ordering[i]];
+      m_outputStrides[i] = outputStrides[ordering[i]];
+    }
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = 0; i < NumDims; ++i) {
+        if (i > NumKernelDims) {
+          m_cudaInputStrides[i] =
+              m_cudaInputStrides[i - 1] * cudaInputDimensions[i - 1];
+          m_cudaOutputStrides[i] =
+              m_cudaOutputStrides[i - 1] * cudaOutputDimensions[i - 1];
+        } else {
+          m_cudaInputStrides[i] = 1;
+          m_cudaOutputStrides[i] = 1;
+        }
+      }
+    } else {
+      for (int i = NumDims - 1; i >= 0; --i) {
+        if (i + 1 < offset) {
+          m_cudaInputStrides[i] =
+              m_cudaInputStrides[i + 1] * cudaInputDimensions[i + 1];
+          m_cudaOutputStrides[i] =
+              m_cudaOutputStrides[i + 1] * cudaOutputDimensions[i + 1];
+        } else {
+          m_cudaInputStrides[i] = 1;
+          m_cudaOutputStrides[i] = 1;
+        }
+      }
+    }
+  }
+
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputPlaneToTensorInputOffset(Index p) const {
+    Index inputIndex = 0;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int d = NumDims - 1; d > NumKernelDims; --d) {
+        const Index idx = p / m_cudaInputStrides[d];
+        inputIndex += idx * m_inputStrides[d];
+        p -= idx * m_cudaInputStrides[d];
+      }
+      inputIndex += p * m_inputStrides[NumKernelDims];
+    } else {
+      int limit = 0;
+      if (NumKernelDims < NumDims) {
+        limit = NumDims - NumKernelDims - 1;
+      }
+      for (int d = 0; d < limit; ++d) {
+        const Index idx = p / m_cudaInputStrides[d];
+        inputIndex += idx * m_inputStrides[d];
+        p -= idx * m_cudaInputStrides[d];
+      }
+      inputIndex += p * m_inputStrides[limit];
+    }
+    return inputIndex;
+  }
+
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputPlaneToTensorOutputOffset(Index p) const {
+    Index outputIndex = 0;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int d = NumDims - 1; d > NumKernelDims; --d) {
+        const Index idx = p / m_cudaOutputStrides[d];
+        outputIndex += idx * m_outputStrides[d];
+        p -= idx * m_cudaOutputStrides[d];
+      }
+      outputIndex += p * m_outputStrides[NumKernelDims];
+    } else {
+      int limit = 0;
+      if (NumKernelDims < NumDims) {
+        limit = NumDims - NumKernelDims - 1;
+      }
+      for (int d = 0; d < limit; ++d) {
+        const Index idx = p / m_cudaOutputStrides[d];
+        outputIndex += idx * m_outputStrides[d];
+        p -= idx * m_cudaOutputStrides[d];
+      }
+      outputIndex += p * m_outputStrides[limit];
+    }
+    return outputIndex;
+  }
+
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i) const {
+    const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
+                              ? 0
+                              : NumDims - NumKernelDims;
+    return i * m_inputStrides[offset];
+  }
+
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i) const {
+    const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
+                              ? 0
+                              : NumDims - NumKernelDims;
+    return i * m_outputStrides[offset];
+  }
+
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j) const {
+    const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
+                              ? 0
+                              : NumDims - NumKernelDims;
+    return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1];
+  }
+
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j) const {
+    const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
+                              ? 0
+                              : NumDims - NumKernelDims;
+    return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1];
+  }
+
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j, Index k) const {
+    const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
+                              ? 0
+                              : NumDims - NumKernelDims;
+    return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1] +
+           k * m_inputStrides[offset + 2];
+  }
+
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const {
+    const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
+                              ? 0
+                              : NumDims - NumKernelDims;
+    return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1] +
+           k * m_outputStrides[offset + 2];
+  }
+
+ private:
+  static const size_t NumDims = internal::array_size<InputDims>::value;
+  array<Index, NumDims> m_inputStrides;
+  array<Index, NumDims> m_outputStrides;
+  array<Index, NumDims> m_cudaInputStrides;
+  array<Index, NumDims> m_cudaOutputStrides;
+};
+
+
+
+template<typename Dimensions, typename InputXprType, typename KernelXprType>
+struct traits<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >
+{
+  // Type promotion to handle the case where the types of the lhs and the rhs are different.
+  typedef typename promote_storage_type<typename InputXprType::Scalar,
+                                        typename KernelXprType::Scalar>::ret Scalar;
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename promote_storage_type<typename traits<InputXprType>::StorageKind,
+                                        typename traits<KernelXprType>::StorageKind>::ret StorageKind;
+  typedef typename promote_index_type<typename traits<InputXprType>::Index,
+                                      typename traits<KernelXprType>::Index>::type Index;
+  typedef typename InputXprType::Nested LhsNested;
+  typedef typename KernelXprType::Nested RhsNested;
+  typedef typename remove_reference<LhsNested>::type _LhsNested;
+  typedef typename remove_reference<RhsNested>::type _RhsNested;
+  static const int NumDimensions = traits<InputXprType>::NumDimensions;
+  static const int Layout = traits<InputXprType>::Layout;
+
+  enum {
+    Flags = 0,
+  };
+};
+
+template<typename Dimensions, typename InputXprType, typename KernelXprType>
+struct eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, Eigen::Dense>
+{
+  typedef const TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>& type;
+};
+
+template<typename Dimensions, typename InputXprType, typename KernelXprType>
+struct nested<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, 1, typename eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >::type>
+{
+  typedef TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> type;
+};
+
+}  // end namespace internal
+
+
+
+template<typename Indices, typename InputXprType, typename KernelXprType>
+class TensorConvolutionOp : public TensorBase<TensorConvolutionOp<Indices, InputXprType, KernelXprType> >
+{
+  public:
+  typedef typename Eigen::internal::traits<TensorConvolutionOp>::Scalar Scalar;
+  typedef typename Eigen::internal::traits<TensorConvolutionOp>::Packet Packet;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename internal::promote_storage_type<typename InputXprType::CoeffReturnType,
+                                                  typename KernelXprType::CoeffReturnType>::ret CoeffReturnType;
+  typedef typename internal::promote_storage_type<typename InputXprType::PacketReturnType,
+                                                  typename KernelXprType::PacketReturnType>::ret PacketReturnType;
+  typedef typename Eigen::internal::nested<TensorConvolutionOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorConvolutionOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorConvolutionOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConvolutionOp(const InputXprType& input, const KernelXprType& kernel, const Indices& dims)
+      : m_input_xpr(input), m_kernel_xpr(kernel), m_indices(dims) {}
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const Indices& indices() const { return m_indices; }
+
+    /** \returns the nested expressions */
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const typename internal::remove_all<typename InputXprType::Nested>::type&
+    inputExpression() const { return m_input_xpr; }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const typename internal::remove_all<typename KernelXprType::Nested>::type&
+    kernelExpression() const { return m_kernel_xpr; }
+
+  protected:
+    typename InputXprType::Nested m_input_xpr;
+    typename KernelXprType::Nested m_kernel_xpr;
+    const Indices m_indices;
+};
+
+
+template<typename Indices, typename InputArgType, typename KernelArgType, typename Device>
+struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, Device>
+{
+  typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
+
+  static const int NumDims = internal::array_size<typename TensorEvaluator<InputArgType, Device>::Dimensions>::value;
+  static const int NumKernelDims = internal::array_size<Indices>::value;
+  typedef typename XprType::Index Index;
+  typedef DSizes<Index, NumDims> Dimensions;
+
+  enum {
+    IsAligned = TensorEvaluator<InputArgType, Device>::IsAligned &
+                TensorEvaluator<KernelArgType, Device>::IsAligned,
+    PacketAccess = TensorEvaluator<InputArgType, Device>::PacketAccess &
+                   TensorEvaluator<KernelArgType, Device>::PacketAccess,
+    BlockAccess = false,
+    Layout = TensorEvaluator<InputArgType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_kernel(NULL), m_local_kernel(false), m_device(device)
+  {
+    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+    const typename TensorEvaluator<InputArgType, Device>::Dimensions& input_dims = m_inputImpl.dimensions();
+    const typename TensorEvaluator<KernelArgType, Device>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_inputStride[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        m_inputStride[i] = m_inputStride[i - 1] * input_dims[i - 1];
+      }
+    } else {
+      m_inputStride[NumDims - 1] = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        m_inputStride[i] = m_inputStride[i + 1] * input_dims[i + 1];
+      }
+    }
+
+    m_dimensions = m_inputImpl.dimensions();
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = 0; i < NumKernelDims; ++i) {
+        const Index index = op.indices()[i];
+        const Index input_dim = input_dims[index];
+        const Index kernel_dim = kernel_dims[i];
+        const Index result_dim = input_dim - kernel_dim + 1;
+        m_dimensions[index] = result_dim;
+        if (i > 0) {
+          m_kernelStride[i] = m_kernelStride[i - 1] * kernel_dims[i - 1];
+        } else {
+          m_kernelStride[0] = 1;
+        }
+        m_indexStride[i] = m_inputStride[index];
+      }
+
+      m_outputStride[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        m_outputStride[i] = m_outputStride[i - 1] * m_dimensions[i - 1];
+      }
+    } else {
+      for (int i = NumKernelDims - 1; i >= 0; --i) {
+        const Index index = op.indices()[i];
+        const Index input_dim = input_dims[index];
+        const Index kernel_dim = kernel_dims[i];
+        const Index result_dim = input_dim - kernel_dim + 1;
+        m_dimensions[index] = result_dim;
+        if (i < NumKernelDims - 1) {
+          m_kernelStride[i] = m_kernelStride[i + 1] * kernel_dims[i + 1];
+        } else {
+          m_kernelStride[NumKernelDims - 1] = 1;
+        }
+        m_indexStride[i] = m_inputStride[index];
+      }
+
+      m_outputStride[NumDims - 1] = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        m_outputStride[i] = m_outputStride[i + 1] * m_dimensions[i + 1];
+      }
+    }
+  }
+
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
+    m_inputImpl.evalSubExprsIfNeeded(NULL);
+    preloadKernel();
+    return true;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_inputImpl.cleanup();
+    if (m_local_kernel) {
+      m_device.deallocate((void*)m_kernel);
+      m_local_kernel = false;
+    }
+    m_kernel = NULL;
+  }
+
+  void evalTo(typename XprType::Scalar* buffer) {
+    evalSubExprsIfNeeded(NULL);
+    for (int i = 0; i < dimensions().TotalSize(); ++i) {
+      buffer[i] += coeff(i);
+    }
+    cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    CoeffReturnType result = CoeffReturnType(0);
+    convolve(firstInput(index), 0, NumKernelDims-1, result);
+    return result;
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC PacketReturnType packet(const Index index) const
+  {
+    const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+    Index indices[2] = {index, index+PacketSize-1};
+    Index startInputs[2] = {0, 0};
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumDims - 1; i > 0; --i) {
+        const Index idx0 = indices[0] / m_outputStride[i];
+        const Index idx1 = indices[1] / m_outputStride[i];
+        startInputs[0] += idx0 * m_inputStride[i];
+        startInputs[1] += idx1 * m_inputStride[i];
+        indices[0] -= idx0 * m_outputStride[i];
+        indices[1] -= idx1 * m_outputStride[i];
+      }
+    } else {
+      for (int i = 0; i < NumDims - 1; ++i) {
+        const Index idx0 = indices[0] / m_outputStride[i];
+        const Index idx1 = indices[1] / m_outputStride[i];
+        startInputs[0] += idx0 * m_inputStride[i];
+        startInputs[1] += idx1 * m_inputStride[i];
+        indices[0] -= idx0 * m_outputStride[i];
+        indices[1] -= idx1 * m_outputStride[i];
+      }
+    }
+    startInputs[0] += indices[0];
+    startInputs[1] += indices[1];
+
+    if (startInputs[1]-startInputs[0] == PacketSize-1) {
+      PacketReturnType result = internal::pset1<PacketReturnType>(0);
+      convolvePacket(startInputs[0], 0, NumKernelDims-1, result);
+      return result;
+    } else {
+      EIGEN_ALIGN_DEFAULT Scalar data[PacketSize];
+      data[0] = Scalar(0);
+      convolve(startInputs[0], 0, NumKernelDims-1, data[0]);
+      for (int i = 1; i < PacketSize-1; ++i) {
+        data[i] = Scalar(0);
+        convolve(firstInput(index+i), 0, NumKernelDims-1, data[i]);
+      }
+      data[PacketSize-1] = Scalar(0);
+      convolve(startInputs[1], 0, NumKernelDims-1, data[PacketSize-1]);
+      return internal::pload<PacketReturnType>(data);
+    }
+  }
+
+  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+
+ private:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
+    Index startInput = 0;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumDims - 1; i > 0; --i) {
+        const Index idx = index / m_outputStride[i];
+        startInput += idx * m_inputStride[i];
+        index -= idx * m_outputStride[i];
+      }
+    } else {
+      for (int i = 0; i < NumDims - 1; ++i) {
+        const Index idx = index / m_outputStride[i];
+        startInput += idx * m_inputStride[i];
+        index -= idx * m_outputStride[i];
+      }
+    }
+    startInput += index;
+    return startInput;
+  }
+
+  EIGEN_DEVICE_FUNC void convolve(Index firstIndex, Index firstKernel, int DimIndex, CoeffReturnType& accum) const {
+    for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
+      const Index input = firstIndex + j * m_indexStride[DimIndex];
+      const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
+      if (DimIndex > 0) {
+        convolve(input, kernel, DimIndex-1, accum);
+      } else {
+        accum += m_inputImpl.coeff(input) * m_kernel[kernel];
+      }
+    }
+  }
+
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC void convolvePacket(Index firstIndex, Index firstKernel, int DimIndex, Packet& accum) const {
+    for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
+      const Index input = firstIndex + j * m_indexStride[DimIndex];
+      const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
+      if (DimIndex > 0) {
+        convolvePacket(input, kernel, DimIndex-1, accum);
+      } else {
+        accum = internal::pmadd<Packet>(m_inputImpl.template packet<Unaligned>(input), internal::pset1<Packet>(m_kernel[kernel]), accum);
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() {
+    // Don't make a local copy of the kernel unless we have to (i.e. it's an
+    // expression that needs to be evaluated)
+    const Scalar* in_place = m_kernelImpl.data();
+    if (in_place) {
+      m_kernel = in_place;
+      m_local_kernel = false;
+    } else {
+      size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
+      Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
+      typedef TensorEvalToOp<const KernelArgType> EvalTo;
+      EvalTo evalToTmp(local, m_kernelArg);
+      const bool PacketAccess = internal::IsVectorizable<Device, KernelArgType>::value;
+      const bool BlockAccess = false;
+      internal::TensorExecutor<const EvalTo, Device, PacketAccess, BlockAccess>::run(evalToTmp, m_device);
+
+      m_kernel = local;
+      m_local_kernel = true;
+    }
+  }
+
+  array<Index, NumDims> m_inputStride;
+  array<Index, NumDims> m_outputStride;
+
+  array<Index, NumKernelDims> m_indexStride;
+  array<Index, NumKernelDims> m_kernelStride;
+  TensorEvaluator<InputArgType, Device> m_inputImpl;
+  TensorEvaluator<KernelArgType, Device> m_kernelImpl;
+  Dimensions m_dimensions;
+
+  KernelArgType m_kernelArg;
+  const Scalar* m_kernel;
+  bool m_local_kernel;
+  const Device& m_device;
+};
+
+
+
+
+// Use an optimized implementation of the evaluation code for GPUs whenever possible.
+#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+
+template <int StaticKernelSize>
+struct GetKernelSize {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int /*kernelSize*/) const {
+    return StaticKernelSize;
+  }
+};
+template <>
+struct GetKernelSize<Dynamic> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int kernelSize) const {
+    return kernelSize;
+  }
+};
+
+template <typename InputEvaluator, typename Index, typename InputDims,
+          int StaticKernelSize>
+__global__ void EigenConvolutionKernel1D(
+    InputEvaluator eval,
+    const internal::IndexMapper<Index, InputDims, 1, InputEvaluator::Layout>
+        indexMapper,
+    const float* __restrict kernel, const int numPlanes, const int numX,
+    const int maxX, const int kernelSize, float* buffer) {
+  extern __shared__ float s[];
+
+  const int first_x = blockIdx.x * maxX;
+  const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
+  const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSize>()(kernelSize);
+  const int num_x_output = last_x - first_x + 1;
+
+  const int first_plane = blockIdx.y * blockDim.y;
+  const int plane_stride = blockDim.y * gridDim.y;
+
+  for (int p = first_plane + threadIdx.y; p < numPlanes; p += plane_stride) {
+    // Load inputs to shared memory
+    const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
+    const int plane_kernel_offset = threadIdx.y * num_x_input;
+    #pragma unroll
+    for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
+      const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x);
+      s[i + plane_kernel_offset] = eval.coeff(tensor_index);
+    }
+
+    __syncthreads();
+
+    // Compute the convolution
+    const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p);
+
+    #pragma unroll
+    for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
+      const int kernel_offset = plane_kernel_offset + i;
+      float result = 0.0f;
+      #pragma unroll
+      for (int k = 0; k < GetKernelSize<StaticKernelSize>()(kernelSize); ++k) {
+        result += s[k + kernel_offset] * kernel[k];
+      }
+      const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x);
+      buffer[tensor_index] = result;
+    }
+    __syncthreads();
+  }
+};
+
+template <typename InputEvaluator, typename Index, typename InputDims,
+          int StaticKernelSizeX, int StaticKernelSizeY>
+__global__ __launch_bounds__(1024, 1) void EigenConvolutionKernel2D(
+    InputEvaluator eval,
+    const internal::IndexMapper<Index, InputDims, 2, InputEvaluator::Layout>
+        indexMapper,
+    const float* __restrict kernel, const int numPlanes, const int numX,
+    const int maxX, const int numY, const int maxY, const int kernelSizeX,
+    const int kernelSizeY, float* buffer) {
+  extern __shared__ float s[];
+
+  const int first_x = blockIdx.x * maxX;
+  const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
+  const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSizeX>()(kernelSizeX);
+  const int num_x_output = last_x - first_x + 1;
+
+  const int first_y = blockIdx.y * maxY;
+  const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
+  const int num_y_input = last_y - first_y + GetKernelSize<StaticKernelSizeY>()(kernelSizeY);
+  const int num_y_output = last_y - first_y + 1;
+
+  const int first_plane = blockIdx.z * blockDim.z;
+  const int plane_stride = blockDim.z * gridDim.z;
+
+  for (int p = first_plane + threadIdx.z; p < numPlanes; p += plane_stride) {
+
+    const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
+    const int plane_kernel_offset = threadIdx.z * num_y_input;
+
+    // Load inputs to shared memory
+    #pragma unroll
+    for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
+      const int input_offset = num_x_input * (j + plane_kernel_offset);
+      #pragma unroll
+      for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
+        const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x, j+first_y);
+        s[i + input_offset] = eval.coeff(tensor_index);
+      }
+    }
+
+    __syncthreads();
+
+    // Convolution
+    const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p);
+
+    #pragma unroll
+    for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
+      #pragma unroll
+      for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
+        float result = 0.0f;
+        #pragma unroll
+        for (int l = 0; l < GetKernelSize<StaticKernelSizeY>()(kernelSizeY); ++l) {
+          const int kernel_offset = kernelSizeX * l;
+          const int input_offset = i + num_x_input * (j + l + plane_kernel_offset);
+          #pragma unroll
+          for (int k = 0; k < GetKernelSize<StaticKernelSizeX>()(kernelSizeX); ++k) {
+            result += s[k + input_offset] * kernel[k + kernel_offset];
+          }
+        }
+        const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x, j+first_y);
+        buffer[tensor_index] = result;
+      }
+    }
+
+    __syncthreads();
+  }
+};
+
+template <typename InputEvaluator, typename Index, typename InputDims>
+__global__ void EigenConvolutionKernel3D(
+    InputEvaluator eval,
+    const internal::IndexMapper<Index, InputDims, 3, InputEvaluator::Layout>
+        indexMapper,
+    const float* __restrict kernel, const size_t numPlanes, const size_t numX,
+    const size_t maxX, const size_t numY, const size_t maxY, const size_t numZ,
+    const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY,
+    const size_t kernelSizeZ, float* buffer) {
+  extern __shared__ float s[];
+
+  // Load inputs to shared memory
+  const int first_x = blockIdx.x * maxX;
+  const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
+  const int num_x_input = last_x - first_x + kernelSizeX;
+
+  const int first_y = blockIdx.y * maxY;
+  const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
+  const int num_y_input = last_y - first_y + kernelSizeY;
+
+  const int first_z = blockIdx.z * maxZ;
+  const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1;
+  const int num_z_input = last_z - first_z + kernelSizeZ;
+
+  for (int p = 0; p < numPlanes; ++p) {
+
+    const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
+    const int plane_kernel_offset = 0;
+
+    for (int k = threadIdx.z; k < num_z_input; k += blockDim.z) {
+      for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
+        for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
+          const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z);
+          s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index);
+        }
+      }
+    }
+
+    __syncthreads();
+
+    // Convolution
+    const int num_z_output = last_z - first_z + 1;
+    const int num_y_output = last_y - first_y + 1;
+    const int num_x_output = last_x - first_x + 1;
+    const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p);
+
+    for (int k = threadIdx.z; k < num_z_output; k += blockDim.z) {
+      for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
+        for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
+          float result = 0.0f;
+          for (int n = 0; n < kernelSizeZ; ++n) {
+            for (int m = 0; m < kernelSizeY; ++m) {
+              for (int l = 0; l < kernelSizeX; ++l) {
+                result += s[i + l + num_x_input * (j + m + num_y_input * (k + n + plane_kernel_offset))] * kernel[l + kernelSizeX * (m + kernelSizeY * n)];
+              }
+            }
+          }
+          const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x, j+first_y, k+first_z);
+          buffer[tensor_index] = result;
+        }
+      }
+    }
+    __syncthreads();
+  }
+};
+
+
+
+template<typename Indices, typename InputArgType, typename KernelArgType>
+struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, GpuDevice>
+{
+  typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
+
+  static const int NumDims =  internal::array_size<typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions>::value;
+  static const int NumKernelDims = internal::array_size<Indices>::value;
+  typedef typename XprType::Index Index;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions KernelDimensions;
+
+  enum {
+    IsAligned = TensorEvaluator<InputArgType, GpuDevice>::IsAligned &
+                TensorEvaluator<KernelArgType, GpuDevice>::IsAligned,
+    PacketAccess = false,
+    BlockAccess = false,
+    Layout = TensorEvaluator<InputArgType, GpuDevice>::Layout,
+    CoordAccess = false,  // to be implemented
+  };
+
+  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const GpuDevice& device)
+      : m_inputImpl(op.inputExpression(), device), m_kernelArg(op.kernelExpression()), m_kernelImpl(op.kernelExpression(), device), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device)
+  {
+    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, GpuDevice>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, GpuDevice>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+    const typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions& input_dims = m_inputImpl.dimensions();
+    const typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
+
+    m_dimensions = m_inputImpl.dimensions();
+    for (int i = 0; i < NumKernelDims; ++i) {
+      const Index index = op.indices()[i];
+      const Index input_dim = input_dims[index];
+      const Index kernel_dim = kernel_dims[i];
+      const Index result_dim = input_dim - kernel_dim + 1;
+      m_dimensions[index] = result_dim;
+    }
+  }
+
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+  typedef typename InputArgType::Scalar Scalar;
+
+  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
+    preloadKernel();
+    m_inputImpl.evalSubExprsIfNeeded(NULL);
+    if (data) {
+      executeEval(data);
+      return false;
+    } else {
+      m_buf = (Scalar*)m_device.allocate(dimensions().TotalSize() * sizeof(Scalar));
+      executeEval(m_buf);
+      return true;
+    }
+  }
+
+  EIGEN_STRONG_INLINE void cleanup() {
+    m_inputImpl.cleanup();
+    if (m_buf) {
+      m_device.deallocate(m_buf);
+      m_buf = NULL;
+    }
+    if (m_local_kernel) {
+      m_device.deallocate((void*)m_kernel);
+      m_local_kernel = false;
+    }
+    m_kernel = NULL;
+  }
+
+  EIGEN_STRONG_INLINE void preloadKernel() {
+    // Don't make a local copy of the kernel unless we have to (i.e. it's an
+    // expression that needs to be evaluated)
+    const Scalar* in_place = m_kernelImpl.data();
+    if (in_place) {
+      m_kernel = in_place;
+      m_local_kernel = false;
+    } else {
+      size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
+      Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
+      typedef TensorEvalToOp<const KernelArgType> EvalTo;
+      EvalTo evalToTmp(local, m_kernelArg);
+      const bool PacketAccess = internal::IsVectorizable<GpuDevice, KernelArgType>::value;
+      const bool BlockAccess = false;
+      internal::TensorExecutor<const EvalTo, GpuDevice, PacketAccess, BlockAccess>::run(evalToTmp, m_device);
+
+      m_kernel = local;
+      m_local_kernel = true;
+    }
+  }
+
+  static unsigned int ceil(unsigned int num, unsigned int denom) {
+    const unsigned int rounded_toward_zero = num / denom;
+    if (num > rounded_toward_zero * denom) {
+      return rounded_toward_zero + 1;
+    }
+    return rounded_toward_zero;
+  }
+
+  void executeEval(Scalar* data) const {
+    typedef typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions InputDims;
+
+    const int maxSharedMem = m_device.sharedMemPerBlock();
+    const int maxThreadsPerBlock = m_device.maxCudaThreadsPerBlock();
+    const int maxBlocksPerProcessor = m_device.maxCudaThreadsPerMultiProcessor() / maxThreadsPerBlock;
+    const int numMultiProcessors = m_device.getNumCudaMultiProcessors();
+    const int warpSize = 32;
+
+    switch (NumKernelDims) {
+      case 1: {
+        const int kernel_size = m_kernelImpl.dimensions().TotalSize();
+
+        const int numX = dimensions()[m_indices[0]];
+        const int numP = dimensions().TotalSize() / numX;
+        int maxX;
+        dim3 block_size;
+
+        const int single_stride_dim =
+            static_cast<int>(Layout) == static_cast<int>(ColMajor)
+                ? 0
+                : m_inputImpl.dimensions().rank() - 1;
+        if (m_indices[0] == single_stride_dim) {
+          // Maximum the reuse
+          const int inner_dim = ((maxSharedMem / (sizeof(Scalar)) - kernel_size + 1 + 31) / 32) * 32;
+          maxX = (std::min<int>)(inner_dim, numX);
+          const int maxP = (std::min<int>)(maxSharedMem / ((kernel_size - 1 + maxX) * sizeof(Scalar)), numP);
+          block_size.x = numext::mini(maxThreadsPerBlock, maxX);
+          block_size.y = (std::min<int>)(maxThreadsPerBlock / block_size.x, maxP);
+        }
+        else {
+          // Read as much as possible alongside the inner most dimension, that is the plane
+          const int inner_dim = maxSharedMem / ((warpSize + kernel_size) * sizeof(Scalar));
+          const int maxP = (std::min<int>)(inner_dim, numP);
+          maxX = (std::min<int>)(maxSharedMem / (inner_dim * sizeof(Scalar)) - kernel_size + 1, numX);
+
+          block_size.x = numext::mini(warpSize, maxX);
+          block_size.y = (std::min<int>)(maxThreadsPerBlock/block_size.x, maxP);
+        }
+
+        const int shared_mem = block_size.y * (maxX + kernel_size - 1) * sizeof(Scalar);
+        assert(shared_mem <= maxSharedMem);
+
+        const int num_x_blocks = ceil(numX, maxX);
+        const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
+        const int num_y_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks);
+
+        dim3 num_blocks(num_x_blocks, std::min<int>(num_y_blocks, ceil(numP, block_size.y)));
+
+
+        //cout << "launching 1D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " maxX: " << maxX << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
+
+        const array<Index, 1> indices(m_indices[0]);
+        const array<Index, 1> kernel_dims(m_kernelImpl.dimensions()[0]);
+        internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(
+            m_inputImpl.dimensions(), kernel_dims, indices);
+        switch(kernel_size) {
+          case 4: {
+            LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data);
+            break;
+          }
+          case 7: {
+            LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data);
+            break;
+          }
+          default: {
+            LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data);
+          }
+        }
+        break;
+      }
+
+      case 2: {
+        const int idxX =
+            static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1;
+        const int idxY =
+            static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0;
+        const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
+        const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
+
+        const int numX = dimensions()[m_indices[idxX]];
+        const int numY = dimensions()[m_indices[idxY]];
+        const int numP = dimensions().TotalSize() / (numX*numY);
+
+        const float scaling_factor = sqrtf(static_cast<float>(maxSharedMem) / (sizeof(Scalar) * kernel_size_y * kernel_size_x));
+
+        // Snap maxX to warp size
+        int inner_dim = ((static_cast<int>(scaling_factor * kernel_size_x) - kernel_size_x + 1 + 32) / 32) * 32;
+        const int maxX = (std::min<int>)(inner_dim, numX);
+        const int maxY = (std::min<int>)(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1)) - kernel_size_y + 1, numY);
+        const int maxP = (std::min<int>)(maxSharedMem / ((kernel_size_x - 1 + maxX) * (kernel_size_y - 1 + maxY) * sizeof(Scalar)), numP);
+
+        dim3 block_size;
+        block_size.x = numext::mini(1024, maxX);
+        block_size.y = (std::min<int>)(1024/block_size.x, maxY);
+        block_size.z = (std::min<int>)(1024/(block_size.x*block_size.y), maxP);
+
+        const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * sizeof(Scalar);
+        assert(shared_mem <= maxSharedMem);
+
+        const int num_x_blocks = ceil(numX, maxX);
+        const int num_y_blocks = ceil(numY, maxY);
+        const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
+        const int num_z_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks * num_y_blocks);
+
+        dim3 num_blocks(num_x_blocks, num_y_blocks, std::min<int>(num_z_blocks, ceil(numP, block_size.z)));
+
+
+        //cout << "launching 2D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y  << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " maxX: " << maxX << " maxY: " << maxY << " maxP: " << maxP << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
+
+        const array<Index, 2> indices(m_indices[idxX], m_indices[idxY]);
+        const array<Index, 2> kernel_dims(m_kernelImpl.dimensions()[idxX],
+                                          m_kernelImpl.dimensions()[idxY]);
+        internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(
+            m_inputImpl.dimensions(), kernel_dims, indices);
+        switch (kernel_size_x) {
+          case 4: {
+            switch (kernel_size_y) {
+              case 7: {
+                LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data);
+                break;
+              }
+              default: {
+                LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data);
+                break;
+              }
+            }
+            break;
+          }
+          case 7: {
+            switch (kernel_size_y) {
+              case 4: {
+                LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data);
+                break;
+              }
+              default: {
+                LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data);
+                break;
+              }
+            }
+            break;
+          }
+          default: {
+            LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data);
+            break;
+          }
+        }
+        break;
+      }
+
+      case 3: {
+        const int idxX =
+            static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2;
+        const int idxY =
+            static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1;
+        const int idxZ =
+            static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0;
+
+        const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
+        const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
+        const int kernel_size_z = m_kernelImpl.dimensions()[idxZ];
+
+        const int numX = dimensions()[m_indices[idxX]];
+        const int numY = dimensions()[m_indices[idxY]];
+        const int numZ = dimensions()[m_indices[idxZ]];
+        const int numP = dimensions().TotalSize() / (numX*numY*numZ);
+
+        const int maxX = (std::min<int>)(128, (std::min<int>)(maxSharedMem / (sizeof(Scalar) * kernel_size_y * kernel_size_z) - kernel_size_x + 1, numX));
+        const int maxY = (std::min<int>)(128, (std::min<int>)(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * kernel_size_z) - kernel_size_y + 1, numY));
+        const int maxZ = (std::min<int>)(128, (std::min<int>)(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1)) - kernel_size_z + 1, numZ));
+
+        dim3 block_size;
+        block_size.x = numext::mini(32, maxX);
+        block_size.y = numext::mini(32, maxY);
+        block_size.z = (std::min<int>)(1024/(block_size.x*block_size.y), maxZ);
+        dim3 num_blocks(ceil(numX, maxX), ceil(numY, maxY), ceil(numZ, maxZ));
+
+        const int shared_mem = (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) * sizeof(Scalar);
+        assert(shared_mem <= maxSharedMem);
+
+        //cout << "launching 3D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y  << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z  << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
+        const array<Index, 3> indices(m_indices[idxX], m_indices[idxY],
+                                      m_indices[idxZ]);
+        const array<Index, 3> kernel_dims(m_kernelImpl.dimensions()[idxX],
+                                          m_kernelImpl.dimensions()[idxY],
+                                          m_kernelImpl.dimensions()[idxZ]);
+        internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(
+            m_inputImpl.dimensions(), kernel_dims, indices);
+
+        LAUNCH_CUDA_KERNEL((EigenConvolutionKernel3D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data);
+        break;
+      }
+
+      default: {
+        EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    eigen_assert(m_buf);
+    eigen_assert(index < m_dimensions.TotalSize());
+    return m_buf[index];
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const
+  {
+    eigen_assert(m_buf);
+    eigen_assert(index < m_dimensions.TotalSize());
+    return internal::ploadt<PacketReturnType, LoadMode>(m_buf+index);
+  }
+
+ private:
+  // No assignment (copies are needed by the kernels)
+  TensorEvaluator& operator = (const TensorEvaluator&);
+
+  TensorEvaluator<InputArgType, GpuDevice> m_inputImpl;
+  TensorEvaluator<KernelArgType, GpuDevice> m_kernelImpl;
+  KernelArgType m_kernelArg;
+  Indices m_indices;
+  Dimensions m_dimensions;
+  Scalar* m_buf;
+  const Scalar* m_kernel;
+  bool m_local_kernel;
+
+  const GpuDevice& m_device;
+};
+#endif
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
new file mode 100644
index 0000000000..dc39565d6b
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
@@ -0,0 +1,302 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CUSTOM_OP_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CUSTOM_OP_H
+
+namespace Eigen {
+
+/** \class TensorCustomUnaryOp
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor custom class.
+  *
+  *
+  */
+namespace internal {
+template<typename CustomUnaryFunc, typename XprType>
+struct traits<TensorCustomUnaryOp<CustomUnaryFunc, XprType> >
+{
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::StorageKind StorageKind;
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef typename remove_reference<Nested>::type _Nested;
+  static const int NumDimensions = traits<XprType>::NumDimensions;
+  static const int Layout = traits<XprType>::Layout;
+};
+
+template<typename CustomUnaryFunc, typename XprType>
+struct eval<TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Eigen::Dense>
+{
+  typedef const TensorCustomUnaryOp<CustomUnaryFunc, XprType>& type;
+};
+
+template<typename CustomUnaryFunc, typename XprType>
+struct nested<TensorCustomUnaryOp<CustomUnaryFunc, XprType>, 1, typename eval<TensorCustomUnaryOp<CustomUnaryFunc, XprType> >::type>
+{
+  typedef TensorCustomUnaryOp<CustomUnaryFunc, XprType> type;
+};
+
+}  // end namespace internal
+
+
+
+template<typename CustomUnaryFunc, typename XprType>
+class TensorCustomUnaryOp : public TensorBase<TensorCustomUnaryOp<CustomUnaryFunc, XprType>, ReadOnlyAccessors>
+{
+  public:
+  typedef typename internal::traits<TensorCustomUnaryOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename internal::nested<TensorCustomUnaryOp>::type Nested;
+  typedef typename internal::traits<TensorCustomUnaryOp>::StorageKind StorageKind;
+  typedef typename internal::traits<TensorCustomUnaryOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCustomUnaryOp(const XprType& expr, const CustomUnaryFunc& func)
+      : m_expr(expr), m_func(func) {}
+
+  EIGEN_DEVICE_FUNC
+  const CustomUnaryFunc& func() const { return m_func; }
+
+  EIGEN_DEVICE_FUNC
+  const typename internal::remove_all<typename XprType::Nested>::type&
+  expression() const { return m_expr; }
+
+  protected:
+    typename XprType::Nested m_expr;
+    const CustomUnaryFunc m_func;
+};
+
+
+// Eval as rvalue
+template<typename CustomUnaryFunc, typename XprType, typename Device>
+struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Device>
+{
+  typedef TensorCustomUnaryOp<CustomUnaryFunc, XprType> ArgType;
+  typedef typename internal::traits<ArgType>::Index Index;
+  static const int NumDims = internal::traits<ArgType>::NumDimensions;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef
+      typename internal::remove_const<typename ArgType::Scalar>::type Scalar;
+
+  enum {
+    IsAligned = false,
+    PacketAccess = (internal::packet_traits<Scalar>::size > 1),
+    BlockAccess = false,
+    Layout = TensorEvaluator<XprType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const ArgType& op, const Device& device)
+      : m_op(op), m_device(device), m_result(NULL)
+  {
+    m_dimensions = op.func().dimensions(op.expression());
+  }
+
+  typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
+    if (data) {
+      evalTo(data);
+      return false;
+    } else {
+      m_result = static_cast<CoeffReturnType*>(
+          m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)));
+      evalTo(m_result);
+      return true;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    if (m_result != NULL) {
+      m_device.deallocate(m_result);
+      m_result = NULL;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    return m_result[index];
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const {
+    return internal::ploadt<PacketReturnType, LoadMode>(m_result + index);
+  }
+
+  EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_result; }
+
+ protected:
+  EIGEN_DEVICE_FUNC void evalTo(Scalar* data) {
+    TensorMap<Tensor<CoeffReturnType, NumDims, Layout, Index> > result(
+        data, m_dimensions);
+    m_op.func().eval(m_op.expression(), result, m_device);
+  }
+
+  Dimensions m_dimensions;
+  const ArgType m_op;
+  const Device& m_device;
+  CoeffReturnType* m_result;
+};
+
+
+
+/** \class TensorCustomBinaryOp
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor custom class.
+  *
+  *
+  */
+namespace internal {
+template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType>
+struct traits<TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType> >
+{
+  typedef typename internal::promote_storage_type<typename LhsXprType::Scalar,
+                                                  typename RhsXprType::Scalar>::ret Scalar;
+  typedef typename internal::promote_storage_type<typename LhsXprType::CoeffReturnType,
+                                                  typename RhsXprType::CoeffReturnType>::ret CoeffReturnType;
+  typedef typename promote_storage_type<typename traits<LhsXprType>::StorageKind,
+                                        typename traits<RhsXprType>::StorageKind>::ret StorageKind;
+  typedef typename promote_index_type<typename traits<LhsXprType>::Index,
+                                      typename traits<RhsXprType>::Index>::type Index;
+  typedef typename LhsXprType::Nested LhsNested;
+  typedef typename RhsXprType::Nested RhsNested;
+  typedef typename remove_reference<LhsNested>::type _LhsNested;
+  typedef typename remove_reference<RhsNested>::type _RhsNested;
+  static const int NumDimensions = traits<LhsXprType>::NumDimensions;
+  static const int Layout = traits<LhsXprType>::Layout;
+};
+
+template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType>
+struct eval<TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, Eigen::Dense>
+{
+  typedef const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>& type;
+};
+
+template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType>
+struct nested<TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, 1, typename eval<TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType> >::type>
+{
+  typedef TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType> type;
+};
+
+}  // end namespace internal
+
+
+
+template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType>
+class TensorCustomBinaryOp : public TensorBase<TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, ReadOnlyAccessors>
+{
+  public:
+  typedef typename internal::traits<TensorCustomBinaryOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename internal::traits<TensorCustomBinaryOp>::CoeffReturnType CoeffReturnType;
+  typedef typename internal::nested<TensorCustomBinaryOp>::type Nested;
+  typedef typename internal::traits<TensorCustomBinaryOp>::StorageKind StorageKind;
+  typedef typename internal::traits<TensorCustomBinaryOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCustomBinaryOp(const LhsXprType& lhs, const RhsXprType& rhs, const CustomBinaryFunc& func)
+
+      : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_func(func) {}
+
+  EIGEN_DEVICE_FUNC
+  const CustomBinaryFunc& func() const { return m_func; }
+
+  EIGEN_DEVICE_FUNC
+  const typename internal::remove_all<typename LhsXprType::Nested>::type&
+  lhsExpression() const { return m_lhs_xpr; }
+
+  EIGEN_DEVICE_FUNC
+  const typename internal::remove_all<typename RhsXprType::Nested>::type&
+  rhsExpression() const { return m_rhs_xpr; }
+
+  protected:
+    typename LhsXprType::Nested m_lhs_xpr;
+    typename RhsXprType::Nested m_rhs_xpr;
+    const CustomBinaryFunc m_func;
+};
+
+
+// Eval as rvalue
+template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType, typename Device>
+struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, Device>
+{
+  typedef TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType> XprType;
+  typedef typename internal::traits<XprType>::Index Index;
+  static const int NumDims = internal::traits<XprType>::NumDimensions;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename XprType::Scalar Scalar;
+
+  enum {
+    IsAligned = false,
+    PacketAccess = (internal::packet_traits<Scalar>::size > 1),
+    BlockAccess = false,
+    Layout = TensorEvaluator<LhsXprType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_op(op), m_device(device), m_result(NULL)
+  {
+    m_dimensions = op.func().dimensions(op.lhsExpression(), op.rhsExpression());
+  }
+
+  typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
+    if (data) {
+      evalTo(data);
+      return false;
+    } else {
+      m_result = static_cast<Scalar *>(m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)));
+      evalTo(m_result);
+      return true;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    if (m_result != NULL) {
+      m_device.deallocate(m_result);
+      m_result = NULL;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    return m_result[index];
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const {
+    return internal::ploadt<PacketReturnType, LoadMode>(m_result + index);
+  }
+
+  EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_result; }
+
+ protected:
+  EIGEN_DEVICE_FUNC void evalTo(Scalar* data) {
+    TensorMap<Tensor<Scalar, NumDims, Layout> > result(data, m_dimensions);
+    m_op.func().eval(m_op.lhsExpression(), m_op.rhsExpression(), result, m_device);
+  }
+
+  Dimensions m_dimensions;
+  const XprType m_op;
+  const Device& m_device;
+  CoeffReturnType* m_result;
+};
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_CUSTOM_OP_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
new file mode 100644
index 0000000000..3c33015bc4
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
@@ -0,0 +1,154 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H
+#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H
+
+namespace Eigen {
+
+/** \class TensorDevice
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Pseudo expression providing an operator = that will evaluate its argument
+  * on the specified computing 'device' (GPU, thread pool, ...)
+  *
+  * Example:
+  *    C.device(EIGEN_GPU) = A + B;
+  *
+  * Todo: thread pools.
+  * Todo: operator +=, -=, *= and so on.
+  */
+
+template <typename ExpressionType, typename DeviceType> class TensorDevice {
+  public:
+    TensorDevice(const DeviceType& device, ExpressionType& expression) : m_device(device), m_expression(expression) {}
+
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) {
+      typedef TensorAssignOp<ExpressionType, const OtherDerived> Assign;
+      Assign assign(m_expression, other);
+      internal::TensorExecutor<const Assign, DeviceType>::run(assign, m_device);
+      return *this;
+    }
+
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE TensorDevice& operator+=(const OtherDerived& other) {
+      typedef typename OtherDerived::Scalar Scalar;
+      typedef TensorCwiseBinaryOp<internal::scalar_sum_op<Scalar>, const ExpressionType, const OtherDerived> Sum;
+      Sum sum(m_expression, other);
+      typedef TensorAssignOp<ExpressionType, const Sum> Assign;
+      Assign assign(m_expression, sum);
+      internal::TensorExecutor<const Assign, DeviceType>::run(assign, m_device);
+      return *this;
+    }
+
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE TensorDevice& operator-=(const OtherDerived& other) {
+      typedef typename OtherDerived::Scalar Scalar;
+      typedef TensorCwiseBinaryOp<internal::scalar_difference_op<Scalar>, const ExpressionType, const OtherDerived> Difference;
+      Difference difference(m_expression, other);
+      typedef TensorAssignOp<ExpressionType, const Difference> Assign;
+      Assign assign(m_expression, difference);
+      internal::TensorExecutor<const Assign, DeviceType>::run(assign, m_device);
+      return *this;
+    }
+
+  protected:
+    const DeviceType& m_device;
+    ExpressionType& m_expression;
+};
+
+
+#ifdef EIGEN_USE_THREADS
+template <typename ExpressionType> class TensorDevice<ExpressionType, ThreadPoolDevice> {
+  public:
+    TensorDevice(const ThreadPoolDevice& device, ExpressionType& expression) : m_device(device), m_expression(expression) {}
+
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) {
+      typedef TensorAssignOp<ExpressionType, const OtherDerived> Assign;
+      Assign assign(m_expression, other);
+      internal::TensorExecutor<const Assign, ThreadPoolDevice>::run(assign, m_device);
+      return *this;
+    }
+
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE TensorDevice& operator+=(const OtherDerived& other) {
+      typedef typename OtherDerived::Scalar Scalar;
+      typedef TensorCwiseBinaryOp<internal::scalar_sum_op<Scalar>, const ExpressionType, const OtherDerived> Sum;
+      Sum sum(m_expression, other);
+      typedef TensorAssignOp<ExpressionType, const Sum> Assign;
+      Assign assign(m_expression, sum);
+      internal::TensorExecutor<const Assign, ThreadPoolDevice>::run(assign, m_device);
+      return *this;
+    }
+
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE TensorDevice& operator-=(const OtherDerived& other) {
+      typedef typename OtherDerived::Scalar Scalar;
+      typedef TensorCwiseBinaryOp<internal::scalar_difference_op<Scalar>, const ExpressionType, const OtherDerived> Difference;
+      Difference difference(m_expression, other);
+      typedef TensorAssignOp<ExpressionType, const Difference> Assign;
+      Assign assign(m_expression, difference);
+      internal::TensorExecutor<const Assign, ThreadPoolDevice>::run(assign, m_device);
+      return *this;
+    }
+
+  protected:
+    const ThreadPoolDevice& m_device;
+    ExpressionType& m_expression;
+};
+#endif
+
+#if defined(EIGEN_USE_GPU)
+template <typename ExpressionType> class TensorDevice<ExpressionType, GpuDevice>
+{
+  public:
+    TensorDevice(const GpuDevice& device, ExpressionType& expression) : m_device(device), m_expression(expression) {}
+
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) {
+      typedef TensorAssignOp<ExpressionType, const OtherDerived> Assign;
+      Assign assign(m_expression, other);
+      internal::TensorExecutor<const Assign, GpuDevice>::run(assign, m_device);
+      return *this;
+    }
+
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE TensorDevice& operator+=(const OtherDerived& other) {
+      typedef typename OtherDerived::Scalar Scalar;
+      typedef TensorCwiseBinaryOp<internal::scalar_sum_op<Scalar>, const ExpressionType, const OtherDerived> Sum;
+      Sum sum(m_expression, other);
+      typedef TensorAssignOp<ExpressionType, const Sum> Assign;
+      Assign assign(m_expression, sum);
+      internal::TensorExecutor<const Assign, GpuDevice>::run(assign, m_device);
+      return *this;
+    }
+
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE TensorDevice& operator-=(const OtherDerived& other) {
+      typedef typename OtherDerived::Scalar Scalar;
+      typedef TensorCwiseBinaryOp<internal::scalar_difference_op<Scalar>, const ExpressionType, const OtherDerived> Difference;
+      Difference difference(m_expression, other);
+      typedef TensorAssignOp<ExpressionType, const Difference> Assign;
+      Assign assign(m_expression, difference);
+      internal::TensorExecutor<const Assign, GpuDevice>::run(assign, m_device);
+      return *this;
+    }
+
+  protected:
+    const GpuDevice& m_device;
+    ExpressionType& m_expression;
+};
+#endif
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h
new file mode 100644
index 0000000000..b6eeb73832
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h
@@ -0,0 +1,920 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_DEVICE_TYPE_H
+#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_TYPE_H
+
+namespace Eigen {
+
+// Default device for the machine (typically a single cpu core)
+struct DefaultDevice {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
+    return internal::aligned_malloc(num_bytes);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
+    internal::aligned_free(buffer);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
+    ::memcpy(dst, src, n);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
+    memcpy(dst, src, n);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
+    memcpy(dst, src, n);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
+    ::memset(buffer, c, n);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const {
+#ifndef __CUDA_ARCH__
+    // Running on the host CPU
+    return 1;
+#else
+    // Running on a CUDA device
+    return 32;
+#endif
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t memcpyThreshold() const {
+    return 2 * numThreads();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
+#ifndef __CUDA_ARCH__
+    // Running on the host CPU
+    return l1CacheSize();
+#else
+    // Running on a CUDA device, return the amount of shared memory available.
+    return 48*1024;
+#endif
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
+#ifndef __CUDA_ARCH__
+    // Running single threaded on the host CPU
+    return l3CacheSize();
+#else
+    // Running on a CUDA device
+    return firstLevelCacheSize();
+#endif
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
+#ifndef __CUDA_ARCH__
+    // Running single threaded on the host CPU
+    // Should return an enum that encodes the ISA supported by the CPU
+    return 1;
+#else
+    // Running on a CUDA device
+    return __CUDA_ARCH__ / 100;
+#endif
+  }
+};
+
+// Multiple cpu cores
+#ifdef EIGEN_USE_THREADS
+
+#if __cplusplus > 199711
+// This defines an interface that ThreadPoolDevice can take to use
+// custom thread pools underneath.
+class ThreadPoolInterface {
+ public:
+  virtual void Schedule(std::function<void()> fn) = 0;
+
+  virtual ~ThreadPoolInterface() {}
+};
+#endif
+
+// The implementation of the ThreadPool type ensures that the Schedule method
+// runs the functions it is provided in FIFO order when the scheduling is done
+// by a single thread.
+#ifdef EIGEN_USE_CUSTOM_THREAD_POOL
+class ThreadPool : public ThreadPoolInterface {
+ public:
+  // Construct a pool that contains "num_threads" threads.
+  explicit ThreadPool(int num_threads) : threads_(num_threads), waiters_(num_threads) {
+    for (int i = 0; i < num_threads; i++) {
+      threads_.push_back(new std::thread([this]() { WorkerLoop(); }));
+    }
+  }
+
+  // Wait until all scheduled work has finished and then destroy the
+  // set of threads.
+  ~ThreadPool() {
+    {
+      // Wait for all work to get done.
+      std::unique_lock<std::mutex> l(mu_);
+      while (!pending_.empty()) {
+        empty_.wait(l);
+      }
+      exiting_ = true;
+
+      // Wakeup all waiters.
+      for (auto w : waiters_) {
+        w->ready = true;
+        w->work = nullptr;
+        w->cv.notify_one();
+      }
+    }
+
+    // Wait for threads to finish.
+    for (auto t : threads_) {
+      t->join();
+      delete t;
+    }
+  }
+
+  // Schedule fn() for execution in the pool of threads. The functions are
+  // executed in the order in which they are scheduled.
+  void Schedule(std::function<void()> fn) final {
+    std::unique_lock<std::mutex> l(mu_);
+    if (waiters_.empty()) {
+      pending_.push_back(fn);
+    } else {
+      Waiter* w = waiters_.back();
+      waiters_.pop_back();
+      w->ready = true;
+      w->work = fn;
+      w->cv.notify_one();
+    }
+  }
+
+ protected:
+  void WorkerLoop() {
+    std::unique_lock<std::mutex> l(mu_);
+    Waiter w;
+    while (!exiting_) {
+      std::function<void()> fn;
+      if (pending_.empty()) {
+        // Wait for work to be assigned to me
+        w.ready = false;
+        waiters_.push_back(&w);
+        while (!w.ready) {
+          w.cv.wait(l);
+        }
+        fn = w.work;
+        w.work = nullptr;
+      } else {
+        // Pick up pending work
+        fn = pending_.front();
+        pending_.pop_front();
+        if (pending_.empty()) {
+          empty_.notify_all();
+        }
+      }
+      if (fn) {
+        mu_.unlock();
+        fn();
+        mu_.lock();
+      }
+    }
+  }
+
+ private:
+  struct Waiter {
+    std::condition_variable cv;
+    std::function<void()> work;
+    bool ready;
+  };
+
+  std::mutex mu_;
+  FixedSizeVector<std::thread*> threads_;               // All threads
+  FixedSizeVector<Waiter*> waiters_;                    // Stack of waiting threads.
+  std::deque<std::function<void()>> pending_;       // Queue of pending work
+  std::condition_variable empty_;                   // Signaled on pending_.empty()
+  bool exiting_ = false;
+};
+
+
+// Notification is an object that allows a user to to wait for another
+// thread to signal a notification that an event has occurred.
+//
+// Multiple threads can wait on the same Notification object.
+// but only one caller must call Notify() on the object.
+class Notification {
+ public:
+  Notification() : notified_(false) {}
+  ~Notification() {}
+
+  void Notify() {
+    std::unique_lock<std::mutex> l(mu_);
+    eigen_assert(!notified_);
+    notified_ = true;
+    cv_.notify_all();
+  }
+
+  void WaitForNotification() {
+    std::unique_lock<std::mutex> l(mu_);
+    while (!notified_) {
+      cv_.wait(l);
+    }
+  }
+
+ private:
+  std::mutex mu_;
+  std::condition_variable cv_;
+  bool notified_;
+};
+
+#else
+
+// Notification is an object that allows a user to to wait for another
+// thread to signal a notification that an event has occurred.
+//
+// Multiple threads can wait on the same Notification object.
+// but only one caller must call Notify() on the object.
+class Notification {
+ public:
+  Notification() : notified_(false) {}
+  ~Notification() {}
+
+  void Notify() {
+    tensorflow::mutex_lock l(mu_);
+    eigen_assert(!notified_);
+    notified_ = true;
+    cv_.notify_all();
+  }
+
+  void WaitForNotification() {
+    tensorflow::mutex_lock l(mu_);
+    while (!notified_) {
+      cv_.wait(l);
+    }
+  }
+
+ private:
+  tensorflow::mutex mu_;
+  tensorflow::condition_variable cv_;
+  bool notified_;
+};
+#endif
+
+// Runs an arbitrary function and then calls Notify() on the passed in
+// Notification.
+template <typename Function, typename... Args> struct FunctionWrapper
+{
+  static void run(Notification* n, Function f, Args... args) {
+    f(args...);
+    n->Notify();
+  }
+};
+
+static EIGEN_STRONG_INLINE void wait_until_ready(Notification* n) {
+  if (n) {
+    n->WaitForNotification();
+  }
+}
+
+
+struct MemcpyExecutor {
+  typedef MemcpyExecutor Self;
+
+  MemcpyExecutor(void *dst, const void *src) :
+      m_dst(static_cast<char *>(dst)), m_src(static_cast<const char *>(src)) { }
+
+  static EIGEN_STRONG_INLINE void run(const MemcpyExecutor* exec, size_t idx, size_t block_size) {
+    ::memcpy(&(exec->m_dst[idx]), &(exec->m_src[idx]), block_size);
+  }
+
+ private:
+  char* m_dst;
+  const char* m_src;
+};
+
+struct MemsetExecutor {
+  typedef MemsetExecutor Self;
+
+  MemsetExecutor(void *buffer, int val) :
+      m_buffer(static_cast<char *>(buffer)), m_val(val) { }
+
+  static EIGEN_STRONG_INLINE void run(const MemsetExecutor* exec, size_t idx, size_t block_size) {
+    ::memset(&(exec->m_buffer[idx]), exec->m_val, block_size);
+  }
+
+ private:
+  char* m_buffer;
+  const int m_val;
+};
+
+
+struct ThreadPoolDevice {
+  // The ownership of the thread pool remains with the caller.
+  ThreadPoolDevice(ThreadPoolInterface* pool, size_t num_cores)
+      : pool_(pool), num_threads_(num_cores) {}
+
+  EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
+    return internal::aligned_malloc(num_bytes);
+  }
+
+  EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
+    internal::aligned_free(buffer);
+  }
+
+  EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
+#ifdef __ANDROID__
+    ::memcpy(dst, src, n);
+#else
+    if (n <= 32768) {
+      ::memcpy(dst, src, n);
+    } else {
+      MemcpyExecutor memcpy_executor(dst, src);
+      execute(memcpy_executor, n);
+    }
+#endif
+  }
+
+  EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
+    memcpy(dst, src, n);
+  }
+
+  EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
+    memcpy(dst, src, n);
+  }
+
+  EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
+#ifdef __ANDROID__
+    ::memset(buffer, c, n);
+#else
+    if (n <= 32768) {
+      ::memset(buffer, c, n);
+    } else {
+      MemsetExecutor memset_executor(buffer, c);
+      execute(memset_executor, n);
+    }
+#endif
+  }
+
+  EIGEN_STRONG_INLINE size_t numThreads() const {
+    return num_threads_;
+  }
+
+  EIGEN_STRONG_INLINE size_t memcpyThreshold() const {
+    return 2 * numThreads();
+  }
+
+  EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
+    return l1CacheSize();
+  }
+
+  EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
+    // The l3 cache size is shared between all the cores.
+    return l3CacheSize() / num_threads_;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
+    // Should return an enum that encodes the ISA supported by the CPU
+    return 1;
+  }
+
+  template <class Function, class... Args>
+  EIGEN_STRONG_INLINE Notification* enqueue(Function&& f, Args&&... args) const {
+    Notification* n = new Notification();
+    std::function<void()> func =
+        std::bind(&FunctionWrapper<Function, Args...>::run, n, f, args...);
+    pool_->Schedule(func);
+    return n;
+  }
+
+  template <class Function, class... Args>
+  EIGEN_STRONG_INLINE void enqueue_and_forget(Function&& f, Args&&... args) const {
+    std::function<void()> func = std::bind(f, args...);
+    pool_->Schedule(func);
+  }
+
+ private:
+  template<typename Executor>
+  EIGEN_STRONG_INLINE void execute(const Executor& exec, size_t n) const {
+    // don't spawn a thread to process fewer than 1024 bytes (chosen by small amount of
+    // experimentation)
+    // TODO: make block_size a multiple of packet_size and align everything
+    const size_t block_size = numext::maxi(static_cast<size_t>(1024), n / numThreads());
+    const size_t block_count = n / block_size;
+    eigen_assert(block_count <= numThreads());
+
+    FixedSizeVector<Notification*> results(block_count);
+    for (size_t block_idx = 0; block_idx < block_count; block_idx++) {
+      results.push_back(enqueue(&Executor::run, &exec, block_idx * block_size, block_size));
+    }
+
+    if (block_count * block_size < n) {
+      Executor::run(&exec, block_count * block_size, n - block_count * block_size);
+    }
+
+    // wait for threads to finish
+    for (size_t block_idx = 0; block_idx < block_count; block_idx++) {
+      results[block_idx]->WaitForNotification();
+      delete results[block_idx];
+    }
+  }
+
+  // todo: NUMA, ...
+  size_t num_threads_;
+  ThreadPoolInterface* pool_;
+};
+#endif
+
+
+// GPU offloading
+#ifdef EIGEN_USE_GPU
+
+// An interface abstracting away device specific memory allocator.
+class Allocator {
+ public:
+  virtual ~Allocator() {}
+  EIGEN_DEVICE_FUNC virtual void* allocate(size_t num_bytes) const = 0;
+  EIGEN_DEVICE_FUNC virtual void deallocate(void* buffer) const = 0;
+};
+
+#if !defined(__GCUDACC__) && !defined(__GCUDACC_HOST__)
+
+// This defines an interface that GPUDevice can take to use
+// CUDA streams underneath.
+class StreamInterface {
+ public:
+  virtual ~StreamInterface() {}
+
+  virtual const cudaStream_t& stream() const = 0;
+  virtual const cudaDeviceProp& deviceProperties() const = 0;
+
+  // Allocate memory on the actual device where the computation will run
+  virtual void* allocate(size_t num_bytes) const = 0;
+  virtual void deallocate(void* buffer) const = 0;
+};
+
+static cudaDeviceProp* m_deviceProperties;
+static bool m_devicePropInitialized = false;
+static tensorflow::mutex m_devicePropInitMutex(tensorflow::LINKER_INITIALIZED);
+
+static void initializeDeviceProp() {
+  if (!m_devicePropInitialized) {
+    tensorflow::mutex_lock l(m_devicePropInitMutex);
+    if (!m_devicePropInitialized) {
+      int num_devices;
+      cudaError_t status = cudaGetDeviceCount(&num_devices);
+      eigen_check(status == cudaSuccess);
+      m_deviceProperties = new cudaDeviceProp[num_devices];
+      for (int i = 0; i < num_devices; ++i) {
+        status = cudaGetDeviceProperties(&m_deviceProperties[i], i);
+        eigen_check(status == cudaSuccess);
+      }
+      m_devicePropInitialized = true;
+    }
+  }
+}
+
+static const cudaStream_t default_stream = cudaStreamDefault;
+
+class CudaStreamDevice : public StreamInterface {
+ public:
+  // Use the default stream on the current device
+  CudaStreamDevice() : stream_(&default_stream) {
+    cudaGetDevice(&device_);
+    initializeDeviceProp();
+  }
+  // Use the default stream on the specified device
+  CudaStreamDevice(int device) : stream_(&default_stream), device_(device) {
+    initializeDeviceProp();
+  }
+  // Use the specified stream. Note that it's the
+  // caller responsibility to ensure that the stream can run on
+  // the specified device. If no device is specified the code
+  // assumes that the stream is associated to the current gpu device.
+  CudaStreamDevice(const cudaStream_t* stream, int device = -1)
+      : stream_(stream), device_(device) {
+    if (device < 0) {
+      cudaGetDevice(&device_);
+    } else {
+      int num_devices;
+      cudaError_t err = cudaGetDeviceCount(&num_devices);
+      eigen_check(err == cudaSuccess);
+      eigen_check(device < num_devices);
+      device_ = device;
+    }
+    initializeDeviceProp();
+  }
+
+  const cudaStream_t& stream() const { return *stream_; }
+  const cudaDeviceProp& deviceProperties() const {
+    return m_deviceProperties[device_];
+  }
+  virtual void* allocate(size_t num_bytes) const {
+    cudaError_t err = cudaSetDevice(device_);
+    eigen_check(err == cudaSuccess);
+    void* result;
+    err = cudaMalloc(&result, num_bytes);
+    eigen_check(err == cudaSuccess);
+    eigen_check(result != NULL);
+    return result;
+  }
+  virtual void deallocate(void* buffer) const {
+    cudaError_t err = cudaSetDevice(device_);
+    eigen_check(err == cudaSuccess);
+    assert(buffer != NULL);
+    err = cudaFree(buffer);
+    assert(err == cudaSuccess);
+  }
+
+ private:
+  const cudaStream_t* stream_;
+  int device_;
+};
+
+static inline void setCudaSharedMemConfig(cudaSharedMemConfig config) {
+  cudaError_t status = cudaDeviceSetSharedMemConfig(config);
+  eigen_check(status == cudaSuccess);
+}
+
+struct GpuDevice {
+  // Neither the cudastream nor the allocator is not owned: the caller is
+  // responsible for their initialization and eventual destruction.
+  explicit GpuDevice(const StreamInterface* stream) : stream_(stream) {
+    eigen_assert(stream);
+  }
+
+  // TODO(bsteiner): This is an internal API, we should not expose it.
+  EIGEN_STRONG_INLINE const cudaStream_t& stream() const {
+    return stream_->stream();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
+#ifndef __CUDA_ARCH__
+    return stream_->allocate(num_bytes);
+#else
+    eigen_assert(false && "The default device should be used instead to generate kernel code");
+    return NULL;
+#endif
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
+#ifndef __CUDA_ARCH__
+    stream_->deallocate(buffer);
+#else
+    eigen_assert(false && "The default device should be used instead to generate kernel code");
+#endif
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
+#ifndef __CUDA_ARCH__
+    cudaError_t err = cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice,
+                                      stream_->stream());
+    assert(err == cudaSuccess);
+#else
+    eigen_assert(false && "The default device should be used instead to generate kernel code");
+#endif
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
+#ifndef __CUDA_ARCH__
+    cudaError_t err =
+        cudaMemcpyAsync(dst, src, n, cudaMemcpyHostToDevice, stream_->stream());
+    assert(err == cudaSuccess);
+#else
+    eigen_assert(false && "The default device should be used instead to generate kernel code");
+#endif
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
+#ifndef __CUDA_ARCH__
+    cudaError_t err =
+        cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToHost, stream_->stream());
+    assert(err == cudaSuccess);
+#else
+    eigen_assert(false && "The default device should be used instead to generate kernel code");
+#endif
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
+#ifndef __CUDA_ARCH__
+    cudaError_t err = cudaMemsetAsync(buffer, c, n, stream_->stream());
+    assert(err == cudaSuccess);
+#else
+    eigen_assert(false && "The default device should be used instead to generate kernel code");
+#endif
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const {
+    // FIXME
+    return 32;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t memcpyThreshold() const {
+    return 4 * 1024 * 1024;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
+    // FIXME
+    return 48*1024;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
+    // We won't try to take advantage of the l2 cache for the time being, and
+    // there is no l3 cache on cuda devices.
+    return firstLevelCacheSize();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const {
+#ifndef __CUDA_ARCH__
+    cudaError_t err = cudaStreamSynchronize(stream_->stream());
+    assert(err == cudaSuccess);
+#else
+    assert(false && "The default device should be used instead to generate kernel code");
+#endif
+  }
+
+  inline int getNumCudaMultiProcessors() const {
+    return stream_->deviceProperties().multiProcessorCount;
+  }
+  inline int maxCudaThreadsPerBlock() const {
+    return stream_->deviceProperties().maxThreadsPerBlock;
+  }
+  inline int maxCudaThreadsPerMultiProcessor() const {
+    return stream_->deviceProperties().maxThreadsPerMultiProcessor;
+  }
+  inline int sharedMemPerBlock() const {
+    return stream_->deviceProperties().sharedMemPerBlock;
+  }
+  inline int majorDeviceVersion() const {
+    return stream_->deviceProperties().major;
+  }
+
+  // This function checks if the CUDA runtime recorded an error for the
+  // underlying stream device.
+  inline bool ok() const {
+    cudaError_t error = cudaStreamQuery(stream_->stream());
+    return (error == cudaSuccess) || (error == cudaErrorNotReady);
+  }
+
+ private:
+  const StreamInterface* stream_;
+};
+
+inline void assertCudaOk() {
+  cudaError_t err = cudaGetLastError();
+
+  assert(err != cudaErrorMissingConfiguration);
+  assert(err != cudaErrorMemoryAllocation);
+  assert(err != cudaErrorInitializationError);
+  assert(err != cudaErrorLaunchFailure);
+  assert(err != cudaErrorPriorLaunchFailure);
+  assert(err != cudaErrorLaunchTimeout);
+  assert(err != cudaErrorLaunchOutOfResources);
+  assert(err != cudaErrorInvalidDeviceFunction);
+  assert(err != cudaErrorInvalidConfiguration);
+  assert(err != cudaErrorInvalidDevice);
+  assert(err != cudaErrorInvalidValue);
+  assert(err != cudaErrorInvalidPitchValue);
+  assert(err != cudaErrorInvalidSymbol);
+  assert(err != cudaErrorMapBufferObjectFailed);
+  assert(err != cudaErrorUnmapBufferObjectFailed);
+  assert(err != cudaErrorInvalidHostPointer);
+  assert(err != cudaErrorInvalidDevicePointer);
+  assert(err != cudaErrorInvalidTexture);
+  assert(err != cudaErrorInvalidTextureBinding);
+  assert(err != cudaErrorInvalidChannelDescriptor);
+  assert(err != cudaErrorInvalidMemcpyDirection);
+  assert(err != cudaErrorAddressOfConstant);
+  assert(err != cudaErrorTextureFetchFailed);
+  assert(err != cudaErrorTextureNotBound);
+  assert(err != cudaErrorSynchronizationError);
+  assert(err != cudaErrorInvalidFilterSetting);
+  assert(err != cudaErrorInvalidNormSetting);
+  assert(err != cudaErrorMixedDeviceExecution);
+  assert(err != cudaErrorCudartUnloading);
+  assert(err != cudaErrorUnknown);
+  assert(err != cudaErrorNotYetImplemented);
+  assert(err != cudaErrorMemoryValueTooLarge);
+  assert(err != cudaErrorInvalidResourceHandle);
+  assert(err != cudaErrorNotReady);
+  assert(err != cudaErrorInsufficientDriver);
+  assert(err != cudaErrorSetOnActiveProcess);
+  assert(err != cudaErrorInvalidSurface);
+  assert(err != cudaErrorNoDevice);
+  assert(err != cudaErrorECCUncorrectable);
+  assert(err != cudaErrorSharedObjectSymbolNotFound);
+  assert(err != cudaErrorSharedObjectInitFailed);
+  assert(err != cudaErrorUnsupportedLimit);
+  assert(err != cudaErrorDuplicateVariableName);
+  assert(err != cudaErrorDuplicateTextureName);
+  assert(err != cudaErrorDuplicateSurfaceName);
+  assert(err != cudaErrorDevicesUnavailable);
+  assert(err != cudaErrorInvalidKernelImage);
+  assert(err != cudaErrorNoKernelImageForDevice);
+  assert(err != cudaErrorIncompatibleDriverContext);
+  assert(err != cudaErrorPeerAccessAlreadyEnabled);
+  assert(err != cudaErrorPeerAccessNotEnabled);
+  assert(err != cudaErrorDeviceAlreadyInUse);
+  assert(err != cudaErrorProfilerDisabled);
+  assert(err != cudaErrorProfilerNotInitialized);
+  assert(err != cudaErrorProfilerAlreadyStarted);
+  assert(err != cudaErrorProfilerAlreadyStopped);
+  assert(err != cudaErrorAssert);
+  assert(err != cudaErrorTooManyPeers);
+  assert(err != cudaErrorHostMemoryAlreadyRegistered);
+  assert(err != cudaErrorHostMemoryNotRegistered);
+  assert(err != cudaErrorOperatingSystem);
+  assert(err != cudaErrorStartupFailure);
+  assert(err != cudaErrorApiFailureBase);
+
+  // catch errors types introduced after this function was written
+  assert(err == cudaSuccess);
+}
+
+#define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, \
+                           ...)                                            \
+  do {                                                                     \
+    (kernel)<<<(gridsize), (blocksize), (sharedmem), (device).stream()>>>( \
+        __VA_ARGS__);                                                      \
+    assertCudaOk();                                                        \
+  } while (false)
+
+#else  // __GCUDACC__
+
+// The following is the version of GpuDevice for StreamExecutor
+// (go/gpuexecutor) a GPU runtime that supports both CUDA and OpenCL.
+// StreamExecutor is being developed as an open-source replacement for the CUDA
+// runtime and is the runtime used when compiling with gcudacc. Differences
+// between the CUDA runtime and StreamExecutor are abstracted away behind
+// GpuDevice.
+
+// TODO(jpienaar): Temporary workaround until b/18409724 is addressed.
+enum cudaSharedMemConfig
+{
+    cudaSharedMemBankSizeDefault   = 0,
+    cudaSharedMemBankSizeFourByte  = 1,
+    cudaSharedMemBankSizeEightByte = 2
+};
+
+static inline void setCudaSharedMemConfig(cudaSharedMemConfig cache_config) {
+  // TODO(jpienaar): fix when implemented (b/18409724)
+}
+
+struct GpuDevice {
+  GpuDevice()
+      : stream_(perftools::gputools::MachineManager::singleton()->stream_for_device(0)),
+        allocator_(nullptr),
+        stream_exec_(stream_->parent()) {}
+
+  GpuDevice(perftools::gputools::Stream* stream,
+            const Allocator* alloc = nullptr)
+      : stream_(stream), allocator_(alloc), stream_exec_(stream_->parent()) { }
+
+  EIGEN_STRONG_INLINE perftools::gputools::Stream* stream() const {
+    return stream_;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
+    if (allocator_ != nullptr) return allocator_->allocate(num_bytes);
+#ifndef __CUDA_ARCH__
+    perftools::gputools::DeviceMemory<char> mem =
+        stream_exec_->AllocateArray<char>(num_bytes);
+    return mem.opaque();
+#else
+    assert(false &&
+           "The default device should be used instead to generate kernel code");
+    return nullptr;
+#endif
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
+    if (allocator_ != nullptr) {
+      allocator_->deallocate(buffer);
+      return;
+    }
+#ifndef __CUDA_ARCH__
+    perftools::gputools::DeviceMemoryBase gpu_mem(buffer);
+    stream_exec_->Deallocate(&gpu_mem);
+#else
+    assert(false &&
+           "The default device should be used instead to generate kernel code");
+#endif
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src,
+                                                    size_t n) const {
+#ifndef __CUDA_ARCH__
+    perftools::gputools::DeviceMemoryBase gpu_to(dst);
+    if (!stream_->ThenMemcpy(&gpu_to, perftools::gputools::DeviceMemoryBase(
+                                          const_cast<void*>(src)),
+                             n).ok()) {
+      assert(false &&
+             "failed during enqueue of 'copy perftools::gputools to "
+             "perftools::gputools'");
+    }
+#else
+    assert(false &&
+           "The default device should be used instead to generate kernel code");
+#endif
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
+#ifndef __CUDA_ARCH__
+    perftools::gputools::DeviceMemoryBase gpu_to(dst);
+    if (!stream_->ThenMemcpy(&gpu_to, src, n).ok()) {
+      assert(false && "failed while enqueuing memcpy from host to device");
+    }
+#else
+    eigen_assert(false && "The default device should be used instead to generate kernel code");
+#endif
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
+#ifndef __CUDA_ARCH__
+    if (!stream_->ThenMemcpy(dst, perftools::gputools::DeviceMemoryBase(
+                                      const_cast<void*>(src)),
+                             n).ok()) {
+      assert(false && "failed while enqueuing memcpy from device to host");
+    }
+#else
+    eigen_assert(false && "The default device should be used instead to generate kernel code");
+#endif
+  }
+
+  EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
+#ifndef __CUDA_ARCH__
+    perftools::gputools::DeviceMemoryBase gpu_buffer{buffer};
+    if (!stream_exec_->Memset32(stream_, &gpu_buffer, c, n)) {
+      assert(false && "GPU memset failed.");
+    }
+#else
+    assert(false &&
+           "The default device should be used instead to generate kernel code");
+#endif
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const {
+    // FIXME
+    return 32;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t memcpyThreshold() const {
+    return 4 * 1024 * 1024;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
+    // FIXME
+    return 48*1024;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
+    // We won't try to take advantage of the l2 cache for the time being, and
+    // there is no l3 cache on cuda devices.
+    return firstLevelCacheSize();
+  }
+
+  EIGEN_STRONG_INLINE void synchronize() const {
+    stream_->BlockHostUntilDone();
+  }
+
+  // A gpu::DeviceDescription is cached inside a StreamExecutor, so these calls
+  // aren't expensive/wasteful.
+  EIGEN_DEVICE_FUNC inline int getNumCudaMultiProcessors() const {
+    return stream_exec_->GetDeviceDescription().core_count();
+  }
+
+  EIGEN_DEVICE_FUNC inline int maxCudaThreadsPerBlock() const {
+    return stream_exec_->GetDeviceDescription().threads_per_block_limit();
+  }
+
+  EIGEN_DEVICE_FUNC inline int maxCudaThreadsPerMultiProcessor() const {
+    return stream_exec_->GetDeviceDescription().threads_per_core_limit();
+  }
+
+  EIGEN_DEVICE_FUNC inline int sharedMemPerBlock() const {
+    return stream_exec_->GetDeviceDescription().shared_memory_per_block();
+  }
+
+  EIGEN_DEVICE_FUNC inline int majorDeviceVersion() const {
+    int major, minor;
+    if (stream_exec_->GetDeviceDescription().cuda_compute_capability(&major,
+                                                                  &minor)) {
+      return major;
+    } else {
+      return 0;
+    }
+  }
+
+  inline bool ok() const { return stream_->ok(); }
+
+ private:
+  perftools::gputools::Stream* stream_;
+  perftools::gputools::StreamExecutor* stream_exec_;
+  const Allocator* allocator_;
+};
+
+#define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...)\
+    (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__);  \
+  CHECK((device).stream()->ok());
+#endif  // __GCUDACC__
+
+#endif  // EIGEN_USE_GPU
+}  // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_TYPE_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h
new file mode 100644
index 0000000000..19e922f92f
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h
@@ -0,0 +1,235 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_DIMENSION_LIST_H
+#define EIGEN_CXX11_TENSOR_TENSOR_DIMENSION_LIST_H
+
+namespace Eigen {
+
+/** \internal
+  *
+  * \class TensorDimensionList
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Special case of tensor index list used to list all the dimensions of a tensor of rank n.
+  *
+  * \sa Tensor
+  */
+
+template <typename Index, std::size_t Rank> struct DimensionList {
+  const Index operator[] (const Index i) const { return i; }
+};
+
+namespace internal {
+
+template<typename Index, std::size_t Rank> struct array_size<DimensionList<Index, Rank> > {
+  static const size_t value = Rank;
+};
+template<typename Index, std::size_t Rank> struct array_size<const DimensionList<Index, Rank> > {
+  static const size_t value = Rank;
+};
+
+template<DenseIndex n, typename Index, std::size_t Rank> const Index array_get(DimensionList<Index, Rank>& a) {
+  return n;
+}
+template<DenseIndex n, typename Index, std::size_t Rank> const Index array_get(const DimensionList<Index, Rank>& a) {
+  return n;
+}
+
+
+#if defined(EIGEN_HAS_CONSTEXPR)
+template <typename Index, std::size_t Rank>
+struct index_known_statically<DimensionList<Index, Rank> > {
+  constexpr bool operator() (const DenseIndex) const {
+    return true;
+  }
+};
+template <typename Index, std::size_t Rank>
+struct index_known_statically<const DimensionList<Index, Rank> > {
+  constexpr bool operator() (const DenseIndex) const {
+    return true;
+  }
+};
+
+template <typename Index, std::size_t Rank>
+struct all_indices_known_statically<DimensionList<Index, Rank> > {
+  constexpr bool operator() () const {
+    return true;
+  }
+};
+template <typename Index, std::size_t Rank>
+struct all_indices_known_statically<const DimensionList<Index, Rank> > {
+  constexpr bool operator() () const {
+    return true;
+  }
+};
+
+template <typename Index, std::size_t Rank>
+struct indices_statically_known_to_increase<DimensionList<Index, Rank> > {
+  constexpr bool operator() () const {
+    return true;
+  }
+};
+template <typename Index, std::size_t Rank>
+struct indices_statically_known_to_increase<const DimensionList<Index, Rank> > {
+  constexpr bool operator() () const {
+    return true;
+  }
+};
+
+template <typename Index, std::size_t Rank>
+struct index_statically_eq<DimensionList<Index, Rank> > {
+  constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
+    return i == value;
+  }
+};
+template <typename Index, std::size_t Rank>
+struct index_statically_eq<const DimensionList<Index, Rank> > {
+  constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
+    return i == value;
+  }
+};
+
+template <typename Index, std::size_t Rank>
+struct index_statically_ne<DimensionList<Index, Rank> > {
+  constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
+    return i != value;
+  }
+};
+template <typename Index, std::size_t Rank>
+struct index_statically_ne<const DimensionList<Index, Rank> > {
+  constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
+    return i != value;
+  }
+};
+
+template <typename Index, std::size_t Rank>
+struct index_statically_gt<DimensionList<Index, Rank> > {
+  constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
+    return i > value;
+  }
+};
+template <typename Index, std::size_t Rank>
+struct index_statically_gt<const DimensionList<Index, Rank> > {
+  constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
+    return i > value;
+  }
+};
+
+template <typename Index, std::size_t Rank>
+struct index_statically_lt<DimensionList<Index, Rank> > {
+  constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
+    return i < value;
+  }
+};
+template <typename Index, std::size_t Rank>
+struct index_statically_lt<const DimensionList<Index, Rank> > {
+  constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
+    return i < value;
+  }
+};
+
+#else
+template <typename Index, std::size_t Rank>
+struct index_known_statically<DimensionList<Index, Rank> > {
+  EIGEN_ALWAYS_INLINE bool operator() (const DenseIndex) const {
+    return true;
+  }
+};
+template <typename Index, std::size_t Rank>
+struct index_known_statically<const DimensionList<Index, Rank> > {
+  EIGEN_ALWAYS_INLINE bool operator() (const DenseIndex) const {
+    return true;
+  }
+};
+
+template <typename Index, std::size_t Rank>
+struct all_indices_known_statically<DimensionList<Index, Rank> > {
+  EIGEN_ALWAYS_INLINE bool operator() () const {
+    return true;
+  }
+};
+template <typename Index, std::size_t Rank>
+struct all_indices_known_statically<const DimensionList<Index, Rank> > {
+  EIGEN_ALWAYS_INLINE bool operator() () const {
+    return true;
+  }
+};
+
+template <typename Index, std::size_t Rank>
+struct indices_statically_known_to_increase<DimensionList<Index, Rank> > {
+  EIGEN_ALWAYS_INLINE bool operator() () const {
+    return true;
+  }
+};
+template <typename Index, std::size_t Rank>
+struct indices_statically_known_to_increase<const DimensionList<Index, Rank> > {
+  EIGEN_ALWAYS_INLINE bool operator() () const {
+    return true;
+  }
+};
+
+template <typename Index, std::size_t Rank>
+struct index_statically_eq<DimensionList<Index, Rank> > {
+  EIGEN_ALWAYS_INLINE bool operator() (const DenseIndex i, const DenseIndex value) const {
+    return false;
+  }
+};
+template <typename Index, std::size_t Rank>
+struct index_statically_eq<const DimensionList<Index, Rank> > {
+  EIGEN_ALWAYS_INLINE bool operator() (const DenseIndex i, const DenseIndex value) const {
+    return false;
+  }
+};
+
+template <typename Index, std::size_t Rank>
+struct index_statically_ne<DimensionList<Index, Rank> > {
+  EIGEN_ALWAYS_INLINE bool operator() (const DenseIndex i, const DenseIndex value) const {
+    return false;
+  }
+};
+template <typename Index, std::size_t Rank>
+struct index_statically_ne<const DimensionList<Index, Rank> > {
+  EIGEN_ALWAYS_INLINE bool operator() (const DenseIndex i, const DenseIndex value) const {
+    return false;
+  }
+};
+
+template <typename Index, std::size_t Rank>
+struct index_statically_gt<DimensionList<Index, Rank> > {
+  EIGEN_ALWAYS_INLINE bool operator() (const DenseIndex i, const DenseIndex value) const {
+    return false;
+  }
+};
+template <typename Index, std::size_t Rank>
+struct index_statically_gt<const DimensionList<Index, Rank> > {
+  EIGEN_ALWAYS_INLINE bool operator() (const DenseIndex i, const DenseIndex value) const {
+    return false;
+  }
+};
+
+template <typename Index, std::size_t Rank>
+struct index_statically_lt<DimensionList<Index, Rank> > {
+  EIGEN_ALWAYS_INLINE bool operator() (const DenseIndex i, const DenseIndex value) const {
+    return false;
+  }
+};
+template <typename Index, std::size_t Rank>
+struct index_statically_lt<const DimensionList<Index, Rank> > {
+  EIGEN_ALWAYS_INLINE bool operator() (const DenseIndex i, const DenseIndex value) const {
+    return false;
+  }
+};
+#endif
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_DIMENSION_LIST_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
new file mode 100644
index 0000000000..8bf5272ec8
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
@@ -0,0 +1,597 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_DIMENSIONS_H
+#define EIGEN_CXX11_TENSOR_TENSOR_DIMENSIONS_H
+
+
+namespace Eigen {
+
+/** \internal
+  *
+  * \class TensorDimensions
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Set of classes used to encode and store the dimensions of a Tensor.
+  *
+  * The Sizes class encodes as part of the type the number of dimensions and the
+  * sizes corresponding to each dimension. It uses no storage space since it is
+  * entirely known at compile time.
+  * The DSizes class is its dynamic sibling: the number of dimensions is known
+  * at compile time but the sizes are set during execution.
+  *
+  * \sa Tensor
+  */
+
+// Can't use std::pairs on cuda devices
+template <typename Index> struct IndexPair {
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair() : first(0), second(0) { }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair(Index f, Index s) : first(f), second(s) { }
+  Index first;
+  Index second;
+};
+
+// Boilerplate code
+namespace internal {
+
+template<std::size_t n, typename Dimension> struct dget {
+  static const std::size_t value = get<n, typename Dimension::Base>::value;
+};
+
+
+template<typename Index, std::size_t NumIndices, std::size_t n, bool RowMajor>
+struct fixed_size_tensor_index_linearization_helper
+{
+  template <typename Dimensions> EIGEN_DEVICE_FUNC
+  static inline Index run(array<Index, NumIndices> const& indices,
+                          const Dimensions& dimensions)
+  {
+    return array_get<RowMajor ? n - 1 : (NumIndices - n)>(indices) +
+        dget<RowMajor ? n - 1 : (NumIndices - n), Dimensions>::value *
+        fixed_size_tensor_index_linearization_helper<Index, NumIndices, n - 1, RowMajor>::run(indices, dimensions);
+  }
+};
+
+template<typename Index, std::size_t NumIndices, bool RowMajor>
+struct fixed_size_tensor_index_linearization_helper<Index, NumIndices, 0, RowMajor>
+{
+  template <typename Dimensions> EIGEN_DEVICE_FUNC
+  static inline Index run(array<Index, NumIndices> const& indices,
+                          const Dimensions&)
+  {
+    return 0;
+  }
+};
+
+template<typename Index, std::size_t n>
+struct fixed_size_tensor_index_extraction_helper
+{
+  template <typename Dimensions> EIGEN_DEVICE_FUNC
+  static inline Index run(const Index index,
+                          const Dimensions& dimensions)
+  {
+    const Index mult = (index == n) ? 1 : 0;
+    return array_get<n>(dimensions) * mult +
+        fixed_size_tensor_index_extraction_helper<Index, n - 1>::run(index, dimensions);
+  }
+};
+
+template<typename Index>
+struct fixed_size_tensor_index_extraction_helper<Index, 0>
+{
+  template <typename Dimensions> EIGEN_DEVICE_FUNC
+  static inline Index run(const Index index,
+                          const Dimensions& dimensions)
+  {
+    const Index mult = (index == 0) ? 1 : 0;
+    return array_get<0>(dimensions) * mult;
+  }
+};
+
+}  // end namespace internal
+
+
+// Fixed size
+#ifndef EIGEN_EMULATE_CXX11_META_H
+template <typename std::size_t... Indices>
+struct Sizes : internal::numeric_list<std::size_t, Indices...> {
+  typedef internal::numeric_list<std::size_t, Indices...> Base;
+  static const std::size_t total_size = internal::arg_prod(Indices...);
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t rank() const {
+    return Base::count;
+  }
+
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t TotalSize() {
+    return internal::arg_prod(Indices...);
+  }
+
+  Sizes() { }
+  template <typename DenseIndex>
+  explicit Sizes(const array<DenseIndex, Base::count>& /*indices*/) {
+    // todo: add assertion
+  }
+#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+  template <typename... DenseIndex> Sizes(DenseIndex...) { }
+  explicit Sizes(std::initializer_list<std::size_t> /*l*/) {
+    // todo: add assertion
+  }
+#endif
+
+  template <typename T> Sizes& operator = (const T& /*other*/) {
+    // add assertion failure if the size of other is different
+    return *this;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t operator[] (const int index) const {
+    return internal::fixed_size_tensor_index_extraction_helper<std::ptrdiff_t, Base::count - 1>::run(index, *this);
+  }
+
+  template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  size_t IndexOfColMajor(const array<DenseIndex, Base::count>& indices) const {
+    return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, false>::run(indices, *static_cast<const Base*>(this));
+  }
+  template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  size_t IndexOfRowMajor(const array<DenseIndex, Base::count>& indices) const {
+    return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, true>::run(indices, *static_cast<const Base*>(this));
+  }
+};
+
+namespace internal {
+template <typename std::size_t... Indices>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_prod(const Sizes<Indices...>&) {
+  return Sizes<Indices...>::total_size;
+}
+}
+
+#else
+
+template <std::size_t n>
+struct non_zero_size {
+  typedef internal::type2val<std::size_t, n> type;
+};
+template <>
+struct non_zero_size<0> {
+  typedef internal::null_type type;
+};
+
+template <std::size_t V1=0, std::size_t V2=0, std::size_t V3=0, std::size_t V4=0, std::size_t V5=0> struct Sizes {
+  typedef typename internal::make_type_list<typename non_zero_size<V1>::type, typename non_zero_size<V2>::type, typename non_zero_size<V3>::type, typename non_zero_size<V4>::type, typename non_zero_size<V5>::type >::type Base;
+  static const size_t count = Base::count;
+  static const std::size_t total_size = internal::arg_prod<Base>::value;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t rank() const {
+    return count;
+  }
+
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t TotalSize() {
+    return internal::arg_prod<Base>::value;
+  }
+
+  Sizes() { }
+  template <typename DenseIndex>
+  explicit Sizes(const array<DenseIndex, Base::count>& indices) {
+    // todo: add assertion
+  }
+#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+  template <typename... DenseIndex> Sizes(DenseIndex... indices) { }
+  explicit Sizes(std::initializer_list<std::size_t> l) {
+    // todo: add assertion
+  }
+#else
+  EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex i0) {
+  }
+  EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex i0, const DenseIndex i1) {
+  }
+  EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2) {
+  }
+  EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3) {
+  }
+  EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3, const DenseIndex i4) {
+  }
+#endif
+
+  template <typename T> Sizes& operator = (const T& other) {
+    // to do: check the size of other
+    return *this;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t operator[] (const int index) const {
+    switch (index) {
+      case 0:
+        return internal::get<0, Base>::value;
+      case 1:
+        return internal::get<1, Base>::value;
+      case 2:
+        return internal::get<2, Base>::value;
+      case 3:
+        return internal::get<3, Base>::value;
+      case 4:
+        return internal::get<4, Base>::value;
+      default:
+        eigen_assert(false && "index overflow");
+        return static_cast<std::size_t>(-1);
+    }
+  }
+
+  template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  size_t IndexOfColMajor(const array<DenseIndex, Base::count>& indices) const {
+    return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, false>::run(indices, *this);
+  }
+  template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  size_t IndexOfRowMajor(const array<DenseIndex, Base::count>& indices) const {
+    return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, true>::run(indices, *this);
+  }
+};
+
+namespace internal {
+template <std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_prod(const Sizes<V1, V2, V3, V4, V5>&) {
+  return Sizes<V1, V2, V3, V4, V5>::total_size;
+}
+}
+
+#endif
+
+// Boilerplate
+namespace internal {
+template<typename Index, std::size_t NumIndices, std::size_t n, bool RowMajor>
+struct tensor_index_linearization_helper
+{
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Index run(array<Index, NumIndices> const& indices, array<Index, NumIndices> const& dimensions)
+  {
+    return array_get<RowMajor ? n : (NumIndices - n - 1)>(indices) +
+      array_get<RowMajor ? n : (NumIndices - n - 1)>(dimensions) *
+        tensor_index_linearization_helper<Index, NumIndices, n - 1, RowMajor>::run(indices, dimensions);
+  }
+};
+
+template<typename Index, std::size_t NumIndices, bool RowMajor>
+struct tensor_index_linearization_helper<Index, NumIndices, 0, RowMajor>
+{
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Index run(array<Index, NumIndices> const& indices, array<Index, NumIndices> const&)
+  {
+    return array_get<RowMajor ? 0 : NumIndices - 1>(indices);
+  }
+};
+}  // end namespace internal
+
+
+
+// Dynamic size
+template <typename DenseIndex, std::size_t NumDims>
+struct DSizes : array<DenseIndex, NumDims> {
+  typedef array<DenseIndex, NumDims> Base;
+  static const std::size_t count = NumDims;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t rank() const {
+    return NumDims;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t TotalSize() const {
+    return internal::array_prod(*static_cast<const Base*>(this));
+  }
+
+  EIGEN_DEVICE_FUNC DSizes() {
+    for (int i = 0 ; i < NumDims; ++i) {
+      (*this)[i] = 0;
+    }
+  }
+  EIGEN_DEVICE_FUNC DSizes(const array<DenseIndex, NumDims>& a) : Base(a) { }
+
+  EIGEN_DEVICE_FUNC DSizes(const DimensionList<DenseIndex, NumDims>& a) {
+    for (int i = 0 ; i < NumDims; ++i) {
+      (*this)[i] = a[i];
+    }
+  }
+
+#ifndef EIGEN_EMULATE_CXX11_META_H
+  template <typename std::size_t... Indices>
+  EIGEN_DEVICE_FUNC DSizes(const Sizes<Indices...>& a) {
+    for (int i = 0 ; i < NumDims; ++i) {
+      (*this)[i] = a[i];
+    }
+  }
+#else
+  template <std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5>
+  EIGEN_DEVICE_FUNC DSizes(const Sizes<V1, V2, V3, V4, V5>& a) {
+    for (int i = 0 ; i < NumDims; ++i) {
+      (*this)[i] = a[i];
+    }
+  }
+#endif
+
+#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+  template<typename... IndexTypes> EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE explicit DSizes(DenseIndex firstDimension, IndexTypes... otherDimensions) {
+    EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumDims, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    (*this) = array<DenseIndex, NumDims>{{firstDimension, otherDimensions...}};
+  }
+#else
+  EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0) {
+    eigen_assert(NumDims == 1);
+    (*this)[0] = i0;
+  }
+  EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0, const DenseIndex i1) {
+    eigen_assert(NumDims == 2);
+    (*this)[0] = i0;
+    (*this)[1] = i1;
+  }
+  EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2) {
+    eigen_assert(NumDims == 3);
+    (*this)[0] = i0;
+    (*this)[1] = i1;
+    (*this)[2] = i2;
+  }
+  EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3) {
+    eigen_assert(NumDims == 4);
+    (*this)[0] = i0;
+    (*this)[1] = i1;
+    (*this)[2] = i2;
+    (*this)[3] = i3;
+  }
+  EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3, const DenseIndex i4) {
+    eigen_assert(NumDims == 5);
+    (*this)[0] = i0;
+    (*this)[1] = i1;
+    (*this)[2] = i2;
+    (*this)[3] = i3;
+    (*this)[4] = i4;
+  }
+#endif
+
+  EIGEN_DEVICE_FUNC DSizes& operator = (const array<DenseIndex, NumDims>& other) {
+    *static_cast<Base*>(this) = other;
+    return *this;
+  }
+
+  // A constexpr would be so much better here
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t IndexOfColMajor(const array<DenseIndex, NumDims>& indices) const {
+    return internal::tensor_index_linearization_helper<DenseIndex, NumDims, NumDims - 1, false>::run(indices, *static_cast<const Base*>(this));
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t IndexOfRowMajor(const array<DenseIndex, NumDims>& indices) const {
+    return internal::tensor_index_linearization_helper<DenseIndex, NumDims, NumDims - 1, true>::run(indices, *static_cast<const Base*>(this));
+  }
+};
+
+
+
+
+// Boilerplate
+namespace internal {
+template<typename Index, std::size_t NumIndices, std::size_t n, bool RowMajor>
+struct tensor_vsize_index_linearization_helper
+{
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Index run(array<Index, NumIndices> const& indices, std::vector<DenseIndex> const& dimensions)
+  {
+    return array_get<RowMajor ? n : (NumIndices - n - 1)>(indices) +
+      array_get<RowMajor ? n : (NumIndices - n - 1)>(dimensions) *
+        tensor_vsize_index_linearization_helper<Index, NumIndices, n - 1, RowMajor>::run(indices, dimensions);
+  }
+};
+
+template<typename Index, std::size_t NumIndices, bool RowMajor>
+struct tensor_vsize_index_linearization_helper<Index, NumIndices, 0, RowMajor>
+{
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Index run(array<Index, NumIndices> const& indices, std::vector<DenseIndex> const&)
+  {
+    return array_get<RowMajor ? 0 : NumIndices - 1>(indices);
+  }
+};
+}  // end namespace internal
+
+
+template <typename DenseIndex>
+struct VSizes : std::vector<DenseIndex> {
+  typedef std::vector<DenseIndex> Base;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t rank() const {
+    return Base::size();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t TotalSize() const {
+    return internal::array_prod(*static_cast<const Base*>(this));
+  }
+
+  EIGEN_DEVICE_FUNC VSizes() { }
+  EIGEN_DEVICE_FUNC explicit VSizes(const std::vector<DenseIndex>& a) : Base(a) { }
+
+  template <std::size_t NumDims>
+  EIGEN_DEVICE_FUNC explicit VSizes(const array<DenseIndex, NumDims>& a) {
+    this->resize(NumDims);
+    for (int i = 0; i < NumDims; ++i) {
+      (*this)[i] = a[i];
+    }
+  }
+  template <std::size_t NumDims>
+  EIGEN_DEVICE_FUNC explicit VSizes(const DSizes<DenseIndex, NumDims>& a) {
+    this->resize(NumDims);
+    for (int i = 0; i < NumDims; ++i) {
+      (*this)[i] = a[i];
+    }
+  }
+
+  EIGEN_DEVICE_FUNC explicit VSizes(const DenseIndex i0) {
+    this->resize(1);
+    (*this)[0] = i0;
+  }
+  EIGEN_DEVICE_FUNC explicit VSizes(const DenseIndex i0, const DenseIndex i1) {
+    this->resize(2);
+    (*this)[0] = i0;
+    (*this)[1] = i1;
+  }
+  EIGEN_DEVICE_FUNC explicit VSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2) {
+    this->resize(3);
+    (*this)[0] = i0;
+    (*this)[1] = i1;
+    (*this)[2] = i2;
+  }
+  EIGEN_DEVICE_FUNC explicit VSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3) {
+    this->resize(4);
+    (*this)[0] = i0;
+    (*this)[1] = i1;
+    (*this)[2] = i2;
+    (*this)[3] = i3;
+  }
+  EIGEN_DEVICE_FUNC explicit VSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3, const DenseIndex i4) {
+    this->resize(5);
+    (*this)[0] = i0;
+    (*this)[1] = i1;
+    (*this)[2] = i2;
+    (*this)[3] = i3;
+    (*this)[4] = i4;
+  }
+
+  EIGEN_DEVICE_FUNC VSizes& operator = (const std::vector<DenseIndex>& other) {
+    *static_cast<Base*>(this) = other;
+    return *this;
+  }
+  template <std::size_t NumDims>
+  EIGEN_DEVICE_FUNC VSizes& operator = (const array<DenseIndex, NumDims>& a) {
+    this->resize(NumDims);
+    for (int i = 0; i < NumDims; ++i) {
+      (*this)[i] = a[i];
+    }
+    return *this;
+  }
+  template <std::size_t NumDims>
+  EIGEN_DEVICE_FUNC VSizes& operator = (const DSizes<DenseIndex, NumDims>& a) {
+    this->resize(NumDims);
+    for (int i = 0; i < NumDims; ++i) {
+      (*this)[i] = a[i];
+    }
+    return *this;
+  }
+
+  // A constexpr would be so much better here
+  template <std::size_t NumDims>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t IndexOfColMajor(const array<DenseIndex, NumDims>& indices) const {
+    return internal::tensor_vsize_index_linearization_helper<DenseIndex, NumDims, NumDims - 1, false>::run(indices, *static_cast<const Base*>(this));
+  }
+  template <std::size_t NumDims>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t IndexOfRowMajor(const array<DenseIndex, NumDims>& indices) const {
+    return internal::tensor_vsize_index_linearization_helper<DenseIndex, NumDims, NumDims - 1, true>::run(indices, *static_cast<const Base*>(this));
+  }
+};
+
+
+// Boilerplate
+namespace internal {
+template <typename DenseIndex>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex array_prod(const VSizes<DenseIndex>& sizes) {
+  DenseIndex total_size = 1;
+  for (int i = 0; i < sizes.size(); ++i) {
+    total_size *= sizes[i];
+  }
+  return total_size;
+};
+}
+
+namespace internal {
+
+template <typename DenseIndex, std::size_t NumDims> struct array_size<const DSizes<DenseIndex, NumDims> > {
+  static const size_t value = NumDims;
+};
+template <typename DenseIndex, std::size_t NumDims> struct array_size<DSizes<DenseIndex, NumDims> > {
+  static const size_t value = NumDims;
+};
+template <typename DenseIndex>
+struct array_size<VSizes<DenseIndex> > {
+  static const ptrdiff_t value = -1;
+};
+#ifndef EIGEN_EMULATE_CXX11_META_H
+template <typename std::size_t... Indices> struct array_size<const Sizes<Indices...> > {
+static const size_t value = Sizes<Indices...>::count;
+};
+template <typename std::size_t... Indices> struct array_size<Sizes<Indices...> > {
+static const size_t value = Sizes<Indices...>::count;
+};
+template <std::size_t n, typename std::size_t... Indices> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_get(const Sizes<Indices...>&) {
+  return get<n, internal::numeric_list<std::size_t, Indices...> >::value;
+}
+#else
+template <std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5> struct array_size<const Sizes<V1,V2,V3,V4,V5> > {
+  static const size_t value = Sizes<V1,V2,V3,V4,V5>::count;
+};
+template <std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5> struct array_size<Sizes<V1,V2,V3,V4,V5> > {
+  static const size_t value = Sizes<V1,V2,V3,V4,V5>::count;
+};
+template <std::size_t n, std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_get(const Sizes<V1,V2,V3,V4,V5>& a) {
+  return get<n, typename Sizes<V1,V2,V3,V4,V5>::Base>::value;
+}
+
+#endif
+
+
+template <typename Dims1, typename Dims2, size_t n, size_t m>
+struct sizes_match_below_dim {
+  static inline bool run(Dims1& dims1, Dims2& dims2) {
+    return false;
+  }
+};
+template <typename Dims1, typename Dims2, size_t n>
+struct sizes_match_below_dim<Dims1, Dims2, n, n> {
+  static inline bool run(Dims1& dims1, Dims2& dims2) {
+    return (array_get<n-1>(dims1) == array_get<n-1>(dims2)) &
+        sizes_match_below_dim<Dims1, Dims2, n-1, n-1>::run(dims1, dims2);
+  }
+};
+template <typename Dims1, typename Dims2>
+struct sizes_match_below_dim<Dims1, Dims2, 0, 0> {
+  static inline bool run(Dims1& dims1, Dims2& dims2) {
+    return true;
+  }
+};
+
+} // end namespace internal
+
+
+template <typename Dims1, typename Dims2>
+bool dimensions_match(Dims1& dims1, Dims2& dims2) {
+  return internal::sizes_match_below_dim<Dims1, Dims2, internal::array_size<Dims1>::value, internal::array_size<Dims2>::value>::run(dims1, dims2);
+}
+
+template <typename IndexType, typename Dims2>
+bool dimensions_match(const VSizes<IndexType>& dims1, Dims2& dims2) {
+  if (dims1.size() != internal::array_size<Dims2>::value) {
+    return false;
+  }
+  for (int i = 0; i < internal::array_size<Dims2>::value; ++i) {
+    if (dims1[i] != dims2[i]) {
+      return false;
+    }
+  }
+  return true;
+}
+
+template <typename Dims1, typename IndexType>
+bool dimensions_match(Dims1& dims1, const VSizes<IndexType>& dims2) {
+  if (internal::array_size<Dims1>::value != dims2.size()) {
+    return false;
+  }
+  for (int i = 0; i < internal::array_size<Dims1>::value; ++i) {
+    if (dims1[i] != dims2[i]) {
+      return false;
+    }
+  }
+  return true;
+}
+
+template <typename IndexType>
+bool dimensions_match(const VSizes<IndexType>& dims1, const VSizes<IndexType>& dims2) {
+  return dims1 == dims2;
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_DIMENSIONS_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
new file mode 100644
index 0000000000..4ad431abae
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
@@ -0,0 +1,151 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_EVAL_TO_H
+#define EIGEN_CXX11_TENSOR_TENSOR_EVAL_TO_H
+
+namespace Eigen {
+
+/** \class TensorForcedEval
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor reshaping class.
+  *
+  *
+  */
+namespace internal {
+template<typename XprType>
+struct traits<TensorEvalToOp<XprType> >
+{
+  // Type promotion to handle the case where the types of the lhs and the rhs are different.
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef typename remove_reference<Nested>::type _Nested;
+  static const int NumDimensions = XprTraits::NumDimensions;
+  static const int Layout = XprTraits::Layout;
+
+  enum {
+    Flags = 0,
+  };
+};
+
+template<typename XprType>
+struct eval<TensorEvalToOp<XprType>, Eigen::Dense>
+{
+  typedef const TensorEvalToOp<XprType>& type;
+};
+
+template<typename XprType>
+struct nested<TensorEvalToOp<XprType>, 1, typename eval<TensorEvalToOp<XprType> >::type>
+{
+  typedef TensorEvalToOp<XprType> type;
+};
+
+}  // end namespace internal
+
+
+
+
+template<typename XprType>
+class TensorEvalToOp : public TensorBase<TensorEvalToOp<XprType> >
+{
+  public:
+  typedef typename Eigen::internal::traits<TensorEvalToOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorEvalToOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorEvalToOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorEvalToOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvalToOp(CoeffReturnType* buffer, const XprType& expr)
+      : m_xpr(expr), m_buffer(buffer) {}
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename XprType::Nested>::type&
+    expression() const { return m_xpr; }
+
+    EIGEN_DEVICE_FUNC CoeffReturnType* buffer() const { return m_buffer; }
+
+  protected:
+    typename XprType::Nested m_xpr;
+    CoeffReturnType* m_buffer;
+};
+
+
+
+template<typename ArgType, typename Device>
+struct TensorEvaluator<const TensorEvalToOp<ArgType>, Device>
+{
+  typedef TensorEvalToOp<ArgType> XprType;
+  typedef typename ArgType::Scalar Scalar;
+  typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
+
+  enum {
+    IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device), m_device(device), m_buffer(op.buffer())
+  { }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~TensorEvaluator() {
+  }
+
+  typedef typename XprType::Index Index;
+  typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+
+  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* scalar) {
+    assert(scalar == NULL);
+    return m_impl.evalSubExprsIfNeeded(m_buffer);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) {
+    m_buffer[i] = m_impl.coeff(i);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) {
+    internal::pstoret<CoeffReturnType, PacketReturnType, Aligned>(m_buffer + i, m_impl.template packet<TensorEvaluator<ArgType, Device>::IsAligned ? Aligned : Unaligned>(i));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_impl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    return m_buffer[index];
+  }
+
+  template<int LoadMode>
+  EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  {
+    return internal::ploadt<PacketReturnType, LoadMode>(m_buffer + index);
+  }
+
+  EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_buffer; }
+
+ private:
+  TensorEvaluator<ArgType, Device> m_impl;
+  const Device& m_device;
+  CoeffReturnType* m_buffer;
+};
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_EVAL_TO_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
new file mode 100644
index 0000000000..f2ef2d85c1
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
@@ -0,0 +1,505 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H
+#define EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H
+
+namespace Eigen {
+
+/** \class TensorEvaluator
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief The tensor evaluator classes.
+  *
+  * These classes are responsible for the evaluation of the tensor expression.
+  *
+  * TODO: add support for more types of expressions, in particular expressions
+  * leading to lvalues (slicing, reshaping, etc...)
+  */
+
+// Generic evaluator
+template<typename Derived, typename Device>
+struct TensorEvaluator
+{
+  typedef typename Derived::Index Index;
+  typedef typename Derived::Scalar Scalar;
+  typedef typename Derived::Scalar CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef typename Derived::Dimensions Dimensions;
+
+  // NumDimensions is -1 for variable dim tensors
+  static const int NumCoords = internal::traits<Derived>::NumDimensions;
+  static const int SafeNumCoords = NumCoords >= 0 ? NumCoords : 0;
+
+  enum {
+    IsAligned = Derived::IsAligned,
+    PacketAccess = Derived::PacketAccess,
+    BlockAccess = internal::is_arithmetic<
+                      typename internal::remove_const<Scalar>::type>::value &&
+                  NumCoords >= 0,
+    Layout = Derived::Layout,
+    CoordAccess = NumCoords >= 0,
+  };
+
+  typedef typename internal::TensorBlock<
+      Index, typename internal::remove_const<Scalar>::type, SafeNumCoords, Layout>
+      TensorBlock;
+  typedef typename internal::TensorBlockReader<
+      Index, typename internal::remove_const<Scalar>::type, SafeNumCoords, Layout,
+      PacketAccess> TensorBlockReader;
+  typedef typename internal::TensorBlockWriter<
+      Index, typename internal::remove_const<Scalar>::type, SafeNumCoords, Layout,
+      PacketAccess> TensorBlockWriter;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  TensorEvaluator(const Derived& m, const Device& device)
+      : m_data(const_cast<Scalar*>(m.data())),
+        m_dims(m.dimensions()),
+        m_device(device) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* dest) {
+    if (dest) {
+      m_device.memcpy((void*)dest, m_data, sizeof(Scalar) * m_dims.TotalSize());
+      return false;
+    }
+    return true;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    eigen_assert(m_data);
+    return m_data[index];
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
+    eigen_assert(m_data);
+    return m_data[index];
+  }
+
+  template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  PacketReturnType packet(Index index) const
+  {
+    return internal::ploadt<PacketReturnType, LoadMode>(m_data + index);
+  }
+
+  template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void writePacket(Index index, const PacketReturnType& x)
+  {
+    return internal::pstoret<Scalar, PacketReturnType, StoreMode>(m_data + index, x);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<Index, SafeNumCoords>& coords) const {
+    eigen_assert(m_data);
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      return m_data[m_dims.IndexOfColMajor(coords)];
+    } else {
+      return m_data[m_dims.IndexOfRowMajor(coords)];
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(const array<Index, SafeNumCoords>& coords) {
+    eigen_assert(m_data);
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      return m_data[m_dims.IndexOfColMajor(coords)];
+    } else {
+      return m_data[m_dims.IndexOfRowMajor(coords)];
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
+      std::vector<internal::TensorOpResourceRequirements>* resources) const {
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(TensorBlock* block) const {
+    assert(m_data != NULL);
+    TensorBlockReader::Run(block, m_data);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(
+      const TensorBlock& block) {
+    assert(m_data != NULL);
+    TensorBlockWriter::Run(block, m_data);
+  }
+
+  EIGEN_DEVICE_FUNC Scalar* data() const { return m_data; }
+
+ protected:
+  Scalar* m_data;
+  Dimensions m_dims;
+  const Device& m_device;
+};
+
+
+namespace {
+template <typename T> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+T loadConstant(const T* address) {
+  return *address;
+
+}
+// Use the texture cache on CUDA devices whenever possible
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
+template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+float loadConstant(const float* address) {
+  return __ldg(address);
+}
+template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+double loadConstant(const double* address) {
+  return __ldg(address);
+
+
+}
+#endif
+}
+
+
+// Default evaluator for rvalues
+template<typename Derived, typename Device>
+struct TensorEvaluator<const Derived, Device>
+{
+  typedef typename Derived::Index Index;
+  typedef typename Derived::Scalar Scalar;
+  typedef typename Derived::Scalar CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef typename Derived::Dimensions Dimensions;
+
+  // NumDimensions is -1 for variable dim tensors
+  static const int NumCoords = internal::traits<Derived>::NumDimensions;
+  static const int SafeNumCoords = NumCoords >= 0 ? NumCoords : 0;
+
+  enum {
+    IsAligned = Derived::IsAligned,
+    PacketAccess = Derived::PacketAccess,
+    BlockAccess = internal::is_arithmetic<
+                      typename internal::remove_const<Scalar>::type>::value &&
+                  NumCoords >= 0,
+    Layout = Derived::Layout,
+    CoordAccess = NumCoords >= 0,
+  };
+
+  // TODO(andydavis) Add block/writeBlock accessors to Tensor and TensorMap so
+  // we can default BlockAccess to true above.
+  typedef typename internal::TensorBlock<
+      Index, typename internal::remove_const<Scalar>::type, SafeNumCoords, Layout>
+      TensorBlock;
+  typedef typename internal::TensorBlockReader<
+      Index, typename internal::remove_const<Scalar>::type, SafeNumCoords, Layout,
+      PacketAccess> TensorBlockReader;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device)
+      : m_data(m.data()), m_dims(m.dimensions()), m_device(device)
+  { }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
+    if (internal::is_arithmetic<typename internal::remove_const<Scalar>::type>::value && data) {
+      m_device.memcpy((void*)data, m_data, m_dims.TotalSize() * sizeof(Scalar));
+      return false;
+    }
+    return true;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    eigen_assert(m_data);
+    return loadConstant(m_data+index);
+  }
+
+  template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  PacketReturnType packet(Index index) const
+  {
+    return internal::ploadt_ro<PacketReturnType, LoadMode>(m_data + index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<Index, SafeNumCoords>& coords) const {
+    eigen_assert(m_data);
+    const Index index = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_dims.IndexOfColMajor(coords)
+                        : m_dims.IndexOfRowMajor(coords);
+    return loadConstant(m_data+index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
+      std::vector<internal::TensorOpResourceRequirements>* resources) const {
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(TensorBlock* block) const {
+    assert(m_data != NULL);
+    TensorBlockReader::Run(block, m_data);
+  }
+
+  EIGEN_DEVICE_FUNC const Scalar* data() const { return m_data; }
+
+ protected:
+  const Scalar* m_data;
+  Dimensions m_dims;
+  const Device& m_device;
+};
+
+
+
+
+// -------------------- CwiseNullaryOp --------------------
+
+template<typename NullaryOp, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
+{
+  typedef TensorCwiseNullaryOp<NullaryOp, ArgType> XprType;
+
+  enum {
+    IsAligned = true,
+    PacketAccess = internal::functor_traits<NullaryOp>::PacketAccess,
+    BlockAccess = false,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+  };
+
+  EIGEN_DEVICE_FUNC
+  TensorEvaluator(const XprType& op, const Device& device)
+      : m_functor(op.functor()), m_argImpl(op.nestedExpression(), device)
+  { }
+
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
+
+  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { return true; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { }
+
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
+  {
+    return m_functor(index);
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  {
+    return m_functor.packetOp(index);
+  }
+
+  EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
+
+ private:
+  const NullaryOp m_functor;
+  TensorEvaluator<ArgType, Device> m_argImpl;
+};
+
+
+
+// -------------------- CwiseUnaryOp --------------------
+
+template<typename UnaryOp, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
+{
+  typedef TensorCwiseUnaryOp<UnaryOp, ArgType> XprType;
+
+  enum {
+    IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess &
+                   internal::functor_traits<UnaryOp>::PacketAccess,
+    BlockAccess = false,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+  };
+
+  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
+    : m_functor(op.functor()),
+      m_argImpl(op.nestedExpression(), device)
+  { }
+
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
+
+  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
+    m_argImpl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_argImpl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
+  {
+    return m_functor(m_argImpl.coeff(index));
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  {
+    return m_functor.packetOp(m_argImpl.template packet<LoadMode>(index));
+  }
+
+  EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
+
+ private:
+  const UnaryOp m_functor;
+  TensorEvaluator<ArgType, Device> m_argImpl;
+};
+
+
+// -------------------- CwiseBinaryOp --------------------
+
+template<typename BinaryOp, typename LeftArgType, typename RightArgType, typename Device>
+struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArgType>, Device>
+{
+  typedef TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArgType> XprType;
+
+  enum {
+    IsAligned = TensorEvaluator<LeftArgType, Device>::IsAligned &
+                TensorEvaluator<RightArgType, Device>::IsAligned,
+    PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess &
+                   TensorEvaluator<RightArgType, Device>::PacketAccess &
+                   internal::functor_traits<BinaryOp>::PacketAccess,
+    BlockAccess = false,
+    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+  };
+
+  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
+    : m_functor(op.functor()),
+      m_leftImpl(op.lhsExpression(), device),
+      m_rightImpl(op.rhsExpression(), device)
+  {
+    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout) || internal::traits<XprType>::NumDimensions <= 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    eigen_assert(dimensions_match(m_leftImpl.dimensions(), m_rightImpl.dimensions()));
+  }
+
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef typename TensorEvaluator<LeftArgType, Device>::Dimensions Dimensions;
+
+  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
+  {
+    // TODO: use right impl instead if right impl dimensions are known at compile time.
+    return m_leftImpl.dimensions();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
+    m_leftImpl.evalSubExprsIfNeeded(NULL);
+    m_rightImpl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_leftImpl.cleanup();
+    m_rightImpl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
+  {
+    return m_functor(m_leftImpl.coeff(index), m_rightImpl.coeff(index));
+  }
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  {
+    return m_functor.packetOp(m_leftImpl.template packet<LoadMode>(index), m_rightImpl.template packet<LoadMode>(index));
+  }
+
+  EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
+
+ private:
+  const BinaryOp m_functor;
+  TensorEvaluator<LeftArgType, Device> m_leftImpl;
+  TensorEvaluator<RightArgType, Device> m_rightImpl;
+};
+
+
+// -------------------- SelectOp --------------------
+
+template<typename IfArgType, typename ThenArgType, typename ElseArgType, typename Device>
+struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>, Device>
+{
+  typedef TensorSelectOp<IfArgType, ThenArgType, ElseArgType> XprType;
+  typedef typename XprType::Scalar Scalar;
+
+  enum {
+    IsAligned = TensorEvaluator<ThenArgType, Device>::IsAligned &
+                TensorEvaluator<ElseArgType, Device>::IsAligned,
+    PacketAccess = TensorEvaluator<ThenArgType, Device>::PacketAccess &
+                   TensorEvaluator<ElseArgType, Device>::PacketAccess &
+                   internal::packet_traits<Scalar>::HasBlend,
+    BlockAccess = false,
+    Layout = TensorEvaluator<IfArgType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+  };
+
+  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
+    : m_condImpl(op.ifExpression(), device),
+      m_thenImpl(op.thenExpression(), device),
+      m_elseImpl(op.elseExpression(), device)
+  {
+    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<IfArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<ThenArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<IfArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<ElseArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    eigen_assert(dimensions_match(m_condImpl.dimensions(), m_thenImpl.dimensions()));
+    eigen_assert(dimensions_match(m_thenImpl.dimensions(), m_elseImpl.dimensions()));
+  }
+
+  typedef typename XprType::Index Index;
+  typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef typename TensorEvaluator<IfArgType, Device>::Dimensions Dimensions;
+
+  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
+  {
+    // TODO: use then or else impl instead if they happen to be known at compile time.
+    return m_condImpl.dimensions();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
+    m_condImpl.evalSubExprsIfNeeded(NULL);
+    m_thenImpl.evalSubExprsIfNeeded(NULL);
+    m_elseImpl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_condImpl.cleanup();
+    m_thenImpl.cleanup();
+    m_elseImpl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
+  {
+    return m_condImpl.coeff(index) ? m_thenImpl.coeff(index) : m_elseImpl.coeff(index);
+  }
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const
+  {
+    const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+    internal::Selector<PacketSize> select;
+    for (Index i = 0; i < PacketSize; ++i) {
+      select.select[i] = m_condImpl.coeff(index+i);
+    }
+    return internal::pblend(select,
+                            m_thenImpl.template packet<LoadMode>(index),
+                            m_elseImpl.template packet<LoadMode>(index));
+  }
+
+  EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
+
+ private:
+  TensorEvaluator<IfArgType, Device> m_condImpl;
+  TensorEvaluator<ThenArgType, Device> m_thenImpl;
+  TensorEvaluator<ElseArgType, Device> m_elseImpl;
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
new file mode 100644
index 0000000000..863c28ab43
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -0,0 +1,461 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
+#define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
+
+namespace Eigen {
+
+/** \class TensorExecutor
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief The tensor executor class.
+  *
+  * This class is responsible for launch the evaluation of the expression on
+  * the specified computing device.
+  */
+namespace internal {
+
+// Default strategy: the expression is evaluated with a single cpu thread.
+template <typename Expression, typename Device,
+          bool Vectorizable, bool Tileable>
+class TensorExecutor {
+ public:
+  typedef typename Expression::Index Index;
+  EIGEN_DEVICE_FUNC static inline void run(const Expression& expr, const Device& device = Device())
+  {
+    TensorEvaluator<Expression, Device> evaluator(expr, device);
+    const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
+    if (needs_assign)
+    {
+      const Index size = array_prod(evaluator.dimensions());
+      for (Index i = 0; i < size; ++i) {
+        evaluator.evalScalar(i);
+      }
+    }
+    evaluator.cleanup();
+  }
+};
+
+template <typename Expression>
+class TensorExecutor<Expression, DefaultDevice, true, false> {
+ public:
+  typedef typename Expression::Index Index;
+  EIGEN_DEVICE_FUNC
+  static inline void run(const Expression& expr, const DefaultDevice& device = DefaultDevice())
+  {
+    TensorEvaluator<Expression, DefaultDevice> evaluator(expr, device);
+    const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
+    if (needs_assign)
+    {
+      const Index size = array_prod(evaluator.dimensions());
+      const int PacketSize = unpacket_traits<typename TensorEvaluator<Expression, DefaultDevice>::PacketReturnType>::size;
+
+      // Manually unroll this loop since compilers don't do it.
+      const Index UnrolledSize = (size / (4 * PacketSize)) * 4 * PacketSize;
+      for (Index i = 0; i < UnrolledSize; i += 4*PacketSize) {
+        evaluator.evalPacket(i);
+        evaluator.evalPacket(i+PacketSize);
+        evaluator.evalPacket(i+2*PacketSize);
+        evaluator.evalPacket(i+3*PacketSize);
+      }
+      const Index VectorizedSize = (size / PacketSize) * PacketSize;
+      for (Index i = UnrolledSize; i < VectorizedSize; i += PacketSize) {
+        evaluator.evalPacket(i);
+      }
+      for (Index i = VectorizedSize; i < size; ++i) {
+        evaluator.evalScalar(i);
+      }
+    }
+    evaluator.cleanup();
+  }
+};
+
+template <typename Expression, bool Vectorizable>
+class TensorExecutor<Expression, DefaultDevice, Vectorizable, true> {
+ public:
+  typedef typename Expression::Index Index;
+  EIGEN_DEVICE_FUNC
+  static inline void run(const Expression& expr,
+                         const DefaultDevice& device = DefaultDevice()) {
+    typedef TensorEvaluator<Expression, DefaultDevice> Evaluator;
+    typedef typename traits<Expression>::Scalar Scalar;
+    typedef typename traits<Expression>::Index Index;
+    const std::size_t NumDims = traits<Expression>::NumDimensions;
+
+    typedef TensorBlockMapper<Index,
+                              typename internal::remove_const<Scalar>::type,
+                              NumDims, Evaluator::Layout> TensorBlockMapper;
+    typedef TensorBlock<Index, typename internal::remove_const<Scalar>::type,
+                        NumDims, Evaluator::Layout> TensorBlock;
+
+    Evaluator evaluator(expr, device);
+    std::size_t total_size = array_prod(evaluator.dimensions());
+    std::size_t cache_size = device.firstLevelCacheSize() / sizeof(Scalar);
+    if (total_size < cache_size) {
+      // TODO(andydavis) Reduce block management overhead for small tensors.
+      internal::TensorExecutor<Expression, DefaultDevice, Vectorizable,
+                               false>::run(expr, device);
+      return;
+    }
+
+    const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
+    if (needs_assign) {
+      // Size tensor blocks to fit in cache (or requested target block size).
+      size_t block_total_size = numext::mini(cache_size, total_size);
+      TensorBlockShapeType block_shape = kUniformAllDims;
+      // Query expression tree for desired block size/shape.
+      std::vector<internal::TensorOpResourceRequirements> resources;
+      evaluator.getResourceRequirements(&resources);
+      if (!resources.empty()) {
+        // TODO(andydavis) Implement different policies (i.e. revert to a
+        // default policy if block shapes/sizes conflict).
+        block_shape = resources[0].block_shape;
+        block_total_size = resources[0].block_total_size;
+      }
+
+      TensorBlockMapper block_mapper(evaluator.dimensions(),
+                                     block_shape,
+                                     block_total_size);
+
+      Scalar* data = static_cast<Scalar*>(device.allocate(
+          block_total_size * sizeof(Scalar)));
+
+      const Index total_block_count = block_mapper.total_block_count();
+      for (Index i = 0; i < total_block_count; ++i) {
+        TensorBlock block = block_mapper.GetBlockForIndex(i, data);
+        evaluator.evalBlock(&block);
+      }
+      device.deallocate(data);
+    }
+    evaluator.cleanup();
+  }
+};
+
+// Multicore strategy: the index space is partitioned and each partition is executed on a single core
+#ifdef EIGEN_USE_THREADS
+template <typename Evaluator, typename Index, bool Vectorizable>
+struct EvalRange {
+  static void run(Evaluator evaluator, const Index first, const Index last) {
+    eigen_assert(last > first);
+    for (Index i = first; i < last; ++i) {
+      evaluator.evalScalar(i);
+    }
+  }
+};
+
+template <typename Evaluator, typename Index>
+struct EvalRange<Evaluator, Index, true> {
+  static void run(Evaluator evaluator, const Index first, const Index last) {
+    eigen_assert(last > first);
+
+    Index i = first;
+    static const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
+    if (last - first >= PacketSize) {
+      eigen_assert(first % PacketSize == 0);
+      Index lastPacket = last - (last % PacketSize);
+      for (; i < lastPacket; i += PacketSize) {
+        evaluator.evalPacket(i);
+      }
+    }
+
+    for (; i < last; ++i) {
+      evaluator.evalScalar(i);
+    }
+  }
+};
+
+template <typename Expression, bool Vectorizable, bool Tileable>
+class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tileable> {
+ public:
+  typedef typename Expression::Index Index;
+  static inline void run(const Expression& expr, const ThreadPoolDevice& device)
+  {
+    if (device.numThreads() <= 1) {
+      DefaultDevice dd;
+      TensorExecutor<Expression, DefaultDevice, Vectorizable, Tileable>::run(expr, dd);
+      return;
+    }
+
+    typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
+    Evaluator evaluator(expr, device);
+    const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
+    if (needs_assign)
+    {
+      const Index size = array_prod(evaluator.dimensions());
+
+      static const Index PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1;
+      Index blocksz = std::ceil<Index>(static_cast<float>(size)/device.numThreads()) + PacketSize - 1;
+      const Index blocksize = numext::maxi<Index>(PacketSize, (blocksz - (blocksz % PacketSize)));
+      const Index numblocks = size / blocksize;
+
+      Index i = 0;
+      FixedSizeVector<Notification*> results(numblocks);
+      for (int i = 0; i < numblocks; ++i) {
+        results.push_back(device.enqueue(&EvalRange<Evaluator, Index, Vectorizable>::run, evaluator, i*blocksize, (i+1)*blocksize));
+      }
+
+      if (numblocks * blocksize < size) {
+        EvalRange<Evaluator, Index, Vectorizable>::run(evaluator, numblocks * blocksize, size);
+      }
+
+      for (int i = 0; i < numblocks; ++i) {
+        wait_until_ready(results[i]);
+        delete results[i];
+      }
+    }
+    evaluator.cleanup();
+  }
+};
+
+template <typename Index, typename Scalar>
+struct BlockRange {
+  BlockRange(Index s, Index l, Scalar* d)
+      : index_start(s), index_limit(l), data(d) {}
+  const Index index_start;
+  const Index index_limit;
+  Scalar* data;
+};
+
+template <typename Evaluator, typename Index, typename Scalar,
+          std::size_t NumDims>
+struct EvalBlockRange {
+  typedef TensorBlockMapper<Index, Scalar, NumDims, Evaluator::Layout>
+      BlockMapper;
+
+  static void run(Evaluator evaluator, const BlockMapper& block_mapper,
+                  BlockRange<Index, Scalar> block_range) {
+    typedef TensorBlock<Index, Scalar, NumDims, Evaluator::Layout>
+        TensorBlock;
+    eigen_assert(block_range.index_limit > block_range.index_start);
+
+    for (Index i = block_range.index_start; i < block_range.index_limit; ++i) {
+      TensorBlock block = block_mapper.GetBlockForIndex(i, block_range.data);
+      evaluator.evalBlock(&block);
+    }
+  }
+};
+
+template <typename Expression, bool Vectorizable>
+class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, true> {
+ public:
+  typedef typename Expression::Index Index;
+  static inline void run(const Expression& expr,
+                         const ThreadPoolDevice& device) {
+    typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
+    typedef typename internal::remove_const<
+        typename traits<Expression>::Scalar>::type Scalar;
+    typedef typename traits<Expression>::Index Index;
+    static const std::size_t NumDims = traits<Expression>::NumDimensions;
+    typedef TensorBlockMapper<Index, Scalar, NumDims, Evaluator::Layout>
+        TensorBlockMapper;
+    typedef TensorBlock<Index, Scalar, NumDims, Evaluator::Layout>
+        TensorBlock;
+    typedef BlockRange<Index, Scalar> BlockRange;
+
+    Evaluator evaluator(expr, device);
+    std::size_t total_size = array_prod(evaluator.dimensions());
+    std::size_t cache_size = device.firstLevelCacheSize() / sizeof(Scalar);
+    if (total_size < cache_size || device.numThreads() <= 1) {
+      // TODO(andydavis) Reduce block management overhead for small tensors.
+      DefaultDevice dd;
+      internal::TensorExecutor<Expression, DefaultDevice, Vectorizable, false>::run(expr, dd);
+      return;
+    }
+    const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
+    if (needs_assign) {
+      TensorBlockShapeType block_shape = kUniformAllDims;
+      size_t block_total_size = 0;
+      // Query expression tree for desired block size/shape.
+      std::vector<internal::TensorOpResourceRequirements> resources;
+      evaluator.getResourceRequirements(&resources);
+      if (!resources.empty()) {
+        // TODO(andydavis) Implement different shape/size policies.
+        block_shape = resources[0].block_shape;
+        block_total_size = resources[0].block_total_size;
+      }
+
+      // Divide the tensor coefficients across the number of threads, subject
+      // to min/max block size constraints.
+      const size_t min_block_size =
+          device.firstLevelCacheSize() / sizeof(Scalar);
+      const size_t max_block_size = block_total_size > 0 ? block_total_size :
+          device.lastLevelCacheSize() / sizeof(Scalar);
+      const size_t target_block_size = numext::maxi(
+          min_block_size,
+          numext::mini(static_cast<size_t>(array_prod(evaluator.dimensions())) / device.numThreads(),
+                       max_block_size));
+
+      TensorBlockMapper block_mapper(evaluator.dimensions(),
+                                     block_shape,
+                                     target_block_size);
+
+      const Index block_partition_size =
+          (block_mapper.total_block_count() + device.numThreads() - 1) /
+          device.numThreads();
+      const Index block_partition_count =
+          (block_mapper.total_block_count() + block_partition_size - 1) /
+          block_partition_size;
+
+      if (block_partition_count == 1) {
+        // Avoid thread hop if no parallelism is possible.
+        Scalar* data = static_cast<Scalar*>(
+            device.allocate(target_block_size * sizeof(Scalar)));
+        EvalBlockRange<Evaluator, Index, Scalar, NumDims>::run(
+            evaluator, block_mapper,
+            BlockRange(0, block_mapper.total_block_count(), data));
+        device.deallocate(data);
+      } else {
+        // Multi-threaded case.
+        struct ThreadState {
+          Notification* done;
+          Scalar* data;
+        };
+        FixedSizeVector<ThreadState> thread_state(block_partition_count,
+                                                  ThreadState());
+
+        // Dispatch threads.
+        for (int i = 0; i < block_partition_count; ++i) {
+          thread_state[i].data = static_cast<Scalar*>(
+              device.allocate(target_block_size * sizeof(Scalar)));
+          thread_state[i].done = device.enqueue(
+              &EvalBlockRange<Evaluator, Index, Scalar, NumDims>::run,
+              evaluator, block_mapper,
+              BlockRange(i * block_partition_size,
+                         numext::mini((i + 1) * block_partition_size,
+                                    block_mapper.total_block_count()),
+                         thread_state[i].data));
+        }
+
+        // Join threads.
+        for (int i = 0; i < block_partition_count; ++i) {
+          wait_until_ready(thread_state[i].done);
+          delete thread_state[i].done;
+          device.deallocate(thread_state[i].data);
+        }
+      }
+    }
+    evaluator.cleanup();
+  }
+};
+
+#endif
+
+
+// GPU: the evaluation of the expression is offloaded to a GPU.
+#if defined(EIGEN_USE_GPU)
+
+template <typename Expression, bool Tileable>
+class TensorExecutor<Expression, GpuDevice, false, Tileable> {
+ public:
+  typedef typename Expression::Index Index;
+  static void run(const Expression& expr, const GpuDevice& device);
+};
+
+template <typename Expression, bool Tileable>
+class TensorExecutor<Expression, GpuDevice, true, Tileable> {
+ public:
+  typedef typename Expression::Index Index;
+  static void run(const Expression& expr, const GpuDevice& device);
+};
+
+#if defined(__CUDACC__)
+template <typename Evaluator, typename Index>
+__global__ void
+__launch_bounds__(1024)
+ EigenMetaKernel_NonVectorizable(Evaluator memcopied_eval, Index size) {
+
+  const Index first_index = blockIdx.x * blockDim.x + threadIdx.x;
+  const Index step_size = blockDim.x * gridDim.x;
+
+  // Cuda memcopies the kernel arguments. That's fine for POD, but for more
+  // complex types such as evaluators we should really conform to the C++
+  // standard and call a proper copy constructor.
+  Evaluator eval(memcopied_eval);
+
+  // Use the scalar path
+  for (Index i = first_index; i < size; i += step_size) {
+    eval.evalScalar(i);
+  }
+}
+
+template <typename Evaluator, typename Index>
+__global__ void
+__launch_bounds__(1024)
+ EigenMetaKernel_Vectorizable(Evaluator memcopied_eval, Index size) {
+
+  const Index first_index = blockIdx.x * blockDim.x + threadIdx.x;
+  const Index step_size = blockDim.x * gridDim.x;
+
+  // Cuda memcopies the kernel arguments. That's fine for POD, but for more
+  // complex types such as evaluators we should really conform to the C++
+  // standard and call a proper copy constructor.
+  Evaluator eval(memcopied_eval);
+
+  // Use the vector path
+  const Index PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
+  const Index vectorized_step_size = step_size * PacketSize;
+  const Index vectorized_size = (size / PacketSize) * PacketSize;
+  for (Index i = first_index * PacketSize; i < vectorized_size;
+       i += vectorized_step_size) {
+    eval.evalPacket(i);
+  }
+  for (Index i = vectorized_size + first_index; i < size; i += step_size) {
+    eval.evalScalar(i);
+  }
+}
+
+/*static*/
+template <typename Expression, bool Tileable>
+inline void TensorExecutor<Expression, GpuDevice, false, Tileable>::run(
+    const Expression& expr, const GpuDevice& device) {
+  TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
+  const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
+  if (needs_assign) {
+    const int num_blocks = device.getNumCudaMultiProcessors() *
+                           device.maxCudaThreadsPerMultiProcessor() /
+                           device.maxCudaThreadsPerBlock();
+    const int block_size = device.maxCudaThreadsPerBlock();
+    const Index size = array_prod(evaluator.dimensions());
+    LAUNCH_CUDA_KERNEL(
+        (EigenMetaKernel_NonVectorizable<TensorEvaluator<Expression, GpuDevice>,
+                                         Index>),
+        num_blocks, block_size, 0, device, evaluator, size);
+  }
+  evaluator.cleanup();
+}
+
+/*static*/
+template <typename Expression, bool Tileable>
+inline void TensorExecutor<Expression, GpuDevice, true, Tileable>::run(
+    const Expression& expr, const GpuDevice& device) {
+  TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
+  const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
+  if (needs_assign) {
+    const int num_blocks = device.getNumCudaMultiProcessors() *
+                           device.maxCudaThreadsPerMultiProcessor() /
+                           device.maxCudaThreadsPerBlock();
+    const int block_size = device.maxCudaThreadsPerBlock();
+    const Index size = array_prod(evaluator.dimensions());
+    LAUNCH_CUDA_KERNEL(
+        (EigenMetaKernel_Vectorizable<TensorEvaluator<Expression, GpuDevice>,
+                                      Index>),
+        num_blocks, block_size, 0, device, evaluator, size);
+  }
+  evaluator.cleanup();
+}
+
+#endif  // __CUDACC__
+#endif  // EIGEN_USE_GPU
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h
new file mode 100644
index 0000000000..49d849e233
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h
@@ -0,0 +1,291 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_EXPR_H
+#define EIGEN_CXX11_TENSOR_TENSOR_EXPR_H
+
+namespace Eigen {
+
+/** \class TensorExpr
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor expression classes.
+  *
+  * The TensorCwiseNullaryOp class applies a nullary operators to an expression.
+  * This is typically used to generate constants.
+  *
+  * The TensorCwiseUnaryOp class represents an expression where a unary operator
+  * (e.g. cwiseSqrt) is applied to an expression.
+  *
+  * The TensorCwiseBinaryOp class represents an expression where a binary
+  * operator (e.g. addition) is applied to a lhs and a rhs expression.
+  *
+  */
+namespace internal {
+template<typename NullaryOp, typename XprType>
+struct traits<TensorCwiseNullaryOp<NullaryOp, XprType> >
+    : traits<XprType>
+{
+  typedef traits<XprType> XprTraits;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::Nested XprTypeNested;
+  typedef typename remove_reference<XprTypeNested>::type _XprTypeNested;
+  static const int NumDimensions = XprTraits::NumDimensions;
+  static const int Layout = XprTraits::Layout;
+
+  enum {
+    Flags = 0,
+  };
+};
+
+}  // end namespace internal
+
+
+
+template<typename NullaryOp, typename XprType>
+class TensorCwiseNullaryOp : public TensorBase<TensorCwiseNullaryOp<NullaryOp, XprType>, ReadOnlyAccessors>
+{
+  public:
+    typedef typename Eigen::internal::traits<TensorCwiseNullaryOp>::Scalar Scalar;
+    typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+    typedef typename XprType::CoeffReturnType CoeffReturnType;
+    typedef TensorCwiseNullaryOp<NullaryOp, XprType> Nested;
+    typedef typename Eigen::internal::traits<TensorCwiseNullaryOp>::StorageKind StorageKind;
+    typedef typename Eigen::internal::traits<TensorCwiseNullaryOp>::Index Index;
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseNullaryOp(const XprType& xpr, const NullaryOp& func = NullaryOp())
+        : m_xpr(xpr), m_functor(func) {}
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename XprType::Nested>::type&
+    nestedExpression() const { return m_xpr; }
+
+    EIGEN_DEVICE_FUNC
+    const NullaryOp& functor() const { return m_functor; }
+
+  protected:
+    typename XprType::Nested m_xpr;
+    const NullaryOp m_functor;
+};
+
+
+
+namespace internal {
+template<typename UnaryOp, typename XprType>
+struct traits<TensorCwiseUnaryOp<UnaryOp, XprType> >
+    : traits<XprType>
+{
+  // TODO(phli): Add InputScalar, InputPacket.  Check references to
+  // current Scalar/Packet to see if the intent is Input or Output.
+  typedef typename result_of<UnaryOp(typename XprType::Scalar)>::type Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename XprType::Nested XprTypeNested;
+  typedef typename remove_reference<XprTypeNested>::type _XprTypeNested;
+  static const int NumDimensions = XprTraits::NumDimensions;
+  static const int Layout = XprTraits::Layout;
+};
+
+template<typename UnaryOp, typename XprType>
+struct eval<TensorCwiseUnaryOp<UnaryOp, XprType>, Eigen::Dense>
+{
+  typedef const TensorCwiseUnaryOp<UnaryOp, XprType>& type;
+};
+
+template<typename UnaryOp, typename XprType>
+struct nested<TensorCwiseUnaryOp<UnaryOp, XprType>, 1, typename eval<TensorCwiseUnaryOp<UnaryOp, XprType> >::type>
+{
+  typedef TensorCwiseUnaryOp<UnaryOp, XprType> type;
+};
+
+}  // end namespace internal
+
+
+
+template<typename UnaryOp, typename XprType>
+class TensorCwiseUnaryOp : public TensorBase<TensorCwiseUnaryOp<UnaryOp, XprType>, ReadOnlyAccessors>
+{
+  public:
+    // TODO(phli): Add InputScalar, InputPacket.  Check references to
+    // current Scalar/Packet to see if the intent is Input or Output.
+    typedef typename Eigen::internal::traits<TensorCwiseUnaryOp>::Scalar Scalar;
+    typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+    typedef Scalar CoeffReturnType;
+    typedef typename Eigen::internal::nested<TensorCwiseUnaryOp>::type Nested;
+    typedef typename Eigen::internal::traits<TensorCwiseUnaryOp>::StorageKind StorageKind;
+    typedef typename Eigen::internal::traits<TensorCwiseUnaryOp>::Index Index;
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseUnaryOp(const XprType& xpr, const UnaryOp& func = UnaryOp())
+      : m_xpr(xpr), m_functor(func) {}
+
+    EIGEN_DEVICE_FUNC
+    const UnaryOp& functor() const { return m_functor; }
+
+    /** \returns the nested expression */
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename XprType::Nested>::type&
+    nestedExpression() const { return m_xpr; }
+
+  protected:
+    typename XprType::Nested m_xpr;
+    const UnaryOp m_functor;
+};
+
+
+namespace internal {
+template<typename BinaryOp, typename LhsXprType, typename RhsXprType>
+struct traits<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> >
+{
+  // Type promotion to handle the case where the types of the lhs and the rhs
+  // are different.
+  // TODO(phli): Add Lhs/RhsScalar, Lhs/RhsPacket.  Check references to
+  // current Scalar/Packet to see if the intent is Inputs or Output.
+  typedef typename result_of<
+      BinaryOp(typename LhsXprType::Scalar,
+               typename RhsXprType::Scalar)>::type Scalar;
+  typedef traits<LhsXprType> XprTraits;
+  typedef typename promote_storage_type<
+      typename traits<LhsXprType>::StorageKind,
+      typename traits<RhsXprType>::StorageKind>::ret StorageKind;
+  typedef typename promote_index_type<
+      typename traits<LhsXprType>::Index,
+      typename traits<RhsXprType>::Index>::type Index;
+  typedef typename LhsXprType::Nested LhsNested;
+  typedef typename RhsXprType::Nested RhsNested;
+  typedef typename remove_reference<LhsNested>::type _LhsNested;
+  typedef typename remove_reference<RhsNested>::type _RhsNested;
+  static const int NumDimensions = XprTraits::NumDimensions;
+  static const int Layout = XprTraits::Layout;
+
+  enum {
+    Flags = 0,
+  };
+};
+
+template<typename BinaryOp, typename LhsXprType, typename RhsXprType>
+struct eval<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType>, Eigen::Dense>
+{
+  typedef const TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType>& type;
+};
+
+template<typename BinaryOp, typename LhsXprType, typename RhsXprType>
+struct nested<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType>, 1, typename eval<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> >::type>
+{
+  typedef TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> type;
+};
+
+}  // end namespace internal
+
+
+
+template<typename BinaryOp, typename LhsXprType, typename RhsXprType>
+class TensorCwiseBinaryOp : public TensorBase<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType>, ReadOnlyAccessors>
+{
+  public:
+    // TODO(phli): Add Lhs/RhsScalar, Lhs/RhsPacket.  Check references to
+    // current Scalar/Packet to see if the intent is Inputs or Output.
+    typedef typename Eigen::internal::traits<TensorCwiseBinaryOp>::Scalar Scalar;
+    typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+    typedef Scalar CoeffReturnType;
+    typedef typename Eigen::internal::nested<TensorCwiseBinaryOp>::type Nested;
+    typedef typename Eigen::internal::traits<TensorCwiseBinaryOp>::StorageKind StorageKind;
+    typedef typename Eigen::internal::traits<TensorCwiseBinaryOp>::Index Index;
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseBinaryOp(const LhsXprType& lhs, const RhsXprType& rhs, const BinaryOp& func = BinaryOp())
+        : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_functor(func) {}
+
+    EIGEN_DEVICE_FUNC
+    const BinaryOp& functor() const { return m_functor; }
+
+    /** \returns the nested expressions */
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename LhsXprType::Nested>::type&
+    lhsExpression() const { return m_lhs_xpr; }
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename RhsXprType::Nested>::type&
+    rhsExpression() const { return m_rhs_xpr; }
+
+  protected:
+    typename LhsXprType::Nested m_lhs_xpr;
+    typename RhsXprType::Nested m_rhs_xpr;
+    const BinaryOp m_functor;
+};
+
+
+namespace internal {
+template<typename IfXprType, typename ThenXprType, typename ElseXprType>
+struct traits<TensorSelectOp<IfXprType, ThenXprType, ElseXprType> >
+    : traits<ThenXprType>
+{
+  typedef typename traits<ThenXprType>::Scalar Scalar;
+  typedef traits<ThenXprType> XprTraits;
+  typedef typename promote_storage_type<typename traits<ThenXprType>::StorageKind,
+                                        typename traits<ElseXprType>::StorageKind>::ret StorageKind;
+  typedef typename promote_index_type<typename traits<ElseXprType>::Index,
+                                      typename traits<ThenXprType>::Index>::type Index;
+  typedef typename IfXprType::Nested IfNested;
+  typedef typename ThenXprType::Nested ThenNested;
+  typedef typename ElseXprType::Nested ElseNested;
+  static const int NumDimensions = XprTraits::NumDimensions;
+  static const int Layout = XprTraits::Layout;
+};
+
+template<typename IfXprType, typename ThenXprType, typename ElseXprType>
+struct eval<TensorSelectOp<IfXprType, ThenXprType, ElseXprType>, Eigen::Dense>
+{
+  typedef const TensorSelectOp<IfXprType, ThenXprType, ElseXprType>& type;
+};
+
+template<typename IfXprType, typename ThenXprType, typename ElseXprType>
+struct nested<TensorSelectOp<IfXprType, ThenXprType, ElseXprType>, 1, typename eval<TensorSelectOp<IfXprType, ThenXprType, ElseXprType> >::type>
+{
+  typedef TensorSelectOp<IfXprType, ThenXprType, ElseXprType> type;
+};
+
+}  // end namespace internal
+
+
+template<typename IfXprType, typename ThenXprType, typename ElseXprType>
+class TensorSelectOp : public TensorBase<TensorSelectOp<IfXprType, ThenXprType, ElseXprType> >
+{
+  public:
+    typedef typename Eigen::internal::traits<TensorSelectOp>::Scalar Scalar;
+    typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+    typedef typename internal::promote_storage_type<typename ThenXprType::CoeffReturnType,
+                                                    typename ElseXprType::CoeffReturnType>::ret CoeffReturnType;
+    typedef typename Eigen::internal::nested<TensorSelectOp>::type Nested;
+    typedef typename Eigen::internal::traits<TensorSelectOp>::StorageKind StorageKind;
+    typedef typename Eigen::internal::traits<TensorSelectOp>::Index Index;
+
+    EIGEN_DEVICE_FUNC
+    TensorSelectOp(const IfXprType& a_condition,
+                   const ThenXprType& a_then,
+                   const ElseXprType& a_else)
+      : m_condition(a_condition), m_then(a_then), m_else(a_else)
+    { }
+
+    EIGEN_DEVICE_FUNC
+    const IfXprType& ifExpression() const { return m_condition; }
+
+    EIGEN_DEVICE_FUNC
+    const ThenXprType& thenExpression() const { return m_then; }
+
+    EIGEN_DEVICE_FUNC
+    const ElseXprType& elseExpression() const { return m_else; }
+
+  protected:
+    typename IfXprType::Nested m_condition;
+    typename ThenXprType::Nested m_then;
+    typename ElseXprType::Nested m_else;
+};
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_EXPR_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
new file mode 100644
index 0000000000..ac73366762
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
@@ -0,0 +1,846 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Jianwei Cui <thucjw@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_FFT_H
+#define EIGEN_CXX11_TENSOR_TENSOR_FFT_H
+namespace Eigen {
+
+/** \class TensorFFT
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor FFT class.
+  *
+  * TODO:
+  * Vectorize the Cooley Tukey and the Bluestein algorithm
+  * Add support for multithreaded evaluation
+  * Improve the performance on GPU
+  */
+
+template <bool NeedUprade> struct MakeComplex {
+  template <typename T>
+  #if defined(EIGEN_USE_GPU) && defined(__CUDACC__) && !defined(__GCUDACC__)
+  EIGEN_DEVICE_FUNC
+  #endif
+  T operator() (const T& val) const { return val; }
+};
+
+template <> struct MakeComplex<true> {
+  template <typename T>
+  #if defined(EIGEN_USE_GPU) && defined(__CUDACC__) && !defined(__GCUDACC__)
+  EIGEN_DEVICE_FUNC
+  #endif
+  std::complex<T> operator() (const T& val) const { return std::complex<T>(val, 0); }
+};
+
+template <> struct MakeComplex<false> {
+  template <typename T>
+  #if defined(EIGEN_USE_GPU) && defined(__CUDACC__) && !defined(__GCUDACC__)
+  EIGEN_DEVICE_FUNC
+  #endif
+  std::complex<T> operator() (const std::complex<T>& val) const { return val; }
+};
+
+template <int ResultType> struct PartOf {
+  template <typename T> T operator() (const T& val) const { return val; }
+};
+
+template <> struct PartOf<RealPart> {
+  template <typename T> T operator() (const std::complex<T>& val) const { return val.real(); }
+};
+
+template <> struct PartOf<ImagPart> {
+  template <typename T> T operator() (const std::complex<T>& val) const { return val.imag(); }
+};
+
+namespace internal {
+template <typename FFT, typename XprType, int FFTResultType, int FFTDir>
+struct traits<TensorFFTOp<FFT, XprType, FFTResultType, FFTDir> > : public traits<XprType> {
+  typedef traits<XprType> XprTraits;
+  typedef typename NumTraits<typename XprTraits::Scalar>::Real RealScalar;
+  typedef typename std::complex<RealScalar> ComplexScalar;
+  typedef typename XprTraits::Scalar InputScalar;
+  typedef typename conditional<FFTResultType == RealPart || FFTResultType == ImagPart, RealScalar, ComplexScalar>::type OutputScalar;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef typename remove_reference<Nested>::type _Nested;
+  static const int NumDimensions = XprTraits::NumDimensions;
+  static const int Layout = XprTraits::Layout;
+};
+
+template <typename FFT, typename XprType, int FFTResultType, int FFTDirection>
+struct eval<TensorFFTOp<FFT, XprType, FFTResultType, FFTDirection>, Eigen::Dense> {
+  typedef const TensorFFTOp<FFT, XprType, FFTResultType, FFTDirection>& type;
+};
+
+template <typename FFT, typename XprType, int FFTResultType, int FFTDirection>
+struct nested<TensorFFTOp<FFT, XprType, FFTResultType, FFTDirection>, 1, typename eval<TensorFFTOp<FFT, XprType, FFTResultType, FFTDirection> >::type> {
+  typedef TensorFFTOp<FFT, XprType, FFTResultType, FFTDirection> type;
+};
+
+}  // end namespace internal
+
+template <typename FFT, typename XprType, int FFTResultType, int FFTDir>
+class TensorFFTOp : public TensorBase<TensorFFTOp<FFT, XprType, FFTResultType, FFTDir>, ReadOnlyAccessors> {
+ public:
+  typedef typename Eigen::internal::traits<TensorFFTOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename std::complex<RealScalar> ComplexScalar;
+  typedef typename internal::conditional<FFTResultType == RealPart || FFTResultType == ImagPart, RealScalar, ComplexScalar>::type OutputScalar;
+  typedef OutputScalar CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorFFTOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorFFTOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorFFTOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorFFTOp(const XprType& expr, const FFT& fft)
+      : m_xpr(expr), m_fft(fft) {}
+
+  EIGEN_DEVICE_FUNC
+  const FFT& fft() const { return m_fft; }
+
+  EIGEN_DEVICE_FUNC
+  const typename internal::remove_all<typename XprType::Nested>::type& expression() const {
+    return m_xpr;
+  }
+
+ protected:
+  typename XprType::Nested m_xpr;
+  const FFT m_fft;
+};
+
+// Eval as rvalue
+template <typename FFT, typename ArgType, typename Device, int FFTResultType, int FFTDir>
+struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, Device> {
+  typedef TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir> XprType;
+  typedef typename XprType::Index Index;
+  static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename std::complex<RealScalar> ComplexScalar;
+  typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions;
+  typedef internal::traits<XprType> XprTraits;
+  typedef typename XprTraits::Scalar InputScalar;
+  typedef typename internal::conditional<FFTResultType == RealPart || FFTResultType == ImagPart, RealScalar, ComplexScalar>::type OutputScalar;
+  typedef OutputScalar CoeffReturnType;
+  typedef typename PacketType<OutputScalar, Device>::type PacketReturnType;
+
+  enum {
+    IsAligned = false,
+    PacketAccess = true,
+    BlockAccess = false,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = false,
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_data(NULL), m_impl(op.expression(), device), m_fft(op.fft()), m_device(device) {
+    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+    for (int i = 0; i < NumDims; ++i) {
+      eigen_assert(input_dims[i] > 0);
+      m_dimensions[i] = input_dims[i];
+    }
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_strides[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        m_strides[i] = m_strides[i - 1] * m_dimensions[i - 1];
+      }
+    } else {
+      m_strides[NumDims - 1] = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        m_strides[i] = m_strides[i + 1] * m_dimensions[i + 1];
+      }
+    }
+    m_size = m_dimensions.TotalSize();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {
+    return m_dimensions;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(OutputScalar* data) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    if (data) {
+      evalToBuf(data);
+      return false;
+    } else {
+      m_data = (CoeffReturnType*)m_device.allocate(sizeof(CoeffReturnType) * m_size);
+      evalToBuf(m_data);
+      return true;
+    }
+  }
+
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    if (m_data) {
+      m_device.deallocate(m_data);
+      m_data = NULL;
+    }
+    m_impl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffReturnType coeff(Index index) const {
+    return m_data[index];
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType packet(Index index) const {
+    return internal::ploadt<PacketReturnType, LoadMode>(m_data + index);
+  }
+
+  EIGEN_DEVICE_FUNC Scalar* data() const { return m_data; }
+
+
+ private:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalToBuf(OutputScalar* data) {
+    const bool write_to_out = internal::is_same<OutputScalar, ComplexScalar>::value;
+    ComplexScalar* buf = write_to_out ? (ComplexScalar*)data : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * m_size);
+
+    for (int i = 0; i < m_size; ++i) {
+      buf[i] = MakeComplex<internal::is_same<InputScalar, RealScalar>::value>()(m_impl.coeff(i));
+    }
+
+    for (int i = 0; i < m_fft.size(); ++i) {
+      int dim = m_fft[i];
+      eigen_assert(dim >= 0 && dim < NumDims);
+      Index line_len = m_dimensions[dim];
+      eigen_assert(line_len >= 1);
+      ComplexScalar* line_buf = (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * line_len);
+      const bool is_power_of_two = isPowerOfTwo(line_len);
+      const int good_composite = is_power_of_two ? 0 : findGoodComposite(line_len);
+      const int log_len = is_power_of_two ? getLog2(line_len) : getLog2(good_composite);
+
+      ComplexScalar* a = is_power_of_two ? NULL : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * good_composite);
+      ComplexScalar* b = is_power_of_two ? NULL : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * good_composite);
+      ComplexScalar* pos_j_base_powered = is_power_of_two ? NULL : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * (line_len + 1));
+      if (!is_power_of_two) {
+        ComplexScalar pos_j_base = ComplexScalar(std::cos(M_PI/line_len), std::sin(M_PI/line_len));
+        for (int i = 0; i < line_len + 1; ++i) {
+          pos_j_base_powered[i] = std::pow(pos_j_base, i * i);
+        }
+      }
+
+      for (Index partial_index = 0; partial_index < m_size / line_len; ++partial_index) {
+        Index base_offset = getBaseOffsetFromIndex(partial_index, dim);
+
+        // get data into line_buf
+        for (int j = 0; j < line_len; ++j) {
+          Index offset = getIndexFromOffset(base_offset, dim, j);
+          line_buf[j] = buf[offset];
+        }
+
+        // processs the line
+        if (is_power_of_two) {
+          processDataLineCooleyTukey(line_buf, line_len, log_len);
+        }
+        else {
+          processDataLineBluestein(line_buf, line_len, good_composite, log_len, a, b, pos_j_base_powered);
+        }
+
+        // write back
+        for (int j = 0; j < line_len; ++j) {
+          const ComplexScalar div_factor = (FFTDir == FFT_FORWARD) ? ComplexScalar(1, 0) : ComplexScalar(line_len, 0);
+          Index offset = getIndexFromOffset(base_offset, dim, j);
+          buf[offset] =  line_buf[j] / div_factor;
+        }
+      }
+      m_device.deallocate(line_buf);
+      if (!pos_j_base_powered) {
+        m_device.deallocate(a);
+        m_device.deallocate(b);
+        m_device.deallocate(pos_j_base_powered);
+      }
+    }
+
+    if(!write_to_out) {
+      for (int i = 0; i < m_size; ++i) {
+        data[i] = PartOf<FFTResultType>()(buf[i]);
+      }
+      m_device.deallocate(buf);
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static bool isPowerOfTwo(int x) {
+    eigen_assert(x > 0);
+    return !(x & (x - 1));
+  }
+
+  //the composite number for padding, used in Bluestein's FFT algorithm
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int findGoodComposite(int n) {
+    int i = 2;
+    while (i < 2 * n - 1) i *= 2;
+    return i;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int getLog2(int m) {
+    int log2m = 0;
+    while (m >>= 1) log2m++;
+    return log2m;
+  }
+
+  // Call Cooley Tukey algorithm directly, data length must be power of 2
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void processDataLineCooleyTukey(ComplexScalar* line_buf, int line_len, int log_len) {
+    eigen_assert(isPowerOfTwo(line_len));
+    scramble_FFT(line_buf, line_len);
+    compute_1D_Butterfly<FFTDir>(line_buf, line_len, log_len);
+  }
+
+  // Call Bluestein's FFT algorithm, m is a good composite number greater than (2 * n - 1), used as the padding length
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void processDataLineBluestein(ComplexScalar* line_buf, int line_len, int good_composite, int log_len, ComplexScalar* a, ComplexScalar* b, const ComplexScalar* pos_j_base_powered) {
+    int n = line_len;
+    int m = good_composite;
+    ComplexScalar* data = line_buf;
+
+    for (int i = 0; i < n; ++i) {
+      if(FFTDir == FFT_FORWARD) {
+        a[i] = data[i] * std::conj(pos_j_base_powered[i]);
+      }
+      else {
+        a[i] = data[i] * pos_j_base_powered[i];
+      }
+    }
+    for (int i = n; i < m; ++i) {
+      a[i] = ComplexScalar(0, 0);
+    }
+
+    for (int i = 0; i < n; ++i) {
+      if(FFTDir == FFT_FORWARD) {
+        b[i] = pos_j_base_powered[i];
+      }
+      else {
+        b[i] = std::conj(pos_j_base_powered[i]);
+      }
+    }
+    for (int i = n; i < m - n; ++i) {
+      b[i] = ComplexScalar(0, 0);
+    }
+    for (int i = m - n; i < m; ++i) {
+      if(FFTDir == FFT_FORWARD) {
+        b[i] = pos_j_base_powered[m-i];
+      }
+      else {
+        b[i] = std::conj(pos_j_base_powered[m-i]);
+      }
+    }
+
+    scramble_FFT(a, m);
+    compute_1D_Butterfly<FFT_FORWARD>(a, m, log_len);
+
+    scramble_FFT(b, m);
+    compute_1D_Butterfly<FFT_FORWARD>(b, m, log_len);
+
+    for (int i = 0; i < m; ++i) {
+      a[i] *= b[i];
+    }
+
+    scramble_FFT(a, m);
+    compute_1D_Butterfly<FFT_REVERSE>(a, m, log_len);
+
+    //Do the scaling after ifft
+    for (int i = 0; i < m; ++i) {
+      a[i] /= m;
+    }
+
+    for (int i = 0; i < n; ++i) {
+      if(FFTDir == FFT_FORWARD) {
+        data[i] = a[i] * std::conj(pos_j_base_powered[i]);
+      }
+      else {
+        data[i] = a[i] * pos_j_base_powered[i];
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static void scramble_FFT(ComplexScalar* data, int n) {
+    eigen_assert(isPowerOfTwo(n));
+    int j = 1;
+    for (int i = 1; i < n; ++i){
+      if (j > i) {
+        std::swap(data[j-1], data[i-1]);
+      }
+      int m = n >> 1;
+      while (m >= 2 && j > m) {
+        j -= m;
+        m >>= 1;
+      }
+      j += m;
+    }
+  }
+
+  template<int Dir>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_1D_Butterfly(ComplexScalar* data, int n, int n_power_of_2) {
+    eigen_assert(isPowerOfTwo(n));
+    if (n == 1) {
+      return;
+    }
+    else if (n == 2) {
+      ComplexScalar tmp = data[1];
+      data[1] = data[0] - data[1];
+      data[0] += tmp;
+      return;
+    }
+    else if (n == 4) {
+      ComplexScalar tmp[4];
+      tmp[0] = data[0] + data[1];
+      tmp[1] = data[0] - data[1];
+      tmp[2] = data[2] + data[3];
+      if(Dir == FFT_FORWARD) {
+        tmp[3] = ComplexScalar(0.0, -1.0) * (data[2] - data[3]);
+      }
+      else {
+        tmp[3] = ComplexScalar(0.0, 1.0) * (data[2] - data[3]);
+      }
+      data[0] = tmp[0] + tmp[2];
+      data[1] = tmp[1] + tmp[3];
+      data[2] = tmp[0] - tmp[2];
+      data[3] = tmp[1] - tmp[3];
+      return;
+    }
+    else if (n == 8) {
+      ComplexScalar tmp_1[8];
+      ComplexScalar tmp_2[8];
+
+      tmp_1[0] = data[0] + data[1];
+      tmp_1[1] = data[0] - data[1];
+      tmp_1[2] = data[2] + data[3];
+      if (Dir == FFT_FORWARD) {
+        tmp_1[3] = (data[2] - data[3]) * ComplexScalar(0, -1);
+      }
+      else {
+        tmp_1[3] = (data[2] - data[3]) * ComplexScalar(0, 1);
+      }
+      tmp_1[4] = data[4] + data[5];
+      tmp_1[5] = data[4] - data[5];
+      tmp_1[6] = data[6] + data[7];
+      if (Dir == FFT_FORWARD) {
+        tmp_1[7] = (data[6] - data[7]) * ComplexScalar(0, -1);
+      }
+      else {
+        tmp_1[7] = (data[6] - data[7]) * ComplexScalar(0, 1);
+      }
+      tmp_2[0] = tmp_1[0] + tmp_1[2];
+      tmp_2[1] = tmp_1[1] + tmp_1[3];
+      tmp_2[2] = tmp_1[0] - tmp_1[2];
+      tmp_2[3] = tmp_1[1] - tmp_1[3];
+      tmp_2[4] = tmp_1[4] + tmp_1[6];
+      // SQRT2DIV2 = sqrt(2)/2
+      #define SQRT2DIV2 0.7071067811865476
+      if (Dir == FFT_FORWARD) {
+        tmp_2[5] = (tmp_1[5] + tmp_1[7]) * ComplexScalar(SQRT2DIV2, -SQRT2DIV2);
+        tmp_2[6] = (tmp_1[4] - tmp_1[6]) * ComplexScalar(0, -1);
+        tmp_2[7] = (tmp_1[5] - tmp_1[7]) * ComplexScalar(-SQRT2DIV2, -SQRT2DIV2);
+      }
+      else {
+        tmp_2[5] = (tmp_1[5] + tmp_1[7]) * ComplexScalar(SQRT2DIV2, SQRT2DIV2);
+        tmp_2[6] = (tmp_1[4] - tmp_1[6]) * ComplexScalar(0, 1);
+        tmp_2[7] = (tmp_1[5] - tmp_1[7]) * ComplexScalar(-SQRT2DIV2, SQRT2DIV2);
+      }
+      data[0] = tmp_2[0] + tmp_2[4];
+      data[1] = tmp_2[1] + tmp_2[5];
+      data[2] = tmp_2[2] + tmp_2[6];
+      data[3] = tmp_2[3] + tmp_2[7];
+      data[4] = tmp_2[0] - tmp_2[4];
+      data[5] = tmp_2[1] - tmp_2[5];
+      data[6] = tmp_2[2] - tmp_2[6];
+      data[7] = tmp_2[3] - tmp_2[7];
+
+      return;
+    }
+    else {
+      compute_1D_Butterfly<Dir>(data, n/2, n_power_of_2 - 1);
+      compute_1D_Butterfly<Dir>(data + n/2, n/2, n_power_of_2 - 1);
+      //Original code:
+      //RealScalar wtemp = std::sin(M_PI/n);
+      //RealScalar wpi =  -std::sin(2 * M_PI/n);
+      RealScalar wtemp = m_sin_PI_div_n_LUT[n_power_of_2];
+      RealScalar wpi;
+      if (Dir == FFT_FORWARD) {
+        wpi =  m_minus_sin_2_PI_div_n_LUT[n_power_of_2];
+      }
+      else {
+        wpi = 0 - m_minus_sin_2_PI_div_n_LUT[n_power_of_2];
+      }
+
+      const ComplexScalar wp(wtemp, wpi);
+      ComplexScalar w(1.0, 0.0);
+      for(int i = 0; i < n/2; i++) {
+        ComplexScalar temp(data[i + n/2] * w);
+        data[i + n/2] = data[i] - temp;
+        data[i] += temp;
+        w += w * wp;
+      }
+      return;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index getBaseOffsetFromIndex(Index index, Index omitted_dim) const {
+    Index result = 0;
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumDims - 1; i > omitted_dim; --i) {
+        const Index partial_m_stride = m_strides[i] / m_dimensions[omitted_dim];
+        const Index idx = index / partial_m_stride;
+        index -= idx * partial_m_stride;
+        result += idx * m_strides[i];
+      }
+      result += index;
+    }
+    else {
+      for (int i = 0; i < omitted_dim; ++i) {
+        const Index partial_m_stride = m_strides[i] / m_dimensions[omitted_dim];
+        const Index idx = index / partial_m_stride;
+        index -= idx * partial_m_stride;
+        result += idx * m_strides[i];
+      }
+      result += index;
+    }
+    // Value of index_coords[omitted_dim] is not determined to this step
+    return result;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index getIndexFromOffset(Index base, Index omitted_dim, Index offset) const {
+    Index result = base + offset * m_strides[omitted_dim] ;
+    return result;
+  }
+
+ protected:
+  int m_size;
+  const FFT& m_fft;
+  Dimensions m_dimensions;
+  array<Index, NumDims> m_strides;
+  TensorEvaluator<ArgType, Device> m_impl;
+  CoeffReturnType* m_data;
+  const Device& m_device;
+
+  // This will support a maximum FFT size of 2^32 for each dimension
+  // m_sin_PI_div_n_LUT[i] = (-2) * std::sin(M_PI / std::pow(2,i)) ^ 2;
+  RealScalar m_sin_PI_div_n_LUT[32] = {
+  0.0,
+  -2,
+  -0.999999999999999,
+  -0.292893218813453,
+  -0.0761204674887130,
+  -0.0192147195967696,
+  -0.00481527332780311,
+  -0.00120454379482761,
+  -3.01181303795779e-04,
+  -7.52981608554592e-05,
+  -1.88247173988574e-05,
+  -4.70619042382852e-06,
+  -1.17654829809007e-06,
+  -2.94137117780840e-07,
+  -7.35342821488550e-08,
+  -1.83835707061916e-08,
+  -4.59589268710903e-09,
+  -1.14897317243732e-09,
+  -2.87243293150586e-10,
+  -7.18108232902250e-11,
+  -1.79527058227174e-11,
+  -4.48817645568941e-12,
+  -1.12204411392298e-12,
+  -2.80511028480785e-13,
+  -7.01277571201985e-14,
+  -1.75319392800498e-14,
+  -4.38298482001247e-15,
+  -1.09574620500312e-15,
+  -2.73936551250781e-16,
+  -6.84841378126949e-17,
+  -1.71210344531737e-17,
+  -4.28025861329343e-18
+  };
+
+  // m_minus_sin_2_PI_div_n_LUT[i] = -std::sin(2 * M_PI / std::pow(2,i));
+  RealScalar m_minus_sin_2_PI_div_n_LUT[32] = {
+    0.0,
+    0.0,
+   -1.00000000000000e+00,
+   -7.07106781186547e-01,
+   -3.82683432365090e-01,
+   -1.95090322016128e-01,
+   -9.80171403295606e-02,
+   -4.90676743274180e-02,
+   -2.45412285229123e-02,
+   -1.22715382857199e-02,
+   -6.13588464915448e-03,
+   -3.06795676296598e-03,
+   -1.53398018628477e-03,
+   -7.66990318742704e-04,
+   -3.83495187571396e-04,
+   -1.91747597310703e-04,
+   -9.58737990959773e-05,
+   -4.79368996030669e-05,
+   -2.39684498084182e-05,
+   -1.19842249050697e-05,
+   -5.99211245264243e-06,
+   -2.99605622633466e-06,
+   -1.49802811316901e-06,
+   -7.49014056584716e-07,
+   -3.74507028292384e-07,
+   -1.87253514146195e-07,
+   -9.36267570730981e-08,
+   -4.68133785365491e-08,
+   -2.34066892682746e-08,
+   -1.17033446341373e-08,
+   -5.85167231706864e-09,
+   -2.92583615853432e-09
+  };
+};
+
+#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) && !defined(__GCUDACC__)
+
+template<typename OutputScalar, typename RealScalar, typename ComplexScalar, int ResultType>
+struct writeToDeviceData {
+  void operator()(OutputScalar* d_data, ComplexScalar* data_buf, size_t size) {
+  }
+};
+
+template<typename OutputScalar, typename RealScalar, typename ComplexScalar>
+struct writeToDeviceData<OutputScalar, RealScalar, ComplexScalar, Eigen::BothParts> {
+  void operator()(OutputScalar* d_data, ComplexScalar* data_buf, size_t size) {
+    cudaMemcpy(d_data, data_buf, size * sizeof(ComplexScalar), cudaMemcpyDeviceToDevice);
+  }
+};
+
+template<typename OutputScalar, typename RealScalar, typename ComplexScalar>
+struct writeToDeviceData<OutputScalar, RealScalar, ComplexScalar, Eigen::RealPart> {
+  void operator()(OutputScalar* d_data, ComplexScalar* data_buf, size_t size) {
+    cudaMemcpy2D(d_data, sizeof(RealScalar), (RealScalar*) data_buf, 2 * sizeof(RealScalar), sizeof(RealScalar), size, cudaMemcpyDeviceToDevice);
+  }
+};
+
+template<typename OutputScalar, typename RealScalar, typename ComplexScalar>
+struct writeToDeviceData<OutputScalar, RealScalar, ComplexScalar, Eigen::ImagPart> {
+  void operator()(OutputScalar* d_data, ComplexScalar* data_buf, size_t size) {
+    RealScalar* data_buf_offset = &(((RealScalar*) data_buf)[1]);
+    cudaMemcpy2D(d_data, sizeof(RealScalar), data_buf_offset,        2 * sizeof(RealScalar), sizeof(RealScalar), size, cudaMemcpyDeviceToDevice);
+  }
+};
+
+template <typename InputScalar, typename RealScalar, typename ComplexScalar, typename InputEvaluator>
+__global__ void copyValues(ComplexScalar* d_data, InputEvaluator eval, int total_size) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < total_size) {
+    d_data[i] = MakeComplex<internal::is_same<InputScalar, RealScalar>::value>()(eval.coeff(i));
+  }
+}
+
+template<typename Scalar, typename Index, int NumDims>
+__global__ void fillLineBuf(Scalar* line_buf, Scalar* data_buf, int line_len,
+                            array<Index, NumDims> coords, array<Index, NumDims> m_strides, int dim) {
+  int j = blockIdx.x * blockDim.x + threadIdx.x;
+  if(j < line_len) {
+    coords[dim] = j;
+    Index index = 0;
+    for (int i = 0; i < NumDims; ++i) {
+      index += coords[i] * m_strides[i];
+    }
+    line_buf[j] = data_buf[index];
+  }
+}
+
+template<typename ComplexScalar, typename RealScalar, typename Index, int NumDims>
+__global__ void writebackLineBuf(ComplexScalar* line_buf, ComplexScalar* data_buf, int line_len,
+                                 array<Index, NumDims> coords, array<Index, NumDims> m_strides, int dim, RealScalar div_factor) {
+  int j = blockIdx.x * blockDim.x + threadIdx.x;
+  if(j < line_len) {
+    coords[dim] = j;
+    Index index = 0;
+    for (int i = 0; i < NumDims; ++i) {
+      index += coords[i] * m_strides[i];
+    }
+
+    data_buf[index] = line_buf[j];
+    ((RealScalar*) data_buf)[2*index] /= div_factor;
+    ((RealScalar*) data_buf)[2*index + 1] /= div_factor;
+  }
+}
+
+template <typename FFT, typename ArgType, int FFTResultType, int FFTDir>
+struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, GpuDevice> {
+  typedef TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir> XprType;
+  typedef typename XprType::Index Index;
+  static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, GpuDevice>::Dimensions>::value;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::Scalar InputScalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename std::complex<RealScalar> ComplexScalar;
+  typedef typename internal::conditional<FFTResultType == Eigen::BothParts, std::complex<RealScalar>, RealScalar>::type OutputScalar;
+  typedef typename TensorEvaluator<ArgType, GpuDevice>::Dimensions InputDimensions;
+  typedef OutputScalar CoeffReturnType;
+  typedef typename PacketType<OutputScalar, GpuDevice>::type PacketReturnType;
+
+  enum {
+    IsAligned = false,
+    PacketAccess = false,
+    BlockAccess = false,
+    Layout = TensorEvaluator<ArgType, GpuDevice>::Layout,
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const GpuDevice& device) : m_data_buf(NULL), m_impl(op.expression(), device), m_fft(op.fft()) {
+    const typename TensorEvaluator<ArgType, GpuDevice>::Dimensions& input_dims = m_impl.dimensions();
+    for (int i = 0; i < NumDims; ++i) {
+      eigen_assert(input_dims[i] > 0);
+      m_dimensions[i] = input_dims[i];
+    }
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_strides[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        m_strides[i] = m_strides[i - 1] * m_dimensions[i - 1];
+      }
+    } else {
+      m_strides[NumDims - 1] = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        m_strides[i] = m_strides[i + 1] * m_dimensions[i + 1];
+      }
+    }
+    m_size = m_dimensions.TotalSize();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {
+    return m_dimensions;
+  }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(OutputScalar* d_data) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    if (d_data) {
+      evalToDeviceData(d_data);
+      return false;
+    } else {
+      evalToSelfDataBuf();
+      return true;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index getIndexFromCoords(const array<Index, NumDims> & coords) const {
+    Index result = 0;
+    for (int i = 0; i < NumDims; ++i) {
+      result += coords[i] * m_strides[i];
+    }
+    return result;
+  }
+
+  EIGEN_STRONG_INLINE array<Index, NumDims> getPartialCoordsFromIndex(Index index, Index omitted_dim) const {
+    array<Index, NumDims> partial_m_strides = m_strides;
+    array<Index, NumDims> index_coords;
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (Index i = omitted_dim + 1; i < NumDims; ++i) {
+        partial_m_strides[i] /= m_dimensions[omitted_dim];
+      }
+      for (int i = NumDims - 1; i > 0; --i) {
+        if(omitted_dim == i) {
+        }
+        else {
+          const Index idx = index / partial_m_strides[i];
+          index -= idx * partial_m_strides[i];
+          index_coords[i] = idx;
+        }
+      }
+      index_coords[0] = index;
+    }
+    else {
+      for (Index i = omitted_dim - 1; i >= 0; --i) {
+        partial_m_strides[i] /= m_dimensions[omitted_dim];
+      }
+      for (int i = 0; i < NumDims - 1; ++i) {
+        if(omitted_dim == i) {
+        }
+        else {
+          const Index idx = index / partial_m_strides[i];
+          index -= idx * partial_m_strides[i];
+          index_coords[i] = idx;
+        }
+      }
+      index_coords[NumDims - 1] = index;
+    }
+    // Value of index_coords[omitted_dim] is not determined to this step
+    return index_coords;
+  }
+
+  void evalToSelfDataBuf() {
+    cudaMalloc((void**) &m_data_buf, sizeof(OutputScalar) * m_size);
+    evalToDeviceData(m_data_buf);
+  }
+
+  EIGEN_STRONG_INLINE void evalToDeviceData(OutputScalar* d_data) {
+    ComplexScalar* data_buf;
+    cudaMalloc((void**) &data_buf, sizeof(ComplexScalar) * m_size);
+
+    int block_size = 128;
+    int grid_size = m_size / block_size + 1;
+
+    copyValues<InputScalar, RealScalar, ComplexScalar, TensorEvaluator<ArgType, GpuDevice> > <<<grid_size, block_size>>>(data_buf, m_impl, m_size);
+
+    for (int i = 0; i < m_fft.size(); ++i) {
+      int dim = m_fft[i];
+      eigen_assert(dim >= 0 && dim < NumDims);
+      int line_len = m_dimensions[dim];
+      ComplexScalar* line_buf;
+      cudaMalloc((void**) &line_buf, sizeof(ComplexScalar) * line_len);
+
+      cufftHandle plan;
+      cufftPlan1d(&plan, line_len, CUFFT_C2C, 1);
+
+      for (Index partial_index = 0; partial_index < m_size/line_len; ++partial_index) {
+        array<Index, NumDims> coords = getPartialCoordsFromIndex(partial_index, dim);
+        // get data into line_buf
+        int block_size = 128;
+        int grid_size = line_len / block_size + 1;
+        fillLineBuf<ComplexScalar, Index, NumDims> <<<grid_size, block_size>>>(line_buf, data_buf, line_len, coords, m_strides, dim);
+
+        if(FFTDir == Eigen::FFT_FORWARD) {
+          cufftExecC2C(plan, reinterpret_cast<cufftComplex *>(line_buf), reinterpret_cast<cufftComplex*>(line_buf), CUFFT_FORWARD);
+        }
+        else {
+          cufftExecC2C(plan, reinterpret_cast<cufftComplex*>(line_buf), reinterpret_cast<cufftComplex*>(line_buf), CUFFT_INVERSE);
+        }
+        // write back
+        RealScalar div_factor = (FFTDir == FFT_FORWARD) ? 1.0 : line_len;
+        writebackLineBuf<ComplexScalar, RealScalar, Index, NumDims> <<<grid_size, block_size>>>(line_buf, data_buf, line_len, coords, m_strides, dim, div_factor);
+        cudaDeviceSynchronize();
+
+      }
+      cufftDestroy(plan);
+      cudaFree(line_buf);
+    }
+    writeToDeviceData<OutputScalar, RealScalar, ComplexScalar, FFTResultType>()(d_data, data_buf, m_size);
+    cudaFree(data_buf);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    if(m_data_buf != NULL) cudaFree(m_data_buf);
+    m_impl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffReturnType coeff(Index index) const {
+    return m_data_buf[index];
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType packet(Index index) const {
+    return internal::ploadt<PacketReturnType, LoadMode>(m_data_buf + index);
+  }
+
+  EIGEN_DEVICE_FUNC Scalar* data() const { return m_data_buf; }
+
+ protected:
+  int m_size;
+  const FFT& m_fft;
+  Dimensions m_dimensions;
+  array<Index, NumDims> m_strides;
+  TensorEvaluator<ArgType, GpuDevice> m_impl;
+  OutputScalar* m_data_buf;
+
+};
+#endif
+
+}  // end namespace Eigen
+#endif //EIGEN_CXX11_TENSOR_TENSOR_FFT_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
new file mode 100644
index 0000000000..a7af67230f
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
@@ -0,0 +1,277 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_FIXED_SIZE_H
+#define EIGEN_CXX11_TENSOR_TENSOR_FIXED_SIZE_H
+
+namespace Eigen {
+
+/** \class TensorFixedSize
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief The fixed sized version of the tensor class.
+  *
+  * The fixed sized equivalent of
+  * Eigen::Tensor<float, 3> t(3, 5, 7);
+  * is
+  * Eigen::TensorFixedSize<float, Sizes<3,5,7>> t;
+  */
+
+template<typename Scalar_, typename Dimensions_, int Options_, typename IndexType>
+class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_, Options_, IndexType> >
+{
+  public:
+    typedef TensorFixedSize<Scalar_, Dimensions_, Options_, IndexType> Self;
+    typedef TensorBase<TensorFixedSize<Scalar_, Dimensions_, Options_, IndexType> > Base;
+    typedef typename Eigen::internal::nested<Self>::type Nested;
+    typedef typename internal::traits<Self>::StorageKind StorageKind;
+    typedef typename internal::traits<Self>::Index Index;
+    typedef Scalar_ Scalar;
+    typedef typename internal::packet_traits<Scalar>::type Packet;
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+    typedef typename Base::CoeffReturnType CoeffReturnType;
+
+    static const int Options = Options_;
+
+    enum {
+      IsAligned = bool(EIGEN_ALIGN),
+      PacketAccess = (internal::packet_traits<Scalar>::size > 1),
+      BlockAccess = false,
+      Layout = Options_ & RowMajor ? RowMajor : ColMajor,
+      CoordAccess = true,
+    };
+
+  typedef Dimensions_ Dimensions;
+  static const std::size_t NumIndices = Dimensions::count;
+
+  protected:
+  TensorStorage<Scalar, Dimensions, Options> m_storage;
+
+  public:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index                      rank()                   const { return NumIndices; }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index                    dimension(std::size_t n) const { return m_storage.dimensions()[n]; }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions&        dimensions()             const { return m_storage.dimensions(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index                    size()                   const { return m_storage.size(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar                   *data()                        { return m_storage.data(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar             *data()                  const { return m_storage.data(); }
+
+    // This makes EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
+    // work, because that uses base().coeffRef() - and we don't yet
+    // implement a similar class hierarchy
+    inline Self& base()             { return *this; }
+    inline const Self& base() const { return *this; }
+
+#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+    template<typename... IndexTypes>
+    EIGEN_DEVICE_FUNC inline const Scalar& coeff(Index firstIndex, IndexTypes... otherIndices) const
+    {
+      // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
+      EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+      return coeff(array<Index, NumIndices>{{firstIndex, otherIndices...}});
+    }
+#endif
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& coeff(const array<Index, NumIndices>& indices) const
+    {
+      eigen_internal_assert(checkIndexRange(indices));
+      return m_storage.data()[linearizedIndex(indices)];
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& coeff() const
+    {
+      EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+      return m_storage.data()[0];
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& coeff(Index index) const
+    {
+      eigen_internal_assert(index >= 0 && index < size());
+      return m_storage.data()[index];
+    }
+
+#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+    template<typename... IndexTypes>
+    inline Scalar& coeffRef(Index firstIndex, IndexTypes... otherIndices)
+    {
+      // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
+      EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+      return coeffRef(array<Index, NumIndices>{{firstIndex, otherIndices...}});
+    }
+#endif
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& coeffRef(const array<Index, NumIndices>& indices)
+    {
+      eigen_internal_assert(checkIndexRange(indices));
+      return m_storage.data()[linearizedIndex(indices)];
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& coeffRef()
+    {
+      EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+      return m_storage.data()[0];
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& coeffRef(Index index)
+    {
+      eigen_internal_assert(index >= 0 && index < size());
+      return m_storage.data()[index];
+    }
+
+#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+    template<typename... IndexTypes>
+    inline const Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) const
+    {
+      // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
+      EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+      return this->operator()(array<Index, NumIndices>{{firstIndex, otherIndices...}});
+    }
+#endif
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& operator()(const array<Index, NumIndices>& indices) const
+    {
+      eigen_assert(checkIndexRange(indices));
+      return coeff(indices);
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& operator()() const
+    {
+      EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+      return coeff();
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& operator()(Index index) const
+    {
+      eigen_internal_assert(index >= 0 && index < size());
+      return coeff(index);
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& operator[](Index index) const
+    {
+      // The bracket operator is only for vectors, use the parenthesis operator instead.
+      EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE);
+      return coeff(index);
+    }
+
+#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+    template<typename... IndexTypes>
+    inline Scalar& operator()(Index firstIndex, IndexTypes... otherIndices)
+    {
+      // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
+      EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+      return operator()(array<Index, NumIndices>{{firstIndex, otherIndices...}});
+    }
+#endif
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& operator()(const array<Index, NumIndices>& indices)
+    {
+      eigen_assert(checkIndexRange(indices));
+      return coeffRef(indices);
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& operator()()
+    {
+      EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+      return coeffRef();
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& operator()(Index index)
+    {
+      eigen_assert(index >= 0 && index < size());
+      return coeffRef(index);
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& operator[](Index index)
+    {
+      // The bracket operator is only for vectors, use the parenthesis operator instead
+      EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+      return coeffRef(index);
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorFixedSize() { }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorFixedSize(const Self& other)
+      : m_storage(other.m_storage)
+    {
+    }
+
+#ifdef EIGEN_HAVE_RVALUE_REFERENCES
+    inline TensorFixedSize(Self&& other)
+      : m_storage(other.m_storage)
+    {
+    }
+#endif
+
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorFixedSize(const TensorBase<OtherDerived, ReadOnlyAccessors>& other)
+    {
+      typedef TensorAssignOp<TensorFixedSize, const OtherDerived> Assign;
+      Assign assign(*this, other.derived());
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+    }
+
+    template<typename Other>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorFixedSize& operator=(const Other& other)
+    {
+      // FIXME: check that the dimensions of other match the dimensions of *this.
+      // Unfortunately this isn't possible yet when the rhs is an expression.
+      typedef TensorAssignOp<Self, const Other> Assign;
+      Assign assign(*this, other);
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+      return *this;
+    }
+
+  protected:
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE bool checkIndexRange(const array<Index, NumIndices>& /*indices*/) const
+    {
+      using internal::array_apply_and_reduce;
+      using internal::array_zip_and_reduce;
+      using internal::greater_equal_zero_op;
+      using internal::logical_and_op;
+      using internal::lesser_op;
+
+      return true;
+        // check whether the indices are all >= 0
+          /*       array_apply_and_reduce<logical_and_op, greater_equal_zero_op>(indices) &&
+        // check whether the indices fit in the dimensions
+        array_zip_and_reduce<logical_and_op, lesser_op>(indices, m_storage.dimensions());*/
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Index linearizedIndex(const array<Index, NumIndices>& indices) const
+    {
+      if (Options&RowMajor) {
+        return m_storage.dimensions().IndexOfRowMajor(indices);
+      } else {
+        return m_storage.dimensions().IndexOfColMajor(indices);
+      }
+    }
+};
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_FIXED_SIZE_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
new file mode 100644
index 0000000000..1d1ce47174
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
@@ -0,0 +1,150 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_FORCED_EVAL_H
+#define EIGEN_CXX11_TENSOR_TENSOR_FORCED_EVAL_H
+
+namespace Eigen {
+
+/** \class TensorForcedEval
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor reshaping class.
+  *
+  *
+  */
+namespace internal {
+template<typename XprType>
+struct traits<TensorForcedEvalOp<XprType> >
+{
+  // Type promotion to handle the case where the types of the lhs and the rhs are different.
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename traits<XprType>::StorageKind StorageKind;
+  typedef typename traits<XprType>::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef typename remove_reference<Nested>::type _Nested;
+  static const int NumDimensions = XprTraits::NumDimensions;
+  static const int Layout = XprTraits::Layout;
+
+  enum {
+    Flags = 0,
+  };
+};
+
+template<typename XprType>
+struct eval<TensorForcedEvalOp<XprType>, Eigen::Dense>
+{
+  typedef const TensorForcedEvalOp<XprType>& type;
+};
+
+template<typename XprType>
+struct nested<TensorForcedEvalOp<XprType>, 1, typename eval<TensorForcedEvalOp<XprType> >::type>
+{
+  typedef TensorForcedEvalOp<XprType> type;
+};
+
+}  // end namespace internal
+
+
+
+template<typename XprType>
+class TensorForcedEvalOp : public TensorBase<TensorForcedEvalOp<XprType> >
+{
+  public:
+  typedef typename Eigen::internal::traits<TensorForcedEvalOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorForcedEvalOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorForcedEvalOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorForcedEvalOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorForcedEvalOp(const XprType& expr)
+      : m_xpr(expr) {}
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename XprType::Nested>::type&
+    expression() const { return m_xpr; }
+
+  protected:
+    typename XprType::Nested m_xpr;
+};
+
+
+template<typename ArgType, typename Device>
+struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
+{
+  typedef TensorForcedEvalOp<ArgType> XprType;
+  typedef typename ArgType::Scalar Scalar;
+  typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
+
+  enum {
+    IsAligned = true,
+    PacketAccess = (internal::packet_traits<Scalar>::size > 1),
+    BlockAccess = false,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+  };
+
+  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device), m_op(op.expression()), m_device(device), m_buffer(NULL)
+  { }
+
+  typedef typename XprType::Index Index;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+
+  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    const Index numValues = m_impl.dimensions().TotalSize();
+    m_buffer = (CoeffReturnType*)m_device.allocate(numValues * sizeof(CoeffReturnType));
+    // Should initialize the memory in case we're dealing with non POD types.
+    if (!internal::is_arithmetic<CoeffReturnType>::value) {
+      for (Index i = 0; i < numValues; ++i) {
+        new(m_buffer+i) CoeffReturnType();
+      }
+    }
+    typedef TensorEvalToOp<const ArgType> EvalTo;
+    EvalTo evalToTmp(m_buffer, m_op);
+    const bool PacketAccess = internal::IsVectorizable<Device, ArgType>::value;
+    const bool BlockAccess = false;
+    internal::TensorExecutor<const EvalTo, Device, PacketAccess, BlockAccess>::run(evalToTmp, m_device);
+    m_impl.cleanup();
+    return true;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_device.deallocate(m_buffer);
+    m_buffer = NULL;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    return m_buffer[index];
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  {
+    return internal::ploadt<PacketReturnType, LoadMode>(m_buffer + index);
+  }
+
+  EIGEN_DEVICE_FUNC Scalar* data() const { return m_buffer; }
+
+ private:
+  TensorEvaluator<ArgType, Device> m_impl;
+  const ArgType m_op;
+  const Device& m_device;
+  CoeffReturnType* m_buffer;
+};
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_FORCED_EVAL_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
new file mode 100644
index 0000000000..e11d5ed22e
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
@@ -0,0 +1,104 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_FORWARD_DECLARATIONS_H
+#define EIGEN_CXX11_TENSOR_TENSOR_FORWARD_DECLARATIONS_H
+
+namespace Eigen {
+
+template<typename Scalar_, std::size_t NumIndices_, int Options_ = 0, typename IndexType = DenseIndex> class Tensor;
+template<typename Scalar_, typename Dimensions, int Options_ = 0, typename IndexType = DenseIndex> class TensorFixedSize;
+template<typename Scalar_, int Options_ = 0, typename IndexType = DenseIndex> class TensorVarDim;
+template<typename PlainObjectType, int Options_ = Unaligned> class TensorMap;
+template<typename PlainObjectType> class TensorRef;
+template<typename Derived, int AccessLevel = internal::accessors_level<Derived>::value> class TensorBase;
+
+template<typename NullaryOp, typename PlainObjectType> class TensorCwiseNullaryOp;
+template<typename UnaryOp, typename XprType> class TensorCwiseUnaryOp;
+template<typename BinaryOp, typename LeftXprType, typename RightXprType> class TensorCwiseBinaryOp;
+template<typename IfXprType, typename ThenXprType, typename ElseXprType> class TensorSelectOp;
+template<typename Op, typename Dims, typename XprType> class TensorReductionOp;
+template<typename XprType> class TensorIndexTupleOp;
+template<typename ReduceOp, typename Dims, typename XprType> class TensorTupleReducerOp;
+template<typename Axis, typename LeftXprType, typename RightXprType> class TensorConcatenationOp;
+template<typename Dimensions, typename LeftXprType, typename RightXprType> class TensorContractionOp;
+template<typename TargetType, typename XprType> class TensorConversionOp;
+template<typename Dimensions, typename InputXprType, typename KernelXprType> class TensorConvolutionOp;
+template<typename Dimensions, typename InputXprType, typename KernelXprType> class TensorConvolutionByFFTOp;
+template<typename FFT, typename XprType, int FFTDataType, int FFTDirection> class TensorFFTOp;
+template<typename IFFT, typename XprType, int ResultType> class TensorIFFTOp;
+template<typename DFT, typename XprType, int ResultType> class TensorDFTOp;
+template<typename IDFT, typename XprType, int ResultType> class TensorIDFTOp;
+template<typename PatchDim, typename XprType> class TensorPatchOp;
+template<DenseIndex Rows, DenseIndex Cols, typename XprType> class TensorImagePatchOp;
+template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType> class TensorVolumePatchOp;
+template<typename Broadcast, typename XprType> class TensorBroadcastingOp;
+template<DenseIndex DimId, typename XprType> class TensorChippingOp;
+template<typename NewDimensions, typename XprType> class TensorReshapingOp;
+template<typename XprType> class TensorLayoutSwapOp;
+template<typename StartIndices, typename Sizes, typename XprType> class TensorSlicingOp;
+template<typename ReverseDimensions, typename XprType> class TensorReverseOp;
+template<typename XprType> class TensorTrueIndicesOp;
+template<typename PaddingDimensions, typename XprType> class TensorPaddingOp;
+template<typename Shuffle, typename XprType> class TensorShufflingOp;
+template<typename Strides, typename XprType> class TensorStridingOp;
+template<typename Strides, typename XprType> class TensorInflationOp;
+template<typename Generator, typename XprType> class TensorGeneratorOp;
+template<typename LeftXprType, typename RightXprType> class TensorAssignOp;
+
+template<typename CustomUnaryFunc, typename XprType> class TensorCustomUnaryOp;
+template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType> class TensorCustomBinaryOp;
+
+template<typename XprType> class TensorEvalToOp;
+template<typename XprType> class TensorForcedEvalOp;
+
+template<typename ExpressionType, typename DeviceType> class TensorDevice;
+template<typename Derived, typename Device> struct TensorEvaluator;
+
+class DefaultDevice;
+class ThreadPoolDevice;
+class GpuDevice;
+
+enum DFTResultType {
+  RealPart = 0,
+  ImagPart = 1,
+  BothParts = 2
+};
+
+enum FFTDirection {
+    FFT_FORWARD = 0,
+    FFT_REVERSE = 1
+};
+
+namespace internal {
+template <typename Device, typename Expression>
+struct IsVectorizable {
+  static const bool value = TensorEvaluator<Expression, Device>::PacketAccess;
+};
+
+template <typename Expression>
+struct IsVectorizable<GpuDevice, Expression> {
+  static const bool value = TensorEvaluator<Expression, GpuDevice>::PacketAccess &&
+                            TensorEvaluator<Expression, GpuDevice>::IsAligned;
+};
+
+template <typename Device, typename Expression>
+struct IsTileable {
+  static const bool value = TensorEvaluator<Expression, Device>::BlockAccess;
+};
+
+template <typename Expression, typename Device,
+          bool Vectorizable = IsVectorizable<Device, Expression>::value,
+          bool Tileable = IsTileable<Device, Expression>::value>
+class TensorExecutor;
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_FORWARD_DECLARATIONS_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
new file mode 100644
index 0000000000..526301ad5b
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
@@ -0,0 +1,706 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_FUNCTORS_H
+#define EIGEN_CXX11_TENSOR_TENSOR_FUNCTORS_H
+
+namespace Eigen {
+namespace internal {
+
+namespace {
+#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) && defined(__CUDA_ARCH__)
+__device__ int get_random_seed() {
+    return clock();
+}
+#else
+int get_random_seed() {
+#ifdef _WIN32
+    SYSTEMTIME st;
+    GetSystemTime(&st);
+    return st.wSecond + 1000 * st.wMilliseconds;
+#elif __APPLE__
+    return mach_absolute_time();
+#else
+    timespec ts;
+    clock_gettime(CLOCK_REALTIME, &ts);
+    return ts.tv_nsec;
+#endif
+}
+#endif
+}
+
+
+// Standard reduction functors
+template <typename T> struct SumReducer
+{
+  static const bool PacketAccess = true;
+  static const bool IsStateful = false;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
+    (*accum) += t;
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const {
+    (*accum) = padd<Packet>(*accum, p);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
+    return static_cast<T>(0);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
+    return pset1<Packet>(0);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const {
+    return accum;
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const {
+    return vaccum;
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const {
+    return saccum + predux(vaccum);
+  }
+};
+
+template <typename T> struct MeanReducer
+{
+  static const bool PacketAccess = true;
+  static const bool IsStateful = true;
+
+  MeanReducer() : scalarCount_(0), packetCount_(0) { }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) {
+    (*accum) += t;
+    scalarCount_++;
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) {
+    (*accum) = padd<Packet>(*accum, p);
+    packetCount_++;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
+    return static_cast<T>(0);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
+    return pset1<Packet>(0);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const {
+    return accum / scalarCount_;
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const {
+    return pdiv(vaccum, pset1<Packet>(packetCount_));
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const {
+    return (saccum + predux(vaccum)) / (scalarCount_ + packetCount_ * unpacket_traits<Packet>::size);
+  }
+
+  protected:
+    int scalarCount_;
+    int packetCount_;
+};
+
+struct AndReducer
+{
+  static const bool PacketAccess = false;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(bool t, bool* accum) const {
+    *accum = *accum && t;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool initialize() const {
+    return true;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool finalize(bool accum) const {
+    return accum;
+  }
+};
+
+struct OrReducer {
+  static const bool PacketAccess = false;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(bool t, bool* accum) const {
+    *accum = *accum || t;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool initialize() const {
+    return false;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool finalize(bool accum) const {
+    return accum;
+  }
+};
+
+template <typename T> struct MaxReducer
+{
+  static const bool PacketAccess = true;
+  static const bool IsStateful = false;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
+    if (t > *accum) { *accum = t; }
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const {
+    (*accum) = pmax<Packet>(*accum, p);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
+    return Eigen::NumTraits<T>::lowest();
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
+    return pset1<Packet>(initialize());
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const {
+    return accum;
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const {
+    return vaccum;
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const {
+    return numext::maxi(saccum, predux_max(vaccum));
+  }
+};
+
+template <typename T> struct MinReducer
+{
+  static const bool PacketAccess = true;
+  static const bool IsStateful = false;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
+    if (t < *accum) { *accum = t; }
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const {
+    (*accum) = pmin<Packet>(*accum, p);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
+    return Eigen::NumTraits<T>::highest();
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
+    return pset1<Packet>(initialize());
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const {
+    return accum;
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const {
+    return vaccum;
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const {
+    return numext::mini(saccum, predux_min(vaccum));
+  }
+};
+
+
+template <typename T> struct ProdReducer
+{
+  static const bool PacketAccess = true;
+  static const bool IsStateful = false;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
+    (*accum) *= t;
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const {
+    (*accum) = pmul<Packet>(*accum, p);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
+    return static_cast<T>(1);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
+    return pset1<Packet>(1);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const {
+    return accum;
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const {
+    return vaccum;
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const {
+    return saccum * predux_mul(vaccum);
+  }
+};
+
+#if !defined (EIGEN_USE_GPU) || !defined(__CUDACC__) || !defined(__CUDA_ARCH__)
+// We're not compiling a cuda kernel
+template <typename T> class UniformRandomGenerator {
+
+ public:
+  static const bool PacketAccess = true;
+
+  // Uses the given "seed" if non-zero, otherwise uses a random seed.
+  UniformRandomGenerator(unsigned int seed = 0) : m_seed(seed) {
+    seed = seed ? seed : get_random_seed();
+    srand(seed);
+  }
+  UniformRandomGenerator(const UniformRandomGenerator& other) {
+    m_seed = other.m_seed;
+  }
+
+  template<typename Index>
+  T operator()(Index, Index = 0) const {
+    return random<T>();
+  }
+  template<typename Index>
+  typename internal::packet_traits<T>::type packetOp(Index i, Index j = 0) const {
+    const int packetSize = internal::packet_traits<T>::size;
+    EIGEN_ALIGN_DEFAULT T values[packetSize];
+    for (int i = 0; i < packetSize; ++i) {
+      values[i] = random<T>();
+    }
+    return internal::pload<typename internal::packet_traits<T>::type>(values);
+  }
+
+ private:
+  unsigned int m_seed;
+};
+
+#if __cplusplus > 199711
+template <> class UniformRandomGenerator<float> {
+ public:
+  static const bool PacketAccess = true;
+
+  // Uses the given "seed" if non-zero, otherwise uses a random seed.
+  UniformRandomGenerator(unsigned int seed = 0) : m_seed(seed) {
+    seed = seed ? seed : get_random_seed();
+    m_generator.seed(seed);
+  }
+  UniformRandomGenerator(const UniformRandomGenerator<float>& other) {
+    m_generator.seed(other(0, 0) * UINT_MAX);
+    m_seed = other.m_seed;
+  }
+
+  template<typename Index>
+  float operator()(Index, Index = 0) const {
+    return m_distribution(m_generator);
+  }
+  template<typename Index>
+  typename internal::packet_traits<float>::type packetOp(Index i, Index j = 0) const {
+    const int packetSize = internal::packet_traits<float>::size;
+    EIGEN_ALIGN_DEFAULT float values[packetSize];
+    for (int i = 0; i < packetSize; ++i) {
+      values[i] = this->operator()(i, j);
+    }
+    return internal::pload<typename internal::packet_traits<float>::type>(values);
+  }
+
+ private:
+  UniformRandomGenerator& operator = (const UniformRandomGenerator&);
+  // Make sure m_seed comes first to match the layout of the cpu
+  // version of the code.
+  unsigned int m_seed;
+  mutable std::mt19937 m_generator;
+  mutable std::uniform_real_distribution<float> m_distribution;
+};
+
+template <> class UniformRandomGenerator<double> {
+ public:
+  static const bool PacketAccess = true;
+
+  // Uses the given "seed" if non-zero, otherwise uses a random seed.
+  UniformRandomGenerator(unsigned int seed = 0) : m_seed(seed) {
+    seed = seed ? seed : get_random_seed();
+    m_generator.seed(seed);
+  }
+  UniformRandomGenerator(const UniformRandomGenerator<double>& other) {
+    m_generator.seed(other(0, 0) * UINT_MAX);
+    m_seed = other.m_seed;
+  }
+
+  template<typename Index>
+  double operator()(Index, Index = 0) const {
+    return m_distribution(m_generator);
+  }
+  template<typename Index>
+  typename internal::packet_traits<double>::type packetOp(Index i, Index j = 0) const {
+    const int packetSize = internal::packet_traits<double>::size;
+    EIGEN_ALIGN_DEFAULT double values[packetSize];
+    for (int i = 0; i < packetSize; ++i) {
+      values[i] = this->operator()(i, j);
+    }
+    return internal::pload<typename internal::packet_traits<double>::type>(values);
+  }
+
+ private:
+  UniformRandomGenerator& operator = (const UniformRandomGenerator&);
+  // Make sure m_seed comes first to match the layout of the cpu
+  // version of the code.
+  unsigned int m_seed;
+  mutable std::mt19937 m_generator;
+  mutable std::uniform_real_distribution<double> m_distribution;
+};
+#endif
+
+#else
+
+// We're compiling a cuda kernel
+template <typename T> class UniformRandomGenerator;
+
+template <> class UniformRandomGenerator<float> {
+ public:
+  static const bool PacketAccess = true;
+
+  // Uses the given "seed" if non-zero, otherwise uses a random seed.
+  __device__ UniformRandomGenerator(unsigned int seed = 0) : m_seed(seed) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    seed = seed ? seed : get_random_seed();
+    curand_init(seed, tid, 0, &m_state);
+  }
+
+  __device__ UniformRandomGenerator(const UniformRandomGenerator& other) {
+    m_seed = other.m_seed;
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    const unsigned int seed = m_seed ? m_seed : get_random_seed();
+    curand_init(seed, tid, 0, &m_state);
+  }
+
+  template<typename Index>
+  __device__ float operator()(Index, Index = 0) const {
+    return curand_uniform(&m_state);
+  }
+  template<typename Index>
+  __device__ float4 packetOp(Index, Index = 0) const {
+    return curand_uniform4(&m_state);
+  }
+
+ private:
+  unsigned int m_seed;
+  mutable curandStatePhilox4_32_10_t m_state;
+};
+
+template <> class UniformRandomGenerator<double> {
+ public:
+  static const bool PacketAccess = true;
+
+  // Uses the given "seed" if non-zero, otherwise uses a random seed.
+  __device__ UniformRandomGenerator(unsigned int seed = 0) : m_seed(seed) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    seed = seed ? seed : get_random_seed();
+    curand_init(seed, tid, 0, &m_state);
+  }
+  __device__ UniformRandomGenerator(const UniformRandomGenerator& other) {
+    m_seed = other.m_seed;
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    const unsigned int seed = m_seed ? m_seed : get_random_seed();
+    curand_init(seed, tid, 0, &m_state);
+  }
+  template<typename Index>
+  __device__ double operator()(Index, Index = 0) const {
+    return curand_uniform_double(&m_state);
+  }
+  template<typename Index>
+  __device__ double2 packetOp(Index, Index = 0) const {
+    return curand_uniform2_double(&m_state);
+  }
+
+ private:
+  unsigned int m_seed;
+  mutable curandStatePhilox4_32_10_t m_state;
+};
+
+template <> class UniformRandomGenerator<std::complex<float> > {
+ public:
+  static const bool PacketAccess = false;
+
+  // Uses the given "seed" if non-zero, otherwise uses a random seed.
+  __device__ UniformRandomGenerator(unsigned int seed = 0) : m_seed(seed) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    seed = seed ? seed : get_random_seed();
+    curand_init(seed, tid, 0, &m_state);
+  }
+  __device__ UniformRandomGenerator(const UniformRandomGenerator& other) {
+    m_seed = other.m_seed;
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    const unsigned int seed = m_seed ? m_seed : get_random_seed();
+    curand_init(seed, tid, 0, &m_state);
+  }
+  template<typename Index>
+  __device__ std::complex<float> operator()(Index, Index = 0) const {
+    float4 vals = curand_uniform4(&m_state);
+    return std::complex<float>(vals.x, vals.y);
+  }
+
+ private:
+  unsigned int m_seed;
+  mutable curandStatePhilox4_32_10_t m_state;
+};
+
+template <> class UniformRandomGenerator<std::complex<double> > {
+ public:
+  static const bool PacketAccess = false;
+
+  // Uses the given "seed" if non-zero, otherwise uses a random seed.
+  __device__ UniformRandomGenerator(unsigned int seed = 0) : m_seed(seed) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    seed = seed ? seed : get_random_seed();
+    curand_init(seed, tid, 0, &m_state);
+  }
+  __device__ UniformRandomGenerator(const UniformRandomGenerator& other) {
+    m_seed = other.m_seed;
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    const unsigned int seed = m_seed ? m_seed : get_random_seed();
+    curand_init(seed, tid, 0, &m_state);
+  }
+  template<typename Index>
+  __device__ std::complex<double> operator()(Index, Index = 0) const {
+    double2 vals = curand_uniform2_double(&m_state);
+    return std::complex<double>(vals.x, vals.y);
+  }
+
+ private:
+  unsigned int m_seed;
+  mutable curandStatePhilox4_32_10_t m_state;
+};
+
+#endif
+
+
+#if (!defined (EIGEN_USE_GPU) || !defined(__CUDACC__) || !defined(__CUDA_ARCH__)) && __cplusplus > 199711
+// We're not compiling a cuda kernel
+template <typename T> class NormalRandomGenerator {
+ public:
+  static const bool PacketAccess = true;
+
+  // Uses the given "seed" if non-zero, otherwise uses a random seed.
+  NormalRandomGenerator(unsigned int seed = 0) : m_distribution(0, 1), m_seed(seed) {
+    seed = seed ? seed : get_random_seed();
+    m_generator.seed(seed);
+  }
+  NormalRandomGenerator(const NormalRandomGenerator& other)
+      : m_distribution(other.m_distribution), m_seed(other.m_seed) {
+    m_generator.seed(other(0, 0) * UINT_MAX);
+  }
+
+  template<typename Index>
+  T operator()(Index, Index = 0) const {
+    return m_distribution(m_generator);
+  }
+  template<typename Index>
+  typename internal::packet_traits<T>::type packetOp(Index, Index = 0) const {
+    const int packetSize = internal::packet_traits<T>::size;
+    EIGEN_ALIGN_DEFAULT T values[packetSize];
+    for (int i = 0; i < packetSize; ++i) {
+      values[i] = m_distribution(m_generator);
+    }
+    return internal::pload<typename internal::packet_traits<T>::type>(values);
+  }
+
+ private:
+  unsigned int m_seed;
+  mutable std::normal_distribution<T> m_distribution;
+  mutable std::mt19937 m_generator;
+};
+
+#elif defined (EIGEN_USE_GPU) && defined(__CUDACC__) && defined(__CUDA_ARCH__)
+
+// We're compiling a cuda kernel
+template <typename T> class NormalRandomGenerator;
+
+template <> class NormalRandomGenerator<float> {
+ public:
+  static const bool PacketAccess = true;
+
+  // Uses the given "seed" if non-zero, otherwise uses a random seed.
+  __device__ NormalRandomGenerator(unsigned int seed = 0) : m_seed(seed) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    seed = seed ? seed : get_random_seed();
+    curand_init(seed, tid, 0, &m_state);
+  }
+  __device__ NormalRandomGenerator(const NormalRandomGenerator<float>& other) {
+    m_seed = other.m_seed;
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    const unsigned int seed = m_seed ? m_seed : get_random_seed();
+    curand_init(seed, tid, 0, &m_state);
+  }
+  template<typename Index>
+   __device__ float operator()(Index, Index = 0) const {
+    return curand_normal(&m_state);
+  }
+  template<typename Index>
+   __device__ float4 packetOp(Index, Index = 0) const {
+    return curand_normal4(&m_state);
+  }
+
+ private:
+  unsigned int m_seed;
+  mutable curandStatePhilox4_32_10_t m_state;
+};
+
+template <> class NormalRandomGenerator<double> {
+ public:
+  static const bool PacketAccess = true;
+
+  // Uses the given "seed" if non-zero, otherwise uses a random seed.
+  __device__ NormalRandomGenerator(unsigned int seed = 0) : m_seed(seed) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    seed = seed ? seed : get_random_seed();
+    curand_init(seed, tid, 0, &m_state);
+  }
+  __device__ NormalRandomGenerator(const NormalRandomGenerator<double>& other) {
+    m_seed = other.m_seed;
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    const unsigned int seed = m_seed ? m_seed : get_random_seed();
+    curand_init(seed, tid, 0, &m_state);
+  }
+  template<typename Index>
+  __device__ double operator()(Index, Index = 0) const {
+    return curand_normal_double(&m_state);
+  }
+  template<typename Index>
+  __device__ double2 packetOp(Index, Index = 0) const {
+    return curand_normal2_double(&m_state);
+  }
+
+ private:
+  unsigned int m_seed;
+  mutable curandStatePhilox4_32_10_t m_state;
+};
+
+
+template <> class NormalRandomGenerator<std::complex<float> > {
+ public:
+  static const bool PacketAccess = false;
+
+  // Uses the given "seed" if non-zero, otherwise uses a random seed.
+  __device__ NormalRandomGenerator(unsigned int seed = 0) : m_seed(seed) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    seed = seed ? seed : get_random_seed();
+    curand_init(seed, tid, 0, &m_state);
+  }
+  __device__ NormalRandomGenerator(const NormalRandomGenerator& other) {
+    m_seed = other.m_seed;
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    const unsigned int seed = m_seed ? m_seed : get_random_seed();
+    curand_init(seed, tid, 0, &m_state);
+  }
+  template<typename Index>
+  __device__ std::complex<float> operator()(Index, Index = 0) const {
+    float4 vals = curand_normal4(&m_state);
+    return std::complex<float>(vals.x, vals.y);
+  }
+
+ private:
+  unsigned int m_seed;
+  mutable curandStatePhilox4_32_10_t m_state;
+};
+
+template <> class NormalRandomGenerator<std::complex<double> > {
+ public:
+  static const bool PacketAccess = false;
+
+  // Uses the given "seed" if non-zero, otherwise uses a random seed.
+  __device__ NormalRandomGenerator(unsigned int seed = 0) : m_seed(seed) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    seed = seed ? seed : get_random_seed();
+    curand_init(seed, tid, 0, &m_state);
+  }
+  __device__ NormalRandomGenerator(const NormalRandomGenerator& other) {
+    m_seed = other.m_seed;
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    const unsigned int seed = m_seed ? m_seed : get_random_seed();
+    curand_init(seed, tid, 0, &m_state);
+  }
+  template<typename Index>
+  __device__ std::complex<double> operator()(Index, Index = 0) const {
+    double2 vals = curand_normal2_double(&m_state);
+    return std::complex<double>(vals.x, vals.y);
+  }
+
+ private:
+  unsigned int m_seed;
+  mutable curandStatePhilox4_32_10_t m_state;
+};
+#else
+
+template <typename T> class NormalRandomGenerator {
+ public:
+  // Uses the given "seed" if non-zero, otherwise uses a random seed.
+  NormalRandomGenerator(unsigned int seed = 0) : m_seed(seed) {}
+
+ private:
+  unsigned int m_seed;
+};
+
+#endif
+
+
+template <typename T, typename Index, size_t NumDims>
+class GaussianGenerator {
+ public:
+  static const bool PacketAccess = false;
+
+  EIGEN_DEVICE_FUNC GaussianGenerator(const array<T, NumDims>& means,
+                                      const array<T, NumDims>& std_devs)
+      : m_means(means) {
+    for (int i = 0; i < NumDims; ++i) {
+      m_two_sigmas[i] = std_devs[i] * std_devs[i] * 2;
+    }
+  }
+
+  T operator()(const array<Index, NumDims>& coordinates) const {
+    T tmp = T(0);
+    for (int i = 0; i < NumDims; ++i) {
+      T offset = coordinates[i] - m_means[i];
+      tmp += offset * offset / m_two_sigmas[i];
+    }
+    return std::exp(-tmp);
+  }
+
+ private:
+  array<T, NumDims> m_means;
+  array<T, NumDims> m_two_sigmas;
+};
+
+template <typename T> struct ArgMaxTupleReducer
+{
+  static const bool PacketAccess = false;
+  static const bool IsStateful = false;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
+    if (t.second > accum->second) { *accum = t; }
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
+    return T(0, NumTraits<typename T::second_type>::lowest());
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T& accum) const {
+    return accum;
+  }
+};
+
+template <typename T> struct ArgMinTupleReducer
+{
+  static const bool PacketAccess = false;
+  static const bool IsStateful = false;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T& t, T* accum) const {
+    if (t.second < accum->second) { *accum = t; }
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
+    return T(0, NumTraits<typename T::second_type>::highest());
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T& accum) const {
+    return accum;
+  }
+};
+
+} // end namespace internal
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_FUNCTORS_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
new file mode 100644
index 0000000000..91a73669a4
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
@@ -0,0 +1,185 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_GENERATOR_H
+#define EIGEN_CXX11_TENSOR_TENSOR_GENERATOR_H
+
+namespace Eigen {
+
+/** \class TensorGenerator
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor generator class.
+  *
+  *
+  */
+namespace internal {
+template<typename Generator, typename XprType>
+struct traits<TensorGeneratorOp<Generator, XprType> > : public traits<XprType>
+{
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef typename remove_reference<Nested>::type _Nested;
+  static const int NumDimensions = XprTraits::NumDimensions;
+  static const int Layout = XprTraits::Layout;
+};
+
+template<typename Generator, typename XprType>
+struct eval<TensorGeneratorOp<Generator, XprType>, Eigen::Dense>
+{
+  typedef const TensorGeneratorOp<Generator, XprType>& type;
+};
+
+template<typename Generator, typename XprType>
+struct nested<TensorGeneratorOp<Generator, XprType>, 1, typename eval<TensorGeneratorOp<Generator, XprType> >::type>
+{
+  typedef TensorGeneratorOp<Generator, XprType> type;
+};
+
+}  // end namespace internal
+
+
+
+template<typename Generator, typename XprType>
+class TensorGeneratorOp : public TensorBase<TensorGeneratorOp<Generator, XprType>, ReadOnlyAccessors>
+{
+  public:
+  typedef typename Eigen::internal::traits<TensorGeneratorOp>::Scalar Scalar;
+  typedef typename Eigen::internal::traits<TensorGeneratorOp>::Packet Packet;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+  typedef typename Eigen::internal::nested<TensorGeneratorOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorGeneratorOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorGeneratorOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorGeneratorOp(const XprType& expr, const Generator& generator)
+      : m_xpr(expr), m_generator(generator) {}
+
+    EIGEN_DEVICE_FUNC
+    const Generator& generator() const { return m_generator; }
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename XprType::Nested>::type&
+    expression() const { return m_xpr; }
+
+  protected:
+    typename XprType::Nested m_xpr;
+    const Generator m_generator;
+};
+
+
+// Eval as rvalue
+template<typename Generator, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
+{
+  typedef TensorGeneratorOp<Generator, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
+  static const int NumDims = internal::array_size<Dimensions>::value;
+  typedef typename XprType::Scalar Scalar;
+
+  enum {
+    IsAligned = false,
+    PacketAccess = (internal::packet_traits<Scalar>::size > 1),
+    BlockAccess = false,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_generator(op.generator())
+  {
+    TensorEvaluator<ArgType, Device> impl(op.expression(), device);
+    m_dimensions = impl.dimensions();
+
+    if (NumDims > 0) {
+      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+        m_strides[0] = 1;
+        for (int i = 1; i < NumDims; ++i) {
+          m_strides[i] = m_strides[i - 1] * m_dimensions[i - 1];
+        }
+      } else {
+        m_strides[NumDims - 1] = 1;
+        for (int i = NumDims - 2; i >= 0; --i) {
+          m_strides[i] = m_strides[i + 1] * m_dimensions[i + 1];
+        }
+      }
+    }
+  }
+
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+    return true;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    array<Index, NumDims> coords;
+    extract_coordinates(index, coords);
+    return m_generator(coords);
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  {
+    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+packetSize-1 < dimensions().TotalSize());
+
+    EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize];
+    for (int i = 0; i < packetSize; ++i) {
+      values[i] = coeff(index+i);
+    }
+    PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+    return rslt;
+  }
+
+  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+
+ protected:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void extract_coordinates(Index index, array<Index, NumDims>& coords) const {
+    if (NumDims > 0) {
+      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+        for (int i = NumDims - 1; i > 0; --i) {
+          const Index idx = index / m_strides[i];
+          index -= idx * m_strides[i];
+          coords[i] = idx;
+        }
+        coords[0] = index;
+      } else {
+        for (int i = 0; i < NumDims - 1; ++i) {
+          const Index idx = index / m_strides[i];
+          index -= idx * m_strides[i];
+          coords[i] = idx;
+        }
+        coords[NumDims-1] = index;
+      }
+    }
+  }
+
+  Dimensions m_dimensions;
+  array<Index, NumDims> m_strides;
+  Generator m_generator;
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_GENERATOR_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h
new file mode 100644
index 0000000000..53dc0b04aa
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h
@@ -0,0 +1,56 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_IO_H
+#define EIGEN_CXX11_TENSOR_TENSOR_IO_H
+
+namespace Eigen {
+
+namespace internal {
+template<>
+struct significant_decimals_impl<std::string>
+    : significant_decimals_default_impl<std::string, true>
+{};
+}
+
+
+template <typename T>
+std::ostream& operator << (std::ostream& os, const TensorBase<T, ReadOnlyAccessors>& expr) {
+  // Evaluate the expression if needed
+  TensorForcedEvalOp<const T> eval = expr.eval();
+  TensorEvaluator<const TensorForcedEvalOp<const T>, DefaultDevice> tensor(eval, DefaultDevice());
+  tensor.evalSubExprsIfNeeded(NULL);
+
+  typedef typename internal::remove_const<typename T::Scalar>::type Scalar;
+  typedef typename T::Index Index;
+  typedef typename TensorEvaluator<const TensorForcedEvalOp<const T>, DefaultDevice>::Dimensions Dimensions;
+  const Index total_size = internal::array_prod(tensor.dimensions());
+
+  // Print the tensor as a 1d vector or a 2d matrix.
+  static const int rank = internal::array_size<Dimensions>::value;
+  if (rank == 0) {
+    os << tensor.coeff(0);
+  } else if (rank == 1) {
+    Map<const Array<Scalar, Dynamic, 1> > array(const_cast<Scalar*>(tensor.data()), total_size);
+    os << array;
+  } else {
+    const Index first_dim = tensor.dimensions()[0];
+    static const int layout = TensorEvaluator<const TensorForcedEvalOp<const T>, DefaultDevice>::Layout;
+    Map<const Array<Scalar, Dynamic, Dynamic, layout> > matrix(const_cast<Scalar*>(tensor.data()), first_dim, total_size/first_dim);
+    os << matrix;
+  }
+
+  // Cleanup.
+  tensor.cleanup();
+  return os;
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_IO_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
new file mode 100644
index 0000000000..a1d33d964e
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
@@ -0,0 +1,757 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_IMAGE_PATCH_H
+#define EIGEN_CXX11_TENSOR_TENSOR_IMAGE_PATCH_H
+
+namespace Eigen {
+
+/** \class TensorImagePatch
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Patch extraction specialized for image processing.
+  * This assumes that the input has a least 3 dimensions ordered as follow:
+  *  1st dimension: channels (of size d)
+  *  2nd dimension: rows (of size r)
+  *  3rd dimension: columns (of size c)
+  *  There can be additional dimensions such as time (for video) or batch (for
+  * bulk processing after the first 3.
+  * Calling the image patch code with patch_rows and patch_cols is equivalent
+  * to calling the regular patch extraction code with parameters d, patch_rows,
+  * patch_cols, and 1 for all the additional dimensions.
+  */
+namespace internal {
+template<DenseIndex Rows, DenseIndex Cols, typename XprType>
+struct traits<TensorImagePatchOp<Rows, Cols, XprType> > : public traits<XprType>
+{
+  typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef typename remove_reference<Nested>::type _Nested;
+  static const int NumDimensions = XprTraits::NumDimensions + 1;
+  static const int Layout = XprTraits::Layout;
+};
+
+template<DenseIndex Rows, DenseIndex Cols, typename XprType>
+struct eval<TensorImagePatchOp<Rows, Cols, XprType>, Eigen::Dense>
+{
+  typedef const TensorImagePatchOp<Rows, Cols, XprType>& type;
+};
+
+template<DenseIndex Rows, DenseIndex Cols, typename XprType>
+struct nested<TensorImagePatchOp<Rows, Cols, XprType>, 1, typename eval<TensorImagePatchOp<Rows, Cols, XprType> >::type>
+{
+  typedef TensorImagePatchOp<Rows, Cols, XprType> type;
+};
+
+template <typename Self, bool Vectorizable>
+struct ImagePatchCopyOp {
+  typedef typename Self::Index Index;
+  typedef typename Self::Scalar Scalar;
+  typedef typename Self::Impl Impl;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
+      const Self& self, const Index num_coeff_to_copy, const Index dst_index,
+      Scalar* dst_data, const Index src_index) {
+    const Impl& impl = self.impl();
+    for (Index i = 0; i < num_coeff_to_copy; ++i) {
+      dst_data[dst_index + i] = impl.coeff(src_index + i);
+    }
+  }
+};
+
+template <typename Self>
+struct ImagePatchCopyOp<Self, true> {
+  typedef typename Self::Index Index;
+  typedef typename Self::Scalar Scalar;
+  typedef typename Self::Impl Impl;
+  typedef typename packet_traits<Scalar>::type Packet;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
+      const Self& self, const Index num_coeff_to_copy, const Index dst_index,
+      Scalar* dst_data, const Index src_index) {
+    const Impl& impl = self.impl();
+    const Index packet_size = internal::unpacket_traits<Packet>::size;
+    const Index vectorized_size = (num_coeff_to_copy / packet_size) *
+        packet_size;
+    for (Index i = 0; i < vectorized_size; i += packet_size) {
+      Packet p = impl.template packet<Unaligned>(src_index + i);
+      internal::pstoret<Scalar, Packet, Unaligned>(dst_data + dst_index + i, p);
+    }
+    for (Index i = vectorized_size; i < num_coeff_to_copy; ++i) {
+      dst_data[dst_index + i] = impl.coeff(src_index + i);
+    }
+  }
+};
+
+template <typename Self>
+struct ImagePatchPaddingOp {
+  typedef typename Self::Index Index;
+  typedef typename Self::Scalar Scalar;
+  typedef typename packet_traits<Scalar>::type Packet;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
+      const Index num_coeff_to_pad, const Scalar padding_value,
+      const Index dst_index, Scalar* dst_data) {
+    const Index packet_size = internal::unpacket_traits<Packet>::size;
+    const Packet padded_packet = internal::pset1<Packet>(padding_value);
+    const Index vectorized_size = (num_coeff_to_pad / packet_size) *
+        packet_size;
+    for (Index i = 0; i < vectorized_size; i += packet_size) {
+      internal::pstoret<Scalar, Packet, Unaligned>(dst_data + dst_index + i,
+                                                   padded_packet);
+    }
+    for (Index i = vectorized_size; i < num_coeff_to_pad; ++i) {
+      dst_data[dst_index + i] = padding_value;
+    }
+  }
+};
+
+}  // end namespace internal
+
+template<DenseIndex Rows, DenseIndex Cols, typename XprType>
+class TensorImagePatchOp : public TensorBase<TensorImagePatchOp<Rows, Cols, XprType>, ReadOnlyAccessors>
+{
+  public:
+  typedef typename Eigen::internal::traits<TensorImagePatchOp>::Scalar Scalar;
+  typedef typename Eigen::internal::traits<TensorImagePatchOp>::Packet Packet;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+  typedef typename Eigen::internal::nested<TensorImagePatchOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorImagePatchOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorImagePatchOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorImagePatchOp(const XprType& expr, DenseIndex patch_rows, DenseIndex patch_cols,
+                                                           DenseIndex row_strides, DenseIndex col_strides,
+                                                           DenseIndex in_row_strides, DenseIndex in_col_strides,
+                                                           DenseIndex row_inflate_strides, DenseIndex col_inflate_strides,
+                                                           PaddingType padding_type, Scalar padding_value)
+      : m_xpr(expr), m_patch_rows(patch_rows), m_patch_cols(patch_cols),
+        m_row_strides(row_strides), m_col_strides(col_strides),
+        m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides),
+        m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides),
+        m_padding_explicit(false), m_padding_top(0), m_padding_bottom(0), m_padding_left(0), m_padding_right(0),
+        m_padding_type(padding_type), m_padding_value(padding_value) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorImagePatchOp(const XprType& expr, DenseIndex patch_rows, DenseIndex patch_cols,
+                                                           DenseIndex row_strides, DenseIndex col_strides,
+                                                           DenseIndex in_row_strides, DenseIndex in_col_strides,
+                                                           DenseIndex row_inflate_strides, DenseIndex col_inflate_strides,
+                                                           DenseIndex padding_top, DenseIndex padding_bottom,
+                                                           DenseIndex padding_left, DenseIndex padding_right,
+                                                           Scalar padding_value)
+      : m_xpr(expr), m_patch_rows(patch_rows), m_patch_cols(patch_cols),
+        m_row_strides(row_strides), m_col_strides(col_strides),
+        m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides),
+        m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides),
+        m_padding_explicit(true), m_padding_top(padding_top), m_padding_bottom(padding_bottom),
+        m_padding_left(padding_left), m_padding_right(padding_right),
+        m_padding_type(PADDING_VALID), m_padding_value(padding_value) {}
+
+    EIGEN_DEVICE_FUNC
+    DenseIndex patch_rows() const { return m_patch_rows; }
+    EIGEN_DEVICE_FUNC
+    DenseIndex patch_cols() const { return m_patch_cols; }
+    EIGEN_DEVICE_FUNC
+    DenseIndex row_strides() const { return m_row_strides; }
+    EIGEN_DEVICE_FUNC
+    DenseIndex col_strides() const { return m_col_strides; }
+    EIGEN_DEVICE_FUNC
+    DenseIndex in_row_strides() const { return m_in_row_strides; }
+    EIGEN_DEVICE_FUNC
+    DenseIndex in_col_strides() const { return m_in_col_strides; }
+    EIGEN_DEVICE_FUNC
+    DenseIndex row_inflate_strides() const { return m_row_inflate_strides; }
+    EIGEN_DEVICE_FUNC
+    DenseIndex col_inflate_strides() const { return m_col_inflate_strides; }
+    EIGEN_DEVICE_FUNC
+    bool padding_explicit() const { return m_padding_explicit; }
+    EIGEN_DEVICE_FUNC
+    DenseIndex padding_top() const { return m_padding_top; }
+    EIGEN_DEVICE_FUNC
+    DenseIndex padding_bottom() const { return m_padding_bottom; }
+    EIGEN_DEVICE_FUNC
+    DenseIndex padding_left() const { return m_padding_left; }
+    EIGEN_DEVICE_FUNC
+    DenseIndex padding_right() const { return m_padding_right; }
+    EIGEN_DEVICE_FUNC
+    PaddingType padding_type() const { return m_padding_type; }
+    EIGEN_DEVICE_FUNC
+    Scalar padding_value() const { return m_padding_value; }
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename XprType::Nested>::type&
+    expression() const { return m_xpr; }
+
+  protected:
+    typename XprType::Nested m_xpr;
+    const DenseIndex m_patch_rows;
+    const DenseIndex m_patch_cols;
+    const DenseIndex m_row_strides;
+    const DenseIndex m_col_strides;
+    const DenseIndex m_in_row_strides;
+    const DenseIndex m_in_col_strides;
+    const DenseIndex m_row_inflate_strides;
+    const DenseIndex m_col_inflate_strides;
+    const bool m_padding_explicit;
+    const DenseIndex m_padding_top;
+    const DenseIndex m_padding_bottom;
+    const DenseIndex m_padding_left;
+    const DenseIndex m_padding_right;
+    const PaddingType m_padding_type;
+    const Scalar m_padding_value;
+};
+
+// Eval as rvalue
+template<DenseIndex Rows, DenseIndex Cols, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
+{
+  typedef TensorImagePatchOp<Rows, Cols, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  static const int NumInputDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+  static const int NumDims = NumInputDims + 1;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
+  typedef TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>,
+                          Device> Self;
+  typedef TensorEvaluator<ArgType, Device> Impl;
+
+  enum {
+    IsAligned = false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = true,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = NumDims == 5,
+  };
+
+  typedef typename internal::TensorBlock<Index, Scalar, NumDims, Layout>
+    OutputTensorBlock;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device)
+  {
+    EIGEN_STATIC_ASSERT(NumDims >= 4, YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+    m_paddingValue = op.padding_value();
+
+    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+
+    // Caches a few variables.
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_inputDepth = input_dims[0];
+      m_inputRows = input_dims[1];
+      m_inputCols = input_dims[2];
+    } else {
+      m_inputDepth = input_dims[NumInputDims-1];
+      m_inputRows = input_dims[NumInputDims-2];
+      m_inputCols = input_dims[NumInputDims-3];
+    }
+
+    m_row_strides = op.row_strides();
+    m_col_strides = op.col_strides();
+
+    // Input strides and effective input/patch size
+    m_in_row_strides = op.in_row_strides();
+    m_in_col_strides = op.in_col_strides();
+    m_row_inflate_strides = op.row_inflate_strides();
+    m_col_inflate_strides = op.col_inflate_strides();
+    // The "effective" input rows and input cols are the input rows and cols
+    // after inflating them with zeros.
+    // For examples, a 2x3 matrix with row_inflate_strides and
+    // col_inflate_strides of 2 comes from:
+    //   A B C
+    //   D E F
+    //
+    // to a matrix is 3 x 5:
+    //
+    //   A . B . C
+    //   . . . . .
+    //   D . E . F
+
+    m_input_rows_eff = (m_inputRows - 1) * m_row_inflate_strides + 1;
+    m_input_cols_eff = (m_inputCols - 1) * m_col_inflate_strides + 1;
+    m_patch_rows_eff = op.patch_rows() + (op.patch_rows() - 1) * (m_in_row_strides - 1);
+    m_patch_cols_eff = op.patch_cols() + (op.patch_cols() - 1) * (m_in_col_strides - 1);
+
+    if (op.padding_explicit()) {
+      m_outputRows = ceil((m_input_rows_eff + op.padding_top() + op.padding_bottom() - m_patch_rows_eff + 1.f) / static_cast<float>(m_row_strides));
+      m_outputCols = ceil((m_input_cols_eff + op.padding_left() + op.padding_right() - m_patch_cols_eff + 1.f) / static_cast<float>(m_col_strides));
+      m_rowPaddingTop = op.padding_top();
+      m_colPaddingLeft = op.padding_left();
+    } else {
+      // Computing padding from the type
+      switch (op.padding_type()) {
+        case PADDING_VALID:
+          m_outputRows = ceil((m_input_rows_eff - m_patch_rows_eff + 1.f) / static_cast<float>(m_row_strides));
+          m_outputCols = ceil((m_input_cols_eff - m_patch_cols_eff + 1.f) / static_cast<float>(m_col_strides));
+          // Calculate the padding
+          m_rowPaddingTop = ((m_outputRows - 1) * m_row_strides + m_patch_rows_eff - m_input_rows_eff) / 2;
+          m_colPaddingLeft = ((m_outputCols - 1) * m_col_strides + m_patch_cols_eff - m_input_cols_eff) / 2;
+          break;
+        case PADDING_SAME:
+          m_outputRows = ceil(m_input_rows_eff / static_cast<float>(m_row_strides));
+          m_outputCols = ceil(m_input_cols_eff / static_cast<float>(m_col_strides));
+          // Calculate the padding
+          m_rowPaddingTop = ((m_outputRows - 1) * m_row_strides + m_patch_rows_eff - m_input_rows_eff) / 2;
+          m_colPaddingLeft = ((m_outputCols - 1) * m_col_strides + m_patch_cols_eff - m_input_cols_eff) / 2;
+          break;
+        default:
+          eigen_assert(false && "unexpected padding");
+      }
+    }
+    eigen_assert(m_outputRows > 0);
+    eigen_assert(m_outputCols > 0);
+
+    // Dimensions for result of extraction.
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      // ColMajor
+      // 0: depth
+      // 1: patch_rows
+      // 2: patch_cols
+      // 3: number of patches
+      // 4 and beyond: anything else (such as batch).
+      m_dimensions[0] = input_dims[0];
+      m_dimensions[1] = op.patch_rows();
+      m_dimensions[2] = op.patch_cols();
+      m_dimensions[3] = m_outputRows * m_outputCols;
+      for (int i = 4; i < NumDims; ++i) {
+        m_dimensions[i] = input_dims[i-1];
+      }
+    } else {
+      // RowMajor
+      // NumDims-1: depth
+      // NumDims-2: patch_rows
+      // NumDims-3: patch_cols
+      // NumDims-4: number of patches
+      // NumDims-5 and beyond: anything else (such as batch).
+      m_dimensions[NumDims-1] = input_dims[NumInputDims-1];
+      m_dimensions[NumDims-2] = op.patch_rows();
+      m_dimensions[NumDims-3] = op.patch_cols();
+      m_dimensions[NumDims-4] = m_outputRows * m_outputCols;
+      for (int i = NumDims-5; i >= 0; --i) {
+        m_dimensions[i] = input_dims[i];
+      }
+    }
+
+    // Strides for moving the patch in various dimensions.
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_colStride = m_dimensions[1];
+      m_patchStride = m_colStride * m_dimensions[2] * m_dimensions[0];
+      m_otherStride = m_patchStride * m_dimensions[3];
+    } else {
+      m_colStride = m_dimensions[NumDims-2];
+      m_patchStride = m_colStride * m_dimensions[NumDims-3] * m_dimensions[NumDims-1];
+      m_otherStride = m_patchStride * m_dimensions[NumDims-4];
+    }
+
+    // Strides for navigating through the input tensor.
+    m_rowInputStride = m_inputDepth;
+    m_colInputStride = m_inputDepth * m_inputRows;
+    m_patchInputStride = m_inputDepth * m_inputRows * m_inputCols;
+
+    // Fast representations of different variables.
+    m_fastOtherStride = internal::TensorIntDivisor<Index>(m_otherStride);
+    m_fastPatchStride = internal::TensorIntDivisor<Index>(m_patchStride);
+    m_fastColStride = internal::TensorIntDivisor<Index>(m_colStride);
+    m_fastInputRowStride = internal::TensorIntDivisor<Index>(m_row_inflate_strides);
+    m_fastInputColStride = internal::TensorIntDivisor<Index>(m_col_inflate_strides);
+    m_fastInputColsEff = internal::TensorIntDivisor<Index>(m_input_cols_eff);
+
+    // Number of patches in the width dimension.
+    m_fastOutputRows = internal::TensorIntDivisor<Index>(m_outputRows);
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_fastOutputDepth = internal::TensorIntDivisor<Index>(m_dimensions[0]);
+    } else {
+      m_fastOutputDepth = internal::TensorIntDivisor<Index>(m_dimensions[NumDims-1]);
+    }
+
+    m_block_total_size_max = numext::maxi(static_cast<std::size_t>(1),
+                                          device.lastLevelCacheSize() /
+                                          sizeof(Scalar));
+  }
+
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_impl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    // Patch index corresponding to the passed in index.
+    const Index patchIndex = index / m_fastPatchStride;
+    // Find the offset of the element wrt the location of the first element.
+    const Index patchOffset = (index - patchIndex * m_patchStride) / m_fastOutputDepth;
+
+    // Other ways to index this element.
+    const Index otherIndex = (NumDims == 4) ? 0 : index / m_fastOtherStride;
+    const Index patch2DIndex = (NumDims == 4) ? patchIndex : (index - otherIndex * m_otherStride) / m_fastPatchStride;
+
+    // Calculate col index in the input original tensor.
+    const Index colIndex = patch2DIndex / m_fastOutputRows;
+    const Index colOffset = patchOffset / m_fastColStride;
+    const Index inputCol = colIndex * m_col_strides + colOffset * m_in_col_strides - m_colPaddingLeft;
+    const Index origInputCol = (m_col_inflate_strides == 1) ? inputCol : ((inputCol >= 0) ? (inputCol / m_fastInputColStride) : 0);
+    if (inputCol < 0 || inputCol >= m_input_cols_eff ||
+        ((m_col_inflate_strides != 1) && (inputCol != origInputCol * m_col_inflate_strides))) {
+      return Scalar(m_paddingValue);
+    }
+
+    // Calculate row index in the original input tensor.
+    const Index rowIndex = patch2DIndex - colIndex * m_outputRows;
+    const Index rowOffset = patchOffset - colOffset * m_colStride;
+    const Index inputRow = rowIndex * m_row_strides + rowOffset * m_in_row_strides - m_rowPaddingTop;
+    const Index origInputRow = (m_row_inflate_strides == 1) ? inputRow : ((inputRow >= 0) ? (inputRow / m_fastInputRowStride) : 0);
+    if (inputRow < 0 || inputRow >= m_input_rows_eff ||
+        ((m_row_inflate_strides != 1) && (inputRow != origInputRow * m_row_inflate_strides))) {
+      return Scalar(m_paddingValue);
+    }
+
+    const int depth_index = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - 1;
+    const Index depth = index - (index / m_fastOutputDepth) * m_dimensions[depth_index];
+
+    const Index inputIndex = depth + origInputRow * m_rowInputStride + origInputCol * m_colInputStride + otherIndex * m_patchInputStride;
+    return m_impl.coeff(inputIndex);
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  {
+    const Index packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+packetSize-1 < dimensions().TotalSize());
+
+    if (m_in_row_strides != 1 || m_in_col_strides != 1 || m_row_inflate_strides != 1 || m_col_inflate_strides != 1) {
+      return packetWithPossibleZero(index);
+    }
+
+    const Index indices[2] = {index, index + packetSize - 1};
+    const Index patchIndex = indices[0] / m_fastPatchStride;
+    if (patchIndex != indices[1] / m_fastPatchStride) {
+      return packetWithPossibleZero(index);
+    }
+    const Index otherIndex = (NumDims == 4) ? 0 : indices[0] / m_fastOtherStride;
+    eigen_assert(otherIndex == indices[1] / m_fastOtherStride);
+
+    // Find the offset of the element wrt the location of the first element.
+    const Index patchOffsets[2] = {(indices[0] - patchIndex * m_patchStride) / m_fastOutputDepth,
+                                   (indices[1] - patchIndex * m_patchStride) / m_fastOutputDepth};
+
+    const Index patch2DIndex = (NumDims == 4) ? patchIndex : (indices[0] - otherIndex * m_otherStride) / m_fastPatchStride;
+    eigen_assert(patch2DIndex == (indices[1] - otherIndex * m_otherStride) / m_fastPatchStride);
+
+    const Index colIndex = patch2DIndex / m_fastOutputRows;
+    const Index colOffsets[2] = {patchOffsets[0] / m_fastColStride, patchOffsets[1] / m_fastColStride};
+
+    // Calculate col indices in the original input tensor.
+    const Index inputCols[2] = {colIndex * m_col_strides + colOffsets[0] -
+      m_colPaddingLeft, colIndex * m_col_strides + colOffsets[1] - m_colPaddingLeft};
+    if (inputCols[1] < 0 || inputCols[0] >= m_inputCols) {
+      return internal::pset1<PacketReturnType>(Scalar(m_paddingValue));
+    }
+
+    if (inputCols[0] == inputCols[1]) {
+      const Index rowIndex = patch2DIndex - colIndex * m_outputRows;
+      const Index rowOffsets[2] = {patchOffsets[0] - colOffsets[0]*m_colStride, patchOffsets[1] - colOffsets[1]*m_colStride};
+      eigen_assert(rowOffsets[0] <= rowOffsets[1]);
+      // Calculate col indices in the original input tensor.
+      const Index inputRows[2] = {rowIndex * m_row_strides + rowOffsets[0] -
+        m_rowPaddingTop, rowIndex * m_row_strides + rowOffsets[1] - m_rowPaddingTop};
+
+      if (inputRows[1] < 0 || inputRows[0] >= m_inputRows) {
+        return internal::pset1<PacketReturnType>(Scalar(m_paddingValue));
+      }
+
+      if (inputRows[0] >= 0 && inputRows[1] < m_inputRows) {
+        // no padding
+        const int depth_index = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - 1;
+        const Index depth = index - (index / m_fastOutputDepth) * m_dimensions[depth_index];
+        const Index inputIndex = depth + inputRows[0] * m_rowInputStride + inputCols[0] * m_colInputStride + otherIndex * m_patchInputStride;
+        return m_impl.template packet<Unaligned>(inputIndex);
+      }
+    }
+
+    return packetWithPossibleZero(index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
+      std::vector<internal::TensorOpResourceRequirements>* resources) const {
+    resources->push_back(internal::TensorOpResourceRequirements(
+        internal::kSkewedInnerDims, m_block_total_size_max));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
+      OutputTensorBlock* output_block) const {
+    typedef typename internal::ImagePatchCopyOp<Self, PacketAccess>
+        ImagePatchCopyOp;
+    typedef typename internal::ImagePatchPaddingOp<Self> ImagePatchPaddingOp;
+
+    // Calculate loop limits and various input/output dim sizes.
+    const DSizes<Index, NumDims>& block_sizes = output_block->block_sizes();
+    const bool col_major =
+        static_cast<int>(Layout) == static_cast<int>(ColMajor);
+    const Index depth_dim_size = block_sizes[col_major ? 0 : NumDims - 1];
+    const Index output_depth_dim_size = m_dimensions[
+        col_major ? 0 : NumDims - 1];
+    const Index row_dim_size = block_sizes[col_major ? 1 : NumDims - 2];
+    const Index output_row_dim_size = m_dimensions[col_major ? 1 : NumDims - 2];
+    const Index col_dim_size = block_sizes[col_major ? 2 : NumDims - 3];
+    const Index block_col_stride = row_dim_size * depth_dim_size;
+    const Index patch_index_dim_size = block_sizes[col_major ? 3 : NumDims - 4];
+    const Index outer_dim_size = block_sizes.TotalSize() /
+        (depth_dim_size * row_dim_size * col_dim_size * patch_index_dim_size);
+
+    const Index patch_size = row_dim_size * col_dim_size * depth_dim_size;
+    const Index batch_size = patch_size * patch_index_dim_size;
+
+    Index output_index = output_block->first_coeff_index();
+
+    // Loop through outer dimensions.
+    for (Index outer_dim_index = 0;
+         outer_dim_index < outer_dim_size;
+         ++outer_dim_index) {
+      const Index outer_output_base_index = outer_dim_index * batch_size;
+      // Find the offset of the element wrt the location of the first element.
+      const Index patchIndexStart = output_index / m_fastPatchStride;
+      const Index patchOffset =
+          (output_index - patchIndexStart * m_patchStride) / m_fastOutputDepth;
+      const Index colOffsetStart = patchOffset / m_fastColStride;
+      // Other ways to index this element.
+      const Index otherIndex = (NumDims == 4) ?
+          0 : output_index / m_fastOtherStride;
+      const Index patch2DIndexStart = (NumDims == 4) ?
+          0 : (output_index - otherIndex * m_otherStride) / m_fastPatchStride;
+      // Calculate starting depth index.
+      const Index depth = output_index - (output_index / m_fastOutputDepth) *
+          output_depth_dim_size;
+      const Index patch_input_base_index = depth + otherIndex *
+          m_patchInputStride;
+
+      // Loop through patches.
+      for (Index patch_index_dim_index = 0;
+           patch_index_dim_index < patch_index_dim_size;
+           ++patch_index_dim_index) {
+        const Index patch_output_base_index = outer_output_base_index +
+            patch_index_dim_index * patch_size;
+        // Patch index corresponding to the passed in index.
+        const Index patchIndex = patchIndexStart + patch_index_dim_index;
+        const Index patch2DIndex = (NumDims == 4) ?
+            patchIndex : patch2DIndexStart + patch_index_dim_index;
+        const Index colIndex = patch2DIndex / m_fastOutputRows;
+        const Index input_col_base = colIndex * m_col_strides;
+        const Index row_offset_base = (patch2DIndex - colIndex * m_outputRows) *
+            m_row_strides - m_rowPaddingTop;
+
+        // Loop through columns.
+        for (Index col_dim_index = 0;
+             col_dim_index < col_dim_size;
+             ++col_dim_index) {
+          const Index col_output_base_index = patch_output_base_index +
+              col_dim_index * block_col_stride;
+
+          // Calculate col index in the input original tensor.
+          Index colOffset = colOffsetStart + col_dim_index;
+          Index inputCol = input_col_base + colOffset * m_in_col_strides -
+              m_colPaddingLeft;
+          Index origInputCol = (m_col_inflate_strides == 1) ?
+              inputCol : ((inputCol >= 0) ?
+                          (inputCol / m_fastInputColStride) : 0);
+
+          bool pad_column = false;
+          if (inputCol < 0 || inputCol >= m_input_cols_eff ||
+              ((m_col_inflate_strides != 1) &&
+               (inputCol != origInputCol * m_col_inflate_strides))) {
+            pad_column = true;
+          }
+
+          const Index col_input_base_index = patch_input_base_index +
+              origInputCol * m_colInputStride;
+          const Index input_row_base = row_offset_base +
+              ((patchOffset + col_dim_index * output_row_dim_size) -
+               colOffset * m_colStride) * m_in_row_strides;
+          // Loop through rows.
+          for (Index row_dim_index = 0;
+               row_dim_index < row_dim_size;
+               ++row_dim_index) {
+            const Index output_base_index = col_output_base_index +
+                row_dim_index * depth_dim_size;
+            bool pad_row = false;
+            Index inputIndex;
+            if (!pad_column) {
+              Index inputRow = input_row_base + row_dim_index *
+                  m_in_row_strides;
+              Index origInputRow = (m_row_inflate_strides == 1) ?
+                  inputRow : ((inputRow >= 0) ?
+                              (inputRow / m_fastInputRowStride) : 0);
+              if (inputRow < 0 || inputRow >= m_input_rows_eff ||
+                  ((m_row_inflate_strides != 1) &&
+                   (inputRow != origInputRow * m_row_inflate_strides))) {
+                pad_row = true;
+              } else {
+                inputIndex = col_input_base_index + origInputRow *
+                    m_rowInputStride;
+              }
+            }
+            // Copy (or pad) along depth dimension.
+            if (pad_column || pad_row) {
+              ImagePatchPaddingOp::Run(depth_dim_size, Scalar(m_paddingValue),
+                                       output_base_index, output_block->data());
+            } else {
+              ImagePatchCopyOp::Run(*this, depth_dim_size,
+                                    output_base_index, output_block->data(),
+                                    inputIndex);
+            }
+          }
+        }
+      }
+      output_index += m_otherStride;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+
+  const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+
+  Index rowPaddingTop() const { return m_rowPaddingTop; }
+  Index colPaddingLeft() const { return m_colPaddingLeft; }
+  Index outputRows() const { return m_outputRows; }
+  Index outputCols() const { return m_outputCols; }
+  Index userRowStride() const { return m_row_strides; }
+  Index userColStride() const { return m_col_strides; }
+  Index userInRowStride() const { return m_in_row_strides; }
+  Index userInColStride() const { return m_in_col_strides; }
+  Index rowInflateStride() const { return m_row_inflate_strides; }
+  Index colInflateStride() const { return m_col_inflate_strides; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<Index, NumDims>& coords) const
+  {
+    // Location of the first element of the patch.
+    // ColMajor
+    // 0: d, 1: patch_rows, 2: patch_cols, 3: number of patches, 4: number of batches
+    // RowMajor
+    // 0: number of batches, 1: number of patches, 2: patch_cols , 3: patch_rows, 4: d
+    const Index patch2DIndex = coords[static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 3 : 1];
+
+    array<Index, NumDims-1> inputCoords;
+    Index input_col_idx = patch2DIndex / m_fastInputColsEff;
+    Index inputCol = input_col_idx  + coords[1] * m_in_row_strides - m_rowPaddingTop;
+    Index inputRow = patch2DIndex - input_col_idx * m_input_cols_eff + coords[2] * m_in_col_strides - m_colPaddingLeft;
+    const Index origInputCol = (m_col_inflate_strides == 1) ? inputCol : ((inputCol >= 0) ? (inputCol / m_fastInputColStride) : 0);
+    const Index origInputRow = (m_row_inflate_strides == 1) ? inputRow : ((inputRow >= 0) ? (inputRow / m_fastInputRowStride) : 0);
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      inputCoords[0] = coords[0];  // depth
+      inputCoords[1] = origInputCol;
+      inputCoords[2] = origInputRow;
+      inputCoords[3] = coords[4];  // batch
+    } else {
+      inputCoords[3] = coords[4];  // depth
+      inputCoords[2] = origInputCol;
+      inputCoords[1] = origInputRow;
+      inputCoords[0] = coords[0];  // batch
+    }
+    // If the computed coordinates are outside the original image perimeter, return 0.
+    if (inputCol < 0 || inputCol >= m_input_cols_eff || inputRow < 0 || inputRow >= m_input_rows_eff ||
+        ((m_col_inflate_strides != 1) && (inputCol != origInputCol * m_col_inflate_strides)) ||
+        ((m_row_inflate_strides != 1) && (inputRow != origInputRow * m_row_inflate_strides))) {
+      return Scalar(m_paddingValue);
+    }
+    if (TensorEvaluator<ArgType, Device>::CoordAccess) {
+      return m_impl.coeff(inputCoords);
+    } else {
+      Index inputIndex;
+      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+        inputIndex =
+          inputCoords[3] * m_patchInputStride +
+          inputCoords[2] * m_colInputStride +
+          inputCoords[1] * m_rowInputStride +
+          inputCoords[0];
+      } else {
+        inputIndex =
+          inputCoords[1] * m_patchInputStride +
+          inputCoords[2] * m_colInputStride +
+          inputCoords[3] * m_rowInputStride +
+          inputCoords[4];
+      }
+      return m_impl.coeff(inputIndex);
+    }
+  }
+
+ protected:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
+  {
+    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize];
+    for (int i = 0; i < packetSize; ++i) {
+      values[i] = coeff(index+i);
+    }
+    PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+    return rslt;
+  }
+
+  Dimensions m_dimensions;
+
+  Index m_otherStride;
+  Index m_patchStride;
+  Index m_colStride;
+  Index m_row_strides;
+  Index m_col_strides;
+
+  Index m_in_row_strides;
+  Index m_in_col_strides;
+  Index m_row_inflate_strides;
+  Index m_col_inflate_strides;
+
+  Index m_input_rows_eff;
+  Index m_input_cols_eff;
+  Index m_patch_rows_eff;
+  Index m_patch_cols_eff;
+
+  internal::TensorIntDivisor<Index> m_fastOtherStride;
+  internal::TensorIntDivisor<Index> m_fastPatchStride;
+  internal::TensorIntDivisor<Index> m_fastColStride;
+  internal::TensorIntDivisor<Index> m_fastInputRowStride;
+  internal::TensorIntDivisor<Index> m_fastInputColStride;
+  internal::TensorIntDivisor<Index> m_fastInputColsEff;
+
+  Index m_rowInputStride;
+  Index m_colInputStride;
+  Index m_patchInputStride;
+
+  Index m_inputDepth;
+  Index m_inputRows;
+  Index m_inputCols;
+
+  Index m_outputRows;
+  Index m_outputCols;
+
+  Index m_rowPaddingTop;
+  Index m_colPaddingLeft;
+
+  internal::TensorIntDivisor<Index> m_fastOutputRows;
+  internal::TensorIntDivisor<Index> m_fastOutputDepth;
+
+  Scalar m_paddingValue;
+  std::size_t m_block_total_size_max;
+
+  TensorEvaluator<ArgType, Device> m_impl;
+};
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_IMAGE_PATCH_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
new file mode 100644
index 0000000000..7631b54f2f
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
@@ -0,0 +1,421 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H
+#define EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H
+
+#if defined(EIGEN_HAS_CONSTEXPR) && defined(EIGEN_HAS_VARIADIC_TEMPLATES)
+
+#define EIGEN_HAS_INDEX_LIST
+
+namespace Eigen {
+
+/** \internal
+  *
+  * \class TensorIndexList
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Set of classes used to encode a set of Tensor dimensions/indices.
+  *
+  * The indices in the list can be known at compile time or at runtime. A mix
+  * of static and dynamic indices can also be provided if needed. The tensor
+  * code will attempt to take advantage of the indices that are known at
+  * compile time to optimize the code it generates.
+  *
+  * This functionality requires a c++11 compliant compiler. If your compiler
+  * is older you need to use arrays of indices instead.
+  *
+  * Several examples are provided in the cxx11_tensor_index_list.cpp file.
+  *
+  * \sa Tensor
+  */
+
+template <DenseIndex n>
+struct type2index {
+  static const DenseIndex value = n;
+  constexpr operator DenseIndex() const { return n; }
+  void set(DenseIndex val) {
+    eigen_assert(val == n);
+  }
+};
+
+namespace internal {
+template <typename T>
+void update_value(T& val, DenseIndex new_val) {
+  val = new_val;
+}
+template <DenseIndex n>
+void update_value(type2index<n>& val, DenseIndex new_val) {
+  val.set(new_val);
+}
+
+template <typename T>
+struct is_compile_time_constant {
+  static constexpr bool value = false;
+};
+
+template <DenseIndex idx>
+struct is_compile_time_constant<type2index<idx> > {
+  static constexpr bool value = true;
+};
+template <DenseIndex idx>
+struct is_compile_time_constant<const type2index<idx> > {
+  static constexpr bool value = true;
+};
+template <DenseIndex idx>
+struct is_compile_time_constant<type2index<idx>& > {
+  static constexpr bool value = true;
+};
+template <DenseIndex idx>
+struct is_compile_time_constant<const type2index<idx>& > {
+  static constexpr bool value = true;
+};
+
+template <DenseIndex Idx>
+struct tuple_coeff {
+  template <typename... T>
+  static constexpr DenseIndex get(const DenseIndex i, const std::tuple<T...>& t) {
+    return std::get<Idx>(t) * (i == Idx) + tuple_coeff<Idx-1>::get(i, t) * (i != Idx);
+  }
+  template <typename... T>
+  static void set(const DenseIndex i, std::tuple<T...>& t, const DenseIndex value) {
+    if (i == Idx) {
+      update_value(std::get<Idx>(t), value);
+    } else {
+      tuple_coeff<Idx-1>::set(i, t, value);
+    }
+  }
+
+  template <typename... T>
+  static constexpr bool value_known_statically(const DenseIndex i, const std::tuple<T...>& t) {
+    return ((i == Idx) & is_compile_time_constant<typename std::tuple_element<Idx, std::tuple<T...> >::type>::value) ||
+        tuple_coeff<Idx-1>::value_known_statically(i, t);
+  }
+
+  template <typename... T>
+  static constexpr bool values_up_to_known_statically(const std::tuple<T...>& t) {
+    return is_compile_time_constant<typename std::tuple_element<Idx, std::tuple<T...> >::type>::value &&
+        tuple_coeff<Idx-1>::values_up_to_known_statically(t);
+  }
+
+  template <typename... T>
+  static constexpr bool values_up_to_statically_known_to_increase(const std::tuple<T...>& t) {
+    return is_compile_time_constant<typename std::tuple_element<Idx, std::tuple<T...> >::type>::value &&
+           is_compile_time_constant<typename std::tuple_element<Idx-1, std::tuple<T...> >::type>::value &&
+           std::get<Idx>(t) > std::get<Idx-1>(t) &&
+           tuple_coeff<Idx-1>::values_up_to_statically_known_to_increase(t);
+  }
+};
+
+template <>
+struct tuple_coeff<0> {
+  template <typename... T>
+  static constexpr DenseIndex get(const DenseIndex i, const std::tuple<T...>& t) {
+    //  eigen_assert (i == 0);  // gcc fails to compile assertions in constexpr
+    return std::get<0>(t) * (i == 0);
+  }
+  template <typename... T>
+  static void set(const DenseIndex i, std::tuple<T...>& t, const DenseIndex value) {
+    eigen_assert (i == 0);
+    update_value(std::get<0>(t), value);
+  }
+  template <typename... T>
+  static constexpr bool value_known_statically(const DenseIndex i, const std::tuple<T...>& t) {
+    //    eigen_assert (i == 0);  // gcc fails to compile assertions in constexpr
+    return is_compile_time_constant<typename std::tuple_element<0, std::tuple<T...> >::type>::value & (i == 0);
+  }
+
+  template <typename... T>
+  static constexpr bool values_up_to_known_statically(const std::tuple<T...>& t) {
+    return is_compile_time_constant<typename std::tuple_element<0, std::tuple<T...> >::type>::value;
+  }
+
+  template <typename... T>
+  static constexpr bool values_up_to_statically_known_to_increase(const std::tuple<T...>& t) {
+    return true;
+  }
+};
+}  // namespace internal
+
+
+template<typename FirstType, typename... OtherTypes>
+struct IndexList : std::tuple<FirstType, OtherTypes...> {
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr DenseIndex operator[] (const DenseIndex i) const {
+    return internal::tuple_coeff<std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value-1>::get(i, *this);
+  }
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void set(const DenseIndex i, const DenseIndex value) {
+    return internal::tuple_coeff<std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value-1>::set(i, *this, value);
+  }
+
+  constexpr IndexList(const std::tuple<FirstType, OtherTypes...>& other) : std::tuple<FirstType, OtherTypes...>(other) { }
+  constexpr IndexList() : std::tuple<FirstType, OtherTypes...>() { }
+
+  constexpr bool value_known_statically(const DenseIndex i) const {
+    return internal::tuple_coeff<std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value-1>::value_known_statically(i, *this);
+  }
+  constexpr bool all_values_known_statically() const {
+    return internal::tuple_coeff<std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value-1>::values_up_to_known_statically(*this);
+  }
+
+  constexpr bool values_statically_known_to_increase() const {
+    return internal::tuple_coeff<std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value-1>::values_up_to_statically_known_to_increase(*this);
+  }
+};
+
+
+template<typename FirstType, typename... OtherTypes>
+constexpr IndexList<FirstType, OtherTypes...> make_index_list(FirstType val1, OtherTypes... other_vals) {
+  return std::make_tuple(val1, other_vals...);
+}
+
+
+namespace internal {
+
+template<typename FirstType, typename... OtherTypes> size_t array_prod(const IndexList<FirstType, OtherTypes...>& sizes) {
+  size_t result = 1;
+  for (int i = 0; i < array_size<IndexList<FirstType, OtherTypes...> >::value; ++i) {
+    result *= sizes[i];
+  }
+  return result;
+};
+
+template<typename FirstType, typename... OtherTypes> struct array_size<IndexList<FirstType, OtherTypes...> > {
+  static const size_t value = std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value;
+};
+template<typename FirstType, typename... OtherTypes> struct array_size<const IndexList<FirstType, OtherTypes...> > {
+  static const size_t value = std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value;
+};
+
+template<DenseIndex n, typename FirstType, typename... OtherTypes> constexpr DenseIndex array_get(IndexList<FirstType, OtherTypes...>& a) {
+  return std::get<n>(a);
+}
+template<DenseIndex n, typename FirstType, typename... OtherTypes> constexpr DenseIndex array_get(const IndexList<FirstType, OtherTypes...>& a) {
+  return std::get<n>(a);
+}
+
+template <typename T>
+struct index_known_statically {
+  constexpr bool operator() (DenseIndex) const {
+    return false;
+  }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_known_statically<IndexList<FirstType, OtherTypes...> > {
+  constexpr bool operator() (const DenseIndex i) const {
+    return IndexList<FirstType, OtherTypes...>().value_known_statically(i);
+  }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_known_statically<const IndexList<FirstType, OtherTypes...> > {
+  constexpr bool operator() (const DenseIndex i) const {
+    return IndexList<FirstType, OtherTypes...>().value_known_statically(i);
+  }
+};
+
+template <typename T>
+struct all_indices_known_statically {
+  constexpr bool operator() () const {
+    return false;
+  }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct all_indices_known_statically<IndexList<FirstType, OtherTypes...> > {
+  constexpr bool operator() () const {
+    return IndexList<FirstType, OtherTypes...>().all_values_known_statically();
+  }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct all_indices_known_statically<const IndexList<FirstType, OtherTypes...> > {
+  constexpr bool operator() () const {
+    return IndexList<FirstType, OtherTypes...>().all_values_known_statically();
+  }
+};
+
+template <typename T>
+struct indices_statically_known_to_increase {
+  constexpr bool operator() () const {
+    return false;
+  }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct indices_statically_known_to_increase<IndexList<FirstType, OtherTypes...> > {
+  constexpr bool operator() () const {
+    return IndexList<FirstType, OtherTypes...>().values_statically_known_to_increase();
+  }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct indices_statically_known_to_increase<const IndexList<FirstType, OtherTypes...> > {
+  constexpr bool operator() () const {
+    return IndexList<FirstType, OtherTypes...>().values_statically_known_to_increase();
+  }
+};
+
+template <typename Tx>
+struct index_statically_eq {
+  constexpr bool operator() (DenseIndex, DenseIndex) const {
+    return false;
+  }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_statically_eq<IndexList<FirstType, OtherTypes...> > {
+  constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
+    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &&
+        IndexList<FirstType, OtherTypes...>()[i] == value;
+  }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_statically_eq<const IndexList<FirstType, OtherTypes...> > {
+  constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
+    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &&
+        IndexList<FirstType, OtherTypes...>()[i] == value;
+  }
+};
+
+template <typename T>
+struct index_statically_ne {
+  constexpr bool operator() (DenseIndex, DenseIndex) const {
+  return false;
+  }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_statically_ne<IndexList<FirstType, OtherTypes...> > {
+  constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
+    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &&
+        IndexList<FirstType, OtherTypes...>()[i] != value;
+  }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_statically_ne<const IndexList<FirstType, OtherTypes...> > {
+  constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
+    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &&
+        IndexList<FirstType, OtherTypes...>()[i] != value;
+  }
+};
+
+
+template <typename T>
+struct index_statically_gt {
+  constexpr bool operator() (DenseIndex, DenseIndex) const {
+  return false;
+  }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_statically_gt<IndexList<FirstType, OtherTypes...> > {
+  constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
+    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &&
+        IndexList<FirstType, OtherTypes...>()[i] > value;
+  }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_statically_gt<const IndexList<FirstType, OtherTypes...> > {
+  constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
+    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &&
+        IndexList<FirstType, OtherTypes...>()[i] > value;
+  }
+};
+
+template <typename T>
+struct index_statically_lt {
+  constexpr bool operator() (DenseIndex, DenseIndex) const {
+  return false;
+  }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_statically_lt<IndexList<FirstType, OtherTypes...> > {
+  constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
+    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &&
+        IndexList<FirstType, OtherTypes...>()[i] < value;
+  }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_statically_lt<const IndexList<FirstType, OtherTypes...> > {
+  constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
+    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &&
+        IndexList<FirstType, OtherTypes...>()[i] < value;
+  }
+};
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#else
+
+namespace Eigen {
+namespace internal {
+
+// No C++11 support
+template <typename T>
+struct index_known_statically {
+  EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC bool operator() (DenseIndex) const{
+    return false;
+  }
+};
+
+template <typename T>
+struct all_indices_known_statically {
+  EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC bool operator() () const {
+    return false;
+  }
+};
+
+template <typename T>
+struct indices_statically_known_to_increase {
+  EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC bool operator() () const {
+    return false;
+  }
+};
+
+template <typename T>
+struct index_statically_eq {
+  EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC bool operator() (DenseIndex, DenseIndex) const{
+    return false;
+  }
+};
+
+template <typename T>
+struct index_statically_ne {
+  EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC bool operator() (DenseIndex, DenseIndex) const{
+    return false;
+  }
+};
+
+template <typename T>
+struct index_statically_gt {
+  EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC bool operator() (DenseIndex, DenseIndex) const{
+    return false;
+  }
+};
+
+template <typename T>
+struct index_statically_lt {
+  EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC bool operator() (DenseIndex, DenseIndex) const{
+    return false;
+  }
+};
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
new file mode 100644
index 0000000000..40a50e4662
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
@@ -0,0 +1,219 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Ke Yang <yangke@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_INFLATION_H
+#define EIGEN_CXX11_TENSOR_TENSOR_INFLATION_H
+
+namespace Eigen {
+
+/** \class TensorInflation
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor inflation class.
+  *
+  *
+  */
+namespace internal {
+template<typename Strides, typename XprType>
+struct traits<TensorInflationOp<Strides, XprType> > : public traits<XprType>
+{
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef typename remove_reference<Nested>::type _Nested;
+  static const int NumDimensions = XprTraits::NumDimensions;
+  static const int Layout = XprTraits::Layout;
+};
+
+template<typename Strides, typename XprType>
+struct eval<TensorInflationOp<Strides, XprType>, Eigen::Dense>
+{
+  typedef const TensorInflationOp<Strides, XprType>& type;
+};
+
+template<typename Strides, typename XprType>
+struct nested<TensorInflationOp<Strides, XprType>, 1, typename eval<TensorInflationOp<Strides, XprType> >::type>
+{
+  typedef TensorInflationOp<Strides, XprType> type;
+};
+
+}  // end namespace internal
+
+template<typename Strides, typename XprType>
+class TensorInflationOp : public TensorBase<TensorInflationOp<Strides, XprType>, ReadOnlyAccessors>
+{
+  public:
+  typedef typename Eigen::internal::traits<TensorInflationOp>::Scalar Scalar;
+  typedef typename Eigen::internal::traits<TensorInflationOp>::Packet Packet;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+  typedef typename Eigen::internal::nested<TensorInflationOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorInflationOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorInflationOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorInflationOp(const XprType& expr, const Strides& strides)
+      : m_xpr(expr), m_strides(strides) {}
+
+    EIGEN_DEVICE_FUNC
+    const Strides& strides() const { return m_strides; }
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename XprType::Nested>::type&
+    expression() const { return m_xpr; }
+
+  protected:
+    typename XprType::Nested m_xpr;
+    const Strides m_strides;
+};
+
+// Eval as rvalue
+template<typename Strides, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device>
+{
+  typedef TensorInflationOp<Strides, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+  typedef DSizes<Index, NumDims> Dimensions;
+
+  enum {
+    IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device), m_strides(op.strides())
+  {
+    m_dimensions = m_impl.dimensions();
+    // Expand each dimension to the inflated dimension.
+    for (int i = 0; i < NumDims; ++i) {
+      m_dimensions[i] = (m_dimensions[i] - 1) * op.strides()[i] + 1;
+    }
+
+    // Remember the strides for fast division.
+    for (int i = 0; i < NumDims; ++i) {
+      m_fastStrides[i] = internal::TensorIntDivisor<Index>(m_strides[i]);
+    }
+
+    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_outputStrides[0] = 1;
+      m_inputStrides[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1];
+        m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1];
+      }
+    } else {  // RowMajor
+      m_outputStrides[NumDims-1] = 1;
+      m_inputStrides[NumDims-1] = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1];
+        m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1];
+      }
+    }
+  }
+
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_impl.cleanup();
+  }
+
+  // Computes the input index given the output index. Returns true if the output
+  // index doesn't fall into a hole.
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool getInputIndex(Index index, Index* inputIndex) const
+  {
+    eigen_assert(index < dimensions().TotalSize());
+    *inputIndex = 0;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumDims - 1; i > 0; --i) {
+        const Index idx = index / m_outputStrides[i];
+        if (idx != idx / m_fastStrides[i] * m_strides[i]) {
+          return false;
+        }
+        *inputIndex += idx / m_strides[i] * m_inputStrides[i];
+        index -= idx * m_outputStrides[i];
+      }
+      if (index != index / m_fastStrides[0] * m_strides[0]) {
+        return false;
+      }
+      *inputIndex += index / m_strides[0];
+      return true;
+    } else {
+      for (int i = 0; i < NumDims - 1; ++i) {
+        const Index idx = index / m_outputStrides[i];
+        if (idx != idx / m_fastStrides[i] * m_strides[i]) {
+          return false;
+        }
+        *inputIndex += idx / m_strides[i] * m_inputStrides[i];
+        index -= idx * m_outputStrides[i];
+      }
+      if (index != index / m_fastStrides[NumDims-1] * m_strides[NumDims-1]) {
+        return false;
+      }
+      *inputIndex += index / m_strides[NumDims - 1];
+    }
+    return true;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    Index inputIndex = 0;
+    if (getInputIndex(index, &inputIndex)) {
+     return m_impl.coeff(inputIndex);
+    } else {
+     return Scalar(0);
+    }
+  }
+
+  // TODO(yangke): optimize this function so that we can detect and produce
+  // all-zero packets
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  {
+    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+packetSize-1 < dimensions().TotalSize());
+
+    EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize];
+    for (int i = 0; i < packetSize; ++i) {
+      values[i] = coeff(index+i);
+    }
+    PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+    return rslt;
+  }
+
+  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+
+ protected:
+  Dimensions m_dimensions;
+  array<Index, NumDims> m_outputStrides;
+  array<Index, NumDims> m_inputStrides;
+  TensorEvaluator<ArgType, Device> m_impl;
+  const Strides m_strides;
+  array<internal::TensorIntDivisor<Index>, NumDims> m_fastStrides;
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_INFLATION_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h
new file mode 100644
index 0000000000..375c763152
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h
@@ -0,0 +1,82 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H
+#define EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H
+
+#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+
+#include <initializer_list>
+
+namespace Eigen {
+
+/** \class TensorInitializer
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Helper template to initialize Tensors from std::initializer_lists.
+  */
+namespace internal {
+
+template <typename Derived, int N>
+struct Initializer {
+  typedef std::initializer_list<
+    typename Initializer<Derived, N - 1>::InitList> InitList;
+
+  static void run(TensorEvaluator<Derived, DefaultDevice>& tensor,
+                  Eigen::array<typename traits<Derived>::Index, traits<Derived>::NumDimensions>* indices,
+                  const InitList& vals) {
+    int i = 0;
+    for (auto v : vals) {
+      (*indices)[traits<Derived>::NumDimensions - N] = i++;
+      Initializer<Derived, N - 1>::run(tensor, indices, v);
+    }
+  }
+};
+
+template <typename Derived>
+struct Initializer<Derived, 1> {
+  typedef std::initializer_list<typename traits<Derived>::Scalar> InitList;
+
+  static void run(TensorEvaluator<Derived, DefaultDevice>& tensor,
+                  Eigen::array<typename traits<Derived>::Index, traits<Derived>::NumDimensions>* indices,
+                  const InitList& vals) {
+    int i = 0;
+    // There is likely a faster way to do that than iterating.
+    for (auto v : vals) {
+      (*indices)[traits<Derived>::NumDimensions - 1] = i++;
+      tensor.coeffRef(*indices) = v;
+    }
+  }
+};
+
+template <typename Derived>
+struct Initializer<Derived, Dynamic> {
+  typedef std::initializer_list<typename traits<Derived>::Scalar> InitList;
+
+  static void run(TensorEvaluator<Derived, DefaultDevice>& tensor,
+                  Eigen::array<typename traits<Derived>::Index, traits<Derived>::NumDimensions>* indices,
+                  const InitList& vals) {
+    // Static initialization not implemented for VarDims tensors.
+    eigen_assert(false);
+  }
+};
+
+template <typename Derived, int N>
+void initialize_tensor(TensorEvaluator<Derived, DefaultDevice>& tensor,
+                       const typename Initializer<Derived, traits<Derived>::NumDimensions>::InitList& vals) {
+  Eigen::array<typename traits<Derived>::Index, traits<Derived>::NumDimensions> indices;
+  Initializer<Derived, traits<Derived>::NumDimensions>::run(tensor, &indices, vals);
+}
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_HAS_VARIADIC_TEMPLATES
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
new file mode 100644
index 0000000000..3e90b08c99
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
@@ -0,0 +1,357 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_INTDIV_H
+#define EIGEN_CXX11_TENSOR_TENSOR_INTDIV_H
+
+
+namespace Eigen {
+
+/** \internal
+  *
+  * \class TensorIntDiv
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Fast integer division by a constant.
+  *
+  * See the paper from Granlund and Montgomery for explanation.
+  *   (at http://dx.doi.org/10.1145/773473.178249)
+  *
+  * \sa Tensor
+  */
+
+namespace internal {
+
+#if !defined(__GCUDACC__) && !defined(__GCUDACC_HOST__)
+
+namespace {
+  // Note: result is undefined if val == 0
+  template <typename T>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int count_leading_zeros(const T val)
+  {
+#ifdef __CUDA_ARCH__
+    if (sizeof(T) == 8) {
+      return __clzll(val);
+    }
+    return __clz(val);
+#elif EIGEN_COMP_MSVC
+    DWORD leading_zeros = 0;
+    if (sizeof(T) == 8) {
+      _BitScanReverse64(&leading_zero, val);
+    }
+    else {
+      _BitScanReverse(&leading_zero, val);
+    }
+#else
+    if (sizeof(T) == 8) {
+      return __builtin_clzl(static_cast<uint64_t>(val));
+    }
+    return __builtin_clz(static_cast<uint32_t>(val));
+#endif
+  }
+
+
+  template <typename T>
+  struct DividerTraits {
+#if defined(__SIZEOF_INT128__) && !defined(__CUDACC__)
+    typedef typename conditional<sizeof(T) == 8, uint64_t, uint32_t>::type type;
+    static const int N = sizeof(T) * 8;
+#else
+    typedef uint32_t type;
+    static const int N = 32;
+#endif
+  };
+
+
+  template <typename T>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t muluh(const uint32_t a, const T b) {
+#if defined(__CUDA_ARCH__)
+    return __umulhi(a, b);
+#else
+    return (static_cast<uint64_t>(a) * b) >> 32;
+#endif
+  }
+
+#if defined(__CUDA_ARCH__)
+ template <typename T>
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t muluh(const uint64_t a, const T b) {
+    return __umul64hi(a, b);
+ }
+#else
+  template <typename T>
+  EIGEN_ALWAYS_INLINE uint64_t muluh(const uint64_t a, const T b) {
+#if defined(__SIZEOF_INT128__) && !defined(__CUDACC__)
+    __uint128_t v = static_cast<__uint128_t>(a) * static_cast<__uint128_t>(b);
+    return static_cast<uint64_t>(v >> 64);
+#else
+    EIGEN_STATIC_ASSERT(sizeof(T) == 4, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    return (a * b) >> 32;
+#endif
+  }
+#endif
+
+  template <int N, typename T>
+  struct DividerHelper {
+    static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t computeMultiplier (const int log_div, const T divider) {
+      EIGEN_STATIC_ASSERT(N == 32, YOU_MADE_A_PROGRAMMING_MISTAKE);
+      return (static_cast<uint64_t>(1) << (N+log_div)) / divider - (static_cast<uint64_t>(1) << N) + 1;
+    }
+  };
+
+#if defined(__SIZEOF_INT128__) && !defined(__CUDACC__)
+  template <typename T>
+  struct DividerHelper<64, T> {
+    static EIGEN_ALWAYS_INLINE uint64_t computeMultiplier(const int log_div, const T divider) {
+      return ((static_cast<__uint128_t>(1) << (64+log_div)) / static_cast<__uint128_t>(divider) - (static_cast<__uint128_t>(1) << 64) + 1);
+    }
+  };
+#endif
+}
+
+
+template <typename T>
+struct TensorIntDivisor {
+ public:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor() {
+    multiplier = 0;
+    shift1 = 0;
+    shift2 = 0;
+  }
+
+  // Must have 0 < divider < 2^31. This is relaxed to
+  // 0 < divider < 2^63 when using 64-bit indices on platforms that support
+  // the __uint128_t type.
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor(const T divider) {
+    const int N = DividerTraits<T>::N;
+    eigen_assert(divider < NumTraits<UnsignedType>::highest()/2);
+    eigen_assert(divider > 0);
+
+    // fast ln2
+    const int leading_zeros = count_leading_zeros(static_cast<UnsignedType>(divider));
+    int log_div = N - leading_zeros;
+    // if divider is a power of two then log_div is 1 more than it should be.
+    if ((1ull << (log_div-1)) == divider)
+      log_div--;
+
+    multiplier = DividerHelper<N, T>::computeMultiplier(log_div, divider);
+    shift1 = log_div > 1 ? 1 : log_div;
+    shift2 = log_div > 1 ? log_div-1 : 0;
+  }
+
+  // Must have 0 <= numerator. On platforms that dont support the __uint128_t
+  // type numerator should also be less than 2^32-1.
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T divide(const T numerator) const {
+    eigen_assert(numerator < NumTraits<UnsignedType>::highest()/2);
+    eigen_assert(numerator >= 0);
+
+    UnsignedType t1 = muluh(multiplier, numerator);
+    UnsignedType t = (static_cast<UnsignedType>(numerator) - t1) >> shift1;
+    return (t1 + t) >> shift2;
+  }
+
+ private:
+  typedef typename DividerTraits<T>::type UnsignedType;
+  UnsignedType multiplier;
+  int32_t shift1;
+  int32_t shift2;
+};
+
+
+// Optimized version for signed 32 bit integers.
+// Derived from Hacker's Delight.
+template <>
+class TensorIntDivisor<int32_t> {
+ public:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor() {
+    magic = 0;
+    shift = 0;
+  }
+  // Must have 2 <= divider
+  EIGEN_DEVICE_FUNC TensorIntDivisor(int32_t divider)  {
+    eigen_assert(divider >= 2);
+    calcMagic(divider);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int divide(const int32_t n) const {
+#ifdef __CUDA_ARCH__
+    return (__umulhi(magic, n) >> shift);
+#else
+    uint64_t v = static_cast<uint64_t>(magic) * static_cast<uint64_t>(n);
+    return (static_cast<uint32_t>(v >> 32) >> shift);
+#endif
+  }
+
+private:
+  // Compute the magic numbers. See Hacker's Delight section 10 for an in
+  // depth explanation.
+  EIGEN_DEVICE_FUNC void calcMagic(int32_t d) {
+   const unsigned two31 = 0x80000000;     // 2**31.
+   unsigned ad = d;
+   unsigned t = two31 + (ad >> 31);
+   unsigned anc = t - 1 - t%ad;     // Absolute value of nc.
+   int p = 31;                      // Init. p.
+   unsigned q1 = two31/anc;         // Init. q1 = 2**p/|nc|.
+   unsigned r1 = two31 - q1*anc;    // Init. r1 = rem(2**p, |nc|).
+   unsigned q2 = two31/ad;          // Init. q2 = 2**p/|d|.
+   unsigned r2 = two31 - q2*ad;     // Init. r2 = rem(2**p, |d|).
+   unsigned delta = 0;
+   do {
+      p = p + 1;
+      q1 = 2*q1;           // Update q1 = 2**p/|nc|.
+      r1 = 2*r1;           // Update r1 = rem(2**p, |nc|).
+      if (r1 >= anc) {     // (Must be an unsigned
+         q1 = q1 + 1;      // comparison here).
+         r1 = r1 - anc;}
+      q2 = 2*q2;           // Update q2 = 2**p/|d|.
+      r2 = 2*r2;           // Update r2 = rem(2**p, |d|).
+      if (r2 >= ad) {      // (Must be an unsigned
+         q2 = q2 + 1;      // comparison here).
+         r2 = r2 - ad;}
+      delta = ad - r2;
+   } while (q1 < delta || (q1 == delta && r1 == 0));
+
+   magic = (unsigned)(q2 + 1);
+   shift = p - 32;
+  }
+
+  uint32_t magic;
+  int32_t shift;
+};
+
+
+template <typename T>
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator / (const T& numerator, const TensorIntDivisor<T>& divisor) {
+  return divisor.divide(numerator);
+}
+
+
+#else
+// Reverse to the old code since gcudacc doesn't support the code above.
+template <typename T>
+struct TensorIntDivisor {
+ public:
+   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor() {
+    multiplier = 0;
+    shift1 = 0;
+    shift2 = 0;
+  }
+
+  // Must have 1 <= divider <= 2^31-1
+   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor(const T divider) {
+    const int N = 32;
+    eigen_assert(divider > 0);
+    eigen_assert(divider < (1ull<<(N-1)));
+
+    // fast ln2
+#ifndef __CUDA_ARCH__
+    const int leading_zeros = __builtin_clz(divider);
+#else
+    const int leading_zeros = __clz(divider);
+#endif
+    int log_div = N - leading_zeros;
+    // if divider is a power of two then log_div is 1 more than it should be.
+    if ((1ull << (log_div-1)) == divider)
+      log_div--;
+
+    multiplier = (static_cast<uint64_t>(1) << (N+log_div)) / divider - (static_cast<uint64_t>(1) << N) + 1;
+    shift1 = log_div > 1 ? 1 : log_div;
+    shift2 = log_div > 1 ? log_div-1 : 0;
+  }
+
+  // Must have 0 <= numerator <= 2^32-1
+   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T divide(const T numerator) const {
+    const int N = 32;
+    eigen_assert(numerator >= 0);
+    eigen_assert(static_cast<uint64_t>(numerator) < 1ull<<N);
+
+    uint32_t t1 = (multiplier * numerator) >> N;
+    uint32_t t = (static_cast<uint32_t>(numerator) - t1) >> shift1;
+    return (t1 + t) >> shift2;
+  }
+
+ private:
+  uint64_t multiplier;
+  int32_t shift1;
+  int32_t shift2;
+};
+
+
+// Optimized version for signed 32 bit integers.
+// Derived from Hacker's Delight.
+template <>
+class TensorIntDivisor<int> {
+ public:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor() {
+    magic = 0;
+    shift = 0;
+  }
+  // Must have 2 <= divider
+  EIGEN_DEVICE_FUNC TensorIntDivisor(int divider)  {
+    eigen_assert(divider >= 2);
+    calcMagic(divider);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int divide(const int n) const {
+#ifdef __CUDA_ARCH__
+    return (__umulhi(magic, n) >> shift);
+#else
+  uint64_t v = static_cast<uint64_t>(magic) * static_cast<uint64_t>(n);
+  return (static_cast<unsigned int>(v >> 32) >> shift);
+#endif
+  }
+
+private:
+  // Compute the magic numbers. See Hacker's Delight section 10 for an in
+  // depth explanation.
+  EIGEN_DEVICE_FUNC void calcMagic(int d) {
+   const unsigned two31 = 0x80000000;     // 2**31.
+   unsigned ad = d;
+   unsigned t = two31 + (ad >> 31);
+   unsigned anc = t - 1 - t%ad;     // Absolute value of nc.
+   int p = 31;                      // Init. p.
+   unsigned q1 = two31/anc;         // Init. q1 = 2**p/|nc|.
+   unsigned r1 = two31 - q1*anc;    // Init. r1 = rem(2**p, |nc|).
+   unsigned q2 = two31/ad;          // Init. q2 = 2**p/|d|.
+   unsigned r2 = two31 - q2*ad;     // Init. r2 = rem(2**p, |d|).
+   unsigned delta = 0;
+   do {
+      p = p + 1;
+      q1 = 2*q1;           // Update q1 = 2**p/|nc|.
+      r1 = 2*r1;           // Update r1 = rem(2**p, |nc|).
+      if (r1 >= anc) {     // (Must be an unsigned
+         q1 = q1 + 1;      // comparison here).
+         r1 = r1 - anc;}
+      q2 = 2*q2;           // Update q2 = 2**p/|d|.
+      r2 = 2*r2;           // Update r2 = rem(2**p, |d|).
+      if (r2 >= ad) {      // (Must be an unsigned
+         q2 = q2 + 1;      // comparison here).
+         r2 = r2 - ad;}
+      delta = ad - r2;
+   } while (q1 < delta || (q1 == delta && r1 == 0));
+
+   magic = (unsigned)(q2 + 1);
+   shift = p - 32;
+  }
+
+  unsigned int magic;
+  int shift;
+};
+
+
+template <typename T>
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator / (const T& numerator, const TensorIntDivisor<T>& divisor) {
+  return divisor.divide(numerator);
+}
+
+#endif
+
+} // end namespace internal
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_INTDIV_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
new file mode 100644
index 0000000000..bd795d54b0
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
@@ -0,0 +1,217 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_LAYOUT_SWAP_H
+#define EIGEN_CXX11_TENSOR_TENSOR_LAYOUT_SWAP_H
+
+namespace Eigen {
+
+/** \class TensorLayoutSwap
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Swap the layout from col-major to row-major, or row-major
+  * to col-major, and invert the order of the dimensions.
+  *
+  * Beware: the dimensions are reversed by this operation. If you want to
+  * preserve the ordering of the dimensions, you need to combine this
+  * operation with a shuffle.
+  *
+  * \example:
+  * Tensor<float, 2, ColMajor> input(2, 4);
+  * Tensor<float, 2, RowMajor> output = input.swap_layout();
+  * eigen_assert(output.dimension(0) == 4);
+  * eigen_assert(output.dimension(1) == 2);
+  *
+  * array<int, 2> shuffle(1, 0);
+  * output = input.swap_layout().shuffle(shuffle);
+  * eigen_assert(output.dimension(0) == 2);
+  * eigen_assert(output.dimension(1) == 4);
+  *
+  */
+namespace internal {
+template<typename XprType>
+struct traits<TensorLayoutSwapOp<XprType> > : public traits<XprType>
+{
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef typename remove_reference<Nested>::type _Nested;
+  static const int NumDimensions = traits<XprType>::NumDimensions;
+  static const int Layout = (static_cast<int>(traits<XprType>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor;
+};
+
+template<typename XprType>
+struct eval<TensorLayoutSwapOp<XprType>, Eigen::Dense>
+{
+  typedef const TensorLayoutSwapOp<XprType>& type;
+};
+
+template<typename XprType>
+struct nested<TensorLayoutSwapOp<XprType>, 1, typename eval<TensorLayoutSwapOp<XprType> >::type>
+{
+  typedef TensorLayoutSwapOp<XprType> type;
+};
+
+}  // end namespace internal
+
+
+
+template<typename XprType>
+class TensorLayoutSwapOp : public TensorBase<TensorLayoutSwapOp<XprType>, WriteAccessors>
+{
+  public:
+  typedef typename Eigen::internal::traits<TensorLayoutSwapOp>::Scalar Scalar;
+  typedef typename Eigen::internal::traits<TensorLayoutSwapOp>::Packet Packet;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
+  typedef typename internal::remove_const<typename XprType::PacketReturnType>::type PacketReturnType;
+  typedef typename Eigen::internal::nested<TensorLayoutSwapOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorLayoutSwapOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorLayoutSwapOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorLayoutSwapOp(const XprType& expr)
+      : m_xpr(expr) {}
+
+  EIGEN_DEVICE_FUNC
+  const typename internal::remove_all<typename XprType::Nested>::type&
+  expression() const { return m_xpr; }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE TensorLayoutSwapOp& operator = (const TensorLayoutSwapOp& other)
+  {
+    typedef TensorAssignOp<TensorLayoutSwapOp, const TensorLayoutSwapOp> Assign;
+    Assign assign(*this, other);
+    internal::TensorExecutor<const Assign, DefaultDevice>::run(
+        assign, DefaultDevice());
+    return *this;
+  }
+
+  template<typename OtherDerived>
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE TensorLayoutSwapOp& operator = (const OtherDerived& other)
+  {
+    typedef TensorAssignOp<TensorLayoutSwapOp, const OtherDerived> Assign;
+    Assign assign(*this, other);
+    internal::TensorExecutor<const Assign, DefaultDevice>::run(
+        assign, DefaultDevice());
+    return *this;
+  }
+
+ protected:
+  typename XprType::Nested m_xpr;
+};
+
+
+// Eval as rvalue
+template<typename ArgType, typename Device>
+struct TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device>
+{
+  typedef TensorLayoutSwapOp<ArgType> XprType;
+  typedef typename XprType::Index Index;
+  static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+  typedef DSizes<Index, NumDims> Dimensions;
+
+  enum {
+    IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
+    Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) ==
+              static_cast<int>(ColMajor))
+                 ? RowMajor
+                 : ColMajor,
+    CoordAccess = false,  // to be implemented
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device)
+  {
+    for(int i = 0; i < NumDims; ++i) {
+      m_dimensions[i] = m_impl.dimensions()[NumDims-1-i];
+    }
+  }
+
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
+    return m_impl.evalSubExprsIfNeeded(data);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_impl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    return m_impl.coeff(index);
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  {
+    return m_impl.template packet<LoadMode>(index);
+  }
+
+  EIGEN_DEVICE_FUNC Scalar* data() const { return m_impl.data(); }
+
+  const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+
+ protected:
+  TensorEvaluator<ArgType, Device> m_impl;
+  Dimensions m_dimensions;
+};
+
+
+// Eval as lvalue
+template<typename ArgType, typename Device>
+  struct TensorEvaluator<TensorLayoutSwapOp<ArgType>, Device>
+  : public TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device>
+{
+  typedef TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device> Base;
+  typedef TensorLayoutSwapOp<ArgType> XprType;
+
+  enum {
+    IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
+    Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) ==
+              static_cast<int>(ColMajor))
+                 ? RowMajor
+                 : ColMajor,
+    CoordAccess = false,  // to be implemented
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+    : Base(op, device)
+  { }
+
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index)
+  {
+    return this->m_impl.coeffRef(index);
+  }
+  template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void writePacket(Index index, const PacketReturnType& x)
+  {
+    this->m_impl.template writePacket<StoreMode>(index, x);
+  }
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_LAYOUT_SWAP_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
new file mode 100644
index 0000000000..908bdc38ad
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
@@ -0,0 +1,320 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_MAP_H
+#define EIGEN_CXX11_TENSOR_TENSOR_MAP_H
+
+namespace Eigen {
+
+/** \class TensorMap
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief A tensor expression mapping an existing array of data.
+  *
+  */
+
+template<typename PlainObjectType, int Options_> class TensorMap : public TensorBase<TensorMap<PlainObjectType, Options_> >
+{
+  public:
+    typedef TensorMap<PlainObjectType, Options_> Self;
+    typedef typename PlainObjectType::Base Base;
+    typedef typename Eigen::internal::nested<Self>::type Nested;
+    typedef typename internal::traits<PlainObjectType>::StorageKind StorageKind;
+    typedef typename internal::traits<PlainObjectType>::Index Index;
+    typedef typename internal::traits<PlainObjectType>::Scalar Scalar;
+    typedef typename internal::packet_traits<Scalar>::type Packet;
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+    typedef typename Base::CoeffReturnType CoeffReturnType;
+
+  /*    typedef typename internal::conditional<
+                         bool(internal::is_lvalue<PlainObjectType>::value),
+                         Scalar *,
+                         const Scalar *>::type
+                     PointerType;*/
+    typedef Scalar* PointerType;
+    typedef PointerType PointerArgType;
+
+    static const int Options = Options_;
+
+    static const Index NumIndices = PlainObjectType::NumIndices;
+    typedef typename PlainObjectType::Dimensions Dimensions;
+
+    enum {
+      IsAligned = ((int(Options_) & Aligned) == Aligned),
+      PacketAccess = (internal::packet_traits<Scalar>::size > 1),
+      BlockAccess = false,
+      Layout = PlainObjectType::Layout,
+      CoordAccess = true,
+    };
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr) : m_data(dataPtr), m_dimensions() {
+      // The number of dimensions used to construct a tensor must be equal to the rank of the tensor.
+      EIGEN_STATIC_ASSERT((0 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE)
+    }
+
+#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+    template<typename... IndexTypes> EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension, IndexTypes... otherDimensions) : m_data(dataPtr), m_dimensions(firstDimension, otherDimensions...) {
+      // The number of dimensions used to construct a tensor must be equal to the rank of the tensor.
+      EIGEN_STATIC_ASSERT((sizeof...(otherDimensions) + 1 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE)
+    }
+#else
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension) : m_data(dataPtr), m_dimensions(firstDimension) {
+      // The number of dimensions used to construct a tensor must be equal to the rank of the tensor.
+      EIGEN_STATIC_ASSERT((1 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE)
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2) : m_data(dataPtr), m_dimensions(dim1, dim2) {
+      EIGEN_STATIC_ASSERT(2 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2, Index dim3) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3) {
+      EIGEN_STATIC_ASSERT(3 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2, Index dim3, Index dim4) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3, dim4) {
+      EIGEN_STATIC_ASSERT(4 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2, Index dim3, Index dim4, Index dim5) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3, dim4, dim5) {
+      EIGEN_STATIC_ASSERT(5 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    }
+#endif
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, const array<Index, NumIndices>& dimensions)
+      : m_data(dataPtr), m_dimensions(dimensions)
+    { }
+
+    template <typename Dimensions>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, const Dimensions& dimensions)
+      : m_data(dataPtr), m_dimensions(dimensions)
+    { }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(PlainObjectType& tensor)
+      : m_data(tensor.data()), m_dimensions(tensor.dimensions())
+    { }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Index rank() const { return m_dimensions.rank(); }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Index dimension(Index n) const { return m_dimensions[n]; }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Index size() const { return m_dimensions.TotalSize(); }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar* data() { return m_data; }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar* data() const { return m_data; }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& operator()(const array<Index, NumIndices>& indices) const
+    {
+      //      eigen_assert(checkIndexRange(indices));
+      if (PlainObjectType::Options&RowMajor) {
+        const Index index = m_dimensions.IndexOfRowMajor(indices);
+        return m_data[index];
+      } else {
+        const Index index = m_dimensions.IndexOfColMajor(indices);
+        return m_data[index];
+      }
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& operator()() const
+    {
+      EIGEN_STATIC_ASSERT(NumIndices == 0 || NumIndices == Dynamic, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor.");
+      eigen_assert(rank() == 0);
+      return m_data[0];
+    }
+
+#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+    template<typename... IndexTypes> EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) const
+    {
+      static_assert(sizeof...(otherIndices) + 1 == NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor.");
+      if (PlainObjectType::Options&RowMajor) {
+        const Index index = m_dimensions.IndexOfRowMajor(array<Index, NumIndices>{{firstIndex, otherIndices...}});
+        return m_data[index];
+      } else {
+        const Index index = m_dimensions.IndexOfColMajor(array<Index, NumIndices>{{firstIndex, otherIndices...}});
+        return m_data[index];
+      }
+    }
+#else
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& operator()(Index index) const
+    {
+      eigen_internal_assert(index >= 0 && index < size());
+      return m_data[index];
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1) const
+    {
+      if (PlainObjectType::Options&RowMajor) {
+        const Index index = i1 + i0 * m_dimensions[0];
+        return m_data[index];
+      } else {
+        const Index index = i0 + i1 * m_dimensions[0];
+        return m_data[index];
+      }
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2) const
+    {
+      if (PlainObjectType::Options&RowMajor) {
+         const Index index = i2 + m_dimensions[1] * (i1 + m_dimensions[0] * i0);
+         return m_data[index];
+      } else {
+         const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * i2);
+        return m_data[index];
+      }
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3) const
+    {
+      if (PlainObjectType::Options&RowMajor) {
+        const Index index = i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0));
+        return m_data[index];
+      } else {
+        const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * i3));
+        return m_data[index];
+      }
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const
+    {
+      if (PlainObjectType::Options&RowMajor) {
+        const Index index = i4 + m_dimensions[4] * (i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0)));
+        return m_data[index];
+      } else {
+        const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * (i3 + m_dimensions[3] * i4)));
+        return m_data[index];
+      }
+    }
+#endif
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& operator()(const array<Index, NumIndices>& indices)
+    {
+      //      eigen_assert(checkIndexRange(indices));
+      if (PlainObjectType::Options&RowMajor) {
+        const Index index = m_dimensions.IndexOfRowMajor(indices);
+        return m_data[index];
+      } else {
+        const Index index = m_dimensions.IndexOfColMajor(indices);
+        return m_data[index];
+      }
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& operator()()
+    {
+      static_assert(NumIndices == 0 || NumIndices == Dynamic, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor.");
+      eigen_internal_assert(rank() == 0);
+      return m_data[0];
+    }
+
+#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+    template<typename... IndexTypes> EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& operator()(Index firstIndex, IndexTypes... otherIndices)
+    {
+      static_assert(sizeof...(otherIndices) + 1 == NumIndices || NumIndices == Dynamic, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor.");
+      const std::size_t NumDims = sizeof...(otherIndices) + 1;
+      if (PlainObjectType::Options&RowMajor) {
+        const array<Index, NumDims> dims = {firstIndex, otherIndices...};
+        const Index index = m_dimensions.IndexOfRowMajor(dims);
+        return m_data[index];
+      } else {
+        const array<Index, NumDims> dims = {firstIndex, otherIndices...};
+        const Index index = m_dimensions.IndexOfColMajor(dims);
+        return m_data[index];
+      }
+    }
+#else
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& operator()(Index index)
+    {
+      eigen_internal_assert(index >= 0 && index < size());
+      return m_data[index];
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1)
+    {
+       if (PlainObjectType::Options&RowMajor) {
+         const Index index = i1 + i0 * m_dimensions[0];
+        return m_data[index];
+      } else {
+        const Index index = i0 + i1 * m_dimensions[0];
+        return m_data[index];
+      }
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2)
+    {
+       if (PlainObjectType::Options&RowMajor) {
+         const Index index = i2 + m_dimensions[1] * (i1 + m_dimensions[0] * i0);
+        return m_data[index];
+      } else {
+         const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * i2);
+        return m_data[index];
+      }
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3)
+    {
+      if (PlainObjectType::Options&RowMajor) {
+        const Index index = i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0));
+        return m_data[index];
+      } else {
+        const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * i3));
+        return m_data[index];
+      }
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4)
+    {
+      if (PlainObjectType::Options&RowMajor) {
+        const Index index = i4 + m_dimensions[4] * (i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0)));
+        return m_data[index];
+      } else {
+        const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * (i3 + m_dimensions[3] * i4)));
+        return m_data[index];
+      }
+    }
+#endif
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Self& operator=(const Self& other)
+    {
+      typedef TensorAssignOp<Self, const Self> Assign;
+      Assign assign(*this, other);
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+      return *this;
+    }
+
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Self& operator=(const OtherDerived& other)
+    {
+      typedef TensorAssignOp<Self, const OtherDerived> Assign;
+      Assign assign(*this, other);
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+      return *this;
+    }
+
+  private:
+    Scalar* m_data;
+    Dimensions m_dimensions;
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_MAP_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
new file mode 100644
index 0000000000..4dd9af6f92
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
@@ -0,0 +1,103 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_META_H
+#define EIGEN_CXX11_TENSOR_TENSOR_META_H
+
+namespace Eigen {
+
+template<bool cond> struct Cond {};
+
+template<typename T1, typename T2> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+const T1& choose(Cond<true>, const T1& first, const T2&) {
+  return first;
+}
+
+template<typename T1, typename T2> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+const T2& choose(Cond<false>, const T1&, const T2& second) {
+  return second;
+}
+
+
+// Default packet types
+template <typename Scalar, typename Device>
+struct PacketType {
+  typedef typename internal::packet_traits<Scalar>::type type;
+  static const int size = internal::unpacket_traits<type>::size;
+};
+
+// For CUDA packet types when using a GpuDevice
+#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+template <>
+struct PacketType<float, GpuDevice> {
+  typedef float4 type;
+  static const int size = 4;
+};
+template <>
+struct PacketType<double, GpuDevice> {
+  typedef double2 type;
+  static const int size = 2;
+};
+#endif
+
+
+#if defined(EIGEN_HAS_CONSTEXPR)
+#define EIGEN_CONSTEXPR constexpr
+#else
+#define EIGEN_CONSTEXPR
+#endif
+
+// Tuple mimics std::pair but works on e.g. nvcc.
+template <typename U, typename V> struct Tuple {
+ public:
+  U first;
+  V second;
+
+  typedef U first_type;
+  typedef V second_type;
+
+  EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Tuple() : first(), second() {}
+
+  EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Tuple(const U& f, const V& s) : first(f), second(s) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Tuple& operator= (const Tuple& rhs) {
+    if (&rhs == this) return *this;
+    first = rhs.first;
+    second = rhs.second;
+    return *this;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void swap(Tuple& rhs) {
+    using numext::swap;
+    swap(first, rhs.first);
+    swap(second, rhs.second);
+  }
+};
+
+template <typename U, typename V>
+EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+bool operator==(const Tuple<U, V>& x, const Tuple<U, V>& y) {
+  return (x.first == y.first && x.second == y.second);
+}
+
+template <typename U, typename V>
+EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+bool operator!=(const Tuple<U, V>& x, const Tuple<U, V>& y) {
+  return !(x == y);
+}
+
+#undef EIGEN_CONSTEXPR
+
+}  // namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_META_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
new file mode 100644
index 0000000000..e67f3da31a
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@@ -0,0 +1,817 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_MORPHING_H
+#define EIGEN_CXX11_TENSOR_TENSOR_MORPHING_H
+
+namespace Eigen {
+
+/** \class TensorReshaping
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor reshaping class.
+  *
+  *
+  */
+namespace internal {
+template<typename NewDimensions, typename XprType>
+struct traits<TensorReshapingOp<NewDimensions, XprType> > : public traits<XprType>
+{
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef typename remove_reference<Nested>::type _Nested;
+  static const int NumDimensions = array_size<NewDimensions>::value;
+  static const int Layout = XprTraits::Layout;
+};
+
+template<typename NewDimensions, typename XprType>
+struct eval<TensorReshapingOp<NewDimensions, XprType>, Eigen::Dense>
+{
+  typedef const TensorReshapingOp<NewDimensions, XprType>& type;
+};
+
+template<typename NewDimensions, typename XprType>
+struct nested<TensorReshapingOp<NewDimensions, XprType>, 1, typename eval<TensorReshapingOp<NewDimensions, XprType> >::type>
+{
+  typedef TensorReshapingOp<NewDimensions, XprType> type;
+};
+
+}  // end namespace internal
+
+
+
+template<typename NewDimensions, typename XprType>
+class TensorReshapingOp : public TensorBase<TensorReshapingOp<NewDimensions, XprType>, WriteAccessors>
+{
+  public:
+  typedef typename Eigen::internal::traits<TensorReshapingOp>::Scalar Scalar;
+  typedef typename Eigen::internal::traits<TensorReshapingOp>::Packet Packet;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
+  typedef typename internal::remove_const<typename XprType::PacketReturnType>::type PacketReturnType;
+  typedef typename Eigen::internal::nested<TensorReshapingOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorReshapingOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorReshapingOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReshapingOp(const XprType& expr, const NewDimensions& dims)
+      : m_xpr(expr), m_dims(dims) {}
+
+    EIGEN_DEVICE_FUNC
+    const NewDimensions& dimensions() const { return m_dims; }
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename XprType::Nested>::type&
+    expression() const { return m_xpr; }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorReshapingOp& operator = (const TensorReshapingOp& other)
+    {
+      typedef TensorAssignOp<TensorReshapingOp, const TensorReshapingOp> Assign;
+      Assign assign(*this, other);
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(
+          assign, DefaultDevice());
+      return *this;
+    }
+
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorReshapingOp& operator = (const OtherDerived& other)
+    {
+      typedef TensorAssignOp<TensorReshapingOp, const OtherDerived> Assign;
+      Assign assign(*this, other);
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(
+          assign, DefaultDevice());
+      return *this;
+    }
+
+  protected:
+    typename XprType::Nested m_xpr;
+    const NewDimensions m_dims;
+};
+
+
+// Eval as rvalue
+template<typename NewDimensions, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
+{
+  typedef TensorReshapingOp<NewDimensions, ArgType> XprType;
+  typedef NewDimensions Dimensions;
+
+  enum {
+    IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    // TODO(andydavis) Re-enable BlockAccess when the performance issue
+    // with block-based reshape is resolved.
+    BlockAccess = false,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device), m_dimensions(op.dimensions())
+  {
+    // The total size of the reshaped tensor must be equal to the total size
+    // of the input tensor.
+    eigen_assert(internal::array_prod(m_impl.dimensions()) == internal::array_prod(op.dimensions()));
+
+    if (BlockAccess) {
+      const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims =
+          m_impl.dimensions();
+      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+        m_outputStrides[0] = 1;
+        for (int i = 1; i < NumOutputDims; ++i) {
+          m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
+        }
+        m_inputStrides[0] = 1;
+        for (int i = 1; i < NumInputDims; ++i) {
+          m_inputStrides[i] = m_inputStrides[i - 1] * input_dims[i - 1];
+        }
+      } else {
+#ifdef __CUDACC__
+        // TODO(andydavis) Remove the following line of code when associated
+        // nvcc bug b/22973013 is fixed.
+        for (int i = 0; i < 1; ++i) {}
+#endif
+        m_outputStrides[NumOutputDims - 1] = 1;
+        for (int i = NumOutputDims - 2; i >= 0; --i) {
+          m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
+        }
+        m_inputStrides[NumInputDims - 1] = 1;
+        for (int i = NumInputDims - 2; i >= 0; --i) {
+          m_inputStrides[i] = m_inputStrides[i + 1] * input_dims[i + 1];
+        }
+      }
+    }
+  }
+
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+  static const std::size_t NumOutputDims =
+      internal::array_size<Dimensions>::value;
+  static const std::size_t NumInputDims = internal::array_size<
+    typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+  typedef typename internal::TensorBlock<
+    Index, typename internal::remove_const<Scalar>::type, NumOutputDims, Layout>
+  OutputTensorBlock;
+  typedef typename internal::TensorBlock<
+    Index, typename internal::remove_const<Scalar>::type, NumInputDims, Layout>
+  InputTensorBlock;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
+    return m_impl.evalSubExprsIfNeeded(data);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_impl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    return m_impl.coeff(index);
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  {
+    return m_impl.template packet<LoadMode>(index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
+      std::vector<internal::TensorOpResourceRequirements>* resources) const {
+    m_impl.getResourceRequirements(resources);
+  }
+
+  // TODO(andydavis) Reduce the overhead of this function.
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
+      OutputTensorBlock* output_block) const {
+    // Calculate output block unit-stride inner dimension length.
+    const DSizes<Index, NumOutputDims>& output_block_sizes =
+        output_block->block_sizes();
+    Index output_inner_dim_size = 1;
+    Index output_outer_dim_start = NumOutputDims;
+    for (Index i = 0; i < NumOutputDims; ++i) {
+      const Index dim = static_cast<int>(Layout) == static_cast<int>(ColMajor)
+          ? i : NumOutputDims - i - 1;
+      output_inner_dim_size *= output_block_sizes[dim];
+      if (output_block_sizes[dim] < m_dimensions[dim]) {
+        output_outer_dim_start = i + 1;
+        break;
+      }
+    }
+
+    // Initialize output block iterator state.
+    struct BlockIteratorState {
+      Index stride;
+      Index span;
+      Index size;
+      Index count;
+    };
+    array<BlockIteratorState, NumOutputDims> block_iter_state;
+
+    for (Index i = 0; i < NumOutputDims; ++i) {
+      const Index dim = static_cast<int>(Layout) == static_cast<int>(ColMajor)
+          ? i : NumOutputDims - i - 1;
+      block_iter_state[i].size = output_block_sizes[dim];
+      block_iter_state[i].stride = m_outputStrides[dim];
+      block_iter_state[i].span =
+          block_iter_state[i].stride * (block_iter_state[i].size - 1);
+      block_iter_state[i].count = 0;
+    }
+
+    const Index output_outer_dim_size = output_block_sizes.TotalSize() /
+        output_inner_dim_size;
+    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims =
+        m_impl.dimensions();
+
+    Index index = output_block->first_coeff_index();
+    for (Index outer_idx = 0; outer_idx < output_outer_dim_size; ++outer_idx) {
+      Index inner_idx = 0;
+      while (inner_idx < output_inner_dim_size) {
+        // Calculate input coords based on 'index'.
+        array<Index, NumInputDims> input_coords;
+        Index idx = index;
+        if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+          for (int i = NumInputDims - 1; i > 0; --i) {
+            input_coords[i] = idx / m_inputStrides[i];
+            idx -= input_coords[i] * m_inputStrides[i];
+          }
+          input_coords[0] = idx;
+        } else {
+          for (int i = 0; i < NumInputDims - 1; ++i) {
+            input_coords[i] = idx / m_inputStrides[i];
+            idx -= input_coords[i] * m_inputStrides[i];
+          }
+          input_coords[NumInputDims - 1] = idx;
+        }
+
+        // Calculate target input block shape, using at most
+        // 'output_inner_dim_size' coefficients along the input block's inner
+        // dimensions.
+        DSizes<Index, NumInputDims> input_block_sizes;
+        Index num_to_allocate = output_inner_dim_size - inner_idx;
+        for (Index i = 0; i < NumInputDims; ++i) {
+          const Index dim =
+              static_cast<int>(Layout) == static_cast<int>(ColMajor)
+              ? i : NumInputDims - i - 1;
+          input_block_sizes[dim] = numext::mini(
+              num_to_allocate, (static_cast<Index>(input_dims[dim]) -
+                                input_coords[dim]));
+          if (input_coords[dim] == 0) {
+            num_to_allocate /= input_block_sizes[dim];
+          } else {
+            num_to_allocate = 1;
+          }
+        }
+
+        // Calculate input block strides.
+        DSizes<Index, NumInputDims> input_block_strides;
+        if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+          input_block_strides[0] = 1;
+          for (int i = 1; i < NumInputDims; ++i) {
+            input_block_strides[i] = input_block_strides[i - 1] *
+                input_block_sizes[i - 1];
+          }
+        } else {
+          input_block_strides[NumInputDims - 1] = 1;
+          for (int i = NumInputDims - 2; i >= 0; --i) {
+            input_block_strides[i] = input_block_strides[i + 1] *
+                input_block_sizes[i + 1];
+          }
+        }
+
+        // Instantiate and read input block from input tensor.
+        InputTensorBlock input_block(index, input_block_sizes,
+                                     input_block_strides, m_inputStrides,
+                                     output_block->data() + outer_idx *
+                                     output_inner_dim_size + inner_idx);
+
+        m_impl.block(&input_block);
+
+        const Index input_block_total_size = input_block_sizes.TotalSize();
+        index += input_block_total_size;
+        inner_idx += input_block_total_size;
+      }
+      eigen_assert(inner_idx == output_inner_dim_size);
+      index -= output_inner_dim_size;
+      // Update index.
+      for (Index i = output_outer_dim_start; i < NumOutputDims; ++i) {
+        if (++block_iter_state[i].count < block_iter_state[i].size) {
+          index += block_iter_state[i].stride;
+          break;
+        }
+        block_iter_state[i].count = 0;
+        index -= block_iter_state[i].span;
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC Scalar* data() const { return const_cast<Scalar*>(m_impl.data()); }
+
+  EIGEN_DEVICE_FUNC const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+
+ protected:
+  TensorEvaluator<ArgType, Device> m_impl;
+  NewDimensions m_dimensions;
+  DSizes<Index, NumOutputDims> m_outputStrides;
+  DSizes<Index, NumInputDims> m_inputStrides;
+};
+
+
+// Eval as lvalue
+template<typename NewDimensions, typename ArgType, typename Device>
+  struct TensorEvaluator<TensorReshapingOp<NewDimensions, ArgType>, Device>
+  : public TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
+
+{
+  typedef TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device> Base;
+  typedef TensorReshapingOp<NewDimensions, ArgType> XprType;
+  typedef NewDimensions Dimensions;
+
+  enum {
+    IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+    : Base(op, device)
+  { }
+
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
+  {
+    return this->m_impl.coeffRef(index);
+  }
+  template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void writePacket(Index index, const PacketReturnType& x)
+  {
+    this->m_impl.template writePacket<StoreMode>(index, x);
+  }
+};
+
+
+/** \class TensorSlicing
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor slicing class.
+  *
+  *
+  */
+namespace internal {
+template<typename StartIndices, typename Sizes, typename XprType>
+struct traits<TensorSlicingOp<StartIndices, Sizes, XprType> > : public traits<XprType>
+{
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef typename remove_reference<Nested>::type _Nested;
+  static const int NumDimensions = array_size<StartIndices>::value;
+  static const int Layout = XprTraits::Layout;
+};
+
+template<typename StartIndices, typename Sizes, typename XprType>
+struct eval<TensorSlicingOp<StartIndices, Sizes, XprType>, Eigen::Dense>
+{
+  typedef const TensorSlicingOp<StartIndices, Sizes, XprType>& type;
+};
+
+template<typename StartIndices, typename Sizes, typename XprType>
+struct nested<TensorSlicingOp<StartIndices, Sizes, XprType>, 1, typename eval<TensorSlicingOp<StartIndices, Sizes, XprType> >::type>
+{
+  typedef TensorSlicingOp<StartIndices, Sizes, XprType> type;
+};
+
+}  // end namespace internal
+
+
+
+template<typename StartIndices, typename Sizes, typename XprType>
+class TensorSlicingOp : public TensorBase<TensorSlicingOp<StartIndices, Sizes, XprType> >
+{
+  public:
+  typedef typename Eigen::internal::traits<TensorSlicingOp>::Scalar Scalar;
+  typedef typename Eigen::internal::traits<TensorSlicingOp>::Packet Packet;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+  typedef typename Eigen::internal::nested<TensorSlicingOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorSlicingOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorSlicingOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSlicingOp(const XprType& expr, const StartIndices& indices, const Sizes& sizes)
+      : m_xpr(expr), m_indices(indices), m_sizes(sizes) {}
+
+    EIGEN_DEVICE_FUNC
+    const StartIndices& startIndices() const { return m_indices; }
+    EIGEN_DEVICE_FUNC
+    const Sizes& sizes() const { return m_sizes; }
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename XprType::Nested>::type&
+    expression() const { return m_xpr; }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorSlicingOp& operator = (const TensorSlicingOp& other)
+    {
+      typedef TensorAssignOp<TensorSlicingOp, const TensorSlicingOp> Assign;
+      Assign assign(*this, other);
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(
+          assign, DefaultDevice());
+      return *this;
+    }
+
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorSlicingOp& operator = (const OtherDerived& other)
+    {
+      typedef TensorAssignOp<TensorSlicingOp, const OtherDerived> Assign;
+      Assign assign(*this, other);
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(
+          assign, DefaultDevice());
+      return *this;
+    }
+
+  protected:
+    typename XprType::Nested m_xpr;
+    const StartIndices m_indices;
+    const Sizes m_sizes;
+};
+
+
+// Eval as rvalue
+template<typename StartIndices, typename Sizes, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
+{
+  typedef TensorSlicingOp<StartIndices, Sizes, ArgType> XprType;
+  static const int NumDims = internal::array_size<Sizes>::value;
+
+  enum {
+    // Alignment can't be guaranteed at compile time since it depends on the
+    // slice offsets and sizes.
+    IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = TensorEvaluator<ArgType, Device>::CoordAccess,
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device), m_device(device), m_dimensions(op.sizes()), m_offsets(op.startIndices())
+  {
+    for (int i = 0; i < internal::array_size<Dimensions>::value; ++i) {
+      eigen_assert(m_impl.dimensions()[i] >= op.sizes()[i] + op.startIndices()[i]);
+    }
+
+    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+    const Sizes& output_dims = op.sizes();
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_inputStrides[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1];
+      }
+
+      // Don't initialize m_fastOutputStrides[0] since it won't ever be accessed.
+      m_outputStrides[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        m_outputStrides[i] = m_outputStrides[i-1] * output_dims[i-1];
+        m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
+      }
+    } else {
+      m_inputStrides[NumDims-1] = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1];
+      }
+
+      m_outputStrides[NumDims-1] = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        m_outputStrides[i] = m_outputStrides[i+1] * output_dims[i+1];
+        m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
+      }
+    }
+
+    m_block_total_size_max = numext::maxi(static_cast<std::size_t>(1),
+                                          device.lastLevelCacheSize() /
+                                          sizeof(Scalar));
+  }
+
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename internal::remove_const<Scalar>::type ScalarNonConst;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+  typedef Sizes Dimensions;
+  typedef internal::TensorBlock<Index, ScalarNonConst, NumDims, Layout>
+    TensorBlock;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    if (internal::is_arithmetic<typename internal::remove_const<Scalar>::type>::value && data && m_impl.data()) {
+      Index contiguous_values = 1;
+      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+        for (int i = 0; i < NumDims; ++i) {
+          contiguous_values *= dimensions()[i];
+          if (dimensions()[i] != m_impl.dimensions()[i]) {
+            break;
+          }
+        }
+      } else {
+        for (int i = NumDims-1; i >= 0; --i) {
+          contiguous_values *= dimensions()[i];
+          if (dimensions()[i] != m_impl.dimensions()[i]) {
+            break;
+          }
+        }
+      }
+      // Use memcpy if it's going to be faster than using the regular evaluation.
+      if (contiguous_values > m_device.memcpyThreshold()) {
+        Scalar* src = (Scalar*)m_impl.data();
+        for (int i = 0; i < internal::array_prod(dimensions()); i += contiguous_values) {
+          Index offset = srcCoeff(i);
+          m_device.memcpy((void*)(data+i), src+offset, contiguous_values * sizeof(Scalar));
+        }
+        return false;
+      }
+    }
+    return true;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_impl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    return m_impl.coeff(srcCoeff(index));
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  {
+    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+        eigen_assert(index+packetSize-1 < internal::array_prod(dimensions()));
+
+    Index inputIndices[] = {0, 0};
+    Index indices[] = {index, index + packetSize - 1};
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumDims - 1; i > 0; --i) {
+        const Index idx0 = indices[0] / m_fastOutputStrides[i];
+        const Index idx1 = indices[1] / m_fastOutputStrides[i];
+        inputIndices[0] += (idx0 + m_offsets[i]) * m_inputStrides[i];
+        inputIndices[1] += (idx1 + m_offsets[i]) * m_inputStrides[i];
+        indices[0] -= idx0 * m_outputStrides[i];
+        indices[1] -= idx1 * m_outputStrides[i];
+      }
+      inputIndices[0] += (indices[0] + m_offsets[0]);
+      inputIndices[1] += (indices[1] + m_offsets[0]);
+    } else {
+      for (int i = 0; i < NumDims - 1; ++i) {
+        const Index idx0 = indices[0] / m_fastOutputStrides[i];
+        const Index idx1 = indices[1] / m_fastOutputStrides[i];
+        inputIndices[0] += (idx0 + m_offsets[i]) * m_inputStrides[i];
+        inputIndices[1] += (idx1 + m_offsets[i]) * m_inputStrides[i];
+        indices[0] -= idx0 * m_outputStrides[i];
+        indices[1] -= idx1 * m_outputStrides[i];
+      }
+      inputIndices[0] += (indices[0] + m_offsets[NumDims-1]);
+      inputIndices[1] += (indices[1] + m_offsets[NumDims-1]);
+    }
+    if (inputIndices[1] - inputIndices[0] == packetSize - 1) {
+      PacketReturnType rslt = m_impl.template packet<Unaligned>(inputIndices[0]);
+      return rslt;
+    }
+    else {
+      typename internal::remove_const<CoeffReturnType>::type values[packetSize];
+      values[0] = m_impl.coeff(inputIndices[0]);
+      values[packetSize-1] = m_impl.coeff(inputIndices[1]);
+      for (int i = 1; i < packetSize-1; ++i) {
+        values[i] = coeff(index+i);
+      }
+      PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+      return rslt;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<Index, NumDims>& coords)
+  {
+    array<Index, NumDims> inputCoords;
+    for (int i = 0; i < NumDims; ++i) {
+      inputCoords = coords[i] + this->m_offsets[i];
+    }
+    return m_impl.coeff(inputCoords);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
+      std::vector<internal::TensorOpResourceRequirements>* resources) const {
+    resources->push_back(internal::TensorOpResourceRequirements(
+        internal::kSkewedInnerDims, m_block_total_size_max));
+    m_impl.getResourceRequirements(resources);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
+      TensorBlock* output_block) const {
+    TensorBlock input_block(srcCoeff(output_block->first_coeff_index()),
+                            output_block->block_sizes(),
+                            output_block->block_strides(),
+                            m_inputStrides,
+                            output_block->data());
+    m_impl.block(&input_block);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const {
+    Scalar* result = m_impl.data();
+    if (result) {
+      Index offset = 0;
+      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+        for (int i = 0; i < NumDims; ++i) {
+          if (m_dimensions[i] != m_impl.dimensions()[i]) {
+            offset += m_offsets[i] * m_inputStrides[i];
+            for (int j = i+1; j < NumDims; ++j) {
+              if (m_dimensions[j] > 1) {
+                return NULL;
+              }
+              offset += m_offsets[j] * m_inputStrides[j];
+            }
+            break;
+          }
+        }
+      } else {
+        for (int i = NumDims - 1; i >= 0; --i) {
+          if (m_dimensions[i] != m_impl.dimensions()[i]) {
+            offset += m_offsets[i] * m_inputStrides[i];
+            for (int j = i-1; j >= 0; --j) {
+              if (m_dimensions[j] > 1) {
+                return NULL;
+              }
+              offset += m_offsets[j] * m_inputStrides[j];
+            }
+            break;
+          }
+        }
+      }
+      return result + offset;
+    }
+    return NULL;
+  }
+
+ protected:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
+  {
+    Index inputIndex = 0;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumDims - 1; i > 0; --i) {
+        const Index idx = index / m_fastOutputStrides[i];
+        inputIndex += (idx + m_offsets[i]) * m_inputStrides[i];
+        index -= idx * m_outputStrides[i];
+      }
+      inputIndex += (index + m_offsets[0]);
+    } else {
+      for (int i = 0; i < NumDims - 1; ++i) {
+        const Index idx = index / m_fastOutputStrides[i];
+        inputIndex += (idx + m_offsets[i]) * m_inputStrides[i];
+        index -= idx * m_outputStrides[i];
+      }
+      inputIndex += (index + m_offsets[NumDims-1]);
+    }
+    return inputIndex;
+  }
+
+  array<Index, NumDims> m_outputStrides;
+  array<internal::TensorIntDivisor<Index>, NumDims> m_fastOutputStrides;
+  array<Index, NumDims> m_inputStrides;
+  TensorEvaluator<ArgType, Device> m_impl;
+  const Device& m_device;
+  Dimensions m_dimensions;
+  const StartIndices m_offsets;
+  std::size_t m_block_total_size_max;
+};
+
+
+// Eval as lvalue
+template<typename StartIndices, typename Sizes, typename ArgType, typename Device>
+struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
+  : public TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
+{
+  typedef TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Device> Base;
+  typedef TensorSlicingOp<StartIndices, Sizes, ArgType> XprType;
+  static const int NumDims = internal::array_size<Sizes>::value;
+
+  enum {
+    IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = TensorEvaluator<ArgType, Device>::CoordAccess,
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+    : Base(op, device)
+    { }
+
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename internal::remove_const<Scalar>::type ScalarNonConst;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+  typedef Sizes Dimensions;
+  typedef internal::TensorBlock<Index, ScalarNonConst, NumDims, Layout>
+    TensorBlock;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
+  {
+    return this->m_impl.coeffRef(this->srcCoeff(index));
+  }
+
+  template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void writePacket(Index index, const PacketReturnType& x)
+  {
+    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    Index inputIndices[] = {0, 0};
+    Index indices[] = {index, index + packetSize - 1};
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumDims - 1; i > 0; --i) {
+        const Index idx0 = indices[0] / this->m_fastOutputStrides[i];
+        const Index idx1 = indices[1] / this->m_fastOutputStrides[i];
+        inputIndices[0] += (idx0 + this->m_offsets[i]) * this->m_inputStrides[i];
+        inputIndices[1] += (idx1 + this->m_offsets[i]) * this->m_inputStrides[i];
+        indices[0] -= idx0 * this->m_outputStrides[i];
+        indices[1] -= idx1 * this->m_outputStrides[i];
+      }
+      inputIndices[0] += (indices[0] + this->m_offsets[0]);
+      inputIndices[1] += (indices[1] + this->m_offsets[0]);
+    } else {
+      for (int i = 0; i < NumDims - 1; ++i) {
+        const Index idx0 = indices[0] / this->m_fastOutputStrides[i];
+        const Index idx1 = indices[1] / this->m_fastOutputStrides[i];
+        inputIndices[0] += (idx0 + this->m_offsets[i]) * this->m_inputStrides[i];
+        inputIndices[1] += (idx1 + this->m_offsets[i]) * this->m_inputStrides[i];
+        indices[0] -= idx0 * this->m_outputStrides[i];
+        indices[1] -= idx1 * this->m_outputStrides[i];
+      }
+      inputIndices[0] += (indices[0] + this->m_offsets[NumDims-1]);
+      inputIndices[1] += (indices[1] + this->m_offsets[NumDims-1]);
+    }
+    if (inputIndices[1] - inputIndices[0] == packetSize - 1) {
+      this->m_impl.template writePacket<StoreMode>(inputIndices[0], x);
+    }
+    else {
+      EIGEN_ALIGN_DEFAULT CoeffReturnType values[packetSize];
+      internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
+      this->m_impl.coeffRef(inputIndices[0]) = values[0];
+      this->m_impl.coeffRef(inputIndices[1]) = values[packetSize-1];
+      for (int i = 1; i < packetSize-1; ++i) {
+        this->coeffRef(index+i) = values[i];
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(const array<Index, NumDims>& coords)
+  {
+    array<Index, NumDims> inputCoords;
+    for (int i = 0; i < NumDims; ++i) {
+      inputCoords = coords[i] + this->m_offsets[i];
+    }
+    return this->m_impl.coeffRef(inputCoords);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(
+      const TensorBlock& block) {
+    this->m_impl.writeBlock(
+        TensorBlock(this->srcCoeff(block.first_coeff_index()),
+                    block.block_sizes(),
+                    block.block_strides(),
+                    this->m_inputStrides,
+                    const_cast<ScalarNonConst*>(block.data())));
+
+  }
+};
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_MORPHING_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
new file mode 100644
index 0000000000..d1dff3f38b
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
@@ -0,0 +1,388 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_PADDING_H
+#define EIGEN_CXX11_TENSOR_TENSOR_PADDING_H
+
+namespace Eigen {
+
+/** \class TensorPadding
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor padding class.
+  * At the moment only padding with a constant value is supported.
+  *
+  */
+namespace internal {
+template<typename PaddingDimensions, typename XprType>
+struct traits<TensorPaddingOp<PaddingDimensions, XprType> > : public traits<XprType>
+{
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef typename remove_reference<Nested>::type _Nested;
+  static const int NumDimensions = XprTraits::NumDimensions;
+  static const int Layout = XprTraits::Layout;
+};
+
+template<typename PaddingDimensions, typename XprType>
+struct eval<TensorPaddingOp<PaddingDimensions, XprType>, Eigen::Dense>
+{
+  typedef const TensorPaddingOp<PaddingDimensions, XprType>& type;
+};
+
+template<typename PaddingDimensions, typename XprType>
+struct nested<TensorPaddingOp<PaddingDimensions, XprType>, 1, typename eval<TensorPaddingOp<PaddingDimensions, XprType> >::type>
+{
+  typedef TensorPaddingOp<PaddingDimensions, XprType> type;
+};
+
+}  // end namespace internal
+
+
+
+template<typename PaddingDimensions, typename XprType>
+class TensorPaddingOp : public TensorBase<TensorPaddingOp<PaddingDimensions, XprType>, ReadOnlyAccessors>
+{
+  public:
+  typedef typename Eigen::internal::traits<TensorPaddingOp>::Scalar Scalar;
+  typedef typename Eigen::internal::traits<TensorPaddingOp>::Packet Packet;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+  typedef typename Eigen::internal::nested<TensorPaddingOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorPaddingOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorPaddingOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorPaddingOp(const XprType& expr, const PaddingDimensions& padding_dims,
+                                                        const Scalar padding_value)
+      : m_xpr(expr), m_padding_dims(padding_dims), m_padding_value(padding_value) {}
+
+    EIGEN_DEVICE_FUNC
+    const PaddingDimensions& padding() const { return m_padding_dims; }
+    EIGEN_DEVICE_FUNC
+    Scalar padding_value() const { return m_padding_value; }
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename XprType::Nested>::type&
+    expression() const { return m_xpr; }
+
+  protected:
+    typename XprType::Nested m_xpr;
+    const PaddingDimensions m_padding_dims;
+    const Scalar m_padding_value;
+};
+
+
+// Eval as rvalue
+template<typename PaddingDimensions, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device>
+{
+  typedef TensorPaddingOp<PaddingDimensions, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  static const int NumDims = internal::array_size<PaddingDimensions>::value;
+  typedef DSizes<Index, NumDims> Dimensions;
+
+  enum {
+    IsAligned = false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = true,
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device), m_padding(op.padding()), m_paddingValue(op.padding_value())
+  {
+    // Compute dimensions
+    m_dimensions = m_impl.dimensions();
+    for (int i = 0; i < NumDims; ++i) {
+      m_dimensions[i] += m_padding[i].first + m_padding[i].second;
+    }
+    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_outputStrides[0] = 1;
+      if (NumDims > 0) {
+        m_inputStrides[0] = 1;
+        for (int i = 1; i < NumDims; ++i) {
+          m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1];
+          m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1];
+        }
+        m_outputStrides[NumDims] = m_outputStrides[NumDims-1] * m_dimensions[NumDims-1];
+      }
+    } else {
+      m_outputStrides[NumDims] = 1;
+      if (NumDims > 0) {
+        m_inputStrides[NumDims - 1] = 1;
+        for (int i = NumDims - 2; i >= 0; --i) {
+          m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1];
+          m_outputStrides[i+1] = m_outputStrides[i+2] * m_dimensions[i+1];
+        }
+        m_outputStrides[0] = m_outputStrides[1] * m_dimensions[0];
+      }
+    }
+  }
+
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_impl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    eigen_assert(index < dimensions().TotalSize());
+    Index inputIndex = 0;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumDims - 1; i > 0; --i) {
+        const Index idx = index / m_outputStrides[i];
+        if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) {
+          return m_paddingValue;
+        }
+        inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
+        index -= idx * m_outputStrides[i];
+      }
+      if (NumDims > 0) {
+        if (index < m_padding[0].first || index >= m_dimensions[0] - m_padding[0].second) {
+          return m_paddingValue;
+        }
+        inputIndex += (index - m_padding[0].first);
+      }
+    } else {
+      for (int i = 0; i < NumDims - 1; ++i) {
+        const Index idx = index / m_outputStrides[i+1];
+        if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) {
+          return m_paddingValue;
+        }
+        inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
+        index -= idx * m_outputStrides[i+1];
+      }
+      if (NumDims > 0) {
+        if (index < m_padding[NumDims-1].first ||
+            index >= m_dimensions[NumDims-1] - m_padding[NumDims-1].second) {
+          return m_paddingValue;
+        }
+        inputIndex += (index - m_padding[NumDims-1].first);
+      }
+    }
+    return m_impl.coeff(inputIndex);
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  {
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      return packetColMajor(index);
+    }
+    return packetRowMajor(index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<Index, NumDims>& coords) const
+  {
+    Index inputIndex = 0;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      if (NumDims > 0) {
+        const Index idx = coords[0];
+        if (idx < m_padding[0].first || idx >= m_dimensions[0] - m_padding[0].second) {
+          return m_paddingValue;
+        }
+        inputIndex = idx - m_padding[0].first;
+      }
+      for (int i = 1; i < NumDims; ++i) {
+        const Index idx = coords[i];
+        if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) {
+          return m_paddingValue;
+        }
+        inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
+      }
+    } else {
+      if (NumDims > 0) {
+        const Index idx = coords[NumDims-1];
+        if (idx < m_padding[NumDims-1].first || idx >= m_dimensions[NumDims-1] - m_padding[NumDims-1].second) {
+          return m_paddingValue;
+        }
+        inputIndex = idx - m_padding[NumDims-1].first;
+      }
+      for (int i = NumDims - 2; i >= 0; --i) {
+        const Index idx = coords[i];
+        if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) {
+          return m_paddingValue;
+        }
+        inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
+      }
+    }
+    return m_impl.coeff(inputIndex);
+  }
+
+  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+
+ protected:
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const
+  {
+    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+packetSize-1 < dimensions().TotalSize());
+
+    const Index initialIndex = index;
+    Index inputIndex = 0;
+    for (int i = NumDims - 1; i > 0; --i) {
+      const Index first = index;
+      const Index last = index + packetSize - 1;
+      const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i];
+      const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i];
+      const Index lastPaddedRight = m_outputStrides[i+1];
+
+      if (last < lastPaddedLeft) {
+        // all the coefficient are in the padding zone.
+        return internal::pset1<PacketReturnType>(m_paddingValue);
+      }
+      else if (first >= firstPaddedRight && last < lastPaddedRight) {
+        // all the coefficient are in the padding zone.
+        return internal::pset1<PacketReturnType>(m_paddingValue);
+      }
+      else if (first >= lastPaddedLeft && last < firstPaddedRight) {
+        // all the coefficient are between the 2 padding zones.
+        const Index idx = index / m_outputStrides[i];
+        inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
+        index -= idx * m_outputStrides[i];
+      }
+      else {
+        // Every other case
+        return packetWithPossibleZero(initialIndex);
+      }
+    }
+
+    const Index last = index + packetSize - 1;
+    const Index first = index;
+
+    if (NumDims > 0) {
+      const Index lastPaddedLeft = m_padding[0].first;
+      const Index firstPaddedRight = (m_dimensions[0] - m_padding[0].second);
+      const Index lastPaddedRight = m_outputStrides[1];
+
+      if (last < lastPaddedLeft) {
+        // all the coefficient are in the padding zone.
+        return internal::pset1<PacketReturnType>(m_paddingValue);
+      }
+      else if (first >= firstPaddedRight && last < lastPaddedRight) {
+        // all the coefficient are in the padding zone.
+        return internal::pset1<PacketReturnType>(m_paddingValue);
+      }
+      else if (first >= lastPaddedLeft && last < firstPaddedRight) {
+        // all the coefficient are between the 2 padding zones.
+        inputIndex += (index - m_padding[0].first);
+        return m_impl.template packet<Unaligned>(inputIndex);
+      }
+    }
+
+    // Every other case
+    return packetWithPossibleZero(initialIndex);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const
+  {
+    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+packetSize-1 < dimensions().TotalSize());
+
+    const Index initialIndex = index;
+    Index inputIndex = 0;
+
+    for (int i = 0; i < NumDims - 1; ++i) {
+      const Index first = index;
+      const Index last = index + packetSize - 1;
+      const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i+1];
+      const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i+1];
+      const Index lastPaddedRight = m_outputStrides[i];
+
+      if (last < lastPaddedLeft) {
+        // all the coefficient are in the padding zone.
+        return internal::pset1<PacketReturnType>(m_paddingValue);
+      }
+      else if (first >= firstPaddedRight && last < lastPaddedRight) {
+        // all the coefficient are in the padding zone.
+        return internal::pset1<PacketReturnType>(m_paddingValue);
+      }
+      else if (first >= lastPaddedLeft && last < firstPaddedRight) {
+        // all the coefficient are between the 2 padding zones.
+        const Index idx = index / m_outputStrides[i+1];
+        inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
+        index -= idx * m_outputStrides[i+1];
+      }
+      else {
+        // Every other case
+        return packetWithPossibleZero(initialIndex);
+      }
+    }
+
+    const Index last = index + packetSize - 1;
+    const Index first = index;
+
+    if (NumDims > 0) {
+      const Index lastPaddedLeft = m_padding[NumDims-1].first;
+      const Index firstPaddedRight = (m_dimensions[NumDims-1] - m_padding[NumDims-1].second);
+      const Index lastPaddedRight = m_outputStrides[NumDims-1];
+
+      if (last < lastPaddedLeft) {
+        // all the coefficient are in the padding zone.
+        return internal::pset1<PacketReturnType>(m_paddingValue);
+      }
+      else if (first >= firstPaddedRight && last < lastPaddedRight) {
+        // all the coefficient are in the padding zone.
+        return internal::pset1<PacketReturnType>(m_paddingValue);
+      }
+      else if (first >= lastPaddedLeft && last < firstPaddedRight) {
+        // all the coefficient are between the 2 padding zones.
+        inputIndex += (index - m_padding[NumDims-1].first);
+        return m_impl.template packet<Unaligned>(inputIndex);
+      }
+    }
+
+    // Every other case
+    return packetWithPossibleZero(initialIndex);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
+  {
+    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize];
+    for (int i = 0; i < packetSize; ++i) {
+      values[i] = coeff(index+i);
+    }
+    PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+    return rslt;
+  }
+
+  Dimensions m_dimensions;
+  array<Index, NumDims+1> m_outputStrides;
+  array<Index, NumDims> m_inputStrides;
+  TensorEvaluator<ArgType, Device> m_impl;
+  PaddingDimensions m_padding;
+
+  Scalar m_paddingValue;
+};
+
+
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_PADDING_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
new file mode 100644
index 0000000000..c89022ab8e
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
@@ -0,0 +1,314 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_PATCH_H
+#define EIGEN_CXX11_TENSOR_TENSOR_PATCH_H
+
+namespace Eigen {
+
+/** \class TensorPatch
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor patch class.
+  *
+  *
+  */
+namespace internal {
+template<typename PatchDim, typename XprType>
+struct traits<TensorPatchOp<PatchDim, XprType> > : public traits<XprType>
+{
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef typename remove_reference<Nested>::type _Nested;
+  static const int NumDimensions = XprTraits::NumDimensions + 1;
+  static const int Layout = XprTraits::Layout;
+};
+
+template<typename PatchDim, typename XprType>
+struct eval<TensorPatchOp<PatchDim, XprType>, Eigen::Dense>
+{
+  typedef const TensorPatchOp<PatchDim, XprType>& type;
+};
+
+template<typename PatchDim, typename XprType>
+struct nested<TensorPatchOp<PatchDim, XprType>, 1, typename eval<TensorPatchOp<PatchDim, XprType> >::type>
+{
+  typedef TensorPatchOp<PatchDim, XprType> type;
+};
+
+}  // end namespace internal
+
+
+
+template<typename PatchDim, typename XprType>
+class TensorPatchOp : public TensorBase<TensorPatchOp<PatchDim, XprType>, ReadOnlyAccessors>
+{
+  public:
+  typedef typename Eigen::internal::traits<TensorPatchOp>::Scalar Scalar;
+  typedef typename Eigen::internal::traits<TensorPatchOp>::Packet Packet;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+  typedef typename Eigen::internal::nested<TensorPatchOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorPatchOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorPatchOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorPatchOp(const XprType& expr, const PatchDim& patch_dims)
+      : m_xpr(expr), m_patch_dims(patch_dims) {}
+
+    EIGEN_DEVICE_FUNC
+    const PatchDim& patch_dims() const { return m_patch_dims; }
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename XprType::Nested>::type&
+    expression() const { return m_xpr; }
+
+  protected:
+    typename XprType::Nested m_xpr;
+    const PatchDim m_patch_dims;
+};
+
+
+// Eval as rvalue
+template<typename PatchDim, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
+{
+  typedef TensorPatchOp<PatchDim, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value + 1;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename XprType::Scalar Scalar;
+
+  enum {
+    IsAligned = false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = true,
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device)
+  {
+    Index num_patches = 1;
+    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+    const PatchDim& patch_dims = op.patch_dims();
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = 0; i < NumDims-1; ++i) {
+        m_dimensions[i] = patch_dims[i];
+        num_patches *= (input_dims[i] - patch_dims[i] + 1);
+      }
+      m_dimensions[NumDims-1] = num_patches;
+
+      m_inputStrides[0] = 1;
+      m_patchStrides[0] = 1;
+      for (int i = 1; i < NumDims-1; ++i) {
+        m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1];
+        m_patchStrides[i] = m_patchStrides[i-1] * (input_dims[i-1] - patch_dims[i-1] + 1);
+      }
+      m_outputStrides[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1];
+      }
+    } else {
+      for (int i = 0; i < NumDims-1; ++i) {
+        m_dimensions[i+1] = patch_dims[i];
+        num_patches *= (input_dims[i] - patch_dims[i] + 1);
+      }
+      m_dimensions[0] = num_patches;
+
+      m_inputStrides[NumDims-2] = 1;
+      m_patchStrides[NumDims-2] = 1;
+      for (int i = NumDims-3; i >= 0; --i) {
+        m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1];
+        m_patchStrides[i] = m_patchStrides[i+1] * (input_dims[i+1] - patch_dims[i+1] + 1);
+      }
+      m_outputStrides[NumDims-1] = 1;
+      for (int i = NumDims-2; i >= 0; --i) {
+        m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1];
+      }
+    }
+  }
+
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_impl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    Index output_stride_index = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? NumDims - 1 : 0;
+    // Find the location of the first element of the patch.
+    Index patchIndex = index / m_outputStrides[output_stride_index];
+    // Find the offset of the element wrt the location of the first element.
+    Index patchOffset = index - patchIndex * m_outputStrides[output_stride_index];
+    Index inputIndex = 0;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumDims - 2; i > 0; --i) {
+        const Index patchIdx = patchIndex / m_patchStrides[i];
+        patchIndex -= patchIdx * m_patchStrides[i];
+        const Index offsetIdx = patchOffset / m_outputStrides[i];
+        patchOffset -= offsetIdx * m_outputStrides[i];
+        inputIndex += (patchIdx + offsetIdx) * m_inputStrides[i];
+      }
+    } else {
+      for (int i = 0; i < NumDims - 2; ++i) {
+        const Index patchIdx = patchIndex / m_patchStrides[i];
+        patchIndex -= patchIdx * m_patchStrides[i];
+        const Index offsetIdx = patchOffset / m_outputStrides[i+1];
+        patchOffset -= offsetIdx * m_outputStrides[i+1];
+        inputIndex += (patchIdx + offsetIdx) * m_inputStrides[i];
+      }
+    }
+    inputIndex += (patchIndex + patchOffset);
+    return m_impl.coeff(inputIndex);
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  {
+    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+packetSize-1 < dimensions().TotalSize());
+
+    Index output_stride_index = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? NumDims - 1 : 0;
+    Index indices[2] = {index, index + packetSize - 1};
+    Index patchIndices[2] = {indices[0] / m_outputStrides[output_stride_index],
+                             indices[1] / m_outputStrides[output_stride_index]};
+    Index patchOffsets[2] = {indices[0] - patchIndices[0] * m_outputStrides[output_stride_index],
+                             indices[1] - patchIndices[1] * m_outputStrides[output_stride_index]};
+
+    Index inputIndices[2] = {0, 0};
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumDims - 2; i > 0; --i) {
+        const Index patchIdx[2] = {patchIndices[0] / m_patchStrides[i],
+                                   patchIndices[1] / m_patchStrides[i]};
+        patchIndices[0] -= patchIdx[0] * m_patchStrides[i];
+        patchIndices[1] -= patchIdx[1] * m_patchStrides[i];
+
+        const Index offsetIdx[2] = {patchOffsets[0] / m_outputStrides[i],
+                                    patchOffsets[1] / m_outputStrides[i]};
+        patchOffsets[0] -= offsetIdx[0] * m_outputStrides[i];
+        patchOffsets[1] -= offsetIdx[1] * m_outputStrides[i];
+
+        inputIndices[0] += (patchIdx[0] + offsetIdx[0]) * m_inputStrides[i];
+        inputIndices[1] += (patchIdx[1] + offsetIdx[1]) * m_inputStrides[i];
+      }
+    } else {
+      for (int i = 0; i < NumDims - 2; ++i) {
+        const Index patchIdx[2] = {patchIndices[0] / m_patchStrides[i],
+                                   patchIndices[1] / m_patchStrides[i]};
+        patchIndices[0] -= patchIdx[0] * m_patchStrides[i];
+        patchIndices[1] -= patchIdx[1] * m_patchStrides[i];
+
+        const Index offsetIdx[2] = {patchOffsets[0] / m_outputStrides[i+1],
+                                    patchOffsets[1] / m_outputStrides[i+1]};
+        patchOffsets[0] -= offsetIdx[0] * m_outputStrides[i+1];
+        patchOffsets[1] -= offsetIdx[1] * m_outputStrides[i+1];
+
+        inputIndices[0] += (patchIdx[0] + offsetIdx[0]) * m_inputStrides[i];
+        inputIndices[1] += (patchIdx[1] + offsetIdx[1]) * m_inputStrides[i];
+      }
+    }
+    inputIndices[0] += (patchIndices[0] + patchOffsets[0]);
+    inputIndices[1] += (patchIndices[1] + patchOffsets[1]);
+
+    if (inputIndices[1] - inputIndices[0] == packetSize - 1) {
+      PacketReturnType rslt = m_impl.template packet<Unaligned>(inputIndices[0]);
+      return rslt;
+    }
+    else {
+      EIGEN_ALIGN_DEFAULT CoeffReturnType values[packetSize];
+      values[0] = m_impl.coeff(inputIndices[0]);
+      values[packetSize-1] = m_impl.coeff(inputIndices[1]);
+      for (int i = 1; i < packetSize-1; ++i) {
+        values[i] = coeff(index+i);
+      }
+      PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+      return rslt;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<Index, NumDims>& coords) const
+  {
+    Index patch_coord_idx = Layout == ColMajor ? NumDims - 1 : 0;
+    // Location of the first element of the patch.
+    const Index patchIndex = coords[patch_coord_idx];
+
+    if (TensorEvaluator<ArgType, Device>::CoordAccess) {
+      array<Index, NumDims-1> inputCoords;
+      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+        for (int i = NumDims - 2; i > 0; --i) {
+          const Index patchIdx = patchIndex / m_patchStrides[i];
+          patchIndex -= patchIdx * m_patchStrides[i];
+          const Index offsetIdx = coords[i];
+          inputCoords[i] = coords[i] + patchIdx;
+        }
+      } else {
+        for (int i = 0; i < NumDims - 2; ++i) {
+          const Index patchIdx = patchIndex / m_patchStrides[i];
+          patchIndex -= patchIdx * m_patchStrides[i];
+          const Index offsetIdx = coords[i+1];
+          inputCoords[i] = coords[i+1] + patchIdx;
+        }
+      }
+      Index coords_idx = Layout == ColMajor ? 0 : NumDims - 1;
+      inputCoords[0] = (patchIndex + coords[coords_idx]);
+      return m_impl.coeff(inputCoords);
+    }
+    else {
+      Index inputIndex = 0;
+      if (Layout == ColMajor) {
+        for (int i = NumDims - 2; i > 0; --i) {
+          const Index patchIdx = patchIndex / m_patchStrides[i];
+          patchIndex -= patchIdx * m_patchStrides[i];
+          const Index offsetIdx = coords[i];
+          inputIndex += (patchIdx + offsetIdx) * m_inputStrides[i];
+        }
+      } else {
+        for (int i = 0; i < NumDims - 2; ++i) {
+          const Index patchIdx = patchIndex / m_patchStrides[i];
+          patchIndex -= patchIdx * m_patchStrides[i];
+          const Index offsetIdx = coords[i+1];
+          inputIndex += (patchIdx + offsetIdx) * m_inputStrides[i];
+        }
+      }
+      Index coords_idx = Layout == ColMajor ? 0 : NumDims - 1;
+      inputIndex += (patchIndex + coords[coords_idx]);
+      return m_impl.coeff(inputIndex);
+    }
+  }
+
+  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+
+ protected:
+  Dimensions m_dimensions;
+  array<Index, NumDims> m_outputStrides;
+  array<Index, NumDims-1> m_inputStrides;
+  array<Index, NumDims-1> m_patchStrides;
+
+  TensorEvaluator<ArgType, Device> m_impl;
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_PATCH_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
new file mode 100644
index 0000000000..a70d5ae1f0
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -0,0 +1,1141 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
+#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
+
+namespace Eigen {
+
+/** \class TensorReduction
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor reduction class.
+  *
+  */
+
+namespace internal {
+template<typename Op, typename Dims, typename XprType>
+struct traits<TensorReductionOp<Op, Dims, XprType> >
+ : traits<XprType>
+{
+  typedef typename traits<XprType>::Scalar Scalar;
+  typedef typename traits<XprType>::StorageKind StorageKind;
+  typedef typename traits<XprType>::Index Index;
+  typedef typename XprType::Nested Nested;
+};
+
+template<typename Op, typename Dims, typename XprType>
+struct eval<TensorReductionOp<Op, Dims, XprType>, Eigen::Dense>
+{
+  typedef const TensorReductionOp<Op, Dims, XprType>& type;
+};
+
+template<typename Op, typename Dims, typename XprType>
+struct nested<TensorReductionOp<Op, Dims, XprType>, 1, typename eval<TensorReductionOp<Op, Dims, XprType> >::type>
+{
+  typedef TensorReductionOp<Op, Dims, XprType> type;
+};
+
+
+
+template <typename InputDims, typename OutputDims, typename ReducedDims> EIGEN_DEVICE_FUNC
+static void partition_dims(const InputDims& input_dims,
+                           const array<bool, internal::array_size<InputDims>::value>& reduced,
+                           OutputDims* output_dims, ReducedDims* reduced_dims) {
+  const int NumInputDims = internal::array_size<InputDims>::value;
+  int outputIndex = 0;
+  int reduceIndex = 0;
+  for (int i = 0; i < NumInputDims; ++i) {
+    if (OutputDims::count == 0 || reduced[i]) {
+      (*reduced_dims)[reduceIndex] = input_dims[i];
+      ++reduceIndex;
+    } else {
+      (*output_dims)[outputIndex] = input_dims[i];
+      ++outputIndex;
+    }
+  }
+}
+
+
+
+template <typename ReducedDims, int NumTensorDims, int Layout>
+struct are_inner_most_dims {
+  static const bool value = false;
+};
+template <typename ReducedDims, int NumTensorDims, int Layout>
+struct preserve_inner_most_dims {
+  static const bool value = false;
+};
+
+#if defined(EIGEN_HAS_CONSTEXPR) && defined(EIGEN_HAS_VARIADIC_TEMPLATES)
+// The use of the tmp1, tmp2, tmp3 intermediate variables is needed for nvcc 7
+// to compile the code below. NVidia is working on a fix.
+template <typename ReducedDims, int NumTensorDims>
+struct are_inner_most_dims<ReducedDims, NumTensorDims, ColMajor>{
+  static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>()();
+  static const bool tmp2 = index_statically_eq<ReducedDims>()(0, 0);
+  static const bool tmp3 = index_statically_eq<ReducedDims>()(array_size<ReducedDims>::value-1, array_size<ReducedDims>::value-1);
+  static const bool value = tmp1 & tmp2 & tmp3;
+};
+template <typename ReducedDims, int NumTensorDims>
+struct are_inner_most_dims<ReducedDims, NumTensorDims, RowMajor>{
+  static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>()();
+  static const bool tmp2 = index_statically_eq<ReducedDims>()(0, NumTensorDims - array_size<ReducedDims>::value);
+  static const bool tmp3 = index_statically_eq<ReducedDims>()(array_size<ReducedDims>::value - 1, NumTensorDims - 1);
+  static const bool value = tmp1 & tmp2 & tmp3;
+
+};
+template <typename ReducedDims, int NumTensorDims>
+struct preserve_inner_most_dims<ReducedDims, NumTensorDims, ColMajor>{
+  static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>()();
+  static const bool tmp2 = index_statically_gt<ReducedDims>()(0, 0);
+  static const bool value = tmp1 & tmp2;
+
+};
+template <typename ReducedDims, int NumTensorDims>
+struct preserve_inner_most_dims<ReducedDims, NumTensorDims, RowMajor>{
+  static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>()();
+  static const bool tmp2 = index_statically_lt<ReducedDims>()(array_size<ReducedDims>::value - 1, NumTensorDims - 1);
+  static const bool value = tmp1 & tmp2;
+};
+#endif
+
+
+template <int DimIndex, typename Self, typename Op>
+struct GenericDimReducer {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::CoeffReturnType* accum) {
+    EIGEN_STATIC_ASSERT(DimIndex >= 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    for (int j = 0; j < self.m_reducedDims[DimIndex]; ++j) {
+      const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex];
+      GenericDimReducer<DimIndex-1, Self, Op>::reduce(self, input, reducer, accum);
+    }
+  }
+};
+template <typename Self, typename Op>
+struct GenericDimReducer<-1, Self, Op> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::CoeffReturnType* accum) {
+    reducer.reduce(self.m_impl.coeff(firstIndex), accum);
+  }
+};
+
+template <typename Self, typename Op, bool Vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
+struct InnerMostDimReducer {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) {
+    typename Self::CoeffReturnType accum = reducer.initialize();
+    for (typename Self::Index j = 0; j < numValuesToReduce; ++j) {
+      reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum);
+    }
+    return reducer.finalize(accum);
+  }
+};
+
+template <typename Self, typename Op>
+struct InnerMostDimReducer<Self, Op, true> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) {
+    const int packetSize = internal::unpacket_traits<typename Self::PacketReturnType>::size;
+    const typename Self::Index VectorizedSize = (numValuesToReduce / packetSize) * packetSize;
+    typename Self::PacketReturnType p = reducer.template initializePacket<typename Self::PacketReturnType>();
+    for (typename Self::Index j = 0; j < VectorizedSize; j += packetSize) {
+      reducer.reducePacket(self.m_impl.template packet<Unaligned>(firstIndex + j), &p);
+    }
+    typename Self::CoeffReturnType accum = reducer.initialize();
+    for (typename Self::Index j = VectorizedSize; j < numValuesToReduce; ++j) {
+      reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum);
+    }
+    return reducer.finalizeBoth(accum, p);
+  }
+};
+
+template <int DimIndex, typename Self, typename Op, bool vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
+struct InnerMostDimPreserver {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) {
+    eigen_assert(false && "should never be called");
+  }
+};
+
+template <int DimIndex, typename Self, typename Op>
+struct InnerMostDimPreserver<DimIndex, Self, Op, true> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) {
+    EIGEN_STATIC_ASSERT(DimIndex >= 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    for (typename Self::Index j = 0; j < self.m_reducedDims[DimIndex]; ++j) {
+      const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex];
+      InnerMostDimPreserver<DimIndex-1, Self, Op>::reduce(self, input, reducer, accum);
+    }
+  }
+};
+
+template <typename Self, typename Op>
+struct InnerMostDimPreserver<-1, Self, Op, true> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) {
+    reducer.reducePacket(self.m_impl.template packet<Unaligned>(firstIndex), accum);
+  }
+};
+
+// Default full reducer
+template <typename Self, typename Op, typename Device, bool Vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
+struct FullReducer {
+  static const bool HasOptimizedImplementation = false;
+
+  static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const Device&, typename Self::CoeffReturnType* output) {
+    const typename Self::Index num_coeffs = array_prod(self.m_impl.dimensions());
+    *output = InnerMostDimReducer<Self, Op>::reduce(self, 0, num_coeffs, reducer);
+  }
+};
+
+
+#ifdef EIGEN_USE_THREADS
+// Multithreaded full reducers
+template <typename Eval, typename Op, bool Vectorizable = (Eval::InputPacketAccess & Op::PacketAccess)>
+struct FullReducerShard {
+  static void run(const Eval& eval, typename Eval::Index firstIndex, typename Eval::Index numValuesToReduce, Op& reducer, FullReducerShard* shard) {
+
+    shard->saccum = reducer.initialize();
+    for (typename Eval::Index j = 0; j < numValuesToReduce; ++j) {
+      reducer.reduce(eval.m_impl.coeff(firstIndex + j), &shard->saccum);
+    }
+  }
+
+  typename Eval::CoeffReturnType saccum;
+};
+
+template <typename Eval, typename Op>
+struct FullReducerShard<Eval, Op, true> {
+  static void run(const Eval& eval, typename Eval::Index firstIndex, typename Eval::Index numValuesToReduce, Op& reducer, FullReducerShard* shard) {
+
+    const int packetSize = internal::unpacket_traits<typename Eval::PacketReturnType>::size;
+    const typename Eval::Index VectorizedSize = (numValuesToReduce / packetSize) * packetSize;
+
+    shard->paccum = reducer.template initializePacket<typename Eval::PacketReturnType>();
+    for (typename Eval::Index j = 0; j < VectorizedSize; j += packetSize) {
+      reducer.reducePacket(eval.m_impl.template packet<Unaligned>(firstIndex + j), &shard->paccum);
+    }
+    shard->saccum = reducer.initialize();
+    for (typename Eval::Index j = VectorizedSize; j < numValuesToReduce; ++j) {
+      reducer.reduce(eval.m_impl.coeff(firstIndex + j), &shard->saccum);
+    }
+  }
+
+  typename Eval::PacketReturnType paccum;
+  typename Eval::CoeffReturnType saccum;
+};
+
+
+template <typename Self, typename Op>
+struct FullReducer<Self, Op, ThreadPoolDevice, false> {
+  static const bool HasOptimizedImplementation = !Op::IsStateful;
+
+  // launch one reducer per thread and accumulate the result.
+  static void run(const Self& self, Op& reducer, const ThreadPoolDevice& device, typename Self::CoeffReturnType* output) {
+    typedef typename Self::Index Index;
+    const Index num_coeffs = array_prod(self.m_impl.dimensions());
+    const Index blocksize = std::floor<Index>(static_cast<float>(num_coeffs)/device.numThreads());
+    const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0;
+    eigen_assert(num_coeffs >= numblocks * blocksize);
+
+    FixedSizeVector<Notification*> results(numblocks);
+    FixedSizeVector<FullReducerShard<Self, Op, false> > shards(numblocks, FullReducerShard<Self, Op, false>());
+    for (Index i = 0; i < numblocks; ++i) {
+      results.push_back(device.enqueue(&FullReducerShard<Self, Op, false>::run, self, i*blocksize, blocksize, reducer, &shards[i]));
+    }
+
+    FullReducerShard<Self, Op, false> finalShard;
+    if (numblocks * blocksize < num_coeffs) {
+      FullReducerShard<Self, Op, false>::run(self, numblocks * blocksize, num_coeffs - numblocks * blocksize, reducer, &finalShard);
+    } else {
+      finalShard.saccum = reducer.initialize();
+    }
+
+    for (Index i = 0; i < numblocks; ++i) {
+      wait_until_ready(results[i]);
+      delete results[i];
+    }
+
+    for (Index i = 0; i < numblocks; ++i) {
+      reducer.reduce(shards[i].saccum, &finalShard.saccum);
+    }
+    *output = reducer.finalize(finalShard.saccum);
+  }
+};
+
+template <typename Self, typename Op>
+struct FullReducer<Self, Op, ThreadPoolDevice, true> {
+  static const bool HasOptimizedImplementation = !Op::IsStateful;
+
+  // launch one reducer per thread and accumulate the result.
+  static void run(const Self& self, Op& reducer, const ThreadPoolDevice& device, typename Self::CoeffReturnType* output) {
+    typedef typename Self::Index Index;
+    const Index num_coeffs = array_prod(self.m_impl.dimensions());
+    const Index blocksize = std::floor<Index>(static_cast<float>(num_coeffs)/device.numThreads());
+    const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0;
+    eigen_assert(num_coeffs >= numblocks * blocksize);
+
+    FixedSizeVector<Notification*> results(numblocks);
+    FixedSizeVector<FullReducerShard<Self, Op, true> > shards(numblocks, FullReducerShard<Self, Op, true>());
+    for (Index i = 0; i < numblocks; ++i) {
+      results.push_back(device.enqueue(&FullReducerShard<Self, Op, true>::run, self, i*blocksize, blocksize, reducer, &shards[i]));
+    }
+
+    FullReducerShard<Self, Op, true> finalShard;
+    if (numblocks * blocksize < num_coeffs) {
+      FullReducerShard<Self, Op, true>::run(self, numblocks * blocksize, num_coeffs - numblocks * blocksize, reducer, &finalShard);
+    } else {
+      finalShard.paccum = reducer.template initializePacket<typename Self::PacketReturnType>();
+      finalShard.saccum = reducer.initialize();
+    }
+
+    for (Index i = 0; i < numblocks; ++i) {
+      wait_until_ready(results[i]);
+      delete results[i];
+    }
+
+    for (Index i = 0; i < numblocks; ++i) {
+      reducer.reducePacket(shards[i].paccum, &finalShard.paccum);
+      reducer.reduce(shards[i].saccum, &finalShard.saccum);
+    }
+
+    *output = reducer.finalizeBoth(finalShard.saccum, finalShard.paccum);
+  }
+};
+#endif
+
+
+#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+// Full reducers for GPU, don't vectorize for now
+
+// Reducer function that enables multiple cuda thread to safely accumulate at the same
+// output address. It basically reads the current value of the output variable, and
+// attempts to update it with the new value. If in the meantime another cuda thread
+// updated the content of the output address it will try again.
+template <typename T, typename R>
+__device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) {
+#if __CUDA_ARCH__ >= 300
+  if (sizeof(T) == 4)
+  {
+    unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
+    unsigned int newval = oldval;
+    reducer.reduce(accum, reinterpret_cast<T*>(&newval));
+    if (newval == oldval) {
+      return;
+    }
+    unsigned int readback;
+    while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
+      oldval = readback;
+      newval = oldval;
+      reducer.reduce(accum, reinterpret_cast<T*>(&newval));
+      if (newval == oldval) {
+        return;
+      }
+    }
+  }
+  else if (sizeof(T) == 8) {
+    unsigned long long oldval = *reinterpret_cast<unsigned long long*>(output);
+    unsigned long long newval = oldval;
+    reducer.reduce(accum, reinterpret_cast<T*>(&newval));
+    if (newval == oldval) {
+      return;
+    }
+    unsigned long long readback;
+    while ((readback = atomicCAS((unsigned long long*)output, oldval, newval)) != oldval) {
+      oldval = readback;
+      newval = oldval;
+      reducer.reduce(accum, reinterpret_cast<T*>(&newval));
+      if (newval == oldval) {
+        return;
+      }
+    }
+  }
+  else {
+    assert(0 && "Wordsize not supported");
+  }
+#else
+  assert(0 && "Shouldn't be called on unsupported device");
+#endif
+}
+
+template <typename T>
+__device__ inline void atomicReduce(T* output, T accum, SumReducer<T>&) {
+#if __CUDA_ARCH__ >= 300
+  atomicAdd(output, accum);
+#else
+  assert(0 && "Shouldn't be called on unsupported device");
+#endif
+}
+
+template <int BlockSize, int NumPerThread, typename Self,
+          typename Reducer, typename Index>
+__global__ void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
+                                    typename Self::CoeffReturnType* output) {
+  const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x;
+
+  if (first_index == 0) {
+    *output = reducer.initialize();
+  }
+
+  typename Self::CoeffReturnType accum = reducer.initialize();
+  for (Index i = 0; i < NumPerThread; ++i) {
+    const Index index = first_index + i * BlockSize;
+    if (index >= num_coeffs) {
+      break;
+    }
+    typename Self::CoeffReturnType val = input.m_impl.coeff(index);
+    reducer.reduce(val, &accum);
+  }
+
+  for (int offset = warpSize/2; offset > 0; offset /= 2) {
+    reducer.reduce(__shfl_down(accum, offset), &accum);
+  }
+
+  if ((threadIdx.x & (warpSize - 1)) == 0) {
+    atomicReduce(output, accum, reducer);
+  }
+}
+
+
+template <typename Self, typename Op, bool Vectorizable>
+struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
+  // Unfortunately nvidia doesn't support well exotic types such as complex,
+  // so reduce the scope of the optimized version of the code to the simple case
+  // of floats.
+  static const bool HasOptimizedImplementation = !Op::IsStateful &&
+                                                 internal::is_same<typename Self::CoeffReturnType, float>::value;
+
+  template <typename OutputType>
+  static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
+    assert(false && "Should only be called on floats");
+  }
+
+  static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output) {
+    typedef typename Self::Index Index;
+
+    const Index num_coeffs = array_prod(self.m_impl.dimensions());
+    const int block_size = 256;
+    const int num_per_thread = 128;
+    const int num_blocks = std::ceil(static_cast<float>(num_coeffs) / (block_size * num_per_thread));
+    LAUNCH_CUDA_KERNEL((FullReductionKernel<block_size, num_per_thread>),
+                       num_blocks, block_size, 0, device, reducer, self, num_coeffs, output);
+  }
+};
+
+#endif
+
+
+template <typename Self, typename Op,
+          bool Vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
+class BlockReducer {
+ public:
+  typedef typename Self::Index Index;
+  typedef typename Self::Scalar Scalar;
+  typedef typename Self::CoeffReturnType CoeffReturnType;
+  typedef typename Self::PacketReturnType PacketReturnType;
+  explicit BlockReducer(const Op& reducer) : op_(reducer) {
+    accum_ = op_.initialize();
+  }
+  void Reduce(Index index, Index num_values_to_reduce, Scalar* data) {
+    for (Index i = 0; i < num_values_to_reduce; ++i) {
+      op_.reduce(data[index + i], &accum_);
+    }
+  }
+  CoeffReturnType Finalize() {
+    return op_.finalize(accum_);
+  }
+  PacketReturnType FinalizePacket() {
+    // TODO(andydavis) This function should not be called for Scalar
+    // reductions: clean this up or add an assert here.
+    return PacketReturnType();
+  }
+
+ private:
+  CoeffReturnType accum_;
+  Op op_;
+};
+
+template <typename Self, typename Op>
+class BlockReducer<Self, Op, true> {
+ public:
+  typedef typename Self::Index Index;
+  typedef typename Self::Scalar Scalar;
+  typedef typename Self::CoeffReturnType CoeffReturnType;
+  typedef typename Self::PacketReturnType PacketReturnType;
+  explicit BlockReducer(const Op& reducer) : op_(reducer) {
+    vaccum_ = op_.template initializePacket<PacketReturnType>();
+    accum_ = op_.initialize();
+  }
+  void Reduce(Index index, Index num_values_to_reduce, Scalar* data) {
+    const int packet_size = internal::unpacket_traits<PacketReturnType>::size;
+    const Index vectorized_size = (num_values_to_reduce / packet_size) *
+        packet_size;
+    for (Index i = 0; i < vectorized_size; i += packet_size) {
+      op_.reducePacket(internal::ploadt<PacketReturnType, Unaligned>(
+          &data[index + i]), &vaccum_);
+    }
+    for (Index i = vectorized_size; i < num_values_to_reduce; ++i) {
+      op_.reduce(data[index + i], &accum_);
+    }
+  }
+  CoeffReturnType Finalize() {
+    return op_.finalizeBoth(accum_, vaccum_);
+  }
+  PacketReturnType FinalizePacket() {
+    return op_.finalizePacket(vaccum_);
+  }
+
+ private:
+  PacketReturnType vaccum_;
+  CoeffReturnType accum_;
+  Op op_;
+};
+
+}  // end namespace internal
+
+
+template <typename Op, typename Dims, typename XprType>
+class TensorReductionOp : public TensorBase<TensorReductionOp<Op, Dims, XprType>, ReadOnlyAccessors> {
+  public:
+    typedef typename Eigen::internal::traits<TensorReductionOp>::Scalar Scalar;
+    typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+    typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
+    typedef typename Eigen::internal::nested<TensorReductionOp>::type Nested;
+    typedef typename Eigen::internal::traits<TensorReductionOp>::StorageKind StorageKind;
+    typedef typename Eigen::internal::traits<TensorReductionOp>::Index Index;
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    TensorReductionOp(const XprType& expr, const Dims& dims) : m_expr(expr), m_dims(dims)
+    { }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    TensorReductionOp(const XprType& expr, const Dims& dims, const Op& reducer) : m_expr(expr), m_dims(dims), m_reducer(reducer)
+    { }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const XprType& expression() const { return m_expr; }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const Dims& dims() const { return m_dims; }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const Op& reducer() const { return m_reducer; }
+
+  protected:
+    typename XprType::Nested m_expr;
+    const Dims m_dims;
+    const Op m_reducer;
+};
+
+
+// Eval as rvalue
+template<typename Op, typename Dims, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
+{
+  typedef TensorReductionOp<Op, Dims, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions;
+  static const int NumInputDims = internal::array_size<InputDimensions>::value;
+  static const int NumReducedDims = internal::array_size<Dims>::value;
+  EIGEN_STATIC_ASSERT(NumInputDims >= NumReducedDims, YOU_MADE_A_PROGRAMMING_MISTAKE)
+  static const int NumOutputDims = NumInputDims - NumReducedDims;
+  typedef DSizes<Index, NumOutputDims> Dimensions;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename internal::remove_const<Scalar>::type ScalarNonConst;
+  typedef TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device> Self;
+  static const bool InputPacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess;
+
+  enum {
+    IsAligned = false,
+    PacketAccess = Self::InputPacketAccess && Op::PacketAccess,
+    BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+  };
+
+  typedef typename internal::TensorBlock<Index, ScalarNonConst, NumOutputDims,
+                                         Layout> OutputTensorBlock;
+  typedef typename internal::TensorBlock<Index, ScalarNonConst, NumInputDims,
+                                         Layout> InputTensorBlock;
+
+  static const bool ReducingInnerMostDims = internal::are_inner_most_dims<Dims, NumInputDims, Layout>::value;
+  static const bool PreservingInnerMostDims = internal::preserve_inner_most_dims<Dims, NumInputDims, Layout>::value;
+  static const bool RunningFullReduction = (NumInputDims==NumReducedDims);
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device), m_reducer(op.reducer()), m_result(NULL), m_device(device)
+  {
+    EIGEN_STATIC_ASSERT((!ReducingInnerMostDims | !PreservingInnerMostDims | (NumReducedDims == NumInputDims)),
+                        YOU_MADE_A_PROGRAMMING_MISTAKE);
+    for (int i = 0; i < NumInputDims; ++i) {
+      m_reduced_dim[i] = false;
+    }
+    for (int i = 0; i < NumReducedDims; ++i) {
+      eigen_assert(op.dims()[i] >= 0);
+      eigen_assert(op.dims()[i] < NumInputDims);
+      m_reduced_dim[op.dims()[i]] = true;
+    }
+
+    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+    internal::partition_dims(input_dims, m_reduced_dim, &m_dimensions, &m_reducedDims);
+
+    // Precompute output strides.
+    if (NumOutputDims > 0) {
+      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+        m_outputStrides[0] = 1;
+        for (int i = 1; i < NumOutputDims; ++i) {
+          m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
+          m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
+        }
+      } else {
+        m_outputStrides[NumOutputDims - 1] = 1;
+        for (int i = NumOutputDims - 2; i >= 0; --i) {
+          m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
+          m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
+        }
+      }
+    }
+
+    // Precompute input strides.
+    if (NumInputDims > 0) {
+      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+        m_inputStrides[0] = 1;
+        for (int i = 1; i < NumInputDims; ++i) {
+          m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1];
+        }
+      } else {
+        m_inputStrides[NumInputDims - 1] = 1;
+        for (int i = NumInputDims - 2; i >= 0; --i) {
+          m_inputStrides[i] = m_inputStrides[i + 1] * input_dims[i + 1];
+        }
+      }
+    }
+
+    int outputIndex = 0;
+    int reduceIndex = 0;
+    for (int i = 0; i < NumInputDims; ++i) {
+      if (m_reduced_dim[i]) {
+        m_reducedStrides[reduceIndex] = m_inputStrides[i];
+        ++reduceIndex;
+      } else {
+        m_preservedStrides[outputIndex] = m_inputStrides[i];
+        m_output_to_input_dim_map[outputIndex] = i;
+        ++outputIndex;
+      }
+    }
+
+    m_numValuesToReduce
+        = NumOutputDims == 0 ? internal::array_prod(input_dims)
+        : (static_cast<int>(Layout) == static_cast<int>(ColMajor))
+            ? m_preservedStrides[0] : m_preservedStrides[NumOutputDims - 1];
+
+    m_block_total_size_max = numext::maxi(static_cast<std::size_t>(1),
+                                        device.lastLevelCacheSize() /
+                                        sizeof(Scalar));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+
+    // Use the FullReducer if possible.
+    if (RunningFullReduction && internal::FullReducer<Self, Op, Device>::HasOptimizedImplementation &&
+        ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) ||
+         (internal::array_prod(m_impl.dimensions()) > 1024 * 1024))) {
+
+      bool need_assign = false;
+      if (!data) {
+        m_result = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType)));
+        data = m_result;
+        need_assign = true;
+      }
+
+      Op reducer(m_reducer);
+      internal::FullReducer<Self, Op, Device>::run(*this, reducer, m_device, data);
+      return need_assign;
+    }
+
+    return true;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_impl.cleanup();
+
+    if (m_result) {
+      m_device.deallocate(m_result);
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    if (RunningFullReduction && m_result) {
+      return *m_result;
+    }
+    Op reducer(m_reducer);
+    if (ReducingInnerMostDims) {
+      return internal::InnerMostDimReducer<Self, Op>::reduce(*this, firstInput(index),
+                                                             m_numValuesToReduce, reducer);
+    } else {
+      typename Self::CoeffReturnType accum = reducer.initialize();
+      internal::GenericDimReducer<NumReducedDims-1, Self, Op>::reduce(*this, firstInput(index), reducer, &accum);
+      return reducer.finalize(accum);
+    }
+  }
+
+  // TODO(bsteiner): provide a more efficient implementation.
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  {
+    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index + packetSize - 1 < dimensions().TotalSize());
+
+    EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize];
+    if (ReducingInnerMostDims) {
+      const Index num_values_to_reduce = m_numValuesToReduce;
+      const Index firstIndex = firstInput(index);
+      for (Index i = 0; i < packetSize; ++i) {
+        Op reducer(m_reducer);
+        values[i] = internal::InnerMostDimReducer<Self, Op>::reduce(*this, firstIndex + i * num_values_to_reduce,
+                                                                    num_values_to_reduce, reducer);
+      }
+    } else if (PreservingInnerMostDims) {
+      const Index firstIndex = firstInput(index);
+      const int innermost_dim = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? 0 : NumOutputDims - 1;
+      // TBD: extend this the the n innermost dimensions that we preserve.
+      if (((firstIndex % m_dimensions[innermost_dim]) + packetSize - 1) < m_dimensions[innermost_dim]) {
+        Op reducer(m_reducer);
+        typename Self::PacketReturnType accum = reducer.template initializePacket<typename Self::PacketReturnType>();
+        internal::InnerMostDimPreserver<NumReducedDims-1, Self, Op>::reduce(*this, firstIndex, reducer, &accum);
+        return reducer.finalizePacket(accum);
+      } else {
+        for (int i = 0; i < packetSize; ++i) {
+          values[i] = coeff(index + i);
+        }
+      }
+    } else {
+      for (int i = 0; i < packetSize; ++i) {
+        values[i] = coeff(index + i);
+      }
+    }
+    PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+    return rslt;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
+      std::vector<internal::TensorOpResourceRequirements>* resources) const {
+    resources->push_back(internal::TensorOpResourceRequirements(
+        internal::kSkewedInnerDims, m_block_total_size_max));
+    m_impl.getResourceRequirements(resources);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
+      OutputTensorBlock* output_block) const {
+    // Special case full reductions to avoid input block copy below.
+    if (NumInputDims == NumReducedDims) {
+      eigen_assert(output_block->first_coeff_index() == 0);
+      eigen_assert(output_block->block_sizes().TotalSize() == 1);
+      Op reducer(m_reducer);
+      output_block->data()[0] = internal::InnerMostDimReducer<Self, Op>::reduce(
+          *this, 0, m_numValuesToReduce, reducer);
+      return;
+    }
+
+    // Calculate input tensor 'slice' required to reduce output block coeffs.
+    DSizes<Index, NumInputDims> input_slice_sizes(m_impl.dimensions());
+    for (int i = 0; i < NumOutputDims; ++i) {
+      // Clip preserved input dimensions by output block size.
+      input_slice_sizes[m_output_to_input_dim_map[i]] =
+          output_block->block_sizes()[i];
+    }
+
+    // Shard input tensor slice into blocks (because it could be large if we
+    // need to reduce along several dimensions to calculate required output
+    // coefficients).
+    const Index max_coeff_count =
+        numext::mini(((m_device.firstLevelCacheSize()) / sizeof(Scalar)),
+                   input_slice_sizes.TotalSize());
+
+    // Calculate max output shard size needed to keep working set of reducers
+    // in L1, while leaving enough space for reducer overhead and 'packet_size'
+    // reductions.
+    DSizes<Index, NumInputDims> target_input_block_sizes;
+    CalculateTargetInputBlockShape(max_coeff_count, input_slice_sizes,
+                                   &target_input_block_sizes);
+    // Calculate indices for first preserved dimension.
+    const Index first_preserved_dim_output_index =
+        static_cast<int>(Layout) == static_cast<int>(ColMajor) ?
+        0 : NumOutputDims - 1;
+    const Index first_preserved_dim_input_index = m_output_to_input_dim_map[
+        first_preserved_dim_output_index];
+    const bool inner_most_dim_preserved = first_preserved_dim_input_index ==
+        (static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 :
+         NumInputDims - 1) | PreservingInnerMostDims;
+
+    // Calculate output block inner/outer dimension sizes.
+    const Index output_block_inner_dim_size = output_block->block_sizes()[
+        first_preserved_dim_output_index];
+    const Index output_block_outer_dim_size =
+        output_block->block_sizes().TotalSize() / output_block_inner_dim_size;
+    // Calculate shard size for first preserved dimension.
+    const Index output_shard_size = target_input_block_sizes[
+        first_preserved_dim_input_index];
+    const Index num_output_shards =
+        (output_block_inner_dim_size + output_shard_size - 1) /
+        output_shard_size;
+
+    // Initialize 'tensor_slice_offsets' from input coords of output index.
+    DSizes<Index, NumInputDims> tensor_slice_offsets;
+    GetInputCoordsForOutputIndex(output_block->first_coeff_index(),
+                                 &tensor_slice_offsets);
+
+    // Store tensor slice offset in first preserved dimension to be used
+    // to update tensor slice extents in loop below.
+    const Index first_preserved_dim_offset_start = tensor_slice_offsets[
+        first_preserved_dim_input_index];
+
+    array<BlockIteratorState, NumOutputDims> block_iter_state;
+
+    // Initialize state used to iterate through output coefficients
+    // and update 'tensor_slice_offsets' in outer preserved dims.
+    for (int i = 0; i < NumOutputDims - 1; ++i) {
+      const int dim = static_cast<int>(Layout) == static_cast<int>(ColMajor)
+          ? i + 1 : NumOutputDims - i - 2;
+      block_iter_state[i].input_dim = m_output_to_input_dim_map[dim];
+      block_iter_state[i].output_size = output_block->block_sizes()[dim];
+      block_iter_state[i].output_count = 0;
+    }
+
+    // Allocate input block memory.
+    ScalarNonConst* input_block_data = static_cast<ScalarNonConst*>(
+        m_device.allocate(max_coeff_count * sizeof(Scalar)));
+    // Allocate reducer memory.
+    const bool packet_reductions_enabled = (Self::InputPacketAccess &
+                                            Op::PacketAccess);
+    const Index packet_size = internal::unpacket_traits<PacketReturnType>::size;
+    const Index num_reducers =
+        (inner_most_dim_preserved && packet_reductions_enabled) ?
+        (output_shard_size / packet_size + output_shard_size % packet_size +
+         packet_size) : output_shard_size;
+    typedef internal::BlockReducer<Self, Op> BlockReducer;
+    BlockReducer* reducers = static_cast<BlockReducer*>(
+        m_device.allocate(num_reducers * sizeof(BlockReducer)));
+
+    InputDimensions input_tensor_dims(m_impl.dimensions());
+    for (Index output_outer_index = 0;
+         output_outer_index < output_block_outer_dim_size;
+         ++output_outer_index) {
+      for (Index output_shard_index = 0;
+           output_shard_index < num_output_shards;
+           ++output_shard_index) {
+        // Initialize 'tensor_slice_extents' for this output shard.
+        DSizes<Index, NumInputDims> tensor_slice_extents(input_slice_sizes);
+        for (int i = 0; i < NumInputDims; ++i) {
+          if (i == first_preserved_dim_input_index) {
+            // Clip first preserved dim size to output shard size.
+            tensor_slice_extents[i] = numext::mini(
+                output_shard_size,
+                input_slice_sizes[i] - (tensor_slice_offsets[i] -
+                                        first_preserved_dim_offset_start));
+
+          } else if (!m_reduced_dim[i]) {
+            // Clip outer preserved dims to size 1, so that we reduce a
+            // contiguous set of output coefficients.
+            tensor_slice_extents[i] = 1;
+          }
+        }
+
+        // Intialize output coefficient reducers.
+        for (int i = 0; i < num_reducers; ++i) {
+          new (&reducers[i]) BlockReducer(m_reducer);
+        }
+
+        typedef internal::TensorSliceBlockMapper<
+          Index, ScalarNonConst, NumInputDims, Layout> TensorSliceBlockMapper;
+
+        // TODO(andydavis) Consider removing 'input_block_stride_order' if we
+        // find that scattered reads are not worth supporting in
+        // TensorSliceBlockMapper.
+        TensorSliceBlockMapper block_mapper(
+            input_tensor_dims, tensor_slice_offsets, tensor_slice_extents,
+            target_input_block_sizes, DimensionList<Index, NumInputDims>());
+
+        const Index num_outputs_to_update = tensor_slice_extents[
+            first_preserved_dim_input_index];
+        const Index preserved_dim_vector_reducer_count =
+            (inner_most_dim_preserved && packet_reductions_enabled) ?
+            num_outputs_to_update / packet_size: 0;
+        const Index preserved_dim_vector_coeff_count =
+            inner_most_dim_preserved ? preserved_dim_vector_reducer_count *
+            packet_size : 0;
+        const Index preserved_dim_reducer_limit =
+            (inner_most_dim_preserved && packet_reductions_enabled) ?
+          (preserved_dim_vector_reducer_count +
+           num_outputs_to_update % packet_size) : num_outputs_to_update;
+
+        const Index total_block_count = block_mapper.total_block_count();
+        for (Index b = 0; b < total_block_count; ++b) {
+          InputTensorBlock input_block = block_mapper.GetBlockForIndex(
+              b, input_block_data);
+          // Read.
+          m_impl.block(&input_block);
+
+          Index num_values_to_reduce = 1;
+          for (Index i = 0; i < NumInputDims; ++i) {
+            if (m_reduced_dim[i]) {
+              num_values_to_reduce *= input_block.block_sizes()[i];
+            }
+          }
+          // Reduce.
+          if (inner_most_dim_preserved) {
+            const Index input_outer_dim_size =
+                input_block.block_sizes().TotalSize() / num_outputs_to_update;
+            for (Index input_outer_dim_index = 0;
+                 input_outer_dim_index < input_outer_dim_size;
+                 ++input_outer_dim_index) {
+              const Index input_outer_dim_base = input_outer_dim_index *
+                  num_outputs_to_update;
+              for (Index i = 0; i < preserved_dim_vector_reducer_count; ++i) {
+                reducers[i].Reduce(input_outer_dim_base + i * packet_size,
+                                   packet_size, input_block.data());
+              }
+              const Index scalar_reducer_base = input_outer_dim_base +
+                  preserved_dim_vector_coeff_count;
+              for (Index i = preserved_dim_vector_reducer_count;
+                   i < preserved_dim_reducer_limit; ++i) {
+                reducers[i].Reduce(scalar_reducer_base + i -
+                                   preserved_dim_vector_reducer_count,
+                                   1,
+                                   input_block.data());
+              }
+            }
+          } else {
+            for (Index i = 0; i < num_outputs_to_update; ++i) {
+              reducers[i].Reduce(i * num_values_to_reduce,
+                                 num_values_to_reduce,
+                                 input_block.data());
+            }
+          }
+        }
+
+        // Finalize all reducers for this output shard.
+        const Index output_base_index =
+            output_outer_index * output_block_inner_dim_size +
+            output_shard_index * output_shard_size;
+        if (inner_most_dim_preserved) {
+          EIGEN_ALIGN_DEFAULT CoeffReturnType values[packet_size];
+          for (Index i = 0; i < preserved_dim_vector_reducer_count; ++i) {
+            const Index reducer_base = output_base_index + i * packet_size;
+            internal::pstore<CoeffReturnType, PacketReturnType>(
+                values, reducers[i].FinalizePacket());
+            for (Index j = 0; j < packet_size; ++j) {
+              output_block->data()[reducer_base + j] = values[j];
+            }
+          }
+          const Index scalar_reducer_base = output_base_index +
+              preserved_dim_vector_coeff_count;
+
+          for (Index i = preserved_dim_vector_reducer_count;
+               i < preserved_dim_reducer_limit; ++i) {
+            output_block->data()[
+                scalar_reducer_base + i - preserved_dim_vector_reducer_count] =
+                reducers[i].Finalize();
+          }
+        } else {
+          for (int i = 0; i < num_outputs_to_update; ++i) {
+            output_block->data()[output_base_index + i] =
+                reducers[i].Finalize();
+          }
+        }
+
+        // Update 'tensor_slice_offsets' by num outputs for this output shard.
+        tensor_slice_offsets[first_preserved_dim_input_index] +=
+            num_outputs_to_update;
+      }
+      // Update slice offset for inner preserved dim.
+      tensor_slice_offsets[first_preserved_dim_input_index] -=
+          output_block_inner_dim_size;
+      // Update slice offsets for remaining output dims.
+      for (int i = 0; i < NumOutputDims - 1; ++i) {
+        BlockIteratorState& b = block_iter_state[i];
+        if (++b.output_count < b.output_size) {
+          ++tensor_slice_offsets[b.input_dim];
+          break;
+        }
+        b.output_count = 0;
+        tensor_slice_offsets[b.input_dim] -= b.output_size - 1;
+      }
+    }
+
+    // Free memory.
+    m_device.deallocate(input_block_data);
+    m_device.deallocate(reducers);
+  }
+
+  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+
+  private:
+  template <int, typename, typename> friend struct internal::GenericDimReducer;
+  template <typename, typename, bool> friend struct internal::InnerMostDimReducer;
+  template <int, typename, typename, bool> friend struct internal::InnerMostDimPreserver;
+  template <typename S, typename O, typename D, bool V> friend struct internal::FullReducer;
+#ifdef EIGEN_USE_THREADS
+  template <typename S, typename O, bool V> friend struct internal::FullReducerShard;
+#endif
+#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+  template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*);
+#endif
+
+  struct BlockIteratorState {
+    Index input_dim;
+    Index output_size;
+    Index output_count;
+  };
+
+  // Returns the Index in the input tensor of the first value that needs to be
+  // used to compute the reduction at output index "index".
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
+    if (ReducingInnerMostDims) {
+      return index * m_numValuesToReduce;
+    }
+    Index startInput = 0;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumOutputDims - 1; i > 0; --i) {
+        // This is index_i in the output tensor.
+        const Index idx = index / m_fastOutputStrides[i];
+        startInput += idx * m_preservedStrides[i];
+        index -= idx * m_outputStrides[i];
+      }
+    } else {
+      for (int i = 0; i < NumOutputDims - 1; ++i) {
+        // This is index_i in the output tensor.
+        const Index idx = index / m_fastOutputStrides[i];
+        startInput += idx * m_preservedStrides[i];
+        index -= idx * m_outputStrides[i];
+      }
+    }
+    if (PreservingInnerMostDims) {
+      eigen_assert(m_numValuesToReduce == 1);
+      startInput += index;
+    } else {
+      startInput += index * m_numValuesToReduce;
+    }
+    return startInput;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void GetInputCoordsForOutputIndex(
+      Index index,
+      DSizes<Index, NumInputDims>* coords) const {
+    for (int i = 0; i < NumInputDims; ++i) {
+      (*coords)[i] = 0;
+    }
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumOutputDims - 1; i > 0; --i) {
+        const Index idx = index / m_fastOutputStrides[i];
+        (*coords)[m_output_to_input_dim_map[i]] = idx;
+        index -= idx * m_outputStrides[i];
+      }
+      (*coords)[m_output_to_input_dim_map[0]] = index;
+    } else {
+      for (int i = 0; i < NumOutputDims - 1; ++i) {
+        const Index idx = index / m_fastOutputStrides[i];
+        (*coords)[m_output_to_input_dim_map[i]] = idx;
+        index -= idx * m_outputStrides[i];
+      }
+      (*coords)[m_output_to_input_dim_map[NumOutputDims-1]] = index;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void CalculateTargetInputBlockShape(
+      const Index max_coeff_count,
+      const DSizes<Index, NumInputDims>& input_slice_sizes,
+      DSizes<Index, NumInputDims>* target_input_block_sizes) const {
+    typedef typename internal::packet_traits<Scalar>::type Packet;
+    const Index packet_size = internal::unpacket_traits<Packet>::size;
+    typedef internal::BlockReducer<Self, Op> BlockReducer;
+    // TODO(andydavis) Compute reducer overhead correctly for the case where
+    // we are preserving the inner most dimension, and a single reducer
+    // reduces a packet's worth of output coefficients.
+    const Index reducer_overhead = sizeof(BlockReducer) / sizeof(Scalar);
+
+    Index coeff_to_allocate = max_coeff_count;
+    bool first_preserved_dim_allocated = false;
+    bool first_reduced_dim_allocated = false;
+    for (int i = 0; i < NumInputDims; ++i) {
+      const int dim = static_cast<int>(Layout) == static_cast<int>(ColMajor)
+          ? i : NumInputDims - i - 1;
+      (*target_input_block_sizes)[dim] = 1;
+      if (m_reduced_dim[dim]) {
+        // TODO(andydavis) Consider allocating to multiple reduced dimensions.
+        // Watch out for cases where reduced dimensions are not contiguous,
+        // which induces scattered reads.
+        if (!first_reduced_dim_allocated) {
+          (*target_input_block_sizes)[dim] = numext::mini(input_slice_sizes[dim],
+                                                        coeff_to_allocate);
+          coeff_to_allocate /= (*target_input_block_sizes)[dim];
+          first_reduced_dim_allocated = true;
+        }
+      } else if (!first_preserved_dim_allocated) {
+        // TODO(andydavis) Include output block size in this L1 working set
+        // calculation.
+        const Index allocated = max_coeff_count - coeff_to_allocate;
+        const Index alloc_size = numext::maxi(static_cast<Index>(1),
+                                            coeff_to_allocate /
+                                            reducer_overhead);
+        (*target_input_block_sizes)[dim] = numext::mini(input_slice_sizes[dim],
+                                                      alloc_size);
+        coeff_to_allocate = numext::maxi(
+            static_cast<Index>(1),
+            coeff_to_allocate / ((*target_input_block_sizes)[dim] *
+                                 reducer_overhead));
+        first_preserved_dim_allocated = true;
+      }
+    }
+  }
+
+  // Bitmap indicating if an input dimension is reduced or not.
+  array<bool, NumInputDims> m_reduced_dim;
+  // Dimensions of the output of the operation.
+  Dimensions m_dimensions;
+  // Precomputed strides for the input tensor.
+  array<Index, NumInputDims> m_inputStrides;
+  // Precomputed strides for the output tensor.
+  array<Index, NumOutputDims> m_outputStrides;
+  array<internal::TensorIntDivisor<Index>, NumOutputDims> m_fastOutputStrides;
+  // Subset of strides of the input tensor for the non-reduced dimensions.
+  // Indexed by output dimensions.
+  array<Index, NumOutputDims> m_preservedStrides;
+  // Map from output to input dimension index.
+  array<Index, NumOutputDims> m_output_to_input_dim_map;
+  // How many values go into each reduction
+  Index m_numValuesToReduce;
+
+  // Subset of strides of the input tensor for the reduced dimensions.
+  // Indexed by reduced dimensions.
+  array<Index, NumReducedDims> m_reducedStrides;
+  // Size of the input dimensions that are reduced.
+  // Indexed by reduced dimensions.
+  array<Index, NumReducedDims> m_reducedDims;
+
+  // Evaluator for the input expression.
+  TensorEvaluator<ArgType, Device> m_impl;
+
+  // Operation to apply for computing the reduction.
+  Op m_reducer;
+
+  // For full reductions
+#ifdef EIGEN_USE_GPU
+  static const bool RunningOnGPU = internal::is_same<Device, Eigen::GpuDevice>::value;
+#else
+  static const bool RunningOnGPU = false;
+#endif
+  CoeffReturnType* m_result;
+  std::size_t m_block_total_size_max;
+
+  const Device& m_device;
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
new file mode 100644
index 0000000000..d052dcdf69
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
@@ -0,0 +1,642 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Manjunath Kudlur <keveman@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H
+#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H
+
+#if defined(EIGEN_USE_GPU)
+
+namespace Eigen {
+namespace internal {
+
+template <typename OutExpr, typename InExpr, typename Op, typename Indices,
+          bool Tileable>
+class TensorExecutor<
+    const TensorAssignOp<
+        OutExpr, TensorReductionOp<Op, Indices const, InExpr const> const>,
+    GpuDevice, false, Tileable> {
+ public:
+  typedef const TensorAssignOp<
+      OutExpr, TensorReductionOp<Op, Indices const, InExpr const> const>
+      Expression;
+  static void run(const Expression& expr, const GpuDevice& device);
+};
+
+template <typename OutExpr, typename InExpr, typename Op, typename Indices,
+          bool Tileable>
+class TensorExecutor<
+    const TensorAssignOp<
+        OutExpr, TensorReductionOp<Op, Indices const, InExpr const> const>,
+    GpuDevice, true, Tileable> {
+ public:
+  typedef const TensorAssignOp<
+      OutExpr, TensorReductionOp<Op, Indices const, InExpr const> const>
+      Expression;
+  static void run(const Expression& expr, const GpuDevice& device);
+};
+
+template <typename InExpr, typename Op, typename Indices, bool Tileable>
+class TensorExecutor<const TensorEvalToOp<const TensorReductionOp<
+                         Op, const Indices, const InExpr> >,
+                     GpuDevice, false, Tileable> {
+ public:
+  typedef const TensorEvalToOp<
+      const TensorReductionOp<Op, const Indices, const InExpr> > Expression;
+  static void run(const Expression& expr, const GpuDevice& device);
+};
+
+template <typename InExpr, typename Op, typename Indices, bool Tileable>
+class TensorExecutor<const TensorEvalToOp<const TensorReductionOp<
+                         Op, const Indices, const InExpr> >,
+                     GpuDevice, true, Tileable> {
+ public:
+  typedef const TensorEvalToOp<
+      const TensorReductionOp<Op, const Indices, const InExpr> > Expression;
+  static void run(const Expression& expr, const GpuDevice& device);
+};
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#if defined(__CUDACC__)
+
+namespace Eigen {
+
+namespace internal {
+
+namespace {
+
+#define DIVUP(x, y) (((x) + (y)-1) / (y))
+
+// Initialize output[0..size-1] with val
+template <typename Output>
+__global__ void InitVector(const float val, int size, Output output) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  for (int i = idx; i < size; i += gridDim.x * blockDim.x) {
+    output.coeffRef(i) = val;
+  }
+}
+
+// -----------------------------------------------------------------------------
+// Column Reduction kernels
+// -----------------------------------------------------------------------------
+template <int GRID_DIM, int BLOCK_DIM, int NUM_PER_THREAD, typename Input,
+          typename Output, typename Reducer>
+__global__ void ColumnReduceKernel(Reducer reducer, const Input input, int rows,
+                                   int cols, Output output) {
+  assert(blockDim.x == BLOCK_DIM);
+  assert(blockDim.y == 1);
+  assert(blockDim.z == 1);
+
+  assert(gridDim.x == GRID_DIM);
+  assert(gridDim.y == 1);
+  assert(gridDim.z == 1);
+
+  typedef typename Input::Index Index;
+
+  const Index num_input_points = DIVUP(rows, NUM_PER_THREAD) * cols;
+  const int bx = blockIdx.x;
+  const int tx = threadIdx.x;
+
+  for (Index i = bx * BLOCK_DIM + tx; i < num_input_points;
+       i += BLOCK_DIM * GRID_DIM) {
+    const Index input_col = i % cols;
+    const Index input_row_begin =
+        ((i / cols) % DIVUP(rows, NUM_PER_THREAD)) * NUM_PER_THREAD;
+    float reduced_val = reducer.bottom_value();
+    for (int j = 0; j < NUM_PER_THREAD; ++j) {
+      float val = ((input_col < cols) && (input_row_begin + j < rows))
+                      ? input.coeff((input_row_begin + j) * cols + input_col)
+                      : reducer.bottom_value();
+      reduced_val = reducer(reduced_val, val);
+    }
+#if __CUDA_ARCH__ >= 300
+    reducer.atomic_reduce(&output.coeffRef(input_col), reduced_val);
+#endif
+  }
+}
+
+// -----------------------------------------------------------------------------
+// Row Reduction kernels
+// -----------------------------------------------------------------------------
+template <int GRID_DIM, int BLOCK_DIM, int NUM_PER_THREAD, typename Input,
+          typename Output, typename Reducer>
+__global__ void RowReduceKernel(Reducer reducer, const Input input, int rows,
+                                int cols, Output output) {
+  assert(BLOCK_DIM % 32 == 0);
+  assert(blockDim.x == BLOCK_DIM);
+  assert(blockDim.y == 1);
+  assert(blockDim.z == 1);
+
+  assert(gridDim.x == GRID_DIM);
+  assert(gridDim.y == 1);
+  assert(gridDim.z == 1);
+
+  const int unroll_times = 16;
+  assert(NUM_PER_THREAD % unroll_times == 0);
+
+  typedef typename Input::Index Index;
+
+  __shared__ float temp[BLOCK_DIM];
+
+  const Index input_col_blocks = DIVUP(cols, BLOCK_DIM * NUM_PER_THREAD);
+  const Index num_input_blocks = input_col_blocks * rows;
+
+  const int bx = blockIdx.x;
+  const int tx = threadIdx.x;
+
+  for (Index i = bx; i < num_input_blocks; i += GRID_DIM) {
+    const Index col_block = i % input_col_blocks;
+    const Index row_block = i / input_col_blocks;
+    const Index col_begin = col_block * BLOCK_DIM * NUM_PER_THREAD + tx;
+    const Index row = row_block;
+    float reduced_val = reducer.bottom_value();
+    if (row < rows) {
+      for (Index j = 0; j < NUM_PER_THREAD; j += unroll_times) {
+        const Index last_col = col_begin + BLOCK_DIM * (j + unroll_times - 1);
+        if (last_col >= cols) {
+          // We can skip the last iteration of the loop since we know
+          // that col >= cols there.
+#pragma unroll
+          for (int k = 0; k < unroll_times - 1; ++k) {
+            const Index col = col_begin + BLOCK_DIM * (j + k);
+            const float val = (col < cols ? input.coeff(row * cols + col)
+                               : reducer.bottom_value());
+            reduced_val = reducer(reduced_val, val);
+          }
+          break;  // col < cols for all later iterations.
+        } else {
+          // Faster version of the loop with no branches after unrolling.
+#pragma unroll
+          for (int k = 0; k < unroll_times; ++k) {
+            const Index col = col_begin + BLOCK_DIM * (j + k);
+            reduced_val = reducer(reduced_val, input.coeff(row * cols + col));
+          }
+        }
+      }
+    }
+    temp[tx] = reduced_val;
+
+    __syncthreads();
+    const int warp_id = tx & 31;
+    if (warp_id < 16) temp[tx] = reducer(temp[tx], temp[tx + 16]);
+    if (warp_id < 8) temp[tx] = reducer(temp[tx], temp[tx + 8]);
+    if (warp_id < 4) temp[tx] = reducer(temp[tx], temp[tx + 4]);
+    if (warp_id < 2) temp[tx] = reducer(temp[tx], temp[tx + 2]);
+    if (warp_id < 1) temp[tx] = reducer(temp[tx], temp[tx + 1]);
+
+    if (warp_id == 0) {
+      if (row < rows) {
+#if __CUDA_ARCH__ >= 300
+        reducer.atomic_reduce(&output.coeffRef(row), temp[tx]);
+#endif
+      }
+    }
+
+    __syncthreads();
+  }
+}
+
+template <typename Input, typename Output, typename Reducer>
+void ColumnReduceCuda(Reducer reducer, const GpuDevice& device,
+                      const Input input, int rows, int cols, Output output) {
+  const int block_size = 256;
+  const int grid_size = 128;
+  const int num_per_thread = 16;
+  LAUNCH_CUDA_KERNEL(InitVector, 32, 1024, 0, device, reducer.bottom_value(),
+                     cols, output);
+  LAUNCH_CUDA_KERNEL(
+      (ColumnReduceKernel<grid_size, block_size, num_per_thread>), grid_size,
+      block_size, 0, device, reducer, input, rows, cols, output);
+}
+
+template <typename Input, typename Output, typename Reducer>
+void RowReduceCuda(Reducer reducer, const GpuDevice& device, const Input input,
+                   int rows, int cols, Output output) {
+  const int block_size = 256;
+  const int grid_size = 32;
+  const int num_per_thread = 128;
+  LAUNCH_CUDA_KERNEL(InitVector, 32, 1024, 0, device, reducer.bottom_value(),
+                     rows, output);
+  LAUNCH_CUDA_KERNEL((RowReduceKernel<grid_size, block_size, num_per_thread>),
+                     grid_size, block_size, 0, device, reducer, input, rows,
+                     cols, output);
+}
+
+// Provides arbitrary sum reductions, applying a function across the
+// right argument being reduced prior to summing
+template <typename F>
+struct FnSumReducer {
+  __host__ __device__ FnSumReducer(F f) : f_(f) {}
+  __host__ __device__ float bottom_value() { return 0.0f; }
+  __device__ float operator()(float x, float y) const { return x + f_(y); }
+  __device__ void atomic_reduce(float* x, float y) const { atomicAdd(x, y); }
+
+  F f_;
+};
+
+// Identity is used for the basic SumReduction
+struct Identity {
+  __device__ float operator()(float x) const { return x; }
+};
+
+struct CudaSumReducer : FnSumReducer<Identity> {
+  __host__ __device__ CudaSumReducer() : FnSumReducer(Identity()) {}
+};
+
+struct CudaMaxReducer {
+  // nvcc doesn't recognize numeric_limits<float>::lowest for some reason.
+  CudaMaxReducer() {
+    bottom_value_ = -3.40282347E+38F;  // std::numeric_limits<float>::lowest();
+  }
+  __host__ __device__ float bottom_value() { return bottom_value_; }
+  __device__ float operator()(float x, float y) const { return fmax(x, y); }
+
+  // This is equivalent to atomicMax(x, y), but CUDA does not have atomicMax for
+  // float data type. Instead, this atomically compares-and-swaps the old value
+  // at x with y. If the old value returned by the CAS operation was already
+  // larger than y, or what was read before, it declares success and finishes,
+  // otherwise repeats the procedure.
+  __device__ void atomic_reduce(float* x, float y) {
+    unsigned int old_val = *reinterpret_cast<unsigned int*>(x);
+    while (*reinterpret_cast<float*>(&old_val) < y) {
+      unsigned int current_val =
+          atomicCAS(reinterpret_cast<unsigned int*>(x), old_val,
+                    *reinterpret_cast<unsigned int*>(&y));
+      if (old_val == current_val) {
+        break;
+      }
+      old_val = current_val;
+    }
+  }
+  float bottom_value_;
+};
+
+}  // end namespace
+
+template <typename Op>
+struct IsFloatSumReduction {
+  static const bool value = false;
+};
+
+template <>
+struct IsFloatSumReduction<SumReducer<float> > {
+  static const bool value = true;
+};
+
+template <typename Op>
+struct IsFloatMaxReduction {
+  static const bool value = false;
+};
+
+template <>
+struct IsFloatMaxReduction<MaxReducer<float> > {
+  static const bool value = true;
+};
+
+template <typename Op>
+struct SumOrMaxOfFloat {
+  static const bool value =
+      IsFloatSumReduction<Op>::value || IsFloatMaxReduction<Op>::value;
+};
+
+enum ReductionType { ROW_REDUCE, COL_REDUCE, UNOPTIMIZED };
+
+template <typename Op, typename Expr, typename ReductionExpr>
+ReductionType GetReductionType(const Expr& expr,
+                               const ReductionExpr& reduction_expr,
+                               const GpuDevice& device, std::size_t* rows,
+                               std::size_t* cols) {
+  typedef TensorEvaluator<const Expr, GpuDevice> EvalExpr;
+  typedef TensorEvaluator<const ReductionExpr, GpuDevice> ReductionEvalExpr;
+
+  if (device.majorDeviceVersion() < 3) {
+    return UNOPTIMIZED;
+  }
+  const EvalExpr eval_expr(expr, device);
+
+  // We only have fast reductions for sum/max of float.
+  if (!SumOrMaxOfFloat<Op>::value) {
+    return UNOPTIMIZED;
+  }
+
+  // For sum/max of float, if we are doing a full reduction, we can
+  // use the ROW_REDUCE optimization.
+  if (ReductionEvalExpr::NumReducedDims == ReductionEvalExpr::NumInputDims) {
+    *rows = 1;
+    *cols = array_prod(eval_expr.dimensions());
+    return ROW_REDUCE;
+  }
+
+  if (ReductionEvalExpr::NumReducedDims > 1) {
+    return UNOPTIMIZED;
+  }
+
+  const int dim = reduction_expr.dims()[0];
+  if (static_cast<int>(ReductionEvalExpr::Layout) ==
+      static_cast<int>(RowMajor)) {
+    if (dim == ReductionEvalExpr::NumInputDims - 1) {
+      *rows = array_prod(eval_expr.dimensions()) /
+              eval_expr.dimensions()[ReductionEvalExpr::NumInputDims - 1];
+      *cols = eval_expr.dimensions()[ReductionEvalExpr::NumInputDims - 1];
+      if (*cols < 32) return UNOPTIMIZED;
+      return ROW_REDUCE;
+    } else if (dim == 0) {
+      *rows = eval_expr.dimensions()[0];
+      *cols = array_prod(eval_expr.dimensions()) / eval_expr.dimensions()[0];
+      if (*rows < 32) return UNOPTIMIZED;
+      return COL_REDUCE;
+    }
+  } else if (static_cast<int>(ReductionEvalExpr::Layout) ==
+             static_cast<int>(ColMajor)) {
+    if (dim == ReductionEvalExpr::NumInputDims - 1) {
+      *rows = eval_expr.dimensions()[ReductionEvalExpr::NumInputDims - 1];
+      *cols = array_prod(eval_expr.dimensions()) /
+              eval_expr.dimensions()[ReductionEvalExpr::NumInputDims - 1];
+      if (*rows < 32) return UNOPTIMIZED;
+      return COL_REDUCE;
+    } else if (dim == 0) {
+      *rows = array_prod(eval_expr.dimensions()) / eval_expr.dimensions()[0];
+      *cols = eval_expr.dimensions()[0];
+      if (*cols < 32) return UNOPTIMIZED;
+      return ROW_REDUCE;
+    }
+  }
+  return UNOPTIMIZED;
+}
+
+template <typename Expression, typename Index, bool Vectorizable>
+struct LaunchKernel;
+
+template <typename Expression, typename Index>
+struct LaunchKernel<Expression, Index, true> {
+  static void launch(int num_blocks, int block_size, const GpuDevice& device,
+                     const TensorEvaluator<Expression, GpuDevice>& evaluator,
+                     Index size) {
+    LAUNCH_CUDA_KERNEL(
+        (EigenMetaKernel_Vectorizable<TensorEvaluator<Expression, GpuDevice>,
+                                      Index>),
+        num_blocks, block_size, 0, device, evaluator, size);
+  }
+};
+
+template <typename Expression, typename Index>
+struct LaunchKernel<Expression, Index, false> {
+  static void launch(int num_blocks, int block_size, const GpuDevice& device,
+                     const TensorEvaluator<Expression, GpuDevice>& evaluator,
+                     Index size) {
+    LAUNCH_CUDA_KERNEL(
+        (EigenMetaKernel_NonVectorizable<TensorEvaluator<Expression, GpuDevice>,
+                                         Index>),
+        num_blocks, block_size, 0, device, evaluator, size);
+  }
+};
+
+template <typename F, typename LHS, typename RHS, bool Compatible>
+struct LaunchRowReduce;
+
+template <typename F, typename LHS, typename RHS>
+struct LaunchRowReduce<F, LHS, RHS, true> {
+  static void launch(const GpuDevice& device, RHS input, std::size_t rows,
+                     std::size_t cols, LHS output) {
+    RowReduceCuda(F(), device, input, rows, cols, output);
+  }
+};
+
+template <typename F, typename LHS, typename RHS>
+struct LaunchRowReduce<F, LHS, RHS, false> {
+  static void launch(const GpuDevice& device, RHS input, std::size_t rows,
+                     std::size_t cols, LHS output) {}
+};
+
+template <typename F, typename LHS, typename RHS, bool Compatible>
+struct LaunchColReduce;
+
+template <typename F, typename LHS, typename RHS>
+struct LaunchColReduce<F, LHS, RHS, true> {
+  static void launch(const GpuDevice& device, RHS input, std::size_t rows,
+                     std::size_t cols, LHS output) {
+    ColumnReduceCuda(F(), device, input, rows, cols, output);
+  }
+};
+
+template <typename F, typename LHS, typename RHS>
+struct LaunchColReduce<F, LHS, RHS, false> {
+  static void launch(const GpuDevice& device, RHS input, std::size_t rows,
+                     std::size_t cols, LHS output) {}
+};
+
+template <typename Expression, typename Device, bool Vectorizable>
+class TensorAssignExecutorHelper;
+
+template <typename OutExpr, typename InExpr, typename Op, typename Indices,
+          bool Vectorizable>
+class TensorAssignExecutorHelper<
+    const TensorAssignOp<
+      OutExpr, TensorReductionOp<Op, Indices const, InExpr const> const>,
+    GpuDevice, Vectorizable> {
+ public:
+  typedef const TensorAssignOp<
+    OutExpr, TensorReductionOp<Op, Indices const, InExpr const> const>
+    Expression;
+
+  typedef typename Expression::Index Index;
+  typedef TensorEvaluator<OutExpr, GpuDevice> LHSEval;
+  typedef TensorEvaluator<const InExpr, GpuDevice> RHSEval;
+  static inline void run(const Expression& expr, const GpuDevice& device) {
+    std::size_t rows, cols;
+    const ReductionType reduction_type =
+        GetReductionType<Op>(expr.rhsExpression().expression(),
+                             expr.rhsExpression(), device, &rows, &cols);
+    if (reduction_type == UNOPTIMIZED) {
+      TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
+      const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
+      if (needs_assign) {
+        const int num_blocks = device.getNumCudaMultiProcessors() *
+                               device.maxCudaThreadsPerMultiProcessor() /
+                               device.maxCudaThreadsPerBlock();
+        const int block_size = device.maxCudaThreadsPerBlock();
+        const Index size = array_prod(evaluator.dimensions());
+        LaunchKernel<Expression, Index, Vectorizable>::launch(
+            num_blocks, block_size, device, evaluator, size);
+      }
+      evaluator.cleanup();
+    } else {
+      LHSEval output(expr.lhsExpression(), device);
+      RHSEval input(expr.rhsExpression().expression(), device);
+      bool lhs_needs_assign = output.evalSubExprsIfNeeded(NULL);
+      bool rhs_needs_assign = input.evalSubExprsIfNeeded(NULL);
+      if (lhs_needs_assign && rhs_needs_assign) {
+        const bool Compatible =
+            IsFloatSumReduction<Op>::value || IsFloatMaxReduction<Op>::value;
+        if (reduction_type == ROW_REDUCE) {
+          if (IsFloatSumReduction<Op>::value) {
+            LaunchRowReduce<CudaSumReducer, LHSEval, RHSEval,
+                            Compatible>::launch(device, input, rows, cols,
+                                                output);
+          } else if (IsFloatMaxReduction<Op>::value) {
+            LaunchRowReduce<CudaMaxReducer, LHSEval, RHSEval,
+                            Compatible>::launch(device, input, rows, cols,
+                                                output);
+          } else {
+            // Unsupported reduction type
+            assert(false && "Unsupported reduction function for ROW_REDUCE");
+          }
+        } else {
+          if (IsFloatSumReduction<Op>::value) {
+            LaunchColReduce<CudaSumReducer, LHSEval, RHSEval,
+                            Compatible>::launch(device, input, rows, cols,
+                                                output);
+          } else if (IsFloatMaxReduction<Op>::value) {
+            LaunchColReduce<CudaMaxReducer, LHSEval, RHSEval,
+                            Compatible>::launch(device, input, rows, cols,
+                                                output);
+          } else {
+            // Unsupported reduction type
+            assert(false && "Unsupported reduction function for COL_REDUCE");
+          }
+        }
+      }
+      input.cleanup();
+      output.cleanup();
+    }
+  }
+};
+
+template <typename OutExpr, typename InExpr, typename Op, typename Indices,
+          bool Tileable>
+inline void TensorExecutor<
+    const TensorAssignOp<
+        OutExpr, TensorReductionOp<Op, Indices const, InExpr const> const>,
+    GpuDevice, false, Tileable>::run(const Expression& expr,
+                                     const GpuDevice& device) {
+  TensorAssignExecutorHelper<
+      const TensorAssignOp<
+          OutExpr, TensorReductionOp<Op, Indices const, InExpr const> const>,
+      GpuDevice, false>::run(expr, device);
+}
+
+template <typename OutExpr, typename InExpr, typename Op, typename Indices,
+          bool Tileable>
+inline void TensorExecutor<
+    const TensorAssignOp<
+        OutExpr, TensorReductionOp<Op, Indices const, InExpr const> const>,
+    GpuDevice, true, Tileable>::run(const Expression& expr,
+                                    const GpuDevice& device) {
+  TensorAssignExecutorHelper<
+      const TensorAssignOp<
+          OutExpr, TensorReductionOp<Op, Indices const, InExpr const> const>,
+      GpuDevice, true>::run(expr, device);
+}
+
+template <typename T, typename Index>
+struct PtrWrapper {
+  EIGEN_DEVICE_FUNC PtrWrapper(T* ptr) : m_ptr(ptr) {}
+  EIGEN_DEVICE_FUNC T& coeffRef(Index i) { return *(m_ptr + i); }
+  T* m_ptr;
+};
+
+template <typename Expression, typename Device, bool Vectorizable>
+class TensorEvalToExecutorHelper;
+
+template <typename InExpr, typename Op, typename Indices, bool Vectorizable>
+class TensorEvalToExecutorHelper<const TensorEvalToOp<const TensorReductionOp<
+                                     Op, const Indices, const InExpr> >,
+                                 GpuDevice, Vectorizable> {
+ public:
+  typedef const TensorEvalToOp<const TensorReductionOp<
+      Op, const Indices, const InExpr> > Expression;
+  typedef typename Expression::Index Index;
+  typedef TensorEvaluator<const InExpr, GpuDevice> RHSEval;
+
+  static inline void run(const Expression& expr, const GpuDevice& device) {
+    std::size_t rows, cols;
+    const ReductionType reduction_type =
+        GetReductionType<Op>(expr.expression().expression(), expr.expression(),
+                             device, &rows, &cols);
+    if (reduction_type == UNOPTIMIZED) {
+      TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
+      const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
+      if (needs_assign) {
+        const int num_blocks = device.getNumCudaMultiProcessors() *
+                               device.maxCudaThreadsPerMultiProcessor() /
+                               device.maxCudaThreadsPerBlock();
+        const int block_size = device.maxCudaThreadsPerBlock();
+        const Index size = array_prod(evaluator.dimensions());
+        LaunchKernel<Expression, Index, Vectorizable>::launch(
+            num_blocks, block_size, device, evaluator, size);
+      }
+      evaluator.cleanup();
+    } else {
+      typedef typename internal::remove_const<typename Expression::Scalar>::type Scalar;
+      PtrWrapper<Scalar, Index> output(expr.buffer());
+      TensorEvaluator<const InExpr, GpuDevice> input(
+          expr.expression().expression(), device);
+      typedef PtrWrapper<Scalar, Index> LHSEval;
+      typedef TensorEvaluator<const InExpr, GpuDevice> RHSEval;
+      bool rhs_needs_assign = input.evalSubExprsIfNeeded(NULL);
+      if (rhs_needs_assign) {
+        const bool Compatible =
+            IsFloatSumReduction<Op>::value || IsFloatMaxReduction<Op>::value;
+        if (reduction_type == ROW_REDUCE) {
+          if (IsFloatSumReduction<Op>::value) {
+            LaunchRowReduce<CudaSumReducer, LHSEval, RHSEval,
+                            Compatible>::launch(device, input, rows, cols,
+                                                output);
+          } else if (IsFloatMaxReduction<Op>::value) {
+            LaunchRowReduce<CudaMaxReducer, LHSEval, RHSEval,
+                            Compatible>::launch(device, input, rows, cols,
+                                                output);
+          }
+        } else {
+          if (IsFloatSumReduction<Op>::value) {
+            LaunchColReduce<CudaSumReducer, LHSEval, RHSEval,
+                            Compatible>::launch(device, input, rows, cols,
+                                                output);
+          } else if (IsFloatMaxReduction<Op>::value) {
+            LaunchColReduce<CudaMaxReducer, LHSEval, RHSEval,
+                            Compatible>::launch(device, input, rows, cols,
+                                                output);
+          }
+        }
+      }
+      input.cleanup();
+    }
+  }
+};
+
+template <typename InExpr, typename Op, typename Indices, bool Tileable>
+inline void
+TensorExecutor<const TensorEvalToOp<
+                   const TensorReductionOp<Op, const Indices, const InExpr> >,
+               GpuDevice, false, Tileable>::run(const Expression& expr,
+                                                const GpuDevice& device) {
+  TensorEvalToExecutorHelper<const TensorEvalToOp<const TensorReductionOp<
+                                 Op, const Indices, const InExpr> >,
+                             GpuDevice, false>::run(expr, device);
+}
+
+template <typename InExpr, typename Op, typename Indices, bool Tileable>
+inline void
+TensorExecutor<const TensorEvalToOp<
+                   const TensorReductionOp<Op, const Indices, const InExpr> >,
+               GpuDevice, true, Tileable>::run(const Expression& expr,
+                                               const GpuDevice& device) {
+  TensorEvalToExecutorHelper<const TensorEvalToOp<const TensorReductionOp<
+                                 Op, const Indices, const InExpr> >,
+                             GpuDevice, true>::run(expr, device);
+}
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // __CUDACC__
+#endif  // EIGEN_USE_GPU
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
new file mode 100644
index 0000000000..fb8ba09dd3
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
@@ -0,0 +1,442 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_REF_H
+#define EIGEN_CXX11_TENSOR_TENSOR_REF_H
+
+namespace Eigen {
+
+namespace internal {
+
+template <typename Dimensions, typename Scalar>
+class TensorLazyBaseEvaluator {
+ public:
+  TensorLazyBaseEvaluator() : m_refcount(0) { }
+  virtual ~TensorLazyBaseEvaluator() { }
+
+  EIGEN_DEVICE_FUNC virtual const Dimensions& dimensions() const = 0;
+  EIGEN_DEVICE_FUNC virtual const Scalar* data() const = 0;
+
+  EIGEN_DEVICE_FUNC virtual const Scalar coeff(DenseIndex index) const = 0;
+  EIGEN_DEVICE_FUNC virtual Scalar& coeffRef(DenseIndex index) = 0;
+
+  void incrRefCount() { ++m_refcount; }
+  void decrRefCount() { --m_refcount; }
+  int refCount() const { return m_refcount; }
+
+ private:
+  // No copy, no assigment;
+  TensorLazyBaseEvaluator(const TensorLazyBaseEvaluator& other);
+  TensorLazyBaseEvaluator& operator = (const TensorLazyBaseEvaluator& other);
+
+  int m_refcount;
+};
+
+
+template <typename Dimensions, typename Expr, typename Device>
+class TensorLazyEvaluatorReadOnly : public TensorLazyBaseEvaluator<Dimensions, typename TensorEvaluator<Expr, Device>::Scalar> {
+ public:
+  //  typedef typename TensorEvaluator<Expr, Device>::Dimensions Dimensions;
+  typedef typename TensorEvaluator<Expr, Device>::Scalar Scalar;
+
+  TensorLazyEvaluatorReadOnly(const Expr& expr, const Device& device) : m_impl(expr, device), m_dummy(Scalar(0)) {
+    m_dims = m_impl.dimensions();
+    m_impl.evalSubExprsIfNeeded(NULL);
+  }
+  virtual ~TensorLazyEvaluatorReadOnly() {
+    m_impl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC virtual const Dimensions& dimensions() const {
+    return m_dims;
+  }
+  EIGEN_DEVICE_FUNC virtual const Scalar* data() const {
+    return m_impl.data();
+  }
+
+  EIGEN_DEVICE_FUNC virtual const Scalar coeff(DenseIndex index) const {
+    return m_impl.coeff(index);
+  }
+  EIGEN_DEVICE_FUNC virtual Scalar& coeffRef(DenseIndex /*index*/) {
+    eigen_assert(false && "can't reference the coefficient of a rvalue");
+    return m_dummy;
+  };
+
+ protected:
+  TensorEvaluator<Expr, Device> m_impl;
+  Dimensions m_dims;
+  Scalar m_dummy;
+};
+
+template <typename Dimensions, typename Expr, typename Device>
+class TensorLazyEvaluatorWritable : public TensorLazyEvaluatorReadOnly<Dimensions, Expr, Device> {
+ public:
+  typedef TensorLazyEvaluatorReadOnly<Dimensions, Expr, Device> Base;
+  typedef typename Base::Scalar Scalar;
+
+  TensorLazyEvaluatorWritable(const Expr& expr, const Device& device) : Base(expr, device) {
+  }
+  virtual ~TensorLazyEvaluatorWritable() {
+  }
+
+  EIGEN_DEVICE_FUNC virtual Scalar& coeffRef(DenseIndex index) {
+    return this->m_impl.coeffRef(index);
+  }
+};
+
+template <typename Dimensions, typename Expr, typename Device>
+class TensorLazyEvaluator : public internal::conditional<bool(internal::is_lvalue<Expr>::value),
+                            TensorLazyEvaluatorWritable<Dimensions, Expr, Device>,
+                            TensorLazyEvaluatorReadOnly<Dimensions, const Expr, Device> >::type {
+ public:
+  typedef typename internal::conditional<bool(internal::is_lvalue<Expr>::value),
+                                         TensorLazyEvaluatorWritable<Dimensions, Expr, Device>,
+                                         TensorLazyEvaluatorReadOnly<Dimensions, const Expr, Device> >::type Base;
+  typedef typename Base::Scalar Scalar;
+
+  TensorLazyEvaluator(const Expr& expr, const Device& device) : Base(expr, device) {
+  }
+  virtual ~TensorLazyEvaluator() {
+  }
+};
+
+}  // namespace internal
+
+
+/** \class TensorRef
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief A reference to a tensor expression
+  * The expression will be evaluated lazily (as much as possible).
+  *
+  */
+template<typename PlainObjectType> class TensorRef : public TensorBase<TensorRef<PlainObjectType> >
+{
+  public:
+    typedef TensorRef<PlainObjectType> Self;
+    typedef typename PlainObjectType::Base Base;
+    typedef typename Eigen::internal::nested<Self>::type Nested;
+    typedef typename internal::traits<PlainObjectType>::StorageKind StorageKind;
+    typedef typename internal::traits<PlainObjectType>::Index Index;
+    typedef typename internal::traits<PlainObjectType>::Scalar Scalar;
+    typedef typename internal::packet_traits<Scalar>::type Packet;
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+    typedef typename Base::CoeffReturnType CoeffReturnType;
+    typedef Scalar* PointerType;
+    typedef PointerType PointerArgType;
+
+    static const Index NumIndices = PlainObjectType::NumIndices;
+    typedef typename PlainObjectType::Dimensions Dimensions;
+
+    enum {
+      IsAligned = false,
+      PacketAccess = false,
+      BlockAccess = false,
+      Layout = PlainObjectType::Layout,
+      CoordAccess = false,  // to be implemented
+    };
+
+    EIGEN_STRONG_INLINE TensorRef() : m_evaluator(NULL) {
+    }
+
+    template <typename Expression>
+    EIGEN_STRONG_INLINE TensorRef(Expression& expr) : m_evaluator(new internal::TensorLazyEvaluator<Dimensions, Expression, DefaultDevice>(expr, DefaultDevice())) {
+      m_evaluator->incrRefCount();
+    }
+
+    template <typename Expression>
+    EIGEN_STRONG_INLINE TensorRef(const Expression& expr) : m_evaluator(new internal::TensorLazyEvaluator<Dimensions, const Expression, DefaultDevice>(expr, DefaultDevice())) {
+      m_evaluator->incrRefCount();
+    }
+
+    template <typename Expression>
+    EIGEN_STRONG_INLINE TensorRef& operator = (const Expression& expr) {
+      unrefEvaluator();
+      m_evaluator = new internal::TensorLazyEvaluator<Dimensions, Expression, DefaultDevice>(expr, DefaultDevice());
+      m_evaluator->incrRefCount();
+      return *this;
+    }
+
+    ~TensorRef() {
+      unrefEvaluator();
+    }
+
+    TensorRef(const TensorRef& other) : m_evaluator(other.m_evaluator) {
+      eigen_assert(m_evaluator->refCount() > 0);
+      m_evaluator->incrRefCount();
+    }
+
+    TensorRef(TensorRef& other) : m_evaluator(other.m_evaluator) {
+      eigen_assert(m_evaluator->refCount() > 0);
+      m_evaluator->incrRefCount();
+    }
+
+    TensorRef& operator = (const TensorRef& other) {
+      if (this != &other) {
+        unrefEvaluator();
+        m_evaluator = other.m_evaluator;
+        eigen_assert(m_evaluator->refCount() > 0);
+        m_evaluator->incrRefCount();
+      }
+      return *this;
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Index rank() const { return m_evaluator->dimensions().size(); }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Index dimension(Index n) const { return m_evaluator->dimensions()[n]; }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_evaluator->dimensions(); }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Index size() const { return m_evaluator->dimensions().TotalSize(); }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar* data() const { return m_evaluator->data(); }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar operator()(Index index) const
+    {
+      return m_evaluator->coeff(index);
+    }
+
+#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+    template<typename... IndexTypes> EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar operator()(Index firstIndex, IndexTypes... otherIndices) const
+    {
+      const std::size_t NumIndices = (sizeof...(otherIndices) + 1);
+      const array<Index, NumIndices> indices{{firstIndex, otherIndices...}};
+      return coeff(indices);
+    }
+    template<typename... IndexTypes> EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& coeffRef(Index firstIndex, IndexTypes... otherIndices)
+    {
+      const std::size_t NumIndices = (sizeof...(otherIndices) + 1);
+      const array<Index, NumIndices> indices{{firstIndex, otherIndices...}};
+      return coeffRef(indices);
+    }
+#else
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar operator()(Index i0, Index i1) const
+    {
+      array<Index, 2> indices;
+      indices[0] = i0;
+      indices[1] = i1;
+      return coeff(indices);
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar operator()(Index i0, Index i1, Index i2) const
+    {
+      array<Index, 3> indices;
+      indices[0] = i0;
+      indices[1] = i1;
+      indices[2] = i2;
+      return coeff(indices);
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar operator()(Index i0, Index i1, Index i2, Index i3) const
+    {
+      array<Index, 4> indices;
+      indices[0] = i0;
+      indices[1] = i1;
+      indices[2] = i2;
+      indices[3] = i3;
+      return coeff(indices);
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const
+    {
+      array<Index, 5> indices;
+      indices[0] = i0;
+      indices[1] = i1;
+      indices[2] = i2;
+      indices[3] = i3;
+      indices[4] = i4;
+      return coeff(indices);
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& coeffRef(Index i0, Index i1)
+    {
+      array<Index, 2> indices;
+      indices[0] = i0;
+      indices[1] = i1;
+      return coeffRef(indices);
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& coeffRef(Index i0, Index i1, Index i2)
+    {
+      array<Index, 3> indices;
+      indices[0] = i0;
+      indices[1] = i1;
+      indices[2] = i2;
+      return coeffRef(indices);
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3)
+    {
+      array<Index, 4> indices;
+      indices[0] = i0;
+      indices[1] = i1;
+      indices[2] = i2;
+      indices[3] = i3;
+      return coeffRef(indices);
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& coeffRef(Index i0, Index i1, Index i2, Index i3, Index i4)
+    {
+      array<Index, 5> indices;
+      indices[0] = i0;
+      indices[1] = i1;
+      indices[2] = i2;
+      indices[3] = i3;
+      indices[4] = i4;
+      return coeffRef(indices);
+    }
+#endif
+
+    template <std::size_t NumIndices> EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar coeff(const array<Index, NumIndices>& indices) const
+    {
+      const Dimensions& dims = this->dimensions();
+      Index index = 0;
+      if (PlainObjectType::Options & RowMajor) {
+        index += indices[0];
+        for (int i = 1; i < NumIndices; ++i) {
+          index = index * dims[i] + indices[i];
+        }
+      } else {
+        index += indices[NumIndices-1];
+        for (int i = NumIndices-2; i >= 0; --i) {
+          index = index * dims[i] + indices[i];
+        }
+      }
+      return m_evaluator->coeff(index);
+    }
+    template <std::size_t NumIndices> EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& coeffRef(const array<Index, NumIndices>& indices)
+    {
+      const Dimensions& dims = this->dimensions();
+      Index index = 0;
+      if (PlainObjectType::Options & RowMajor) {
+        index += indices[0];
+        for (int i = 1; i < NumIndices; ++i) {
+          index = index * dims[i] + indices[i];
+        }
+      } else {
+        index += indices[NumIndices-1];
+        for (int i = NumIndices-2; i >= 0; --i) {
+          index = index * dims[i] + indices[i];
+        }
+      }
+      return m_evaluator->coeffRef(index);
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar coeff(Index index) const
+    {
+      return m_evaluator->coeff(index);
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& coeffRef(Index index)
+    {
+      return m_evaluator->coeffRef(index);
+    }
+
+  private:
+    EIGEN_STRONG_INLINE void unrefEvaluator() {
+      if (m_evaluator) {
+        m_evaluator->decrRefCount();
+        if (m_evaluator->refCount() == 0) {
+          delete m_evaluator;
+        }
+      }
+    }
+
+  internal::TensorLazyBaseEvaluator<Dimensions, Scalar>* m_evaluator;
+};
+
+
+// evaluator for rvalues
+template<typename Derived, typename Device>
+struct TensorEvaluator<const TensorRef<Derived>, Device>
+{
+  typedef typename Derived::Index Index;
+  typedef typename Derived::Scalar Scalar;
+  typedef typename Derived::Packet Packet;
+  typedef typename Derived::Scalar CoeffReturnType;
+  typedef typename Derived::Packet PacketReturnType;
+  typedef typename Derived::Dimensions Dimensions;
+
+  enum {
+    IsAligned = false,
+    PacketAccess = false,
+    BlockAccess = false,
+    Layout = TensorRef<Derived>::Layout,
+    CoordAccess = false,  // to be implemented
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const TensorRef<Derived>& m, const Device&)
+      : m_ref(m)
+  { }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_ref.dimensions(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
+    return true;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    return m_ref.coeff(index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
+    return m_ref.coeffRef(index);
+  }
+
+  EIGEN_DEVICE_FUNC Scalar* data() const { return m_ref.data(); }
+
+ protected:
+  TensorRef<Derived> m_ref;
+};
+
+
+// evaluator for lvalues
+template<typename Derived, typename Device>
+struct TensorEvaluator<TensorRef<Derived>, Device> : public TensorEvaluator<const TensorRef<Derived>, Device>
+{
+  typedef typename Derived::Index Index;
+  typedef typename Derived::Scalar Scalar;
+  typedef typename Derived::Packet Packet;
+  typedef typename Derived::Scalar CoeffReturnType;
+  typedef typename Derived::Packet PacketReturnType;
+  typedef typename Derived::Dimensions Dimensions;
+
+  typedef TensorEvaluator<const TensorRef<Derived>, Device> Base;
+
+  enum {
+    IsAligned = false,
+    PacketAccess = false,
+    BlockAccess = false,
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(TensorRef<Derived>& m, const Device& d) : Base(m, d)
+  { }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
+    return this->m_ref.coeffRef(index);
+  }
+};
+
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_REF_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
new file mode 100644
index 0000000000..44e147de3e
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
@@ -0,0 +1,278 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Navdeep Jaitly <ndjaitly@google.com>
+//                    Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_REVERSE_H
+#define EIGEN_CXX11_TENSOR_TENSOR_REVERSE_H
+namespace Eigen {
+
+/** \class TensorReverse
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor reverse elements class.
+  *
+  */
+namespace internal {
+template<typename ReverseDimensions, typename XprType>
+struct traits<TensorReverseOp<ReverseDimensions,
+                              XprType> > : public traits<XprType>
+{
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef typename remove_reference<Nested>::type _Nested;
+  static const int NumDimensions = XprTraits::NumDimensions;
+  static const int Layout = XprTraits::Layout;
+};
+
+template<typename ReverseDimensions, typename XprType>
+struct eval<TensorReverseOp<ReverseDimensions, XprType>, Eigen::Dense>
+{
+  typedef const TensorReverseOp<ReverseDimensions, XprType>& type;
+};
+
+template<typename ReverseDimensions, typename XprType>
+struct nested<TensorReverseOp<ReverseDimensions, XprType>, 1,
+            typename eval<TensorReverseOp<ReverseDimensions, XprType> >::type>
+{
+  typedef TensorReverseOp<ReverseDimensions, XprType> type;
+};
+
+}  // end namespace internal
+
+template<typename ReverseDimensions, typename XprType>
+class TensorReverseOp : public TensorBase<TensorReverseOp<ReverseDimensions,
+                                          XprType>, WriteAccessors>
+{
+  public:
+  typedef typename Eigen::internal::traits<TensorReverseOp>::Scalar Scalar;
+  typedef typename Eigen::internal::traits<TensorReverseOp>::Packet Packet;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+  typedef typename Eigen::internal::nested<TensorReverseOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorReverseOp>::StorageKind
+                                                                    StorageKind;
+  typedef typename Eigen::internal::traits<TensorReverseOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReverseOp(
+      const XprType& expr, const ReverseDimensions& reverse_dims)
+      : m_xpr(expr), m_reverse_dims(reverse_dims) {}
+
+    EIGEN_DEVICE_FUNC
+    const ReverseDimensions& reverse() const { return m_reverse_dims; }
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename XprType::Nested>::type&
+    expression() const { return m_xpr; }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorReverseOp& operator = (const TensorReverseOp& other)
+    {
+      typedef TensorAssignOp<TensorReverseOp, const TensorReverseOp> Assign;
+      Assign assign(*this, other);
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(
+          assign, DefaultDevice());
+      return *this;
+    }
+
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorReverseOp& operator = (const OtherDerived& other)
+    {
+      typedef TensorAssignOp<TensorReverseOp, const OtherDerived> Assign;
+      Assign assign(*this, other);
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(
+          assign, DefaultDevice());
+      return *this;
+    }
+
+  protected:
+    typename XprType::Nested m_xpr;
+    const ReverseDimensions m_reverse_dims;
+};
+
+// Eval as rvalue
+template<typename ReverseDimensions, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device>
+{
+  typedef TensorReverseOp<ReverseDimensions, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  static const int NumDims = internal::array_size<ReverseDimensions>::value;
+  typedef DSizes<Index, NumDims> Dimensions;
+
+  enum {
+    IsAligned = false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
+                                                        const Device& device)
+      : m_impl(op.expression(), device), m_reverse(op.reverse())
+  {
+    // Compute strides
+    m_dimensions = m_impl.dimensions();
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_strides[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        m_strides[i] = m_strides[i-1] * m_dimensions[i-1];
+      }
+    } else {
+      m_strides[NumDims-1] = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        m_strides[i] = m_strides[i+1] * m_dimensions[i+1];
+      }
+    }
+  }
+
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_impl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index reverseIndex(
+      Index index) const {
+    eigen_assert(index < dimensions().TotalSize());
+    Index inputIndex = 0;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumDims - 1; i > 0; --i) {
+        Index idx = index / m_strides[i];
+        index -= idx * m_strides[i];
+        if (m_reverse[i]) {
+          idx = m_dimensions[i] - idx - 1;
+        }
+        inputIndex += idx * m_strides[i] ;
+      }
+      if (m_reverse[0]) {
+        inputIndex += (m_dimensions[0] - index - 1);
+      } else {
+        inputIndex += index;
+      }
+    } else {
+      for (int i = 0; i < NumDims - 1; ++i) {
+        Index idx = index / m_strides[i];
+        index -= idx * m_strides[i];
+        if (m_reverse[i]) {
+          idx = m_dimensions[i] - idx - 1;
+        }
+        inputIndex += idx * m_strides[i] ;
+      }
+      if (m_reverse[NumDims-1]) {
+        inputIndex += (m_dimensions[NumDims-1] - index - 1);
+      } else {
+        inputIndex += index;
+      }
+    }
+    return inputIndex;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(
+      Index index) const  {
+    return m_impl.coeff(reverseIndex(index));
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  PacketReturnType packet(Index index) const
+  {
+    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+packetSize-1 < dimensions().TotalSize());
+
+    // TODO(ndjaitly): write a better packing routine that uses
+    // local structure.
+    EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type
+                                                            values[packetSize];
+    for (int i = 0; i < packetSize; ++i) {
+      values[i] = coeff(index+i);
+    }
+    PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+    return rslt;
+  }
+
+  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+
+ protected:
+  Dimensions m_dimensions;
+  array<Index, NumDims> m_strides;
+  TensorEvaluator<ArgType, Device> m_impl;
+  ReverseDimensions m_reverse;
+};
+
+// Eval as lvalue
+
+template <typename ReverseDimensions, typename ArgType, typename Device>
+struct TensorEvaluator<TensorReverseOp<ReverseDimensions, ArgType>, Device>
+    : public TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>,
+                             Device> {
+  typedef TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>,
+                          Device> Base;
+  typedef TensorReverseOp<ReverseDimensions, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  static const int NumDims = internal::array_size<ReverseDimensions>::value;
+  typedef DSizes<Index, NumDims> Dimensions;
+
+  enum {
+    IsAligned = false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+  };
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
+                                                        const Device& device)
+      : Base(op, device) {}
+
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  const Dimensions& dimensions() const { return this->m_dimensions; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
+    return this->m_impl.coeffRef(Base::reverseIndex(index));
+  }
+
+  template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void writePacket(Index index, const PacketReturnType& x) {
+    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+packetSize-1 < dimensions().TotalSize());
+
+    // This code is pilfered from TensorMorphing.h
+    EIGEN_ALIGN_DEFAULT CoeffReturnType values[packetSize];
+    internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
+    for (int i = 0; i < packetSize; ++i) {
+      this->coeffRef(index+i) = values[i];
+    }
+  }
+
+};
+
+
+}  // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_REVERSE_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
new file mode 100644
index 0000000000..2e59a147bc
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
@@ -0,0 +1,412 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H
+#define EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H
+
+namespace Eigen {
+
+/** \class TensorShuffling
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor shuffling class.
+  *
+  *
+  */
+namespace internal {
+template<typename Shuffle, typename XprType>
+struct traits<TensorShufflingOp<Shuffle, XprType> > : public traits<XprType>
+{
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef typename remove_reference<Nested>::type _Nested;
+  static const int NumDimensions = XprTraits::NumDimensions;
+  static const int Layout = XprTraits::Layout;
+};
+
+template<typename Shuffle, typename XprType>
+struct eval<TensorShufflingOp<Shuffle, XprType>, Eigen::Dense>
+{
+  typedef const TensorShufflingOp<Shuffle, XprType>& type;
+};
+
+template<typename Shuffle, typename XprType>
+struct nested<TensorShufflingOp<Shuffle, XprType>, 1, typename eval<TensorShufflingOp<Shuffle, XprType> >::type>
+{
+  typedef TensorShufflingOp<Shuffle, XprType> type;
+};
+
+}  // end namespace internal
+
+
+
+template<typename Shuffle, typename XprType>
+class TensorShufflingOp : public TensorBase<TensorShufflingOp<Shuffle, XprType> >
+{
+  public:
+  typedef typename Eigen::internal::traits<TensorShufflingOp>::Scalar Scalar;
+  typedef typename Eigen::internal::traits<TensorShufflingOp>::Packet Packet;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+  typedef typename Eigen::internal::nested<TensorShufflingOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorShufflingOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorShufflingOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorShufflingOp(const XprType& expr, const Shuffle& shuffle)
+      : m_xpr(expr), m_shuffle(shuffle) {}
+
+    EIGEN_DEVICE_FUNC
+    const Shuffle& shufflePermutation() const { return m_shuffle; }
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename XprType::Nested>::type&
+    expression() const { return m_xpr; }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorShufflingOp& operator = (const TensorShufflingOp& other)
+    {
+      typedef TensorAssignOp<TensorShufflingOp, const TensorShufflingOp> Assign;
+      Assign assign(*this, other);
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(
+          assign, DefaultDevice());
+      return *this;
+    }
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorShufflingOp& operator = (const OtherDerived& other)
+    {
+      typedef TensorAssignOp<TensorShufflingOp, const OtherDerived> Assign;
+      Assign assign(*this, other);
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(
+          assign, DefaultDevice());
+      return *this;
+    }
+
+  protected:
+    typename XprType::Nested m_xpr;
+    const Shuffle m_shuffle;
+};
+
+
+// Eval as rvalue
+template<typename Shuffle, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
+{
+  typedef TensorShufflingOp<Shuffle, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename internal::remove_const<Scalar>::type ScalarNonConst;
+
+  enum {
+    IsAligned = false,
+    PacketAccess = (internal::packet_traits<Scalar>::size > 1),
+    BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+  };
+
+  typedef typename internal::TensorBlock<
+    Index, typename internal::remove_const<Scalar>::type, NumDims,
+    TensorEvaluator<ArgType, Device>::Layout> TensorBlock;
+  typedef typename internal::TensorBlockReader<
+    Index, typename internal::remove_const<Scalar>::type, NumDims,
+    TensorEvaluator<ArgType, Device>::Layout, PacketAccess> TensorBlockReader;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_shuffle(op.shufflePermutation()), m_impl(op.expression(), device)
+  {
+    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+    for (int i = 0; i < NumDims; ++i) {
+      m_dimensions[i] = input_dims[m_shuffle[i]];
+      m_inverseShuffle[m_shuffle[i]] = i;
+    }
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_unshuffledInputStrides[0] = 1;
+      m_outputStrides[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        m_unshuffledInputStrides[i] =
+            m_unshuffledInputStrides[i - 1] * input_dims[i - 1];
+        m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
+      }
+    } else {
+      m_unshuffledInputStrides[NumDims - 1] = 1;
+      m_outputStrides[NumDims - 1] = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        m_unshuffledInputStrides[i] =
+            m_unshuffledInputStrides[i + 1] * input_dims[i + 1];
+        m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
+      }
+    }
+
+    for (int i = 0; i < NumDims; ++i) {
+      m_inputStrides[i] = m_unshuffledInputStrides[m_shuffle[i]];
+    }
+
+    m_block_total_size_max = numext::maxi(static_cast<std::size_t>(1),
+                                        device.firstLevelCacheSize() /
+                                        sizeof(Scalar));
+  }
+
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_impl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    return m_impl.coeff(srcCoeff(index));
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  {
+    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+packetSize-1 < dimensions().TotalSize());
+
+    EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize];
+    for (int i = 0; i < packetSize; ++i) {
+      values[i] = coeff(index+i);
+    }
+    PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+    return rslt;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
+      std::vector<internal::TensorOpResourceRequirements>* resources) const {
+    resources->push_back(internal::TensorOpResourceRequirements(
+        internal::kUniformAllDims, m_block_total_size_max));
+    m_impl.getResourceRequirements(resources);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
+      TensorBlock* output_block) const {
+    if (m_impl.data() != NULL) {
+      // Fast path: we have direct access to the data, so shuffle as we read.
+      TensorBlockReader::Run(output_block,
+                             srcCoeff(output_block->first_coeff_index()),
+                             m_inverseShuffle,
+                             m_unshuffledInputStrides,
+                             m_impl.data());
+      return;
+    }
+
+    // Slow path: read unshuffled block from the input and shuffle in-place.
+    // Initialize input block sizes using input-to-output shuffle map.
+    DSizes<Index, NumDims> input_block_sizes;
+    for (Index i = 0; i < NumDims; ++i) {
+      input_block_sizes[i] = output_block->block_sizes()[m_inverseShuffle[i]];
+    }
+
+    // Calculate input block strides.
+    DSizes<Index, NumDims> input_block_strides;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      input_block_strides[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        input_block_strides[i] = input_block_strides[i - 1] *
+            input_block_sizes[i - 1];
+      }
+    } else {
+      input_block_strides[NumDims - 1] = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        input_block_strides[i] = input_block_strides[i + 1] *
+            input_block_sizes[i + 1];
+      }
+    }
+
+    // Read input block.
+    TensorBlock input_block(srcCoeff(output_block->first_coeff_index()),
+                            input_block_sizes,
+                            input_block_strides,
+                            m_unshuffledInputStrides,
+                            output_block->data());
+
+    m_impl.block(&input_block);
+
+    // Naive In-place shuffle: random IO but block size is O(L1 cache size).
+    // TODO(andydavis) Improve the performance of this in-place shuffle.
+    const Index total_size = input_block_sizes.TotalSize();
+    std::vector<bool> bitmap(total_size, false);
+    ScalarNonConst* data = const_cast<ScalarNonConst*>(output_block->data());
+    const DSizes<Index, NumDims>& output_block_strides =
+        output_block->block_strides();
+    for (Index input_index = 0; input_index < total_size; ++input_index) {
+      if (bitmap[input_index]) {
+        // Coefficient at this index has already been shuffled.
+        continue;
+      }
+
+      Index output_index = GetBlockOutputIndex(input_index,
+                                               input_block_strides,
+                                               output_block_strides);
+      if (output_index == input_index) {
+        // Coefficient already in place.
+        bitmap[output_index] = true;
+        continue;
+      }
+
+      // The following loop starts at 'input_index', and shuffles
+      // coefficients into their shuffled location at 'output_index'.
+      // It skips through the array shuffling coefficients by following
+      // the shuffle cycle starting and ending a 'start_index'.
+      ScalarNonConst evicted_value;
+      ScalarNonConst shuffled_value = data[input_index];
+      do {
+        evicted_value = data[output_index];
+        data[output_index] = shuffled_value;
+        shuffled_value = evicted_value;
+        bitmap[output_index] = true;
+        output_index = GetBlockOutputIndex(output_index,
+                                           input_block_strides,
+                                           output_block_strides);
+      } while (output_index != input_index);
+
+      data[output_index] = shuffled_value;
+      bitmap[output_index] = true;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+
+ protected:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index GetBlockOutputIndex(
+      Index input_index,
+      const DSizes<Index, NumDims>& input_block_strides,
+      const DSizes<Index, NumDims>& output_block_strides) const {
+    Index output_index = 0;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumDims - 1; i > 0; --i) {
+        const Index idx = input_index / input_block_strides[i];
+        output_index += idx * output_block_strides[m_inverseShuffle[i]];
+        input_index -= idx * input_block_strides[i];
+      }
+      return output_index + input_index *
+          output_block_strides[m_inverseShuffle[0]];
+    } else {
+      for (int i = 0; i < NumDims - 1; ++i) {
+        const Index idx = input_index / input_block_strides[i];
+        output_index += idx * output_block_strides[m_inverseShuffle[i]];
+        input_index -= idx * input_block_strides[i];
+      }
+      return output_index + input_index *
+          output_block_strides[m_inverseShuffle[NumDims - 1]];
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const {
+    Index inputIndex = 0;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumDims - 1; i > 0; --i) {
+        const Index idx = index / m_outputStrides[i];
+        inputIndex += idx * m_inputStrides[i];
+        index -= idx * m_outputStrides[i];
+      }
+      return inputIndex + index * m_inputStrides[0];
+    } else {
+      for (int i = 0; i < NumDims - 1; ++i) {
+        const Index idx = index / m_outputStrides[i];
+        inputIndex += idx * m_inputStrides[i];
+        index -= idx * m_outputStrides[i];
+      }
+      return inputIndex + index * m_inputStrides[NumDims - 1];
+    }
+  }
+
+  const Shuffle& m_shuffle;
+  Dimensions m_dimensions;
+  array<Index, NumDims> m_inverseShuffle;
+  array<Index, NumDims> m_outputStrides;
+  array<Index, NumDims> m_inputStrides;
+  array<Index, NumDims> m_unshuffledInputStrides;
+  TensorEvaluator<ArgType, Device> m_impl;
+  std::size_t m_block_total_size_max;
+};
+
+
+// Eval as lvalue
+template<typename Shuffle, typename ArgType, typename Device>
+struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
+    : public TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
+{
+  typedef TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> Base;
+
+  typedef TensorShufflingOp<Shuffle, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename XprType::Scalar Scalar;
+
+  enum {
+    IsAligned = false,
+    PacketAccess = (internal::packet_traits<Scalar>::size > 1),
+    BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+  };
+
+  typedef typename internal::TensorBlock<
+    Index, typename internal::remove_const<Scalar>::type, NumDims,
+    TensorEvaluator<ArgType, Device>::Layout> TensorBlock;
+  typedef typename internal::TensorBlockWriter<
+    Index, typename internal::remove_const<Scalar>::type, NumDims,
+    TensorEvaluator<ArgType, Device>::Layout, PacketAccess> TensorBlockWriter;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : Base(op, device)
+  { }
+
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
+  {
+    return this->m_impl.coeffRef(this->srcCoeff(index));
+  }
+
+  template <int StoreMode> EIGEN_STRONG_INLINE
+  void writePacket(Index index, const PacketReturnType& x)
+  {
+    static const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+
+    EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize];
+    internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
+    for (int i = 0; i < packetSize; ++i) {
+      this->coeffRef(index+i) = values[i];
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(
+      const TensorBlock& block) {
+    eigen_assert(this->m_impl.data() != NULL);
+    TensorBlockWriter::Run(block, this->srcCoeff(block.first_coeff_index()),
+                           this->m_inverseShuffle,
+                           this->m_unshuffledInputStrides, this->m_impl.data());
+  }
+};
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
new file mode 100644
index 0000000000..cfde4fdc72
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
@@ -0,0 +1,247 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSORSTORAGE_H
+#define EIGEN_CXX11_TENSOR_TENSORSTORAGE_H
+
+#ifdef EIGEN_TENSOR_STORAGE_CTOR_PLUGIN
+  #define EIGEN_INTERNAL_TENSOR_STORAGE_CTOR_PLUGIN EIGEN_TENSOR_STORAGE_CTOR_PLUGIN;
+#else
+  #define EIGEN_INTERNAL_TENSOR_STORAGE_CTOR_PLUGIN
+#endif
+
+namespace Eigen {
+
+/** \internal
+  *
+  * \class TensorStorage
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Stores the data of a tensor
+  *
+  * This class stores the data of fixed-size, dynamic-size or mixed tensors
+  * in a way as compact as possible.
+  *
+  * \sa Tensor
+  */
+template<typename T, typename Dimensions, int Options_> class TensorStorage;
+
+
+// Pure fixed-size storage
+template<typename T, int Options_, typename FixedDimensions>
+class TensorStorage<T, FixedDimensions, Options_>
+{
+ private:
+  static const std::size_t Size = FixedDimensions::total_size;
+
+  EIGEN_ALIGN_DEFAULT T m_data[Size];
+  FixedDimensions m_dimensions;
+
+ public:
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE TensorStorage() {
+    EIGEN_STATIC_ASSERT(Size == FixedDimensions::total_size, YOU_MADE_A_PROGRAMMING_MISTAKE)
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE T *data() { return m_data; }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE const T *data() const { return m_data; }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE const FixedDimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE DenseIndex size() const { return m_dimensions.TotalSize(); }
+};
+
+
+// pure dynamic
+template<typename T, int Options_, typename IndexType, std::size_t NumIndices_>
+class TensorStorage<T, DSizes<IndexType, NumIndices_>, Options_>
+{
+  public:
+    typedef IndexType Index;
+    typedef DSizes<IndexType, NumIndices_> Dimensions;
+    typedef TensorStorage<T, DSizes<IndexType, NumIndices_>, Options_> Self;
+
+    EIGEN_DEVICE_FUNC TensorStorage()
+      : m_data(NumIndices_ ? 0 : internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(1))
+      , m_dimensions() {}
+
+    EIGEN_DEVICE_FUNC TensorStorage(internal::constructor_without_unaligned_array_assert)
+      : m_data(NumIndices_ ? 0 : internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(1))
+      , m_dimensions(internal::template repeat<NumIndices_, Index>(0)) {}
+
+    EIGEN_DEVICE_FUNC TensorStorage(Index size, const array<Index, NumIndices_>& dimensions)
+        : m_data(internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(size)), m_dimensions(dimensions)
+      { EIGEN_INTERNAL_TENSOR_STORAGE_CTOR_PLUGIN }
+
+    EIGEN_DEVICE_FUNC TensorStorage(const Self& other)
+      : m_data(internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(internal::array_prod(other.m_dimensions)))
+      , m_dimensions(other.m_dimensions)
+    {
+      internal::smart_copy(other.m_data, other.m_data+internal::array_prod(other.m_dimensions), m_data);
+    }
+    EIGEN_DEVICE_FUNC Self& operator=(const Self& other)
+    {
+      if (this != &other) {
+        Self tmp(other);
+        this->swap(tmp);
+      }
+      return *this;
+    }
+
+    EIGEN_DEVICE_FUNC  ~TensorStorage() { internal::conditional_aligned_delete_auto<T,(Options_&DontAlign)==0>(m_data, internal::array_prod(m_dimensions)); }
+    EIGEN_DEVICE_FUNC  void swap(Self& other)
+    { numext::swap(m_data,other.m_data); numext::swap(m_dimensions,other.m_dimensions); }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {return m_dimensions;}
+
+    EIGEN_DEVICE_FUNC void resize(Index size, const array<Index, NumIndices_>& nbDimensions)
+    {
+      const Index currentSz = internal::array_prod(m_dimensions);
+      if(size != currentSz)
+      {
+        internal::conditional_aligned_delete_auto<T,(Options_&DontAlign)==0>(m_data, currentSz);
+        if (size)
+          m_data = internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(size);
+        else
+          m_data = 0;
+        EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
+      }
+      m_dimensions = nbDimensions;
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T *data() { return m_data; }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T *data() const { return m_data; }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_dimensions.TotalSize(); }
+
+ private:
+  T *m_data;
+  Dimensions m_dimensions;
+};
+
+
+// pure dynamic
+template<typename T, int Options_>
+class TensorStorage<T, VSizes<DenseIndex>, Options_>
+{
+    T* m_data;
+    VSizes<DenseIndex> m_dimensions;
+    typedef TensorStorage<T, VSizes<DenseIndex>, Options_> Self_;
+
+  public:
+    EIGEN_DEVICE_FUNC TensorStorage() : m_data(0), m_dimensions() {}
+
+    template <DenseIndex NumDims>
+    EIGEN_DEVICE_FUNC TensorStorage(const array<DenseIndex, NumDims>& dimensions)
+      {
+        m_dimensions.resize(NumDims);
+        for (int i = 0; i < NumDims; ++i) {
+          m_dimensions[i] = dimensions[i];
+        }
+        const DenseIndex size = array_prod(dimensions);
+        m_data = internal::conditional_managed_new_auto<T,(Options_&DontAlign)==0,(Options_&AllocateUVM)>(size);
+        EIGEN_INTERNAL_TENSOR_STORAGE_CTOR_PLUGIN
+      }
+
+    EIGEN_DEVICE_FUNC TensorStorage(const std::vector<DenseIndex>& dimensions)
+        : m_dimensions(dimensions)
+      {
+        const DenseIndex size = internal::array_prod(dimensions);
+        m_data = internal::conditional_managed_new_auto<T,(Options_&DontAlign)==0,(Options_&AllocateUVM)>(size);
+        EIGEN_INTERNAL_TENSOR_STORAGE_CTOR_PLUGIN
+      }
+
+#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+    template<typename... IndexTypes> EIGEN_DEVICE_FUNC
+    TensorStorage(IndexTypes... dimensions) {
+      const int NumDims = sizeof...(dimensions);
+      m_dimensions.resize(NumDims);
+      const array<DenseIndex, NumDims> dim{{dimensions...}};
+      DenseIndex size = 1;
+      for (int i = 0; i < NumDims; ++i) {
+        size *= dim[i];
+        m_dimensions[i] = dim[i];
+      }
+      m_data = internal::conditional_managed_new_auto<T,(Options_&DontAlign)==0,(Options_&AllocateUVM)>(size);
+      EIGEN_INTERNAL_TENSOR_STORAGE_CTOR_PLUGIN
+    }
+#endif
+
+    EIGEN_DEVICE_FUNC TensorStorage(const Self_& other)
+      : m_data(internal::conditional_managed_new_auto<T,(Options_&DontAlign)==0,(Options_&AllocateUVM)>(internal::array_prod(other.m_dimensions)))
+      , m_dimensions(other.m_dimensions)
+    {
+      internal::smart_copy(other.m_data, other.m_data+internal::array_prod(other.m_dimensions), m_data);
+    }
+
+    EIGEN_DEVICE_FUNC Self_& operator=(const Self_& other)
+    {
+      if (this != &other) {
+        Self_ tmp(other);
+        this->swap(tmp);
+      }
+      return *this;
+    }
+
+    EIGEN_DEVICE_FUNC ~TensorStorage()
+    {
+      internal::conditional_managed_delete_auto<T,(Options_&DontAlign)==0,(Options_&AllocateUVM)>(m_data, internal::array_prod(m_dimensions));
+    }
+
+    EIGEN_DEVICE_FUNC void swap(Self_& other)
+    { std::swap(m_data,other.m_data); std::swap(m_dimensions,other.m_dimensions); }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const VSizes<DenseIndex>& dimensions() const { return m_dimensions; }
+
+    template <typename NewDimensions> EIGEN_DEVICE_FUNC
+    void resize(DenseIndex size, const NewDimensions& nbDimensions)
+    {
+      const DenseIndex currentSz = internal::array_prod(m_dimensions);
+      if(size != currentSz)
+      {
+        internal::conditional_managed_delete_auto<T,(Options_&DontAlign)==0,(Options_&AllocateUVM)>(m_data, currentSz);
+        if (size)
+          m_data = internal::conditional_managed_new_auto<T,(Options_&DontAlign)==0,(Options_&AllocateUVM)>(size);
+        else
+          m_data = 0;
+        EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
+      }
+      m_dimensions.resize(internal::array_size<NewDimensions>::value);
+      for (int i = 0; i < internal::array_size<NewDimensions>::value; ++i) {
+        m_dimensions[i] = nbDimensions[i];
+      }
+    }
+    EIGEN_DEVICE_FUNC void resize(DenseIndex size, const std::vector<DenseIndex>& nbDimensions)
+    {
+      const DenseIndex currentSz = internal::array_prod(m_dimensions);
+      if(size != currentSz)
+      {
+        internal::conditional_managed_delete_auto<T,(Options_&DontAlign)==0,(Options_&AllocateUVM)>(m_data, currentSz);
+        if (size)
+          m_data = internal::conditional_managed_new_auto<T,(Options_&DontAlign)==0,(Options_&AllocateUVM)>(size);
+        else
+          m_data = 0;
+        EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
+      }
+      m_dimensions = nbDimensions;
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T *data() { return m_data; }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T *data() const { return m_data; }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex size() const { return m_dimensions.TotalSize(); }
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSORSTORAGE_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
new file mode 100644
index 0000000000..8abe5ea8e4
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
@@ -0,0 +1,329 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_STRIDING_H
+#define EIGEN_CXX11_TENSOR_TENSOR_STRIDING_H
+
+namespace Eigen {
+
+/** \class TensorStriding
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor striding class.
+  *
+  *
+  */
+namespace internal {
+template<typename Strides, typename XprType>
+struct traits<TensorStridingOp<Strides, XprType> > : public traits<XprType>
+{
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef typename remove_reference<Nested>::type _Nested;
+  static const int NumDimensions = XprTraits::NumDimensions;
+  static const int Layout = XprTraits::Layout;
+};
+
+template<typename Strides, typename XprType>
+struct eval<TensorStridingOp<Strides, XprType>, Eigen::Dense>
+{
+  typedef const TensorStridingOp<Strides, XprType>& type;
+};
+
+template<typename Strides, typename XprType>
+struct nested<TensorStridingOp<Strides, XprType>, 1, typename eval<TensorStridingOp<Strides, XprType> >::type>
+{
+  typedef TensorStridingOp<Strides, XprType> type;
+};
+
+}  // end namespace internal
+
+
+
+template<typename Strides, typename XprType>
+class TensorStridingOp : public TensorBase<TensorStridingOp<Strides, XprType> >
+{
+  public:
+  typedef typename Eigen::internal::traits<TensorStridingOp>::Scalar Scalar;
+  typedef typename Eigen::internal::traits<TensorStridingOp>::Packet Packet;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+  typedef typename Eigen::internal::nested<TensorStridingOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorStridingOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorStridingOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorStridingOp(const XprType& expr, const Strides& dims)
+      : m_xpr(expr), m_dims(dims) {}
+
+    EIGEN_DEVICE_FUNC
+    const Strides& strides() const { return m_dims; }
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename XprType::Nested>::type&
+    expression() const { return m_xpr; }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorStridingOp& operator = (const TensorStridingOp& other)
+    {
+      typedef TensorAssignOp<TensorStridingOp, const TensorStridingOp> Assign;
+      Assign assign(*this, other);
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(
+          assign, DefaultDevice());
+      return *this;
+    }
+
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorStridingOp& operator = (const OtherDerived& other)
+    {
+      typedef TensorAssignOp<TensorStridingOp, const OtherDerived> Assign;
+      Assign assign(*this, other);
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(
+          assign, DefaultDevice());
+      return *this;
+    }
+
+  protected:
+    typename XprType::Nested m_xpr;
+    const Strides m_dims;
+};
+
+
+// Eval as rvalue
+template<typename Strides, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
+{
+  typedef TensorStridingOp<Strides, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+  typedef DSizes<Index, NumDims> Dimensions;
+
+  enum {
+    IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device)
+  {
+    m_dimensions = m_impl.dimensions();
+    for (int i = 0; i < NumDims; ++i) {
+      m_dimensions[i] = ceilf(static_cast<float>(m_dimensions[i]) / op.strides()[i]);
+    }
+
+    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_outputStrides[0] = 1;
+      m_inputStrides[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1];
+        m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1];
+        m_inputStrides[i-1] *= op.strides()[i-1];
+      }
+      m_inputStrides[NumDims-1] *= op.strides()[NumDims-1];
+    } else {  // RowMajor
+      m_outputStrides[NumDims-1] = 1;
+      m_inputStrides[NumDims-1] = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1];
+        m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1];
+        m_inputStrides[i+1] *= op.strides()[i+1];
+      }
+      m_inputStrides[0] *= op.strides()[0];
+    }
+  }
+
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_impl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    return m_impl.coeff(srcCoeff(index));
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  {
+    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+packetSize-1 < dimensions().TotalSize());
+
+    Index inputIndices[] = {0, 0};
+    Index indices[] = {index, index + packetSize - 1};
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumDims - 1; i > 0; --i) {
+        const Index idx0 = indices[0] / m_outputStrides[i];
+        const Index idx1 = indices[1] / m_outputStrides[i];
+        inputIndices[0] += idx0 * m_inputStrides[i];
+        inputIndices[1] += idx1 * m_inputStrides[i];
+        indices[0] -= idx0 * m_outputStrides[i];
+        indices[1] -= idx1 * m_outputStrides[i];
+      }
+      inputIndices[0] += indices[0] * m_inputStrides[0];
+      inputIndices[1] += indices[1] * m_inputStrides[0];
+    } else {  // RowMajor
+      for (int i = 0; i < NumDims - 1; ++i) {
+        const Index idx0 = indices[0] / m_outputStrides[i];
+        const Index idx1 = indices[1] / m_outputStrides[i];
+        inputIndices[0] += idx0 * m_inputStrides[i];
+        inputIndices[1] += idx1 * m_inputStrides[i];
+        indices[0] -= idx0 * m_outputStrides[i];
+        indices[1] -= idx1 * m_outputStrides[i];
+      }
+      inputIndices[0] += indices[0] * m_inputStrides[NumDims-1];
+      inputIndices[1] += indices[1] * m_inputStrides[NumDims-1];
+    }
+    if (inputIndices[1] - inputIndices[0] == packetSize - 1) {
+      PacketReturnType rslt = m_impl.template packet<Unaligned>(inputIndices[0]);
+      return rslt;
+    }
+    else {
+      EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize];
+      values[0] = m_impl.coeff(inputIndices[0]);
+      values[packetSize-1] = m_impl.coeff(inputIndices[1]);
+      for (int i = 1; i < packetSize-1; ++i) {
+        values[i] = coeff(index+i);
+      }
+      PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+      return rslt;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+
+ protected:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
+  {
+    Index inputIndex = 0;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumDims - 1; i > 0; --i) {
+        const Index idx = index / m_outputStrides[i];
+        inputIndex += idx * m_inputStrides[i];
+        index -= idx * m_outputStrides[i];
+      }
+      inputIndex += index * m_inputStrides[0];
+    } else {  // RowMajor
+      for (int i = 0; i < NumDims - 1; ++i) {
+        const Index idx = index / m_outputStrides[i];
+        inputIndex += idx * m_inputStrides[i];
+        index -= idx * m_outputStrides[i];
+      }
+      inputIndex += index * m_inputStrides[NumDims-1];
+    }
+    return inputIndex;
+  }
+
+  Dimensions m_dimensions;
+  array<Index, NumDims> m_outputStrides;
+  array<Index, NumDims> m_inputStrides;
+  TensorEvaluator<ArgType, Device> m_impl;
+};
+
+
+// Eval as lvalue
+template<typename Strides, typename ArgType, typename Device>
+struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
+    : public TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
+{
+  typedef TensorStridingOp<Strides, ArgType> XprType;
+  typedef TensorEvaluator<const XprType, Device> Base;
+  //  typedef typename XprType::Index Index;
+  static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+  //  typedef DSizes<Index, NumDims> Dimensions;
+
+  enum {
+    IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : Base(op, device) { }
+
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index)
+  {
+    return this->m_impl.coeffRef(this->srcCoeff(index));
+  }
+
+  template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void writePacket(Index index, const PacketReturnType& x)
+  {
+    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+packetSize-1 < this->dimensions().TotalSize());
+
+    Index inputIndices[] = {0, 0};
+    Index indices[] = {index, index + packetSize - 1};
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumDims - 1; i > 0; --i) {
+        const Index idx0 = indices[0] / this->m_outputStrides[i];
+        const Index idx1 = indices[1] / this->m_outputStrides[i];
+        inputIndices[0] += idx0 * this->m_inputStrides[i];
+        inputIndices[1] += idx1 * this->m_inputStrides[i];
+        indices[0] -= idx0 * this->m_outputStrides[i];
+        indices[1] -= idx1 * this->m_outputStrides[i];
+      }
+      inputIndices[0] += indices[0] * this->m_inputStrides[0];
+      inputIndices[1] += indices[1] * this->m_inputStrides[0];
+    } else {  // RowMajor
+      for (int i = 0; i < NumDims - 1; ++i) {
+        const Index idx0 = indices[0] / this->m_outputStrides[i];
+        const Index idx1 = indices[1] / this->m_outputStrides[i];
+        inputIndices[0] += idx0 * this->m_inputStrides[i];
+        inputIndices[1] += idx1 * this->m_inputStrides[i];
+        indices[0] -= idx0 * this->m_outputStrides[i];
+        indices[1] -= idx1 * this->m_outputStrides[i];
+      }
+      inputIndices[0] += indices[0] * this->m_inputStrides[NumDims-1];
+      inputIndices[1] += indices[1] * this->m_inputStrides[NumDims-1];
+    }
+    if (inputIndices[1] - inputIndices[0] == packetSize - 1) {
+      this->m_impl.template writePacket<Unaligned>(inputIndices[0], x);
+    }
+    else {
+      EIGEN_ALIGN_DEFAULT Scalar values[packetSize];
+      internal::pstore<Scalar, PacketReturnType>(values, x);
+      this->m_impl.coeffRef(inputIndices[0]) = values[0];
+      this->m_impl.coeffRef(inputIndices[1]) = values[packetSize-1];
+      for (int i = 1; i < packetSize-1; ++i) {
+        this->coeffRef(index+i) = values[i];
+      }
+    }
+  }
+};
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_STRIDING_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
new file mode 100644
index 0000000000..b8c1eadfc3
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
@@ -0,0 +1,294 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_TRAITS_H
+#define EIGEN_CXX11_TENSOR_TENSOR_TRAITS_H
+
+namespace Eigen {
+namespace internal {
+
+
+template<typename Scalar, int Options>
+class compute_tensor_flags
+{
+  enum {
+    is_dynamic_size_storage = 1,
+
+    aligned_bit =
+    (
+        ((Options&DontAlign)==0) && (
+#if EIGEN_ALIGN_STATICALLY
+            (!is_dynamic_size_storage)
+#else
+            0
+#endif
+            ||
+#if EIGEN_ALIGN
+            is_dynamic_size_storage
+#else
+            0
+#endif
+      )
+    ) ? AlignedBit : 0,
+    packet_access_bit = packet_traits<Scalar>::Vectorizable && aligned_bit ? PacketAccessBit : 0
+  };
+
+  public:
+    enum { ret = packet_access_bit | aligned_bit};
+};
+
+
+template<typename Scalar_, std::size_t NumIndices_, int Options_, typename IndexType_>
+struct traits<Tensor<Scalar_, NumIndices_, Options_, IndexType_> >
+{
+  typedef Scalar_ Scalar;
+  typedef Dense StorageKind;
+  typedef IndexType_ Index;
+  static const int NumDimensions = NumIndices_;
+  static const int Layout = Options_ & RowMajor ? RowMajor : ColMajor;
+  enum {
+    Options = Options_,
+    Flags = compute_tensor_flags<Scalar_, Options_>::ret | (is_const<Scalar_>::value ? 0 : LvalueBit),
+  };
+};
+
+
+template<typename Scalar_, typename Dimensions, int Options_, typename IndexType_>
+struct traits<TensorFixedSize<Scalar_, Dimensions, Options_, IndexType_> >
+{
+  typedef Scalar_ Scalar;
+  typedef Dense StorageKind;
+  typedef IndexType_ Index;
+  static const int NumDimensions = array_size<Dimensions>::value;
+  static const int Layout = Options_ & RowMajor ? RowMajor : ColMajor;
+  enum {
+    Options = Options_,
+    Flags = compute_tensor_flags<Scalar_, Options_>::ret | (is_const<Scalar_>::value ? 0: LvalueBit),
+  };
+};
+
+
+template<typename Scalar_, int Options_, typename IndexType_>
+struct traits<TensorVarDim<Scalar_, Options_, IndexType_> >
+{
+  typedef Scalar_ Scalar;
+  typedef Dense StorageKind;
+  typedef IndexType_ Index;
+  static const int NumDimensions = -1;
+  static const int Layout = Options_ & RowMajor ? RowMajor : ColMajor;
+  enum {
+    Options = Options_,
+    Flags = compute_tensor_flags<Scalar_, Options_>::ret | (is_const<Scalar_>::value ? 0 : LvalueBit),
+  };
+};
+
+template<typename PlainObjectType, int Options_>
+struct traits<TensorMap<PlainObjectType, Options_> >
+  : public traits<PlainObjectType>
+{
+  typedef traits<PlainObjectType> BaseTraits;
+  typedef typename BaseTraits::Scalar Scalar;
+  typedef typename BaseTraits::StorageKind StorageKind;
+  typedef typename BaseTraits::Index Index;
+  static const int NumDimensions = BaseTraits::NumDimensions;
+  static const int Layout = BaseTraits::Layout;
+  enum {
+    Options = Options_,
+    Flags = (BaseTraits::Flags & ~AlignedBit) | (Options&Aligned ? AlignedBit : 0),
+  };
+};
+
+template<typename PlainObjectType>
+struct traits<TensorRef<PlainObjectType> >
+  : public traits<PlainObjectType>
+{
+  typedef traits<PlainObjectType> BaseTraits;
+  typedef typename BaseTraits::Scalar Scalar;
+  typedef typename BaseTraits::StorageKind StorageKind;
+  typedef typename BaseTraits::Index Index;
+  static const int NumDimensions = BaseTraits::NumDimensions;
+  static const int Layout = BaseTraits::Layout;
+  enum {
+    Options = BaseTraits::Options,
+    Flags = (BaseTraits::Flags & ~AlignedBit) | (Options&Aligned ? AlignedBit : 0),
+  };
+};
+
+
+template<typename _Scalar, std::size_t NumIndices_, int Options, typename IndexType_>
+struct eval<Tensor<_Scalar, NumIndices_, Options, IndexType_>, Eigen::Dense>
+{
+  typedef const Tensor<_Scalar, NumIndices_, Options, IndexType_>& type;
+};
+
+template<typename _Scalar, std::size_t NumIndices_, int Options, typename IndexType_>
+struct eval<const Tensor<_Scalar, NumIndices_, Options, IndexType_>, Eigen::Dense>
+{
+  typedef const Tensor<_Scalar, NumIndices_, Options, IndexType_>& type;
+};
+
+template<typename Scalar_, typename Dimensions, int Options, typename IndexType_>
+struct eval<TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>, Eigen::Dense>
+{
+  typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>& type;
+};
+
+template<typename Scalar_, typename Dimensions, int Options, typename IndexType_>
+struct eval<const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>, Eigen::Dense>
+{
+  typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>& type;
+};
+
+template<typename Scalar_,  int Options, typename IndexType_>
+struct eval<TensorVarDim<Scalar_, Options, IndexType_>, Eigen::Dense>
+{
+  typedef const TensorVarDim<Scalar_, Options, IndexType_>& type;
+};
+
+template<typename Scalar_, int Options, typename IndexType_>
+struct eval<const TensorVarDim<Scalar_, Options, IndexType_>, Eigen::Dense>
+{
+  typedef const TensorVarDim<Scalar_, Options, IndexType_>& type;
+};
+
+template<typename PlainObjectType, int Options>
+struct eval<TensorMap<PlainObjectType, Options>, Eigen::Dense>
+{
+  typedef const TensorMap<PlainObjectType, Options>& type;
+};
+
+template<typename PlainObjectType, int Options>
+struct eval<const TensorMap<PlainObjectType, Options>, Eigen::Dense>
+{
+  typedef const TensorMap<PlainObjectType, Options>& type;
+};
+
+template<typename PlainObjectType>
+struct eval<TensorRef<PlainObjectType>, Eigen::Dense>
+{
+  typedef const TensorRef<PlainObjectType>& type;
+};
+
+template<typename PlainObjectType>
+struct eval<const TensorRef<PlainObjectType>, Eigen::Dense>
+{
+  typedef const TensorRef<PlainObjectType>& type;
+};
+
+
+template <typename Scalar_, std::size_t NumIndices_, int Options_, typename IndexType_>
+struct nested<Tensor<Scalar_, NumIndices_, Options_, IndexType_>, 1, typename eval<Tensor<Scalar_, NumIndices_, Options_, IndexType_> >::type>
+{
+  typedef const Tensor<Scalar_, NumIndices_, Options_, IndexType_>& type;
+};
+
+template <typename Scalar_, std::size_t NumIndices_, int Options_, typename IndexType_>
+struct nested<const Tensor<Scalar_, NumIndices_, Options_, IndexType_>, 1, typename eval<const Tensor<Scalar_, NumIndices_, Options_, IndexType_> >::type>
+{
+  typedef const Tensor<Scalar_, NumIndices_, Options_, IndexType_>& type;
+};
+
+template <typename Scalar_, typename Dimensions, int Options, typename IndexType_>
+struct nested<TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>, 1, typename eval<TensorFixedSize<Scalar_, Dimensions, Options, IndexType_> >::type>
+{
+  typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>& type;
+};
+
+template <typename Scalar_, typename Dimensions, int Options, typename IndexType_>
+struct nested<const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>, 1, typename eval<const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_> >::type>
+{
+  typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>& type;
+};
+
+template <typename Scalar_, int Options>
+struct nested<TensorVarDim<Scalar_, Options>, 1, typename eval<TensorVarDim<Scalar_, Options> >::type>
+{
+  typedef const TensorVarDim<Scalar_, Options>& type;
+};
+
+template <typename Scalar_, int Options>
+struct nested<const TensorVarDim<Scalar_, Options>, 1, typename eval<const TensorVarDim<Scalar_, Options> >::type>
+{
+  typedef const TensorVarDim<Scalar_, Options>& type;
+};
+
+
+template <typename PlainObjectType, int Options>
+struct nested<TensorMap<PlainObjectType, Options>, 1, typename eval<TensorMap<PlainObjectType, Options> >::type>
+{
+  typedef const TensorMap<PlainObjectType, Options>& type;
+};
+
+template <typename PlainObjectType, int Options>
+struct nested<const TensorMap<PlainObjectType, Options>, 1, typename eval<TensorMap<PlainObjectType, Options> >::type>
+{
+  typedef const TensorMap<PlainObjectType, Options>& type;
+};
+
+template <typename PlainObjectType>
+struct nested<TensorRef<PlainObjectType>, 1, typename eval<TensorRef<PlainObjectType> >::type>
+{
+  typedef const TensorRef<PlainObjectType>& type;
+};
+
+template <typename PlainObjectType>
+struct nested<const TensorRef<PlainObjectType>, 1, typename eval<TensorRef<PlainObjectType> >::type>
+{
+  typedef const TensorRef<PlainObjectType>& type;
+};
+
+}  // end namespace internal
+
+// Convolutional layers take in an input tensor of shape (D, R, C, B), or (D, C,
+// R, B), and convolve it with a set of filters, which can also be presented as
+// a tensor (D, K, K, M), where M is the number of filters, K is the filter
+// size, and each 3-dimensional tensor of size (D, K, K) is a filter. For
+// simplicity we assume that we always use square filters (which is usually the
+// case in images), hence the two Ks in the tensor dimension.  It also takes in
+// a few additional parameters:
+// Stride (S): The convolution stride is the offset between locations where we
+//             apply the filters.  A larger stride means that the output will be
+//             spatially smaller.
+// Padding (P): The padding we apply to the input tensor along the R and C
+//              dimensions.  This is usually used to make sure that the spatial
+//              dimensions of the output matches our intention.
+//
+// Two types of padding are often used:
+//   SAME: The pad value is computed so that the output will have size
+//         R/S and C/S.
+//   VALID: no padding is carried out.
+// When we do padding, the padded values at the padded locations are usually
+// zero.
+//
+// The output dimensions for convolution, when given all the parameters above,
+// are as follows:
+// When Padding = SAME: the output size is (B, R', C', M), where
+//   R' = ceil(float(R) / float(S))
+//   C' = ceil(float(C) / float(S))
+// where ceil is the ceiling function.  The input tensor is padded with 0 as
+// needed.  The number of padded rows and columns are computed as:
+//   Pr = ((R' - 1) * S + K - R) / 2
+//   Pc = ((C' - 1) * S + K - C) / 2
+// when the stride is 1, we have the simplified case R'=R, C'=C, Pr=Pc=(K-1)/2.
+// This is where SAME comes from - the output has the same size as the input has.
+// When Padding = VALID: the output size is computed as
+//   R' = ceil(float(R - K + 1) / float(S))
+//   C' = ceil(float(C - K + 1) / float(S))
+// and the number of padded rows and columns are computed in the same way as in
+// the SAME case.
+// When the stride is 1, we have the simplified case R'=R-K+1, C'=C-K+1, Pr=0,
+// Pc=0.
+typedef enum {
+  PADDING_VALID = 1,
+  PADDING_SAME = 2,
+} PaddingType;
+
+}  // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_TRAITS_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorTrueIndices.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorTrueIndices.h
new file mode 100644
index 0000000000..ec1d44e6a6
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorTrueIndices.h
@@ -0,0 +1,250 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Eugene Brevdo <ebrevdo@google.com>
+//                    Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_TRUE_INDICES_H
+#define EIGEN_CXX11_TENSOR_TENSOR_TRUE_INDICES_H
+namespace Eigen {
+
+/** \class TensorTrueIndices
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor provide indices of true values class.
+  *
+  */
+namespace internal {
+template<typename XprType>
+struct traits<TensorTrueIndicesOp<XprType> > : public traits<XprType>
+{
+  typedef DenseIndex Scalar;
+  typedef DenseIndex CoeffReturnType;
+  typedef traits<XprType> XprTraits;
+  //typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef typename remove_reference<Nested>::type _Nested;
+  static const int NumDimensions = 2; // XprTraits::NumDimensions;
+  static const int Layout = XprTraits::Layout;
+};
+
+template<typename XprType>
+struct eval<TensorTrueIndicesOp<XprType>, Eigen::Dense>
+{
+  typedef const TensorTrueIndicesOp<XprType>& type;
+};
+
+template<typename XprType>
+struct nested<TensorTrueIndicesOp<XprType>, 1,
+            typename eval<TensorTrueIndicesOp<XprType> >::type>
+{
+  typedef TensorTrueIndicesOp<XprType> type;
+};
+
+}  // end namespace internal
+
+template<typename XprType>
+class TensorTrueIndicesOp : public TensorBase<TensorTrueIndicesOp<XprType>, WriteAccessors>
+{
+  public:
+    typedef typename Eigen::internal::traits<TensorTrueIndicesOp>::Scalar Scalar;
+    //typedef typename Eigen::internal::traits<TensorTrueIndicesOp>::Packet Packet;
+    typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+    typedef typename Eigen::internal::traits<TensorTrueIndicesOp>::CoeffReturnType CoeffReturnType;
+    typedef typename internal::packet_traits<CoeffReturnType>::type PacketReturnType;
+    typedef typename Eigen::internal::nested<TensorTrueIndicesOp>::type Nested;
+    typedef typename Eigen::internal::traits<TensorTrueIndicesOp>::StorageKind
+                                                                    StorageKind;
+    typedef typename Eigen::internal::traits<TensorTrueIndicesOp>::Index Index;
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorTrueIndicesOp(
+        const XprType& expr, const CoeffReturnType& not_found = -1)
+        : m_xpr(expr), m_not_found(not_found) {
+    }
+
+    EIGEN_DEVICE_FUNC
+    const CoeffReturnType& not_found() const { return m_not_found; }
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename XprType::Nested>::type&
+    expression() const { return m_xpr; }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorTrueIndicesOp& operator = (const TensorTrueIndicesOp& other)
+    {
+      typedef TensorAssignOp<TensorTrueIndicesOp, const TensorTrueIndicesOp> Assign;
+      Assign assign(*this, other);
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(
+          assign, DefaultDevice());
+      return *this;
+    }
+
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorTrueIndicesOp& operator = (const OtherDerived& other)
+    {
+      typedef TensorAssignOp<TensorTrueIndicesOp, const OtherDerived> Assign;
+      Assign assign(*this, other);
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(
+          assign, DefaultDevice());
+      return *this;
+    }
+
+  protected:
+    typename XprType::Nested m_xpr;
+    CoeffReturnType m_not_found;
+};
+
+// Eval as rvalue
+template<typename ArgType, typename Device>
+struct TensorEvaluator<const TensorTrueIndicesOp<ArgType>, Device>
+{
+  typedef TensorTrueIndicesOp<ArgType> XprType;
+  typedef typename XprType::Index InputIndex;
+  typedef typename XprType::Index Index;
+  static const int NumDims = 2;
+  typedef DSizes<Index, 2> Dimensions;
+  typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions;
+  static const int NumInputDims = internal::array_size<InputDimensions>::value;
+
+  enum {
+    IsAligned = true,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
+                                                        const Device& device)
+      : m_impl(op.expression(), device), m_not_found(op.not_found())
+  {
+    // Store original dimensions
+    m_orig_dimensions = m_impl.dimensions();
+
+    // Calculate output dimensions
+    m_dimensions[0] = m_orig_dimensions.TotalSize();
+    m_dimensions[1] = NumInputDims;
+
+    // Calculate strides of input expression
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_strides[0] = 1;
+      for (int i = 1; i < NumInputDims; ++i) {
+        m_strides[i] = m_strides[i-1] * m_orig_dimensions[i-1];
+      }
+    } else {
+      m_strides[NumInputDims-1] = 1;
+      for (int i = NumInputDims - 2; i >= 0; --i) {
+        m_strides[i] = m_strides[i+1] * m_orig_dimensions[i+1];
+      }
+    }
+  }
+
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_impl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE InputIndex origIndices(
+      Index index) const {
+    eigen_assert(index < dimensions().TotalSize());
+    Index inputIndex = 0;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      inputIndex = index % m_dimensions[0];
+    } else {
+      inputIndex = index / m_dimensions[1];
+    }
+    return inputIndex;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int whichDim(
+      Index index) const {
+    eigen_assert(index < dimensions().TotalSize());
+    int inputDim = 0;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      inputDim = index / m_dimensions[0];
+    } else {
+      inputDim = index % m_dimensions[1];
+    }
+    return inputDim;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType origDim(
+      int dim, InputIndex index) const {
+    eigen_assert(index < m_orig_dimensions.TotalSize());
+    eigen_assert(dim > -1 && dim < m_orig_dimensions.size());
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumInputDims - 1; i > 0; --i) {
+        Index idx = index / m_strides[i];
+        if (i == dim) return idx;  // Found our dimension
+        index -= idx * m_strides[i];
+      }
+      return index;
+    } else {
+      for (int i = 0; i < NumInputDims - 1; ++i) {
+        Index idx = index / m_strides[i];
+        if (i == dim) return idx;  // Found our dimension
+        index -= idx * m_strides[i];
+      }
+      return index;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(
+      Index index) const  {
+    InputIndex orig_index = origIndices(index);
+    if (m_impl.coeff(orig_index))
+      return origDim(whichDim(index), orig_index);
+    else {
+      return m_not_found;
+    }
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  PacketReturnType packet(Index index) const
+  {
+    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+packetSize-1 < dimensions().TotalSize());
+
+    // TODO(ndjaitly): write a better packing routine that uses
+    // local structure.
+    EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type
+                                                            values[packetSize];
+    for (int i = 0; i < packetSize; ++i) {
+      values[i] = coeff(index+i);
+    }
+    PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+    return rslt;
+  }
+
+  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+
+ protected:
+  InputDimensions m_orig_dimensions;
+  Dimensions m_dimensions;
+  TensorEvaluator<ArgType, Device> m_impl;
+  array<Index, NumInputDims> m_strides;
+  CoeffReturnType m_not_found;
+};
+
+}  // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_TRUE_INDICES_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorVarDim.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorVarDim.h
new file mode 100644
index 0000000000..49954b955e
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorVarDim.h
@@ -0,0 +1,315 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_VAR_DIM_H
+#define EIGEN_CXX11_TENSOR_TENSOR_VAR_DIM_H
+
+namespace Eigen {
+
+/** \class Tensor
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief A version of the tensor class that supports a variable number of dimensions.
+  *
+  * The variable equivalent of
+  * Eigen::Tensor<float, 3> t(3, 5, 7);
+  * is
+  * Eigen::TensorVarDim<float> t(3, 5, 7);
+  */
+
+template<typename Scalar_, int Options_, typename IndexType_>
+class TensorVarDim : public TensorBase<TensorVarDim<Scalar_, Options_, IndexType_> >
+{
+  public:
+    typedef TensorVarDim<Scalar_, Options_, IndexType_> Self;
+    typedef TensorBase<TensorVarDim<Scalar_, Options_, IndexType_> > Base;
+    typedef typename Eigen::internal::nested<Self>::type Nested;
+    typedef typename internal::traits<Self>::StorageKind StorageKind;
+    typedef typename internal::traits<Self>::Index Index;
+    typedef Scalar_ Scalar;
+    typedef typename internal::packet_traits<Scalar>::type Packet;
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+    typedef typename Base::CoeffReturnType CoeffReturnType;
+    typedef typename Base::PacketReturnType PacketReturnType;
+
+    enum {
+      IsAligned = bool(EIGEN_ALIGN) & !(Options_ & DontAlign),
+      PacketAccess = (internal::packet_traits<Scalar>::size > 1),
+      BlockAccess = false,
+      Layout = Options_ & RowMajor ? RowMajor : ColMajor,
+      // disabled for now as the number of coefficients is not known by the
+      // caller at compile time.
+      CoordAccess = false,
+    };
+
+    static const int Options = Options_;
+
+    static const Index NumIndices = Dynamic;
+
+    typedef VSizes<Index> Dimensions;
+
+  protected:
+    TensorStorage<Scalar, VSizes<Index>, Options_> m_storage;
+
+  public:
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index                         rank() const { return m_storage.dimensions().size(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index                         dimension(std::size_t n) const { return m_storage.dimensions()[n]; }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions()    const { return m_storage.dimensions(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index                         size()                   const { return m_storage.size(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar                        *data()                        { return m_storage.data(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar                  *data()                  const { return m_storage.data(); }
+
+    // This makes EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
+    // work, because that uses base().coeffRef() - and we don't yet
+    // implement a similar class hierarchy
+    inline Self& base()             { return *this; }
+    inline const Self& base() const { return *this; }
+
+#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+    template<typename... IndexTypes>
+    EIGEN_DEVICE_FUNC inline const Scalar& coeff(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const
+    {
+      // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
+      static const std::size_t NumIndices = sizeof...(otherIndices) + 2;
+      return coeff(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
+    }
+#endif
+
+    template <std::size_t NumIndices>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(const array<Index, NumIndices>& indices) const
+    {
+      eigen_internal_assert(checkIndexRange(indices));
+      return m_storage.data()[linearizedIndex(indices)];
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index index) const
+    {
+      eigen_internal_assert(index >= 0 && index < size());
+      return m_storage.data()[index];
+    }
+
+#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+    template<typename... IndexTypes>
+    inline Scalar& coeffRef(Index firstIndex, Index secondIndex, IndexTypes... otherIndices)
+    {
+      static const std::size_t NumIndices = sizeof...(otherIndices) + 2;
+      return coeffRef(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
+    }
+#endif
+
+    template <std::size_t NumIndices>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(const array<Index, NumIndices>& indices)
+    {
+      eigen_internal_assert(checkIndexRange(indices));
+      return m_storage.data()[linearizedIndex(indices)];
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index)
+    {
+      eigen_internal_assert(index >= 0 && index < size());
+      return m_storage.data()[index];
+    }
+
+#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+    template<typename... IndexTypes>
+    inline const Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const
+    {
+      // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
+      static const std::size_t NumIndices = sizeof...(otherIndices) + 2;
+      return this->operator()(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
+    }
+#endif
+
+    template <std::size_t NumIndices>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(const array<Index, NumIndices>& indices) const
+    {
+      eigen_assert(checkIndexRange(indices));
+      return coeff(indices);
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(Index index) const
+    {
+      eigen_internal_assert(index >= 0 && index < size());
+      return coeff(index);
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator[](Index index) const
+    {
+      return coeff(index);
+    }
+
+#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+    template<typename... IndexTypes>
+    inline Scalar& operator()(Index firstIndex, IndexTypes... otherIndices)
+    {
+      // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
+      static const size_t NumIndices = sizeof...(otherIndices) + 1;
+      return operator()(array<Index, NumIndices>{{firstIndex, otherIndices...}});
+    }
+#endif
+
+    template <std::size_t NumIndices>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(const array<Index, NumIndices>& indices)
+    {
+      eigen_assert(checkIndexRange(indices));
+      return coeffRef(indices);
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(Index index)
+    {
+      eigen_assert(index >= 0 && index < size());
+      return coeffRef(index);
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator[](Index index)
+    {
+      return coeffRef(index);
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorVarDim()
+      : m_storage()
+    {
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorVarDim(const Self& other)
+      : m_storage(other.m_storage)
+    {
+    }
+
+#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+    template<typename... IndexTypes>
+    EIGEN_STRONG_INLINE TensorVarDim(Index firstDimension, IndexTypes... otherDimensions)
+        : m_storage(firstDimension, otherDimensions...)
+    {
+    }
+#endif
+
+    EIGEN_STRONG_INLINE explicit TensorVarDim(const std::vector<Index>& dimensions)
+        : m_storage(dimensions)
+    {
+      EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
+    }
+
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorVarDim(const TensorBase<OtherDerived, ReadOnlyAccessors>& other)
+    {
+      typedef TensorAssignOp<TensorVarDim, const OtherDerived> Assign;
+      Assign assign(*this, other.derived());
+      resize(TensorEvaluator<const Assign, DefaultDevice>(assign, DefaultDevice()).dimensions());
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+    }
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorVarDim(const TensorBase<OtherDerived, WriteAccessors>& other)
+    {
+      typedef TensorAssignOp<TensorVarDim, const OtherDerived> Assign;
+      Assign assign(*this, other.derived());
+      resize(TensorEvaluator<const Assign, DefaultDevice>(assign, DefaultDevice()).dimensions());
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorVarDim& operator=(const TensorVarDim& other)
+    {
+      typedef TensorAssignOp<TensorVarDim, const TensorVarDim> Assign;
+      Assign assign(*this, other);
+      resize(TensorEvaluator<const Assign, DefaultDevice>(assign, DefaultDevice()).dimensions());
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+      return *this;
+    }
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorVarDim& operator=(const OtherDerived& other)
+    {
+      typedef TensorAssignOp<TensorVarDim, const OtherDerived> Assign;
+      Assign assign(*this, other);
+      resize(TensorEvaluator<const Assign, DefaultDevice>(assign, DefaultDevice()).dimensions());
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+      return *this;
+    }
+
+#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+    template<typename... IndexTypes>
+    void resize(Index firstDimension, IndexTypes... otherDimensions)
+    {
+      // The number of dimensions used to resize a tensor must be equal to the rank of the tensor.
+      EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+      static const std::size_t NumIndices = sizeof...(otherDimensions) + 1;
+      resize(array<Index, NumIndices>{{firstDimension, otherDimensions...}});
+    }
+#endif
+
+    template <size_t NumIndices>
+    void resize(const array<Index, NumIndices>& dimensions)
+    {
+      Index size = Index(1);
+      for (std::size_t i = 0; i < NumIndices; i++) {
+        internal::check_rows_cols_for_overflow<Dynamic>::run(size, dimensions[i]);
+        size *= dimensions[i];
+      }
+      #ifdef EIGEN_INITIALIZE_COEFFS
+        bool size_changed = size != this->size();
+        m_storage.resize(size, dimensions);
+        if(size_changed) EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
+      #else
+        m_storage.resize(size, dimensions);
+      #endif
+    }
+    void resize(const std::vector<Index>& dimensions)
+    {
+      Index size = Index(1);
+      for (std::size_t i = 0; i < dimensions.size(); i++) {
+        internal::check_rows_cols_for_overflow<Dynamic>::run(size, dimensions[i]);
+        size *= dimensions[i];
+      }
+      #ifdef EIGEN_INITIALIZE_COEFFS
+        bool size_changed = size != this->size();
+        m_storage.resize(size, dimensions);
+        if(size_changed) EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
+      #else
+        m_storage.resize(size, dimensions);
+      #endif
+    }
+
+  protected:
+    template <std::size_t NumIndices>
+    bool checkIndexRange(const array<Index, NumIndices>& indices) const
+    {
+      /*     using internal::array_apply_and_reduce;
+      using internal::array_zip_and_reduce;
+      using internal::greater_equal_zero_op;
+      using internal::logical_and_op;
+      using internal::lesser_op;
+
+      return
+        // check whether the indices are all >= 0
+        array_apply_and_reduce<logical_and_op, greater_equal_zero_op>(indices) &&
+        // check whether the indices fit in the dimensions
+        array_zip_and_reduce<logical_and_op, lesser_op>(indices, m_storage.dimensions());
+      */
+      return true;
+    }
+
+    template <std::size_t NumIndices>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index linearizedIndex(const array<Index, NumIndices>& indices) const
+    {
+      if (Options&RowMajor) {
+        return m_storage.dimensions().IndexOfRowMajor(indices);
+      } else {
+        return m_storage.dimensions().IndexOfColMajor(indices);
+      }
+    }
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_VAR_DIM_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
new file mode 100644
index 0000000000..de86c57f11
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
@@ -0,0 +1,677 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_VOLUME_PATCH_H
+#define EIGEN_CXX11_TENSOR_TENSOR_VOLUME_PATCH_H
+
+namespace Eigen {
+
+/** \class TensorVolumePatch
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Patch extraction specialized for processing of volumetric data.
+  * This assumes that the input has a least 4 dimensions ordered as follows:
+  *  - channels
+  *  - planes
+  *  - rows
+  *  - columns
+  *  - (optional) additional dimensions such as time or batch size.
+  * Calling the volume patch code with patch_planes, patch_rows, and patch_cols
+  * is equivalent to calling the regular patch extraction code with parameters
+  * d, patch_planes, patch_rows, patch_cols, and 1 for all the additional
+  * dimensions.
+  */
+namespace internal {
+template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType>
+struct traits<TensorVolumePatchOp<Planes, Rows, Cols, XprType> > : public traits<XprType>
+{
+  typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef typename remove_reference<Nested>::type _Nested;
+  static const int NumDimensions = XprTraits::NumDimensions + 1;
+  static const int Layout = XprTraits::Layout;
+};
+
+template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType>
+struct eval<TensorVolumePatchOp<Planes, Rows, Cols, XprType>, Eigen::Dense>
+{
+  typedef const TensorVolumePatchOp<Planes, Rows, Cols, XprType>& type;
+};
+
+template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType>
+struct nested<TensorVolumePatchOp<Planes, Rows, Cols, XprType>, 1, typename eval<TensorVolumePatchOp<Planes, Rows, Cols, XprType> >::type>
+{
+  typedef TensorVolumePatchOp<Planes, Rows, Cols, XprType> type;
+};
+
+}  // end namespace internal
+
+template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType>
+class TensorVolumePatchOp : public TensorBase<TensorVolumePatchOp<Planes, Rows, Cols, XprType>, ReadOnlyAccessors>
+{
+  public:
+  typedef typename Eigen::internal::traits<TensorVolumePatchOp>::Scalar Scalar;
+  typedef typename Eigen::internal::traits<TensorVolumePatchOp>::Packet Packet;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+  typedef typename Eigen::internal::nested<TensorVolumePatchOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorVolumePatchOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorVolumePatchOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorVolumePatchOp(const XprType& expr, DenseIndex patch_planes, DenseIndex patch_rows, DenseIndex patch_cols,
+                                                            DenseIndex plane_strides, DenseIndex row_strides, DenseIndex col_strides,
+                                                            DenseIndex in_plane_strides, DenseIndex in_row_strides, DenseIndex in_col_strides,
+                                                            DenseIndex plane_inflate_strides, DenseIndex row_inflate_strides, DenseIndex col_inflate_strides,
+                                                            PaddingType padding_type, Scalar padding_value)
+      : m_xpr(expr), m_patch_planes(patch_planes), m_patch_rows(patch_rows), m_patch_cols(patch_cols),
+        m_plane_strides(plane_strides), m_row_strides(row_strides), m_col_strides(col_strides),
+        m_in_plane_strides(in_plane_strides), m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides),
+        m_plane_inflate_strides(plane_inflate_strides), m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides),
+        m_padding_explicit(false), m_padding_top_z(0), m_padding_bottom_z(0), m_padding_top(0), m_padding_bottom(0), m_padding_left(0), m_padding_right(0),
+        m_padding_type(padding_type), m_padding_value(padding_value) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorVolumePatchOp(const XprType& expr, DenseIndex patch_planes, DenseIndex patch_rows, DenseIndex patch_cols,
+                                                           DenseIndex plane_strides, DenseIndex row_strides, DenseIndex col_strides,
+                                                           DenseIndex in_plane_strides, DenseIndex in_row_strides, DenseIndex in_col_strides,
+                                                           DenseIndex plane_inflate_strides, DenseIndex row_inflate_strides, DenseIndex col_inflate_strides,
+                                                           DenseIndex padding_top_z, DenseIndex padding_bottom_z,
+                                                           DenseIndex padding_top, DenseIndex padding_bottom,
+                                                           DenseIndex padding_left, DenseIndex padding_right,
+                                                           Scalar padding_value)
+      : m_xpr(expr), m_patch_planes(patch_planes), m_patch_rows(patch_rows), m_patch_cols(patch_cols),
+        m_plane_strides(plane_strides), m_row_strides(row_strides), m_col_strides(col_strides),
+        m_in_plane_strides(in_plane_strides), m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides),
+        m_plane_inflate_strides(plane_inflate_strides), m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides),
+        m_padding_explicit(true), m_padding_top_z(padding_top_z), m_padding_bottom_z(padding_bottom_z), m_padding_top(padding_top), m_padding_bottom(padding_bottom),
+        m_padding_left(padding_left), m_padding_right(padding_right),
+        m_padding_type(PADDING_VALID), m_padding_value(padding_value) {}
+
+    EIGEN_DEVICE_FUNC
+    DenseIndex patch_planes() const { return m_patch_planes; }
+    EIGEN_DEVICE_FUNC
+    DenseIndex patch_rows() const { return m_patch_rows; }
+    EIGEN_DEVICE_FUNC
+    DenseIndex patch_cols() const { return m_patch_cols; }
+    EIGEN_DEVICE_FUNC
+    DenseIndex plane_strides() const { return m_plane_strides; }
+    EIGEN_DEVICE_FUNC
+    DenseIndex row_strides() const { return m_row_strides; }
+    EIGEN_DEVICE_FUNC
+    DenseIndex col_strides() const { return m_col_strides; }
+    EIGEN_DEVICE_FUNC
+    DenseIndex in_plane_strides() const { return m_in_plane_strides; }
+    EIGEN_DEVICE_FUNC
+    DenseIndex in_row_strides() const { return m_in_row_strides; }
+    EIGEN_DEVICE_FUNC
+    DenseIndex in_col_strides() const { return m_in_col_strides; }
+    EIGEN_DEVICE_FUNC
+    DenseIndex plane_inflate_strides() const { return m_plane_inflate_strides; }
+    EIGEN_DEVICE_FUNC
+    DenseIndex row_inflate_strides() const { return m_row_inflate_strides; }
+    EIGEN_DEVICE_FUNC
+    DenseIndex col_inflate_strides() const { return m_col_inflate_strides; }
+    EIGEN_DEVICE_FUNC
+    bool padding_explicit() const { return m_padding_explicit; }
+    EIGEN_DEVICE_FUNC
+    DenseIndex padding_top_z() const { return m_padding_top_z; }
+    EIGEN_DEVICE_FUNC
+    DenseIndex padding_bottom_z() const { return m_padding_bottom_z; }
+    EIGEN_DEVICE_FUNC
+    DenseIndex padding_top() const { return m_padding_top; }
+    EIGEN_DEVICE_FUNC
+    DenseIndex padding_bottom() const { return m_padding_bottom; }
+    EIGEN_DEVICE_FUNC
+    DenseIndex padding_left() const { return m_padding_left; }
+    EIGEN_DEVICE_FUNC
+    DenseIndex padding_right() const { return m_padding_right; }
+    EIGEN_DEVICE_FUNC
+    PaddingType padding_type() const { return m_padding_type; }
+    EIGEN_DEVICE_FUNC
+    Scalar padding_value() const { return m_padding_value; }
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename XprType::Nested>::type&
+    expression() const { return m_xpr; }
+
+  protected:
+    typename XprType::Nested m_xpr;
+    const DenseIndex m_patch_planes;
+    const DenseIndex m_patch_rows;
+    const DenseIndex m_patch_cols;
+    const DenseIndex m_plane_strides;
+    const DenseIndex m_row_strides;
+    const DenseIndex m_col_strides;
+    const DenseIndex m_in_plane_strides;
+    const DenseIndex m_in_row_strides;
+    const DenseIndex m_in_col_strides;
+    const DenseIndex m_plane_inflate_strides;
+    const DenseIndex m_row_inflate_strides;
+    const DenseIndex m_col_inflate_strides;
+    const bool m_padding_explicit;
+    const DenseIndex m_padding_top_z;
+    const DenseIndex m_padding_bottom_z;
+    const DenseIndex m_padding_top;
+    const DenseIndex m_padding_bottom;
+    const DenseIndex m_padding_left;
+    const DenseIndex m_padding_right;
+    const PaddingType m_padding_type;
+    const Scalar m_padding_value;
+};
+
+
+// Eval as rvalue
+template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, Device>
+{
+  typedef TensorVolumePatchOp<Planes, Rows, Cols, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  static const int NumInputDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+  static const int NumDims = NumInputDims + 1;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
+
+  enum {
+    IsAligned = false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = NumDims == 6,
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device)
+  {
+    EIGEN_STATIC_ASSERT(NumDims >= 5, YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+    m_paddingValue = op.padding_value();
+
+    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+
+    // Cache a few variables.
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_inputDepth = input_dims[0];
+      m_inputPlanes = input_dims[1];
+      m_inputRows = input_dims[2];
+      m_inputCols = input_dims[3];
+    } else {
+      m_inputDepth = input_dims[NumInputDims-1];
+      m_inputPlanes = input_dims[NumInputDims-2];
+      m_inputRows = input_dims[NumInputDims-3];
+      m_inputCols = input_dims[NumInputDims-4];
+    }
+
+    m_plane_strides = op.plane_strides();
+    m_row_strides = op.row_strides();
+    m_col_strides = op.col_strides();
+
+    // Input strides and effective input/patch size
+    m_in_plane_strides = op.in_plane_strides();
+    m_in_row_strides = op.in_row_strides();
+    m_in_col_strides = op.in_col_strides();
+    m_plane_inflate_strides = op.plane_inflate_strides();
+    m_row_inflate_strides = op.row_inflate_strides();
+    m_col_inflate_strides = op.col_inflate_strides();
+
+    // The "effective" spatial size after inflating data with zeros.
+    m_input_planes_eff = (m_inputPlanes - 1) * m_plane_inflate_strides + 1;
+    m_input_rows_eff = (m_inputRows - 1) * m_row_inflate_strides + 1;
+    m_input_cols_eff = (m_inputCols - 1) * m_col_inflate_strides + 1;
+    m_patch_planes_eff = op.patch_planes() + (op.patch_planes() - 1) * (m_in_plane_strides - 1);
+    m_patch_rows_eff = op.patch_rows() + (op.patch_rows() - 1) * (m_in_row_strides - 1);
+    m_patch_cols_eff = op.patch_cols() + (op.patch_cols() - 1) * (m_in_col_strides - 1);
+
+    if (op.padding_explicit()) {
+      m_outputPlanes = ceil((m_input_planes_eff + op.padding_top_z() + op.padding_bottom_z() - m_patch_planes_eff + 1.f) / static_cast<float>(m_plane_strides));
+      m_outputRows = ceil((m_input_rows_eff + op.padding_top() + op.padding_bottom() - m_patch_rows_eff + 1.f) / static_cast<float>(m_row_strides));
+      m_outputCols = ceil((m_input_cols_eff + op.padding_left() + op.padding_right() - m_patch_cols_eff + 1.f) / static_cast<float>(m_col_strides));
+      m_planePaddingTop = op.padding_top_z();
+      m_rowPaddingTop = op.padding_top();
+      m_colPaddingLeft = op.padding_left();
+    } else {
+      // Computing padding from the type
+      switch (op.padding_type()) {
+        case PADDING_VALID:
+          m_outputPlanes = ceil((m_input_planes_eff - m_patch_planes_eff + 1.f) / static_cast<float>(m_plane_strides));
+          m_outputRows = ceil((m_input_rows_eff - m_patch_rows_eff + 1.f) / static_cast<float>(m_row_strides));
+          m_outputCols = ceil((m_input_cols_eff - m_patch_cols_eff + 1.f) / static_cast<float>(m_col_strides));
+          m_planePaddingTop = 0;
+          m_rowPaddingTop = 0;
+          m_colPaddingLeft = 0;
+          break;
+        case PADDING_SAME: {
+          m_outputPlanes = ceil(m_input_planes_eff / static_cast<float>(m_plane_strides));
+          m_outputRows = ceil(m_input_rows_eff / static_cast<float>(m_row_strides));
+          m_outputCols = ceil(m_input_cols_eff / static_cast<float>(m_col_strides));
+          const Index dz = m_outputPlanes * m_plane_strides + m_patch_planes_eff - 1 - m_input_planes_eff;
+          const Index dy = m_outputRows * m_row_strides + m_patch_rows_eff - 1 - m_input_rows_eff;
+          const Index dx = m_outputCols * m_col_strides + m_patch_cols_eff - 1 - m_input_cols_eff;
+          m_planePaddingTop = dz - dz / 2;
+          m_rowPaddingTop = dy - dy / 2;
+          m_colPaddingLeft = dx - dx / 2;
+          break;
+        }
+        default:
+          eigen_assert(false && "unexpected padding");
+      }
+    }
+    eigen_assert(m_outputRows > 0);
+    eigen_assert(m_outputCols > 0);
+    eigen_assert(m_outputPlanes > 0);
+
+    // Dimensions for result of extraction.
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      // ColMajor
+      // 0: depth
+      // 1: patch_planes
+      // 2: patch_rows
+      // 3: patch_cols
+      // 4: number of patches
+      // 5 and beyond: anything else (such as batch).
+      m_dimensions[0] = input_dims[0];
+      m_dimensions[1] = op.patch_planes();
+      m_dimensions[2] = op.patch_rows();
+      m_dimensions[3] = op.patch_cols();
+      m_dimensions[4] = m_outputPlanes * m_outputRows * m_outputCols;
+      for (int i = 5; i < NumDims; ++i) {
+        m_dimensions[i] = input_dims[i-1];
+      }
+    } else {
+      // RowMajor
+      // NumDims-1: depth
+      // NumDims-2: patch_planes
+      // NumDims-3: patch_rows
+      // NumDims-4: patch_cols
+      // NumDims-5: number of patches
+      // NumDims-6 and beyond: anything else (such as batch).
+      m_dimensions[NumDims-1] = input_dims[NumInputDims-1];
+      m_dimensions[NumDims-2] = op.patch_planes();
+      m_dimensions[NumDims-3] = op.patch_rows();
+      m_dimensions[NumDims-4] = op.patch_cols();
+      m_dimensions[NumDims-5] = m_outputPlanes * m_outputRows * m_outputCols;
+      for (int i = NumDims-6; i >= 0; --i) {
+        m_dimensions[i] = input_dims[i];
+      }
+    }
+
+    // Strides for the output tensor.
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_rowStride = m_dimensions[1];
+      m_colStride = m_dimensions[2] * m_rowStride;
+      m_patchStride = m_colStride * m_dimensions[3] * m_dimensions[0];
+      m_otherStride = m_patchStride * m_dimensions[4];
+    } else {
+      m_rowStride = m_dimensions[NumDims-2];
+      m_colStride = m_dimensions[NumDims-3] * m_rowStride;
+      m_patchStride = m_colStride * m_dimensions[NumDims-4] * m_dimensions[NumDims-1];
+      m_otherStride = m_patchStride * m_dimensions[NumDims-5];
+    }
+
+    // Strides for navigating through the input tensor.
+    m_planeInputStride = m_inputDepth;
+    m_rowInputStride = m_inputDepth * m_inputPlanes;
+    m_colInputStride = m_inputDepth * m_inputRows * m_inputPlanes;
+    m_otherInputStride = m_inputDepth * m_inputRows * m_inputCols * m_inputPlanes;
+
+    m_outputPlanesRows = m_outputPlanes * m_outputRows;
+
+    // Fast representations of different variables.
+    m_fastOtherStride = internal::TensorIntDivisor<Index>(m_otherStride);
+    m_fastPatchStride = internal::TensorIntDivisor<Index>(m_patchStride);
+    m_fastColStride = internal::TensorIntDivisor<Index>(m_colStride);
+    m_fastRowStride = internal::TensorIntDivisor<Index>(m_rowStride);
+    m_fastInputRowStride = internal::TensorIntDivisor<Index>(m_row_inflate_strides);
+    m_fastInputColStride = internal::TensorIntDivisor<Index>(m_col_inflate_strides);
+    m_fastInputPlaneStride = internal::TensorIntDivisor<Index>(m_plane_inflate_strides);
+    m_fastInputColsEff = internal::TensorIntDivisor<Index>(m_input_cols_eff);
+    m_fastOutputPlanes = internal::TensorIntDivisor<Index>(m_outputPlanes);
+    m_fastOutputPlanesRows = internal::TensorIntDivisor<Index>(m_outputPlanesRows);
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_fastOutputDepth = internal::TensorIntDivisor<Index>(m_dimensions[0]);
+    } else {
+      m_fastOutputDepth = internal::TensorIntDivisor<Index>(m_dimensions[NumDims-1]);
+    }
+  }
+
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_impl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    // Patch index corresponding to the passed in index.
+    const Index patchIndex = index / m_fastPatchStride;
+
+    // Spatial offset within the patch. This has to be translated into 3D
+    // coordinates within the patch.
+    const Index patchOffset = (index - patchIndex * m_patchStride) / m_fastOutputDepth;
+
+    // Batch, etc.
+    const Index otherIndex = (NumDims == 5) ? 0 : index / m_fastOtherStride;
+    const Index patch3DIndex = (NumDims == 5) ? patchIndex : (index - otherIndex * m_otherStride) / m_fastPatchStride;
+
+    // Calculate column index in the input original tensor.
+    const Index colIndex = patch3DIndex / m_fastOutputPlanesRows;
+    const Index colOffset = patchOffset / m_fastColStride;
+    const Index inputCol = colIndex * m_col_strides + colOffset * m_in_col_strides - m_colPaddingLeft;
+    const Index origInputCol = (m_col_inflate_strides == 1) ? inputCol : ((inputCol >= 0) ? (inputCol / m_fastInputColStride) : 0);
+    if (inputCol < 0 || inputCol >= m_input_cols_eff ||
+        ((m_col_inflate_strides != 1) && (inputCol != origInputCol * m_col_inflate_strides))) {
+      return Scalar(m_paddingValue);
+    }
+
+    // Calculate row index in the original input tensor.
+    const Index rowIndex = (patch3DIndex - colIndex * m_outputPlanesRows) / m_fastOutputPlanes;
+    const Index rowOffset = (patchOffset - colOffset * m_colStride) / m_fastRowStride;
+    const Index inputRow = rowIndex * m_row_strides + rowOffset * m_in_row_strides - m_rowPaddingTop;
+    const Index origInputRow = (m_row_inflate_strides == 1) ? inputRow : ((inputRow >= 0) ? (inputRow / m_fastInputRowStride) : 0);
+    if (inputRow < 0 || inputRow >= m_input_rows_eff ||
+        ((m_row_inflate_strides != 1) && (inputRow != origInputRow * m_row_inflate_strides))) {
+      return Scalar(m_paddingValue);
+    }
+
+    // Calculate plane index in the original input tensor.
+    const Index planeIndex = (patch3DIndex - m_outputPlanes * (colIndex * m_outputRows + rowIndex));
+    const Index planeOffset = patchOffset - colOffset * m_colStride - rowOffset * m_rowStride;
+    const Index inputPlane = planeIndex * m_plane_strides + planeOffset * m_in_plane_strides - m_planePaddingTop;
+    const Index origInputPlane = (m_plane_inflate_strides == 1) ? inputPlane : ((inputPlane >= 0) ? (inputPlane / m_fastInputPlaneStride) : 0);
+    if (inputPlane < 0 || inputPlane >= m_input_planes_eff ||
+        ((m_plane_inflate_strides != 1) && (inputPlane != origInputPlane * m_plane_inflate_strides))) {
+      return Scalar(m_paddingValue);
+    }
+
+    const int depth_index = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - 1;
+    const Index depth = index - (index / m_fastOutputDepth) * m_dimensions[depth_index];
+
+    const Index inputIndex = depth +
+        origInputRow * m_rowInputStride +
+        origInputCol * m_colInputStride +
+        origInputPlane * m_planeInputStride +
+        otherIndex * m_otherInputStride;
+
+    return m_impl.coeff(inputIndex);
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  {
+    const Index packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+packetSize-1 < dimensions().TotalSize());
+
+    if (m_in_row_strides != 1 || m_in_col_strides != 1 || m_row_inflate_strides != 1 || m_col_inflate_strides != 1 ||
+        m_in_plane_strides != 1 || m_plane_inflate_strides != 1) {
+      return packetWithPossibleZero(index);
+    }
+
+    const Index indices[2] = {index, index + packetSize - 1};
+    const Index patchIndex = indices[0] / m_fastPatchStride;
+    if (patchIndex != indices[1] / m_fastPatchStride) {
+      return packetWithPossibleZero(index);
+    }
+    const Index otherIndex = (NumDims == 5) ? 0 : indices[0] / m_fastOtherStride;
+    eigen_assert(otherIndex == indices[1] / m_fastOtherStride);
+
+    // Find the offset of the element wrt the location of the first element.
+    const Index patchOffsets[2] = {(indices[0] - patchIndex * m_patchStride) / m_fastOutputDepth,
+                                   (indices[1] - patchIndex * m_patchStride) / m_fastOutputDepth};
+
+    const Index patch3DIndex = (NumDims == 5) ? patchIndex : (indices[0] - otherIndex * m_otherStride) / m_fastPatchStride;
+    eigen_assert(patch3DIndex == (indices[1] - otherIndex * m_otherStride) / m_fastPatchStride);
+
+    const Index colIndex = patch3DIndex / m_fastOutputPlanesRows;
+    const Index colOffsets[2] = {
+      patchOffsets[0] / m_fastColStride,
+      patchOffsets[1] / m_fastColStride};
+
+    // Calculate col indices in the original input tensor.
+    const Index inputCols[2] = {
+      colIndex * m_col_strides + colOffsets[0] - m_colPaddingLeft,
+      colIndex * m_col_strides + colOffsets[1] - m_colPaddingLeft};
+    if (inputCols[1] < 0 || inputCols[0] >= m_inputCols) {
+      return internal::pset1<PacketReturnType>(Scalar(m_paddingValue));
+    }
+
+    if (inputCols[0] != inputCols[1]) {
+      return packetWithPossibleZero(index);
+    }
+
+    const Index rowIndex = (patch3DIndex - colIndex * m_outputPlanesRows) / m_fastOutputPlanes;
+    const Index rowOffsets[2] = {
+      (patchOffsets[0] - colOffsets[0] * m_colStride) / m_fastRowStride,
+      (patchOffsets[1] - colOffsets[1] * m_colStride) / m_fastRowStride};
+    eigen_assert(rowOffsets[0] <= rowOffsets[1]);
+    // Calculate col indices in the original input tensor.
+    const Index inputRows[2] = {
+      rowIndex * m_row_strides + rowOffsets[0] - m_rowPaddingTop,
+      rowIndex * m_row_strides + rowOffsets[1] - m_rowPaddingTop};
+
+    if (inputRows[1] < 0 || inputRows[0] >= m_inputRows) {
+      return internal::pset1<PacketReturnType>(Scalar(m_paddingValue));
+    }
+
+    if (inputRows[0] != inputRows[1]) {
+      return packetWithPossibleZero(index);
+    }
+
+    const Index planeIndex = (patch3DIndex - m_outputPlanes * (colIndex * m_outputRows + rowIndex));
+    const Index planeOffsets[2] = {
+      patchOffsets[0] - colOffsets[0] * m_colStride - rowOffsets[0] * m_rowStride,
+      patchOffsets[1] - colOffsets[1] * m_colStride - rowOffsets[1] * m_rowStride};
+    eigen_assert(planeOffsets[0] <= planeOffsets[1]);
+    const Index inputPlanes[2] = {
+      planeIndex * m_plane_strides + planeOffsets[0] - m_planePaddingTop,
+      planeIndex * m_plane_strides + planeOffsets[1] - m_planePaddingTop};
+
+    if (inputPlanes[1] < 0 || inputPlanes[0] >= m_inputPlanes) {
+      return internal::pset1<PacketReturnType>(Scalar(m_paddingValue));
+    }
+
+    if (inputPlanes[0] >= 0 && inputPlanes[1] < m_inputPlanes) {
+      // no padding
+      const int depth_index = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - 1;
+      const Index depth = index - (index / m_fastOutputDepth) * m_dimensions[depth_index];
+      const Index inputIndex = depth +
+          inputRows[0] * m_rowInputStride +
+          inputCols[0] * m_colInputStride +
+          m_planeInputStride * inputPlanes[0] +
+          otherIndex * m_otherInputStride;
+      return m_impl.template packet<Unaligned>(inputIndex);
+    }
+
+    return packetWithPossibleZero(index);
+  }
+
+  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+
+  const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+
+  Index planePaddingTop() const { return m_planePaddingTop; }
+  Index rowPaddingTop() const { return m_rowPaddingTop; }
+  Index colPaddingLeft() const { return m_colPaddingLeft; }
+  Index outputPlanes() const { return m_outputPlanes; }
+  Index outputRows() const { return m_outputRows; }
+  Index outputCols() const { return m_outputCols; }
+  Index userPlaneStride() const { return m_plane_strides; }
+  Index userRowStride() const { return m_row_strides; }
+  Index userColStride() const { return m_col_strides; }
+  Index userInPlaneStride() const { return m_in_plane_strides; }
+  Index userInRowStride() const { return m_in_row_strides; }
+  Index userInColStride() const { return m_in_col_strides; }
+  Index planeInflateStride() const { return m_plane_inflate_strides; }
+  Index rowInflateStride() const { return m_row_inflate_strides; }
+  Index colInflateStride() const { return m_col_inflate_strides; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<Index, NumDims>& coords) const
+  {
+    // ColMajor
+    //   0: depth, 1: patch_planes, 2: patch_rows, 3: patch_cols, 4: number of patches, 5: batches
+    // RowMajor
+    //   0: batches, 1: number of patches, 2: patch_cols , 3: patch_rows, 4: patch_planes, 5: depth
+    const Index patch3DIndex = coords[static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 4 : 1];
+    const Index colOffset = coords[static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 3 : 2];
+    const Index rowOffset= coords[static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 3];
+    const Index planeOffset = coords[static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 4];
+
+    array<Index, NumDims-1> inputCoords;
+
+    const Index colIndex = patch3DIndex / m_fastOutputPlanesRows;
+    const Index inputCol = colIndex * m_col_strides + colOffset * m_in_col_strides - m_colPaddingLeft;
+    const Index origInputCol = (m_col_inflate_strides == 1) ? inputCol : ((inputCol >= 0) ? (inputCol / m_fastInputColStride) : 0);
+    if (inputCol < 0 || inputCol >= m_input_cols_eff ||
+        ((m_col_inflate_strides != 1) && (inputCol != origInputCol * m_col_inflate_strides))) {
+      return Scalar(m_paddingValue);
+    }
+
+    const Index rowIndex = (patch3DIndex - colIndex * m_outputPlanesRows) / m_fastOutputPlanes;
+    const Index inputRow = rowIndex * m_row_strides + rowOffset * m_in_row_strides - m_rowPaddingTop;
+    const Index origInputRow = (m_row_inflate_strides == 1) ? inputRow : ((inputRow >= 0) ? (inputRow / m_fastInputRowStride) : 0);
+    if (inputRow < 0 || inputRow >= m_input_rows_eff ||
+        ((m_row_inflate_strides != 1) && (inputRow != origInputRow * m_row_inflate_strides))) {
+      return Scalar(m_paddingValue);
+    }
+
+    const Index planeIndex = patch3DIndex - colIndex * m_outputPlanesRows - rowIndex * m_outputRows;
+    const Index inputPlane = planeIndex * m_plane_strides + planeOffset * m_in_plane_strides - m_planePaddingTop;
+    const Index origInputPlane = (m_plane_inflate_strides == 1) ? inputPlane : ((inputPlane >= 0) ? (inputPlane / m_fastInputPlaneStride) : 0);
+    if (inputPlane < 0 || inputPlane >= m_input_planes_eff ||
+        ((m_plane_inflate_strides != 1) && (inputPlane != origInputPlane * m_plane_inflate_strides))) {
+      return Scalar(m_paddingValue);
+    }
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      inputCoords[0] = coords[0];  // depth
+      inputCoords[1] = origInputPlane;
+      inputCoords[2] = origInputRow;
+      inputCoords[3] = origInputCol;
+      inputCoords[4] = coords[5];  // batch
+    } else {
+      inputCoords[4] = coords[5];  // depth
+      inputCoords[3] = origInputPlane;
+      inputCoords[2] = origInputRow;
+      inputCoords[1] = origInputCol;
+      inputCoords[0] = coords[0];  // batch
+    }
+    if (TensorEvaluator<ArgType, Device>::CoordAccess) {
+      return m_impl.coeff(inputCoords);
+    } else {
+      Index inputIndex;
+      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+        inputIndex =
+          inputCoords[4] * m_otherInputStride +
+          inputCoords[3] * m_colInputStride +
+          inputCoords[2] * m_rowInputStride +
+          inputCoords[1] * m_planeInputStride +
+          inputCoords[0];
+      } else {
+        inputIndex =
+          inputCoords[0] * m_otherInputStride +
+          inputCoords[1] * m_colInputStride +
+          inputCoords[2] * m_rowInputStride +
+          inputCoords[3] * m_planeInputStride +
+          inputCoords[4];
+      }
+      return m_impl.coeff(inputIndex);
+    }
+  }
+
+ protected:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
+  {
+    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize];
+    for (int i = 0; i < packetSize; ++i) {
+      values[i] = coeff(index+i);
+    }
+    PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+    return rslt;
+  }
+
+  Dimensions m_dimensions;
+
+  // Parameters passed to the costructor.
+  Index m_plane_strides;
+  Index m_row_strides;
+  Index m_col_strides;
+
+  Index m_outputPlanes;
+  Index m_outputRows;
+  Index m_outputCols;
+
+  Index m_planePaddingTop;
+  Index m_rowPaddingTop;
+  Index m_colPaddingLeft;
+
+  Index m_in_plane_strides;
+  Index m_in_row_strides;
+  Index m_in_col_strides;
+
+  Index m_plane_inflate_strides;
+  Index m_row_inflate_strides;
+  Index m_col_inflate_strides;
+
+  // Cached input size.
+  Index m_inputDepth;
+  Index m_inputPlanes;
+  Index m_inputRows;
+  Index m_inputCols;
+
+  // Other cached variables.
+  Index m_outputPlanesRows;
+
+  // Effective input/patch post-inflation size.
+  Index m_input_planes_eff;
+  Index m_input_rows_eff;
+  Index m_input_cols_eff;
+  Index m_patch_planes_eff;
+  Index m_patch_rows_eff;
+  Index m_patch_cols_eff;
+
+  // Strides for the output tensor.
+  Index m_otherStride;
+  Index m_patchStride;
+  Index m_rowStride;
+  Index m_colStride;
+
+  // Strides for the input tensor.
+  Index m_planeInputStride;
+  Index m_rowInputStride;
+  Index m_colInputStride;
+  Index m_otherInputStride;
+
+  internal::TensorIntDivisor<Index> m_fastOtherStride;
+  internal::TensorIntDivisor<Index> m_fastPatchStride;
+  internal::TensorIntDivisor<Index> m_fastColStride;
+  internal::TensorIntDivisor<Index> m_fastRowStride;
+  internal::TensorIntDivisor<Index> m_fastInputPlaneStride;
+  internal::TensorIntDivisor<Index> m_fastInputRowStride;
+  internal::TensorIntDivisor<Index> m_fastInputColStride;
+  internal::TensorIntDivisor<Index> m_fastInputColsEff;
+  internal::TensorIntDivisor<Index> m_fastOutputPlanesRows;
+  internal::TensorIntDivisor<Index> m_fastOutputPlanes;
+  internal::TensorIntDivisor<Index> m_fastOutputDepth;
+
+  Scalar m_paddingValue;
+
+  TensorEvaluator<ArgType, Device> m_impl;
+};
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_VOLUME_PATCH_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/g3doc/README.md b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/g3doc/README.md
new file mode 100644
index 0000000000..1c3fe32f9b
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/g3doc/README.md
@@ -0,0 +1,1792 @@
+# Eigen Tensors
+
+Tensors are multidimensional arrays of elements. Elements are typically scalars,
+but more complex types such as strings are also supported.
+
+[TOC]
+
+## Tensor Classes
+
+You can manipulate a tensor with one of the following classes.  They all are in
+the namespace ```::Eigen.```
+
+
+### Class Tensor&lt;data_type, rank&gt;
+
+This is the class to use to create a tensor and allocate memory for it.  The
+class is templatized with the tensor datatype, such as float or int, and the
+tensor rank.  The rank is the number of dimensions, for example rank 2 is a
+matrix.
+
+Tensors of this class are resizable.  For example, if you assign a tensor of a
+different size to a Tensor, that tensor is resized to match its new value.
+
+#### Constructor Tensor&lt;data_type, rank&gt;(size0, size1, ...)
+
+Constructor for a Tensor.  The constructor must be passed ```rank``` integers
+indicating the sizes of the instance along each of the the ```rank```
+dimensions.
+
+    // Create a tensor of rank 3 of sizes 2, 3, 4.  This tensor owns
+    // memory to hold 24 floating point values (24 = 2 x 3 x 4).
+    Tensor<float, 3> t_3d(2, 3, 4);
+
+    // Resize t_3d by assigning a tensor of different sizes, but same rank.
+    t_3d = Tensor<float, 3>(3, 4, 3);
+
+#### Constructor Tensor&lt;data_type, rank&gt;(size_array)
+
+Constructor where the sizes for the constructor are specified as an array of
+values instead of an explicitly list of parameters.  The array type to use is
+```Eigen::array<Eigen::Index>```.  The array can be constructed automatically
+from an initializer list.
+
+    // Create a tensor of strings of rank 2 with sizes 5, 7.
+    Tensor<string, 2> t_2d({5, 7});
+
+
+### Class TensorFixedSize&lt;data_type, Sizes&lt;size0, size1, ...&gt;&gt;
+
+Class to use for tensors of fixed size, where the size is known at compile
+time.  Fixed sized tensors can provide very fast computations because all their
+dimensions are known by the compiler.  FixedSize tensors are not resizable.
+
+If the total number of elements in a fixed size tensor is small enough the
+tensor data is held onto the stack and does not cause heap allocation and free.
+
+    // Create a 4 x 3 tensor of floats.
+    TensorFixedSize<float, Sizes<4, 3>> t_4x3;
+
+### Class TensorMap&lt;Tensor&lt;data_type, rank&gt;&gt;
+
+This is the class to use to create a tensor on top of memory allocated and
+owned by another part of your code.  It allows to view any piece of allocated
+memory as a Tensor.  Instances of this class do not own the memory where the
+data are stored.
+
+A TensorMap is not resizable because it does not own the memory where its data
+are stored.
+
+#### Constructor TensorMap&lt;Tensor&lt;data_type, rank&gt;&gt;(data, size0, size1, ...)
+
+Constructor for a Tensor.  The constructor must be passed a pointer to the
+storage for the data, and "rank" size attributes.  The storage has to be
+large enough to hold all the data.
+
+    // Map a tensor of ints on top of stack-allocated storage.
+    int storage[128];  // 2 x 4 x 2 x 8 = 128
+    TensorMap<int, 4> t_4d(storage, 2, 4, 2, 8);
+
+    // The same storage can be viewed as a different tensor.
+    // You can also pass the sizes as an array.
+    TensorMap<int, 2> t_2d(storage, 16, 8);
+
+    // You can also map fixed-size tensors.  Here we get a 1d view of
+    // the 2d fixed-size tensor.
+    TensorFixedSize<float, Sizes<4, 5>> t_4x3;
+    TensorMap<float, 1> t_12(t_4x3, 12);
+
+
+#### Class TensorRef
+
+See Assigning to a TensorRef below.
+
+## Accessing Tensor Elements
+
+#### &lt;data_type&gt; tensor(index0, index1...)
+
+Return the element at position ```(index0, index1...)``` in tensor
+```tensor```.  You must pass as many parameters as the rank of ```tensor```.
+The expression can be used as an l-value to set the value of the element at the
+specified position.  The value returned is of the datatype of the tensor.
+
+    // Set the value of the element at position (0, 1, 0);
+    Tensor<float, 3> t_3d(2, 3, 4);
+    t_3d(0, 1, 0) = 12.0f;
+
+    // Initialize all elements to random values.
+    for (int i = 0; i < 2; ++i) {
+      for (int j = 0; j < 3; ++j) {
+        for (int k = 0; k < 4; ++k) {
+          t_3d(i, j, k) = ...some random value...;
+        }
+      }
+    }
+
+    // Print elements of a tensor.
+    for (int i = 0; i < 2; ++i) {
+      LOG(INFO) << t_3d(i, 0, 0);
+    }
+
+
+## TensorLayout
+
+The tensor library supports 2 layouts: ```ColMajor``` (the default) and
+```RowMajor```.  Only the default column major layout is currently fully
+supported, and it is therefore not recommended to attempt to use the row major
+layout at the moment.
+
+The layout of a tensor is optionally specified as part of its type. If not
+specified explicitly column major is assumed.
+
+    Tensor<float, 3, ColMajor> col_major;  // equivalent to Tensor<float, 3>
+    TensorMap<Tensor<float, 3, RowMajor> > row_major(data, ...);
+
+All the arguments to an expression must use the same layout. Attempting to mix
+different layouts will result in a compilation error.
+
+It is possible to change the layout of a tensor or an expression using the
+```swap_layout()``` method.  Note that this will also reverse the order of the
+dimensions.
+
+    Tensor<float, 2, ColMajor> col_major(2, 4);
+    Tensor<float, 2, RowMajor> row_major(2, 4);
+
+    Tensor<float, 2> col_major_result = col_major;  // ok, layouts match
+    Tensor<float, 2> col_major_result = row_major;  // will not compile
+
+    // Simple layout swap
+    col_major_result = row_major.swap_layout();
+    eigen_assert(col_major_result.dimension(0) == 4);
+    eigen_assert(col_major_result.dimension(1) == 2);
+
+    // Swap the layout and preserve the order of the dimensions
+    array<int, 2> shuffle(1, 0);
+    col_major_result = row_major.swap_layout().shuffle(shuffle);
+    eigen_assert(col_major_result.dimension(0) == 2);
+    eigen_assert(col_major_result.dimension(1) == 4);
+
+
+## Tensor Operations
+
+The Eigen Tensor library provides a vast library of operations on Tensors:
+numerical operations such as addition and multiplication, geometry operations
+such as slicing and shuffling, etc.  These operations are available as methods
+of the Tensor classes, and in some cases as operator overloads.  For example
+the following code computes the elementwise addition of two tensors:
+
+    Tensor<float, 3> t1(2, 3, 4);
+    ...set some values in t1...
+    Tensor<float, 3> t2(2, 3, 4);
+    ...set some values in t2...
+    // Set t3 to the element wise sum of t1 and t2
+    Tensor<float, 3> t3 = t1 + t2;
+
+While the code above looks easy enough, it is important to understand that the
+expression ```t1 + t2``` is not actually adding the values of the tensors.  The
+expression instead constructs a "tensor operator" object of the class
+TensorCwiseBinaryOp&lt;scalar_sum&gt;, which has references to the tensors
+```t1``` and ```t2```.  This is a small C++ object that knows how to add
+```t1``` and ```t2```.  It is only when the value of the expression is assigned
+to the tensor ```t3``` that the addition is actually performed.  Technically,
+this happens through the overloading of ```operator=()``` in the Tensor class.
+
+This mechanism for computing tensor expressions allows for lazy evaluation and
+optimizations which are what make the tensor library very fast.
+
+Of course, the tensor operators do nest, and the expression ```t1 + t2 *
+0.3f``` is actually represented with the (approximate) tree of operators:
+
+    TensorCwiseBinaryOp<scalar_sum>(t1, TensorCwiseUnaryOp<scalar_mul>(t2, 0.3f))
+
+
+### Tensor Operations and C++ "auto"
+
+Because Tensor operations create tensor operators, the C++ ```auto``` keyword
+does not have its intuitive meaning.  Consider these 2 lines of code:
+
+    Tensor<float, 3> t3 = t1 + t2;
+    auto t4 = t1 + t2;
+
+In the first line we allocate the tensor ```t3``` and it will contain the
+result of the addition of ```t1``` and ```t2```.  In the second line, ```t4```
+is actually the tree of tensor operators that will compute the addition of
+```t1``` and ```t2```.  In fact, ```t4``` is *not* a tensor and you cannot get
+the values of its elements:
+
+    Tensor<float, 3> t3 = t1 + t2;
+    cout << t3(0, 0, 0);  // OK prints the value of t1(0, 0, 0) + t2(0, 0, 0)
+
+    auto t4 = t1 + t2;
+    cout << t4(0, 0, 0);  // Compilation error!
+
+When you use ```auto``` you do not get a Tensor as a result but instead a
+non-evaluated expression.  So only use ```auto``` to delay evaluation.
+
+Unfortunately, there is no single underlying concrete type for holding
+non-evaluated expressions, hence you have to use auto in the case when you do
+want to hold non-evaluated expressions.
+
+When you need the results of a set of tensor computations you have to assign the
+result to a Tensor that will be capable of holding them.  This can be
+either a normal Tensor, a fixed size Tensor, or a TensorMap on an existing
+piece of memory.  All the following will work:
+
+    auto t4 = t1 + t2;
+
+    Tensor<float, 3> result = t4;  // Could also be: result(t4);
+    cout << result(0, 0, 0);
+
+    TensorMap<float, 4> result(<a float* with enough space>, <size0>, ...) = t4;
+    cout << result(0, 0, 0);
+
+    TensorFixedSize<float, Sizes<size0, ...>> result = t4;
+    cout << result(0, 0, 0);
+
+Until you need the results, you can keep the operation around, and even reuse
+it for additional operations.  As long as you keep the expression as an
+operation, no computation is performed.
+
+    // One way to compute exp((t1 + t2) * 0.2f);
+    auto t3 = t1 + t2;
+    auto t4 = t3 * 0.2f;
+    auto t5 = t4.exp();
+    Tensor<float, 3> result = t5;
+
+    // Another way, exactly as efficient as the previous one:
+    Tensor<float, 3> result = ((t1 + t2) * 0.2f).exp();
+
+### Controlling When Expression are Evaluated
+
+There are several ways to control when expressions are evaluated:
+
+*   Assignment to a Tensor, TensorFixedSize, or TensorMap.
+*   Use of the eval() method.
+*   Assignment to a TensorRef.
+
+#### Assigning to a Tensor, TensorFixedSize, or TensorMap.
+
+The most common way to evaluate an expression is to assign it to a Tensor.  In
+the example below, the ```auto``` declarations make the intermediate values
+"Operations", not Tensors, and do not cause the expressions to be evaluated.
+The assignment to the Tensor ```result``` causes the evaluation of all the
+operations.
+
+    auto t3 = t1 + t2;             // t3 is an Operation.
+    auto t4 = t3 * 0.2f;           // t4 is an Operation.
+    auto t5 = t4.exp();            // t5 is an Operation.
+    Tensor<float, 3> result = t5;  // The operations are evaluated.
+
+If you know the ranks and sizes of the Operation value you can assign the
+Operation to a TensorFixedSize instead of a Tensor, which is a bit more
+efficient.
+
+    // We know that the result is a 4x4x2 tensor!
+    TensorFixedSize<float, Sizes<4, 4, 2>> result = t5;
+
+Simiarly, assigning an expression to a TensorMap causes its evaluation.  Like
+tensors of type TensorFixedSize, TensorMaps cannot be resized so they have to
+have the rank and sizes of the expression that are assigned to them.
+
+#### Calling eval().
+
+When you compute large composite expressions, you sometimes want to tell Eigen
+that an intermediate value in the expression tree is worth evaluating ahead of
+time.  This is done by inserting a call to the ```eval()``` method of the
+expression Operation.
+
+    // The previous example could have been written:
+    Tensor<float, 3> result = ((t1 + t2) * 0.2f).exp();
+
+    // If you want to compute (t1 + t2) once ahead of time you can write:
+    Tensor<float, 3> result = ((t1 + t2).eval() * 0.2f).exp();
+
+Semantically, calling ```eval()``` is equivalent to materializing the value of
+the expression in a temporary Tensor of the right size.  The code above in
+effect does:
+
+    // .eval() knows the size!
+    TensorFixedSize<float, Sizes<4, 4, 2>> tmp = t1 + t2;
+    Tensor<float, 3> result = (tmp * 0.2f).exp();
+
+Note that the return value of ```eval()``` is itself an Operation, so the
+following code does not do what you may think:
+
+    // Here t3 is an evaluation Operation.  t3 has not been evaluated yet.
+    auto t3 = (t1 + t2).eval();
+
+    // You can use t3 in another expression.  Still no evaluation.
+    auto t4 = (t3 * 0.2f).exp();
+
+    // The value is evaluated when you assign the Operation to a Tensor, using
+    // an intermediate tensor to represent t3.x
+    Tensor<float, 3> result = t4;
+
+While in the examples above calling ```eval()``` does not make a difference in
+performance, in other cases it can make a huge difference.  In the expression
+below the ```broadcast()``` expression causes the ```X.maximum()``` expression
+to be evaluated many times:
+
+    Tensor<...> X ...;
+    Tensor<...> Y = ((X - X.maximum(depth_dim).reshape(dims2d).broadcast(bcast))
+                     * beta).exp();
+
+Inserting a call to ```eval()``` between the ```maximum()``` and
+```reshape()``` calls guarantees that maximum() is only computed once and
+greatly speeds-up execution:
+
+    Tensor<...> Y =
+      ((X - X.maximum(depth_dim).eval().reshape(dims2d).broadcast(bcast))
+        * beta).exp();
+
+In the other example below, the tensor ```Y``` is both used in the expression
+and its assignment.  This is an aliasing problem and if the evaluation is not
+done in the right order Y will be updated incrementally during the evaluation
+resulting in bogus results:
+
+     Tensor<...> Y ...;
+     Y = Y / (Y.sum(depth_dim).reshape(dims2d).broadcast(bcast));
+
+Inserting a call to ```eval()``` between the ```sum()``` and ```reshape()```
+expressions ensures that the sum is computed before any updates to ```Y``` are
+done.
+
+     Y = Y / (Y.sum(depth_dim).eval().reshape(dims2d).broadcast(bcast));
+
+Note that an eval around the full right hand side expression is not needed
+because the generated has to compute the i-th value of the right hand side
+before assigning it to the left hand side.
+
+However, if you were assigning the expression value to a shuffle of ```Y```
+then you would need to force an eval for correctness by adding an ```eval()```
+call for the right hand side:
+
+     Y.shuffle(...) =
+        (Y / (Y.sum(depth_dim).eval().reshape(dims2d).broadcast(bcast))).eval();
+
+
+#### Assigning to a TensorRef.
+
+If you need to access only a few elements from the value of an expression you
+can avoid materializing the value in a full tensor by using a TensorRef.
+
+A TensorRef is a small wrapper class for any Eigen Operation.  It provides
+overloads for the ```()``` operator that let you access individual values in
+the expression.  TensorRef is convenient, because the Operation themselves do
+not provide a way to access individual elements.
+
+    // Create a TensorRef for the expression.  The expression is not
+    // evaluated yet.
+    TensorRef<Tensor<float, 3> > ref = ((t1 + t2) * 0.2f).exp();
+
+    // Use "ref" to access individual elements.  The expression is evaluated
+    // on the fly.
+    float at_0 = ref(0, 0, 0);
+    cout << ref(0, 1, 0);
+
+Only use TensorRef when you need a subset of the values of the expression.
+TensorRef only computes the values you access.  However note that if you are
+going to access all the values it will be much faster to materialize the
+results in a Tensor first.
+
+In some cases, if the full Tensor result would be very large, you may save
+memory by accessing it as a TensorRef.  But not always.  So don't count on it.
+
+
+### Controlling How Expressions Are Evaluated
+
+The tensor library provides several implementations of the various operations
+such as contractions and convolutions.  The implementations are optimized for
+different environments: single threaded on CPU, multi threaded on CPU, or on a
+GPU using cuda.  Additional implementations may be added later.
+
+You can choose which implementation to use with the ```device()``` call.  If
+you do not choose an implementation explicitly the default implementation that
+uses a single thread on the CPU is used.
+
+The default implementation has been optimized for recent Intel CPUs, taking
+advantage of SSE, AVX, and FMA instructions.  Work is ongoing to tune the
+library on ARM CPUs.  Note that you need to pass compiler-dependent flags
+to enable the use of SSE, AVX, and other instructions.
+
+For example, the following code adds two tensors using the default
+single-threaded CPU implementation:
+
+    Tensor<float, 2> a(30, 40);
+    Tensor<float, 2> b(30, 40);
+    Tensor<float, 2> c = a + b;
+
+To choose a different implementation you have to insert a ```device()``` call
+before the assignment of the result.  For technical C++ reasons this requires
+that the Tensor for the result be declared on its own.  This means that you
+have to know the size of the result.
+
+    Eigen::Tensor<float, 2> c(30, 40);
+    c.device(...) = a + b;
+
+The call to ```device()``` must be the last call on the left of the operator=.
+
+You must pass to the ```device()``` call an Eigen device object.  There are
+presently three devices you can use: DefaultDevice, ThreadPoolDevice and
+GpuDevice.
+
+
+#### Evaluating With the DefaultDevice
+
+This is exactly the same as not inserting a ```device()``` call.
+
+    DefaultDevice my_device;
+    c.device(my_device) = a + b;
+
+#### Evaluating with a Thread Pool
+
+    #include "thread/threadpool.h"
+
+    // Create a threadpool and start the threads.  This is the Google way,
+    // other environments use different mechanism to create a thread pool.
+    ThreadPool my_pool(4 /* number of threads in the pool */);
+    my_pool.StartWorkers();
+
+    // Create the Eigen ThreadPoolDevice.
+    // You typically use up to all the available threads in the pool.
+    Eigen::ThreadPoolDevice my_device(&my_pool, 4 /* number of threads to use */);
+
+    // Now just use the device when evaluating expressions.
+    Eigen::Tensor<float, 2> c(30, 50);
+    c.device(my_device) = a.contract(b, dot_product_dims);
+
+
+#### Evaluating On GPU
+
+This is presently a bit more complicated than just using a thread pool device.
+You need to create a GPU device but you also need to explicitly allocate the
+memory for tensors with cuda.
+
+
+## API Reference
+
+### Datatypes
+
+In the documentation of the tensor methods and Operation we mention datatypes
+that are tensor-type specific:
+
+#### &lt;Tensor-Type&gt;::Dimensions
+
+Acts like an array of ints.  Has an ```int size``` attribute, and can be
+indexed like an array to access individual values.  Used to represent the
+dimensions of a tensor.  See ```dimensions()```.
+
+#### &lt;Tensor-Type&gt;::Index
+
+Acts like an ```int```.  Used for indexing tensors along their dimensions.  See
+```operator()```, ```dimension()```, and ```size()```.
+
+#### &lt;Tensor-Type&gt;::Scalar
+
+Represents the datatype of individual tensor elements.  For example, for a
+```Tensor<float>```, ```Scalar``` is the type ```float```.  See
+```setConstant()```.
+
+#### &lt;Operation&gt;
+
+We use this pseudo type to indicate that a tensor Operation is returned by a
+method.  We indicate in the text the type and dimensions of the tensor that the
+Operation returns after evaluation.
+
+The Operation will have to be evaluated, for example by assigning it to a
+tensor, before you can access the values of the resulting tensor.  You can also
+access the values through a TensorRef.
+
+
+## Built-in Tensor Methods
+
+These are usual C++ methods that act on tensors immediately.  They are not
+Operations which provide delayed evaluation of their results.  Unless specified
+otherwise, all the methods listed below are available on all tensor classes:
+Tensor, TensorFixedSize, and TensorMap.
+
+## Metadata
+
+### int NumDimensions
+
+Constant value indicating the number of dimensions of a Tensor.  This is also
+known as the tensor "rank".
+
+      Eigen::Tensor<float, 2> a(3, 4);
+      cout << "Dims " << a.NumDimensions;
+      => Dims 2
+
+### Dimensions dimensions()
+
+Returns an array-like object representing the dimensions of the tensor.
+The actual type of the dimensions() result is <Tensor-Type>::Dimensions.
+
+    Eigen::Tensor<float, 2> a(3, 4);
+    const Eigen::Tensor<float, 2>::Dimensions& d = a.dimensions();
+    cout << "Dim size: " << d.size << ", dim 0: " << d[0]
+         << ", dim 1: " << d[1];
+    => Dim size: 2, dim 0: 3, dim 1: 4
+
+If you use a C++11 compiler, you can use ```auto``` to simplify the code:
+
+    const auto& d = a.dimensions();
+    cout << "Dim size: " << d.size << ", dim 0: " << d[0]
+         << ", dim 1: " << d[1];
+    => Dim size: 2, dim 0: 3, dim 1: 4
+
+### Index dimension(Index n)
+
+Returns the n-th dimension of the tensor.  The actual type of the
+```dimension()``` result is ```<Tensor-Type>::Index```, but you can
+always use it like an int.
+
+      Eigen::Tensor<float, 2> a(3, 4);
+      int dim1 = a.dimension(1);
+      cout << "Dim 1: " << dim1;
+      => Dim 1: 4
+
+### Index size()
+
+Returns the total number of elements in the tensor.  This is the product of all
+the tensor dimensions.  The actual type of the ```size()``` result is
+```<Tensor-Type>::Index```, but you can always use it like an int.
+
+    Eigen::Tensor<float, 2> a(3, 4);
+    cout << "Size: " << a.size();
+    => Size: 12
+
+
+### Getting Dimensions From An Operation
+
+A few operations provide ```dimensions()``` directly,
+e.g. ```TensorReslicingOp```.  Most operations defer calculating dimensions
+until the operation is being evaluated.  If you need access to the dimensions
+of a deferred operation, you can wrap it in a TensorRef (see Assigning to a
+TensorRef above), which provides ```dimensions()``` and ```dimension()``` as
+above.
+
+TensorRef can also wrap the plain Tensor types, so this is a useful idiom in
+templated contexts where the underlying object could be either a raw Tensor
+or some deferred operation (e.g. a slice of a Tensor).  In this case, the
+template code can wrap the object in a TensorRef and reason about its
+dimensionality while remaining agnostic to the underlying type.
+
+
+## Constructors
+
+### Tensor
+
+Creates a tensor of the specified size. The number of arguments must be equal
+to the rank of the tensor. The content of the tensor is not initialized.
+
+    Eigen::Tensor<float, 2> a(3, 4);
+    cout << "NumRows: " << a.dimension(0) << " NumCols: " << a.dimension(1) << endl;
+    => NumRows: 3 NumCols: 4
+
+### TensorFixedSize
+
+Creates a tensor of the specified size. The number of arguments in the Size<>
+template parameter determines the rank of the tensor. The content of the tensor
+is not initialized.
+
+    Eigen::TensorFixedSize<float, Sizes<3, 4>> a;
+    cout << "Rank: " << a.rank() << endl;
+    => Rank: 2
+    cout << "NumRows: " << a.dimension(0) << " NumCols: " << a.dimension(1) << endl;
+    => NumRows: 3 NumCols: 4
+
+### TensorMap
+
+Creates a tensor mapping an existing array of data. The data must not be freed
+until the TensorMap is discarded, and the size of the data must be large enough
+to accomodate the coefficients of the tensor.
+
+    float data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
+    Eigen::TensorMap<float, 2> a(data, 3, 4);
+    cout << "NumRows: " << a.dimension(0) << " NumCols: " << a.dimension(1) << endl;
+    => NumRows: 3 NumCols: 4
+    cout << "a(1, 2): " << a(1, 2) << endl;
+    => a(1, 2): 9
+
+
+## Contents Initialization
+
+When a new Tensor or a new TensorFixedSize are created, memory is allocated to
+hold all the tensor elements, but the memory is not initialized.  Similarly,
+when a new TensorMap is created on top of non-initialized memory, its
+contents are not initialized.
+
+You can use one of the methods below to initialize the tensor memory.  These
+have an immediate effect on the tensor and return the tensor itself as a
+result.  These are not tensor Operations which delay evaluation.
+
+### &lt;Tensor-Type&gt; setConstant(const Scalar& val)
+
+Sets all elements of the tensor to the constant value ```val```.  ```Scalar```
+is the type of data stored in the tensor.  You can pass any value that is
+convertible to that type.
+
+Returns the tensor itself in case you want to chain another call.
+
+    a.setConstant(12.3f);
+    cout << "Constant: " << endl << a << endl << endl;
+    =>
+    Constant:
+    12.3 12.3 12.3 12.3
+    12.3 12.3 12.3 12.3
+    12.3 12.3 12.3 12.3
+
+Note that ```setConstant()``` can be used on any tensor where the element type
+has a copy constructor and an ```operator=()```:
+
+    Eigen::Tensor<string, 2> a(2, 3);
+    a.setConstant("yolo");
+    cout << "String tensor: " << endl << a << endl << endl;
+    =>
+    String tensor:
+    yolo yolo yolo
+    yolo yolo yolo
+
+
+### &lt;Tensor-Type&gt; setZero()
+
+Fills the tensor with zeros.  Equivalent to ```setConstant(Scalar(0))```.
+Returns the tensor itself in case you want to chain another call.
+
+    a.setZero();
+    cout << "Zeros: " << endl << a << endl << endl;
+    =>
+    Zeros:
+    0 0 0 0
+    0 0 0 0
+    0 0 0 0
+
+
+### &lt;Tensor-Type&gt; setValues({..initializer_list})
+
+Fills the tensor with explicit values specified in a std::initializer_list.
+The type of the initializer list depends on the type and rank of the tensor.
+
+If the tensor has rank N, the initializer list must be nested N times.  The
+most deeply nested lists must contains P scalars of the Tensor type where P is
+the size of the last dimension of the Tensor.
+
+For example, for a ```TensorFixedSize<float, Sizes<2, 3>>``` the initializer list must
+contains 2 lists of 3 floats each.
+
+```setValues()``` returns the tensor itself in case you want to chain another
+call.
+
+    Eigen::Tensor<float, 2> a(2, 3);
+    a.setValues({{0.0f, 1.0f, 2.0f}, {3.0f, 4.0f, 5.0f}});
+    cout << "a" << endl << a << endl << endl;
+    =>
+    a
+    0 1 2
+    3 4 5
+
+If a list is too short, the corresponding elements of the tensor will not be
+changed.  This is valid at each level of nesting.  For example the following
+code only sets the values of the first row of the tensor.
+
+    Eigen::Tensor<int, 2> a(2, 3);
+    a.setConstant(1000);
+    a.setValues({{10, 20, 30}});
+    cout << "a" << endl << a << endl << endl;
+    =>
+    a
+    10   20   30
+    1000 1000 1000
+
+### &lt;Tensor-Type&gt; setRandom()
+
+Fills the tensor with random values.  Returns the tensor itself in case you
+want to chain another call.
+
+    a.setRandom();
+    cout << "Random: " << endl << a << endl << endl;
+    =>
+    Random:
+      0.680375    0.59688  -0.329554    0.10794
+     -0.211234   0.823295   0.536459 -0.0452059
+      0.566198  -0.604897  -0.444451   0.257742
+
+You can customize ```setRandom()``` by providing your own random number
+generator as a template argument:
+
+    a.setRandom<MyRandomGenerator>();
+
+Here, ```MyRandomGenerator``` must be a struct with the following member
+functions, where Scalar and Index are the same as ```<Tensor-Type>::Scalar```
+and ```<Tensor-Type>::Index```.
+
+See ```struct UniformRandomGenerator``` in TensorFunctors.h for an example.
+
+    // Custom number generator for use with setRandom().
+    struct MyRandomGenerator {
+      // Default and copy constructors. Both are needed
+      MyRandomGenerator() { }
+      MyRandomGenerator(const MyRandomGenerator& ) { }
+
+      // Return a random value to be used.  "element_location" is the
+      // location of the entry to set in the tensor, it can typically
+      // be ignored.
+      Scalar operator()(Eigen::DenseIndex element_location,
+                        Eigen::DenseIndex /*unused*/ = 0) const {
+        return <randomly generated value of type T>;
+      }
+
+      // Same as above but generates several numbers at a time.
+      typename internal::packet_traits<Scalar>::type packetOp(
+          Eigen::DenseIndex packet_location, Eigen::DenseIndex /*unused*/ = 0) const {
+        return <a packet of randomly generated values>;
+      }
+    };
+
+You can also use one of the 2 random number generators that are part of the
+tensor library:
+*   UniformRandomGenerator
+*   NormalRandomGenerator
+
+
+## Data Access
+
+The Tensor, TensorFixedSize, and TensorRef classes provide the following
+accessors to access the tensor coefficients:
+
+    const Scalar& operator()(const array<Index, NumIndices>& indices)
+    const Scalar& operator()(Index firstIndex, IndexTypes... otherIndices)
+    Scalar& operator()(const array<Index, NumIndices>& indices)
+    Scalar& operator()(Index firstIndex, IndexTypes... otherIndices)
+
+The number of indices must be equal to the rank of the tensor. Moreover, these
+accessors are not available on tensor expressions. In order to access the
+values of a tensor expression, the expression must either be evaluated or
+wrapped in a TensorRef.
+
+
+### Scalar* data() and const Scalar* data() const
+
+Returns a pointer to the storage for the tensor.  The pointer is const if the
+tensor was const.  This allows direct access to the data.  The layout of the
+data depends on the tensor layout: RowMajor or ColMajor.
+
+This access is usually only needed for special cases, for example when mixing
+Eigen Tensor code with other libraries.
+
+Scalar is the type of data stored in the tensor.
+
+    Eigen::Tensor<float, 2> a(3, 4);
+    float* a_data = a.data();
+    a_data[0] = 123.45f;
+    cout << "a(0, 0): " << a(0, 0);
+    => a(0, 0): 123.45
+
+
+## Tensor Operations
+
+All the methods documented below return non evaluated tensor ```Operations```.
+These can be chained: you can apply another Tensor Operation to the value
+returned by the method.
+
+The chain of Operation is evaluated lazily, typically when it is assigned to a
+tensor.  See "Controlling when Expressions are Evaluated" for more details about
+their evaluation.
+
+### &lt;Operation&gt; constant(const Scalar& val)
+
+Returns a tensor of the same type and dimensions as the original tensor but
+where all elements have the value ```val```.
+
+This is useful, for example, when you want to add or subtract a constant from a
+tensor, or multiply every element of a tensor by a scalar.
+
+    Eigen::Tensor<float, 2> a(2, 3);
+    a.setConstant(1.0f);
+    Eigen::Tensor<float, 2> b = a + a.constant(2.0f);
+    Eigen::Tensor<float, 2> c = b * b.constant(0.2f);
+    cout << "a" << endl << a << endl << endl;
+    cout << "b" << endl << b << endl << endl;
+    cout << "c" << endl << c << endl << endl;
+    =>
+    a
+    1 1 1
+    1 1 1
+
+    b
+    3 3 3
+    3 3 3
+
+    c
+    0.6 0.6 0.6
+    0.6 0.6 0.6
+
+### &lt;Operation&gt; random()
+
+Returns a tensor of the same type and dimensions as the current tensor
+but where all elements have random values.
+
+This is for example useful to add random values to an existing tensor.
+The generation of random values can be customized in the same manner
+as for ```setRandom()```.
+
+    Eigen::Tensor<float, 2> a(2, 3);
+    a.setConstant(1.0f);
+    Eigen::Tensor<float, 2> b = a + a.random();
+    cout << "a" << endl << a << endl << endl;
+    cout << "b" << endl << b << endl << endl;
+    =>
+    a
+    1 1 1
+    1 1 1
+
+    b
+    1.68038   1.5662  1.82329
+    0.788766  1.59688 0.395103
+
+
+## Unary Element Wise Operations
+
+All these operations take a single input tensor as argument and return a tensor
+of the same type and dimensions as the tensor to which they are applied.  The
+requested operations are applied to each element independently.
+
+### &lt;Operation&gt; operator-()
+
+Returns a tensor of the same type and dimensions as the original tensor
+containing the opposite values of the original tensor.
+
+    Eigen::Tensor<float, 2> a(2, 3);
+    a.setConstant(1.0f);
+    Eigen::Tensor<float, 2> b = -a;
+    cout << "a" << endl << a << endl << endl;
+    cout << "b" << endl << b << endl << endl;
+    =>
+    a
+    1 1 1
+    1 1 1
+
+    b
+    -1 -1 -1
+    -1 -1 -1
+
+### &lt;Operation&gt; sqrt()
+
+Returns a tensor of the same type and dimensions as the original tensor
+containing the square roots of the original tensor.
+
+### &lt;Operation&gt; rsqrt()
+
+Returns a tensor of the same type and dimensions as the original tensor
+containing the inverse square roots of the original tensor.
+
+### &lt;Operation&gt; square()
+
+Returns a tensor of the same type and dimensions as the original tensor
+containing the squares of the original tensor values.
+
+### &lt;Operation&gt; inverse()
+
+Returns a tensor of the same type and dimensions as the original tensor
+containing the inverse of the original tensor values.
+
+### &lt;Operation&gt; exp()
+
+Returns a tensor of the same type and dimensions as the original tensor
+containing the exponential of the original tensor.
+
+### &lt;Operation&gt; log()
+
+Returns a tensor of the same type and dimensions as the original tensor
+containing the natural logarithms of the original tensor.
+
+### &lt;Operation&gt; abs()
+
+Returns a tensor of the same type and dimensions as the original tensor
+containing the absolute values of the original tensor.
+
+### &lt;Operation&gt; pow(Scalar exponent)
+
+Returns a tensor of the same type and dimensions as the original tensor
+containing the coefficients of the original tensor to the power of the
+exponent.
+
+The type of the exponent, Scalar, is always the same as the type of the
+tensor coefficients.  For example, only integer exponents can be used in
+conjuntion with tensors of integer values.
+
+You can use cast() to lift this restriction.  For example this computes
+cubic roots of an int Tensor:
+
+    Eigen::Tensor<int, 2> a(2, 3);
+    a.setValues({{0, 1, 8}, {27, 64, 125}});
+    Eigen::Tensor<double, 2> b = a.cast<double>().pow(1.0 / 3.0);
+    cout << "a" << endl << a << endl << endl;
+    cout << "b" << endl << b << endl << endl;
+    =>
+    a
+    0   1   8
+    27  64 125
+
+    b
+    0 1 2
+    3 4 5
+
+### &lt;Operation&gt;  operator * (Scalar scale)
+
+Multiplies all the coefficients of the input tensor by the provided scale.
+
+### &lt;Operation&gt;  cwiseMax(Scalar threshold)
+TODO
+
+### &lt;Operation&gt;  cwiseMin(Scalar threshold)
+TODO
+
+### &lt;Operation&gt;  unaryExpr(const CustomUnaryOp& func)
+TODO
+
+
+## Binary Element Wise Operations
+
+These operations take two input tensors as arguments. The 2 input tensors should
+be of the same type and dimensions. The result is a tensor of the same
+dimensions as the tensors to which they are applied, and unless otherwise
+specified it is also of the same type. The requested operations are applied to
+each pair of elements independently.
+
+### &lt;Operation&gt; operator+(const OtherDerived& other)
+
+Returns a tensor of the same type and dimensions as the input tensors
+containing the coefficient wise sums of the inputs.
+
+### &lt;Operation&gt; operator-(const OtherDerived& other)
+
+Returns a tensor of the same type and dimensions as the input tensors
+containing the coefficient wise differences of the inputs.
+
+### &lt;Operation&gt; operator*(const OtherDerived& other)
+
+Returns a tensor of the same type and dimensions as the input tensors
+containing the coefficient wise products of the inputs.
+
+### &lt;Operation&gt; operator/(const OtherDerived& other)
+
+Returns a tensor of the same type and dimensions as the input tensors
+containing the coefficient wise quotients of the inputs.
+
+This operator is not supported for integer types.
+
+### &lt;Operation&gt; cwiseMax(const OtherDerived& other)
+
+Returns a tensor of the same type and dimensions as the input tensors
+containing the coefficient wise maximums of the inputs.
+
+### &lt;Operation&gt; cwiseMin(const OtherDerived& other)
+
+Returns a tensor of the same type and dimensions as the input tensors
+containing the coefficient wise mimimums of the inputs.
+
+### &lt;Operation&gt; Logical operators
+
+The following logical operators are supported as well:
+
+*   operator&&(const OtherDerived& other)
+*   operator||(const OtherDerived& other)
+*   operator<(const OtherDerived& other)
+*   operator<=(const OtherDerived& other)
+*   operator>(const OtherDerived& other)
+*   operator>=(const OtherDerived& other)
+*   operator==(const OtherDerived& other)
+*   operator!=(const OtherDerived& other)
+
+They all return a tensor of boolean values.
+
+
+## Selection (select(const ThenDerived& thenTensor, const ElseDerived& elseTensor)
+
+Selection is a coefficient-wise ternary operator that is the tensor equivalent
+to the if-then-else operation.
+
+    Tensor<bool, 3> if = ...;
+    Tensor<float, 3> then = ...;
+    Tensor<float, 3> else = ...;
+    Tensor<float, 3> result = if.select(then, else);
+
+The 3 arguments must be of the same dimensions, which will also be the dimension
+of the result.  The 'if' tensor must be of type boolean, the 'then' and the
+'else' tensor must be of the same type, which will also be the type of the
+result.
+
+Each coefficient in the result is equal to the corresponding coefficient in the
+'then' tensor if the corresponding value in the 'if' tensor is true. If not, the
+resulting coefficient will come from the 'else' tensor.
+
+
+## Contraction
+
+Tensor *contractions* are a generalization of the matrix product to the
+multidimensional case.
+
+    // Create 2 matrices using tensors of rank 2
+    Eigen::Tensor<int, 2> a(2, 3);
+    a.setValues({{1, 2, 3}, {6, 5, 4}});
+    Eigen::Tensor<int, 2> b(3, 2);
+    a.setValues({{1, 2}, {4, 5}, {5, 6}});
+
+    // Compute the traditional matrix product
+    array<IndexPair<int>, 1> product_dims = { IndexPair(1, 0) };
+    Eigen::Tensor<int, 2> AB = a.contract(b, product_dims);
+
+    // Compute the product of the transpose of the matrices
+    array<IndexPair<int>, 1> transpose_product_dims = { IndexPair(0, 1) };
+    Eigen::Tensor<int, 2> AtBt = a.contract(b, transposed_product_dims);
+
+
+## Reduction Operations
+
+A *Reduction* operation returns a tensor with fewer dimensions than the
+original tensor.  The values in the returned tensor are computed by applying a
+*reduction operator* to slices of values from the original tensor.  You specify
+the dimensions along which the slices are made.
+
+The Eigen Tensor library provides a set of predefined reduction operators such
+as ```maximum()``` and ```sum()``` and lets you define additional operators by
+implementing a few methods from a reductor template.
+
+### Reduction Dimensions
+
+All reduction operations take a single parameter of type
+```<TensorType>::Dimensions``` which can always be specified as an array of
+ints.  These are called the "reduction dimensions."  The values are the indices
+of the dimensions of the input tensor over which the reduction is done.  The
+parameter can have at most as many element as the rank of the input tensor;
+each element must be less than the tensor rank, as it indicates one of the
+dimensions to reduce.
+
+Each dimension of the input tensor should occur at most once in the reduction
+dimensions as the implementation does not remove duplicates.
+
+The order of the values in the reduction dimensions does not affect the
+results, but the code may execute faster if you list the dimensions in
+increasing order.
+
+Example: Reduction along one dimension.
+
+    // Create a tensor of 2 dimensions
+    Eigen::Tensor<int, 2> a(2, 3);
+    a.setValues({{1, 2, 3}, {6, 5, 4}});
+    // Reduce it along the second dimension (1)...
+    Eigen::array<int, 1> dims({1 /* dimension to reduce */});
+    // ...using the "maximum" operator.
+    // The result is a tensor with one dimension.  The size of
+    // that dimension is the same as the first (non-reduced) dimension of a.
+    Eigen::Tensor<int, 1> b = a.maximum(dims);
+    cout << "a" << endl << a << endl << endl;
+    cout << "b" << endl << b << endl << endl;
+    =>
+    a
+    1 2 3
+    6 5 4
+
+    b
+    3
+    6
+
+Example: Reduction along two dimensions.
+
+    Eigen::Tensor<float, 3, Eigen::ColMajor> a(2, 3, 4);
+    a.setValues({{{0.0f, 1.0f, 2.0f, 3.0f},
+                  {7.0f, 6.0f, 5.0f, 4.0f},
+                  {8.0f, 9.0f, 10.0f, 11.0f}},
+                 {{12.0f, 13.0f, 14.0f, 15.0f},
+                  {19.0f, 18.0f, 17.0f, 16.0f},
+                  {20.0f, 21.0f, 22.0f, 23.0f}}});
+    // The tensor a has 3 dimensions.  We reduce along the
+    // first 2, resulting in a tensor with a single dimension
+    // of size 4 (the last dimension of a.)
+    // Note that we pass the array of reduction dimensions
+    // directly to the maximum() call.
+    Eigen::Tensor<float, 1, Eigen::ColMajor> b =
+        a.maximum(Eigen::array<int, 2>({0, 1}));
+    cout << "b" << endl << b << endl << endl;
+    =>
+    b
+    20
+    21
+    22
+    23
+
+#### Reduction along all dimensions
+
+As a special case, if you pass no parameter to a reduction operation the
+original tensor is reduced along *all* its dimensions.  The result is a
+one-dimension tensor with a single value.
+
+    Eigen::Tensor<float, 3> a(2, 3, 4);
+    a.setValues({{{0.0f, 1.0f, 2.0f, 3.0f},
+                  {7.0f, 6.0f, 5.0f, 4.0f},
+                  {8.0f, 9.0f, 10.0f, 11.0f}},
+                 {{12.0f, 13.0f, 14.0f, 15.0f},
+                  {19.0f, 18.0f, 17.0f, 16.0f},
+                  {20.0f, 21.0f, 22.0f, 23.0f}}});
+    // Reduce along all dimensions using the sum() operator.
+    Eigen::Tensor<float, 1> b = a.sum();
+    cout << "b" << endl << b << endl << endl;
+    =>
+    b
+    276
+
+
+### &lt;Operation&gt; sum(const Dimensions& new_dims)
+### &lt;Operation&gt; sum()
+
+Reduce a tensor using the sum() operator.  The resulting values
+are the sum of the reduced values.
+
+### &lt;Operation&gt; mean(const Dimensions& new_dims)
+### &lt;Operation&gt; mean()
+
+Reduce a tensor using the mean() operator.  The resulting values
+are the mean of the reduced values.
+
+### &lt;Operation&gt; maximum(const Dimensions& new_dims)
+### &lt;Operation&gt; maximum()
+
+Reduce a tensor using the maximum() operator.  The resulting values are the
+largest of the reduced values.
+
+### &lt;Operation&gt; minimum(const Dimensions& new_dims)
+### &lt;Operation&gt; minimum()
+
+Reduce a tensor using the minimum() operator.  The resulting values
+are the smallest of the reduced values.
+
+### &lt;Operation&gt; prod(const Dimensions& new_dims)
+### &lt;Operation&gt; prod()
+
+Reduce a tensor using the prod() operator.  The resulting values
+are the product of the reduced values.
+
+### &lt;Operation&gt; all(const Dimensions& new_dims)
+### &lt;Operation&gt; all()
+Reduce a tensor using the all() operator.  Casts tensor to bool and then checks
+whether all elements are true.  Runs through all elements rather than
+short-circuiting, so may be significantly inefficient.
+
+### &lt;Operation&gt; any(const Dimensions& new_dims)
+### &lt;Operation&gt; any()
+Reduce a tensor using the any() operator.  Casts tensor to bool and then checks
+whether any element is true.  Runs through all elements rather than
+short-circuiting, so may be significantly inefficient.
+
+### &lt;Operation&gt; reduce(const Dimensions& new_dims, const Reducer& reducer)
+
+Reduce a tensor using a user-defined reduction operator.  See ```SumReducer```
+in TensorFunctors.h for information on how to implement a reduction operator.
+
+
+## Convolutions
+
+### &lt;Operation&gt; convolve(const KernelDerived& kernel, const Dimensions& dims)
+
+Returns a tensor that is the output of the convolution of the of the input tensor with the kernel,
+along the specified dimensions of the input tensor. The dimension size for dimensions of the output tensor
+which were part of the convolution will be reduced by the formula:
+output_dim_size = input_dim_size - kernel_dim_size + 1 (requires: input_dim_size >= kernel_dim_size).
+The dimension sizes for dimensions that were not part of the convolution will remain the same.
+Performance of the convolution can depend on the length of the stride(s) of the input tensor dimension(s) along which the
+convolution is computed (the first dimension has the shortest stride for ColMajor, whereas RowMajor's shortest stride is
+for the last dimension).
+
+    // Compute convolution along the second and third dimension.
+    Tensor<float, 4, DataLayout> input(3, 3, 7, 11);
+    Tensor<float, 2, DataLayout> kernel(2, 2);
+    Tensor<float, 4, DataLayout> output(3, 2, 6, 11);
+    input.setRandom();
+    kernel.setRandom();
+
+    Eigen::array<ptrdiff_t, 2> dims({1, 2});  // Specify second and third dimension for convolution.
+    output = input.convolve(kernel, dims);
+
+    for (int i = 0; i < 3; ++i) {
+      for (int j = 0; j < 2; ++j) {
+        for (int k = 0; k < 6; ++k) {
+          for (int l = 0; l < 11; ++l) {
+            const float result = output(i,j,k,l);
+            const float expected = input(i,j+0,k+0,l) * kernel(0,0) +
+                                   input(i,j+1,k+0,l) * kernel(1,0) +
+                                   input(i,j+0,k+1,l) * kernel(0,1) +
+                                   input(i,j+1,k+1,l) * kernel(1,1);
+            VERIFY_IS_APPROX(result, expected);
+          }
+        }
+      }
+    }
+
+
+
+## Geometrical Operations
+
+These operations return a Tensor with different dimensions than the original
+Tensor.  They can be used to access slices of tensors, see them with different
+dimensions, or pad tensors with additional data.
+
+### &lt;Operation&gt; reshape(const Dimensions& new_dims)
+
+Returns a view of the input tensor that has been reshaped to the specified
+new dimensions.  The argument new_dims is an array of Index values.  The
+rank of the resulting tensor is equal to the number of elements in new_dims.
+
+The product of all the sizes in the new dimension array must be equal to
+the number of elements in the input tensor.
+
+    // Increase the rank of the input tensor by introducing a new dimension
+    // of size 1.
+    Tensor<float, 2> input(7, 11);
+    array<int, 3> three_dims{{7, 11, 1}};
+    Tensor<float, 3> result = input.reshape(three_dims);
+
+    // Decrease the rank of the input tensor by merging 2 dimensions;
+    array<int, 1> one_dim{{7 * 11}};
+    Tensor<float, 1> result = input.reshape(one_dim);
+
+This operation does not move any data in the input tensor, so the resulting
+contents of a reshaped Tensor depend on the data layout of the original Tensor.
+
+For example this is what happens when you ```reshape()``` a 2D ColMajor tensor
+to one dimension:
+
+    Eigen::Tensor<float, 2, Eigen::ColMajor> a(2, 3);
+    a.setValues({{0.0f, 100.0f, 200.0f}, {300.0f, 400.0f, 500.0f}});
+    Eigen::array<Eigen::DenseIndex, 1> one_dim({3 * 2});
+    Eigen::Tensor<float, 1, Eigen::ColMajor> b = a.reshape(one_dim);
+    cout << "b" << endl << b << endl;
+    =>
+    b
+      0
+    300
+    100
+    400
+    200
+    500
+
+This is what happens when the 2D Tensor is RowMajor:
+
+    Eigen::Tensor<float, 2, Eigen::RowMajor> a(2, 3);
+    a.setValues({{0.0f, 100.0f, 200.0f}, {300.0f, 400.0f, 500.0f}});
+    Eigen::array<Eigen::DenseIndex, 1> one_dim({3 * 2});
+    Eigen::Tensor<float, 1, Eigen::RowMajor> b = a.reshape(one_dim);
+    cout << "b" << endl << b << endl;
+    =>
+    b
+      0
+    100
+    200
+    300
+    400
+    500
+
+The reshape operation is a lvalue. In other words, it can be used on the left
+side of the assignment operator.
+
+The previous example can be rewritten as follow:
+
+    Eigen::Tensor<float, 2, Eigen::ColMajor> a(2, 3);
+    a.setValues({{0.0f, 100.0f, 200.0f}, {300.0f, 400.0f, 500.0f}});
+    Eigen::array<Eigen::DenseIndex, 2> two_dim({2, 3});
+    Eigen::Tensor<float, 1, Eigen::ColMajor> b;
+    b.reshape(two_dim) = a;
+    cout << "b" << endl << b << endl;
+    =>
+    b
+      0
+    300
+    100
+    400
+    200
+    500
+
+Note that "b" itself was not reshaped but that instead the assignment is done to
+the reshape view of b.
+
+
+### &lt;Operation&gt; shuffle(const Shuffle& shuffle)
+
+Returns a copy of the input tensor whose dimensions have been
+reordered according to the specified permutation. The argument shuffle
+is an array of Index values. Its size is the rank of the input
+tensor. It must contain a permutation of 0, 1, ..., rank - 1. The i-th
+dimension of the output tensor equals to the size of the shuffle[i]-th
+dimension of the input tensor. For example:
+
+    // Shuffle all dimensions to the left by 1.
+    Tensor<float, 3> input(20, 30, 50);
+    // ... set some values in input.
+    Tensor<float, 3> output = input.shuffle({1, 2, 0})
+
+    eigen_assert(output.dimension(0) == 30);
+    eigen_assert(output.dimension(1) == 50);
+    eigen_assert(output.dimension(2) == 20);
+
+Indices into the output tensor are shuffled accordingly to formulate
+indices into the input tensor. For example, one can assert in the above
+code snippet that:
+
+    eigen_assert(output(3, 7, 11) == input(11, 3, 7));
+
+In general, one can assert that
+
+    eigen_assert(output(..., indices[shuffle[i]], ...) ==
+                 input(..., indices[i], ...))
+
+The shuffle operation results in a lvalue, which means that it can be assigned
+to. In other words, it can be used on the left side of the assignment operator.
+
+Let's rewrite the previous example to take advantage of this feature:
+
+    // Shuffle all dimensions to the left by 1.
+    Tensor<float, 3> input(20, 30, 50);
+    // ... set some values in input.
+    Tensor<float, 3> output(30, 50, 20);
+    output.shuffle({2, 0, 1}) = input;
+
+
+### &lt;Operation&gt; stride(const Strides& strides)
+
+Returns a view of the input tensor that strides (skips stride-1
+elements) along each of the dimensions.  The argument strides is an
+array of Index values.  The dimensions of the resulting tensor are
+ceil(input_dimensions[i] / strides[i]).
+
+For example this is what happens when you ```stride()``` a 2D tensor:
+
+    Eigen::Tensor<int, 2> a(4, 3);
+    a.setValues({{0, 100, 200}, {300, 400, 500}, {600, 700, 800}, {900, 1000, 1100}});
+    Eigen::array<Eigen::DenseIndex, 2> strides({3, 2});
+    Eigen::Tensor<int, 2> b = a.stride(strides);
+    cout << "b" << endl << b << endl;
+    =>
+    b
+       0   200
+     900  1100
+
+It is possible to assign a tensor to a stride:
+    Tensor<float, 3> input(20, 30, 50);
+    // ... set some values in input.
+    Tensor<float, 3> output(40, 90, 200);
+    output.stride({2, 3, 4}) = input;
+
+### &lt;Operation&gt; inflate(const Strides& strides)
+
+Returns a view of an "inflated" tensor of the input tensor by inserting zeros
+between the original elements in the input tensor. The argument strides is an
+array of Index values, indicating how much "inflation" there is. The dimensions
+ of the resulting tensor are (input_dimensions[i] - 1) * strides[i] + 1. In
+some sense it is the inverse of the ```stride()``` operation.
+
+For example this is what happens when you ```inflate()``` a 2D tensor:
+
+    Eigen::Tensor<int, 2> a(2, 3);
+    a.setValues({{0, 100, 200}, {300, 400, 500}});
+    Eigen::array<Eigen::DenseIndex, 2> strides({3, 2});
+    Eigen::Tensor<int, 2> b = a.inflate(strides);
+    cout << "b" << endl << b << endl;
+    =>
+    b
+       0     0     0    100    0    0    200
+       0     0     0      0    0    0      0
+     300     0     0    400    0    0    500
+
+The ```inflate()``` operation is an r-value only operation as it doesn't make
+sense to assign a value to an inflated tensor in positions where the values are
+hardwired to zero.
+
+### &lt;Operation&gt; slice(const StartIndices& offsets, const Sizes& extents)
+
+Returns a sub-tensor of the given tensor. For each dimension i, the slice is
+made of the coefficients stored between offset[i] and offset[i] + extents[i] in
+the input tensor.
+
+    Eigen::Tensor<int, 2> a(4, 3);
+    a.setValues({{0, 100, 200}, {300, 400, 500},
+                 {600, 700, 800}, {900, 1000, 1100}});
+    Eigen::array<int, 2> offsets = {1, 0};
+    Eigen::array<int, 2> extents = {2, 2};
+    Eigen::Tensor<int, 1> slice = a.slice(offsets, extents);
+    cout << "a" << endl << a << endl;
+    =>
+    a
+       0   100   200
+     300   400   500
+     600   700   800
+     900  1000  1100
+    cout << "slice" << endl << slice << endl;
+    =>
+    slice
+     300   400
+     600   700
+
+
+### &lt;Operation&gt; chip(const Index offset, const Index dim)
+
+A chip is a special kind of slice. It is the subtensor at the given offset in
+the dimension dim. The returned tensor has one fewer dimension than the input
+tensor: the dimension dim is removed.
+
+For example, a matrix chip would be either a row or a column of the input
+matrix.
+
+    Eigen::Tensor<int, 2> a(4, 3);
+    a.setValues({{0, 100, 200}, {300, 400, 500},
+                 {600, 700, 800}, {900, 1000, 1100}});
+    Eigen::Tensor<int, 1> row_3 = a.chip(2, 0);
+    Eigen::Tensor<int, 1> col_2 = a.chip(1, 1);
+    cout << "a" << endl << a << endl;
+    =>
+    a
+       0   100   200
+     300   400   500
+     600   700   800
+     900  1000  1100
+    cout << "row_3" << endl << row_3 << endl;
+    =>
+    row_3
+       600   700   800
+    cout << "col_2" << endl << col_2 << endl;
+    =>
+    col_2
+       100   400   700    1000
+
+It is possible to assign values to a tensor chip since the chip operation is a
+lvalue. For example:
+
+    Eigen::Tensor<int, 1> a(3);
+    a.setValues({{100, 200, 300}});
+    Eigen::Tensor<int, 2> b(2, 3);
+    b.setZero();
+    b.chip(0, 0) = a;
+    cout << "a" << endl << a << endl;
+    =>
+    a
+     100
+     200
+     300
+    cout << "b" << endl << b << endl;
+    =>
+    b
+       100   200   300
+         0     0     0
+
+
+### &lt;Operation&gt; reverse(const ReverseDimensions& reverse)
+
+Returns a view of the input tensor that reverses the order of the coefficients
+along a subset of the dimensions.  The argument reverse is an array of boolean
+values that indicates whether or not the order of the coefficients should be
+reversed along each of the dimensions.  This operation preserves the dimensions
+of the input tensor.
+
+For example this is what happens when you ```reverse()``` the first dimension
+of a 2D tensor:
+
+    Eigen::Tensor<int, 2> a(4, 3);
+    a.setValues({{0, 100, 200}, {300, 400, 500},
+                {600, 700, 800}, {900, 1000, 1100}});
+    Eigen::array<bool, 2> reverse({true, false});
+    Eigen::Tensor<int, 2> b = a.reverse(reverse);
+    cout << "a" << endl << a << endl << "b" << endl << b << endl;
+    =>
+    a
+       0   100   200
+     300   400   500
+     600   700   800
+     900  1000  1100
+    b
+     900  1000  1100
+     600   700   800
+     300   400   500
+       0   100   200
+
+
+### &lt;Operation&gt; broadcast(const Broadcast& broadcast)
+
+Returns a view of the input tensor in which the input is replicated one to many
+times.
+The broadcast argument specifies how many copies of the input tensor need to be
+made in each of the dimensions.
+
+    Eigen::Tensor<int, 2> a(2, 3);
+    a.setValues({{0, 100, 200}, {300, 400, 500}});
+    Eigen::array<int, 2> bcast({3, 2});
+    Eigen::Tensor<int, 2> b = a.broadcast(bcast);
+    cout << "a" << endl << a << endl << "b" << endl << b << endl;
+    =>
+    a
+       0   100   200
+     300   400   500
+    b
+       0   100   200    0   100   200
+     300   400   500  300   400   500
+       0   100   200    0   100   200
+     300   400   500  300   400   500
+       0   100   200    0   100   200
+     300   400   500  300   400   500
+
+### &lt;Operation&gt; concatenate(const OtherDerived& other, Axis axis)
+
+TODO
+
+### &lt;Operation&gt;  pad(const PaddingDimensions& padding)
+
+Returns a view of the input tensor in which the input is padded with zeros.
+
+    Eigen::Tensor<int, 2> a(2, 3);
+    a.setValues({{0, 100, 200}, {300, 400, 500}});
+    Eigen::array<std::pair<int, int>, 2> paddings;
+    paddings[0] = make_pair(0, 1);
+    paddings[1] = make_pair(2, 3);
+    Eigen::Tensor<int, 2> b = a.pad(paddings);
+    cout << "a" << endl << a << endl << "b" << endl << b << endl;
+    =>
+    a
+       0   100   200
+     300   400   500
+    b
+       0     0     0    0
+       0     0     0    0
+       0   100   200    0
+     300   400   500    0
+       0     0     0    0
+       0     0     0    0
+       0     0     0    0
+
+
+### &lt;Operation&gt;  extract_patches(const PatchDims& patch_dims)
+
+Returns a tensor of coefficient patches extracted from the input tensor, where
+each patch is of dimension specified by 'patch_dims'. The returned tensor has
+one greater dimension than the input tensor, which is used to index each patch.
+The patch index in the output tensor depends on the data layout of the input
+tensor: the patch index is the last dimension ColMajor layout, and the first
+dimension in RowMajor layout.
+
+For example, given the following input tensor:
+
+    Eigen::Tensor<float, 2, DataLayout> tensor(3,4);
+    tensor.setValues({{0.0f, 1.0f, 2.0f, 3.0f},
+                    {4.0f, 5.0f, 6.0f, 7.0f},
+                    {8.0f, 9.0f, 10.0f, 11.0f}});
+
+    cout << "tensor: " << endl << tensor << endl;
+    =>
+    tensor:
+    0   1   2   3
+    4   5   6   7
+    8   9  10  11
+
+Six 2x2 patches can be extracted and indexed using the following code:
+
+    Eigen::Tensor<float, 3, DataLayout> patch;
+    Eigen::array<ptrdiff_t, 2> patch_dims;
+    patch_dims[0] = 2;
+    patch_dims[1] = 2;
+    patch = tensor.extract_patches(patch_dims);
+    for (int k = 0; k < 6; ++k) {
+      cout << "patch index: " << k << endl;
+      for (int i = 0; i < 2; ++i) {
+        for (int j = 0; j < 2; ++j) {
+          if (DataLayout == ColMajor) {
+            cout << patch(i, j, k) << " ";
+          } else {
+            cout << patch(k, i, j) << " ";
+          }
+        }
+        cout << endl;
+      }
+    }
+
+This code results in the following output when the data layout is ColMajor:
+
+    patch index: 0
+    0 1
+    4 5
+    patch index: 1
+    4 5
+    8 9
+    patch index: 2
+    1 2
+    5 6
+    patch index: 3
+    5 6
+    9 10
+    patch index: 4
+    2 3
+    6 7
+    patch index: 5
+    6 7
+    10 11
+
+This code results in the following output when the data layout is RowMajor:
+(NOTE: the set of patches is the same as in ColMajor, but are indexed differently).
+
+    patch index: 0
+    0 1
+    4 5
+    patch index: 1
+    1 2
+    5 6
+    patch index: 2
+    2 3
+    6 7
+    patch index: 3
+    4 5
+    8 9
+    patch index: 4
+    5 6
+    9 10
+    patch index: 5
+    6 7
+    10 11
+
+### &lt;Operation&gt;  extract_image_patches(const Index patch_rows, const Index patch_cols,
+                          const Index row_stride, const Index col_stride,
+                          const Index in_row_stride, const Index in_col_stride,
+                          const Index row_inflate_stride, const Index col_inflate_stride,
+                          const PaddingType padding_type, const Scalar padding_value)
+
+Returns a tensor of coefficient image patches extracted from the input tensor,
+which is expected to have dimensions ordered as follows (depending on the data
+layout of the input tensor, and the number of additional dimensions 'N'):
+
+* ColMajor
+     * 1st dimension: channels (of size d)
+     * 2nd dimension: rows (of size r)
+     * 3rd dimension: columns (of size c)
+     * 4th-Nth dimension: time (for video) or batch (for bulk processing).
+
+* RowMajor (reverse order of ColMajor)
+    * 1st-Nth dimension: time (for video) or batch (for bulk processing).
+    * N+1'th dimension: columns (of size c)
+    * N+2'th dimension: rows (of size r)
+    * N+3'th dimension: channels (of size d)
+
+The returned tensor has one greater dimension than the input tensor, which is
+used to index each patch. The patch index in the output tensor depends on the
+data layout of the input tensor: the patch index is the 4'th dimension in
+ColMajor layout, and the 4'th from the last dimension in RowMajor layout.
+
+For example, given the following input tensor with the following dimension
+sizes:
+
+* depth:   2
+* rows:    3
+* columns: 5
+* batch:   7
+
+    Tensor<float, 4> tensor(2,3,5,7);
+    Tensor<float, 4, RowMajor> tensor_row_major = tensor.swap_layout();
+
+2x2 image patches can be extracted and indexed using the following code:
+
+* 2D patch: ColMajor (patch indexed by second-to-last dimension)
+
+    Tensor<float, 5> twod_patch;
+    twod_patch = tensor.extract_image_patches<2, 2>();
+    // twod_patch.dimension(0) == 2
+    // twod_patch.dimension(1) == 2
+    // twod_patch.dimension(2) == 2
+    // twod_patch.dimension(3) == 3*5
+    // twod_patch.dimension(4) == 7
+
+* 2D patch: RowMajor (patch indexed by the second dimension)
+
+    Tensor<float, 5, RowMajor> twod_patch_row_major;
+    twod_patch_row_major = tensor_row_major.extract_image_patches<2, 2>();
+    // twod_patch_row_major.dimension(0) == 7
+    // twod_patch_row_major.dimension(1) == 3*5
+    // twod_patch_row_major.dimension(2) == 2
+    // twod_patch_row_major.dimension(3) == 2
+    // twod_patch_row_major.dimension(4) == 2
+
+Input parameters:
+
+* patch_rows, patch_cols: Spatial extent of the extracted patches.
+* row_stride, col_stride: Image Displacement (in pixels) between the
+  upper-left coordinates of consecutive patches.
+* in_row_stride, in_col_stride: Image displacement (in pixels) between
+  two consecutive patch samples. If larger than 1 (default), they allow
+  for sparsely sampling the input image.
+* row_inflate_stride, col_inflate_stride: If larger than 1 (default), "inflates"
+  the inputs by inserting zeros between the original elements. This is useful
+  for backward convolution.
+* padding_type: Boundary conditions. Either PADDING_SAME (default)
+  or PADDING_VALID.
+* padding_value: the value used in padding, defaults to 0.
+
+## Special Operations
+
+### &lt;Operation&gt; cast&lt;T&gt;()
+
+Returns a tensor of type T with the same dimensions as the original tensor.
+The returned tensor contains the values of the original tensor converted to
+type T.
+
+    Eigen::Tensor<float, 2> a(2, 3);
+    Eigen::Tensor<int, 2> b = a.cast<int>();
+
+This can be useful for example if you need to do element-wise division of
+Tensors of integers.  This is not currently supported by the Tensor library
+but you can easily cast the tensors to floats to do the division:
+
+    Eigen::Tensor<int, 2> a(2, 3);
+    a.setValues({{0, 1, 2}, {3, 4, 5}});
+    Eigen::Tensor<int, 2> b =
+        (a.cast<float>() / a.constant(2).cast<float>()).cast<int>();
+    cout << "a" << endl << a << endl << endl;
+    cout << "b" << endl << b << endl << endl;
+    =>
+    a
+    0 1 2
+    3 4 5
+
+    b
+    0 0 1
+    1 2 2
+
+
+### &lt;Operation&gt;     eval()
+
+TODO
+
+
+## Representation of scalar values
+
+Scalar values are often represented by tensors of size 1 and rank 1. It would be
+more logical and user friendly to use tensors of rank 0 instead. For example
+Tensor&lt;T, N&gt;::maximum() currently returns a Tensor&lt;T, 1&gt;. Similarly, the inner
+product of 2 1d tensors (through contractions) returns a 1d tensor. In the
+future these operations might be updated to return 0d tensors instead.
+
+## GPU Support
+
+NVidia GPU support can be enabled using:
+
+    #define EIGEN_USE_GPU
+
+To speedup operations on GPU, it is also recommended to use 32 bit indices. This
+prevents Eigen from using 64 bit loop indices, which have to be emulated in
+software and make any operation extremely slow.
+
+This can be achieved globally by using the EIGEN_DEFAULT_DENSE_INDEX_TYPE define
+as follow:
+
+    #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+
+This can also be done individually for each tensor by using the Index32Bit
+option as follow:
+
+    Eigen::Tensor<DataType, Rank, Eigen::Index32Bit> t;
+    Eigen::TensorMap<Eigen::Tensor<DataType, Rank, Eigen::Index32Bit> > t_map;
+
+
+## Limitations
+
+*   The number of tensor dimensions is currently limited to 250 when using a
+    compiler that supports cxx11. It is limited to only 5 for older compilers.
+*   The IndexList class requires a cxx11 compliant compiler. You can use an
+    array of indices instead if you don't have access to a modern compiler.
+*   TensorVarDims are only partially supported
+*   On GPUs only floating point values are properly tested and optimized for.
+*   Complex and integer values are known to be broken on GPUs. If you try to use
+    them you'll most likely end up triggering a static assertion failure such as
+    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h
new file mode 100644
index 0000000000..13cb2157f2
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h
@@ -0,0 +1,293 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSORSYMMETRY_DYNAMICSYMMETRY_H
+#define EIGEN_CXX11_TENSORSYMMETRY_DYNAMICSYMMETRY_H
+
+namespace Eigen {
+
+class DynamicSGroup
+{
+  public:
+    inline explicit DynamicSGroup() : m_numIndices(1), m_elements(), m_generators(), m_globalFlags(0) { m_elements.push_back(ge(Generator(0, 0, 0))); }
+    inline DynamicSGroup(const DynamicSGroup& o) : m_numIndices(o.m_numIndices), m_elements(o.m_elements), m_generators(o.m_generators), m_globalFlags(o.m_globalFlags) { }
+    inline DynamicSGroup(DynamicSGroup&& o) : m_numIndices(o.m_numIndices), m_elements(), m_generators(o.m_generators), m_globalFlags(o.m_globalFlags) { std::swap(m_elements, o.m_elements); }
+    inline DynamicSGroup& operator=(const DynamicSGroup& o) { m_numIndices = o.m_numIndices; m_elements = o.m_elements; m_generators = o.m_generators; m_globalFlags = o.m_globalFlags; return *this; }
+    inline DynamicSGroup& operator=(DynamicSGroup&& o) { m_numIndices = o.m_numIndices; std::swap(m_elements, o.m_elements); m_generators = o.m_generators; m_globalFlags = o.m_globalFlags; return *this; }
+
+    void add(int one, int two, int flags = 0);
+
+    template<typename Gen_>
+    inline void add(Gen_) { add(Gen_::One, Gen_::Two, Gen_::Flags); }
+    inline void addSymmetry(int one, int two) { add(one, two, 0); }
+    inline void addAntiSymmetry(int one, int two) { add(one, two, NegationFlag); }
+    inline void addHermiticity(int one, int two) { add(one, two, ConjugationFlag); }
+    inline void addAntiHermiticity(int one, int two) { add(one, two, NegationFlag | ConjugationFlag); }
+
+    template<typename Op, typename RV, typename Index, std::size_t N, typename... Args>
+    inline RV apply(const std::array<Index, N>& idx, RV initial, Args&&... args) const
+    {
+      eigen_assert(N >= m_numIndices && "Can only apply symmetry group to objects that have at least the required number of indices.");
+      for (std::size_t i = 0; i < size(); i++)
+        initial = Op::run(h_permute(i, idx, typename internal::gen_numeric_list<int, N>::type()), m_elements[i].flags, initial, std::forward<Args>(args)...);
+      return initial;
+    }
+
+    template<typename Op, typename RV, typename Index, typename... Args>
+    inline RV apply(const std::vector<Index>& idx, RV initial, Args&&... args) const
+    {
+      eigen_assert(idx.size() >= m_numIndices && "Can only apply symmetry group to objects that have at least the required number of indices.");
+      for (std::size_t i = 0; i < size(); i++)
+        initial = Op::run(h_permute(i, idx), m_elements[i].flags, initial, std::forward<Args>(args)...);
+      return initial;
+    }
+
+    inline int globalFlags() const { return m_globalFlags; }
+    inline std::size_t size() const { return m_elements.size(); }
+
+    template<typename Tensor_, typename... IndexTypes>
+    inline internal::tensor_symmetry_value_setter<Tensor_, DynamicSGroup> operator()(Tensor_& tensor, typename Tensor_::Index firstIndex, IndexTypes... otherIndices) const
+    {
+      static_assert(sizeof...(otherIndices) + 1 == Tensor_::NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor.");
+      return operator()(tensor, std::array<typename Tensor_::Index, Tensor_::NumIndices>{{firstIndex, otherIndices...}});
+    }
+
+    template<typename Tensor_>
+    inline internal::tensor_symmetry_value_setter<Tensor_, DynamicSGroup> operator()(Tensor_& tensor, std::array<typename Tensor_::Index, Tensor_::NumIndices> const& indices) const
+    {
+      return internal::tensor_symmetry_value_setter<Tensor_, DynamicSGroup>(tensor, *this, indices);
+    }
+  private:
+    struct GroupElement {
+      std::vector<int> representation;
+      int flags;
+      bool isId() const
+      {
+        for (std::size_t i = 0; i < representation.size(); i++)
+          if (i != (size_t)representation[i])
+            return false;
+        return true;
+      }
+    };
+    struct Generator {
+      int one;
+      int two;
+      int flags;
+      constexpr inline Generator(int one_, int two_, int flags_) : one(one_), two(two_), flags(flags_) {}
+    };
+
+    std::size_t m_numIndices;
+    std::vector<GroupElement> m_elements;
+    std::vector<Generator> m_generators;
+    int m_globalFlags;
+
+    template<typename Index, std::size_t N, int... n>
+    inline std::array<Index, N> h_permute(std::size_t which, const std::array<Index, N>& idx, internal::numeric_list<int, n...>) const
+    {
+      return std::array<Index, N>{{ idx[n >= m_numIndices ? n : m_elements[which].representation[n]]... }};
+    }
+
+    template<typename Index>
+    inline std::vector<Index> h_permute(std::size_t which, std::vector<Index> idx) const
+    {
+      std::vector<Index> result;
+      result.reserve(idx.size());
+      for (auto k : m_elements[which].representation)
+        result.push_back(idx[k]);
+      for (std::size_t i = m_numIndices; i < idx.size(); i++)
+        result.push_back(idx[i]);
+      return result;
+    }
+
+    inline GroupElement ge(Generator const& g) const
+    {
+      GroupElement result;
+      result.representation.reserve(m_numIndices);
+      result.flags = g.flags;
+      for (std::size_t k = 0; k < m_numIndices; k++) {
+        if (k == (std::size_t)g.one)
+          result.representation.push_back(g.two);
+        else if (k == (std::size_t)g.two)
+          result.representation.push_back(g.one);
+        else
+          result.representation.push_back(int(k));
+      }
+      return result;
+    }
+
+    GroupElement mul(GroupElement, GroupElement) const;
+    inline GroupElement mul(Generator g1, GroupElement g2) const
+    {
+      return mul(ge(g1), g2);
+    }
+
+    inline GroupElement mul(GroupElement g1, Generator g2) const
+    {
+      return mul(g1, ge(g2));
+    }
+
+    inline GroupElement mul(Generator g1, Generator g2) const
+    {
+      return mul(ge(g1), ge(g2));
+    }
+
+    inline int findElement(GroupElement e) const
+    {
+      for (auto ee : m_elements) {
+        if (ee.representation == e.representation)
+          return ee.flags ^ e.flags;
+      }
+      return -1;
+    }
+
+    void updateGlobalFlags(int flagDiffOfSameGenerator);
+};
+
+// dynamic symmetry group that auto-adds the template parameters in the constructor
+template<typename... Gen>
+class DynamicSGroupFromTemplateArgs : public DynamicSGroup
+{
+  public:
+    inline DynamicSGroupFromTemplateArgs() : DynamicSGroup()
+    {
+      add_all(internal::type_list<Gen...>());
+    }
+    inline DynamicSGroupFromTemplateArgs(DynamicSGroupFromTemplateArgs const& other) : DynamicSGroup(other) { }
+    inline DynamicSGroupFromTemplateArgs(DynamicSGroupFromTemplateArgs&& other) : DynamicSGroup(other) { }
+    inline DynamicSGroupFromTemplateArgs<Gen...>& operator=(const DynamicSGroupFromTemplateArgs<Gen...>& o) { DynamicSGroup::operator=(o); return *this; }
+    inline DynamicSGroupFromTemplateArgs<Gen...>& operator=(DynamicSGroupFromTemplateArgs<Gen...>&& o) { DynamicSGroup::operator=(o); return *this; }
+  
+  private:
+    template<typename Gen1, typename... GenNext>
+    inline void add_all(internal::type_list<Gen1, GenNext...>)
+    {
+      add(Gen1());
+      add_all(internal::type_list<GenNext...>());
+    }
+
+    inline void add_all(internal::type_list<>)
+    {
+    }
+};
+
+inline DynamicSGroup::GroupElement DynamicSGroup::mul(GroupElement g1, GroupElement g2) const
+{
+  eigen_internal_assert(g1.representation.size() == m_numIndices);
+  eigen_internal_assert(g2.representation.size() == m_numIndices);
+
+  GroupElement result;
+  result.representation.reserve(m_numIndices);
+  for (std::size_t i = 0; i < m_numIndices; i++) {
+    int v = g2.representation[g1.representation[i]];
+    eigen_assert(v >= 0);
+    result.representation.push_back(v);
+  }
+  result.flags = g1.flags ^ g2.flags;
+  return result;
+}
+
+inline void DynamicSGroup::add(int one, int two, int flags)
+{
+  eigen_assert(one >= 0);
+  eigen_assert(two >= 0);
+  eigen_assert(one != two);
+
+  if ((std::size_t)one >= m_numIndices || (std::size_t)two >= m_numIndices) {
+    std::size_t newNumIndices = (one > two) ? one : two + 1;
+    for (auto& gelem : m_elements) {
+      gelem.representation.reserve(newNumIndices);
+      for (std::size_t i = m_numIndices; i < newNumIndices; i++)
+        gelem.representation.push_back(i);
+    }
+    m_numIndices = newNumIndices;
+  }
+
+  Generator g{one, two, flags};
+  GroupElement e = ge(g);
+
+  /* special case for first generator */
+  if (m_elements.size() == 1) {
+    while (!e.isId()) {
+      m_elements.push_back(e);
+      e = mul(e, g);
+    }
+
+    if (e.flags > 0)
+      updateGlobalFlags(e.flags);
+
+    // only add in case we didn't have identity
+    if (m_elements.size() > 1)
+      m_generators.push_back(g);
+    return;
+  }
+
+  int p = findElement(e);
+  if (p >= 0) {
+    updateGlobalFlags(p);
+    return;
+  }
+
+  std::size_t coset_order = m_elements.size();
+  m_elements.push_back(e);
+  for (std::size_t i = 1; i < coset_order; i++)
+    m_elements.push_back(mul(m_elements[i], e));
+  m_generators.push_back(g);
+
+  std::size_t coset_rep = coset_order;
+  do {
+    for (auto g : m_generators) {
+      e = mul(m_elements[coset_rep], g);
+      p = findElement(e);
+      if (p < 0) {
+        // element not yet in group
+        m_elements.push_back(e);
+        for (std::size_t i = 1; i < coset_order; i++)
+          m_elements.push_back(mul(m_elements[i], e));
+      } else if (p > 0) {
+        updateGlobalFlags(p);
+      }
+    }
+    coset_rep += coset_order;
+  } while (coset_rep < m_elements.size());
+}
+
+inline void DynamicSGroup::updateGlobalFlags(int flagDiffOfSameGenerator)
+{
+    switch (flagDiffOfSameGenerator) {
+      case 0:
+      default:
+        // nothing happened
+        break;
+      case NegationFlag:
+        // every element is it's own negative => whole tensor is zero
+        m_globalFlags |= GlobalZeroFlag;
+        break;
+      case ConjugationFlag:
+        // every element is it's own conjugate => whole tensor is real
+        m_globalFlags |= GlobalRealFlag;
+        break;
+      case (NegationFlag | ConjugationFlag):
+        // every element is it's own negative conjugate => whole tensor is imaginary
+        m_globalFlags |= GlobalImagFlag;
+        break;
+      /* NOTE:
+       *   since GlobalZeroFlag == GlobalRealFlag | GlobalImagFlag, if one generator
+       *   causes the tensor to be real and the next one to be imaginary, this will
+       *   trivially give the correct result
+       */
+    }
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSORSYMMETRY_DYNAMICSYMMETRY_H
+
+/*
+ * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle;
+ */
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h
new file mode 100644
index 0000000000..942293bd71
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h
@@ -0,0 +1,236 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSORSYMMETRY_STATICSYMMETRY_H
+#define EIGEN_CXX11_TENSORSYMMETRY_STATICSYMMETRY_H
+
+namespace Eigen {
+
+namespace internal {
+
+template<typename list> struct tensor_static_symgroup_permutate;
+
+template<int... nn>
+struct tensor_static_symgroup_permutate<numeric_list<int, nn...>>
+{
+  constexpr static std::size_t N = sizeof...(nn);
+
+  template<typename T>
+  constexpr static inline std::array<T, N> run(const std::array<T, N>& indices)
+  {
+    return {{indices[nn]...}};
+  }
+};
+
+template<typename indices_, int flags_>
+struct tensor_static_symgroup_element
+{
+  typedef indices_ indices;
+  constexpr static int flags = flags_;
+};
+
+template<typename Gen, int N>
+struct tensor_static_symgroup_element_ctor
+{
+  typedef tensor_static_symgroup_element<
+    typename gen_numeric_list_swapped_pair<int, N, Gen::One, Gen::Two>::type,
+    Gen::Flags
+  > type;
+};
+
+template<int N>
+struct tensor_static_symgroup_identity_ctor
+{
+  typedef tensor_static_symgroup_element<
+    typename gen_numeric_list<int, N>::type,
+    0
+  > type;
+};
+
+template<typename iib>
+struct tensor_static_symgroup_multiply_helper
+{
+  template<int... iia>
+  constexpr static inline numeric_list<int, get<iia, iib>::value...> helper(numeric_list<int, iia...>) {
+    return numeric_list<int, get<iia, iib>::value...>();
+  }
+};
+
+template<typename A, typename B>
+struct tensor_static_symgroup_multiply
+{
+  private:
+    typedef typename A::indices iia;
+    typedef typename B::indices iib;
+    constexpr static int ffa = A::flags;
+    constexpr static int ffb = B::flags;
+  
+  public:
+    static_assert(iia::count == iib::count, "Cannot multiply symmetry elements with different number of indices.");
+
+    typedef tensor_static_symgroup_element<
+      decltype(tensor_static_symgroup_multiply_helper<iib>::helper(iia())),
+      ffa ^ ffb
+    > type;
+};
+
+template<typename A, typename B>
+struct tensor_static_symgroup_equality
+{
+    typedef typename A::indices iia;
+    typedef typename B::indices iib;
+    constexpr static int ffa = A::flags;
+    constexpr static int ffb = B::flags;
+    static_assert(iia::count == iib::count, "Cannot compare symmetry elements with different number of indices.");
+
+    constexpr static bool value = is_same<iia, iib>::value;
+
+  private:
+    /* this should be zero if they are identical, or else the tensor
+     * will be forced to be pure real, pure imaginary or even pure zero
+     */
+    constexpr static int flags_cmp_ = ffa ^ ffb;
+
+    /* either they are not equal, then we don't care whether the flags
+     * match, or they are equal, and then we have to check
+     */
+    constexpr static bool is_zero      = value && flags_cmp_ == NegationFlag;
+    constexpr static bool is_real      = value && flags_cmp_ == ConjugationFlag;
+    constexpr static bool is_imag      = value && flags_cmp_ == (NegationFlag | ConjugationFlag);
+
+  public:
+    constexpr static int global_flags = 
+      (is_real ? GlobalRealFlag : 0) |
+      (is_imag ? GlobalImagFlag : 0) |
+      (is_zero ? GlobalZeroFlag : 0);
+};
+
+template<std::size_t NumIndices, typename... Gen>
+struct tensor_static_symgroup
+{
+  typedef StaticSGroup<Gen...> type;
+  constexpr static std::size_t size = type::static_size;
+};
+
+template<typename Index, std::size_t N, int... ii, int... jj>
+constexpr static inline std::array<Index, N> tensor_static_symgroup_index_permute(std::array<Index, N> idx, internal::numeric_list<int, ii...>, internal::numeric_list<int, jj...>)
+{
+  return {{ idx[ii]..., idx[jj]... }};
+}
+
+template<typename Index, int... ii>
+static inline std::vector<Index> tensor_static_symgroup_index_permute(std::vector<Index> idx, internal::numeric_list<int, ii...>)
+{
+  std::vector<Index> result{{ idx[ii]... }};
+  std::size_t target_size = idx.size();
+  for (std::size_t i = result.size(); i < target_size; i++)
+    result.push_back(idx[i]);
+  return result;
+}
+
+template<typename T> struct tensor_static_symgroup_do_apply;
+
+template<typename first, typename... next>
+struct tensor_static_symgroup_do_apply<internal::type_list<first, next...>>
+{
+  template<typename Op, typename RV, std::size_t SGNumIndices, typename Index, std::size_t NumIndices, typename... Args>
+  static inline RV run(const std::array<Index, NumIndices>& idx, RV initial, Args&&... args)
+  {
+    static_assert(NumIndices >= SGNumIndices, "Can only apply symmetry group to objects that have at least the required amount of indices.");
+    typedef typename internal::gen_numeric_list<int, NumIndices - SGNumIndices, SGNumIndices>::type remaining_indices;
+    initial = Op::run(tensor_static_symgroup_index_permute(idx, typename first::indices(), remaining_indices()), first::flags, initial, std::forward<Args>(args)...);
+    return tensor_static_symgroup_do_apply<internal::type_list<next...>>::template run<Op, RV, SGNumIndices>(idx, initial, args...);
+  }
+
+  template<typename Op, typename RV, std::size_t SGNumIndices, typename Index, typename... Args>
+  static inline RV run(const std::vector<Index>& idx, RV initial, Args&&... args)
+  {
+    eigen_assert(idx.size() >= SGNumIndices && "Can only apply symmetry group to objects that have at least the required amount of indices.");
+    initial = Op::run(tensor_static_symgroup_index_permute(idx, typename first::indices()), first::flags, initial, std::forward<Args>(args)...);
+    return tensor_static_symgroup_do_apply<internal::type_list<next...>>::template run<Op, RV, SGNumIndices>(idx, initial, args...);
+  }
+};
+
+template<EIGEN_TPL_PP_SPEC_HACK_DEF(typename, empty)>
+struct tensor_static_symgroup_do_apply<internal::type_list<EIGEN_TPL_PP_SPEC_HACK_USE(empty)>>
+{
+  template<typename Op, typename RV, std::size_t SGNumIndices, typename Index, std::size_t NumIndices, typename... Args>
+  static inline RV run(const std::array<Index, NumIndices>&, RV initial, Args&&...)
+  {
+    // do nothing
+    return initial;
+  }
+
+  template<typename Op, typename RV, std::size_t SGNumIndices, typename Index, typename... Args>
+  static inline RV run(const std::vector<Index>&, RV initial, Args&&...)
+  {
+    // do nothing
+    return initial;
+  }
+};
+
+} // end namespace internal
+
+template<typename... Gen>
+class StaticSGroup
+{
+    constexpr static std::size_t NumIndices = internal::tensor_symmetry_num_indices<Gen...>::value;
+    typedef internal::group_theory::enumerate_group_elements<
+      internal::tensor_static_symgroup_multiply,
+      internal::tensor_static_symgroup_equality,
+      typename internal::tensor_static_symgroup_identity_ctor<NumIndices>::type,
+      internal::type_list<typename internal::tensor_static_symgroup_element_ctor<Gen, NumIndices>::type...>
+    > group_elements;
+    typedef typename group_elements::type ge;
+  public:
+    constexpr inline StaticSGroup() {}
+    constexpr inline StaticSGroup(const StaticSGroup<Gen...>&) {}
+    constexpr inline StaticSGroup(StaticSGroup<Gen...>&&) {}
+
+    template<typename Op, typename RV, typename Index, std::size_t N, typename... Args>
+    static inline RV apply(const std::array<Index, N>& idx, RV initial, Args&&... args)
+    {
+      return internal::tensor_static_symgroup_do_apply<ge>::template run<Op, RV, NumIndices>(idx, initial, args...);
+    }
+
+    template<typename Op, typename RV, typename Index, typename... Args>
+    static inline RV apply(const std::vector<Index>& idx, RV initial, Args&&... args)
+    {
+      eigen_assert(idx.size() == NumIndices);
+      return internal::tensor_static_symgroup_do_apply<ge>::template run<Op, RV, NumIndices>(idx, initial, args...);
+    }
+
+    constexpr static std::size_t static_size = ge::count;
+
+    constexpr static inline std::size_t size() {
+      return ge::count;
+    }
+    constexpr static inline int globalFlags() { return group_elements::global_flags; }
+
+    template<typename Tensor_, typename... IndexTypes>
+    inline internal::tensor_symmetry_value_setter<Tensor_, StaticSGroup<Gen...>> operator()(Tensor_& tensor, typename Tensor_::Index firstIndex, IndexTypes... otherIndices) const
+    {
+      static_assert(sizeof...(otherIndices) + 1 == Tensor_::NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor.");
+      return operator()(tensor, std::array<typename Tensor_::Index, Tensor_::NumIndices>{{firstIndex, otherIndices...}});
+    }
+
+    template<typename Tensor_>
+    inline internal::tensor_symmetry_value_setter<Tensor_, StaticSGroup<Gen...>> operator()(Tensor_& tensor, std::array<typename Tensor_::Index, Tensor_::NumIndices> const& indices) const
+    {
+      return internal::tensor_symmetry_value_setter<Tensor_, StaticSGroup<Gen...>>(tensor, *this, indices);
+    }
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSORSYMMETRY_STATICSYMMETRY_H
+
+/*
+ * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle;
+ */
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/TensorSymmetry/Symmetry.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/TensorSymmetry/Symmetry.h
new file mode 100644
index 0000000000..879d6cd77b
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/TensorSymmetry/Symmetry.h
@@ -0,0 +1,338 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSORSYMMETRY_SYMMETRY_H
+#define EIGEN_CXX11_TENSORSYMMETRY_SYMMETRY_H
+
+namespace Eigen {
+
+enum {
+  NegationFlag           = 0x01,
+  ConjugationFlag        = 0x02
+};
+
+enum {
+  GlobalRealFlag         = 0x01,
+  GlobalImagFlag         = 0x02,
+  GlobalZeroFlag         = 0x03
+};
+
+namespace internal {
+
+template<std::size_t NumIndices, typename... Sym>                   struct tensor_symmetry_pre_analysis;
+template<std::size_t NumIndices, typename... Sym>                   struct tensor_static_symgroup;
+template<bool instantiate, std::size_t NumIndices, typename... Sym> struct tensor_static_symgroup_if;
+template<typename Tensor_> struct tensor_symmetry_calculate_flags;
+template<typename Tensor_> struct tensor_symmetry_assign_value;
+template<typename... Sym> struct tensor_symmetry_num_indices;
+
+} // end namespace internal
+
+template<int One_, int Two_>
+struct Symmetry
+{
+  static_assert(One_ != Two_, "Symmetries must cover distinct indices.");
+  constexpr static int One = One_;
+  constexpr static int Two = Two_;
+  constexpr static int Flags = 0;
+};
+
+template<int One_, int Two_>
+struct AntiSymmetry
+{
+  static_assert(One_ != Two_, "Symmetries must cover distinct indices.");
+  constexpr static int One = One_;
+  constexpr static int Two = Two_;
+  constexpr static int Flags = NegationFlag;
+};
+
+template<int One_, int Two_>
+struct Hermiticity
+{
+  static_assert(One_ != Two_, "Symmetries must cover distinct indices.");
+  constexpr static int One = One_;
+  constexpr static int Two = Two_;
+  constexpr static int Flags = ConjugationFlag;
+};
+
+template<int One_, int Two_>
+struct AntiHermiticity
+{
+  static_assert(One_ != Two_, "Symmetries must cover distinct indices.");
+  constexpr static int One = One_;
+  constexpr static int Two = Two_;
+  constexpr static int Flags = ConjugationFlag | NegationFlag;
+};
+
+/** \class DynamicSGroup
+  * \ingroup TensorSymmetry_Module
+  *
+  * \brief Dynamic symmetry group
+  *
+  * The %DynamicSGroup class represents a symmetry group that need not be known at
+  * compile time. It is useful if one wants to support arbitrary run-time defineable
+  * symmetries for tensors, but it is also instantiated if a symmetry group is defined
+  * at compile time that would be either too large for the compiler to reasonably
+  * generate (using templates to calculate this at compile time is very inefficient)
+  * or that the compiler could generate the group but that it wouldn't make sense to
+  * unroll the loop for setting coefficients anymore.
+  */
+class DynamicSGroup;
+
+/** \internal
+  *
+  * \class DynamicSGroupFromTemplateArgs
+  * \ingroup TensorSymmetry_Module
+  *
+  * \brief Dynamic symmetry group, initialized from template arguments
+  *
+  * This class is a child class of DynamicSGroup. It uses the template arguments
+  * specified to initialize itself.
+  */
+template<typename... Gen>
+class DynamicSGroupFromTemplateArgs;
+
+/** \class StaticSGroup
+  * \ingroup TensorSymmetry_Module
+  *
+  * \brief Static symmetry group
+  *
+  * This class represents a symmetry group that is known and resolved completely
+  * at compile time. Ideally, no run-time penalty is incurred compared to the
+  * manual unrolling of the symmetry.
+  *
+  * <b><i>CAUTION:</i></b>
+  *
+  * Do not use this class directly for large symmetry groups. The compiler
+  * may run into a limit, or segfault or in the very least will take a very,
+  * very, very long time to compile the code. Use the SGroup class instead
+  * if you want a static group. That class contains logic that will
+  * automatically select the DynamicSGroup class instead if the symmetry
+  * group becomes too large. (In that case, unrolling may not even be
+  * beneficial.)
+  */
+template<typename... Gen>
+class StaticSGroup;
+
+/** \class SGroup
+  * \ingroup TensorSymmetry_Module
+  *
+  * \brief Symmetry group, initialized from template arguments
+  *
+  * This class represents a symmetry group whose generators are already
+  * known at compile time. It may or may not be resolved at compile time,
+  * depending on the estimated size of the group.
+  *
+  * \sa StaticSGroup
+  * \sa DynamicSGroup
+  */
+template<typename... Gen>
+class SGroup : public internal::tensor_symmetry_pre_analysis<internal::tensor_symmetry_num_indices<Gen...>::value, Gen...>::root_type
+{
+  public:
+    constexpr static std::size_t NumIndices = internal::tensor_symmetry_num_indices<Gen...>::value;
+    typedef typename internal::tensor_symmetry_pre_analysis<NumIndices, Gen...>::root_type Base;
+
+    // make standard constructors + assignment operators public
+    inline SGroup() : Base() { }
+    inline SGroup(const SGroup<Gen...>& other) : Base(other) { }
+    inline SGroup(SGroup<Gen...>&& other) : Base(other) { }
+    inline SGroup<Gen...>& operator=(const SGroup<Gen...>& other) { Base::operator=(other); return *this; }
+    inline SGroup<Gen...>& operator=(SGroup<Gen...>&& other) { Base::operator=(other); return *this; }
+
+    // all else is defined in the base class
+};
+
+namespace internal {
+
+template<typename... Sym> struct tensor_symmetry_num_indices
+{
+  constexpr static std::size_t value = 1;
+};
+
+template<int One_, int Two_, typename... Sym> struct tensor_symmetry_num_indices<Symmetry<One_, Two_>, Sym...>
+{
+private:
+  constexpr static std::size_t One = static_cast<std::size_t>(One_);
+  constexpr static std::size_t Two = static_cast<std::size_t>(Two_);
+  constexpr static std::size_t Three = tensor_symmetry_num_indices<Sym...>::value;
+
+  // don't use std::max, since it's not constexpr until C++14...
+  constexpr static std::size_t maxOneTwoPlusOne = ((One > Two) ? One : Two) + 1;
+public:
+  constexpr static std::size_t value = (maxOneTwoPlusOne > Three) ? maxOneTwoPlusOne : Three;
+};
+
+template<int One_, int Two_, typename... Sym> struct tensor_symmetry_num_indices<AntiSymmetry<One_, Two_>, Sym...>
+  : public tensor_symmetry_num_indices<Symmetry<One_, Two_>, Sym...> {};
+template<int One_, int Two_, typename... Sym> struct tensor_symmetry_num_indices<Hermiticity<One_, Two_>, Sym...>
+  : public tensor_symmetry_num_indices<Symmetry<One_, Two_>, Sym...> {};
+template<int One_, int Two_, typename... Sym> struct tensor_symmetry_num_indices<AntiHermiticity<One_, Two_>, Sym...>
+  : public tensor_symmetry_num_indices<Symmetry<One_, Two_>, Sym...> {};
+
+/** \internal
+  *
+  * \class tensor_symmetry_pre_analysis
+  * \ingroup TensorSymmetry_Module
+  *
+  * \brief Pre-select whether to use a static or dynamic symmetry group
+  *
+  * When a symmetry group could in principle be determined at compile time,
+  * this template implements the logic whether to actually do that or whether
+  * to rather defer that to runtime.
+  *
+  * The logic is as follows:
+  * <dl>
+  * <dt><b>No generators (trivial symmetry):</b></dt>
+  * <dd>Use a trivial static group. Ideally, this has no performance impact
+  *     compared to not using symmetry at all. In practice, this might not
+  *     be the case.</dd>
+  * <dt><b>More than 4 generators:</b></dt>
+  * <dd>Calculate the group at run time, it is likely far too large for the
+  *     compiler to be able to properly generate it in a realistic time.</dd>
+  * <dt><b>Up to and including 4 generators:</b></dt>
+  * <dd>Actually enumerate all group elements, but then check how many there
+  *     are. If there are more than 16, it is unlikely that unrolling the
+  *     loop (as is done in the static compile-time case) is sensible, so
+  *     use a dynamic group instead. If there are at most 16 elements, actually
+  *     use that static group. Note that the largest group with 4 generators
+  *     still compiles with reasonable resources.</dd>
+  * </dl>
+  *
+  * Note: Example compile time performance with g++-4.6 on an Intenl Core i5-3470
+  *       with 16 GiB RAM (all generators non-redundant and the subgroups don't
+  *       factorize):
+  *
+  *          # Generators          -O0 -ggdb               -O2
+  *          -------------------------------------------------------------------
+  *          1                 0.5 s  /   250 MiB     0.45s /   230 MiB
+  *          2                 0.5 s  /   260 MiB     0.5 s /   250 MiB
+  *          3                 0.65s  /   310 MiB     0.62s /   310 MiB
+  *          4                 2.2 s  /   860 MiB     1.7 s /   770 MiB
+  *          5               130   s  / 13000 MiB   120   s / 11000 MiB
+  *
+  * It is clear that everything is still very efficient up to 4 generators, then
+  * the memory and CPU requirements become unreasonable. Thus we only instantiate
+  * the template group theory logic if the number of generators supplied is 4 or
+  * lower, otherwise this will be forced to be done during runtime, where the
+  * algorithm is reasonably fast.
+  */
+template<std::size_t NumIndices>
+struct tensor_symmetry_pre_analysis<NumIndices>
+{
+  typedef StaticSGroup<> root_type;
+};
+
+template<std::size_t NumIndices, typename Gen_, typename... Gens_>
+struct tensor_symmetry_pre_analysis<NumIndices, Gen_, Gens_...>
+{
+  constexpr static std::size_t max_static_generators = 4;
+  constexpr static std::size_t max_static_elements = 16;
+  typedef tensor_static_symgroup_if<(sizeof...(Gens_) + 1 <= max_static_generators), NumIndices, Gen_, Gens_...> helper;
+  constexpr static std::size_t possible_size = helper::size;
+
+  typedef typename conditional<
+    possible_size == 0 || possible_size >= max_static_elements,
+    DynamicSGroupFromTemplateArgs<Gen_, Gens_...>,
+    typename helper::type
+  >::type root_type;
+};
+
+template<bool instantiate, std::size_t NumIndices, typename... Gens>
+struct tensor_static_symgroup_if
+{
+  constexpr static std::size_t size = 0;
+  typedef void type;
+};
+
+template<std::size_t NumIndices, typename... Gens>
+struct tensor_static_symgroup_if<true, NumIndices, Gens...> : tensor_static_symgroup<NumIndices, Gens...> {};
+
+template<typename Tensor_>
+struct tensor_symmetry_assign_value
+{
+  typedef typename Tensor_::Index Index;
+  typedef typename Tensor_::Scalar Scalar;
+  constexpr static std::size_t NumIndices = Tensor_::NumIndices;
+
+  static inline int run(const std::array<Index, NumIndices>& transformed_indices, int transformation_flags, int dummy, Tensor_& tensor, const Scalar& value_)
+  {
+    Scalar value(value_);
+    if (transformation_flags & ConjugationFlag)
+      value = numext::conj(value);
+    if (transformation_flags & NegationFlag)
+      value = -value;
+    tensor.coeffRef(transformed_indices) = value;
+    return dummy;
+  }
+};
+
+template<typename Tensor_>
+struct tensor_symmetry_calculate_flags
+{
+  typedef typename Tensor_::Index Index;
+  constexpr static std::size_t NumIndices = Tensor_::NumIndices;
+
+  static inline int run(const std::array<Index, NumIndices>& transformed_indices, int transform_flags, int current_flags, const std::array<Index, NumIndices>& orig_indices)
+  {
+    if (transformed_indices == orig_indices) {
+      if (transform_flags & (ConjugationFlag | NegationFlag))
+        return current_flags | GlobalImagFlag; // anti-hermitian diagonal
+      else if (transform_flags & ConjugationFlag)
+        return current_flags | GlobalRealFlag; // hermitian diagonal
+      else if (transform_flags & NegationFlag)
+        return current_flags | GlobalZeroFlag; // anti-symmetric diagonal
+    }
+    return current_flags;
+  }
+};
+
+template<typename Tensor_, typename Symmetry_, int Flags = 0>
+class tensor_symmetry_value_setter
+{
+  public:
+    typedef typename Tensor_::Index Index;
+    typedef typename Tensor_::Scalar Scalar;
+    constexpr static std::size_t NumIndices = Tensor_::NumIndices;
+
+    inline tensor_symmetry_value_setter(Tensor_& tensor, Symmetry_ const& symmetry, std::array<Index, NumIndices> const& indices)
+      : m_tensor(tensor), m_symmetry(symmetry), m_indices(indices) { }
+
+    inline tensor_symmetry_value_setter<Tensor_, Symmetry_, Flags>& operator=(Scalar const& value)
+    {
+      doAssign(value);
+      return *this;
+    }
+  private:
+    Tensor_& m_tensor;
+    Symmetry_ m_symmetry;
+    std::array<Index, NumIndices> m_indices;
+
+    inline void doAssign(Scalar const& value)
+    {
+      #ifdef EIGEN_TENSOR_SYMMETRY_CHECK_VALUES
+        int value_flags = m_symmetry.template apply<internal::tensor_symmetry_calculate_flags<Tensor_>, int>(m_indices, m_symmetry.globalFlags(), m_indices);
+        if (value_flags & GlobalRealFlag)
+          eigen_assert(numext::imag(value) == 0);
+        if (value_flags & GlobalImagFlag)
+          eigen_assert(numext::real(value) == 0);
+      #endif
+      m_symmetry.template apply<internal::tensor_symmetry_assign_value<Tensor_>, int>(m_indices, 0, m_tensor, value);
+    }
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSORSYMMETRY_SYMMETRY_H
+
+/*
+ * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle;
+ */
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h
new file mode 100644
index 0000000000..0fe0b7c46d
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h
@@ -0,0 +1,666 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSORSYMMETRY_TEMPLATEGROUPTHEORY_H
+#define EIGEN_CXX11_TENSORSYMMETRY_TEMPLATEGROUPTHEORY_H
+
+namespace Eigen {
+
+namespace internal {
+
+namespace group_theory {
+
+/** \internal
+  * \file CXX11/Tensor/util/TemplateGroupTheory.h
+  * This file contains C++ templates that implement group theory algorithms.
+  *
+  * The algorithms allow for a compile-time analysis of finite groups.
+  *
+  * Currently only Dimino's algorithm is implemented, which returns a list
+  * of all elements in a group given a set of (possibly redundant) generators.
+  * (One could also do that with the so-called orbital algorithm, but that
+  * is much more expensive and usually has no advantages.)
+  */
+
+/**********************************************************************
+ *                "Ok kid, here is where it gets complicated."
+ *                         - Amelia Pond in the "Doctor Who" episode
+ *                           "The Big Bang"
+ *
+ * Dimino's algorithm
+ * ==================
+ *
+ * The following is Dimino's algorithm in sequential form:
+ *
+ * Input: identity element, list of generators, equality check,
+ *        multiplication operation
+ * Output: list of group elements
+ *
+ * 1. add identity element
+ * 2. remove identities from list of generators
+ * 3. add all powers of first generator that aren't the
+ *    identity element
+ * 4. go through all remaining generators:
+ *        a. if generator is already in the list of elements
+ *                -> do nothing
+ *        b. otherwise
+ *                i.   remember current # of elements
+ *                     (i.e. the size of the current subgroup)
+ *                ii.  add all current elements (which includes
+ *                     the identity) each multiplied from right
+ *                     with the current generator to the group
+ *                iii. add all remaining cosets that are generated
+ *                     by products of the new generator with itself
+ *                     and all other generators seen so far
+ *
+ * In functional form, this is implemented as a long set of recursive
+ * templates that have a complicated relationship.
+ *
+ * The main interface for Dimino's algorithm is the template
+ * enumerate_group_elements. All lists are implemented as variadic
+ * type_list<typename...> and numeric_list<typename = int, int...>
+ * templates.
+ *
+ * 'Calling' templates is usually done via typedefs.
+ *
+ * This algorithm is an extended version of the basic version. The
+ * extension consists in the fact that each group element has a set
+ * of flags associated with it. Multiplication of two group elements
+ * with each other results in a group element whose flags are the
+ * XOR of the flags of the previous elements. Each time the algorithm
+ * notices that a group element it just calculated is already in the
+ * list of current elements, the flags of both will be compared and
+ * added to the so-called 'global flags' of the group.
+ *
+ * The rationale behind this extension is that this allows not only
+ * for the description of symmetries between tensor indices, but
+ * also allows for the description of hermiticity, antisymmetry and
+ * antihermiticity. Negation and conjugation each are specific bit
+ * in the flags value and if two different ways to reach a group
+ * element lead to two different flags, this poses a constraint on
+ * the allowed values of the resulting tensor. For example, if a
+ * group element is reach both with and without the conjugation
+ * flags, it is clear that the resulting tensor has to be real.
+ *
+ * Note that this flag mechanism is quite generic and may have other
+ * uses beyond tensor properties.
+ *
+ * IMPORTANT: 
+ *     This algorithm assumes the group to be finite. If you try to
+ *     run it with a group that's infinite, the algorithm will only
+ *     terminate once you hit a compiler limit (max template depth).
+ *     Also note that trying to use this implementation to create a
+ *     very large group will probably either make you hit the same
+ *     limit, cause the compiler to segfault or at the very least
+ *     take a *really* long time (hours, days, weeks - sic!) to
+ *     compile. It is not recommended to plug in more than 4
+ *     generators, unless they are independent of each other.
+ */
+
+/** \internal
+  *
+  * \class strip_identities
+  * \ingroup CXX11_TensorSymmetry_Module
+  *
+  * \brief Cleanse a list of group elements of the identity element
+  *
+  * This template is used to make a first pass through all initial
+  * generators of Dimino's algorithm and remove the identity
+  * elements.
+  *
+  * \sa enumerate_group_elements
+  */
+template<template<typename, typename> class Equality, typename id, typename L> struct strip_identities;
+
+template<
+  template<typename, typename> class Equality,
+  typename id,
+  typename t,
+  typename... ts
+>
+struct strip_identities<Equality, id, type_list<t, ts...>>
+{
+  typedef typename conditional<
+    Equality<id, t>::value,
+    typename strip_identities<Equality, id, type_list<ts...>>::type,
+    typename concat<type_list<t>, typename strip_identities<Equality, id, type_list<ts...>>::type>::type
+  >::type type;
+  constexpr static int global_flags = Equality<id, t>::global_flags | strip_identities<Equality, id, type_list<ts...>>::global_flags;
+};
+
+template<
+  template<typename, typename> class Equality,
+  typename id
+  EIGEN_TPL_PP_SPEC_HACK_DEFC(typename, ts)
+>
+struct strip_identities<Equality, id, type_list<EIGEN_TPL_PP_SPEC_HACK_USE(ts)>>
+{
+  typedef type_list<> type;
+  constexpr static int global_flags = 0;
+};
+
+/** \internal
+  *
+  * \class dimino_first_step_elements_helper 
+  * \ingroup CXX11_TensorSymmetry_Module
+  *
+  * \brief Recursive template that adds powers of the first generator to the list of group elements
+  *
+  * This template calls itself recursively to add powers of the first
+  * generator to the list of group elements. It stops if it reaches
+  * the identity element again.
+  *
+  * \sa enumerate_group_elements, dimino_first_step_elements
+  */
+template<
+  template<typename, typename> class Multiply,
+  template<typename, typename> class Equality,
+  typename id,
+  typename g,
+  typename current_element,
+  typename elements,
+  bool dont_add_current_element   // = false
+>
+struct dimino_first_step_elements_helper :
+  public dimino_first_step_elements_helper<
+    Multiply,
+    Equality,
+    id,
+    g,
+    typename Multiply<current_element, g>::type,
+    typename concat<elements, type_list<current_element>>::type,
+    Equality<typename Multiply<current_element, g>::type, id>::value
+  > {};
+
+template<
+  template<typename, typename> class Multiply,
+  template<typename, typename> class Equality,
+  typename id,
+  typename g,
+  typename current_element,
+  typename elements
+>
+struct dimino_first_step_elements_helper<Multiply, Equality, id, g, current_element, elements, true>
+{
+  typedef elements type;
+  constexpr static int global_flags = Equality<current_element, id>::global_flags;
+};
+
+/** \internal
+  *
+  * \class dimino_first_step_elements
+  * \ingroup CXX11_TensorSymmetry_Module
+  *
+  * \brief Add all powers of the first generator to the list of group elements
+  *
+  * This template takes the first non-identity generator and generates the initial
+  * list of elements which consists of all powers of that generator. For a group
+  * with just one generated, it would be enumerated after this.
+  *
+  * \sa enumerate_group_elements
+  */
+template<
+  template<typename, typename> class Multiply,
+  template<typename, typename> class Equality,
+  typename id,
+  typename generators
+>
+struct dimino_first_step_elements
+{
+  typedef typename get<0, generators>::type first_generator;
+  typedef typename skip<1, generators>::type next_generators;
+  typedef type_list<first_generator> generators_done;
+
+  typedef dimino_first_step_elements_helper<
+    Multiply,
+    Equality,
+    id,
+    first_generator,
+    first_generator,
+    type_list<id>,
+    false
+  > helper;
+  typedef typename helper::type type;
+  constexpr static int global_flags = helper::global_flags;
+};
+
+/** \internal
+  *
+  * \class dimino_get_coset_elements
+  * \ingroup CXX11_TensorSymmetry_Module
+  *
+  * \brief Generate all elements of a specific coset
+  *
+  * This template generates all the elements of a specific coset by
+  * multiplying all elements in the given subgroup with the new
+  * coset representative. Note that the first element of the
+  * subgroup is always the identity element, so the first element of
+  * ther result of this template is going to be the coset
+  * representative itself.
+  *
+  * Note that this template accepts an additional boolean parameter
+  * that specifies whether to actually generate the coset (true) or
+  * just return an empty list (false).
+  *
+  * \sa enumerate_group_elements, dimino_add_cosets_for_rep
+  */
+template<
+  template<typename, typename> class Multiply,
+  typename sub_group_elements,
+  typename new_coset_rep,
+  bool generate_coset      // = true
+>
+struct dimino_get_coset_elements
+{
+  typedef typename apply_op_from_right<Multiply, new_coset_rep, sub_group_elements>::type type;
+};
+
+template<
+  template<typename, typename> class Multiply,
+  typename sub_group_elements,
+  typename new_coset_rep
+>
+struct dimino_get_coset_elements<Multiply, sub_group_elements, new_coset_rep, false>
+{
+  typedef type_list<> type;
+};
+
+/** \internal
+  *
+  * \class dimino_add_cosets_for_rep
+  * \ingroup CXX11_TensorSymmetry_Module
+  *
+  * \brief Recursive template for adding coset spaces
+  *
+  * This template multiplies the coset representative with a generator
+  * from the list of previous generators. If the new element is not in
+  * the group already, it adds the corresponding coset. Finally it
+  * proceeds to call itself with the next generator from the list.
+  *
+  * \sa enumerate_group_elements, dimino_add_all_coset_spaces
+  */
+template<
+  template<typename, typename> class Multiply,
+  template<typename, typename> class Equality,
+  typename id,
+  typename sub_group_elements,
+  typename elements,
+  typename generators,
+  typename rep_element,
+  int sub_group_size
+>
+struct dimino_add_cosets_for_rep;
+
+template<
+  template<typename, typename> class Multiply,
+  template<typename, typename> class Equality,
+  typename id,
+  typename sub_group_elements,
+  typename elements,
+  typename g,
+  typename... gs,
+  typename rep_element,
+  int sub_group_size
+>
+struct dimino_add_cosets_for_rep<Multiply, Equality, id, sub_group_elements, elements, type_list<g, gs...>, rep_element, sub_group_size>
+{
+  typedef typename Multiply<rep_element, g>::type new_coset_rep;
+  typedef contained_in_list_gf<Equality, new_coset_rep, elements> _cil;
+  constexpr static bool add_coset = !_cil::value;
+
+  typedef typename dimino_get_coset_elements<
+    Multiply,
+    sub_group_elements,
+    new_coset_rep,
+    add_coset
+  >::type coset_elements;
+
+  typedef dimino_add_cosets_for_rep<
+    Multiply,
+    Equality,
+    id,
+    sub_group_elements,
+    typename concat<elements, coset_elements>::type,
+    type_list<gs...>,
+    rep_element,
+    sub_group_size
+  > _helper;
+
+  typedef typename _helper::type type;
+  constexpr static int global_flags = _cil::global_flags | _helper::global_flags;
+
+  /* Note that we don't have to update global flags here, since
+   * we will only add these elements if they are not part of
+   * the group already. But that only happens if the coset rep
+   * is not already in the group, so the check for the coset rep
+   * will catch this.
+   */
+};
+
+template<
+  template<typename, typename> class Multiply,
+  template<typename, typename> class Equality,
+  typename id,
+  typename sub_group_elements,
+  typename elements
+  EIGEN_TPL_PP_SPEC_HACK_DEFC(typename, empty),
+  typename rep_element,
+  int sub_group_size
+>
+struct dimino_add_cosets_for_rep<Multiply, Equality, id, sub_group_elements, elements, type_list<EIGEN_TPL_PP_SPEC_HACK_USE(empty)>, rep_element, sub_group_size>
+{
+  typedef elements type;
+  constexpr static int global_flags = 0;
+};
+
+/** \internal
+  *
+  * \class dimino_add_all_coset_spaces
+  * \ingroup CXX11_TensorSymmetry_Module
+  *
+  * \brief Recursive template for adding all coset spaces for a new generator
+  *
+  * This template tries to go through the list of generators (with
+  * the help of the dimino_add_cosets_for_rep template) as long as
+  * it still finds elements that are not part of the group and add
+  * the corresponding cosets.
+  *
+  * \sa enumerate_group_elements, dimino_add_cosets_for_rep
+  */
+template<
+  template<typename, typename> class Multiply,
+  template<typename, typename> class Equality,
+  typename id,
+  typename sub_group_elements,
+  typename elements,
+  typename generators,
+  int sub_group_size,
+  int rep_pos,
+  bool stop_condition        // = false
+>
+struct dimino_add_all_coset_spaces
+{
+  typedef typename get<rep_pos, elements>::type rep_element;
+  typedef dimino_add_cosets_for_rep<
+    Multiply,
+    Equality,
+    id,
+    sub_group_elements,
+    elements,
+    generators,
+    rep_element,
+    sub_group_elements::count
+  > _ac4r;
+  typedef typename _ac4r::type new_elements;
+  
+  constexpr static int new_rep_pos = rep_pos + sub_group_elements::count;
+  constexpr static bool new_stop_condition = new_rep_pos >= new_elements::count;
+
+  typedef dimino_add_all_coset_spaces<
+    Multiply,
+    Equality,
+    id,
+    sub_group_elements,
+    new_elements,
+    generators,
+    sub_group_size,
+    new_rep_pos,
+    new_stop_condition
+  > _helper;
+
+  typedef typename _helper::type type;
+  constexpr static int global_flags = _helper::global_flags | _ac4r::global_flags;
+};
+
+template<
+  template<typename, typename> class Multiply,
+  template<typename, typename> class Equality,
+  typename id,
+  typename sub_group_elements,
+  typename elements,
+  typename generators,
+  int sub_group_size,
+  int rep_pos
+>
+struct dimino_add_all_coset_spaces<Multiply, Equality, id, sub_group_elements, elements, generators, sub_group_size, rep_pos, true>
+{
+  typedef elements type;
+  constexpr static int global_flags = 0;
+};
+
+/** \internal
+  *
+  * \class dimino_add_generator
+  * \ingroup CXX11_TensorSymmetry_Module
+  *
+  * \brief Enlarge the group by adding a new generator.
+  *
+  * It accepts a boolean parameter that determines if the generator is redundant,
+  * i.e. was already seen in the group. In that case, it reduces to a no-op.
+  *
+  * \sa enumerate_group_elements, dimino_add_all_coset_spaces
+  */
+template<
+  template<typename, typename> class Multiply,
+  template<typename, typename> class Equality,
+  typename id,
+  typename elements,
+  typename generators_done,
+  typename current_generator,
+  bool redundant          // = false
+>
+struct dimino_add_generator
+{
+  /* this template is only called if the generator is not redundant
+   * => all elements of the group multiplied with the new generator
+   *    are going to be new elements of the most trivial coset space
+   */
+  typedef typename apply_op_from_right<Multiply, current_generator, elements>::type multiplied_elements;
+  typedef typename concat<elements, multiplied_elements>::type new_elements;
+
+  constexpr static int rep_pos = elements::count;
+
+  typedef dimino_add_all_coset_spaces<
+    Multiply,
+    Equality,
+    id,
+    elements, // elements of previous subgroup
+    new_elements,
+    typename concat<generators_done, type_list<current_generator>>::type,
+    elements::count, // size of previous subgroup
+    rep_pos,
+    false // don't stop (because rep_pos >= new_elements::count is always false at this point)
+  > _helper;
+  typedef typename _helper::type type;
+  constexpr static int global_flags = _helper::global_flags;
+};
+
+template<
+  template<typename, typename> class Multiply,
+  template<typename, typename> class Equality,
+  typename id,
+  typename elements,
+  typename generators_done,
+  typename current_generator
+>
+struct dimino_add_generator<Multiply, Equality, id, elements, generators_done, current_generator, true>
+{
+  // redundant case
+  typedef elements type;
+  constexpr static int global_flags = 0;
+};
+
+/** \internal
+  *
+  * \class dimino_add_remaining_generators
+  * \ingroup CXX11_TensorSymmetry_Module
+  *
+  * \brief Recursive template that adds all remaining generators to a group
+  *
+  * Loop through the list of generators that remain and successively
+  * add them to the group.
+  *
+  * \sa enumerate_group_elements, dimino_add_generator
+  */
+template<
+  template<typename, typename> class Multiply,
+  template<typename, typename> class Equality,
+  typename id,
+  typename generators_done,
+  typename remaining_generators,
+  typename elements
+>
+struct dimino_add_remaining_generators
+{
+  typedef typename get<0, remaining_generators>::type first_generator;
+  typedef typename skip<1, remaining_generators>::type next_generators;
+
+  typedef contained_in_list_gf<Equality, first_generator, elements> _cil;
+
+  typedef dimino_add_generator<
+    Multiply,
+    Equality,
+    id,
+    elements,
+    generators_done,
+    first_generator,
+    _cil::value
+  > _helper;
+
+  typedef typename _helper::type new_elements;
+
+  typedef dimino_add_remaining_generators<
+    Multiply,
+    Equality,
+    id,
+    typename concat<generators_done, type_list<first_generator>>::type,
+    next_generators,
+    new_elements
+  > _next_iter;
+
+  typedef typename _next_iter::type type;
+  constexpr static int global_flags =
+    _cil::global_flags |
+    _helper::global_flags |
+    _next_iter::global_flags;
+};
+
+template<
+  template<typename, typename> class Multiply,
+  template<typename, typename> class Equality,
+  typename id,
+  typename generators_done,
+  typename elements
+>
+struct dimino_add_remaining_generators<Multiply, Equality, id, generators_done, type_list<>, elements>
+{
+  typedef elements type;
+  constexpr static int global_flags = 0;
+};
+
+/** \internal
+  *
+  * \class enumerate_group_elements_noid
+  * \ingroup CXX11_TensorSymmetry_Module
+  *
+  * \brief Helper template that implements group element enumeration
+  *
+  * This is a helper template that implements the actual enumeration
+  * of group elements. This has been split so that the list of
+  * generators can be cleansed of the identity element before
+  * performing the actual operation.
+  *
+  * \sa enumerate_group_elements
+  */
+template<
+  template<typename, typename> class Multiply,
+  template<typename, typename> class Equality,
+  typename id,
+  typename generators,
+  int initial_global_flags = 0
+>
+struct enumerate_group_elements_noid
+{
+  typedef dimino_first_step_elements<Multiply, Equality, id, generators> first_step;
+  typedef typename first_step::type first_step_elements;
+
+  typedef dimino_add_remaining_generators<
+    Multiply,
+    Equality,
+    id,
+    typename first_step::generators_done,
+    typename first_step::next_generators, // remaining_generators
+    typename first_step::type // first_step elements
+  > _helper;
+
+  typedef typename _helper::type type;
+  constexpr static int global_flags =
+    initial_global_flags |
+    first_step::global_flags |
+    _helper::global_flags;
+};
+
+// in case when no generators are specified
+template<
+  template<typename, typename> class Multiply,
+  template<typename, typename> class Equality,
+  typename id,
+  int initial_global_flags
+>
+struct enumerate_group_elements_noid<Multiply, Equality, id, type_list<>, initial_global_flags>
+{
+  typedef type_list<id> type;
+  constexpr static int global_flags = initial_global_flags;
+};
+
+/** \internal
+  *
+  * \class enumerate_group_elements
+  * \ingroup CXX11_TensorSymmetry_Module
+  *
+  * \brief Enumerate all elements in a finite group
+  *
+  * This template enumerates all elements in a finite group. It accepts
+  * the following template parameters:
+  *
+  * \tparam Multiply      The multiplication operation that multiplies two group elements
+  *                       with each other.
+  * \tparam Equality      The equality check operation that checks if two group elements
+  *                       are equal to another.
+  * \tparam id            The identity element
+  * \tparam _generators   A list of (possibly redundant) generators of the group
+  */
+template<
+  template<typename, typename> class Multiply,
+  template<typename, typename> class Equality,
+  typename id,
+  typename _generators
+>
+struct enumerate_group_elements
+  : public enumerate_group_elements_noid<
+      Multiply,
+      Equality,
+      id,
+      typename strip_identities<Equality, id, _generators>::type,
+      strip_identities<Equality, id, _generators>::global_flags
+    >
+{
+};
+
+} // end namespace group_theory
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSORSYMMETRY_TEMPLATEGROUPTHEORY_H
+
+/*
+ * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle;
+ */