diff options
Diffstat (limited to 'third_party/eigen3/unsupported/Eigen/CXX11')
83 files changed, 34023 insertions, 0 deletions
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/Core b/third_party/eigen3/unsupported/Eigen/CXX11/Core new file mode 100644 index 0000000000..1b3690716c --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/Core @@ -0,0 +1,46 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2013 Christian Seiler <christian@iwakd.de> +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_CORE_MODULE +#define EIGEN_CXX11_CORE_MODULE + +#include <Eigen/Core> + +#include <Eigen/src/Core/util/DisableStupidWarnings.h> + +/** \defgroup CXX11_Core_Module C++11 Core Module + * + * This module provides common core features for all modules that + * explicitly depend on C++11. Currently, this is only the Tensor + * module. Note that at this stage, you should not need to include + * this module directly. + * + * It also provides a limited fallback for compilers that don't support + * CXX11 yet, such as nvcc. + * + * \code + * #include <Eigen/CXX11/Core> + * \endcode + */ + +// Only a subset of cxx11 is allowed at Google, so we default to emulate the +// cxx11 functionality that we need. +#include "src/Core/util/FixedSizeVector.h" +#if 1 +#include <vector> +#include "src/Core/util/EmulateCXX11Meta.h" +#else +#include "src/Core/util/CXX11Workarounds.h" +#include "src/Core/util/CXX11Meta.h" +#endif +#include <Eigen/src/Core/util/ReenableStupidWarnings.h> + +#endif // EIGEN_CXX11_CORE_MODULE + diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/FixedPoint b/third_party/eigen3/unsupported/Eigen/CXX11/FixedPoint new file mode 100644 index 0000000000..35b55de46d --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/FixedPoint @@ -0,0 +1,51 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_FIXED_POINT_MODULE +#define EIGEN_CXX11_FIXED_POINT_MODULE + +#include <Eigen/Core> +#include <stdint.h> + +/** \defgroup CXX11_FixedPoint_Module Fixed Point Module + * + * This module provides common core features for all modules that + * explicitly depend on C++11. Currently, this is only the Tensor + * module. Note that at this stage, you should not need to include + * this module directly. + * + * It also provides a limited fallback for compilers that don't support + * CXX11 yet, such as nvcc. + * + * \code + * #include <Eigen/CXX11/FixedPoint> + * \endcode + */ + +#include "src/FixedPoint/FixedPointTypes.h" + +// Use optimized implementations whenever available +#ifdef EIGEN_VECTORIZE_AVX2 +#define EIGEN_USE_OPTIMIZED_INT8_UINT8_MAT_MAT_PRODUCT +#include "src/Tensor/TensorContractionThreadPool.h" +#include "src/FixedPoint/PacketMathAVX2.h" +#include "src/FixedPoint/MatMatProductAVX2.h" +#include "src/FixedPoint/TypeCastingAVX2.h" + +#elif defined EIGEN_VECTORIZE_NEON +#define EIGEN_USE_OPTIMIZED_INT8_UINT8_MAT_MAT_PRODUCT +#include "src/FixedPoint/MatMatProductNEON.h" +#endif + +// Use the default implementation when no optimized code is available +#include "src/FixedPoint/MatMatProduct.h" +#include "src/FixedPoint/MatVecProduct.h" + + +#endif // EIGEN_CXX11_FIXED_POINT_MODULE diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/NeuralNetworks b/third_party/eigen3/unsupported/Eigen/CXX11/NeuralNetworks new file mode 100644 index 0000000000..7741b68d8a --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/NeuralNetworks @@ -0,0 +1,35 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_NEURAL_NETWORKS_MODULE +#define EIGEN_CXX11_NEURAL_NETWORKS_MODULE + +#include "unsupported/Eigen/CXX11/Tensor" + +/** \defgroup CXX11_NeuralNetworks_Module Neural Networks Module + * + * This module provides an efficient implementation of the common primitives + * used by neural networks. + * The primitives are built on top of the tensor library. + * + * \code + * #include <Eigen/CXX11/NeuralNetworks> + * \endcode + */ + +#include "unsupported/Eigen/CXX11/src/NeuralNetworks/Activations.h" +#include "unsupported/Eigen/CXX11/src/NeuralNetworks/Attention.h" +#include "unsupported/Eigen/CXX11/src/NeuralNetworks/Pooling.h" +#include "unsupported/Eigen/CXX11/src/NeuralNetworks/SoftMax.h" +#include "unsupported/Eigen/CXX11/src/NeuralNetworks/BackwardCuboidConvolutions.h" +#include "unsupported/Eigen/CXX11/src/NeuralNetworks/CuboidConvolution.h" +#include "unsupported/Eigen/CXX11/src/NeuralNetworks/BackwardSpatialConvolutions.h" +#include "unsupported/Eigen/CXX11/src/NeuralNetworks/SpatialConvolutions.h" + +#endif // EIGEN_CXX11_NEURAL_NETWORKS_MODULE diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/Tensor b/third_party/eigen3/unsupported/Eigen/CXX11/Tensor new file mode 100644 index 0000000000..3904c72eef --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/Tensor @@ -0,0 +1,145 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2013 Christian Seiler <christian@iwakd.de> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_MODULE +#define EIGEN_CXX11_TENSOR_MODULE + +#include "Eigen/src/Core/util/StaticAssert.h" +#include "unsupported/Eigen/CXX11/Core" + +#include "Eigen/src/Core/util/DisableStupidWarnings.h" + +/** \defgroup CXX11_Tensor_Module Tensor Module + * + * This module provides a Tensor class for storing arbitrarily indexed + * objects. + * + * \code + * #include <Eigen/CXX11/Tensor> + * \endcode + */ + +#include <cstddef> +#include <cstring> +#include <stdint.h> + +#if __cplusplus > 199711 +#include <random> +#endif + +#ifdef EIGEN_USE_THREADS +#if defined(EIGEN_USE_CUSTOM_THREAD_POOL) +// Use the Eigen implementation of the ThreadPool class. We only need to +// include a few multithreading headers +#include <condition_variable> +#include <deque> +#include <mutex> +#include <thread> +#else +#include "tensorflow/core/platform/port.h" +#endif // EIGEN_USE_CUSTOM_THREAD_POOL + +#include <functional> + +#endif // EIGEN_USE_THREADS + +#ifdef EIGEN_USE_GPU +#include "tensorflow/core/platform/port.h" +#if !defined(__GCUDACC__) && !defined(__GCUDACC_HOST__) +#include <cuda.h> +#include <cufft.h> +#include <cuda_runtime.h> +#ifdef __CUDACC__ +#include <curand_kernel.h> +#endif // defined(__CUDACC__) +#else +#include "perftools/gputools/executor/gcuda.h" +#ifdef __CUDACC__ +#include "third_party/gpus/cuda/curand_device/curand_kernel.h" +#endif // defined(__CUDACC__) +#endif // __GCUDACC__ +#endif // EIGEN_USE_GPU + +#ifdef _WIN32 +#include <winbase.h> +#elif defined(__APPLE__) +#include <mach/mach_time.h> +#else +#include <time.h> +#endif + +#include "Eigen/Core" + +// Beware: the order of the include matters to some compilers. For example +// TensorIndexList.h should be included before TensorDimensions.h in order to +// use index lists to encode tensor dimensions when compiling with llvm. +#include "unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h" + +#include "unsupported/Eigen/CXX11/src/Tensor/TensorBase.h" + +#include "unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h" + +#include "unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionMappers.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h" +#include "unsupported/Eigen/CXX11/src/NeuralNetworks/TensorConvolutionByFFT.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorTrueIndices.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h" + +#include "unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h" + +#include "unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h" +#include "unsupported/Eigen/CXX11/src/Tensor/Tensor.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorVarDim.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorMap.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorRef.h" + +#include "unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h" + +#include "unsupported/Eigen/CXX11/src/Tensor/TensorIO.h" + +#include "Eigen/src/Core/util/ReenableStupidWarnings.h" + +#endif // EIGEN_CXX11_TENSOR_MODULE diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/TensorSymmetry b/third_party/eigen3/unsupported/Eigen/CXX11/TensorSymmetry new file mode 100644 index 0000000000..027c6087f9 --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/TensorSymmetry @@ -0,0 +1,40 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2013 Christian Seiler <christian@iwakd.de> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSORSYMMETRY_MODULE +#define EIGEN_CXX11_TENSORSYMMETRY_MODULE + +#include <Eigen/CXX11/Tensor> + +#include <Eigen/src/Core/util/DisableStupidWarnings.h> + +/** \defgroup CXX11_TensorSymmetry_Module Tensor Symmetry Module + * + * This module provides a classes that allow for the definition of + * symmetries w.r.t. tensor indices. + * + * Including this module will implicitly include the Tensor module. + * + * \code + * #include <Eigen/TensorSymmetry> + * \endcode + */ + +#include "src/TensorSymmetry/util/TemplateGroupTheory.h" +#include "src/TensorSymmetry/Symmetry.h" +#include "src/TensorSymmetry/StaticSymmetry.h" +#include "src/TensorSymmetry/DynamicSymmetry.h" + +#include <Eigen/src/Core/util/ReenableStupidWarnings.h> + +#endif // EIGEN_CXX11_TENSORSYMMETRY_MODULE + +/* + * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle; + */ diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h new file mode 100644 index 0000000000..ad6a9dda10 --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h @@ -0,0 +1,508 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2013 Christian Seiler <christian@iwakd.de> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11META_H +#define EIGEN_CXX11META_H + +namespace Eigen { + +namespace internal { + +/** \internal + * \file CXX11/Core/util/CXX11Meta.h + * This file contains generic metaprogramming classes which are not specifically related to Eigen. + * This file expands upon Core/util/Meta.h and adds support for C++11 specific features. + */ + +template<typename... tt> +struct type_list { constexpr static int count = sizeof...(tt); }; + +template<typename t, typename... tt> +struct type_list<t, tt...> { constexpr static int count = sizeof...(tt) + 1; typedef t first_type; }; + +template<typename T, T... nn> +struct numeric_list { constexpr static std::size_t count = sizeof...(nn); }; + +template<typename T, T n, T... nn> +struct numeric_list<T, n, nn...> { constexpr static std::size_t count = sizeof...(nn) + 1; constexpr static T first_value = n; }; + +/* numeric list constructors + * + * equivalencies: + * constructor result + * typename gen_numeric_list<int, 5>::type numeric_list<int, 0,1,2,3,4> + * typename gen_numeric_list_reversed<int, 5>::type numeric_list<int, 4,3,2,1,0> + * typename gen_numeric_list_swapped_pair<int, 5,1,2>::type numeric_list<int, 0,2,1,3,4> + * typename gen_numeric_list_repeated<int, 0, 5>::type numeric_list<int, 0,0,0,0,0> + */ + +template<typename T, std::size_t n, T... ii> struct gen_numeric_list : gen_numeric_list<T, n-1, n-1, ii...> {}; +template<typename T, T... ii> struct gen_numeric_list<T, 0, ii...> { typedef numeric_list<T, ii...> type; }; + +template<typename T, std::size_t n, T... ii> struct gen_numeric_list_reversed : gen_numeric_list_reversed<T, n-1, ii..., n-1> {}; +template<typename T, T... ii> struct gen_numeric_list_reversed<T, 0, ii...> { typedef numeric_list<T, ii...> type; }; + +template<typename T, std::size_t n, T a, T b, T... ii> struct gen_numeric_list_swapped_pair : gen_numeric_list_swapped_pair<T, n-1, a, b, (n-1) == a ? b : ((n-1) == b ? a : (n-1)), ii...> {}; +template<typename T, T a, T b, T... ii> struct gen_numeric_list_swapped_pair<T, 0, a, b, ii...> { typedef numeric_list<T, ii...> type; }; + +template<typename T, std::size_t n, T V, T... nn> struct gen_numeric_list_repeated : gen_numeric_list_repeated<T, n-1, V, V, nn...> {}; +template<typename T, T V, T... nn> struct gen_numeric_list_repeated<T, 0, V, nn...> { typedef numeric_list<T, nn...> type; }; + +/* list manipulation: concatenate */ + +template<class a, class b> struct concat; + +template<typename... as, typename... bs> struct concat<type_list<as...>, type_list<bs...>> { typedef type_list<as..., bs...> type; }; +template<typename T, T... as, T... bs> struct concat<numeric_list<T, as...>, numeric_list<T, bs...> > { typedef numeric_list<T, as..., bs...> type; }; + +template<typename... p> struct mconcat; +template<typename a> struct mconcat<a> { typedef a type; }; +template<typename a, typename b> struct mconcat<a, b> : concat<a, b> {}; +template<typename a, typename b, typename... cs> struct mconcat<a, b, cs...> : concat<a, typename mconcat<b, cs...>::type> {}; + +/* list manipulation: extract slices */ + +template<int n, typename x> struct take; +template<int n, typename a, typename... as> struct take<n, type_list<a, as...>> : concat<type_list<a>, typename take<n-1, type_list<as...>>::type> {}; +template<int n> struct take<n, type_list<>> { typedef type_list<> type; }; +template<typename a, typename... as> struct take<0, type_list<a, as...>> { typedef type_list<> type; }; +template<> struct take<0, type_list<>> { typedef type_list<> type; }; + +template<typename T, int n, T a, T... as> struct take<n, numeric_list<T, a, as...>> : concat<numeric_list<T, a>, typename take<n-1, numeric_list<T, as...>>::type> {}; +template<typename T, int n> struct take<n, numeric_list<T>> { typedef numeric_list<T> type; }; +template<typename T, T a, T... as> struct take<0, numeric_list<T, a, as...>> { typedef numeric_list<T> type; }; +template<typename T> struct take<0, numeric_list<T>> { typedef numeric_list<T> type; }; + +template<typename T, int n, T... ii> struct h_skip_helper_numeric; +template<typename T, int n, T i, T... ii> struct h_skip_helper_numeric<T, n, i, ii...> : h_skip_helper_numeric<T, n-1, ii...> {}; +template<typename T, T i, T... ii> struct h_skip_helper_numeric<T, 0, i, ii...> { typedef numeric_list<T, i, ii...> type; }; +template<typename T, int n> struct h_skip_helper_numeric<T, n> { typedef numeric_list<T> type; }; +template<typename T> struct h_skip_helper_numeric<T, 0> { typedef numeric_list<T> type; }; + +template<int n, typename... tt> struct h_skip_helper_type; +template<int n, typename t, typename... tt> struct h_skip_helper_type<n, t, tt...> : h_skip_helper_type<n-1, tt...> {}; +template<typename t, typename... tt> struct h_skip_helper_type<0, t, tt...> { typedef type_list<t, tt...> type; }; +template<int n> struct h_skip_helper_type<n> { typedef type_list<> type; }; +template<> struct h_skip_helper_type<0> { typedef type_list<> type; }; + +template<int n> +struct h_skip { + template<typename T, T... ii> + constexpr static inline typename h_skip_helper_numeric<T, n, ii...>::type helper(numeric_list<T, ii...>) { return typename h_skip_helper_numeric<T, n, ii...>::type(); } + template<typename... tt> + constexpr static inline typename h_skip_helper_type<n, tt...>::type helper(type_list<tt...>) { return typename h_skip_helper_type<n, tt...>::type(); } +}; + +template<int n, typename a> struct skip { typedef decltype(h_skip<n>::helper(a())) type; }; + +template<int start, int count, typename a> struct slice : take<count, typename skip<start, a>::type> {}; + +/* list manipulation: retrieve single element from list */ + +template<int n, typename x> struct get; + +template<int n, typename a, typename... as> struct get<n, type_list<a, as...>> : get<n-1, type_list<as...>> {}; +template<typename a, typename... as> struct get<0, type_list<a, as...>> { typedef a type; }; +template<int n EIGEN_TPL_PP_SPEC_HACK_DEFC(typename, as)> struct get<n, type_list<EIGEN_TPL_PP_SPEC_HACK_USE(as)>> { static_assert((n - n) < 0, "meta-template get: The element to extract from a list must be smaller than the size of the list."); }; + +template<typename T, int n, T a, T... as> struct get<n, numeric_list<T, a, as...>> : get<n-1, numeric_list<T, as...>> {}; +template<typename T, T a, T... as> struct get<0, numeric_list<T, a, as...>> { constexpr static T value = a; }; +template<typename T, int n EIGEN_TPL_PP_SPEC_HACK_DEFC(T, as)> struct get<n, numeric_list<T EIGEN_TPL_PP_SPEC_HACK_USEC(as)>> { static_assert((n - n) < 0, "meta-template get: The element to extract from a list must be smaller than the size of the list."); }; + +/* always get type, regardless of dummy; good for parameter pack expansion */ + +template<typename T, T dummy, typename t> struct id_numeric { typedef t type; }; +template<typename dummy, typename t> struct id_type { typedef t type; }; + +/* equality checking, flagged version */ + +template<typename a, typename b> struct is_same_gf : is_same<a, b> { constexpr static int global_flags = 0; }; + +/* apply_op to list */ + +template< + bool from_left, // false + template<typename, typename> class op, + typename additional_param, + typename... values +> +struct h_apply_op_helper { typedef type_list<typename op<values, additional_param>::type...> type; }; +template< + template<typename, typename> class op, + typename additional_param, + typename... values +> +struct h_apply_op_helper<true, op, additional_param, values...> { typedef type_list<typename op<additional_param, values>::type...> type; }; + +template< + bool from_left, + template<typename, typename> class op, + typename additional_param +> +struct h_apply_op +{ + template<typename... values> + constexpr static typename h_apply_op_helper<from_left, op, additional_param, values...>::type helper(type_list<values...>) + { return typename h_apply_op_helper<from_left, op, additional_param, values...>::type(); } +}; + +template< + template<typename, typename> class op, + typename additional_param, + typename a +> +struct apply_op_from_left { typedef decltype(h_apply_op<true, op, additional_param>::helper(a())) type; }; + +template< + template<typename, typename> class op, + typename additional_param, + typename a +> +struct apply_op_from_right { typedef decltype(h_apply_op<false, op, additional_param>::helper(a())) type; }; + +/* see if an element is in a list */ + +template< + template<typename, typename> class test, + typename check_against, + typename h_list, + bool last_check_positive = false +> +struct contained_in_list; + +template< + template<typename, typename> class test, + typename check_against, + typename h_list +> +struct contained_in_list<test, check_against, h_list, true> +{ + constexpr static bool value = true; +}; + +template< + template<typename, typename> class test, + typename check_against, + typename a, + typename... as +> +struct contained_in_list<test, check_against, type_list<a, as...>, false> : contained_in_list<test, check_against, type_list<as...>, test<check_against, a>::value> {}; + +template< + template<typename, typename> class test, + typename check_against + EIGEN_TPL_PP_SPEC_HACK_DEFC(typename, empty) +> +struct contained_in_list<test, check_against, type_list<EIGEN_TPL_PP_SPEC_HACK_USE(empty)>, false> { constexpr static bool value = false; }; + +/* see if an element is in a list and check for global flags */ + +template< + template<typename, typename> class test, + typename check_against, + typename h_list, + int default_flags = 0, + bool last_check_positive = false, + int last_check_flags = default_flags +> +struct contained_in_list_gf; + +template< + template<typename, typename> class test, + typename check_against, + typename h_list, + int default_flags, + int last_check_flags +> +struct contained_in_list_gf<test, check_against, h_list, default_flags, true, last_check_flags> +{ + constexpr static bool value = true; + constexpr static int global_flags = last_check_flags; +}; + +template< + template<typename, typename> class test, + typename check_against, + typename a, + typename... as, + int default_flags, + int last_check_flags +> +struct contained_in_list_gf<test, check_against, type_list<a, as...>, default_flags, false, last_check_flags> : contained_in_list_gf<test, check_against, type_list<as...>, default_flags, test<check_against, a>::value, test<check_against, a>::global_flags> {}; + +template< + template<typename, typename> class test, + typename check_against + EIGEN_TPL_PP_SPEC_HACK_DEFC(typename, empty), + int default_flags, + int last_check_flags +> +struct contained_in_list_gf<test, check_against, type_list<EIGEN_TPL_PP_SPEC_HACK_USE(empty)>, default_flags, false, last_check_flags> { constexpr static bool value = false; constexpr static int global_flags = default_flags; }; + +/* generic reductions */ + +template< + typename Reducer, + typename... Ts +> struct reduce; + +template< + typename Reducer, + typename A, + typename... Ts +> struct reduce<Reducer, A, Ts...> +{ + constexpr static inline A run(A a, Ts...) { return a; } +}; + +template< + typename Reducer, + typename A, + typename B, + typename... Ts +> struct reduce<Reducer, A, B, Ts...> +{ + constexpr static inline auto run(A a, B b, Ts... ts) -> decltype(Reducer::run(a, reduce<Reducer, B, Ts...>::run(b, ts...))) { + return Reducer::run(a, reduce<Reducer, B, Ts...>::run(b, ts...)); + } +}; + +/* generic binary operations */ + +struct sum_op { template<typename A, typename B> constexpr static inline auto run(A a, B b) -> decltype(a + b) { return a + b; } }; +struct product_op { template<typename A, typename B> constexpr static inline auto run(A a, B b) -> decltype(a * b) { return a * b; } }; + +struct logical_and_op { template<typename A, typename B> constexpr static inline auto run(A a, B b) -> decltype(a && b) { return a && b; } }; +struct logical_or_op { template<typename A, typename B> constexpr static inline auto run(A a, B b) -> decltype(a || b) { return a || b; } }; + +struct equal_op { template<typename A, typename B> constexpr static inline auto run(A a, B b) -> decltype(a == b) { return a == b; } }; +struct not_equal_op { template<typename A, typename B> constexpr static inline auto run(A a, B b) -> decltype(a != b) { return a != b; } }; +struct lesser_op { template<typename A, typename B> constexpr static inline auto run(A a, B b) -> decltype(a < b) { return a < b; } }; +struct lesser_equal_op { template<typename A, typename B> constexpr static inline auto run(A a, B b) -> decltype(a <= b) { return a <= b; } }; +struct greater_op { template<typename A, typename B> constexpr static inline auto run(A a, B b) -> decltype(a > b) { return a > b; } }; +struct greater_equal_op { template<typename A, typename B> constexpr static inline auto run(A a, B b) -> decltype(a >= b) { return a >= b; } }; + +/* generic unary operations */ + +struct not_op { template<typename A> constexpr static inline auto run(A a) -> decltype(!a) { return !a; } }; +struct negation_op { template<typename A> constexpr static inline auto run(A a) -> decltype(-a) { return -a; } }; +struct greater_equal_zero_op { template<typename A> constexpr static inline auto run(A a) -> decltype(a >= 0) { return a >= 0; } }; + + +/* reductions for lists */ + +// using auto -> return value spec makes ICC 13.0 and 13.1 crash here, so we have to hack it +// together in front... (13.0 doesn't work with array_prod/array_reduce/... anyway, but 13.1 +// does... +template<typename... Ts> +constexpr inline decltype(reduce<product_op, Ts...>::run((*((Ts*)0))...)) arg_prod(Ts... ts) +{ + return reduce<product_op, Ts...>::run(ts...); +} + +template<typename... Ts> +constexpr inline decltype(reduce<sum_op, Ts...>::run((*((Ts*)0))...)) arg_sum(Ts... ts) +{ + return reduce<sum_op, Ts...>::run(ts...); +} + +/* reverse arrays */ + +template<typename Array, int... n> +constexpr inline Array h_array_reverse(Array arr, numeric_list<int, n...>) +{ + return {{array_get<sizeof...(n) - n - 1>(arr)...}}; +} + +template<typename T, std::size_t N> +constexpr inline std::array<T, N> array_reverse(std::array<T, N> arr) +{ + return h_array_reverse(arr, typename gen_numeric_list<int, N>::type()); +} + +/* generic array reductions */ + +// can't reuse standard reduce() interface above because Intel's Compiler +// *really* doesn't like it, so we just reimplement the stuff +// (start from N - 1 and work down to 0 because specialization for +// n == N - 1 also doesn't work in Intel's compiler, so it goes into +// an infinite loop) +template<typename Reducer, typename T, std::size_t N, std::size_t n = N - 1> +struct h_array_reduce { + constexpr static inline auto run(std::array<T, N> arr, T identity) -> decltype(Reducer::run(h_array_reduce<Reducer, T, N, n - 1>::run(arr), array_get<n>(arr))) + { + return Reducer::run(h_array_reduce<Reducer, T, N, n - 1>::run(arr), array_get<n>(arr)); + } +}; + +template<typename Reducer, typename T, std::size_t N> +struct h_array_reduce<Reducer, T, N, 0> +{ + constexpr static inline T run(std::array<T, N> arr, T identity) + { + return array_get<0>(arr); + } +}; + +template<typename Reducer, typename T, std::size_t N> +struct h_array_reduce<Reducer, T, 0> +{ + constexpr static inline T run(std::array<T, 0> arr, T identity) + { + return identity; + } +}; + +template<typename Reducer, typename T, std::size_t N> +constexpr inline auto array_reduce(std::array<T, N> arr, T identity) -> decltype(h_array_reduce<Reducer, T, N>::run(arr)) +{ + return h_array_reduce<Reducer, T, N>::run(arr, identity); +} + +/* standard array reductions */ + +template<typename T, std::size_t N> +constexpr inline auto array_sum(std::array<T, N> arr) -> decltype(array_reduce<sum_op, T, N>(arr)) +{ + return array_reduce<sum_op, T, N>(arr, 0); +} + +template<typename T, std::size_t N> +constexpr inline auto array_prod(std::array<T, N> arr) -> decltype(array_reduce<product_op, T, N>(arr)) +{ + return array_reduce<product_op, T, N>(arr, 1); +} + +/* zip an array */ + +template<typename Op, typename A, typename B, std::size_t N, int... n> +constexpr inline std::array<decltype(Op::run(A(), B())),N> h_array_zip(std::array<A, N> a, std::array<B, N> b, numeric_list<int, n...>) +{ + return std::array<decltype(Op::run(A(), B())),N>{{ Op::run(array_get<n>(a), array_get<n>(b))... }}; +} + +template<typename Op, typename A, typename B, std::size_t N> +constexpr inline std::array<decltype(Op::run(A(), B())),N> array_zip(std::array<A, N> a, std::array<B, N> b) +{ + return h_array_zip<Op>(a, b, typename gen_numeric_list<int, N>::type()); +} + +/* zip an array and reduce the result */ + +template<typename Reducer, typename Op, typename A, typename B, std::size_t N, int... n> +constexpr inline auto h_array_zip_and_reduce(std::array<A, N> a, std::array<B, N> b, numeric_list<int, n...>) -> decltype(reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A(), B()))>::type...>::run(Op::run(array_get<n>(a), array_get<n>(b))...)) +{ + return reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A(), B()))>::type...>::run(Op::run(array_get<n>(a), array_get<n>(b))...); +} + +template<typename Reducer, typename Op, typename A, typename B, std::size_t N> +constexpr inline auto array_zip_and_reduce(std::array<A, N> a, std::array<B, N> b) -> decltype(h_array_zip_and_reduce<Reducer, Op, A, B, N>(a, b, typename gen_numeric_list<int, N>::type())) +{ + return h_array_zip_and_reduce<Reducer, Op, A, B, N>(a, b, typename gen_numeric_list<int, N>::type()); +} + +/* apply stuff to an array */ + +template<typename Op, typename A, std::size_t N, int... n> +constexpr inline std::array<decltype(Op::run(A())),N> h_array_apply(std::array<A, N> a, numeric_list<int, n...>) +{ + return std::array<decltype(Op::run(A())),N>{{ Op::run(array_get<n>(a))... }}; +} + +template<typename Op, typename A, std::size_t N> +constexpr inline std::array<decltype(Op::run(A())),N> array_apply(std::array<A, N> a) +{ + return h_array_apply<Op>(a, typename gen_numeric_list<int, N>::type()); +} + +/* apply stuff to an array and reduce */ + +template<typename Reducer, typename Op, typename A, std::size_t N, int... n> +constexpr inline auto h_array_apply_and_reduce(std::array<A, N> arr, numeric_list<int, n...>) -> decltype(reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A()))>::type...>::run(Op::run(array_get<n>(arr))...)) +{ + return reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A()))>::type...>::run(Op::run(array_get<n>(arr))...); +} + +template<typename Reducer, typename Op, typename A, std::size_t N> +constexpr inline auto array_apply_and_reduce(std::array<A, N> a) -> decltype(h_array_apply_and_reduce<Reducer, Op, A, N>(a, typename gen_numeric_list<int, N>::type())) +{ + return h_array_apply_and_reduce<Reducer, Op, A, N>(a, typename gen_numeric_list<int, N>::type()); +} + +/* repeat a value n times (and make an array out of it + * usage: + * std::array<int, 16> = repeat<16>(42); + */ + +template<int n> +struct h_repeat +{ + template<typename t, int... ii> + constexpr static inline std::array<t, n> run(t v, numeric_list<int, ii...>) + { + return {{ typename id_numeric<int, ii, t>::type(v)... }}; + } +}; + +template<int n, typename t> +constexpr std::array<t, n> repeat(t v) { return h_repeat<n>::run(v, typename gen_numeric_list<int, n>::type()); } + +/* instantiate a class by a C-style array */ +template<class InstType, typename ArrType, std::size_t N, bool Reverse, typename... Ps> +struct h_instantiate_by_c_array; + +template<class InstType, typename ArrType, std::size_t N, typename... Ps> +struct h_instantiate_by_c_array<InstType, ArrType, N, false, Ps...> +{ + static InstType run(ArrType* arr, Ps... args) + { + return h_instantiate_by_c_array<InstType, ArrType, N - 1, false, Ps..., ArrType>::run(arr + 1, args..., arr[0]); + } +}; + +template<class InstType, typename ArrType, std::size_t N, typename... Ps> +struct h_instantiate_by_c_array<InstType, ArrType, N, true, Ps...> +{ + static InstType run(ArrType* arr, Ps... args) + { + return h_instantiate_by_c_array<InstType, ArrType, N - 1, false, ArrType, Ps...>::run(arr + 1, arr[0], args...); + } +}; + +template<class InstType, typename ArrType, typename... Ps> +struct h_instantiate_by_c_array<InstType, ArrType, 0, false, Ps...> +{ + static InstType run(ArrType* arr, Ps... args) + { + (void)arr; + return InstType(args...); + } +}; + +template<class InstType, typename ArrType, typename... Ps> +struct h_instantiate_by_c_array<InstType, ArrType, 0, true, Ps...> +{ + static InstType run(ArrType* arr, Ps... args) + { + (void)arr; + return InstType(args...); + } +}; + +template<class InstType, typename ArrType, std::size_t N, bool Reverse = false> +InstType instantiate_by_c_array(ArrType* arr) +{ + return h_instantiate_by_c_array<InstType, ArrType, N, Reverse>::run(arr); +} + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_CXX11META_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h new file mode 100644 index 0000000000..a590cf4e18 --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h @@ -0,0 +1,116 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2013 Christian Seiler <christian@iwakd.de> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11WORKAROUNDS_H +#define EIGEN_CXX11WORKAROUNDS_H + +/* COMPATIBILITY CHECKS + * (so users of compilers that are too old get some realistic error messages) + */ +#if defined(__INTEL_COMPILER) && (__INTEL_COMPILER < 1310) +#error Intel Compiler only supports required C++ features since version 13.1. +// note that most stuff in principle works with 13.0 but when combining +// some features, at some point 13.0 will just fail with an internal assertion +#elif defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 6)) +// G++ < 4.6 by default will continue processing the source files - even if we use #error to make +// it error out. For this reason, we use the pragma to make sure G++ aborts at the first error +// it sees. Unfortunately, that is still not our #error directive, but at least the output is +// short enough the user has a chance to see that the compiler version is not sufficient for +// the funky template mojo we use. +#pragma GCC diagnostic error "-Wfatal-errors" +#error GNU C++ Compiler (g++) only supports required C++ features since version 4.6. +#endif + +/* Check that the compiler at least claims to support C++11. It might not be sufficient + * because the compiler may not implement it correctly, but at least we'll know. + */ +#if __cplusplus <= 199711L +#if defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER) +#pragma GCC diagnostic error "-Wfatal-errors" +#endif +#error This library needs at least a C++11 compliant compiler. If you use g++/clang, please enable the -std=c++11 compiler flag. (-std=c++0x on older versions.) +#endif + +namespace Eigen { + +// Use std::array as Eigen array +template <typename T, std::size_t N> using array = std::array<T, N>; + +namespace internal { + +/* std::get is only constexpr in C++14, not yet in C++11 + * - libstdc++ from version 4.7 onwards has it nevertheless, + * so use that + * - libstdc++ older versions: use _M_instance directly + * - libc++ all versions so far: use __elems_ directly + * - all other libs: use std::get to be portable, but + * this may not be constexpr + */ +#if defined(__GLIBCXX__) && __GLIBCXX__ < 20120322 +#define STD_GET_ARR_HACK a._M_instance[I] +#elif defined(_LIBCPP_VERSION) +#define STD_GET_ARR_HACK a.__elems_[I] +#else +#define STD_GET_ARR_HACK std::template get<I, T, N>(a) +#endif + +template<std::size_t I, class T, std::size_t N> constexpr inline T& array_get(std::array<T,N>& a) { return (T&) STD_GET_ARR_HACK; } +template<std::size_t I, class T, std::size_t N> constexpr inline T&& array_get(std::array<T,N>&& a) { return (T&&) STD_GET_ARR_HACK; } +template<std::size_t I, class T, std::size_t N> constexpr inline T const& array_get(std::array<T,N> const& a) { return (T const&) STD_GET_ARR_HACK; } + +template<std::size_t I, class T> constexpr inline T& array_get(std::vector<T>& a) { return a[I]; } +template<std::size_t I, class T> constexpr inline T&& array_get(std::vector<T>&& a) { return a[I]; } +template<std::size_t I, class T> constexpr inline T const& array_get(std::vector<T> const& a) { return a[I]; } + +#undef STD_GET_ARR_HACK + +template <typename T> struct array_size; +template<class T, std::size_t N> struct array_size<const std::array<T,N> > { + static const size_t value = N; +}; +template <typename T> struct array_size; +template<class T, std::size_t N> struct array_size<std::array<T,N> > { + static const size_t value = N; +}; + +/* Suppose you have a template of the form + * template<typename T> struct X; + * And you want to specialize it in such a way: + * template<typename S1, typename... SN> struct X<Foo<S1, SN...>> { ::: }; + * template<> struct X<Foo<>> { ::: }; + * This will work in Intel's compiler 13.0, but only to some extent in g++ 4.6, since + * g++ can only match templates called with parameter packs if the number of template + * arguments is not a fixed size (so inside the first specialization, referencing + * X<Foo<Sn...>> will fail in g++). On the other hand, g++ will accept the following: + * template<typename S...> struct X<Foo<S...>> { ::: }: + * as an additional (!) specialization, which will then only match the empty case. + * But Intel's compiler 13.0 won't accept that, it will only accept the empty syntax, + * so we have to create a workaround for this. + */ +#if defined(__GNUC__) && !defined(__INTEL_COMPILER) +#define EIGEN_TPL_PP_SPEC_HACK_DEF(mt, n) mt... n +#define EIGEN_TPL_PP_SPEC_HACK_DEFC(mt, n) , EIGEN_TPL_PP_SPEC_HACK_DEF(mt, n) +#define EIGEN_TPL_PP_SPEC_HACK_USE(n) n... +#define EIGEN_TPL_PP_SPEC_HACK_USEC(n) , n... +#else +#define EIGEN_TPL_PP_SPEC_HACK_DEF(mt, n) +#define EIGEN_TPL_PP_SPEC_HACK_DEFC(mt, n) +#define EIGEN_TPL_PP_SPEC_HACK_USE(n) +#define EIGEN_TPL_PP_SPEC_HACK_USEC(n) +#endif + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_CXX11WORKAROUNDS_H + +/* + * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle; + */ diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h new file mode 100644 index 0000000000..a1e1dca8e1 --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h @@ -0,0 +1,456 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_EMULATE_CXX11_META_H +#define EIGEN_EMULATE_CXX11_META_H + + + +namespace Eigen { + +// The array class is only available starting with cxx11. Emulate our own here +// if needed +template <typename T, size_t n> class array { + public: + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE T& operator[] (size_t index) { return values[index]; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const T& operator[] (size_t index) const { return values[index]; } + + static EIGEN_ALWAYS_INLINE std::size_t size() { return n; } + + T values[n]; + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE array() { } + explicit EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE array(const T& v) { + EIGEN_STATIC_ASSERT(n==1, YOU_MADE_A_PROGRAMMING_MISTAKE) + values[0] = v; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE array(const T& v1, const T& v2) { + EIGEN_STATIC_ASSERT(n==2, YOU_MADE_A_PROGRAMMING_MISTAKE) + values[0] = v1; + values[1] = v2; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3) { + EIGEN_STATIC_ASSERT(n==3, YOU_MADE_A_PROGRAMMING_MISTAKE) + values[0] = v1; + values[1] = v2; + values[2] = v3; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, + const T& v4) { + EIGEN_STATIC_ASSERT(n==4, YOU_MADE_A_PROGRAMMING_MISTAKE) + values[0] = v1; + values[1] = v2; + values[2] = v3; + values[3] = v4; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4, + const T& v5) { + EIGEN_STATIC_ASSERT(n==5, YOU_MADE_A_PROGRAMMING_MISTAKE) + values[0] = v1; + values[1] = v2; + values[2] = v3; + values[3] = v4; + values[4] = v5; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4, + const T& v5, const T& v6) { + EIGEN_STATIC_ASSERT(n==6, YOU_MADE_A_PROGRAMMING_MISTAKE) + values[0] = v1; + values[1] = v2; + values[2] = v3; + values[3] = v4; + values[4] = v5; + values[5] = v6; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4, + const T& v5, const T& v6, const T& v7) { + EIGEN_STATIC_ASSERT(n==7, YOU_MADE_A_PROGRAMMING_MISTAKE) + values[0] = v1; + values[1] = v2; + values[2] = v3; + values[3] = v4; + values[4] = v5; + values[5] = v6; + values[6] = v7; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE array( + const T& v1, const T& v2, const T& v3, const T& v4, + const T& v5, const T& v6, const T& v7, const T& v8) { + EIGEN_STATIC_ASSERT(n==8, YOU_MADE_A_PROGRAMMING_MISTAKE) + values[0] = v1; + values[1] = v2; + values[2] = v3; + values[3] = v4; + values[4] = v5; + values[5] = v6; + values[6] = v7; + values[7] = v8; + } + +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES + array(std::initializer_list<T> l) { + eigen_assert(l.size() == n); + internal::smart_copy(l.begin(), l.end(), values); + } +#endif +}; + +// Specialize array for zero size +template <typename T> class array<T, 0> { + public: + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE T& operator[] (size_t index) { + eigen_assert(false && "Can't index a zero size array"); + return *static_cast<T*>(NULL); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const T& operator[] (size_t index) const { + eigen_assert(false && "Can't index a zero size array"); + return *static_cast<const T*>(NULL); + } + + static EIGEN_ALWAYS_INLINE std::size_t size() { return 0; } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE array() { } + +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES + array(std::initializer_list<T> l) { + eigen_assert(l.size() == 0); + } +#endif +}; + +namespace internal { + +/** \internal + * \file CXX11/Core/util/EmulateCXX11Meta.h + * This file emulates a subset of the functionality provided by CXXMeta.h for + * compilers that don't yet support cxx11 such as nvcc. + */ + +struct empty_list { static const std::size_t count = 0; }; + +template<typename T, typename Tail=empty_list> struct type_list { + typedef T HeadType; + typedef Tail TailType; + static const T head; + static const Tail tail; + static const std::size_t count = 1 + Tail::count; +}; + +struct null_type { }; + +template<typename T1 = null_type, typename T2 = null_type, typename T3 = null_type, + typename T4 = null_type, typename T5 = null_type, typename T6 = null_type, + typename T7 = null_type, typename T8 = null_type> +struct make_type_list { + typedef typename make_type_list<T2, T3, T4, T5, T6, T7, T8>::type tailresult; + + typedef type_list<T1, tailresult> type; +}; + +template<> struct make_type_list<> { + typedef empty_list type; +}; + + +template <std::size_t index, class TList> struct get_type; + +template <class Head, class Tail> +struct get_type<0, type_list<Head, Tail> > +{ + typedef Head type; +}; + +template <std::size_t i, class Head, class Tail> +struct get_type<i, type_list<Head, Tail> > +{ + typedef typename get_type<i-1, Tail>::type type; +}; + + +/* numeric list */ +template <typename T, T n> +struct type2val { + typedef T type; + static const T value = n; +}; + + +template<typename T, size_t n, T V> struct gen_numeric_list_repeated; + +template<typename T, T V> struct gen_numeric_list_repeated<T, 1, V> { + typedef typename make_type_list<type2val<T, V> >::type type; +}; + +template<typename T, T V> struct gen_numeric_list_repeated<T, 2, V> { + typedef typename make_type_list<type2val<T, V>, type2val<T, V> >::type type; +}; + +template<typename T, T V> struct gen_numeric_list_repeated<T, 3, V> { + typedef typename make_type_list<type2val<T, V>, type2val<T, V>, type2val<T, V> >::type type; +}; + +template<typename T, T V> struct gen_numeric_list_repeated<T, 4, V> { + typedef typename make_type_list<type2val<T, V>, type2val<T, V>, type2val<T, V>, type2val<T, V> >::type type; +}; + +template<typename T, T V> struct gen_numeric_list_repeated<T, 5, V> { + typedef typename make_type_list<type2val<T, V>, type2val<T, V>, type2val<T, V>, type2val<T, V>, type2val<T, V> >::type type; +}; + +template<typename T, T V> struct gen_numeric_list_repeated<T, 6, V> { + typedef typename make_type_list<type2val<T, V>, type2val<T, V>, type2val<T, V>, + type2val<T, V>, type2val<T, V>, type2val<T, V> >::type type; +}; + +template<typename T, T V> struct gen_numeric_list_repeated<T, 7, V> { + typedef typename make_type_list<type2val<T, V>, type2val<T, V>, type2val<T, V>, + type2val<T, V>, type2val<T, V>, type2val<T, V>, + type2val<T, V> >::type type; +}; + +template<typename T, T V> struct gen_numeric_list_repeated<T, 8, V> { + typedef typename make_type_list<type2val<T, V>, type2val<T, V>, type2val<T, V>, + type2val<T, V>, type2val<T, V>, type2val<T, V>, + type2val<T, V>, type2val<T, V> >::type type; +}; + + +template <std::size_t index, class NList> struct get; + +template <std::size_t i> +struct get<i, empty_list> +{ + get() { eigen_assert(false && "index overflow"); } + typedef void type; + static const char value = '\0'; +}; + +template <std::size_t i, class Head> +struct get<i, type_list<Head, empty_list> > +{ + get() { eigen_assert(false && "index overflow"); } + typedef void type; + static const char value = '\0'; +}; + +template <class Head> +struct get<0, type_list<Head, empty_list> > +{ + typedef typename Head::type type; + static const type value = Head::value; +}; + +template <class Head, class Tail> +struct get<0, type_list<Head, Tail> > +{ + typedef typename Head::type type; + static const type value = Head::value; +}; + +template <std::size_t i, class Head, class Tail> +struct get<i, type_list<Head, Tail> > +{ + typedef typename Tail::HeadType::type type; + static const type value = get<i-1, Tail>::value; +}; + + +template <class NList> struct arg_prod { + static const typename NList::HeadType::type value = get<0, NList>::value * arg_prod<typename NList::TailType>::value; +}; +template <> struct arg_prod<empty_list> { + static const int value = 1; +}; + + +template<int n, typename t> +array<t, n> repeat(t v) { + array<t, n> array; + array.fill(v); + return array; +} + +template<std::size_t I, class Head, class Tail> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Head::type array_get(type_list<Head, Tail>& a) { + return get<I, type_list<Head, Tail> >::value; +} +template<std::size_t I, class Head, class Tail> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Head::type array_get(const type_list<Head, Tail>& a) { + return get<I, type_list<Head, Tail> >::value; +} + +template <class NList> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NList::HeadType::type array_prod(const NList& l) { + return arg_prod<NList>::value; +}; + +template<std::size_t n, typename t> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const array<t, n>& a) { + t prod = 1; + for (size_t i = 0; i < n; ++i) { prod *= a[i]; } + return prod; +} + +template<typename t> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const std::vector<t>& a) { + t prod = 1; + for (size_t i = 0; i < a.size(); ++i) { prod *= a[i]; } + return prod; +} + +template<std::size_t I, class T, std::size_t N> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& array_get(array<T,N>& a) { + return a[I]; +} +template<std::size_t I, class T, std::size_t N> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& array_get(const array<T,N>& a) { + return a[I]; +} + +template<std::size_t I, class T> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& array_get(std::vector<T>& a) { + return a[I]; +} +template<std::size_t I, class T> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& array_get(const std::vector<T>& a) { + return a[I]; +} + +template <typename T> struct array_size; +template<class T, std::size_t N> struct array_size<array<T,N> > { + static const size_t value = N; +}; +template <typename T> struct array_size; +template<class T, std::size_t N> struct array_size<array<T,N>& > { + static const size_t value = N; +}; +template <typename T> struct array_size; +template<class T, std::size_t N> struct array_size<const array<T,N> > { + static const size_t value = N; +}; +template <typename T> struct array_size; +template<class T, std::size_t N> struct array_size<const array<T,N>& > { + static const size_t value = N; +}; + +struct sum_op { + template<typename A, typename B> static inline bool run(A a, B b) { return a + b; } +}; +struct product_op { + template<typename A, typename B> static inline bool run(A a, B b) { return a * b; } +}; + +struct logical_and_op { + template<typename A, typename B> static inline bool run(A a, B b) { return a && b; } +}; +struct logical_or_op { + template<typename A, typename B> static inline bool run(A a, B b) { return a || b; } +}; + +struct equal_op { + template<typename A, typename B> static inline bool run(A a, B b) { return a == b; } +}; +struct not_equal_op { + template<typename A, typename B> static inline bool run(A a, B b) { return a != b; } +}; +struct lesser_op { + template<typename A, typename B> static inline bool run(A a, B b) { return a < b; } +}; +struct lesser_equal_op { + template<typename A, typename B> static inline bool run(A a, B b) { return a <= b; } +}; + +struct greater_op { + template<typename A, typename B> static inline bool run(A a, B b) { return a > b; } +}; +struct greater_equal_op { + template<typename A, typename B> static inline bool run(A a, B b) { return a >= b; } +}; + +struct not_op { + template<typename A> static inline bool run(A a) { return !a; } +}; +struct negation_op { + template<typename A> static inline bool run(A a) { return -a; } +}; +struct greater_equal_zero_op { + template<typename A> static inline bool run(A a) { return a >= 0; } +}; + + +template<typename Reducer, typename Op, typename A, std::size_t N> +struct ArrayApplyAndReduce { + static inline bool run(const array<A, N>& a) { + EIGEN_STATIC_ASSERT(N >= 2, YOU_MADE_A_PROGRAMMING_MISTAKE); + bool result = Reducer::run(Op::run(a[0]), Op::run(a[1])); + for (size_t i = 2; i < N; ++i) { + result = Reducer::run(result, Op::run(a[i])); + } + return result; + } +}; + +template<typename Reducer, typename Op, typename A> +struct ArrayApplyAndReduce<Reducer, Op, A, 1> { + static inline bool run(const array<A, 1>& a) { + return Op::run(a[0]); + } +}; + +template<typename Reducer, typename Op, typename A, std::size_t N> +inline bool array_apply_and_reduce(const array<A, N>& a) { + return ArrayApplyAndReduce<Reducer, Op, A, N>::run(a); +} + +template<typename Reducer, typename Op, typename A, typename B, std::size_t N> +struct ArrayZipAndReduce { + static inline bool run(const array<A, N>& a, const array<B, N>& b) { + EIGEN_STATIC_ASSERT(N >= 2, YOU_MADE_A_PROGRAMMING_MISTAKE); + bool result = Reducer::run(Op::run(a[0], b[0]), Op::run(a[1], b[1])); + for (size_t i = 2; i < N; ++i) { + result = Reducer::run(result, Op::run(a[i], b[i])); + } + return result; + } +}; + +template<typename Reducer, typename Op, typename A, typename B> +struct ArrayZipAndReduce<Reducer, Op, A, B, 1> { + static inline bool run(const array<A, 1>& a, const array<B, 1>& b) { + return Op::run(a[0], b[0]); + } +}; + +template<typename Reducer, typename Op, typename A, typename B, std::size_t N> +inline bool array_zip_and_reduce(const array<A, N>& a, const array<B, N>& b) { + return ArrayZipAndReduce<Reducer, Op, A, B, N>::run(a, b); +} + +} // end namespace internal + +} // end namespace Eigen + + + +#endif // EIGEN_EMULATE_CXX11_META_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Core/util/FixedSizeVector.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Core/util/FixedSizeVector.h new file mode 100644 index 0000000000..c68119aa03 --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Core/util/FixedSizeVector.h @@ -0,0 +1,128 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_FIXEDSIZEVECTOR_H +#define EIGEN_FIXEDSIZEVECTOR_H + +namespace Eigen { + +/** \class FixedSizeVector + * \ingroup Core + * + * \brief The FixedSizeVector class. + * + * The %FixedSizeVector provides a subset of std::vector functionality. + * + * The goal is to provide basic std::vector operations when using + * std::vector is not an option (e.g. on GPU or when compiling using + * FMA/AVX, as this can cause either compilation failures or illegal + * instruction failures). + * + */ +template <typename T> +class FixedSizeVector { + public: + // Construct a new FixedSizeVector, reserve n elements. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit FixedSizeVector(size_t n) + : reserve_(n), size_(0), + data_(static_cast<T*>(internal::aligned_malloc(n * sizeof(T)))) { + for (size_t i = 0; i < n; ++i) { new (&data_[i]) T; } + } + + // Construct a new FixedSizeVector, reserve and resize to n. + // Copy the init value to all elements. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit FixedSizeVector(size_t n, const T& init) + : reserve_(n), size_(n), + data_(static_cast<T*>(internal::aligned_malloc(n * sizeof(T)))) { + for (size_t i = 0; i < n; ++i) { new (&data_[i]) T(init); } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + ~FixedSizeVector() { + for (size_t i = 0; i < size_; ++i) { + data_[i].~T(); + } + internal::aligned_free(data_); + } + + // Append new elements (up to reserved size). + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void push_back(const T& t) { + eigen_assert(size_ < reserve_); + data_[size_++] = t; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const T& operator[] (size_t i) const { + eigen_assert(i < size_); + return data_[i]; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + T& operator[] (size_t i) { + eigen_assert(i < size_); + return data_[i]; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + T& back() { + eigen_assert(size_ > 0); + return data_[size_ - 1]; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const T& back() const { + eigen_assert(size_ > 0); + return data_[size_ - 1]; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void pop_back() { + // NOTE: This does not destroy the value at the end the way + // std::vector's version of pop_back() does. That happens when + // the Vector is destroyed. + eigen_assert(size_ > 0); + size_--; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + size_t size() const { return size_; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + bool empty() const { return size_ == 0; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + T* data() { return data_; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const T* data() const { return data_; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + T* begin() { return data_; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + T* end() { return data_ + size_; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const T* begin() const { return data_; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const T* end() const { return data_ + size_; } + + private: + size_t reserve_; + size_t size_; + T* data_; +}; + +} // namespace Eigen + +#endif // EIGEN_FIXEDSIZEVECTOR_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/FixedPointTypes.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/FixedPointTypes.h new file mode 100644 index 0000000000..564729ce48 --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/FixedPointTypes.h @@ -0,0 +1,341 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_FIXED_POINT_TYPES_H +#define EIGEN_CXX11_FIXED_POINT_TYPES_H + +#include <cmath> +#include <iostream> + +namespace Eigen { + +// The mantissa part of the fixed point representation. See +// go/tensorfixedpoint for details +struct QInt8; +struct QUInt8; +struct QInt16; +struct QUInt16; +struct QInt32; + +template <> +struct NumTraits<QInt8> : GenericNumTraits<int8_t> {}; +template <> +struct NumTraits<QUInt8> : GenericNumTraits<uint8_t> {}; +template <> +struct NumTraits<QInt16> : GenericNumTraits<int16_t> {}; +template <> +struct NumTraits<QUInt16> : GenericNumTraits<uint16_t> {}; +template <> +struct NumTraits<QInt32> : GenericNumTraits<int32_t> {}; + +namespace internal { +template <> +struct scalar_product_traits<QInt32, double> { + enum { + // Cost = NumTraits<T>::MulCost, + Defined = 1 + }; + typedef QInt32 ReturnType; +}; +} + +// Wrap the 8bit int into a QInt8 struct instead of using a typedef to prevent +// the compiler from silently type cast the mantissa into a bigger or a smaller +// representation. +struct QInt8 { + QInt8() {} + QInt8(const int8_t v) : value(v) {} + QInt8(const QInt32 v); + + operator int() const { return static_cast<int>(value); } + + int8_t value; +}; + +struct QUInt8 { + QUInt8() {} + QUInt8(const uint8_t v) : value(v) {} + QUInt8(const QInt32 v); + + operator int() const { return static_cast<int>(value); } + + uint8_t value; +}; + +struct QInt16 { + QInt16() {} + QInt16(const int16_t v) : value(v) {} + QInt16(const QInt32 v); + operator int() const { return static_cast<int>(value); } + + int16_t value; +}; + +struct QUInt16 { + QUInt16() {} + QUInt16(const uint16_t v) : value(v) {} + QUInt16(const QInt32 v); + operator int() const { return static_cast<int>(value); } + + uint16_t value; +}; + +struct QInt32 { + QInt32() {} + QInt32(const int8_t v) : value(v) {} + QInt32(const int32_t v) : value(v) {} + QInt32(const QInt8 v) : value(v.value) {} + QInt32(const float v) : value(static_cast<int32_t>(lrint(v))) {} +#ifdef EIGEN_MAKING_DOCS + // Workaround to fix build on PPC. + QInt32(unsigned long v) : value(v) {} +#endif + + operator float() const { return static_cast<float>(value); } + + int32_t value; +}; + +EIGEN_STRONG_INLINE QInt8::QInt8(const QInt32 v) + : value(v.value > 127 ? 127 : (v.value < -128 ? -128 : v.value)) {} +EIGEN_STRONG_INLINE QUInt8::QUInt8(const QInt32 v) + : value(v.value > 255 ? 255 : (v.value < 0 ? 0 : v.value)) {} +EIGEN_STRONG_INLINE QInt16::QInt16(const QInt32 v) + : value(v.value > 32767 ? 32767 : (v.value < -32768 ? -32768 : v.value)) {} +EIGEN_STRONG_INLINE QUInt16::QUInt16(const QInt32 v) + : value(v.value > 65535 ? 65535 : (v.value < 0 ? 0 : v.value)) {} + +// Basic widening 8-bit operations: This will be vectorized in future CLs. +EIGEN_STRONG_INLINE QInt32 operator*(const QInt8 a, const QInt8 b) { + return QInt32(static_cast<int32_t>(a.value) * static_cast<int32_t>(b.value)); +} +EIGEN_STRONG_INLINE QInt32 operator*(const QInt8 a, const QUInt8 b) { + return QInt32(static_cast<int32_t>(a.value) * static_cast<int32_t>(b.value)); +} +EIGEN_STRONG_INLINE QInt32 operator+(const QInt8 a, const QInt8 b) { + return QInt32(static_cast<int32_t>(a.value) + static_cast<int32_t>(b.value)); +} +EIGEN_STRONG_INLINE QInt32 operator-(const QInt8 a, const QInt8 b) { + return QInt32(static_cast<int32_t>(a.value) - static_cast<int32_t>(b.value)); +} + +// Basic widening 16-bit operations: This will be vectorized in future CLs. +EIGEN_STRONG_INLINE QInt32 operator*(const QInt16 a, const QInt16 b) { + return QInt32(static_cast<int32_t>(a.value) * static_cast<int32_t>(b.value)); +} +EIGEN_STRONG_INLINE QInt32 operator*(const QInt16 a, const QUInt16 b) { + return QInt32(static_cast<int32_t>(a.value) * static_cast<int32_t>(b.value)); +} +EIGEN_STRONG_INLINE QInt32 operator+(const QInt16 a, const QInt16 b) { + return QInt32(static_cast<int32_t>(a.value) + static_cast<int32_t>(b.value)); +} +EIGEN_STRONG_INLINE QInt32 operator-(const QInt16 a, const QInt16 b) { + return QInt32(static_cast<int32_t>(a.value) - static_cast<int32_t>(b.value)); +} + +// Mixed QInt32 op QInt8 operations. This will be vectorized in future CLs. +EIGEN_STRONG_INLINE QInt32 operator+(const QInt32 a, const QInt8 b) { + return QInt32(a.value + static_cast<int32_t>(b.value)); +} +EIGEN_STRONG_INLINE QInt32 operator+(const QInt8 a, const QInt32 b) { + return QInt32(static_cast<int32_t>(a.value) + b.value); +} +EIGEN_STRONG_INLINE QInt32 operator-(const QInt32 a, const QInt8 b) { + return QInt32(a.value - static_cast<int32_t>(b.value)); +} +EIGEN_STRONG_INLINE QInt32 operator-(const QInt8 a, const QInt32 b) { + return QInt32(static_cast<int32_t>(a.value) - b.value); +} +EIGEN_STRONG_INLINE QInt32 operator*(const QInt32 a, const QInt8 b) { + return QInt32(a.value * static_cast<int32_t>(b.value)); +} +EIGEN_STRONG_INLINE QInt32 operator*(const QInt8 a, const QInt32 b) { + return QInt32(static_cast<int32_t>(a.value) * b.value); +} + +// Mixed QInt32 op QInt16 operations. This will be vectorized in future CLs. +EIGEN_STRONG_INLINE QInt32 operator+(const QInt32 a, const QInt16 b) { + return QInt32(a.value + static_cast<int32_t>(b.value)); +} +EIGEN_STRONG_INLINE QInt32 operator+(const QInt16 a, const QInt32 b) { + return QInt32(static_cast<int32_t>(a.value) + b.value); +} +EIGEN_STRONG_INLINE QInt32 operator-(const QInt32 a, const QInt16 b) { + return QInt32(a.value - static_cast<int32_t>(b.value)); +} +EIGEN_STRONG_INLINE QInt32 operator-(const QInt16 a, const QInt32 b) { + return QInt32(static_cast<int32_t>(a.value) - b.value); +} +EIGEN_STRONG_INLINE QInt32 operator*(const QInt32 a, const QInt16 b) { + return QInt32(a.value * static_cast<int32_t>(b.value)); +} +EIGEN_STRONG_INLINE QInt32 operator*(const QInt16 a, const QInt32 b) { + return QInt32(static_cast<int32_t>(a.value) * b.value); +} + +// Mixed QInt32 op QUInt8 operations. This will be vectorized in future CLs. +EIGEN_STRONG_INLINE QInt32 operator+(const QInt32 a, const QUInt8 b) { + return QInt32(a.value + static_cast<int32_t>(b.value)); +} +EIGEN_STRONG_INLINE QInt32 operator+(const QUInt8 a, const QInt32 b) { + return QInt32(static_cast<int32_t>(a.value) + b.value); +} +EIGEN_STRONG_INLINE QInt32 operator-(const QInt32 a, const QUInt8 b) { + return QInt32(a.value - static_cast<int32_t>(b.value)); +} +EIGEN_STRONG_INLINE QInt32 operator-(const QUInt8 a, const QInt32 b) { + return QInt32(static_cast<int32_t>(a.value) - b.value); +} +EIGEN_STRONG_INLINE QInt32 operator*(const QInt32 a, const QUInt8 b) { + return QInt32(a.value * static_cast<int32_t>(b.value)); +} +EIGEN_STRONG_INLINE QInt32 operator*(const QUInt8 a, const QInt32 b) { + return QInt32(static_cast<int32_t>(a.value) * b.value); +} + +// Mixed QInt32 op QUInt16 operations. This will be vectorized in future CLs. +EIGEN_STRONG_INLINE QInt32 operator+(const QInt32 a, const QUInt16 b) { + return QInt32(a.value + static_cast<int32_t>(b.value)); +} +EIGEN_STRONG_INLINE QInt32 operator+(const QUInt16 a, const QInt32 b) { + return QInt32(static_cast<int32_t>(a.value) + b.value); +} +EIGEN_STRONG_INLINE QInt32 operator-(const QInt32 a, const QUInt16 b) { + return QInt32(a.value - static_cast<int32_t>(b.value)); +} +EIGEN_STRONG_INLINE QInt32 operator-(const QUInt16 a, const QInt32 b) { + return QInt32(static_cast<int32_t>(a.value) - b.value); +} +EIGEN_STRONG_INLINE QInt32 operator*(const QInt32 a, const QUInt16 b) { + return QInt32(a.value * static_cast<int32_t>(b.value)); +} +EIGEN_STRONG_INLINE QInt32 operator*(const QUInt16 a, const QInt32 b) { + return QInt32(static_cast<int32_t>(a.value) * b.value); +} + +// Basic arithmetic operations on QInt32, which behaves like a int32_t. +EIGEN_STRONG_INLINE QInt32 operator+(const QInt32 a, const QInt32 b) { + return a.value + b.value; +} +EIGEN_STRONG_INLINE QInt32 operator-(const QInt32 a, const QInt32 b) { + return a.value - b.value; +} +EIGEN_STRONG_INLINE QInt32 operator*(const QInt32 a, const QInt32 b) { + return a.value * b.value; +} +EIGEN_STRONG_INLINE QInt32 operator/(const QInt32 a, const QInt32 b) { + return a.value / b.value; +} +EIGEN_STRONG_INLINE QInt32& operator+=(QInt32& a, const QInt32 b) { + a.value += b.value; + return a; +} +EIGEN_STRONG_INLINE QInt32& operator-=(QInt32& a, const QInt32 b) { + a.value -= b.value; + return a; +} +EIGEN_STRONG_INLINE QInt32& operator*=(QInt32& a, const QInt32 b) { + a.value *= b.value; + return a; +} +EIGEN_STRONG_INLINE QInt32& operator/=(QInt32& a, const QInt32 b) { + a.value /= b.value; + return a; +} +EIGEN_STRONG_INLINE QInt32 operator-(const QInt32 a) { + return -a.value; +} + +// Scaling QInt32 by double. We do the arithmetic in double because +// float only has 23 bits of mantissa, so casting QInt32 to float might reduce +// accuracy by discarding up to 7 (least significant) bits. +EIGEN_STRONG_INLINE QInt32 operator*(const QInt32 a, const double b) { + return static_cast<int32_t>(lrint(static_cast<double>(a.value) * b)); +} +EIGEN_STRONG_INLINE QInt32 operator*(const double a, const QInt32 b) { + return static_cast<int32_t>(lrint(a * static_cast<double>(b.value))); +} +EIGEN_STRONG_INLINE QInt32& operator*=(QInt32& a, const double b) { + a.value = static_cast<int32_t>(lrint(static_cast<double>(a.value) * b)); + return a; +} + +// Comparisons +EIGEN_STRONG_INLINE bool operator==(const QInt8 a, const QInt8 b) { + return a.value == b.value; +} +EIGEN_STRONG_INLINE bool operator==(const QUInt8 a, const QUInt8 b) { + return a.value == b.value; +} +EIGEN_STRONG_INLINE bool operator==(const QInt16 a, const QInt16 b) { + return a.value == b.value; +} +EIGEN_STRONG_INLINE bool operator==(const QUInt16 a, const QUInt16 b) { + return a.value == b.value; +} +EIGEN_STRONG_INLINE bool operator==(const QInt32 a, const QInt32 b) { + return a.value == b.value; +} + +EIGEN_STRONG_INLINE bool operator<(const QInt8 a, const QInt8 b) { + return a.value < b.value; +} +EIGEN_STRONG_INLINE bool operator<(const QUInt8 a, const QUInt8 b) { + return a.value < b.value; +} +EIGEN_STRONG_INLINE bool operator<(const QInt16 a, const QInt16 b) { + return a.value < b.value; +} +EIGEN_STRONG_INLINE bool operator<(const QUInt16 a, const QUInt16 b) { + return a.value < b.value; +} +EIGEN_STRONG_INLINE bool operator<(const QInt32 a, const QInt32 b) { + return a.value < b.value; +} + +EIGEN_STRONG_INLINE bool operator>(const QInt8 a, const QInt8 b) { + return a.value > b.value; +} +EIGEN_STRONG_INLINE bool operator>(const QUInt8 a, const QUInt8 b) { + return a.value > b.value; +} +EIGEN_STRONG_INLINE bool operator>(const QInt16 a, const QInt16 b) { + return a.value > b.value; +} +EIGEN_STRONG_INLINE bool operator>(const QUInt16 a, const QUInt16 b) { + return a.value > b.value; +} +EIGEN_STRONG_INLINE bool operator>(const QInt32 a, const QInt32 b) { + return a.value > b.value; +} + +EIGEN_STRONG_INLINE std::ostream& operator<<(std::ostream& os, QInt8 a) { + os << static_cast<int>(a.value); + return os; +} +EIGEN_STRONG_INLINE std::ostream& operator<<(std::ostream& os, QUInt8 a) { + os << static_cast<int>(a.value); + return os; +} +EIGEN_STRONG_INLINE std::ostream& operator<<(std::ostream& os, QInt16 a) { + os << static_cast<int>(a.value); + return os; +} +EIGEN_STRONG_INLINE std::ostream& operator<<(std::ostream& os, QUInt16 a) { + os << static_cast<int>(a.value); + return os; +} +EIGEN_STRONG_INLINE std::ostream& operator<<(std::ostream& os, QInt32 a) { + os << a.value; + return os; +} + +} // namespace Eigen + +#endif // EIGEN_CXX11_FIXED_POINT_TYPES_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProduct.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProduct.h new file mode 100644 index 0000000000..4d0dca07df --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProduct.h @@ -0,0 +1,255 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_FIXED_POINT_MAT_MAT_PRODUCT_H +#define EIGEN_CXX11_FIXED_POINT_MAT_MAT_PRODUCT_H + + +namespace Eigen { +namespace internal { + +// Accumulate the product of 2 QInt8 inputs on 32 bits to prevent +// overflows +template<> struct scalar_product_traits<QInt8, QInt8> +{ + enum { + Defined = 1 + }; + typedef QInt32 ReturnType; +}; + +// Accumulate the product of QInt8 inputs with QUint8 inputs on 32 bits +// to prevent overflows +template<> struct scalar_product_traits<QInt8, QUInt8> +{ + enum { + Defined = 1 + }; + typedef QInt32 ReturnType; +}; + +// Description of the product implementation. It's pretty simple now since +// nothing is vectorized yet. +// This definition tackle the case where both lhs and rhs are encoded using +// signed 8bit integers +#ifndef EIGEN_USE_OPTIMIZED_INT8_INT8_MAT_MAT_PRODUCT + +template<bool _ConjLhs, bool _ConjRhs> +class gebp_traits<QInt8, QInt8, _ConjLhs, _ConjRhs> +{ +public: + typedef QInt8 LhsScalar; + typedef QInt8 RhsScalar; + typedef QInt32 ResScalar; + + enum { + // register block size along the M and N directions + // One for the current implementation + nr = 1, + mr = 1, + // Progress made at each iteration of the product loop + // also 1 for the current implementation + LhsProgress = 1, + RhsProgress = 1 + }; +}; + +// The signed 8bit Mat-Mat product itself. +template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs> +struct gebp_kernel<QInt8, QInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> +{ + EIGEN_DONT_INLINE + void operator()(const DataMapper& res, const QInt8* blockA, const QInt8* blockB, + Index rows, Index depth, Index cols, QInt32 alpha, + Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0); +}; + +template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs> +EIGEN_DONT_INLINE +void gebp_kernel<QInt8, QInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> +::operator()(const DataMapper& res, const QInt8* blockA, const QInt8* blockB, + Index rows, Index depth, Index cols, QInt32 alpha, + Index strideA, Index strideB, Index offsetA, Index offsetB) +{ + EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE); + + eigen_assert(alpha.value == 1); + eigen_assert(strideA == -1); + eigen_assert(strideB == -1); + eigen_assert(offsetA == 0); + eigen_assert(offsetB == 0); + + eigen_assert(rows > 0); + eigen_assert(cols > 0); + eigen_assert(depth > 0); + eigen_assert(blockA); + eigen_assert(blockB); + + for (Index j = 0; j < cols; ++j) { + Index startB = j * depth; + + for (Index i = 0; i < rows; ++i) { + Index startA = i * depth; + + for (Index k = 0; k < depth; ++k) { + res(i, j) += blockA[startA + k] * blockB[startB + k]; + } + } + } +} +#endif + + +// This definition tackle the case where the lhs is encoded using signed 8bit +// integers and the rhs using unsigned 8bit integers. +#ifndef EIGEN_USE_OPTIMIZED_INT8_UINT8_MAT_MAT_PRODUCT +template<bool _ConjLhs, bool _ConjRhs> +class gebp_traits<QInt8, QUInt8, _ConjLhs, _ConjRhs> +{ +public: + typedef QInt8 LhsScalar; + typedef QUInt8 RhsScalar; + typedef QInt32 ResScalar; + + enum { + // register block size along the M and N directions + // One for the current implementation + nr = 1, + mr = 1, + // Progress made at each iteration of the product loop + // also 1 for the current implementation + LhsProgress = 1, + RhsProgress = 1 + }; +}; + +// Mat-Mat product of a signed 8bit lhs with an unsigned 8bit rhs +template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs> +struct gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> +{ + EIGEN_DONT_INLINE + void operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB, + Index rows, Index depth, Index cols, QInt32 alpha, + Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0); +}; + +template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs> +EIGEN_DONT_INLINE +void gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> +::operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB, + Index rows, Index depth, Index cols, QInt32 alpha, + Index strideA, Index strideB, Index offsetA, Index offsetB) +{ + EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE); + + eigen_assert(alpha.value == 1); + eigen_assert(strideA == -1); + eigen_assert(strideB == -1); + eigen_assert(offsetA == 0); + eigen_assert(offsetB == 0); + + eigen_assert(rows > 0); + eigen_assert(cols > 0); + eigen_assert(depth > 0); + eigen_assert(blockA); + eigen_assert(blockB); + + for (Index j = 0; j < cols; ++j) { + Index startB = j * depth; + + for (Index i = 0; i < rows; ++i) { + Index startA = i * depth; + + for (Index k = 0; k < depth; ++k) { + res(i, j) += blockA[startA + k] * blockB[startB + k]; + } + } + } +} +#endif + + +// This definition tackle the case where the khs is encoded using unsigned 8bit +// integers and the rhs using signed 8bit integers. +#ifndef EIGEN_USE_OPTIMIZED_UINT8_INT8_MAT_MAT_PRODUCT +template<bool _ConjLhs, bool _ConjRhs> +class gebp_traits<QUInt8, QInt8, _ConjLhs, _ConjRhs> +{ +public: + typedef QUInt8 LhsScalar; + typedef QInt8 RhsScalar; + typedef QInt32 ResScalar; + + enum { + // register block size along the M and N directions + // One for the current implementation + nr = 1, + mr = 1, + // Progress made at each iteration of the product loop + // also 1 for the current implementation + LhsProgress = 1, + RhsProgress = 1 + }; +}; + + +// Mat-Mat product of an unsigned 8bit lhs with a signed 8bit rhs +template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs> +struct gebp_kernel<QUInt8, QInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> +{ + EIGEN_DONT_INLINE + void operator()(const DataMapper& res, const QUInt8* blockA, const QInt8* blockB, + Index rows, Index depth, Index cols, QInt32 alpha, + Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0); +}; + +template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs> +EIGEN_DONT_INLINE +void gebp_kernel<QUInt8, QInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> +::operator()(const DataMapper& res, const QUInt8* blockA, const QInt8* blockB, + Index rows, Index depth, Index cols, QInt32 alpha, + Index strideA, Index strideB, Index offsetA, Index offsetB) +{ + EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE); + + eigen_assert(alpha.value == 1); + eigen_assert(strideA == -1); + eigen_assert(strideB == -1); + eigen_assert(offsetA == 0); + eigen_assert(offsetB == 0); + + eigen_assert(rows > 0); + eigen_assert(cols > 0); + eigen_assert(depth > 0); + eigen_assert(blockA); + eigen_assert(blockB); + + for (Index j = 0; j < cols; ++j) { + Index startB = j * depth; + + for (Index i = 0; i < rows; ++i) { + Index startA = i * depth; + + for (Index k = 0; k < depth; ++k) { + res(i, j) += blockA[startA + k] * blockB[startB + k]; + } + } + } +} +#endif + +} // namespace internal +} // namespace Eigen + + + +#endif // EIGEN_CXX11_FIXED_POINT_MAT_MAT_PRODUCT_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductAVX2.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductAVX2.h new file mode 100644 index 0000000000..d561b79fbd --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductAVX2.h @@ -0,0 +1,1743 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com> +// Copyright (C) 2015 Matthew Sarett <msarett@google.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_FIXED_POINT_MAT_MAT_PRODUCT_AVX2_H +#define EIGEN_CXX11_FIXED_POINT_MAT_MAT_PRODUCT_AVX2_H + +namespace Eigen { +namespace internal { + +// AVX2 optimized implementation of Mat-Mat product. +// LHS is encoded using signed 8-bit integers. +// RHS is encoded using unsigned 8-bit integers. +#ifdef EIGEN_USE_OPTIMIZED_INT8_UINT8_MAT_MAT_PRODUCT + +// Define quantized traits +template<bool _ConjLhs, bool _ConjRhs> +class gebp_traits<QInt8, QUInt8, _ConjLhs, _ConjRhs> +{ +public: + typedef QInt8 LhsScalar; + typedef QUInt8 RhsScalar; + typedef QInt32 ResScalar; + + enum { + // Define register blocking scheme. + nr = 32, + mr = 32, + kr = 8, + // Ignore progress tracking per loop iteration. + LhsProgress = -1, + RhsProgress = -1 + }; +}; + +// Specialized blocking for quantized implementations. +// Used by TensorContractionThreadPool, inputs must have dimensions that are +// multiples of 32. +template<int KcFactor, typename Index> +struct ComputeGemmByColBlockingSizes<QInt8, QUInt8, KcFactor, Index> { + void operator()(Index& k, Index& m, Index& n, Index num_threads) + { + eigen_assert(m % 32 == 0); + eigen_assert(n % 32 == 0); + eigen_assert(k % 32 == 0); + if (!k || !m || !n) { + return; + } + n = (((n / num_threads) + 31) / 32) * 32; + } +}; + +// Specialized blocking for quantized implementations. +// Used by TensorContractionThreadPool, inputs must have dimensions that are +// multiples of 32. +template<int KcFactor, typename Index> +struct ComputeGemmByRowBlockingSizes<QInt8, QUInt8, KcFactor, Index> { + void operator()(Index& k, Index& m, Index& n, Index num_threads) + { + eigen_assert(m % 32 == 0); + eigen_assert(n % 32 == 0 || n == 1); + eigen_assert(k % 32 == 0); + if (!k || !m || !n) { + return; + } + // Special case to avoid breaking the unimplemented matrix-vector case + if (n == 1) { + n = 32; + } + m = (((m / num_threads) + 31) / 32) * 32; + } +}; + +// Specialized blocking for quantized implementations. +// Used by TensorContraction and GeneralMatrixMatrix, inputs are padded to +// multiples of 32. +template <int MaxRows, int MaxCols, int MaxDepth, int KcFactor> +class gemm_blocking_space<ColMajor, QInt8, QInt8, MaxRows, MaxCols, MaxDepth, + KcFactor, false> + : public level3_blocking<QInt8, QInt8> { + DenseIndex m_sizeA; + DenseIndex m_sizeB; + + public: + gemm_blocking_space(DenseIndex rows, DenseIndex cols, DenseIndex depth, + DenseIndex /*num_threads*/, bool /*l3_blocking*/) { + this->m_mc = ((rows + 31) / 32) * 32; + this->m_nc = ((cols + 31) / 32) * 32; + this->m_kc = ((depth + 31) / 32) * 32; + m_sizeA = this->m_mc * this->m_kc; + m_sizeB = this->m_kc * this->m_nc; + } + void allocateA() { + if (this->m_blockA == 0) this->m_blockA = aligned_new<QInt8>(m_sizeA); + } + void allocateB() { + if (this->m_blockB == 0) this->m_blockB = aligned_new<QInt8>(m_sizeB); + } + void allocateAll() { + allocateA(); + allocateB(); + } + ~gemm_blocking_space() { + aligned_delete(this->m_blockA, m_sizeA); + aligned_delete(this->m_blockB, m_sizeB); + } +}; + + +template <int MaxRows, int MaxCols, int MaxDepth, int KcFactor> +class gemm_blocking_space<ColMajor, QInt8, QUInt8, MaxRows, MaxCols, MaxDepth, + KcFactor, false> + : public level3_blocking<QInt8, QUInt8> { + DenseIndex m_sizeA; + DenseIndex m_sizeB; + + public: + gemm_blocking_space(DenseIndex rows, DenseIndex cols, DenseIndex depth, + DenseIndex /*num_threads*/, bool /*l3_blocking*/) { + this->m_mc = ((rows + 31) / 32) * 32; + this->m_nc = ((cols + 31) / 32) * 32; + this->m_kc = ((depth + 31) / 32) * 32; + m_sizeA = this->m_mc * this->m_kc; + m_sizeB = this->m_kc * this->m_nc; + } + void allocateA() { + if (this->m_blockA == 0) this->m_blockA = aligned_new<QInt8>(m_sizeA); + } + void allocateB() { + if (this->m_blockB == 0) this->m_blockB = aligned_new<QUInt8>(m_sizeB); + } + void allocateAll() { + allocateA(); + allocateB(); + } + ~gemm_blocking_space() { + aligned_delete(this->m_blockA, m_sizeA); + aligned_delete(this->m_blockB, m_sizeB); + } +}; + +// Alternate templates for any input sizes +template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, int StorageOrder, bool Conjugate = false, bool PanelMode = false> +struct gemm_pack_lhs_any; +template <typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode> +struct gemm_pack_lhs_any<QInt8, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode> { + EIGEN_DONT_INLINE void operator() + (QInt8* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride = 0, Index offset = 0); +}; + +template<typename Scalar, typename Index, typename DataMapper, int nr, int StorageOrder, bool Conjugate = false, bool PanelMode=false> +struct gemm_pack_rhs_any; +template <typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode> +struct gemm_pack_rhs_any<QUInt8, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode> { + EIGEN_DONT_INLINE void operator() + (QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride = 0, Index offset = 0); +}; + +template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs=false, bool ConjugateRhs=false> +struct gebp_kernel_any; +template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs> +struct gebp_kernel_any<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> +{ + typedef typename DataMapper::LinearMapper LinearMapper; + + EIGEN_DONT_INLINE + void operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB, + Index rows, Index depth, Index cols, QInt32 alpha, + Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0); +}; + +// Alternate implementations for any input sizes +template <typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode> +EIGEN_DONT_INLINE void gemm_pack_lhs_any<QInt8, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode>:: +operator()(QInt8* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) { + eigen_assert(stride == 0); + eigen_assert(offset == 0); + + // Get vector pointer + __m256i* blockA_256 = reinterpret_cast<__m256i*>(blockA); + + // Get even multiples of the dimensions + Index rows_32 = (rows / 32) * 32; + Index depth_8 = (depth / 8) * 8; + + // Get padding for when depth is not a multiple of 32 + int padding = 0; + if (depth % 32 != 0) { + int depth_32 = (depth / 32) * 32; + int extra_depth = depth - depth_32; + int extra_depth_8 = ((extra_depth + 7) / 8) * 8; + padding = 32 - extra_depth_8; + } + + // Pack rows in sets of 32 + for (Index m = 0; m < rows_32; m += 32) { + // Pack depth in sets of 8 + for (Index k = 0; k < depth_8; k += 8) { + // Load vectors + __m256i L_A = lhs.loadPacket(m, k); + __m256i L_B = lhs.loadPacket(m, k + 1); + + // Interleave 8-bit elements + __m256i L_AB0_AB16 = _mm256_unpacklo_epi8(L_A, L_B); + __m256i L_AB8_AB24 = _mm256_unpackhi_epi8(L_A, L_B); + + __m256i L_C = lhs.loadPacket(m, k + 2); + __m256i L_D = lhs.loadPacket(m, k + 3); + __m256i L_CD0_CD16 = _mm256_unpacklo_epi8(L_C, L_D); + __m256i L_CD8_CD24 = _mm256_unpackhi_epi8(L_C, L_D); + + // Interleave 16-bit elements + __m256i L_AD0_AD16 = _mm256_unpacklo_epi16(L_AB0_AB16, L_CD0_CD16); + __m256i L_AD4_AD20 = _mm256_unpackhi_epi16(L_AB0_AB16, L_CD0_CD16); + + // Use permute before we store to cross 128-bit lanes + __m256i L_AD0 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x20); + _mm256_store_si256(blockA_256++, L_AD0); + + // Complete packing for 32 x 8 block + __m256i L_AD16 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x31); + __m256i L_AD8_AD24 = _mm256_unpacklo_epi16(L_AB8_AB24, L_CD8_CD24); + __m256i L_AD12_AD28 = _mm256_unpackhi_epi16(L_AB8_AB24, L_CD8_CD24); + __m256i L_AD8 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x20); + _mm256_store_si256(blockA_256++, L_AD8); + _mm256_store_si256(blockA_256++, L_AD16); + __m256i L_AD24 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x31); + _mm256_store_si256(blockA_256++, L_AD24); + __m256i L_E = lhs.loadPacket(m, k + 4); + __m256i L_F = lhs.loadPacket(m, k + 5); + __m256i L_EF0_EF16 = _mm256_unpacklo_epi8(L_E, L_F); + __m256i L_EF8_EF24 = _mm256_unpackhi_epi8(L_E, L_F); + __m256i L_G = lhs.loadPacket(m, k + 6); + __m256i L_H = lhs.loadPacket(m, k + 7); + __m256i L_GH0_GH16 = _mm256_unpacklo_epi8(L_G, L_H); + __m256i L_GH8_GH24 = _mm256_unpackhi_epi8(L_G, L_H); + __m256i L_EH0_EH16 = _mm256_unpacklo_epi16(L_EF0_EF16, L_GH0_GH16); + __m256i L_EH4_EH20 = _mm256_unpackhi_epi16(L_EF0_EF16, L_GH0_GH16); + __m256i L_EH0 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x20); + _mm256_store_si256(blockA_256++, L_EH0); + __m256i L_EH16 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x31); + __m256i L_EH8_EH24 = _mm256_unpacklo_epi16(L_EF8_EF24, L_GH8_GH24); + __m256i L_EH12_EH28 = _mm256_unpackhi_epi16(L_EF8_EF24, L_GH8_GH24); + __m256i L_EH8 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x20); + _mm256_store_si256(blockA_256++, L_EH8); + _mm256_store_si256(blockA_256++, L_EH16); + __m256i L_EH24 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x31); + _mm256_store_si256(blockA_256++, L_EH24); + } + + // Finish the k dimension, padding with zeros + if (depth_8 < depth) { + __m256i L_A, L_B, L_C, L_D, L_E, L_F, L_G, L_H; + switch (depth - depth_8) { + case 1: + L_A = lhs.loadPacket(m, depth_8); + L_B = _mm256_setzero_si256(); + L_C = _mm256_setzero_si256(); + L_D = _mm256_setzero_si256(); + L_E = _mm256_setzero_si256(); + L_F = _mm256_setzero_si256(); + L_G = _mm256_setzero_si256(); + L_H = _mm256_setzero_si256(); + break; + case 2: + L_A = lhs.loadPacket(m, depth_8); + L_B = lhs.loadPacket(m, depth_8 + 1); + L_C = _mm256_setzero_si256(); + L_D = _mm256_setzero_si256(); + L_E = _mm256_setzero_si256(); + L_F = _mm256_setzero_si256(); + L_G = _mm256_setzero_si256(); + L_H = _mm256_setzero_si256(); + break; + case 3: + L_A = lhs.loadPacket(m, depth_8); + L_B = lhs.loadPacket(m, depth_8 + 1); + L_C = lhs.loadPacket(m, depth_8 + 2); + L_D = _mm256_setzero_si256(); + L_E = _mm256_setzero_si256(); + L_F = _mm256_setzero_si256(); + L_G = _mm256_setzero_si256(); + L_H = _mm256_setzero_si256(); + break; + case 4: + L_A = lhs.loadPacket(m, depth_8); + L_B = lhs.loadPacket(m, depth_8 + 1); + L_C = lhs.loadPacket(m, depth_8 + 2); + L_D = lhs.loadPacket(m, depth_8 + 3); + L_E = _mm256_setzero_si256(); + L_F = _mm256_setzero_si256(); + L_G = _mm256_setzero_si256(); + L_H = _mm256_setzero_si256(); + break; + case 5: + L_A = lhs.loadPacket(m, depth_8); + L_B = lhs.loadPacket(m, depth_8 + 1); + L_C = lhs.loadPacket(m, depth_8 + 2); + L_D = lhs.loadPacket(m, depth_8 + 3); + L_E = lhs.loadPacket(m, depth_8 + 4); + L_F = _mm256_setzero_si256(); + L_G = _mm256_setzero_si256(); + L_H = _mm256_setzero_si256(); + break; + case 6: + L_A = lhs.loadPacket(m, depth_8); + L_B = lhs.loadPacket(m, depth_8 + 1); + L_C = lhs.loadPacket(m, depth_8 + 2); + L_D = lhs.loadPacket(m, depth_8 + 3); + L_E = lhs.loadPacket(m, depth_8 + 4); + L_F = lhs.loadPacket(m, depth_8 + 5); + L_G = _mm256_setzero_si256(); + L_H = _mm256_setzero_si256(); + break; + case 7: + L_A = lhs.loadPacket(m, depth_8); + L_B = lhs.loadPacket(m, depth_8 + 1); + L_C = lhs.loadPacket(m, depth_8 + 2); + L_D = lhs.loadPacket(m, depth_8 + 3); + L_E = lhs.loadPacket(m, depth_8 + 4); + L_F = lhs.loadPacket(m, depth_8 + 5); + L_G = lhs.loadPacket(m, depth_8 + 6); + L_H = _mm256_setzero_si256(); + break; + } + + // Interleave 8-bit elements + __m256i L_AB0_AB16 = _mm256_unpacklo_epi8(L_A, L_B); + __m256i L_AB8_AB24 = _mm256_unpackhi_epi8(L_A, L_B); + + __m256i L_CD0_CD16 = _mm256_unpacklo_epi8(L_C, L_D); + __m256i L_CD8_CD24 = _mm256_unpackhi_epi8(L_C, L_D); + + // Interleave 16-bit elements + __m256i L_AD0_AD16 = _mm256_unpacklo_epi16(L_AB0_AB16, L_CD0_CD16); + __m256i L_AD4_AD20 = _mm256_unpackhi_epi16(L_AB0_AB16, L_CD0_CD16); + + // Use permute before we store to cross 128-bit lanes + __m256i L_AD0 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x20); + _mm256_store_si256(blockA_256++, L_AD0); + + // Complete packing + __m256i L_AD16 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x31); + __m256i L_AD8_AD24 = _mm256_unpacklo_epi16(L_AB8_AB24, L_CD8_CD24); + __m256i L_AD12_AD28 = _mm256_unpackhi_epi16(L_AB8_AB24, L_CD8_CD24); + __m256i L_AD8 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x20); + _mm256_store_si256(blockA_256++, L_AD8); + _mm256_store_si256(blockA_256++, L_AD16); + __m256i L_AD24 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x31); + _mm256_store_si256(blockA_256++, L_AD24); + __m256i L_EF0_EF16 = _mm256_unpacklo_epi8(L_E, L_F); + __m256i L_EF8_EF24 = _mm256_unpackhi_epi8(L_E, L_F); + __m256i L_GH0_GH16 = _mm256_unpacklo_epi8(L_G, L_H); + __m256i L_GH8_GH24 = _mm256_unpackhi_epi8(L_G, L_H); + __m256i L_EH0_EH16 = _mm256_unpacklo_epi16(L_EF0_EF16, L_GH0_GH16); + __m256i L_EH4_EH20 = _mm256_unpackhi_epi16(L_EF0_EF16, L_GH0_GH16); + __m256i L_EH0 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x20); + _mm256_store_si256(blockA_256++, L_EH0); + __m256i L_EH16 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x31); + __m256i L_EH8_EH24 = _mm256_unpacklo_epi16(L_EF8_EF24, L_GH8_GH24); + __m256i L_EH12_EH28 = _mm256_unpackhi_epi16(L_EF8_EF24, L_GH8_GH24); + __m256i L_EH8 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x20); + _mm256_store_si256(blockA_256++, L_EH8); + _mm256_store_si256(blockA_256++, L_EH16); + __m256i L_EH24 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x31); + _mm256_store_si256(blockA_256++, L_EH24); + } + blockA_256 += padding; + } + + // Finish the m dimension, padding with zeros + if (rows_32 < rows) { + // Pack depth in sets of 8 + for (Index k = 0; k < depth_8; k += 8) { + // Load vectors + __m256i L_A = _mm256_setzero_si256(); + __m256i L_B = _mm256_setzero_si256(); + __m256i L_C = _mm256_setzero_si256(); + __m256i L_D = _mm256_setzero_si256(); + __m256i L_E = _mm256_setzero_si256(); + __m256i L_F = _mm256_setzero_si256(); + __m256i L_G = _mm256_setzero_si256(); + __m256i L_H = _mm256_setzero_si256(); + for (Index m = 0; m < rows - rows_32; m++) { + QInt8* ptr = (QInt8*) &L_A; + ptr[m] = lhs(rows_32 + m, k); + ptr = (QInt8*) &L_B; + ptr[m] = lhs(rows_32 + m, k + 1); + ptr = (QInt8*) &L_C; + ptr[m] = lhs(rows_32 + m, k + 2); + ptr = (QInt8*) &L_D; + ptr[m] = lhs(rows_32 + m, k + 3); + ptr = (QInt8*) &L_E; + ptr[m] = lhs(rows_32 + m, k + 4); + ptr = (QInt8*) &L_F; + ptr[m] = lhs(rows_32 + m, k + 5); + ptr = (QInt8*) &L_G; + ptr[m] = lhs(rows_32 + m, k + 6); + ptr = (QInt8*) &L_H; + ptr[m] = lhs(rows_32 + m, k + 7); + } + + // Interleave 8-bit elements + __m256i L_AB0_AB16 = _mm256_unpacklo_epi8(L_A, L_B); + __m256i L_AB8_AB24 = _mm256_unpackhi_epi8(L_A, L_B); + __m256i L_CD0_CD16 = _mm256_unpacklo_epi8(L_C, L_D); + __m256i L_CD8_CD24 = _mm256_unpackhi_epi8(L_C, L_D); + + // Interleave 16-bit elements + __m256i L_AD0_AD16 = _mm256_unpacklo_epi16(L_AB0_AB16, L_CD0_CD16); + __m256i L_AD4_AD20 = _mm256_unpackhi_epi16(L_AB0_AB16, L_CD0_CD16); + + // Use permute before we store to cross 128-bit lanes + __m256i L_AD0 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x20); + _mm256_store_si256(blockA_256++, L_AD0); + + // Complete packing for 32 x 8 block + __m256i L_AD16 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x31); + __m256i L_AD8_AD24 = _mm256_unpacklo_epi16(L_AB8_AB24, L_CD8_CD24); + __m256i L_AD12_AD28 = _mm256_unpackhi_epi16(L_AB8_AB24, L_CD8_CD24); + __m256i L_AD8 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x20); + _mm256_store_si256(blockA_256++, L_AD8); + _mm256_store_si256(blockA_256++, L_AD16); + __m256i L_AD24 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x31); + _mm256_store_si256(blockA_256++, L_AD24); + __m256i L_EF0_EF16 = _mm256_unpacklo_epi8(L_E, L_F); + __m256i L_EF8_EF24 = _mm256_unpackhi_epi8(L_E, L_F); + __m256i L_GH0_GH16 = _mm256_unpacklo_epi8(L_G, L_H); + __m256i L_GH8_GH24 = _mm256_unpackhi_epi8(L_G, L_H); + __m256i L_EH0_EH16 = _mm256_unpacklo_epi16(L_EF0_EF16, L_GH0_GH16); + __m256i L_EH4_EH20 = _mm256_unpackhi_epi16(L_EF0_EF16, L_GH0_GH16); + __m256i L_EH0 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x20); + _mm256_store_si256(blockA_256++, L_EH0); + __m256i L_EH16 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x31); + __m256i L_EH8_EH24 = _mm256_unpacklo_epi16(L_EF8_EF24, L_GH8_GH24); + __m256i L_EH12_EH28 = _mm256_unpackhi_epi16(L_EF8_EF24, L_GH8_GH24); + __m256i L_EH8 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x20); + _mm256_store_si256(blockA_256++, L_EH8); + _mm256_store_si256(blockA_256++, L_EH16); + __m256i L_EH24 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x31); + _mm256_store_si256(blockA_256++, L_EH24); + } + + // Finish the k dimension, padding with zeros + if (depth_8 < depth) { + __m256i L_A, L_B, L_C, L_D, L_E, L_F, L_G, L_H; + QInt8* ptr; + switch (depth - depth_8) { + case 1: + L_A = _mm256_setzero_si256(); + L_B = _mm256_setzero_si256(); + L_C = _mm256_setzero_si256(); + L_D = _mm256_setzero_si256(); + L_E = _mm256_setzero_si256(); + L_F = _mm256_setzero_si256(); + L_G = _mm256_setzero_si256(); + L_H = _mm256_setzero_si256(); + for (Index m = 0; m < rows - rows_32; m++) { + QInt8* ptr = (QInt8*) &L_A; + ptr[m] = lhs(rows_32 + m, depth_8); + } + break; + case 2: + L_A = _mm256_setzero_si256(); + L_B = _mm256_setzero_si256(); + L_C = _mm256_setzero_si256(); + L_D = _mm256_setzero_si256(); + L_E = _mm256_setzero_si256(); + L_F = _mm256_setzero_si256(); + L_G = _mm256_setzero_si256(); + L_H = _mm256_setzero_si256(); + for (Index m = 0; m < rows - rows_32; m++) { + ptr = (QInt8*) &L_A; + ptr[m] = lhs(rows_32 + m, depth_8); + ptr = (QInt8*) &L_B; + ptr[m] = lhs(rows_32 + m, depth_8 + 1); + } + break; + case 3: + L_A = _mm256_setzero_si256(); + L_B = _mm256_setzero_si256(); + L_C = _mm256_setzero_si256(); + L_D = _mm256_setzero_si256(); + L_E = _mm256_setzero_si256(); + L_F = _mm256_setzero_si256(); + L_G = _mm256_setzero_si256(); + L_H = _mm256_setzero_si256(); + for (Index m = 0; m < rows - rows_32; m++) { + ptr = (QInt8*) &L_A; + ptr[m] = lhs(rows_32 + m, depth_8); + ptr = (QInt8*) &L_B; + ptr[m] = lhs(rows_32 + m, depth_8 + 1); + ptr = (QInt8*) &L_C; + ptr[m] = lhs(rows_32 + m, depth_8 + 2); + } + break; + case 4: + L_A = _mm256_setzero_si256(); + L_B = _mm256_setzero_si256(); + L_C = _mm256_setzero_si256(); + L_D = _mm256_setzero_si256(); + L_E = _mm256_setzero_si256(); + L_F = _mm256_setzero_si256(); + L_G = _mm256_setzero_si256(); + L_H = _mm256_setzero_si256(); + for (Index m = 0; m < rows - rows_32; m++) { + ptr = (QInt8*) &L_A; + ptr[m] = lhs(rows_32 + m, depth_8); + ptr = (QInt8*) &L_B; + ptr[m] = lhs(rows_32 + m, depth_8 + 1); + ptr = (QInt8*) &L_C; + ptr[m] = lhs(rows_32 + m, depth_8 + 2); + ptr = (QInt8*) &L_D; + ptr[m] = lhs(rows_32 + m, depth_8 + 3); + } + break; + case 5: + L_A = _mm256_setzero_si256(); + L_B = _mm256_setzero_si256(); + L_C = _mm256_setzero_si256(); + L_D = _mm256_setzero_si256(); + L_E = _mm256_setzero_si256(); + L_F = _mm256_setzero_si256(); + L_G = _mm256_setzero_si256(); + L_H = _mm256_setzero_si256(); + for (Index m = 0; m < rows - rows_32; m++) { + ptr = (QInt8*) &L_A; + ptr[m] = lhs(rows_32 + m, depth_8); + ptr = (QInt8*) &L_B; + ptr[m] = lhs(rows_32 + m, depth_8 + 1); + ptr = (QInt8*) &L_C; + ptr[m] = lhs(rows_32 + m, depth_8 + 2); + ptr = (QInt8*) &L_D; + ptr[m] = lhs(rows_32 + m, depth_8 + 3); + ptr = (QInt8*) &L_E; + ptr[m] = lhs(rows_32 + m, depth_8 + 4); + } + break; + case 6: + L_A = _mm256_setzero_si256(); + L_B = _mm256_setzero_si256(); + L_C = _mm256_setzero_si256(); + L_D = _mm256_setzero_si256(); + L_E = _mm256_setzero_si256(); + L_F = _mm256_setzero_si256(); + L_G = _mm256_setzero_si256(); + L_H = _mm256_setzero_si256(); + for (Index m = 0; m < rows - rows_32; m++) { + ptr = (QInt8*) &L_A; + ptr[m] = lhs(rows_32 + m, depth_8); + ptr = (QInt8*) &L_B; + ptr[m] = lhs(rows_32 + m, depth_8 + 1); + ptr = (QInt8*) &L_C; + ptr[m] = lhs(rows_32 + m, depth_8 + 2); + ptr = (QInt8*) &L_D; + ptr[m] = lhs(rows_32 + m, depth_8 + 3); + ptr = (QInt8*) &L_E; + ptr[m] = lhs(rows_32 + m, depth_8 + 4); + ptr = (QInt8*) &L_F; + ptr[m] = lhs(rows_32 + m, depth_8 + 5); + } + break; + case 7: + L_A = _mm256_setzero_si256(); + L_B = _mm256_setzero_si256(); + L_C = _mm256_setzero_si256(); + L_D = _mm256_setzero_si256(); + L_E = _mm256_setzero_si256(); + L_F = _mm256_setzero_si256(); + L_G = _mm256_setzero_si256(); + L_H = _mm256_setzero_si256(); + for (Index m = 0; m < rows - rows_32; m++) { + ptr = (QInt8*) &L_A; + ptr[m] = lhs(rows_32 + m, depth_8); + ptr = (QInt8*) &L_B; + ptr[m] = lhs(rows_32 + m, depth_8 + 1); + ptr = (QInt8*) &L_C; + ptr[m] = lhs(rows_32 + m, depth_8 + 2); + ptr = (QInt8*) &L_D; + ptr[m] = lhs(rows_32 + m, depth_8 + 3); + ptr = (QInt8*) &L_E; + ptr[m] = lhs(rows_32 + m, depth_8 + 4); + ptr = (QInt8*) &L_F; + ptr[m] = lhs(rows_32 + m, depth_8 + 5); + ptr = (QInt8*) &L_G; + ptr[m] = lhs(rows_32 + m, depth_8 + 6); + } + break; + } + + // Interleave 8-bit elements + __m256i L_AB0_AB16 = _mm256_unpacklo_epi8(L_A, L_B); + __m256i L_AB8_AB24 = _mm256_unpackhi_epi8(L_A, L_B); + __m256i L_CD0_CD16 = _mm256_unpacklo_epi8(L_C, L_D); + __m256i L_CD8_CD24 = _mm256_unpackhi_epi8(L_C, L_D); + + // Interleave 16-bit elements + __m256i L_AD0_AD16 = _mm256_unpacklo_epi16(L_AB0_AB16, L_CD0_CD16); + __m256i L_AD4_AD20 = _mm256_unpackhi_epi16(L_AB0_AB16, L_CD0_CD16); + + // Use permute before we store to cross 128-bit lanes + __m256i L_AD0 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x20); + _mm256_store_si256(blockA_256++, L_AD0); + + // Complete packing + __m256i L_AD16 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x31); + __m256i L_AD8_AD24 = _mm256_unpacklo_epi16(L_AB8_AB24, L_CD8_CD24); + __m256i L_AD12_AD28 = _mm256_unpackhi_epi16(L_AB8_AB24, L_CD8_CD24); + __m256i L_AD8 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x20); + _mm256_store_si256(blockA_256++, L_AD8); + _mm256_store_si256(blockA_256++, L_AD16); + __m256i L_AD24 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x31); + _mm256_store_si256(blockA_256++, L_AD24); + __m256i L_EF0_EF16 = _mm256_unpacklo_epi8(L_E, L_F); + __m256i L_EF8_EF24 = _mm256_unpackhi_epi8(L_E, L_F); + __m256i L_GH0_GH16 = _mm256_unpacklo_epi8(L_G, L_H); + __m256i L_GH8_GH24 = _mm256_unpackhi_epi8(L_G, L_H); + __m256i L_EH0_EH16 = _mm256_unpacklo_epi16(L_EF0_EF16, L_GH0_GH16); + __m256i L_EH4_EH20 = _mm256_unpackhi_epi16(L_EF0_EF16, L_GH0_GH16); + __m256i L_EH0 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x20); + _mm256_store_si256(blockA_256++, L_EH0); + __m256i L_EH16 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x31); + __m256i L_EH8_EH24 = _mm256_unpacklo_epi16(L_EF8_EF24, L_GH8_GH24); + __m256i L_EH12_EH28 = _mm256_unpackhi_epi16(L_EF8_EF24, L_GH8_GH24); + __m256i L_EH8 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x20); + _mm256_store_si256(blockA_256++, L_EH8); + _mm256_store_si256(blockA_256++, L_EH16); + __m256i L_EH24 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x31); + _mm256_store_si256(blockA_256++, L_EH24); + } + } +} + +template <typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode> +EIGEN_DONT_INLINE void gemm_pack_rhs_any<QUInt8, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>:: +operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) { + eigen_assert(stride == 0); + eigen_assert(offset == 0); + + // Get vector pointer + __m256i* blockB_256 = reinterpret_cast<__m256i*>(blockB); + + // Get even multiples of the dimensions + Index cols_32 = (cols / 32) * 32; + Index depth_32 = (depth / 32) * 32; + + // Perform a step of the packing for 4 columns + __m256i R_AB_L, R_AB_H, R_CD_L, R_CD_H, R_AD_0, R_AD_8, R_AD_16, R_AD_24; +#define PACK_STEP \ + R_AB_L = _mm256_unpacklo_epi64(R_A, R_B); \ + R_CD_L = _mm256_unpacklo_epi64(R_C, R_D); \ + R_AB_H = _mm256_unpackhi_epi64(R_A, R_B); \ + R_CD_H = _mm256_unpackhi_epi64(R_C, R_D); \ + R_AD_0 = _mm256_permute2x128_si256(R_AB_L, R_CD_L, 0x20); \ + R_AD_16 = _mm256_permute2x128_si256(R_AB_L, R_CD_L, 0x31); \ + R_AD_8 = _mm256_permute2x128_si256(R_AB_H, R_CD_H, 0x20); \ + R_AD_24 = _mm256_permute2x128_si256(R_AB_H, R_CD_H, 0x31); \ + _mm256_store_si256(blockB_256, R_AD_0); \ + _mm256_store_si256(blockB_256 + 8, R_AD_8); \ + _mm256_store_si256(blockB_256 + 16, R_AD_16); \ + _mm256_store_si256(blockB_256 + 24, R_AD_24); \ + blockB_256++; + + // Pack cols in sets of 32 + for (Index n = 0; n < cols_32; n += 32) { + // Pack depth in sets of 32 + for (Index k = 0; k < depth_32; k += 32) { + __m256i R_A = rhs.loadPacket(k, n); + __m256i R_B = rhs.loadPacket(k, n + 1); + __m256i R_C = rhs.loadPacket(k, n + 2); + __m256i R_D = rhs.loadPacket(k, n + 3); + PACK_STEP; + + R_A = rhs.loadPacket(k, n + 4); + R_B = rhs.loadPacket(k, n + 5); + R_C = rhs.loadPacket(k, n + 6); + R_D = rhs.loadPacket(k, n + 7); + PACK_STEP; + + R_A = rhs.loadPacket(k, n + 8); + R_B = rhs.loadPacket(k, n + 9); + R_C = rhs.loadPacket(k, n + 10); + R_D = rhs.loadPacket(k, n + 11); + PACK_STEP; + + R_A = rhs.loadPacket(k, n + 12); + R_B = rhs.loadPacket(k, n + 13); + R_C = rhs.loadPacket(k, n + 14); + R_D = rhs.loadPacket(k, n + 15); + PACK_STEP; + + R_A = rhs.loadPacket(k, n + 16); + R_B = rhs.loadPacket(k, n + 17); + R_C = rhs.loadPacket(k, n + 18); + R_D = rhs.loadPacket(k, n + 19); + PACK_STEP; + + R_A = rhs.loadPacket(k, n + 20); + R_B = rhs.loadPacket(k, n + 21); + R_C = rhs.loadPacket(k, n + 22); + R_D = rhs.loadPacket(k, n + 23); + PACK_STEP; + + R_A = rhs.loadPacket(k, n + 24); + R_B = rhs.loadPacket(k, n + 25); + R_C = rhs.loadPacket(k, n + 26); + R_D = rhs.loadPacket(k, n + 27); + PACK_STEP; + + R_A = rhs.loadPacket(k, n + 28); + R_B = rhs.loadPacket(k, n + 29); + R_C = rhs.loadPacket(k, n + 30); + R_D = rhs.loadPacket(k, n + 31); + PACK_STEP; + + blockB_256 += 24; + } + + if (depth_32 < depth) { + QUInt8* ptr; + __m256i R_A = _mm256_setzero_si256(); + __m256i R_B = _mm256_setzero_si256(); + __m256i R_C = _mm256_setzero_si256(); + __m256i R_D = _mm256_setzero_si256(); + for (Index k = depth_32; k < depth; k++) { + ptr = (QUInt8*) &R_A; + ptr[k - depth_32] = rhs(k, n); + ptr = (QUInt8*) &R_B; + ptr[k - depth_32] = rhs(k, n + 1); + ptr = (QUInt8*) &R_C; + ptr[k - depth_32] = rhs(k, n + 2); + ptr = (QUInt8*) &R_D; + ptr[k - depth_32] = rhs(k, n + 3); + } + PACK_STEP; + + R_A = _mm256_setzero_si256(); + R_B = _mm256_setzero_si256(); + R_C = _mm256_setzero_si256(); + R_D = _mm256_setzero_si256(); + for (Index k = depth_32; k < depth; k++) { + ptr = (QUInt8*) &R_A; + ptr[k - depth_32] = rhs(k, n + 4); + ptr = (QUInt8*) &R_B; + ptr[k - depth_32] = rhs(k, n + 5); + ptr = (QUInt8*) &R_C; + ptr[k - depth_32] = rhs(k, n + 6); + ptr = (QUInt8*) &R_D; + ptr[k - depth_32] = rhs(k, n + 7); + } + PACK_STEP; + + R_A = _mm256_setzero_si256(); + R_B = _mm256_setzero_si256(); + R_C = _mm256_setzero_si256(); + R_D = _mm256_setzero_si256(); + for (Index k = depth_32; k < depth; k++) { + ptr = (QUInt8*) &R_A; + ptr[k - depth_32] = rhs(k, n + 8); + ptr = (QUInt8*) &R_B; + ptr[k - depth_32] = rhs(k, n + 9); + ptr = (QUInt8*) &R_C; + ptr[k - depth_32] = rhs(k, n + 10); + ptr = (QUInt8*) &R_D; + ptr[k - depth_32] = rhs(k, n + 11); + } + PACK_STEP; + + R_A = _mm256_setzero_si256(); + R_B = _mm256_setzero_si256(); + R_C = _mm256_setzero_si256(); + R_D = _mm256_setzero_si256(); + for (Index k = depth_32; k < depth; k++) { + ptr = (QUInt8*) &R_A; + ptr[k - depth_32] = rhs(k, n + 12); + ptr = (QUInt8*) &R_B; + ptr[k - depth_32] = rhs(k, n + 13); + ptr = (QUInt8*) &R_C; + ptr[k - depth_32] = rhs(k, n + 14); + ptr = (QUInt8*) &R_D; + ptr[k - depth_32] = rhs(k, n + 15); + } + PACK_STEP; + + R_A = _mm256_setzero_si256(); + R_B = _mm256_setzero_si256(); + R_C = _mm256_setzero_si256(); + R_D = _mm256_setzero_si256(); + for (Index k = depth_32; k < depth; k++) { + ptr = (QUInt8*) &R_A; + ptr[k - depth_32] = rhs(k, n + 16); + ptr = (QUInt8*) &R_B; + ptr[k - depth_32] = rhs(k, n + 17); + ptr = (QUInt8*) &R_C; + ptr[k - depth_32] = rhs(k, n + 18); + ptr = (QUInt8*) &R_D; + ptr[k - depth_32] = rhs(k, n + 19); + } + PACK_STEP; + + R_A = _mm256_setzero_si256(); + R_B = _mm256_setzero_si256(); + R_C = _mm256_setzero_si256(); + R_D = _mm256_setzero_si256(); + for (Index k = depth_32; k < depth; k++) { + ptr = (QUInt8*) &R_A; + ptr[k - depth_32] = rhs(k, n + 20); + ptr = (QUInt8*) &R_B; + ptr[k - depth_32] = rhs(k, n + 21); + ptr = (QUInt8*) &R_C; + ptr[k - depth_32] = rhs(k, n + 22); + ptr = (QUInt8*) &R_D; + ptr[k - depth_32] = rhs(k, n + 23); + } + PACK_STEP; + + R_A = _mm256_setzero_si256(); + R_B = _mm256_setzero_si256(); + R_C = _mm256_setzero_si256(); + R_D = _mm256_setzero_si256(); + for (Index k = depth_32; k < depth; k++) { + ptr = (QUInt8*) &R_A; + ptr[k - depth_32] = rhs(k, n + 24); + ptr = (QUInt8*) &R_B; + ptr[k - depth_32] = rhs(k, n + 25); + ptr = (QUInt8*) &R_C; + ptr[k - depth_32] = rhs(k, n + 26); + ptr = (QUInt8*) &R_D; + ptr[k - depth_32] = rhs(k, n + 27); + } + PACK_STEP; + + R_A = _mm256_setzero_si256(); + R_B = _mm256_setzero_si256(); + R_C = _mm256_setzero_si256(); + R_D = _mm256_setzero_si256(); + for (Index k = depth_32; k < depth; k++) { + ptr = (QUInt8*) &R_A; + ptr[k - depth_32] = rhs(k, n + 28); + ptr = (QUInt8*) &R_B; + ptr[k - depth_32] = rhs(k, n + 29); + ptr = (QUInt8*) &R_C; + ptr[k - depth_32] = rhs(k, n + 30); + ptr = (QUInt8*) &R_D; + ptr[k - depth_32] = rhs(k, n + 31); + } + PACK_STEP; + blockB_256 += 24; + } + } + + // Finish packing cols + if (cols_32 < cols) { + // Pack depth in sets of 32 + for (Index k = 0; k < depth_32; k += 32) { + __m256i R_A, R_B, R_C, R_D; + Index n; + for (n = cols_32; n < cols; n += 4) { + switch (cols - n) { + case 1: + R_A = rhs.loadPacket(k, n); + R_B = _mm256_setzero_si256(); + R_C = _mm256_setzero_si256(); + R_D = _mm256_setzero_si256(); + PACK_STEP; + break; + case 2: + R_A = rhs.loadPacket(k, n); + R_B = rhs.loadPacket(k, n + 1); + R_C = _mm256_setzero_si256(); + R_D = _mm256_setzero_si256(); + PACK_STEP; + break; + case 3: + R_A = rhs.loadPacket(k, n); + R_B = rhs.loadPacket(k, n + 1); + R_C = rhs.loadPacket(k, n + 2); + R_D = _mm256_setzero_si256(); + PACK_STEP; + break; + default: + R_A = rhs.loadPacket(k, n); + R_B = rhs.loadPacket(k, n + 1); + R_C = rhs.loadPacket(k, n + 2); + R_D = rhs.loadPacket(k, n + 3); + PACK_STEP; + break; + } + } + + // Increment the block pointer. + // We must pad if cols is not a multiple of 32. + blockB_256 += 32 - (n - cols_32) / 4; + } + + if (depth_32 < depth) { + for (Index n = cols_32; n < cols; n += 4) { + QUInt8* ptr; + __m256i R_A = _mm256_setzero_si256(); + __m256i R_B = _mm256_setzero_si256(); + __m256i R_C = _mm256_setzero_si256(); + __m256i R_D = _mm256_setzero_si256(); + switch (cols - n) { + case 1: + for (Index k = depth_32; k < depth; k++) { + ptr = (QUInt8*) &R_A; + ptr[k - depth_32] = rhs(k, n); + } + PACK_STEP; + break; + case 2: + for (Index k = depth_32; k < depth; k++) { + ptr = (QUInt8*) &R_A; + ptr[k - depth_32] = rhs(k, n); + ptr = (QUInt8*) &R_B; + ptr[k - depth_32] = rhs(k, n + 1); + } + PACK_STEP; + break; + case 3: + for (Index k = depth_32; k < depth; k++) { + ptr = (QUInt8*) &R_A; + ptr[k - depth_32] = rhs(k, n); + ptr = (QUInt8*) &R_B; + ptr[k - depth_32] = rhs(k, n + 1); + ptr = (QUInt8*) &R_C; + ptr[k - depth_32] = rhs(k, n + 2); + } + PACK_STEP; + break; + default: + for (Index k = depth_32; k < depth; k++) { + ptr = (QUInt8*) &R_A; + ptr[k - depth_32] = rhs(k, n); + ptr = (QUInt8*) &R_B; + ptr[k - depth_32] = rhs(k, n + 1); + ptr = (QUInt8*) &R_C; + ptr[k - depth_32] = rhs(k, n + 2); + ptr = (QUInt8*) &R_D; + ptr[k - depth_32] = rhs(k, n + 3); + } + PACK_STEP; + break; + } + } + } + } +#undef PACK_STEP +} + +template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs> +EIGEN_DONT_INLINE +void gebp_kernel_any<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> +::operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB, + Index rows, Index depth, Index cols, QInt32 alpha, + Index strideA, Index strideB, Index offsetA, Index offsetB) +{ + EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE); + eigen_assert(alpha.value == 1); + eigen_assert(strideA == -1); + eigen_assert(strideB == -1); + eigen_assert(offsetA == 0); + eigen_assert(offsetB == 0); + eigen_assert(rows > 0); + eigen_assert(cols > 0); + eigen_assert(depth > 0); + eigen_assert(blockA); + eigen_assert(blockB); + + Index rows_32 = ((rows + 31) / 32) * 32; + Index cols_32 = ((cols + 31) / 32) * 32; + Index depth_32 = ((depth + 31) / 32) * 32; + + // Create result block + ei_declare_aligned_stack_constructed_variable(QInt32, blockO, 32 * 32, 0); + memset(blockO, 0, 32 * 32 * sizeof(QInt32)); + + // Get vectorized pointers + __m256i* blockO_256 = reinterpret_cast<__m256i*>(blockO); + const __m256i* blockA_256 = reinterpret_cast<const __m256i*>(blockA); + const __m256i* blockB_256 = reinterpret_cast<const __m256i*>(blockB); + + // Loop over blocks of 32 columns + for (Index n = 0; n < cols_32; n += 32) { + // Reset index into blockA + Index indexL = 0; + // Loop over blocks of 32 rows + for (Index m = 0; m < rows_32; m += 32) { + // Reset index into blockB + Index indexR = n / 32 * depth_32; + // Loop over blocks of 8 on depth + for (Index k = 0; k < depth_32; k += 8) { + // Load inputs + __m256i L_AD0 = blockA_256[indexL++]; + __m256i L_AD8 = blockA_256[indexL++]; + __m256i L_AD16 = blockA_256[indexL++]; + __m256i L_AD24 = blockA_256[indexL++]; + __m256i L_EH0 = blockA_256[indexL++]; + __m256i L_EH8 = blockA_256[indexL++]; + __m256i L_EH16 = blockA_256[indexL++]; + __m256i L_EH24 = blockA_256[indexL++]; + __m256i R_AH0 = blockB_256[indexR++]; + __m256i R_AH4 = blockB_256[indexR++]; + __m256i R_AH8 = blockB_256[indexR++]; + __m256i R_AH12 = blockB_256[indexR++]; + __m256i R_AH16 = blockB_256[indexR++]; + __m256i R_AH20 = blockB_256[indexR++]; + __m256i R_AH24 = blockB_256[indexR++]; + __m256i R_AH28 = blockB_256[indexR++]; + + // This constant is used with madd to convert 16 bit to 32 bit + const __m256i ONE = _mm256_set1_epi32(0x00010001); + + // Declare variables used in COMPUTE_STEP + __m256i P_16_A, P_16_B, P_32_A, P_32_B, P_32; + +#define COMPUTE_STEP(R_INPUT_A, R_INPUT_B, OFFSET) \ + P_16_A = _mm256_maddubs_epi16(R_INPUT_A, L_AD0); \ + P_32_A = _mm256_madd_epi16(P_16_A, ONE); \ + P_16_B = _mm256_maddubs_epi16(R_INPUT_B, L_EH0); \ + P_32_B = _mm256_madd_epi16(P_16_B, ONE); \ + P_32 = _mm256_add_epi32(P_32_A, P_32_B); \ + _mm256_store_si256( \ + blockO_256 + 4 * OFFSET, \ + _mm256_add_epi32(_mm256_load_si256(blockO_256 + 4 * OFFSET), P_32)); \ + \ + P_16_A = _mm256_maddubs_epi16(R_INPUT_A, L_AD8); \ + P_32_A = _mm256_madd_epi16(P_16_A, ONE); \ + P_16_B = _mm256_maddubs_epi16(R_INPUT_B, L_EH8); \ + P_32_B = _mm256_madd_epi16(P_16_B, ONE); \ + P_32 = _mm256_add_epi32(P_32_A, P_32_B); \ + _mm256_store_si256( \ + blockO_256 + 4 * OFFSET + 1, \ + _mm256_add_epi32(_mm256_load_si256(blockO_256 + 4 * OFFSET + 1), P_32)); \ + \ + P_16_A = _mm256_maddubs_epi16(R_INPUT_A, L_AD16); \ + P_32_A = _mm256_madd_epi16(P_16_A, ONE); \ + P_16_B = _mm256_maddubs_epi16(R_INPUT_B, L_EH16); \ + P_32_B = _mm256_madd_epi16(P_16_B, ONE); \ + P_32 = _mm256_add_epi32(P_32_A, P_32_B); \ + _mm256_store_si256( \ + blockO_256 + 4 * OFFSET + 2, \ + _mm256_add_epi32(_mm256_load_si256(blockO_256 + 4 * OFFSET + 2), P_32)); \ + \ + P_16_A = _mm256_maddubs_epi16(R_INPUT_A, L_AD24); \ + P_32_A = _mm256_madd_epi16(P_16_A, ONE); \ + P_16_B = _mm256_maddubs_epi16(R_INPUT_B, L_EH24); \ + P_32_B = _mm256_madd_epi16(P_16_B, ONE); \ + P_32 = _mm256_add_epi32(P_32_A, P_32_B); \ + _mm256_store_si256( \ + blockO_256 + 4 * OFFSET + 3, \ + _mm256_add_epi32(_mm256_load_si256(blockO_256 + 4 * OFFSET + 3), P_32)); + + // Permute and shuffle to copy a single value across the entire vector + // Then compute the multiplication + __m256i R_AH0_ = _mm256_permute2x128_si256(R_AH0, R_AH0, 0x00); + __m256i R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00); + __m256i R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55); + COMPUTE_STEP(R_AD0, R_EH0, 0); + __m256i R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA); + __m256i R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF); + COMPUTE_STEP(R_AD1, R_EH1, 1); + R_AH0_ = _mm256_permute2x128_si256(R_AH0, R_AH0, 0x11); + __m256i R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00); + __m256i R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55); + COMPUTE_STEP(R_AD2, R_EH2, 2); + __m256i R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA); + __m256i R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF); + COMPUTE_STEP(R_AD3, R_EH3, 3); + + R_AH0_ = _mm256_permute2x128_si256(R_AH4, R_AH4, 0x00); + R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00); + R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55); + COMPUTE_STEP(R_AD0, R_EH0, 4); + R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA); + R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF); + COMPUTE_STEP(R_AD1, R_EH1, 5); + R_AH0_ = _mm256_permute2x128_si256(R_AH4, R_AH4, 0x11); + R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00); + R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55); + COMPUTE_STEP(R_AD2, R_EH2, 6); + R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA); + R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF); + COMPUTE_STEP(R_AD3, R_EH3, 7); + + R_AH0_ = _mm256_permute2x128_si256(R_AH8, R_AH8, 0x00); + R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00); + R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55); + COMPUTE_STEP(R_AD0, R_EH0, 8); + R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA); + R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF); + COMPUTE_STEP(R_AD1, R_EH1, 9); + R_AH0_ = _mm256_permute2x128_si256(R_AH8, R_AH8, 0x11); + R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00); + R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55); + COMPUTE_STEP(R_AD2, R_EH2, 10); + R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA); + R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF); + COMPUTE_STEP(R_AD3, R_EH3, 11); + + R_AH0_ = _mm256_permute2x128_si256(R_AH12, R_AH12, 0x00); + R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00); + R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55); + COMPUTE_STEP(R_AD0, R_EH0, 12); + R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA); + R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF); + COMPUTE_STEP(R_AD1, R_EH1, 13); + R_AH0_ = _mm256_permute2x128_si256(R_AH12, R_AH12, 0x11); + R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00); + R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55); + COMPUTE_STEP(R_AD2, R_EH2, 14); + R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA); + R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF); + COMPUTE_STEP(R_AD3, R_EH3, 15); + + R_AH0_ = _mm256_permute2x128_si256(R_AH16, R_AH16, 0x00); + R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00); + R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55); + COMPUTE_STEP(R_AD0, R_EH0, 16); + R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA); + R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF); + COMPUTE_STEP(R_AD1, R_EH1, 17); + R_AH0_ = _mm256_permute2x128_si256(R_AH16, R_AH16, 0x11); + R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00); + R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55); + COMPUTE_STEP(R_AD2, R_EH2, 18); + R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA); + R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF); + COMPUTE_STEP(R_AD3, R_EH3, 19); + + R_AH0_ = _mm256_permute2x128_si256(R_AH20, R_AH20, 0x00); + R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00); + R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55); + COMPUTE_STEP(R_AD0, R_EH0, 20); + R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA); + R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF); + COMPUTE_STEP(R_AD1, R_EH1, 21); + R_AH0_ = _mm256_permute2x128_si256(R_AH20, R_AH20, 0x11); + R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00); + R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55); + COMPUTE_STEP(R_AD2, R_EH2, 22); + R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA); + R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF); + COMPUTE_STEP(R_AD3, R_EH3, 23); + + R_AH0_ = _mm256_permute2x128_si256(R_AH24, R_AH24, 0x00); + R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00); + R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55); + COMPUTE_STEP(R_AD0, R_EH0, 24); + R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA); + R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF); + COMPUTE_STEP(R_AD1, R_EH1, 25); + R_AH0_ = _mm256_permute2x128_si256(R_AH24, R_AH24, 0x11); + R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00); + R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55); + COMPUTE_STEP(R_AD2, R_EH2, 26); + R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA); + R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF); + COMPUTE_STEP(R_AD3, R_EH3, 27); + + R_AH0_ = _mm256_permute2x128_si256(R_AH28, R_AH28, 0x00); + R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00); + R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55); + COMPUTE_STEP(R_AD0, R_EH0, 28); + R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA); + R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF); + COMPUTE_STEP(R_AD1, R_EH1, 29); + R_AH0_ = _mm256_permute2x128_si256(R_AH28, R_AH28, 0x11); + R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00); + R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55); + COMPUTE_STEP(R_AD2, R_EH2, 30); + R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA); + R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF); + COMPUTE_STEP(R_AD3, R_EH3, 31); + +#undef COMPUTE_STEP + } + + // Transfer the results to the result matrix. + if (m + 32 <= rows && n + 32 <= cols) { + Index i = 0; + for (Index j = n; j < n + 32; j++) { + LinearMapper r0 = res.getLinearMapper(m, j); + LinearMapper r1 = res.getLinearMapper(m + 8, j); + LinearMapper r2 = res.getLinearMapper(m + 16, j); + LinearMapper r3 = res.getLinearMapper(m + 24, j); + r0.storePacket( + 0, _mm256_add_epi32(blockO_256[i++], r0.loadPacket(0))); + r1.storePacket( + 0, _mm256_add_epi32(blockO_256[i++], r1.loadPacket(0))); + r2.storePacket( + 0, _mm256_add_epi32(blockO_256[i++], r2.loadPacket(0))); + r3.storePacket( + 0, _mm256_add_epi32(blockO_256[i++], r3.loadPacket(0))); + } + } + else { + for (Index j = n; j < cols; j++) { + for (Index i = m; i < rows; i++) { + res(i, j) = blockO[(j - n) * 32 + (i - m)]; + } + } + } + + // Zero the result block so it can be reused + memset(blockO, 0, 32 * 32 * sizeof(QInt32)); + } + } +} + +// Below are the fully optimized versions that are correct only for sizes that +// are multiple of 32. It is about a 10% performance benefit to keep these +// implementations separate. + +// Arrange a block of the left input matrix in contiguous memory. +// +// Given column major input (A0 beside A1 in memory): +// A0 B0 C0 D0 E0 F0 G0 H0 ... +// A1 B1 C1 D1 E1 F1 G1 H1 ... +// A2 B2 C2 D2 E2 F2 G2 H2 ... +// A3 B3 C3 D3 E3 F3 G3 H3 ... +// A4 B4 C4 D4 E4 F4 G4 H4 ... +// A5 B5 C5 D5 E5 F5 G5 H5 ... +// A6 B6 C6 D6 E6 F6 G6 H6 ... +// A7 B7 C7 D7 E7 F7 G7 H7 ... +// A8 ... +// ... +// +// Packing yields output (A0 beside B0 in memory): +// A0 B0 C0 D0 +// A1 B1 C1 D1 +// A2 B2 C2 D2 +// A3 B3 C3 D3 +// A4 B4 C4 D4 +// A5 B5 C5 D5 +// A6 B6 C6 D6 +// A7 B7 C7 D7 +// ... +// A31 B31 C31 D31 +// E0 F0 G0 H0 +// E1 F1 G1 H1 +// E2 F2 G2 H2 +// E3 F3 G3 H3 +// E4 F4 G4 H4 +// E5 F5 G5 H5 +// E6 F6 G6 H6 +// E7 F7 G7 H7 +// ... +// +// Four elements of the same row are arranged contiguously because maddubs and +// madd both perform an adjacent addition in the kernel. +template <typename Index, typename DataMapper, int Pack1, int Pack2, + bool Conjugate, bool PanelMode> +struct gemm_pack_lhs<QInt8, Index, DataMapper, Pack1, Pack2, ColMajor, + Conjugate, PanelMode> { + EIGEN_DONT_INLINE void operator()(QInt8* blockA, const DataMapper& lhs, + Index depth, Index rows, Index stride = 0, + Index offset = 0); +}; + +template <typename Index, typename DataMapper, int Pack1, int Pack2, + bool Conjugate, bool PanelMode> +EIGEN_DONT_INLINE void gemm_pack_lhs<QInt8, Index, DataMapper, Pack1, Pack2, + ColMajor, Conjugate, PanelMode>:: +operator()(QInt8* blockA, const DataMapper& lhs, Index depth, Index rows, + Index stride, Index offset) { + eigen_assert(stride == 0); + eigen_assert(offset == 0); + + // Use alternate function for weird sizes + if (rows % 32 != 0 || depth % 32 != 0) { + gemm_pack_lhs_any<QInt8, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode> lhs_pack; + return lhs_pack(blockA, lhs, depth, rows, stride, offset); + } + + // Get vector pointer + __m256i* blockA_256 = reinterpret_cast<__m256i*>(blockA); + + // Pack rows in sets of 32 + for (Index m = 0; m < rows; m += 32) { + // Pack depth in sets of 8 + for (Index k = 0; k < depth; k += 8) { + // Load vectors + __m256i L_A = lhs.loadPacket(m, k); + __m256i L_B = lhs.loadPacket(m, k + 1); + + // Interleave 8-bit elements + __m256i L_AB0_AB16 = _mm256_unpacklo_epi8(L_A, L_B); + __m256i L_AB8_AB24 = _mm256_unpackhi_epi8(L_A, L_B); + + __m256i L_C = lhs.loadPacket(m, k + 2); + __m256i L_D = lhs.loadPacket(m, k + 3); + __m256i L_CD0_CD16 = _mm256_unpacklo_epi8(L_C, L_D); + __m256i L_CD8_CD24 = _mm256_unpackhi_epi8(L_C, L_D); + + // Interleave 16-bit elements + __m256i L_AD0_AD16 = _mm256_unpacklo_epi16(L_AB0_AB16, L_CD0_CD16); + __m256i L_AD4_AD20 = _mm256_unpackhi_epi16(L_AB0_AB16, L_CD0_CD16); + + // Use permute before we store to cross 128-bit lanes + __m256i L_AD0 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x20); + _mm256_store_si256(blockA_256++, L_AD0); + + // Complete packing for 32 x 8 block + __m256i L_AD16 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x31); + __m256i L_AD8_AD24 = _mm256_unpacklo_epi16(L_AB8_AB24, L_CD8_CD24); + __m256i L_AD12_AD28 = _mm256_unpackhi_epi16(L_AB8_AB24, L_CD8_CD24); + __m256i L_AD8 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x20); + _mm256_store_si256(blockA_256++, L_AD8); + _mm256_store_si256(blockA_256++, L_AD16); + __m256i L_AD24 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x31); + _mm256_store_si256(blockA_256++, L_AD24); + __m256i L_E = lhs.loadPacket(m, k + 4); + __m256i L_F = lhs.loadPacket(m, k + 5); + __m256i L_EF0_EF16 = _mm256_unpacklo_epi8(L_E, L_F); + __m256i L_EF8_EF24 = _mm256_unpackhi_epi8(L_E, L_F); + __m256i L_G = lhs.loadPacket(m, k + 6); + __m256i L_H = lhs.loadPacket(m, k + 7); + __m256i L_GH0_GH16 = _mm256_unpacklo_epi8(L_G, L_H); + __m256i L_GH8_GH24 = _mm256_unpackhi_epi8(L_G, L_H); + __m256i L_EH0_EH16 = _mm256_unpacklo_epi16(L_EF0_EF16, L_GH0_GH16); + __m256i L_EH4_EH20 = _mm256_unpackhi_epi16(L_EF0_EF16, L_GH0_GH16); + __m256i L_EH0 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x20); + _mm256_store_si256(blockA_256++, L_EH0); + __m256i L_EH16 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x31); + __m256i L_EH8_EH24 = _mm256_unpacklo_epi16(L_EF8_EF24, L_GH8_GH24); + __m256i L_EH12_EH28 = _mm256_unpackhi_epi16(L_EF8_EF24, L_GH8_GH24); + __m256i L_EH8 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x20); + _mm256_store_si256(blockA_256++, L_EH8); + _mm256_store_si256(blockA_256++, L_EH16); + __m256i L_EH24 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x31); + _mm256_store_si256(blockA_256++, L_EH24); + } + } +} + +// Arrange a block of the right input matrix in contiguous memory. +// +// Given column major input (A0 beside A1 in memory): +// A0 B0 C0 D0 E0 F0 G0 H0 ... +// A1 B1 C1 D1 E1 F1 G1 H1 ... +// A2 B2 C2 D2 E2 F2 G2 H2 ... +// A3 B3 C3 D3 E3 F3 G3 H3 ... +// A4 B4 C4 D4 E4 F4 G4 H4 ... +// A5 B5 C5 D5 E5 F5 G5 H5 ... +// A6 B6 C6 D6 E6 F6 G6 H6 ... +// A7 B7 C7 D7 E7 F7 G7 H7 ... +// A8 ... +// ... +// +// Packing yields row major output (A0 beside A1 in memory): +// A0 A1 A2 A3 A4 A5 A6 A7 +// B0 B1 B2 B3 B4 B5 B6 B7 +// ... +// +// At least four elements of the same col are arranged contiguously because +// maddubs and madd both perform an adjacent addition in the kernel. We can +// save work by leaving 8 adjacent elements because kr = 8. +template <typename Index, typename DataMapper, int nr, bool Conjugate, + bool PanelMode> +struct gemm_pack_rhs<QUInt8, Index, DataMapper, nr, ColMajor, Conjugate, + PanelMode> { + EIGEN_DONT_INLINE void operator()(QUInt8* blockB, const DataMapper& rhs, + Index depth, Index cols, Index stride = 0, + Index offset = 0); +}; + +template <typename Index, typename DataMapper, int nr, bool Conjugate, + bool PanelMode> +EIGEN_DONT_INLINE void gemm_pack_rhs<QUInt8, Index, DataMapper, nr, ColMajor, + Conjugate, PanelMode>:: +operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols, + Index stride, Index offset) { + eigen_assert(stride == 0); + eigen_assert(offset == 0); + + // Use alternate function for weird sizes + if (cols % 32 != 0 || depth % 32 != 0) { + gemm_pack_rhs_any<QUInt8, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode> rhs_pack; + return rhs_pack(blockB, rhs, depth, cols, stride, offset); + } + + // Get vector pointer + __m256i* blockB_256 = reinterpret_cast<__m256i*>(blockB); + + // Perform a step of the packing for 4 columns + __m256i R_AB_L, R_AB_H, R_CD_L, R_CD_H, R_AD_0, R_AD_8, R_AD_16, R_AD_24; +#define PACK_STEP \ + R_AB_L = _mm256_unpacklo_epi64(R_A, R_B); \ + R_CD_L = _mm256_unpacklo_epi64(R_C, R_D); \ + R_AB_H = _mm256_unpackhi_epi64(R_A, R_B); \ + R_CD_H = _mm256_unpackhi_epi64(R_C, R_D); \ + R_AD_0 = _mm256_permute2x128_si256(R_AB_L, R_CD_L, 0x20); \ + R_AD_16 = _mm256_permute2x128_si256(R_AB_L, R_CD_L, 0x31); \ + R_AD_8 = _mm256_permute2x128_si256(R_AB_H, R_CD_H, 0x20); \ + R_AD_24 = _mm256_permute2x128_si256(R_AB_H, R_CD_H, 0x31); \ + _mm256_store_si256(blockB_256, R_AD_0); \ + _mm256_store_si256(blockB_256 + 8, R_AD_8); \ + _mm256_store_si256(blockB_256 + 16, R_AD_16); \ + _mm256_store_si256(blockB_256 + 24, R_AD_24); \ + blockB_256++; + + // Pack cols in sets of 32 + for (Index n = 0; n < cols; n += 32) { + // Pack depth in sets of 32 + for (Index k = 0; k < depth; k += 32) { + __m256i R_A = rhs.loadPacket(k, n); + __m256i R_B = rhs.loadPacket(k, n + 1); + __m256i R_C = rhs.loadPacket(k, n + 2); + __m256i R_D = rhs.loadPacket(k, n + 3); + PACK_STEP; + + R_A = rhs.loadPacket(k, n + 4); + R_B = rhs.loadPacket(k, n + 5); + R_C = rhs.loadPacket(k, n + 6); + R_D = rhs.loadPacket(k, n + 7); + PACK_STEP; + + R_A = rhs.loadPacket(k, n + 8); + R_B = rhs.loadPacket(k, n + 9); + R_C = rhs.loadPacket(k, n + 10); + R_D = rhs.loadPacket(k, n + 11); + PACK_STEP; + + R_A = rhs.loadPacket(k, n + 12); + R_B = rhs.loadPacket(k, n + 13); + R_C = rhs.loadPacket(k, n + 14); + R_D = rhs.loadPacket(k, n + 15); + PACK_STEP; + + R_A = rhs.loadPacket(k, n + 16); + R_B = rhs.loadPacket(k, n + 17); + R_C = rhs.loadPacket(k, n + 18); + R_D = rhs.loadPacket(k, n + 19); + PACK_STEP; + + R_A = rhs.loadPacket(k, n + 20); + R_B = rhs.loadPacket(k, n + 21); + R_C = rhs.loadPacket(k, n + 22); + R_D = rhs.loadPacket(k, n + 23); + PACK_STEP; + + R_A = rhs.loadPacket(k, n + 24); + R_B = rhs.loadPacket(k, n + 25); + R_C = rhs.loadPacket(k, n + 26); + R_D = rhs.loadPacket(k, n + 27); + PACK_STEP; + + R_A = rhs.loadPacket(k, n + 28); + R_B = rhs.loadPacket(k, n + 29); + R_C = rhs.loadPacket(k, n + 30); + R_D = rhs.loadPacket(k, n + 31); + PACK_STEP; + + blockB_256 += 24; + } + } +#undef PACK_STEP +} + +// Perform the actual multiplication on packed inputs +template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs> +struct gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> +{ + typedef typename DataMapper::LinearMapper LinearMapper; + + EIGEN_DONT_INLINE + void operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB, + Index rows, Index depth, Index cols, QInt32 alpha, + Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0); +}; + +template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs> +EIGEN_DONT_INLINE +void gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> +::operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB, + Index rows, Index depth, Index cols, QInt32 alpha, + Index strideA, Index strideB, Index offsetA, Index offsetB) +{ + EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE); + eigen_assert(alpha.value == 1); + eigen_assert(strideA == -1); + eigen_assert(strideB == -1); + eigen_assert(offsetA == 0); + eigen_assert(offsetB == 0); + eigen_assert(rows > 0); + eigen_assert(cols > 0); + eigen_assert(depth > 0); + eigen_assert(blockA); + eigen_assert(blockB); + + // Use alternate function for weird sizes + if (rows % 32 != 0 || cols % 32 != 0 || depth % 32 != 0) { + gebp_kernel_any<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> gebp; + return gebp(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); + } + + // Create result block + QInt32* blockO = aligned_new<QInt32>(32 * 32); + // Allocating the result block is about 5-10% faster than declaring stack + // space. It is unclear why this is the case. + // ei_declare_aligned_stack_constructed_variable(QInt32, blockO, 32 * 32, 0); + memset(blockO, 0, 32 * 32 * sizeof(QInt32)); + + // Get vectorized pointers + __m256i* blockO_256 = reinterpret_cast<__m256i*>(blockO); + const __m256i* blockA_256 = reinterpret_cast<const __m256i*>(blockA); + const __m256i* blockB_256 = reinterpret_cast<const __m256i*>(blockB); + + // Loop over blocks of 32 columns + for (Index n = 0; n < cols; n += 32) { + // Reset index into blockA + Index indexL = 0; + // Loop over blocks of 32 rows + for (Index m = 0; m < rows; m += 32) { + // Reset index into blockB + Index indexR = n / 32 * depth; + // Loop over blocks of 8 on depth + for (Index k = 0; k < depth; k += 8) { + // Load inputs + __m256i L_AD0 = blockA_256[indexL++]; + __m256i L_AD8 = blockA_256[indexL++]; + __m256i L_AD16 = blockA_256[indexL++]; + __m256i L_AD24 = blockA_256[indexL++]; + __m256i L_EH0 = blockA_256[indexL++]; + __m256i L_EH8 = blockA_256[indexL++]; + __m256i L_EH16 = blockA_256[indexL++]; + __m256i L_EH24 = blockA_256[indexL++]; + __m256i R_AH0 = blockB_256[indexR++]; + __m256i R_AH4 = blockB_256[indexR++]; + __m256i R_AH8 = blockB_256[indexR++]; + __m256i R_AH12 = blockB_256[indexR++]; + __m256i R_AH16 = blockB_256[indexR++]; + __m256i R_AH20 = blockB_256[indexR++]; + __m256i R_AH24 = blockB_256[indexR++]; + __m256i R_AH28 = blockB_256[indexR++]; + + // This constant is used with madd to convert 16 bit to 32 bit + const __m256i ONE = _mm256_set1_epi32(0x00010001); + + // Declare variables used in COMPUTE_STEP + __m256i P_16_A, P_16_B, P_32_A, P_32_B, P_32; + +#define COMPUTE_STEP(R_INPUT_A, R_INPUT_B, OFFSET) \ + P_16_A = _mm256_maddubs_epi16(R_INPUT_A, L_AD0); \ + P_32_A = _mm256_madd_epi16(P_16_A, ONE); \ + P_16_B = _mm256_maddubs_epi16(R_INPUT_B, L_EH0); \ + P_32_B = _mm256_madd_epi16(P_16_B, ONE); \ + P_32 = _mm256_add_epi32(P_32_A, P_32_B); \ + _mm256_store_si256( \ + blockO_256 + 4 * OFFSET, \ + _mm256_add_epi32(_mm256_load_si256(blockO_256 + 4 * OFFSET), P_32)); \ + \ + P_16_A = _mm256_maddubs_epi16(R_INPUT_A, L_AD8); \ + P_32_A = _mm256_madd_epi16(P_16_A, ONE); \ + P_16_B = _mm256_maddubs_epi16(R_INPUT_B, L_EH8); \ + P_32_B = _mm256_madd_epi16(P_16_B, ONE); \ + P_32 = _mm256_add_epi32(P_32_A, P_32_B); \ + _mm256_store_si256( \ + blockO_256 + 4 * OFFSET + 1, \ + _mm256_add_epi32(_mm256_load_si256(blockO_256 + 4 * OFFSET + 1), P_32)); \ + \ + P_16_A = _mm256_maddubs_epi16(R_INPUT_A, L_AD16); \ + P_32_A = _mm256_madd_epi16(P_16_A, ONE); \ + P_16_B = _mm256_maddubs_epi16(R_INPUT_B, L_EH16); \ + P_32_B = _mm256_madd_epi16(P_16_B, ONE); \ + P_32 = _mm256_add_epi32(P_32_A, P_32_B); \ + _mm256_store_si256( \ + blockO_256 + 4 * OFFSET + 2, \ + _mm256_add_epi32(_mm256_load_si256(blockO_256 + 4 * OFFSET + 2), P_32)); \ + \ + P_16_A = _mm256_maddubs_epi16(R_INPUT_A, L_AD24); \ + P_32_A = _mm256_madd_epi16(P_16_A, ONE); \ + P_16_B = _mm256_maddubs_epi16(R_INPUT_B, L_EH24); \ + P_32_B = _mm256_madd_epi16(P_16_B, ONE); \ + P_32 = _mm256_add_epi32(P_32_A, P_32_B); \ + _mm256_store_si256( \ + blockO_256 + 4 * OFFSET + 3, \ + _mm256_add_epi32(_mm256_load_si256(blockO_256 + 4 * OFFSET + 3), P_32)); + + // Permute and shuffle to copy a single value across the entire vector + // Then compute the multiplication + __m256i R_AH0_ = _mm256_permute2x128_si256(R_AH0, R_AH0, 0x00); + __m256i R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00); + __m256i R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55); + COMPUTE_STEP(R_AD0, R_EH0, 0); + __m256i R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA); + __m256i R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF); + COMPUTE_STEP(R_AD1, R_EH1, 1); + R_AH0_ = _mm256_permute2x128_si256(R_AH0, R_AH0, 0x11); + __m256i R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00); + __m256i R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55); + COMPUTE_STEP(R_AD2, R_EH2, 2); + __m256i R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA); + __m256i R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF); + COMPUTE_STEP(R_AD3, R_EH3, 3); + + R_AH0_ = _mm256_permute2x128_si256(R_AH4, R_AH4, 0x00); + R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00); + R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55); + COMPUTE_STEP(R_AD0, R_EH0, 4); + R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA); + R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF); + COMPUTE_STEP(R_AD1, R_EH1, 5); + R_AH0_ = _mm256_permute2x128_si256(R_AH4, R_AH4, 0x11); + R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00); + R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55); + COMPUTE_STEP(R_AD2, R_EH2, 6); + R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA); + R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF); + COMPUTE_STEP(R_AD3, R_EH3, 7); + + R_AH0_ = _mm256_permute2x128_si256(R_AH8, R_AH8, 0x00); + R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00); + R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55); + COMPUTE_STEP(R_AD0, R_EH0, 8); + R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA); + R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF); + COMPUTE_STEP(R_AD1, R_EH1, 9); + R_AH0_ = _mm256_permute2x128_si256(R_AH8, R_AH8, 0x11); + R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00); + R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55); + COMPUTE_STEP(R_AD2, R_EH2, 10); + R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA); + R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF); + COMPUTE_STEP(R_AD3, R_EH3, 11); + + R_AH0_ = _mm256_permute2x128_si256(R_AH12, R_AH12, 0x00); + R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00); + R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55); + COMPUTE_STEP(R_AD0, R_EH0, 12); + R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA); + R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF); + COMPUTE_STEP(R_AD1, R_EH1, 13); + R_AH0_ = _mm256_permute2x128_si256(R_AH12, R_AH12, 0x11); + R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00); + R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55); + COMPUTE_STEP(R_AD2, R_EH2, 14); + R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA); + R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF); + COMPUTE_STEP(R_AD3, R_EH3, 15); + + R_AH0_ = _mm256_permute2x128_si256(R_AH16, R_AH16, 0x00); + R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00); + R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55); + COMPUTE_STEP(R_AD0, R_EH0, 16); + R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA); + R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF); + COMPUTE_STEP(R_AD1, R_EH1, 17); + R_AH0_ = _mm256_permute2x128_si256(R_AH16, R_AH16, 0x11); + R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00); + R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55); + COMPUTE_STEP(R_AD2, R_EH2, 18); + R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA); + R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF); + COMPUTE_STEP(R_AD3, R_EH3, 19); + + R_AH0_ = _mm256_permute2x128_si256(R_AH20, R_AH20, 0x00); + R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00); + R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55); + COMPUTE_STEP(R_AD0, R_EH0, 20); + R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA); + R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF); + COMPUTE_STEP(R_AD1, R_EH1, 21); + R_AH0_ = _mm256_permute2x128_si256(R_AH20, R_AH20, 0x11); + R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00); + R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55); + COMPUTE_STEP(R_AD2, R_EH2, 22); + R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA); + R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF); + COMPUTE_STEP(R_AD3, R_EH3, 23); + + R_AH0_ = _mm256_permute2x128_si256(R_AH24, R_AH24, 0x00); + R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00); + R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55); + COMPUTE_STEP(R_AD0, R_EH0, 24); + R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA); + R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF); + COMPUTE_STEP(R_AD1, R_EH1, 25); + R_AH0_ = _mm256_permute2x128_si256(R_AH24, R_AH24, 0x11); + R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00); + R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55); + COMPUTE_STEP(R_AD2, R_EH2, 26); + R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA); + R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF); + COMPUTE_STEP(R_AD3, R_EH3, 27); + + R_AH0_ = _mm256_permute2x128_si256(R_AH28, R_AH28, 0x00); + R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00); + R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55); + COMPUTE_STEP(R_AD0, R_EH0, 28); + R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA); + R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF); + COMPUTE_STEP(R_AD1, R_EH1, 29); + R_AH0_ = _mm256_permute2x128_si256(R_AH28, R_AH28, 0x11); + R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00); + R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55); + COMPUTE_STEP(R_AD2, R_EH2, 30); + R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA); + R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF); + COMPUTE_STEP(R_AD3, R_EH3, 31); + +#undef COMPUTE_STEP + } + + // Transfer the results to the result matrix + Index i = 0; + for (Index j = n; j < n + 32; j++) { + LinearMapper r0 = res.getLinearMapper(m, j); + LinearMapper r1 = res.getLinearMapper(m + 8, j); + LinearMapper r2 = res.getLinearMapper(m + 16, j); + LinearMapper r3 = res.getLinearMapper(m + 24, j); + r0.storePacket( + 0, _mm256_add_epi32(blockO_256[i++], r0.loadPacket(0))); + r1.storePacket( + 0, _mm256_add_epi32(blockO_256[i++], r1.loadPacket(0))); + r2.storePacket( + 0, _mm256_add_epi32(blockO_256[i++], r2.loadPacket(0))); + r3.storePacket( + 0, _mm256_add_epi32(blockO_256[i++], r3.loadPacket(0))); + } + + // Zero the result block so it can be reused + memset(blockO, 0, 32 * 32 * sizeof(QInt32)); + } + } + aligned_delete(blockO, 32 * 32); +} + +#endif // EIGEN_USE_OPTIMIZED_INT8_UINT8_MAT_MAT_PRODUCT + +} // namespace internal +} // namespace Eigen + +#endif // EIGEN_CXX11_FIXED_POINT_MAT_MAT_PRODUCT_AVX2_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductNEON.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductNEON.h new file mode 100644 index 0000000000..99894cafb5 --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductNEON.h @@ -0,0 +1,95 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com> +// Copyright (C) 2015 Benoit Jacob <benoitjacob@google.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_FIXED_POINT_MAT_MAT_PRODUCT_NEON_H +#define EIGEN_CXX11_FIXED_POINT_MAT_MAT_PRODUCT_NEON_H + + +namespace Eigen { +namespace internal { + + +// AVX2 optimized implementation of the case where the lhs is encoded using signed 8bit +// integers and the rhs using unsigned 8bit integers. +#ifdef EIGEN_USE_OPTIMIZED_INT8_UINT8_MAT_MAT_PRODUCT + +template<bool _ConjLhs, bool _ConjRhs> +class gebp_traits<QInt8, QUInt8, _ConjLhs, _ConjRhs> +{ +public: + typedef QInt8 LhsScalar; + typedef QUInt8 RhsScalar; + typedef QInt32 ResScalar; + + enum { + // register block size along the M and N directions + // One for the current implementation + nr = 1, + mr = 1, + // Progress made at each iteration of the product loop + // also 1 for the current implementation + LhsProgress = 1, + RhsProgress = 1 + }; +}; + +// Mat-Mat product of a signed 8bit lhs with an unsigned 8bit rhs +template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs> +struct gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> +{ + EIGEN_DONT_INLINE + void operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB, + Index rows, Index depth, Index cols, QInt32 alpha, + Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0); +}; + +template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs> +EIGEN_DONT_INLINE +void gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> +::operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB, + Index rows, Index depth, Index cols, QInt32 alpha, + Index strideA, Index strideB, Index offsetA, Index offsetB) +{ + EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE); + + eigen_assert(alpha.value == 1); + eigen_assert(strideA == -1); + eigen_assert(strideB == -1); + eigen_assert(offsetA == 0); + eigen_assert(offsetB == 0); + + eigen_assert(rows > 0); + eigen_assert(cols > 0); + eigen_assert(depth > 0); + eigen_assert(blockA); + eigen_assert(blockB); + + for (Index j = 0; j < cols; ++j) { + Index startB = j * depth; + + for (Index i = 0; i < rows; ++i) { + Index startA = i * depth; + + for (Index k = 0; k < depth; ++k) { + res(i, j) += blockA[startA + k] * blockB[startB + k]; + } + } + } +} +#endif + + +} // namespace internal +} // namespace Eigen + + + +#endif // EIGEN_CXX11_FIXED_POINT_MAT_MAT_PRODUCT_NEON_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatVecProduct.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatVecProduct.h new file mode 100644 index 0000000000..18b5085b89 --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatVecProduct.h @@ -0,0 +1,123 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_FIXED_POINT_MAT_VEC_PRODUCT_H +#define EIGEN_CXX11_FIXED_POINT_MAT_VEC_PRODUCT_H + + +namespace Eigen { +namespace internal { + +// Mat-Vec product +// Both lhs and rhs are encoded as 8bit signed integers +template<typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version> +struct general_matrix_vector_product<Index,QInt8,LhsMapper,ColMajor,ConjugateLhs,QInt8,RhsMapper,ConjugateRhs,Version> +{ +EIGEN_DONT_INLINE static void run( + Index rows, Index cols, + const LhsMapper& lhs, + const RhsMapper& rhs, + QInt32* res, Index resIncr, + QInt8 alpha); +}; + +template<typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version> +EIGEN_DONT_INLINE void general_matrix_vector_product<Index,QInt8,LhsMapper,ColMajor,ConjugateLhs,QInt8,RhsMapper,ConjugateRhs,Version>::run( + Index rows, Index cols, + const LhsMapper& lhs, + const RhsMapper& rhs, + QInt32* res, Index resIncr, + QInt8 alpha) +{ + eigen_assert(alpha.value == 1); + eigen_assert(resIncr == 1); + eigen_assert(rows > 0); + eigen_assert(cols > 0); + + for (Index i = 0; i < rows; ++i) { + for (Index j = 0; j < cols; ++j) { + res[i] += lhs(i, j) * rhs(j, 0); + } + } +} + + +// Mat-Vec product +// The lhs is encoded using 8bit signed integers, the rhs using 8bit unsigned integers +template<typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version> +struct general_matrix_vector_product<Index,QInt8,LhsMapper,ColMajor,ConjugateLhs,QUInt8,RhsMapper,ConjugateRhs,Version> +{ +EIGEN_DONT_INLINE static void run( + Index rows, Index cols, + const LhsMapper& lhs, + const RhsMapper& rhs, + QInt32* res, Index resIncr, + QUInt8 alpha); +}; + +template<typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version> +EIGEN_DONT_INLINE void general_matrix_vector_product<Index,QInt8,LhsMapper,ColMajor,ConjugateLhs,QUInt8,RhsMapper,ConjugateRhs,Version>::run( + Index rows, Index cols, + const LhsMapper& lhs, + const RhsMapper& rhs, + QInt32* res, Index resIncr, + QUInt8 alpha) +{ + eigen_assert(alpha.value == 1); + eigen_assert(resIncr == 1); + eigen_assert(rows > 0); + eigen_assert(cols > 0); + + for (Index i = 0; i < rows; ++i) { + for (Index j = 0; j < cols; ++j) { + res[i] += lhs(i, j) * rhs(j, 0); + } + } +} + + +// Mat-Vec product +// The lhs is encoded using bit unsigned integers, the rhs using 8bit signed integers +template<typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version> +struct general_matrix_vector_product<Index,QUInt8,LhsMapper,ColMajor,ConjugateLhs,QInt8,RhsMapper,ConjugateRhs,Version> +{ +EIGEN_DONT_INLINE static void run( + Index rows, Index cols, + const LhsMapper& lhs, + const RhsMapper& rhs, + QInt32* res, Index resIncr, + QInt8 alpha); +}; + +template<typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version> +EIGEN_DONT_INLINE void general_matrix_vector_product<Index,QUInt8,LhsMapper,ColMajor,ConjugateLhs,QInt8,RhsMapper,ConjugateRhs,Version>::run( + Index rows, Index cols, + const LhsMapper& lhs, + const RhsMapper& rhs, + QInt32* res, Index resIncr, + QInt8 alpha) +{ + eigen_assert(alpha.value == 1); + eigen_assert(resIncr == 1); + eigen_assert(rows > 0); + eigen_assert(cols > 0); + + for (Index i = 0; i < rows; ++i) { + for (Index j = 0; j < cols; ++j) { + res[i] += lhs(i, j) * rhs(j, 0); + } + } +} + +} // namespace internal +} // namespace Eigen + + + +#endif // EIGEN_CXX11_FIXED_POINT_MAT_VEC_PRODUCT_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h new file mode 100644 index 0000000000..cae1a0b06d --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h @@ -0,0 +1,409 @@ +#ifndef THIRD_PARTY_EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_PACKETMATHAVX2_H_ +#define THIRD_PARTY_EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_PACKETMATHAVX2_H_ + +namespace Eigen { +namespace internal { + +typedef struct Packet32q8i { + __m256i val; + operator __m256i() const { return val; } + Packet32q8i(); + Packet32q8i(__m256i val) : val(val) {} +} Packet32q8i; + +typedef struct Packet32q8u { + __m256i val; + operator __m256i() const { return val; } + Packet32q8u(); + Packet32q8u(__m256i val) : val(val) {} +} Packet32q8u; + +typedef struct Packet16q8i { + __m128i val; + operator __m128i() const { return val; } + Packet16q8i(); + Packet16q8i(__m128i val) : val(val) {} +} Packet16q8i; + +typedef struct Packet16q8u { + __m128i val; + operator __m128i() const { return val; } + Packet16q8u(); + Packet16q8u(__m128i val) : val(val) {} +} Packet16q8u; + +typedef struct Packet8q32i { + __m256i val; + operator __m256i() const { return val; } + Packet8q32i(); + Packet8q32i(__m256i val) : val(val) {} +} Packet8q32i; + +typedef struct Packet4q32i { + __m128i val; + operator __m128i() const { return val; } + Packet4q32i(); + Packet4q32i(__m128i val) : val(val) {} +} Packet4q32i; + +template <> +struct packet_traits<QInt8> : default_packet_traits { + typedef Packet32q8i type; + typedef Packet16q8i half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 32, + }; + enum { + HasAdd = 0, + HasSub = 0, + HasMul = 0, + HasNegate = 0, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 1, + HasMax = 1, + HasConj = 0, + HasSetLinear = 0 + }; +}; +template <> +struct packet_traits<QUInt8> : default_packet_traits { + typedef Packet32q8u type; + typedef Packet16q8u half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 32, + }; + enum { + HasAdd = 0, + HasSub = 0, + HasMul = 0, + HasNegate = 0, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 1, + HasMax = 1, + HasConj = 0, + HasSetLinear = 0 + }; +}; +template <> +struct packet_traits<QInt32> : default_packet_traits { + typedef Packet8q32i type; + typedef Packet4q32i half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 8, + }; + enum { + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 1, + HasMax = 1, + HasConj = 0, + HasSetLinear = 0 + }; +}; + +template <> +struct unpacket_traits<Packet32q8i> { + typedef QInt8 type; + typedef Packet16q8i half; + enum { size = 32 }; +}; +template <> +struct unpacket_traits<Packet32q8u> { + typedef QUInt8 type; + typedef Packet16q8u half; + enum { size = 32 }; +}; +template <> +struct unpacket_traits<Packet8q32i> { + typedef QInt32 type; + typedef Packet4q32i half; + enum { size = 8 }; +}; + +// Unaligned load +template <> +EIGEN_STRONG_INLINE Packet32q8i ploadu<Packet32q8i>(const QInt8* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256( + reinterpret_cast<const __m256i*>(from)); +} +template <> +EIGEN_STRONG_INLINE Packet32q8u ploadu<Packet32q8u>(const QUInt8* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256( + reinterpret_cast<const __m256i*>(from)); +} +template <> +EIGEN_STRONG_INLINE Packet8q32i ploadu<Packet8q32i>(const QInt32* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256( + reinterpret_cast<const __m256i*>(from)); +} + +// Aligned load +template <> +EIGEN_STRONG_INLINE Packet32q8i pload<Packet32q8i>(const QInt8* from) { + EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256( + reinterpret_cast<const __m256i*>(from)); +} +template <> +EIGEN_STRONG_INLINE Packet32q8u pload<Packet32q8u>(const QUInt8* from) { + EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256( + reinterpret_cast<const __m256i*>(from)); +} +template <> +EIGEN_STRONG_INLINE Packet8q32i pload<Packet8q32i>(const QInt32* from) { + EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256( + reinterpret_cast<const __m256i*>(from)); +} + +// Unaligned store +template <> +EIGEN_STRONG_INLINE void pstoreu<QInt8>(QInt8* to, const Packet32q8i& from) { + EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256( + reinterpret_cast<__m256i*>(to), from.val); +} +template <> +EIGEN_STRONG_INLINE void pstoreu<QUInt8>(QUInt8* to, const Packet32q8u& from) { + EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256( + reinterpret_cast<__m256i*>(to), from.val); +} +template <> +EIGEN_STRONG_INLINE void pstoreu<QInt32>(QInt32* to, const Packet8q32i& from) { + EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256( + reinterpret_cast<__m256i*>(to), from.val); +} + +// Aligned store +template <> +EIGEN_STRONG_INLINE void pstore<QInt32>(QInt32* to, const Packet8q32i& from) { + EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to), + from.val); +} +template <> +EIGEN_STRONG_INLINE void pstore<QUInt8>(QUInt8* to, const Packet32q8u& from) { + EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to), + from.val); +} +template <> +EIGEN_STRONG_INLINE void pstore<QInt8>(QInt8* to, const Packet32q8i& from) { + EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to), + from.val); +} + +// Extract first element. +template <> +EIGEN_STRONG_INLINE QInt32 pfirst<Packet8q32i>(const Packet8q32i& a) { + return _mm_cvtsi128_si32(_mm256_castsi256_si128(a)); +} +template <> +EIGEN_STRONG_INLINE QUInt8 pfirst<Packet32q8u>(const Packet32q8u& a) { + return static_cast<uint8_t>(_mm256_extract_epi8(a.val, 0)); +} +template <> +EIGEN_STRONG_INLINE QInt8 pfirst<Packet32q8i>(const Packet32q8i& a) { + return _mm256_extract_epi8(a.val, 0); +} + +// Initialize to constant value. +template <> +EIGEN_STRONG_INLINE Packet32q8i pset1<Packet32q8i>(const QInt8& from) { + return _mm256_set1_epi8(from.value); +} +template <> +EIGEN_STRONG_INLINE Packet32q8u pset1<Packet32q8u>(const QUInt8& from) { + return _mm256_set1_epi8(static_cast<uint8_t>(from.value)); +} +template <> +EIGEN_STRONG_INLINE Packet8q32i pset1<Packet8q32i>(const QInt32& from) { + return _mm256_set1_epi32(from.value); +} + +// Basic arithmetic packet ops for QInt32. +template <> +EIGEN_STRONG_INLINE Packet8q32i padd<Packet8q32i>(const Packet8q32i& a, + const Packet8q32i& b) { + return _mm256_add_epi32(a.val, b.val); +} +template <> +EIGEN_STRONG_INLINE Packet8q32i psub<Packet8q32i>(const Packet8q32i& a, + const Packet8q32i& b) { + return _mm256_sub_epi32(a.val, b.val); +} +// Note: mullo truncates the result to 32 bits. +template <> +EIGEN_STRONG_INLINE Packet8q32i pmul<Packet8q32i>(const Packet8q32i& a, + const Packet8q32i& b) { + return _mm256_mullo_epi32(a.val, b.val); +} +template <> +EIGEN_STRONG_INLINE Packet8q32i pnegate<Packet8q32i>(const Packet8q32i& a) { + return _mm256_sub_epi32(_mm256_setzero_si256(), a.val); +} + +// Min and max. +template <> +EIGEN_STRONG_INLINE Packet8q32i pmin<Packet8q32i>(const Packet8q32i& a, + const Packet8q32i& b) { + return _mm256_min_epi32(a.val, b.val); +} +template <> +EIGEN_STRONG_INLINE Packet8q32i pmax<Packet8q32i>(const Packet8q32i& a, + const Packet8q32i& b) { + return _mm256_max_epi32(a.val, b.val); +} + +template <> +EIGEN_STRONG_INLINE Packet32q8u pmin<Packet32q8u>(const Packet32q8u& a, + const Packet32q8u& b) { + return _mm256_min_epu8(a.val, b.val); +} +template <> +EIGEN_STRONG_INLINE Packet32q8u pmax<Packet32q8u>(const Packet32q8u& a, + const Packet32q8u& b) { + return _mm256_max_epu8(a.val, b.val); +} + +template <> +EIGEN_STRONG_INLINE Packet32q8i pmin<Packet32q8i>(const Packet32q8i& a, + const Packet32q8i& b) { + return _mm256_min_epi8(a.val, b.val); +} +template <> +EIGEN_STRONG_INLINE Packet32q8i pmax<Packet32q8i>(const Packet32q8i& a, + const Packet32q8i& b) { + return _mm256_max_epi8(a.val, b.val); +} + +// Reductions. +template <> +EIGEN_STRONG_INLINE QInt32 predux_min<Packet8q32i>(const Packet8q32i& a) { + __m256i tmp = _mm256_min_epi32(a, _mm256_permute2f128_si256(a, a, 1)); + tmp = + _mm256_min_epi32(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2))); + return pfirst<Packet8q32i>( + _mm256_min_epi32(tmp, _mm256_shuffle_epi32(tmp, 1))); +} +template <> +EIGEN_STRONG_INLINE QInt32 predux_max<Packet8q32i>(const Packet8q32i& a) { + __m256i tmp = _mm256_max_epi32(a, _mm256_permute2f128_si256(a, a, 1)); + tmp = + _mm256_max_epi32(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2))); + return pfirst<Packet8q32i>( + _mm256_max_epi32(tmp, _mm256_shuffle_epi32(tmp, 1))); +} + +template <> +EIGEN_STRONG_INLINE QUInt8 predux_min<Packet32q8u>(const Packet32q8u& a) { + __m256i tmp = _mm256_min_epu8(a, _mm256_permute2f128_si256(a, a, 1)); + tmp = + _mm256_min_epu8(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2))); + tmp = _mm256_min_epu8(tmp, _mm256_shuffle_epi32(tmp, 1)); + tmp = _mm256_min_epu8(tmp, + _mm256_shufflelo_epi16(tmp, _MM_SHUFFLE(1, 0, 3, 2))); + return std::min(static_cast<uint8_t>(_mm256_extract_epi8(tmp, 0)), + static_cast<uint8_t>(_mm256_extract_epi8(tmp, 1))); +} +template <> +EIGEN_STRONG_INLINE QUInt8 predux_max<Packet32q8u>(const Packet32q8u& a) { + __m256i tmp = _mm256_max_epu8(a, _mm256_permute2f128_si256(a, a, 1)); + tmp = + _mm256_max_epu8(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2))); + tmp = _mm256_max_epu8(tmp, _mm256_shuffle_epi32(tmp, 1)); + tmp = _mm256_max_epu8(tmp, + _mm256_shufflelo_epi16(tmp, _MM_SHUFFLE(1, 0, 3, 2))); + return std::max(static_cast<uint8_t>(_mm256_extract_epi8(tmp, 0)), + static_cast<uint8_t>(_mm256_extract_epi8(tmp, 1))); +} + +template <> +EIGEN_STRONG_INLINE QInt8 predux_min<Packet32q8i>(const Packet32q8i& a) { + __m256i tmp = _mm256_min_epi8(a, _mm256_permute2f128_si256(a, a, 1)); + tmp = _mm256_min_epi8(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2))); + tmp = _mm256_min_epi8(tmp, _mm256_shuffle_epi32(tmp, 1)); + tmp = _mm256_min_epi8(tmp, _mm256_shufflelo_epi16(tmp, _MM_SHUFFLE(1, 0, 3, 2))); + return std::min(_mm256_extract_epi8(tmp, 0), _mm256_extract_epi8(tmp, 1)); +} +template <> +EIGEN_STRONG_INLINE QInt8 predux_max<Packet32q8i>(const Packet32q8i& a) { + __m256i tmp = _mm256_max_epi8(a, _mm256_permute2f128_si256(a, a, 1)); + tmp = _mm256_max_epi8(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2))); + tmp = _mm256_max_epi8(tmp, _mm256_shuffle_epi32(tmp, 1)); + tmp = _mm256_max_epi8(tmp, _mm256_shufflelo_epi16(tmp, _MM_SHUFFLE(1, 0, 3, 2))); + return std::max(_mm256_extract_epi8(tmp, 0), _mm256_extract_epi8(tmp, 1)); +} + +// Comparisons +template <> +EIGEN_STRONG_INLINE Packet8q32i peq<Packet8q32i>(const Packet8q32i& a, + const Packet8q32i& b) { + return _mm256_cmpeq_epi32(a.val, b.val); +} +template <> +EIGEN_STRONG_INLINE Packet32q8i peq<Packet32q8i>(const Packet32q8i& a, + const Packet32q8i& b) { + return _mm256_cmpeq_epi8(a.val, b.val); +} +template <> +EIGEN_STRONG_INLINE Packet32q8u peq<Packet32q8u>(const Packet32q8u& a, + const Packet32q8u& b) { + return _mm256_cmpeq_epi8(a.val, b.val); +} + +// Note: There are no instructions in AVX2 for unsigned lt/gt comparison. +// These are added in AVX-512. +template <> +EIGEN_STRONG_INLINE Packet8q32i ple<Packet8q32i>(const Packet8q32i& a, + const Packet8q32i& b) { + const __m256i gt = _mm256_cmpgt_epi32(a.val, b.val); + return _mm256_xor_si256(gt, gt); +} +template <> +EIGEN_STRONG_INLINE Packet32q8i ple<Packet32q8i>(const Packet32q8i& a, + const Packet32q8i& b) { + const __m256i gt = _mm256_cmpgt_epi8(a.val, b.val); + return _mm256_xor_si256(gt, gt); +} + +template <> +EIGEN_STRONG_INLINE Packet8q32i plt<Packet8q32i>(const Packet8q32i& a, + const Packet8q32i& b) { + return _mm256_cmpgt_epi32(b.val, a.val); +} +template <> +EIGEN_STRONG_INLINE Packet32q8i plt<Packet32q8i>(const Packet32q8i& a, + const Packet32q8i& b) { + return _mm256_cmpgt_epi8(b.val, a.val); +} + +// Vectorized scaling of Packet32q8i by float. +template <> +struct functor_traits<scalar_multiple2_op<QInt32, double>> { + enum { Cost = 4 * NumTraits<float>::MulCost, PacketAccess = true }; +}; + +template <> +EIGEN_STRONG_INLINE const Packet8q32i +scalar_multiple2_op<QInt32, double>::packetOp(const Packet8q32i& a) const { + __m256d scale = _mm256_set1_pd(m_other); + __m256d a_lo = _mm256_cvtepi32_pd(_mm256_castsi256_si128(a)); + __m128i result_lo = _mm256_cvtpd_epi32(_mm256_mul_pd(scale, a_lo)); + __m256d a_hi = _mm256_cvtepi32_pd(_mm256_extracti128_si256(a, 1)); + __m128i result_hi = _mm256_cvtpd_epi32(_mm256_mul_pd(scale, a_hi)); + return _mm256_insertf128_si256(_mm256_castsi128_si256(result_lo), result_hi, + 1); +} + +} // end namespace internal +} // end namespace Eigen + +#endif // THIRD_PARTY_EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_PACKETMATHAVX2_H_ diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX2.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX2.h new file mode 100644 index 0000000000..045384d7fc --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX2.h @@ -0,0 +1,66 @@ +#ifndef THIRD_PARTY_EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX2_H_ +#define THIRD_PARTY_EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX2_H_ + +namespace Eigen { +namespace internal { + +typedef __m256 Packet8f; + +template <> +struct type_casting_traits<QInt32, float> { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; + +template <> +EIGEN_STRONG_INLINE Packet8f pcast<Packet8q32i>(const Packet8q32i& a) { + return _mm256_cvtepi32_ps(a.val); +} + +template <> +struct type_casting_traits<float, QInt32> { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; + +template <> +EIGEN_STRONG_INLINE Packet8q32i pcast<Packet8f>(const Packet8f& a) { + return _mm256_cvtps_epi32(a); +} + +template <> +struct type_casting_traits<QInt32, QInt8> { + enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 }; +}; + +template <> +EIGEN_STRONG_INLINE Packet32q8i +pcast<Packet8q32i, Packet32q8i>(const Packet8q32i& a, const Packet8q32i& b, + const Packet8q32i& c, const Packet8q32i& d) { + __m256i converted = _mm256_packs_epi16(_mm256_packs_epi32(a.val, b.val), + _mm256_packs_epi32(c.val, d.val)); + // Since packs does not cross 128 bit lane boundaries, + // we have to permute to properly order the final result. + const __m256i permute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0); + return _mm256_permutevar8x32_epi32(converted, permute_mask); +} + +template <> +struct type_casting_traits<QInt32, QUInt8> { + enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 }; +}; + +template <> +EIGEN_STRONG_INLINE Packet32q8u +pcast<Packet8q32i, Packet32q8u>(const Packet8q32i& a, const Packet8q32i& b, + const Packet8q32i& c, const Packet8q32i& d) { + const __m256i converted = _mm256_packus_epi16( + _mm256_packs_epi32(a.val, b.val), _mm256_packs_epi32(c.val, d.val)); + // Since packus does not cross 128 bit lane boundaries, + // we have to permute to properly order the final result. + const __m256i permute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0); + return _mm256_permutevar8x32_epi32(converted, permute_mask); +} + +} // end namespace internal +} // end namespace Eigen + +#endif // THIRD_PARTY_EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX2_H_ diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Activations.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Activations.h new file mode 100644 index 0000000000..94d616f2b5 --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Activations.h @@ -0,0 +1,116 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +#ifndef EIGEN_CXX11_NEURAL_NETWORKS_ACTIVATIONS_H +#define EIGEN_CXX11_NEURAL_NETWORKS_ACTIVATIONS_H + +namespace Eigen { + +/** scalar_sigmoid_fast_derivative_op + * \ingroup CXX11_NeuralNetworks_Module + * \brief Template functor to compute the fast derivative of a sigmoid + * + * Input should be the backpropagated gradient. + * + * \sa class CwiseUnaryOp, Cwise::sigmoid_fast_derivative() + */ +template <typename T> +struct scalar_sigmoid_fast_derivative_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_sigmoid_fast_derivative_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& y) const { + const T one = T(1); + return (one - y) * y; + } + + template <typename Packet> + inline Packet packetOp(const Packet& y) const { + const Packet one = internal::pset1<Packet>(1); + return internal::pmul(internal::psub(one, y), y); + } +}; + +namespace internal { +template <typename T> +struct functor_traits<scalar_sigmoid_fast_derivative_op<T> > { + enum { + Cost = NumTraits<T>::AddCost * 2 + NumTraits<T>::MulCost, + PacketAccess = packet_traits<T>::HasAdd && packet_traits<T>::HasMul && + packet_traits<T>::HasNegate + }; +}; +} // namespace internal + +/** scalar_tanh_fast_derivative_op + * \ingroup CXX11_NeuralNetworks_Module + * \brief Template functor to compute the fast derivative of a tanh + * + * Input should be the backpropagated gradient. + * + * \sa class CwiseUnaryOp, Cwise::tanh_fast_derivative() + */ +template <typename T> +struct scalar_tanh_fast_derivative_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_tanh_fast_derivative_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& y) const { + const T one = T(1); + return one - (y * y); + } + + template <typename Packet> + inline Packet packetOp(const Packet& y) const { + const Packet one = internal::pset1<Packet>(1); + return internal::psub(one, internal::pmul(y, y)); + } +}; + +namespace internal { +template <typename T> +struct functor_traits<scalar_tanh_fast_derivative_op<T> > { + enum { + Cost = NumTraits<T>::AddCost * 2 + NumTraits<T>::MulCost * 1, + PacketAccess = packet_traits<T>::HasAdd && packet_traits<T>::HasMul && + packet_traits<T>::HasNegate + }; +}; +} // namespace internal + +/** + * \ingroup CXX11_NeuralNetworks_Module + * \brief Template functor to clip the the magnitude of the first scalar. + * + * \sa class CwiseBinaryOp, MatrixBase::Clip + */ +template <typename Scalar> +struct scalar_clip_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_clip_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar + operator()(const Scalar& a, const Scalar& b) const { + return numext::mini(numext::maxi(a, -b), b); + } + template <typename Packet> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet + packetOp(const Packet& a, const Packet& b) const { + return internal::pmin(internal::pmax(a, internal::pnegate(b)), b); + } +}; + +namespace internal { +template <typename Scalar> +struct functor_traits<scalar_clip_op<Scalar> > { + enum { + Cost = NumTraits<Scalar>::AddCost * 3, + PacketAccess = packet_traits<Scalar>::HasMax && + packet_traits<Scalar>::HasMin && + packet_traits<Scalar>::HasNegate + }; +}; +} // namespace internal + +} // end namespace Eigen + +#endif // EIGEN_CXX11_NEURAL_NETWORKS_ACTIVATIONS_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Attention.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Attention.h new file mode 100644 index 0000000000..d4bc7a3515 --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Attention.h @@ -0,0 +1,209 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +#ifndef EIGEN_CXX11_NEURAL_NETWORKS_ATTENTION_H +#define EIGEN_CXX11_NEURAL_NETWORKS_ATTENTION_H + +namespace Eigen { + +/** ExtractGlimpses + * \ingroup CXX11_NeuralNetworks_Module + * + * \brief Extract glimpses from an input tensor. + * + * The input parameter is expected to be a col-major tensor with a rank of 4 (depth, x, y, and batch). + * The width and height parameters specify the extension of the returned glimpses. + * The offsets parameter specifies the x, y locations of the center of the glimpses relative to the center of the input image. The vector is expected to contain one IndexPair for each image in the batch dimension. + * The normalized boolean indicates if incoming coordinates are normalized so that 0.0 and 1.0 correspond to the minimum and maximum of each height and width dimension. + * The centered boolean indicates if incoming coordinates are centered relative to the image, in which case -1.0 and 1.0 correspond to minimum and maximum of each dimension while 0.0 corresponds to the center. + * + * The result can be assigned to a tensor of rank equal to that of the input. The result will be laid out in col-major order (depth, x, y, batch). + * The dimensions of the result will be equal to the dimensions of the input except for width and height which will be equal to the requested glimpse size. + */ +namespace { +template <typename Index> +struct GlimpseExtractionOp { + GlimpseExtractionOp(const Index width, const Index height, + const std::vector<IndexPair<float> >& offsets, + const bool normalized, + const bool centered, + const bool uniform_noise) : + width_(width), height_(height), offsets_(offsets), + normalized_(normalized), centered_(centered), uniform_noise_(uniform_noise) { } + + template <typename Input> + DSizes<Index, 4> dimensions(const Input& input) const { + typedef typename internal::traits<Input>::Index IndexType; + typedef TensorRef<Tensor<typename internal::traits<Input>::Scalar, 4, + internal::traits<Input>::Layout, IndexType> > Ref; + Ref in(input); + + DSizes<Index, 4> dims = in.dimensions(); + + dims[0] = in.dimension(0); + dims[1] = width_; + dims[2] = height_; + dims[3] = in.dimension(3); + return dims; + } + + template <typename Input, typename Output, typename Device> + EIGEN_DEVICE_FUNC + void eval(const Input& input, Output& output, const Device& device) const + { + typedef typename internal::traits<Input>::Index IndexType; + typedef TensorRef<Tensor<typename internal::traits<Input>::Scalar, 4, + internal::traits<Input>::Layout, IndexType> > Ref; + Ref in(input); + + const Index num_channels = in.dimension(0); + const Index input_width = in.dimension(1); + const Index input_height = in.dimension(2); + const Index batch_size = in.dimension(3); + eigen_assert(input_width > 0); + eigen_assert(input_height > 0); + + for (Index i = 0; i < batch_size; ++i) { + float x = offsets_[i].first, y = offsets_[i].second; + + // Un-normalize coordinates back to pixel space if normalized. + if (normalized_) { + x *= input_width; + y *= input_height; + } + // Un-center if coordinates are centered on the image center. + if (centered_) { + x /= 2.0f; + y /= 2.0f; + x += input_width / 2.0f; + y += input_height / 2.0f; + } + // Remove half of the glimpse window. + x -= width_ / 2.0f; + y -= height_ / 2.0f; + + const Index offset_x = (Index) x; + const Index offset_y = (Index) y; + Index glimpse_width = width_; + Index glimpse_height = height_; + bool partial_overlap = false; + DSizes<Index, 3> slice_offset(0, offset_x, offset_y); + DSizes<Index, 3> slice_extent(num_channels, width_, height_); + DSizes<Index, 3> base_offset(0, 0, 0); + + if (offset_x < 0) { + slice_offset[1] = 0; + glimpse_width = (std::max<Index>)(0, width_ + offset_x); + slice_extent[1] = glimpse_width; + base_offset[1] = width_ - glimpse_width; + partial_overlap = true; + } else if (offset_x + width_ >= input_width) { + glimpse_width = (std::max<Index>)(0, input_width - offset_x); + slice_extent[1] = glimpse_width; + partial_overlap = true; + } + if (offset_y < 0) { + slice_offset[2] = 0; + glimpse_height = (std::max<Index>)(0, height_ + offset_y); + slice_extent[2] = glimpse_height; + base_offset[2] = height_ - glimpse_height; + partial_overlap = true; + } else if (offset_y + height_ >= input_height) { + glimpse_height = (std::max<Index>)(0, input_height - offset_y); + slice_extent[2] = glimpse_height; + partial_overlap = true; + } + slice_extent[1] = std::min<Index>(input_width, slice_extent[1]); + slice_extent[2] = std::min<Index>(input_height, slice_extent[2]); + + if (partial_overlap) { + if (uniform_noise_) { + // Initialize the glimpse with uniform noise. + typedef typename internal::remove_const< + typename internal::traits<Input>::Scalar>::type Scalar; + TensorFixedSize<Scalar, Sizes<> > mini; + mini.device(device) = input.template chip<3>(i).minimum(); + TensorFixedSize<float, Sizes<> > range; + range.device(device) = + (input.template chip<3>(i).maximum() - mini).template cast<float>(); + + DSizes<Index, 3> glimpse_size(num_channels, width_, height_); + TensorMap<Tensor<float, 3> > tmp(NULL, glimpse_size); + output.template chip<3>(i).device(device) = + mini.reshape(Sizes<1,1,1>()).broadcast(glimpse_size) + + (tmp.random() * range.reshape(Sizes<1,1,1>()).broadcast(glimpse_size)).template cast<Scalar>(); + } else { + // Initialize the glimpse with white noise: compute the mean and sigma + // of each channel, and use them to shape the gaussian. + DSizes<Index, 2> glimpse_size(width_, height_); + DSizes<Index, 2> input_size(input_width, input_height); + typedef typename internal::remove_const< + typename internal::traits<Input>::Scalar>::type Scalar; + + for (int j = 0; j < num_channels; ++j) { + TensorFixedSize<Scalar, Sizes<> > mean; + mean.device(device) = input.template chip<3>(i).template chip<0>(j).template cast<float>().mean(); + TensorFixedSize<float, Sizes<> > sigma; + sigma.device(device) = + (input.template chip<3>(i).template chip<0>(j).template cast<float>() - mean.reshape(Sizes<1,1>()).broadcast(input_size)).square().mean().sqrt(); + TensorFixedSize<Scalar, Sizes<> > mini; + mini.device(device) = input.template chip<3>(i).template chip<0>(j).minimum(); + TensorFixedSize<float, Sizes<> > maxi; + maxi.device(device) = input.template chip<3>(i).template chip<0>(j).maximum(); + + TensorMap<Tensor<float, 2> > tmp(NULL, glimpse_size); + output.template chip<3>(i).template chip<0>(j).device(device) = + (mean.reshape(Sizes<1,1>()).broadcast(glimpse_size) + + (tmp.random(internal::NormalRandomGenerator<float>()) * sigma.reshape(Sizes<1,1>()).broadcast(glimpse_size)).template cast<Scalar>()).cwiseMin(maxi.reshape(Sizes<1,1>()).broadcast(glimpse_size)).cwiseMax(mini.reshape(Sizes<1,1>()).broadcast(glimpse_size)); + } + } + + // Copy the part of the glimpse that cover the input image if any. + if (glimpse_width == 0 || glimpse_height == 0) { + continue; + } + output.template chip<3>(i).slice(base_offset, slice_extent).device(device) = input.template chip<3>(i).slice(slice_offset, slice_extent); + } else { + output.template chip<3>(i).device(device) = input.template chip<3>(i).slice(slice_offset, slice_extent); + } + } + } + + private: + const Index width_; + const Index height_; + const std::vector<IndexPair<float> > offsets_; + const bool normalized_; + const bool centered_; + const bool uniform_noise_; +}; +} + + +template <typename Input> +EIGEN_ALWAYS_INLINE +static const TensorCustomUnaryOp<const GlimpseExtractionOp<typename internal::traits<Input>::Index>, const Input> +ExtractGlimpses(const Input& input, + const typename internal::traits<Input>::Index width, + const typename internal::traits<Input>::Index height, + const std::vector<IndexPair<float> >& offsets, + const bool normalized = true, const bool centered = true, + const bool uniform_noise = true) +{ + EIGEN_STATIC_ASSERT(internal::traits<Input>::Layout == ColMajor, YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == 4, YOU_MADE_A_PROGRAMMING_MISTAKE); + + typedef typename internal::traits<Input>::Index Index; + const GlimpseExtractionOp<Index> op(width, height, offsets, normalized, + centered, uniform_noise); + return input.customOp(op); +} + +} // end namespace Eigen + +#endif // EIGEN_CXX11_NEURAL_NETWORKS_ATTENTION_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/BackwardCuboidConvolutions.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/BackwardCuboidConvolutions.h new file mode 100644 index 0000000000..12ce23444c --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/BackwardCuboidConvolutions.h @@ -0,0 +1,523 @@ +#ifndef EIGEN_CXX11_NEURAL_NETWORKS_BACKWARD_CUBOID_CONVOLUTIONS_H +#define EIGEN_CXX11_NEURAL_NETWORKS_BACKWARD_CUBOID_CONVOLUTIONS_H + +#include "Patch3d.h" + +namespace Eigen { + +/** CuboidConvolutionBackwardInput + * \ingroup CXX11_NeuralNetworks_Module + * + * \brief Computes the backprop for the input of a 3D convolution. + * + * The output_backward parameter is expected to be a tensor with a rank of 4 or more (channels, depth, height, width, and optionally others) + * The kernel parameter is expected to be a 5D tensor (filters, channels, kernel_depth, kernel_height, kernel_width) + * output_backward and kernel have to be in the same layout. + * + * The dimensions of the result will be filters, depth, height, width (and others if applicable). + * + * It is possible to swap the order of the depth, width and height dimensions provided that the same order is used in the input, the kernel, and the output. + * + * All dimension orders above are given for col-major, and should be reversed for row-major. + */ + +template <typename OutputBackward, typename Kernel> +EIGEN_ALWAYS_INLINE static const typename internal::conditional< + internal::traits<OutputBackward>::Layout == ColMajor, + TensorReshapingOp< + const DSizes<typename internal::traits<OutputBackward>::Index, + internal::traits<OutputBackward>::NumDimensions>, + const TensorContractionOp< + const array< IndexPair<typename internal::traits<OutputBackward>::Index>, 2>, + const TensorReshapingOp< + const DSizes< typename internal::traits<OutputBackward>::Index, 3>, + const TensorReverseOp<const array<bool, 5>, const Kernel> + >, + const TensorReshapingOp< + const DSizes< typename internal::traits<OutputBackward>::Index, 3>, + const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const OutputBackward> + > + > + >, + TensorReshapingOp< + const DSizes<typename internal::traits<OutputBackward>::Index, + internal::traits<OutputBackward>::NumDimensions>, + const TensorContractionOp< + const array< IndexPair<typename internal::traits<OutputBackward>::Index>, 2>, + const TensorReshapingOp< + const DSizes< typename internal::traits<OutputBackward>::Index, 3>, + const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const OutputBackward> + >, + const TensorReshapingOp< + const DSizes<typename internal::traits<OutputBackward>::Index, 3>, + const TensorReverseOp<const array<bool, 5>, const Kernel> + > + > + > +>::type +CuboidConvolutionBackwardInput( + const Kernel& kernel, const OutputBackward& output_backward, + typename internal::traits<OutputBackward>::Index inputPlanes, + typename internal::traits<OutputBackward>::Index inputRows, + typename internal::traits<OutputBackward>::Index inputCols, + const DenseIndex stridePlanes = 1, const DenseIndex strideRows = 1, + const DenseIndex strideCols = 1) { + typedef typename internal::traits<OutputBackward>::Index TensorIndex; + const TensorRef<const Tensor<typename internal::traits<Kernel>::Scalar, internal::traits<Kernel>::NumDimensions, internal::traits<Kernel>::Layout, TensorIndex> > kern(kernel); + const TensorRef<const Tensor<typename internal::traits<OutputBackward>::Scalar, internal::traits<OutputBackward>::NumDimensions, internal::traits<OutputBackward>::Layout, TensorIndex> > out(output_backward); + + EIGEN_STATIC_ASSERT(internal::traits<Kernel>::Layout == internal::traits<OutputBackward>::Layout, YOU_MADE_A_PROGRAMMING_MISTAKE); + + static const bool isColMajor = (internal::traits<OutputBackward>::Layout == ColMajor); + + static const int NumDims = internal::traits<OutputBackward>::NumDimensions; + + // Number of filters to apply. This is the same as the output depth of the result + const TensorIndex kernelFilters = isColMajor ? kern.dimensions()[0] : kern.dimensions()[4]; + // Number of channels. This is the same as the input depth. + const TensorIndex kernelChannels = isColMajor ? kern.dimensions()[1] : kern.dimensions()[3]; + const TensorIndex kernelPlanes = isColMajor ? kern.dimensions()[2] : kern.dimensions()[2]; + const TensorIndex kernelRows = isColMajor ? kern.dimensions()[3] : kern.dimensions()[1]; + const TensorIndex kernelCols = isColMajor ? kern.dimensions()[4] : kern.dimensions()[0]; + + const TensorIndex outputPlanes = isColMajor ? out.dimensions()[1] : out.dimensions()[NumDims - 2]; + const TensorIndex outputRows = isColMajor ? out.dimensions()[2] : out.dimensions()[NumDims - 3]; + const TensorIndex outputCols = isColMajor ? out.dimensions()[3] : out.dimensions()[NumDims - 4]; + + TensorIndex forward_pad_z, forward_pad_y, forward_pad_x; + const TensorIndex size_z = ceil(inputPlanes / static_cast<float>(stridePlanes)); + const TensorIndex size_y = ceil(inputRows / static_cast<float>(strideRows)); + const TensorIndex size_x = ceil(inputCols / static_cast<float>(strideCols)); + + // Infer padding type. + if (size_z == outputPlanes && size_y == outputRows && size_x == outputCols) { + // SAME padding. + const TensorIndex dz = size_z * stridePlanes + kernelPlanes - 1 - inputPlanes; + const TensorIndex dy = size_y * strideRows + kernelRows - 1 - inputRows; + const TensorIndex dx = size_x * strideCols + kernelCols - 1 - inputCols; + + forward_pad_z = dz - dz / 2; + forward_pad_y = dy - dy / 2; + forward_pad_x = dx - dx / 2; + } else { + // VALID padding. + forward_pad_z = 0; + forward_pad_y = 0; + forward_pad_x = 0; + } + const TensorIndex padding_ztop = kernelPlanes - 1 - forward_pad_z; + const TensorIndex padding_top = kernelRows - 1 - forward_pad_y; + const TensorIndex padding_left = kernelCols - 1 - forward_pad_x; + + const TensorIndex padding_zbottom = inputPlanes + kernelPlanes - 1 - (outputPlanes - 1) * stridePlanes - 1 - padding_ztop; + const TensorIndex padding_bottom = inputRows + kernelRows - 1 - (outputRows - 1) * strideRows - 1 - padding_top; + const TensorIndex padding_right = inputCols + kernelCols - 1 - (outputCols - 1) * strideCols - 1 - padding_left; + + eigen_assert(padding_ztop >= 0); + eigen_assert(padding_zbottom >= 0); + eigen_assert(padding_top >= 0); + eigen_assert(padding_left >= 0); + eigen_assert(padding_bottom >= 0); + eigen_assert(padding_right >= 0); + + // The kernel has dimensions filters X channels X patch_planes X patch_rows X patch_cols. + // We need to reverse the kernel along the spatial dimensions. + array<bool, 5> kernel_reverse; + if (isColMajor) { + kernel_reverse[0] = false; + kernel_reverse[1] = false; + kernel_reverse[2] = true; + kernel_reverse[3] = true; + kernel_reverse[4] = true; + } else { + kernel_reverse[0] = true; + kernel_reverse[1] = true; + kernel_reverse[2] = true; + kernel_reverse[3] = false; + kernel_reverse[4] = false; + } + + DSizes<TensorIndex, 3> kernel_dims; + if (isColMajor) { + kernel_dims[0] = kernelFilters; + kernel_dims[1] = kernelChannels; + kernel_dims[2] = kernelRows * kernelCols * kernelPlanes; + } else { + kernel_dims[0] = kernelRows * kernelCols * kernelPlanes; + kernel_dims[1] = kernelChannels; + kernel_dims[2] = kernelFilters; + } + + // The output_backward has dimensions out_depth X out_planes X out_rows X out_cols X OTHERS + // When we extract the image patches from output_backward, it will have dimensions: + // out_depth X (patch_planes * patch_rows * patch_cols) X (input_planes * input_rows * input_cols * OTHERS) + DSizes<TensorIndex, 3> pre_contract_dims; + if (isColMajor) { + pre_contract_dims[0] = kernelFilters; + pre_contract_dims[1] = kernelRows * kernelCols * kernelPlanes; + pre_contract_dims[2] = inputRows * inputCols * inputPlanes; + for (int i = 4; i < NumDims; ++i) { + pre_contract_dims[2] *= out.dimension(i); + } + } else { + pre_contract_dims[2] = kernelFilters; + pre_contract_dims[1] = kernelRows * kernelCols * kernelPlanes; + pre_contract_dims[0] = inputRows * inputCols * inputPlanes; + for (int i = 0; i < NumDims - 4; ++i) { + pre_contract_dims[0] *= out.dimension(i); + } + } + + // We will contract along dimensions (0, 2) in kernel and (0, 1) in + // output_backward, if this is col-major, and + // dimensions (0, 2) in kernel and (1, 2) in output_backward, if this row-major. + array<IndexPair<TensorIndex>, 2> contract_dims; + if (isColMajor) { + // col-major: kernel.contract(output.patches) + contract_dims[0] = IndexPair<TensorIndex>(0, 0); + contract_dims[1] = IndexPair<TensorIndex>(2, 1); + } else { + // row-major: output.patches.contract(kernel) + contract_dims[0] = IndexPair<TensorIndex>(1, 0); + contract_dims[1] = IndexPair<TensorIndex>(2, 2); + } + + // Post contraction, the dimensions of the input_backprop is + // channels X input_planes X input_rows X input_cols X OTHERS + DSizes<TensorIndex, NumDims> post_contract_dims; + if (isColMajor) { + post_contract_dims[0] = kernelChannels; + post_contract_dims[1] = inputPlanes; + post_contract_dims[2] = inputRows; + post_contract_dims[3] = inputCols; + for (int i = 4; i < NumDims; ++i) { + post_contract_dims[i] = out.dimension(i); + } + } else { + post_contract_dims[NumDims - 1] = kernelChannels; + post_contract_dims[NumDims - 2] = inputPlanes; + post_contract_dims[NumDims - 3] = inputRows; + post_contract_dims[NumDims - 4] = inputCols; + for (int i = 0; i < NumDims - 4; ++i) { + post_contract_dims[i] = out.dimension(i); + } + } + + DSizes<TensorIndex, NumDims> strides; + for (int i = 0; i < NumDims; i++) { + strides[i] = 1; + } + if (isColMajor) { + strides[1] = stridePlanes; + strides[2] = strideRows; + strides[3] = strideCols; + } else { + strides[NumDims - 2] = stridePlanes; + strides[NumDims - 3] = strideRows; + strides[NumDims - 4] = strideCols; + } + + return choose( + Cond<internal::traits<OutputBackward>::Layout == ColMajor>(), + kernel.reverse(kernel_reverse) + .reshape(kernel_dims) + .contract( + output_backward.extract_volume_patches(kernelPlanes, kernelRows, kernelCols, + 1, 1, 1, stridePlanes, strideRows, strideCols, + padding_ztop, padding_zbottom, + padding_top, padding_bottom, + padding_left, padding_right) + .reshape(pre_contract_dims), + contract_dims) + .reshape(post_contract_dims), + output_backward.extract_volume_patches(kernelPlanes, kernelRows, kernelCols, + 1, 1, 1, stridePlanes, strideRows, strideCols, + padding_ztop, padding_zbottom, + padding_top, padding_bottom, + padding_left, padding_right) + .reshape(pre_contract_dims) + .contract(kernel.reverse(kernel_reverse).reshape(kernel_dims), + contract_dims) + .reshape(post_contract_dims)); +} + + +/** CuboidConvolutionBackwardKernel + * \ingroup CXX11_NeuralNetworks_Module + * + * \brief Computes the backprop for the filter of a 3D convolution. + * + * The output_backward parameter is expected to be a tensor with a rank of 4 or more (channels, depth, height, width, and optionally others) + * The kernel parameter is expected to be a 4D tensor (filters, channels, kernel_depth, kernel_height, kernel_width) + * output_backward and kernel have to be in the same layout. + * + * The dimensions of the result will be filters, depth, height, width (and others if applicable). + * + * It is possible to swap the order of the depth, width and height dimensions provided that the same order is used in the input, the kernel, and the output. + * + * All dimension orders above are given for col-major, and should be reversed for row-major. + */ +template <typename OutputBackward, typename Input> +EIGEN_ALWAYS_INLINE static const typename internal::conditional< + internal::traits<OutputBackward>::Layout == ColMajor, + const TensorShufflingOp< + const array<typename internal::traits<OutputBackward>::Index, 5>, + const TensorReverseOp< + const array<bool, 5>, + const TensorReshapingOp< + const DSizes<typename internal::traits<OutputBackward>::Index, 5>, + const TensorContractionOp< + const array< IndexPair<typename internal::traits<Input>::Index>, 2>, + const TensorReshapingOp< + const DSizes<typename internal::traits<Input>::Index, 3>, + const Input>, + const TensorReshapingOp< + const DSizes< typename internal::traits<OutputBackward>::Index, 4>, + const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const OutputBackward> + > + > + > + > + >, + const TensorShufflingOp< + const array<typename internal::traits<OutputBackward>::Index, 5>, + const TensorReverseOp< + const array<bool, 5>, + const TensorReshapingOp< + const DSizes<typename internal::traits<OutputBackward>::Index, 5>, + const TensorContractionOp< + const array< IndexPair<typename internal::traits<Input>::Index>, 2>, + const TensorReshapingOp< + const DSizes< typename internal::traits<OutputBackward>::Index, 4>, + const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const OutputBackward> + >, + const TensorReshapingOp< + const DSizes<typename internal::traits<Input>::Index, 3>, + const Input + > + > + > + > + > +>::type +CuboidConvolutionBackwardKernel( + const Input& input, const OutputBackward& output_backward, + typename internal::traits<Input>::Index kernelPlanes, + typename internal::traits<Input>::Index kernelRows, + typename internal::traits<Input>::Index kernelCols, + const DenseIndex stridePlanes = 1, + const DenseIndex strideRows = 1, + const DenseIndex strideCols = 1) { + typedef typename internal::traits<Input>::Index TensorIndex; + TensorRef<Tensor<typename internal::traits<Input>::Scalar, internal::traits<Input>::NumDimensions, internal::traits<Input>::Layout, TensorIndex> > in(input); + TensorRef<Tensor<typename internal::traits<OutputBackward>::Scalar, internal::traits<OutputBackward>::NumDimensions, internal::traits<OutputBackward>::Layout, TensorIndex> > out(output_backward); + + EIGEN_STATIC_ASSERT(internal::traits<Input>::Layout == internal::traits<OutputBackward>::Layout, YOU_MADE_A_PROGRAMMING_MISTAKE); + + static const bool isColMajor = (internal::traits<Input>::Layout == ColMajor); + + static const int NumDims = internal::traits<Input>::NumDimensions; + EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == internal::traits<OutputBackward>::NumDimensions, YOU_MADE_A_PROGRAMMING_MISTAKE); + + const TensorIndex inputPlanes = isColMajor ? in.dimension(1) : in.dimension(NumDims - 2); + const TensorIndex inputRows = isColMajor ? in.dimension(2) : in.dimension(NumDims - 3); + const TensorIndex inputCols = isColMajor ? in.dimension(3) : in.dimension(NumDims - 4); + + const TensorIndex outputPlanes = isColMajor ? out.dimension(1) : out.dimension(NumDims - 2); + const TensorIndex outputRows = isColMajor ? out.dimension(2) : out.dimension(NumDims - 3); + const TensorIndex outputCols = isColMajor ? out.dimension(3) : out.dimension(NumDims - 4); + + const TensorIndex kernelFilters = isColMajor ? out.dimension(0) : out.dimension(NumDims - 1); + const TensorIndex kernelChannels = isColMajor ? in.dimension(0) : in.dimension(NumDims - 1); + + TensorIndex forward_pad_z, forward_pad_y, forward_pad_x; + const TensorIndex size_z = ceil(inputPlanes / static_cast<float>(stridePlanes)); + const TensorIndex size_y = ceil(inputRows / static_cast<float>(strideRows)); + const TensorIndex size_x = ceil(inputCols / static_cast<float>(strideCols)); + + // Infer padding type. + if (size_z == outputPlanes && size_y == outputRows && size_x == outputCols) { + // SAME padding. + const TensorIndex dz = size_z * stridePlanes + kernelPlanes - 1 - inputPlanes; + const TensorIndex dy = size_y * strideRows + kernelRows - 1 - inputRows; + const TensorIndex dx = size_x * strideCols + kernelCols - 1 - inputCols; + + forward_pad_z = dz - dz / 2; + forward_pad_y = dy - dy / 2; + forward_pad_x = dx - dx / 2; + } else { + // VALID padding. + forward_pad_z = 0; + forward_pad_y = 0; + forward_pad_x = 0; + } + + const TensorIndex padding_ztop = kernelPlanes - 1 - forward_pad_z; + const TensorIndex padding_top = kernelRows - 1 - forward_pad_y; + const TensorIndex padding_left = kernelCols - 1 - forward_pad_x; + + const TensorIndex padding_zbottom = inputPlanes + kernelPlanes - 1 - (outputPlanes - 1) * stridePlanes - 1 - padding_ztop; + const TensorIndex padding_bottom = inputRows + kernelRows - 1 - (outputRows - 1) * strideRows - 1 - padding_top; + const TensorIndex padding_right = inputCols + kernelCols - 1 - (outputCols - 1) * strideCols - 1 - padding_left; + + eigen_assert(padding_ztop >= 0); + eigen_assert(padding_zbottom >= 0); + eigen_assert(padding_top >= 0); + eigen_assert(padding_left >= 0); + eigen_assert(padding_bottom >= 0); + eigen_assert(padding_right >= 0); + + // The output_backward has dimensions out_depth X out_plaens X out_rows X out_cols X OTHERS + // When we extract the image patches from output_backward (with input as the + // kernel), it will have dimensions + // (out_depth) X (input_planes * input_rows * input_cols) X (kernel_planes * kernel_rows * kernel_cols) X OTHERS + DSizes<TensorIndex, 4> pre_contract_dims; + if (isColMajor) { + pre_contract_dims[0] = kernelFilters; + pre_contract_dims[1] = inputRows * inputCols * inputPlanes; + pre_contract_dims[2] = kernelRows * kernelCols * kernelPlanes; + pre_contract_dims[3] = 1; + for (int i = 4; i < NumDims; ++i) { + pre_contract_dims[3] *= out.dimension(i); + } + } else { + pre_contract_dims[3] = kernelFilters; + pre_contract_dims[2] = inputRows * inputCols * inputPlanes; + pre_contract_dims[1] = kernelRows * kernelCols * kernelPlanes; + pre_contract_dims[0] = 1; + for (int i = 0; i < NumDims - 4; ++i) { + pre_contract_dims[0] *= out.dimension(i); + } + } + + // The input has dimensions in_depth X (input_planes * input_rows * input_cols) X OTHERS + DSizes<TensorIndex, 3> input_dims; + if (isColMajor) { + input_dims[0] = kernelChannels; + input_dims[1] = inputRows * inputCols * inputPlanes; + input_dims[2] = 1; + for (int i = 4; i < NumDims; ++i) { + input_dims[2] *= in.dimension(i); + } + eigen_assert(input_dims[2] == pre_contract_dims[3]); + } else { + input_dims[2] = kernelChannels; + input_dims[1] = inputRows * inputCols * inputPlanes; + input_dims[0] = 1; + for (int i = 0; i < NumDims - 4; ++i) { + input_dims[0] *= in.dimension(i); + } + eigen_assert(input_dims[0] == pre_contract_dims[0]); + } + + // We will contract along dimensions (1, 2) in in and (1, 3) in out, if + // this is col-major. + // For row-major, it's dimensions (0, 1) in in and (0, 2) in out. + array<IndexPair<TensorIndex>, 2> contract_dims; + if (isColMajor) { + // col-major: in.contract(output.patches) + contract_dims[0] = IndexPair<TensorIndex>(1, 1); + contract_dims[1] = IndexPair<TensorIndex>(2, 3); + } else { + // row-major: output.patches.contract(in) + contract_dims[0] = IndexPair<TensorIndex>(0, 0); + contract_dims[1] = IndexPair<TensorIndex>(2, 1); + } + + // After the contraction, the kernel will have dimension + // in_depth X out_depth X kernel_patches X kernel_rows X kernel_cols + // We will need to shuffle the first two dimensions and reverse the spatial dimensions. + // The end shape is: + // out_depth X in_shape X kernel_planes X kernel_rows X kernel_cols + + // This is the shape of the kernel *before* the shuffling. + DSizes<TensorIndex, 5> kernel_dims; + if (isColMajor) { + kernel_dims[0] = kernelChannels; + kernel_dims[1] = kernelFilters; + kernel_dims[2] = kernelPlanes; + kernel_dims[3] = kernelRows; + kernel_dims[4] = kernelCols; + } else { + kernel_dims[0] = kernelCols; + kernel_dims[1] = kernelRows; + kernel_dims[2] = kernelPlanes; + kernel_dims[3] = kernelFilters; + kernel_dims[4] = kernelChannels; + } + + // Flip filters and channels. + array<TensorIndex, 5> kernel_shuffle; + if (isColMajor) { + kernel_shuffle[0] = 1; + kernel_shuffle[1] = 0; + kernel_shuffle[2] = 2; + kernel_shuffle[3] = 3; + kernel_shuffle[4] = 4; + } else { + kernel_shuffle[0] = 0; + kernel_shuffle[1] = 1; + kernel_shuffle[2] = 2; + kernel_shuffle[3] = 4; + kernel_shuffle[4] = 3; + } + + // Reverse the spatial dimensions. + array<bool, 5> kernel_reverse; + if (isColMajor) { + kernel_reverse[0] = false; + kernel_reverse[1] = false; + kernel_reverse[2] = true; + kernel_reverse[3] = true; + kernel_reverse[4] = true; + } else { + kernel_reverse[0] = true; + kernel_reverse[1] = true; + kernel_reverse[2] = true; + kernel_reverse[3] = false; + kernel_reverse[4] = false; + } + + DSizes<TensorIndex, NumDims> strides; + for (int i = 0; i < NumDims; i++) { + strides[i] = 1; + } + if (isColMajor) { + strides[1] = stridePlanes; + strides[2] = strideRows; + strides[3] = strideCols; + } else { + strides[NumDims - 2] = stridePlanes; + strides[NumDims - 3] = strideRows; + strides[NumDims - 4] = strideCols; + } + return choose( + Cond<internal::traits<Input>::Layout == ColMajor>(), + input.reshape(input_dims) + .contract( + output_backward.extract_volume_patches( + inputPlanes, inputRows, inputCols, 1, + 1, 1, stridePlanes, strideRows, strideCols, + + padding_ztop, padding_zbottom, padding_top, + padding_bottom, padding_left, padding_right) + .reshape(pre_contract_dims), + contract_dims) + .reshape(kernel_dims) + .reverse(kernel_reverse) + .shuffle(kernel_shuffle), + output_backward.extract_volume_patches( + inputPlanes, inputRows, inputCols, 1, 1, 1, + stridePlanes, strideRows, strideCols, padding_ztop, + padding_zbottom, padding_top, padding_bottom, + padding_left, padding_right) + .reshape(pre_contract_dims) + .contract(input.reshape(input_dims), contract_dims) + .reshape(kernel_dims) + .reverse(kernel_reverse) + .shuffle(kernel_shuffle)); +} + +} // end namespace Eigen + +#endif // EIGEN_CXX11_NEURAL_NETWORKS_BACKWARD_CUBOID_CONVOLUTIONS_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/BackwardSpatialConvolutions.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/BackwardSpatialConvolutions.h new file mode 100644 index 0000000000..188dc75bf6 --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/BackwardSpatialConvolutions.h @@ -0,0 +1,351 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Ke Yang <yangke@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_NEURAL_NETWORKS_BACKWARD_SPATIAL_CONVOLUTIONS_H +#define EIGEN_CXX11_NEURAL_NETWORKS_BACKWARD_SPATIAL_CONVOLUTIONS_H + +namespace Eigen { + +/** SpatialConvolutionBackwardInput + * \ingroup CXX11_NeuralNetworks_Module + * + * \brief Computes the backprop for the input of a 2D convolution. + * + * The output_backward parameter is expected to be a tensor with a rank of 3 or more (channels, height, width, and optionally others) + * The kernel parameter is expected to be a 4D tensor (filters, channels, kernel_height, kernel_width) + * The output_backward and the kernel must both be in col-major layout. The result will also be in col-major layout. + * + * If in_stride > 1, then applies convolution with holes (aka atrous convolution), sampling every in_stride input pixels. + * + * The result can be assigned to a tensor of rank equal to the rank of the output_backward. The dimensions of the result will be filters, height, width (and others if applicable). + * + * It is possible to swap the order of the width and height dimensions provided that the same order is used in the input, the kernel, and the output. + * + */ + +template <typename OutputBackward, typename Kernel> +EIGEN_ALWAYS_INLINE +static const typename internal::conditional< + internal::traits<OutputBackward>::Layout == ColMajor, + TensorReshapingOp<const DSizes<typename internal::traits<OutputBackward>::Index, internal::traits<OutputBackward>::NumDimensions>, const TensorContractionOp<const array<IndexPair<typename internal::traits<OutputBackward>::Index>, 2>, const TensorReshapingOp<const DSizes<typename internal::traits<OutputBackward>::Index, 3>, const TensorReverseOp<const array<bool, 4>, const Kernel> >, const TensorReshapingOp<const DSizes<typename internal::traits<OutputBackward>::Index, 3>, const TensorImagePatchOp<Dynamic, Dynamic, const OutputBackward> > > >, + TensorReshapingOp<const DSizes<typename internal::traits<OutputBackward>::Index, internal::traits<OutputBackward>::NumDimensions>, const TensorContractionOp<const array<IndexPair<typename internal::traits<OutputBackward>::Index>, 2>, const TensorReshapingOp<const DSizes<typename internal::traits<OutputBackward>::Index, 3>, const TensorImagePatchOp<Dynamic, Dynamic, const OutputBackward> >, const TensorReshapingOp<const DSizes<typename internal::traits<OutputBackward>::Index, 3>, const TensorReverseOp<const array<bool, 4>, const Kernel> > > > >::type +SpatialConvolutionBackwardInput(const Kernel& kernel, const OutputBackward& output_backward, typename internal::traits<OutputBackward>::Index inputRows, typename internal::traits<OutputBackward>::Index inputCols, const DenseIndex stride = 1, const DenseIndex in_stride = 1) { + + typedef typename internal::traits<OutputBackward>::Index TensorIndex; + TensorRef<Tensor<typename internal::traits<Kernel>::Scalar, internal::traits<Kernel>::NumDimensions, internal::traits<Kernel>::Layout, TensorIndex> > kern(kernel); + TensorRef<Tensor<typename internal::traits<OutputBackward>::Scalar, internal::traits<OutputBackward>::NumDimensions, internal::traits<OutputBackward>::Layout, TensorIndex> > out(output_backward); + + EIGEN_STATIC_ASSERT(internal::traits<Kernel>::Layout == internal::traits<OutputBackward>::Layout, YOU_MADE_A_PROGRAMMING_MISTAKE); + + static const bool isColMajor = (internal::traits<OutputBackward>::Layout == ColMajor); + + static const int NumDims = internal::traits<OutputBackward>::NumDimensions; + + // Number of filters to apply. This is the same as the output depth of the result + const TensorIndex kernelFilters = isColMajor ? kern.dimensions()[0] : kern.dimensions()[3]; + // Number of channels. This is the same as the input depth. + const TensorIndex kernelChannels = isColMajor ? kern.dimensions()[1] : kern.dimensions()[2]; + const TensorIndex kernelRows = isColMajor ? kern.dimensions()[2] : kern.dimensions()[1]; + const TensorIndex kernelCols = isColMajor ? kern.dimensions()[3] : kern.dimensions()[0]; + + // This is the effective kernel size, taking into account the (in_stride - 1) zero-values + // inserted between consecutive kernel elements in atrous convolution + const TensorIndex kernelRowsEff = kernelRows + (kernelRows - 1) * (in_stride - 1); + const TensorIndex kernelColsEff = kernelCols + (kernelCols - 1) * (in_stride - 1); + + const TensorIndex outputRows = isColMajor ? output_backward.dimension(1) : output_backward.dimension(NumDims - 2); + const TensorIndex outputCols = isColMajor ? output_backward.dimension(2) : output_backward.dimension(NumDims - 3); + + // Computing the forward padding + const TensorIndex forward_pad_top = ((outputRows - 1) * stride + kernelRowsEff - inputRows) / 2; + const TensorIndex forward_pad_left = ((outputCols - 1) * stride + kernelColsEff - inputCols) / 2; + + const TensorIndex padding_top = kernelRowsEff - 1 - forward_pad_top; + const TensorIndex padding_left = kernelColsEff - 1 - forward_pad_left; + const TensorIndex padding_bottom = inputRows + kernelRowsEff - 1 - (outputRows - 1) * stride - 1 - padding_top; + const TensorIndex padding_right = inputCols + kernelColsEff - 1 - (outputCols - 1) * stride - 1 - padding_left; + + eigen_assert(padding_top >= 0); + eigen_assert(padding_left >= 0); + eigen_assert(padding_bottom >= 0); + eigen_assert(padding_right >= 0); + + // The kernel has dimensions filters X channels X patch_rows X patch_cols + // We need to reverse the kernel along dimensions corresponding to rows and + // cols. + // TODO(yangke): we can make things slightly faster by collapsing the dimensions + // where we don't reverse. Try that once we have a faster compiler. + array<bool, 4> kernel_reverse; + if (isColMajor) { + kernel_reverse[0] = false; + kernel_reverse[1] = false; + kernel_reverse[2] = true; + kernel_reverse[3] = true; + } else { + kernel_reverse[0] = true; + kernel_reverse[1] = true; + kernel_reverse[2] = false; + kernel_reverse[3] = false; + } + + DSizes<TensorIndex, 3> kernel_dims; + if (isColMajor) { + kernel_dims[0] = kernelFilters; + kernel_dims[1] = kernelChannels; + kernel_dims[2] = kernelRows * kernelCols; + } else { + kernel_dims[0] = kernelRows * kernelCols; + kernel_dims[1] = kernelChannels; + kernel_dims[2] = kernelFilters; + } + + // The output_backward has dimensions out_depth X out_rows X out_cols X OTHERS + // When we extract the image patches from output_backward, it will have dimensions + // out_depth X (patch_rows * patch_cols) X (input_rows * input_cols * OTHERS) + DSizes<TensorIndex, 3> pre_contract_dims; + if (isColMajor) { + pre_contract_dims[0] = kernelFilters; + pre_contract_dims[1] = kernelRows * kernelCols; + pre_contract_dims[2] = inputRows * inputCols; + for (int i = 3; i < NumDims; ++i) { + pre_contract_dims[2] *= out.dimension(i); + } + } else { + pre_contract_dims[2] = kernelFilters; + pre_contract_dims[1] = kernelRows * kernelCols; + pre_contract_dims[0] = inputRows * inputCols; + for (int i = 0; i < NumDims - 3; ++i) { + pre_contract_dims[0] *= out.dimension(i); + } + } + + // We will contract along dimensions (0, 2) in kernel and (0, 1) in + // output_backward, if this is col-major, and + // dimensions (0, 2) in kernel and (1, 2) in output_backward, if this row-major. + array<IndexPair<TensorIndex>, 2> contract_dims; + if (isColMajor) { + // col-major: kernel.contract(output.patches) + contract_dims[0] = IndexPair<TensorIndex>(0, 0); + contract_dims[1] = IndexPair<TensorIndex>(2, 1); + } else { + // row-major: output.patches.contract(kernel) + contract_dims[0] = IndexPair<TensorIndex>(1, 0); + contract_dims[1] = IndexPair<TensorIndex>(2, 2); + } + + // Post contraction, the dimensions of the input_backprop is + // channels X input_rows X input_cols X OTHERS + DSizes<TensorIndex, NumDims> post_contract_dims; + if (isColMajor) { + post_contract_dims[0] = kernelChannels; + post_contract_dims[1] = inputRows; + post_contract_dims[2] = inputCols; + for (int i = 3; i < NumDims; ++i) { + post_contract_dims[i] = out.dimension(i); + } + } else { + post_contract_dims[NumDims - 1] = kernelChannels; + post_contract_dims[NumDims - 2] = inputRows; + post_contract_dims[NumDims - 3] = inputCols; + for (int i = 0; i < NumDims - 3; ++i) { + post_contract_dims[i] = out.dimension(i); + } + } + + return choose(Cond<internal::traits<OutputBackward>::Layout == ColMajor>(), + kernel.reverse(kernel_reverse).reshape(kernel_dims).contract(output_backward.extract_image_patches(kernelRows, kernelCols, 1, 1, in_stride, in_stride, stride, stride, padding_top, padding_bottom, padding_left, padding_right, 0).reshape(pre_contract_dims), contract_dims).reshape(post_contract_dims), + output_backward.extract_image_patches(kernelRows, kernelCols, 1, 1, in_stride, in_stride, stride, stride, padding_top, padding_bottom, padding_left, padding_right, 0).reshape(pre_contract_dims).contract(kernel.reverse(kernel_reverse).reshape(kernel_dims), contract_dims).reshape(post_contract_dims)); +} + + +/** SpatialConvolutionBackwardKernel + * \ingroup CXX11_NeuralNetworks_Module + * + * \brief Computes the backprop for the filter of a 2D convolution. + * + * The output_backward parameter is expected to be a tensor with a rank of 3 or more (channels, height, width, and optionally others) + * The kernel parameter is expected to be a 4D tensor (filters, channels, kernel_height, kernel_width) + * The output_backward and the kernel must both be in col-major layout. The result will also be in col-major layout. + * + * If in_stride > 1, then applies convolution with holes (aka atrous convolution), sampling every in_stride input pixels. + * + * The result can be assigned to a tensor of rank equal to the rank of the output_backward. The dimensions of the result will be filters, height, width (and others if applicable). + * + * It is possible to swap the order of the width and height dimensions provided that the same order is used in the input, the kernel, and the output. + * + */ +// TODO(gpapan): Resolve a bug in TensorContractionInputMapper at SpatialConvolutions.h that yangke circumvented by using .reshape().reshape(). +// This can significantly accelerate SpatialConvolutionBackwardKernel. + +template <typename OutputBackward, typename Input> +EIGEN_ALWAYS_INLINE +static const typename internal::conditional< + internal::traits<OutputBackward>::Layout == ColMajor, + const TensorShufflingOp<const array<typename internal::traits<OutputBackward>::Index, 4>, const TensorReverseOp<const array<bool, 4>, const TensorReshapingOp<const DSizes<typename internal::traits<OutputBackward>::Index, 4>, const TensorContractionOp<const array<IndexPair<typename internal::traits<Input>::Index>, 2>, const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 3>, const Input>, const TensorReshapingOp<const DSizes<typename internal::traits<OutputBackward>::Index, 4>, const TensorReshapingOp<const DSizes<typename internal::traits<OutputBackward>::Index, 4>, const TensorImagePatchOp<Dynamic, Dynamic, const OutputBackward> > > > > > >, + const TensorShufflingOp<const array<typename internal::traits<OutputBackward>::Index, 4>, const TensorReverseOp<const array<bool, 4>, const TensorReshapingOp<const DSizes<typename internal::traits<OutputBackward>::Index, 4>, const TensorContractionOp<const array<IndexPair<typename internal::traits<Input>::Index>, 2>, const TensorReshapingOp<const DSizes<typename internal::traits<OutputBackward>::Index, 4>, const TensorReshapingOp<const DSizes<typename internal::traits<OutputBackward>::Index, 4>, const TensorImagePatchOp<Dynamic, Dynamic, const OutputBackward> > >, const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 3>, const Input> > > > > >::type +SpatialConvolutionBackwardKernel(const Input& input, const OutputBackward& output_backward, typename internal::traits<Input>::Index kernelRows, typename internal::traits<Input>::Index kernelCols, const DenseIndex stride = 1, const DenseIndex in_stride = 1) { + + typedef typename internal::traits<Input>::Index TensorIndex; + TensorRef<Tensor<typename internal::traits<Input>::Scalar, internal::traits<Input>::NumDimensions, internal::traits<Input>::Layout, TensorIndex> > in(input); + TensorRef<Tensor<typename internal::traits<OutputBackward>::Scalar, internal::traits<OutputBackward>::NumDimensions, internal::traits<OutputBackward>::Layout, TensorIndex> > out(output_backward); + + EIGEN_STATIC_ASSERT(internal::traits<Input>::Layout == internal::traits<OutputBackward>::Layout, YOU_MADE_A_PROGRAMMING_MISTAKE); + + // stride and in_stride cannot both be larger than 1 + eigen_assert(!(stride > 1 && in_stride > 1)); + + static const bool isColMajor = (internal::traits<Input>::Layout == ColMajor); + + static const int NumDims = internal::traits<Input>::NumDimensions; + EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == internal::traits<OutputBackward>::NumDimensions, YOU_MADE_A_PROGRAMMING_MISTAKE); + + const TensorIndex inputRows = isColMajor ? in.dimension(1) : in.dimension(NumDims - 2); + const TensorIndex inputCols = isColMajor ? in.dimension(2) : in.dimension(NumDims - 3); + + const TensorIndex outputRows = isColMajor ? output_backward.dimension(1) : output_backward.dimension(NumDims - 2); + const TensorIndex outputCols = isColMajor ? output_backward.dimension(2) : output_backward.dimension(NumDims - 3); + + // Number of filters to apply. This is the same as the output depth of the result + const TensorIndex kernelFilters = isColMajor ? out.dimensions()[0] : out.dimensions()[NumDims - 1]; + + // Number of channels. This is the same as the input depth. + const TensorIndex kernelChannels = isColMajor ? in.dimensions()[0] : in.dimensions()[NumDims - 1]; + + // This is the effective kernel size, taking into account the (in_stride - 1) zero-values + // inserted between consecutive kernel elements in atrous convolution + const TensorIndex kernelRowsEff = kernelRows + (kernelRows - 1) * (in_stride - 1); + const TensorIndex kernelColsEff = kernelCols + (kernelCols - 1) * (in_stride - 1); + + // Computing the forward padding + const TensorIndex forward_pad_top = ((outputRows - 1) * stride + kernelRowsEff - inputRows) / 2; + const TensorIndex forward_pad_left = ((outputCols - 1) * stride + kernelColsEff - inputCols) / 2; + + // TODO: factor out the padding computation. + const TensorIndex padding_top = kernelRowsEff - 1 - forward_pad_top; + const TensorIndex padding_left = kernelColsEff - 1 - forward_pad_left; + const TensorIndex padding_bottom = inputRows + kernelRowsEff - 1 - (outputRows - 1) * stride - 1 - padding_top; + const TensorIndex padding_right = inputCols + kernelColsEff - 1 - (outputCols - 1) * stride - 1 - padding_left; + + eigen_assert(padding_top >= 0); + eigen_assert(padding_left >= 0); + eigen_assert(padding_bottom >= 0); + eigen_assert(padding_right >= 0); + + // The output_backward has dimensions out_depth X out_rows X out_cols X OTHERS + // When we extract the image patches from output_backward (with input as the + // kernel), it will have dimensions + // (out_depth) X (input_rows * input_cols) X (kernel_rows * kernel_cols) X OTHERS + DSizes<TensorIndex, 4> pre_contract_dims; + if (isColMajor) { + pre_contract_dims[0] = kernelFilters; + pre_contract_dims[1] = inputRows * inputCols; + pre_contract_dims[2] = kernelRows * kernelCols; + pre_contract_dims[3] = 1; + for (int i = 3; i < NumDims; ++i) { + pre_contract_dims[3] *= out.dimension(i); + } + } else { + pre_contract_dims[3] = kernelFilters; + pre_contract_dims[2] = inputRows * inputCols; + pre_contract_dims[1] = kernelRows * kernelCols; + pre_contract_dims[0] = 1; + for (int i = 0; i < NumDims - 3; ++i) { + pre_contract_dims[0] *= out.dimension(i); + } + } + + // The input has dimensions in_depth X (input_rows * input_cols) X OTHERS + DSizes<TensorIndex, 3> input_dims; + if (isColMajor) { + input_dims[0] = kernelChannels; + input_dims[1] = inputRows * inputCols; + input_dims[2] = 1; + for (int i = 3; i < NumDims; ++i) { + input_dims[2] *= in.dimension(i); + } + eigen_assert(input_dims[2] == pre_contract_dims[3]); + } else { + input_dims[2] = kernelChannels; + input_dims[1] = inputRows * inputCols; + input_dims[0] = 1; + for (int i = 0; i < NumDims - 3; ++i) { + input_dims[0] *= in.dimension(i); + } + eigen_assert(input_dims[0] == pre_contract_dims[0]); + } + + // We will contract along dimensions (1, 2) in in and (1, 3) in out, if + // this is col-major. + // For row-major, it's dimensions (0, 1) in in and (0, 2) in out. + array<IndexPair<TensorIndex>, 2> contract_dims; + if (isColMajor) { + // col-major: in.contract(output.patches) + contract_dims[0] = IndexPair<TensorIndex>(1, 1); + contract_dims[1] = IndexPair<TensorIndex>(2, 3); + } else { + // row-major: output.patches.contract(in) + contract_dims[0] = IndexPair<TensorIndex>(0, 0); + contract_dims[1] = IndexPair<TensorIndex>(2, 1); + } + + // After the contraction, the kernel will have dimension + // in_depth X out_depth X kernel_rows X kernel_cols + // We will need to shuffle the first two dimensions and reverse the latter + // two dimensions. + // The end shape is + // out_depth X in_shape X kernel_rows X kernel_cols + + // This is the shape of the kernel *before* the shuffling. + DSizes<TensorIndex, 4> kernel_dims; + if (isColMajor) { + kernel_dims[0] = kernelChannels; + kernel_dims[1] = kernelFilters; + kernel_dims[2] = kernelRows; + kernel_dims[3] = kernelCols; + } else { + kernel_dims[0] = kernelCols; + kernel_dims[1] = kernelRows; + kernel_dims[2] = kernelFilters; + kernel_dims[3] = kernelChannels; + } + + array<TensorIndex, 4> kernel_shuffle; + if (isColMajor) { + kernel_shuffle[0] = 1; + kernel_shuffle[1] = 0; + kernel_shuffle[2] = 2; + kernel_shuffle[3] = 3; + } else { + kernel_shuffle[0] = 0; + kernel_shuffle[1] = 1; + kernel_shuffle[2] = 3; + kernel_shuffle[3] = 2; + } + + array<bool, 4> kernel_reverse; + if (isColMajor) { + kernel_reverse[0] = false; + kernel_reverse[1] = false; + kernel_reverse[2] = true; + kernel_reverse[3] = true; + } else { + kernel_reverse[0] = true; + kernel_reverse[1] = true; + kernel_reverse[2] = false; + kernel_reverse[3] = false; + } + + return choose(Cond<internal::traits<Input>::Layout == ColMajor>(), + input.reshape(input_dims).contract(output_backward.extract_image_patches(inputRows, inputCols, in_stride, in_stride, 1, 1, stride, stride, padding_top, padding_bottom, padding_left, padding_right, 0).reshape(pre_contract_dims).reshape(pre_contract_dims), contract_dims).reshape(kernel_dims).reverse(kernel_reverse).shuffle(kernel_shuffle), + output_backward.extract_image_patches(inputRows, inputCols, in_stride, in_stride, 1, 1, stride, stride, padding_top, padding_bottom, padding_left, padding_right, 0).reshape(pre_contract_dims).reshape(pre_contract_dims).contract(input.reshape(input_dims), contract_dims).reshape(kernel_dims).reverse(kernel_reverse).shuffle(kernel_shuffle)); +} + +} // end namespace Eigen + +#endif // EIGEN_CXX11_NEURAL_NETWORKS_BACKWARD_SPATIAL_CONVOLUTIONS_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/CuboidConvolution.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/CuboidConvolution.h new file mode 100644 index 0000000000..dfb9dcedba --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/CuboidConvolution.h @@ -0,0 +1,179 @@ +#ifndef EIGEN_CXX11_SRC_NEURAL_NETWORKS_CUBOID_CONVOLUTION_H +#define EIGEN_CXX11_SRC_NEURAL_NETWORKS_CUBOID_CONVOLUTION_H + +#include "Patch3d.h" + +namespace Eigen { + +/** CuboidConvolution + * \ingroup CXX11_NeuralNetworks_Module + * + * \brief Applies a 3D convolution over a multichannel input voxel block. + * + * The input parameter is expected to be a tensor with a rank of 4 or more (channels, depth, height, width, and optionally others). + * The kernel parameter is expected to be a 5D tensor (filters, channels, kernel_depth, kernel_height, kernel_width). + * The result can be assigned to a tensor of rank equal to the rank of the input. The dimensions of the result will be filters, depth, height, width (and others if applicable). + * + * The input and kernel have to be in the same layout, and both row-major and + * col-major are supported. The shapes given above are for col-major layout. + * For row-major, all dimensions should be reversed. + * + * It is possible to swap the order of the depth, width, and height dimensions provided that the same order is used in the input, the kernel, and the output. + */ +template <typename Input, typename Kernel> +EIGEN_ALWAYS_INLINE +static const typename internal::conditional < + internal::traits<Input>::Layout == ColMajor, + TensorReshapingOp< + const DSizes<typename internal::traits<Input>::Index, + internal::traits<Input>::NumDimensions>, + const TensorContractionOp< + const array<IndexPair<typename internal::traits<Input>::Index>, 1>, + const TensorReshapingOp< + const DSizes<typename internal::traits<Input>::Index, 2>, + const Kernel>, + const TensorReshapingOp< + const DSizes<typename internal::traits<Input>::Index, 2>, + const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, + const Input> > > >, + TensorReshapingOp< + const DSizes<typename internal::traits<Input>::Index, + internal::traits<Input>::NumDimensions>, + const TensorContractionOp< + const array<IndexPair<typename internal::traits<Input>::Index>, 1>, + const TensorReshapingOp< + const DSizes<typename internal::traits<Input>::Index, 2>, + const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, + const Input> > , + const TensorReshapingOp< + const DSizes<typename internal::traits<Input>::Index, 2>, + const Kernel> > > >::type +CuboidConvolution(const Input& input, const Kernel& kernel, + const DenseIndex stridePlanes = 1, + const DenseIndex strideRows = 1, + const DenseIndex strideCols = 1, + const PaddingType padding_type = PADDING_SAME) { + typedef typename internal::traits<Input>::Index TensorIndex; + TensorRef<Tensor<typename internal::traits<Input>::Scalar, internal::traits<Input>::NumDimensions, internal::traits<Input>::Layout, TensorIndex> > in(input); + TensorRef<Tensor<typename internal::traits<Kernel>::Scalar, internal::traits<Kernel>::NumDimensions, internal::traits<Kernel>::Layout, TensorIndex> > kern(kernel); + + EIGEN_STATIC_ASSERT(internal::traits<Input>::Layout == internal::traits<Kernel>::Layout, YOU_MADE_A_PROGRAMMING_MISTAKE); + static const bool isColMajor = (internal::traits<Input>::Layout == ColMajor); + static const int NumDims = internal::traits<Input>::NumDimensions; + + // Number of filters to apply. This is the same as the output depth of the result. + const TensorIndex kernelFilters = isColMajor ? kern.dimensions()[0] : kern.dimensions()[4]; + const TensorIndex kernelChannels = isColMajor ? kern.dimensions()[1] : kern.dimensions()[3]; + + // Spatial size of the kernel. + const TensorIndex kernelDepth = isColMajor ? kern.dimensions()[2] : kern.dimensions()[2]; + const TensorIndex kernelRows = isColMajor ? kern.dimensions()[3] : kern.dimensions()[1]; + const TensorIndex kernelCols = isColMajor ? kern.dimensions()[4] : kern.dimensions()[0]; + + if (isColMajor) { + eigen_assert(kernelChannels == in.dimension(0)); + } else { + eigen_assert(kernelChannels == in.dimension(NumDims - 1)); + } + + const TensorIndex inputPlanes = isColMajor ? in.dimension(1) : in.dimension(NumDims - 2); + const TensorIndex inputRows = isColMajor ? in.dimension(2) : in.dimension(NumDims - 3); + const TensorIndex inputCols = isColMajor ? in.dimension(3) : in.dimension(NumDims - 4); + + const float stride_planes_f = static_cast<float>(stridePlanes); + const float stride_rows_f = static_cast<float>(strideRows); + const float stride_cols_f = static_cast<float>(strideCols); + TensorIndex out_depth; + TensorIndex out_height; + TensorIndex out_width; + switch (padding_type) { + case PADDING_VALID: + out_depth = ceil((inputPlanes - kernelDepth + 1.f) / stride_planes_f); + out_height = ceil((inputRows - kernelRows + 1.f) / stride_rows_f); + out_width = ceil((inputCols - kernelCols + 1.f) / stride_cols_f); + break; + case PADDING_SAME: + out_depth = ceil(inputPlanes / stride_planes_f); + out_height = ceil(inputRows / stride_rows_f); + out_width = ceil(inputCols / stride_cols_f); + break; + default: + eigen_assert(false && "unexpected padding"); + } + + DSizes<TensorIndex, 2> kernel_dims; + if (isColMajor) { + kernel_dims[0] = kernelFilters; + kernel_dims[1] = kernelChannels * kernelDepth * kernelRows * kernelCols; + } else { + kernel_dims[0] = kernelChannels * kernelDepth * kernelRows * kernelCols; + kernel_dims[1] = kernelFilters; + } + + // Molds the output of the patch extraction result into a 2D tensor: + // - the first dimension (dims[0]): the patch values to be multiplied with the kernels + // - the second dimension (dims[1]): everything else + DSizes<TensorIndex, 2> pre_contract_dims; + if (isColMajor) { + pre_contract_dims[0] = kernelChannels * kernelDepth * kernelRows * kernelCols; + pre_contract_dims[1] = out_depth * out_height * out_width; + for (int i = 4; i < NumDims; ++i) { + pre_contract_dims[1] *= in.dimension(i); + } + } else { + pre_contract_dims[1] = kernelChannels * kernelDepth * kernelRows * kernelCols; + pre_contract_dims[0] = out_depth * out_height * out_width; + for (int i = 0; i < NumDims - 4; ++i) { + pre_contract_dims[0] *= in.dimension(i); + } + } + + array<IndexPair<TensorIndex>, 1> contract_dims; + contract_dims[0] = IndexPair<TensorIndex>(1, 0); + + // Molds the output of the contraction into the shape expected by the user + // (assuming ColMajor): + // - 1st dim: kernel filters + // - 2nd dim: output depth + // - 3nd dim: output height + // - 4rd dim: output width + // - 5th dim and beyond: everything else including batch size + DSizes<TensorIndex, NumDims> post_contract_dims; + if (isColMajor) { + post_contract_dims[0] = kernelFilters; + post_contract_dims[1] = out_depth; + post_contract_dims[2] = out_height; + post_contract_dims[3] = out_width; + for (int i = 4; i < NumDims; ++i) { + post_contract_dims[i] = in.dimension(i); + } + } else { + post_contract_dims[NumDims - 1] = kernelFilters; + post_contract_dims[NumDims - 2] = out_depth; + post_contract_dims[NumDims - 3] = out_height; + post_contract_dims[NumDims - 4] = out_width; + for (int i = 0; i < NumDims - 4; ++i) { + post_contract_dims[i] = in.dimension(i); + } + } + + return choose( + Cond<internal::traits<Input>::Layout == ColMajor>(), + kernel.reshape(kernel_dims) + .contract(input.extract_volume_patches( + kernelDepth, kernelRows, kernelCols, stridePlanes, + strideRows, strideCols, padding_type) + .reshape(pre_contract_dims), + contract_dims) + .reshape(post_contract_dims), + input.extract_volume_patches(kernelDepth, kernelRows, kernelCols, + stridePlanes, strideRows, strideCols, + padding_type) + .reshape(pre_contract_dims) + .contract(kernel.reshape(kernel_dims), contract_dims) + .reshape(post_contract_dims)); +} + +} // end namespace Eigen + +#endif // EIGEN_CXX11_SRC_NEURAL_NETWORKS_CUBOID_CONVOLUTION_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Patch3d.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Patch3d.h new file mode 100644 index 0000000000..df60fe18a3 --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Patch3d.h @@ -0,0 +1,233 @@ +#ifndef EIGEN_CXX11_SRC_NEURAL_NETWORKS_PATCH3D_H +#define EIGEN_CXX11_SRC_NEURAL_NETWORKS_PATCH3D_H + +#if not defined(__CUDACC__) +#include <type_traits> +#endif + +namespace Eigen { +namespace internal { + +/** Extract3DPatches + * \ingroup CXX11_NeuralNetworksModule + * + * \brief Extracts 3D patches from a multichannel input volume. + * + * The input parameter is expected to be a tensor with a rank of 4 or more + * (channels, depth, height, width, optional others in col-major, and the + * reverse order in row-major). + + * The return value will be a tensor of 3 more dimension than the input tensor. + * In col-major, the first 4 dimensions of the result are: channels, patch_depth, + * patch_height, patch_width. The next dimensions will identify the patch + * position on the 3D grid of extracted patches: z, y, x. The remaining + * dimensions, if any, will be the same as the 'other' dimensions of the input + * tensor. + */ + +template <typename Input> +EIGEN_ALWAYS_INLINE static const TensorStridingOp< + const array<typename internal::traits<Input>::Index, + internal::traits<Input>::NumDimensions + 3>, + const TensorReshapingOp< + const DSizes<typename internal::traits<Input>::Index, + internal::traits<Input>::NumDimensions + 3>, + const TensorPatchOp< + const DSizes<typename internal::traits<Input>::Index, + internal::traits<Input>::NumDimensions>, + const TensorPaddingOp< + const array<IndexPair<typename internal::traits<Input>::Index>, + internal::traits<Input>::NumDimensions>, + const Input> > > > +Extract3DPatches( + const Input& input, const DenseIndex patchPlanes, + const DenseIndex patchRows, const DenseIndex patchCols, + const DenseIndex stridePlanes, const DenseIndex strideRows, + const DenseIndex strideCols, + const DenseIndex paddingZTop, const DenseIndex paddingZBottom, + const DenseIndex paddingTop, const DenseIndex paddingBottom, + const DenseIndex paddingLeft, const DenseIndex paddingRight, + const typename internal::traits<Input>::Scalar padding_value = 0) { + + typedef typename internal::traits<Input>::Index TensorIndex; + TensorRef<Tensor<typename internal::traits<Input>::Scalar, internal::traits<Input>::NumDimensions, internal::traits<Input>::Layout, TensorIndex> > in(input); + + EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions >= 4, YOU_MADE_A_PROGRAMMING_MISTAKE); + + static const bool isColMajor = (internal::traits<Input>::Layout == ColMajor); + static const int NumDims = internal::traits<Input>::NumDimensions; + static const int ExtDims = NumDims + 3; + + // Tensor size after patch extraction. We add three dimensions to unpack the + // linear patch index into a 3D grid over which stride() can work. + DSizes<TensorIndex, ExtDims> pre_stride_dims; + + if (isColMajor) { + pre_stride_dims[0] = in.dimension(0); + pre_stride_dims[1] = patchPlanes; + pre_stride_dims[2] = patchRows; + pre_stride_dims[3] = patchCols; + } else { + pre_stride_dims[ExtDims - 1] = in.dimension(NumDims - 1); + pre_stride_dims[ExtDims - 4] = patchCols; + pre_stride_dims[ExtDims - 3] = patchRows; + pre_stride_dims[ExtDims - 2] = patchPlanes; + } + + const TensorIndex inputPlanes = isColMajor ? in.dimension(1) : in.dimension(NumDims - 2); + const TensorIndex inputRows = isColMajor ? in.dimension(2) : in.dimension(NumDims - 3); + const TensorIndex inputCols = isColMajor ? in.dimension(3) : in.dimension(NumDims - 4); + + array<IndexPair<TensorIndex>, NumDims> paddings; + for (int i = 0; i < NumDims; ++i) { + paddings[i] = IndexPair<TensorIndex>(0, 0); + } + + paddings[isColMajor ? 1 : (NumDims - 2)] = IndexPair<TensorIndex>(paddingZTop, paddingZBottom); + paddings[isColMajor ? 2 : (NumDims - 3)] = IndexPair<TensorIndex>(paddingTop, paddingBottom); + paddings[isColMajor ? 3 : (NumDims - 4)] = IndexPair<TensorIndex>(paddingLeft, paddingRight); + + pre_stride_dims[isColMajor ? 4 : (ExtDims - 5)] = inputPlanes + paddingZBottom + paddingZTop - patchPlanes + 1; + pre_stride_dims[isColMajor ? 5 : (ExtDims - 6)] = inputRows + paddingTop + paddingBottom - patchRows + 1; + pre_stride_dims[isColMajor ? 6 : (ExtDims - 7)] = inputCols + paddingLeft + paddingRight - patchCols + 1; + + if (isColMajor) { + for (int i = 7; i < NumDims + 3; ++i) { + pre_stride_dims[i] = in.dimension(i - 3); + } + } else { + for (int i = 0; i < NumDims - 4; ++i) { + pre_stride_dims[i] = in.dimension(i); + } + } + + DSizes<TensorIndex, NumDims> patch_dims; + if (isColMajor) { + patch_dims[0] = in.dimension(0); + patch_dims[1] = patchPlanes; + patch_dims[2] = patchRows; + patch_dims[3] = patchCols; + for (int i = 4; i < NumDims; ++i) { + patch_dims[i] = 1; + } + } else { + patch_dims[NumDims - 1] = in.dimension(NumDims - 1); + patch_dims[NumDims - 4] = patchCols; + patch_dims[NumDims - 3] = patchRows; + patch_dims[NumDims - 2] = patchPlanes; + for (int i = 0; i < NumDims - 4; i++) { + patch_dims[i] = 1; + } + } + + array<TensorIndex, NumDims + 3> strides; + if (isColMajor) { + // No striding within the patches. + for (int i = 0; i < 4; ++i) { + strides[i] = 1; + } + // Apply striding in the spatial patch grid dimensions only. + strides[4] = stridePlanes; + strides[5] = strideRows; + strides[6] = strideCols; + // No striding in the remaining dimensions (batches, ...). + for (int i = 7; i < NumDims + 3; i++) { + strides[i] = 1; + } + } else { + // No striding within the patches. + for (int i = 1; i <= 4; ++i) { + strides[ExtDims - i] = 1; + } + // Apply striding in the spatial patch grid dimensions only. + strides[ExtDims - 7] = strideCols; + strides[ExtDims - 6] = strideRows; + strides[ExtDims - 5] = stridePlanes; + // No striding in the remaining dimensions (batches, ...). + for (int i = 0; i < NumDims - 4; i++) { + strides[i] = 1; + } + } + + // TODO(mjanusz): Consider getting rid of pad(), and stride() and extend + // extract_patches to take additional parameters for padding/striding, + // similarly to etract_image_patches. + return input.pad(paddings, padding_value).extract_patches(patch_dims).reshape(pre_stride_dims).stride(strides); +} + + +template <typename Input> +EIGEN_ALWAYS_INLINE static const TensorStridingOp< + const array<typename internal::traits<Input>::Index, + internal::traits<Input>::NumDimensions + 3>, + const TensorReshapingOp< + const DSizes<typename internal::traits<Input>::Index, + internal::traits<Input>::NumDimensions + 3>, + const TensorPatchOp< + const DSizes<typename internal::traits<Input>::Index, + internal::traits<Input>::NumDimensions>, + const TensorPaddingOp< + const array<IndexPair<typename internal::traits<Input>::Index>, + internal::traits<Input>::NumDimensions>, + const Input> > > > +Extract3DPatches( + const Input& input, const DenseIndex patchPlanes, + const DenseIndex patchRows, const DenseIndex patchCols, + const DenseIndex stridePlanes, const DenseIndex strideRows, + const DenseIndex strideCols, const PaddingType padding_type, + const typename internal::traits<Input>::Scalar padding_value = 0) { + typedef typename internal::traits<Input>::Index TensorIndex; + TensorRef<Tensor<typename internal::traits<Input>::Scalar, internal::traits<Input>::NumDimensions, internal::traits<Input>::Layout, TensorIndex> > in(input); + + EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions >= 4, YOU_MADE_A_PROGRAMMING_MISTAKE); + + static const bool isColMajor = (internal::traits<Input>::Layout == ColMajor); + static const int NumDims = internal::traits<Input>::NumDimensions; + + const TensorIndex inputPlanes = isColMajor ? in.dimension(1) : in.dimension(NumDims - 2); + const TensorIndex inputRows = isColMajor ? in.dimension(2) : in.dimension(NumDims - 3); + const TensorIndex inputCols = isColMajor ? in.dimension(3) : in.dimension(NumDims - 4); + + switch (padding_type) { + case PADDING_VALID: + // No padding in any dimension. + return Extract3DPatches(input, patchPlanes, patchRows, patchCols, + stridePlanes, strideRows, strideCols, + 0, 0, 0, 0, 0, 0, padding_value); + case PADDING_SAME: + // The side of the tensor before striding should be just the expected + // output times the stride. + const TensorIndex size_z = ceil(inputPlanes / static_cast<float>(stridePlanes)) * stridePlanes; + const TensorIndex size_y = ceil(inputRows / static_cast<float>(strideRows)) * strideRows; + const TensorIndex size_x = ceil(inputCols / static_cast<float>(strideCols)) * strideCols; + + // The size of the patch space is going to be: padded_input_size - patch_size + 1. + // This has to match the expected size before striding (pre_stride_dims). + // The deltas below extend the input to the expected size. + const TensorIndex dz = size_z + patchPlanes - 1 - inputPlanes; + const TensorIndex dy = size_y + patchRows - 1 - inputRows; + const TensorIndex dx = size_x + patchCols - 1 - inputCols; + + return Extract3DPatches(input, patchPlanes, patchRows, patchCols, + stridePlanes, strideRows, strideCols, + dz - dz / 2, dz / 2, + dy - dy / 2, dy / 2, + dx - dx / 2, dx / 2, + padding_value); + } +} + +// TODO(mjanusz): Switch this to a 'using' alias once CUDA supports C++11. +template <typename Input> +struct Extract3DPatchesType { + typedef const TensorStridingOp< const array<typename internal::traits<Input>::Index, internal::traits<Input>::NumDimensions + 3>, + const TensorReshapingOp< const DSizes<typename internal::traits<Input>::Index, internal::traits<Input>::NumDimensions + 3>, + const TensorPatchOp< const DSizes<typename internal::traits<Input>::Index, internal::traits<Input>::NumDimensions>, + const TensorPaddingOp< const array< IndexPair<typename internal::traits<Input>::Index>, internal::traits<Input>::NumDimensions>, + const Input> > > > type; +}; + +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_CXX11_SRC_NEURAL_NETWORKS_PATCH3D_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Pooling.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Pooling.h new file mode 100644 index 0000000000..8dea22806c --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Pooling.h @@ -0,0 +1,442 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +#ifndef EIGEN_CXX11_NEURAL_NETWORKS_POOLING_H +#define EIGEN_CXX11_NEURAL_NETWORKS_POOLING_H + +#include "Patch3d.h" + +namespace Eigen { + +/** SpatialMaxPooling + * \ingroup CXX11_NeuralNetworks_Module + * + * \brief Applies a max-pooling over a multichannel input image. + * + * The input parameter is expected to be a with a rank of 4 (channels, height, width, others in col-major, and the reverse of that in row-major). + * + * The result can be assigned to a tensor of rank equal to the rank of the input. The dimensions of the result will be channels, height, width, and others (in col-major, and the reverse of that if the input was row-major). + * + * The order of the width and height dimensions can be swapped if needed. + * +*/ +#if !defined(EIGEN_HAS_INDEX_LIST) +template <typename Input> +EIGEN_ALWAYS_INLINE +static const TensorReshapingOp<const Eigen::DSizes<typename internal::traits<Input>::Index, internal::traits<Input>::NumDimensions>, const TensorReductionOp<internal::MaxReducer<typename internal::remove_const<typename internal::traits<Input>::Scalar>::type>, const Eigen::array<int, 2>, const TensorImagePatchOp<Dynamic, Dynamic, const Input> > > +#else +template <typename Input> +EIGEN_ALWAYS_INLINE +static const TensorReshapingOp<const Eigen::DSizes<typename internal::traits<Input>::Index, internal::traits<Input>::NumDimensions>, const TensorReductionOp<internal::MaxReducer<typename internal::remove_const<typename internal::traits<Input>::Scalar>::type>, typename internal::conditional<internal::traits<Input>::Layout == ColMajor, const Eigen::IndexList<Eigen::type2index<1>, Eigen::type2index<2> >, const Eigen::IndexList<Eigen::type2index<2>, Eigen::type2index<3> > >::type, const TensorImagePatchOp<Dynamic, Dynamic, const Input> > > +#endif +SpatialMaxPooling(const Input& input, DenseIndex patchRows, DenseIndex patchCols, + DenseIndex strideRows, DenseIndex strideCols, const PaddingType padding_type, + DenseIndex in_strideRows = 1, DenseIndex in_strideCols = 1) +{ + EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == 4, YOU_MADE_A_PROGRAMMING_MISTAKE); + + typedef typename internal::traits<Input>::Index TensorIndex; + TensorRef<Tensor<typename internal::traits<Input>::Scalar, internal::traits<Input>::NumDimensions, internal::traits<Input>::Layout, TensorIndex> > in(input); + + const DenseIndex patchRowsEff = patchRows + (patchRows - 1) * (in_strideRows - 1); + const DenseIndex patchColsEff = patchCols + (patchCols - 1) * (in_strideCols - 1); + + static const bool isColMajor = (internal::traits<Input>::Layout == ColMajor); + static const int idxRows = isColMajor ? 1 : 2; + static const int idxCols = isColMajor ? 2 : 1; + + // Molds the output of the reduction into the shape expected by the user. + // (assuming col-major): + // - 1st dim: channels + // - 2nd dim: output height + // - 3rd dim: output width + // - 4th dim and beyond: everything else including batch size + Eigen::DSizes<TensorIndex, internal::traits<Input>::NumDimensions> post_reduce_dims; + post_reduce_dims[0] = in.dimension(0); + if (padding_type == PADDING_VALID) { + post_reduce_dims[idxRows] = numext::ceil((in.dimension(idxRows) - patchRowsEff + 1.f) / static_cast<float>(strideRows)); + post_reduce_dims[idxCols] = numext::ceil((in.dimension(idxCols) - patchColsEff + 1.f) / static_cast<float>(strideCols)); + } else { + post_reduce_dims[idxRows] = numext::ceil(in.dimension(idxRows) / static_cast<float>(strideRows)); + post_reduce_dims[idxCols] = numext::ceil(in.dimension(idxCols) / static_cast<float>(strideCols)); + } + post_reduce_dims[3] = in.dimension(3); + +#if !defined(EIGEN_HAS_INDEX_LIST) + // nvcc doesn't support cxx11 + Eigen::array<int, 2> reduction_dims; + if (isColMajor) { + reduction_dims[0] = 1; + reduction_dims[1] = 2; + } else { + reduction_dims[0] = 2; + reduction_dims[1] = 3; + } +#else + // Take advantage of cxx11 to give the compiler information it can use to + // optimize the code. + typename internal::conditional<internal::traits<Input>::Layout == ColMajor, const Eigen::IndexList<Eigen::type2index<1>, Eigen::type2index<2> >, const Eigen::IndexList<Eigen::type2index<2>, Eigen::type2index<3> > >::type reduction_dims; +#endif + + return input.extract_image_patches(patchRows, patchCols, strideRows, strideCols, in_strideRows, in_strideCols, padding_type, -Eigen::NumTraits<typename internal::remove_const<typename internal::traits<Input>::Scalar>::type>::highest()).maximum(reduction_dims).reshape(post_reduce_dims); +} + +/** CuboidMaxPooling + * \ingroup CXX11_NeuralNetworks_Module + * + * \brief Applies a max-pooling over a multichannel input volume. + * + * The input parameter is expected to be a tensor with a rank of 5 (channels, depth, height, width, others in col-major, and the reverse of that in row-major). + * + * The result can be assigned to a tensor of rank equal to the rank of the input. The dimensions of the result will be channels, depth, height, width, and others (in col-major, and the reverse of that if the input was row-major). + * + * The order of the depth, width and height dimensions can be swapped if needed. + * +*/ +#if !defined(EIGEN_HAS_INDEX_LIST) +template <typename Input> +EIGEN_ALWAYS_INLINE static const TensorReshapingOp< + const Eigen::DSizes<DenseIndex, internal::traits<Input>::NumDimensions>, + const TensorReductionOp< + internal::MaxReducer<float>, const Eigen::array<int, 1>, + const TensorReshapingOp< + const Eigen::DSizes<DenseIndex, 3>, + const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Input> > > > +#else +template <typename Input> +EIGEN_ALWAYS_INLINE static const TensorReshapingOp< + const Eigen::DSizes<DenseIndex, internal::traits<Input>::NumDimensions>, + const TensorReductionOp< + internal::MaxReducer<float>, + const Eigen::IndexList<Eigen::type2index<1> >, + const TensorReshapingOp< + const Eigen::DSizes<DenseIndex, 3>, + const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Input> > > > +#endif +CuboidMaxPooling(const Input& input, DenseIndex patchPlanes, + DenseIndex patchRows, DenseIndex patchCols, + DenseIndex stridePlanes, DenseIndex strideRows, + DenseIndex strideCols, const PaddingType padding_type) { + EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == 5, YOU_MADE_A_PROGRAMMING_MISTAKE); + static const bool isColMajor = (internal::traits<Input>::Layout == ColMajor); + + typedef typename internal::traits<Input>::Index TensorIndex; + TensorRef<Tensor<typename internal::traits<Input>::Scalar, internal::traits<Input>::NumDimensions, internal::traits<Input>::Layout, TensorIndex> > in(input); + + static const int idxPlanes = isColMajor ? 1 : 3; + static const int idxRows = 2; + static const int idxCols = isColMajor ? 3 : 1; + + // Molds the output of the reduction into the shape expected by the used + // (assuming col-major): + // - 1st dim: channels + // - 2nd dim: output depth + // - 3rd dim: output height + // - 4th dim: output width + // - 5th dim and beyond: everything else including batch size + Eigen::DSizes<DenseIndex, internal::traits<Input>::NumDimensions> post_reduce_dims; + post_reduce_dims[0] = in.dimension(0); + if (padding_type == PADDING_VALID) { + post_reduce_dims[idxPlanes] = numext::ceil((in.dimension(idxPlanes) - patchPlanes + 1.f) / static_cast<float>(stridePlanes)); + post_reduce_dims[idxRows] = numext::ceil((in.dimension(idxRows) - patchRows + 1.f) / static_cast<float>(strideRows)); + post_reduce_dims[idxCols] = numext::ceil((in.dimension(idxCols) - patchCols + 1.f) / static_cast<float>(strideCols)); + } else { + post_reduce_dims[idxPlanes] = numext::ceil(in.dimension(idxPlanes) / static_cast<float>(stridePlanes)); + post_reduce_dims[idxRows] = numext::ceil(in.dimension(idxRows) / static_cast<float>(strideRows)); + post_reduce_dims[idxCols] = numext::ceil(in.dimension(idxCols) / static_cast<float>(strideCols)); + } + post_reduce_dims[4] = in.dimension(4); + + Eigen::DSizes<DenseIndex, 3> pre_reduce_dims; + pre_reduce_dims[1] = patchRows * patchCols * patchPlanes; + if (isColMajor) { + pre_reduce_dims[0] = post_reduce_dims[0]; + pre_reduce_dims[2] = post_reduce_dims[1] * post_reduce_dims[2] * post_reduce_dims[3] * post_reduce_dims[4]; + } else { + pre_reduce_dims[0] = post_reduce_dims[0] * post_reduce_dims[1] * post_reduce_dims[2] * post_reduce_dims[3]; + pre_reduce_dims[2] = post_reduce_dims[4]; + } + +#if !defined(EIGEN_HAS_INDEX_LIST) + // nvcc doesn't support cxx11 + Eigen::array<int, 1> reduction_dims; + reduction_dims[0] = 1; +#else + // Take advantage of cxx11 to give the compiler information it can use to + // optimize the code. + Eigen::IndexList<Eigen::type2index<1> > reduction_dims; +#endif + return input.extract_volume_patches(patchPlanes, patchRows, patchCols, + stridePlanes, strideRows, strideCols, + padding_type, -Eigen::NumTraits<float>::highest()) + .reshape(pre_reduce_dims) + .maximum(reduction_dims) + .reshape(post_reduce_dims); +} + + +/** SpatialAvgPooling + * \ingroup CXX11_NeuralNetworks_Module + * + * \brief Applies an average pooling over a multichannel input image. + * + * The input parameter is expected to be a tensor with a rank of 4 (channels, height, width, others in col-major, and the reverse of that in row-major). + * + * The result can be assigned to a tensor of rank equal to the rank of the input. The dimensions of the result will be channels, height, width, and others (in col-major, and the reverse of that if the input was row-major). + * + * The order of the width and height dimensions can be swapped if needed. + * +*/ +namespace internal { + +template <typename T> struct AvgPoolMeanReducer +{ +#if (EIGEN_ARCH_i386 || EIGEN_ARCH_x86_64 || defined (EIGEN_USE_GPU) || defined(__CUDACC__) || defined(__CUDA_ARCH__)) + // We only support packet access for floats. + static const bool PacketAccess = internal::is_same<T, float>::value; +#else + static const bool PacketAccess = false; +#endif + static const bool IsStateful = true; + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE AvgPoolMeanReducer() : scalarCount_(0) { + typedef typename packet_traits<T>::type Packet; + packetCount_ = pset1<Packet>(0.0); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) { + if (t != -Eigen::NumTraits<T>::highest()) { + (*accum) = (*accum) + t; + scalarCount_++; + } + } + + +#if (!defined (EIGEN_USE_GPU) || !defined(__CUDACC__) || !defined(__CUDA_ARCH__)) +#ifdef EIGEN_VECTORIZE_AVX +#define pequal(a,b) _mm256_cmp_ps(a,b,_CMP_EQ_UQ) +#define psel(a,b,false_mask) _mm256_blendv_ps(a,b,false_mask) +#else +#define pequal(a,b) _mm_cmpeq_ps(a,b) +#define psel(a,b,false_mask) _mm_or_ps(_mm_andnot_ps(false_mask, a), _mm_and_ps(false_mask, b)) +#endif + + template <typename Packet> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) { + reducePacketWithType(static_cast<T>(0), p, accum); + } + + template <typename Packet> + void reducePacketWithType(T, const Packet& p, Packet* accum) { + Packet skip_mask = pequal(p, pset1<Packet>(-Eigen::NumTraits<T>::highest())); + (*accum) = padd<Packet>(*accum, psel(p, pset1<Packet>(0), skip_mask)); + packetCount_ = padd<Packet>(packetCount_, psel(pset1<Packet>(1), pset1<Packet>(0), skip_mask)); + } + +#else +#define pequal(a,b) make_float4(a.x == b.x ? 1.f : 0, a.y == b.y ? 1.f : 0, a.z == b.z ? 1.f : 0, a.w == b.w ? 1.f : 0) +#define psel(a,b,c) make_float4(c.x ? b.x : a.x, c.y ? b.y : a.y, c.z ? b.z : a.z, c.w ? b.w : a.w) + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const float4& p, float4* accum) { + float4 skip_mask = pequal(p, pset1<float4>(-Eigen::NumTraits<float>::highest())); + (*accum) = padd<float4>(*accum, psel(p, pset1<float4>(0), skip_mask)); + packetCount_ = padd<float4>(packetCount_, psel(pset1<float4>(1), pset1<float4>(0), skip_mask)); + } + +#endif + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { + return static_cast<T>(0); + } + template <typename Packet> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const { + return pset1<Packet>(0); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { + eigen_assert(scalarCount_ > 0); + return accum / scalarCount_; + } + template <typename Packet> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const { + return pdiv(vaccum, packetCount_); + } + template <typename Packet> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const { + return (saccum + predux(vaccum)) / (scalarCount_ + predux(packetCount_)); + } + + protected: + typedef typename packet_traits<T>::type Packet; + int scalarCount_; + Packet packetCount_; +}; + +} // namespace internal + +#if !defined(EIGEN_HAS_INDEX_LIST) +template <typename Input> +EIGEN_ALWAYS_INLINE +static const TensorReshapingOp<const Eigen::DSizes<typename internal::traits<Input>::Index, internal::traits<Input>::NumDimensions>, const TensorReductionOp<internal::AvgPoolMeanReducer<typename internal::remove_const<typename internal::traits<Input>::Scalar>::type>, const Eigen::array<int, 2>, const TensorImagePatchOp<Dynamic, Dynamic, const Input> > > +#else +template <typename Input> +EIGEN_ALWAYS_INLINE +static const TensorReshapingOp<const Eigen::DSizes<typename internal::traits<Input>::Index, internal::traits<Input>::NumDimensions>, const TensorReductionOp<internal::AvgPoolMeanReducer<typename internal::remove_const<typename internal::traits<Input>::Scalar>::type>, typename internal::conditional<internal::traits<Input>::Layout == ColMajor, const Eigen::IndexList<Eigen::type2index<1>, Eigen::type2index<2> >, const Eigen::IndexList<Eigen::type2index<2>, Eigen::type2index<3> > >::type, const TensorImagePatchOp<Dynamic, Dynamic, const Input> > > +#endif +SpatialAvgPooling(const Input& input, DenseIndex patchRows, DenseIndex patchCols, + DenseIndex strideRows, DenseIndex strideCols, const PaddingType padding_type, + DenseIndex in_strideRows = 1, DenseIndex in_strideCols = 1) +{ + EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == 4, YOU_MADE_A_PROGRAMMING_MISTAKE); + + typedef typename internal::traits<Input>::Index TensorIndex; + TensorRef<Tensor<typename internal::traits<Input>::Scalar, internal::traits<Input>::NumDimensions, internal::traits<Input>::Layout, TensorIndex> > in(input); + + const DenseIndex patchRowsEff = patchRows + (patchRows - 1) * (in_strideRows - 1); + const DenseIndex patchColsEff = patchCols + (patchCols - 1) * (in_strideCols - 1); + + static const bool isColMajor = (internal::traits<Input>::Layout == ColMajor); + static const int idxRows = isColMajor ? 1 : 2; + static const int idxCols = isColMajor ? 2 : 1; + + // Molds the output of the reduction into the shape expected by the user. + // (assuming col-major): + // - 1st dim: channels + // - 2nd dim: output height + // - 3rd dim: output width + // - 4th dim and beyond: everything else including batch size + Eigen::DSizes<TensorIndex, internal::traits<Input>::NumDimensions> post_reduce_dims; + post_reduce_dims[0] = in.dimension(0); + if (padding_type == PADDING_VALID) { + post_reduce_dims[idxRows] = numext::ceil((in.dimension(idxRows) - patchRowsEff + 1.f) / static_cast<float>(strideRows)); + post_reduce_dims[idxCols] = numext::ceil((in.dimension(idxCols) - patchColsEff + 1.f) / static_cast<float>(strideCols)); + } else { + post_reduce_dims[idxRows] = numext::ceil(in.dimension(idxRows) / static_cast<float>(strideRows)); + post_reduce_dims[idxCols] = numext::ceil(in.dimension(idxCols) / static_cast<float>(strideCols)); + } + post_reduce_dims[3] = in.dimension(3); + + typedef typename internal::remove_const<typename internal::traits<Input>::Scalar>::type CoeffReturnType; + internal::AvgPoolMeanReducer<CoeffReturnType> mean_with_nan; + +#if !defined(EIGEN_HAS_INDEX_LIST) + // nvcc doesn't support cxx11 + Eigen::array<int, 2> reduction_dims; + if (isColMajor) { + reduction_dims[0] = 1; + reduction_dims[1] = 2; + } else { + reduction_dims[0] = 2; + reduction_dims[1] = 3; + } +#else + // Take advantage of cxx11 to give the compiler information it can use to + // optimize the code. + typename internal::conditional<internal::traits<Input>::Layout == ColMajor, const Eigen::IndexList<Eigen::type2index<1>, Eigen::type2index<2> >, const Eigen::IndexList<Eigen::type2index<2>, Eigen::type2index<3> > >::type reduction_dims; +#endif + return input.extract_image_patches(patchRows, patchCols, strideRows, strideCols, in_strideRows, in_strideCols, padding_type, -Eigen::NumTraits<typename internal::remove_const<typename internal::traits<Input>::Scalar>::type>::highest()).reduce(reduction_dims, mean_with_nan).reshape(post_reduce_dims); +} + + +/** CuboidAvgPooling + * \ingroup CXX11_NeuralNetworks_Module + * + * \brief Applies an average pooling over a multichannel input volume. + * + * The input parameter is expected to be a tensor with a rank of 5 (channels, depth, height, width, others, and the reverse of that in row-major). + * + * The result can be assigned to a tensor of rank equal to the rank of the input. The dimensions of the result will be channels, depth, width, and others (in col-major, and the reverse of that if the input was row-major). + * + * The order of the depth, width and height dimensions can be swapped if needed. + * +*/ +#if !defined(EIGEN_HAS_INDEX_LIST) +template <typename Input> +EIGEN_ALWAYS_INLINE static const TensorReshapingOp< + const Eigen::DSizes<DenseIndex, internal::traits<Input>::NumDimensions>, + const TensorReductionOp< + internal::AvgPoolMeanReducer<float>, const Eigen::array<int, 1>, + const TensorReshapingOp< + const Eigen::DSizes<DenseIndex, 3>, + const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Input> > > > +#else +template <typename Input> +EIGEN_ALWAYS_INLINE static const TensorReshapingOp< + const Eigen::DSizes<DenseIndex, internal::traits<Input>::NumDimensions>, + const TensorReductionOp< + internal::AvgPoolMeanReducer<float>, + const Eigen::IndexList<Eigen::type2index<1> >, + const TensorReshapingOp< + const Eigen::DSizes<DenseIndex, 3>, + const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Input> > > > +#endif +CuboidAvgPooling(const Input& input, DenseIndex patchPlanes, + DenseIndex patchRows, DenseIndex patchCols, + DenseIndex stridePlanes, DenseIndex strideRows, + DenseIndex strideCols, const PaddingType padding_type) { + EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == 5, YOU_MADE_A_PROGRAMMING_MISTAKE); + static const bool isColMajor = (internal::traits<Input>::Layout == ColMajor); + + typedef typename internal::traits<Input>::Index TensorIndex; + TensorRef<Tensor<typename internal::traits<Input>::Scalar, internal::traits<Input>::NumDimensions, internal::traits<Input>::Layout, TensorIndex> > in(input); + + static const int idxPlanes = isColMajor ? 1 : 3; + static const int idxRows = 2; + static const int idxCols = isColMajor ? 3 : 1; + // Molds the output of the reduction into the shape expected by the used + // (assuming col-major): + // - 1st dim: channels + // - 2nd dim: outupt depth + // - 3rd dim: output height + // - 4th dim: output width + // - 5th dim and beyond: everything else including batch size + Eigen::DSizes<DenseIndex, internal::traits<Input>::NumDimensions> post_reduce_dims; + post_reduce_dims[0] = in.dimension(0); + if (padding_type == PADDING_VALID) { + post_reduce_dims[idxPlanes] = numext::ceil((in.dimension(idxPlanes) - patchPlanes + 1.f) / static_cast<float>(stridePlanes)); + post_reduce_dims[idxRows] = numext::ceil((in.dimension(idxRows) - patchRows + 1.f) / static_cast<float>(strideRows)); + post_reduce_dims[idxCols] = numext::ceil((in.dimension(idxCols) - patchCols + 1.f) / static_cast<float>(strideCols)); + } else { + post_reduce_dims[idxPlanes] = numext::ceil(in.dimension(idxPlanes) / static_cast<float>(stridePlanes)); + post_reduce_dims[idxRows] = numext::ceil(in.dimension(idxRows) / static_cast<float>(strideRows)); + post_reduce_dims[idxCols] = numext::ceil(in.dimension(idxCols) / static_cast<float>(strideCols)); + } + post_reduce_dims[4] = in.dimension(4); + + Eigen::DSizes<DenseIndex, 3> pre_reduce_dims; + pre_reduce_dims[1] = patchRows * patchCols * patchPlanes; + if (isColMajor) { + pre_reduce_dims[0] = post_reduce_dims[0]; + pre_reduce_dims[2] = post_reduce_dims[1] * post_reduce_dims[2] * post_reduce_dims[3] * post_reduce_dims[4]; + } else { + pre_reduce_dims[0] = post_reduce_dims[0] * post_reduce_dims[1] * post_reduce_dims[2] * post_reduce_dims[3]; + pre_reduce_dims[2] = post_reduce_dims[4]; + } + + typedef typename internal::remove_const<typename internal::traits<Input>::Scalar>::type CoeffReturnType; + internal::AvgPoolMeanReducer<CoeffReturnType> mean_with_nan; + +#if !defined(EIGEN_HAS_INDEX_LIST) + // nvcc doesn't support cxx11 + Eigen::array<int, 1> reduction_dims; + reduction_dims[0] = 1; +#else + // Take advantage of cxx11 to give the compiler information it can use to + // optimize the code. + Eigen::IndexList<Eigen::type2index<1> > reduction_dims; +#endif + return input.extract_volume_patches(patchPlanes, patchRows, patchCols, + stridePlanes, strideRows, strideCols, + padding_type, -Eigen::NumTraits<float>::highest()) + .reshape(pre_reduce_dims) + .reduce(reduction_dims, mean_with_nan) + .reshape(post_reduce_dims); +} + +} // end namespace Eigen + +#endif // EIGEN_CXX11_NEURAL_NETWORKS_POOLING_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/SoftMax.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/SoftMax.h new file mode 100644 index 0000000000..223ae28ffd --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/SoftMax.h @@ -0,0 +1,82 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +#ifndef EIGEN_CXX11_NEURAL_NETWORKS_SOFTMAX_H +#define EIGEN_CXX11_NEURAL_NETWORKS_SOFTMAX_H + +namespace Eigen { + +/** SoftMax + * \ingroup CXX11_NeuralNetworks_Module + * + * \brief Applies a softmax + * + * The input parameter is expected to be a col-major tensor with a rank of 2 (depth and other). + * + * The result can be assigned to a tensor of rank and dimensions equal to that of the input. The result will be laid out in col-major order. + * +*/ + +namespace { +struct SoftmaxOp { + SoftmaxOp(const float beta) : beta_(beta) { } + + template <typename Input> + typename Input::Dimensions dimensions(const Input& input) const { + return input.dimensions(); + } + + template <typename Input, typename Output, typename Device> + void eval(const Input& input, Output& output, const Device& device) const + { +#if !defined(EIGEN_HAS_INDEX_LIST) + // nvcc doesn't support cxx11 + Eigen::array<typename internal::traits<Input>::Index, 1> depth_dim; + depth_dim[0] = 0; + Eigen::array<typename internal::traits<Input>::Index, 2> bcast; + bcast[0] = dimensions(input)[0]; + bcast[1] = 1; + DSizes<typename internal::traits<Input>::Index, 2> dims2d; + dims2d[0] = 1; + dims2d[1] = dimensions(input)[1]; +#else + // Take advantage of cxx11 to give the compiler information it can use to + // optimize the code. + Eigen::IndexList<Eigen::type2index<0>> depth_dim; + Eigen::IndexList<int, Eigen::type2index<1>> bcast; + bcast.set(0, dimensions(input)[0]); + Eigen::IndexList<Eigen::type2index<1>, typename internal::traits<Input>::Index> dims2d; + dims2d.set(1, dimensions(input)[1]); +#endif + + output.device(device) = ((input - input.maximum(depth_dim).eval().reshape(dims2d).broadcast(bcast)) * beta_).exp(); + output.device(device) = output / (output.sum(depth_dim).eval().reshape(dims2d).broadcast(bcast)); + } + + private: + const float beta_; +}; +} + + +template <typename Input> +EIGEN_ALWAYS_INLINE +static const TensorCustomUnaryOp<const SoftmaxOp, const Input> +SoftMax(const Input& input, const float beta) +{ + EIGEN_STATIC_ASSERT(internal::traits<Input>::Layout == ColMajor, YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == 2, YOU_MADE_A_PROGRAMMING_MISTAKE); + + const SoftmaxOp op(beta); + return input.customOp(op); +} + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_NEURAL_NETWORKS_SOFTMAX_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/SpatialConvolutions.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/SpatialConvolutions.h new file mode 100644 index 0000000000..34a9fcf037 --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/SpatialConvolutions.h @@ -0,0 +1,634 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +#ifndef EIGEN_CXX11_NEURAL_NETWORKS_SPATIAL_CONVOLUTIONS_H +#define EIGEN_CXX11_NEURAL_NETWORKS_SPATIAL_CONVOLUTIONS_H + +namespace Eigen { + +namespace internal { + +// These optimizations require vector instructions +#ifdef EIGEN_VECTORIZE + +// TODO: Consolidate this part of the code with the image patch extraction code +// since they are both very similar. +template <typename NewDimension, DenseIndex Rows, DenseIndex Cols, typename ArgType, typename Device, + typename Scalar, typename Index, + typename nocontract_t, typename contract_t, + int Side, size_t packet_size, + bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment> +class TensorContractionInputMapper<Scalar, Index, Side, TensorEvaluator<const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >, Device>, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> +{ + public: + typedef TensorContractionInputMapper<Scalar, Index, Side, TensorEvaluator<const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >, Device>, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> Self; + typedef Self SubMapper; + typedef Self VectorMapper; + typedef Self LinearMapper; + typedef typename packet_traits<Scalar>::type Packet; + + TensorContractionInputMapper(const TensorEvaluator<const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >, Device>& tensor, + const nocontract_t&, const nocontract_t&, + const contract_t&, const contract_t&, + const Index depth_offset = 0, const Index col_offset = 0) + : m_depth_offset(depth_offset), m_col_offset(col_offset), m_impl(tensor.impl().impl()) + { + if (internal::traits<ArgType>::Layout == ColMajor) { + m_patch_depth = tensor.impl().dimensions()[0]; + m_patch_rows = tensor.impl().dimensions()[1]; + m_patch_cols = tensor.impl().dimensions()[2]; + m_num_patches = tensor.impl().dimensions()[3]; + } else { + static const int NumDims = tensor.impl().dimensions().size(); + m_patch_depth = tensor.impl().dimensions()[NumDims - 1]; + m_patch_rows = tensor.impl().dimensions()[NumDims - 2]; + m_patch_cols = tensor.impl().dimensions()[NumDims - 3]; + m_num_patches = tensor.impl().dimensions()[NumDims - 4]; + } + m_patch_row_inflate_strides = tensor.impl().rowInflateStride(); + m_patch_col_inflate_strides = tensor.impl().colInflateStride(); + + m_colStride = m_patch_rows; + + m_outputRows = tensor.impl().outputRows(); + m_row_strides = tensor.impl().userRowStride(); + m_col_strides = tensor.impl().userColStride(); + + m_in_row_strides = tensor.impl().userInRowStride(); + m_in_col_strides = tensor.impl().userInColStride(); + + if (internal::traits<ArgType>::Layout == ColMajor) { + m_inputRows = tensor.impl().impl().dimensions()[1]; + m_inputCols = tensor.impl().impl().dimensions()[2]; + } else { + static const int NumDims = tensor.impl().impl().dimensions().size(); + m_inputRows = tensor.impl().impl().dimensions()[NumDims - 2]; + m_inputCols = tensor.impl().impl().dimensions()[NumDims - 3]; + } + + m_rowInputStride = m_patch_depth; + m_colInputStride = m_patch_depth * m_inputRows; + m_patchInputStride = m_patch_depth * m_inputRows * m_inputCols; + + m_rowPaddingTop = tensor.impl().rowPaddingTop(); + m_colPaddingLeft = tensor.impl().colPaddingLeft(); + + m_fastInputRowStride = internal::TensorIntDivisor<Index>(m_patch_row_inflate_strides); + m_fastInputColStride = internal::TensorIntDivisor<Index>(m_patch_col_inflate_strides); + m_fastNumPatches = internal::TensorIntDivisor<Index>(m_num_patches); + m_fastColStride = internal::TensorIntDivisor<Index>(m_colStride); + m_fastOutputRows = internal::TensorIntDivisor<Index>(m_outputRows); + m_fastDimZero = internal::TensorIntDivisor<Index>(m_patch_depth); + + computeBaseIndices(m_col_offset, m_rowIndex, m_colIndex, m_otherIndex); + } + + TensorContractionInputMapper(const TensorContractionInputMapper& base_mapper, + const Index depth_offset, + const Index col_offset) : m_depth_offset(depth_offset), m_col_offset(col_offset), m_impl(base_mapper.m_impl) { + m_patch_depth = base_mapper.m_patch_depth; + m_patch_rows = base_mapper.m_patch_rows; + m_patch_cols = base_mapper.m_patch_cols; + m_num_patches = base_mapper.m_num_patches; + m_patch_row_inflate_strides = base_mapper.m_patch_row_inflate_strides; + m_patch_col_inflate_strides = base_mapper.m_patch_col_inflate_strides; + + m_colStride = base_mapper.m_colStride; + + m_rowInputStride = base_mapper.m_rowInputStride; + m_colInputStride = base_mapper.m_colInputStride; + m_patchInputStride = base_mapper.m_patchInputStride; + + m_inputRows = base_mapper.m_inputRows; + m_inputCols = base_mapper.m_inputCols; + + m_outputRows = base_mapper.m_outputRows; + m_row_strides = base_mapper.m_row_strides; + m_col_strides = base_mapper.m_col_strides; + + m_in_row_strides = base_mapper.m_in_row_strides; + m_in_col_strides = base_mapper.m_in_col_strides; + + m_rowPaddingTop = base_mapper.m_rowPaddingTop; + m_colPaddingLeft = base_mapper.m_colPaddingLeft; + + m_fastInputRowStride = base_mapper.m_fastInputRowStride; + m_fastInputColStride = base_mapper.m_fastInputColStride; + m_fastNumPatches = base_mapper.m_fastNumPatches; + m_fastColStride = base_mapper.m_fastColStride; + m_fastOutputRows = base_mapper.m_fastOutputRows; + m_fastDimZero = base_mapper.m_fastDimZero; + + computeBaseIndices(m_col_offset, m_rowIndex, m_colIndex, m_otherIndex); + } + + // If true, turns off some optimizations for loading packets since the image + // patches are "non-standard" such as there are non-trivial strides or + // inflations in the input. + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE bool nonStandardPatches() const { + return m_in_row_strides != 1 || m_in_col_strides != 1 || m_patch_row_inflate_strides != 1 || m_patch_col_inflate_strides != 1; + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE SubMapper getSubMapper(Index i, Index j) const { + return SubMapper(*this, m_depth_offset + i, m_col_offset + j); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE LinearMapper getLinearMapper(Index i, Index j) const { + return LinearMapper(*this, m_depth_offset + i, m_col_offset + j); + } + + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Scalar operator()(Index row) const { + return loadCoeff(row + m_depth_offset, m_rowIndex, m_colIndex, m_otherIndex); + } + + // Load the coefficient at the patchIndex location instead of the usual m_rowIndex, + // m_colIndex, m_otherIndex. This is currently only used by the gpu code. EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar operator()(Index row, Index patchIndex) const { + checkZeroOffsets(); + Index rowIndex, colIndex, otherIndex; + computeBaseIndices(patchIndex, rowIndex, colIndex, otherIndex); + return loadCoeff(row, rowIndex, colIndex, otherIndex); + } + + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Packet loadPacket(Index row) const { + return loadPacket(row + m_depth_offset, m_rowIndex, m_colIndex, m_otherIndex); + } + + // Load the packet at the patchIndex location instead of the usual m_rowIndex, + // m_colIndex, m_otherIndex. This is currently only used by the gpu code. + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Packet loadPacket(Index row, Index patchIndex) const { + checkZeroOffsets(); + Index rowIndex, colIndex, otherIndex; + computeBaseIndices(patchIndex, rowIndex, colIndex, otherIndex); + return loadPacket(row, rowIndex, colIndex, otherIndex); + } + + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; } + + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Index patchDepth() const { return m_patch_depth; } + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Index patchRows() const { return m_patch_rows; } + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Index patchCols() const { return m_patch_cols; } + + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE bool padRow(const Index row) const { + const Index r = m_rowIndex + row; + return r < 0 | r >= m_inputRows; + } + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE bool padCol(const Index col) const { + const Index c = m_colIndex + col; + return c < 0 | c >= m_inputCols; + } + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Index baseIndex(const Index row, const Index col) const { + const Index r = m_rowIndex + row; + const Index c = m_colIndex + col; + return r * m_rowInputStride + c * m_colInputStride + m_otherIndex; + } + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Packet packetNoPadding(const Index depth, const Index baseIndex) const { + const Index inputIndex = depth + baseIndex; + return m_impl.template packet<Unaligned>(inputIndex); + } + + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Index rowOffset() const { + const Index patchOffset = m_depth_offset / m_fastDimZero; + const Index colOffset = patchOffset / m_fastColStride; + return patchOffset-colOffset*m_colStride; + } + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Index colOffset() const { + const Index patchOffset = m_depth_offset / m_fastDimZero; + const Index colOffset = patchOffset / m_fastColStride; + return colOffset; + } + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Index depthOffset() const { + const Index patchOffset = m_depth_offset % m_patch_depth; + return patchOffset; + } + + private: + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar loadCoeff(Index patchId, Index rowIndex, Index colIndex, Index otherIndex) const { + // Find the offset of the element wrt the location of the first element. + const Index patchOffset = patchId / m_fastDimZero; + + const Index colOffset = patchOffset / m_fastColStride; + const Index inputCol = colIndex + colOffset * m_in_col_strides; + const Index origInputCol = (m_patch_col_inflate_strides == 1) ? inputCol : ((inputCol >= 0) ? (inputCol / m_fastInputColStride) : 0); + const Index rowOffset = patchOffset - colOffset * m_colStride; + const Index inputRow = rowIndex + rowOffset * m_in_row_strides; + const Index origInputRow = (m_patch_row_inflate_strides == 1) ? inputRow : ((inputRow >= 0) ? (inputRow / m_fastInputRowStride) : 0); + if (origInputCol < 0 | origInputRow < 0 | origInputCol >= m_inputCols | origInputRow >= m_inputRows | + (inputCol != origInputCol * m_patch_col_inflate_strides) | (inputRow != origInputRow * m_patch_row_inflate_strides)) { + return Scalar(0); + } + const Index depth = patchId - patchOffset * m_patch_depth; + const Index inputIndex = depth + origInputRow * m_rowInputStride + origInputCol * m_colInputStride + otherIndex; + return m_impl.coeff(inputIndex); + } + + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Packet loadPacket(Index patchId, Index rowIndex, Index colIndex, Index otherIndex) const { + const Index packetSize = internal::unpacket_traits<Packet>::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(patchId < m_patch_depth*m_patch_rows*m_patch_cols); + + if (nonStandardPatches()) { + return packetWithPossibleZero(patchId, rowIndex, colIndex, otherIndex); + } + + if ((m_patch_depth % packetSize) == 0) { + // Find the offset of the element wrt the location of the first element. + const Index patchOffset = patchId / m_fastDimZero; + eigen_assert((patchId + packetSize - 1) / m_fastDimZero == patchOffset); + + const Index colOffset = patchOffset / m_fastColStride; + const Index inputCol = colIndex + colOffset; + const Index rowOffset = patchOffset - colOffset*m_colStride; + const Index inputRow = rowIndex + rowOffset; + if (inputCol < 0 | inputRow < 0 | inputCol >= m_inputCols | inputRow >= m_inputRows) { + // all zeros + return internal::pset1<Packet>(Scalar(0)); + } + // no padding + const Index depth = patchId - patchOffset * m_patch_depth; + const Index inputIndex = depth + inputRow * m_rowInputStride + inputCol * m_colInputStride + otherIndex; + return m_impl.template packet<Unaligned>(inputIndex); + } + else { + const Index patchOffsets[2] = {patchId / m_fastDimZero, (patchId + packetSize - 1) / m_fastDimZero}; + + const Index colOffsets[2] = {patchOffsets[0] / m_fastColStride, patchOffsets[1] / m_fastColStride}; + + const Index inputCols[2] = {colIndex + colOffsets[0], colIndex + colOffsets[1]}; + if (inputCols[0] >= m_inputCols | inputCols[1] < 0) { + // all zeros + return internal::pset1<Packet>(Scalar(0)); + } + + if (inputCols[0] == inputCols[1]) { + const Index rowOffsets[2] = {patchOffsets[0] - colOffsets[0]*m_colStride, patchOffsets[1] - colOffsets[1]*m_colStride}; + eigen_assert(rowOffsets[0] <= rowOffsets[1]); + const Index inputRows[2] = {rowIndex + rowOffsets[0], rowIndex + rowOffsets[1]}; + + if (inputRows[0] >= m_inputRows | inputRows[1] < 0) { + // all zeros + return internal::pset1<Packet>(Scalar(0)); + } + + if (inputRows[0] >= 0 & inputRows[1] < m_inputRows) { + // no padding + const Index depth = patchId - patchOffsets[0] * m_patch_depth; + const Index inputIndex = depth + inputRows[0] * m_rowInputStride + inputCols[0] * m_colInputStride + otherIndex; + return m_impl.template packet<Unaligned>(inputIndex); + } + } + } + return packetWithPossibleZero(patchId, rowIndex, colIndex, otherIndex); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet packetWithPossibleZero(Index patchId, Index rowIndex, Index colIndex, Index otherIndex) const + { + const int packetSize = internal::unpacket_traits<Packet>::size; + EIGEN_ALIGN_MAX typename internal::remove_const<Scalar>::type values[packetSize]; + for (int i = 0; i < packetSize; ++i) { + values[i] = loadCoeff(patchId+i, rowIndex, colIndex, otherIndex); + } + Packet rslt = internal::pload<Packet>(values); + return rslt; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void computeBaseIndices(Index patchIndex, Index& rowIndex, Index& colIndex, Index& otherIndex) const { + const int NumInputDims = array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value; + otherIndex = (NumInputDims == 3) ? 0 : patchIndex / m_fastNumPatches; + const Index patch2DIndex = (NumInputDims == 3) ? patchIndex : (patchIndex - otherIndex * m_num_patches); + otherIndex *= m_patchInputStride; + colIndex = patch2DIndex / m_fastOutputRows; + rowIndex = patch2DIndex - colIndex * m_outputRows; + colIndex = colIndex * m_col_strides - m_colPaddingLeft; + rowIndex = rowIndex * m_row_strides - m_rowPaddingTop; + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void checkZeroOffsets() const { + eigen_assert(m_col_offset == 0); + eigen_assert(m_depth_offset == 0); + eigen_assert(m_rowIndex == 0); + eigen_assert(m_colIndex == 0); + eigen_assert(m_otherIndex == 0); + } + + Index m_depth_offset; // First row in the input matrix + Index m_col_offset; // First col in the input matrix + + Index m_patch_depth; // patch depth, which is equal to the input depth + Index m_patch_rows; // number of rows in the patch + Index m_patch_cols; // number of colums in the patch + Index m_num_patches; // number of patches to extract. + Index m_patch_row_inflate_strides; // the strides for row inflation in the image patch + Index m_patch_col_inflate_strides; // the strides for col inflation in the image patch + // Fast representation of inflation strides. + internal::TensorIntDivisor<Index> m_fastInputRowStride; + internal::TensorIntDivisor<Index> m_fastInputColStride; + + Index m_otherStride; + Index m_colStride; + internal::TensorIntDivisor<Index> m_fastNumPatches; + internal::TensorIntDivisor<Index> m_fastColStride; + + Index m_rowInputStride; // row stride in the input tensor + Index m_colInputStride; // col stride in the input tensor + Index m_patchInputStride; // patch stride in the input tensor + + Index m_inputRows; // Number of rows in the input tensor + Index m_inputCols; // Number of cols in the input tensor + + Index m_outputRows; // Number of patch rows + + Index m_row_strides; // User specified row stride + Index m_col_strides; // User specified col stride + + Index m_in_row_strides; // User specified input row stride + Index m_in_col_strides; // User specified input col stride + + Index m_rowPaddingTop; // Row padding + Index m_colPaddingLeft; // Column padding + + internal::TensorIntDivisor<Index> m_fastOutputRows; + internal::TensorIntDivisor<Index> m_fastDimZero; + + Index m_rowIndex; // precomputed row index corresponding to the col offset + Index m_colIndex; // precomputed col index corresponding to the col offset + Index m_otherIndex; // precomputed other index corresponding to the col offset + + const TensorEvaluator<ArgType, Device> m_impl; +}; + + +template <typename NewDimension, DenseIndex Rows, DenseIndex Cols, typename ArgType, typename Device, + typename Scalar, typename Index, + typename nocontract_t, typename contract_t, + int Side, size_t packet_size, + bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment, int nr> +struct gemm_pack_rhs<Scalar, Index, TensorContractionInputMapper<Scalar, Index, Side, TensorEvaluator<const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >, Device>, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>, nr, ColMajor, false, false> { + + typedef TensorContractionInputMapper<Scalar, Index, Side, TensorEvaluator<const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >, Device>, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> DataMapper; + + static inline Index ceil_div(Index a, Index b) { + return (a + b - 1) / b; + } + + EIGEN_DONT_INLINE void operator()(Scalar* block, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0) const { + eigen_assert(stride == 0); + eigen_assert(offset == 0); + + EIGEN_STATIC_ASSERT((nr == 4), YOU_MADE_A_PROGRAMMING_MISTAKE); + typedef typename DataMapper::LinearMapper LinearMapper; + typedef typename packet_traits<Scalar>::type Packet; + + const Index packet_cols4 = (cols/4) * 4; + const Index peeled_k = (depth/packet_size) * packet_size; + + for(Index j2=0; j2<packet_cols4; j2+=4) + { + const LinearMapper dm0 = rhs.getLinearMapper(0, j2 + 0); + const LinearMapper dm1 = rhs.getLinearMapper(0, j2 + 1); + const LinearMapper dm2 = rhs.getLinearMapper(0, j2 + 2); + const LinearMapper dm3 = rhs.getLinearMapper(0, j2 + 3); + + Index k=0; + if((packet_size%4)==0 && !rhs.nonStandardPatches()) + { + const Index patch_depth = rhs.patchDepth(); + if ((patch_depth % packet_size) == 0) { + const Index patch_cols = rhs.patchCols(); + const Index patch_rows = rhs.patchRows(); + + const Index startCol = rhs.colOffset(); + const Index max_cols = std::min<Index>(ceil_div(peeled_k, patch_rows*patch_depth)+startCol, patch_cols); + + for (Index c = startCol; c < max_cols; ++c) { + eigen_assert(k < peeled_k); + const Index startRow = (c == startCol) ? rhs.rowOffset() : 0; + const Index max_rows = std::min<Index>(ceil_div(peeled_k-c*patch_rows*patch_depth, patch_depth)+startRow, patch_rows); + + const bool pad_col0 = dm0.padCol(c); + const bool pad_col1 = dm1.padCol(c); + const bool pad_col2 = dm2.padCol(c); + const bool pad_col3 = dm3.padCol(c); + for (Index r = startRow; r < max_rows; ++r) { + eigen_assert(k < peeled_k); + const bool pad0 = pad_col0 || dm0.padRow(r); + const bool pad1 = pad_col1 || dm1.padRow(r); + const bool pad2 = pad_col2 || dm2.padRow(r); + const bool pad3 = pad_col3 || dm3.padRow(r); + + const Index idx0 = dm0.baseIndex(r, c); + const Index idx1 = dm1.baseIndex(r, c); + const Index idx2 = dm2.baseIndex(r, c); + const Index idx3 = dm3.baseIndex(r, c); + + const Index startDepth = ((c == startCol) && (r == startRow)) ? rhs.depthOffset() : 0; + const Index max_depth = std::min<Index>(peeled_k-c*patch_rows*patch_depth-r*patch_depth+startDepth, patch_depth); + eigen_assert(max_depth % packet_size == 0); + for (Index d = startDepth; d < max_depth; d += packet_size) { + eigen_assert(k < peeled_k); + PacketBlock<Packet, 4> kernel; + kernel.packet[0] = pad0 ? pset1<Packet>(0) : dm0.packetNoPadding(d, idx0); + kernel.packet[1] = pad1 ? pset1<Packet>(0) : dm1.packetNoPadding(d, idx1); + kernel.packet[2] = pad2 ? pset1<Packet>(0) : dm2.packetNoPadding(d, idx2); + kernel.packet[3] = pad3 ? pset1<Packet>(0) : dm3.packetNoPadding(d, idx3); + ptranspose(kernel); + pstoreu(block+0*packet_size, kernel.packet[0]); + pstoreu(block+1*packet_size, kernel.packet[1]); + pstoreu(block+2*packet_size, kernel.packet[2]); + pstoreu(block+3*packet_size, kernel.packet[3]); + block+=4*packet_size; + k += packet_size; + } + } + } + } + + for(; k<peeled_k; k+=packet_size) { + PacketBlock<Packet, 4> kernel; + kernel.packet[0] = dm0.loadPacket(k); + kernel.packet[1] = dm1.loadPacket(k); + kernel.packet[2] = dm2.loadPacket(k); + kernel.packet[3] = dm3.loadPacket(k); + ptranspose(kernel); + pstoreu(block+0*packet_size, kernel.packet[0]); + pstoreu(block+1*packet_size, kernel.packet[1]); + pstoreu(block+2*packet_size, kernel.packet[2]); + pstoreu(block+3*packet_size, kernel.packet[3]); + block+=4*packet_size; + } + } + for(; k<depth; k++) + { + block[0] = dm0(k); + block[1] = dm1(k); + block[2] = dm2(k); + block[3] = dm3(k); + block += 4; + } + } + + // copy the remaining columns one at a time (nr==1) + for(Index j2=packet_cols4; j2<cols; ++j2) + { + const LinearMapper dm0 = rhs.getLinearMapper(0, j2); + for(Index k=0; k<depth; k++) + { + *block = dm0(k); + block += 1; + } + } + } +}; + +#endif // EIGEN_VECTORIZE +} // end namespace internal + + +/** SpatialConvolution + * \ingroup CXX11_NeuralNetworks_Module + * + * \brief Applies a 2D convolution over a multichannel input image. + * + * The input parameter is expected to be a tensor with a rank of 3 or more (channels, height, width, and optionally others) + * The kernel parameter is expected to be a 4D tensor (filters, channels, kernel_height, kernel_width) + * The input and the kernel must both be in col-major layout. The result will also be in col-major layout. + * + * If in_stride > 1, then applies convolution with holes (aka atrous convolution), sampling every in_stride input pixels. + * + * The result can be assigned to a tensor of rank equal to the rank of the input. The dimensions of the result will be filters, height, width (and others if applicable). + * + * It is possible to swap the order of the width and height dimensions provided that the same order is used in the input, the kernel, and the output. + * + */ +template <typename Input, typename Kernel> +EIGEN_ALWAYS_INLINE +static const typename internal::conditional< + internal::traits<Input>::Layout == ColMajor, + TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, internal::traits<Input>::NumDimensions>, const TensorContractionOp<const array<IndexPair<typename internal::traits<Input>::Index>, 1>, const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 2>, const Kernel>, const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 2>, const TensorImagePatchOp<Dynamic, Dynamic, const Input> > > >, + TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, internal::traits<Input>::NumDimensions>, const TensorContractionOp<const array<IndexPair<typename internal::traits<Input>::Index>, 1>, const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 2>, const TensorImagePatchOp<Dynamic, Dynamic, const Input> >, const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 2>, const Kernel> > > >::type +SpatialConvolution(const Input& input, const Kernel& kernel, const DenseIndex stride = 1, const PaddingType padding_type = PADDING_SAME, const DenseIndex in_stride = 1) { + + typedef typename internal::traits<Input>::Index TensorIndex; + TensorRef<Tensor<typename internal::traits<Input>::Scalar, internal::traits<Input>::NumDimensions, internal::traits<Input>::Layout, TensorIndex> > in(input); + TensorRef<Tensor<typename internal::traits<Kernel>::Scalar, internal::traits<Kernel>::NumDimensions, internal::traits<Kernel>::Layout, TensorIndex> > kern(kernel); + + EIGEN_STATIC_ASSERT(internal::traits<Input>::Layout == internal::traits<Kernel>::Layout, YOU_MADE_A_PROGRAMMING_MISTAKE); + static const bool isColMajor = (internal::traits<Input>::Layout == ColMajor); + + static const int NumDims = internal::traits<Input>::NumDimensions; + + // Number of filters to apply. This is the same as the output depth of the result + const TensorIndex kernelFilters = isColMajor ? kern.dimensions()[0] : kern.dimensions()[3]; + // Number of channels. This is the same as the input depth. + const TensorIndex kernelChannels = isColMajor ? kern.dimensions()[1] : kern.dimensions()[2]; + const TensorIndex kernelRows = isColMajor ? kern.dimensions()[2] : kern.dimensions()[1]; + const TensorIndex kernelCols = isColMajor ? kern.dimensions()[3] : kern.dimensions()[0]; + + const DenseIndex kernelRowsEff = kernelRows + (kernelRows - 1) * (in_stride - 1); + const DenseIndex kernelColsEff = kernelCols + (kernelCols - 1) * (in_stride - 1); + + array<IndexPair<TensorIndex>, 1> contract_dims; + contract_dims[0] = IndexPair<TensorIndex>(1, 0); + + const TensorIndex InputRows = isColMajor ? in.dimension(1) : in.dimension(NumDims - 2); + const TensorIndex InputCols = isColMajor ? in.dimension(2) : in.dimension(NumDims - 3); + + TensorIndex out_height; + TensorIndex out_width; + switch (padding_type) { + case PADDING_VALID: + out_height = numext::ceil((InputRows - kernelRowsEff + 1.f) / static_cast<float>(stride)); + out_width = numext::ceil((InputCols - kernelColsEff + 1.f) / static_cast<float>(stride)); + break; + case PADDING_SAME: + out_height = numext::ceil(InputRows / static_cast<float>(stride)); + out_width = numext::ceil(InputCols / static_cast<float>(stride)); + break; + default: + eigen_assert(false && "unexpected padding"); + } + + // Molds the output of the patch extraction code into a 2d tensor: + // - the first dimension (dims[0]): the patch values to be multiplied with the kernels + // - the second dimension (dims[1]): everything else + DSizes<TensorIndex, 2> pre_contract_dims; + if (isColMajor) { + pre_contract_dims[0] = kernelChannels * kernelRows * kernelCols; + pre_contract_dims[1] = out_height * out_width; + for (int i = 3; i < NumDims; ++i) { + pre_contract_dims[1] *= in.dimension(i); + } + } else { + pre_contract_dims[1] = kernelChannels * kernelRows * kernelCols; + pre_contract_dims[0] = out_height * out_width; + for (int i = 0; i < NumDims - 3; ++i) { + pre_contract_dims[0] *= in.dimension(i); + } + } + + // Molds the output of the contraction into the shape expected by the used + // (assuming this is ColMajor): + // - 1st dim: kernel filters + // - 2nd dim: output height + // - 3rd dim: output width + // - 4th dim and beyond: everything else including batch size + DSizes<TensorIndex, NumDims> post_contract_dims; + if (isColMajor) { + post_contract_dims[0] = kernelFilters; + post_contract_dims[1] = out_height; + post_contract_dims[2] = out_width; + for (int i = 3; i < NumDims; ++i) { + post_contract_dims[i] = in.dimension(i); + } + } else { + post_contract_dims[NumDims - 1] = kernelFilters; + post_contract_dims[NumDims - 2] = out_height; + post_contract_dims[NumDims - 3] = out_width; + for (int i = 0; i < NumDims - 3; ++i) { + post_contract_dims[i] = in.dimension(i); + } + } + + DSizes<TensorIndex, 2> kernel_dims; + if (isColMajor) { + kernel_dims[0] = kernelFilters; + kernel_dims[1] = kernelChannels * kernelRows * kernelCols; + } else { + kernel_dims[0] = kernelChannels * kernelRows * kernelCols; + kernel_dims[1] = kernelFilters; + } + // TODO(yangke): choose() is defined in TensorContraction.h -- consider + // moving it to somewhere more "common". + return choose(Cond<internal::traits<Input>::Layout == ColMajor>(), + kernel.reshape(kernel_dims).contract(input.extract_image_patches(kernelRows, kernelCols, stride, stride, in_stride, in_stride, padding_type).reshape(pre_contract_dims), contract_dims).reshape(post_contract_dims), + input.extract_image_patches(kernelRows, kernelCols, stride, stride, in_stride, in_stride, padding_type).reshape(pre_contract_dims).contract(kernel.reshape(kernel_dims), contract_dims).reshape(post_contract_dims)); +} + +} // end namespace Eigen + +#endif // EIGEN_CXX11_NEURAL_NETWORKS_SPATIAL_CONVOLUTIONS_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/TensorConvolutionByFFT.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/TensorConvolutionByFFT.h new file mode 100644 index 0000000000..0e72173536 --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/TensorConvolutionByFFT.h @@ -0,0 +1,289 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// Copyright (C) 2015 Jianwei Cui <thucjw@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTIONBYFFT_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTIONBYFFT_H + +namespace Eigen { + +/** \class TensorConvolutionByFFT + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor convolution class. + * + * + */ +namespace internal { + + +template<typename Dimensions, typename InputXprType, typename KernelXprType> +struct traits<TensorConvolutionByFFTOp<Dimensions, InputXprType, KernelXprType> > +{ + // Type promotion to handle the case where the types of the lhs and the rhs are different. + typedef typename promote_storage_type<typename InputXprType::Scalar, + typename KernelXprType::Scalar>::ret Scalar; + typedef typename packet_traits<Scalar>::type Packet; + typedef typename promote_storage_type<typename traits<InputXprType>::StorageKind, + typename traits<KernelXprType>::StorageKind>::ret StorageKind; + typedef typename promote_index_type<typename traits<InputXprType>::Index, + typename traits<KernelXprType>::Index>::type Index; + typedef typename InputXprType::Nested LhsNested; + typedef typename KernelXprType::Nested RhsNested; + typedef typename remove_reference<LhsNested>::type _LhsNested; + typedef typename remove_reference<RhsNested>::type _RhsNested; + static const int NumDimensions = traits<InputXprType>::NumDimensions; + static const int Layout = traits<InputXprType>::Layout; + + enum { + Flags = 0, + }; +}; + +template<typename Dimensions, typename InputXprType, typename KernelXprType> +struct eval<TensorConvolutionByFFTOp<Dimensions, InputXprType, KernelXprType>, Eigen::Dense> +{ + typedef const TensorConvolutionByFFTOp<Dimensions, InputXprType, KernelXprType>& type; +}; + +template<typename Dimensions, typename InputXprType, typename KernelXprType> +struct nested<TensorConvolutionByFFTOp<Dimensions, InputXprType, KernelXprType>, 1, typename eval<TensorConvolutionByFFTOp<Dimensions, InputXprType, KernelXprType> >::type> +{ + typedef TensorConvolutionByFFTOp<Dimensions, InputXprType, KernelXprType> type; +}; + +} // end namespace internal + + + +template<typename Indices, typename InputXprType, typename KernelXprType> +class TensorConvolutionByFFTOp : public TensorBase<TensorConvolutionByFFTOp<Indices, InputXprType, KernelXprType> > +{ + public: + typedef typename Eigen::internal::traits<TensorConvolutionByFFTOp>::Scalar Scalar; + typedef typename Eigen::internal::traits<TensorConvolutionByFFTOp>::Packet Packet; + typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; + typedef typename internal::promote_storage_type<typename InputXprType::CoeffReturnType, + typename KernelXprType::CoeffReturnType>::ret CoeffReturnType; + typedef typename internal::promote_storage_type<typename InputXprType::PacketReturnType, + typename KernelXprType::PacketReturnType>::ret PacketReturnType; + typedef typename Eigen::internal::nested<TensorConvolutionByFFTOp>::type Nested; + typedef typename Eigen::internal::traits<TensorConvolutionByFFTOp>::StorageKind StorageKind; + typedef typename Eigen::internal::traits<TensorConvolutionByFFTOp>::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConvolutionByFFTOp(const InputXprType& input, const KernelXprType& kernel, const Indices& dims) + : m_input_xpr(input), m_kernel_xpr(kernel), m_indices(dims) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const Indices& indices() const { return m_indices; } + + /** \returns the nested expressions */ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const typename internal::remove_all<typename InputXprType::Nested>::type& + inputExpression() const { return m_input_xpr; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const typename internal::remove_all<typename KernelXprType::Nested>::type& + kernelExpression() const { return m_kernel_xpr; } + + protected: + typename InputXprType::Nested m_input_xpr; + typename KernelXprType::Nested m_kernel_xpr; + const Indices m_indices; +}; + + +template<typename Indices, typename InputArgType, typename KernelArgType, typename Device> +struct TensorEvaluator<const TensorConvolutionByFFTOp<Indices, InputArgType, KernelArgType>, Device> +{ + typedef TensorConvolutionByFFTOp<Indices, InputArgType, KernelArgType> XprType; + + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; + + static const int NumDims = internal::array_size<typename TensorEvaluator<InputArgType, Device>::Dimensions>::value; + static const int NumKernelDims = internal::array_size<Indices>::value; + typedef typename XprType::Index Index; + typedef DSizes<Index, NumDims> Dimensions; + + enum { + IsAligned = TensorEvaluator<InputArgType, Device>::IsAligned & + TensorEvaluator<KernelArgType, Device>::IsAligned, + PacketAccess = false, + BlockAccess = false, + Layout = TensorEvaluator<InputArgType, Device>::Layout, + CoordAccess = false, // to be implemented + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_kernel(NULL), m_local_kernel(false), m_device(device) + { + EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); + + const typename TensorEvaluator<InputArgType, Device>::Dimensions& input_dims = m_inputImpl.dimensions(); + const typename TensorEvaluator<KernelArgType, Device>::Dimensions& kernel_dims = m_kernelImpl.dimensions(); + + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + m_inputStride[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_inputStride[i] = m_inputStride[i - 1] * input_dims[i - 1]; + } + } else { + m_inputStride[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_inputStride[i] = m_inputStride[i + 1] * input_dims[i + 1]; + } + } + + m_dimensions = m_inputImpl.dimensions(); + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + for (int i = 0; i < NumKernelDims; ++i) { + const Index index = op.indices()[i]; + const Index input_dim = input_dims[index]; + const Index kernel_dim = kernel_dims[i]; + const Index result_dim = input_dim - kernel_dim + 1; + m_dimensions[index] = result_dim; + if (i > 0) { + m_kernelStride[i] = m_kernelStride[i - 1] * kernel_dims[i - 1]; + } else { + m_kernelStride[0] = 1; + } + m_indexStride[i] = m_inputStride[index]; + } + + m_outputStride[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_outputStride[i] = m_outputStride[i - 1] * m_dimensions[i - 1]; + } + } else { + for (int i = NumKernelDims - 1; i >= 0; --i) { + const Index index = op.indices()[i]; + const Index input_dim = input_dims[index]; + const Index kernel_dim = kernel_dims[i]; + const Index result_dim = input_dim - kernel_dim + 1; + m_dimensions[index] = result_dim; + if (i < NumKernelDims - 1) { + m_kernelStride[i] = m_kernelStride[i + 1] * kernel_dims[i + 1]; + } else { + m_kernelStride[NumKernelDims - 1] = 1; + } + m_indexStride[i] = m_inputStride[index]; + } + + m_outputStride[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_outputStride[i] = m_outputStride[i + 1] * m_dimensions[i + 1]; + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + m_inputImpl.evalSubExprsIfNeeded(NULL); + m_kernelImpl.evalSubExprsIfNeeded(NULL); + + typedef typename internal::traits<InputArgType>::Index TensorIndex; + + Tensor<Scalar, NumDims, Layout, TensorIndex> input(m_inputImpl.dimensions()); + for (int i = 0; i < m_inputImpl.dimensions().TotalSize(); ++i) { + input.data()[i] = m_inputImpl.coeff(i); + } + + Tensor<Scalar, NumDims, Layout, TensorIndex> kernel(m_kernelImpl.dimensions()); + for (int i = 0; i < m_kernelImpl.dimensions().TotalSize(); ++i) { + kernel.data()[i] = m_kernelImpl.coeff(i); + } + + array<std::pair<ptrdiff_t, ptrdiff_t>, NumDims> paddings; + for (int i = 0; i < NumDims; ++i) { + paddings[i] = std::make_pair(0, m_inputImpl.dimensions()[i] - m_kernelImpl.dimensions()[i]); + } + + Eigen::array<bool, NumKernelDims> reverse; + for (int i = 0; i < NumKernelDims; ++i) { + reverse[i] = true; + } + + Eigen::array<bool, NumDims> fft; + for (int i = 0; i < NumDims; ++i) { + fft[i] = i; + } + + Eigen::DSizes<TensorIndex, NumDims> slice_offsets; + for (int i = 0; i < NumDims; ++i) { + slice_offsets[i] = m_kernelImpl.dimensions()[i] - 1; + } + + Eigen::DSizes<TensorIndex, NumDims> slice_extents; + for (int i = 0; i < NumDims; ++i) { + slice_extents[i] = m_inputImpl.dimensions()[i] - m_kernelImpl.dimensions()[i] + 1; + } + + Tensor<Scalar, NumDims, Layout, TensorIndex> kernel_variant = kernel.reverse(reverse).pad(paddings); + Tensor<std::complex<Scalar>, NumDims, Layout, TensorIndex> kernel_fft = kernel_variant.template fft<Eigen::BothParts, FFT_FORWARD>(fft); + //Tensor<std::complex<Scalar>, NumDims, Layout|IndexType> kernel_fft = kernel.reverse(reverse).pad(paddings).template fft<2>(fft); + Tensor<std::complex<Scalar>, NumDims, Layout, TensorIndex> input_fft = input.template fft<Eigen::BothParts, FFT_FORWARD>(fft); + Tensor<std::complex<Scalar>, NumDims, Layout, TensorIndex> prod = (input_fft * kernel_fft).template fft<Eigen::BothParts, FFT_REVERSE>(fft); + Tensor<std::complex<Scalar>, NumDims, Layout, TensorIndex> tensor_result = prod.slice(slice_offsets, slice_extents); + + for (int i = 0; i < tensor_result.size(); ++i) { + data[i] = std::real(tensor_result.data()[i]); + } + return false; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_inputImpl.cleanup(); + if (m_local_kernel) { + m_device.deallocate((void*)m_kernel); + m_local_kernel = false; + } + m_kernel = NULL; + } + + void evalTo(typename XprType::Scalar* buffer) { + evalSubExprsIfNeeded(NULL); + for (int i = 0; i < dimensions().TotalSize(); ++i) { + buffer[i] += coeff(i); + } + cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + CoeffReturnType result = CoeffReturnType(0); + return result; + } + + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + + private: + array<Index, NumDims> m_inputStride; + array<Index, NumDims> m_outputStride; + + array<Index, NumKernelDims> m_indexStride; + array<Index, NumKernelDims> m_kernelStride; + TensorEvaluator<InputArgType, Device> m_inputImpl; + TensorEvaluator<KernelArgType, Device> m_kernelImpl; + Dimensions m_dimensions; + + KernelArgType m_kernelArg; + const Scalar* m_kernel; + bool m_local_kernel; + const Device& m_device; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTIONBYFFT_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/Tensor.h new file mode 100644 index 0000000000..9db0d2698f --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/Tensor.h @@ -0,0 +1,461 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2013 Christian Seiler <christian@iwakd.de> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_H +#define EIGEN_CXX11_TENSOR_TENSOR_H + +namespace Eigen { + +/** \class Tensor + * \ingroup CXX11_Tensor_Module + * + * \brief The tensor class. + * + * The %Tensor class is the work-horse for all \em dense tensors within Eigen. + * + * The %Tensor class encompasses only dynamic-size objects so far. + * + * The first two template parameters are required: + * \tparam Scalar_ \anchor tensor_tparam_scalar Numeric type, e.g. float, double, int or std::complex<float>. + * User defined scalar types are supported as well (see \ref user_defined_scalars "here"). + * \tparam NumIndices_ Number of indices (i.e. rank of the tensor) + * + * The remaining template parameters are optional -- in most cases you don't have to worry about them. + * \tparam Options_ \anchor tensor_tparam_options A combination of either \b #RowMajor or \b #ColMajor, and of either + * \b #AutoAlign or \b #DontAlign. + * The former controls \ref TopicStorageOrders "storage order", and defaults to column-major. The latter controls alignment, which is required + * for vectorization. It defaults to aligning tensors. Note that tensors currently do not support any operations that profit from vectorization. + * Support for such operations (i.e. adding two tensors etc.) is planned. + * + * You can access elements of tensors using normal subscripting: + * + * \code + * Eigen::Tensor<double, 4> t(10, 10, 10, 10); + * t(0, 1, 2, 3) = 42.0; + * \endcode + * + * This class can be extended with the help of the plugin mechanism described on the page + * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_TENSOR_PLUGIN. + * + * <i><b>Some notes:</b></i> + * + * <dl> + * <dt><b>Relation to other parts of Eigen:</b></dt> + * <dd>The midterm developement goal for this class is to have a similar hierarchy as Eigen uses for matrices, so that + * taking blocks or using tensors in expressions is easily possible, including an interface with the vector/matrix code + * by providing .asMatrix() and .asVector() (or similar) methods for rank 2 and 1 tensors. However, currently, the %Tensor + * class does not provide any of these features and is only available as a stand-alone class that just allows for + * coefficient access. Also, when fixed-size tensors are implemented, the number of template arguments is likely to + * change dramatically.</dd> + * </dl> + * + * \ref TopicStorageOrders + */ + +template<typename Scalar_, std::size_t NumIndices_, int Options_, typename IndexType_> +class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexType_> > +{ + public: + typedef Tensor<Scalar_, NumIndices_, Options_, IndexType_> Self; + typedef TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexType_> > Base; + typedef typename Eigen::internal::nested<Self>::type Nested; + typedef typename internal::traits<Self>::StorageKind StorageKind; + typedef typename internal::traits<Self>::Index Index; + typedef Scalar_ Scalar; + typedef typename internal::packet_traits<Scalar>::type Packet; + typedef typename NumTraits<Scalar>::Real RealScalar; + typedef typename Base::CoeffReturnType CoeffReturnType; + typedef typename Base::PacketReturnType PacketReturnType; + + enum { + IsAligned = bool(EIGEN_ALIGN) & !(Options_ & DontAlign), + PacketAccess = (internal::packet_traits<Scalar>::size > 1), + BlockAccess = false, + Layout = Options_ & RowMajor ? RowMajor : ColMajor, + CoordAccess = true, + }; + + static const int Options = Options_; + static const std::size_t NumIndices = NumIndices_; + typedef DSizes<Index, NumIndices_> Dimensions; + + protected: + TensorStorage<Scalar, Dimensions, Options_> m_storage; + + public: + // Metadata + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rank() const { return NumIndices; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index dimension(std::size_t n) const { return m_storage.dimensions()[n]; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_storage.dimensions(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_storage.size(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar *data() { return m_storage.data(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar *data() const { return m_storage.data(); } + + // This makes EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED + // work, because that uses base().coeffRef() - and we don't yet + // implement a similar class hierarchy + inline Self& base() { return *this; } + inline const Self& base() const { return *this; } + +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES + template<typename... IndexTypes> + EIGEN_DEVICE_FUNC inline const Scalar& coeff(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const + { + // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + return coeff(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}}); + } +#endif + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(const array<Index, NumIndices>& indices) const + { + eigen_internal_assert(checkIndexRange(indices)); + return m_storage.data()[linearizedIndex(indices)]; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff() const + { + EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + return m_storage.data()[0]; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index index) const + { + eigen_internal_assert(index >= 0 && index < size()); + return m_storage.data()[index]; + } + +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES + template<typename... IndexTypes> + inline Scalar& coeffRef(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) + { + // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + return coeffRef(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}}); + } +#endif + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(const array<Index, NumIndices>& indices) + { + eigen_internal_assert(checkIndexRange(indices)); + return m_storage.data()[linearizedIndex(indices)]; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef() + { + EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE) + return m_storage.data()[0]; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) + { + eigen_internal_assert(index >= 0 && index < size()); + return m_storage.data()[index]; + } + +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES + template<typename... IndexTypes> + inline const Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const + { + // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + return this->operator()(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}}); + } +#else + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1) const + { + return coeff(array<Index, 2>(i0, i1)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2) const + { + return coeff(array<Index, 3>(i0, i1, i2)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3) const + { + return coeff(array<Index, 4>(i0, i1, i2, i3)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const + { + return coeff(array<Index, 5>(i0, i1, i2, i3, i4)); + } +#endif + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(const array<Index, NumIndices>& indices) const + { + eigen_assert(checkIndexRange(indices)); + return coeff(indices); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()() const + { + EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + return coeff(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(Index index) const + { + eigen_internal_assert(index >= 0 && index < size()); + return coeff(index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator[](Index index) const + { + // The bracket operator is only for vectors, use the parenthesis operator instead. + EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE); + return coeff(index); + } + +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES + template<typename... IndexTypes> + inline Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) + { + // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + return operator()(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}}); + } +#else + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1) + { + return coeffRef(array<Index, 2>(i0, i1)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2) + { + return coeffRef(array<Index, 3>(i0, i1, i2)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3) + { + return coeffRef(array<Index, 4>(i0, i1, i2, i3)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) + { + return coeffRef(array<Index, 5>(i0, i1, i2, i3, i4)); + } +#endif + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(const array<Index, NumIndices>& indices) + { + eigen_assert(checkIndexRange(indices)); + return coeffRef(indices); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()() + { + EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + return coeffRef(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(Index index) + { + eigen_assert(index >= 0 && index < size()); + return coeffRef(index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator[](Index index) + { + // The bracket operator is only for vectors, use the parenthesis operator instead + EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + return coeffRef(index); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Tensor() + : m_storage() + { + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Tensor(const Self& other) + : m_storage(other.m_storage) + { + } + +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES + template<typename... IndexTypes> + inline Tensor(Index firstDimension, IndexTypes... otherDimensions) + : m_storage(internal::array_prod(array<Index, NumIndices>{{firstDimension, otherDimensions...}}), array<Index, NumIndices>{{firstDimension, otherDimensions...}}) + { + // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + } +#else + inline explicit Tensor(Index dim1) + : m_storage(dim1, array<Index, 1>(dim1)) + { + EIGEN_STATIC_ASSERT(1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + } + inline explicit Tensor(Index dim1, Index dim2) + : m_storage(dim1*dim2, array<Index, 2>(dim1, dim2)) + { + EIGEN_STATIC_ASSERT(2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + } + inline explicit Tensor(Index dim1, Index dim2, Index dim3) + : m_storage(dim1*dim2*dim3, array<Index, 3>(dim1, dim2, dim3)) + { + EIGEN_STATIC_ASSERT(3 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + } + inline explicit Tensor(Index dim1, Index dim2, Index dim3, Index dim4) + : m_storage(dim1*dim2*dim3*dim4, array<Index, 4>(dim1, dim2, dim3, dim4)) + { + EIGEN_STATIC_ASSERT(4 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + } + inline explicit Tensor(Index dim1, Index dim2, Index dim3, Index dim4, Index dim5) + : m_storage(dim1*dim2*dim3*dim4*dim5, array<Index, 4>(dim1, dim2, dim3, dim4, dim5)) + { + EIGEN_STATIC_ASSERT(5 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + } +#endif + + inline explicit Tensor(const array<Index, NumIndices>& dimensions) + : m_storage(internal::array_prod(dimensions), dimensions) + { + EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED + } + + template<typename OtherDerived> + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Tensor(const TensorBase<OtherDerived, ReadOnlyAccessors>& other) + { + typedef TensorAssignOp<Tensor, const OtherDerived> Assign; + Assign assign(*this, other.derived()); + resize(TensorEvaluator<const Assign, DefaultDevice>(assign, DefaultDevice()).dimensions()); + internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice()); + } + template<typename OtherDerived> + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Tensor(const TensorBase<OtherDerived, WriteAccessors>& other) + { + typedef TensorAssignOp<Tensor, const OtherDerived> Assign; + Assign assign(*this, other.derived()); + resize(TensorEvaluator<const Assign, DefaultDevice>(assign, DefaultDevice()).dimensions()); + internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Tensor& operator=(const Tensor& other) + { + typedef TensorAssignOp<Tensor, const Tensor> Assign; + Assign assign(*this, other); + resize(TensorEvaluator<const Assign, DefaultDevice>(assign, DefaultDevice()).dimensions()); + internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice()); + return *this; + } + template<typename Other> + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Tensor& operator=(const Other& other) + { + typedef TensorAssignOp<Tensor, const Other> Assign; + Assign assign(*this, other); + resize(TensorEvaluator<const Assign, DefaultDevice>(assign, DefaultDevice()).dimensions()); + internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice()); + return *this; + } + +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES + template<typename... IndexTypes> EIGEN_DEVICE_FUNC + void resize(Index firstDimension, IndexTypes... otherDimensions) + { + // The number of dimensions used to resize a tensor must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + resize(array<Index, NumIndices>{firstDimension, otherDimensions...}); + } +#endif + + EIGEN_DEVICE_FUNC + void resize() + { + EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + // Nothing to do: rank 0 tensors have fixed size + } + + EIGEN_DEVICE_FUNC + void resize(const array<Index, NumIndices>& dimensions) + { + Index size = Index(1); + for (size_t i = 0; i < NumIndices; i++) { + internal::check_rows_cols_for_overflow<Dynamic>::run(size, dimensions[i]); + size *= dimensions[i]; + } + #ifdef EIGEN_INITIALIZE_COEFFS + bool size_changed = size != this->size(); + m_storage.resize(size, dimensions); + if(size_changed) EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED + #else + m_storage.resize(size, dimensions); + #endif + } + + EIGEN_DEVICE_FUNC + void resize(const DSizes<Index, NumIndices>& dimensions) { + array<Index, NumIndices> dims; + for (int i = 0; i < NumIndices; ++i) { + dims[i] = dimensions[i]; + } + resize(dims); + } + +#ifndef EIGEN_EMULATE_CXX11_META_H + template <typename std::size_t... Indices> + EIGEN_DEVICE_FUNC + void resize(const Sizes<Indices...>& dimensions) { + array<Index, NumIndices> dims; + for (int i = 0; i < NumIndices; ++i) { + dims[i] = dimensions[i]; + } + resize(dims); + } +#else + template <std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5> + EIGEN_DEVICE_FUNC + void resize(const Sizes<V1, V2, V3, V4, V5>& dimensions) { + array<Index, NumIndices> dims; + for (int i = 0; i < NumIndices; ++i) { + dims[i] = dimensions[i]; + } + resize(dims); + } +#endif + + protected: + + bool checkIndexRange(const array<Index, NumIndices>& indices) const + { + using internal::array_apply_and_reduce; + using internal::array_zip_and_reduce; + using internal::greater_equal_zero_op; + using internal::logical_and_op; + using internal::lesser_op; + + return + // check whether the indices are all >= 0 + array_apply_and_reduce<logical_and_op, greater_equal_zero_op>(indices) && + // check whether the indices fit in the dimensions + array_zip_and_reduce<logical_and_op, lesser_op>(indices, m_storage.dimensions()); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index linearizedIndex(const array<Index, NumIndices>& indices) const + { + if (Options&RowMajor) { + return m_storage.dimensions().IndexOfRowMajor(indices); + } else { + return m_storage.dimensions().IndexOfColMajor(indices); + } + } +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h new file mode 100644 index 0000000000..ee3bf7fe34 --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h @@ -0,0 +1,288 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Eugene Brevdo <ebrevdo@gmail.com> +// Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_ARG_MAX_H +#define EIGEN_CXX11_TENSOR_TENSOR_ARG_MAX_H + +namespace Eigen { +namespace internal { + +/** \class TensorIndexTuple + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor + Index Tuple class. + * + * + */ +template<typename XprType> +struct traits<TensorIndexTupleOp<XprType> > : public traits<XprType> +{ + typedef traits<XprType> XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef Tuple<Index, typename XprTraits::Scalar> Scalar; + typedef typename XprType::Nested Nested; + typedef typename remove_reference<Nested>::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; +}; + +template<typename XprType> +struct eval<TensorIndexTupleOp<XprType>, Eigen::Dense> +{ + typedef const TensorIndexTupleOp<XprType>& type; +}; + +template<typename XprType> +struct nested<TensorIndexTupleOp<XprType>, 1, + typename eval<TensorIndexTupleOp<XprType> >::type> +{ + typedef TensorIndexTupleOp<XprType> type; +}; + +} // end namespace internal + +template<typename XprType> +class TensorIndexTupleOp : public TensorBase<TensorIndexTupleOp<XprType>, ReadOnlyAccessors> +{ + public: + typedef typename Eigen::internal::traits<TensorIndexTupleOp>::Scalar Scalar; + typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; + typedef typename Eigen::internal::nested<TensorIndexTupleOp>::type Nested; + typedef typename Eigen::internal::traits<TensorIndexTupleOp>::StorageKind StorageKind; + typedef typename Eigen::internal::traits<TensorIndexTupleOp>::Index Index; + typedef Tuple<Index, typename XprType::CoeffReturnType> CoeffReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIndexTupleOp(const XprType& expr) + : m_xpr(expr) {} + + EIGEN_DEVICE_FUNC + const typename internal::remove_all<typename XprType::Nested>::type& + expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; +}; + +// Eval as rvalue +template<typename ArgType, typename Device> +struct TensorEvaluator<const TensorIndexTupleOp<ArgType>, Device> +{ + typedef TensorIndexTupleOp<ArgType> XprType; + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + + typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions; + static const int NumDims = internal::array_size<Dimensions>::value; + + enum { + IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false, + PacketAccess = /*TensorEvaluator<ArgType, Device>::PacketAccess*/ false, + BlockAccess = false, + Layout = TensorEvaluator<ArgType, Device>::Layout, + CoordAccess = false, // to be implemented + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device) { } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { + return m_impl.dimensions(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + return CoeffReturnType(index, m_impl.coeff(index)); + } + + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + + protected: + TensorEvaluator<ArgType, Device> m_impl; +}; + +namespace internal { + +/** \class TensorTupleIndex + * \ingroup CXX11_Tensor_Module + * + * \brief Converts to Tensor<Tuple<Index, Scalar> > and reduces to Tensor<Index>. + * + */ +template<typename ReduceOp, typename Dims, typename XprType> +struct traits<TensorTupleReducerOp<ReduceOp, Dims, XprType> > : public traits<XprType> +{ + typedef traits<XprType> XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef Index Scalar; + typedef typename XprType::Nested Nested; + typedef typename remove_reference<Nested>::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; +}; + +template<typename ReduceOp, typename Dims, typename XprType> +struct eval<TensorTupleReducerOp<ReduceOp, Dims, XprType>, Eigen::Dense> +{ + typedef const TensorTupleReducerOp<ReduceOp, Dims, XprType>& type; +}; + +template<typename ReduceOp, typename Dims, typename XprType> +struct nested<TensorTupleReducerOp<ReduceOp, Dims, XprType>, 1, + typename eval<TensorTupleReducerOp<ReduceOp, Dims, XprType> >::type> +{ + typedef TensorTupleReducerOp<ReduceOp, Dims, XprType> type; +}; + +} // end namespace internal + +template<typename ReduceOp, typename Dims, typename XprType> +class TensorTupleReducerOp : public TensorBase<TensorTupleReducerOp<ReduceOp, Dims, XprType>, ReadOnlyAccessors> +{ + public: + typedef typename Eigen::internal::traits<TensorTupleReducerOp>::Scalar Scalar; + typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; + typedef typename Eigen::internal::nested<TensorTupleReducerOp>::type Nested; + typedef typename Eigen::internal::traits<TensorTupleReducerOp>::StorageKind StorageKind; + typedef typename Eigen::internal::traits<TensorTupleReducerOp>::Index Index; + typedef Index CoeffReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorTupleReducerOp(const XprType& expr, + const ReduceOp& reduce_op, + const int return_dim, + const Dims& reduce_dims) + : m_xpr(expr), m_reduce_op(reduce_op), m_return_dim(return_dim), m_reduce_dims(reduce_dims) {} + + EIGEN_DEVICE_FUNC + const typename internal::remove_all<typename XprType::Nested>::type& + expression() const { return m_xpr; } + + EIGEN_DEVICE_FUNC + const ReduceOp& reduce_op() const { return m_reduce_op; } + + EIGEN_DEVICE_FUNC + const Dims& reduce_dims() const { return m_reduce_dims; } + + EIGEN_DEVICE_FUNC + int return_dim() const { return m_return_dim; } + + protected: + typename XprType::Nested m_xpr; + const ReduceOp m_reduce_op; + const int m_return_dim; + const Dims m_reduce_dims; +}; + +// Eval as rvalue +template<typename ReduceOp, typename Dims, typename ArgType, typename Device> +struct TensorEvaluator<const TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Device> +{ + typedef TensorTupleReducerOp<ReduceOp, Dims, ArgType> XprType; + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename TensorIndexTupleOp<ArgType>::CoeffReturnType TupleType; + typedef typename TensorEvaluator<const TensorReductionOp<ReduceOp, Dims, const TensorIndexTupleOp<ArgType> >, Device>::Dimensions Dimensions; + typedef typename TensorEvaluator<const TensorIndexTupleOp<ArgType> , Device>::Dimensions InputDimensions; + static const int NumDims = internal::array_size<InputDimensions>::value; + typedef array<Index, NumDims> StrideDims; + + enum { + IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false, + PacketAccess = /*TensorEvaluator<ArgType, Device>::PacketAccess*/ false, + BlockAccess = false, + Layout = TensorEvaluator<const TensorReductionOp<ReduceOp, Dims, const TensorIndexTupleOp<ArgType> >, Device>::Layout, + CoordAccess = false, // to be implemented + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_orig_impl(op.expression(), device), + m_impl(op.expression().index_tuples().reduce(op.reduce_dims(), op.reduce_op()), device), + m_return_dim(op.return_dim()), + m_strides(gen_strides(m_orig_impl.dimensions())), + m_stride_mod(gen_stride_mod(m_orig_impl.dimensions())), + m_stride_div(gen_stride_div()) { } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { + return m_impl.dimensions(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + const TupleType v = m_impl.coeff(index); + return (m_return_dim < 0) ? v.first : (v.first % m_stride_mod) / m_stride_div; + } + + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + + private: + EIGEN_DEVICE_FUNC StrideDims gen_strides(const InputDimensions& dims) { + StrideDims strides; + if (m_return_dim < 0) return strides; // Won't be using these. + eigen_assert(m_return_dim < NumDims && + "Asking to convert index to a dimension outside of the rank"); + + // Calculate m_stride_div and m_stride_mod, which are used to + // calculate the value of an index w.r.t. the m_return_dim. + if (Layout == static_cast<int>(ColMajor)) { + strides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + strides[i] = strides[i-1] * dims[i-1]; + } + } else { + strides[NumDims-1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + strides[i] = strides[i+1] * dims[i+1]; + } + } + return strides; + } + + EIGEN_DEVICE_FUNC Index gen_stride_mod(const InputDimensions& dims) { + if (Layout == static_cast<int>(ColMajor)) { + return (m_return_dim < NumDims - 1) ? m_strides[m_return_dim + 1] : dims.TotalSize(); + } else { + return (m_return_dim > 0) ? m_strides[m_return_dim - 1] : dims.TotalSize(); + } + } + + EIGEN_DEVICE_FUNC Index gen_stride_div() { + return m_strides[m_return_dim]; + } + + protected: + TensorEvaluator<const TensorIndexTupleOp<ArgType>, Device> m_orig_impl; + TensorEvaluator<const TensorReductionOp<ReduceOp, Dims, const TensorIndexTupleOp<ArgType> >, Device> m_impl; + const int m_return_dim; + const StrideDims m_strides; + const Index m_stride_mod; + const Index m_stride_div; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_ARG_MAX_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h new file mode 100644 index 0000000000..fdb943e713 --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h @@ -0,0 +1,179 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H +#define EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H + +namespace Eigen { + +/** \class TensorAssign + * \ingroup CXX11_Tensor_Module + * + * \brief The tensor assignment class. + * + * This class is represents the assignment of the values resulting from the evaluation of + * the rhs expression to the memory locations denoted by the lhs expression. + */ +namespace internal { +template<typename LhsXprType, typename RhsXprType> +struct traits<TensorAssignOp<LhsXprType, RhsXprType> > +{ + typedef typename LhsXprType::Scalar Scalar; + typedef typename traits<LhsXprType>::StorageKind StorageKind; + typedef typename promote_index_type<typename traits<LhsXprType>::Index, + typename traits<RhsXprType>::Index>::type Index; + typedef typename LhsXprType::Nested LhsNested; + typedef typename RhsXprType::Nested RhsNested; + typedef typename remove_reference<LhsNested>::type _LhsNested; + typedef typename remove_reference<RhsNested>::type _RhsNested; + static const std::size_t NumDimensions = internal::traits<LhsXprType>::NumDimensions; + static const int Layout = internal::traits<LhsXprType>::Layout; + + enum { + Flags = 0, + }; +}; + +template<typename LhsXprType, typename RhsXprType> +struct eval<TensorAssignOp<LhsXprType, RhsXprType>, Eigen::Dense> +{ + typedef const TensorAssignOp<LhsXprType, RhsXprType>& type; +}; + +template<typename LhsXprType, typename RhsXprType> +struct nested<TensorAssignOp<LhsXprType, RhsXprType>, 1, typename eval<TensorAssignOp<LhsXprType, RhsXprType> >::type> +{ + typedef TensorAssignOp<LhsXprType, RhsXprType> type; +}; + +} // end namespace internal + + + +template<typename LhsXprType, typename RhsXprType> +class TensorAssignOp : public TensorBase<TensorAssignOp<LhsXprType, RhsXprType> > +{ + public: + typedef typename Eigen::internal::traits<TensorAssignOp>::Scalar Scalar; + typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; + typedef typename LhsXprType::CoeffReturnType CoeffReturnType; + typedef typename Eigen::internal::traits<TensorAssignOp>::StorageKind StorageKind; + typedef typename Eigen::internal::traits<TensorAssignOp>::Index Index; + static const std::size_t NumDims = Eigen::internal::traits<TensorAssignOp>::NumDimensions; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorAssignOp(LhsXprType& lhs, const RhsXprType& rhs) + : m_lhs_xpr(lhs), m_rhs_xpr(rhs) {} + + /** \returns the nested expressions */ + EIGEN_DEVICE_FUNC + typename internal::remove_all<typename LhsXprType::Nested>::type& + lhsExpression() const { return *((typename internal::remove_all<typename LhsXprType::Nested>::type*)&m_lhs_xpr); } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all<typename RhsXprType::Nested>::type& + rhsExpression() const { return m_rhs_xpr; } + + protected: + typename internal::remove_all<typename LhsXprType::Nested>::type& m_lhs_xpr; + const typename internal::remove_all<typename RhsXprType::Nested>::type& m_rhs_xpr; +}; + + +template<typename LeftArgType, typename RightArgType, typename Device> +struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device> +{ + typedef TensorAssignOp<LeftArgType, RightArgType> XprType; + + enum { + IsAligned = TensorEvaluator<LeftArgType, Device>::IsAligned & + TensorEvaluator<RightArgType, Device>::IsAligned, + PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & + TensorEvaluator<RightArgType, Device>::PacketAccess, + BlockAccess = TensorEvaluator<LeftArgType, Device>::BlockAccess & + TensorEvaluator<RightArgType, Device>::BlockAccess, + Layout = TensorEvaluator<LeftArgType, Device>::Layout, + }; + + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : + m_leftImpl(op.lhsExpression(), device), + m_rightImpl(op.rhsExpression(), device) + { + EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); + } + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + typedef typename TensorEvaluator<RightArgType, Device>::Dimensions Dimensions; + static const std::size_t NumDims = XprType::NumDims; + + typedef typename internal::TensorBlock< + Index, typename internal::remove_const<Scalar>::type, NumDims, Layout> + TensorBlock; + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const + { + // TODO: use left impl instead if right impl dimensions are known at compile time. + return m_rightImpl.dimensions(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + eigen_assert(dimensions_match(m_leftImpl.dimensions(), m_rightImpl.dimensions())); + m_leftImpl.evalSubExprsIfNeeded(NULL); + // If the lhs provides raw access to its storage area (i.e. if m_leftImpl.data() returns a non + // null value), attempt to evaluate the rhs expression in place. Returns true iff in place + // evaluation isn't supported and the caller still needs to manually assign the values generated + // by the rhs to the lhs. + return m_rightImpl.evalSubExprsIfNeeded(m_leftImpl.data()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_leftImpl.cleanup(); + m_rightImpl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) { + m_leftImpl.coeffRef(i) = m_rightImpl.coeff(i); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) { + const int LhsStoreMode = TensorEvaluator<LeftArgType, Device>::IsAligned ? Aligned : Unaligned; + const int RhsLoadMode = TensorEvaluator<RightArgType, Device>::IsAligned ? Aligned : Unaligned; + m_leftImpl.template writePacket<LhsStoreMode>(i, m_rightImpl.template packet<RhsLoadMode>(i)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( + std::vector<internal::TensorOpResourceRequirements>* resources) const { + m_leftImpl.getResourceRequirements(resources); + m_rightImpl.getResourceRequirements(resources); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlock(TensorBlock* block) { + m_rightImpl.block(block); + m_leftImpl.writeBlock(*block); + } + + EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const + { + return m_leftImpl.coeff(index); + } + template<int LoadMode> + EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const + { + return m_leftImpl.template packet<LoadMode>(index); + } + + private: + TensorEvaluator<LeftArgType, Device> m_leftImpl; + TensorEvaluator<RightArgType, Device> m_rightImpl; +}; + +} + + +#endif // EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h new file mode 100644 index 0000000000..35ebca151b --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -0,0 +1,934 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_BASE_H +#define EIGEN_CXX11_TENSOR_TENSOR_BASE_H + +// clang-format off + +namespace Eigen { + +/** \class TensorBase + * \ingroup CXX11_Tensor_Module + * + * \brief The tensor base class. + * + * This class is the common parent of the Tensor and TensorMap class, thus + * making it possible to use either class interchangably in expressions. + */ + +template<typename Derived> +class TensorBase<Derived, ReadOnlyAccessors> +{ + public: + typedef internal::traits<Derived> DerivedTraits; + typedef typename DerivedTraits::Scalar Scalar; + typedef typename DerivedTraits::Index Index; + typedef typename internal::remove_const<Scalar>::type CoeffReturnType; + typedef typename internal::packet_traits<CoeffReturnType>::type PacketReturnType; + static const int NumDimensions = DerivedTraits::NumDimensions; + + // Generic nullary operation support. + template <typename CustomNullaryOp> EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseNullaryOp<CustomNullaryOp, const Derived> + nullaryExpr(const CustomNullaryOp& func) const { + return TensorCwiseNullaryOp<CustomNullaryOp, const Derived>(derived(), func); + } + + // Coefficient-wise nullary operators + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> + constant(const Scalar& value) const { + return nullaryExpr(internal::scalar_constant_op<Scalar>(value)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseNullaryOp<internal::UniformRandomGenerator<Scalar>, const Derived> + random() const { + return nullaryExpr(internal::UniformRandomGenerator<Scalar>()); + } + template <typename RandomGenerator> EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseNullaryOp<RandomGenerator, const Derived> + random(const RandomGenerator& gen = RandomGenerator()) const { + return nullaryExpr(gen); + } + + // Tensor generation + template <typename Generator> EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorGeneratorOp<Generator, const Derived> + generate(const Generator& generator) const { + return TensorGeneratorOp<Generator, const Derived>(derived(), generator); + } + + // Generic unary operation support. + template <typename CustomUnaryOp> EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<CustomUnaryOp, const Derived> + unaryExpr(const CustomUnaryOp& func) const { + return TensorCwiseUnaryOp<CustomUnaryOp, const Derived>(derived(), func); + } + + // Coefficient-wise unary operators + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_opposite_op<Scalar>, const Derived> + operator-() const { + return unaryExpr(internal::scalar_opposite_op<Scalar>()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_sqrt_op<Scalar>, const Derived> + sqrt() const { + return unaryExpr(internal::scalar_sqrt_op<Scalar>()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_rsqrt_op<Scalar>, const Derived> + rsqrt() const { + return unaryExpr(internal::scalar_rsqrt_op<Scalar>()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_square_op<Scalar>, const Derived> + square() const { + return unaryExpr(internal::scalar_square_op<Scalar>()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_cube_op<Scalar>, const Derived> + cube() const { + return unaryExpr(internal::scalar_cube_op<Scalar>()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_inverse_op<Scalar>, const Derived> + inverse() const { + return unaryExpr(internal::scalar_inverse_op<Scalar>()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_tanh_op<Scalar>, const Derived> + tanh() const { + return unaryExpr(internal::scalar_tanh_op<Scalar>()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_sigmoid_op<Scalar>, const Derived> + sigmoid() const { + return unaryExpr(internal::scalar_sigmoid_op<Scalar>()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_exp_op<Scalar>, const Derived> + exp() const { + return unaryExpr(internal::scalar_exp_op<Scalar>()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_log_op<Scalar>, const Derived> + log() const { + return unaryExpr(internal::scalar_log_op<Scalar>()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_abs_op<Scalar>, const Derived> + abs() const { + return unaryExpr(internal::scalar_abs_op<Scalar>()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_pow_op<Scalar>, const Derived> + pow(Scalar exponent) const { + return unaryExpr(internal::scalar_pow_op<Scalar>(exponent)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_add_op<Scalar>, const Derived> + operator+ (Scalar rhs) const { + return unaryExpr(internal::scalar_add_op<Scalar>(rhs)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_sub_op<Scalar>, const Derived> + operator- (Scalar rhs) const { + EIGEN_STATIC_ASSERT((std::numeric_limits<Scalar>::is_signed || internal::is_same<Scalar, const std::complex<float> >::value), YOU_MADE_A_PROGRAMMING_MISTAKE); + return unaryExpr(internal::scalar_sub_op<Scalar>(rhs)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const Derived> + operator* (Scalar rhs) const { + return unaryExpr(internal::scalar_multiple_op<Scalar>(rhs)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_quotient1_op<Scalar>, const Derived> + operator/ (Scalar rhs) const { + // EIGEN_STATIC_ASSERT(!std::numeric_limits<Scalar>::is_integer, YOU_MADE_A_PROGRAMMING_MISTAKE); + return unaryExpr(internal::scalar_quotient1_op<Scalar>(rhs)); + } + + template <typename Scale> + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_multiple2_op<Scalar, Scale>, const Derived> + scale (Scale rhs) const { + return unaryExpr(internal::scalar_multiple2_op<Scalar, Scale>(rhs)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_mod_op<Scalar>, const Derived> + operator% (Scalar rhs) const { + EIGEN_STATIC_ASSERT(std::numeric_limits<Scalar>::is_integer, YOU_MADE_A_PROGRAMMING_MISTAKE_TRY_MOD); + return unaryExpr(internal::scalar_mod_op<Scalar>(rhs)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_fmod_op<Scalar>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> > + mod(Scalar rhs) const { + EIGEN_STATIC_ASSERT(!std::numeric_limits<Scalar>::is_integer, YOU_MADE_A_PROGRAMMING_MISTAKE_FMOD_IS_NOT_FOR_INTEGERS); + return mod(constant(rhs)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_max_op<Scalar>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> > + cwiseMax(Scalar threshold) const { + return cwiseMax(constant(threshold)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_min_op<Scalar>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> > + cwiseMin(Scalar threshold) const { + return cwiseMin(constant(threshold)); + } + + template <typename NewType> EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorConversionOp<NewType, const Derived> + cast() const { + return TensorConversionOp<NewType, const Derived>(derived()); + } + + // Generic binary operation support. + template <typename CustomBinaryOp, typename OtherDerived> EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<CustomBinaryOp, const Derived, const OtherDerived> + binaryExpr(const OtherDerived& other, const CustomBinaryOp& func) const { + return TensorCwiseBinaryOp<CustomBinaryOp, const Derived, const OtherDerived>(derived(), other, func); + } + + // Coefficient-wise binary operators. + template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp<internal::scalar_sum_op<Scalar>, const Derived, const OtherDerived> + operator+(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_sum_op<Scalar>()); + } + + template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp<internal::scalar_difference_op<Scalar>, const Derived, const OtherDerived> + operator-(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_difference_op<Scalar>()); + } + + template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp<internal::scalar_product_op<Scalar>, const Derived, const OtherDerived> + operator*(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_product_op<Scalar>()); + } + + template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp<internal::scalar_quotient_op<Scalar>, const Derived, const OtherDerived> + operator/(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_quotient_op<Scalar>()); + } + + template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp<internal::scalar_fmod_op<Scalar>, const Derived, const OtherDerived> + mod(const OtherDerived& other) const { + EIGEN_STATIC_ASSERT(!std::numeric_limits<Scalar>::is_integer, YOU_MADE_A_PROGRAMMING_MISTAKE_FMOD_IS_NOT_FOR_INTEGERS); + return binaryExpr(other.derived(), internal::scalar_fmod_op<Scalar>()); + } + + template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp<internal::scalar_max_op<Scalar>, const Derived, const OtherDerived> + cwiseMax(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_max_op<Scalar>()); + } + + template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp<internal::scalar_min_op<Scalar>, const Derived, const OtherDerived> + cwiseMin(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_min_op<Scalar>()); + } + + template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp<internal::scalar_boolean_and_op, const Derived, const OtherDerived> + operator&&(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_boolean_and_op()); + } + + template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp<internal::scalar_boolean_or_op, const Derived, const OtherDerived> + operator||(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_boolean_or_op()); + } + + template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp<internal::scalar_boolean_xor_op, const Derived, const OtherDerived> + operator^(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_boolean_xor_op()); + } + + // Comparisons and tests. + template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp<std::less<Scalar>, const Derived, const OtherDerived> + operator<(const OtherDerived& other) const { + return binaryExpr(other.derived(), std::less<Scalar>()); + } + template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp<std::less_equal<Scalar>, const Derived, const OtherDerived> + operator<=(const OtherDerived& other) const { + return binaryExpr(other.derived(), std::less_equal<Scalar>()); + } + template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp<std::greater<Scalar>, const Derived, const OtherDerived> + operator>(const OtherDerived& other) const { + return binaryExpr(other.derived(), std::greater<Scalar>()); + } + template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp<std::greater_equal<Scalar>, const Derived, const OtherDerived> + operator>=(const OtherDerived& other) const { + return binaryExpr(other.derived(), std::greater_equal<Scalar>()); + } + + template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp<std::equal_to<Scalar>, const Derived, const OtherDerived> + operator==(const OtherDerived& other) const { + return binaryExpr(other.derived(), std::equal_to<Scalar>()); + } + template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp<std::not_equal_to<Scalar>, const Derived, const OtherDerived> + operator!=(const OtherDerived& other) const { + return binaryExpr(other.derived(), std::not_equal_to<Scalar>()); + } + + // comparisons and tests for Scalars + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<std::less<Scalar>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> > + operator<(Scalar threshold) const { + return operator<(constant(threshold)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<std::less_equal<Scalar>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> > + operator<=(Scalar threshold) const { + return operator<=(constant(threshold)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<std::greater<Scalar>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> > + operator>(Scalar threshold) const { + return operator>(constant(threshold)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<std::greater_equal<Scalar>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> > + operator>=(Scalar threshold) const { + return operator>=(constant(threshold)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<std::equal_to<Scalar>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> > + operator==(Scalar threshold) const { + return operator==(constant(threshold)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<std::not_equal_to<Scalar>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> > + operator!=(Scalar threshold) const { + return operator!=(constant(threshold)); + } + + // Coefficient-wise ternary operators. + template<typename ThenDerived, typename ElseDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorSelectOp<const Derived, const ThenDerived, const ElseDerived> + select(const ThenDerived& thenTensor, const ElseDerived& elseTensor) const { + return TensorSelectOp<const Derived, const ThenDerived, const ElseDerived>(derived(), thenTensor.derived(), elseTensor.derived()); + } + + // Contractions. + typedef Eigen::IndexPair<Index> DimensionPair; + + template<typename OtherDerived, typename Dimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorContractionOp<const Dimensions, const Derived, const OtherDerived> + contract(const OtherDerived& other, const Dimensions& dims) const { + return TensorContractionOp<const Dimensions, const Derived, const OtherDerived>(derived(), other.derived(), dims); + } + + // Convolutions. + template<typename KernelDerived, typename Dimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorConvolutionOp<const Dimensions, const Derived, const KernelDerived> + convolve(const KernelDerived& kernel, const Dimensions& dims) const { + return TensorConvolutionOp<const Dimensions, const Derived, const KernelDerived>(derived(), kernel.derived(), dims); + } + + // Convolutions by fft. + template<typename KernelDerived, typename Dimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorConvolutionByFFTOp<const Dimensions, const Derived, const KernelDerived> + convolvebyfft(const KernelDerived& kernel, const Dimensions& dims) const { + return TensorConvolutionByFFTOp<const Dimensions, const Derived, const KernelDerived>(derived(), kernel.derived(), dims); + } + + // Reductions. + template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp<internal::SumReducer<CoeffReturnType>, const Dims, const Derived> + sum(const Dims& dims) const { + return TensorReductionOp<internal::SumReducer<CoeffReturnType>, const Dims, const Derived>(derived(), dims, internal::SumReducer<CoeffReturnType>()); + } + + const TensorReductionOp<internal::SumReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived> + sum() const { + DimensionList<Index, NumDimensions> in_dims; + return TensorReductionOp<internal::SumReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::SumReducer<CoeffReturnType>()); + } + + template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp<internal::MeanReducer<CoeffReturnType>, const Dims, const Derived> + mean(const Dims& dims) const { + return TensorReductionOp<internal::MeanReducer<CoeffReturnType>, const Dims, const Derived>(derived(), dims, internal::MeanReducer<CoeffReturnType>()); + } + + const TensorReductionOp<internal::MeanReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived> + mean() const { + DimensionList<Index, NumDimensions> in_dims; + return TensorReductionOp<internal::MeanReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::MeanReducer<CoeffReturnType>()); + } + + template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp<internal::ProdReducer<CoeffReturnType>, const Dims, const Derived> + prod(const Dims& dims) const { + return TensorReductionOp<internal::ProdReducer<CoeffReturnType>, const Dims, const Derived>(derived(), dims, internal::ProdReducer<CoeffReturnType>()); + } + + const TensorReductionOp<internal::ProdReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived> + prod() const { + DimensionList<Index, NumDimensions> in_dims; + return TensorReductionOp<internal::ProdReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::ProdReducer<CoeffReturnType>()); + } + + template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp<internal::MaxReducer<CoeffReturnType>, const Dims, const Derived> + maximum(const Dims& dims) const { + return TensorReductionOp<internal::MaxReducer<CoeffReturnType>, const Dims, const Derived>(derived(), dims, internal::MaxReducer<CoeffReturnType>()); + } + + const TensorReductionOp<internal::MaxReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived> + maximum() const { + DimensionList<Index, NumDimensions> in_dims; + return TensorReductionOp<internal::MaxReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::MaxReducer<CoeffReturnType>()); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorTupleReducerOp< + internal::ArgMaxTupleReducer<Tuple<Index, CoeffReturnType> >, + const array<Index, NumDimensions>, const Derived> + argmax() const { + array<Index, NumDimensions> in_dims; + for (int d = 0; d < NumDimensions; ++d) in_dims[d] = d; + return TensorTupleReducerOp< + internal::ArgMaxTupleReducer<Tuple<Index, CoeffReturnType> >, + const array<Index, NumDimensions>, + const Derived>(derived(), internal::ArgMaxTupleReducer<Tuple<Index, CoeffReturnType> >(), -1, in_dims); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorTupleReducerOp< + internal::ArgMinTupleReducer<Tuple<Index, CoeffReturnType> >, + const array<Index, NumDimensions>, const Derived> + argmin() const { + array<Index, NumDimensions> in_dims; + for (int d = 0; d < NumDimensions; ++d) in_dims[d] = d; + return TensorTupleReducerOp< + internal::ArgMinTupleReducer<Tuple<Index, CoeffReturnType> >, + const array<Index, NumDimensions>, + const Derived>(derived(), internal::ArgMinTupleReducer<Tuple<Index, CoeffReturnType> >(), -1, in_dims); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorTupleReducerOp< + internal::ArgMaxTupleReducer<Tuple<Index, CoeffReturnType> >, + const array<Index, 1>, const Derived> + argmax(const int return_dim) const { + array<Index, 1> in_dims; + in_dims[0] = return_dim; + return TensorTupleReducerOp< + internal::ArgMaxTupleReducer<Tuple<Index, CoeffReturnType> >, + const array<Index, 1>, + const Derived>(derived(), internal::ArgMaxTupleReducer<Tuple<Index, CoeffReturnType> >(), return_dim, in_dims); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorTupleReducerOp< + internal::ArgMinTupleReducer<Tuple<Index, CoeffReturnType> >, + const array<Index, 1>, const Derived> + argmin(const int return_dim) const { + array<Index, 1> in_dims; + in_dims[0] = return_dim; + return TensorTupleReducerOp< + internal::ArgMinTupleReducer<Tuple<Index, CoeffReturnType> >, + const array<Index, 1>, + const Derived>(derived(), internal::ArgMinTupleReducer<Tuple<Index, CoeffReturnType> >(), return_dim, in_dims); + } + + template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp<internal::MinReducer<CoeffReturnType>, const Dims, const Derived> + minimum(const Dims& dims) const { + return TensorReductionOp<internal::MinReducer<CoeffReturnType>, const Dims, const Derived>(derived(), dims, internal::MinReducer<CoeffReturnType>()); + } + + const TensorReductionOp<internal::MinReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived> + minimum() const { + DimensionList<Index, NumDimensions> in_dims; + return TensorReductionOp<internal::MinReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::MinReducer<CoeffReturnType>()); + } + + // This does not short-circuit, so is potentially very inefficient. + template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp<internal::AndReducer, const Dims, const TensorConversionOp<bool, const Derived> > + all(const Dims& dims) const { + return cast<bool>().reduce(dims, internal::AndReducer()); + } + + // This does not short-circuit, so is potentially very inefficient. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp<internal::AndReducer, const DimensionList<Index, NumDimensions>, const TensorConversionOp<bool, const Derived> > + all() const { + DimensionList<Index, NumDimensions> in_dims; + return cast<bool>().reduce(in_dims, internal::AndReducer()); + } + + // This does not short-circuit, so is potentially very inefficient. + template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp<internal::OrReducer, const Dims, const TensorConversionOp<bool, const Derived> > + any(const Dims& dims) const { + return cast<bool>().reduce(dims, internal::OrReducer()); + } + + // This does not short-circuit, so is potentially very inefficient. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp<internal::OrReducer, const DimensionList<Index, NumDimensions>, const TensorConversionOp<bool, const Derived> > + any() const { + DimensionList<Index, NumDimensions> in_dims; + return cast<bool>().reduce(in_dims, internal::OrReducer()); + } + + template <typename Reducer, typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp<Reducer, const Dims, const Derived> + reduce(const Dims& dims, const Reducer& reducer) const { + return TensorReductionOp<Reducer, const Dims, const Derived>(derived(), dims, reducer); + } + + template <typename Broadcast> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorBroadcastingOp<const Broadcast, const Derived> + broadcast(const Broadcast& broadcast) const { + return TensorBroadcastingOp<const Broadcast, const Derived>(derived(), broadcast); + } + + template <int FFTDataType, int FFTDirection, typename FFT> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorFFTOp<const FFT, const Derived, FFTDataType, FFTDirection> + fft(const FFT& fft) const { + return TensorFFTOp<const FFT, const Derived, FFTDataType, FFTDirection>(derived(), fft); + } + + template <typename Axis, typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorConcatenationOp<Axis, const Derived, const OtherDerived> + concatenate(const OtherDerived& other, Axis axis) const { + return TensorConcatenationOp<Axis, const Derived, const OtherDerived>(derived(), other.derived(), axis); + } + + template <typename PatchDims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorPatchOp<const PatchDims, const Derived> + extract_patches(const PatchDims& patch_dims) const { + return TensorPatchOp<const PatchDims, const Derived>(derived(), patch_dims); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Derived> + extract_volume_patches(const Index patch_planes, const Index patch_rows, const Index patch_cols, + const Index plane_stride = 1, const Index row_stride = 1, const Index col_stride = 1, + const PaddingType padding_type = PADDING_SAME, const Scalar padding_value = 0) const { + return TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Derived>(derived(), patch_planes, patch_rows, patch_cols, plane_stride, row_stride, col_stride, 1, 1, 1, 1, 1, 1, padding_type, padding_value); + } + + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Derived> + extract_volume_patches(const Index patch_planes, const Index patch_rows, const Index patch_cols, + const Index plane_stride, const Index row_stride, const Index col_stride, + const Index plane_inflate_stride, const Index row_inflate_stride, const Index col_inflate_stride, + const Index padding_top_z, const Index padding_bottom_z, + const Index padding_top, const Index padding_bottom, + const Index padding_left, const Index padding_right, const Scalar padding_value = 0) const { + return TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Derived>(derived(), patch_planes, patch_rows, patch_cols, plane_stride, row_stride, col_stride, 1, 1, 1, plane_inflate_stride, row_inflate_stride, col_inflate_stride, padding_top_z, padding_bottom_z, padding_top, padding_bottom, padding_left, padding_right, padding_value); + } + + template <Index Rows, Index Cols> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorImagePatchOp<Rows, Cols, const Derived> + extract_image_patches() const { + return TensorImagePatchOp<Rows, Cols, const Derived>(derived(), Rows, Cols, 1, 1, 1, 1, 1, 1, PADDING_SAME, 0); + } + + template <Index Rows, Index Cols> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorImagePatchOp<Rows, Cols, const Derived> + extract_image_patches(const PaddingType padding_type) const { + return TensorImagePatchOp<Rows, Cols, const Derived>(derived(), Rows, Cols, 1, 1, 1, 1, 1, 1, padding_type, 0); + } + + template <Index Rows, Index Cols> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorImagePatchOp<Rows, Cols, const Derived> + extract_image_patches(const Index stride, const PaddingType padding_type) const { + return TensorImagePatchOp<Rows, Cols, const Derived>(derived(), Rows, Cols, stride, stride, 1, 1, 1, 1, padding_type, 0); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorImagePatchOp<Dynamic, Dynamic, const Derived> + extract_image_patches(const Index patch_rows, const Index patch_cols, + const Index row_stride = 1, const Index col_stride = 1) const { + return TensorImagePatchOp<Dynamic, Dynamic, const Derived>(derived(), patch_rows, patch_cols, row_stride, col_stride, + 1, 1, 1, 1, PADDING_SAME, 0); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorImagePatchOp<Dynamic, Dynamic, const Derived> + extract_image_patches(const Index patch_rows, const Index patch_cols, + const Index row_stride, const Index col_stride, + const PaddingType padding_type) const { + return TensorImagePatchOp<Dynamic, Dynamic, const Derived>(derived(), patch_rows, patch_cols, row_stride, col_stride, + 1, 1, 1, 1, padding_type, 0); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorImagePatchOp<Dynamic, Dynamic, const Derived> + extract_image_patches(const Index patch_rows, const Index patch_cols, + const Index row_stride, const Index col_stride, + const PaddingType padding_type, const Scalar padding_value) const { + return TensorImagePatchOp<Dynamic, Dynamic, const Derived>(derived(), patch_rows, patch_cols, row_stride, col_stride, + 1, 1, 1, 1, padding_type, padding_value); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorImagePatchOp<Dynamic, Dynamic, const Derived> + extract_image_patches(const Index patch_rows, const Index patch_cols, + const Index row_stride, const Index col_stride, + const Index in_row_stride, const Index in_col_stride) const { + return TensorImagePatchOp<Dynamic, Dynamic, const Derived>(derived(), patch_rows, patch_cols, row_stride, col_stride, + in_row_stride, in_col_stride, 1, 1, PADDING_SAME, 0); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorImagePatchOp<Dynamic, Dynamic, const Derived> + extract_image_patches(const Index patch_rows, const Index patch_cols, + const Index row_stride, const Index col_stride, + const Index in_row_stride, const Index in_col_stride, + const PaddingType padding_type) const { + return TensorImagePatchOp<Dynamic, Dynamic, const Derived>(derived(), patch_rows, patch_cols, row_stride, col_stride, + in_row_stride, in_col_stride, 1, 1, padding_type, 0); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorImagePatchOp<Dynamic, Dynamic, const Derived> + extract_image_patches(const Index patch_rows, const Index patch_cols, + const Index row_stride, const Index col_stride, + const Index in_row_stride, const Index in_col_stride, + const PaddingType padding_type, const Scalar padding_value) const { + return TensorImagePatchOp<Dynamic, Dynamic, const Derived>(derived(), patch_rows, patch_cols, row_stride, col_stride, + in_row_stride, in_col_stride, 1, 1, padding_type, padding_value); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorImagePatchOp<Dynamic, Dynamic, const Derived> + extract_image_patches(const Index patch_rows, const Index patch_cols, + const Index row_stride, const Index col_stride, + const Index in_row_stride, const Index in_col_stride, + const Index row_inflate_stride, const Index col_inflate_stride, + const PaddingType padding_type, const Scalar padding_value) const { + return TensorImagePatchOp<Dynamic, Dynamic, const Derived>(derived(), patch_rows, patch_cols, row_stride, col_stride, + in_row_stride, in_col_stride, row_inflate_stride, col_inflate_stride, + padding_type, padding_value); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorImagePatchOp<Dynamic, Dynamic, const Derived> + extract_image_patches(const Index patch_rows, const Index patch_cols, + const Index row_stride, const Index col_stride, + const Index in_row_stride, const Index in_col_stride, + const Index row_inflate_stride, const Index col_inflate_stride, + const Index padding_top, const Index padding_bottom, + const Index padding_left,const Index padding_right, + const Scalar padding_value) const { + return TensorImagePatchOp<Dynamic, Dynamic, const Derived>(derived(), patch_rows, patch_cols, row_stride, col_stride, + in_row_stride, in_col_stride, row_inflate_stride, col_inflate_stride, + padding_top, padding_bottom, padding_left, padding_right, padding_value); + } + + // Morphing operators. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorLayoutSwapOp<const Derived> + swap_layout() const { + return TensorLayoutSwapOp<const Derived>(derived()); + } + template <typename NewDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReshapingOp<const NewDimensions, const Derived> + reshape(const NewDimensions& newDimensions) const { + return TensorReshapingOp<const NewDimensions, const Derived>(derived(), newDimensions); + } + template <typename StartIndices, typename Sizes> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorSlicingOp<const StartIndices, const Sizes, const Derived> + slice(const StartIndices& startIndices, const Sizes& sizes) const { + return TensorSlicingOp<const StartIndices, const Sizes, const Derived>(derived(), startIndices, sizes); + } + template <Index DimId> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorChippingOp<DimId, const Derived> + chip(const Index offset) const { + return TensorChippingOp<DimId, const Derived>(derived(), offset, DimId); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorChippingOp<Dynamic, const Derived> + chip(const Index offset, const Index dim) const { + return TensorChippingOp<Dynamic, const Derived>(derived(), offset, dim); + } + template <typename ReverseDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReverseOp<const ReverseDimensions, const Derived> + reverse(const ReverseDimensions& rev) const { + return TensorReverseOp<const ReverseDimensions, const Derived>(derived(), rev); + } + template <typename PaddingDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorPaddingOp<const PaddingDimensions, const Derived> + pad(const PaddingDimensions& padding) const { + return TensorPaddingOp<const PaddingDimensions, const Derived>(derived(), padding, Scalar(0)); + } + template <typename PaddingDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorPaddingOp<const PaddingDimensions, const Derived> + pad (const PaddingDimensions& padding, const Scalar padding_value) const { + return TensorPaddingOp<const PaddingDimensions, const Derived>(derived(), padding, padding_value); + } + template <typename Shuffle> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorShufflingOp<const Shuffle, const Derived> + shuffle(const Shuffle& shuffle) const { + return TensorShufflingOp<const Shuffle, const Derived>(derived(), shuffle); + } + template <typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorStridingOp<const Strides, const Derived> + stride(const Strides& strides) const { + return TensorStridingOp<const Strides, const Derived>(derived(), strides); + } + template <typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorInflationOp<const Strides, const Derived> + inflate(const Strides& strides) const { + return TensorInflationOp<const Strides, const Derived>(derived(), strides); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorTrueIndicesOp<const Derived> + true_indices(const Index& not_true_value = -1) const { + return TensorTrueIndicesOp<const Derived>(derived(), not_true_value); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorIndexTupleOp<const Derived> + index_tuples() const { + return TensorIndexTupleOp<const Derived>(derived()); + } + template <typename CustomUnaryFunc> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCustomUnaryOp<const CustomUnaryFunc, const Derived> customOp(const CustomUnaryFunc& op) const { + return TensorCustomUnaryOp<const CustomUnaryFunc, const Derived>(derived(), op); + } + template <typename OtherDerived, typename CustomBinaryFunc> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCustomBinaryOp<const CustomBinaryFunc, const Derived, const OtherDerived> customOp(const OtherDerived& other, const CustomBinaryFunc& op) const { + return TensorCustomBinaryOp<const CustomBinaryFunc, const Derived, const OtherDerived>(derived(), other, op); + } + + // Force the evaluation of the expression. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorForcedEvalOp<const Derived> eval() const { + return TensorForcedEvalOp<const Derived>(derived()); + } + + protected: + template <typename Scalar, std::size_t NumIndices, int Options, typename IndexType> friend class Tensor; + template <typename Scalar, int Option, typename IndexTypes> friend class TensorVarDim; + template <typename Scalar, typename Dimensions, int Option, typename IndexTypes> friend class TensorFixedSize; + template <typename OtherDerived, int AccessLevel> friend class TensorBase; + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Derived& derived() const { return *static_cast<const Derived*>(this); } +}; + +template<typename Derived> +class TensorBase<Derived, WriteAccessors> : public TensorBase<Derived, ReadOnlyAccessors> { + public: + typedef internal::traits<Derived> DerivedTraits; + typedef typename DerivedTraits::Scalar Scalar; + typedef typename DerivedTraits::Index Index; + typedef Scalar CoeffReturnType; + typedef typename internal::packet_traits<Scalar>::type PacketReturnType; + static const int NumDimensions = DerivedTraits::NumDimensions; + + template <typename Scalar, std::size_t NumIndices, int Options, typename IndexType> friend class Tensor; + template <typename Scalar, int Options, typename IndexType> friend class TensorVarDim; + template <typename OtherDerived, int AccessLevel> friend class TensorBase; + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Derived& setZero() { + return setConstant(Scalar(0)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Derived& setConstant(const Scalar& val) { + return derived() = this->constant(val); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Derived& setRandom() { + return derived() = this->random(); + } + template <typename RandomGenerator> EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Derived& setRandom() { + return derived() = this->template random<RandomGenerator>(); + } + +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Derived& setValues( + const typename internal::Initializer<Derived, NumDimensions>::InitList& vals) { + TensorEvaluator<Derived, DefaultDevice> eval(derived(), DefaultDevice()); + internal::initialize_tensor<Derived, NumDimensions>(eval, vals); + return derived(); + } +#endif // EIGEN_HAS_VARIADIC_TEMPLATES + + template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Derived& operator+=(const OtherDerived& other) { + return derived() = derived() + other.derived(); + } + template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Derived& operator-=(const OtherDerived& other) { + return derived() = derived() - other.derived(); + } + template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Derived& operator*=(const OtherDerived& other) { + return derived() = derived() * other.derived(); + } + template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Derived& operator/=(const OtherDerived& other) { + return derived() = derived() / other.derived(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorLayoutSwapOp<const Derived> + swap_layout() const { + return TensorLayoutSwapOp<const Derived>(derived()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorLayoutSwapOp<Derived> + swap_layout() { + return TensorLayoutSwapOp<Derived>(derived()); + } + + template <typename Axis, typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorConcatenationOp<const Axis, const Derived, const OtherDerived> + concatenate(const OtherDerived& other, const Axis& axis) const { + return TensorConcatenationOp<const Axis, const Derived, const OtherDerived>(derived(), other, axis); + } + template <typename Axis, typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorConcatenationOp<const Axis, Derived, OtherDerived> + concatenate(const OtherDerived& other, const Axis& axis) { + return TensorConcatenationOp<const Axis, Derived, OtherDerived>(derived(), other, axis); + } + + template <typename NewDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReshapingOp<const NewDimensions, const Derived> + reshape(const NewDimensions& newDimensions) const { + return TensorReshapingOp<const NewDimensions, const Derived>(derived(), newDimensions); + } + template <typename NewDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorReshapingOp<const NewDimensions, Derived> + reshape(const NewDimensions& newDimensions) { + return TensorReshapingOp<const NewDimensions, Derived>(derived(), newDimensions); + } + + template <typename StartIndices, typename Sizes> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorSlicingOp<const StartIndices, const Sizes, const Derived> + slice(const StartIndices& startIndices, const Sizes& sizes) const { + return TensorSlicingOp<const StartIndices, const Sizes, const Derived>(derived(), startIndices, sizes); + } + template <typename StartIndices, typename Sizes> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorSlicingOp<const StartIndices, const Sizes, Derived> + slice(const StartIndices& startIndices, const Sizes& sizes) { + return TensorSlicingOp<const StartIndices, const Sizes, Derived>(derived(), startIndices, sizes); + } + + template <DenseIndex DimId> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorChippingOp<DimId, const Derived> + chip(const Index offset) const { + return TensorChippingOp<DimId, const Derived>(derived(), offset, DimId); + } + template <Index DimId> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorChippingOp<DimId, Derived> + chip(const Index offset) { + return TensorChippingOp<DimId, Derived>(derived(), offset, DimId); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorChippingOp<Dynamic, const Derived> + chip(const Index offset, const Index dim) const { + return TensorChippingOp<Dynamic, const Derived>(derived(), offset, dim); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorChippingOp<Dynamic, Derived> + chip(const Index offset, const Index dim) { + return TensorChippingOp<Dynamic, Derived>(derived(), offset, dim); + } + + template <typename ReverseDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReverseOp<const ReverseDimensions, const Derived> + reverse(const ReverseDimensions& rev) const { + return TensorReverseOp<const ReverseDimensions, const Derived>(derived(), rev); + } + template <typename ReverseDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorReverseOp<const ReverseDimensions, Derived> + reverse(const ReverseDimensions& rev) { + return TensorReverseOp<const ReverseDimensions, Derived>(derived(), rev); + } + + template <typename Shuffle> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorShufflingOp<const Shuffle, const Derived> + shuffle(const Shuffle& shuffle) const { + return TensorShufflingOp<const Shuffle, const Derived>(derived(), shuffle); + } + template <typename Shuffle> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorShufflingOp<const Shuffle, Derived> + shuffle(const Shuffle& shuffle) { + return TensorShufflingOp<const Shuffle, Derived>(derived(), shuffle); + } + + template <typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorStridingOp<const Strides, const Derived> + stride(const Strides& strides) const { + return TensorStridingOp<const Strides, const Derived>(derived(), strides); + } + template <typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorStridingOp<const Strides, Derived> + stride(const Strides& strides) { + return TensorStridingOp<const Strides, Derived>(derived(), strides); + } + + // Select the device on which to evaluate the expression. + template <typename DeviceType> + TensorDevice<Derived, DeviceType> device(const DeviceType& device) { + return TensorDevice<Derived, DeviceType>(device, derived()); + } + + protected: + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Derived& derived() { return *static_cast<Derived*>(this); } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Derived& derived() const { return *static_cast<const Derived*>(this); } +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_BASE_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h new file mode 100644 index 0000000000..ac428b169f --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h @@ -0,0 +1,627 @@ +#ifndef EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H +#define EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H + +namespace Eigen { + +/** \class TensorBlock + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor block class. + * + * This class represents a tensor block specified by the index of the + * first block coefficient, and the size of the block in each dimension. + * + */ + +namespace internal { + +template <typename Index, typename Scalar, std::size_t NumDims, int Layout> +class TensorBlock { + public: + typedef DSizes<Index, NumDims> Dimensions; + + TensorBlock(const Index first_coeff_index, + const Dimensions& block_sizes, + const Dimensions& block_strides, + const Dimensions& tensor_strides, + Scalar* data) + : m_first_coeff_index(first_coeff_index), + m_block_sizes(block_sizes), + m_block_strides(block_strides), + m_tensor_strides(tensor_strides), + m_data(data) {} + + Index first_coeff_index() const { return m_first_coeff_index; } + + const Dimensions& block_sizes() const { return m_block_sizes; } + + const Dimensions& block_strides() const { return m_block_strides; } + + const Dimensions& tensor_strides() const { return m_tensor_strides; } + + Scalar* data() { return m_data; } + + const Scalar* data() const { return m_data; } + + private: + Index m_first_coeff_index; + Dimensions m_block_sizes; + Dimensions m_block_strides; + Dimensions m_tensor_strides; + Scalar* m_data; // Not owned. +}; + +template <typename Index, typename Scalar, bool Vectorizable> +struct TensorBlockCopyOp { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( + const Index num_coeff_to_copy, const Index dst_index, + const Index dst_stride, Scalar* EIGEN_RESTRICT dst_data, const Index src_index, + const Index src_stride, const Scalar* EIGEN_RESTRICT src_data) { + for (Index i = 0; i < num_coeff_to_copy; ++i) { + dst_data[dst_index + i * dst_stride] = + src_data[src_index + i * src_stride]; + } + } +}; + +// NOTE: Benchmarks run on an implementation of this that broke each of the +// loops in these conditionals into it's own template specialization (to +// avoid conditionals in the caller's loop) did not show an improvement. +template <typename Index, typename Scalar> +struct TensorBlockCopyOp<Index, Scalar, true> { + typedef typename packet_traits<Scalar>::type Packet; + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( + const Index num_coeff_to_copy, const Index dst_index, + const Index dst_stride, Scalar* EIGEN_RESTRICT dst_data, + const Index src_index, const Index src_stride, + const Scalar* EIGEN_RESTRICT src_data) { + if (src_stride == 1) { + const Index packet_size = internal::unpacket_traits<Packet>::size; + const Index vectorized_size = + (num_coeff_to_copy / packet_size) * packet_size; + if (dst_stride == 1) { + // LINEAR + for (Index i = 0; i < vectorized_size; i += packet_size) { + Packet p = internal::ploadt<Packet, Unaligned>( + src_data + src_index + i); + internal::pstoret<Scalar, Packet, Unaligned>( + dst_data + dst_index + i, p); + } + for (Index i = vectorized_size; i < num_coeff_to_copy; ++i) { + dst_data[dst_index + i] = src_data[src_index + i]; + } + } else { + // SCATTER + for (Index i = 0; i < vectorized_size; i += packet_size) { + Packet p = internal::ploadt<Packet, Unaligned>( + src_data + src_index + i); + internal::pscatter<Scalar, Packet>( + dst_data + dst_index + i * dst_stride, p, dst_stride); + } + for (Index i = vectorized_size; i < num_coeff_to_copy; ++i) { + dst_data[dst_index + i * dst_stride] = src_data[src_index + i]; + } + } + } else { + if (dst_stride == 1) { + // GATHER + const Index packet_size = internal::unpacket_traits<Packet>::size; + const Index vectorized_size = + (num_coeff_to_copy / packet_size) * packet_size; + for (Index i = 0; i < vectorized_size; i += packet_size) { + Packet p = internal::pgather<Scalar, Packet>( + src_data + src_index + i * src_stride, src_stride); + internal::pstoret<Scalar, Packet, Unaligned>( + dst_data + dst_index + i, p); + } + for (Index i = vectorized_size; i < num_coeff_to_copy; ++i) { + dst_data[dst_index + i] = src_data[src_index + i * src_stride]; + } + } else { + // RANDOM + for (Index i = 0; i < num_coeff_to_copy; ++i) { + dst_data[dst_index + i * dst_stride] = + src_data[src_index + i * src_stride]; + } + } + } + } +}; + +/** \class TensorBlockIO + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor block IO class. + * + * This class is responsible for copying data between a tensor and a tensor + * block. + * + */ +template <typename Index, typename Scalar, std::size_t NumDims, int Layout, + bool Vectorizable, bool BlockRead> +class TensorBlockIO { + public: + typedef typename internal::TensorBlock<Index, Scalar, NumDims, Layout> + TensorBlock; + typedef typename internal::TensorBlockCopyOp<Index, Scalar, Vectorizable> + TensorBlockCopyOp; + + protected: + struct BlockIteratorState { + Index input_stride; + Index output_stride; + Index input_span; + Index output_span; + Index size; + Index count; + }; + + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Copy( + const TensorBlock& block, Index first_coeff_index, + const array<Index, NumDims>& tensor_to_block_dim_map, + const array<Index, NumDims>& tensor_strides, const Scalar* src_data, + Scalar* dst_data) { + // Calculate strides and dimensions. + const Index block_dim_for_tensor_stride1_dim = + NumDims == 0 ? 1 : + tensor_to_block_dim_map[static_cast<int>(Layout) == + static_cast<int>(ColMajor) + ? 0 + : NumDims - 1]; + const size_t block_inner_dim_size = + NumDims == 0 ? 1 : + block.block_sizes()[block_dim_for_tensor_stride1_dim]; + const size_t block_outer_dim_size = + NumDims == 0 ? 1 : + block.block_sizes().TotalSize() / block_inner_dim_size; + + Index inputIndex; + Index outputIndex; + Index input_stride; + Index output_stride; + + // Setup strides to read/write along the tensor's stride1 dimension. + if (BlockRead) { + inputIndex = first_coeff_index; + outputIndex = 0; + input_stride = 1; + output_stride = NumDims == 0 ? 1 + : block.block_strides()[block_dim_for_tensor_stride1_dim]; + } else { + inputIndex = 0; + outputIndex = first_coeff_index; + input_stride = NumDims == 0 ? 1 + : block.block_strides()[block_dim_for_tensor_stride1_dim]; + output_stride = 1; + } + + const std::size_t at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1; + array<BlockIteratorState, at_least_1_dim> block_iter_state; + + // Initialize block iterator state. + for (int i = 0; i < static_cast<int>(NumDims) - 1; ++i) { + const int dim = static_cast<int>(Layout) == static_cast<int>(ColMajor) + ? i + 1 + : NumDims - i - 2; + block_iter_state[i].size = + block.block_sizes()[tensor_to_block_dim_map[dim]]; + if (BlockRead) { + block_iter_state[i].input_stride = tensor_strides[dim]; + block_iter_state[i].output_stride = + block.block_strides()[tensor_to_block_dim_map[dim]]; + } else { + block_iter_state[i].input_stride = + block.block_strides()[tensor_to_block_dim_map[dim]]; + block_iter_state[i].output_stride = tensor_strides[dim]; + } + block_iter_state[i].input_span = + block_iter_state[i].input_stride * (block_iter_state[i].size - 1); + block_iter_state[i].output_span = + block_iter_state[i].output_stride * (block_iter_state[i].size - 1); + block_iter_state[i].count = 0; + } + + // Iterate copying data from src to dst. + for (Index i = 0; i < block_outer_dim_size; ++i) { + TensorBlockCopyOp::Run(block_inner_dim_size, outputIndex, output_stride, + dst_data, inputIndex, input_stride, src_data); + // Update index. + for (int i = 0; i < static_cast<int>(NumDims) - 1; ++i) { + if (++block_iter_state[i].count < block_iter_state[i].size) { + inputIndex += block_iter_state[i].input_stride; + outputIndex += block_iter_state[i].output_stride; + break; + } + block_iter_state[i].count = 0; + inputIndex -= block_iter_state[i].input_span; + outputIndex -= block_iter_state[i].output_span; + } + } + } +}; + +/** \class TensorBlockReader + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor block reader class. + * + * This class is responsible for reading a tensor block. + * + */ + +template <typename Index, typename Scalar, std::size_t NumDims, int Layout, + bool Vectorizable> +class TensorBlockReader : public TensorBlockIO<Index, Scalar, NumDims, + Layout, Vectorizable, true> { + public: + typedef typename internal::TensorBlock<Index, Scalar, NumDims, Layout> + TensorBlock; + typedef TensorBlockIO<Index, Scalar, NumDims, Layout, Vectorizable, true> + Base; + + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( + TensorBlock* block, const Scalar* src_data) { + array<Index, NumDims> tensor_to_block_dim_map; + for (int i = 0; i < NumDims; ++i) { + tensor_to_block_dim_map[i] = i; + } + Base::Copy(*block, block->first_coeff_index(), tensor_to_block_dim_map, + block->tensor_strides(), src_data, block->data()); + } + + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( + TensorBlock* block, Index first_coeff_index, + const array<Index, NumDims>& tensor_to_block_dim_map, + const array<Index, NumDims>& tensor_strides, const Scalar* src_data) { + Base::Copy(*block, first_coeff_index, tensor_to_block_dim_map, + tensor_strides, src_data, block->data()); + } +}; + +/** \class TensorBlockWriter + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor block writer class. + * + * This class is responsible for writing a tensor block. + * + */ + +template <typename Index, typename Scalar, std::size_t NumDims, int Layout, + bool Vectorizable> +class TensorBlockWriter : public TensorBlockIO<Index, Scalar, NumDims, + Layout, Vectorizable, false> { + public: + typedef typename internal::TensorBlock<Index, Scalar, NumDims, Layout> + TensorBlock; + typedef TensorBlockIO<Index, Scalar, NumDims, Layout, Vectorizable, false> + Base; + + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( + const TensorBlock& block, Scalar* dst_data) { + array<Index, NumDims> tensor_to_block_dim_map; + for (int i = 0; i < NumDims; ++i) { + tensor_to_block_dim_map[i] = i; + } + Base::Copy(block, block.first_coeff_index(), tensor_to_block_dim_map, + block.tensor_strides(), block.data(), dst_data); + } + + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( + const TensorBlock& block, Index first_coeff_index, + const array<Index, NumDims>& tensor_to_block_dim_map, + const array<Index, NumDims>& tensor_strides, Scalar* dst_data) { + Base::Copy(block, first_coeff_index, tensor_to_block_dim_map, + tensor_strides, block.data(), dst_data); + } +}; + +enum TensorBlockShapeType { + kUniformAllDims, + kSkewedInnerDims, +}; + +struct TensorOpResourceRequirements { + TensorBlockShapeType block_shape; + std::size_t block_total_size; + // TODO(andydavis) Add 'target_num_threads' to support communication of + // thread-resource requirements. This will allow ops deep in the + // expression tree (like reductions) to communicate resources + // requirements based on local state (like the total number of reductions + // to be computed). + TensorOpResourceRequirements(internal::TensorBlockShapeType shape, + const std::size_t size) + : block_shape(shape), block_total_size(size) {} +}; + +/** \class TensorBlockMapper + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor block mapper class. + * + * This class is responsible for iterating over the blocks of a tensor. + * + */ + +template <typename Index, typename Scalar, std::size_t NumDims, int Layout> +class TensorBlockMapper { + public: + typedef typename internal::TensorBlock<Index, Scalar, NumDims, Layout> + TensorBlock; + + TensorBlockMapper(const Eigen::DSizes<Index, NumDims>& dims, + const TensorBlockShapeType block_shape, + const size_t max_coeff_count) + : m_dimensions(dims), m_block_dim_sizes(dims), m_total_block_count(1) { + if (m_block_dim_sizes.TotalSize() > max_coeff_count) { + if (block_shape == kUniformAllDims) { + // Tensor will not fit within 'max_coeff_count' budget: calculate tensor + // block dimension sizes based on "square" dimension size target. + const size_t dim_size_target = + std::pow(static_cast<float>(max_coeff_count), + 1.0 / static_cast<float>(m_block_dim_sizes.rank())); + for (size_t i = 0; i < m_block_dim_sizes.rank(); ++i) { + // TODO(andydavis) Adjust the inner most 'm_block_dim_size' to make it + // a multiple of the packet size. Note that reducing 'm_block_dim_size' + // in this manner can increase the number of blocks, and so will + // amplify any per-block overhead. + m_block_dim_sizes[i] = + numext::mini(dim_size_target, static_cast<size_t>(m_dimensions[i])); + } + // Add any un-allocated coefficients to inner dimension(s). + Index total_size = m_block_dim_sizes.TotalSize(); + for (int i = 0; i < NumDims; ++i) { + const int dim = static_cast<int>(Layout) == static_cast<int>(ColMajor) + ? i : NumDims - i - 1; + if (m_block_dim_sizes[dim] < m_dimensions[dim]) { + const Index total_size_other_dims = total_size / + m_block_dim_sizes[dim]; + const Index alloc_avail = max_coeff_count / total_size_other_dims; + if (alloc_avail == m_block_dim_sizes[dim]) { + // Insufficient excess coefficients to allocate. + break; + } + m_block_dim_sizes[dim] = numext::mini(m_dimensions[dim], alloc_avail); + total_size = total_size_other_dims * m_block_dim_sizes[dim]; + } + } + } else { + eigen_assert(block_shape == kSkewedInnerDims); + Index coeff_to_allocate = max_coeff_count; + for (int i = 0; i < NumDims; ++i) { + const int dim = static_cast<int>(Layout) == static_cast<int>(ColMajor) + ? i : NumDims - i - 1; + m_block_dim_sizes[dim] = numext::mini(coeff_to_allocate, + m_dimensions[dim]); + coeff_to_allocate /= numext::maxi(static_cast<Index>(1), + m_block_dim_sizes[dim]); + } + } + } + + // Calculate block counts by dimension and total block count. + DSizes<Index, NumDims> block_count; + for (size_t i = 0; i < block_count.rank(); ++i) { + block_count[i] = + (m_dimensions[i] + m_block_dim_sizes[i] - 1) / m_block_dim_sizes[i]; + } + m_total_block_count = array_prod(block_count); + + // Calculate block strides (used for enumerating blocks). + if (NumDims > 0) { + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + m_block_strides[0] = 1; + m_tensor_strides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_block_strides[i] = m_block_strides[i - 1] * block_count[i - 1]; + m_tensor_strides[i] = m_tensor_strides[i - 1] * m_dimensions[i - 1]; + } + } else { + m_block_strides[NumDims - 1] = 1; + m_tensor_strides[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_block_strides[i] = m_block_strides[i + 1] * block_count[i + 1]; + m_tensor_strides[i] = m_tensor_strides[i + 1] * m_dimensions[i + 1]; + } + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock + GetBlockForIndex(Index block_index, Scalar* data) const { + Index first_coeff_index = 0; + DSizes<Index, NumDims> coords; + DSizes<Index, NumDims> sizes; + DSizes<Index, NumDims> strides; + if (NumDims > 0) { + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = block_index / m_block_strides[i]; + coords[i] = idx * m_block_dim_sizes[i]; + sizes[i] = + numext::mini((m_dimensions[i] - coords[i]), m_block_dim_sizes[i]); + block_index -= idx * m_block_strides[i]; + first_coeff_index += coords[i] * m_tensor_strides[i]; + } + coords[0] = block_index * m_block_dim_sizes[0]; + sizes[0] = + numext::mini((m_dimensions[0] - coords[0]), m_block_dim_sizes[0]); + first_coeff_index += coords[0] * m_tensor_strides[0]; + + strides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + strides[i] = strides[i - 1] * sizes[i - 1]; + } + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = block_index / m_block_strides[i]; + coords[i] = idx * m_block_dim_sizes[i]; + sizes[i] = + numext::mini((m_dimensions[i] - coords[i]), m_block_dim_sizes[i]); + block_index -= idx * m_block_strides[i]; + first_coeff_index += coords[i] * m_tensor_strides[i]; + } + coords[NumDims - 1] = block_index * m_block_dim_sizes[NumDims - 1]; + sizes[NumDims - 1] = + numext::mini((m_dimensions[NumDims - 1] - coords[NumDims - 1]), + m_block_dim_sizes[NumDims - 1]); + first_coeff_index += coords[NumDims - 1] * m_tensor_strides[NumDims - 1]; + + strides[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + strides[i] = strides[i + 1] * sizes[i + 1]; + } + } + } + + return TensorBlock(first_coeff_index, sizes, strides, m_tensor_strides, + data); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index total_block_count() const { + return m_total_block_count; + } + + private: + DSizes<Index, NumDims> m_dimensions; + DSizes<Index, NumDims> m_block_dim_sizes; + DSizes<Index, NumDims> m_block_strides; + DSizes<Index, NumDims> m_tensor_strides; + Index m_total_block_count; +}; + +/** \class TensorSliceBlockMapper + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor slice block mapper class. + * + * This class is responsible for iterating over the blocks of + * a slice of a tensor. Supports shuffling of the block strides + * for callers that want to reduce strides for dimensions to be + * processed together. + * + */ + +template <typename Index, typename Scalar, std::size_t NumDims, int Layout> +class TensorSliceBlockMapper { + public: + typedef typename internal::TensorBlock<Index, Scalar, NumDims, Layout> + TensorBlock; + typedef DSizes<Index, NumDims> Dimensions; + + TensorSliceBlockMapper(const Dimensions& tensor_dims, + const Dimensions& tensor_slice_offsets, + const Dimensions& tensor_slice_extents, + const Dimensions& block_dim_sizes, + const Dimensions& block_stride_order) + : m_tensor_dimensions(tensor_dims), + m_tensor_slice_offsets(tensor_slice_offsets), + m_tensor_slice_extents(tensor_slice_extents), + m_block_dim_sizes(block_dim_sizes), + m_block_stride_order(block_stride_order), + m_total_block_count(1) { + // Calculate block counts by dimension and total block count. + DSizes<Index, NumDims> block_count; + for (size_t i = 0; i < block_count.rank(); ++i) { + block_count[i] = (m_tensor_slice_extents[i] + m_block_dim_sizes[i] - 1) / + m_block_dim_sizes[i]; + } + m_total_block_count = array_prod(block_count); + + // Calculate block strides (used for enumerating blocks). + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + m_block_strides[0] = 1; + m_tensor_strides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_block_strides[i] = m_block_strides[i - 1] * block_count[i - 1]; + m_tensor_strides[i] = m_tensor_strides[i - 1] * + m_tensor_dimensions[i - 1]; + } + } else { + m_block_strides[NumDims - 1] = 1; + m_tensor_strides[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_block_strides[i] = m_block_strides[i + 1] * block_count[i + 1]; + m_tensor_strides[i] = m_tensor_strides[i + 1] * + m_tensor_dimensions[i + 1]; + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock + GetBlockForIndex(Index block_index, Scalar* data) const { + Index first_coeff_index = 0; + DSizes<Index, NumDims> coords; + DSizes<Index, NumDims> sizes; + DSizes<Index, NumDims> strides; + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = block_index / m_block_strides[i]; + coords[i] = m_tensor_slice_offsets[i] + idx * m_block_dim_sizes[i]; + sizes[i] = numext::mini(m_tensor_slice_offsets[i] + m_tensor_slice_extents[i] - coords[i], + m_block_dim_sizes[i]); + block_index -= idx * m_block_strides[i]; + first_coeff_index += coords[i] * m_tensor_strides[i]; + } + coords[0] = m_tensor_slice_offsets[0] + + block_index * m_block_dim_sizes[0]; + sizes[0] = numext::mini(m_tensor_slice_offsets[0] + m_tensor_slice_extents[0] - coords[0], + m_block_dim_sizes[0]); + first_coeff_index += coords[0] * m_tensor_strides[0]; + + Index prev_dim = m_block_stride_order[0]; + strides[prev_dim] = 1; + for (int i = 1; i < NumDims; ++i) { + const Index curr_dim = m_block_stride_order[i]; + strides[curr_dim] = strides[prev_dim] * sizes[prev_dim]; + prev_dim = curr_dim; + } + } else { + for (int i = 0; i < static_cast<int>(NumDims) - 1; ++i) { + const Index idx = block_index / m_block_strides[i]; + coords[i] = m_tensor_slice_offsets[i] + idx * m_block_dim_sizes[i]; + sizes[i] = numext::mini(m_tensor_slice_offsets[i] + m_tensor_slice_extents[i] - coords[i], + m_block_dim_sizes[i]); + block_index -= idx * m_block_strides[i]; + first_coeff_index += coords[i] * m_tensor_strides[i]; + } + coords[NumDims - 1] = m_tensor_slice_offsets[NumDims - 1] + + block_index * m_block_dim_sizes[NumDims - 1]; + sizes[NumDims - 1] = numext::mini( + m_tensor_slice_offsets[NumDims - 1] + m_tensor_slice_extents[NumDims - 1] - coords[NumDims - 1], + m_block_dim_sizes[NumDims - 1]); + first_coeff_index += coords[NumDims - 1] * m_tensor_strides[NumDims - 1]; + + Index prev_dim = m_block_stride_order[NumDims - 1]; + strides[prev_dim] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + const Index curr_dim = m_block_stride_order[i]; + strides[curr_dim] = strides[prev_dim] * sizes[prev_dim]; + prev_dim = curr_dim; + } + } + + return TensorBlock(first_coeff_index, sizes, strides, m_tensor_strides, + data); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index total_block_count() const { + return m_total_block_count; + } + + private: + Dimensions m_tensor_dimensions; + Dimensions m_tensor_slice_offsets; + Dimensions m_tensor_slice_extents; + Dimensions m_tensor_strides; + Dimensions m_block_dim_sizes; + Dimensions m_block_stride_order; + Dimensions m_block_strides; + Index m_total_block_count; +}; + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h new file mode 100644 index 0000000000..7e6d00fad6 --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -0,0 +1,352 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H +#define EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H + +namespace Eigen { + +/** \class TensorBroadcasting + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor broadcasting class. + * + * + */ +namespace internal { +template<typename Broadcast, typename XprType> +struct traits<TensorBroadcastingOp<Broadcast, XprType> > : public traits<XprType> +{ + typedef typename XprType::Scalar Scalar; + typedef traits<XprType> XprTraits; + typedef typename packet_traits<Scalar>::type Packet; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference<Nested>::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; +}; + +template<typename Broadcast, typename XprType> +struct eval<TensorBroadcastingOp<Broadcast, XprType>, Eigen::Dense> +{ + typedef const TensorBroadcastingOp<Broadcast, XprType>& type; +}; + +template<typename Broadcast, typename XprType> +struct nested<TensorBroadcastingOp<Broadcast, XprType>, 1, typename eval<TensorBroadcastingOp<Broadcast, XprType> >::type> +{ + typedef TensorBroadcastingOp<Broadcast, XprType> type; +}; + +} // end namespace internal + + + +template<typename Broadcast, typename XprType> +class TensorBroadcastingOp : public TensorBase<TensorBroadcastingOp<Broadcast, XprType>, ReadOnlyAccessors> +{ + public: + typedef typename Eigen::internal::traits<TensorBroadcastingOp>::Scalar Scalar; + typedef typename Eigen::internal::traits<TensorBroadcastingOp>::Packet Packet; + typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename Eigen::internal::nested<TensorBroadcastingOp>::type Nested; + typedef typename Eigen::internal::traits<TensorBroadcastingOp>::StorageKind StorageKind; + typedef typename Eigen::internal::traits<TensorBroadcastingOp>::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBroadcastingOp(const XprType& expr, const Broadcast& broadcast) + : m_xpr(expr), m_broadcast(broadcast) {} + + EIGEN_DEVICE_FUNC + const Broadcast& broadcast() const { return m_broadcast; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all<typename XprType::Nested>::type& + expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; + const Broadcast m_broadcast; +}; + + +// Eval as rvalue +template<typename Broadcast, typename ArgType, typename Device> +struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device> +{ + typedef TensorBroadcastingOp<Broadcast, ArgType> XprType; + typedef typename XprType::Index Index; + static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value; + typedef DSizes<Index, NumDims> Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions; + EIGEN_STATIC_ASSERT(NumDims == internal::array_size<Broadcast>::value, "Broadcast cannot change rank") + + enum { + IsAligned = false, + PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, + BlockAccess = false, + Layout = TensorEvaluator<ArgType, Device>::Layout, + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device) + { + const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions(); + const Broadcast& broadcast = op.broadcast(); + for (int i = 0; i < NumDims; ++i) { + eigen_assert(input_dims[i] > 0); + m_dimensions[i] = input_dims[i] * broadcast[i]; + } + + if (NumDims > 0) { + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + m_inputStrides[0] = 1; + m_outputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; + m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; + } + } else { + // NumDims is always > 0 here, but use max to avoid compiler warning + m_inputStrides[numext::maxi(0, NumDims-1)] = 1; + m_outputStrides[numext::maxi(0, NumDims-1)] = 1; + for (int i = NumDims-2; i >= 0; --i) { + m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1]; + m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1]; + } + } + } + } + + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffReturnType coeff(Index index) const + { + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + return coeffColMajor(index); + } else { + return coeffRowMajor(index); + } + } + + // TODO: attempt to speed this up. The integer divisions and modulo are slow + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffColMajor(Index index) const + { + Index inputIndex = 0; + if (NumDims > 0) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_outputStrides[i]; + if (internal::index_statically_eq<Broadcast>()(i, 1)) { + eigen_assert(idx < m_impl.dimensions()[i]); + inputIndex += idx * m_inputStrides[i]; + } else { + if (internal::index_statically_eq<InputDimensions>()(i, 1)) { + eigen_assert(idx % m_impl.dimensions()[i] == 0); + } else { + inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i]; + } + } + index -= idx * m_outputStrides[i]; + } + if (internal::index_statically_eq<Broadcast>()(0, 1)) { + eigen_assert(index < m_impl.dimensions()[0]); + inputIndex += index; + } else { + if (internal::index_statically_eq<InputDimensions>()(0, 1)) { + eigen_assert(index % m_impl.dimensions()[0] == 0); + } else { + inputIndex += (index % m_impl.dimensions()[0]); + } + } + } + return m_impl.coeff(inputIndex); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffRowMajor(Index index) const + { + Index inputIndex = 0; + if (NumDims > 0) { + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = index / m_outputStrides[i]; + if (internal::index_statically_eq<Broadcast>()(i, 1)) { + eigen_assert(idx < m_impl.dimensions()[i]); + inputIndex += idx * m_inputStrides[i]; + } else { + if (internal::index_statically_eq<InputDimensions>()(i, 1)) { + eigen_assert(idx % m_impl.dimensions()[i] == 0); + } else { + inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i]; + } + } + index -= idx * m_outputStrides[i]; + } + if (internal::index_statically_eq<Broadcast>()(NumDims-1, 1)) { + eigen_assert(index < m_impl.dimensions()[NumDims-1]); + inputIndex += index; + } else { + if (internal::index_statically_eq<InputDimensions>()(NumDims-1, 1)) { + eigen_assert(index % m_impl.dimensions()[NumDims-1] == 0); + } else { + inputIndex += (index % m_impl.dimensions()[NumDims-1]); + } + } + } + return m_impl.coeff(inputIndex); + } + + template<int LoadMode> + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType packet(Index index) const + { + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + return packetColMajor<LoadMode>(index); + } else { + return packetRowMajor<LoadMode>(index); + } + } + + // Ignore the LoadMode and always use unaligned loads since we can't guarantee + // the alignment at compile time. + template<int LoadMode> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const + { + const int packetSize = internal::unpacket_traits<PacketReturnType>::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + + const Index originalIndex = index; + + Index inputIndex = 0; + Index innermostLoc = 0; + if (NumDims > 0) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_outputStrides[i]; + if (internal::index_statically_eq<Broadcast>()(i, 1)) { + eigen_assert(idx < m_impl.dimensions()[i]); + inputIndex += idx * m_inputStrides[i]; + } else { + if (internal::index_statically_eq<InputDimensions>()(i, 1)) { + eigen_assert(idx % m_impl.dimensions()[i] == 0); + } else { + inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i]; + } + } + index -= idx * m_outputStrides[i]; + } + if (internal::index_statically_eq<Broadcast>()(0, 1)) { + eigen_assert(index < m_impl.dimensions()[0]); + innermostLoc = index; + } else { + if (internal::index_statically_eq<InputDimensions>()(0, 1)) { + eigen_assert(innermostLoc % m_impl.dimensions()[0] == 0); + innermostLoc = 0; + } else { + innermostLoc = index % m_impl.dimensions()[0]; + } + } + inputIndex += innermostLoc; + } + + // Todo: this could be extended to the second dimension if we're not + // broadcasting alongside the first dimension, and so on. + if (innermostLoc + packetSize <= m_impl.dimensions()[0]) { + return m_impl.template packet<Unaligned>(inputIndex); + } else { + EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize]; + values[0] = m_impl.coeff(inputIndex); + for (int i = 1; i < packetSize; ++i) { + values[i] = coeffColMajor(originalIndex+i); + } + PacketReturnType rslt = internal::pload<PacketReturnType>(values); + return rslt; + } + } + + template<int LoadMode> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const + { + const int packetSize = internal::unpacket_traits<PacketReturnType>::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + + const Index originalIndex = index; + + Index inputIndex = 0; + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = index / m_outputStrides[i]; + if (internal::index_statically_eq<Broadcast>()(i, 1)) { + eigen_assert(idx < m_impl.dimensions()[i]); + inputIndex += idx * m_inputStrides[i]; + } else { + if (internal::index_statically_eq<InputDimensions>()(i, 1)) { + eigen_assert(idx % m_impl.dimensions()[i] == 0); + } else { + inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i]; + } + } + index -= idx * m_outputStrides[i]; + } + Index innermostLoc; + if (internal::index_statically_eq<Broadcast>()(NumDims-1, 1)) { + eigen_assert(index < m_impl.dimensions()[NumDims-1]); + innermostLoc = index; + } else { + if (internal::index_statically_eq<InputDimensions>()(NumDims-1, 1)) { + eigen_assert(innermostLoc % m_impl.dimensions()[NumDims-1] == 0); + innermostLoc = 0; + } else { + innermostLoc = index % m_impl.dimensions()[NumDims-1]; + } + } + inputIndex += innermostLoc; + + // Todo: this could be extended to the second dimension if we're not + // broadcasting alongside the first dimension, and so on. + if (innermostLoc + packetSize <= m_impl.dimensions()[NumDims-1]) { + return m_impl.template packet<Unaligned>(inputIndex); + } else { + EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize]; + values[0] = m_impl.coeff(inputIndex); + for (int i = 1; i < packetSize; ++i) { + values[i] = coeffRowMajor(originalIndex+i); + } + PacketReturnType rslt = internal::pload<PacketReturnType>(values); + return rslt; + } + } + + + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + + protected: + Dimensions m_dimensions; + array<Index, NumDims> m_outputStrides; + array<Index, NumDims> m_inputStrides; + TensorEvaluator<ArgType, Device> m_impl; +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h new file mode 100644 index 0000000000..36c436a613 --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -0,0 +1,510 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CHIPPING_H +#define EIGEN_CXX11_TENSOR_TENSOR_CHIPPING_H + +namespace Eigen { + +/** \class TensorKChippingReshaping + * \ingroup CXX11_Tensor_Module + * + * \brief A chip is a thin slice, corresponding to a column or a row in a 2-d tensor. + * + * + */ + +namespace internal { +template<DenseIndex DimId, typename XprType> +struct traits<TensorChippingOp<DimId, XprType> > : public traits<XprType> +{ + typedef typename XprType::Scalar Scalar; + typedef traits<XprType> XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference<Nested>::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions - 1; + static const int Layout = XprTraits::Layout; +}; + +template<DenseIndex DimId, typename XprType> +struct eval<TensorChippingOp<DimId, XprType>, Eigen::Dense> +{ + typedef const TensorChippingOp<DimId, XprType>& type; +}; + +template<DenseIndex DimId, typename XprType> +struct nested<TensorChippingOp<DimId, XprType>, 1, typename eval<TensorChippingOp<DimId, XprType> >::type> +{ + typedef TensorChippingOp<DimId, XprType> type; +}; + +template <DenseIndex DimId> +struct DimensionId +{ + DimensionId(DenseIndex dim) { + eigen_assert(dim == DimId); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex actualDim() const { + return DimId; + } +}; +template <> +struct DimensionId<Dynamic> +{ + DimensionId(DenseIndex dim) : actual_dim(dim) { + eigen_assert(dim >= 0); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex actualDim() const { + return actual_dim; + } + private: + const DenseIndex actual_dim; +}; + + +} // end namespace internal + + + +template<DenseIndex DimId, typename XprType> +class TensorChippingOp : public TensorBase<TensorChippingOp<DimId, XprType> > +{ + public: + typedef typename Eigen::internal::traits<TensorChippingOp>::Scalar Scalar; + typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename Eigen::internal::nested<TensorChippingOp>::type Nested; + typedef typename Eigen::internal::traits<TensorChippingOp>::StorageKind StorageKind; + typedef typename Eigen::internal::traits<TensorChippingOp>::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorChippingOp(const XprType& expr, const Index offset, const Index dim) + : m_xpr(expr), m_offset(offset), m_dim(dim) { + } + + EIGEN_DEVICE_FUNC + const Index offset() const { return m_offset; } + EIGEN_DEVICE_FUNC + const Index dim() const { return m_dim.actualDim(); } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all<typename XprType::Nested>::type& + expression() const { return m_xpr; } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorChippingOp& operator = (const TensorChippingOp& other) + { + typedef TensorAssignOp<TensorChippingOp, const TensorChippingOp> Assign; + Assign assign(*this, other); + internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice()); + return *this; + } + + template<typename OtherDerived> + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorChippingOp& operator = (const OtherDerived& other) + { + typedef TensorAssignOp<TensorChippingOp, const OtherDerived> Assign; + Assign assign(*this, other); + internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice()); + return *this; + } + + protected: + typename XprType::Nested m_xpr; + const Index m_offset; + const internal::DimensionId<DimId> m_dim; +}; + + +// Eval as rvalue +template<DenseIndex DimId, typename ArgType, typename Device> +struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device> +{ + typedef TensorChippingOp<DimId, ArgType> XprType; + static const int NumInputDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value; + static const int NumDims = NumInputDims-1; + typedef typename XprType::Index Index; + typedef DSizes<Index, NumDims> Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename internal::remove_const<Scalar>::type ScalarNonConst; + + enum { + // Alignment can't be guaranteed at compile time since it depends on the + // slice offsets. + IsAligned = false, + PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, + BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess, + Layout = TensorEvaluator<ArgType, Device>::Layout, + CoordAccess = false, // to be implemented + }; + + typedef internal::TensorBlock<Index, ScalarNonConst, NumInputDims, Layout> + InputTensorBlock; + typedef internal::TensorBlock<Index, ScalarNonConst, NumDims, Layout> + OutputTensorBlock; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_dim(op.dim()), m_device(device) + { + EIGEN_STATIC_ASSERT(NumInputDims >= 1, YOU_MADE_A_PROGRAMMING_MISTAKE); + eigen_assert(NumInputDims > m_dim.actualDim()); + const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions(); + eigen_assert(op.offset() < input_dims[m_dim.actualDim()]); + + int j = 0; + for (int i = 0; i < NumInputDims; ++i) { + if (i != m_dim.actualDim()) { + m_dimensions[j] = input_dims[i]; + ++j; + } + } + + m_stride = 1; + m_inputStride = 1; + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + for (int i = 0; i < m_dim.actualDim(); ++i) { + m_stride *= input_dims[i]; + m_inputStride *= input_dims[i]; + } + } else { + for (int i = NumInputDims-1; i > m_dim.actualDim(); --i) { + m_stride *= input_dims[i]; + m_inputStride *= input_dims[i]; + } + } + m_inputStride *= input_dims[m_dim.actualDim()]; + m_inputOffset = m_stride * op.offset(); + + if (BlockAccess) { + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + m_inputStrides[0] = 1; + for (int i = 1; i < NumInputDims; ++i) { + m_inputStrides[i] = m_inputStrides[i - 1] * input_dims[i - 1]; + } + } else { + m_inputStrides[NumInputDims - 1] = 1; + for (int i = NumInputDims - 2; i >= 0; --i) { + m_inputStrides[i] = m_inputStrides[i + 1] * input_dims[i + 1]; + } + } + + m_block_total_size_max = numext::maxi(static_cast<std::size_t>(1), + device.lastLevelCacheSize() / + sizeof(Scalar)); + } + } + + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + return m_impl.coeff(srcCoeff(index)); + } + + template<int LoadMode> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + const int packetSize = internal::unpacket_traits<PacketReturnType>::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + + if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && + m_dim.actualDim() == 0) || + (static_cast<int>(Layout) == static_cast<int>(RowMajor) && + m_dim.actualDim() == NumInputDims - 1)) { + // m_stride is equal to 1, so let's avoid the integer division. + eigen_assert(m_stride == 1); + Index inputIndex = index * m_inputStride + m_inputOffset; + EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize]; + for (int i = 0; i < packetSize; ++i) { + values[i] = m_impl.coeff(inputIndex); + inputIndex += m_inputStride; + } + PacketReturnType rslt = internal::pload<PacketReturnType>(values); + return rslt; + } else if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && + m_dim.actualDim() == NumInputDims - 1) || + (static_cast<int>(Layout) == static_cast<int>(RowMajor) && + m_dim.actualDim() == 0)) { + // m_stride is aways greater than index, so let's avoid the integer division. + eigen_assert(m_stride > index); + return m_impl.template packet<LoadMode>(index + m_inputOffset); + } else { + const Index idx = index / m_stride; + const Index rem = index - idx * m_stride; + if (rem + packetSize <= m_stride) { + Index inputIndex = idx * m_inputStride + m_inputOffset + rem; + return m_impl.template packet<LoadMode>(inputIndex); + } else { + // Cross the stride boundary. Fallback to slow path. + EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize]; + for (int i = 0; i < packetSize; ++i) { + values[i] = coeff(index); + ++index; + } + PacketReturnType rslt = internal::pload<PacketReturnType>(values); + return rslt; + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( + std::vector<internal::TensorOpResourceRequirements>* resources) const { + resources->push_back(internal::TensorOpResourceRequirements( + internal::kSkewedInnerDims, m_block_total_size_max)); + m_impl.getResourceRequirements(resources); + } + + // TODO(andydavis) Reduce the overhead of this function (experiment with + // using a fixed block size). + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block( + OutputTensorBlock* output_block) const { + // Calculate input block sizes. + const DSizes<Index, NumDims>& output_block_sizes = + output_block->block_sizes(); + const DSizes<Index, NumDims>& output_block_strides = + output_block->block_strides(); + const Index chip_dim = m_dim.actualDim(); + DSizes<Index, NumInputDims> input_block_sizes; + DSizes<Index, NumInputDims> input_block_strides; + for (Index i = 0; i < NumInputDims; ++i) { + if (i < chip_dim) { + input_block_sizes[i] = output_block_sizes[i]; + input_block_strides[i] = output_block_strides[i]; + } else if (i > chip_dim) { + input_block_sizes[i] = output_block_sizes[i - 1]; + input_block_strides[i] = output_block_strides[i - 1]; + } else { + input_block_sizes[i] = 1; + } + } + // Fix up input_block_stride for chip dimension. + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + if (chip_dim == 0) { + input_block_strides[chip_dim] = 1; + } else { + input_block_strides[chip_dim] = input_block_strides[chip_dim - 1] * + input_block_sizes[chip_dim - 1]; + } + } else { + if (chip_dim == NumInputDims - 1) { + input_block_strides[chip_dim] = 1; + } else { + input_block_strides[chip_dim] = input_block_strides[chip_dim + 1] * + input_block_sizes[chip_dim + 1]; + } + } + // Instantiate and read input block from input tensor. + InputTensorBlock input_block(srcCoeff(output_block->first_coeff_index()), + input_block_sizes, + input_block_strides, + m_inputStrides, + output_block->data()); + m_impl.block(&input_block); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const { + CoeffReturnType* result = const_cast<CoeffReturnType*>(m_impl.data()); + if (((static_cast<int>(Layout) == static_cast<int>(ColMajor) && + m_dim.actualDim() == NumDims) || + (static_cast<int>(Layout) == static_cast<int>(RowMajor) && + m_dim.actualDim() == 0)) && + result) { + return result + m_inputOffset; + } else { + return NULL; + } + } + + protected: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const + { + Index inputIndex; + if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && + m_dim.actualDim() == 0) || + (static_cast<int>(Layout) == static_cast<int>(RowMajor) && + m_dim.actualDim() == NumInputDims - 1)) { + // m_stride is equal to 1, so let's avoid the integer division. + eigen_assert(m_stride == 1); + inputIndex = index * m_inputStride + m_inputOffset; + } else if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && + m_dim.actualDim() == NumInputDims - 1) || + (static_cast<int>(Layout) == static_cast<int>(RowMajor) && + m_dim.actualDim() == 0)) { + // m_stride is aways greater than index, so let's avoid the integer division. + eigen_assert(m_stride > index); + inputIndex = index + m_inputOffset; + } else { + const Index idx = index / m_stride; + inputIndex = idx * m_inputStride + m_inputOffset; + index -= idx * m_stride; + inputIndex += index; + } + return inputIndex; + } + + Dimensions m_dimensions; + Index m_stride; + Index m_inputOffset; + Index m_inputStride; + DSizes<Index, NumInputDims> m_inputStrides; + TensorEvaluator<ArgType, Device> m_impl; + const internal::DimensionId<DimId> m_dim; + const Device& m_device; + std::size_t m_block_total_size_max; +}; + + +// Eval as lvalue +template<DenseIndex DimId, typename ArgType, typename Device> +struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device> + : public TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device> +{ + typedef TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device> Base; + typedef TensorChippingOp<DimId, ArgType> XprType; + static const int NumInputDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value; + static const int NumDims = NumInputDims-1; + typedef typename XprType::Index Index; + typedef DSizes<Index, NumDims> Dimensions; + typedef typename XprType::Scalar Scalar; + + enum { + IsAligned = false, + PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, + BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess, + Layout = TensorEvaluator<ArgType, Device>::Layout, + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : Base(op, device) + { } + + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + typedef typename internal::remove_const<Scalar>::type ScalarNonConst; + typedef internal::TensorBlock<Index, ScalarNonConst, NumInputDims, Layout> + InputTensorBlock; + typedef internal::TensorBlock<Index, ScalarNonConst, NumDims, Layout> + OutputTensorBlock; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) + { + return this->m_impl.coeffRef(this->srcCoeff(index)); + } + + template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void writePacket(Index index, const PacketReturnType& x) + { + static const int packetSize = internal::unpacket_traits<PacketReturnType>::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + + if ((static_cast<int>(this->Layout) == static_cast<int>(ColMajor) && + this->m_dim.actualDim() == 0) || + (static_cast<int>(this->Layout) == static_cast<int>(RowMajor) && + this->m_dim.actualDim() == NumInputDims - 1)) { + // m_stride is equal to 1, so let's avoid the integer division. + eigen_assert(this->m_stride == 1); + EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize]; + internal::pstore<CoeffReturnType, PacketReturnType>(values, x); + Index inputIndex = index * this->m_inputStride + this->m_inputOffset; + for (int i = 0; i < packetSize; ++i) { + this->m_impl.coeffRef(inputIndex) = values[i]; + inputIndex += this->m_inputStride; + } + } else if ((static_cast<int>(this->Layout) == static_cast<int>(ColMajor) && + this->m_dim.actualDim() == NumInputDims - 1) || + (static_cast<int>(this->Layout) == static_cast<int>(RowMajor) && + this->m_dim.actualDim() == 0)) { + // m_stride is aways greater than index, so let's avoid the integer division. + eigen_assert(this->m_stride > index); + this->m_impl.template writePacket<StoreMode>(index + this->m_inputOffset, x); + } else { + const Index idx = index / this->m_stride; + const Index rem = index - idx * this->m_stride; + if (rem + packetSize <= this->m_stride) { + const Index inputIndex = idx * this->m_inputStride + this->m_inputOffset + rem; + this->m_impl.template writePacket<StoreMode>(inputIndex, x); + } else { + // Cross stride boundary. Fallback to slow path. + EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize]; + internal::pstore<CoeffReturnType, PacketReturnType>(values, x); + for (int i = 0; i < packetSize; ++i) { + this->coeffRef(index) = values[i]; + ++index; + } + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock( + const OutputTensorBlock& output_block) { + // Calculate input block sizes. + const DSizes<Index, NumDims>& output_block_sizes = + output_block.block_sizes(); + const DSizes<Index, NumDims>& output_block_strides = + output_block.block_strides(); + const Index chip_dim = this->m_dim.actualDim(); + DSizes<Index, NumInputDims> input_block_sizes; + DSizes<Index, NumInputDims> input_block_strides; + for (Index i = 0; i < NumInputDims; ++i) { + if (i < chip_dim) { + input_block_sizes[i] = output_block_sizes[i]; + input_block_strides[i] = output_block_strides[i]; + } else if (i > chip_dim) { + input_block_sizes[i] = output_block_sizes[i - 1]; + input_block_strides[i] = output_block_strides[i - 1]; + } else { + input_block_sizes[i] = 1; + } + } + // Fix up input_block_stride for chip dimension. + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + if (chip_dim == 0) { + input_block_strides[chip_dim] = 1; + } else { + input_block_strides[chip_dim] = input_block_strides[chip_dim - 1] * + input_block_sizes[chip_dim - 1]; + } + } else { + if (chip_dim == NumInputDims - 1) { + input_block_strides[chip_dim] = 1; + } else { + input_block_strides[chip_dim] = input_block_strides[chip_dim - 1] * + input_block_sizes[chip_dim - 1]; + } + } + // Write input block. + this->m_impl.writeBlock( + InputTensorBlock(this->srcCoeff(output_block.first_coeff_index()), + input_block_sizes, + input_block_strides, + this->m_inputStrides, + const_cast<ScalarNonConst*>(output_block.data()))); + } + +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_CHIPPING_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h new file mode 100644 index 0000000000..54d9e5f2c8 --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h @@ -0,0 +1,350 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONCATENATION_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONCATENATION_H + +namespace Eigen { + +/** \class TensorConcatenationOp + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor concatenation class. + * + * + */ +namespace internal { +template<typename Axis, typename LhsXprType, typename RhsXprType> +struct traits<TensorConcatenationOp<Axis, LhsXprType, RhsXprType> > +{ + // Type promotion to handle the case where the types of the lhs and the rhs are different. + typedef typename promote_storage_type<typename LhsXprType::Scalar, + typename RhsXprType::Scalar>::ret Scalar; + typedef typename packet_traits<Scalar>::type Packet; + typedef typename promote_storage_type<typename traits<LhsXprType>::StorageKind, + typename traits<RhsXprType>::StorageKind>::ret StorageKind; + typedef typename promote_index_type<typename traits<LhsXprType>::Index, + typename traits<RhsXprType>::Index>::type Index; + typedef typename LhsXprType::Nested LhsNested; + typedef typename RhsXprType::Nested RhsNested; + typedef typename remove_reference<LhsNested>::type _LhsNested; + typedef typename remove_reference<RhsNested>::type _RhsNested; + static const int NumDimensions = traits<LhsXprType>::NumDimensions; + static const int Layout = traits<LhsXprType>::Layout; + enum { Flags = 0 }; +}; + +template<typename Axis, typename LhsXprType, typename RhsXprType> +struct eval<TensorConcatenationOp<Axis, LhsXprType, RhsXprType>, Eigen::Dense> +{ + typedef const TensorConcatenationOp<Axis, LhsXprType, RhsXprType>& type; +}; + +template<typename Axis, typename LhsXprType, typename RhsXprType> +struct nested<TensorConcatenationOp<Axis, LhsXprType, RhsXprType>, 1, typename eval<TensorConcatenationOp<Axis, LhsXprType, RhsXprType> >::type> +{ + typedef TensorConcatenationOp<Axis, LhsXprType, RhsXprType> type; +}; + +} // end namespace internal + + +template<typename Axis, typename LhsXprType, typename RhsXprType> +class TensorConcatenationOp : public TensorBase<TensorConcatenationOp<Axis, LhsXprType, RhsXprType>, WriteAccessors> +{ + public: + typedef typename internal::traits<TensorConcatenationOp>::Scalar Scalar; + typedef typename internal::traits<TensorConcatenationOp>::Packet Packet; + typedef typename internal::traits<TensorConcatenationOp>::StorageKind StorageKind; + typedef typename internal::traits<TensorConcatenationOp>::Index Index; + typedef typename internal::nested<TensorConcatenationOp>::type Nested; + typedef typename internal::promote_storage_type<typename LhsXprType::CoeffReturnType, + typename RhsXprType::CoeffReturnType>::ret CoeffReturnType; + typedef typename internal::promote_storage_type<typename LhsXprType::PacketReturnType, + typename RhsXprType::PacketReturnType>::ret PacketReturnType; + typedef typename NumTraits<Scalar>::Real RealScalar; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConcatenationOp(const LhsXprType& lhs, const RhsXprType& rhs, Axis axis) + : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_axis(axis) {} + + EIGEN_DEVICE_FUNC + const typename internal::remove_all<typename LhsXprType::Nested>::type& + lhsExpression() const { return m_lhs_xpr; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all<typename RhsXprType::Nested>::type& + rhsExpression() const { return m_rhs_xpr; } + + EIGEN_DEVICE_FUNC const Axis& axis() const { return m_axis; } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorConcatenationOp& operator = (const TensorConcatenationOp& other) + { + typedef TensorAssignOp<TensorConcatenationOp, const TensorConcatenationOp> Assign; + Assign assign(*this, other); + internal::TensorExecutor<const Assign, DefaultDevice>::run( + assign, DefaultDevice()); + return *this; + } + + template<typename OtherDerived> + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorConcatenationOp& operator = (const OtherDerived& other) + { + typedef TensorAssignOp<TensorConcatenationOp, const OtherDerived> Assign; + Assign assign(*this, other); + internal::TensorExecutor<const Assign, DefaultDevice>::run( + assign, DefaultDevice()); + return *this; + } + + protected: + typename LhsXprType::Nested m_lhs_xpr; + typename RhsXprType::Nested m_rhs_xpr; + const Axis m_axis; +}; + + +// Eval as rvalue +template<typename Axis, typename LeftArgType, typename RightArgType, typename Device> +struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgType>, Device> +{ + typedef TensorConcatenationOp<Axis, LeftArgType, RightArgType> XprType; + typedef typename XprType::Index Index; + static const int NumDims = internal::array_size<typename TensorEvaluator<LeftArgType, Device>::Dimensions>::value; + static const int RightNumDims = internal::array_size<typename TensorEvaluator<RightArgType, Device>::Dimensions>::value; + typedef DSizes<Index, NumDims> Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + enum { + IsAligned = false, + PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & + TensorEvaluator<RightArgType, Device>::PacketAccess, + BlockAccess = false, + Layout = TensorEvaluator<LeftArgType, Device>::Layout, + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device), m_axis(op.axis()) + { + EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout) || NumDims == 1), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT(NumDims == RightNumDims, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(0 <= m_axis && m_axis < NumDims); + const Dimensions& lhs_dims = m_leftImpl.dimensions(); + const Dimensions& rhs_dims = m_rightImpl.dimensions(); + int i = 0; + for (; i < m_axis; ++i) { + eigen_assert(lhs_dims[i] > 0); + eigen_assert(lhs_dims[i] == rhs_dims[i]); + m_dimensions[i] = lhs_dims[i]; + } + eigen_assert(lhs_dims[i] > 0); // Now i == m_axis. + eigen_assert(rhs_dims[i] > 0); + m_dimensions[i] = lhs_dims[i] + rhs_dims[i]; + for (++i; i < NumDims; ++i) { + eigen_assert(lhs_dims[i] > 0); + eigen_assert(lhs_dims[i] == rhs_dims[i]); + m_dimensions[i] = lhs_dims[i]; + } + + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + m_leftStrides[0] = 1; + m_rightStrides[0] = 1; + m_outputStrides[0] = 1; + + for (int i = 1; i < NumDims; ++i) { + m_leftStrides[i] = m_leftStrides[i-1] * lhs_dims[i-1]; + m_rightStrides[i] = m_rightStrides[i-1] * rhs_dims[i-1]; + m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; + } + } else { + m_leftStrides[NumDims - 1] = 1; + m_rightStrides[NumDims - 1] = 1; + m_outputStrides[NumDims - 1] = 1; + + for (int i = NumDims - 2; i >= 0; --i) { + m_leftStrides[i] = m_leftStrides[i+1] * lhs_dims[i+1]; + m_rightStrides[i] = m_rightStrides[i+1] * rhs_dims[i+1]; + m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1]; + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + // TODO(phli): Add short-circuit memcpy evaluation if underlying data are linear? + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) + { + m_leftImpl.evalSubExprsIfNeeded(NULL); + m_rightImpl.evalSubExprsIfNeeded(NULL); + return true; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() + { + m_leftImpl.cleanup(); + m_rightImpl.cleanup(); + } + + // TODO(phli): attempt to speed this up. The integer divisions and modulo are slow. + // See CL/76180724 comments for more ideas. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + // Collect dimension-wise indices (subs). + array<Index, NumDims> subs; + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + subs[i] = index / m_outputStrides[i]; + index -= subs[i] * m_outputStrides[i]; + } + subs[0] = index; + } else { + for (int i = 0; i < NumDims - 1; ++i) { + subs[i] = index / m_outputStrides[i]; + index -= subs[i] * m_outputStrides[i]; + } + subs[NumDims - 1] = index; + } + + const Dimensions& left_dims = m_leftImpl.dimensions(); + if (subs[m_axis] < left_dims[m_axis]) { + Index left_index; + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + left_index = subs[0]; + for (int i = 1; i < NumDims; ++i) { + left_index += (subs[i] % left_dims[i]) * m_leftStrides[i]; + } + } else { + left_index = subs[NumDims - 1]; + for (int i = NumDims - 2; i >= 0; --i) { + left_index += (subs[i] % left_dims[i]) * m_leftStrides[i]; + } + } + return m_leftImpl.coeff(left_index); + } else { + subs[m_axis] -= left_dims[m_axis]; + const Dimensions& right_dims = m_rightImpl.dimensions(); + Index right_index; + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + right_index = subs[0]; + for (int i = 1; i < NumDims; ++i) { + right_index += (subs[i] % right_dims[i]) * m_rightStrides[i]; + } + } else { + right_index = subs[NumDims - 1]; + for (int i = NumDims - 2; i >= 0; --i) { + right_index += (subs[i] % right_dims[i]) * m_rightStrides[i]; + } + } + return m_rightImpl.coeff(right_index); + } + } + + // TODO(phli): Add a real vectorization. + template<int LoadMode> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + static const int packetSize = internal::unpacket_traits<PacketReturnType>::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index + packetSize - 1 < dimensions().TotalSize()); + + EIGEN_ALIGN_DEFAULT CoeffReturnType values[packetSize]; + for (int i = 0; i < packetSize; ++i) { + values[i] = coeff(index+i); + } + PacketReturnType rslt = internal::pload<PacketReturnType>(values); + return rslt; + } + + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + + protected: + Dimensions m_dimensions; + array<Index, NumDims> m_outputStrides; + array<Index, NumDims> m_leftStrides; + array<Index, NumDims> m_rightStrides; + TensorEvaluator<LeftArgType, Device> m_leftImpl; + TensorEvaluator<RightArgType, Device> m_rightImpl; + const Axis m_axis; +}; + +// Eval as lvalue +template<typename Axis, typename LeftArgType, typename RightArgType, typename Device> + struct TensorEvaluator<TensorConcatenationOp<Axis, LeftArgType, RightArgType>, Device> + : public TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgType>, Device> +{ + typedef TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgType>, Device> Base; + typedef TensorConcatenationOp<Axis, LeftArgType, RightArgType> XprType; + typedef typename Base::Dimensions Dimensions; + enum { + IsAligned = false, + PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & + TensorEvaluator<RightArgType, Device>::PacketAccess, + BlockAccess = false, + Layout = TensorEvaluator<LeftArgType, Device>::Layout, + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(XprType& op, const Device& device) + : Base(op, device) + { + EIGEN_STATIC_ASSERT((static_cast<int>(Layout) == static_cast<int>(ColMajor)), YOU_MADE_A_PROGRAMMING_MISTAKE); + } + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) + { + // Collect dimension-wise indices (subs). + array<Index, Base::NumDims> subs; + for (int i = Base::NumDims - 1; i > 0; --i) { + subs[i] = index / this->m_outputStrides[i]; + index -= subs[i] * this->m_outputStrides[i]; + } + subs[0] = index; + + const Dimensions& left_dims = this->m_leftImpl.dimensions(); + if (subs[this->m_axis] < left_dims[this->m_axis]) { + Index left_index = subs[0]; + for (int i = 1; i < Base::NumDims; ++i) { + left_index += (subs[i] % left_dims[i]) * this->m_leftStrides[i]; + } + return this->m_leftImpl.coeffRef(left_index); + } else { + subs[this->m_axis] -= left_dims[this->m_axis]; + const Dimensions& right_dims = this->m_rightImpl.dimensions(); + Index right_index = subs[0]; + for (int i = 1; i < Base::NumDims; ++i) { + right_index += (subs[i] % right_dims[i]) * this->m_rightStrides[i]; + } + return this->m_rightImpl.coeffRef(right_index); + } + } + + template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void writePacket(Index index, const PacketReturnType& x) + { + static const int packetSize = internal::unpacket_traits<PacketReturnType>::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index + packetSize - 1 < this->dimensions().TotalSize()); + + EIGEN_ALIGN_DEFAULT CoeffReturnType values[packetSize]; + internal::pstore<CoeffReturnType, PacketReturnType>(values, x); + for (int i = 0; i < packetSize; ++i) { + coeffRef(index+i) = values[i]; + } + } +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONCATENATION_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h new file mode 100644 index 0000000000..7fb384c65e --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -0,0 +1,635 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Eric Martin <eric@ericmart.in> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_H + +namespace Eigen { + +/** \class TensorContraction + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor contraction class. + * + * + */ +namespace internal { +template<typename Dimensions, typename LhsXprType, typename RhsXprType> +struct traits<TensorContractionOp<Dimensions, LhsXprType, RhsXprType> > +{ + // Type promotion to handle the case where the types of the lhs and the rhs are different. + typedef typename scalar_product_traits<typename LhsXprType::Scalar, typename RhsXprType::Scalar>::ReturnType Scalar; + + typedef typename scalar_product_traits<typename traits<LhsXprType>::StorageKind, + typename traits<RhsXprType>::StorageKind>::ReturnType StorageKind; + typedef typename promote_index_type<typename traits<LhsXprType>::Index, + typename traits<RhsXprType>::Index>::type Index; + typedef typename LhsXprType::Nested LhsNested; + typedef typename RhsXprType::Nested RhsNested; + typedef typename remove_reference<LhsNested>::type _LhsNested; + typedef typename remove_reference<RhsNested>::type _RhsNested; + + // From NumDims below. + static const int NumDimensions = traits<RhsXprType>::NumDimensions + traits<RhsXprType>::NumDimensions - 2 * array_size<Dimensions>::value; + static const int Layout = traits<LhsXprType>::Layout; + + enum { + Flags = 0, + }; +}; + +template<typename Dimensions, typename LhsXprType, typename RhsXprType> +struct eval<TensorContractionOp<Dimensions, LhsXprType, RhsXprType>, Eigen::Dense> +{ + typedef const TensorContractionOp<Dimensions, LhsXprType, RhsXprType>& type; +}; + +template<typename Dimensions, typename LhsXprType, typename RhsXprType> +struct nested<TensorContractionOp<Dimensions, LhsXprType, RhsXprType>, 1, typename eval<TensorContractionOp<Dimensions, LhsXprType, RhsXprType> >::type> +{ + typedef TensorContractionOp<Dimensions, LhsXprType, RhsXprType> type; +}; + +template<typename Indices_, typename LeftArgType_, typename RightArgType_, typename Device_> +struct traits<TensorEvaluator<const TensorContractionOp<Indices_, LeftArgType_, RightArgType_>, Device_> > { + typedef Indices_ Indices; + typedef LeftArgType_ LeftArgType; + typedef RightArgType_ RightArgType; + typedef Device_ Device; + + // From NumDims below. + static const int NumDimensions = traits<LeftArgType_>::NumDimensions + traits<RightArgType_>::NumDimensions - 2 * array_size<Indices_>::value; +}; + +} // end namespace internal + +template<typename Indices, typename LhsXprType, typename RhsXprType> +class TensorContractionOp : public TensorBase<TensorContractionOp<Indices, LhsXprType, RhsXprType> > +{ + public: + typedef typename Eigen::internal::traits<TensorContractionOp>::Scalar Scalar; + typedef typename internal::scalar_product_traits<typename LhsXprType::CoeffReturnType, + typename RhsXprType::CoeffReturnType>::ReturnType CoeffReturnType; + typedef typename Eigen::internal::nested<TensorContractionOp>::type Nested; + typedef typename Eigen::internal::traits<TensorContractionOp>::StorageKind StorageKind; + typedef typename Eigen::internal::traits<TensorContractionOp>::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionOp( + const LhsXprType& lhs, const RhsXprType& rhs, const Indices& dims) + : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_indices(dims) {} + + EIGEN_DEVICE_FUNC const Indices& indices() const { return m_indices; } + + /** \returns the nested expressions */ + EIGEN_DEVICE_FUNC + const typename internal::remove_all<typename LhsXprType::Nested>::type& + lhsExpression() const { return m_lhs_xpr; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all<typename RhsXprType::Nested>::type& + rhsExpression() const { return m_rhs_xpr; } + + protected: + typename LhsXprType::Nested m_lhs_xpr; + typename RhsXprType::Nested m_rhs_xpr; + const Indices m_indices; +}; + + +template<typename Derived> +struct TensorContractionEvaluatorBase +{ + typedef typename internal::traits<Derived>::Indices Indices; + typedef typename internal::traits<Derived>::LeftArgType LeftArgType; + typedef typename internal::traits<Derived>::RightArgType RightArgType; + typedef typename internal::traits<Derived>::Device Device; + + typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType; + typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar; + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + + enum { + IsAligned = true, + PacketAccess = (internal::packet_traits<Scalar>::size > 1), + BlockAccess = false, + Layout = TensorEvaluator<LeftArgType, Device>::Layout, + CoordAccess = false, // to be implemented + }; + + // Most of the code is assuming that both input tensors are ColMajor. If the + // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS: + // If we want to compute A * B = C, where A is LHS and B is RHS, the code + // will pretend B is LHS and A is RHS. + typedef typename internal::conditional< + static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType; + typedef typename internal::conditional< + static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType; + + static const int LDims = + internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value; + static const int RDims = + internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value; + static const int ContractDims = internal::array_size<Indices>::value; + static const int NumDims = LDims + RDims - 2 * ContractDims; + + typedef array<Index, LDims> left_dim_mapper_t; + typedef array<Index, RDims> right_dim_mapper_t; + typedef array<Index, ContractDims> contract_t; + typedef array<Index, LDims - ContractDims> left_nocontract_t; + typedef array<Index, RDims - ContractDims> right_nocontract_t; + + typedef DSizes<Index, NumDims> Dimensions; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorContractionEvaluatorBase(const XprType& op, const Device& device) + : m_leftImpl(choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(), + op.lhsExpression(), op.rhsExpression()), device), + m_rightImpl(choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(), + op.rhsExpression(), op.lhsExpression()), device), + m_device(device), + m_result(NULL) { + EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) == + static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)), + YOU_MADE_A_PROGRAMMING_MISTAKE); + + eigen_assert((contract_t::size > 0) && "Must contract on some indices"); + + + DSizes<Index, LDims> eval_left_dims; + DSizes<Index, RDims> eval_right_dims; + array<IndexPair<Index>, ContractDims> eval_op_indices; + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + // For ColMajor, we keep using the existing dimensions + for (int i = 0; i < LDims; i++) { + eval_left_dims[i] = m_leftImpl.dimensions()[i]; + } + for (int i = 0; i < RDims; i++) { + eval_right_dims[i] = m_rightImpl.dimensions()[i]; + } + // We keep the pairs of contracting indices. + for (int i = 0; i < ContractDims; i++) { + eval_op_indices[i].first = op.indices()[i].first; + eval_op_indices[i].second = op.indices()[i].second; + } + } else { + // For RowMajor, we need to reverse the existing dimensions + for (int i = 0; i < LDims; i++) { + eval_left_dims[i] = m_leftImpl.dimensions()[LDims - i - 1]; + } + for (int i = 0; i < RDims; i++) { + eval_right_dims[i] = m_rightImpl.dimensions()[RDims - i - 1]; + } + // We need to flip all the pairs of contracting indices as well as + // reversing the dimensions. + for (int i = 0; i < ContractDims; i++) { + eval_op_indices[i].first = LDims - 1 - op.indices()[ContractDims - 1 - i].second; + eval_op_indices[i].second = RDims - 1 - op.indices()[ContractDims - 1 - i].first; + } + } + + array<Index, LDims> lhs_strides; + if (LDims > 0) { + lhs_strides[0] = 1; + for (int i = 0; i < LDims-1; ++i) { + lhs_strides[i+1] = lhs_strides[i] * eval_left_dims[i]; + } + } + + array<Index, RDims> rhs_strides; + if (RDims > 0) { + rhs_strides[0] = 1; + for (int i = 0; i < RDims-1; ++i) { + rhs_strides[i+1] = rhs_strides[i] * eval_right_dims[i]; + } + } + + if (m_i_strides.size() > 0) m_i_strides[0] = 1; + if (m_j_strides.size() > 0) m_j_strides[0] = 1; + if (m_k_strides.size() > 0) m_k_strides[0] = 1; + + m_i_size = 1; + m_j_size = 1; + m_k_size = 1; + + // To compute the dimension, we simply concatenate the non-contracting + // dimensions of the left and then the right tensor. Additionally, I also + // want to compute the cumulative products of the left non-contracting + // dimensions, right non-contracting dimensions, and the contracting + // dimensions (in the order of the contraction) to aid in the later + // computation of tensor indices for matrix indices. + m_lhs_inner_dim_contiguous = true; + int dim_idx = 0; + int nocontract_idx = 0; + + for (int i = 0; i < LDims; i++) { + // find if we are contracting on index i of left tensor + bool contracting = false; + for (int j = 0; j < ContractDims; j++) { + if (eval_op_indices[j].first == i) { + contracting = true; + break; + } + } + if (!contracting) { + // add dimension size to output dimensions + m_dimensions[dim_idx] = eval_left_dims[i]; + m_left_nocontract_strides[nocontract_idx] = lhs_strides[i]; + if (dim_idx != i) { + m_lhs_inner_dim_contiguous = false; + } + if (nocontract_idx+1 < internal::array_size<left_nocontract_t>::value) { + m_i_strides[nocontract_idx+1] = + m_i_strides[nocontract_idx] * eval_left_dims[i]; + } else { + m_i_size = m_i_strides[nocontract_idx] * eval_left_dims[i]; + } + dim_idx++; + nocontract_idx++; + } + } + + nocontract_idx = 0; + for (int i = 0; i < RDims; i++) { + bool contracting = false; + // find if we are contracting on index i of right tensor + for (int j = 0; j < ContractDims; j++) { + if (eval_op_indices[j].second == i) { + contracting = true; + break; + } + } + if (!contracting) { + m_dimensions[dim_idx] = eval_right_dims[i]; + if (nocontract_idx+1 < internal::array_size<right_nocontract_t>::value) { + m_j_strides[nocontract_idx+1] = + m_j_strides[nocontract_idx] * eval_right_dims[i]; + } else { + m_j_size = m_j_strides[nocontract_idx] * eval_right_dims[i]; + } + m_right_nocontract_strides[nocontract_idx] = rhs_strides[i]; + dim_idx++; + nocontract_idx++; + } + } + + // now build contraction cumprod. We assumed above that non-contracting axes + // are represented in the same order in the matrix as they are in the tensor. + // This is not the case for contracting axes. As the contracting axes must be + // of the same size in each tensor, I'll only look at the first tensor here. + m_rhs_inner_dim_contiguous = true; + m_rhs_inner_dim_reordered = false; + for (int i = 0; i < ContractDims; i++) { + Index left = eval_op_indices[i].first; + Index right = eval_op_indices[i].second; + + Index size = eval_left_dims[left]; + eigen_assert(size == eval_right_dims[right] && + "Contraction axes must be same size"); + + if (i+1 < internal::array_size<contract_t>::value) { + m_k_strides[i+1] = m_k_strides[i] * size; + } else { + m_k_size = m_k_strides[i] * size; + } + m_left_contracting_strides[i] = lhs_strides[left]; + m_right_contracting_strides[i] = rhs_strides[right]; + + if (i > 0 && right < eval_op_indices[i-1].second) { + m_rhs_inner_dim_reordered = true; + } + if (right != i) { + m_rhs_inner_dim_contiguous = false; + } + } + + // If the layout is RowMajor, we need to reverse the m_dimensions + if (static_cast<int>(Layout) == static_cast<int>(RowMajor)) { + for (int i = 0, j = NumDims - 1; i < j; i++, j--) { + numext::swap(m_dimensions[i], m_dimensions[j]); + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + m_leftImpl.evalSubExprsIfNeeded(NULL); + m_rightImpl.evalSubExprsIfNeeded(NULL); + if (data) { + evalTo(data); + return false; + } else { + m_result = static_cast<Scalar *>(m_device.allocate(dimensions().TotalSize() * sizeof(Scalar))); + evalTo(m_result); + return true; + } + } + + EIGEN_DEVICE_FUNC void evalTo(Scalar* buffer) const { + if (this->m_lhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_reordered) { + static_cast<const Derived*>(this)->template evalProduct<true, true, true, Unaligned>(buffer); + } + else { + static_cast<const Derived*>(this)->template evalProduct<true, true, false, Unaligned>(buffer); + } + } + else { + if (this->m_rhs_inner_dim_reordered) { + static_cast<const Derived*>(this)->template evalProduct<true, false, true, Unaligned>(buffer); + } + else { + static_cast<const Derived*>(this)->template evalProduct<true, false, false, Unaligned>(buffer); + } + } + } + else { + if (this->m_rhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_reordered) { + static_cast<const Derived*>(this)->template evalProduct<false, true, true, Unaligned>(buffer); + } + else { + static_cast<const Derived*>(this)->template evalProduct<false, true, false, Unaligned>(buffer); + } + } + else { + if (this->m_rhs_inner_dim_reordered) { + static_cast<const Derived*>(this)->template evalProduct<false, false, true, Unaligned>(buffer); + } + else { + static_cast<const Derived*>(this)->template evalProduct<false, false, false, Unaligned>(buffer); + } + } + } + } + + template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment> + void evalGemv(Scalar* buffer) const { + const Index rows = m_i_size; + const Index cols = m_k_size; + + typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar; + typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar; + typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator; + typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator; + const int lhs_packet_size = PacketType<LhsScalar, Device>::size; + const int rhs_packet_size = PacketType<RhsScalar, Device>::size; + typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs, + LeftEvaluator, left_nocontract_t, + contract_t, lhs_packet_size, + lhs_inner_dim_contiguous, + false, Unaligned> LhsMapper; + + typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs, + RightEvaluator, right_nocontract_t, + contract_t, rhs_packet_size, + rhs_inner_dim_contiguous, + rhs_inner_dim_reordered, Unaligned> RhsMapper; + + LhsMapper lhs(m_leftImpl, m_left_nocontract_strides, m_i_strides, + m_left_contracting_strides, m_k_strides); + RhsMapper rhs(m_rightImpl, m_right_nocontract_strides, m_j_strides, + m_right_contracting_strides, m_k_strides); + + const RhsScalar alpha(1); + const Index resIncr(1); + + // zero out the result buffer (which must be of size at least rows * sizeof(Scalar) + m_device.memset(buffer, 0, rows * sizeof(Scalar)); + + internal::general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,false,RhsScalar,RhsMapper,false>::run( + rows, cols, lhs, rhs, + buffer, resIncr, alpha); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_leftImpl.cleanup(); + m_rightImpl.cleanup(); + + if (m_result != NULL) { + m_device.deallocate(m_result); + m_result = NULL; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + return m_result[index]; + } + + template<int LoadMode> + EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const { + return internal::ploadt<PacketReturnType, LoadMode>(m_result + index); + } + + EIGEN_DEVICE_FUNC Scalar* data() const { return m_result; } + + protected: + // Note: nvcc doesn't like implicit copy constructor. If this is needed anywhere, + // then we'll have to write an explicit copy constructor... + //TensorContractionEvaluatorBase(const TensorContractionEvaluatorBase&); + + TensorContractionEvaluatorBase& operator = (const TensorContractionEvaluatorBase&); + Dimensions m_dimensions; + + contract_t m_k_strides; + contract_t m_left_contracting_strides; + contract_t m_right_contracting_strides; + + bool m_lhs_inner_dim_contiguous; + bool m_rhs_inner_dim_contiguous; + bool m_rhs_inner_dim_reordered; + + left_nocontract_t m_i_strides; + right_nocontract_t m_j_strides; + left_nocontract_t m_left_nocontract_strides; + right_nocontract_t m_right_nocontract_strides; + + Index m_i_size; + Index m_j_size; + Index m_k_size; + + TensorEvaluator<EvalLeftArgType, Device> m_leftImpl; + TensorEvaluator<EvalRightArgType, Device> m_rightImpl; + const Device& m_device; + Scalar* m_result; +}; + + +// evaluator for default device +template<typename Indices, typename LeftArgType, typename RightArgType, typename Device> +struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> : + public TensorContractionEvaluatorBase< + TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> > { + typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> Self; + typedef TensorContractionEvaluatorBase<Self> Base; + + typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType; + typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar; + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + + enum { + Layout = TensorEvaluator<LeftArgType, Device>::Layout, + }; + + // Most of the code is assuming that both input tensors are ColMajor. If the + // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS: + // If we want to compute A * B = C, where A is LHS and B is RHS, the code + // will pretend B is LHS and A is RHS. + typedef typename internal::conditional< + static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType; + typedef typename internal::conditional< + static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType; + + static const int LDims = + internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value; + static const int RDims = + internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value; + static const int ContractDims = internal::array_size<Indices>::value; + + typedef array<Index, LDims> left_dim_mapper_t; + typedef array<Index, RDims> right_dim_mapper_t; + + typedef array<Index, ContractDims> contract_t; + typedef array<Index, LDims - ContractDims> left_nocontract_t; + typedef array<Index, RDims - ContractDims> right_nocontract_t; + + static const int NumDims = LDims + RDims - 2 * ContractDims; + + // Could we use NumDimensions here? + typedef DSizes<Index, NumDims> Dimensions; + + + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : + Base(op, device) { } + + template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment> + void evalProduct(Scalar* buffer) const { + if (this->m_j_size == 1) { + this->template evalGemv<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer); + return; + } + + evalGemm<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer); + } + + template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment> + EIGEN_DEVICE_FUNC void evalGemm(Scalar* buffer) const { + // columns in left side, rows in right side + const Index k = this->m_k_size; + + // rows in left side + const Index m = this->m_i_size; + + // columns in right side + const Index n = this->m_j_size; + + // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar) + this->m_device.memset(buffer, 0, m * n * sizeof(Scalar)); + + // define mr, nr, and all of my data mapper types + typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar; + typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar; + typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits; + + const Index nr = Traits::nr; + const Index mr = Traits::mr; + + typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator; + typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator; + + const int lhs_packet_size = internal::packet_traits<LhsScalar>::size; + const int rhs_packet_size = internal::packet_traits<RhsScalar>::size; + + typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs, + LeftEvaluator, left_nocontract_t, + contract_t, lhs_packet_size, + lhs_inner_dim_contiguous, + false, Unaligned> LhsMapper; + + typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs, + RightEvaluator, right_nocontract_t, + contract_t, rhs_packet_size, + rhs_inner_dim_contiguous, + rhs_inner_dim_reordered, Unaligned> RhsMapper; + + typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper; + + // declare GEBP packing and kernel structs + // TODO: packing could be faster sometimes if we supported row major tensor mappers + internal::gemm_pack_lhs<LhsScalar, Index, typename LhsMapper::SubMapper, mr, Traits::LhsProgress, ColMajor> pack_lhs; + internal::gemm_pack_rhs<RhsScalar, Index, typename RhsMapper::SubMapper, nr, ColMajor> pack_rhs; + + // TODO: replace false, false with conjugate values? + internal::gebp_kernel<LhsScalar, RhsScalar, Index, OutputMapper, mr, nr, false, false> gebp; + + // initialize data mappers + LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, + this->m_left_contracting_strides, this->m_k_strides); + + RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides, + this->m_right_contracting_strides, this->m_k_strides); + + OutputMapper output(buffer, m); + + // TODO: refine arguments here (am I row or col major, etc) + typedef typename internal::gemm_blocking_space<ColMajor, LhsScalar, RhsScalar, Dynamic, Dynamic, Dynamic> BlockingType; + + // compute block sizes (which depend on number of threads) + + // last parameter is true to use L3 blocking, 2nd to last parameter is 1 to + // indicate 1 thread + BlockingType blocking(m, n, k, 1, true); + + const Index kc = blocking.kc(); + const Index mc = (std::min<Index>)(m, blocking.mc()); + const Index nc = (std::min<Index>)(n, blocking.nc()); + + // sizes of submatrices to live in cache. see Goto paper. + int sizeA = blocking.mc() * kc; + int sizeB = kc * blocking.nc(); + + // note: m_device.allocate should return 16 byte aligned pointers, but if blockA and blockB + // aren't 16 byte aligned segfaults will happen due to SIMD instructions + LhsScalar* blockA = static_cast<LhsScalar *>(this->m_device.allocate(sizeA * sizeof(LhsScalar))); + RhsScalar* blockB = static_cast<RhsScalar *>(this->m_device.allocate(sizeB * sizeof(RhsScalar))); + + for(Index i2=0; i2<m; i2+=mc) + { + const Index actual_mc = numext::mini(i2+mc,m)-i2; + for (Index k2 = 0; k2 < k; k2 += kc) { + // make sure we don't overshoot right edge of left matrix, then pack vertical panel + const Index actual_kc = numext::mini(k2 + kc, k) - k2; + pack_lhs(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc, 0, 0); + + // series of horizontal blocks + for (Index j2 = 0; j2 < n; j2 += nc) { + // make sure we don't overshoot right edge of right matrix, then pack block + const Index actual_nc = numext::mini(j2 + nc, n) - j2; + pack_rhs(blockB, rhs.getSubMapper(k2, j2), actual_kc, actual_nc, 0, 0); + + // call gebp (matrix kernel) + // The parameters here are copied from Eigen's GEMM implementation + gebp(output.getSubMapper(i2, j2), blockA, blockB, actual_mc, actual_kc, actual_nc, Scalar(1), -1, -1, 0, 0); + } + } + } + + this->m_device.deallocate(blockA); + this->m_device.deallocate(blockB); + } +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h new file mode 100644 index 0000000000..f05746f298 --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h @@ -0,0 +1,1387 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Eric Martin <eric@ericmart.in> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H + +#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) + +namespace Eigen { + +template<typename Scalar, typename Index, typename LhsMapper, + typename RhsMapper, typename OutputMapper, bool needs_edge_check> +__device__ EIGEN_STRONG_INLINE void +EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, volatile Scalar* lhs_shmem, volatile Scalar* rhs_shmem, + const Index m_size, const Index n_size, const Index k_size) { + + const Index m_block_idx = blockIdx.x; + const Index n_block_idx = blockIdx.y; + + const Index base_m = 64 * m_block_idx; + const Index base_n = 64 * n_block_idx; + + // declare and initialize 64 registers for output 8x8 block + + // prefetch registers + Scalar lhs_pf0; + Scalar lhs_pf1; + Scalar lhs_pf2; + Scalar lhs_pf3; + Scalar lhs_pf4; + Scalar lhs_pf5; + Scalar lhs_pf6; + Scalar lhs_pf7; + + Scalar rhs_pf0; + Scalar rhs_pf1; + Scalar rhs_pf2; + Scalar rhs_pf3; + Scalar rhs_pf4; + Scalar rhs_pf5; + Scalar rhs_pf6; + Scalar rhs_pf7; + + // shared memory is formatted + // (contract idx in block, nocontract idx in block, block idx) + // where block idx is column major. This transposition limits the number of + // bank conflicts when reading the LHS. The core idea is that since the contracting + // index is shared by both sides, then the contracting index should be in threadIdx.x. + + // On the LHS, we pad each row inside of each block with an extra element. This makes + // each block 8 rows of 9 elements, which is 72 elements. This gives no bank conflicts + // on writes and very few 2-way conflicts on reads. There is an 8x8 grid of these blocks. + + // On the RHS we just add 8 padding elements to the end of each block. This gives no bank + // conflicts on writes and also none on reads. + + // storage indices + const Index lhs_store_idx_base = threadIdx.y * 72 + threadIdx.x * 9 + threadIdx.z; + const Index rhs_store_idx_base = threadIdx.y * 72 + threadIdx.z * 8 + threadIdx.x; + + const Index lhs_store_idx_0 = lhs_store_idx_base + 576 * 0; + const Index lhs_store_idx_1 = lhs_store_idx_base + 576 * 1; + const Index lhs_store_idx_2 = lhs_store_idx_base + 576 * 2; + const Index lhs_store_idx_3 = lhs_store_idx_base + 576 * 3; + const Index lhs_store_idx_4 = lhs_store_idx_base + 576 * 4; + const Index lhs_store_idx_5 = lhs_store_idx_base + 576 * 5; + const Index lhs_store_idx_6 = lhs_store_idx_base + 576 * 6; + const Index lhs_store_idx_7 = lhs_store_idx_base + 576 * 7; + + const Index rhs_store_idx_0 = rhs_store_idx_base + 576 * 0; + const Index rhs_store_idx_1 = rhs_store_idx_base + 576 * 1; + const Index rhs_store_idx_2 = rhs_store_idx_base + 576 * 2; + const Index rhs_store_idx_3 = rhs_store_idx_base + 576 * 3; + const Index rhs_store_idx_4 = rhs_store_idx_base + 576 * 4; + const Index rhs_store_idx_5 = rhs_store_idx_base + 576 * 5; + const Index rhs_store_idx_6 = rhs_store_idx_base + 576 * 6; + const Index rhs_store_idx_7 = rhs_store_idx_base + 576 * 7; + + // in the loading code, the following variables are important: + // threadIdx.x: the vertical position in an 8x8 block + // threadIdx.y: the vertical index of the 8x8 block in the grid + // threadIdx.z: the horizontal position in an 8x8 block + // k: the horizontal index of the 8x8 block in the grid + // + // The k parameter is implicit (it was the loop counter for a loop that went + // from 0 to <8, but now that loop is unrolled in the below code. + + const Index load_idx_vert = threadIdx.x + 8 * threadIdx.y; + const Index lhs_vert = base_m + load_idx_vert; + +#define prefetchIntoRegisters(base_k) \ + { \ + lhs_pf0 = Scalar(0); \ + lhs_pf1 = Scalar(0); \ + lhs_pf2 = Scalar(0); \ + lhs_pf3 = Scalar(0); \ + lhs_pf4 = Scalar(0); \ + lhs_pf5 = Scalar(0); \ + lhs_pf6 = Scalar(0); \ + lhs_pf7 = Scalar(0); \ + \ + rhs_pf0 = Scalar(0); \ + rhs_pf1 = Scalar(0); \ + rhs_pf2 = Scalar(0); \ + rhs_pf3 = Scalar(0); \ + rhs_pf4 = Scalar(0); \ + rhs_pf5 = Scalar(0); \ + rhs_pf6 = Scalar(0); \ + rhs_pf7 = Scalar(0); \ + \ + if (!needs_edge_check || lhs_vert < m_size) { \ + const Index lhs_horiz_0 = base_k + threadIdx.z + 0 * 8; \ + const Index lhs_horiz_1 = base_k + threadIdx.z + 1 * 8; \ + const Index lhs_horiz_2 = base_k + threadIdx.z + 2 * 8; \ + const Index lhs_horiz_3 = base_k + threadIdx.z + 3 * 8; \ + const Index lhs_horiz_4 = base_k + threadIdx.z + 4 * 8; \ + const Index lhs_horiz_5 = base_k + threadIdx.z + 5 * 8; \ + const Index lhs_horiz_6 = base_k + threadIdx.z + 6 * 8; \ + const Index lhs_horiz_7 = base_k + threadIdx.z + 7 * 8; \ + \ + if (!needs_edge_check || lhs_horiz_7 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ + lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ + lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \ + lhs_pf6 = lhs(lhs_vert, lhs_horiz_6); \ + lhs_pf7 = lhs(lhs_vert, lhs_horiz_7); \ + } else if (lhs_horiz_6 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ + lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ + lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \ + lhs_pf6 = lhs(lhs_vert, lhs_horiz_6); \ + } else if (lhs_horiz_5 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ + lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ + lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \ + } else if (lhs_horiz_4 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ + lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ + } else if (lhs_horiz_3 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ + } else if (lhs_horiz_2 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + } else if (lhs_horiz_1 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + } else if (lhs_horiz_0 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + } \ + } \ + \ + const Index rhs_vert = base_k + load_idx_vert; \ + if (!needs_edge_check || rhs_vert < k_size) { \ + const Index rhs_horiz_0 = base_n + threadIdx.z + 0 * 8; \ + const Index rhs_horiz_1 = base_n + threadIdx.z + 1 * 8; \ + const Index rhs_horiz_2 = base_n + threadIdx.z + 2 * 8; \ + const Index rhs_horiz_3 = base_n + threadIdx.z + 3 * 8; \ + const Index rhs_horiz_4 = base_n + threadIdx.z + 4 * 8; \ + const Index rhs_horiz_5 = base_n + threadIdx.z + 5 * 8; \ + const Index rhs_horiz_6 = base_n + threadIdx.z + 6 * 8; \ + const Index rhs_horiz_7 = base_n + threadIdx.z + 7 * 8; \ + \ + if (rhs_horiz_7 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ + rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ + rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \ + rhs_pf6 = rhs(rhs_vert, rhs_horiz_6); \ + rhs_pf7 = rhs(rhs_vert, rhs_horiz_7); \ + } else if (rhs_horiz_6 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ + rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ + rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \ + rhs_pf6 = rhs(rhs_vert, rhs_horiz_6); \ + } else if (rhs_horiz_5 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ + rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ + rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \ + } else if (rhs_horiz_4 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ + rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ + } else if (rhs_horiz_3 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ + } else if (rhs_horiz_2 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + } else if (rhs_horiz_1 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + } else if (rhs_horiz_0 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + } \ + } \ + } \ + +#define writeRegToShmem(_) \ + lhs_shmem[lhs_store_idx_0] = lhs_pf0; \ + rhs_shmem[rhs_store_idx_0] = rhs_pf0; \ + \ + lhs_shmem[lhs_store_idx_1] = lhs_pf1; \ + rhs_shmem[rhs_store_idx_1] = rhs_pf1; \ + \ + lhs_shmem[lhs_store_idx_2] = lhs_pf2; \ + rhs_shmem[rhs_store_idx_2] = rhs_pf2; \ + \ + lhs_shmem[lhs_store_idx_3] = lhs_pf3; \ + rhs_shmem[rhs_store_idx_3] = rhs_pf3; \ + \ + lhs_shmem[lhs_store_idx_4] = lhs_pf4; \ + rhs_shmem[rhs_store_idx_4] = rhs_pf4; \ + \ + lhs_shmem[lhs_store_idx_5] = lhs_pf5; \ + rhs_shmem[rhs_store_idx_5] = rhs_pf5; \ + \ + lhs_shmem[lhs_store_idx_6] = lhs_pf6; \ + rhs_shmem[rhs_store_idx_6] = rhs_pf6; \ + \ + lhs_shmem[lhs_store_idx_7] = lhs_pf7; \ + rhs_shmem[rhs_store_idx_7] = rhs_pf7; \ + + // declare and initialize result array +#define res(i, j) _res_##i##j +#define initResultRow(i) \ + Scalar res(i, 0) = Scalar(0); \ + Scalar res(i, 1) = Scalar(0); \ + Scalar res(i, 2) = Scalar(0); \ + Scalar res(i, 3) = Scalar(0); \ + Scalar res(i, 4) = Scalar(0); \ + Scalar res(i, 5) = Scalar(0); \ + Scalar res(i, 6) = Scalar(0); \ + Scalar res(i, 7) = Scalar(0); \ + + initResultRow(0); + initResultRow(1); + initResultRow(2); + initResultRow(3); + initResultRow(4); + initResultRow(5); + initResultRow(6); + initResultRow(7); +#undef initResultRow + + for (Index base_k = 0; base_k < k_size; base_k += 64) { + // wait for previous iteration to finish with shmem. Despite common sense, + // the code is a bit faster with this here then at bottom of loop + __syncthreads(); + + prefetchIntoRegisters(base_k); + writeRegToShmem(); + + #undef prefetchIntoRegisters + #undef writeRegToShmem + + // wait for shared mem packing to be done before starting computation + __syncthreads(); + + // compute 8x8 matrix product by outer product. This involves packing one column + // of LHS and one row of RHS into registers (takes 16 registers). + +#define lcol(i) _lcol##i + Scalar lcol(0); + Scalar lcol(1); + Scalar lcol(2); + Scalar lcol(3); + Scalar lcol(4); + Scalar lcol(5); + Scalar lcol(6); + Scalar lcol(7); + +#define rrow(j) _rrow##j + Scalar rrow(0); + Scalar rrow(1); + Scalar rrow(2); + Scalar rrow(3); + Scalar rrow(4); + Scalar rrow(5); + Scalar rrow(6); + Scalar rrow(7); + + // Now x corresponds to k, y to m, and z to n + const volatile Scalar* lhs_block = &lhs_shmem[threadIdx.x + 9 * threadIdx.y]; + const volatile Scalar* rhs_block = &rhs_shmem[threadIdx.x + 8 * threadIdx.z]; + +#define lhs_element(i, j) lhs_block[72 * ((i) + 8 * (j))] +#define rhs_element(i, j) rhs_block[72 * ((i) + 8 * (j))] + +#define loadData(i, j) \ + lcol(0) = lhs_element(0, j); \ + rrow(0) = rhs_element(i, 0); \ + lcol(1) = lhs_element(1, j); \ + rrow(1) = rhs_element(i, 1); \ + lcol(2) = lhs_element(2, j); \ + rrow(2) = rhs_element(i, 2); \ + lcol(3) = lhs_element(3, j); \ + rrow(3) = rhs_element(i, 3); \ + lcol(4) = lhs_element(4, j); \ + rrow(4) = rhs_element(i, 4); \ + lcol(5) = lhs_element(5, j); \ + rrow(5) = rhs_element(i, 5); \ + lcol(6) = lhs_element(6, j); \ + rrow(6) = rhs_element(i, 6); \ + lcol(7) = lhs_element(7, j); \ + rrow(7) = rhs_element(i, 7); \ + +#define computeCol(j) \ + res(0, j) += lcol(0) * rrow(j); \ + res(1, j) += lcol(1) * rrow(j); \ + res(2, j) += lcol(2) * rrow(j); \ + res(3, j) += lcol(3) * rrow(j); \ + res(4, j) += lcol(4) * rrow(j); \ + res(5, j) += lcol(5) * rrow(j); \ + res(6, j) += lcol(6) * rrow(j); \ + res(7, j) += lcol(7) * rrow(j); \ + +#define computePass(i) \ + loadData(i, i); \ + \ + computeCol(0); \ + computeCol(1); \ + computeCol(2); \ + computeCol(3); \ + computeCol(4); \ + computeCol(5); \ + computeCol(6); \ + computeCol(7); \ + + computePass(0); + computePass(1); + computePass(2); + computePass(3); + computePass(4); + computePass(5); + computePass(6); + computePass(7); + +#undef lcol +#undef rrow +#undef lhs_element +#undef rhs_element +#undef loadData +#undef computeCol +#undef computePass + } // end loop over k + + // we've now iterated over all of the large (ie width 64) k blocks and + // accumulated results in registers. At this point thread (x, y, z) contains + // the sum across all big k blocks of the product of little k block of index (x, y) + // with block of index (y, z). To compute the final output, we need to reduce + // the 8 threads over y by summation. +#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask) + +#define reduceRow(i, mask) \ + shuffleInc(i, 0, mask); \ + shuffleInc(i, 1, mask); \ + shuffleInc(i, 2, mask); \ + shuffleInc(i, 3, mask); \ + shuffleInc(i, 4, mask); \ + shuffleInc(i, 5, mask); \ + shuffleInc(i, 6, mask); \ + shuffleInc(i, 7, mask); \ + +#define reduceMatrix(mask) \ + reduceRow(0, mask); \ + reduceRow(1, mask); \ + reduceRow(2, mask); \ + reduceRow(3, mask); \ + reduceRow(4, mask); \ + reduceRow(5, mask); \ + reduceRow(6, mask); \ + reduceRow(7, mask); \ + + // actually perform the reduction, now each thread of index (_, y, z) + // contains the correct values in its registers that belong in the output + // block + reduceMatrix(1); + reduceMatrix(2); + reduceMatrix(4); + +#undef shuffleInc +#undef reduceRow +#undef reduceMatrix + + // now we need to copy the 64 values into main memory. We can't split work + // among threads because all variables are in registers. There's 2 ways + // to do this: + // (1) have 1 thread do 64 writes from registers into global memory + // (2) have 1 thread do 64 writes into shared memory, and then 8 threads + // each do 8 writes into global memory. We can just overwrite the shared + // memory from the problem we just solved. + // (2) is slightly faster than (1) due to less branching and more ILP + + // TODO: won't yield much gain, but could just use currently unused shared mem + // and then we won't have to sync + // wait for shared mem to be out of use + __syncthreads(); + +#define writeResultShmem(i, j) \ + lhs_shmem[i + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j] = res(i, j); \ + +#define writeRow(i) \ + writeResultShmem(i, 0); \ + writeResultShmem(i, 1); \ + writeResultShmem(i, 2); \ + writeResultShmem(i, 3); \ + writeResultShmem(i, 4); \ + writeResultShmem(i, 5); \ + writeResultShmem(i, 6); \ + writeResultShmem(i, 7); \ + + if (threadIdx.x == 0) { + writeRow(0); + writeRow(1); + writeRow(2); + writeRow(3); + writeRow(4); + writeRow(5); + writeRow(6); + writeRow(7); + } +#undef writeResultShmem +#undef writeRow + + const int max_i_write = (min)((int)((m_size - base_m - threadIdx.y + 7) / 8), 8); + const int max_j_write = (min)((int)((n_size - base_n - threadIdx.z + 7) / 8), 8); + + if (threadIdx.x < max_i_write) { + if (max_j_write == 8) { + // TODO: can i trade bank conflicts for coalesced writes? + Scalar val0 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 0]; + Scalar val1 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 1]; + Scalar val2 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 2]; + Scalar val3 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 3]; + Scalar val4 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 4]; + Scalar val5 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 5]; + Scalar val6 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 6]; + Scalar val7 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 7]; + + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 0) = val0; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 1) = val1; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 2) = val2; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 3) = val3; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 4) = val4; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 5) = val5; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 6) = val6; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 7) = val7; + } else { +#pragma unroll 7 + for (int j = 0; j < max_j_write; j++) { + Scalar val = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j]; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * j) = val; + } + } + } +#undef res +} + + +template<typename Scalar, typename Index, typename LhsMapper, + typename RhsMapper, typename OutputMapper> +__global__ void +__launch_bounds__(512) +EigenContractionKernel(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, + const Index m_size, const Index n_size, const Index k_size) { + __shared__ volatile Scalar lhs_shmem[72 * 64]; + __shared__ volatile Scalar rhs_shmem[72 * 64]; + + const Index m_block_idx = blockIdx.x; + const Index n_block_idx = blockIdx.y; + + const Index base_m = 64 * m_block_idx; + const Index base_n = 64 * n_block_idx; + + if (base_m + 63 < m_size && base_n + 63 < n_size) { + EigenContractionKernelInternal<Scalar, Index, LhsMapper, RhsMapper, OutputMapper, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size); + } else { + EigenContractionKernelInternal<Scalar, Index, LhsMapper, RhsMapper, OutputMapper, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size); + } +} + + +template<typename Index, typename LhsMapper, + typename RhsMapper, typename OutputMapper, bool CHECK_LHS_BOUNDARY, + bool CHECK_RHS_BOUNDARY> +__device__ EIGEN_STRONG_INLINE void +EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, float2 lhs_shmem2[][16], + float2 rhs_shmem2[][8], const Index m_size, + const Index n_size, const Index k_size, + const Index base_m, const Index base_n) { + typedef float Scalar; + + // prefetch registers + float4 lhs_pf0, rhs_pf0; + + float4 results[4]; + for (int i = 0; i < 4; i++) { + results[i].x = results[i].y = results[i].z = results[i].w = 0; + } + + +#define prefetch_lhs(reg, row, col) \ + if (!CHECK_LHS_BOUNDARY) { \ + if (col < k_size) { \ + reg =lhs.loadPacket(row, col); \ + } \ + } else { \ + if (col < k_size) { \ + if (row + 3 < m_size) { \ + reg =lhs.loadPacket(row, col); \ + } else if (row + 2 < m_size) { \ + reg.x =lhs(row + 0, col); \ + reg.y =lhs(row + 1, col); \ + reg.z =lhs(row + 2, col); \ + } else if (row + 1 < m_size) { \ + reg.x =lhs(row + 0, col); \ + reg.y =lhs(row + 1, col); \ + } else if (row < m_size) { \ + reg.x =lhs(row + 0, col); \ + } \ + } \ + } \ + + + Index lhs_vert = base_m+threadIdx.x*4; + + for (Index k = 0; k < k_size; k += 16) { + lhs_pf0 = internal::pset1<float4>(0); + rhs_pf0 = internal::pset1<float4>(0); + + Index lhs_horiz = threadIdx.y+k; + prefetch_lhs(lhs_pf0, lhs_vert, lhs_horiz) + + Index rhs_vert = k+(threadIdx.x%4)*4; + Index rhs_horiz0 = (threadIdx.x>>2)+threadIdx.y*4+base_n; + + if (!CHECK_RHS_BOUNDARY) { + if ((rhs_vert + 3) < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0); + } else if (rhs_vert + 2 < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + } else if (rhs_vert + 1 < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + } else if (rhs_vert < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + } + } else { + if (rhs_horiz0 < n_size) { + if ((rhs_vert + 3) < k_size) { + rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0); + } else if ((rhs_vert + 2) < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + } else if ((rhs_vert + 1) < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + } else if (rhs_vert < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + } + } + } + float x1, x2 ; + // the following can be a bitwise operation..... some day. + if((threadIdx.x%8) < 4) { + x1 = rhs_pf0.y; + x2 = rhs_pf0.w; + } else { + x1 = rhs_pf0.x; + x2 = rhs_pf0.z; + } + x1 = __shfl_xor(x1, 4); + x2 = __shfl_xor(x2, 4); + if((threadIdx.x%8) < 4) { + rhs_pf0.y = x1; + rhs_pf0.w = x2; + } else { + rhs_pf0.x = x1; + rhs_pf0.z = x2; + } + + // We have 64 features. + // Row 0 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 0, 1. + // Row 1 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 2, 3. + // ... + // Row 31 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 62, 63 + // Row 32 -> times (2, 6, 10, 14, 3, 7, 11, 15) for features 0, 1 + // ... + rhs_shmem2[(threadIdx.x>>3)+ threadIdx.y*2][threadIdx.x%8] = make_float2(rhs_pf0.x, rhs_pf0.y); + rhs_shmem2[(threadIdx.x>>3)+ threadIdx.y*2+32][threadIdx.x%8] = make_float2(rhs_pf0.z, rhs_pf0.w); + + // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) + // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) + // ... + // Row 15 (time 15) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) + // Row 16 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) + // ... + + lhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(lhs_pf0.x, lhs_pf0.y); + lhs_shmem2[threadIdx.y+16][threadIdx.x] = make_float2(lhs_pf0.z, lhs_pf0.w); + + +#define add_vals(fl1, fl2, fr1, fr2)\ + results[0].x += fl1.x * fr1.x;\ + results[0].y += fl1.y * fr1.x;\ + results[0].z += fl2.x * fr1.x;\ + results[0].w += fl2.y * fr1.x;\ +\ + results[1].x += fl1.x * fr1.y;\ + results[1].y += fl1.y * fr1.y;\ + results[1].z += fl2.x * fr1.y;\ + results[1].w += fl2.y * fr1.y;\ +\ + results[2].x += fl1.x * fr2.x;\ + results[2].y += fl1.y * fr2.x;\ + results[2].z += fl2.x * fr2.x;\ + results[2].w += fl2.y * fr2.x;\ +\ + results[3].x += fl1.x * fr2.y;\ + results[3].y += fl1.y * fr2.y;\ + results[3].z += fl2.x * fr2.y;\ + results[3].w += fl2.y * fr2.y;\ + + __syncthreads(); + + // Do the multiplies. + #pragma unroll + for (int koff = 0; koff < 16; koff ++) { + // 32 x threads. + float2 fl1 = lhs_shmem2[koff][threadIdx.x]; + float2 fl2 = lhs_shmem2[koff + 16][threadIdx.x]; + + int start_feature = threadIdx.y * 4; + float2 fr1 = rhs_shmem2[(start_feature>>1) + 32*((koff%4)/2)][koff/4 + (koff%2)*4]; + float2 fr2 = rhs_shmem2[(start_feature>>1) + 1 + 32*((koff%4)/2)][koff/4 + (koff%2)*4]; + + add_vals(fl1, fl2, fr1, fr2) + } + __syncthreads(); + } + +#undef prefetch_lhs +#undef add_vals + + Index horiz_base = threadIdx.y*4+base_n; + if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) { + for (int i = 0; i < 4; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } else if (!CHECK_RHS_BOUNDARY) { + // CHECK LHS + if (lhs_vert + 3 < m_size) { + for (int i = 0; i < 4; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } else if (lhs_vert + 2 < m_size) { + for (int i = 0; i < 4; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + } + } else if (lhs_vert + 1 < m_size) { + for (int i = 0; i < 4; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + } + } else if (lhs_vert < m_size) { + for (int i = 0; i < 4; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + } + } + } else if (!CHECK_LHS_BOUNDARY) { + // CHECK RHS + /* + int ncols_rem = fminf(n_size- horiz_base, 4); + for (int i = 0; i < ncols_rem; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + }*/ + for (int i = 0; i < 4; i++) { + if (horiz_base+i < n_size) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } + } else { + // CHECK both boundaries. + for (int i = 0; i < 4; i++) { + if (horiz_base+i < n_size) { + if (lhs_vert < m_size) + output(lhs_vert, horiz_base + i) = results[i].x; + if (lhs_vert + 1 < m_size) + output(lhs_vert + 1, horiz_base + i) = results[i].y; + if (lhs_vert + 2 < m_size) + output(lhs_vert + 2, horiz_base + i) = results[i].z; + if (lhs_vert + 3 < m_size) + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } + } +} + + +template<typename Index, typename LhsMapper, + typename RhsMapper, typename OutputMapper, bool CHECK_LHS_BOUNDARY, + bool CHECK_RHS_BOUNDARY> +__device__ EIGEN_ALWAYS_INLINE void +EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, float2 lhs_shmem2[][32], + float2 rhs_shmem2[][8], const Index m_size, + const Index n_size, const Index k_size, + const Index base_m, const Index base_n) { + typedef float Scalar; + + // prefetch registers + float4 lhs_pf0, lhs_pf1, lhs_pf2, lhs_pf3; + float4 rhs_pf0, rhs_pf1; + + float4 results[8]; + for (int i=0; i < 8; i++) { + results[i].x = results[i].y = results[i].z = results[i].w = 0; + } + + + Index lhs_vert = base_m+threadIdx.x*4+(threadIdx.y%4)*32; + for (Index k = 0; k < k_size; k += 32) { + lhs_pf0 = internal::pset1<float4>(0); + lhs_pf1 = internal::pset1<float4>(0); + lhs_pf2 = internal::pset1<float4>(0); + lhs_pf3 = internal::pset1<float4>(0); + + rhs_pf0 = internal::pset1<float4>(0); + rhs_pf1 = internal::pset1<float4>(0); + + if (!CHECK_LHS_BOUNDARY) { + if ((threadIdx.y/4+k+24) < k_size) { + lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16)); + lhs_pf3 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+24)); + } else if ((threadIdx.y/4+k+16) < k_size) { + lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16)); + } else if ((threadIdx.y/4+k+8) < k_size) { + lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + } else if ((threadIdx.y/4+k) < k_size) { + lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); + } + } else { + // just CHECK_LHS_BOUNDARY + if (lhs_vert + 3 < m_size) { + if ((threadIdx.y/4+k+24) < k_size) { + lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16)); + lhs_pf3 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+24)); + } else if ((threadIdx.y/4+k+16) < k_size) { + lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16)); + } else if ((threadIdx.y/4+k+8) < k_size) { + lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + } else if ((threadIdx.y/4+k) < k_size) { + lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); + } + } else if (lhs_vert + 2 < m_size) { + if ((threadIdx.y/4+k+24) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); + lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); + lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16)); + lhs_pf2.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+16)); + lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24)); + lhs_pf3.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+24)); + lhs_pf3.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+24)); + } else if ((threadIdx.y/4+k+16) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); + lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); + lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16)); + lhs_pf2.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+16)); + } else if ((threadIdx.y/4+k+8) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); + lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8)); + } else if ((threadIdx.y/4+k) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k)); + } + } else if (lhs_vert + 1 < m_size) { + if ((threadIdx.y/4+k+24) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); + lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16)); + lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24)); + lhs_pf3.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+24)); + } else if ((threadIdx.y/4+k+16) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); + lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16)); + } else if ((threadIdx.y/4+k+8) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); + } else if ((threadIdx.y/4+k) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + } + } else if (lhs_vert < m_size) { + if ((threadIdx.y/4+k+24) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); + lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24)); + } else if ((threadIdx.y/4+k+16) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); + } else if ((threadIdx.y/4+k+8) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + } else if ((threadIdx.y/4+k) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + } + } + } + __syncthreads(); + Index rhs_vert = k+threadIdx.x*4; + Index rhs_horiz0 = threadIdx.y*2+base_n; + Index rhs_horiz1 = threadIdx.y*2+1+base_n; + if (!CHECK_RHS_BOUNDARY) { + if ((rhs_vert + 3) < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0); + rhs_pf1 = rhs.loadPacket(rhs_vert, rhs_horiz1); + } else if (rhs_vert + 2 < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); + rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1); + } else if (rhs_vert + 1 < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); + } else if (rhs_vert < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + } + } else { + if (rhs_horiz1 < n_size) { + if ((rhs_vert + 3) < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0); + rhs_pf1 = rhs.loadPacket(rhs_vert, rhs_horiz1); + } else if (rhs_vert + 2 < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); + rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1); + } else if (k+threadIdx.x*4 + 1 < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); + } else if (k+threadIdx.x*4 < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + } + } else if (rhs_horiz0 < n_size) { + if ((rhs_vert + 3) < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0); + } else if ((rhs_vert + 2) < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + } else if ((rhs_vert + 1) < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + } else if (rhs_vert < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + } + } + } + __syncthreads(); + // Loaded. Do computation + // Row 0 -> times (0, 4, 8, .. 28) for features 0, 1. + // Row 1 -> times (0, 4, 8, .. 28) for features 2, 3. + // .. + // Row 31 -> times (0, 4, 8, .. 28) for features 62, 63 + rhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(rhs_pf0.x, rhs_pf1.x); + // Row 32 -> times (1, 5, 9, .. 29) for features 0, 1. + // Row 33 -> times (1, 5, 9, .. 29) for features 2, 3. + // .. + rhs_shmem2[threadIdx.y+32][threadIdx.x] = make_float2(rhs_pf0.y, rhs_pf1.y); + // Row 64 -> times (2, 6, 10, .. 30) for features 0, 1. + // Row 65 -> times (2, 6, 10, .. 30) for features 2, 3. + rhs_shmem2[threadIdx.y+64][threadIdx.x] = make_float2(rhs_pf0.z, rhs_pf1.z); + // Row 96 -> times (3, 7, 11, .. 31) for features 0, 1. + // Row 97 -> times (3, 7, 11, .. 31) for features 2, 3. + rhs_shmem2[threadIdx.y+96][threadIdx.x] = make_float2(rhs_pf0.w, rhs_pf1.w); + + // LHS. + // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) .. (124, 125) + // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) .. (124, 125) + // ... + // Row 8 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) .. (126, 127) + // Row 15 (time 7) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) .. (126, 127) + + +#define add_vals(a_feat1, a_feat2, f1, f2, f3, f4)\ + results[0].x += a_feat1.x * f1.x;\ + results[1].x += a_feat1.x * f1.y;\ + results[2].x += a_feat1.x * f2.x;\ + results[3].x += a_feat1.x * f2.y;\ + results[4].x += a_feat1.x * f3.x;\ + results[5].x += a_feat1.x * f3.y;\ + results[6].x += a_feat1.x * f4.x;\ + results[7].x += a_feat1.x * f4.y;\ +\ + results[0].y += a_feat1.y * f1.x;\ + results[1].y += a_feat1.y * f1.y;\ + results[2].y += a_feat1.y * f2.x;\ + results[3].y += a_feat1.y * f2.y;\ + results[4].y += a_feat1.y * f3.x;\ + results[5].y += a_feat1.y * f3.y;\ + results[6].y += a_feat1.y * f4.x;\ + results[7].y += a_feat1.y * f4.y;\ +\ + results[0].z += a_feat2.x * f1.x;\ + results[1].z += a_feat2.x * f1.y;\ + results[2].z += a_feat2.x * f2.x;\ + results[3].z += a_feat2.x * f2.y;\ + results[4].z += a_feat2.x * f3.x;\ + results[5].z += a_feat2.x * f3.y;\ + results[6].z += a_feat2.x * f4.x;\ + results[7].z += a_feat2.x * f4.y;\ +\ + results[0].w += a_feat2.y * f1.x;\ + results[1].w += a_feat2.y * f1.y;\ + results[2].w += a_feat2.y * f2.x;\ + results[3].w += a_feat2.y * f2.y;\ + results[4].w += a_feat2.y * f3.x;\ + results[5].w += a_feat2.y * f3.y;\ + results[6].w += a_feat2.y * f4.x;\ + results[7].w += a_feat2.y * f4.y;\ + + lhs_shmem2[threadIdx.y/4][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf0.x, lhs_pf0.y); + lhs_shmem2[threadIdx.y/4+8][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf1.x, lhs_pf1.y); + lhs_shmem2[threadIdx.y/4+16][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf2.x, lhs_pf2.y); + lhs_shmem2[threadIdx.y/4+24][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf3.x, lhs_pf3.y); + + lhs_shmem2[threadIdx.y/4 + 32][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf0.z, lhs_pf0.w); + lhs_shmem2[threadIdx.y/4 + 40][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf1.z, lhs_pf1.w); + lhs_shmem2[threadIdx.y/4 + 48][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf2.z, lhs_pf2.w); + lhs_shmem2[threadIdx.y/4 + 56][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf3.z, lhs_pf3.w); + + __syncthreads(); + + // Do the multiplies. + #pragma unroll + for (int koff = 0; koff < 32; koff ++) { + float2 a3 = lhs_shmem2[koff][threadIdx.x + (threadIdx.y % 4) * 8]; + float2 a4 = lhs_shmem2[koff + 32][threadIdx.x + (threadIdx.y % 4) * 8]; + + // first feature is at (threadIdx.y/4) * 8 last is at start + 8. + int start_feature = (threadIdx.y / 4) * 8; + + float2 br1 = rhs_shmem2[start_feature/2 + (koff % 4) * 32][koff/4]; + float2 br2 = rhs_shmem2[start_feature/2 + 1 + (koff % 4) * 32][koff/4]; + float2 br3 = rhs_shmem2[start_feature/2 + 2 + (koff % 4) * 32][koff/4]; + float2 br4 = rhs_shmem2[start_feature/2 + 3 + (koff % 4) * 32][koff/4]; + + add_vals(a3, a4, br1, br2, br3, br4) + } + __syncthreads(); + } // end loop over k + + + __syncthreads(); + Index horiz_base = (threadIdx.y/4)*8+base_n; + if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) { + #pragma unroll + for (int i = 0; i < 8; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } else if (!CHECK_RHS_BOUNDARY) { + if (lhs_vert + 3 < m_size) { + #pragma unroll + for (int i = 0; i < 8; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } else if (lhs_vert + 2 < m_size) { + #pragma unroll + for (int i = 0; i < 8; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + } + } else if (lhs_vert + 1 < m_size) { + #pragma unroll + for (int i = 0; i < 8; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + } + } else if (lhs_vert < m_size) { + #pragma unroll + for (int i = 0; i < 8; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + } + } + } else if (!CHECK_LHS_BOUNDARY) { + // CHECK BOUNDARY_B + #pragma unroll + for (int i = 0; i < 8; i++) { + if (horiz_base + i < n_size) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } + } else { + // CHECK both boundaries. + #pragma unroll + for (int i = 0; i < 8; i++) { + if (horiz_base + i < n_size) { + if (lhs_vert < m_size) + output(lhs_vert, horiz_base + i) = results[i].x; + if (lhs_vert + 1 < m_size) + output(lhs_vert + 1, horiz_base + i) = results[i].y; + if (lhs_vert + 2 < m_size) + output(lhs_vert + 2, horiz_base + i) = results[i].z; + if (lhs_vert + 3 < m_size) + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } + } +} + + +template<typename Index, typename LhsMapper, + typename RhsMapper, typename OutputMapper> +__global__ void +__launch_bounds__(256) +EigenFloatContractionKernel(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, + const Index m_size, const Index n_size, const Index k_size) { + __shared__ float2 lhs_shmem[64*32]; + __shared__ float2 rhs_shmem[128*8]; + + typedef float2 LHS_MEM[64][32]; + typedef float2 RHS_MEM[128][8]; + + typedef float2 LHS_MEM16x16[32][16]; + typedef float2 RHS_MEM16x16[64][8]; + + const Index m_block_idx = blockIdx.x; + const Index n_block_idx = blockIdx.y; + + const Index base_m = 128 * m_block_idx; + const Index base_n = 64 * n_block_idx; + + const bool check_rhs = (base_n + 63) >= n_size; + const bool check_lhs128 = (base_m + 127) >= m_size; + + if (!check_rhs) { + if (!check_lhs128) { + // >= 128 rows left + EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, false, false>( + lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); + } else { + EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, true, false>( + lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); + } + } else { + if (!check_lhs128) { + // >= 128 rows left + EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, false, true>( + lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); + } else { + EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, true, true>( + lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); + } + } +} + +template<typename Index, typename LhsMapper, + typename RhsMapper, typename OutputMapper> +__global__ void +__launch_bounds__(256) +EigenFloatContractionKernel16x16(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, + const Index m_size, const Index n_size, const Index k_size) { + __shared__ float2 lhs_shmem[32][16]; + __shared__ float2 rhs_shmem[64][8]; + + const Index m_block_idx = blockIdx.x; + const Index n_block_idx = blockIdx.y; + + const Index base_m = 64 * m_block_idx; + const Index base_n = 64 * n_block_idx; + + if (base_m + 63 < m_size) { + if (base_n + 63 < n_size) { + EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, false, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); + } else { + EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, false, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); + } + } else { + if (base_n + 63 < n_size) { + EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, true, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); + } else { + EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, true, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); + } + } +} + + +template<typename Indices, typename LeftArgType, typename RightArgType> +struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, GpuDevice> : + public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, GpuDevice> > { + + typedef GpuDevice Device; + + typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> Self; + typedef TensorContractionEvaluatorBase<Self> Base; + + typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType; + typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar; + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType<CoeffReturnType, GpuDevice>::type PacketReturnType; + + enum { + Layout = TensorEvaluator<LeftArgType, Device>::Layout, + }; + + // Most of the code is assuming that both input tensors are ColMajor. If the + // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS: + // If we want to compute A * B = C, where A is LHS and B is RHS, the code + // will pretend B is LHS and A is RHS. + typedef typename internal::conditional< + static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType; + typedef typename internal::conditional< + static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType; + + static const int LDims = + internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value; + static const int RDims = + internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value; + static const int ContractDims = internal::array_size<Indices>::value; + + typedef array<Index, LDims> left_dim_mapper_t; + typedef array<Index, RDims> right_dim_mapper_t; + + typedef array<Index, ContractDims> contract_t; + typedef array<Index, LDims - ContractDims> left_nocontract_t; + typedef array<Index, RDims - ContractDims> right_nocontract_t; + + static const int NumDims = LDims + RDims - 2 * ContractDims; + + typedef DSizes<Index, NumDims> Dimensions; + + // typedefs needed in evalTo + typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar; + typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar; + + typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator; + typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator; + + typedef typename LeftEvaluator::Dimensions LeftDimensions; + typedef typename RightEvaluator::Dimensions RightDimensions; + + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : + Base(op, device) {} + + // We need to redefine this method to make nvcc happy + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + this->m_leftImpl.evalSubExprsIfNeeded(NULL); + this->m_rightImpl.evalSubExprsIfNeeded(NULL); + if (data) { + evalTo(data); + return false; + } else { + this->m_result = static_cast<Scalar *>(this->m_device.allocate(this->dimensions().TotalSize() * sizeof(Scalar))); + evalTo(this->m_result); + return true; + } + } + + void evalTo(Scalar* buffer) const { + if (this->m_lhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_reordered) { + evalTyped<true, true, true, Unaligned>(buffer); + } + else { + evalTyped<true, true, false, Unaligned>(buffer); + } + } + else { + if (this->m_rhs_inner_dim_reordered) { + evalTyped<true, false, true, Unaligned>(buffer); + } + else { + evalTyped<true, false, false, Unaligned>(buffer); + } + } + } + else { + if (this->m_rhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_reordered) { + evalTyped<false, true, true, Unaligned>(buffer); + } + else { + evalTyped<false, true, false, Unaligned>(buffer); + } + } + else { + if (this->m_rhs_inner_dim_reordered) { + evalTyped<false, false, true, Unaligned>(buffer); + } + else { + evalTyped<false, false, false, Unaligned>(buffer); + } + } + } + } + + template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment> + void evalTyped(Scalar* buffer) const { + // columns in left side, rows in right side + const Index k = this->m_k_size; + + // rows in left side + const Index m = this->m_i_size; + + // columns in right side + const Index n = this->m_j_size; + + // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar) + this->m_device.memset(buffer, 0, m * n * sizeof(Scalar)); + + typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs, + LeftEvaluator, left_nocontract_t, + contract_t, 4, + lhs_inner_dim_contiguous, + false, Unaligned> LhsMapper; + + typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs, + RightEvaluator, right_nocontract_t, + contract_t, 4, + rhs_inner_dim_contiguous, + rhs_inner_dim_reordered, Unaligned> RhsMapper; + + typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper; + + + // initialize data mappers + LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, + this->m_left_contracting_strides, this->m_k_strides); + + RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides, + this->m_right_contracting_strides, this->m_k_strides); + + OutputMapper output(buffer, m); + + setCudaSharedMemConfig(cudaSharedMemBankSizeEightByte); + if (internal::is_same<LhsScalar, float>::value && + internal::is_same<RhsScalar, float>::value) { + if (m < 768 || n < 768) { + const Index m_blocks = (m + 63) / 64; + const Index n_blocks = (n + 63) / 64; + const dim3 num_blocks(m_blocks, n_blocks, 1); + const dim3 block_size(16, 16, 1); + LAUNCH_CUDA_KERNEL((EigenFloatContractionKernel16x16<Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, this->m_device, lhs, rhs, output, m, n, k); + } else { + const Index m_blocks = (m + 127) / 128; + const Index n_blocks = (n + 63) / 64; + const dim3 num_blocks(m_blocks, n_blocks, 1); + const dim3 block_size(8, 32, 1); + LAUNCH_CUDA_KERNEL((EigenFloatContractionKernel<Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, this->m_device, lhs, rhs, output, m, n, k); + } + } else { + const Index m_blocks = (m + 63) / 64; + const Index n_blocks = (n + 63) / 64; + const dim3 num_blocks(m_blocks, n_blocks, 1); + const dim3 block_size(8, 8, 8); + LAUNCH_CUDA_KERNEL((EigenContractionKernel<Scalar, Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, this->m_device, lhs, rhs, output, m, n, k); + } + } +}; + +} // end namespace Eigen + +#endif // EIGEN_USE_GPU and __CUDACC__ +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMappers.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMappers.h new file mode 100644 index 0000000000..b5b09bf41e --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMappers.h @@ -0,0 +1,383 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Eric Martin <eric@ericmart.in> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MAPPERS_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MAPPERS_H + +// NOTE: The file has strong column major bias/assumptions, which is pointed out +// in comments. As of right now, this code will only work the column major packing +// routines. + +/* + * A tensor contraction can be represented by a matrix multiplication. We don't + * want to actually reshape the tensor into a matrix (because this involves a + * full copy of the tensor), so the reshaping operation is implicit in a sense. + * This means we need a collection of methods take a matrix index and return + * the element of the tensor that would be at that index if we were to actually + * reshape the matrix. This file consists of these methods. + */ + +namespace Eigen { +namespace internal { + +enum { + Rhs = 0, + Lhs = 1, +}; + +/* + * Used to lookup the tensor index when working with the left and right + * arguments to a tensor contraction. + */ +template<typename Scalar, typename Index, int side, + typename Tensor, + typename nocontract_t, typename contract_t, + size_t packet_size, bool inner_dim_contiguous> +class SimpleTensorContractionMapper { + public: + EIGEN_DEVICE_FUNC + SimpleTensorContractionMapper(const Tensor& tensor, + const nocontract_t& nocontract_strides, + const nocontract_t& ij_strides, + const contract_t& contract_strides, + const contract_t& k_strides) : + m_tensor(tensor), + m_nocontract_strides(nocontract_strides), + m_ij_strides(ij_strides), + m_contract_strides(contract_strides), + m_k_strides(k_strides) { } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE void prefetch(int i) { } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar operator()(Index row) const { + // column major assumption + return operator()(row, 0); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar operator()(Index row, Index col) const { + return m_tensor.coeff(computeIndex(row, col)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Index computeIndex(Index row, Index col) const { + const bool left = (side == Lhs); + Index nocontract_val = left ? row : col; + Index linidx = 0; + for (int i = array_size<nocontract_t>::value - 1; i > 0; i--) { + const Index idx = nocontract_val / m_ij_strides[i]; + linidx += idx * m_nocontract_strides[i]; + nocontract_val -= idx * m_ij_strides[i]; + } + if (array_size<typename Tensor::Dimensions>::value > array_size<contract_t>::value) { + if (side == Lhs && inner_dim_contiguous) { + eigen_assert(m_nocontract_strides[0] == 1); + linidx += nocontract_val; + } else { + linidx += nocontract_val * m_nocontract_strides[0]; + } + } + + Index contract_val = left ? col : row; + for (int i = array_size<contract_t>::value - 1; i > 0; i--) { + const Index idx = contract_val / m_k_strides[i]; + linidx += idx * m_contract_strides[i]; + contract_val -= idx * m_k_strides[i]; + } + EIGEN_STATIC_ASSERT(array_size<contract_t>::value > 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + if (side == Rhs && inner_dim_contiguous) { + eigen_assert(m_contract_strides[0] == 1); + linidx += contract_val; + } else { + linidx += contract_val * m_contract_strides[0]; + } + + return linidx; + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE IndexPair<Index> computeIndexPair(Index row, Index col, const Index distance) const { + const bool left = (side == Lhs); + Index nocontract_val[2] = {left ? row : col, left ? row + distance : col}; + Index linidx[2] = {0, 0}; + for (int i = array_size<nocontract_t>::value - 1; i > 0; i--) { + const Index idx0 = nocontract_val[0] / m_ij_strides[i]; + const Index idx1 = nocontract_val[1] / m_ij_strides[i]; + linidx[0] += idx0 * m_nocontract_strides[i]; + linidx[1] += idx1 * m_nocontract_strides[i]; + nocontract_val[0] -= idx0 * m_ij_strides[i]; + nocontract_val[1] -= idx1 * m_ij_strides[i]; + } + if (array_size<typename Tensor::Dimensions>::value > array_size<contract_t>::value) { + if (side == Lhs && inner_dim_contiguous) { + eigen_assert(m_nocontract_strides[0] == 1); + linidx[0] += nocontract_val[0]; + linidx[1] += nocontract_val[1]; + } else { + linidx[0] += nocontract_val[0] * m_nocontract_strides[0]; + linidx[1] += nocontract_val[1] * m_nocontract_strides[0]; + } + } + + Index contract_val[2] = {left ? col : row, left ? col : row + distance}; + for (int i = array_size<contract_t>::value - 1; i > 0; i--) { + const Index idx0 = contract_val[0] / m_k_strides[i]; + const Index idx1 = contract_val[1] / m_k_strides[i]; + linidx[0] += idx0 * m_contract_strides[i]; + linidx[1] += idx1 * m_contract_strides[i]; + contract_val[0] -= idx0 * m_k_strides[i]; + contract_val[1] -= idx1 * m_k_strides[i]; + } + EIGEN_STATIC_ASSERT(array_size<contract_t>::value > 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + if (side == Rhs && inner_dim_contiguous) { + eigen_assert(m_contract_strides[0] == 1); + linidx[0] += contract_val[0]; + linidx[1] += contract_val[1]; + } else { + linidx[0] += contract_val[0] * m_contract_strides[0]; + linidx[1] += contract_val[1] * m_contract_strides[0]; + } + return IndexPair<Index>(linidx[0], linidx[1]); + } + + Index firstAligned(Index size) const { + return size; + } + Index stride() const { + return 1; + } + + protected: + const Tensor m_tensor; + const nocontract_t m_nocontract_strides; + const nocontract_t m_ij_strides; + const contract_t m_contract_strides; + const contract_t m_k_strides; +}; + + + +template<typename Scalar, typename Index, int side, + typename Tensor, + typename nocontract_t, typename contract_t, + size_t packet_size, bool inner_dim_contiguous, + bool inner_dim_reordered, int Alignment> + class BaseTensorContractionMapper : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous> +{ + public: + typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous> ParentMapper; + + EIGEN_DEVICE_FUNC + BaseTensorContractionMapper(const Tensor& tensor, + const nocontract_t& nocontract_strides, + const nocontract_t& ij_strides, + const contract_t& contract_strides, + const contract_t& k_strides) : + ParentMapper(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { } + + typedef typename packet_traits<Scalar>::type Packet; + typedef typename packet_traits<Scalar>::half HalfPacket; + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const { + // whole method makes column major assumption + + // don't need to add offsets for now (because operator handles that) + // current code assumes packet size must be a multiple of 2 + EIGEN_STATIC_ASSERT(packet_size % 2 == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + + if (Tensor::PacketAccess && inner_dim_contiguous && !inner_dim_reordered) { + const Index index = this->computeIndex(i, j); + eigen_assert(this->computeIndex(i+packet_size-1, j) == index + packet_size-1); + return this->m_tensor.template packet<Alignment>(index); + } + + const IndexPair<Index> indexPair = this->computeIndexPair(i, j, packet_size - 1); + const Index first = indexPair.first; + const Index last = indexPair.second; + + // We can always do optimized packet reads from left hand side right now, because + // the vertical matrix dimension on the left hand side is never contracting. + // On the right hand side we need to check if the contracting dimensions may have + // been shuffled first. + if (Tensor::PacketAccess && + (side == Lhs || internal::array_size<contract_t>::value <= 1 || !inner_dim_reordered) && + (last - first) == (packet_size - 1)) { + + return this->m_tensor.template packet<Alignment>(first); + } + + EIGEN_ALIGN_DEFAULT Scalar data[packet_size]; + + data[0] = this->m_tensor.coeff(first); + for (Index k = 1; k < packet_size - 1; k += 2) { + const IndexPair<Index> internal_pair = this->computeIndexPair(i + k, j, 1); + data[k] = this->m_tensor.coeff(internal_pair.first); + data[k + 1] = this->m_tensor.coeff(internal_pair.second); + } + data[packet_size - 1] = this->m_tensor.coeff(last); + + return pload<Packet>(data); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE HalfPacket loadHalfPacket(Index i, Index j) const { + // whole method makes column major assumption + + // don't need to add offsets for now (because operator handles that) + const Index half_packet_size = unpacket_traits<HalfPacket>::size; + if (half_packet_size == packet_size) { + return loadPacket(i, j); + } + EIGEN_ALIGN_DEFAULT Scalar data[half_packet_size]; + for (Index k = 0; k < half_packet_size; k++) { + data[k] = operator()(i + k, j); + } + return pload<HalfPacket>(data); + } +}; + + +template<typename Scalar, typename Index, int side, + typename Tensor, + typename nocontract_t, typename contract_t, + bool inner_dim_contiguous, + bool inner_dim_reordered, int Alignment> +class BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment> : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous> +{ + public: + typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous> ParentMapper; + + EIGEN_DEVICE_FUNC + BaseTensorContractionMapper(const Tensor& tensor, + const nocontract_t& nocontract_strides, + const nocontract_t& ij_strides, + const contract_t& contract_strides, + const contract_t& k_strides) : + ParentMapper(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { } + + typedef typename packet_traits<Scalar>::type Packet; + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const { + EIGEN_ALIGN_DEFAULT Scalar data[1]; + data[0] = this->m_tensor.coeff(this->computeIndex(i, j)); + return pload<typename packet_traits<Scalar>::type>(data); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Packet loadHalfPacket(Index i, Index j) const { + return loadPacket(i, j); + } +}; + +template<typename Scalar, typename Index, int side, + typename Tensor, + typename nocontract_t, typename contract_t, + size_t packet_size, + bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment> +class TensorContractionInputMapper; + +template<typename Scalar, typename Index, int side, + typename Tensor, + typename nocontract_t, typename contract_t, + size_t packet_size, + bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment> +class TensorContractionSubMapper { + public: + typedef typename packet_traits<Scalar>::type Packet; + typedef typename packet_traits<Scalar>::half HalfPacket; + + typedef TensorContractionInputMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> ParentMapper; + typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> Self; + typedef Self LinearMapper; + + EIGEN_DEVICE_FUNC TensorContractionSubMapper(const ParentMapper& base_mapper, Index vert_offset, Index horiz_offset) + : m_base_mapper(base_mapper), m_vert_offset(vert_offset), m_horiz_offset(horiz_offset) { } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const { + return m_base_mapper(i + m_vert_offset, m_horiz_offset); + } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i, Index j) const { + return m_base_mapper(i + m_vert_offset, j + m_horiz_offset); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const { + return m_base_mapper.loadPacket(i + m_vert_offset, m_horiz_offset); + } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const { + return m_base_mapper.loadPacket(i + m_vert_offset, j + m_horiz_offset); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const { + return m_base_mapper.loadHalfPacket(i + m_vert_offset, m_horiz_offset); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, Packet p) const { + m_base_mapper.storePacket(i + m_vert_offset, m_horiz_offset, p); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const { + return LinearMapper(m_base_mapper, i + m_vert_offset, j + m_horiz_offset); + } + + template <typename PacketT, int AlignmentType> + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i) const { + EIGEN_STATIC_ASSERT((internal::is_same<PacketT, Packet>::value), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((AlignmentType == Aligned || Alignment == Unaligned), YOU_MADE_A_PROGRAMMING_MISTAKE); + return loadPacket(i); + } + + template <typename Packet> + EIGEN_DEVICE_FUNC bool aligned(Index i) const { + return false; + } + + private: + const ParentMapper& m_base_mapper; + const Index m_vert_offset; + const Index m_horiz_offset; +}; + + +template<typename Scalar, typename Index, int side, + typename Tensor, + typename nocontract_t, typename contract_t, + size_t packet_size, + bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment> +class TensorContractionInputMapper + : public BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> { + + public: + typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> Base; + typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> SubMapper; + typedef SubMapper VectorMapper; + + EIGEN_DEVICE_FUNC TensorContractionInputMapper(const Tensor& tensor, + const nocontract_t& nocontract_strides, + const nocontract_t& ij_strides, + const contract_t& contract_strides, + const contract_t& k_strides) + : Base(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE SubMapper getSubMapper(Index i, Index j) const { + return SubMapper(*this, i, j); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const { + return VectorMapper(*this, i, j); + } +}; + + +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MAPPERS_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h new file mode 100644 index 0000000000..c335086902 --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -0,0 +1,713 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H + +namespace Eigen { +namespace internal { + +// Specify blocking strategy for thread pool by cols +template<typename LhsScalar, typename RhsScalar, int KcFactor, typename Index> +struct ComputeGemmByColBlockingSizes { + void operator()(Index& k, Index& m, Index& n, Index num_threads = 1) + { + computeProductBlockingSizes<LhsScalar,RhsScalar,1>(k, m, n, num_threads); + } +}; + +// Specify blocking strategy for thread pool by rows +template<typename LhsScalar, typename RhsScalar, int KcFactor, typename Index> +struct ComputeGemmByRowBlockingSizes { + void operator()(Index& k, Index& m, Index& n, Index num_threads = 1) + { + if (!k || !m || !n) { + return; + } + m = (((m / num_threads) + 15) / 16) * 16; + } +}; + +} // namespace internal +} // namespace Eigen + +// evaluator for thread pool device +#ifdef EIGEN_USE_THREADS + +namespace Eigen { +namespace internal { + +template<typename LhsScalar, typename LhsMapper, typename Index> +struct packLhsArg { + LhsScalar* blockA; + const LhsMapper& lhs; + const Index m_start; + const Index k_start; + const Index mc; + const Index kc; +}; + +template<typename LhsScalar, typename RhsScalar, typename RhsMapper, typename OutputMapper, typename Index> +struct packRhsAndKernelArg { + const FixedSizeVector<LhsScalar*>* blockAs; + RhsScalar* blockB; + const RhsMapper& rhs; + OutputMapper& output; + const Index m; + const Index k; + const Index n; + const Index mc; + const Index kc; + const Index nc; + const Index num_threads; + const Index num_blockAs; + const Index max_m; + const Index k_block_idx; + const Index m_block_idx; + const Index n_block_idx; + const Index m_blocks; + const Index n_blocks; + FixedSizeVector<Notification*>* kernel_notifications; + const FixedSizeVector<Notification*>* lhs_notifications; + const bool need_to_pack; +}; + +template<typename RhsScalar, typename RhsMapper, typename Index> +struct packRhsArg { + RhsScalar* blockB; + const RhsMapper& rhs; + const Index n_start; + const Index k_start; + const Index nc; + const Index kc; +}; + +template<typename LhsScalar, typename RhsScalar, typename LhsMapper, typename OutputMapper, typename Index> +struct packLhsAndKernelArg { + const FixedSizeVector<RhsScalar*>* blockBs; + LhsScalar* blockA; + const LhsMapper& lhs; + OutputMapper& output; + const Index m; + const Index k; + const Index n; + const Index mc; + const Index kc; + const Index nc; + const Index num_threads; + const Index num_blockBs; + const Index max_n; + const Index k_block_idx; + const Index m_block_idx; + const Index n_block_idx; + const Index m_blocks; + const Index n_blocks; + FixedSizeVector<Notification*>* kernel_notifications; + const FixedSizeVector<Notification*>* rhs_notifications; + const bool need_to_pack; +}; + +} // end namespace internal + + +template<typename Indices, typename LeftArgType, typename RightArgType> +struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, ThreadPoolDevice> : + public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, ThreadPoolDevice> > { + + typedef ThreadPoolDevice Device; + + typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> Self; + typedef TensorContractionEvaluatorBase<Self> Base; + + typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType; + typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar; + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType<CoeffReturnType, ThreadPoolDevice>::type PacketReturnType; + + enum { + Layout = TensorEvaluator<LeftArgType, Device>::Layout, + }; + + // Most of the code is assuming that both input tensors are ColMajor. If the + // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS: + // If we want to compute A * B = C, where A is LHS and B is RHS, the code + // will pretend B is LHS and A is RHS. + typedef typename internal::conditional< + static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType; + typedef typename internal::conditional< + static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType; + + static const int LDims = + internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value; + static const int RDims = + internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value; + static const int ContractDims = internal::array_size<Indices>::value; + + typedef array<Index, LDims> left_dim_mapper_t; + typedef array<Index, RDims> right_dim_mapper_t; + + typedef array<Index, ContractDims> contract_t; + typedef array<Index, LDims - ContractDims> left_nocontract_t; + typedef array<Index, RDims - ContractDims> right_nocontract_t; + + static const int NumDims = LDims + RDims - 2 * ContractDims; + + typedef DSizes<Index, NumDims> Dimensions; + + // typedefs needed in evalTo + typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar; + typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar; + typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits; + + typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator; + typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator; + + TensorEvaluator(const XprType& op, const Device& device) : + Base(op, device) {} + + template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment> + void evalProduct(Scalar* buffer) const { + // Disable Gemv on ARM/AVX or if multiple threads are in use +#if !defined(EIGEN_VECTORIZE_NEON) && !defined(EIGEN_VECTORIZE_AVX) + if (this->m_j_size == 1 && this->m_device.numThreads() == 1) { + this->template evalGemv<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer); + return; + } +#endif + + if (this->m_j_size / this->m_device.numThreads() < Traits::nr && + this->m_i_size / this->m_device.numThreads() >= Traits::mr) { + evalGemmByRows<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer); + } else { + evalGemmByCols<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer); + } + } + + template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment> + void evalGemmByCols(Scalar* buffer) const { + // columns in left side, rows in right side + const Index k = this->m_k_size; + + // rows in left side + const Index m = this->m_i_size; + + // columns in right side + const Index n = this->m_j_size; + + // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar) + this->m_device.memset(buffer, 0, m * n * sizeof(Scalar)); + + + const int lhs_packet_size = PacketType<LhsScalar, Device>::size; + const int rhs_packet_size = PacketType<RhsScalar, Device>::size; + + typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs, + LeftEvaluator, left_nocontract_t, + contract_t, lhs_packet_size, + lhs_inner_dim_contiguous, + false, Unaligned> LhsMapper; + + typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs, + RightEvaluator, right_nocontract_t, + contract_t, rhs_packet_size, + rhs_inner_dim_contiguous, + rhs_inner_dim_reordered, Unaligned> RhsMapper; + + typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper; + + // TODO: packing could be faster sometimes if we supported row major tensor mappers + typedef internal::gemm_pack_lhs<LhsScalar, Index, typename LhsMapper::SubMapper, Traits::mr, + Traits::LhsProgress, ColMajor> LhsPacker; + typedef internal::gemm_pack_rhs<RhsScalar, Index, typename RhsMapper::SubMapper, Traits::nr, ColMajor> RhsPacker; + + // TODO: replace false, false with conjugate values? + typedef internal::gebp_kernel<LhsScalar, RhsScalar, Index, OutputMapper, + Traits::mr, Traits::nr, false, false> GebpKernel; + + typedef internal::packLhsArg<LhsScalar, LhsMapper, Index> packLArg; + typedef internal::packRhsAndKernelArg<LhsScalar, RhsScalar, RhsMapper, OutputMapper, Index> packRKArg; + + // initialize data mappers + LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, + this->m_left_contracting_strides, this->m_k_strides); + + RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides, + this->m_right_contracting_strides, this->m_k_strides); + + OutputMapper output(buffer, m); + + LhsPacker pack_lhs; + + // compute block sizes (which depend on number of threads) + const Index num_threads = this->m_device.numThreads(); + Index mc = m; + Index nc = n; + Index kc = k; + internal::ComputeGemmByColBlockingSizes<LhsScalar,RhsScalar,1,Index> block; + block(kc, mc, nc, num_threads); + eigen_assert(mc <= m); + eigen_assert(nc <= n); + eigen_assert(kc <= k); + +#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b)) + const Index k_blocks = CEIL_DIV(k, kc); + const Index n_blocks = CEIL_DIV(n, nc); + const Index m_blocks = CEIL_DIV(m, mc); +#undef CEIL_DIV + + const int sizeA = mc * kc; + const int sizeB = kc * nc; + + /* cout << "m: " << m << " n: " << n << " k: " << k << endl; + cout << "mc: " << mc << " nc: " << nc << " kc: " << kc << endl; + cout << "m_blocks: " << m_blocks << " n_blocks: " << n_blocks << " k_blocks: " << k_blocks << endl; + cout << "num threads: " << num_threads << endl; + */ + + // note: m_device.allocate should return 16 byte aligned pointers, but if blockA and blockB + // aren't 16 byte aligned segfaults will happen due to SIMD instructions + // note: You can get away with allocating just a single blockA and offsets and meet the + // the alignment requirements with the assumption that + // (Traits::mr * sizeof(ResScalar)) % 16 == 0 + const Index numBlockAs = (std::min)(num_threads, m_blocks); + FixedSizeVector<LhsScalar *> blockAs(num_threads); + for (int i = 0; i < num_threads; i++) { + blockAs.push_back(static_cast<LhsScalar *>(this->m_device.allocate(sizeA * sizeof(LhsScalar)))); + } + + // To circumvent alignment issues, I'm just going to separately allocate the memory for each thread + // TODO: is this too much memory to allocate? This simplifies coding a lot, but is wasteful. + // Other options: (1) reuse memory when a thread finishes. con: tricky + // (2) allocate block B memory in each thread. con: overhead + FixedSizeVector<RhsScalar *> blockBs(n_blocks); + for (int i = 0; i < n_blocks; i++) { + blockBs.push_back(static_cast<RhsScalar *>(this->m_device.allocate(sizeB * sizeof(RhsScalar)))); + } + + // lhs_notifications starts with all null Notifications + FixedSizeVector<Notification*> lhs_notifications(num_threads, nullptr); + + // this should really be numBlockAs * n_blocks; + const Index num_kernel_notifications = num_threads * n_blocks; + FixedSizeVector<Notification*> kernel_notifications(num_kernel_notifications, + nullptr); + + for (Index k_block_idx = 0; k_block_idx < k_blocks; k_block_idx++) { + const Index k_start = k_block_idx * kc; + // make sure we don't overshoot right edge of left matrix + const Index actual_kc = (std::min)(k_start + kc, k) - k_start; + + for (Index m_block_idx = 0; m_block_idx < m_blocks; m_block_idx += numBlockAs) { + const int num_blocks = (std::min)(m_blocks-m_block_idx, numBlockAs); + + for (Index mt_block_idx = m_block_idx; mt_block_idx < m_block_idx+num_blocks; mt_block_idx++) { + const Index m_start = mt_block_idx * mc; + const Index actual_mc = (std::min)(m_start + mc, m) - m_start; + eigen_assert(actual_mc > 0); + + int blockAId = (k_block_idx * m_blocks + mt_block_idx) % num_threads; + + // Wait for previous RHS kernels to complete. + for (int i = 0; i < n_blocks; ++i) { + int notification_id = (blockAId * n_blocks + i); + + // Wait for any current kernels using this slot to complete + // before using it. + if (kernel_notifications[notification_id]) { + wait_until_ready(kernel_notifications[notification_id]); + delete kernel_notifications[notification_id]; + } + kernel_notifications[notification_id] = new Notification(); + } + const packLArg arg = { + blockAs[blockAId], // blockA + lhs, // lhs + m_start, // m + k_start, // k + actual_mc, // mc + actual_kc, // kc + }; + + // Delete any existing notification since we may be + // replacing it. The algorithm should ensure that there are + // no existing waiters on this notification. + delete lhs_notifications[blockAId]; + lhs_notifications[blockAId] = + this->m_device.enqueue(&Self::packLhs<packLArg, LhsPacker>, arg); + } + + // now start kernels. + const Index m_base_start = m_block_idx * mc; + const bool need_to_pack = m_block_idx == 0; + + for (Index n_block_idx = 0; n_block_idx < n_blocks; n_block_idx++) { + const Index n_start = n_block_idx * nc; + const Index actual_nc = (std::min)(n_start + nc, n) - n_start; + + // first make sure the previous kernels are all done before overwriting rhs. Also wait if + // we're going to start new k. In both cases need_to_pack is true. + if (need_to_pack) { + for (int i = num_blocks; i < num_threads; ++i) { + Index blockAId = (k_block_idx * m_blocks + i + m_block_idx) % num_threads; + Index future_id = (blockAId * n_blocks + n_block_idx); + wait_until_ready(kernel_notifications[future_id]); + } + } + + packRKArg arg = { + &blockAs, // blockA + blockBs[n_block_idx], // blockB + rhs, // rhs + output, // output + m_base_start, // m + k_start, // k + n_start, // n + mc, // mc + actual_kc, // kc + actual_nc, // nc + num_threads, + numBlockAs, + m, + k_block_idx, + m_block_idx, + n_block_idx, // n_block_idx + m_blocks, // m_blocks + n_blocks, // n_blocks + &kernel_notifications, // kernel_notifications + &lhs_notifications, // lhs_notifications + need_to_pack, // need_to_pack + }; + + // We asynchronously kick off this function, which ends up + // notifying the appropriate kernel_notifications objects, + // which this thread waits on before exiting. + // + // The wait for kernel_notifications below ensures that we + // don't have to keep track of the launch of this work. + this->m_device.enqueue_and_forget(&Self::packRhsAndKernel<packRKArg, RhsPacker, GebpKernel>, arg); + } + } + } + + // Make sure all the kernels are done. + for (int i = 0; i < kernel_notifications.size(); ++i) { + wait_until_ready(kernel_notifications[i]); + delete kernel_notifications[i]; + } + + // No need to wait for lhs notifications since they should have + // already been waited on. Just clean them up. + for (int i = 0; i < lhs_notifications.size(); ++i) { + delete lhs_notifications[i]; + } + + // deallocate all of the memory for both A and B's + for (int i = 0; i < blockAs.size(); i++) { + this->m_device.deallocate(blockAs[i]); + } + for (int i = 0; i < blockBs.size(); i++) { + this->m_device.deallocate(blockBs[i]); + } + } + + /* + * Packs a LHS block of size (mt, kc) starting at lhs(m, k). Before packing + * the LHS block, check that all of the kernels that worked on the same + * mt_block_idx in the previous m_block are done. + */ + template <typename packLArg, typename LhsPacker> + static void packLhs(const packLArg arg) { + // perform actual packing + LhsPacker pack_lhs; + pack_lhs(arg.blockA, arg.lhs.getSubMapper(arg.m_start, arg.k_start), arg.kc, arg.mc); + } + + /* + * Packs a RHS block of size (kc, nc) starting at (k, n) after checking that + * all kernels in the previous block are done. + * Then for each LHS future, we wait on the future and then call GEBP + * on the area packed by the future (which starts at + * blockA + future_idx * mt * kc) on the LHS and with the full packed + * RHS block. + * The output of this GEBP is written to output(m + i * mt, n). + */ + template <typename packRKArg, typename RhsPacker, typename GebpKernel> + static void packRhsAndKernel(packRKArg arg) { + if (arg.need_to_pack) { + RhsPacker pack_rhs; + pack_rhs(arg.blockB, arg.rhs.getSubMapper(arg.k, arg.n), arg.kc, arg.nc); + } + + GebpKernel gebp; + for (Index mt_block_idx = 0; mt_block_idx < arg.num_blockAs; mt_block_idx++) { + const Index m_base_start = arg.m + arg.mc*mt_block_idx; + if (m_base_start < arg.max_m) { + int blockAId = (arg.k_block_idx * arg.m_blocks + mt_block_idx + arg.m_block_idx) % arg.num_threads; + wait_until_ready((*arg.lhs_notifications)[blockAId]); + const Index actual_mc = (std::min)(m_base_start + arg.mc, arg.max_m) - m_base_start; + gebp(arg.output.getSubMapper(m_base_start, arg.n), + (*arg.blockAs)[blockAId], arg.blockB, + actual_mc, arg.kc, arg.nc, Scalar(1), -1, -1, 0, 0); + + // Notify that the kernel is done. + const Index set_idx = blockAId * arg.n_blocks + arg.n_block_idx; + (*arg.kernel_notifications)[set_idx]->Notify(); + } + } + } + + template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment> + void evalGemmByRows(Scalar* buffer) const { + // columns in left side, rows in right side + const Index k = this->m_k_size; + + // rows in left side + const Index m = this->m_i_size; + + // columns in right side + const Index n = this->m_j_size; + + // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar) + this->m_device.memset(buffer, 0, m * n * sizeof(Scalar)); + + const int lhs_packet_size = PacketType<LhsScalar, ThreadPoolDevice>::size; + const int rhs_packet_size = PacketType<RhsScalar, ThreadPoolDevice>::size; + + typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs, + LeftEvaluator, left_nocontract_t, + contract_t, lhs_packet_size, + lhs_inner_dim_contiguous, + false, Unaligned> LhsMapper; + + typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs, + RightEvaluator, right_nocontract_t, + contract_t, rhs_packet_size, + rhs_inner_dim_contiguous, + rhs_inner_dim_reordered, Unaligned> RhsMapper; + + typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper; + + // TODO: packing could be faster sometimes if we supported row major tensor mappers + typedef internal::gemm_pack_lhs<LhsScalar, Index, typename LhsMapper::SubMapper, Traits::mr, + Traits::LhsProgress, ColMajor> LhsPacker; + typedef internal::gemm_pack_rhs<RhsScalar, Index, typename RhsMapper::SubMapper, Traits::nr, ColMajor> RhsPacker; + + // TODO: replace false, false with conjugate values? + typedef internal::gebp_kernel<LhsScalar, RhsScalar, Index, OutputMapper, + Traits::mr, Traits::nr, false, false> GebpKernel; + + typedef internal::packRhsArg<RhsScalar, RhsMapper, Index> packRArg; + typedef internal::packLhsAndKernelArg<LhsScalar, RhsScalar, LhsMapper, OutputMapper, Index> packLKArg; + + // initialize data mappers + LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, + this->m_left_contracting_strides, this->m_k_strides); + + RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides, + this->m_right_contracting_strides, this->m_k_strides); + + OutputMapper output(buffer, m); + + RhsPacker pack_rhs; + + // compute block sizes (which depend on number of threads) + const Index num_threads = this->m_device.numThreads(); + Index mc = m; + Index nc = n; + Index kc = k; + internal::ComputeGemmByRowBlockingSizes<LhsScalar,RhsScalar,1,Index> block; + block(kc, mc, nc, num_threads); + eigen_assert(mc <= m); + eigen_assert(nc <= n); + eigen_assert(kc <= k); + +#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b)) + const Index k_blocks = CEIL_DIV(k, kc); + const Index n_blocks = CEIL_DIV(n, nc); + const Index m_blocks = CEIL_DIV(m, mc); +#undef CEIL_DIV + + + const int sizeA = mc * kc; + const int sizeB = kc * nc; + + const Index numBlockBs = (std::min)(num_threads, n_blocks); + FixedSizeVector<RhsScalar *> blockBs(num_threads); + for (int i = 0; i < num_threads; i++) { + blockBs.push_back(static_cast<RhsScalar *>(this->m_device.allocate(sizeB * sizeof(RhsScalar)))); + } + + FixedSizeVector<LhsScalar *> blockAs(m_blocks); + for (int i = 0; i < m_blocks; i++) { + blockAs.push_back(static_cast<LhsScalar *>(this->m_device.allocate(sizeA * sizeof(LhsScalar)))); + } + + // lhs_notifications starts with all null Notifications + FixedSizeVector<Notification*> rhs_notifications(num_threads, nullptr); + + // this should really be numBlockBs * m_blocks; + const Index num_kernel_notifications = num_threads * m_blocks; + FixedSizeVector<Notification*> kernel_notifications(num_kernel_notifications, + nullptr); + + for (Index k_block_idx = 0; k_block_idx < k_blocks; k_block_idx++) { + const Index k_start = k_block_idx * kc; + // make sure we don't overshoot right edge of left matrix + const Index actual_kc = (std::min)(k_start + kc, k) - k_start; + + for (Index n_block_idx = 0; n_block_idx < n_blocks; n_block_idx += numBlockBs) { + const int num_blocks = (std::min)(n_blocks-n_block_idx, numBlockBs); + + for (Index nt_block_idx = n_block_idx; nt_block_idx < n_block_idx+num_blocks; nt_block_idx++) { + const Index n_start = nt_block_idx * nc; + const Index actual_nc = (std::min)(n_start + nc, n) - n_start; + eigen_assert(actual_nc > 0); + + int blockBId = (k_block_idx * n_blocks + nt_block_idx) % num_threads; + // Wait for previous RHS kernels to complete. + for (int i = 0; i < m_blocks; ++i) { + int notification_id = (blockBId * m_blocks + i); + + // Wait for any current kernels using this slot to complete + // before using it. + if (kernel_notifications[notification_id]) { + wait_until_ready(kernel_notifications[notification_id]); + delete kernel_notifications[notification_id]; + } + kernel_notifications[notification_id] = new Notification(); + } + const packRArg arg = { + blockBs[blockBId], // blockB + rhs, // rhs + n_start, // n + k_start, // k + actual_nc, // nc + actual_kc, // kc + }; + + // Delete any existing notification since we may be + // replacing it. The algorithm should ensure that there are + // no existing waiters on this notification. + delete rhs_notifications[blockBId]; + rhs_notifications[blockBId] = + this->m_device.enqueue(&Self::packRhs<packRArg, RhsPacker>, arg); + } + + // now start kernels. + const Index n_base_start = n_block_idx * nc; + const bool need_to_pack = n_block_idx == 0; + + for (Index m_block_idx = 0; m_block_idx < m_blocks; m_block_idx++) { + const Index m_start = m_block_idx * mc; + const Index actual_mc = (std::min)(m_start + mc, m) - m_start; + + // first make sure the previous kernels are all done before overwriting rhs. Also wait if + // we're going to start new k. In both cases need_to_pack is true. + if (need_to_pack) { + for (int i = num_blocks; i < num_threads; ++i) { + Index blockBId = (k_block_idx * n_blocks + i + n_block_idx) % num_threads; + Index future_id = (blockBId * m_blocks + m_block_idx); + wait_until_ready(kernel_notifications[future_id]); + } + } + + packLKArg arg = { + &blockBs, // blockB + blockAs[m_block_idx], // blockA + lhs, // lhs + output, // output + m_start, // m + k_start, // k + n_base_start, // n + actual_mc, // mc + actual_kc, // kc + nc, // nc + num_threads, + numBlockBs, + n, + k_block_idx, + m_block_idx, + n_block_idx, + m_blocks, + n_blocks, + &kernel_notifications, + &rhs_notifications, + need_to_pack, + }; + + // We asynchronously kick off this function, which ends up + // notifying the appropriate kernel_notifications objects, + // which this thread waits on before exiting. + // + // The wait for kernel_notifications below ensures that we + // don't have to keep track of the launch of this work. + this->m_device.enqueue_and_forget(&Self::packLhsAndKernel<packLKArg, LhsPacker, GebpKernel>, arg); + } + } + } + + // Make sure all the kernels are done. + for (int i = 0; i < kernel_notifications.size(); ++i) { + wait_until_ready(kernel_notifications[i]); + delete kernel_notifications[i]; + } + + // No need to wait for lhs notifications since they should have + // already been waited on. Just clean them up. + for (int i = 0; i < rhs_notifications.size(); ++i) { + delete rhs_notifications[i]; + } + + // deallocate all of the memory for both A and B's + for (int i = 0; i < blockAs.size(); i++) { + this->m_device.deallocate(blockAs[i]); + } + for (int i = 0; i < blockBs.size(); i++) { + this->m_device.deallocate(blockBs[i]); + } + } + + template <typename packRArg, typename RhsPacker> + static void packRhs(const packRArg arg) { + // perform actual packing + RhsPacker pack_rhs; + pack_rhs(arg.blockB, arg.rhs.getSubMapper(arg.k_start, arg.n_start), arg.kc, arg.nc); + } + + template <typename packLKArg, typename LhsPacker, typename GebpKernel> + static void packLhsAndKernel(packLKArg arg) { + if (arg.need_to_pack) { + LhsPacker pack_lhs; + pack_lhs(arg.blockA, arg.lhs.getSubMapper(arg.m, arg.k), arg.kc, arg.mc); + } + + GebpKernel gebp; + for (Index nt_block_idx = 0; nt_block_idx < arg.num_blockBs; nt_block_idx++) { + const Index n_base_start = arg.n + arg.nc*nt_block_idx; + if (n_base_start < arg.max_n) { + int blockBId = (arg.k_block_idx * arg.n_blocks + nt_block_idx + arg.n_block_idx) % arg.num_threads; + wait_until_ready((*arg.rhs_notifications)[blockBId]); + const Index actual_nc = (std::min)(n_base_start + arg.nc, arg.max_n) - n_base_start; + gebp(arg.output.getSubMapper(arg.m, n_base_start), + arg.blockA, (*arg.blockBs)[blockBId], + arg.mc, arg.kc, actual_nc, Scalar(1), -1, -1, 0, 0); + + // Notify that the kernel is done. + const Index set_idx = blockBId * arg.m_blocks + arg.m_block_idx; + (*arg.kernel_notifications)[set_idx]->Notify(); + } + } + } +}; + +} // end namespace Eigen + +#endif // EIGEN_USE_THREADS +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h new file mode 100644 index 0000000000..d54091fa1c --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h @@ -0,0 +1,226 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVERSION_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONVERSION_H + +namespace Eigen { + +/** \class TensorConversionOp + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor conversion class. This class makes it possible to vectorize + * type casting operations when the number of scalars per packet in the source + * and the destination type differ + */ +namespace internal { +template<typename TargetType, typename XprType> +struct traits<TensorConversionOp<TargetType, XprType> > +{ + // Type promotion to handle the case where the types of the lhs and the rhs are different. + typedef TargetType Scalar; + typedef typename traits<XprType>::StorageKind StorageKind; + typedef typename traits<XprType>::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference<Nested>::type _Nested; + static const int NumDimensions = traits<XprType>::NumDimensions; + static const int Layout = traits<XprType>::Layout; + enum { Flags = 0 }; +}; + +template<typename TargetType, typename XprType> +struct eval<TensorConversionOp<TargetType, XprType>, Eigen::Dense> +{ + typedef const TensorConversionOp<TargetType, XprType>& type; +}; + +template<typename TargetType, typename XprType> +struct nested<TensorConversionOp<TargetType, XprType>, 1, typename eval<TensorConversionOp<TargetType, XprType> >::type> +{ + typedef TensorConversionOp<TargetType, XprType> type; +}; + +} // end namespace internal + + +template <typename TensorEvaluator, typename SrcPacket, typename TgtPacket, int SrcCoeffRatio, int TgtCoeffRatio> +struct PacketConverter { + PacketConverter(const TensorEvaluator& impl) + : m_impl(impl) {} + + template<int LoadMode, typename Index> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const { + return internal::pcast<SrcPacket, TgtPacket>(m_impl.template packet<LoadMode>(index)); + } + + private: + const TensorEvaluator& m_impl; +}; + + +template <typename TensorEvaluator, typename SrcPacket, typename TgtPacket> +struct PacketConverter<TensorEvaluator, SrcPacket, TgtPacket, 2, 1> { + PacketConverter(const TensorEvaluator& impl) + : m_impl(impl) {} + + template<int LoadMode, typename Index> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const { + const int SrcPacketSize = internal::unpacket_traits<SrcPacket>::size; + + SrcPacket src1 = m_impl.template packet<LoadMode>(index); + SrcPacket src2 = m_impl.template packet<LoadMode>(index + SrcPacketSize); + TgtPacket result = internal::pcast<SrcPacket, TgtPacket>(src1, src2); + return result; + } + + private: + const TensorEvaluator& m_impl; +}; + +template <typename TensorEvaluator, typename SrcPacket, typename TgtPacket> +struct PacketConverter<TensorEvaluator, SrcPacket, TgtPacket, 4, 1> { + PacketConverter(const TensorEvaluator& impl) + : m_impl(impl) {} + + template<int LoadMode, typename Index> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const { + const int SrcPacketSize = internal::unpacket_traits<SrcPacket>::size; + + SrcPacket src1 = m_impl.template packet<LoadMode>(index); + SrcPacket src2 = m_impl.template packet<LoadMode>(index + SrcPacketSize); + SrcPacket src3 = m_impl.template packet<LoadMode>(index + 2 * SrcPacketSize); + SrcPacket src4 = m_impl.template packet<LoadMode>(index + 3 * SrcPacketSize); + TgtPacket result = internal::pcast<SrcPacket, TgtPacket>(src1, src2, src3, src4); + return result; + } + + private: + const TensorEvaluator& m_impl; +}; + + +template <typename TensorEvaluator, typename SrcPacket, typename TgtPacket> +struct PacketConverter<TensorEvaluator, SrcPacket, TgtPacket, 1, 2> { + PacketConverter(const TensorEvaluator& impl) + : m_impl(impl), m_maxIndex(impl.dimensions().TotalSize()) {} + + template<int LoadMode, typename Index> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const { + const int SrcPacketSize = internal::unpacket_traits<SrcPacket>::size; + if (index + SrcPacketSize < m_maxIndex) { + return internal::pcast<SrcPacket, TgtPacket>(m_impl.template packet<LoadMode>(index)); + } else { + const int TgtPacketSize = internal::unpacket_traits<TgtPacket>::size; + EIGEN_ALIGN_DEFAULT typename internal::unpacket_traits<TgtPacket>::type values[TgtPacketSize]; + for (int i = 0; i < TgtPacketSize; ++i) { + values[i] = m_impl.coeff(index+i); + } + TgtPacket rslt = internal::pload<TgtPacket>(values); + return rslt; + } + } + + private: + const TensorEvaluator& m_impl; + const typename TensorEvaluator::Index m_maxIndex; +}; + +template<typename TargetType, typename XprType> +class TensorConversionOp : public TensorBase<TensorConversionOp<TargetType, XprType>, ReadOnlyAccessors> +{ + public: + typedef typename internal::traits<TensorConversionOp>::Scalar Scalar; + typedef typename internal::traits<TensorConversionOp>::StorageKind StorageKind; + typedef typename internal::traits<TensorConversionOp>::Index Index; + typedef typename internal::nested<TensorConversionOp>::type Nested; + typedef Scalar CoeffReturnType; + typedef typename NumTraits<Scalar>::Real RealScalar; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConversionOp(const XprType& xpr) + : m_xpr(xpr) {} + + EIGEN_DEVICE_FUNC + const typename internal::remove_all<typename XprType::Nested>::type& + expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; +}; + + + + +// Eval as rvalue +template<typename TargetType, typename ArgType, typename Device> +struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device> +{ + typedef TensorConversionOp<TargetType, ArgType> XprType; + typedef typename XprType::Index Index; + typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions; + typedef TargetType Scalar; + typedef TargetType CoeffReturnType; + typedef typename internal::remove_all<typename internal::traits<ArgType>::Scalar>::type SrcType; + typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + typedef typename PacketType<SrcType, Device>::type PacketSourceType; + + enum { + IsAligned = false, + PacketAccess = + TensorEvaluator<ArgType, Device>::PacketAccess && + internal::type_casting_traits<SrcType, TargetType>::VectorizedCast, + BlockAccess = false, + Layout = TensorEvaluator<ArgType, Device>::Layout, + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device) + { + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_impl.dimensions(); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) + { + if (internal::is_same<TargetType, SrcType>::value) { + return m_impl.evalSubExprsIfNeeded((SrcType*)data); + } + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() + { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + internal::scalar_cast_op<SrcType, TargetType> converter; + return converter(m_impl.coeff(index)); + } + + template<int LoadMode> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + const int SrcCoeffRatio = internal::type_casting_traits<SrcType, TargetType>::SrcCoeffRatio; + const int TgtCoeffRatio = internal::type_casting_traits<SrcType, TargetType>::TgtCoeffRatio; + PacketConverter<TensorEvaluator<ArgType, Device>, PacketSourceType, PacketReturnType, + SrcCoeffRatio, TgtCoeffRatio> converter(m_impl); + return converter.template packet<LoadMode>(index); + } + + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + + protected: + TensorEvaluator<ArgType, Device> m_impl; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVERSION_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h new file mode 100644 index 0000000000..58cae7162c --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -0,0 +1,1076 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H + +namespace Eigen { + +/** \class TensorConvolution + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor convolution class. + * + * + */ +namespace internal { + +template <typename Index, typename InputDims, size_t NumKernelDims, int Layout> +class IndexMapper { + public: + IndexMapper(const InputDims& input_dims, const array<Index, NumKernelDims>& kernel_dims, + const array<Index, NumKernelDims>& indices) { + + array<Index, NumDims> dimensions = input_dims; + for (int i = 0; i < NumKernelDims; ++i) { + const Index index = indices[i]; + const Index input_dim = input_dims[index]; + const Index kernel_dim = kernel_dims[i]; + const Index result_dim = input_dim - kernel_dim + 1; + dimensions[index] = result_dim; + } + + array<Index, NumDims> inputStrides; + array<Index, NumDims> outputStrides; + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + inputStrides[0] = 1; + outputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + inputStrides[i] = inputStrides[i-1] * input_dims[i-1]; + outputStrides[i] = outputStrides[i-1] * dimensions[i-1]; + } + } else { + inputStrides[NumDims - 1] = 1; + outputStrides[NumDims - 1] = 1; + for (int i = static_cast<int>(NumDims) - 2; i >= 0; --i) { + inputStrides[i] = inputStrides[i + 1] * input_dims[i + 1]; + outputStrides[i] = outputStrides[i + 1] * dimensions[i + 1]; + } + } + + array<Index, NumDims> cudaInputDimensions; + array<Index, NumDims> cudaOutputDimensions; + array<Index, NumDims> tmp = dimensions; + array<Index, NumDims> ordering; + const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) + ? 0 + : NumDims - NumKernelDims; + for (int i = 0; i < NumKernelDims; ++i) { + const Index index = i + offset; + ordering[index] = indices[i]; + tmp[indices[i]] = -1; + cudaInputDimensions[index] = input_dims[indices[i]]; + cudaOutputDimensions[index] = dimensions[indices[i]]; + } + + int written = static_cast<int>(Layout) == static_cast<int>(ColMajor) + ? NumKernelDims + : 0; + for (int i = 0; i < NumDims; ++i) { + if (tmp[i] >= 0) { + ordering[written] = i; + cudaInputDimensions[written] = input_dims[i]; + cudaOutputDimensions[written] = dimensions[i]; + ++written; + } + } + + for (int i = 0; i < NumDims; ++i) { + m_inputStrides[i] = inputStrides[ordering[i]]; + m_outputStrides[i] = outputStrides[ordering[i]]; + } + + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + for (int i = 0; i < NumDims; ++i) { + if (i > NumKernelDims) { + m_cudaInputStrides[i] = + m_cudaInputStrides[i - 1] * cudaInputDimensions[i - 1]; + m_cudaOutputStrides[i] = + m_cudaOutputStrides[i - 1] * cudaOutputDimensions[i - 1]; + } else { + m_cudaInputStrides[i] = 1; + m_cudaOutputStrides[i] = 1; + } + } + } else { + for (int i = NumDims - 1; i >= 0; --i) { + if (i + 1 < offset) { + m_cudaInputStrides[i] = + m_cudaInputStrides[i + 1] * cudaInputDimensions[i + 1]; + m_cudaOutputStrides[i] = + m_cudaOutputStrides[i + 1] * cudaOutputDimensions[i + 1]; + } else { + m_cudaInputStrides[i] = 1; + m_cudaOutputStrides[i] = 1; + } + } + } + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputPlaneToTensorInputOffset(Index p) const { + Index inputIndex = 0; + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + for (int d = NumDims - 1; d > NumKernelDims; --d) { + const Index idx = p / m_cudaInputStrides[d]; + inputIndex += idx * m_inputStrides[d]; + p -= idx * m_cudaInputStrides[d]; + } + inputIndex += p * m_inputStrides[NumKernelDims]; + } else { + int limit = 0; + if (NumKernelDims < NumDims) { + limit = NumDims - NumKernelDims - 1; + } + for (int d = 0; d < limit; ++d) { + const Index idx = p / m_cudaInputStrides[d]; + inputIndex += idx * m_inputStrides[d]; + p -= idx * m_cudaInputStrides[d]; + } + inputIndex += p * m_inputStrides[limit]; + } + return inputIndex; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputPlaneToTensorOutputOffset(Index p) const { + Index outputIndex = 0; + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + for (int d = NumDims - 1; d > NumKernelDims; --d) { + const Index idx = p / m_cudaOutputStrides[d]; + outputIndex += idx * m_outputStrides[d]; + p -= idx * m_cudaOutputStrides[d]; + } + outputIndex += p * m_outputStrides[NumKernelDims]; + } else { + int limit = 0; + if (NumKernelDims < NumDims) { + limit = NumDims - NumKernelDims - 1; + } + for (int d = 0; d < limit; ++d) { + const Index idx = p / m_cudaOutputStrides[d]; + outputIndex += idx * m_outputStrides[d]; + p -= idx * m_cudaOutputStrides[d]; + } + outputIndex += p * m_outputStrides[limit]; + } + return outputIndex; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i) const { + const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) + ? 0 + : NumDims - NumKernelDims; + return i * m_inputStrides[offset]; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i) const { + const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) + ? 0 + : NumDims - NumKernelDims; + return i * m_outputStrides[offset]; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j) const { + const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) + ? 0 + : NumDims - NumKernelDims; + return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1]; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j) const { + const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) + ? 0 + : NumDims - NumKernelDims; + return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1]; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j, Index k) const { + const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) + ? 0 + : NumDims - NumKernelDims; + return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1] + + k * m_inputStrides[offset + 2]; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const { + const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) + ? 0 + : NumDims - NumKernelDims; + return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1] + + k * m_outputStrides[offset + 2]; + } + + private: + static const size_t NumDims = internal::array_size<InputDims>::value; + array<Index, NumDims> m_inputStrides; + array<Index, NumDims> m_outputStrides; + array<Index, NumDims> m_cudaInputStrides; + array<Index, NumDims> m_cudaOutputStrides; +}; + + + +template<typename Dimensions, typename InputXprType, typename KernelXprType> +struct traits<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> > +{ + // Type promotion to handle the case where the types of the lhs and the rhs are different. + typedef typename promote_storage_type<typename InputXprType::Scalar, + typename KernelXprType::Scalar>::ret Scalar; + typedef typename packet_traits<Scalar>::type Packet; + typedef typename promote_storage_type<typename traits<InputXprType>::StorageKind, + typename traits<KernelXprType>::StorageKind>::ret StorageKind; + typedef typename promote_index_type<typename traits<InputXprType>::Index, + typename traits<KernelXprType>::Index>::type Index; + typedef typename InputXprType::Nested LhsNested; + typedef typename KernelXprType::Nested RhsNested; + typedef typename remove_reference<LhsNested>::type _LhsNested; + typedef typename remove_reference<RhsNested>::type _RhsNested; + static const int NumDimensions = traits<InputXprType>::NumDimensions; + static const int Layout = traits<InputXprType>::Layout; + + enum { + Flags = 0, + }; +}; + +template<typename Dimensions, typename InputXprType, typename KernelXprType> +struct eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, Eigen::Dense> +{ + typedef const TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>& type; +}; + +template<typename Dimensions, typename InputXprType, typename KernelXprType> +struct nested<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, 1, typename eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >::type> +{ + typedef TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> type; +}; + +} // end namespace internal + + + +template<typename Indices, typename InputXprType, typename KernelXprType> +class TensorConvolutionOp : public TensorBase<TensorConvolutionOp<Indices, InputXprType, KernelXprType> > +{ + public: + typedef typename Eigen::internal::traits<TensorConvolutionOp>::Scalar Scalar; + typedef typename Eigen::internal::traits<TensorConvolutionOp>::Packet Packet; + typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; + typedef typename internal::promote_storage_type<typename InputXprType::CoeffReturnType, + typename KernelXprType::CoeffReturnType>::ret CoeffReturnType; + typedef typename internal::promote_storage_type<typename InputXprType::PacketReturnType, + typename KernelXprType::PacketReturnType>::ret PacketReturnType; + typedef typename Eigen::internal::nested<TensorConvolutionOp>::type Nested; + typedef typename Eigen::internal::traits<TensorConvolutionOp>::StorageKind StorageKind; + typedef typename Eigen::internal::traits<TensorConvolutionOp>::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConvolutionOp(const InputXprType& input, const KernelXprType& kernel, const Indices& dims) + : m_input_xpr(input), m_kernel_xpr(kernel), m_indices(dims) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const Indices& indices() const { return m_indices; } + + /** \returns the nested expressions */ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const typename internal::remove_all<typename InputXprType::Nested>::type& + inputExpression() const { return m_input_xpr; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const typename internal::remove_all<typename KernelXprType::Nested>::type& + kernelExpression() const { return m_kernel_xpr; } + + protected: + typename InputXprType::Nested m_input_xpr; + typename KernelXprType::Nested m_kernel_xpr; + const Indices m_indices; +}; + + +template<typename Indices, typename InputArgType, typename KernelArgType, typename Device> +struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, Device> +{ + typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType; + + static const int NumDims = internal::array_size<typename TensorEvaluator<InputArgType, Device>::Dimensions>::value; + static const int NumKernelDims = internal::array_size<Indices>::value; + typedef typename XprType::Index Index; + typedef DSizes<Index, NumDims> Dimensions; + + enum { + IsAligned = TensorEvaluator<InputArgType, Device>::IsAligned & + TensorEvaluator<KernelArgType, Device>::IsAligned, + PacketAccess = TensorEvaluator<InputArgType, Device>::PacketAccess & + TensorEvaluator<KernelArgType, Device>::PacketAccess, + BlockAccess = false, + Layout = TensorEvaluator<InputArgType, Device>::Layout, + CoordAccess = false, // to be implemented + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_kernel(NULL), m_local_kernel(false), m_device(device) + { + EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); + + const typename TensorEvaluator<InputArgType, Device>::Dimensions& input_dims = m_inputImpl.dimensions(); + const typename TensorEvaluator<KernelArgType, Device>::Dimensions& kernel_dims = m_kernelImpl.dimensions(); + + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + m_inputStride[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_inputStride[i] = m_inputStride[i - 1] * input_dims[i - 1]; + } + } else { + m_inputStride[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_inputStride[i] = m_inputStride[i + 1] * input_dims[i + 1]; + } + } + + m_dimensions = m_inputImpl.dimensions(); + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + for (int i = 0; i < NumKernelDims; ++i) { + const Index index = op.indices()[i]; + const Index input_dim = input_dims[index]; + const Index kernel_dim = kernel_dims[i]; + const Index result_dim = input_dim - kernel_dim + 1; + m_dimensions[index] = result_dim; + if (i > 0) { + m_kernelStride[i] = m_kernelStride[i - 1] * kernel_dims[i - 1]; + } else { + m_kernelStride[0] = 1; + } + m_indexStride[i] = m_inputStride[index]; + } + + m_outputStride[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_outputStride[i] = m_outputStride[i - 1] * m_dimensions[i - 1]; + } + } else { + for (int i = NumKernelDims - 1; i >= 0; --i) { + const Index index = op.indices()[i]; + const Index input_dim = input_dims[index]; + const Index kernel_dim = kernel_dims[i]; + const Index result_dim = input_dim - kernel_dim + 1; + m_dimensions[index] = result_dim; + if (i < NumKernelDims - 1) { + m_kernelStride[i] = m_kernelStride[i + 1] * kernel_dims[i + 1]; + } else { + m_kernelStride[NumKernelDims - 1] = 1; + } + m_indexStride[i] = m_inputStride[index]; + } + + m_outputStride[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_outputStride[i] = m_outputStride[i + 1] * m_dimensions[i + 1]; + } + } + } + + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + m_inputImpl.evalSubExprsIfNeeded(NULL); + preloadKernel(); + return true; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_inputImpl.cleanup(); + if (m_local_kernel) { + m_device.deallocate((void*)m_kernel); + m_local_kernel = false; + } + m_kernel = NULL; + } + + void evalTo(typename XprType::Scalar* buffer) { + evalSubExprsIfNeeded(NULL); + for (int i = 0; i < dimensions().TotalSize(); ++i) { + buffer[i] += coeff(i); + } + cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + CoeffReturnType result = CoeffReturnType(0); + convolve(firstInput(index), 0, NumKernelDims-1, result); + return result; + } + + template<int LoadMode> + EIGEN_DEVICE_FUNC PacketReturnType packet(const Index index) const + { + const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; + Index indices[2] = {index, index+PacketSize-1}; + Index startInputs[2] = {0, 0}; + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx0 = indices[0] / m_outputStride[i]; + const Index idx1 = indices[1] / m_outputStride[i]; + startInputs[0] += idx0 * m_inputStride[i]; + startInputs[1] += idx1 * m_inputStride[i]; + indices[0] -= idx0 * m_outputStride[i]; + indices[1] -= idx1 * m_outputStride[i]; + } + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx0 = indices[0] / m_outputStride[i]; + const Index idx1 = indices[1] / m_outputStride[i]; + startInputs[0] += idx0 * m_inputStride[i]; + startInputs[1] += idx1 * m_inputStride[i]; + indices[0] -= idx0 * m_outputStride[i]; + indices[1] -= idx1 * m_outputStride[i]; + } + } + startInputs[0] += indices[0]; + startInputs[1] += indices[1]; + + if (startInputs[1]-startInputs[0] == PacketSize-1) { + PacketReturnType result = internal::pset1<PacketReturnType>(0); + convolvePacket(startInputs[0], 0, NumKernelDims-1, result); + return result; + } else { + EIGEN_ALIGN_DEFAULT Scalar data[PacketSize]; + data[0] = Scalar(0); + convolve(startInputs[0], 0, NumKernelDims-1, data[0]); + for (int i = 1; i < PacketSize-1; ++i) { + data[i] = Scalar(0); + convolve(firstInput(index+i), 0, NumKernelDims-1, data[i]); + } + data[PacketSize-1] = Scalar(0); + convolve(startInputs[1], 0, NumKernelDims-1, data[PacketSize-1]); + return internal::pload<PacketReturnType>(data); + } + } + + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + + private: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const { + Index startInput = 0; + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_outputStride[i]; + startInput += idx * m_inputStride[i]; + index -= idx * m_outputStride[i]; + } + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = index / m_outputStride[i]; + startInput += idx * m_inputStride[i]; + index -= idx * m_outputStride[i]; + } + } + startInput += index; + return startInput; + } + + EIGEN_DEVICE_FUNC void convolve(Index firstIndex, Index firstKernel, int DimIndex, CoeffReturnType& accum) const { + for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) { + const Index input = firstIndex + j * m_indexStride[DimIndex]; + const Index kernel = firstKernel + j * m_kernelStride[DimIndex]; + if (DimIndex > 0) { + convolve(input, kernel, DimIndex-1, accum); + } else { + accum += m_inputImpl.coeff(input) * m_kernel[kernel]; + } + } + } + + template <typename Packet> + EIGEN_DEVICE_FUNC void convolvePacket(Index firstIndex, Index firstKernel, int DimIndex, Packet& accum) const { + for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) { + const Index input = firstIndex + j * m_indexStride[DimIndex]; + const Index kernel = firstKernel + j * m_kernelStride[DimIndex]; + if (DimIndex > 0) { + convolvePacket(input, kernel, DimIndex-1, accum); + } else { + accum = internal::pmadd<Packet>(m_inputImpl.template packet<Unaligned>(input), internal::pset1<Packet>(m_kernel[kernel]), accum); + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() { + // Don't make a local copy of the kernel unless we have to (i.e. it's an + // expression that needs to be evaluated) + const Scalar* in_place = m_kernelImpl.data(); + if (in_place) { + m_kernel = in_place; + m_local_kernel = false; + } else { + size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar); + Scalar* local = (Scalar*)m_device.allocate(kernel_sz); + typedef TensorEvalToOp<const KernelArgType> EvalTo; + EvalTo evalToTmp(local, m_kernelArg); + const bool PacketAccess = internal::IsVectorizable<Device, KernelArgType>::value; + const bool BlockAccess = false; + internal::TensorExecutor<const EvalTo, Device, PacketAccess, BlockAccess>::run(evalToTmp, m_device); + + m_kernel = local; + m_local_kernel = true; + } + } + + array<Index, NumDims> m_inputStride; + array<Index, NumDims> m_outputStride; + + array<Index, NumKernelDims> m_indexStride; + array<Index, NumKernelDims> m_kernelStride; + TensorEvaluator<InputArgType, Device> m_inputImpl; + TensorEvaluator<KernelArgType, Device> m_kernelImpl; + Dimensions m_dimensions; + + KernelArgType m_kernelArg; + const Scalar* m_kernel; + bool m_local_kernel; + const Device& m_device; +}; + + + + +// Use an optimized implementation of the evaluation code for GPUs whenever possible. +#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) + +template <int StaticKernelSize> +struct GetKernelSize { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int /*kernelSize*/) const { + return StaticKernelSize; + } +}; +template <> +struct GetKernelSize<Dynamic> { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int kernelSize) const { + return kernelSize; + } +}; + +template <typename InputEvaluator, typename Index, typename InputDims, + int StaticKernelSize> +__global__ void EigenConvolutionKernel1D( + InputEvaluator eval, + const internal::IndexMapper<Index, InputDims, 1, InputEvaluator::Layout> + indexMapper, + const float* __restrict kernel, const int numPlanes, const int numX, + const int maxX, const int kernelSize, float* buffer) { + extern __shared__ float s[]; + + const int first_x = blockIdx.x * maxX; + const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; + const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSize>()(kernelSize); + const int num_x_output = last_x - first_x + 1; + + const int first_plane = blockIdx.y * blockDim.y; + const int plane_stride = blockDim.y * gridDim.y; + + for (int p = first_plane + threadIdx.y; p < numPlanes; p += plane_stride) { + // Load inputs to shared memory + const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p); + const int plane_kernel_offset = threadIdx.y * num_x_input; + #pragma unroll + for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) { + const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x); + s[i + plane_kernel_offset] = eval.coeff(tensor_index); + } + + __syncthreads(); + + // Compute the convolution + const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p); + + #pragma unroll + for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) { + const int kernel_offset = plane_kernel_offset + i; + float result = 0.0f; + #pragma unroll + for (int k = 0; k < GetKernelSize<StaticKernelSize>()(kernelSize); ++k) { + result += s[k + kernel_offset] * kernel[k]; + } + const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x); + buffer[tensor_index] = result; + } + __syncthreads(); + } +}; + +template <typename InputEvaluator, typename Index, typename InputDims, + int StaticKernelSizeX, int StaticKernelSizeY> +__global__ __launch_bounds__(1024, 1) void EigenConvolutionKernel2D( + InputEvaluator eval, + const internal::IndexMapper<Index, InputDims, 2, InputEvaluator::Layout> + indexMapper, + const float* __restrict kernel, const int numPlanes, const int numX, + const int maxX, const int numY, const int maxY, const int kernelSizeX, + const int kernelSizeY, float* buffer) { + extern __shared__ float s[]; + + const int first_x = blockIdx.x * maxX; + const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; + const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSizeX>()(kernelSizeX); + const int num_x_output = last_x - first_x + 1; + + const int first_y = blockIdx.y * maxY; + const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1; + const int num_y_input = last_y - first_y + GetKernelSize<StaticKernelSizeY>()(kernelSizeY); + const int num_y_output = last_y - first_y + 1; + + const int first_plane = blockIdx.z * blockDim.z; + const int plane_stride = blockDim.z * gridDim.z; + + for (int p = first_plane + threadIdx.z; p < numPlanes; p += plane_stride) { + + const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p); + const int plane_kernel_offset = threadIdx.z * num_y_input; + + // Load inputs to shared memory + #pragma unroll + for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) { + const int input_offset = num_x_input * (j + plane_kernel_offset); + #pragma unroll + for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) { + const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x, j+first_y); + s[i + input_offset] = eval.coeff(tensor_index); + } + } + + __syncthreads(); + + // Convolution + const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p); + + #pragma unroll + for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) { + #pragma unroll + for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) { + float result = 0.0f; + #pragma unroll + for (int l = 0; l < GetKernelSize<StaticKernelSizeY>()(kernelSizeY); ++l) { + const int kernel_offset = kernelSizeX * l; + const int input_offset = i + num_x_input * (j + l + plane_kernel_offset); + #pragma unroll + for (int k = 0; k < GetKernelSize<StaticKernelSizeX>()(kernelSizeX); ++k) { + result += s[k + input_offset] * kernel[k + kernel_offset]; + } + } + const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x, j+first_y); + buffer[tensor_index] = result; + } + } + + __syncthreads(); + } +}; + +template <typename InputEvaluator, typename Index, typename InputDims> +__global__ void EigenConvolutionKernel3D( + InputEvaluator eval, + const internal::IndexMapper<Index, InputDims, 3, InputEvaluator::Layout> + indexMapper, + const float* __restrict kernel, const size_t numPlanes, const size_t numX, + const size_t maxX, const size_t numY, const size_t maxY, const size_t numZ, + const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY, + const size_t kernelSizeZ, float* buffer) { + extern __shared__ float s[]; + + // Load inputs to shared memory + const int first_x = blockIdx.x * maxX; + const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; + const int num_x_input = last_x - first_x + kernelSizeX; + + const int first_y = blockIdx.y * maxY; + const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1; + const int num_y_input = last_y - first_y + kernelSizeY; + + const int first_z = blockIdx.z * maxZ; + const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1; + const int num_z_input = last_z - first_z + kernelSizeZ; + + for (int p = 0; p < numPlanes; ++p) { + + const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p); + const int plane_kernel_offset = 0; + + for (int k = threadIdx.z; k < num_z_input; k += blockDim.z) { + for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) { + for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) { + const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z); + s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index); + } + } + } + + __syncthreads(); + + // Convolution + const int num_z_output = last_z - first_z + 1; + const int num_y_output = last_y - first_y + 1; + const int num_x_output = last_x - first_x + 1; + const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p); + + for (int k = threadIdx.z; k < num_z_output; k += blockDim.z) { + for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) { + for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) { + float result = 0.0f; + for (int n = 0; n < kernelSizeZ; ++n) { + for (int m = 0; m < kernelSizeY; ++m) { + for (int l = 0; l < kernelSizeX; ++l) { + result += s[i + l + num_x_input * (j + m + num_y_input * (k + n + plane_kernel_offset))] * kernel[l + kernelSizeX * (m + kernelSizeY * n)]; + } + } + } + const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x, j+first_y, k+first_z); + buffer[tensor_index] = result; + } + } + } + __syncthreads(); + } +}; + + + +template<typename Indices, typename InputArgType, typename KernelArgType> +struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, GpuDevice> +{ + typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType; + + static const int NumDims = internal::array_size<typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions>::value; + static const int NumKernelDims = internal::array_size<Indices>::value; + typedef typename XprType::Index Index; + typedef DSizes<Index, NumDims> Dimensions; + typedef typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions KernelDimensions; + + enum { + IsAligned = TensorEvaluator<InputArgType, GpuDevice>::IsAligned & + TensorEvaluator<KernelArgType, GpuDevice>::IsAligned, + PacketAccess = false, + BlockAccess = false, + Layout = TensorEvaluator<InputArgType, GpuDevice>::Layout, + CoordAccess = false, // to be implemented + }; + + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const GpuDevice& device) + : m_inputImpl(op.inputExpression(), device), m_kernelArg(op.kernelExpression()), m_kernelImpl(op.kernelExpression(), device), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device) + { + EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, GpuDevice>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, GpuDevice>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); + + const typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions& input_dims = m_inputImpl.dimensions(); + const typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions& kernel_dims = m_kernelImpl.dimensions(); + + m_dimensions = m_inputImpl.dimensions(); + for (int i = 0; i < NumKernelDims; ++i) { + const Index index = op.indices()[i]; + const Index input_dim = input_dims[index]; + const Index kernel_dim = kernel_dims[i]; + const Index result_dim = input_dim - kernel_dim + 1; + m_dimensions[index] = result_dim; + } + } + + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename InputArgType::Scalar Scalar; + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + preloadKernel(); + m_inputImpl.evalSubExprsIfNeeded(NULL); + if (data) { + executeEval(data); + return false; + } else { + m_buf = (Scalar*)m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)); + executeEval(m_buf); + return true; + } + } + + EIGEN_STRONG_INLINE void cleanup() { + m_inputImpl.cleanup(); + if (m_buf) { + m_device.deallocate(m_buf); + m_buf = NULL; + } + if (m_local_kernel) { + m_device.deallocate((void*)m_kernel); + m_local_kernel = false; + } + m_kernel = NULL; + } + + EIGEN_STRONG_INLINE void preloadKernel() { + // Don't make a local copy of the kernel unless we have to (i.e. it's an + // expression that needs to be evaluated) + const Scalar* in_place = m_kernelImpl.data(); + if (in_place) { + m_kernel = in_place; + m_local_kernel = false; + } else { + size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar); + Scalar* local = (Scalar*)m_device.allocate(kernel_sz); + typedef TensorEvalToOp<const KernelArgType> EvalTo; + EvalTo evalToTmp(local, m_kernelArg); + const bool PacketAccess = internal::IsVectorizable<GpuDevice, KernelArgType>::value; + const bool BlockAccess = false; + internal::TensorExecutor<const EvalTo, GpuDevice, PacketAccess, BlockAccess>::run(evalToTmp, m_device); + + m_kernel = local; + m_local_kernel = true; + } + } + + static unsigned int ceil(unsigned int num, unsigned int denom) { + const unsigned int rounded_toward_zero = num / denom; + if (num > rounded_toward_zero * denom) { + return rounded_toward_zero + 1; + } + return rounded_toward_zero; + } + + void executeEval(Scalar* data) const { + typedef typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions InputDims; + + const int maxSharedMem = m_device.sharedMemPerBlock(); + const int maxThreadsPerBlock = m_device.maxCudaThreadsPerBlock(); + const int maxBlocksPerProcessor = m_device.maxCudaThreadsPerMultiProcessor() / maxThreadsPerBlock; + const int numMultiProcessors = m_device.getNumCudaMultiProcessors(); + const int warpSize = 32; + + switch (NumKernelDims) { + case 1: { + const int kernel_size = m_kernelImpl.dimensions().TotalSize(); + + const int numX = dimensions()[m_indices[0]]; + const int numP = dimensions().TotalSize() / numX; + int maxX; + dim3 block_size; + + const int single_stride_dim = + static_cast<int>(Layout) == static_cast<int>(ColMajor) + ? 0 + : m_inputImpl.dimensions().rank() - 1; + if (m_indices[0] == single_stride_dim) { + // Maximum the reuse + const int inner_dim = ((maxSharedMem / (sizeof(Scalar)) - kernel_size + 1 + 31) / 32) * 32; + maxX = (std::min<int>)(inner_dim, numX); + const int maxP = (std::min<int>)(maxSharedMem / ((kernel_size - 1 + maxX) * sizeof(Scalar)), numP); + block_size.x = numext::mini(maxThreadsPerBlock, maxX); + block_size.y = (std::min<int>)(maxThreadsPerBlock / block_size.x, maxP); + } + else { + // Read as much as possible alongside the inner most dimension, that is the plane + const int inner_dim = maxSharedMem / ((warpSize + kernel_size) * sizeof(Scalar)); + const int maxP = (std::min<int>)(inner_dim, numP); + maxX = (std::min<int>)(maxSharedMem / (inner_dim * sizeof(Scalar)) - kernel_size + 1, numX); + + block_size.x = numext::mini(warpSize, maxX); + block_size.y = (std::min<int>)(maxThreadsPerBlock/block_size.x, maxP); + } + + const int shared_mem = block_size.y * (maxX + kernel_size - 1) * sizeof(Scalar); + assert(shared_mem <= maxSharedMem); + + const int num_x_blocks = ceil(numX, maxX); + const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem); + const int num_y_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks); + + dim3 num_blocks(num_x_blocks, std::min<int>(num_y_blocks, ceil(numP, block_size.y))); + + + //cout << "launching 1D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " maxX: " << maxX << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl; + + const array<Index, 1> indices(m_indices[0]); + const array<Index, 1> kernel_dims(m_kernelImpl.dimensions()[0]); + internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper( + m_inputImpl.dimensions(), kernel_dims, indices); + switch(kernel_size) { + case 4: { + LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data); + break; + } + case 7: { + LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data); + break; + } + default: { + LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data); + } + } + break; + } + + case 2: { + const int idxX = + static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1; + const int idxY = + static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0; + const int kernel_size_x = m_kernelImpl.dimensions()[idxX]; + const int kernel_size_y = m_kernelImpl.dimensions()[idxY]; + + const int numX = dimensions()[m_indices[idxX]]; + const int numY = dimensions()[m_indices[idxY]]; + const int numP = dimensions().TotalSize() / (numX*numY); + + const float scaling_factor = sqrtf(static_cast<float>(maxSharedMem) / (sizeof(Scalar) * kernel_size_y * kernel_size_x)); + + // Snap maxX to warp size + int inner_dim = ((static_cast<int>(scaling_factor * kernel_size_x) - kernel_size_x + 1 + 32) / 32) * 32; + const int maxX = (std::min<int>)(inner_dim, numX); + const int maxY = (std::min<int>)(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1)) - kernel_size_y + 1, numY); + const int maxP = (std::min<int>)(maxSharedMem / ((kernel_size_x - 1 + maxX) * (kernel_size_y - 1 + maxY) * sizeof(Scalar)), numP); + + dim3 block_size; + block_size.x = numext::mini(1024, maxX); + block_size.y = (std::min<int>)(1024/block_size.x, maxY); + block_size.z = (std::min<int>)(1024/(block_size.x*block_size.y), maxP); + + const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * sizeof(Scalar); + assert(shared_mem <= maxSharedMem); + + const int num_x_blocks = ceil(numX, maxX); + const int num_y_blocks = ceil(numY, maxY); + const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem); + const int num_z_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks * num_y_blocks); + + dim3 num_blocks(num_x_blocks, num_y_blocks, std::min<int>(num_z_blocks, ceil(numP, block_size.z))); + + + //cout << "launching 2D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " maxX: " << maxX << " maxY: " << maxY << " maxP: " << maxP << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl; + + const array<Index, 2> indices(m_indices[idxX], m_indices[idxY]); + const array<Index, 2> kernel_dims(m_kernelImpl.dimensions()[idxX], + m_kernelImpl.dimensions()[idxY]); + internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper( + m_inputImpl.dimensions(), kernel_dims, indices); + switch (kernel_size_x) { + case 4: { + switch (kernel_size_y) { + case 7: { + LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data); + break; + } + default: { + LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data); + break; + } + } + break; + } + case 7: { + switch (kernel_size_y) { + case 4: { + LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data); + break; + } + default: { + LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data); + break; + } + } + break; + } + default: { + LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data); + break; + } + } + break; + } + + case 3: { + const int idxX = + static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2; + const int idxY = + static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1; + const int idxZ = + static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0; + + const int kernel_size_x = m_kernelImpl.dimensions()[idxX]; + const int kernel_size_y = m_kernelImpl.dimensions()[idxY]; + const int kernel_size_z = m_kernelImpl.dimensions()[idxZ]; + + const int numX = dimensions()[m_indices[idxX]]; + const int numY = dimensions()[m_indices[idxY]]; + const int numZ = dimensions()[m_indices[idxZ]]; + const int numP = dimensions().TotalSize() / (numX*numY*numZ); + + const int maxX = (std::min<int>)(128, (std::min<int>)(maxSharedMem / (sizeof(Scalar) * kernel_size_y * kernel_size_z) - kernel_size_x + 1, numX)); + const int maxY = (std::min<int>)(128, (std::min<int>)(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * kernel_size_z) - kernel_size_y + 1, numY)); + const int maxZ = (std::min<int>)(128, (std::min<int>)(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1)) - kernel_size_z + 1, numZ)); + + dim3 block_size; + block_size.x = numext::mini(32, maxX); + block_size.y = numext::mini(32, maxY); + block_size.z = (std::min<int>)(1024/(block_size.x*block_size.y), maxZ); + dim3 num_blocks(ceil(numX, maxX), ceil(numY, maxY), ceil(numZ, maxZ)); + + const int shared_mem = (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) * sizeof(Scalar); + assert(shared_mem <= maxSharedMem); + + //cout << "launching 3D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl; + const array<Index, 3> indices(m_indices[idxX], m_indices[idxY], + m_indices[idxZ]); + const array<Index, 3> kernel_dims(m_kernelImpl.dimensions()[idxX], + m_kernelImpl.dimensions()[idxY], + m_kernelImpl.dimensions()[idxZ]); + internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper( + m_inputImpl.dimensions(), kernel_dims, indices); + + LAUNCH_CUDA_KERNEL((EigenConvolutionKernel3D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data); + break; + } + + default: { + EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE); + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + eigen_assert(m_buf); + eigen_assert(index < m_dimensions.TotalSize()); + return m_buf[index]; + } + + template<int LoadMode> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const + { + eigen_assert(m_buf); + eigen_assert(index < m_dimensions.TotalSize()); + return internal::ploadt<PacketReturnType, LoadMode>(m_buf+index); + } + + private: + // No assignment (copies are needed by the kernels) + TensorEvaluator& operator = (const TensorEvaluator&); + + TensorEvaluator<InputArgType, GpuDevice> m_inputImpl; + TensorEvaluator<KernelArgType, GpuDevice> m_kernelImpl; + KernelArgType m_kernelArg; + Indices m_indices; + Dimensions m_dimensions; + Scalar* m_buf; + const Scalar* m_kernel; + bool m_local_kernel; + + const GpuDevice& m_device; +}; +#endif + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h new file mode 100644 index 0000000000..dc39565d6b --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h @@ -0,0 +1,302 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CUSTOM_OP_H +#define EIGEN_CXX11_TENSOR_TENSOR_CUSTOM_OP_H + +namespace Eigen { + +/** \class TensorCustomUnaryOp + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor custom class. + * + * + */ +namespace internal { +template<typename CustomUnaryFunc, typename XprType> +struct traits<TensorCustomUnaryOp<CustomUnaryFunc, XprType> > +{ + typedef typename XprType::Scalar Scalar; + typedef typename XprType::StorageKind StorageKind; + typedef typename XprType::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference<Nested>::type _Nested; + static const int NumDimensions = traits<XprType>::NumDimensions; + static const int Layout = traits<XprType>::Layout; +}; + +template<typename CustomUnaryFunc, typename XprType> +struct eval<TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Eigen::Dense> +{ + typedef const TensorCustomUnaryOp<CustomUnaryFunc, XprType>& type; +}; + +template<typename CustomUnaryFunc, typename XprType> +struct nested<TensorCustomUnaryOp<CustomUnaryFunc, XprType>, 1, typename eval<TensorCustomUnaryOp<CustomUnaryFunc, XprType> >::type> +{ + typedef TensorCustomUnaryOp<CustomUnaryFunc, XprType> type; +}; + +} // end namespace internal + + + +template<typename CustomUnaryFunc, typename XprType> +class TensorCustomUnaryOp : public TensorBase<TensorCustomUnaryOp<CustomUnaryFunc, XprType>, ReadOnlyAccessors> +{ + public: + typedef typename internal::traits<TensorCustomUnaryOp>::Scalar Scalar; + typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename internal::nested<TensorCustomUnaryOp>::type Nested; + typedef typename internal::traits<TensorCustomUnaryOp>::StorageKind StorageKind; + typedef typename internal::traits<TensorCustomUnaryOp>::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCustomUnaryOp(const XprType& expr, const CustomUnaryFunc& func) + : m_expr(expr), m_func(func) {} + + EIGEN_DEVICE_FUNC + const CustomUnaryFunc& func() const { return m_func; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all<typename XprType::Nested>::type& + expression() const { return m_expr; } + + protected: + typename XprType::Nested m_expr; + const CustomUnaryFunc m_func; +}; + + +// Eval as rvalue +template<typename CustomUnaryFunc, typename XprType, typename Device> +struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Device> +{ + typedef TensorCustomUnaryOp<CustomUnaryFunc, XprType> ArgType; + typedef typename internal::traits<ArgType>::Index Index; + static const int NumDims = internal::traits<ArgType>::NumDimensions; + typedef DSizes<Index, NumDims> Dimensions; + typedef + typename internal::remove_const<typename ArgType::Scalar>::type Scalar; + + enum { + IsAligned = false, + PacketAccess = (internal::packet_traits<Scalar>::size > 1), + BlockAccess = false, + Layout = TensorEvaluator<XprType, Device>::Layout, + CoordAccess = false, // to be implemented + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const ArgType& op, const Device& device) + : m_op(op), m_device(device), m_result(NULL) + { + m_dimensions = op.func().dimensions(op.expression()); + } + + typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType; + typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { + if (data) { + evalTo(data); + return false; + } else { + m_result = static_cast<CoeffReturnType*>( + m_device.allocate(dimensions().TotalSize() * sizeof(Scalar))); + evalTo(m_result); + return true; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + if (m_result != NULL) { + m_device.deallocate(m_result); + m_result = NULL; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + return m_result[index]; + } + + template<int LoadMode> + EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const { + return internal::ploadt<PacketReturnType, LoadMode>(m_result + index); + } + + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_result; } + + protected: + EIGEN_DEVICE_FUNC void evalTo(Scalar* data) { + TensorMap<Tensor<CoeffReturnType, NumDims, Layout, Index> > result( + data, m_dimensions); + m_op.func().eval(m_op.expression(), result, m_device); + } + + Dimensions m_dimensions; + const ArgType m_op; + const Device& m_device; + CoeffReturnType* m_result; +}; + + + +/** \class TensorCustomBinaryOp + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor custom class. + * + * + */ +namespace internal { +template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType> +struct traits<TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType> > +{ + typedef typename internal::promote_storage_type<typename LhsXprType::Scalar, + typename RhsXprType::Scalar>::ret Scalar; + typedef typename internal::promote_storage_type<typename LhsXprType::CoeffReturnType, + typename RhsXprType::CoeffReturnType>::ret CoeffReturnType; + typedef typename promote_storage_type<typename traits<LhsXprType>::StorageKind, + typename traits<RhsXprType>::StorageKind>::ret StorageKind; + typedef typename promote_index_type<typename traits<LhsXprType>::Index, + typename traits<RhsXprType>::Index>::type Index; + typedef typename LhsXprType::Nested LhsNested; + typedef typename RhsXprType::Nested RhsNested; + typedef typename remove_reference<LhsNested>::type _LhsNested; + typedef typename remove_reference<RhsNested>::type _RhsNested; + static const int NumDimensions = traits<LhsXprType>::NumDimensions; + static const int Layout = traits<LhsXprType>::Layout; +}; + +template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType> +struct eval<TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, Eigen::Dense> +{ + typedef const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>& type; +}; + +template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType> +struct nested<TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, 1, typename eval<TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType> >::type> +{ + typedef TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType> type; +}; + +} // end namespace internal + + + +template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType> +class TensorCustomBinaryOp : public TensorBase<TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, ReadOnlyAccessors> +{ + public: + typedef typename internal::traits<TensorCustomBinaryOp>::Scalar Scalar; + typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; + typedef typename internal::traits<TensorCustomBinaryOp>::CoeffReturnType CoeffReturnType; + typedef typename internal::nested<TensorCustomBinaryOp>::type Nested; + typedef typename internal::traits<TensorCustomBinaryOp>::StorageKind StorageKind; + typedef typename internal::traits<TensorCustomBinaryOp>::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCustomBinaryOp(const LhsXprType& lhs, const RhsXprType& rhs, const CustomBinaryFunc& func) + + : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_func(func) {} + + EIGEN_DEVICE_FUNC + const CustomBinaryFunc& func() const { return m_func; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all<typename LhsXprType::Nested>::type& + lhsExpression() const { return m_lhs_xpr; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all<typename RhsXprType::Nested>::type& + rhsExpression() const { return m_rhs_xpr; } + + protected: + typename LhsXprType::Nested m_lhs_xpr; + typename RhsXprType::Nested m_rhs_xpr; + const CustomBinaryFunc m_func; +}; + + +// Eval as rvalue +template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType, typename Device> +struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, Device> +{ + typedef TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType> XprType; + typedef typename internal::traits<XprType>::Index Index; + static const int NumDims = internal::traits<XprType>::NumDimensions; + typedef DSizes<Index, NumDims> Dimensions; + typedef typename XprType::Scalar Scalar; + + enum { + IsAligned = false, + PacketAccess = (internal::packet_traits<Scalar>::size > 1), + BlockAccess = false, + Layout = TensorEvaluator<LhsXprType, Device>::Layout, + CoordAccess = false, // to be implemented + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_op(op), m_device(device), m_result(NULL) + { + m_dimensions = op.func().dimensions(op.lhsExpression(), op.rhsExpression()); + } + + typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType; + typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { + if (data) { + evalTo(data); + return false; + } else { + m_result = static_cast<Scalar *>(m_device.allocate(dimensions().TotalSize() * sizeof(Scalar))); + evalTo(m_result); + return true; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + if (m_result != NULL) { + m_device.deallocate(m_result); + m_result = NULL; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + return m_result[index]; + } + + template<int LoadMode> + EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const { + return internal::ploadt<PacketReturnType, LoadMode>(m_result + index); + } + + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_result; } + + protected: + EIGEN_DEVICE_FUNC void evalTo(Scalar* data) { + TensorMap<Tensor<Scalar, NumDims, Layout> > result(data, m_dimensions); + m_op.func().eval(m_op.lhsExpression(), m_op.rhsExpression(), result, m_device); + } + + Dimensions m_dimensions; + const XprType m_op; + const Device& m_device; + CoeffReturnType* m_result; +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_CUSTOM_OP_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h new file mode 100644 index 0000000000..3c33015bc4 --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h @@ -0,0 +1,154 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H +#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H + +namespace Eigen { + +/** \class TensorDevice + * \ingroup CXX11_Tensor_Module + * + * \brief Pseudo expression providing an operator = that will evaluate its argument + * on the specified computing 'device' (GPU, thread pool, ...) + * + * Example: + * C.device(EIGEN_GPU) = A + B; + * + * Todo: thread pools. + * Todo: operator +=, -=, *= and so on. + */ + +template <typename ExpressionType, typename DeviceType> class TensorDevice { + public: + TensorDevice(const DeviceType& device, ExpressionType& expression) : m_device(device), m_expression(expression) {} + + template<typename OtherDerived> + EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) { + typedef TensorAssignOp<ExpressionType, const OtherDerived> Assign; + Assign assign(m_expression, other); + internal::TensorExecutor<const Assign, DeviceType>::run(assign, m_device); + return *this; + } + + template<typename OtherDerived> + EIGEN_STRONG_INLINE TensorDevice& operator+=(const OtherDerived& other) { + typedef typename OtherDerived::Scalar Scalar; + typedef TensorCwiseBinaryOp<internal::scalar_sum_op<Scalar>, const ExpressionType, const OtherDerived> Sum; + Sum sum(m_expression, other); + typedef TensorAssignOp<ExpressionType, const Sum> Assign; + Assign assign(m_expression, sum); + internal::TensorExecutor<const Assign, DeviceType>::run(assign, m_device); + return *this; + } + + template<typename OtherDerived> + EIGEN_STRONG_INLINE TensorDevice& operator-=(const OtherDerived& other) { + typedef typename OtherDerived::Scalar Scalar; + typedef TensorCwiseBinaryOp<internal::scalar_difference_op<Scalar>, const ExpressionType, const OtherDerived> Difference; + Difference difference(m_expression, other); + typedef TensorAssignOp<ExpressionType, const Difference> Assign; + Assign assign(m_expression, difference); + internal::TensorExecutor<const Assign, DeviceType>::run(assign, m_device); + return *this; + } + + protected: + const DeviceType& m_device; + ExpressionType& m_expression; +}; + + +#ifdef EIGEN_USE_THREADS +template <typename ExpressionType> class TensorDevice<ExpressionType, ThreadPoolDevice> { + public: + TensorDevice(const ThreadPoolDevice& device, ExpressionType& expression) : m_device(device), m_expression(expression) {} + + template<typename OtherDerived> + EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) { + typedef TensorAssignOp<ExpressionType, const OtherDerived> Assign; + Assign assign(m_expression, other); + internal::TensorExecutor<const Assign, ThreadPoolDevice>::run(assign, m_device); + return *this; + } + + template<typename OtherDerived> + EIGEN_STRONG_INLINE TensorDevice& operator+=(const OtherDerived& other) { + typedef typename OtherDerived::Scalar Scalar; + typedef TensorCwiseBinaryOp<internal::scalar_sum_op<Scalar>, const ExpressionType, const OtherDerived> Sum; + Sum sum(m_expression, other); + typedef TensorAssignOp<ExpressionType, const Sum> Assign; + Assign assign(m_expression, sum); + internal::TensorExecutor<const Assign, ThreadPoolDevice>::run(assign, m_device); + return *this; + } + + template<typename OtherDerived> + EIGEN_STRONG_INLINE TensorDevice& operator-=(const OtherDerived& other) { + typedef typename OtherDerived::Scalar Scalar; + typedef TensorCwiseBinaryOp<internal::scalar_difference_op<Scalar>, const ExpressionType, const OtherDerived> Difference; + Difference difference(m_expression, other); + typedef TensorAssignOp<ExpressionType, const Difference> Assign; + Assign assign(m_expression, difference); + internal::TensorExecutor<const Assign, ThreadPoolDevice>::run(assign, m_device); + return *this; + } + + protected: + const ThreadPoolDevice& m_device; + ExpressionType& m_expression; +}; +#endif + +#if defined(EIGEN_USE_GPU) +template <typename ExpressionType> class TensorDevice<ExpressionType, GpuDevice> +{ + public: + TensorDevice(const GpuDevice& device, ExpressionType& expression) : m_device(device), m_expression(expression) {} + + template<typename OtherDerived> + EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) { + typedef TensorAssignOp<ExpressionType, const OtherDerived> Assign; + Assign assign(m_expression, other); + internal::TensorExecutor<const Assign, GpuDevice>::run(assign, m_device); + return *this; + } + + template<typename OtherDerived> + EIGEN_STRONG_INLINE TensorDevice& operator+=(const OtherDerived& other) { + typedef typename OtherDerived::Scalar Scalar; + typedef TensorCwiseBinaryOp<internal::scalar_sum_op<Scalar>, const ExpressionType, const OtherDerived> Sum; + Sum sum(m_expression, other); + typedef TensorAssignOp<ExpressionType, const Sum> Assign; + Assign assign(m_expression, sum); + internal::TensorExecutor<const Assign, GpuDevice>::run(assign, m_device); + return *this; + } + + template<typename OtherDerived> + EIGEN_STRONG_INLINE TensorDevice& operator-=(const OtherDerived& other) { + typedef typename OtherDerived::Scalar Scalar; + typedef TensorCwiseBinaryOp<internal::scalar_difference_op<Scalar>, const ExpressionType, const OtherDerived> Difference; + Difference difference(m_expression, other); + typedef TensorAssignOp<ExpressionType, const Difference> Assign; + Assign assign(m_expression, difference); + internal::TensorExecutor<const Assign, GpuDevice>::run(assign, m_device); + return *this; + } + + protected: + const GpuDevice& m_device; + ExpressionType& m_expression; +}; +#endif + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h new file mode 100644 index 0000000000..b6eeb73832 --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h @@ -0,0 +1,920 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_DEVICE_TYPE_H +#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_TYPE_H + +namespace Eigen { + +// Default device for the machine (typically a single cpu core) +struct DefaultDevice { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { + return internal::aligned_malloc(num_bytes); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const { + internal::aligned_free(buffer); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { + ::memcpy(dst, src, n); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const { + memcpy(dst, src, n); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const { + memcpy(dst, src, n); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { + ::memset(buffer, c, n); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const { +#ifndef __CUDA_ARCH__ + // Running on the host CPU + return 1; +#else + // Running on a CUDA device + return 32; +#endif + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t memcpyThreshold() const { + return 2 * numThreads(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { +#ifndef __CUDA_ARCH__ + // Running on the host CPU + return l1CacheSize(); +#else + // Running on a CUDA device, return the amount of shared memory available. + return 48*1024; +#endif + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { +#ifndef __CUDA_ARCH__ + // Running single threaded on the host CPU + return l3CacheSize(); +#else + // Running on a CUDA device + return firstLevelCacheSize(); +#endif + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const { +#ifndef __CUDA_ARCH__ + // Running single threaded on the host CPU + // Should return an enum that encodes the ISA supported by the CPU + return 1; +#else + // Running on a CUDA device + return __CUDA_ARCH__ / 100; +#endif + } +}; + +// Multiple cpu cores +#ifdef EIGEN_USE_THREADS + +#if __cplusplus > 199711 +// This defines an interface that ThreadPoolDevice can take to use +// custom thread pools underneath. +class ThreadPoolInterface { + public: + virtual void Schedule(std::function<void()> fn) = 0; + + virtual ~ThreadPoolInterface() {} +}; +#endif + +// The implementation of the ThreadPool type ensures that the Schedule method +// runs the functions it is provided in FIFO order when the scheduling is done +// by a single thread. +#ifdef EIGEN_USE_CUSTOM_THREAD_POOL +class ThreadPool : public ThreadPoolInterface { + public: + // Construct a pool that contains "num_threads" threads. + explicit ThreadPool(int num_threads) : threads_(num_threads), waiters_(num_threads) { + for (int i = 0; i < num_threads; i++) { + threads_.push_back(new std::thread([this]() { WorkerLoop(); })); + } + } + + // Wait until all scheduled work has finished and then destroy the + // set of threads. + ~ThreadPool() { + { + // Wait for all work to get done. + std::unique_lock<std::mutex> l(mu_); + while (!pending_.empty()) { + empty_.wait(l); + } + exiting_ = true; + + // Wakeup all waiters. + for (auto w : waiters_) { + w->ready = true; + w->work = nullptr; + w->cv.notify_one(); + } + } + + // Wait for threads to finish. + for (auto t : threads_) { + t->join(); + delete t; + } + } + + // Schedule fn() for execution in the pool of threads. The functions are + // executed in the order in which they are scheduled. + void Schedule(std::function<void()> fn) final { + std::unique_lock<std::mutex> l(mu_); + if (waiters_.empty()) { + pending_.push_back(fn); + } else { + Waiter* w = waiters_.back(); + waiters_.pop_back(); + w->ready = true; + w->work = fn; + w->cv.notify_one(); + } + } + + protected: + void WorkerLoop() { + std::unique_lock<std::mutex> l(mu_); + Waiter w; + while (!exiting_) { + std::function<void()> fn; + if (pending_.empty()) { + // Wait for work to be assigned to me + w.ready = false; + waiters_.push_back(&w); + while (!w.ready) { + w.cv.wait(l); + } + fn = w.work; + w.work = nullptr; + } else { + // Pick up pending work + fn = pending_.front(); + pending_.pop_front(); + if (pending_.empty()) { + empty_.notify_all(); + } + } + if (fn) { + mu_.unlock(); + fn(); + mu_.lock(); + } + } + } + + private: + struct Waiter { + std::condition_variable cv; + std::function<void()> work; + bool ready; + }; + + std::mutex mu_; + FixedSizeVector<std::thread*> threads_; // All threads + FixedSizeVector<Waiter*> waiters_; // Stack of waiting threads. + std::deque<std::function<void()>> pending_; // Queue of pending work + std::condition_variable empty_; // Signaled on pending_.empty() + bool exiting_ = false; +}; + + +// Notification is an object that allows a user to to wait for another +// thread to signal a notification that an event has occurred. +// +// Multiple threads can wait on the same Notification object. +// but only one caller must call Notify() on the object. +class Notification { + public: + Notification() : notified_(false) {} + ~Notification() {} + + void Notify() { + std::unique_lock<std::mutex> l(mu_); + eigen_assert(!notified_); + notified_ = true; + cv_.notify_all(); + } + + void WaitForNotification() { + std::unique_lock<std::mutex> l(mu_); + while (!notified_) { + cv_.wait(l); + } + } + + private: + std::mutex mu_; + std::condition_variable cv_; + bool notified_; +}; + +#else + +// Notification is an object that allows a user to to wait for another +// thread to signal a notification that an event has occurred. +// +// Multiple threads can wait on the same Notification object. +// but only one caller must call Notify() on the object. +class Notification { + public: + Notification() : notified_(false) {} + ~Notification() {} + + void Notify() { + tensorflow::mutex_lock l(mu_); + eigen_assert(!notified_); + notified_ = true; + cv_.notify_all(); + } + + void WaitForNotification() { + tensorflow::mutex_lock l(mu_); + while (!notified_) { + cv_.wait(l); + } + } + + private: + tensorflow::mutex mu_; + tensorflow::condition_variable cv_; + bool notified_; +}; +#endif + +// Runs an arbitrary function and then calls Notify() on the passed in +// Notification. +template <typename Function, typename... Args> struct FunctionWrapper +{ + static void run(Notification* n, Function f, Args... args) { + f(args...); + n->Notify(); + } +}; + +static EIGEN_STRONG_INLINE void wait_until_ready(Notification* n) { + if (n) { + n->WaitForNotification(); + } +} + + +struct MemcpyExecutor { + typedef MemcpyExecutor Self; + + MemcpyExecutor(void *dst, const void *src) : + m_dst(static_cast<char *>(dst)), m_src(static_cast<const char *>(src)) { } + + static EIGEN_STRONG_INLINE void run(const MemcpyExecutor* exec, size_t idx, size_t block_size) { + ::memcpy(&(exec->m_dst[idx]), &(exec->m_src[idx]), block_size); + } + + private: + char* m_dst; + const char* m_src; +}; + +struct MemsetExecutor { + typedef MemsetExecutor Self; + + MemsetExecutor(void *buffer, int val) : + m_buffer(static_cast<char *>(buffer)), m_val(val) { } + + static EIGEN_STRONG_INLINE void run(const MemsetExecutor* exec, size_t idx, size_t block_size) { + ::memset(&(exec->m_buffer[idx]), exec->m_val, block_size); + } + + private: + char* m_buffer; + const int m_val; +}; + + +struct ThreadPoolDevice { + // The ownership of the thread pool remains with the caller. + ThreadPoolDevice(ThreadPoolInterface* pool, size_t num_cores) + : pool_(pool), num_threads_(num_cores) {} + + EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { + return internal::aligned_malloc(num_bytes); + } + + EIGEN_STRONG_INLINE void deallocate(void* buffer) const { + internal::aligned_free(buffer); + } + + EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { +#ifdef __ANDROID__ + ::memcpy(dst, src, n); +#else + if (n <= 32768) { + ::memcpy(dst, src, n); + } else { + MemcpyExecutor memcpy_executor(dst, src); + execute(memcpy_executor, n); + } +#endif + } + + EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const { + memcpy(dst, src, n); + } + + EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const { + memcpy(dst, src, n); + } + + EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { +#ifdef __ANDROID__ + ::memset(buffer, c, n); +#else + if (n <= 32768) { + ::memset(buffer, c, n); + } else { + MemsetExecutor memset_executor(buffer, c); + execute(memset_executor, n); + } +#endif + } + + EIGEN_STRONG_INLINE size_t numThreads() const { + return num_threads_; + } + + EIGEN_STRONG_INLINE size_t memcpyThreshold() const { + return 2 * numThreads(); + } + + EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { + return l1CacheSize(); + } + + EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { + // The l3 cache size is shared between all the cores. + return l3CacheSize() / num_threads_; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const { + // Should return an enum that encodes the ISA supported by the CPU + return 1; + } + + template <class Function, class... Args> + EIGEN_STRONG_INLINE Notification* enqueue(Function&& f, Args&&... args) const { + Notification* n = new Notification(); + std::function<void()> func = + std::bind(&FunctionWrapper<Function, Args...>::run, n, f, args...); + pool_->Schedule(func); + return n; + } + + template <class Function, class... Args> + EIGEN_STRONG_INLINE void enqueue_and_forget(Function&& f, Args&&... args) const { + std::function<void()> func = std::bind(f, args...); + pool_->Schedule(func); + } + + private: + template<typename Executor> + EIGEN_STRONG_INLINE void execute(const Executor& exec, size_t n) const { + // don't spawn a thread to process fewer than 1024 bytes (chosen by small amount of + // experimentation) + // TODO: make block_size a multiple of packet_size and align everything + const size_t block_size = numext::maxi(static_cast<size_t>(1024), n / numThreads()); + const size_t block_count = n / block_size; + eigen_assert(block_count <= numThreads()); + + FixedSizeVector<Notification*> results(block_count); + for (size_t block_idx = 0; block_idx < block_count; block_idx++) { + results.push_back(enqueue(&Executor::run, &exec, block_idx * block_size, block_size)); + } + + if (block_count * block_size < n) { + Executor::run(&exec, block_count * block_size, n - block_count * block_size); + } + + // wait for threads to finish + for (size_t block_idx = 0; block_idx < block_count; block_idx++) { + results[block_idx]->WaitForNotification(); + delete results[block_idx]; + } + } + + // todo: NUMA, ... + size_t num_threads_; + ThreadPoolInterface* pool_; +}; +#endif + + +// GPU offloading +#ifdef EIGEN_USE_GPU + +// An interface abstracting away device specific memory allocator. +class Allocator { + public: + virtual ~Allocator() {} + EIGEN_DEVICE_FUNC virtual void* allocate(size_t num_bytes) const = 0; + EIGEN_DEVICE_FUNC virtual void deallocate(void* buffer) const = 0; +}; + +#if !defined(__GCUDACC__) && !defined(__GCUDACC_HOST__) + +// This defines an interface that GPUDevice can take to use +// CUDA streams underneath. +class StreamInterface { + public: + virtual ~StreamInterface() {} + + virtual const cudaStream_t& stream() const = 0; + virtual const cudaDeviceProp& deviceProperties() const = 0; + + // Allocate memory on the actual device where the computation will run + virtual void* allocate(size_t num_bytes) const = 0; + virtual void deallocate(void* buffer) const = 0; +}; + +static cudaDeviceProp* m_deviceProperties; +static bool m_devicePropInitialized = false; +static tensorflow::mutex m_devicePropInitMutex(tensorflow::LINKER_INITIALIZED); + +static void initializeDeviceProp() { + if (!m_devicePropInitialized) { + tensorflow::mutex_lock l(m_devicePropInitMutex); + if (!m_devicePropInitialized) { + int num_devices; + cudaError_t status = cudaGetDeviceCount(&num_devices); + eigen_check(status == cudaSuccess); + m_deviceProperties = new cudaDeviceProp[num_devices]; + for (int i = 0; i < num_devices; ++i) { + status = cudaGetDeviceProperties(&m_deviceProperties[i], i); + eigen_check(status == cudaSuccess); + } + m_devicePropInitialized = true; + } + } +} + +static const cudaStream_t default_stream = cudaStreamDefault; + +class CudaStreamDevice : public StreamInterface { + public: + // Use the default stream on the current device + CudaStreamDevice() : stream_(&default_stream) { + cudaGetDevice(&device_); + initializeDeviceProp(); + } + // Use the default stream on the specified device + CudaStreamDevice(int device) : stream_(&default_stream), device_(device) { + initializeDeviceProp(); + } + // Use the specified stream. Note that it's the + // caller responsibility to ensure that the stream can run on + // the specified device. If no device is specified the code + // assumes that the stream is associated to the current gpu device. + CudaStreamDevice(const cudaStream_t* stream, int device = -1) + : stream_(stream), device_(device) { + if (device < 0) { + cudaGetDevice(&device_); + } else { + int num_devices; + cudaError_t err = cudaGetDeviceCount(&num_devices); + eigen_check(err == cudaSuccess); + eigen_check(device < num_devices); + device_ = device; + } + initializeDeviceProp(); + } + + const cudaStream_t& stream() const { return *stream_; } + const cudaDeviceProp& deviceProperties() const { + return m_deviceProperties[device_]; + } + virtual void* allocate(size_t num_bytes) const { + cudaError_t err = cudaSetDevice(device_); + eigen_check(err == cudaSuccess); + void* result; + err = cudaMalloc(&result, num_bytes); + eigen_check(err == cudaSuccess); + eigen_check(result != NULL); + return result; + } + virtual void deallocate(void* buffer) const { + cudaError_t err = cudaSetDevice(device_); + eigen_check(err == cudaSuccess); + assert(buffer != NULL); + err = cudaFree(buffer); + assert(err == cudaSuccess); + } + + private: + const cudaStream_t* stream_; + int device_; +}; + +static inline void setCudaSharedMemConfig(cudaSharedMemConfig config) { + cudaError_t status = cudaDeviceSetSharedMemConfig(config); + eigen_check(status == cudaSuccess); +} + +struct GpuDevice { + // Neither the cudastream nor the allocator is not owned: the caller is + // responsible for their initialization and eventual destruction. + explicit GpuDevice(const StreamInterface* stream) : stream_(stream) { + eigen_assert(stream); + } + + // TODO(bsteiner): This is an internal API, we should not expose it. + EIGEN_STRONG_INLINE const cudaStream_t& stream() const { + return stream_->stream(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { +#ifndef __CUDA_ARCH__ + return stream_->allocate(num_bytes); +#else + eigen_assert(false && "The default device should be used instead to generate kernel code"); + return NULL; +#endif + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const { +#ifndef __CUDA_ARCH__ + stream_->deallocate(buffer); +#else + eigen_assert(false && "The default device should be used instead to generate kernel code"); +#endif + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { +#ifndef __CUDA_ARCH__ + cudaError_t err = cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice, + stream_->stream()); + assert(err == cudaSuccess); +#else + eigen_assert(false && "The default device should be used instead to generate kernel code"); +#endif + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const { +#ifndef __CUDA_ARCH__ + cudaError_t err = + cudaMemcpyAsync(dst, src, n, cudaMemcpyHostToDevice, stream_->stream()); + assert(err == cudaSuccess); +#else + eigen_assert(false && "The default device should be used instead to generate kernel code"); +#endif + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const { +#ifndef __CUDA_ARCH__ + cudaError_t err = + cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToHost, stream_->stream()); + assert(err == cudaSuccess); +#else + eigen_assert(false && "The default device should be used instead to generate kernel code"); +#endif + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { +#ifndef __CUDA_ARCH__ + cudaError_t err = cudaMemsetAsync(buffer, c, n, stream_->stream()); + assert(err == cudaSuccess); +#else + eigen_assert(false && "The default device should be used instead to generate kernel code"); +#endif + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const { + // FIXME + return 32; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t memcpyThreshold() const { + return 4 * 1024 * 1024; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { + // FIXME + return 48*1024; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { + // We won't try to take advantage of the l2 cache for the time being, and + // there is no l3 cache on cuda devices. + return firstLevelCacheSize(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const { +#ifndef __CUDA_ARCH__ + cudaError_t err = cudaStreamSynchronize(stream_->stream()); + assert(err == cudaSuccess); +#else + assert(false && "The default device should be used instead to generate kernel code"); +#endif + } + + inline int getNumCudaMultiProcessors() const { + return stream_->deviceProperties().multiProcessorCount; + } + inline int maxCudaThreadsPerBlock() const { + return stream_->deviceProperties().maxThreadsPerBlock; + } + inline int maxCudaThreadsPerMultiProcessor() const { + return stream_->deviceProperties().maxThreadsPerMultiProcessor; + } + inline int sharedMemPerBlock() const { + return stream_->deviceProperties().sharedMemPerBlock; + } + inline int majorDeviceVersion() const { + return stream_->deviceProperties().major; + } + + // This function checks if the CUDA runtime recorded an error for the + // underlying stream device. + inline bool ok() const { + cudaError_t error = cudaStreamQuery(stream_->stream()); + return (error == cudaSuccess) || (error == cudaErrorNotReady); + } + + private: + const StreamInterface* stream_; +}; + +inline void assertCudaOk() { + cudaError_t err = cudaGetLastError(); + + assert(err != cudaErrorMissingConfiguration); + assert(err != cudaErrorMemoryAllocation); + assert(err != cudaErrorInitializationError); + assert(err != cudaErrorLaunchFailure); + assert(err != cudaErrorPriorLaunchFailure); + assert(err != cudaErrorLaunchTimeout); + assert(err != cudaErrorLaunchOutOfResources); + assert(err != cudaErrorInvalidDeviceFunction); + assert(err != cudaErrorInvalidConfiguration); + assert(err != cudaErrorInvalidDevice); + assert(err != cudaErrorInvalidValue); + assert(err != cudaErrorInvalidPitchValue); + assert(err != cudaErrorInvalidSymbol); + assert(err != cudaErrorMapBufferObjectFailed); + assert(err != cudaErrorUnmapBufferObjectFailed); + assert(err != cudaErrorInvalidHostPointer); + assert(err != cudaErrorInvalidDevicePointer); + assert(err != cudaErrorInvalidTexture); + assert(err != cudaErrorInvalidTextureBinding); + assert(err != cudaErrorInvalidChannelDescriptor); + assert(err != cudaErrorInvalidMemcpyDirection); + assert(err != cudaErrorAddressOfConstant); + assert(err != cudaErrorTextureFetchFailed); + assert(err != cudaErrorTextureNotBound); + assert(err != cudaErrorSynchronizationError); + assert(err != cudaErrorInvalidFilterSetting); + assert(err != cudaErrorInvalidNormSetting); + assert(err != cudaErrorMixedDeviceExecution); + assert(err != cudaErrorCudartUnloading); + assert(err != cudaErrorUnknown); + assert(err != cudaErrorNotYetImplemented); + assert(err != cudaErrorMemoryValueTooLarge); + assert(err != cudaErrorInvalidResourceHandle); + assert(err != cudaErrorNotReady); + assert(err != cudaErrorInsufficientDriver); + assert(err != cudaErrorSetOnActiveProcess); + assert(err != cudaErrorInvalidSurface); + assert(err != cudaErrorNoDevice); + assert(err != cudaErrorECCUncorrectable); + assert(err != cudaErrorSharedObjectSymbolNotFound); + assert(err != cudaErrorSharedObjectInitFailed); + assert(err != cudaErrorUnsupportedLimit); + assert(err != cudaErrorDuplicateVariableName); + assert(err != cudaErrorDuplicateTextureName); + assert(err != cudaErrorDuplicateSurfaceName); + assert(err != cudaErrorDevicesUnavailable); + assert(err != cudaErrorInvalidKernelImage); + assert(err != cudaErrorNoKernelImageForDevice); + assert(err != cudaErrorIncompatibleDriverContext); + assert(err != cudaErrorPeerAccessAlreadyEnabled); + assert(err != cudaErrorPeerAccessNotEnabled); + assert(err != cudaErrorDeviceAlreadyInUse); + assert(err != cudaErrorProfilerDisabled); + assert(err != cudaErrorProfilerNotInitialized); + assert(err != cudaErrorProfilerAlreadyStarted); + assert(err != cudaErrorProfilerAlreadyStopped); + assert(err != cudaErrorAssert); + assert(err != cudaErrorTooManyPeers); + assert(err != cudaErrorHostMemoryAlreadyRegistered); + assert(err != cudaErrorHostMemoryNotRegistered); + assert(err != cudaErrorOperatingSystem); + assert(err != cudaErrorStartupFailure); + assert(err != cudaErrorApiFailureBase); + + // catch errors types introduced after this function was written + assert(err == cudaSuccess); +} + +#define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, \ + ...) \ + do { \ + (kernel)<<<(gridsize), (blocksize), (sharedmem), (device).stream()>>>( \ + __VA_ARGS__); \ + assertCudaOk(); \ + } while (false) + +#else // __GCUDACC__ + +// The following is the version of GpuDevice for StreamExecutor +// (go/gpuexecutor) a GPU runtime that supports both CUDA and OpenCL. +// StreamExecutor is being developed as an open-source replacement for the CUDA +// runtime and is the runtime used when compiling with gcudacc. Differences +// between the CUDA runtime and StreamExecutor are abstracted away behind +// GpuDevice. + +// TODO(jpienaar): Temporary workaround until b/18409724 is addressed. +enum cudaSharedMemConfig +{ + cudaSharedMemBankSizeDefault = 0, + cudaSharedMemBankSizeFourByte = 1, + cudaSharedMemBankSizeEightByte = 2 +}; + +static inline void setCudaSharedMemConfig(cudaSharedMemConfig cache_config) { + // TODO(jpienaar): fix when implemented (b/18409724) +} + +struct GpuDevice { + GpuDevice() + : stream_(perftools::gputools::MachineManager::singleton()->stream_for_device(0)), + allocator_(nullptr), + stream_exec_(stream_->parent()) {} + + GpuDevice(perftools::gputools::Stream* stream, + const Allocator* alloc = nullptr) + : stream_(stream), allocator_(alloc), stream_exec_(stream_->parent()) { } + + EIGEN_STRONG_INLINE perftools::gputools::Stream* stream() const { + return stream_; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { + if (allocator_ != nullptr) return allocator_->allocate(num_bytes); +#ifndef __CUDA_ARCH__ + perftools::gputools::DeviceMemory<char> mem = + stream_exec_->AllocateArray<char>(num_bytes); + return mem.opaque(); +#else + assert(false && + "The default device should be used instead to generate kernel code"); + return nullptr; +#endif + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const { + if (allocator_ != nullptr) { + allocator_->deallocate(buffer); + return; + } +#ifndef __CUDA_ARCH__ + perftools::gputools::DeviceMemoryBase gpu_mem(buffer); + stream_exec_->Deallocate(&gpu_mem); +#else + assert(false && + "The default device should be used instead to generate kernel code"); +#endif + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, + size_t n) const { +#ifndef __CUDA_ARCH__ + perftools::gputools::DeviceMemoryBase gpu_to(dst); + if (!stream_->ThenMemcpy(&gpu_to, perftools::gputools::DeviceMemoryBase( + const_cast<void*>(src)), + n).ok()) { + assert(false && + "failed during enqueue of 'copy perftools::gputools to " + "perftools::gputools'"); + } +#else + assert(false && + "The default device should be used instead to generate kernel code"); +#endif + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const { +#ifndef __CUDA_ARCH__ + perftools::gputools::DeviceMemoryBase gpu_to(dst); + if (!stream_->ThenMemcpy(&gpu_to, src, n).ok()) { + assert(false && "failed while enqueuing memcpy from host to device"); + } +#else + eigen_assert(false && "The default device should be used instead to generate kernel code"); +#endif + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const { +#ifndef __CUDA_ARCH__ + if (!stream_->ThenMemcpy(dst, perftools::gputools::DeviceMemoryBase( + const_cast<void*>(src)), + n).ok()) { + assert(false && "failed while enqueuing memcpy from device to host"); + } +#else + eigen_assert(false && "The default device should be used instead to generate kernel code"); +#endif + } + + EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { +#ifndef __CUDA_ARCH__ + perftools::gputools::DeviceMemoryBase gpu_buffer{buffer}; + if (!stream_exec_->Memset32(stream_, &gpu_buffer, c, n)) { + assert(false && "GPU memset failed."); + } +#else + assert(false && + "The default device should be used instead to generate kernel code"); +#endif + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const { + // FIXME + return 32; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t memcpyThreshold() const { + return 4 * 1024 * 1024; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { + // FIXME + return 48*1024; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { + // We won't try to take advantage of the l2 cache for the time being, and + // there is no l3 cache on cuda devices. + return firstLevelCacheSize(); + } + + EIGEN_STRONG_INLINE void synchronize() const { + stream_->BlockHostUntilDone(); + } + + // A gpu::DeviceDescription is cached inside a StreamExecutor, so these calls + // aren't expensive/wasteful. + EIGEN_DEVICE_FUNC inline int getNumCudaMultiProcessors() const { + return stream_exec_->GetDeviceDescription().core_count(); + } + + EIGEN_DEVICE_FUNC inline int maxCudaThreadsPerBlock() const { + return stream_exec_->GetDeviceDescription().threads_per_block_limit(); + } + + EIGEN_DEVICE_FUNC inline int maxCudaThreadsPerMultiProcessor() const { + return stream_exec_->GetDeviceDescription().threads_per_core_limit(); + } + + EIGEN_DEVICE_FUNC inline int sharedMemPerBlock() const { + return stream_exec_->GetDeviceDescription().shared_memory_per_block(); + } + + EIGEN_DEVICE_FUNC inline int majorDeviceVersion() const { + int major, minor; + if (stream_exec_->GetDeviceDescription().cuda_compute_capability(&major, + &minor)) { + return major; + } else { + return 0; + } + } + + inline bool ok() const { return stream_->ok(); } + + private: + perftools::gputools::Stream* stream_; + perftools::gputools::StreamExecutor* stream_exec_; + const Allocator* allocator_; +}; + +#define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...)\ + (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \ + CHECK((device).stream()->ok()); +#endif // __GCUDACC__ + +#endif // EIGEN_USE_GPU +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_TYPE_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h new file mode 100644 index 0000000000..19e922f92f --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h @@ -0,0 +1,235 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_DIMENSION_LIST_H +#define EIGEN_CXX11_TENSOR_TENSOR_DIMENSION_LIST_H + +namespace Eigen { + +/** \internal + * + * \class TensorDimensionList + * \ingroup CXX11_Tensor_Module + * + * \brief Special case of tensor index list used to list all the dimensions of a tensor of rank n. + * + * \sa Tensor + */ + +template <typename Index, std::size_t Rank> struct DimensionList { + const Index operator[] (const Index i) const { return i; } +}; + +namespace internal { + +template<typename Index, std::size_t Rank> struct array_size<DimensionList<Index, Rank> > { + static const size_t value = Rank; +}; +template<typename Index, std::size_t Rank> struct array_size<const DimensionList<Index, Rank> > { + static const size_t value = Rank; +}; + +template<DenseIndex n, typename Index, std::size_t Rank> const Index array_get(DimensionList<Index, Rank>& a) { + return n; +} +template<DenseIndex n, typename Index, std::size_t Rank> const Index array_get(const DimensionList<Index, Rank>& a) { + return n; +} + + +#if defined(EIGEN_HAS_CONSTEXPR) +template <typename Index, std::size_t Rank> +struct index_known_statically<DimensionList<Index, Rank> > { + constexpr bool operator() (const DenseIndex) const { + return true; + } +}; +template <typename Index, std::size_t Rank> +struct index_known_statically<const DimensionList<Index, Rank> > { + constexpr bool operator() (const DenseIndex) const { + return true; + } +}; + +template <typename Index, std::size_t Rank> +struct all_indices_known_statically<DimensionList<Index, Rank> > { + constexpr bool operator() () const { + return true; + } +}; +template <typename Index, std::size_t Rank> +struct all_indices_known_statically<const DimensionList<Index, Rank> > { + constexpr bool operator() () const { + return true; + } +}; + +template <typename Index, std::size_t Rank> +struct indices_statically_known_to_increase<DimensionList<Index, Rank> > { + constexpr bool operator() () const { + return true; + } +}; +template <typename Index, std::size_t Rank> +struct indices_statically_known_to_increase<const DimensionList<Index, Rank> > { + constexpr bool operator() () const { + return true; + } +}; + +template <typename Index, std::size_t Rank> +struct index_statically_eq<DimensionList<Index, Rank> > { + constexpr bool operator() (const DenseIndex i, const DenseIndex value) const { + return i == value; + } +}; +template <typename Index, std::size_t Rank> +struct index_statically_eq<const DimensionList<Index, Rank> > { + constexpr bool operator() (const DenseIndex i, const DenseIndex value) const { + return i == value; + } +}; + +template <typename Index, std::size_t Rank> +struct index_statically_ne<DimensionList<Index, Rank> > { + constexpr bool operator() (const DenseIndex i, const DenseIndex value) const { + return i != value; + } +}; +template <typename Index, std::size_t Rank> +struct index_statically_ne<const DimensionList<Index, Rank> > { + constexpr bool operator() (const DenseIndex i, const DenseIndex value) const { + return i != value; + } +}; + +template <typename Index, std::size_t Rank> +struct index_statically_gt<DimensionList<Index, Rank> > { + constexpr bool operator() (const DenseIndex i, const DenseIndex value) const { + return i > value; + } +}; +template <typename Index, std::size_t Rank> +struct index_statically_gt<const DimensionList<Index, Rank> > { + constexpr bool operator() (const DenseIndex i, const DenseIndex value) const { + return i > value; + } +}; + +template <typename Index, std::size_t Rank> +struct index_statically_lt<DimensionList<Index, Rank> > { + constexpr bool operator() (const DenseIndex i, const DenseIndex value) const { + return i < value; + } +}; +template <typename Index, std::size_t Rank> +struct index_statically_lt<const DimensionList<Index, Rank> > { + constexpr bool operator() (const DenseIndex i, const DenseIndex value) const { + return i < value; + } +}; + +#else +template <typename Index, std::size_t Rank> +struct index_known_statically<DimensionList<Index, Rank> > { + EIGEN_ALWAYS_INLINE bool operator() (const DenseIndex) const { + return true; + } +}; +template <typename Index, std::size_t Rank> +struct index_known_statically<const DimensionList<Index, Rank> > { + EIGEN_ALWAYS_INLINE bool operator() (const DenseIndex) const { + return true; + } +}; + +template <typename Index, std::size_t Rank> +struct all_indices_known_statically<DimensionList<Index, Rank> > { + EIGEN_ALWAYS_INLINE bool operator() () const { + return true; + } +}; +template <typename Index, std::size_t Rank> +struct all_indices_known_statically<const DimensionList<Index, Rank> > { + EIGEN_ALWAYS_INLINE bool operator() () const { + return true; + } +}; + +template <typename Index, std::size_t Rank> +struct indices_statically_known_to_increase<DimensionList<Index, Rank> > { + EIGEN_ALWAYS_INLINE bool operator() () const { + return true; + } +}; +template <typename Index, std::size_t Rank> +struct indices_statically_known_to_increase<const DimensionList<Index, Rank> > { + EIGEN_ALWAYS_INLINE bool operator() () const { + return true; + } +}; + +template <typename Index, std::size_t Rank> +struct index_statically_eq<DimensionList<Index, Rank> > { + EIGEN_ALWAYS_INLINE bool operator() (const DenseIndex i, const DenseIndex value) const { + return false; + } +}; +template <typename Index, std::size_t Rank> +struct index_statically_eq<const DimensionList<Index, Rank> > { + EIGEN_ALWAYS_INLINE bool operator() (const DenseIndex i, const DenseIndex value) const { + return false; + } +}; + +template <typename Index, std::size_t Rank> +struct index_statically_ne<DimensionList<Index, Rank> > { + EIGEN_ALWAYS_INLINE bool operator() (const DenseIndex i, const DenseIndex value) const { + return false; + } +}; +template <typename Index, std::size_t Rank> +struct index_statically_ne<const DimensionList<Index, Rank> > { + EIGEN_ALWAYS_INLINE bool operator() (const DenseIndex i, const DenseIndex value) const { + return false; + } +}; + +template <typename Index, std::size_t Rank> +struct index_statically_gt<DimensionList<Index, Rank> > { + EIGEN_ALWAYS_INLINE bool operator() (const DenseIndex i, const DenseIndex value) const { + return false; + } +}; +template <typename Index, std::size_t Rank> +struct index_statically_gt<const DimensionList<Index, Rank> > { + EIGEN_ALWAYS_INLINE bool operator() (const DenseIndex i, const DenseIndex value) const { + return false; + } +}; + +template <typename Index, std::size_t Rank> +struct index_statically_lt<DimensionList<Index, Rank> > { + EIGEN_ALWAYS_INLINE bool operator() (const DenseIndex i, const DenseIndex value) const { + return false; + } +}; +template <typename Index, std::size_t Rank> +struct index_statically_lt<const DimensionList<Index, Rank> > { + EIGEN_ALWAYS_INLINE bool operator() (const DenseIndex i, const DenseIndex value) const { + return false; + } +}; +#endif + +} // end namespace internal +} // end namespace Eigen + + +#endif // EIGEN_CXX11_TENSOR_TENSOR_DIMENSION_LIST_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h new file mode 100644 index 0000000000..8bf5272ec8 --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h @@ -0,0 +1,597 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_DIMENSIONS_H +#define EIGEN_CXX11_TENSOR_TENSOR_DIMENSIONS_H + + +namespace Eigen { + +/** \internal + * + * \class TensorDimensions + * \ingroup CXX11_Tensor_Module + * + * \brief Set of classes used to encode and store the dimensions of a Tensor. + * + * The Sizes class encodes as part of the type the number of dimensions and the + * sizes corresponding to each dimension. It uses no storage space since it is + * entirely known at compile time. + * The DSizes class is its dynamic sibling: the number of dimensions is known + * at compile time but the sizes are set during execution. + * + * \sa Tensor + */ + +// Can't use std::pairs on cuda devices +template <typename Index> struct IndexPair { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair() : first(0), second(0) { } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair(Index f, Index s) : first(f), second(s) { } + Index first; + Index second; +}; + +// Boilerplate code +namespace internal { + +template<std::size_t n, typename Dimension> struct dget { + static const std::size_t value = get<n, typename Dimension::Base>::value; +}; + + +template<typename Index, std::size_t NumIndices, std::size_t n, bool RowMajor> +struct fixed_size_tensor_index_linearization_helper +{ + template <typename Dimensions> EIGEN_DEVICE_FUNC + static inline Index run(array<Index, NumIndices> const& indices, + const Dimensions& dimensions) + { + return array_get<RowMajor ? n - 1 : (NumIndices - n)>(indices) + + dget<RowMajor ? n - 1 : (NumIndices - n), Dimensions>::value * + fixed_size_tensor_index_linearization_helper<Index, NumIndices, n - 1, RowMajor>::run(indices, dimensions); + } +}; + +template<typename Index, std::size_t NumIndices, bool RowMajor> +struct fixed_size_tensor_index_linearization_helper<Index, NumIndices, 0, RowMajor> +{ + template <typename Dimensions> EIGEN_DEVICE_FUNC + static inline Index run(array<Index, NumIndices> const& indices, + const Dimensions&) + { + return 0; + } +}; + +template<typename Index, std::size_t n> +struct fixed_size_tensor_index_extraction_helper +{ + template <typename Dimensions> EIGEN_DEVICE_FUNC + static inline Index run(const Index index, + const Dimensions& dimensions) + { + const Index mult = (index == n) ? 1 : 0; + return array_get<n>(dimensions) * mult + + fixed_size_tensor_index_extraction_helper<Index, n - 1>::run(index, dimensions); + } +}; + +template<typename Index> +struct fixed_size_tensor_index_extraction_helper<Index, 0> +{ + template <typename Dimensions> EIGEN_DEVICE_FUNC + static inline Index run(const Index index, + const Dimensions& dimensions) + { + const Index mult = (index == 0) ? 1 : 0; + return array_get<0>(dimensions) * mult; + } +}; + +} // end namespace internal + + +// Fixed size +#ifndef EIGEN_EMULATE_CXX11_META_H +template <typename std::size_t... Indices> +struct Sizes : internal::numeric_list<std::size_t, Indices...> { + typedef internal::numeric_list<std::size_t, Indices...> Base; + static const std::size_t total_size = internal::arg_prod(Indices...); + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t rank() const { + return Base::count; + } + + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t TotalSize() { + return internal::arg_prod(Indices...); + } + + Sizes() { } + template <typename DenseIndex> + explicit Sizes(const array<DenseIndex, Base::count>& /*indices*/) { + // todo: add assertion + } +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES + template <typename... DenseIndex> Sizes(DenseIndex...) { } + explicit Sizes(std::initializer_list<std::size_t> /*l*/) { + // todo: add assertion + } +#endif + + template <typename T> Sizes& operator = (const T& /*other*/) { + // add assertion failure if the size of other is different + return *this; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t operator[] (const int index) const { + return internal::fixed_size_tensor_index_extraction_helper<std::ptrdiff_t, Base::count - 1>::run(index, *this); + } + + template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + size_t IndexOfColMajor(const array<DenseIndex, Base::count>& indices) const { + return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, false>::run(indices, *static_cast<const Base*>(this)); + } + template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + size_t IndexOfRowMajor(const array<DenseIndex, Base::count>& indices) const { + return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, true>::run(indices, *static_cast<const Base*>(this)); + } +}; + +namespace internal { +template <typename std::size_t... Indices> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_prod(const Sizes<Indices...>&) { + return Sizes<Indices...>::total_size; +} +} + +#else + +template <std::size_t n> +struct non_zero_size { + typedef internal::type2val<std::size_t, n> type; +}; +template <> +struct non_zero_size<0> { + typedef internal::null_type type; +}; + +template <std::size_t V1=0, std::size_t V2=0, std::size_t V3=0, std::size_t V4=0, std::size_t V5=0> struct Sizes { + typedef typename internal::make_type_list<typename non_zero_size<V1>::type, typename non_zero_size<V2>::type, typename non_zero_size<V3>::type, typename non_zero_size<V4>::type, typename non_zero_size<V5>::type >::type Base; + static const size_t count = Base::count; + static const std::size_t total_size = internal::arg_prod<Base>::value; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t rank() const { + return count; + } + + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t TotalSize() { + return internal::arg_prod<Base>::value; + } + + Sizes() { } + template <typename DenseIndex> + explicit Sizes(const array<DenseIndex, Base::count>& indices) { + // todo: add assertion + } +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES + template <typename... DenseIndex> Sizes(DenseIndex... indices) { } + explicit Sizes(std::initializer_list<std::size_t> l) { + // todo: add assertion + } +#else + EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex i0) { + } + EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex i0, const DenseIndex i1) { + } + EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2) { + } + EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3) { + } + EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3, const DenseIndex i4) { + } +#endif + + template <typename T> Sizes& operator = (const T& other) { + // to do: check the size of other + return *this; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t operator[] (const int index) const { + switch (index) { + case 0: + return internal::get<0, Base>::value; + case 1: + return internal::get<1, Base>::value; + case 2: + return internal::get<2, Base>::value; + case 3: + return internal::get<3, Base>::value; + case 4: + return internal::get<4, Base>::value; + default: + eigen_assert(false && "index overflow"); + return static_cast<std::size_t>(-1); + } + } + + template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + size_t IndexOfColMajor(const array<DenseIndex, Base::count>& indices) const { + return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, false>::run(indices, *this); + } + template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + size_t IndexOfRowMajor(const array<DenseIndex, Base::count>& indices) const { + return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, true>::run(indices, *this); + } +}; + +namespace internal { +template <std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_prod(const Sizes<V1, V2, V3, V4, V5>&) { + return Sizes<V1, V2, V3, V4, V5>::total_size; +} +} + +#endif + +// Boilerplate +namespace internal { +template<typename Index, std::size_t NumIndices, std::size_t n, bool RowMajor> +struct tensor_index_linearization_helper +{ + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Index run(array<Index, NumIndices> const& indices, array<Index, NumIndices> const& dimensions) + { + return array_get<RowMajor ? n : (NumIndices - n - 1)>(indices) + + array_get<RowMajor ? n : (NumIndices - n - 1)>(dimensions) * + tensor_index_linearization_helper<Index, NumIndices, n - 1, RowMajor>::run(indices, dimensions); + } +}; + +template<typename Index, std::size_t NumIndices, bool RowMajor> +struct tensor_index_linearization_helper<Index, NumIndices, 0, RowMajor> +{ + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Index run(array<Index, NumIndices> const& indices, array<Index, NumIndices> const&) + { + return array_get<RowMajor ? 0 : NumIndices - 1>(indices); + } +}; +} // end namespace internal + + + +// Dynamic size +template <typename DenseIndex, std::size_t NumDims> +struct DSizes : array<DenseIndex, NumDims> { + typedef array<DenseIndex, NumDims> Base; + static const std::size_t count = NumDims; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t rank() const { + return NumDims; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t TotalSize() const { + return internal::array_prod(*static_cast<const Base*>(this)); + } + + EIGEN_DEVICE_FUNC DSizes() { + for (int i = 0 ; i < NumDims; ++i) { + (*this)[i] = 0; + } + } + EIGEN_DEVICE_FUNC DSizes(const array<DenseIndex, NumDims>& a) : Base(a) { } + + EIGEN_DEVICE_FUNC DSizes(const DimensionList<DenseIndex, NumDims>& a) { + for (int i = 0 ; i < NumDims; ++i) { + (*this)[i] = a[i]; + } + } + +#ifndef EIGEN_EMULATE_CXX11_META_H + template <typename std::size_t... Indices> + EIGEN_DEVICE_FUNC DSizes(const Sizes<Indices...>& a) { + for (int i = 0 ; i < NumDims; ++i) { + (*this)[i] = a[i]; + } + } +#else + template <std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5> + EIGEN_DEVICE_FUNC DSizes(const Sizes<V1, V2, V3, V4, V5>& a) { + for (int i = 0 ; i < NumDims; ++i) { + (*this)[i] = a[i]; + } + } +#endif + +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES + template<typename... IndexTypes> EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE explicit DSizes(DenseIndex firstDimension, IndexTypes... otherDimensions) { + EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumDims, YOU_MADE_A_PROGRAMMING_MISTAKE) + (*this) = array<DenseIndex, NumDims>{{firstDimension, otherDimensions...}}; + } +#else + EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0) { + eigen_assert(NumDims == 1); + (*this)[0] = i0; + } + EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0, const DenseIndex i1) { + eigen_assert(NumDims == 2); + (*this)[0] = i0; + (*this)[1] = i1; + } + EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2) { + eigen_assert(NumDims == 3); + (*this)[0] = i0; + (*this)[1] = i1; + (*this)[2] = i2; + } + EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3) { + eigen_assert(NumDims == 4); + (*this)[0] = i0; + (*this)[1] = i1; + (*this)[2] = i2; + (*this)[3] = i3; + } + EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3, const DenseIndex i4) { + eigen_assert(NumDims == 5); + (*this)[0] = i0; + (*this)[1] = i1; + (*this)[2] = i2; + (*this)[3] = i3; + (*this)[4] = i4; + } +#endif + + EIGEN_DEVICE_FUNC DSizes& operator = (const array<DenseIndex, NumDims>& other) { + *static_cast<Base*>(this) = other; + return *this; + } + + // A constexpr would be so much better here + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t IndexOfColMajor(const array<DenseIndex, NumDims>& indices) const { + return internal::tensor_index_linearization_helper<DenseIndex, NumDims, NumDims - 1, false>::run(indices, *static_cast<const Base*>(this)); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t IndexOfRowMajor(const array<DenseIndex, NumDims>& indices) const { + return internal::tensor_index_linearization_helper<DenseIndex, NumDims, NumDims - 1, true>::run(indices, *static_cast<const Base*>(this)); + } +}; + + + + +// Boilerplate +namespace internal { +template<typename Index, std::size_t NumIndices, std::size_t n, bool RowMajor> +struct tensor_vsize_index_linearization_helper +{ + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Index run(array<Index, NumIndices> const& indices, std::vector<DenseIndex> const& dimensions) + { + return array_get<RowMajor ? n : (NumIndices - n - 1)>(indices) + + array_get<RowMajor ? n : (NumIndices - n - 1)>(dimensions) * + tensor_vsize_index_linearization_helper<Index, NumIndices, n - 1, RowMajor>::run(indices, dimensions); + } +}; + +template<typename Index, std::size_t NumIndices, bool RowMajor> +struct tensor_vsize_index_linearization_helper<Index, NumIndices, 0, RowMajor> +{ + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Index run(array<Index, NumIndices> const& indices, std::vector<DenseIndex> const&) + { + return array_get<RowMajor ? 0 : NumIndices - 1>(indices); + } +}; +} // end namespace internal + + +template <typename DenseIndex> +struct VSizes : std::vector<DenseIndex> { + typedef std::vector<DenseIndex> Base; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t rank() const { + return Base::size(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t TotalSize() const { + return internal::array_prod(*static_cast<const Base*>(this)); + } + + EIGEN_DEVICE_FUNC VSizes() { } + EIGEN_DEVICE_FUNC explicit VSizes(const std::vector<DenseIndex>& a) : Base(a) { } + + template <std::size_t NumDims> + EIGEN_DEVICE_FUNC explicit VSizes(const array<DenseIndex, NumDims>& a) { + this->resize(NumDims); + for (int i = 0; i < NumDims; ++i) { + (*this)[i] = a[i]; + } + } + template <std::size_t NumDims> + EIGEN_DEVICE_FUNC explicit VSizes(const DSizes<DenseIndex, NumDims>& a) { + this->resize(NumDims); + for (int i = 0; i < NumDims; ++i) { + (*this)[i] = a[i]; + } + } + + EIGEN_DEVICE_FUNC explicit VSizes(const DenseIndex i0) { + this->resize(1); + (*this)[0] = i0; + } + EIGEN_DEVICE_FUNC explicit VSizes(const DenseIndex i0, const DenseIndex i1) { + this->resize(2); + (*this)[0] = i0; + (*this)[1] = i1; + } + EIGEN_DEVICE_FUNC explicit VSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2) { + this->resize(3); + (*this)[0] = i0; + (*this)[1] = i1; + (*this)[2] = i2; + } + EIGEN_DEVICE_FUNC explicit VSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3) { + this->resize(4); + (*this)[0] = i0; + (*this)[1] = i1; + (*this)[2] = i2; + (*this)[3] = i3; + } + EIGEN_DEVICE_FUNC explicit VSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3, const DenseIndex i4) { + this->resize(5); + (*this)[0] = i0; + (*this)[1] = i1; + (*this)[2] = i2; + (*this)[3] = i3; + (*this)[4] = i4; + } + + EIGEN_DEVICE_FUNC VSizes& operator = (const std::vector<DenseIndex>& other) { + *static_cast<Base*>(this) = other; + return *this; + } + template <std::size_t NumDims> + EIGEN_DEVICE_FUNC VSizes& operator = (const array<DenseIndex, NumDims>& a) { + this->resize(NumDims); + for (int i = 0; i < NumDims; ++i) { + (*this)[i] = a[i]; + } + return *this; + } + template <std::size_t NumDims> + EIGEN_DEVICE_FUNC VSizes& operator = (const DSizes<DenseIndex, NumDims>& a) { + this->resize(NumDims); + for (int i = 0; i < NumDims; ++i) { + (*this)[i] = a[i]; + } + return *this; + } + + // A constexpr would be so much better here + template <std::size_t NumDims> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t IndexOfColMajor(const array<DenseIndex, NumDims>& indices) const { + return internal::tensor_vsize_index_linearization_helper<DenseIndex, NumDims, NumDims - 1, false>::run(indices, *static_cast<const Base*>(this)); + } + template <std::size_t NumDims> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t IndexOfRowMajor(const array<DenseIndex, NumDims>& indices) const { + return internal::tensor_vsize_index_linearization_helper<DenseIndex, NumDims, NumDims - 1, true>::run(indices, *static_cast<const Base*>(this)); + } +}; + + +// Boilerplate +namespace internal { +template <typename DenseIndex> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex array_prod(const VSizes<DenseIndex>& sizes) { + DenseIndex total_size = 1; + for (int i = 0; i < sizes.size(); ++i) { + total_size *= sizes[i]; + } + return total_size; +}; +} + +namespace internal { + +template <typename DenseIndex, std::size_t NumDims> struct array_size<const DSizes<DenseIndex, NumDims> > { + static const size_t value = NumDims; +}; +template <typename DenseIndex, std::size_t NumDims> struct array_size<DSizes<DenseIndex, NumDims> > { + static const size_t value = NumDims; +}; +template <typename DenseIndex> +struct array_size<VSizes<DenseIndex> > { + static const ptrdiff_t value = -1; +}; +#ifndef EIGEN_EMULATE_CXX11_META_H +template <typename std::size_t... Indices> struct array_size<const Sizes<Indices...> > { +static const size_t value = Sizes<Indices...>::count; +}; +template <typename std::size_t... Indices> struct array_size<Sizes<Indices...> > { +static const size_t value = Sizes<Indices...>::count; +}; +template <std::size_t n, typename std::size_t... Indices> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_get(const Sizes<Indices...>&) { + return get<n, internal::numeric_list<std::size_t, Indices...> >::value; +} +#else +template <std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5> struct array_size<const Sizes<V1,V2,V3,V4,V5> > { + static const size_t value = Sizes<V1,V2,V3,V4,V5>::count; +}; +template <std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5> struct array_size<Sizes<V1,V2,V3,V4,V5> > { + static const size_t value = Sizes<V1,V2,V3,V4,V5>::count; +}; +template <std::size_t n, std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_get(const Sizes<V1,V2,V3,V4,V5>& a) { + return get<n, typename Sizes<V1,V2,V3,V4,V5>::Base>::value; +} + +#endif + + +template <typename Dims1, typename Dims2, size_t n, size_t m> +struct sizes_match_below_dim { + static inline bool run(Dims1& dims1, Dims2& dims2) { + return false; + } +}; +template <typename Dims1, typename Dims2, size_t n> +struct sizes_match_below_dim<Dims1, Dims2, n, n> { + static inline bool run(Dims1& dims1, Dims2& dims2) { + return (array_get<n-1>(dims1) == array_get<n-1>(dims2)) & + sizes_match_below_dim<Dims1, Dims2, n-1, n-1>::run(dims1, dims2); + } +}; +template <typename Dims1, typename Dims2> +struct sizes_match_below_dim<Dims1, Dims2, 0, 0> { + static inline bool run(Dims1& dims1, Dims2& dims2) { + return true; + } +}; + +} // end namespace internal + + +template <typename Dims1, typename Dims2> +bool dimensions_match(Dims1& dims1, Dims2& dims2) { + return internal::sizes_match_below_dim<Dims1, Dims2, internal::array_size<Dims1>::value, internal::array_size<Dims2>::value>::run(dims1, dims2); +} + +template <typename IndexType, typename Dims2> +bool dimensions_match(const VSizes<IndexType>& dims1, Dims2& dims2) { + if (dims1.size() != internal::array_size<Dims2>::value) { + return false; + } + for (int i = 0; i < internal::array_size<Dims2>::value; ++i) { + if (dims1[i] != dims2[i]) { + return false; + } + } + return true; +} + +template <typename Dims1, typename IndexType> +bool dimensions_match(Dims1& dims1, const VSizes<IndexType>& dims2) { + if (internal::array_size<Dims1>::value != dims2.size()) { + return false; + } + for (int i = 0; i < internal::array_size<Dims1>::value; ++i) { + if (dims1[i] != dims2[i]) { + return false; + } + } + return true; +} + +template <typename IndexType> +bool dimensions_match(const VSizes<IndexType>& dims1, const VSizes<IndexType>& dims2) { + return dims1 == dims2; +} + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_DIMENSIONS_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h new file mode 100644 index 0000000000..4ad431abae --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h @@ -0,0 +1,151 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_EVAL_TO_H +#define EIGEN_CXX11_TENSOR_TENSOR_EVAL_TO_H + +namespace Eigen { + +/** \class TensorForcedEval + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor reshaping class. + * + * + */ +namespace internal { +template<typename XprType> +struct traits<TensorEvalToOp<XprType> > +{ + // Type promotion to handle the case where the types of the lhs and the rhs are different. + typedef typename XprType::Scalar Scalar; + typedef traits<XprType> XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference<Nested>::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; + + enum { + Flags = 0, + }; +}; + +template<typename XprType> +struct eval<TensorEvalToOp<XprType>, Eigen::Dense> +{ + typedef const TensorEvalToOp<XprType>& type; +}; + +template<typename XprType> +struct nested<TensorEvalToOp<XprType>, 1, typename eval<TensorEvalToOp<XprType> >::type> +{ + typedef TensorEvalToOp<XprType> type; +}; + +} // end namespace internal + + + + +template<typename XprType> +class TensorEvalToOp : public TensorBase<TensorEvalToOp<XprType> > +{ + public: + typedef typename Eigen::internal::traits<TensorEvalToOp>::Scalar Scalar; + typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; + typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType; + typedef typename Eigen::internal::nested<TensorEvalToOp>::type Nested; + typedef typename Eigen::internal::traits<TensorEvalToOp>::StorageKind StorageKind; + typedef typename Eigen::internal::traits<TensorEvalToOp>::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvalToOp(CoeffReturnType* buffer, const XprType& expr) + : m_xpr(expr), m_buffer(buffer) {} + + EIGEN_DEVICE_FUNC + const typename internal::remove_all<typename XprType::Nested>::type& + expression() const { return m_xpr; } + + EIGEN_DEVICE_FUNC CoeffReturnType* buffer() const { return m_buffer; } + + protected: + typename XprType::Nested m_xpr; + CoeffReturnType* m_buffer; +}; + + + +template<typename ArgType, typename Device> +struct TensorEvaluator<const TensorEvalToOp<ArgType>, Device> +{ + typedef TensorEvalToOp<ArgType> XprType; + typedef typename ArgType::Scalar Scalar; + typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions; + + enum { + IsAligned = TensorEvaluator<ArgType, Device>::IsAligned, + PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, + BlockAccess = false, + Layout = TensorEvaluator<ArgType, Device>::Layout, + CoordAccess = false, // to be implemented + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_device(device), m_buffer(op.buffer()) + { } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~TensorEvaluator() { + } + + typedef typename XprType::Index Index; + typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType; + typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* scalar) { + assert(scalar == NULL); + return m_impl.evalSubExprsIfNeeded(m_buffer); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) { + m_buffer[i] = m_impl.coeff(i); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) { + internal::pstoret<CoeffReturnType, PacketReturnType, Aligned>(m_buffer + i, m_impl.template packet<TensorEvaluator<ArgType, Device>::IsAligned ? Aligned : Unaligned>(i)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + return m_buffer[index]; + } + + template<int LoadMode> + EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + return internal::ploadt<PacketReturnType, LoadMode>(m_buffer + index); + } + + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_buffer; } + + private: + TensorEvaluator<ArgType, Device> m_impl; + const Device& m_device; + CoeffReturnType* m_buffer; +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_EVAL_TO_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h new file mode 100644 index 0000000000..f2ef2d85c1 --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -0,0 +1,505 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H +#define EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H + +namespace Eigen { + +/** \class TensorEvaluator + * \ingroup CXX11_Tensor_Module + * + * \brief The tensor evaluator classes. + * + * These classes are responsible for the evaluation of the tensor expression. + * + * TODO: add support for more types of expressions, in particular expressions + * leading to lvalues (slicing, reshaping, etc...) + */ + +// Generic evaluator +template<typename Derived, typename Device> +struct TensorEvaluator +{ + typedef typename Derived::Index Index; + typedef typename Derived::Scalar Scalar; + typedef typename Derived::Scalar CoeffReturnType; + typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + typedef typename Derived::Dimensions Dimensions; + + // NumDimensions is -1 for variable dim tensors + static const int NumCoords = internal::traits<Derived>::NumDimensions; + static const int SafeNumCoords = NumCoords >= 0 ? NumCoords : 0; + + enum { + IsAligned = Derived::IsAligned, + PacketAccess = Derived::PacketAccess, + BlockAccess = internal::is_arithmetic< + typename internal::remove_const<Scalar>::type>::value && + NumCoords >= 0, + Layout = Derived::Layout, + CoordAccess = NumCoords >= 0, + }; + + typedef typename internal::TensorBlock< + Index, typename internal::remove_const<Scalar>::type, SafeNumCoords, Layout> + TensorBlock; + typedef typename internal::TensorBlockReader< + Index, typename internal::remove_const<Scalar>::type, SafeNumCoords, Layout, + PacketAccess> TensorBlockReader; + typedef typename internal::TensorBlockWriter< + Index, typename internal::remove_const<Scalar>::type, SafeNumCoords, Layout, + PacketAccess> TensorBlockWriter; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorEvaluator(const Derived& m, const Device& device) + : m_data(const_cast<Scalar*>(m.data())), + m_dims(m.dimensions()), + m_device(device) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* dest) { + if (dest) { + m_device.memcpy((void*)dest, m_data, sizeof(Scalar) * m_dims.TotalSize()); + return false; + } + return true; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + eigen_assert(m_data); + return m_data[index]; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { + eigen_assert(m_data); + return m_data[index]; + } + + template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + PacketReturnType packet(Index index) const + { + return internal::ploadt<PacketReturnType, LoadMode>(m_data + index); + } + + template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void writePacket(Index index, const PacketReturnType& x) + { + return internal::pstoret<Scalar, PacketReturnType, StoreMode>(m_data + index, x); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<Index, SafeNumCoords>& coords) const { + eigen_assert(m_data); + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + return m_data[m_dims.IndexOfColMajor(coords)]; + } else { + return m_data[m_dims.IndexOfRowMajor(coords)]; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(const array<Index, SafeNumCoords>& coords) { + eigen_assert(m_data); + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + return m_data[m_dims.IndexOfColMajor(coords)]; + } else { + return m_data[m_dims.IndexOfRowMajor(coords)]; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( + std::vector<internal::TensorOpResourceRequirements>* resources) const { + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(TensorBlock* block) const { + assert(m_data != NULL); + TensorBlockReader::Run(block, m_data); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock( + const TensorBlock& block) { + assert(m_data != NULL); + TensorBlockWriter::Run(block, m_data); + } + + EIGEN_DEVICE_FUNC Scalar* data() const { return m_data; } + + protected: + Scalar* m_data; + Dimensions m_dims; + const Device& m_device; +}; + + +namespace { +template <typename T> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T loadConstant(const T* address) { + return *address; + +} +// Use the texture cache on CUDA devices whenever possible +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 +template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +float loadConstant(const float* address) { + return __ldg(address); +} +template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +double loadConstant(const double* address) { + return __ldg(address); + + +} +#endif +} + + +// Default evaluator for rvalues +template<typename Derived, typename Device> +struct TensorEvaluator<const Derived, Device> +{ + typedef typename Derived::Index Index; + typedef typename Derived::Scalar Scalar; + typedef typename Derived::Scalar CoeffReturnType; + typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + typedef typename Derived::Dimensions Dimensions; + + // NumDimensions is -1 for variable dim tensors + static const int NumCoords = internal::traits<Derived>::NumDimensions; + static const int SafeNumCoords = NumCoords >= 0 ? NumCoords : 0; + + enum { + IsAligned = Derived::IsAligned, + PacketAccess = Derived::PacketAccess, + BlockAccess = internal::is_arithmetic< + typename internal::remove_const<Scalar>::type>::value && + NumCoords >= 0, + Layout = Derived::Layout, + CoordAccess = NumCoords >= 0, + }; + + // TODO(andydavis) Add block/writeBlock accessors to Tensor and TensorMap so + // we can default BlockAccess to true above. + typedef typename internal::TensorBlock< + Index, typename internal::remove_const<Scalar>::type, SafeNumCoords, Layout> + TensorBlock; + typedef typename internal::TensorBlockReader< + Index, typename internal::remove_const<Scalar>::type, SafeNumCoords, Layout, + PacketAccess> TensorBlockReader; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device) + : m_data(m.data()), m_dims(m.dimensions()), m_device(device) + { } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { + if (internal::is_arithmetic<typename internal::remove_const<Scalar>::type>::value && data) { + m_device.memcpy((void*)data, m_data, m_dims.TotalSize() * sizeof(Scalar)); + return false; + } + return true; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + eigen_assert(m_data); + return loadConstant(m_data+index); + } + + template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + PacketReturnType packet(Index index) const + { + return internal::ploadt_ro<PacketReturnType, LoadMode>(m_data + index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<Index, SafeNumCoords>& coords) const { + eigen_assert(m_data); + const Index index = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_dims.IndexOfColMajor(coords) + : m_dims.IndexOfRowMajor(coords); + return loadConstant(m_data+index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( + std::vector<internal::TensorOpResourceRequirements>* resources) const { + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(TensorBlock* block) const { + assert(m_data != NULL); + TensorBlockReader::Run(block, m_data); + } + + EIGEN_DEVICE_FUNC const Scalar* data() const { return m_data; } + + protected: + const Scalar* m_data; + Dimensions m_dims; + const Device& m_device; +}; + + + + +// -------------------- CwiseNullaryOp -------------------- + +template<typename NullaryOp, typename ArgType, typename Device> +struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device> +{ + typedef TensorCwiseNullaryOp<NullaryOp, ArgType> XprType; + + enum { + IsAligned = true, + PacketAccess = internal::functor_traits<NullaryOp>::PacketAccess, + BlockAccess = false, + Layout = TensorEvaluator<ArgType, Device>::Layout, + CoordAccess = false, // to be implemented + }; + + EIGEN_DEVICE_FUNC + TensorEvaluator(const XprType& op, const Device& device) + : m_functor(op.functor()), m_argImpl(op.nestedExpression(), device) + { } + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename internal::traits<XprType>::Scalar CoeffReturnType; + typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions; + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { return true; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { } + + EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const + { + return m_functor(index); + } + + template<int LoadMode> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + return m_functor.packetOp(index); + } + + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } + + private: + const NullaryOp m_functor; + TensorEvaluator<ArgType, Device> m_argImpl; +}; + + + +// -------------------- CwiseUnaryOp -------------------- + +template<typename UnaryOp, typename ArgType, typename Device> +struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device> +{ + typedef TensorCwiseUnaryOp<UnaryOp, ArgType> XprType; + + enum { + IsAligned = TensorEvaluator<ArgType, Device>::IsAligned, + PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess & + internal::functor_traits<UnaryOp>::PacketAccess, + BlockAccess = false, + Layout = TensorEvaluator<ArgType, Device>::Layout, + CoordAccess = false, // to be implemented + }; + + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) + : m_functor(op.functor()), + m_argImpl(op.nestedExpression(), device) + { } + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename internal::traits<XprType>::Scalar CoeffReturnType; + typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions; + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + m_argImpl.evalSubExprsIfNeeded(NULL); + return true; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_argImpl.cleanup(); + } + + EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const + { + return m_functor(m_argImpl.coeff(index)); + } + + template<int LoadMode> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + return m_functor.packetOp(m_argImpl.template packet<LoadMode>(index)); + } + + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } + + private: + const UnaryOp m_functor; + TensorEvaluator<ArgType, Device> m_argImpl; +}; + + +// -------------------- CwiseBinaryOp -------------------- + +template<typename BinaryOp, typename LeftArgType, typename RightArgType, typename Device> +struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArgType>, Device> +{ + typedef TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArgType> XprType; + + enum { + IsAligned = TensorEvaluator<LeftArgType, Device>::IsAligned & + TensorEvaluator<RightArgType, Device>::IsAligned, + PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & + TensorEvaluator<RightArgType, Device>::PacketAccess & + internal::functor_traits<BinaryOp>::PacketAccess, + BlockAccess = false, + Layout = TensorEvaluator<LeftArgType, Device>::Layout, + CoordAccess = false, // to be implemented + }; + + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) + : m_functor(op.functor()), + m_leftImpl(op.lhsExpression(), device), + m_rightImpl(op.rhsExpression(), device) + { + EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout) || internal::traits<XprType>::NumDimensions <= 1), YOU_MADE_A_PROGRAMMING_MISTAKE); + eigen_assert(dimensions_match(m_leftImpl.dimensions(), m_rightImpl.dimensions())); + } + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename internal::traits<XprType>::Scalar CoeffReturnType; + typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + typedef typename TensorEvaluator<LeftArgType, Device>::Dimensions Dimensions; + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const + { + // TODO: use right impl instead if right impl dimensions are known at compile time. + return m_leftImpl.dimensions(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { + m_leftImpl.evalSubExprsIfNeeded(NULL); + m_rightImpl.evalSubExprsIfNeeded(NULL); + return true; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_leftImpl.cleanup(); + m_rightImpl.cleanup(); + } + + EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const + { + return m_functor(m_leftImpl.coeff(index), m_rightImpl.coeff(index)); + } + template<int LoadMode> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + return m_functor.packetOp(m_leftImpl.template packet<LoadMode>(index), m_rightImpl.template packet<LoadMode>(index)); + } + + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } + + private: + const BinaryOp m_functor; + TensorEvaluator<LeftArgType, Device> m_leftImpl; + TensorEvaluator<RightArgType, Device> m_rightImpl; +}; + + +// -------------------- SelectOp -------------------- + +template<typename IfArgType, typename ThenArgType, typename ElseArgType, typename Device> +struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>, Device> +{ + typedef TensorSelectOp<IfArgType, ThenArgType, ElseArgType> XprType; + typedef typename XprType::Scalar Scalar; + + enum { + IsAligned = TensorEvaluator<ThenArgType, Device>::IsAligned & + TensorEvaluator<ElseArgType, Device>::IsAligned, + PacketAccess = TensorEvaluator<ThenArgType, Device>::PacketAccess & + TensorEvaluator<ElseArgType, Device>::PacketAccess & + internal::packet_traits<Scalar>::HasBlend, + BlockAccess = false, + Layout = TensorEvaluator<IfArgType, Device>::Layout, + CoordAccess = false, // to be implemented + }; + + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) + : m_condImpl(op.ifExpression(), device), + m_thenImpl(op.thenExpression(), device), + m_elseImpl(op.elseExpression(), device) + { + EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<IfArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<ThenArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<IfArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<ElseArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); + eigen_assert(dimensions_match(m_condImpl.dimensions(), m_thenImpl.dimensions())); + eigen_assert(dimensions_match(m_thenImpl.dimensions(), m_elseImpl.dimensions())); + } + + typedef typename XprType::Index Index; + typedef typename internal::traits<XprType>::Scalar CoeffReturnType; + typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + typedef typename TensorEvaluator<IfArgType, Device>::Dimensions Dimensions; + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const + { + // TODO: use then or else impl instead if they happen to be known at compile time. + return m_condImpl.dimensions(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { + m_condImpl.evalSubExprsIfNeeded(NULL); + m_thenImpl.evalSubExprsIfNeeded(NULL); + m_elseImpl.evalSubExprsIfNeeded(NULL); + return true; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_condImpl.cleanup(); + m_thenImpl.cleanup(); + m_elseImpl.cleanup(); + } + + EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const + { + return m_condImpl.coeff(index) ? m_thenImpl.coeff(index) : m_elseImpl.coeff(index); + } + template<int LoadMode> + EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const + { + const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; + internal::Selector<PacketSize> select; + for (Index i = 0; i < PacketSize; ++i) { + select.select[i] = m_condImpl.coeff(index+i); + } + return internal::pblend(select, + m_thenImpl.template packet<LoadMode>(index), + m_elseImpl.template packet<LoadMode>(index)); + } + + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } + + private: + TensorEvaluator<IfArgType, Device> m_condImpl; + TensorEvaluator<ThenArgType, Device> m_thenImpl; + TensorEvaluator<ElseArgType, Device> m_elseImpl; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h new file mode 100644 index 0000000000..863c28ab43 --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -0,0 +1,461 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H +#define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H + +namespace Eigen { + +/** \class TensorExecutor + * \ingroup CXX11_Tensor_Module + * + * \brief The tensor executor class. + * + * This class is responsible for launch the evaluation of the expression on + * the specified computing device. + */ +namespace internal { + +// Default strategy: the expression is evaluated with a single cpu thread. +template <typename Expression, typename Device, + bool Vectorizable, bool Tileable> +class TensorExecutor { + public: + typedef typename Expression::Index Index; + EIGEN_DEVICE_FUNC static inline void run(const Expression& expr, const Device& device = Device()) + { + TensorEvaluator<Expression, Device> evaluator(expr, device); + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + if (needs_assign) + { + const Index size = array_prod(evaluator.dimensions()); + for (Index i = 0; i < size; ++i) { + evaluator.evalScalar(i); + } + } + evaluator.cleanup(); + } +}; + +template <typename Expression> +class TensorExecutor<Expression, DefaultDevice, true, false> { + public: + typedef typename Expression::Index Index; + EIGEN_DEVICE_FUNC + static inline void run(const Expression& expr, const DefaultDevice& device = DefaultDevice()) + { + TensorEvaluator<Expression, DefaultDevice> evaluator(expr, device); + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + if (needs_assign) + { + const Index size = array_prod(evaluator.dimensions()); + const int PacketSize = unpacket_traits<typename TensorEvaluator<Expression, DefaultDevice>::PacketReturnType>::size; + + // Manually unroll this loop since compilers don't do it. + const Index UnrolledSize = (size / (4 * PacketSize)) * 4 * PacketSize; + for (Index i = 0; i < UnrolledSize; i += 4*PacketSize) { + evaluator.evalPacket(i); + evaluator.evalPacket(i+PacketSize); + evaluator.evalPacket(i+2*PacketSize); + evaluator.evalPacket(i+3*PacketSize); + } + const Index VectorizedSize = (size / PacketSize) * PacketSize; + for (Index i = UnrolledSize; i < VectorizedSize; i += PacketSize) { + evaluator.evalPacket(i); + } + for (Index i = VectorizedSize; i < size; ++i) { + evaluator.evalScalar(i); + } + } + evaluator.cleanup(); + } +}; + +template <typename Expression, bool Vectorizable> +class TensorExecutor<Expression, DefaultDevice, Vectorizable, true> { + public: + typedef typename Expression::Index Index; + EIGEN_DEVICE_FUNC + static inline void run(const Expression& expr, + const DefaultDevice& device = DefaultDevice()) { + typedef TensorEvaluator<Expression, DefaultDevice> Evaluator; + typedef typename traits<Expression>::Scalar Scalar; + typedef typename traits<Expression>::Index Index; + const std::size_t NumDims = traits<Expression>::NumDimensions; + + typedef TensorBlockMapper<Index, + typename internal::remove_const<Scalar>::type, + NumDims, Evaluator::Layout> TensorBlockMapper; + typedef TensorBlock<Index, typename internal::remove_const<Scalar>::type, + NumDims, Evaluator::Layout> TensorBlock; + + Evaluator evaluator(expr, device); + std::size_t total_size = array_prod(evaluator.dimensions()); + std::size_t cache_size = device.firstLevelCacheSize() / sizeof(Scalar); + if (total_size < cache_size) { + // TODO(andydavis) Reduce block management overhead for small tensors. + internal::TensorExecutor<Expression, DefaultDevice, Vectorizable, + false>::run(expr, device); + return; + } + + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + if (needs_assign) { + // Size tensor blocks to fit in cache (or requested target block size). + size_t block_total_size = numext::mini(cache_size, total_size); + TensorBlockShapeType block_shape = kUniformAllDims; + // Query expression tree for desired block size/shape. + std::vector<internal::TensorOpResourceRequirements> resources; + evaluator.getResourceRequirements(&resources); + if (!resources.empty()) { + // TODO(andydavis) Implement different policies (i.e. revert to a + // default policy if block shapes/sizes conflict). + block_shape = resources[0].block_shape; + block_total_size = resources[0].block_total_size; + } + + TensorBlockMapper block_mapper(evaluator.dimensions(), + block_shape, + block_total_size); + + Scalar* data = static_cast<Scalar*>(device.allocate( + block_total_size * sizeof(Scalar))); + + const Index total_block_count = block_mapper.total_block_count(); + for (Index i = 0; i < total_block_count; ++i) { + TensorBlock block = block_mapper.GetBlockForIndex(i, data); + evaluator.evalBlock(&block); + } + device.deallocate(data); + } + evaluator.cleanup(); + } +}; + +// Multicore strategy: the index space is partitioned and each partition is executed on a single core +#ifdef EIGEN_USE_THREADS +template <typename Evaluator, typename Index, bool Vectorizable> +struct EvalRange { + static void run(Evaluator evaluator, const Index first, const Index last) { + eigen_assert(last > first); + for (Index i = first; i < last; ++i) { + evaluator.evalScalar(i); + } + } +}; + +template <typename Evaluator, typename Index> +struct EvalRange<Evaluator, Index, true> { + static void run(Evaluator evaluator, const Index first, const Index last) { + eigen_assert(last > first); + + Index i = first; + static const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size; + if (last - first >= PacketSize) { + eigen_assert(first % PacketSize == 0); + Index lastPacket = last - (last % PacketSize); + for (; i < lastPacket; i += PacketSize) { + evaluator.evalPacket(i); + } + } + + for (; i < last; ++i) { + evaluator.evalScalar(i); + } + } +}; + +template <typename Expression, bool Vectorizable, bool Tileable> +class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tileable> { + public: + typedef typename Expression::Index Index; + static inline void run(const Expression& expr, const ThreadPoolDevice& device) + { + if (device.numThreads() <= 1) { + DefaultDevice dd; + TensorExecutor<Expression, DefaultDevice, Vectorizable, Tileable>::run(expr, dd); + return; + } + + typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator; + Evaluator evaluator(expr, device); + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + if (needs_assign) + { + const Index size = array_prod(evaluator.dimensions()); + + static const Index PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1; + Index blocksz = std::ceil<Index>(static_cast<float>(size)/device.numThreads()) + PacketSize - 1; + const Index blocksize = numext::maxi<Index>(PacketSize, (blocksz - (blocksz % PacketSize))); + const Index numblocks = size / blocksize; + + Index i = 0; + FixedSizeVector<Notification*> results(numblocks); + for (int i = 0; i < numblocks; ++i) { + results.push_back(device.enqueue(&EvalRange<Evaluator, Index, Vectorizable>::run, evaluator, i*blocksize, (i+1)*blocksize)); + } + + if (numblocks * blocksize < size) { + EvalRange<Evaluator, Index, Vectorizable>::run(evaluator, numblocks * blocksize, size); + } + + for (int i = 0; i < numblocks; ++i) { + wait_until_ready(results[i]); + delete results[i]; + } + } + evaluator.cleanup(); + } +}; + +template <typename Index, typename Scalar> +struct BlockRange { + BlockRange(Index s, Index l, Scalar* d) + : index_start(s), index_limit(l), data(d) {} + const Index index_start; + const Index index_limit; + Scalar* data; +}; + +template <typename Evaluator, typename Index, typename Scalar, + std::size_t NumDims> +struct EvalBlockRange { + typedef TensorBlockMapper<Index, Scalar, NumDims, Evaluator::Layout> + BlockMapper; + + static void run(Evaluator evaluator, const BlockMapper& block_mapper, + BlockRange<Index, Scalar> block_range) { + typedef TensorBlock<Index, Scalar, NumDims, Evaluator::Layout> + TensorBlock; + eigen_assert(block_range.index_limit > block_range.index_start); + + for (Index i = block_range.index_start; i < block_range.index_limit; ++i) { + TensorBlock block = block_mapper.GetBlockForIndex(i, block_range.data); + evaluator.evalBlock(&block); + } + } +}; + +template <typename Expression, bool Vectorizable> +class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, true> { + public: + typedef typename Expression::Index Index; + static inline void run(const Expression& expr, + const ThreadPoolDevice& device) { + typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator; + typedef typename internal::remove_const< + typename traits<Expression>::Scalar>::type Scalar; + typedef typename traits<Expression>::Index Index; + static const std::size_t NumDims = traits<Expression>::NumDimensions; + typedef TensorBlockMapper<Index, Scalar, NumDims, Evaluator::Layout> + TensorBlockMapper; + typedef TensorBlock<Index, Scalar, NumDims, Evaluator::Layout> + TensorBlock; + typedef BlockRange<Index, Scalar> BlockRange; + + Evaluator evaluator(expr, device); + std::size_t total_size = array_prod(evaluator.dimensions()); + std::size_t cache_size = device.firstLevelCacheSize() / sizeof(Scalar); + if (total_size < cache_size || device.numThreads() <= 1) { + // TODO(andydavis) Reduce block management overhead for small tensors. + DefaultDevice dd; + internal::TensorExecutor<Expression, DefaultDevice, Vectorizable, false>::run(expr, dd); + return; + } + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + if (needs_assign) { + TensorBlockShapeType block_shape = kUniformAllDims; + size_t block_total_size = 0; + // Query expression tree for desired block size/shape. + std::vector<internal::TensorOpResourceRequirements> resources; + evaluator.getResourceRequirements(&resources); + if (!resources.empty()) { + // TODO(andydavis) Implement different shape/size policies. + block_shape = resources[0].block_shape; + block_total_size = resources[0].block_total_size; + } + + // Divide the tensor coefficients across the number of threads, subject + // to min/max block size constraints. + const size_t min_block_size = + device.firstLevelCacheSize() / sizeof(Scalar); + const size_t max_block_size = block_total_size > 0 ? block_total_size : + device.lastLevelCacheSize() / sizeof(Scalar); + const size_t target_block_size = numext::maxi( + min_block_size, + numext::mini(static_cast<size_t>(array_prod(evaluator.dimensions())) / device.numThreads(), + max_block_size)); + + TensorBlockMapper block_mapper(evaluator.dimensions(), + block_shape, + target_block_size); + + const Index block_partition_size = + (block_mapper.total_block_count() + device.numThreads() - 1) / + device.numThreads(); + const Index block_partition_count = + (block_mapper.total_block_count() + block_partition_size - 1) / + block_partition_size; + + if (block_partition_count == 1) { + // Avoid thread hop if no parallelism is possible. + Scalar* data = static_cast<Scalar*>( + device.allocate(target_block_size * sizeof(Scalar))); + EvalBlockRange<Evaluator, Index, Scalar, NumDims>::run( + evaluator, block_mapper, + BlockRange(0, block_mapper.total_block_count(), data)); + device.deallocate(data); + } else { + // Multi-threaded case. + struct ThreadState { + Notification* done; + Scalar* data; + }; + FixedSizeVector<ThreadState> thread_state(block_partition_count, + ThreadState()); + + // Dispatch threads. + for (int i = 0; i < block_partition_count; ++i) { + thread_state[i].data = static_cast<Scalar*>( + device.allocate(target_block_size * sizeof(Scalar))); + thread_state[i].done = device.enqueue( + &EvalBlockRange<Evaluator, Index, Scalar, NumDims>::run, + evaluator, block_mapper, + BlockRange(i * block_partition_size, + numext::mini((i + 1) * block_partition_size, + block_mapper.total_block_count()), + thread_state[i].data)); + } + + // Join threads. + for (int i = 0; i < block_partition_count; ++i) { + wait_until_ready(thread_state[i].done); + delete thread_state[i].done; + device.deallocate(thread_state[i].data); + } + } + } + evaluator.cleanup(); + } +}; + +#endif + + +// GPU: the evaluation of the expression is offloaded to a GPU. +#if defined(EIGEN_USE_GPU) + +template <typename Expression, bool Tileable> +class TensorExecutor<Expression, GpuDevice, false, Tileable> { + public: + typedef typename Expression::Index Index; + static void run(const Expression& expr, const GpuDevice& device); +}; + +template <typename Expression, bool Tileable> +class TensorExecutor<Expression, GpuDevice, true, Tileable> { + public: + typedef typename Expression::Index Index; + static void run(const Expression& expr, const GpuDevice& device); +}; + +#if defined(__CUDACC__) +template <typename Evaluator, typename Index> +__global__ void +__launch_bounds__(1024) + EigenMetaKernel_NonVectorizable(Evaluator memcopied_eval, Index size) { + + const Index first_index = blockIdx.x * blockDim.x + threadIdx.x; + const Index step_size = blockDim.x * gridDim.x; + + // Cuda memcopies the kernel arguments. That's fine for POD, but for more + // complex types such as evaluators we should really conform to the C++ + // standard and call a proper copy constructor. + Evaluator eval(memcopied_eval); + + // Use the scalar path + for (Index i = first_index; i < size; i += step_size) { + eval.evalScalar(i); + } +} + +template <typename Evaluator, typename Index> +__global__ void +__launch_bounds__(1024) + EigenMetaKernel_Vectorizable(Evaluator memcopied_eval, Index size) { + + const Index first_index = blockIdx.x * blockDim.x + threadIdx.x; + const Index step_size = blockDim.x * gridDim.x; + + // Cuda memcopies the kernel arguments. That's fine for POD, but for more + // complex types such as evaluators we should really conform to the C++ + // standard and call a proper copy constructor. + Evaluator eval(memcopied_eval); + + // Use the vector path + const Index PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size; + const Index vectorized_step_size = step_size * PacketSize; + const Index vectorized_size = (size / PacketSize) * PacketSize; + for (Index i = first_index * PacketSize; i < vectorized_size; + i += vectorized_step_size) { + eval.evalPacket(i); + } + for (Index i = vectorized_size + first_index; i < size; i += step_size) { + eval.evalScalar(i); + } +} + +/*static*/ +template <typename Expression, bool Tileable> +inline void TensorExecutor<Expression, GpuDevice, false, Tileable>::run( + const Expression& expr, const GpuDevice& device) { + TensorEvaluator<Expression, GpuDevice> evaluator(expr, device); + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + if (needs_assign) { + const int num_blocks = device.getNumCudaMultiProcessors() * + device.maxCudaThreadsPerMultiProcessor() / + device.maxCudaThreadsPerBlock(); + const int block_size = device.maxCudaThreadsPerBlock(); + const Index size = array_prod(evaluator.dimensions()); + LAUNCH_CUDA_KERNEL( + (EigenMetaKernel_NonVectorizable<TensorEvaluator<Expression, GpuDevice>, + Index>), + num_blocks, block_size, 0, device, evaluator, size); + } + evaluator.cleanup(); +} + +/*static*/ +template <typename Expression, bool Tileable> +inline void TensorExecutor<Expression, GpuDevice, true, Tileable>::run( + const Expression& expr, const GpuDevice& device) { + TensorEvaluator<Expression, GpuDevice> evaluator(expr, device); + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + if (needs_assign) { + const int num_blocks = device.getNumCudaMultiProcessors() * + device.maxCudaThreadsPerMultiProcessor() / + device.maxCudaThreadsPerBlock(); + const int block_size = device.maxCudaThreadsPerBlock(); + const Index size = array_prod(evaluator.dimensions()); + LAUNCH_CUDA_KERNEL( + (EigenMetaKernel_Vectorizable<TensorEvaluator<Expression, GpuDevice>, + Index>), + num_blocks, block_size, 0, device, evaluator, size); + } + evaluator.cleanup(); +} + +#endif // __CUDACC__ +#endif // EIGEN_USE_GPU + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h new file mode 100644 index 0000000000..49d849e233 --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h @@ -0,0 +1,291 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_EXPR_H +#define EIGEN_CXX11_TENSOR_TENSOR_EXPR_H + +namespace Eigen { + +/** \class TensorExpr + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor expression classes. + * + * The TensorCwiseNullaryOp class applies a nullary operators to an expression. + * This is typically used to generate constants. + * + * The TensorCwiseUnaryOp class represents an expression where a unary operator + * (e.g. cwiseSqrt) is applied to an expression. + * + * The TensorCwiseBinaryOp class represents an expression where a binary + * operator (e.g. addition) is applied to a lhs and a rhs expression. + * + */ +namespace internal { +template<typename NullaryOp, typename XprType> +struct traits<TensorCwiseNullaryOp<NullaryOp, XprType> > + : traits<XprType> +{ + typedef traits<XprType> XprTraits; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::Nested XprTypeNested; + typedef typename remove_reference<XprTypeNested>::type _XprTypeNested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; + + enum { + Flags = 0, + }; +}; + +} // end namespace internal + + + +template<typename NullaryOp, typename XprType> +class TensorCwiseNullaryOp : public TensorBase<TensorCwiseNullaryOp<NullaryOp, XprType>, ReadOnlyAccessors> +{ + public: + typedef typename Eigen::internal::traits<TensorCwiseNullaryOp>::Scalar Scalar; + typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef TensorCwiseNullaryOp<NullaryOp, XprType> Nested; + typedef typename Eigen::internal::traits<TensorCwiseNullaryOp>::StorageKind StorageKind; + typedef typename Eigen::internal::traits<TensorCwiseNullaryOp>::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseNullaryOp(const XprType& xpr, const NullaryOp& func = NullaryOp()) + : m_xpr(xpr), m_functor(func) {} + + EIGEN_DEVICE_FUNC + const typename internal::remove_all<typename XprType::Nested>::type& + nestedExpression() const { return m_xpr; } + + EIGEN_DEVICE_FUNC + const NullaryOp& functor() const { return m_functor; } + + protected: + typename XprType::Nested m_xpr; + const NullaryOp m_functor; +}; + + + +namespace internal { +template<typename UnaryOp, typename XprType> +struct traits<TensorCwiseUnaryOp<UnaryOp, XprType> > + : traits<XprType> +{ + // TODO(phli): Add InputScalar, InputPacket. Check references to + // current Scalar/Packet to see if the intent is Input or Output. + typedef typename result_of<UnaryOp(typename XprType::Scalar)>::type Scalar; + typedef traits<XprType> XprTraits; + typedef typename XprType::Nested XprTypeNested; + typedef typename remove_reference<XprTypeNested>::type _XprTypeNested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; +}; + +template<typename UnaryOp, typename XprType> +struct eval<TensorCwiseUnaryOp<UnaryOp, XprType>, Eigen::Dense> +{ + typedef const TensorCwiseUnaryOp<UnaryOp, XprType>& type; +}; + +template<typename UnaryOp, typename XprType> +struct nested<TensorCwiseUnaryOp<UnaryOp, XprType>, 1, typename eval<TensorCwiseUnaryOp<UnaryOp, XprType> >::type> +{ + typedef TensorCwiseUnaryOp<UnaryOp, XprType> type; +}; + +} // end namespace internal + + + +template<typename UnaryOp, typename XprType> +class TensorCwiseUnaryOp : public TensorBase<TensorCwiseUnaryOp<UnaryOp, XprType>, ReadOnlyAccessors> +{ + public: + // TODO(phli): Add InputScalar, InputPacket. Check references to + // current Scalar/Packet to see if the intent is Input or Output. + typedef typename Eigen::internal::traits<TensorCwiseUnaryOp>::Scalar Scalar; + typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; + typedef Scalar CoeffReturnType; + typedef typename Eigen::internal::nested<TensorCwiseUnaryOp>::type Nested; + typedef typename Eigen::internal::traits<TensorCwiseUnaryOp>::StorageKind StorageKind; + typedef typename Eigen::internal::traits<TensorCwiseUnaryOp>::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseUnaryOp(const XprType& xpr, const UnaryOp& func = UnaryOp()) + : m_xpr(xpr), m_functor(func) {} + + EIGEN_DEVICE_FUNC + const UnaryOp& functor() const { return m_functor; } + + /** \returns the nested expression */ + EIGEN_DEVICE_FUNC + const typename internal::remove_all<typename XprType::Nested>::type& + nestedExpression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; + const UnaryOp m_functor; +}; + + +namespace internal { +template<typename BinaryOp, typename LhsXprType, typename RhsXprType> +struct traits<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> > +{ + // Type promotion to handle the case where the types of the lhs and the rhs + // are different. + // TODO(phli): Add Lhs/RhsScalar, Lhs/RhsPacket. Check references to + // current Scalar/Packet to see if the intent is Inputs or Output. + typedef typename result_of< + BinaryOp(typename LhsXprType::Scalar, + typename RhsXprType::Scalar)>::type Scalar; + typedef traits<LhsXprType> XprTraits; + typedef typename promote_storage_type< + typename traits<LhsXprType>::StorageKind, + typename traits<RhsXprType>::StorageKind>::ret StorageKind; + typedef typename promote_index_type< + typename traits<LhsXprType>::Index, + typename traits<RhsXprType>::Index>::type Index; + typedef typename LhsXprType::Nested LhsNested; + typedef typename RhsXprType::Nested RhsNested; + typedef typename remove_reference<LhsNested>::type _LhsNested; + typedef typename remove_reference<RhsNested>::type _RhsNested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; + + enum { + Flags = 0, + }; +}; + +template<typename BinaryOp, typename LhsXprType, typename RhsXprType> +struct eval<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType>, Eigen::Dense> +{ + typedef const TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType>& type; +}; + +template<typename BinaryOp, typename LhsXprType, typename RhsXprType> +struct nested<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType>, 1, typename eval<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> >::type> +{ + typedef TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> type; +}; + +} // end namespace internal + + + +template<typename BinaryOp, typename LhsXprType, typename RhsXprType> +class TensorCwiseBinaryOp : public TensorBase<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType>, ReadOnlyAccessors> +{ + public: + // TODO(phli): Add Lhs/RhsScalar, Lhs/RhsPacket. Check references to + // current Scalar/Packet to see if the intent is Inputs or Output. + typedef typename Eigen::internal::traits<TensorCwiseBinaryOp>::Scalar Scalar; + typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; + typedef Scalar CoeffReturnType; + typedef typename Eigen::internal::nested<TensorCwiseBinaryOp>::type Nested; + typedef typename Eigen::internal::traits<TensorCwiseBinaryOp>::StorageKind StorageKind; + typedef typename Eigen::internal::traits<TensorCwiseBinaryOp>::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseBinaryOp(const LhsXprType& lhs, const RhsXprType& rhs, const BinaryOp& func = BinaryOp()) + : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_functor(func) {} + + EIGEN_DEVICE_FUNC + const BinaryOp& functor() const { return m_functor; } + + /** \returns the nested expressions */ + EIGEN_DEVICE_FUNC + const typename internal::remove_all<typename LhsXprType::Nested>::type& + lhsExpression() const { return m_lhs_xpr; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all<typename RhsXprType::Nested>::type& + rhsExpression() const { return m_rhs_xpr; } + + protected: + typename LhsXprType::Nested m_lhs_xpr; + typename RhsXprType::Nested m_rhs_xpr; + const BinaryOp m_functor; +}; + + +namespace internal { +template<typename IfXprType, typename ThenXprType, typename ElseXprType> +struct traits<TensorSelectOp<IfXprType, ThenXprType, ElseXprType> > + : traits<ThenXprType> +{ + typedef typename traits<ThenXprType>::Scalar Scalar; + typedef traits<ThenXprType> XprTraits; + typedef typename promote_storage_type<typename traits<ThenXprType>::StorageKind, + typename traits<ElseXprType>::StorageKind>::ret StorageKind; + typedef typename promote_index_type<typename traits<ElseXprType>::Index, + typename traits<ThenXprType>::Index>::type Index; + typedef typename IfXprType::Nested IfNested; + typedef typename ThenXprType::Nested ThenNested; + typedef typename ElseXprType::Nested ElseNested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; +}; + +template<typename IfXprType, typename ThenXprType, typename ElseXprType> +struct eval<TensorSelectOp<IfXprType, ThenXprType, ElseXprType>, Eigen::Dense> +{ + typedef const TensorSelectOp<IfXprType, ThenXprType, ElseXprType>& type; +}; + +template<typename IfXprType, typename ThenXprType, typename ElseXprType> +struct nested<TensorSelectOp<IfXprType, ThenXprType, ElseXprType>, 1, typename eval<TensorSelectOp<IfXprType, ThenXprType, ElseXprType> >::type> +{ + typedef TensorSelectOp<IfXprType, ThenXprType, ElseXprType> type; +}; + +} // end namespace internal + + +template<typename IfXprType, typename ThenXprType, typename ElseXprType> +class TensorSelectOp : public TensorBase<TensorSelectOp<IfXprType, ThenXprType, ElseXprType> > +{ + public: + typedef typename Eigen::internal::traits<TensorSelectOp>::Scalar Scalar; + typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; + typedef typename internal::promote_storage_type<typename ThenXprType::CoeffReturnType, + typename ElseXprType::CoeffReturnType>::ret CoeffReturnType; + typedef typename Eigen::internal::nested<TensorSelectOp>::type Nested; + typedef typename Eigen::internal::traits<TensorSelectOp>::StorageKind StorageKind; + typedef typename Eigen::internal::traits<TensorSelectOp>::Index Index; + + EIGEN_DEVICE_FUNC + TensorSelectOp(const IfXprType& a_condition, + const ThenXprType& a_then, + const ElseXprType& a_else) + : m_condition(a_condition), m_then(a_then), m_else(a_else) + { } + + EIGEN_DEVICE_FUNC + const IfXprType& ifExpression() const { return m_condition; } + + EIGEN_DEVICE_FUNC + const ThenXprType& thenExpression() const { return m_then; } + + EIGEN_DEVICE_FUNC + const ElseXprType& elseExpression() const { return m_else; } + + protected: + typename IfXprType::Nested m_condition; + typename ThenXprType::Nested m_then; + typename ElseXprType::Nested m_else; +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_EXPR_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h new file mode 100644 index 0000000000..ac73366762 --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h @@ -0,0 +1,846 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Jianwei Cui <thucjw@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_FFT_H +#define EIGEN_CXX11_TENSOR_TENSOR_FFT_H +namespace Eigen { + +/** \class TensorFFT + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor FFT class. + * + * TODO: + * Vectorize the Cooley Tukey and the Bluestein algorithm + * Add support for multithreaded evaluation + * Improve the performance on GPU + */ + +template <bool NeedUprade> struct MakeComplex { + template <typename T> + #if defined(EIGEN_USE_GPU) && defined(__CUDACC__) && !defined(__GCUDACC__) + EIGEN_DEVICE_FUNC + #endif + T operator() (const T& val) const { return val; } +}; + +template <> struct MakeComplex<true> { + template <typename T> + #if defined(EIGEN_USE_GPU) && defined(__CUDACC__) && !defined(__GCUDACC__) + EIGEN_DEVICE_FUNC + #endif + std::complex<T> operator() (const T& val) const { return std::complex<T>(val, 0); } +}; + +template <> struct MakeComplex<false> { + template <typename T> + #if defined(EIGEN_USE_GPU) && defined(__CUDACC__) && !defined(__GCUDACC__) + EIGEN_DEVICE_FUNC + #endif + std::complex<T> operator() (const std::complex<T>& val) const { return val; } +}; + +template <int ResultType> struct PartOf { + template <typename T> T operator() (const T& val) const { return val; } +}; + +template <> struct PartOf<RealPart> { + template <typename T> T operator() (const std::complex<T>& val) const { return val.real(); } +}; + +template <> struct PartOf<ImagPart> { + template <typename T> T operator() (const std::complex<T>& val) const { return val.imag(); } +}; + +namespace internal { +template <typename FFT, typename XprType, int FFTResultType, int FFTDir> +struct traits<TensorFFTOp<FFT, XprType, FFTResultType, FFTDir> > : public traits<XprType> { + typedef traits<XprType> XprTraits; + typedef typename NumTraits<typename XprTraits::Scalar>::Real RealScalar; + typedef typename std::complex<RealScalar> ComplexScalar; + typedef typename XprTraits::Scalar InputScalar; + typedef typename conditional<FFTResultType == RealPart || FFTResultType == ImagPart, RealScalar, ComplexScalar>::type OutputScalar; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference<Nested>::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; +}; + +template <typename FFT, typename XprType, int FFTResultType, int FFTDirection> +struct eval<TensorFFTOp<FFT, XprType, FFTResultType, FFTDirection>, Eigen::Dense> { + typedef const TensorFFTOp<FFT, XprType, FFTResultType, FFTDirection>& type; +}; + +template <typename FFT, typename XprType, int FFTResultType, int FFTDirection> +struct nested<TensorFFTOp<FFT, XprType, FFTResultType, FFTDirection>, 1, typename eval<TensorFFTOp<FFT, XprType, FFTResultType, FFTDirection> >::type> { + typedef TensorFFTOp<FFT, XprType, FFTResultType, FFTDirection> type; +}; + +} // end namespace internal + +template <typename FFT, typename XprType, int FFTResultType, int FFTDir> +class TensorFFTOp : public TensorBase<TensorFFTOp<FFT, XprType, FFTResultType, FFTDir>, ReadOnlyAccessors> { + public: + typedef typename Eigen::internal::traits<TensorFFTOp>::Scalar Scalar; + typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; + typedef typename std::complex<RealScalar> ComplexScalar; + typedef typename internal::conditional<FFTResultType == RealPart || FFTResultType == ImagPart, RealScalar, ComplexScalar>::type OutputScalar; + typedef OutputScalar CoeffReturnType; + typedef typename Eigen::internal::nested<TensorFFTOp>::type Nested; + typedef typename Eigen::internal::traits<TensorFFTOp>::StorageKind StorageKind; + typedef typename Eigen::internal::traits<TensorFFTOp>::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorFFTOp(const XprType& expr, const FFT& fft) + : m_xpr(expr), m_fft(fft) {} + + EIGEN_DEVICE_FUNC + const FFT& fft() const { return m_fft; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all<typename XprType::Nested>::type& expression() const { + return m_xpr; + } + + protected: + typename XprType::Nested m_xpr; + const FFT m_fft; +}; + +// Eval as rvalue +template <typename FFT, typename ArgType, typename Device, int FFTResultType, int FFTDir> +struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, Device> { + typedef TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir> XprType; + typedef typename XprType::Index Index; + static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value; + typedef DSizes<Index, NumDims> Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; + typedef typename std::complex<RealScalar> ComplexScalar; + typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions; + typedef internal::traits<XprType> XprTraits; + typedef typename XprTraits::Scalar InputScalar; + typedef typename internal::conditional<FFTResultType == RealPart || FFTResultType == ImagPart, RealScalar, ComplexScalar>::type OutputScalar; + typedef OutputScalar CoeffReturnType; + typedef typename PacketType<OutputScalar, Device>::type PacketReturnType; + + enum { + IsAligned = false, + PacketAccess = true, + BlockAccess = false, + Layout = TensorEvaluator<ArgType, Device>::Layout, + CoordAccess = false, + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_data(NULL), m_impl(op.expression(), device), m_fft(op.fft()), m_device(device) { + const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions(); + for (int i = 0; i < NumDims; ++i) { + eigen_assert(input_dims[i] > 0); + m_dimensions[i] = input_dims[i]; + } + + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + m_strides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_strides[i] = m_strides[i - 1] * m_dimensions[i - 1]; + } + } else { + m_strides[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_strides[i] = m_strides[i + 1] * m_dimensions[i + 1]; + } + } + m_size = m_dimensions.TotalSize(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { + return m_dimensions; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(OutputScalar* data) { + m_impl.evalSubExprsIfNeeded(NULL); + if (data) { + evalToBuf(data); + return false; + } else { + m_data = (CoeffReturnType*)m_device.allocate(sizeof(CoeffReturnType) * m_size); + evalToBuf(m_data); + return true; + } + } + + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + if (m_data) { + m_device.deallocate(m_data); + m_data = NULL; + } + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffReturnType coeff(Index index) const { + return m_data[index]; + } + + template<int LoadMode> + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType packet(Index index) const { + return internal::ploadt<PacketReturnType, LoadMode>(m_data + index); + } + + EIGEN_DEVICE_FUNC Scalar* data() const { return m_data; } + + + private: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalToBuf(OutputScalar* data) { + const bool write_to_out = internal::is_same<OutputScalar, ComplexScalar>::value; + ComplexScalar* buf = write_to_out ? (ComplexScalar*)data : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * m_size); + + for (int i = 0; i < m_size; ++i) { + buf[i] = MakeComplex<internal::is_same<InputScalar, RealScalar>::value>()(m_impl.coeff(i)); + } + + for (int i = 0; i < m_fft.size(); ++i) { + int dim = m_fft[i]; + eigen_assert(dim >= 0 && dim < NumDims); + Index line_len = m_dimensions[dim]; + eigen_assert(line_len >= 1); + ComplexScalar* line_buf = (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * line_len); + const bool is_power_of_two = isPowerOfTwo(line_len); + const int good_composite = is_power_of_two ? 0 : findGoodComposite(line_len); + const int log_len = is_power_of_two ? getLog2(line_len) : getLog2(good_composite); + + ComplexScalar* a = is_power_of_two ? NULL : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * good_composite); + ComplexScalar* b = is_power_of_two ? NULL : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * good_composite); + ComplexScalar* pos_j_base_powered = is_power_of_two ? NULL : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * (line_len + 1)); + if (!is_power_of_two) { + ComplexScalar pos_j_base = ComplexScalar(std::cos(M_PI/line_len), std::sin(M_PI/line_len)); + for (int i = 0; i < line_len + 1; ++i) { + pos_j_base_powered[i] = std::pow(pos_j_base, i * i); + } + } + + for (Index partial_index = 0; partial_index < m_size / line_len; ++partial_index) { + Index base_offset = getBaseOffsetFromIndex(partial_index, dim); + + // get data into line_buf + for (int j = 0; j < line_len; ++j) { + Index offset = getIndexFromOffset(base_offset, dim, j); + line_buf[j] = buf[offset]; + } + + // processs the line + if (is_power_of_two) { + processDataLineCooleyTukey(line_buf, line_len, log_len); + } + else { + processDataLineBluestein(line_buf, line_len, good_composite, log_len, a, b, pos_j_base_powered); + } + + // write back + for (int j = 0; j < line_len; ++j) { + const ComplexScalar div_factor = (FFTDir == FFT_FORWARD) ? ComplexScalar(1, 0) : ComplexScalar(line_len, 0); + Index offset = getIndexFromOffset(base_offset, dim, j); + buf[offset] = line_buf[j] / div_factor; + } + } + m_device.deallocate(line_buf); + if (!pos_j_base_powered) { + m_device.deallocate(a); + m_device.deallocate(b); + m_device.deallocate(pos_j_base_powered); + } + } + + if(!write_to_out) { + for (int i = 0; i < m_size; ++i) { + data[i] = PartOf<FFTResultType>()(buf[i]); + } + m_device.deallocate(buf); + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static bool isPowerOfTwo(int x) { + eigen_assert(x > 0); + return !(x & (x - 1)); + } + + //the composite number for padding, used in Bluestein's FFT algorithm + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int findGoodComposite(int n) { + int i = 2; + while (i < 2 * n - 1) i *= 2; + return i; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int getLog2(int m) { + int log2m = 0; + while (m >>= 1) log2m++; + return log2m; + } + + // Call Cooley Tukey algorithm directly, data length must be power of 2 + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void processDataLineCooleyTukey(ComplexScalar* line_buf, int line_len, int log_len) { + eigen_assert(isPowerOfTwo(line_len)); + scramble_FFT(line_buf, line_len); + compute_1D_Butterfly<FFTDir>(line_buf, line_len, log_len); + } + + // Call Bluestein's FFT algorithm, m is a good composite number greater than (2 * n - 1), used as the padding length + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void processDataLineBluestein(ComplexScalar* line_buf, int line_len, int good_composite, int log_len, ComplexScalar* a, ComplexScalar* b, const ComplexScalar* pos_j_base_powered) { + int n = line_len; + int m = good_composite; + ComplexScalar* data = line_buf; + + for (int i = 0; i < n; ++i) { + if(FFTDir == FFT_FORWARD) { + a[i] = data[i] * std::conj(pos_j_base_powered[i]); + } + else { + a[i] = data[i] * pos_j_base_powered[i]; + } + } + for (int i = n; i < m; ++i) { + a[i] = ComplexScalar(0, 0); + } + + for (int i = 0; i < n; ++i) { + if(FFTDir == FFT_FORWARD) { + b[i] = pos_j_base_powered[i]; + } + else { + b[i] = std::conj(pos_j_base_powered[i]); + } + } + for (int i = n; i < m - n; ++i) { + b[i] = ComplexScalar(0, 0); + } + for (int i = m - n; i < m; ++i) { + if(FFTDir == FFT_FORWARD) { + b[i] = pos_j_base_powered[m-i]; + } + else { + b[i] = std::conj(pos_j_base_powered[m-i]); + } + } + + scramble_FFT(a, m); + compute_1D_Butterfly<FFT_FORWARD>(a, m, log_len); + + scramble_FFT(b, m); + compute_1D_Butterfly<FFT_FORWARD>(b, m, log_len); + + for (int i = 0; i < m; ++i) { + a[i] *= b[i]; + } + + scramble_FFT(a, m); + compute_1D_Butterfly<FFT_REVERSE>(a, m, log_len); + + //Do the scaling after ifft + for (int i = 0; i < m; ++i) { + a[i] /= m; + } + + for (int i = 0; i < n; ++i) { + if(FFTDir == FFT_FORWARD) { + data[i] = a[i] * std::conj(pos_j_base_powered[i]); + } + else { + data[i] = a[i] * pos_j_base_powered[i]; + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static void scramble_FFT(ComplexScalar* data, int n) { + eigen_assert(isPowerOfTwo(n)); + int j = 1; + for (int i = 1; i < n; ++i){ + if (j > i) { + std::swap(data[j-1], data[i-1]); + } + int m = n >> 1; + while (m >= 2 && j > m) { + j -= m; + m >>= 1; + } + j += m; + } + } + + template<int Dir> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_1D_Butterfly(ComplexScalar* data, int n, int n_power_of_2) { + eigen_assert(isPowerOfTwo(n)); + if (n == 1) { + return; + } + else if (n == 2) { + ComplexScalar tmp = data[1]; + data[1] = data[0] - data[1]; + data[0] += tmp; + return; + } + else if (n == 4) { + ComplexScalar tmp[4]; + tmp[0] = data[0] + data[1]; + tmp[1] = data[0] - data[1]; + tmp[2] = data[2] + data[3]; + if(Dir == FFT_FORWARD) { + tmp[3] = ComplexScalar(0.0, -1.0) * (data[2] - data[3]); + } + else { + tmp[3] = ComplexScalar(0.0, 1.0) * (data[2] - data[3]); + } + data[0] = tmp[0] + tmp[2]; + data[1] = tmp[1] + tmp[3]; + data[2] = tmp[0] - tmp[2]; + data[3] = tmp[1] - tmp[3]; + return; + } + else if (n == 8) { + ComplexScalar tmp_1[8]; + ComplexScalar tmp_2[8]; + + tmp_1[0] = data[0] + data[1]; + tmp_1[1] = data[0] - data[1]; + tmp_1[2] = data[2] + data[3]; + if (Dir == FFT_FORWARD) { + tmp_1[3] = (data[2] - data[3]) * ComplexScalar(0, -1); + } + else { + tmp_1[3] = (data[2] - data[3]) * ComplexScalar(0, 1); + } + tmp_1[4] = data[4] + data[5]; + tmp_1[5] = data[4] - data[5]; + tmp_1[6] = data[6] + data[7]; + if (Dir == FFT_FORWARD) { + tmp_1[7] = (data[6] - data[7]) * ComplexScalar(0, -1); + } + else { + tmp_1[7] = (data[6] - data[7]) * ComplexScalar(0, 1); + } + tmp_2[0] = tmp_1[0] + tmp_1[2]; + tmp_2[1] = tmp_1[1] + tmp_1[3]; + tmp_2[2] = tmp_1[0] - tmp_1[2]; + tmp_2[3] = tmp_1[1] - tmp_1[3]; + tmp_2[4] = tmp_1[4] + tmp_1[6]; + // SQRT2DIV2 = sqrt(2)/2 + #define SQRT2DIV2 0.7071067811865476 + if (Dir == FFT_FORWARD) { + tmp_2[5] = (tmp_1[5] + tmp_1[7]) * ComplexScalar(SQRT2DIV2, -SQRT2DIV2); + tmp_2[6] = (tmp_1[4] - tmp_1[6]) * ComplexScalar(0, -1); + tmp_2[7] = (tmp_1[5] - tmp_1[7]) * ComplexScalar(-SQRT2DIV2, -SQRT2DIV2); + } + else { + tmp_2[5] = (tmp_1[5] + tmp_1[7]) * ComplexScalar(SQRT2DIV2, SQRT2DIV2); + tmp_2[6] = (tmp_1[4] - tmp_1[6]) * ComplexScalar(0, 1); + tmp_2[7] = (tmp_1[5] - tmp_1[7]) * ComplexScalar(-SQRT2DIV2, SQRT2DIV2); + } + data[0] = tmp_2[0] + tmp_2[4]; + data[1] = tmp_2[1] + tmp_2[5]; + data[2] = tmp_2[2] + tmp_2[6]; + data[3] = tmp_2[3] + tmp_2[7]; + data[4] = tmp_2[0] - tmp_2[4]; + data[5] = tmp_2[1] - tmp_2[5]; + data[6] = tmp_2[2] - tmp_2[6]; + data[7] = tmp_2[3] - tmp_2[7]; + + return; + } + else { + compute_1D_Butterfly<Dir>(data, n/2, n_power_of_2 - 1); + compute_1D_Butterfly<Dir>(data + n/2, n/2, n_power_of_2 - 1); + //Original code: + //RealScalar wtemp = std::sin(M_PI/n); + //RealScalar wpi = -std::sin(2 * M_PI/n); + RealScalar wtemp = m_sin_PI_div_n_LUT[n_power_of_2]; + RealScalar wpi; + if (Dir == FFT_FORWARD) { + wpi = m_minus_sin_2_PI_div_n_LUT[n_power_of_2]; + } + else { + wpi = 0 - m_minus_sin_2_PI_div_n_LUT[n_power_of_2]; + } + + const ComplexScalar wp(wtemp, wpi); + ComplexScalar w(1.0, 0.0); + for(int i = 0; i < n/2; i++) { + ComplexScalar temp(data[i + n/2] * w); + data[i + n/2] = data[i] - temp; + data[i] += temp; + w += w * wp; + } + return; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index getBaseOffsetFromIndex(Index index, Index omitted_dim) const { + Index result = 0; + + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + for (int i = NumDims - 1; i > omitted_dim; --i) { + const Index partial_m_stride = m_strides[i] / m_dimensions[omitted_dim]; + const Index idx = index / partial_m_stride; + index -= idx * partial_m_stride; + result += idx * m_strides[i]; + } + result += index; + } + else { + for (int i = 0; i < omitted_dim; ++i) { + const Index partial_m_stride = m_strides[i] / m_dimensions[omitted_dim]; + const Index idx = index / partial_m_stride; + index -= idx * partial_m_stride; + result += idx * m_strides[i]; + } + result += index; + } + // Value of index_coords[omitted_dim] is not determined to this step + return result; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index getIndexFromOffset(Index base, Index omitted_dim, Index offset) const { + Index result = base + offset * m_strides[omitted_dim] ; + return result; + } + + protected: + int m_size; + const FFT& m_fft; + Dimensions m_dimensions; + array<Index, NumDims> m_strides; + TensorEvaluator<ArgType, Device> m_impl; + CoeffReturnType* m_data; + const Device& m_device; + + // This will support a maximum FFT size of 2^32 for each dimension + // m_sin_PI_div_n_LUT[i] = (-2) * std::sin(M_PI / std::pow(2,i)) ^ 2; + RealScalar m_sin_PI_div_n_LUT[32] = { + 0.0, + -2, + -0.999999999999999, + -0.292893218813453, + -0.0761204674887130, + -0.0192147195967696, + -0.00481527332780311, + -0.00120454379482761, + -3.01181303795779e-04, + -7.52981608554592e-05, + -1.88247173988574e-05, + -4.70619042382852e-06, + -1.17654829809007e-06, + -2.94137117780840e-07, + -7.35342821488550e-08, + -1.83835707061916e-08, + -4.59589268710903e-09, + -1.14897317243732e-09, + -2.87243293150586e-10, + -7.18108232902250e-11, + -1.79527058227174e-11, + -4.48817645568941e-12, + -1.12204411392298e-12, + -2.80511028480785e-13, + -7.01277571201985e-14, + -1.75319392800498e-14, + -4.38298482001247e-15, + -1.09574620500312e-15, + -2.73936551250781e-16, + -6.84841378126949e-17, + -1.71210344531737e-17, + -4.28025861329343e-18 + }; + + // m_minus_sin_2_PI_div_n_LUT[i] = -std::sin(2 * M_PI / std::pow(2,i)); + RealScalar m_minus_sin_2_PI_div_n_LUT[32] = { + 0.0, + 0.0, + -1.00000000000000e+00, + -7.07106781186547e-01, + -3.82683432365090e-01, + -1.95090322016128e-01, + -9.80171403295606e-02, + -4.90676743274180e-02, + -2.45412285229123e-02, + -1.22715382857199e-02, + -6.13588464915448e-03, + -3.06795676296598e-03, + -1.53398018628477e-03, + -7.66990318742704e-04, + -3.83495187571396e-04, + -1.91747597310703e-04, + -9.58737990959773e-05, + -4.79368996030669e-05, + -2.39684498084182e-05, + -1.19842249050697e-05, + -5.99211245264243e-06, + -2.99605622633466e-06, + -1.49802811316901e-06, + -7.49014056584716e-07, + -3.74507028292384e-07, + -1.87253514146195e-07, + -9.36267570730981e-08, + -4.68133785365491e-08, + -2.34066892682746e-08, + -1.17033446341373e-08, + -5.85167231706864e-09, + -2.92583615853432e-09 + }; +}; + +#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) && !defined(__GCUDACC__) + +template<typename OutputScalar, typename RealScalar, typename ComplexScalar, int ResultType> +struct writeToDeviceData { + void operator()(OutputScalar* d_data, ComplexScalar* data_buf, size_t size) { + } +}; + +template<typename OutputScalar, typename RealScalar, typename ComplexScalar> +struct writeToDeviceData<OutputScalar, RealScalar, ComplexScalar, Eigen::BothParts> { + void operator()(OutputScalar* d_data, ComplexScalar* data_buf, size_t size) { + cudaMemcpy(d_data, data_buf, size * sizeof(ComplexScalar), cudaMemcpyDeviceToDevice); + } +}; + +template<typename OutputScalar, typename RealScalar, typename ComplexScalar> +struct writeToDeviceData<OutputScalar, RealScalar, ComplexScalar, Eigen::RealPart> { + void operator()(OutputScalar* d_data, ComplexScalar* data_buf, size_t size) { + cudaMemcpy2D(d_data, sizeof(RealScalar), (RealScalar*) data_buf, 2 * sizeof(RealScalar), sizeof(RealScalar), size, cudaMemcpyDeviceToDevice); + } +}; + +template<typename OutputScalar, typename RealScalar, typename ComplexScalar> +struct writeToDeviceData<OutputScalar, RealScalar, ComplexScalar, Eigen::ImagPart> { + void operator()(OutputScalar* d_data, ComplexScalar* data_buf, size_t size) { + RealScalar* data_buf_offset = &(((RealScalar*) data_buf)[1]); + cudaMemcpy2D(d_data, sizeof(RealScalar), data_buf_offset, 2 * sizeof(RealScalar), sizeof(RealScalar), size, cudaMemcpyDeviceToDevice); + } +}; + +template <typename InputScalar, typename RealScalar, typename ComplexScalar, typename InputEvaluator> +__global__ void copyValues(ComplexScalar* d_data, InputEvaluator eval, int total_size) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < total_size) { + d_data[i] = MakeComplex<internal::is_same<InputScalar, RealScalar>::value>()(eval.coeff(i)); + } +} + +template<typename Scalar, typename Index, int NumDims> +__global__ void fillLineBuf(Scalar* line_buf, Scalar* data_buf, int line_len, + array<Index, NumDims> coords, array<Index, NumDims> m_strides, int dim) { + int j = blockIdx.x * blockDim.x + threadIdx.x; + if(j < line_len) { + coords[dim] = j; + Index index = 0; + for (int i = 0; i < NumDims; ++i) { + index += coords[i] * m_strides[i]; + } + line_buf[j] = data_buf[index]; + } +} + +template<typename ComplexScalar, typename RealScalar, typename Index, int NumDims> +__global__ void writebackLineBuf(ComplexScalar* line_buf, ComplexScalar* data_buf, int line_len, + array<Index, NumDims> coords, array<Index, NumDims> m_strides, int dim, RealScalar div_factor) { + int j = blockIdx.x * blockDim.x + threadIdx.x; + if(j < line_len) { + coords[dim] = j; + Index index = 0; + for (int i = 0; i < NumDims; ++i) { + index += coords[i] * m_strides[i]; + } + + data_buf[index] = line_buf[j]; + ((RealScalar*) data_buf)[2*index] /= div_factor; + ((RealScalar*) data_buf)[2*index + 1] /= div_factor; + } +} + +template <typename FFT, typename ArgType, int FFTResultType, int FFTDir> +struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, GpuDevice> { + typedef TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir> XprType; + typedef typename XprType::Index Index; + static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, GpuDevice>::Dimensions>::value; + typedef DSizes<Index, NumDims> Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::Scalar InputScalar; + typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; + typedef typename std::complex<RealScalar> ComplexScalar; + typedef typename internal::conditional<FFTResultType == Eigen::BothParts, std::complex<RealScalar>, RealScalar>::type OutputScalar; + typedef typename TensorEvaluator<ArgType, GpuDevice>::Dimensions InputDimensions; + typedef OutputScalar CoeffReturnType; + typedef typename PacketType<OutputScalar, GpuDevice>::type PacketReturnType; + + enum { + IsAligned = false, + PacketAccess = false, + BlockAccess = false, + Layout = TensorEvaluator<ArgType, GpuDevice>::Layout, + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const GpuDevice& device) : m_data_buf(NULL), m_impl(op.expression(), device), m_fft(op.fft()) { + const typename TensorEvaluator<ArgType, GpuDevice>::Dimensions& input_dims = m_impl.dimensions(); + for (int i = 0; i < NumDims; ++i) { + eigen_assert(input_dims[i] > 0); + m_dimensions[i] = input_dims[i]; + } + + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + m_strides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_strides[i] = m_strides[i - 1] * m_dimensions[i - 1]; + } + } else { + m_strides[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_strides[i] = m_strides[i + 1] * m_dimensions[i + 1]; + } + } + m_size = m_dimensions.TotalSize(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { + return m_dimensions; + } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(OutputScalar* d_data) { + m_impl.evalSubExprsIfNeeded(NULL); + if (d_data) { + evalToDeviceData(d_data); + return false; + } else { + evalToSelfDataBuf(); + return true; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index getIndexFromCoords(const array<Index, NumDims> & coords) const { + Index result = 0; + for (int i = 0; i < NumDims; ++i) { + result += coords[i] * m_strides[i]; + } + return result; + } + + EIGEN_STRONG_INLINE array<Index, NumDims> getPartialCoordsFromIndex(Index index, Index omitted_dim) const { + array<Index, NumDims> partial_m_strides = m_strides; + array<Index, NumDims> index_coords; + + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + for (Index i = omitted_dim + 1; i < NumDims; ++i) { + partial_m_strides[i] /= m_dimensions[omitted_dim]; + } + for (int i = NumDims - 1; i > 0; --i) { + if(omitted_dim == i) { + } + else { + const Index idx = index / partial_m_strides[i]; + index -= idx * partial_m_strides[i]; + index_coords[i] = idx; + } + } + index_coords[0] = index; + } + else { + for (Index i = omitted_dim - 1; i >= 0; --i) { + partial_m_strides[i] /= m_dimensions[omitted_dim]; + } + for (int i = 0; i < NumDims - 1; ++i) { + if(omitted_dim == i) { + } + else { + const Index idx = index / partial_m_strides[i]; + index -= idx * partial_m_strides[i]; + index_coords[i] = idx; + } + } + index_coords[NumDims - 1] = index; + } + // Value of index_coords[omitted_dim] is not determined to this step + return index_coords; + } + + void evalToSelfDataBuf() { + cudaMalloc((void**) &m_data_buf, sizeof(OutputScalar) * m_size); + evalToDeviceData(m_data_buf); + } + + EIGEN_STRONG_INLINE void evalToDeviceData(OutputScalar* d_data) { + ComplexScalar* data_buf; + cudaMalloc((void**) &data_buf, sizeof(ComplexScalar) * m_size); + + int block_size = 128; + int grid_size = m_size / block_size + 1; + + copyValues<InputScalar, RealScalar, ComplexScalar, TensorEvaluator<ArgType, GpuDevice> > <<<grid_size, block_size>>>(data_buf, m_impl, m_size); + + for (int i = 0; i < m_fft.size(); ++i) { + int dim = m_fft[i]; + eigen_assert(dim >= 0 && dim < NumDims); + int line_len = m_dimensions[dim]; + ComplexScalar* line_buf; + cudaMalloc((void**) &line_buf, sizeof(ComplexScalar) * line_len); + + cufftHandle plan; + cufftPlan1d(&plan, line_len, CUFFT_C2C, 1); + + for (Index partial_index = 0; partial_index < m_size/line_len; ++partial_index) { + array<Index, NumDims> coords = getPartialCoordsFromIndex(partial_index, dim); + // get data into line_buf + int block_size = 128; + int grid_size = line_len / block_size + 1; + fillLineBuf<ComplexScalar, Index, NumDims> <<<grid_size, block_size>>>(line_buf, data_buf, line_len, coords, m_strides, dim); + + if(FFTDir == Eigen::FFT_FORWARD) { + cufftExecC2C(plan, reinterpret_cast<cufftComplex *>(line_buf), reinterpret_cast<cufftComplex*>(line_buf), CUFFT_FORWARD); + } + else { + cufftExecC2C(plan, reinterpret_cast<cufftComplex*>(line_buf), reinterpret_cast<cufftComplex*>(line_buf), CUFFT_INVERSE); + } + // write back + RealScalar div_factor = (FFTDir == FFT_FORWARD) ? 1.0 : line_len; + writebackLineBuf<ComplexScalar, RealScalar, Index, NumDims> <<<grid_size, block_size>>>(line_buf, data_buf, line_len, coords, m_strides, dim, div_factor); + cudaDeviceSynchronize(); + + } + cufftDestroy(plan); + cudaFree(line_buf); + } + writeToDeviceData<OutputScalar, RealScalar, ComplexScalar, FFTResultType>()(d_data, data_buf, m_size); + cudaFree(data_buf); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + if(m_data_buf != NULL) cudaFree(m_data_buf); + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffReturnType coeff(Index index) const { + return m_data_buf[index]; + } + + template<int LoadMode> + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType packet(Index index) const { + return internal::ploadt<PacketReturnType, LoadMode>(m_data_buf + index); + } + + EIGEN_DEVICE_FUNC Scalar* data() const { return m_data_buf; } + + protected: + int m_size; + const FFT& m_fft; + Dimensions m_dimensions; + array<Index, NumDims> m_strides; + TensorEvaluator<ArgType, GpuDevice> m_impl; + OutputScalar* m_data_buf; + +}; +#endif + +} // end namespace Eigen +#endif //EIGEN_CXX11_TENSOR_TENSOR_FFT_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h new file mode 100644 index 0000000000..a7af67230f --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h @@ -0,0 +1,277 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_FIXED_SIZE_H +#define EIGEN_CXX11_TENSOR_TENSOR_FIXED_SIZE_H + +namespace Eigen { + +/** \class TensorFixedSize + * \ingroup CXX11_Tensor_Module + * + * \brief The fixed sized version of the tensor class. + * + * The fixed sized equivalent of + * Eigen::Tensor<float, 3> t(3, 5, 7); + * is + * Eigen::TensorFixedSize<float, Sizes<3,5,7>> t; + */ + +template<typename Scalar_, typename Dimensions_, int Options_, typename IndexType> +class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_, Options_, IndexType> > +{ + public: + typedef TensorFixedSize<Scalar_, Dimensions_, Options_, IndexType> Self; + typedef TensorBase<TensorFixedSize<Scalar_, Dimensions_, Options_, IndexType> > Base; + typedef typename Eigen::internal::nested<Self>::type Nested; + typedef typename internal::traits<Self>::StorageKind StorageKind; + typedef typename internal::traits<Self>::Index Index; + typedef Scalar_ Scalar; + typedef typename internal::packet_traits<Scalar>::type Packet; + typedef typename NumTraits<Scalar>::Real RealScalar; + typedef typename Base::CoeffReturnType CoeffReturnType; + + static const int Options = Options_; + + enum { + IsAligned = bool(EIGEN_ALIGN), + PacketAccess = (internal::packet_traits<Scalar>::size > 1), + BlockAccess = false, + Layout = Options_ & RowMajor ? RowMajor : ColMajor, + CoordAccess = true, + }; + + typedef Dimensions_ Dimensions; + static const std::size_t NumIndices = Dimensions::count; + + protected: + TensorStorage<Scalar, Dimensions, Options> m_storage; + + public: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rank() const { return NumIndices; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index dimension(std::size_t n) const { return m_storage.dimensions()[n]; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_storage.dimensions(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_storage.size(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar *data() { return m_storage.data(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar *data() const { return m_storage.data(); } + + // This makes EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED + // work, because that uses base().coeffRef() - and we don't yet + // implement a similar class hierarchy + inline Self& base() { return *this; } + inline const Self& base() const { return *this; } + +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES + template<typename... IndexTypes> + EIGEN_DEVICE_FUNC inline const Scalar& coeff(Index firstIndex, IndexTypes... otherIndices) const + { + // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + return coeff(array<Index, NumIndices>{{firstIndex, otherIndices...}}); + } +#endif + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& coeff(const array<Index, NumIndices>& indices) const + { + eigen_internal_assert(checkIndexRange(indices)); + return m_storage.data()[linearizedIndex(indices)]; + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& coeff() const + { + EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + return m_storage.data()[0]; + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& coeff(Index index) const + { + eigen_internal_assert(index >= 0 && index < size()); + return m_storage.data()[index]; + } + +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES + template<typename... IndexTypes> + inline Scalar& coeffRef(Index firstIndex, IndexTypes... otherIndices) + { + // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + return coeffRef(array<Index, NumIndices>{{firstIndex, otherIndices...}}); + } +#endif + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& coeffRef(const array<Index, NumIndices>& indices) + { + eigen_internal_assert(checkIndexRange(indices)); + return m_storage.data()[linearizedIndex(indices)]; + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& coeffRef() + { + EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + return m_storage.data()[0]; + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) + { + eigen_internal_assert(index >= 0 && index < size()); + return m_storage.data()[index]; + } + +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES + template<typename... IndexTypes> + inline const Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) const + { + // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + return this->operator()(array<Index, NumIndices>{{firstIndex, otherIndices...}}); + } +#endif + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(const array<Index, NumIndices>& indices) const + { + eigen_assert(checkIndexRange(indices)); + return coeff(indices); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()() const + { + EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + return coeff(); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index index) const + { + eigen_internal_assert(index >= 0 && index < size()); + return coeff(index); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator[](Index index) const + { + // The bracket operator is only for vectors, use the parenthesis operator instead. + EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE); + return coeff(index); + } + +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES + template<typename... IndexTypes> + inline Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) + { + // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + return operator()(array<Index, NumIndices>{{firstIndex, otherIndices...}}); + } +#endif + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(const array<Index, NumIndices>& indices) + { + eigen_assert(checkIndexRange(indices)); + return coeffRef(indices); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()() + { + EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + return coeffRef(); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index index) + { + eigen_assert(index >= 0 && index < size()); + return coeffRef(index); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator[](Index index) + { + // The bracket operator is only for vectors, use the parenthesis operator instead + EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + return coeffRef(index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorFixedSize() { } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorFixedSize(const Self& other) + : m_storage(other.m_storage) + { + } + +#ifdef EIGEN_HAVE_RVALUE_REFERENCES + inline TensorFixedSize(Self&& other) + : m_storage(other.m_storage) + { + } +#endif + + template<typename OtherDerived> + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorFixedSize(const TensorBase<OtherDerived, ReadOnlyAccessors>& other) + { + typedef TensorAssignOp<TensorFixedSize, const OtherDerived> Assign; + Assign assign(*this, other.derived()); + internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice()); + } + + template<typename Other> + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorFixedSize& operator=(const Other& other) + { + // FIXME: check that the dimensions of other match the dimensions of *this. + // Unfortunately this isn't possible yet when the rhs is an expression. + typedef TensorAssignOp<Self, const Other> Assign; + Assign assign(*this, other); + internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice()); + return *this; + } + + protected: + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE bool checkIndexRange(const array<Index, NumIndices>& /*indices*/) const + { + using internal::array_apply_and_reduce; + using internal::array_zip_and_reduce; + using internal::greater_equal_zero_op; + using internal::logical_and_op; + using internal::lesser_op; + + return true; + // check whether the indices are all >= 0 + /* array_apply_and_reduce<logical_and_op, greater_equal_zero_op>(indices) && + // check whether the indices fit in the dimensions + array_zip_and_reduce<logical_and_op, lesser_op>(indices, m_storage.dimensions());*/ + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Index linearizedIndex(const array<Index, NumIndices>& indices) const + { + if (Options&RowMajor) { + return m_storage.dimensions().IndexOfRowMajor(indices); + } else { + return m_storage.dimensions().IndexOfColMajor(indices); + } + } +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_FIXED_SIZE_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h new file mode 100644 index 0000000000..1d1ce47174 --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h @@ -0,0 +1,150 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_FORCED_EVAL_H +#define EIGEN_CXX11_TENSOR_TENSOR_FORCED_EVAL_H + +namespace Eigen { + +/** \class TensorForcedEval + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor reshaping class. + * + * + */ +namespace internal { +template<typename XprType> +struct traits<TensorForcedEvalOp<XprType> > +{ + // Type promotion to handle the case where the types of the lhs and the rhs are different. + typedef typename XprType::Scalar Scalar; + typedef traits<XprType> XprTraits; + typedef typename traits<XprType>::StorageKind StorageKind; + typedef typename traits<XprType>::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference<Nested>::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; + + enum { + Flags = 0, + }; +}; + +template<typename XprType> +struct eval<TensorForcedEvalOp<XprType>, Eigen::Dense> +{ + typedef const TensorForcedEvalOp<XprType>& type; +}; + +template<typename XprType> +struct nested<TensorForcedEvalOp<XprType>, 1, typename eval<TensorForcedEvalOp<XprType> >::type> +{ + typedef TensorForcedEvalOp<XprType> type; +}; + +} // end namespace internal + + + +template<typename XprType> +class TensorForcedEvalOp : public TensorBase<TensorForcedEvalOp<XprType> > +{ + public: + typedef typename Eigen::internal::traits<TensorForcedEvalOp>::Scalar Scalar; + typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; + typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType; + typedef typename Eigen::internal::nested<TensorForcedEvalOp>::type Nested; + typedef typename Eigen::internal::traits<TensorForcedEvalOp>::StorageKind StorageKind; + typedef typename Eigen::internal::traits<TensorForcedEvalOp>::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorForcedEvalOp(const XprType& expr) + : m_xpr(expr) {} + + EIGEN_DEVICE_FUNC + const typename internal::remove_all<typename XprType::Nested>::type& + expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; +}; + + +template<typename ArgType, typename Device> +struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device> +{ + typedef TensorForcedEvalOp<ArgType> XprType; + typedef typename ArgType::Scalar Scalar; + typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions; + + enum { + IsAligned = true, + PacketAccess = (internal::packet_traits<Scalar>::size > 1), + BlockAccess = false, + Layout = TensorEvaluator<ArgType, Device>::Layout, + }; + + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_op(op.expression()), m_device(device), m_buffer(NULL) + { } + + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { + m_impl.evalSubExprsIfNeeded(NULL); + const Index numValues = m_impl.dimensions().TotalSize(); + m_buffer = (CoeffReturnType*)m_device.allocate(numValues * sizeof(CoeffReturnType)); + // Should initialize the memory in case we're dealing with non POD types. + if (!internal::is_arithmetic<CoeffReturnType>::value) { + for (Index i = 0; i < numValues; ++i) { + new(m_buffer+i) CoeffReturnType(); + } + } + typedef TensorEvalToOp<const ArgType> EvalTo; + EvalTo evalToTmp(m_buffer, m_op); + const bool PacketAccess = internal::IsVectorizable<Device, ArgType>::value; + const bool BlockAccess = false; + internal::TensorExecutor<const EvalTo, Device, PacketAccess, BlockAccess>::run(evalToTmp, m_device); + m_impl.cleanup(); + return true; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_device.deallocate(m_buffer); + m_buffer = NULL; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + return m_buffer[index]; + } + + template<int LoadMode> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + return internal::ploadt<PacketReturnType, LoadMode>(m_buffer + index); + } + + EIGEN_DEVICE_FUNC Scalar* data() const { return m_buffer; } + + private: + TensorEvaluator<ArgType, Device> m_impl; + const ArgType m_op; + const Device& m_device; + CoeffReturnType* m_buffer; +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_FORCED_EVAL_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h new file mode 100644 index 0000000000..e11d5ed22e --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h @@ -0,0 +1,104 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_FORWARD_DECLARATIONS_H +#define EIGEN_CXX11_TENSOR_TENSOR_FORWARD_DECLARATIONS_H + +namespace Eigen { + +template<typename Scalar_, std::size_t NumIndices_, int Options_ = 0, typename IndexType = DenseIndex> class Tensor; +template<typename Scalar_, typename Dimensions, int Options_ = 0, typename IndexType = DenseIndex> class TensorFixedSize; +template<typename Scalar_, int Options_ = 0, typename IndexType = DenseIndex> class TensorVarDim; +template<typename PlainObjectType, int Options_ = Unaligned> class TensorMap; +template<typename PlainObjectType> class TensorRef; +template<typename Derived, int AccessLevel = internal::accessors_level<Derived>::value> class TensorBase; + +template<typename NullaryOp, typename PlainObjectType> class TensorCwiseNullaryOp; +template<typename UnaryOp, typename XprType> class TensorCwiseUnaryOp; +template<typename BinaryOp, typename LeftXprType, typename RightXprType> class TensorCwiseBinaryOp; +template<typename IfXprType, typename ThenXprType, typename ElseXprType> class TensorSelectOp; +template<typename Op, typename Dims, typename XprType> class TensorReductionOp; +template<typename XprType> class TensorIndexTupleOp; +template<typename ReduceOp, typename Dims, typename XprType> class TensorTupleReducerOp; +template<typename Axis, typename LeftXprType, typename RightXprType> class TensorConcatenationOp; +template<typename Dimensions, typename LeftXprType, typename RightXprType> class TensorContractionOp; +template<typename TargetType, typename XprType> class TensorConversionOp; +template<typename Dimensions, typename InputXprType, typename KernelXprType> class TensorConvolutionOp; +template<typename Dimensions, typename InputXprType, typename KernelXprType> class TensorConvolutionByFFTOp; +template<typename FFT, typename XprType, int FFTDataType, int FFTDirection> class TensorFFTOp; +template<typename IFFT, typename XprType, int ResultType> class TensorIFFTOp; +template<typename DFT, typename XprType, int ResultType> class TensorDFTOp; +template<typename IDFT, typename XprType, int ResultType> class TensorIDFTOp; +template<typename PatchDim, typename XprType> class TensorPatchOp; +template<DenseIndex Rows, DenseIndex Cols, typename XprType> class TensorImagePatchOp; +template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType> class TensorVolumePatchOp; +template<typename Broadcast, typename XprType> class TensorBroadcastingOp; +template<DenseIndex DimId, typename XprType> class TensorChippingOp; +template<typename NewDimensions, typename XprType> class TensorReshapingOp; +template<typename XprType> class TensorLayoutSwapOp; +template<typename StartIndices, typename Sizes, typename XprType> class TensorSlicingOp; +template<typename ReverseDimensions, typename XprType> class TensorReverseOp; +template<typename XprType> class TensorTrueIndicesOp; +template<typename PaddingDimensions, typename XprType> class TensorPaddingOp; +template<typename Shuffle, typename XprType> class TensorShufflingOp; +template<typename Strides, typename XprType> class TensorStridingOp; +template<typename Strides, typename XprType> class TensorInflationOp; +template<typename Generator, typename XprType> class TensorGeneratorOp; +template<typename LeftXprType, typename RightXprType> class TensorAssignOp; + +template<typename CustomUnaryFunc, typename XprType> class TensorCustomUnaryOp; +template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType> class TensorCustomBinaryOp; + +template<typename XprType> class TensorEvalToOp; +template<typename XprType> class TensorForcedEvalOp; + +template<typename ExpressionType, typename DeviceType> class TensorDevice; +template<typename Derived, typename Device> struct TensorEvaluator; + +class DefaultDevice; +class ThreadPoolDevice; +class GpuDevice; + +enum DFTResultType { + RealPart = 0, + ImagPart = 1, + BothParts = 2 +}; + +enum FFTDirection { + FFT_FORWARD = 0, + FFT_REVERSE = 1 +}; + +namespace internal { +template <typename Device, typename Expression> +struct IsVectorizable { + static const bool value = TensorEvaluator<Expression, Device>::PacketAccess; +}; + +template <typename Expression> +struct IsVectorizable<GpuDevice, Expression> { + static const bool value = TensorEvaluator<Expression, GpuDevice>::PacketAccess && + TensorEvaluator<Expression, GpuDevice>::IsAligned; +}; + +template <typename Device, typename Expression> +struct IsTileable { + static const bool value = TensorEvaluator<Expression, Device>::BlockAccess; +}; + +template <typename Expression, typename Device, + bool Vectorizable = IsVectorizable<Device, Expression>::value, + bool Tileable = IsTileable<Device, Expression>::value> +class TensorExecutor; +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_FORWARD_DECLARATIONS_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h new file mode 100644 index 0000000000..526301ad5b --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h @@ -0,0 +1,706 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_FUNCTORS_H +#define EIGEN_CXX11_TENSOR_TENSOR_FUNCTORS_H + +namespace Eigen { +namespace internal { + +namespace { +#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) && defined(__CUDA_ARCH__) +__device__ int get_random_seed() { + return clock(); +} +#else +int get_random_seed() { +#ifdef _WIN32 + SYSTEMTIME st; + GetSystemTime(&st); + return st.wSecond + 1000 * st.wMilliseconds; +#elif __APPLE__ + return mach_absolute_time(); +#else + timespec ts; + clock_gettime(CLOCK_REALTIME, &ts); + return ts.tv_nsec; +#endif +} +#endif +} + + +// Standard reduction functors +template <typename T> struct SumReducer +{ + static const bool PacketAccess = true; + static const bool IsStateful = false; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { + (*accum) += t; + } + template <typename Packet> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const { + (*accum) = padd<Packet>(*accum, p); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { + return static_cast<T>(0); + } + template <typename Packet> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const { + return pset1<Packet>(0); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { + return accum; + } + template <typename Packet> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const { + return vaccum; + } + template <typename Packet> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const { + return saccum + predux(vaccum); + } +}; + +template <typename T> struct MeanReducer +{ + static const bool PacketAccess = true; + static const bool IsStateful = true; + + MeanReducer() : scalarCount_(0), packetCount_(0) { } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) { + (*accum) += t; + scalarCount_++; + } + template <typename Packet> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) { + (*accum) = padd<Packet>(*accum, p); + packetCount_++; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { + return static_cast<T>(0); + } + template <typename Packet> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const { + return pset1<Packet>(0); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { + return accum / scalarCount_; + } + template <typename Packet> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const { + return pdiv(vaccum, pset1<Packet>(packetCount_)); + } + template <typename Packet> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const { + return (saccum + predux(vaccum)) / (scalarCount_ + packetCount_ * unpacket_traits<Packet>::size); + } + + protected: + int scalarCount_; + int packetCount_; +}; + +struct AndReducer +{ + static const bool PacketAccess = false; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(bool t, bool* accum) const { + *accum = *accum && t; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool initialize() const { + return true; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool finalize(bool accum) const { + return accum; + } +}; + +struct OrReducer { + static const bool PacketAccess = false; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(bool t, bool* accum) const { + *accum = *accum || t; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool initialize() const { + return false; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool finalize(bool accum) const { + return accum; + } +}; + +template <typename T> struct MaxReducer +{ + static const bool PacketAccess = true; + static const bool IsStateful = false; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { + if (t > *accum) { *accum = t; } + } + template <typename Packet> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const { + (*accum) = pmax<Packet>(*accum, p); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { + return Eigen::NumTraits<T>::lowest(); + } + template <typename Packet> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const { + return pset1<Packet>(initialize()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { + return accum; + } + template <typename Packet> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const { + return vaccum; + } + template <typename Packet> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const { + return numext::maxi(saccum, predux_max(vaccum)); + } +}; + +template <typename T> struct MinReducer +{ + static const bool PacketAccess = true; + static const bool IsStateful = false; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { + if (t < *accum) { *accum = t; } + } + template <typename Packet> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const { + (*accum) = pmin<Packet>(*accum, p); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { + return Eigen::NumTraits<T>::highest(); + } + template <typename Packet> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const { + return pset1<Packet>(initialize()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { + return accum; + } + template <typename Packet> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const { + return vaccum; + } + template <typename Packet> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const { + return numext::mini(saccum, predux_min(vaccum)); + } +}; + + +template <typename T> struct ProdReducer +{ + static const bool PacketAccess = true; + static const bool IsStateful = false; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { + (*accum) *= t; + } + template <typename Packet> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const { + (*accum) = pmul<Packet>(*accum, p); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { + return static_cast<T>(1); + } + template <typename Packet> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const { + return pset1<Packet>(1); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { + return accum; + } + template <typename Packet> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const { + return vaccum; + } + template <typename Packet> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const { + return saccum * predux_mul(vaccum); + } +}; + +#if !defined (EIGEN_USE_GPU) || !defined(__CUDACC__) || !defined(__CUDA_ARCH__) +// We're not compiling a cuda kernel +template <typename T> class UniformRandomGenerator { + + public: + static const bool PacketAccess = true; + + // Uses the given "seed" if non-zero, otherwise uses a random seed. + UniformRandomGenerator(unsigned int seed = 0) : m_seed(seed) { + seed = seed ? seed : get_random_seed(); + srand(seed); + } + UniformRandomGenerator(const UniformRandomGenerator& other) { + m_seed = other.m_seed; + } + + template<typename Index> + T operator()(Index, Index = 0) const { + return random<T>(); + } + template<typename Index> + typename internal::packet_traits<T>::type packetOp(Index i, Index j = 0) const { + const int packetSize = internal::packet_traits<T>::size; + EIGEN_ALIGN_DEFAULT T values[packetSize]; + for (int i = 0; i < packetSize; ++i) { + values[i] = random<T>(); + } + return internal::pload<typename internal::packet_traits<T>::type>(values); + } + + private: + unsigned int m_seed; +}; + +#if __cplusplus > 199711 +template <> class UniformRandomGenerator<float> { + public: + static const bool PacketAccess = true; + + // Uses the given "seed" if non-zero, otherwise uses a random seed. + UniformRandomGenerator(unsigned int seed = 0) : m_seed(seed) { + seed = seed ? seed : get_random_seed(); + m_generator.seed(seed); + } + UniformRandomGenerator(const UniformRandomGenerator<float>& other) { + m_generator.seed(other(0, 0) * UINT_MAX); + m_seed = other.m_seed; + } + + template<typename Index> + float operator()(Index, Index = 0) const { + return m_distribution(m_generator); + } + template<typename Index> + typename internal::packet_traits<float>::type packetOp(Index i, Index j = 0) const { + const int packetSize = internal::packet_traits<float>::size; + EIGEN_ALIGN_DEFAULT float values[packetSize]; + for (int i = 0; i < packetSize; ++i) { + values[i] = this->operator()(i, j); + } + return internal::pload<typename internal::packet_traits<float>::type>(values); + } + + private: + UniformRandomGenerator& operator = (const UniformRandomGenerator&); + // Make sure m_seed comes first to match the layout of the cpu + // version of the code. + unsigned int m_seed; + mutable std::mt19937 m_generator; + mutable std::uniform_real_distribution<float> m_distribution; +}; + +template <> class UniformRandomGenerator<double> { + public: + static const bool PacketAccess = true; + + // Uses the given "seed" if non-zero, otherwise uses a random seed. + UniformRandomGenerator(unsigned int seed = 0) : m_seed(seed) { + seed = seed ? seed : get_random_seed(); + m_generator.seed(seed); + } + UniformRandomGenerator(const UniformRandomGenerator<double>& other) { + m_generator.seed(other(0, 0) * UINT_MAX); + m_seed = other.m_seed; + } + + template<typename Index> + double operator()(Index, Index = 0) const { + return m_distribution(m_generator); + } + template<typename Index> + typename internal::packet_traits<double>::type packetOp(Index i, Index j = 0) const { + const int packetSize = internal::packet_traits<double>::size; + EIGEN_ALIGN_DEFAULT double values[packetSize]; + for (int i = 0; i < packetSize; ++i) { + values[i] = this->operator()(i, j); + } + return internal::pload<typename internal::packet_traits<double>::type>(values); + } + + private: + UniformRandomGenerator& operator = (const UniformRandomGenerator&); + // Make sure m_seed comes first to match the layout of the cpu + // version of the code. + unsigned int m_seed; + mutable std::mt19937 m_generator; + mutable std::uniform_real_distribution<double> m_distribution; +}; +#endif + +#else + +// We're compiling a cuda kernel +template <typename T> class UniformRandomGenerator; + +template <> class UniformRandomGenerator<float> { + public: + static const bool PacketAccess = true; + + // Uses the given "seed" if non-zero, otherwise uses a random seed. + __device__ UniformRandomGenerator(unsigned int seed = 0) : m_seed(seed) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + seed = seed ? seed : get_random_seed(); + curand_init(seed, tid, 0, &m_state); + } + + __device__ UniformRandomGenerator(const UniformRandomGenerator& other) { + m_seed = other.m_seed; + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + const unsigned int seed = m_seed ? m_seed : get_random_seed(); + curand_init(seed, tid, 0, &m_state); + } + + template<typename Index> + __device__ float operator()(Index, Index = 0) const { + return curand_uniform(&m_state); + } + template<typename Index> + __device__ float4 packetOp(Index, Index = 0) const { + return curand_uniform4(&m_state); + } + + private: + unsigned int m_seed; + mutable curandStatePhilox4_32_10_t m_state; +}; + +template <> class UniformRandomGenerator<double> { + public: + static const bool PacketAccess = true; + + // Uses the given "seed" if non-zero, otherwise uses a random seed. + __device__ UniformRandomGenerator(unsigned int seed = 0) : m_seed(seed) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + seed = seed ? seed : get_random_seed(); + curand_init(seed, tid, 0, &m_state); + } + __device__ UniformRandomGenerator(const UniformRandomGenerator& other) { + m_seed = other.m_seed; + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + const unsigned int seed = m_seed ? m_seed : get_random_seed(); + curand_init(seed, tid, 0, &m_state); + } + template<typename Index> + __device__ double operator()(Index, Index = 0) const { + return curand_uniform_double(&m_state); + } + template<typename Index> + __device__ double2 packetOp(Index, Index = 0) const { + return curand_uniform2_double(&m_state); + } + + private: + unsigned int m_seed; + mutable curandStatePhilox4_32_10_t m_state; +}; + +template <> class UniformRandomGenerator<std::complex<float> > { + public: + static const bool PacketAccess = false; + + // Uses the given "seed" if non-zero, otherwise uses a random seed. + __device__ UniformRandomGenerator(unsigned int seed = 0) : m_seed(seed) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + seed = seed ? seed : get_random_seed(); + curand_init(seed, tid, 0, &m_state); + } + __device__ UniformRandomGenerator(const UniformRandomGenerator& other) { + m_seed = other.m_seed; + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + const unsigned int seed = m_seed ? m_seed : get_random_seed(); + curand_init(seed, tid, 0, &m_state); + } + template<typename Index> + __device__ std::complex<float> operator()(Index, Index = 0) const { + float4 vals = curand_uniform4(&m_state); + return std::complex<float>(vals.x, vals.y); + } + + private: + unsigned int m_seed; + mutable curandStatePhilox4_32_10_t m_state; +}; + +template <> class UniformRandomGenerator<std::complex<double> > { + public: + static const bool PacketAccess = false; + + // Uses the given "seed" if non-zero, otherwise uses a random seed. + __device__ UniformRandomGenerator(unsigned int seed = 0) : m_seed(seed) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + seed = seed ? seed : get_random_seed(); + curand_init(seed, tid, 0, &m_state); + } + __device__ UniformRandomGenerator(const UniformRandomGenerator& other) { + m_seed = other.m_seed; + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + const unsigned int seed = m_seed ? m_seed : get_random_seed(); + curand_init(seed, tid, 0, &m_state); + } + template<typename Index> + __device__ std::complex<double> operator()(Index, Index = 0) const { + double2 vals = curand_uniform2_double(&m_state); + return std::complex<double>(vals.x, vals.y); + } + + private: + unsigned int m_seed; + mutable curandStatePhilox4_32_10_t m_state; +}; + +#endif + + +#if (!defined (EIGEN_USE_GPU) || !defined(__CUDACC__) || !defined(__CUDA_ARCH__)) && __cplusplus > 199711 +// We're not compiling a cuda kernel +template <typename T> class NormalRandomGenerator { + public: + static const bool PacketAccess = true; + + // Uses the given "seed" if non-zero, otherwise uses a random seed. + NormalRandomGenerator(unsigned int seed = 0) : m_distribution(0, 1), m_seed(seed) { + seed = seed ? seed : get_random_seed(); + m_generator.seed(seed); + } + NormalRandomGenerator(const NormalRandomGenerator& other) + : m_distribution(other.m_distribution), m_seed(other.m_seed) { + m_generator.seed(other(0, 0) * UINT_MAX); + } + + template<typename Index> + T operator()(Index, Index = 0) const { + return m_distribution(m_generator); + } + template<typename Index> + typename internal::packet_traits<T>::type packetOp(Index, Index = 0) const { + const int packetSize = internal::packet_traits<T>::size; + EIGEN_ALIGN_DEFAULT T values[packetSize]; + for (int i = 0; i < packetSize; ++i) { + values[i] = m_distribution(m_generator); + } + return internal::pload<typename internal::packet_traits<T>::type>(values); + } + + private: + unsigned int m_seed; + mutable std::normal_distribution<T> m_distribution; + mutable std::mt19937 m_generator; +}; + +#elif defined (EIGEN_USE_GPU) && defined(__CUDACC__) && defined(__CUDA_ARCH__) + +// We're compiling a cuda kernel +template <typename T> class NormalRandomGenerator; + +template <> class NormalRandomGenerator<float> { + public: + static const bool PacketAccess = true; + + // Uses the given "seed" if non-zero, otherwise uses a random seed. + __device__ NormalRandomGenerator(unsigned int seed = 0) : m_seed(seed) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + seed = seed ? seed : get_random_seed(); + curand_init(seed, tid, 0, &m_state); + } + __device__ NormalRandomGenerator(const NormalRandomGenerator<float>& other) { + m_seed = other.m_seed; + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + const unsigned int seed = m_seed ? m_seed : get_random_seed(); + curand_init(seed, tid, 0, &m_state); + } + template<typename Index> + __device__ float operator()(Index, Index = 0) const { + return curand_normal(&m_state); + } + template<typename Index> + __device__ float4 packetOp(Index, Index = 0) const { + return curand_normal4(&m_state); + } + + private: + unsigned int m_seed; + mutable curandStatePhilox4_32_10_t m_state; +}; + +template <> class NormalRandomGenerator<double> { + public: + static const bool PacketAccess = true; + + // Uses the given "seed" if non-zero, otherwise uses a random seed. + __device__ NormalRandomGenerator(unsigned int seed = 0) : m_seed(seed) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + seed = seed ? seed : get_random_seed(); + curand_init(seed, tid, 0, &m_state); + } + __device__ NormalRandomGenerator(const NormalRandomGenerator<double>& other) { + m_seed = other.m_seed; + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + const unsigned int seed = m_seed ? m_seed : get_random_seed(); + curand_init(seed, tid, 0, &m_state); + } + template<typename Index> + __device__ double operator()(Index, Index = 0) const { + return curand_normal_double(&m_state); + } + template<typename Index> + __device__ double2 packetOp(Index, Index = 0) const { + return curand_normal2_double(&m_state); + } + + private: + unsigned int m_seed; + mutable curandStatePhilox4_32_10_t m_state; +}; + + +template <> class NormalRandomGenerator<std::complex<float> > { + public: + static const bool PacketAccess = false; + + // Uses the given "seed" if non-zero, otherwise uses a random seed. + __device__ NormalRandomGenerator(unsigned int seed = 0) : m_seed(seed) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + seed = seed ? seed : get_random_seed(); + curand_init(seed, tid, 0, &m_state); + } + __device__ NormalRandomGenerator(const NormalRandomGenerator& other) { + m_seed = other.m_seed; + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + const unsigned int seed = m_seed ? m_seed : get_random_seed(); + curand_init(seed, tid, 0, &m_state); + } + template<typename Index> + __device__ std::complex<float> operator()(Index, Index = 0) const { + float4 vals = curand_normal4(&m_state); + return std::complex<float>(vals.x, vals.y); + } + + private: + unsigned int m_seed; + mutable curandStatePhilox4_32_10_t m_state; +}; + +template <> class NormalRandomGenerator<std::complex<double> > { + public: + static const bool PacketAccess = false; + + // Uses the given "seed" if non-zero, otherwise uses a random seed. + __device__ NormalRandomGenerator(unsigned int seed = 0) : m_seed(seed) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + seed = seed ? seed : get_random_seed(); + curand_init(seed, tid, 0, &m_state); + } + __device__ NormalRandomGenerator(const NormalRandomGenerator& other) { + m_seed = other.m_seed; + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + const unsigned int seed = m_seed ? m_seed : get_random_seed(); + curand_init(seed, tid, 0, &m_state); + } + template<typename Index> + __device__ std::complex<double> operator()(Index, Index = 0) const { + double2 vals = curand_normal2_double(&m_state); + return std::complex<double>(vals.x, vals.y); + } + + private: + unsigned int m_seed; + mutable curandStatePhilox4_32_10_t m_state; +}; +#else + +template <typename T> class NormalRandomGenerator { + public: + // Uses the given "seed" if non-zero, otherwise uses a random seed. + NormalRandomGenerator(unsigned int seed = 0) : m_seed(seed) {} + + private: + unsigned int m_seed; +}; + +#endif + + +template <typename T, typename Index, size_t NumDims> +class GaussianGenerator { + public: + static const bool PacketAccess = false; + + EIGEN_DEVICE_FUNC GaussianGenerator(const array<T, NumDims>& means, + const array<T, NumDims>& std_devs) + : m_means(means) { + for (int i = 0; i < NumDims; ++i) { + m_two_sigmas[i] = std_devs[i] * std_devs[i] * 2; + } + } + + T operator()(const array<Index, NumDims>& coordinates) const { + T tmp = T(0); + for (int i = 0; i < NumDims; ++i) { + T offset = coordinates[i] - m_means[i]; + tmp += offset * offset / m_two_sigmas[i]; + } + return std::exp(-tmp); + } + + private: + array<T, NumDims> m_means; + array<T, NumDims> m_two_sigmas; +}; + +template <typename T> struct ArgMaxTupleReducer +{ + static const bool PacketAccess = false; + static const bool IsStateful = false; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { + if (t.second > accum->second) { *accum = t; } + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { + return T(0, NumTraits<typename T::second_type>::lowest()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T& accum) const { + return accum; + } +}; + +template <typename T> struct ArgMinTupleReducer +{ + static const bool PacketAccess = false; + static const bool IsStateful = false; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T& t, T* accum) const { + if (t.second < accum->second) { *accum = t; } + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { + return T(0, NumTraits<typename T::second_type>::highest()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T& accum) const { + return accum; + } +}; + +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_FUNCTORS_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h new file mode 100644 index 0000000000..91a73669a4 --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h @@ -0,0 +1,185 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_GENERATOR_H +#define EIGEN_CXX11_TENSOR_TENSOR_GENERATOR_H + +namespace Eigen { + +/** \class TensorGenerator + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor generator class. + * + * + */ +namespace internal { +template<typename Generator, typename XprType> +struct traits<TensorGeneratorOp<Generator, XprType> > : public traits<XprType> +{ + typedef typename XprType::Scalar Scalar; + typedef traits<XprType> XprTraits; + typedef typename packet_traits<Scalar>::type Packet; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference<Nested>::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; +}; + +template<typename Generator, typename XprType> +struct eval<TensorGeneratorOp<Generator, XprType>, Eigen::Dense> +{ + typedef const TensorGeneratorOp<Generator, XprType>& type; +}; + +template<typename Generator, typename XprType> +struct nested<TensorGeneratorOp<Generator, XprType>, 1, typename eval<TensorGeneratorOp<Generator, XprType> >::type> +{ + typedef TensorGeneratorOp<Generator, XprType> type; +}; + +} // end namespace internal + + + +template<typename Generator, typename XprType> +class TensorGeneratorOp : public TensorBase<TensorGeneratorOp<Generator, XprType>, ReadOnlyAccessors> +{ + public: + typedef typename Eigen::internal::traits<TensorGeneratorOp>::Scalar Scalar; + typedef typename Eigen::internal::traits<TensorGeneratorOp>::Packet Packet; + typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename Eigen::internal::nested<TensorGeneratorOp>::type Nested; + typedef typename Eigen::internal::traits<TensorGeneratorOp>::StorageKind StorageKind; + typedef typename Eigen::internal::traits<TensorGeneratorOp>::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorGeneratorOp(const XprType& expr, const Generator& generator) + : m_xpr(expr), m_generator(generator) {} + + EIGEN_DEVICE_FUNC + const Generator& generator() const { return m_generator; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all<typename XprType::Nested>::type& + expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; + const Generator m_generator; +}; + + +// Eval as rvalue +template<typename Generator, typename ArgType, typename Device> +struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device> +{ + typedef TensorGeneratorOp<Generator, ArgType> XprType; + typedef typename XprType::Index Index; + typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions; + static const int NumDims = internal::array_size<Dimensions>::value; + typedef typename XprType::Scalar Scalar; + + enum { + IsAligned = false, + PacketAccess = (internal::packet_traits<Scalar>::size > 1), + BlockAccess = false, + Layout = TensorEvaluator<ArgType, Device>::Layout, + CoordAccess = false, // to be implemented + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_generator(op.generator()) + { + TensorEvaluator<ArgType, Device> impl(op.expression(), device); + m_dimensions = impl.dimensions(); + + if (NumDims > 0) { + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + m_strides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_strides[i] = m_strides[i - 1] * m_dimensions[i - 1]; + } + } else { + m_strides[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_strides[i] = m_strides[i + 1] * m_dimensions[i + 1]; + } + } + } + } + + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { + return true; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + array<Index, NumDims> coords; + extract_coordinates(index, coords); + return m_generator(coords); + } + + template<int LoadMode> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + const int packetSize = internal::unpacket_traits<PacketReturnType>::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + + EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize]; + for (int i = 0; i < packetSize; ++i) { + values[i] = coeff(index+i); + } + PacketReturnType rslt = internal::pload<PacketReturnType>(values); + return rslt; + } + + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + + protected: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void extract_coordinates(Index index, array<Index, NumDims>& coords) const { + if (NumDims > 0) { + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_strides[i]; + index -= idx * m_strides[i]; + coords[i] = idx; + } + coords[0] = index; + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = index / m_strides[i]; + index -= idx * m_strides[i]; + coords[i] = idx; + } + coords[NumDims-1] = index; + } + } + } + + Dimensions m_dimensions; + array<Index, NumDims> m_strides; + Generator m_generator; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_GENERATOR_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h new file mode 100644 index 0000000000..53dc0b04aa --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h @@ -0,0 +1,56 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_IO_H +#define EIGEN_CXX11_TENSOR_TENSOR_IO_H + +namespace Eigen { + +namespace internal { +template<> +struct significant_decimals_impl<std::string> + : significant_decimals_default_impl<std::string, true> +{}; +} + + +template <typename T> +std::ostream& operator << (std::ostream& os, const TensorBase<T, ReadOnlyAccessors>& expr) { + // Evaluate the expression if needed + TensorForcedEvalOp<const T> eval = expr.eval(); + TensorEvaluator<const TensorForcedEvalOp<const T>, DefaultDevice> tensor(eval, DefaultDevice()); + tensor.evalSubExprsIfNeeded(NULL); + + typedef typename internal::remove_const<typename T::Scalar>::type Scalar; + typedef typename T::Index Index; + typedef typename TensorEvaluator<const TensorForcedEvalOp<const T>, DefaultDevice>::Dimensions Dimensions; + const Index total_size = internal::array_prod(tensor.dimensions()); + + // Print the tensor as a 1d vector or a 2d matrix. + static const int rank = internal::array_size<Dimensions>::value; + if (rank == 0) { + os << tensor.coeff(0); + } else if (rank == 1) { + Map<const Array<Scalar, Dynamic, 1> > array(const_cast<Scalar*>(tensor.data()), total_size); + os << array; + } else { + const Index first_dim = tensor.dimensions()[0]; + static const int layout = TensorEvaluator<const TensorForcedEvalOp<const T>, DefaultDevice>::Layout; + Map<const Array<Scalar, Dynamic, Dynamic, layout> > matrix(const_cast<Scalar*>(tensor.data()), first_dim, total_size/first_dim); + os << matrix; + } + + // Cleanup. + tensor.cleanup(); + return os; +} + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_IO_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h new file mode 100644 index 0000000000..a1d33d964e --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h @@ -0,0 +1,757 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_IMAGE_PATCH_H +#define EIGEN_CXX11_TENSOR_TENSOR_IMAGE_PATCH_H + +namespace Eigen { + +/** \class TensorImagePatch + * \ingroup CXX11_Tensor_Module + * + * \brief Patch extraction specialized for image processing. + * This assumes that the input has a least 3 dimensions ordered as follow: + * 1st dimension: channels (of size d) + * 2nd dimension: rows (of size r) + * 3rd dimension: columns (of size c) + * There can be additional dimensions such as time (for video) or batch (for + * bulk processing after the first 3. + * Calling the image patch code with patch_rows and patch_cols is equivalent + * to calling the regular patch extraction code with parameters d, patch_rows, + * patch_cols, and 1 for all the additional dimensions. + */ +namespace internal { +template<DenseIndex Rows, DenseIndex Cols, typename XprType> +struct traits<TensorImagePatchOp<Rows, Cols, XprType> > : public traits<XprType> +{ + typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar; + typedef traits<XprType> XprTraits; + typedef typename packet_traits<Scalar>::type Packet; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference<Nested>::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions + 1; + static const int Layout = XprTraits::Layout; +}; + +template<DenseIndex Rows, DenseIndex Cols, typename XprType> +struct eval<TensorImagePatchOp<Rows, Cols, XprType>, Eigen::Dense> +{ + typedef const TensorImagePatchOp<Rows, Cols, XprType>& type; +}; + +template<DenseIndex Rows, DenseIndex Cols, typename XprType> +struct nested<TensorImagePatchOp<Rows, Cols, XprType>, 1, typename eval<TensorImagePatchOp<Rows, Cols, XprType> >::type> +{ + typedef TensorImagePatchOp<Rows, Cols, XprType> type; +}; + +template <typename Self, bool Vectorizable> +struct ImagePatchCopyOp { + typedef typename Self::Index Index; + typedef typename Self::Scalar Scalar; + typedef typename Self::Impl Impl; + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( + const Self& self, const Index num_coeff_to_copy, const Index dst_index, + Scalar* dst_data, const Index src_index) { + const Impl& impl = self.impl(); + for (Index i = 0; i < num_coeff_to_copy; ++i) { + dst_data[dst_index + i] = impl.coeff(src_index + i); + } + } +}; + +template <typename Self> +struct ImagePatchCopyOp<Self, true> { + typedef typename Self::Index Index; + typedef typename Self::Scalar Scalar; + typedef typename Self::Impl Impl; + typedef typename packet_traits<Scalar>::type Packet; + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( + const Self& self, const Index num_coeff_to_copy, const Index dst_index, + Scalar* dst_data, const Index src_index) { + const Impl& impl = self.impl(); + const Index packet_size = internal::unpacket_traits<Packet>::size; + const Index vectorized_size = (num_coeff_to_copy / packet_size) * + packet_size; + for (Index i = 0; i < vectorized_size; i += packet_size) { + Packet p = impl.template packet<Unaligned>(src_index + i); + internal::pstoret<Scalar, Packet, Unaligned>(dst_data + dst_index + i, p); + } + for (Index i = vectorized_size; i < num_coeff_to_copy; ++i) { + dst_data[dst_index + i] = impl.coeff(src_index + i); + } + } +}; + +template <typename Self> +struct ImagePatchPaddingOp { + typedef typename Self::Index Index; + typedef typename Self::Scalar Scalar; + typedef typename packet_traits<Scalar>::type Packet; + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( + const Index num_coeff_to_pad, const Scalar padding_value, + const Index dst_index, Scalar* dst_data) { + const Index packet_size = internal::unpacket_traits<Packet>::size; + const Packet padded_packet = internal::pset1<Packet>(padding_value); + const Index vectorized_size = (num_coeff_to_pad / packet_size) * + packet_size; + for (Index i = 0; i < vectorized_size; i += packet_size) { + internal::pstoret<Scalar, Packet, Unaligned>(dst_data + dst_index + i, + padded_packet); + } + for (Index i = vectorized_size; i < num_coeff_to_pad; ++i) { + dst_data[dst_index + i] = padding_value; + } + } +}; + +} // end namespace internal + +template<DenseIndex Rows, DenseIndex Cols, typename XprType> +class TensorImagePatchOp : public TensorBase<TensorImagePatchOp<Rows, Cols, XprType>, ReadOnlyAccessors> +{ + public: + typedef typename Eigen::internal::traits<TensorImagePatchOp>::Scalar Scalar; + typedef typename Eigen::internal::traits<TensorImagePatchOp>::Packet Packet; + typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename Eigen::internal::nested<TensorImagePatchOp>::type Nested; + typedef typename Eigen::internal::traits<TensorImagePatchOp>::StorageKind StorageKind; + typedef typename Eigen::internal::traits<TensorImagePatchOp>::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorImagePatchOp(const XprType& expr, DenseIndex patch_rows, DenseIndex patch_cols, + DenseIndex row_strides, DenseIndex col_strides, + DenseIndex in_row_strides, DenseIndex in_col_strides, + DenseIndex row_inflate_strides, DenseIndex col_inflate_strides, + PaddingType padding_type, Scalar padding_value) + : m_xpr(expr), m_patch_rows(patch_rows), m_patch_cols(patch_cols), + m_row_strides(row_strides), m_col_strides(col_strides), + m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides), + m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides), + m_padding_explicit(false), m_padding_top(0), m_padding_bottom(0), m_padding_left(0), m_padding_right(0), + m_padding_type(padding_type), m_padding_value(padding_value) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorImagePatchOp(const XprType& expr, DenseIndex patch_rows, DenseIndex patch_cols, + DenseIndex row_strides, DenseIndex col_strides, + DenseIndex in_row_strides, DenseIndex in_col_strides, + DenseIndex row_inflate_strides, DenseIndex col_inflate_strides, + DenseIndex padding_top, DenseIndex padding_bottom, + DenseIndex padding_left, DenseIndex padding_right, + Scalar padding_value) + : m_xpr(expr), m_patch_rows(patch_rows), m_patch_cols(patch_cols), + m_row_strides(row_strides), m_col_strides(col_strides), + m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides), + m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides), + m_padding_explicit(true), m_padding_top(padding_top), m_padding_bottom(padding_bottom), + m_padding_left(padding_left), m_padding_right(padding_right), + m_padding_type(PADDING_VALID), m_padding_value(padding_value) {} + + EIGEN_DEVICE_FUNC + DenseIndex patch_rows() const { return m_patch_rows; } + EIGEN_DEVICE_FUNC + DenseIndex patch_cols() const { return m_patch_cols; } + EIGEN_DEVICE_FUNC + DenseIndex row_strides() const { return m_row_strides; } + EIGEN_DEVICE_FUNC + DenseIndex col_strides() const { return m_col_strides; } + EIGEN_DEVICE_FUNC + DenseIndex in_row_strides() const { return m_in_row_strides; } + EIGEN_DEVICE_FUNC + DenseIndex in_col_strides() const { return m_in_col_strides; } + EIGEN_DEVICE_FUNC + DenseIndex row_inflate_strides() const { return m_row_inflate_strides; } + EIGEN_DEVICE_FUNC + DenseIndex col_inflate_strides() const { return m_col_inflate_strides; } + EIGEN_DEVICE_FUNC + bool padding_explicit() const { return m_padding_explicit; } + EIGEN_DEVICE_FUNC + DenseIndex padding_top() const { return m_padding_top; } + EIGEN_DEVICE_FUNC + DenseIndex padding_bottom() const { return m_padding_bottom; } + EIGEN_DEVICE_FUNC + DenseIndex padding_left() const { return m_padding_left; } + EIGEN_DEVICE_FUNC + DenseIndex padding_right() const { return m_padding_right; } + EIGEN_DEVICE_FUNC + PaddingType padding_type() const { return m_padding_type; } + EIGEN_DEVICE_FUNC + Scalar padding_value() const { return m_padding_value; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all<typename XprType::Nested>::type& + expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; + const DenseIndex m_patch_rows; + const DenseIndex m_patch_cols; + const DenseIndex m_row_strides; + const DenseIndex m_col_strides; + const DenseIndex m_in_row_strides; + const DenseIndex m_in_col_strides; + const DenseIndex m_row_inflate_strides; + const DenseIndex m_col_inflate_strides; + const bool m_padding_explicit; + const DenseIndex m_padding_top; + const DenseIndex m_padding_bottom; + const DenseIndex m_padding_left; + const DenseIndex m_padding_right; + const PaddingType m_padding_type; + const Scalar m_padding_value; +}; + +// Eval as rvalue +template<DenseIndex Rows, DenseIndex Cols, typename ArgType, typename Device> +struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device> +{ + typedef TensorImagePatchOp<Rows, Cols, ArgType> XprType; + typedef typename XprType::Index Index; + static const int NumInputDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value; + static const int NumDims = NumInputDims + 1; + typedef DSizes<Index, NumDims> Dimensions; + typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar; + typedef TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, + Device> Self; + typedef TensorEvaluator<ArgType, Device> Impl; + + enum { + IsAligned = false, + PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, + BlockAccess = true, + Layout = TensorEvaluator<ArgType, Device>::Layout, + CoordAccess = NumDims == 5, + }; + + typedef typename internal::TensorBlock<Index, Scalar, NumDims, Layout> + OutputTensorBlock; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device) + { + EIGEN_STATIC_ASSERT(NumDims >= 4, YOU_MADE_A_PROGRAMMING_MISTAKE); + + m_paddingValue = op.padding_value(); + + const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions(); + + // Caches a few variables. + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + m_inputDepth = input_dims[0]; + m_inputRows = input_dims[1]; + m_inputCols = input_dims[2]; + } else { + m_inputDepth = input_dims[NumInputDims-1]; + m_inputRows = input_dims[NumInputDims-2]; + m_inputCols = input_dims[NumInputDims-3]; + } + + m_row_strides = op.row_strides(); + m_col_strides = op.col_strides(); + + // Input strides and effective input/patch size + m_in_row_strides = op.in_row_strides(); + m_in_col_strides = op.in_col_strides(); + m_row_inflate_strides = op.row_inflate_strides(); + m_col_inflate_strides = op.col_inflate_strides(); + // The "effective" input rows and input cols are the input rows and cols + // after inflating them with zeros. + // For examples, a 2x3 matrix with row_inflate_strides and + // col_inflate_strides of 2 comes from: + // A B C + // D E F + // + // to a matrix is 3 x 5: + // + // A . B . C + // . . . . . + // D . E . F + + m_input_rows_eff = (m_inputRows - 1) * m_row_inflate_strides + 1; + m_input_cols_eff = (m_inputCols - 1) * m_col_inflate_strides + 1; + m_patch_rows_eff = op.patch_rows() + (op.patch_rows() - 1) * (m_in_row_strides - 1); + m_patch_cols_eff = op.patch_cols() + (op.patch_cols() - 1) * (m_in_col_strides - 1); + + if (op.padding_explicit()) { + m_outputRows = ceil((m_input_rows_eff + op.padding_top() + op.padding_bottom() - m_patch_rows_eff + 1.f) / static_cast<float>(m_row_strides)); + m_outputCols = ceil((m_input_cols_eff + op.padding_left() + op.padding_right() - m_patch_cols_eff + 1.f) / static_cast<float>(m_col_strides)); + m_rowPaddingTop = op.padding_top(); + m_colPaddingLeft = op.padding_left(); + } else { + // Computing padding from the type + switch (op.padding_type()) { + case PADDING_VALID: + m_outputRows = ceil((m_input_rows_eff - m_patch_rows_eff + 1.f) / static_cast<float>(m_row_strides)); + m_outputCols = ceil((m_input_cols_eff - m_patch_cols_eff + 1.f) / static_cast<float>(m_col_strides)); + // Calculate the padding + m_rowPaddingTop = ((m_outputRows - 1) * m_row_strides + m_patch_rows_eff - m_input_rows_eff) / 2; + m_colPaddingLeft = ((m_outputCols - 1) * m_col_strides + m_patch_cols_eff - m_input_cols_eff) / 2; + break; + case PADDING_SAME: + m_outputRows = ceil(m_input_rows_eff / static_cast<float>(m_row_strides)); + m_outputCols = ceil(m_input_cols_eff / static_cast<float>(m_col_strides)); + // Calculate the padding + m_rowPaddingTop = ((m_outputRows - 1) * m_row_strides + m_patch_rows_eff - m_input_rows_eff) / 2; + m_colPaddingLeft = ((m_outputCols - 1) * m_col_strides + m_patch_cols_eff - m_input_cols_eff) / 2; + break; + default: + eigen_assert(false && "unexpected padding"); + } + } + eigen_assert(m_outputRows > 0); + eigen_assert(m_outputCols > 0); + + // Dimensions for result of extraction. + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + // ColMajor + // 0: depth + // 1: patch_rows + // 2: patch_cols + // 3: number of patches + // 4 and beyond: anything else (such as batch). + m_dimensions[0] = input_dims[0]; + m_dimensions[1] = op.patch_rows(); + m_dimensions[2] = op.patch_cols(); + m_dimensions[3] = m_outputRows * m_outputCols; + for (int i = 4; i < NumDims; ++i) { + m_dimensions[i] = input_dims[i-1]; + } + } else { + // RowMajor + // NumDims-1: depth + // NumDims-2: patch_rows + // NumDims-3: patch_cols + // NumDims-4: number of patches + // NumDims-5 and beyond: anything else (such as batch). + m_dimensions[NumDims-1] = input_dims[NumInputDims-1]; + m_dimensions[NumDims-2] = op.patch_rows(); + m_dimensions[NumDims-3] = op.patch_cols(); + m_dimensions[NumDims-4] = m_outputRows * m_outputCols; + for (int i = NumDims-5; i >= 0; --i) { + m_dimensions[i] = input_dims[i]; + } + } + + // Strides for moving the patch in various dimensions. + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + m_colStride = m_dimensions[1]; + m_patchStride = m_colStride * m_dimensions[2] * m_dimensions[0]; + m_otherStride = m_patchStride * m_dimensions[3]; + } else { + m_colStride = m_dimensions[NumDims-2]; + m_patchStride = m_colStride * m_dimensions[NumDims-3] * m_dimensions[NumDims-1]; + m_otherStride = m_patchStride * m_dimensions[NumDims-4]; + } + + // Strides for navigating through the input tensor. + m_rowInputStride = m_inputDepth; + m_colInputStride = m_inputDepth * m_inputRows; + m_patchInputStride = m_inputDepth * m_inputRows * m_inputCols; + + // Fast representations of different variables. + m_fastOtherStride = internal::TensorIntDivisor<Index>(m_otherStride); + m_fastPatchStride = internal::TensorIntDivisor<Index>(m_patchStride); + m_fastColStride = internal::TensorIntDivisor<Index>(m_colStride); + m_fastInputRowStride = internal::TensorIntDivisor<Index>(m_row_inflate_strides); + m_fastInputColStride = internal::TensorIntDivisor<Index>(m_col_inflate_strides); + m_fastInputColsEff = internal::TensorIntDivisor<Index>(m_input_cols_eff); + + // Number of patches in the width dimension. + m_fastOutputRows = internal::TensorIntDivisor<Index>(m_outputRows); + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + m_fastOutputDepth = internal::TensorIntDivisor<Index>(m_dimensions[0]); + } else { + m_fastOutputDepth = internal::TensorIntDivisor<Index>(m_dimensions[NumDims-1]); + } + + m_block_total_size_max = numext::maxi(static_cast<std::size_t>(1), + device.lastLevelCacheSize() / + sizeof(Scalar)); + } + + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + // Patch index corresponding to the passed in index. + const Index patchIndex = index / m_fastPatchStride; + // Find the offset of the element wrt the location of the first element. + const Index patchOffset = (index - patchIndex * m_patchStride) / m_fastOutputDepth; + + // Other ways to index this element. + const Index otherIndex = (NumDims == 4) ? 0 : index / m_fastOtherStride; + const Index patch2DIndex = (NumDims == 4) ? patchIndex : (index - otherIndex * m_otherStride) / m_fastPatchStride; + + // Calculate col index in the input original tensor. + const Index colIndex = patch2DIndex / m_fastOutputRows; + const Index colOffset = patchOffset / m_fastColStride; + const Index inputCol = colIndex * m_col_strides + colOffset * m_in_col_strides - m_colPaddingLeft; + const Index origInputCol = (m_col_inflate_strides == 1) ? inputCol : ((inputCol >= 0) ? (inputCol / m_fastInputColStride) : 0); + if (inputCol < 0 || inputCol >= m_input_cols_eff || + ((m_col_inflate_strides != 1) && (inputCol != origInputCol * m_col_inflate_strides))) { + return Scalar(m_paddingValue); + } + + // Calculate row index in the original input tensor. + const Index rowIndex = patch2DIndex - colIndex * m_outputRows; + const Index rowOffset = patchOffset - colOffset * m_colStride; + const Index inputRow = rowIndex * m_row_strides + rowOffset * m_in_row_strides - m_rowPaddingTop; + const Index origInputRow = (m_row_inflate_strides == 1) ? inputRow : ((inputRow >= 0) ? (inputRow / m_fastInputRowStride) : 0); + if (inputRow < 0 || inputRow >= m_input_rows_eff || + ((m_row_inflate_strides != 1) && (inputRow != origInputRow * m_row_inflate_strides))) { + return Scalar(m_paddingValue); + } + + const int depth_index = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - 1; + const Index depth = index - (index / m_fastOutputDepth) * m_dimensions[depth_index]; + + const Index inputIndex = depth + origInputRow * m_rowInputStride + origInputCol * m_colInputStride + otherIndex * m_patchInputStride; + return m_impl.coeff(inputIndex); + } + + template<int LoadMode> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + const Index packetSize = internal::unpacket_traits<PacketReturnType>::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + + if (m_in_row_strides != 1 || m_in_col_strides != 1 || m_row_inflate_strides != 1 || m_col_inflate_strides != 1) { + return packetWithPossibleZero(index); + } + + const Index indices[2] = {index, index + packetSize - 1}; + const Index patchIndex = indices[0] / m_fastPatchStride; + if (patchIndex != indices[1] / m_fastPatchStride) { + return packetWithPossibleZero(index); + } + const Index otherIndex = (NumDims == 4) ? 0 : indices[0] / m_fastOtherStride; + eigen_assert(otherIndex == indices[1] / m_fastOtherStride); + + // Find the offset of the element wrt the location of the first element. + const Index patchOffsets[2] = {(indices[0] - patchIndex * m_patchStride) / m_fastOutputDepth, + (indices[1] - patchIndex * m_patchStride) / m_fastOutputDepth}; + + const Index patch2DIndex = (NumDims == 4) ? patchIndex : (indices[0] - otherIndex * m_otherStride) / m_fastPatchStride; + eigen_assert(patch2DIndex == (indices[1] - otherIndex * m_otherStride) / m_fastPatchStride); + + const Index colIndex = patch2DIndex / m_fastOutputRows; + const Index colOffsets[2] = {patchOffsets[0] / m_fastColStride, patchOffsets[1] / m_fastColStride}; + + // Calculate col indices in the original input tensor. + const Index inputCols[2] = {colIndex * m_col_strides + colOffsets[0] - + m_colPaddingLeft, colIndex * m_col_strides + colOffsets[1] - m_colPaddingLeft}; + if (inputCols[1] < 0 || inputCols[0] >= m_inputCols) { + return internal::pset1<PacketReturnType>(Scalar(m_paddingValue)); + } + + if (inputCols[0] == inputCols[1]) { + const Index rowIndex = patch2DIndex - colIndex * m_outputRows; + const Index rowOffsets[2] = {patchOffsets[0] - colOffsets[0]*m_colStride, patchOffsets[1] - colOffsets[1]*m_colStride}; + eigen_assert(rowOffsets[0] <= rowOffsets[1]); + // Calculate col indices in the original input tensor. + const Index inputRows[2] = {rowIndex * m_row_strides + rowOffsets[0] - + m_rowPaddingTop, rowIndex * m_row_strides + rowOffsets[1] - m_rowPaddingTop}; + + if (inputRows[1] < 0 || inputRows[0] >= m_inputRows) { + return internal::pset1<PacketReturnType>(Scalar(m_paddingValue)); + } + + if (inputRows[0] >= 0 && inputRows[1] < m_inputRows) { + // no padding + const int depth_index = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - 1; + const Index depth = index - (index / m_fastOutputDepth) * m_dimensions[depth_index]; + const Index inputIndex = depth + inputRows[0] * m_rowInputStride + inputCols[0] * m_colInputStride + otherIndex * m_patchInputStride; + return m_impl.template packet<Unaligned>(inputIndex); + } + } + + return packetWithPossibleZero(index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( + std::vector<internal::TensorOpResourceRequirements>* resources) const { + resources->push_back(internal::TensorOpResourceRequirements( + internal::kSkewedInnerDims, m_block_total_size_max)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block( + OutputTensorBlock* output_block) const { + typedef typename internal::ImagePatchCopyOp<Self, PacketAccess> + ImagePatchCopyOp; + typedef typename internal::ImagePatchPaddingOp<Self> ImagePatchPaddingOp; + + // Calculate loop limits and various input/output dim sizes. + const DSizes<Index, NumDims>& block_sizes = output_block->block_sizes(); + const bool col_major = + static_cast<int>(Layout) == static_cast<int>(ColMajor); + const Index depth_dim_size = block_sizes[col_major ? 0 : NumDims - 1]; + const Index output_depth_dim_size = m_dimensions[ + col_major ? 0 : NumDims - 1]; + const Index row_dim_size = block_sizes[col_major ? 1 : NumDims - 2]; + const Index output_row_dim_size = m_dimensions[col_major ? 1 : NumDims - 2]; + const Index col_dim_size = block_sizes[col_major ? 2 : NumDims - 3]; + const Index block_col_stride = row_dim_size * depth_dim_size; + const Index patch_index_dim_size = block_sizes[col_major ? 3 : NumDims - 4]; + const Index outer_dim_size = block_sizes.TotalSize() / + (depth_dim_size * row_dim_size * col_dim_size * patch_index_dim_size); + + const Index patch_size = row_dim_size * col_dim_size * depth_dim_size; + const Index batch_size = patch_size * patch_index_dim_size; + + Index output_index = output_block->first_coeff_index(); + + // Loop through outer dimensions. + for (Index outer_dim_index = 0; + outer_dim_index < outer_dim_size; + ++outer_dim_index) { + const Index outer_output_base_index = outer_dim_index * batch_size; + // Find the offset of the element wrt the location of the first element. + const Index patchIndexStart = output_index / m_fastPatchStride; + const Index patchOffset = + (output_index - patchIndexStart * m_patchStride) / m_fastOutputDepth; + const Index colOffsetStart = patchOffset / m_fastColStride; + // Other ways to index this element. + const Index otherIndex = (NumDims == 4) ? + 0 : output_index / m_fastOtherStride; + const Index patch2DIndexStart = (NumDims == 4) ? + 0 : (output_index - otherIndex * m_otherStride) / m_fastPatchStride; + // Calculate starting depth index. + const Index depth = output_index - (output_index / m_fastOutputDepth) * + output_depth_dim_size; + const Index patch_input_base_index = depth + otherIndex * + m_patchInputStride; + + // Loop through patches. + for (Index patch_index_dim_index = 0; + patch_index_dim_index < patch_index_dim_size; + ++patch_index_dim_index) { + const Index patch_output_base_index = outer_output_base_index + + patch_index_dim_index * patch_size; + // Patch index corresponding to the passed in index. + const Index patchIndex = patchIndexStart + patch_index_dim_index; + const Index patch2DIndex = (NumDims == 4) ? + patchIndex : patch2DIndexStart + patch_index_dim_index; + const Index colIndex = patch2DIndex / m_fastOutputRows; + const Index input_col_base = colIndex * m_col_strides; + const Index row_offset_base = (patch2DIndex - colIndex * m_outputRows) * + m_row_strides - m_rowPaddingTop; + + // Loop through columns. + for (Index col_dim_index = 0; + col_dim_index < col_dim_size; + ++col_dim_index) { + const Index col_output_base_index = patch_output_base_index + + col_dim_index * block_col_stride; + + // Calculate col index in the input original tensor. + Index colOffset = colOffsetStart + col_dim_index; + Index inputCol = input_col_base + colOffset * m_in_col_strides - + m_colPaddingLeft; + Index origInputCol = (m_col_inflate_strides == 1) ? + inputCol : ((inputCol >= 0) ? + (inputCol / m_fastInputColStride) : 0); + + bool pad_column = false; + if (inputCol < 0 || inputCol >= m_input_cols_eff || + ((m_col_inflate_strides != 1) && + (inputCol != origInputCol * m_col_inflate_strides))) { + pad_column = true; + } + + const Index col_input_base_index = patch_input_base_index + + origInputCol * m_colInputStride; + const Index input_row_base = row_offset_base + + ((patchOffset + col_dim_index * output_row_dim_size) - + colOffset * m_colStride) * m_in_row_strides; + // Loop through rows. + for (Index row_dim_index = 0; + row_dim_index < row_dim_size; + ++row_dim_index) { + const Index output_base_index = col_output_base_index + + row_dim_index * depth_dim_size; + bool pad_row = false; + Index inputIndex; + if (!pad_column) { + Index inputRow = input_row_base + row_dim_index * + m_in_row_strides; + Index origInputRow = (m_row_inflate_strides == 1) ? + inputRow : ((inputRow >= 0) ? + (inputRow / m_fastInputRowStride) : 0); + if (inputRow < 0 || inputRow >= m_input_rows_eff || + ((m_row_inflate_strides != 1) && + (inputRow != origInputRow * m_row_inflate_strides))) { + pad_row = true; + } else { + inputIndex = col_input_base_index + origInputRow * + m_rowInputStride; + } + } + // Copy (or pad) along depth dimension. + if (pad_column || pad_row) { + ImagePatchPaddingOp::Run(depth_dim_size, Scalar(m_paddingValue), + output_base_index, output_block->data()); + } else { + ImagePatchCopyOp::Run(*this, depth_dim_size, + output_base_index, output_block->data(), + inputIndex); + } + } + } + } + output_index += m_otherStride; + } + } + + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + + const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; } + + Index rowPaddingTop() const { return m_rowPaddingTop; } + Index colPaddingLeft() const { return m_colPaddingLeft; } + Index outputRows() const { return m_outputRows; } + Index outputCols() const { return m_outputCols; } + Index userRowStride() const { return m_row_strides; } + Index userColStride() const { return m_col_strides; } + Index userInRowStride() const { return m_in_row_strides; } + Index userInColStride() const { return m_in_col_strides; } + Index rowInflateStride() const { return m_row_inflate_strides; } + Index colInflateStride() const { return m_col_inflate_strides; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<Index, NumDims>& coords) const + { + // Location of the first element of the patch. + // ColMajor + // 0: d, 1: patch_rows, 2: patch_cols, 3: number of patches, 4: number of batches + // RowMajor + // 0: number of batches, 1: number of patches, 2: patch_cols , 3: patch_rows, 4: d + const Index patch2DIndex = coords[static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 3 : 1]; + + array<Index, NumDims-1> inputCoords; + Index input_col_idx = patch2DIndex / m_fastInputColsEff; + Index inputCol = input_col_idx + coords[1] * m_in_row_strides - m_rowPaddingTop; + Index inputRow = patch2DIndex - input_col_idx * m_input_cols_eff + coords[2] * m_in_col_strides - m_colPaddingLeft; + const Index origInputCol = (m_col_inflate_strides == 1) ? inputCol : ((inputCol >= 0) ? (inputCol / m_fastInputColStride) : 0); + const Index origInputRow = (m_row_inflate_strides == 1) ? inputRow : ((inputRow >= 0) ? (inputRow / m_fastInputRowStride) : 0); + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + inputCoords[0] = coords[0]; // depth + inputCoords[1] = origInputCol; + inputCoords[2] = origInputRow; + inputCoords[3] = coords[4]; // batch + } else { + inputCoords[3] = coords[4]; // depth + inputCoords[2] = origInputCol; + inputCoords[1] = origInputRow; + inputCoords[0] = coords[0]; // batch + } + // If the computed coordinates are outside the original image perimeter, return 0. + if (inputCol < 0 || inputCol >= m_input_cols_eff || inputRow < 0 || inputRow >= m_input_rows_eff || + ((m_col_inflate_strides != 1) && (inputCol != origInputCol * m_col_inflate_strides)) || + ((m_row_inflate_strides != 1) && (inputRow != origInputRow * m_row_inflate_strides))) { + return Scalar(m_paddingValue); + } + if (TensorEvaluator<ArgType, Device>::CoordAccess) { + return m_impl.coeff(inputCoords); + } else { + Index inputIndex; + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + inputIndex = + inputCoords[3] * m_patchInputStride + + inputCoords[2] * m_colInputStride + + inputCoords[1] * m_rowInputStride + + inputCoords[0]; + } else { + inputIndex = + inputCoords[1] * m_patchInputStride + + inputCoords[2] * m_colInputStride + + inputCoords[3] * m_rowInputStride + + inputCoords[4]; + } + return m_impl.coeff(inputIndex); + } + } + + protected: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const + { + const int packetSize = internal::unpacket_traits<PacketReturnType>::size; + EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize]; + for (int i = 0; i < packetSize; ++i) { + values[i] = coeff(index+i); + } + PacketReturnType rslt = internal::pload<PacketReturnType>(values); + return rslt; + } + + Dimensions m_dimensions; + + Index m_otherStride; + Index m_patchStride; + Index m_colStride; + Index m_row_strides; + Index m_col_strides; + + Index m_in_row_strides; + Index m_in_col_strides; + Index m_row_inflate_strides; + Index m_col_inflate_strides; + + Index m_input_rows_eff; + Index m_input_cols_eff; + Index m_patch_rows_eff; + Index m_patch_cols_eff; + + internal::TensorIntDivisor<Index> m_fastOtherStride; + internal::TensorIntDivisor<Index> m_fastPatchStride; + internal::TensorIntDivisor<Index> m_fastColStride; + internal::TensorIntDivisor<Index> m_fastInputRowStride; + internal::TensorIntDivisor<Index> m_fastInputColStride; + internal::TensorIntDivisor<Index> m_fastInputColsEff; + + Index m_rowInputStride; + Index m_colInputStride; + Index m_patchInputStride; + + Index m_inputDepth; + Index m_inputRows; + Index m_inputCols; + + Index m_outputRows; + Index m_outputCols; + + Index m_rowPaddingTop; + Index m_colPaddingLeft; + + internal::TensorIntDivisor<Index> m_fastOutputRows; + internal::TensorIntDivisor<Index> m_fastOutputDepth; + + Scalar m_paddingValue; + std::size_t m_block_total_size_max; + + TensorEvaluator<ArgType, Device> m_impl; +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_IMAGE_PATCH_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h new file mode 100644 index 0000000000..7631b54f2f --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h @@ -0,0 +1,421 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H +#define EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H + +#if defined(EIGEN_HAS_CONSTEXPR) && defined(EIGEN_HAS_VARIADIC_TEMPLATES) + +#define EIGEN_HAS_INDEX_LIST + +namespace Eigen { + +/** \internal + * + * \class TensorIndexList + * \ingroup CXX11_Tensor_Module + * + * \brief Set of classes used to encode a set of Tensor dimensions/indices. + * + * The indices in the list can be known at compile time or at runtime. A mix + * of static and dynamic indices can also be provided if needed. The tensor + * code will attempt to take advantage of the indices that are known at + * compile time to optimize the code it generates. + * + * This functionality requires a c++11 compliant compiler. If your compiler + * is older you need to use arrays of indices instead. + * + * Several examples are provided in the cxx11_tensor_index_list.cpp file. + * + * \sa Tensor + */ + +template <DenseIndex n> +struct type2index { + static const DenseIndex value = n; + constexpr operator DenseIndex() const { return n; } + void set(DenseIndex val) { + eigen_assert(val == n); + } +}; + +namespace internal { +template <typename T> +void update_value(T& val, DenseIndex new_val) { + val = new_val; +} +template <DenseIndex n> +void update_value(type2index<n>& val, DenseIndex new_val) { + val.set(new_val); +} + +template <typename T> +struct is_compile_time_constant { + static constexpr bool value = false; +}; + +template <DenseIndex idx> +struct is_compile_time_constant<type2index<idx> > { + static constexpr bool value = true; +}; +template <DenseIndex idx> +struct is_compile_time_constant<const type2index<idx> > { + static constexpr bool value = true; +}; +template <DenseIndex idx> +struct is_compile_time_constant<type2index<idx>& > { + static constexpr bool value = true; +}; +template <DenseIndex idx> +struct is_compile_time_constant<const type2index<idx>& > { + static constexpr bool value = true; +}; + +template <DenseIndex Idx> +struct tuple_coeff { + template <typename... T> + static constexpr DenseIndex get(const DenseIndex i, const std::tuple<T...>& t) { + return std::get<Idx>(t) * (i == Idx) + tuple_coeff<Idx-1>::get(i, t) * (i != Idx); + } + template <typename... T> + static void set(const DenseIndex i, std::tuple<T...>& t, const DenseIndex value) { + if (i == Idx) { + update_value(std::get<Idx>(t), value); + } else { + tuple_coeff<Idx-1>::set(i, t, value); + } + } + + template <typename... T> + static constexpr bool value_known_statically(const DenseIndex i, const std::tuple<T...>& t) { + return ((i == Idx) & is_compile_time_constant<typename std::tuple_element<Idx, std::tuple<T...> >::type>::value) || + tuple_coeff<Idx-1>::value_known_statically(i, t); + } + + template <typename... T> + static constexpr bool values_up_to_known_statically(const std::tuple<T...>& t) { + return is_compile_time_constant<typename std::tuple_element<Idx, std::tuple<T...> >::type>::value && + tuple_coeff<Idx-1>::values_up_to_known_statically(t); + } + + template <typename... T> + static constexpr bool values_up_to_statically_known_to_increase(const std::tuple<T...>& t) { + return is_compile_time_constant<typename std::tuple_element<Idx, std::tuple<T...> >::type>::value && + is_compile_time_constant<typename std::tuple_element<Idx-1, std::tuple<T...> >::type>::value && + std::get<Idx>(t) > std::get<Idx-1>(t) && + tuple_coeff<Idx-1>::values_up_to_statically_known_to_increase(t); + } +}; + +template <> +struct tuple_coeff<0> { + template <typename... T> + static constexpr DenseIndex get(const DenseIndex i, const std::tuple<T...>& t) { + // eigen_assert (i == 0); // gcc fails to compile assertions in constexpr + return std::get<0>(t) * (i == 0); + } + template <typename... T> + static void set(const DenseIndex i, std::tuple<T...>& t, const DenseIndex value) { + eigen_assert (i == 0); + update_value(std::get<0>(t), value); + } + template <typename... T> + static constexpr bool value_known_statically(const DenseIndex i, const std::tuple<T...>& t) { + // eigen_assert (i == 0); // gcc fails to compile assertions in constexpr + return is_compile_time_constant<typename std::tuple_element<0, std::tuple<T...> >::type>::value & (i == 0); + } + + template <typename... T> + static constexpr bool values_up_to_known_statically(const std::tuple<T...>& t) { + return is_compile_time_constant<typename std::tuple_element<0, std::tuple<T...> >::type>::value; + } + + template <typename... T> + static constexpr bool values_up_to_statically_known_to_increase(const std::tuple<T...>& t) { + return true; + } +}; +} // namespace internal + + +template<typename FirstType, typename... OtherTypes> +struct IndexList : std::tuple<FirstType, OtherTypes...> { + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr DenseIndex operator[] (const DenseIndex i) const { + return internal::tuple_coeff<std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value-1>::get(i, *this); + } + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void set(const DenseIndex i, const DenseIndex value) { + return internal::tuple_coeff<std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value-1>::set(i, *this, value); + } + + constexpr IndexList(const std::tuple<FirstType, OtherTypes...>& other) : std::tuple<FirstType, OtherTypes...>(other) { } + constexpr IndexList() : std::tuple<FirstType, OtherTypes...>() { } + + constexpr bool value_known_statically(const DenseIndex i) const { + return internal::tuple_coeff<std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value-1>::value_known_statically(i, *this); + } + constexpr bool all_values_known_statically() const { + return internal::tuple_coeff<std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value-1>::values_up_to_known_statically(*this); + } + + constexpr bool values_statically_known_to_increase() const { + return internal::tuple_coeff<std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value-1>::values_up_to_statically_known_to_increase(*this); + } +}; + + +template<typename FirstType, typename... OtherTypes> +constexpr IndexList<FirstType, OtherTypes...> make_index_list(FirstType val1, OtherTypes... other_vals) { + return std::make_tuple(val1, other_vals...); +} + + +namespace internal { + +template<typename FirstType, typename... OtherTypes> size_t array_prod(const IndexList<FirstType, OtherTypes...>& sizes) { + size_t result = 1; + for (int i = 0; i < array_size<IndexList<FirstType, OtherTypes...> >::value; ++i) { + result *= sizes[i]; + } + return result; +}; + +template<typename FirstType, typename... OtherTypes> struct array_size<IndexList<FirstType, OtherTypes...> > { + static const size_t value = std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value; +}; +template<typename FirstType, typename... OtherTypes> struct array_size<const IndexList<FirstType, OtherTypes...> > { + static const size_t value = std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value; +}; + +template<DenseIndex n, typename FirstType, typename... OtherTypes> constexpr DenseIndex array_get(IndexList<FirstType, OtherTypes...>& a) { + return std::get<n>(a); +} +template<DenseIndex n, typename FirstType, typename... OtherTypes> constexpr DenseIndex array_get(const IndexList<FirstType, OtherTypes...>& a) { + return std::get<n>(a); +} + +template <typename T> +struct index_known_statically { + constexpr bool operator() (DenseIndex) const { + return false; + } +}; + +template <typename FirstType, typename... OtherTypes> +struct index_known_statically<IndexList<FirstType, OtherTypes...> > { + constexpr bool operator() (const DenseIndex i) const { + return IndexList<FirstType, OtherTypes...>().value_known_statically(i); + } +}; + +template <typename FirstType, typename... OtherTypes> +struct index_known_statically<const IndexList<FirstType, OtherTypes...> > { + constexpr bool operator() (const DenseIndex i) const { + return IndexList<FirstType, OtherTypes...>().value_known_statically(i); + } +}; + +template <typename T> +struct all_indices_known_statically { + constexpr bool operator() () const { + return false; + } +}; + +template <typename FirstType, typename... OtherTypes> +struct all_indices_known_statically<IndexList<FirstType, OtherTypes...> > { + constexpr bool operator() () const { + return IndexList<FirstType, OtherTypes...>().all_values_known_statically(); + } +}; + +template <typename FirstType, typename... OtherTypes> +struct all_indices_known_statically<const IndexList<FirstType, OtherTypes...> > { + constexpr bool operator() () const { + return IndexList<FirstType, OtherTypes...>().all_values_known_statically(); + } +}; + +template <typename T> +struct indices_statically_known_to_increase { + constexpr bool operator() () const { + return false; + } +}; + +template <typename FirstType, typename... OtherTypes> +struct indices_statically_known_to_increase<IndexList<FirstType, OtherTypes...> > { + constexpr bool operator() () const { + return IndexList<FirstType, OtherTypes...>().values_statically_known_to_increase(); + } +}; + +template <typename FirstType, typename... OtherTypes> +struct indices_statically_known_to_increase<const IndexList<FirstType, OtherTypes...> > { + constexpr bool operator() () const { + return IndexList<FirstType, OtherTypes...>().values_statically_known_to_increase(); + } +}; + +template <typename Tx> +struct index_statically_eq { + constexpr bool operator() (DenseIndex, DenseIndex) const { + return false; + } +}; + +template <typename FirstType, typename... OtherTypes> +struct index_statically_eq<IndexList<FirstType, OtherTypes...> > { + constexpr bool operator() (const DenseIndex i, const DenseIndex value) const { + return IndexList<FirstType, OtherTypes...>().value_known_statically(i) && + IndexList<FirstType, OtherTypes...>()[i] == value; + } +}; + +template <typename FirstType, typename... OtherTypes> +struct index_statically_eq<const IndexList<FirstType, OtherTypes...> > { + constexpr bool operator() (const DenseIndex i, const DenseIndex value) const { + return IndexList<FirstType, OtherTypes...>().value_known_statically(i) && + IndexList<FirstType, OtherTypes...>()[i] == value; + } +}; + +template <typename T> +struct index_statically_ne { + constexpr bool operator() (DenseIndex, DenseIndex) const { + return false; + } +}; + +template <typename FirstType, typename... OtherTypes> +struct index_statically_ne<IndexList<FirstType, OtherTypes...> > { + constexpr bool operator() (const DenseIndex i, const DenseIndex value) const { + return IndexList<FirstType, OtherTypes...>().value_known_statically(i) && + IndexList<FirstType, OtherTypes...>()[i] != value; + } +}; + +template <typename FirstType, typename... OtherTypes> +struct index_statically_ne<const IndexList<FirstType, OtherTypes...> > { + constexpr bool operator() (const DenseIndex i, const DenseIndex value) const { + return IndexList<FirstType, OtherTypes...>().value_known_statically(i) && + IndexList<FirstType, OtherTypes...>()[i] != value; + } +}; + + +template <typename T> +struct index_statically_gt { + constexpr bool operator() (DenseIndex, DenseIndex) const { + return false; + } +}; + +template <typename FirstType, typename... OtherTypes> +struct index_statically_gt<IndexList<FirstType, OtherTypes...> > { + constexpr bool operator() (const DenseIndex i, const DenseIndex value) const { + return IndexList<FirstType, OtherTypes...>().value_known_statically(i) && + IndexList<FirstType, OtherTypes...>()[i] > value; + } +}; + +template <typename FirstType, typename... OtherTypes> +struct index_statically_gt<const IndexList<FirstType, OtherTypes...> > { + constexpr bool operator() (const DenseIndex i, const DenseIndex value) const { + return IndexList<FirstType, OtherTypes...>().value_known_statically(i) && + IndexList<FirstType, OtherTypes...>()[i] > value; + } +}; + +template <typename T> +struct index_statically_lt { + constexpr bool operator() (DenseIndex, DenseIndex) const { + return false; + } +}; + +template <typename FirstType, typename... OtherTypes> +struct index_statically_lt<IndexList<FirstType, OtherTypes...> > { + constexpr bool operator() (const DenseIndex i, const DenseIndex value) const { + return IndexList<FirstType, OtherTypes...>().value_known_statically(i) && + IndexList<FirstType, OtherTypes...>()[i] < value; + } +}; + +template <typename FirstType, typename... OtherTypes> +struct index_statically_lt<const IndexList<FirstType, OtherTypes...> > { + constexpr bool operator() (const DenseIndex i, const DenseIndex value) const { + return IndexList<FirstType, OtherTypes...>().value_known_statically(i) && + IndexList<FirstType, OtherTypes...>()[i] < value; + } +}; + +} // end namespace internal +} // end namespace Eigen + +#else + +namespace Eigen { +namespace internal { + +// No C++11 support +template <typename T> +struct index_known_statically { + EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC bool operator() (DenseIndex) const{ + return false; + } +}; + +template <typename T> +struct all_indices_known_statically { + EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC bool operator() () const { + return false; + } +}; + +template <typename T> +struct indices_statically_known_to_increase { + EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC bool operator() () const { + return false; + } +}; + +template <typename T> +struct index_statically_eq { + EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC bool operator() (DenseIndex, DenseIndex) const{ + return false; + } +}; + +template <typename T> +struct index_statically_ne { + EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC bool operator() (DenseIndex, DenseIndex) const{ + return false; + } +}; + +template <typename T> +struct index_statically_gt { + EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC bool operator() (DenseIndex, DenseIndex) const{ + return false; + } +}; + +template <typename T> +struct index_statically_lt { + EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC bool operator() (DenseIndex, DenseIndex) const{ + return false; + } +}; + +} // end namespace internal +} // end namespace Eigen + +#endif + +#endif // EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h new file mode 100644 index 0000000000..40a50e4662 --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h @@ -0,0 +1,219 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Ke Yang <yangke@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_INFLATION_H +#define EIGEN_CXX11_TENSOR_TENSOR_INFLATION_H + +namespace Eigen { + +/** \class TensorInflation + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor inflation class. + * + * + */ +namespace internal { +template<typename Strides, typename XprType> +struct traits<TensorInflationOp<Strides, XprType> > : public traits<XprType> +{ + typedef typename XprType::Scalar Scalar; + typedef traits<XprType> XprTraits; + typedef typename packet_traits<Scalar>::type Packet; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference<Nested>::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; +}; + +template<typename Strides, typename XprType> +struct eval<TensorInflationOp<Strides, XprType>, Eigen::Dense> +{ + typedef const TensorInflationOp<Strides, XprType>& type; +}; + +template<typename Strides, typename XprType> +struct nested<TensorInflationOp<Strides, XprType>, 1, typename eval<TensorInflationOp<Strides, XprType> >::type> +{ + typedef TensorInflationOp<Strides, XprType> type; +}; + +} // end namespace internal + +template<typename Strides, typename XprType> +class TensorInflationOp : public TensorBase<TensorInflationOp<Strides, XprType>, ReadOnlyAccessors> +{ + public: + typedef typename Eigen::internal::traits<TensorInflationOp>::Scalar Scalar; + typedef typename Eigen::internal::traits<TensorInflationOp>::Packet Packet; + typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename Eigen::internal::nested<TensorInflationOp>::type Nested; + typedef typename Eigen::internal::traits<TensorInflationOp>::StorageKind StorageKind; + typedef typename Eigen::internal::traits<TensorInflationOp>::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorInflationOp(const XprType& expr, const Strides& strides) + : m_xpr(expr), m_strides(strides) {} + + EIGEN_DEVICE_FUNC + const Strides& strides() const { return m_strides; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all<typename XprType::Nested>::type& + expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; + const Strides m_strides; +}; + +// Eval as rvalue +template<typename Strides, typename ArgType, typename Device> +struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device> +{ + typedef TensorInflationOp<Strides, ArgType> XprType; + typedef typename XprType::Index Index; + static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value; + typedef DSizes<Index, NumDims> Dimensions; + + enum { + IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false, + PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, + BlockAccess = false, + Layout = TensorEvaluator<ArgType, Device>::Layout, + CoordAccess = false, // to be implemented + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_strides(op.strides()) + { + m_dimensions = m_impl.dimensions(); + // Expand each dimension to the inflated dimension. + for (int i = 0; i < NumDims; ++i) { + m_dimensions[i] = (m_dimensions[i] - 1) * op.strides()[i] + 1; + } + + // Remember the strides for fast division. + for (int i = 0; i < NumDims; ++i) { + m_fastStrides[i] = internal::TensorIntDivisor<Index>(m_strides[i]); + } + + const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions(); + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + m_outputStrides[0] = 1; + m_inputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; + m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; + } + } else { // RowMajor + m_outputStrides[NumDims-1] = 1; + m_inputStrides[NumDims-1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1]; + m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1]; + } + } + } + + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + // Computes the input index given the output index. Returns true if the output + // index doesn't fall into a hole. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool getInputIndex(Index index, Index* inputIndex) const + { + eigen_assert(index < dimensions().TotalSize()); + *inputIndex = 0; + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_outputStrides[i]; + if (idx != idx / m_fastStrides[i] * m_strides[i]) { + return false; + } + *inputIndex += idx / m_strides[i] * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + if (index != index / m_fastStrides[0] * m_strides[0]) { + return false; + } + *inputIndex += index / m_strides[0]; + return true; + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = index / m_outputStrides[i]; + if (idx != idx / m_fastStrides[i] * m_strides[i]) { + return false; + } + *inputIndex += idx / m_strides[i] * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + if (index != index / m_fastStrides[NumDims-1] * m_strides[NumDims-1]) { + return false; + } + *inputIndex += index / m_strides[NumDims - 1]; + } + return true; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + Index inputIndex = 0; + if (getInputIndex(index, &inputIndex)) { + return m_impl.coeff(inputIndex); + } else { + return Scalar(0); + } + } + + // TODO(yangke): optimize this function so that we can detect and produce + // all-zero packets + template<int LoadMode> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + const int packetSize = internal::unpacket_traits<PacketReturnType>::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + + EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize]; + for (int i = 0; i < packetSize; ++i) { + values[i] = coeff(index+i); + } + PacketReturnType rslt = internal::pload<PacketReturnType>(values); + return rslt; + } + + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + + protected: + Dimensions m_dimensions; + array<Index, NumDims> m_outputStrides; + array<Index, NumDims> m_inputStrides; + TensorEvaluator<ArgType, Device> m_impl; + const Strides m_strides; + array<internal::TensorIntDivisor<Index>, NumDims> m_fastStrides; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_INFLATION_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h new file mode 100644 index 0000000000..375c763152 --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h @@ -0,0 +1,82 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H +#define EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H + +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES + +#include <initializer_list> + +namespace Eigen { + +/** \class TensorInitializer + * \ingroup CXX11_Tensor_Module + * + * \brief Helper template to initialize Tensors from std::initializer_lists. + */ +namespace internal { + +template <typename Derived, int N> +struct Initializer { + typedef std::initializer_list< + typename Initializer<Derived, N - 1>::InitList> InitList; + + static void run(TensorEvaluator<Derived, DefaultDevice>& tensor, + Eigen::array<typename traits<Derived>::Index, traits<Derived>::NumDimensions>* indices, + const InitList& vals) { + int i = 0; + for (auto v : vals) { + (*indices)[traits<Derived>::NumDimensions - N] = i++; + Initializer<Derived, N - 1>::run(tensor, indices, v); + } + } +}; + +template <typename Derived> +struct Initializer<Derived, 1> { + typedef std::initializer_list<typename traits<Derived>::Scalar> InitList; + + static void run(TensorEvaluator<Derived, DefaultDevice>& tensor, + Eigen::array<typename traits<Derived>::Index, traits<Derived>::NumDimensions>* indices, + const InitList& vals) { + int i = 0; + // There is likely a faster way to do that than iterating. + for (auto v : vals) { + (*indices)[traits<Derived>::NumDimensions - 1] = i++; + tensor.coeffRef(*indices) = v; + } + } +}; + +template <typename Derived> +struct Initializer<Derived, Dynamic> { + typedef std::initializer_list<typename traits<Derived>::Scalar> InitList; + + static void run(TensorEvaluator<Derived, DefaultDevice>& tensor, + Eigen::array<typename traits<Derived>::Index, traits<Derived>::NumDimensions>* indices, + const InitList& vals) { + // Static initialization not implemented for VarDims tensors. + eigen_assert(false); + } +}; + +template <typename Derived, int N> +void initialize_tensor(TensorEvaluator<Derived, DefaultDevice>& tensor, + const typename Initializer<Derived, traits<Derived>::NumDimensions>::InitList& vals) { + Eigen::array<typename traits<Derived>::Index, traits<Derived>::NumDimensions> indices; + Initializer<Derived, traits<Derived>::NumDimensions>::run(tensor, &indices, vals); +} + +} // namespace internal +} // namespace Eigen + +#endif // EIGEN_HAS_VARIADIC_TEMPLATES + +#endif // EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h new file mode 100644 index 0000000000..3e90b08c99 --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h @@ -0,0 +1,357 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_INTDIV_H +#define EIGEN_CXX11_TENSOR_TENSOR_INTDIV_H + + +namespace Eigen { + +/** \internal + * + * \class TensorIntDiv + * \ingroup CXX11_Tensor_Module + * + * \brief Fast integer division by a constant. + * + * See the paper from Granlund and Montgomery for explanation. + * (at http://dx.doi.org/10.1145/773473.178249) + * + * \sa Tensor + */ + +namespace internal { + +#if !defined(__GCUDACC__) && !defined(__GCUDACC_HOST__) + +namespace { + // Note: result is undefined if val == 0 + template <typename T> + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int count_leading_zeros(const T val) + { +#ifdef __CUDA_ARCH__ + if (sizeof(T) == 8) { + return __clzll(val); + } + return __clz(val); +#elif EIGEN_COMP_MSVC + DWORD leading_zeros = 0; + if (sizeof(T) == 8) { + _BitScanReverse64(&leading_zero, val); + } + else { + _BitScanReverse(&leading_zero, val); + } +#else + if (sizeof(T) == 8) { + return __builtin_clzl(static_cast<uint64_t>(val)); + } + return __builtin_clz(static_cast<uint32_t>(val)); +#endif + } + + + template <typename T> + struct DividerTraits { +#if defined(__SIZEOF_INT128__) && !defined(__CUDACC__) + typedef typename conditional<sizeof(T) == 8, uint64_t, uint32_t>::type type; + static const int N = sizeof(T) * 8; +#else + typedef uint32_t type; + static const int N = 32; +#endif + }; + + + template <typename T> + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t muluh(const uint32_t a, const T b) { +#if defined(__CUDA_ARCH__) + return __umulhi(a, b); +#else + return (static_cast<uint64_t>(a) * b) >> 32; +#endif + } + +#if defined(__CUDA_ARCH__) + template <typename T> + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t muluh(const uint64_t a, const T b) { + return __umul64hi(a, b); + } +#else + template <typename T> + EIGEN_ALWAYS_INLINE uint64_t muluh(const uint64_t a, const T b) { +#if defined(__SIZEOF_INT128__) && !defined(__CUDACC__) + __uint128_t v = static_cast<__uint128_t>(a) * static_cast<__uint128_t>(b); + return static_cast<uint64_t>(v >> 64); +#else + EIGEN_STATIC_ASSERT(sizeof(T) == 4, YOU_MADE_A_PROGRAMMING_MISTAKE); + return (a * b) >> 32; +#endif + } +#endif + + template <int N, typename T> + struct DividerHelper { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t computeMultiplier (const int log_div, const T divider) { + EIGEN_STATIC_ASSERT(N == 32, YOU_MADE_A_PROGRAMMING_MISTAKE); + return (static_cast<uint64_t>(1) << (N+log_div)) / divider - (static_cast<uint64_t>(1) << N) + 1; + } + }; + +#if defined(__SIZEOF_INT128__) && !defined(__CUDACC__) + template <typename T> + struct DividerHelper<64, T> { + static EIGEN_ALWAYS_INLINE uint64_t computeMultiplier(const int log_div, const T divider) { + return ((static_cast<__uint128_t>(1) << (64+log_div)) / static_cast<__uint128_t>(divider) - (static_cast<__uint128_t>(1) << 64) + 1); + } + }; +#endif +} + + +template <typename T> +struct TensorIntDivisor { + public: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor() { + multiplier = 0; + shift1 = 0; + shift2 = 0; + } + + // Must have 0 < divider < 2^31. This is relaxed to + // 0 < divider < 2^63 when using 64-bit indices on platforms that support + // the __uint128_t type. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor(const T divider) { + const int N = DividerTraits<T>::N; + eigen_assert(divider < NumTraits<UnsignedType>::highest()/2); + eigen_assert(divider > 0); + + // fast ln2 + const int leading_zeros = count_leading_zeros(static_cast<UnsignedType>(divider)); + int log_div = N - leading_zeros; + // if divider is a power of two then log_div is 1 more than it should be. + if ((1ull << (log_div-1)) == divider) + log_div--; + + multiplier = DividerHelper<N, T>::computeMultiplier(log_div, divider); + shift1 = log_div > 1 ? 1 : log_div; + shift2 = log_div > 1 ? log_div-1 : 0; + } + + // Must have 0 <= numerator. On platforms that dont support the __uint128_t + // type numerator should also be less than 2^32-1. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T divide(const T numerator) const { + eigen_assert(numerator < NumTraits<UnsignedType>::highest()/2); + eigen_assert(numerator >= 0); + + UnsignedType t1 = muluh(multiplier, numerator); + UnsignedType t = (static_cast<UnsignedType>(numerator) - t1) >> shift1; + return (t1 + t) >> shift2; + } + + private: + typedef typename DividerTraits<T>::type UnsignedType; + UnsignedType multiplier; + int32_t shift1; + int32_t shift2; +}; + + +// Optimized version for signed 32 bit integers. +// Derived from Hacker's Delight. +template <> +class TensorIntDivisor<int32_t> { + public: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor() { + magic = 0; + shift = 0; + } + // Must have 2 <= divider + EIGEN_DEVICE_FUNC TensorIntDivisor(int32_t divider) { + eigen_assert(divider >= 2); + calcMagic(divider); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int divide(const int32_t n) const { +#ifdef __CUDA_ARCH__ + return (__umulhi(magic, n) >> shift); +#else + uint64_t v = static_cast<uint64_t>(magic) * static_cast<uint64_t>(n); + return (static_cast<uint32_t>(v >> 32) >> shift); +#endif + } + +private: + // Compute the magic numbers. See Hacker's Delight section 10 for an in + // depth explanation. + EIGEN_DEVICE_FUNC void calcMagic(int32_t d) { + const unsigned two31 = 0x80000000; // 2**31. + unsigned ad = d; + unsigned t = two31 + (ad >> 31); + unsigned anc = t - 1 - t%ad; // Absolute value of nc. + int p = 31; // Init. p. + unsigned q1 = two31/anc; // Init. q1 = 2**p/|nc|. + unsigned r1 = two31 - q1*anc; // Init. r1 = rem(2**p, |nc|). + unsigned q2 = two31/ad; // Init. q2 = 2**p/|d|. + unsigned r2 = two31 - q2*ad; // Init. r2 = rem(2**p, |d|). + unsigned delta = 0; + do { + p = p + 1; + q1 = 2*q1; // Update q1 = 2**p/|nc|. + r1 = 2*r1; // Update r1 = rem(2**p, |nc|). + if (r1 >= anc) { // (Must be an unsigned + q1 = q1 + 1; // comparison here). + r1 = r1 - anc;} + q2 = 2*q2; // Update q2 = 2**p/|d|. + r2 = 2*r2; // Update r2 = rem(2**p, |d|). + if (r2 >= ad) { // (Must be an unsigned + q2 = q2 + 1; // comparison here). + r2 = r2 - ad;} + delta = ad - r2; + } while (q1 < delta || (q1 == delta && r1 == 0)); + + magic = (unsigned)(q2 + 1); + shift = p - 32; + } + + uint32_t magic; + int32_t shift; +}; + + +template <typename T> +static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator / (const T& numerator, const TensorIntDivisor<T>& divisor) { + return divisor.divide(numerator); +} + + +#else +// Reverse to the old code since gcudacc doesn't support the code above. +template <typename T> +struct TensorIntDivisor { + public: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor() { + multiplier = 0; + shift1 = 0; + shift2 = 0; + } + + // Must have 1 <= divider <= 2^31-1 + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor(const T divider) { + const int N = 32; + eigen_assert(divider > 0); + eigen_assert(divider < (1ull<<(N-1))); + + // fast ln2 +#ifndef __CUDA_ARCH__ + const int leading_zeros = __builtin_clz(divider); +#else + const int leading_zeros = __clz(divider); +#endif + int log_div = N - leading_zeros; + // if divider is a power of two then log_div is 1 more than it should be. + if ((1ull << (log_div-1)) == divider) + log_div--; + + multiplier = (static_cast<uint64_t>(1) << (N+log_div)) / divider - (static_cast<uint64_t>(1) << N) + 1; + shift1 = log_div > 1 ? 1 : log_div; + shift2 = log_div > 1 ? log_div-1 : 0; + } + + // Must have 0 <= numerator <= 2^32-1 + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T divide(const T numerator) const { + const int N = 32; + eigen_assert(numerator >= 0); + eigen_assert(static_cast<uint64_t>(numerator) < 1ull<<N); + + uint32_t t1 = (multiplier * numerator) >> N; + uint32_t t = (static_cast<uint32_t>(numerator) - t1) >> shift1; + return (t1 + t) >> shift2; + } + + private: + uint64_t multiplier; + int32_t shift1; + int32_t shift2; +}; + + +// Optimized version for signed 32 bit integers. +// Derived from Hacker's Delight. +template <> +class TensorIntDivisor<int> { + public: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor() { + magic = 0; + shift = 0; + } + // Must have 2 <= divider + EIGEN_DEVICE_FUNC TensorIntDivisor(int divider) { + eigen_assert(divider >= 2); + calcMagic(divider); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int divide(const int n) const { +#ifdef __CUDA_ARCH__ + return (__umulhi(magic, n) >> shift); +#else + uint64_t v = static_cast<uint64_t>(magic) * static_cast<uint64_t>(n); + return (static_cast<unsigned int>(v >> 32) >> shift); +#endif + } + +private: + // Compute the magic numbers. See Hacker's Delight section 10 for an in + // depth explanation. + EIGEN_DEVICE_FUNC void calcMagic(int d) { + const unsigned two31 = 0x80000000; // 2**31. + unsigned ad = d; + unsigned t = two31 + (ad >> 31); + unsigned anc = t - 1 - t%ad; // Absolute value of nc. + int p = 31; // Init. p. + unsigned q1 = two31/anc; // Init. q1 = 2**p/|nc|. + unsigned r1 = two31 - q1*anc; // Init. r1 = rem(2**p, |nc|). + unsigned q2 = two31/ad; // Init. q2 = 2**p/|d|. + unsigned r2 = two31 - q2*ad; // Init. r2 = rem(2**p, |d|). + unsigned delta = 0; + do { + p = p + 1; + q1 = 2*q1; // Update q1 = 2**p/|nc|. + r1 = 2*r1; // Update r1 = rem(2**p, |nc|). + if (r1 >= anc) { // (Must be an unsigned + q1 = q1 + 1; // comparison here). + r1 = r1 - anc;} + q2 = 2*q2; // Update q2 = 2**p/|d|. + r2 = 2*r2; // Update r2 = rem(2**p, |d|). + if (r2 >= ad) { // (Must be an unsigned + q2 = q2 + 1; // comparison here). + r2 = r2 - ad;} + delta = ad - r2; + } while (q1 < delta || (q1 == delta && r1 == 0)); + + magic = (unsigned)(q2 + 1); + shift = p - 32; + } + + unsigned int magic; + int shift; +}; + + +template <typename T> +static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator / (const T& numerator, const TensorIntDivisor<T>& divisor) { + return divisor.divide(numerator); +} + +#endif + +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_INTDIV_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h new file mode 100644 index 0000000000..bd795d54b0 --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h @@ -0,0 +1,217 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_LAYOUT_SWAP_H +#define EIGEN_CXX11_TENSOR_TENSOR_LAYOUT_SWAP_H + +namespace Eigen { + +/** \class TensorLayoutSwap + * \ingroup CXX11_Tensor_Module + * + * \brief Swap the layout from col-major to row-major, or row-major + * to col-major, and invert the order of the dimensions. + * + * Beware: the dimensions are reversed by this operation. If you want to + * preserve the ordering of the dimensions, you need to combine this + * operation with a shuffle. + * + * \example: + * Tensor<float, 2, ColMajor> input(2, 4); + * Tensor<float, 2, RowMajor> output = input.swap_layout(); + * eigen_assert(output.dimension(0) == 4); + * eigen_assert(output.dimension(1) == 2); + * + * array<int, 2> shuffle(1, 0); + * output = input.swap_layout().shuffle(shuffle); + * eigen_assert(output.dimension(0) == 2); + * eigen_assert(output.dimension(1) == 4); + * + */ +namespace internal { +template<typename XprType> +struct traits<TensorLayoutSwapOp<XprType> > : public traits<XprType> +{ + typedef typename XprType::Scalar Scalar; + typedef traits<XprType> XprTraits; + typedef typename packet_traits<Scalar>::type Packet; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference<Nested>::type _Nested; + static const int NumDimensions = traits<XprType>::NumDimensions; + static const int Layout = (static_cast<int>(traits<XprType>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor; +}; + +template<typename XprType> +struct eval<TensorLayoutSwapOp<XprType>, Eigen::Dense> +{ + typedef const TensorLayoutSwapOp<XprType>& type; +}; + +template<typename XprType> +struct nested<TensorLayoutSwapOp<XprType>, 1, typename eval<TensorLayoutSwapOp<XprType> >::type> +{ + typedef TensorLayoutSwapOp<XprType> type; +}; + +} // end namespace internal + + + +template<typename XprType> +class TensorLayoutSwapOp : public TensorBase<TensorLayoutSwapOp<XprType>, WriteAccessors> +{ + public: + typedef typename Eigen::internal::traits<TensorLayoutSwapOp>::Scalar Scalar; + typedef typename Eigen::internal::traits<TensorLayoutSwapOp>::Packet Packet; + typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; + typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType; + typedef typename internal::remove_const<typename XprType::PacketReturnType>::type PacketReturnType; + typedef typename Eigen::internal::nested<TensorLayoutSwapOp>::type Nested; + typedef typename Eigen::internal::traits<TensorLayoutSwapOp>::StorageKind StorageKind; + typedef typename Eigen::internal::traits<TensorLayoutSwapOp>::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorLayoutSwapOp(const XprType& expr) + : m_xpr(expr) {} + + EIGEN_DEVICE_FUNC + const typename internal::remove_all<typename XprType::Nested>::type& + expression() const { return m_xpr; } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorLayoutSwapOp& operator = (const TensorLayoutSwapOp& other) + { + typedef TensorAssignOp<TensorLayoutSwapOp, const TensorLayoutSwapOp> Assign; + Assign assign(*this, other); + internal::TensorExecutor<const Assign, DefaultDevice>::run( + assign, DefaultDevice()); + return *this; + } + + template<typename OtherDerived> + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorLayoutSwapOp& operator = (const OtherDerived& other) + { + typedef TensorAssignOp<TensorLayoutSwapOp, const OtherDerived> Assign; + Assign assign(*this, other); + internal::TensorExecutor<const Assign, DefaultDevice>::run( + assign, DefaultDevice()); + return *this; + } + + protected: + typename XprType::Nested m_xpr; +}; + + +// Eval as rvalue +template<typename ArgType, typename Device> +struct TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device> +{ + typedef TensorLayoutSwapOp<ArgType> XprType; + typedef typename XprType::Index Index; + static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value; + typedef DSizes<Index, NumDims> Dimensions; + + enum { + IsAligned = TensorEvaluator<ArgType, Device>::IsAligned, + PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, + BlockAccess = false, + Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == + static_cast<int>(ColMajor)) + ? RowMajor + : ColMajor, + CoordAccess = false, // to be implemented + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device) + { + for(int i = 0; i < NumDims; ++i) { + m_dimensions[i] = m_impl.dimensions()[NumDims-1-i]; + } + } + + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { + return m_impl.evalSubExprsIfNeeded(data); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + return m_impl.coeff(index); + } + + template<int LoadMode> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + return m_impl.template packet<LoadMode>(index); + } + + EIGEN_DEVICE_FUNC Scalar* data() const { return m_impl.data(); } + + const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; } + + protected: + TensorEvaluator<ArgType, Device> m_impl; + Dimensions m_dimensions; +}; + + +// Eval as lvalue +template<typename ArgType, typename Device> + struct TensorEvaluator<TensorLayoutSwapOp<ArgType>, Device> + : public TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device> +{ + typedef TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device> Base; + typedef TensorLayoutSwapOp<ArgType> XprType; + + enum { + IsAligned = TensorEvaluator<ArgType, Device>::IsAligned, + PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, + BlockAccess = false, + Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == + static_cast<int>(ColMajor)) + ? RowMajor + : ColMajor, + CoordAccess = false, // to be implemented + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : Base(op, device) + { } + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) + { + return this->m_impl.coeffRef(index); + } + template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void writePacket(Index index, const PacketReturnType& x) + { + this->m_impl.template writePacket<StoreMode>(index, x); + } +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_LAYOUT_SWAP_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h new file mode 100644 index 0000000000..908bdc38ad --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h @@ -0,0 +1,320 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_MAP_H +#define EIGEN_CXX11_TENSOR_TENSOR_MAP_H + +namespace Eigen { + +/** \class TensorMap + * \ingroup CXX11_Tensor_Module + * + * \brief A tensor expression mapping an existing array of data. + * + */ + +template<typename PlainObjectType, int Options_> class TensorMap : public TensorBase<TensorMap<PlainObjectType, Options_> > +{ + public: + typedef TensorMap<PlainObjectType, Options_> Self; + typedef typename PlainObjectType::Base Base; + typedef typename Eigen::internal::nested<Self>::type Nested; + typedef typename internal::traits<PlainObjectType>::StorageKind StorageKind; + typedef typename internal::traits<PlainObjectType>::Index Index; + typedef typename internal::traits<PlainObjectType>::Scalar Scalar; + typedef typename internal::packet_traits<Scalar>::type Packet; + typedef typename NumTraits<Scalar>::Real RealScalar; + typedef typename Base::CoeffReturnType CoeffReturnType; + + /* typedef typename internal::conditional< + bool(internal::is_lvalue<PlainObjectType>::value), + Scalar *, + const Scalar *>::type + PointerType;*/ + typedef Scalar* PointerType; + typedef PointerType PointerArgType; + + static const int Options = Options_; + + static const Index NumIndices = PlainObjectType::NumIndices; + typedef typename PlainObjectType::Dimensions Dimensions; + + enum { + IsAligned = ((int(Options_) & Aligned) == Aligned), + PacketAccess = (internal::packet_traits<Scalar>::size > 1), + BlockAccess = false, + Layout = PlainObjectType::Layout, + CoordAccess = true, + }; + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr) : m_data(dataPtr), m_dimensions() { + // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT((0 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE) + } + +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES + template<typename... IndexTypes> EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension, IndexTypes... otherDimensions) : m_data(dataPtr), m_dimensions(firstDimension, otherDimensions...) { + // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT((sizeof...(otherDimensions) + 1 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE) + } +#else + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension) : m_data(dataPtr), m_dimensions(firstDimension) { + // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT((1 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE) + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2) : m_data(dataPtr), m_dimensions(dim1, dim2) { + EIGEN_STATIC_ASSERT(2 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE) + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2, Index dim3) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3) { + EIGEN_STATIC_ASSERT(3 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE) + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2, Index dim3, Index dim4) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3, dim4) { + EIGEN_STATIC_ASSERT(4 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE) + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2, Index dim3, Index dim4, Index dim5) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3, dim4, dim5) { + EIGEN_STATIC_ASSERT(5 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE) + } +#endif + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, const array<Index, NumIndices>& dimensions) + : m_data(dataPtr), m_dimensions(dimensions) + { } + + template <typename Dimensions> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, const Dimensions& dimensions) + : m_data(dataPtr), m_dimensions(dimensions) + { } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(PlainObjectType& tensor) + : m_data(tensor.data()), m_dimensions(tensor.dimensions()) + { } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Index rank() const { return m_dimensions.rank(); } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Index dimension(Index n) const { return m_dimensions[n]; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Index size() const { return m_dimensions.TotalSize(); } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar* data() { return m_data; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar* data() const { return m_data; } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(const array<Index, NumIndices>& indices) const + { + // eigen_assert(checkIndexRange(indices)); + if (PlainObjectType::Options&RowMajor) { + const Index index = m_dimensions.IndexOfRowMajor(indices); + return m_data[index]; + } else { + const Index index = m_dimensions.IndexOfColMajor(indices); + return m_data[index]; + } + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()() const + { + EIGEN_STATIC_ASSERT(NumIndices == 0 || NumIndices == Dynamic, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); + eigen_assert(rank() == 0); + return m_data[0]; + } + +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES + template<typename... IndexTypes> EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) const + { + static_assert(sizeof...(otherIndices) + 1 == NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); + if (PlainObjectType::Options&RowMajor) { + const Index index = m_dimensions.IndexOfRowMajor(array<Index, NumIndices>{{firstIndex, otherIndices...}}); + return m_data[index]; + } else { + const Index index = m_dimensions.IndexOfColMajor(array<Index, NumIndices>{{firstIndex, otherIndices...}}); + return m_data[index]; + } + } +#else + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index index) const + { + eigen_internal_assert(index >= 0 && index < size()); + return m_data[index]; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1) const + { + if (PlainObjectType::Options&RowMajor) { + const Index index = i1 + i0 * m_dimensions[0]; + return m_data[index]; + } else { + const Index index = i0 + i1 * m_dimensions[0]; + return m_data[index]; + } + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2) const + { + if (PlainObjectType::Options&RowMajor) { + const Index index = i2 + m_dimensions[1] * (i1 + m_dimensions[0] * i0); + return m_data[index]; + } else { + const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * i2); + return m_data[index]; + } + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3) const + { + if (PlainObjectType::Options&RowMajor) { + const Index index = i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0)); + return m_data[index]; + } else { + const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * i3)); + return m_data[index]; + } + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const + { + if (PlainObjectType::Options&RowMajor) { + const Index index = i4 + m_dimensions[4] * (i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0))); + return m_data[index]; + } else { + const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * (i3 + m_dimensions[3] * i4))); + return m_data[index]; + } + } +#endif + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(const array<Index, NumIndices>& indices) + { + // eigen_assert(checkIndexRange(indices)); + if (PlainObjectType::Options&RowMajor) { + const Index index = m_dimensions.IndexOfRowMajor(indices); + return m_data[index]; + } else { + const Index index = m_dimensions.IndexOfColMajor(indices); + return m_data[index]; + } + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()() + { + static_assert(NumIndices == 0 || NumIndices == Dynamic, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); + eigen_internal_assert(rank() == 0); + return m_data[0]; + } + +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES + template<typename... IndexTypes> EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) + { + static_assert(sizeof...(otherIndices) + 1 == NumIndices || NumIndices == Dynamic, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); + const std::size_t NumDims = sizeof...(otherIndices) + 1; + if (PlainObjectType::Options&RowMajor) { + const array<Index, NumDims> dims = {firstIndex, otherIndices...}; + const Index index = m_dimensions.IndexOfRowMajor(dims); + return m_data[index]; + } else { + const array<Index, NumDims> dims = {firstIndex, otherIndices...}; + const Index index = m_dimensions.IndexOfColMajor(dims); + return m_data[index]; + } + } +#else + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index index) + { + eigen_internal_assert(index >= 0 && index < size()); + return m_data[index]; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1) + { + if (PlainObjectType::Options&RowMajor) { + const Index index = i1 + i0 * m_dimensions[0]; + return m_data[index]; + } else { + const Index index = i0 + i1 * m_dimensions[0]; + return m_data[index]; + } + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2) + { + if (PlainObjectType::Options&RowMajor) { + const Index index = i2 + m_dimensions[1] * (i1 + m_dimensions[0] * i0); + return m_data[index]; + } else { + const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * i2); + return m_data[index]; + } + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3) + { + if (PlainObjectType::Options&RowMajor) { + const Index index = i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0)); + return m_data[index]; + } else { + const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * i3)); + return m_data[index]; + } + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) + { + if (PlainObjectType::Options&RowMajor) { + const Index index = i4 + m_dimensions[4] * (i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0))); + return m_data[index]; + } else { + const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * (i3 + m_dimensions[3] * i4))); + return m_data[index]; + } + } +#endif + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Self& operator=(const Self& other) + { + typedef TensorAssignOp<Self, const Self> Assign; + Assign assign(*this, other); + internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice()); + return *this; + } + + template<typename OtherDerived> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Self& operator=(const OtherDerived& other) + { + typedef TensorAssignOp<Self, const OtherDerived> Assign; + Assign assign(*this, other); + internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice()); + return *this; + } + + private: + Scalar* m_data; + Dimensions m_dimensions; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_MAP_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h new file mode 100644 index 0000000000..4dd9af6f92 --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h @@ -0,0 +1,103 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_META_H +#define EIGEN_CXX11_TENSOR_TENSOR_META_H + +namespace Eigen { + +template<bool cond> struct Cond {}; + +template<typename T1, typename T2> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +const T1& choose(Cond<true>, const T1& first, const T2&) { + return first; +} + +template<typename T1, typename T2> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +const T2& choose(Cond<false>, const T1&, const T2& second) { + return second; +} + + +// Default packet types +template <typename Scalar, typename Device> +struct PacketType { + typedef typename internal::packet_traits<Scalar>::type type; + static const int size = internal::unpacket_traits<type>::size; +}; + +// For CUDA packet types when using a GpuDevice +#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) +template <> +struct PacketType<float, GpuDevice> { + typedef float4 type; + static const int size = 4; +}; +template <> +struct PacketType<double, GpuDevice> { + typedef double2 type; + static const int size = 2; +}; +#endif + + +#if defined(EIGEN_HAS_CONSTEXPR) +#define EIGEN_CONSTEXPR constexpr +#else +#define EIGEN_CONSTEXPR +#endif + +// Tuple mimics std::pair but works on e.g. nvcc. +template <typename U, typename V> struct Tuple { + public: + U first; + V second; + + typedef U first_type; + typedef V second_type; + + EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Tuple() : first(), second() {} + + EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Tuple(const U& f, const V& s) : first(f), second(s) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Tuple& operator= (const Tuple& rhs) { + if (&rhs == this) return *this; + first = rhs.first; + second = rhs.second; + return *this; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void swap(Tuple& rhs) { + using numext::swap; + swap(first, rhs.first); + swap(second, rhs.second); + } +}; + +template <typename U, typename V> +EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +bool operator==(const Tuple<U, V>& x, const Tuple<U, V>& y) { + return (x.first == y.first && x.second == y.second); +} + +template <typename U, typename V> +EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +bool operator!=(const Tuple<U, V>& x, const Tuple<U, V>& y) { + return !(x == y); +} + +#undef EIGEN_CONSTEXPR + +} // namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_META_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h new file mode 100644 index 0000000000..e67f3da31a --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -0,0 +1,817 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_MORPHING_H +#define EIGEN_CXX11_TENSOR_TENSOR_MORPHING_H + +namespace Eigen { + +/** \class TensorReshaping + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor reshaping class. + * + * + */ +namespace internal { +template<typename NewDimensions, typename XprType> +struct traits<TensorReshapingOp<NewDimensions, XprType> > : public traits<XprType> +{ + typedef typename XprType::Scalar Scalar; + typedef traits<XprType> XprTraits; + typedef typename packet_traits<Scalar>::type Packet; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference<Nested>::type _Nested; + static const int NumDimensions = array_size<NewDimensions>::value; + static const int Layout = XprTraits::Layout; +}; + +template<typename NewDimensions, typename XprType> +struct eval<TensorReshapingOp<NewDimensions, XprType>, Eigen::Dense> +{ + typedef const TensorReshapingOp<NewDimensions, XprType>& type; +}; + +template<typename NewDimensions, typename XprType> +struct nested<TensorReshapingOp<NewDimensions, XprType>, 1, typename eval<TensorReshapingOp<NewDimensions, XprType> >::type> +{ + typedef TensorReshapingOp<NewDimensions, XprType> type; +}; + +} // end namespace internal + + + +template<typename NewDimensions, typename XprType> +class TensorReshapingOp : public TensorBase<TensorReshapingOp<NewDimensions, XprType>, WriteAccessors> +{ + public: + typedef typename Eigen::internal::traits<TensorReshapingOp>::Scalar Scalar; + typedef typename Eigen::internal::traits<TensorReshapingOp>::Packet Packet; + typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; + typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType; + typedef typename internal::remove_const<typename XprType::PacketReturnType>::type PacketReturnType; + typedef typename Eigen::internal::nested<TensorReshapingOp>::type Nested; + typedef typename Eigen::internal::traits<TensorReshapingOp>::StorageKind StorageKind; + typedef typename Eigen::internal::traits<TensorReshapingOp>::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReshapingOp(const XprType& expr, const NewDimensions& dims) + : m_xpr(expr), m_dims(dims) {} + + EIGEN_DEVICE_FUNC + const NewDimensions& dimensions() const { return m_dims; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all<typename XprType::Nested>::type& + expression() const { return m_xpr; } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorReshapingOp& operator = (const TensorReshapingOp& other) + { + typedef TensorAssignOp<TensorReshapingOp, const TensorReshapingOp> Assign; + Assign assign(*this, other); + internal::TensorExecutor<const Assign, DefaultDevice>::run( + assign, DefaultDevice()); + return *this; + } + + template<typename OtherDerived> + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorReshapingOp& operator = (const OtherDerived& other) + { + typedef TensorAssignOp<TensorReshapingOp, const OtherDerived> Assign; + Assign assign(*this, other); + internal::TensorExecutor<const Assign, DefaultDevice>::run( + assign, DefaultDevice()); + return *this; + } + + protected: + typename XprType::Nested m_xpr; + const NewDimensions m_dims; +}; + + +// Eval as rvalue +template<typename NewDimensions, typename ArgType, typename Device> +struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device> +{ + typedef TensorReshapingOp<NewDimensions, ArgType> XprType; + typedef NewDimensions Dimensions; + + enum { + IsAligned = TensorEvaluator<ArgType, Device>::IsAligned, + PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, + // TODO(andydavis) Re-enable BlockAccess when the performance issue + // with block-based reshape is resolved. + BlockAccess = false, + Layout = TensorEvaluator<ArgType, Device>::Layout, + CoordAccess = false, // to be implemented + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_dimensions(op.dimensions()) + { + // The total size of the reshaped tensor must be equal to the total size + // of the input tensor. + eigen_assert(internal::array_prod(m_impl.dimensions()) == internal::array_prod(op.dimensions())); + + if (BlockAccess) { + const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = + m_impl.dimensions(); + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + m_outputStrides[0] = 1; + for (int i = 1; i < NumOutputDims; ++i) { + m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1]; + } + m_inputStrides[0] = 1; + for (int i = 1; i < NumInputDims; ++i) { + m_inputStrides[i] = m_inputStrides[i - 1] * input_dims[i - 1]; + } + } else { +#ifdef __CUDACC__ + // TODO(andydavis) Remove the following line of code when associated + // nvcc bug b/22973013 is fixed. + for (int i = 0; i < 1; ++i) {} +#endif + m_outputStrides[NumOutputDims - 1] = 1; + for (int i = NumOutputDims - 2; i >= 0; --i) { + m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1]; + } + m_inputStrides[NumInputDims - 1] = 1; + for (int i = NumInputDims - 2; i >= 0; --i) { + m_inputStrides[i] = m_inputStrides[i + 1] * input_dims[i + 1]; + } + } + } + } + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + static const std::size_t NumOutputDims = + internal::array_size<Dimensions>::value; + static const std::size_t NumInputDims = internal::array_size< + typename TensorEvaluator<ArgType, Device>::Dimensions>::value; + typedef typename internal::TensorBlock< + Index, typename internal::remove_const<Scalar>::type, NumOutputDims, Layout> + OutputTensorBlock; + typedef typename internal::TensorBlock< + Index, typename internal::remove_const<Scalar>::type, NumInputDims, Layout> + InputTensorBlock; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { + return m_impl.evalSubExprsIfNeeded(data); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + return m_impl.coeff(index); + } + + template<int LoadMode> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + return m_impl.template packet<LoadMode>(index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( + std::vector<internal::TensorOpResourceRequirements>* resources) const { + m_impl.getResourceRequirements(resources); + } + + // TODO(andydavis) Reduce the overhead of this function. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block( + OutputTensorBlock* output_block) const { + // Calculate output block unit-stride inner dimension length. + const DSizes<Index, NumOutputDims>& output_block_sizes = + output_block->block_sizes(); + Index output_inner_dim_size = 1; + Index output_outer_dim_start = NumOutputDims; + for (Index i = 0; i < NumOutputDims; ++i) { + const Index dim = static_cast<int>(Layout) == static_cast<int>(ColMajor) + ? i : NumOutputDims - i - 1; + output_inner_dim_size *= output_block_sizes[dim]; + if (output_block_sizes[dim] < m_dimensions[dim]) { + output_outer_dim_start = i + 1; + break; + } + } + + // Initialize output block iterator state. + struct BlockIteratorState { + Index stride; + Index span; + Index size; + Index count; + }; + array<BlockIteratorState, NumOutputDims> block_iter_state; + + for (Index i = 0; i < NumOutputDims; ++i) { + const Index dim = static_cast<int>(Layout) == static_cast<int>(ColMajor) + ? i : NumOutputDims - i - 1; + block_iter_state[i].size = output_block_sizes[dim]; + block_iter_state[i].stride = m_outputStrides[dim]; + block_iter_state[i].span = + block_iter_state[i].stride * (block_iter_state[i].size - 1); + block_iter_state[i].count = 0; + } + + const Index output_outer_dim_size = output_block_sizes.TotalSize() / + output_inner_dim_size; + const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = + m_impl.dimensions(); + + Index index = output_block->first_coeff_index(); + for (Index outer_idx = 0; outer_idx < output_outer_dim_size; ++outer_idx) { + Index inner_idx = 0; + while (inner_idx < output_inner_dim_size) { + // Calculate input coords based on 'index'. + array<Index, NumInputDims> input_coords; + Index idx = index; + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + for (int i = NumInputDims - 1; i > 0; --i) { + input_coords[i] = idx / m_inputStrides[i]; + idx -= input_coords[i] * m_inputStrides[i]; + } + input_coords[0] = idx; + } else { + for (int i = 0; i < NumInputDims - 1; ++i) { + input_coords[i] = idx / m_inputStrides[i]; + idx -= input_coords[i] * m_inputStrides[i]; + } + input_coords[NumInputDims - 1] = idx; + } + + // Calculate target input block shape, using at most + // 'output_inner_dim_size' coefficients along the input block's inner + // dimensions. + DSizes<Index, NumInputDims> input_block_sizes; + Index num_to_allocate = output_inner_dim_size - inner_idx; + for (Index i = 0; i < NumInputDims; ++i) { + const Index dim = + static_cast<int>(Layout) == static_cast<int>(ColMajor) + ? i : NumInputDims - i - 1; + input_block_sizes[dim] = numext::mini( + num_to_allocate, (static_cast<Index>(input_dims[dim]) - + input_coords[dim])); + if (input_coords[dim] == 0) { + num_to_allocate /= input_block_sizes[dim]; + } else { + num_to_allocate = 1; + } + } + + // Calculate input block strides. + DSizes<Index, NumInputDims> input_block_strides; + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + input_block_strides[0] = 1; + for (int i = 1; i < NumInputDims; ++i) { + input_block_strides[i] = input_block_strides[i - 1] * + input_block_sizes[i - 1]; + } + } else { + input_block_strides[NumInputDims - 1] = 1; + for (int i = NumInputDims - 2; i >= 0; --i) { + input_block_strides[i] = input_block_strides[i + 1] * + input_block_sizes[i + 1]; + } + } + + // Instantiate and read input block from input tensor. + InputTensorBlock input_block(index, input_block_sizes, + input_block_strides, m_inputStrides, + output_block->data() + outer_idx * + output_inner_dim_size + inner_idx); + + m_impl.block(&input_block); + + const Index input_block_total_size = input_block_sizes.TotalSize(); + index += input_block_total_size; + inner_idx += input_block_total_size; + } + eigen_assert(inner_idx == output_inner_dim_size); + index -= output_inner_dim_size; + // Update index. + for (Index i = output_outer_dim_start; i < NumOutputDims; ++i) { + if (++block_iter_state[i].count < block_iter_state[i].size) { + index += block_iter_state[i].stride; + break; + } + block_iter_state[i].count = 0; + index -= block_iter_state[i].span; + } + } + } + + EIGEN_DEVICE_FUNC Scalar* data() const { return const_cast<Scalar*>(m_impl.data()); } + + EIGEN_DEVICE_FUNC const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; } + + protected: + TensorEvaluator<ArgType, Device> m_impl; + NewDimensions m_dimensions; + DSizes<Index, NumOutputDims> m_outputStrides; + DSizes<Index, NumInputDims> m_inputStrides; +}; + + +// Eval as lvalue +template<typename NewDimensions, typename ArgType, typename Device> + struct TensorEvaluator<TensorReshapingOp<NewDimensions, ArgType>, Device> + : public TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device> + +{ + typedef TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device> Base; + typedef TensorReshapingOp<NewDimensions, ArgType> XprType; + typedef NewDimensions Dimensions; + + enum { + IsAligned = TensorEvaluator<ArgType, Device>::IsAligned, + PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, + BlockAccess = false, + Layout = TensorEvaluator<ArgType, Device>::Layout, + CoordAccess = false, // to be implemented + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : Base(op, device) + { } + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) + { + return this->m_impl.coeffRef(index); + } + template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void writePacket(Index index, const PacketReturnType& x) + { + this->m_impl.template writePacket<StoreMode>(index, x); + } +}; + + +/** \class TensorSlicing + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor slicing class. + * + * + */ +namespace internal { +template<typename StartIndices, typename Sizes, typename XprType> +struct traits<TensorSlicingOp<StartIndices, Sizes, XprType> > : public traits<XprType> +{ + typedef typename XprType::Scalar Scalar; + typedef traits<XprType> XprTraits; + typedef typename packet_traits<Scalar>::type Packet; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference<Nested>::type _Nested; + static const int NumDimensions = array_size<StartIndices>::value; + static const int Layout = XprTraits::Layout; +}; + +template<typename StartIndices, typename Sizes, typename XprType> +struct eval<TensorSlicingOp<StartIndices, Sizes, XprType>, Eigen::Dense> +{ + typedef const TensorSlicingOp<StartIndices, Sizes, XprType>& type; +}; + +template<typename StartIndices, typename Sizes, typename XprType> +struct nested<TensorSlicingOp<StartIndices, Sizes, XprType>, 1, typename eval<TensorSlicingOp<StartIndices, Sizes, XprType> >::type> +{ + typedef TensorSlicingOp<StartIndices, Sizes, XprType> type; +}; + +} // end namespace internal + + + +template<typename StartIndices, typename Sizes, typename XprType> +class TensorSlicingOp : public TensorBase<TensorSlicingOp<StartIndices, Sizes, XprType> > +{ + public: + typedef typename Eigen::internal::traits<TensorSlicingOp>::Scalar Scalar; + typedef typename Eigen::internal::traits<TensorSlicingOp>::Packet Packet; + typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename Eigen::internal::nested<TensorSlicingOp>::type Nested; + typedef typename Eigen::internal::traits<TensorSlicingOp>::StorageKind StorageKind; + typedef typename Eigen::internal::traits<TensorSlicingOp>::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSlicingOp(const XprType& expr, const StartIndices& indices, const Sizes& sizes) + : m_xpr(expr), m_indices(indices), m_sizes(sizes) {} + + EIGEN_DEVICE_FUNC + const StartIndices& startIndices() const { return m_indices; } + EIGEN_DEVICE_FUNC + const Sizes& sizes() const { return m_sizes; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all<typename XprType::Nested>::type& + expression() const { return m_xpr; } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorSlicingOp& operator = (const TensorSlicingOp& other) + { + typedef TensorAssignOp<TensorSlicingOp, const TensorSlicingOp> Assign; + Assign assign(*this, other); + internal::TensorExecutor<const Assign, DefaultDevice>::run( + assign, DefaultDevice()); + return *this; + } + + template<typename OtherDerived> + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorSlicingOp& operator = (const OtherDerived& other) + { + typedef TensorAssignOp<TensorSlicingOp, const OtherDerived> Assign; + Assign assign(*this, other); + internal::TensorExecutor<const Assign, DefaultDevice>::run( + assign, DefaultDevice()); + return *this; + } + + protected: + typename XprType::Nested m_xpr; + const StartIndices m_indices; + const Sizes m_sizes; +}; + + +// Eval as rvalue +template<typename StartIndices, typename Sizes, typename ArgType, typename Device> +struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Device> +{ + typedef TensorSlicingOp<StartIndices, Sizes, ArgType> XprType; + static const int NumDims = internal::array_size<Sizes>::value; + + enum { + // Alignment can't be guaranteed at compile time since it depends on the + // slice offsets and sizes. + IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false, + PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, + BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess, + Layout = TensorEvaluator<ArgType, Device>::Layout, + CoordAccess = TensorEvaluator<ArgType, Device>::CoordAccess, + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_device(device), m_dimensions(op.sizes()), m_offsets(op.startIndices()) + { + for (int i = 0; i < internal::array_size<Dimensions>::value; ++i) { + eigen_assert(m_impl.dimensions()[i] >= op.sizes()[i] + op.startIndices()[i]); + } + + const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions(); + const Sizes& output_dims = op.sizes(); + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + m_inputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; + } + + // Don't initialize m_fastOutputStrides[0] since it won't ever be accessed. + m_outputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_outputStrides[i] = m_outputStrides[i-1] * output_dims[i-1]; + m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]); + } + } else { + m_inputStrides[NumDims-1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1]; + } + + m_outputStrides[NumDims-1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_outputStrides[i] = m_outputStrides[i+1] * output_dims[i+1]; + m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]); + } + } + + m_block_total_size_max = numext::maxi(static_cast<std::size_t>(1), + device.lastLevelCacheSize() / + sizeof(Scalar)); + } + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename internal::remove_const<Scalar>::type ScalarNonConst; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + typedef Sizes Dimensions; + typedef internal::TensorBlock<Index, ScalarNonConst, NumDims, Layout> + TensorBlock; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { + m_impl.evalSubExprsIfNeeded(NULL); + if (internal::is_arithmetic<typename internal::remove_const<Scalar>::type>::value && data && m_impl.data()) { + Index contiguous_values = 1; + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + for (int i = 0; i < NumDims; ++i) { + contiguous_values *= dimensions()[i]; + if (dimensions()[i] != m_impl.dimensions()[i]) { + break; + } + } + } else { + for (int i = NumDims-1; i >= 0; --i) { + contiguous_values *= dimensions()[i]; + if (dimensions()[i] != m_impl.dimensions()[i]) { + break; + } + } + } + // Use memcpy if it's going to be faster than using the regular evaluation. + if (contiguous_values > m_device.memcpyThreshold()) { + Scalar* src = (Scalar*)m_impl.data(); + for (int i = 0; i < internal::array_prod(dimensions()); i += contiguous_values) { + Index offset = srcCoeff(i); + m_device.memcpy((void*)(data+i), src+offset, contiguous_values * sizeof(Scalar)); + } + return false; + } + } + return true; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + return m_impl.coeff(srcCoeff(index)); + } + + template<int LoadMode> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + const int packetSize = internal::unpacket_traits<PacketReturnType>::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+packetSize-1 < internal::array_prod(dimensions())); + + Index inputIndices[] = {0, 0}; + Index indices[] = {index, index + packetSize - 1}; + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx0 = indices[0] / m_fastOutputStrides[i]; + const Index idx1 = indices[1] / m_fastOutputStrides[i]; + inputIndices[0] += (idx0 + m_offsets[i]) * m_inputStrides[i]; + inputIndices[1] += (idx1 + m_offsets[i]) * m_inputStrides[i]; + indices[0] -= idx0 * m_outputStrides[i]; + indices[1] -= idx1 * m_outputStrides[i]; + } + inputIndices[0] += (indices[0] + m_offsets[0]); + inputIndices[1] += (indices[1] + m_offsets[0]); + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx0 = indices[0] / m_fastOutputStrides[i]; + const Index idx1 = indices[1] / m_fastOutputStrides[i]; + inputIndices[0] += (idx0 + m_offsets[i]) * m_inputStrides[i]; + inputIndices[1] += (idx1 + m_offsets[i]) * m_inputStrides[i]; + indices[0] -= idx0 * m_outputStrides[i]; + indices[1] -= idx1 * m_outputStrides[i]; + } + inputIndices[0] += (indices[0] + m_offsets[NumDims-1]); + inputIndices[1] += (indices[1] + m_offsets[NumDims-1]); + } + if (inputIndices[1] - inputIndices[0] == packetSize - 1) { + PacketReturnType rslt = m_impl.template packet<Unaligned>(inputIndices[0]); + return rslt; + } + else { + typename internal::remove_const<CoeffReturnType>::type values[packetSize]; + values[0] = m_impl.coeff(inputIndices[0]); + values[packetSize-1] = m_impl.coeff(inputIndices[1]); + for (int i = 1; i < packetSize-1; ++i) { + values[i] = coeff(index+i); + } + PacketReturnType rslt = internal::pload<PacketReturnType>(values); + return rslt; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<Index, NumDims>& coords) + { + array<Index, NumDims> inputCoords; + for (int i = 0; i < NumDims; ++i) { + inputCoords = coords[i] + this->m_offsets[i]; + } + return m_impl.coeff(inputCoords); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( + std::vector<internal::TensorOpResourceRequirements>* resources) const { + resources->push_back(internal::TensorOpResourceRequirements( + internal::kSkewedInnerDims, m_block_total_size_max)); + m_impl.getResourceRequirements(resources); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block( + TensorBlock* output_block) const { + TensorBlock input_block(srcCoeff(output_block->first_coeff_index()), + output_block->block_sizes(), + output_block->block_strides(), + m_inputStrides, + output_block->data()); + m_impl.block(&input_block); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { + Scalar* result = m_impl.data(); + if (result) { + Index offset = 0; + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + for (int i = 0; i < NumDims; ++i) { + if (m_dimensions[i] != m_impl.dimensions()[i]) { + offset += m_offsets[i] * m_inputStrides[i]; + for (int j = i+1; j < NumDims; ++j) { + if (m_dimensions[j] > 1) { + return NULL; + } + offset += m_offsets[j] * m_inputStrides[j]; + } + break; + } + } + } else { + for (int i = NumDims - 1; i >= 0; --i) { + if (m_dimensions[i] != m_impl.dimensions()[i]) { + offset += m_offsets[i] * m_inputStrides[i]; + for (int j = i-1; j >= 0; --j) { + if (m_dimensions[j] > 1) { + return NULL; + } + offset += m_offsets[j] * m_inputStrides[j]; + } + break; + } + } + } + return result + offset; + } + return NULL; + } + + protected: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const + { + Index inputIndex = 0; + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_fastOutputStrides[i]; + inputIndex += (idx + m_offsets[i]) * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + inputIndex += (index + m_offsets[0]); + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = index / m_fastOutputStrides[i]; + inputIndex += (idx + m_offsets[i]) * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + inputIndex += (index + m_offsets[NumDims-1]); + } + return inputIndex; + } + + array<Index, NumDims> m_outputStrides; + array<internal::TensorIntDivisor<Index>, NumDims> m_fastOutputStrides; + array<Index, NumDims> m_inputStrides; + TensorEvaluator<ArgType, Device> m_impl; + const Device& m_device; + Dimensions m_dimensions; + const StartIndices m_offsets; + std::size_t m_block_total_size_max; +}; + + +// Eval as lvalue +template<typename StartIndices, typename Sizes, typename ArgType, typename Device> +struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device> + : public TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Device> +{ + typedef TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Device> Base; + typedef TensorSlicingOp<StartIndices, Sizes, ArgType> XprType; + static const int NumDims = internal::array_size<Sizes>::value; + + enum { + IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false, + PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, + BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess, + Layout = TensorEvaluator<ArgType, Device>::Layout, + CoordAccess = TensorEvaluator<ArgType, Device>::CoordAccess, + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : Base(op, device) + { } + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename internal::remove_const<Scalar>::type ScalarNonConst; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + typedef Sizes Dimensions; + typedef internal::TensorBlock<Index, ScalarNonConst, NumDims, Layout> + TensorBlock; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) + { + return this->m_impl.coeffRef(this->srcCoeff(index)); + } + + template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void writePacket(Index index, const PacketReturnType& x) + { + const int packetSize = internal::unpacket_traits<PacketReturnType>::size; + Index inputIndices[] = {0, 0}; + Index indices[] = {index, index + packetSize - 1}; + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx0 = indices[0] / this->m_fastOutputStrides[i]; + const Index idx1 = indices[1] / this->m_fastOutputStrides[i]; + inputIndices[0] += (idx0 + this->m_offsets[i]) * this->m_inputStrides[i]; + inputIndices[1] += (idx1 + this->m_offsets[i]) * this->m_inputStrides[i]; + indices[0] -= idx0 * this->m_outputStrides[i]; + indices[1] -= idx1 * this->m_outputStrides[i]; + } + inputIndices[0] += (indices[0] + this->m_offsets[0]); + inputIndices[1] += (indices[1] + this->m_offsets[0]); + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx0 = indices[0] / this->m_fastOutputStrides[i]; + const Index idx1 = indices[1] / this->m_fastOutputStrides[i]; + inputIndices[0] += (idx0 + this->m_offsets[i]) * this->m_inputStrides[i]; + inputIndices[1] += (idx1 + this->m_offsets[i]) * this->m_inputStrides[i]; + indices[0] -= idx0 * this->m_outputStrides[i]; + indices[1] -= idx1 * this->m_outputStrides[i]; + } + inputIndices[0] += (indices[0] + this->m_offsets[NumDims-1]); + inputIndices[1] += (indices[1] + this->m_offsets[NumDims-1]); + } + if (inputIndices[1] - inputIndices[0] == packetSize - 1) { + this->m_impl.template writePacket<StoreMode>(inputIndices[0], x); + } + else { + EIGEN_ALIGN_DEFAULT CoeffReturnType values[packetSize]; + internal::pstore<CoeffReturnType, PacketReturnType>(values, x); + this->m_impl.coeffRef(inputIndices[0]) = values[0]; + this->m_impl.coeffRef(inputIndices[1]) = values[packetSize-1]; + for (int i = 1; i < packetSize-1; ++i) { + this->coeffRef(index+i) = values[i]; + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(const array<Index, NumDims>& coords) + { + array<Index, NumDims> inputCoords; + for (int i = 0; i < NumDims; ++i) { + inputCoords = coords[i] + this->m_offsets[i]; + } + return this->m_impl.coeffRef(inputCoords); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock( + const TensorBlock& block) { + this->m_impl.writeBlock( + TensorBlock(this->srcCoeff(block.first_coeff_index()), + block.block_sizes(), + block.block_strides(), + this->m_inputStrides, + const_cast<ScalarNonConst*>(block.data()))); + + } +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_MORPHING_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h new file mode 100644 index 0000000000..d1dff3f38b --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h @@ -0,0 +1,388 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_PADDING_H +#define EIGEN_CXX11_TENSOR_TENSOR_PADDING_H + +namespace Eigen { + +/** \class TensorPadding + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor padding class. + * At the moment only padding with a constant value is supported. + * + */ +namespace internal { +template<typename PaddingDimensions, typename XprType> +struct traits<TensorPaddingOp<PaddingDimensions, XprType> > : public traits<XprType> +{ + typedef typename XprType::Scalar Scalar; + typedef traits<XprType> XprTraits; + typedef typename packet_traits<Scalar>::type Packet; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference<Nested>::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; +}; + +template<typename PaddingDimensions, typename XprType> +struct eval<TensorPaddingOp<PaddingDimensions, XprType>, Eigen::Dense> +{ + typedef const TensorPaddingOp<PaddingDimensions, XprType>& type; +}; + +template<typename PaddingDimensions, typename XprType> +struct nested<TensorPaddingOp<PaddingDimensions, XprType>, 1, typename eval<TensorPaddingOp<PaddingDimensions, XprType> >::type> +{ + typedef TensorPaddingOp<PaddingDimensions, XprType> type; +}; + +} // end namespace internal + + + +template<typename PaddingDimensions, typename XprType> +class TensorPaddingOp : public TensorBase<TensorPaddingOp<PaddingDimensions, XprType>, ReadOnlyAccessors> +{ + public: + typedef typename Eigen::internal::traits<TensorPaddingOp>::Scalar Scalar; + typedef typename Eigen::internal::traits<TensorPaddingOp>::Packet Packet; + typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename Eigen::internal::nested<TensorPaddingOp>::type Nested; + typedef typename Eigen::internal::traits<TensorPaddingOp>::StorageKind StorageKind; + typedef typename Eigen::internal::traits<TensorPaddingOp>::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorPaddingOp(const XprType& expr, const PaddingDimensions& padding_dims, + const Scalar padding_value) + : m_xpr(expr), m_padding_dims(padding_dims), m_padding_value(padding_value) {} + + EIGEN_DEVICE_FUNC + const PaddingDimensions& padding() const { return m_padding_dims; } + EIGEN_DEVICE_FUNC + Scalar padding_value() const { return m_padding_value; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all<typename XprType::Nested>::type& + expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; + const PaddingDimensions m_padding_dims; + const Scalar m_padding_value; +}; + + +// Eval as rvalue +template<typename PaddingDimensions, typename ArgType, typename Device> +struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device> +{ + typedef TensorPaddingOp<PaddingDimensions, ArgType> XprType; + typedef typename XprType::Index Index; + static const int NumDims = internal::array_size<PaddingDimensions>::value; + typedef DSizes<Index, NumDims> Dimensions; + + enum { + IsAligned = false, + PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, + BlockAccess = false, + Layout = TensorEvaluator<ArgType, Device>::Layout, + CoordAccess = true, + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_padding(op.padding()), m_paddingValue(op.padding_value()) + { + // Compute dimensions + m_dimensions = m_impl.dimensions(); + for (int i = 0; i < NumDims; ++i) { + m_dimensions[i] += m_padding[i].first + m_padding[i].second; + } + const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions(); + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + m_outputStrides[0] = 1; + if (NumDims > 0) { + m_inputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; + m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; + } + m_outputStrides[NumDims] = m_outputStrides[NumDims-1] * m_dimensions[NumDims-1]; + } + } else { + m_outputStrides[NumDims] = 1; + if (NumDims > 0) { + m_inputStrides[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1]; + m_outputStrides[i+1] = m_outputStrides[i+2] * m_dimensions[i+1]; + } + m_outputStrides[0] = m_outputStrides[1] * m_dimensions[0]; + } + } + } + + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + eigen_assert(index < dimensions().TotalSize()); + Index inputIndex = 0; + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_outputStrides[i]; + if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) { + return m_paddingValue; + } + inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + if (NumDims > 0) { + if (index < m_padding[0].first || index >= m_dimensions[0] - m_padding[0].second) { + return m_paddingValue; + } + inputIndex += (index - m_padding[0].first); + } + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = index / m_outputStrides[i+1]; + if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) { + return m_paddingValue; + } + inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; + index -= idx * m_outputStrides[i+1]; + } + if (NumDims > 0) { + if (index < m_padding[NumDims-1].first || + index >= m_dimensions[NumDims-1] - m_padding[NumDims-1].second) { + return m_paddingValue; + } + inputIndex += (index - m_padding[NumDims-1].first); + } + } + return m_impl.coeff(inputIndex); + } + + template<int LoadMode> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + return packetColMajor(index); + } + return packetRowMajor(index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<Index, NumDims>& coords) const + { + Index inputIndex = 0; + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + if (NumDims > 0) { + const Index idx = coords[0]; + if (idx < m_padding[0].first || idx >= m_dimensions[0] - m_padding[0].second) { + return m_paddingValue; + } + inputIndex = idx - m_padding[0].first; + } + for (int i = 1; i < NumDims; ++i) { + const Index idx = coords[i]; + if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) { + return m_paddingValue; + } + inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; + } + } else { + if (NumDims > 0) { + const Index idx = coords[NumDims-1]; + if (idx < m_padding[NumDims-1].first || idx >= m_dimensions[NumDims-1] - m_padding[NumDims-1].second) { + return m_paddingValue; + } + inputIndex = idx - m_padding[NumDims-1].first; + } + for (int i = NumDims - 2; i >= 0; --i) { + const Index idx = coords[i]; + if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) { + return m_paddingValue; + } + inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; + } + } + return m_impl.coeff(inputIndex); + } + + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + + protected: + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const + { + const int packetSize = internal::unpacket_traits<PacketReturnType>::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + + const Index initialIndex = index; + Index inputIndex = 0; + for (int i = NumDims - 1; i > 0; --i) { + const Index first = index; + const Index last = index + packetSize - 1; + const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i]; + const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i]; + const Index lastPaddedRight = m_outputStrides[i+1]; + + if (last < lastPaddedLeft) { + // all the coefficient are in the padding zone. + return internal::pset1<PacketReturnType>(m_paddingValue); + } + else if (first >= firstPaddedRight && last < lastPaddedRight) { + // all the coefficient are in the padding zone. + return internal::pset1<PacketReturnType>(m_paddingValue); + } + else if (first >= lastPaddedLeft && last < firstPaddedRight) { + // all the coefficient are between the 2 padding zones. + const Index idx = index / m_outputStrides[i]; + inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + else { + // Every other case + return packetWithPossibleZero(initialIndex); + } + } + + const Index last = index + packetSize - 1; + const Index first = index; + + if (NumDims > 0) { + const Index lastPaddedLeft = m_padding[0].first; + const Index firstPaddedRight = (m_dimensions[0] - m_padding[0].second); + const Index lastPaddedRight = m_outputStrides[1]; + + if (last < lastPaddedLeft) { + // all the coefficient are in the padding zone. + return internal::pset1<PacketReturnType>(m_paddingValue); + } + else if (first >= firstPaddedRight && last < lastPaddedRight) { + // all the coefficient are in the padding zone. + return internal::pset1<PacketReturnType>(m_paddingValue); + } + else if (first >= lastPaddedLeft && last < firstPaddedRight) { + // all the coefficient are between the 2 padding zones. + inputIndex += (index - m_padding[0].first); + return m_impl.template packet<Unaligned>(inputIndex); + } + } + + // Every other case + return packetWithPossibleZero(initialIndex); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const + { + const int packetSize = internal::unpacket_traits<PacketReturnType>::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + + const Index initialIndex = index; + Index inputIndex = 0; + + for (int i = 0; i < NumDims - 1; ++i) { + const Index first = index; + const Index last = index + packetSize - 1; + const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i+1]; + const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i+1]; + const Index lastPaddedRight = m_outputStrides[i]; + + if (last < lastPaddedLeft) { + // all the coefficient are in the padding zone. + return internal::pset1<PacketReturnType>(m_paddingValue); + } + else if (first >= firstPaddedRight && last < lastPaddedRight) { + // all the coefficient are in the padding zone. + return internal::pset1<PacketReturnType>(m_paddingValue); + } + else if (first >= lastPaddedLeft && last < firstPaddedRight) { + // all the coefficient are between the 2 padding zones. + const Index idx = index / m_outputStrides[i+1]; + inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; + index -= idx * m_outputStrides[i+1]; + } + else { + // Every other case + return packetWithPossibleZero(initialIndex); + } + } + + const Index last = index + packetSize - 1; + const Index first = index; + + if (NumDims > 0) { + const Index lastPaddedLeft = m_padding[NumDims-1].first; + const Index firstPaddedRight = (m_dimensions[NumDims-1] - m_padding[NumDims-1].second); + const Index lastPaddedRight = m_outputStrides[NumDims-1]; + + if (last < lastPaddedLeft) { + // all the coefficient are in the padding zone. + return internal::pset1<PacketReturnType>(m_paddingValue); + } + else if (first >= firstPaddedRight && last < lastPaddedRight) { + // all the coefficient are in the padding zone. + return internal::pset1<PacketReturnType>(m_paddingValue); + } + else if (first >= lastPaddedLeft && last < firstPaddedRight) { + // all the coefficient are between the 2 padding zones. + inputIndex += (index - m_padding[NumDims-1].first); + return m_impl.template packet<Unaligned>(inputIndex); + } + } + + // Every other case + return packetWithPossibleZero(initialIndex); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const + { + const int packetSize = internal::unpacket_traits<PacketReturnType>::size; + EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize]; + for (int i = 0; i < packetSize; ++i) { + values[i] = coeff(index+i); + } + PacketReturnType rslt = internal::pload<PacketReturnType>(values); + return rslt; + } + + Dimensions m_dimensions; + array<Index, NumDims+1> m_outputStrides; + array<Index, NumDims> m_inputStrides; + TensorEvaluator<ArgType, Device> m_impl; + PaddingDimensions m_padding; + + Scalar m_paddingValue; +}; + + + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_PADDING_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h new file mode 100644 index 0000000000..c89022ab8e --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h @@ -0,0 +1,314 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_PATCH_H +#define EIGEN_CXX11_TENSOR_TENSOR_PATCH_H + +namespace Eigen { + +/** \class TensorPatch + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor patch class. + * + * + */ +namespace internal { +template<typename PatchDim, typename XprType> +struct traits<TensorPatchOp<PatchDim, XprType> > : public traits<XprType> +{ + typedef typename XprType::Scalar Scalar; + typedef traits<XprType> XprTraits; + typedef typename packet_traits<Scalar>::type Packet; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference<Nested>::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions + 1; + static const int Layout = XprTraits::Layout; +}; + +template<typename PatchDim, typename XprType> +struct eval<TensorPatchOp<PatchDim, XprType>, Eigen::Dense> +{ + typedef const TensorPatchOp<PatchDim, XprType>& type; +}; + +template<typename PatchDim, typename XprType> +struct nested<TensorPatchOp<PatchDim, XprType>, 1, typename eval<TensorPatchOp<PatchDim, XprType> >::type> +{ + typedef TensorPatchOp<PatchDim, XprType> type; +}; + +} // end namespace internal + + + +template<typename PatchDim, typename XprType> +class TensorPatchOp : public TensorBase<TensorPatchOp<PatchDim, XprType>, ReadOnlyAccessors> +{ + public: + typedef typename Eigen::internal::traits<TensorPatchOp>::Scalar Scalar; + typedef typename Eigen::internal::traits<TensorPatchOp>::Packet Packet; + typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename Eigen::internal::nested<TensorPatchOp>::type Nested; + typedef typename Eigen::internal::traits<TensorPatchOp>::StorageKind StorageKind; + typedef typename Eigen::internal::traits<TensorPatchOp>::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorPatchOp(const XprType& expr, const PatchDim& patch_dims) + : m_xpr(expr), m_patch_dims(patch_dims) {} + + EIGEN_DEVICE_FUNC + const PatchDim& patch_dims() const { return m_patch_dims; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all<typename XprType::Nested>::type& + expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; + const PatchDim m_patch_dims; +}; + + +// Eval as rvalue +template<typename PatchDim, typename ArgType, typename Device> +struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device> +{ + typedef TensorPatchOp<PatchDim, ArgType> XprType; + typedef typename XprType::Index Index; + static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value + 1; + typedef DSizes<Index, NumDims> Dimensions; + typedef typename XprType::Scalar Scalar; + + enum { + IsAligned = false, + PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, + BlockAccess = false, + Layout = TensorEvaluator<ArgType, Device>::Layout, + CoordAccess = true, + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device) + { + Index num_patches = 1; + const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions(); + const PatchDim& patch_dims = op.patch_dims(); + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + for (int i = 0; i < NumDims-1; ++i) { + m_dimensions[i] = patch_dims[i]; + num_patches *= (input_dims[i] - patch_dims[i] + 1); + } + m_dimensions[NumDims-1] = num_patches; + + m_inputStrides[0] = 1; + m_patchStrides[0] = 1; + for (int i = 1; i < NumDims-1; ++i) { + m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; + m_patchStrides[i] = m_patchStrides[i-1] * (input_dims[i-1] - patch_dims[i-1] + 1); + } + m_outputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; + } + } else { + for (int i = 0; i < NumDims-1; ++i) { + m_dimensions[i+1] = patch_dims[i]; + num_patches *= (input_dims[i] - patch_dims[i] + 1); + } + m_dimensions[0] = num_patches; + + m_inputStrides[NumDims-2] = 1; + m_patchStrides[NumDims-2] = 1; + for (int i = NumDims-3; i >= 0; --i) { + m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1]; + m_patchStrides[i] = m_patchStrides[i+1] * (input_dims[i+1] - patch_dims[i+1] + 1); + } + m_outputStrides[NumDims-1] = 1; + for (int i = NumDims-2; i >= 0; --i) { + m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1]; + } + } + } + + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + Index output_stride_index = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? NumDims - 1 : 0; + // Find the location of the first element of the patch. + Index patchIndex = index / m_outputStrides[output_stride_index]; + // Find the offset of the element wrt the location of the first element. + Index patchOffset = index - patchIndex * m_outputStrides[output_stride_index]; + Index inputIndex = 0; + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + for (int i = NumDims - 2; i > 0; --i) { + const Index patchIdx = patchIndex / m_patchStrides[i]; + patchIndex -= patchIdx * m_patchStrides[i]; + const Index offsetIdx = patchOffset / m_outputStrides[i]; + patchOffset -= offsetIdx * m_outputStrides[i]; + inputIndex += (patchIdx + offsetIdx) * m_inputStrides[i]; + } + } else { + for (int i = 0; i < NumDims - 2; ++i) { + const Index patchIdx = patchIndex / m_patchStrides[i]; + patchIndex -= patchIdx * m_patchStrides[i]; + const Index offsetIdx = patchOffset / m_outputStrides[i+1]; + patchOffset -= offsetIdx * m_outputStrides[i+1]; + inputIndex += (patchIdx + offsetIdx) * m_inputStrides[i]; + } + } + inputIndex += (patchIndex + patchOffset); + return m_impl.coeff(inputIndex); + } + + template<int LoadMode> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + const int packetSize = internal::unpacket_traits<PacketReturnType>::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + + Index output_stride_index = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? NumDims - 1 : 0; + Index indices[2] = {index, index + packetSize - 1}; + Index patchIndices[2] = {indices[0] / m_outputStrides[output_stride_index], + indices[1] / m_outputStrides[output_stride_index]}; + Index patchOffsets[2] = {indices[0] - patchIndices[0] * m_outputStrides[output_stride_index], + indices[1] - patchIndices[1] * m_outputStrides[output_stride_index]}; + + Index inputIndices[2] = {0, 0}; + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + for (int i = NumDims - 2; i > 0; --i) { + const Index patchIdx[2] = {patchIndices[0] / m_patchStrides[i], + patchIndices[1] / m_patchStrides[i]}; + patchIndices[0] -= patchIdx[0] * m_patchStrides[i]; + patchIndices[1] -= patchIdx[1] * m_patchStrides[i]; + + const Index offsetIdx[2] = {patchOffsets[0] / m_outputStrides[i], + patchOffsets[1] / m_outputStrides[i]}; + patchOffsets[0] -= offsetIdx[0] * m_outputStrides[i]; + patchOffsets[1] -= offsetIdx[1] * m_outputStrides[i]; + + inputIndices[0] += (patchIdx[0] + offsetIdx[0]) * m_inputStrides[i]; + inputIndices[1] += (patchIdx[1] + offsetIdx[1]) * m_inputStrides[i]; + } + } else { + for (int i = 0; i < NumDims - 2; ++i) { + const Index patchIdx[2] = {patchIndices[0] / m_patchStrides[i], + patchIndices[1] / m_patchStrides[i]}; + patchIndices[0] -= patchIdx[0] * m_patchStrides[i]; + patchIndices[1] -= patchIdx[1] * m_patchStrides[i]; + + const Index offsetIdx[2] = {patchOffsets[0] / m_outputStrides[i+1], + patchOffsets[1] / m_outputStrides[i+1]}; + patchOffsets[0] -= offsetIdx[0] * m_outputStrides[i+1]; + patchOffsets[1] -= offsetIdx[1] * m_outputStrides[i+1]; + + inputIndices[0] += (patchIdx[0] + offsetIdx[0]) * m_inputStrides[i]; + inputIndices[1] += (patchIdx[1] + offsetIdx[1]) * m_inputStrides[i]; + } + } + inputIndices[0] += (patchIndices[0] + patchOffsets[0]); + inputIndices[1] += (patchIndices[1] + patchOffsets[1]); + + if (inputIndices[1] - inputIndices[0] == packetSize - 1) { + PacketReturnType rslt = m_impl.template packet<Unaligned>(inputIndices[0]); + return rslt; + } + else { + EIGEN_ALIGN_DEFAULT CoeffReturnType values[packetSize]; + values[0] = m_impl.coeff(inputIndices[0]); + values[packetSize-1] = m_impl.coeff(inputIndices[1]); + for (int i = 1; i < packetSize-1; ++i) { + values[i] = coeff(index+i); + } + PacketReturnType rslt = internal::pload<PacketReturnType>(values); + return rslt; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<Index, NumDims>& coords) const + { + Index patch_coord_idx = Layout == ColMajor ? NumDims - 1 : 0; + // Location of the first element of the patch. + const Index patchIndex = coords[patch_coord_idx]; + + if (TensorEvaluator<ArgType, Device>::CoordAccess) { + array<Index, NumDims-1> inputCoords; + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + for (int i = NumDims - 2; i > 0; --i) { + const Index patchIdx = patchIndex / m_patchStrides[i]; + patchIndex -= patchIdx * m_patchStrides[i]; + const Index offsetIdx = coords[i]; + inputCoords[i] = coords[i] + patchIdx; + } + } else { + for (int i = 0; i < NumDims - 2; ++i) { + const Index patchIdx = patchIndex / m_patchStrides[i]; + patchIndex -= patchIdx * m_patchStrides[i]; + const Index offsetIdx = coords[i+1]; + inputCoords[i] = coords[i+1] + patchIdx; + } + } + Index coords_idx = Layout == ColMajor ? 0 : NumDims - 1; + inputCoords[0] = (patchIndex + coords[coords_idx]); + return m_impl.coeff(inputCoords); + } + else { + Index inputIndex = 0; + if (Layout == ColMajor) { + for (int i = NumDims - 2; i > 0; --i) { + const Index patchIdx = patchIndex / m_patchStrides[i]; + patchIndex -= patchIdx * m_patchStrides[i]; + const Index offsetIdx = coords[i]; + inputIndex += (patchIdx + offsetIdx) * m_inputStrides[i]; + } + } else { + for (int i = 0; i < NumDims - 2; ++i) { + const Index patchIdx = patchIndex / m_patchStrides[i]; + patchIndex -= patchIdx * m_patchStrides[i]; + const Index offsetIdx = coords[i+1]; + inputIndex += (patchIdx + offsetIdx) * m_inputStrides[i]; + } + } + Index coords_idx = Layout == ColMajor ? 0 : NumDims - 1; + inputIndex += (patchIndex + coords[coords_idx]); + return m_impl.coeff(inputIndex); + } + } + + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + + protected: + Dimensions m_dimensions; + array<Index, NumDims> m_outputStrides; + array<Index, NumDims-1> m_inputStrides; + array<Index, NumDims-1> m_patchStrides; + + TensorEvaluator<ArgType, Device> m_impl; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_PATCH_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h new file mode 100644 index 0000000000..a70d5ae1f0 --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -0,0 +1,1141 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H +#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H + +namespace Eigen { + +/** \class TensorReduction + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor reduction class. + * + */ + +namespace internal { +template<typename Op, typename Dims, typename XprType> +struct traits<TensorReductionOp<Op, Dims, XprType> > + : traits<XprType> +{ + typedef typename traits<XprType>::Scalar Scalar; + typedef typename traits<XprType>::StorageKind StorageKind; + typedef typename traits<XprType>::Index Index; + typedef typename XprType::Nested Nested; +}; + +template<typename Op, typename Dims, typename XprType> +struct eval<TensorReductionOp<Op, Dims, XprType>, Eigen::Dense> +{ + typedef const TensorReductionOp<Op, Dims, XprType>& type; +}; + +template<typename Op, typename Dims, typename XprType> +struct nested<TensorReductionOp<Op, Dims, XprType>, 1, typename eval<TensorReductionOp<Op, Dims, XprType> >::type> +{ + typedef TensorReductionOp<Op, Dims, XprType> type; +}; + + + +template <typename InputDims, typename OutputDims, typename ReducedDims> EIGEN_DEVICE_FUNC +static void partition_dims(const InputDims& input_dims, + const array<bool, internal::array_size<InputDims>::value>& reduced, + OutputDims* output_dims, ReducedDims* reduced_dims) { + const int NumInputDims = internal::array_size<InputDims>::value; + int outputIndex = 0; + int reduceIndex = 0; + for (int i = 0; i < NumInputDims; ++i) { + if (OutputDims::count == 0 || reduced[i]) { + (*reduced_dims)[reduceIndex] = input_dims[i]; + ++reduceIndex; + } else { + (*output_dims)[outputIndex] = input_dims[i]; + ++outputIndex; + } + } +} + + + +template <typename ReducedDims, int NumTensorDims, int Layout> +struct are_inner_most_dims { + static const bool value = false; +}; +template <typename ReducedDims, int NumTensorDims, int Layout> +struct preserve_inner_most_dims { + static const bool value = false; +}; + +#if defined(EIGEN_HAS_CONSTEXPR) && defined(EIGEN_HAS_VARIADIC_TEMPLATES) +// The use of the tmp1, tmp2, tmp3 intermediate variables is needed for nvcc 7 +// to compile the code below. NVidia is working on a fix. +template <typename ReducedDims, int NumTensorDims> +struct are_inner_most_dims<ReducedDims, NumTensorDims, ColMajor>{ + static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>()(); + static const bool tmp2 = index_statically_eq<ReducedDims>()(0, 0); + static const bool tmp3 = index_statically_eq<ReducedDims>()(array_size<ReducedDims>::value-1, array_size<ReducedDims>::value-1); + static const bool value = tmp1 & tmp2 & tmp3; +}; +template <typename ReducedDims, int NumTensorDims> +struct are_inner_most_dims<ReducedDims, NumTensorDims, RowMajor>{ + static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>()(); + static const bool tmp2 = index_statically_eq<ReducedDims>()(0, NumTensorDims - array_size<ReducedDims>::value); + static const bool tmp3 = index_statically_eq<ReducedDims>()(array_size<ReducedDims>::value - 1, NumTensorDims - 1); + static const bool value = tmp1 & tmp2 & tmp3; + +}; +template <typename ReducedDims, int NumTensorDims> +struct preserve_inner_most_dims<ReducedDims, NumTensorDims, ColMajor>{ + static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>()(); + static const bool tmp2 = index_statically_gt<ReducedDims>()(0, 0); + static const bool value = tmp1 & tmp2; + +}; +template <typename ReducedDims, int NumTensorDims> +struct preserve_inner_most_dims<ReducedDims, NumTensorDims, RowMajor>{ + static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>()(); + static const bool tmp2 = index_statically_lt<ReducedDims>()(array_size<ReducedDims>::value - 1, NumTensorDims - 1); + static const bool value = tmp1 & tmp2; +}; +#endif + + +template <int DimIndex, typename Self, typename Op> +struct GenericDimReducer { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::CoeffReturnType* accum) { + EIGEN_STATIC_ASSERT(DimIndex >= 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + for (int j = 0; j < self.m_reducedDims[DimIndex]; ++j) { + const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex]; + GenericDimReducer<DimIndex-1, Self, Op>::reduce(self, input, reducer, accum); + } + } +}; +template <typename Self, typename Op> +struct GenericDimReducer<-1, Self, Op> { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::CoeffReturnType* accum) { + reducer.reduce(self.m_impl.coeff(firstIndex), accum); + } +}; + +template <typename Self, typename Op, bool Vectorizable = (Self::InputPacketAccess & Op::PacketAccess)> +struct InnerMostDimReducer { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) { + typename Self::CoeffReturnType accum = reducer.initialize(); + for (typename Self::Index j = 0; j < numValuesToReduce; ++j) { + reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum); + } + return reducer.finalize(accum); + } +}; + +template <typename Self, typename Op> +struct InnerMostDimReducer<Self, Op, true> { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) { + const int packetSize = internal::unpacket_traits<typename Self::PacketReturnType>::size; + const typename Self::Index VectorizedSize = (numValuesToReduce / packetSize) * packetSize; + typename Self::PacketReturnType p = reducer.template initializePacket<typename Self::PacketReturnType>(); + for (typename Self::Index j = 0; j < VectorizedSize; j += packetSize) { + reducer.reducePacket(self.m_impl.template packet<Unaligned>(firstIndex + j), &p); + } + typename Self::CoeffReturnType accum = reducer.initialize(); + for (typename Self::Index j = VectorizedSize; j < numValuesToReduce; ++j) { + reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum); + } + return reducer.finalizeBoth(accum, p); + } +}; + +template <int DimIndex, typename Self, typename Op, bool vectorizable = (Self::InputPacketAccess & Op::PacketAccess)> +struct InnerMostDimPreserver { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) { + eigen_assert(false && "should never be called"); + } +}; + +template <int DimIndex, typename Self, typename Op> +struct InnerMostDimPreserver<DimIndex, Self, Op, true> { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) { + EIGEN_STATIC_ASSERT(DimIndex >= 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + for (typename Self::Index j = 0; j < self.m_reducedDims[DimIndex]; ++j) { + const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex]; + InnerMostDimPreserver<DimIndex-1, Self, Op>::reduce(self, input, reducer, accum); + } + } +}; + +template <typename Self, typename Op> +struct InnerMostDimPreserver<-1, Self, Op, true> { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) { + reducer.reducePacket(self.m_impl.template packet<Unaligned>(firstIndex), accum); + } +}; + +// Default full reducer +template <typename Self, typename Op, typename Device, bool Vectorizable = (Self::InputPacketAccess & Op::PacketAccess)> +struct FullReducer { + static const bool HasOptimizedImplementation = false; + + static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const Device&, typename Self::CoeffReturnType* output) { + const typename Self::Index num_coeffs = array_prod(self.m_impl.dimensions()); + *output = InnerMostDimReducer<Self, Op>::reduce(self, 0, num_coeffs, reducer); + } +}; + + +#ifdef EIGEN_USE_THREADS +// Multithreaded full reducers +template <typename Eval, typename Op, bool Vectorizable = (Eval::InputPacketAccess & Op::PacketAccess)> +struct FullReducerShard { + static void run(const Eval& eval, typename Eval::Index firstIndex, typename Eval::Index numValuesToReduce, Op& reducer, FullReducerShard* shard) { + + shard->saccum = reducer.initialize(); + for (typename Eval::Index j = 0; j < numValuesToReduce; ++j) { + reducer.reduce(eval.m_impl.coeff(firstIndex + j), &shard->saccum); + } + } + + typename Eval::CoeffReturnType saccum; +}; + +template <typename Eval, typename Op> +struct FullReducerShard<Eval, Op, true> { + static void run(const Eval& eval, typename Eval::Index firstIndex, typename Eval::Index numValuesToReduce, Op& reducer, FullReducerShard* shard) { + + const int packetSize = internal::unpacket_traits<typename Eval::PacketReturnType>::size; + const typename Eval::Index VectorizedSize = (numValuesToReduce / packetSize) * packetSize; + + shard->paccum = reducer.template initializePacket<typename Eval::PacketReturnType>(); + for (typename Eval::Index j = 0; j < VectorizedSize; j += packetSize) { + reducer.reducePacket(eval.m_impl.template packet<Unaligned>(firstIndex + j), &shard->paccum); + } + shard->saccum = reducer.initialize(); + for (typename Eval::Index j = VectorizedSize; j < numValuesToReduce; ++j) { + reducer.reduce(eval.m_impl.coeff(firstIndex + j), &shard->saccum); + } + } + + typename Eval::PacketReturnType paccum; + typename Eval::CoeffReturnType saccum; +}; + + +template <typename Self, typename Op> +struct FullReducer<Self, Op, ThreadPoolDevice, false> { + static const bool HasOptimizedImplementation = !Op::IsStateful; + + // launch one reducer per thread and accumulate the result. + static void run(const Self& self, Op& reducer, const ThreadPoolDevice& device, typename Self::CoeffReturnType* output) { + typedef typename Self::Index Index; + const Index num_coeffs = array_prod(self.m_impl.dimensions()); + const Index blocksize = std::floor<Index>(static_cast<float>(num_coeffs)/device.numThreads()); + const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0; + eigen_assert(num_coeffs >= numblocks * blocksize); + + FixedSizeVector<Notification*> results(numblocks); + FixedSizeVector<FullReducerShard<Self, Op, false> > shards(numblocks, FullReducerShard<Self, Op, false>()); + for (Index i = 0; i < numblocks; ++i) { + results.push_back(device.enqueue(&FullReducerShard<Self, Op, false>::run, self, i*blocksize, blocksize, reducer, &shards[i])); + } + + FullReducerShard<Self, Op, false> finalShard; + if (numblocks * blocksize < num_coeffs) { + FullReducerShard<Self, Op, false>::run(self, numblocks * blocksize, num_coeffs - numblocks * blocksize, reducer, &finalShard); + } else { + finalShard.saccum = reducer.initialize(); + } + + for (Index i = 0; i < numblocks; ++i) { + wait_until_ready(results[i]); + delete results[i]; + } + + for (Index i = 0; i < numblocks; ++i) { + reducer.reduce(shards[i].saccum, &finalShard.saccum); + } + *output = reducer.finalize(finalShard.saccum); + } +}; + +template <typename Self, typename Op> +struct FullReducer<Self, Op, ThreadPoolDevice, true> { + static const bool HasOptimizedImplementation = !Op::IsStateful; + + // launch one reducer per thread and accumulate the result. + static void run(const Self& self, Op& reducer, const ThreadPoolDevice& device, typename Self::CoeffReturnType* output) { + typedef typename Self::Index Index; + const Index num_coeffs = array_prod(self.m_impl.dimensions()); + const Index blocksize = std::floor<Index>(static_cast<float>(num_coeffs)/device.numThreads()); + const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0; + eigen_assert(num_coeffs >= numblocks * blocksize); + + FixedSizeVector<Notification*> results(numblocks); + FixedSizeVector<FullReducerShard<Self, Op, true> > shards(numblocks, FullReducerShard<Self, Op, true>()); + for (Index i = 0; i < numblocks; ++i) { + results.push_back(device.enqueue(&FullReducerShard<Self, Op, true>::run, self, i*blocksize, blocksize, reducer, &shards[i])); + } + + FullReducerShard<Self, Op, true> finalShard; + if (numblocks * blocksize < num_coeffs) { + FullReducerShard<Self, Op, true>::run(self, numblocks * blocksize, num_coeffs - numblocks * blocksize, reducer, &finalShard); + } else { + finalShard.paccum = reducer.template initializePacket<typename Self::PacketReturnType>(); + finalShard.saccum = reducer.initialize(); + } + + for (Index i = 0; i < numblocks; ++i) { + wait_until_ready(results[i]); + delete results[i]; + } + + for (Index i = 0; i < numblocks; ++i) { + reducer.reducePacket(shards[i].paccum, &finalShard.paccum); + reducer.reduce(shards[i].saccum, &finalShard.saccum); + } + + *output = reducer.finalizeBoth(finalShard.saccum, finalShard.paccum); + } +}; +#endif + + +#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) +// Full reducers for GPU, don't vectorize for now + +// Reducer function that enables multiple cuda thread to safely accumulate at the same +// output address. It basically reads the current value of the output variable, and +// attempts to update it with the new value. If in the meantime another cuda thread +// updated the content of the output address it will try again. +template <typename T, typename R> +__device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) { +#if __CUDA_ARCH__ >= 300 + if (sizeof(T) == 4) + { + unsigned int oldval = *reinterpret_cast<unsigned int*>(output); + unsigned int newval = oldval; + reducer.reduce(accum, reinterpret_cast<T*>(&newval)); + if (newval == oldval) { + return; + } + unsigned int readback; + while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) { + oldval = readback; + newval = oldval; + reducer.reduce(accum, reinterpret_cast<T*>(&newval)); + if (newval == oldval) { + return; + } + } + } + else if (sizeof(T) == 8) { + unsigned long long oldval = *reinterpret_cast<unsigned long long*>(output); + unsigned long long newval = oldval; + reducer.reduce(accum, reinterpret_cast<T*>(&newval)); + if (newval == oldval) { + return; + } + unsigned long long readback; + while ((readback = atomicCAS((unsigned long long*)output, oldval, newval)) != oldval) { + oldval = readback; + newval = oldval; + reducer.reduce(accum, reinterpret_cast<T*>(&newval)); + if (newval == oldval) { + return; + } + } + } + else { + assert(0 && "Wordsize not supported"); + } +#else + assert(0 && "Shouldn't be called on unsupported device"); +#endif +} + +template <typename T> +__device__ inline void atomicReduce(T* output, T accum, SumReducer<T>&) { +#if __CUDA_ARCH__ >= 300 + atomicAdd(output, accum); +#else + assert(0 && "Shouldn't be called on unsupported device"); +#endif +} + +template <int BlockSize, int NumPerThread, typename Self, + typename Reducer, typename Index> +__global__ void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs, + typename Self::CoeffReturnType* output) { + const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x; + + if (first_index == 0) { + *output = reducer.initialize(); + } + + typename Self::CoeffReturnType accum = reducer.initialize(); + for (Index i = 0; i < NumPerThread; ++i) { + const Index index = first_index + i * BlockSize; + if (index >= num_coeffs) { + break; + } + typename Self::CoeffReturnType val = input.m_impl.coeff(index); + reducer.reduce(val, &accum); + } + + for (int offset = warpSize/2; offset > 0; offset /= 2) { + reducer.reduce(__shfl_down(accum, offset), &accum); + } + + if ((threadIdx.x & (warpSize - 1)) == 0) { + atomicReduce(output, accum, reducer); + } +} + + +template <typename Self, typename Op, bool Vectorizable> +struct FullReducer<Self, Op, GpuDevice, Vectorizable> { + // Unfortunately nvidia doesn't support well exotic types such as complex, + // so reduce the scope of the optimized version of the code to the simple case + // of floats. + static const bool HasOptimizedImplementation = !Op::IsStateful && + internal::is_same<typename Self::CoeffReturnType, float>::value; + + template <typename OutputType> + static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) { + assert(false && "Should only be called on floats"); + } + + static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output) { + typedef typename Self::Index Index; + + const Index num_coeffs = array_prod(self.m_impl.dimensions()); + const int block_size = 256; + const int num_per_thread = 128; + const int num_blocks = std::ceil(static_cast<float>(num_coeffs) / (block_size * num_per_thread)); + LAUNCH_CUDA_KERNEL((FullReductionKernel<block_size, num_per_thread>), + num_blocks, block_size, 0, device, reducer, self, num_coeffs, output); + } +}; + +#endif + + +template <typename Self, typename Op, + bool Vectorizable = (Self::InputPacketAccess & Op::PacketAccess)> +class BlockReducer { + public: + typedef typename Self::Index Index; + typedef typename Self::Scalar Scalar; + typedef typename Self::CoeffReturnType CoeffReturnType; + typedef typename Self::PacketReturnType PacketReturnType; + explicit BlockReducer(const Op& reducer) : op_(reducer) { + accum_ = op_.initialize(); + } + void Reduce(Index index, Index num_values_to_reduce, Scalar* data) { + for (Index i = 0; i < num_values_to_reduce; ++i) { + op_.reduce(data[index + i], &accum_); + } + } + CoeffReturnType Finalize() { + return op_.finalize(accum_); + } + PacketReturnType FinalizePacket() { + // TODO(andydavis) This function should not be called for Scalar + // reductions: clean this up or add an assert here. + return PacketReturnType(); + } + + private: + CoeffReturnType accum_; + Op op_; +}; + +template <typename Self, typename Op> +class BlockReducer<Self, Op, true> { + public: + typedef typename Self::Index Index; + typedef typename Self::Scalar Scalar; + typedef typename Self::CoeffReturnType CoeffReturnType; + typedef typename Self::PacketReturnType PacketReturnType; + explicit BlockReducer(const Op& reducer) : op_(reducer) { + vaccum_ = op_.template initializePacket<PacketReturnType>(); + accum_ = op_.initialize(); + } + void Reduce(Index index, Index num_values_to_reduce, Scalar* data) { + const int packet_size = internal::unpacket_traits<PacketReturnType>::size; + const Index vectorized_size = (num_values_to_reduce / packet_size) * + packet_size; + for (Index i = 0; i < vectorized_size; i += packet_size) { + op_.reducePacket(internal::ploadt<PacketReturnType, Unaligned>( + &data[index + i]), &vaccum_); + } + for (Index i = vectorized_size; i < num_values_to_reduce; ++i) { + op_.reduce(data[index + i], &accum_); + } + } + CoeffReturnType Finalize() { + return op_.finalizeBoth(accum_, vaccum_); + } + PacketReturnType FinalizePacket() { + return op_.finalizePacket(vaccum_); + } + + private: + PacketReturnType vaccum_; + CoeffReturnType accum_; + Op op_; +}; + +} // end namespace internal + + +template <typename Op, typename Dims, typename XprType> +class TensorReductionOp : public TensorBase<TensorReductionOp<Op, Dims, XprType>, ReadOnlyAccessors> { + public: + typedef typename Eigen::internal::traits<TensorReductionOp>::Scalar Scalar; + typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; + typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType; + typedef typename Eigen::internal::nested<TensorReductionOp>::type Nested; + typedef typename Eigen::internal::traits<TensorReductionOp>::StorageKind StorageKind; + typedef typename Eigen::internal::traits<TensorReductionOp>::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorReductionOp(const XprType& expr, const Dims& dims) : m_expr(expr), m_dims(dims) + { } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorReductionOp(const XprType& expr, const Dims& dims, const Op& reducer) : m_expr(expr), m_dims(dims), m_reducer(reducer) + { } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const XprType& expression() const { return m_expr; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const Dims& dims() const { return m_dims; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const Op& reducer() const { return m_reducer; } + + protected: + typename XprType::Nested m_expr; + const Dims m_dims; + const Op m_reducer; +}; + + +// Eval as rvalue +template<typename Op, typename Dims, typename ArgType, typename Device> +struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device> +{ + typedef TensorReductionOp<Op, Dims, ArgType> XprType; + typedef typename XprType::Index Index; + typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions; + static const int NumInputDims = internal::array_size<InputDimensions>::value; + static const int NumReducedDims = internal::array_size<Dims>::value; + EIGEN_STATIC_ASSERT(NumInputDims >= NumReducedDims, YOU_MADE_A_PROGRAMMING_MISTAKE) + static const int NumOutputDims = NumInputDims - NumReducedDims; + typedef DSizes<Index, NumOutputDims> Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename internal::remove_const<Scalar>::type ScalarNonConst; + typedef TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device> Self; + static const bool InputPacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess; + + enum { + IsAligned = false, + PacketAccess = Self::InputPacketAccess && Op::PacketAccess, + BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess, + Layout = TensorEvaluator<ArgType, Device>::Layout, + CoordAccess = false, // to be implemented + }; + + typedef typename internal::TensorBlock<Index, ScalarNonConst, NumOutputDims, + Layout> OutputTensorBlock; + typedef typename internal::TensorBlock<Index, ScalarNonConst, NumInputDims, + Layout> InputTensorBlock; + + static const bool ReducingInnerMostDims = internal::are_inner_most_dims<Dims, NumInputDims, Layout>::value; + static const bool PreservingInnerMostDims = internal::preserve_inner_most_dims<Dims, NumInputDims, Layout>::value; + static const bool RunningFullReduction = (NumInputDims==NumReducedDims); + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_reducer(op.reducer()), m_result(NULL), m_device(device) + { + EIGEN_STATIC_ASSERT((!ReducingInnerMostDims | !PreservingInnerMostDims | (NumReducedDims == NumInputDims)), + YOU_MADE_A_PROGRAMMING_MISTAKE); + for (int i = 0; i < NumInputDims; ++i) { + m_reduced_dim[i] = false; + } + for (int i = 0; i < NumReducedDims; ++i) { + eigen_assert(op.dims()[i] >= 0); + eigen_assert(op.dims()[i] < NumInputDims); + m_reduced_dim[op.dims()[i]] = true; + } + + const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions(); + internal::partition_dims(input_dims, m_reduced_dim, &m_dimensions, &m_reducedDims); + + // Precompute output strides. + if (NumOutputDims > 0) { + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + m_outputStrides[0] = 1; + for (int i = 1; i < NumOutputDims; ++i) { + m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1]; + m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]); + } + } else { + m_outputStrides[NumOutputDims - 1] = 1; + for (int i = NumOutputDims - 2; i >= 0; --i) { + m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1]; + m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]); + } + } + } + + // Precompute input strides. + if (NumInputDims > 0) { + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + m_inputStrides[0] = 1; + for (int i = 1; i < NumInputDims; ++i) { + m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; + } + } else { + m_inputStrides[NumInputDims - 1] = 1; + for (int i = NumInputDims - 2; i >= 0; --i) { + m_inputStrides[i] = m_inputStrides[i + 1] * input_dims[i + 1]; + } + } + } + + int outputIndex = 0; + int reduceIndex = 0; + for (int i = 0; i < NumInputDims; ++i) { + if (m_reduced_dim[i]) { + m_reducedStrides[reduceIndex] = m_inputStrides[i]; + ++reduceIndex; + } else { + m_preservedStrides[outputIndex] = m_inputStrides[i]; + m_output_to_input_dim_map[outputIndex] = i; + ++outputIndex; + } + } + + m_numValuesToReduce + = NumOutputDims == 0 ? internal::array_prod(input_dims) + : (static_cast<int>(Layout) == static_cast<int>(ColMajor)) + ? m_preservedStrides[0] : m_preservedStrides[NumOutputDims - 1]; + + m_block_total_size_max = numext::maxi(static_cast<std::size_t>(1), + device.lastLevelCacheSize() / + sizeof(Scalar)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType; + typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { + m_impl.evalSubExprsIfNeeded(NULL); + + // Use the FullReducer if possible. + if (RunningFullReduction && internal::FullReducer<Self, Op, Device>::HasOptimizedImplementation && + ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) || + (internal::array_prod(m_impl.dimensions()) > 1024 * 1024))) { + + bool need_assign = false; + if (!data) { + m_result = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType))); + data = m_result; + need_assign = true; + } + + Op reducer(m_reducer); + internal::FullReducer<Self, Op, Device>::run(*this, reducer, m_device, data); + return need_assign; + } + + return true; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + + if (m_result) { + m_device.deallocate(m_result); + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + if (RunningFullReduction && m_result) { + return *m_result; + } + Op reducer(m_reducer); + if (ReducingInnerMostDims) { + return internal::InnerMostDimReducer<Self, Op>::reduce(*this, firstInput(index), + m_numValuesToReduce, reducer); + } else { + typename Self::CoeffReturnType accum = reducer.initialize(); + internal::GenericDimReducer<NumReducedDims-1, Self, Op>::reduce(*this, firstInput(index), reducer, &accum); + return reducer.finalize(accum); + } + } + + // TODO(bsteiner): provide a more efficient implementation. + template<int LoadMode> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + const int packetSize = internal::unpacket_traits<PacketReturnType>::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index + packetSize - 1 < dimensions().TotalSize()); + + EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize]; + if (ReducingInnerMostDims) { + const Index num_values_to_reduce = m_numValuesToReduce; + const Index firstIndex = firstInput(index); + for (Index i = 0; i < packetSize; ++i) { + Op reducer(m_reducer); + values[i] = internal::InnerMostDimReducer<Self, Op>::reduce(*this, firstIndex + i * num_values_to_reduce, + num_values_to_reduce, reducer); + } + } else if (PreservingInnerMostDims) { + const Index firstIndex = firstInput(index); + const int innermost_dim = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? 0 : NumOutputDims - 1; + // TBD: extend this the the n innermost dimensions that we preserve. + if (((firstIndex % m_dimensions[innermost_dim]) + packetSize - 1) < m_dimensions[innermost_dim]) { + Op reducer(m_reducer); + typename Self::PacketReturnType accum = reducer.template initializePacket<typename Self::PacketReturnType>(); + internal::InnerMostDimPreserver<NumReducedDims-1, Self, Op>::reduce(*this, firstIndex, reducer, &accum); + return reducer.finalizePacket(accum); + } else { + for (int i = 0; i < packetSize; ++i) { + values[i] = coeff(index + i); + } + } + } else { + for (int i = 0; i < packetSize; ++i) { + values[i] = coeff(index + i); + } + } + PacketReturnType rslt = internal::pload<PacketReturnType>(values); + return rslt; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( + std::vector<internal::TensorOpResourceRequirements>* resources) const { + resources->push_back(internal::TensorOpResourceRequirements( + internal::kSkewedInnerDims, m_block_total_size_max)); + m_impl.getResourceRequirements(resources); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block( + OutputTensorBlock* output_block) const { + // Special case full reductions to avoid input block copy below. + if (NumInputDims == NumReducedDims) { + eigen_assert(output_block->first_coeff_index() == 0); + eigen_assert(output_block->block_sizes().TotalSize() == 1); + Op reducer(m_reducer); + output_block->data()[0] = internal::InnerMostDimReducer<Self, Op>::reduce( + *this, 0, m_numValuesToReduce, reducer); + return; + } + + // Calculate input tensor 'slice' required to reduce output block coeffs. + DSizes<Index, NumInputDims> input_slice_sizes(m_impl.dimensions()); + for (int i = 0; i < NumOutputDims; ++i) { + // Clip preserved input dimensions by output block size. + input_slice_sizes[m_output_to_input_dim_map[i]] = + output_block->block_sizes()[i]; + } + + // Shard input tensor slice into blocks (because it could be large if we + // need to reduce along several dimensions to calculate required output + // coefficients). + const Index max_coeff_count = + numext::mini(((m_device.firstLevelCacheSize()) / sizeof(Scalar)), + input_slice_sizes.TotalSize()); + + // Calculate max output shard size needed to keep working set of reducers + // in L1, while leaving enough space for reducer overhead and 'packet_size' + // reductions. + DSizes<Index, NumInputDims> target_input_block_sizes; + CalculateTargetInputBlockShape(max_coeff_count, input_slice_sizes, + &target_input_block_sizes); + // Calculate indices for first preserved dimension. + const Index first_preserved_dim_output_index = + static_cast<int>(Layout) == static_cast<int>(ColMajor) ? + 0 : NumOutputDims - 1; + const Index first_preserved_dim_input_index = m_output_to_input_dim_map[ + first_preserved_dim_output_index]; + const bool inner_most_dim_preserved = first_preserved_dim_input_index == + (static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : + NumInputDims - 1) | PreservingInnerMostDims; + + // Calculate output block inner/outer dimension sizes. + const Index output_block_inner_dim_size = output_block->block_sizes()[ + first_preserved_dim_output_index]; + const Index output_block_outer_dim_size = + output_block->block_sizes().TotalSize() / output_block_inner_dim_size; + // Calculate shard size for first preserved dimension. + const Index output_shard_size = target_input_block_sizes[ + first_preserved_dim_input_index]; + const Index num_output_shards = + (output_block_inner_dim_size + output_shard_size - 1) / + output_shard_size; + + // Initialize 'tensor_slice_offsets' from input coords of output index. + DSizes<Index, NumInputDims> tensor_slice_offsets; + GetInputCoordsForOutputIndex(output_block->first_coeff_index(), + &tensor_slice_offsets); + + // Store tensor slice offset in first preserved dimension to be used + // to update tensor slice extents in loop below. + const Index first_preserved_dim_offset_start = tensor_slice_offsets[ + first_preserved_dim_input_index]; + + array<BlockIteratorState, NumOutputDims> block_iter_state; + + // Initialize state used to iterate through output coefficients + // and update 'tensor_slice_offsets' in outer preserved dims. + for (int i = 0; i < NumOutputDims - 1; ++i) { + const int dim = static_cast<int>(Layout) == static_cast<int>(ColMajor) + ? i + 1 : NumOutputDims - i - 2; + block_iter_state[i].input_dim = m_output_to_input_dim_map[dim]; + block_iter_state[i].output_size = output_block->block_sizes()[dim]; + block_iter_state[i].output_count = 0; + } + + // Allocate input block memory. + ScalarNonConst* input_block_data = static_cast<ScalarNonConst*>( + m_device.allocate(max_coeff_count * sizeof(Scalar))); + // Allocate reducer memory. + const bool packet_reductions_enabled = (Self::InputPacketAccess & + Op::PacketAccess); + const Index packet_size = internal::unpacket_traits<PacketReturnType>::size; + const Index num_reducers = + (inner_most_dim_preserved && packet_reductions_enabled) ? + (output_shard_size / packet_size + output_shard_size % packet_size + + packet_size) : output_shard_size; + typedef internal::BlockReducer<Self, Op> BlockReducer; + BlockReducer* reducers = static_cast<BlockReducer*>( + m_device.allocate(num_reducers * sizeof(BlockReducer))); + + InputDimensions input_tensor_dims(m_impl.dimensions()); + for (Index output_outer_index = 0; + output_outer_index < output_block_outer_dim_size; + ++output_outer_index) { + for (Index output_shard_index = 0; + output_shard_index < num_output_shards; + ++output_shard_index) { + // Initialize 'tensor_slice_extents' for this output shard. + DSizes<Index, NumInputDims> tensor_slice_extents(input_slice_sizes); + for (int i = 0; i < NumInputDims; ++i) { + if (i == first_preserved_dim_input_index) { + // Clip first preserved dim size to output shard size. + tensor_slice_extents[i] = numext::mini( + output_shard_size, + input_slice_sizes[i] - (tensor_slice_offsets[i] - + first_preserved_dim_offset_start)); + + } else if (!m_reduced_dim[i]) { + // Clip outer preserved dims to size 1, so that we reduce a + // contiguous set of output coefficients. + tensor_slice_extents[i] = 1; + } + } + + // Intialize output coefficient reducers. + for (int i = 0; i < num_reducers; ++i) { + new (&reducers[i]) BlockReducer(m_reducer); + } + + typedef internal::TensorSliceBlockMapper< + Index, ScalarNonConst, NumInputDims, Layout> TensorSliceBlockMapper; + + // TODO(andydavis) Consider removing 'input_block_stride_order' if we + // find that scattered reads are not worth supporting in + // TensorSliceBlockMapper. + TensorSliceBlockMapper block_mapper( + input_tensor_dims, tensor_slice_offsets, tensor_slice_extents, + target_input_block_sizes, DimensionList<Index, NumInputDims>()); + + const Index num_outputs_to_update = tensor_slice_extents[ + first_preserved_dim_input_index]; + const Index preserved_dim_vector_reducer_count = + (inner_most_dim_preserved && packet_reductions_enabled) ? + num_outputs_to_update / packet_size: 0; + const Index preserved_dim_vector_coeff_count = + inner_most_dim_preserved ? preserved_dim_vector_reducer_count * + packet_size : 0; + const Index preserved_dim_reducer_limit = + (inner_most_dim_preserved && packet_reductions_enabled) ? + (preserved_dim_vector_reducer_count + + num_outputs_to_update % packet_size) : num_outputs_to_update; + + const Index total_block_count = block_mapper.total_block_count(); + for (Index b = 0; b < total_block_count; ++b) { + InputTensorBlock input_block = block_mapper.GetBlockForIndex( + b, input_block_data); + // Read. + m_impl.block(&input_block); + + Index num_values_to_reduce = 1; + for (Index i = 0; i < NumInputDims; ++i) { + if (m_reduced_dim[i]) { + num_values_to_reduce *= input_block.block_sizes()[i]; + } + } + // Reduce. + if (inner_most_dim_preserved) { + const Index input_outer_dim_size = + input_block.block_sizes().TotalSize() / num_outputs_to_update; + for (Index input_outer_dim_index = 0; + input_outer_dim_index < input_outer_dim_size; + ++input_outer_dim_index) { + const Index input_outer_dim_base = input_outer_dim_index * + num_outputs_to_update; + for (Index i = 0; i < preserved_dim_vector_reducer_count; ++i) { + reducers[i].Reduce(input_outer_dim_base + i * packet_size, + packet_size, input_block.data()); + } + const Index scalar_reducer_base = input_outer_dim_base + + preserved_dim_vector_coeff_count; + for (Index i = preserved_dim_vector_reducer_count; + i < preserved_dim_reducer_limit; ++i) { + reducers[i].Reduce(scalar_reducer_base + i - + preserved_dim_vector_reducer_count, + 1, + input_block.data()); + } + } + } else { + for (Index i = 0; i < num_outputs_to_update; ++i) { + reducers[i].Reduce(i * num_values_to_reduce, + num_values_to_reduce, + input_block.data()); + } + } + } + + // Finalize all reducers for this output shard. + const Index output_base_index = + output_outer_index * output_block_inner_dim_size + + output_shard_index * output_shard_size; + if (inner_most_dim_preserved) { + EIGEN_ALIGN_DEFAULT CoeffReturnType values[packet_size]; + for (Index i = 0; i < preserved_dim_vector_reducer_count; ++i) { + const Index reducer_base = output_base_index + i * packet_size; + internal::pstore<CoeffReturnType, PacketReturnType>( + values, reducers[i].FinalizePacket()); + for (Index j = 0; j < packet_size; ++j) { + output_block->data()[reducer_base + j] = values[j]; + } + } + const Index scalar_reducer_base = output_base_index + + preserved_dim_vector_coeff_count; + + for (Index i = preserved_dim_vector_reducer_count; + i < preserved_dim_reducer_limit; ++i) { + output_block->data()[ + scalar_reducer_base + i - preserved_dim_vector_reducer_count] = + reducers[i].Finalize(); + } + } else { + for (int i = 0; i < num_outputs_to_update; ++i) { + output_block->data()[output_base_index + i] = + reducers[i].Finalize(); + } + } + + // Update 'tensor_slice_offsets' by num outputs for this output shard. + tensor_slice_offsets[first_preserved_dim_input_index] += + num_outputs_to_update; + } + // Update slice offset for inner preserved dim. + tensor_slice_offsets[first_preserved_dim_input_index] -= + output_block_inner_dim_size; + // Update slice offsets for remaining output dims. + for (int i = 0; i < NumOutputDims - 1; ++i) { + BlockIteratorState& b = block_iter_state[i]; + if (++b.output_count < b.output_size) { + ++tensor_slice_offsets[b.input_dim]; + break; + } + b.output_count = 0; + tensor_slice_offsets[b.input_dim] -= b.output_size - 1; + } + } + + // Free memory. + m_device.deallocate(input_block_data); + m_device.deallocate(reducers); + } + + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + + private: + template <int, typename, typename> friend struct internal::GenericDimReducer; + template <typename, typename, bool> friend struct internal::InnerMostDimReducer; + template <int, typename, typename, bool> friend struct internal::InnerMostDimPreserver; + template <typename S, typename O, typename D, bool V> friend struct internal::FullReducer; +#ifdef EIGEN_USE_THREADS + template <typename S, typename O, bool V> friend struct internal::FullReducerShard; +#endif +#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) + template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*); +#endif + + struct BlockIteratorState { + Index input_dim; + Index output_size; + Index output_count; + }; + + // Returns the Index in the input tensor of the first value that needs to be + // used to compute the reduction at output index "index". + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const { + if (ReducingInnerMostDims) { + return index * m_numValuesToReduce; + } + Index startInput = 0; + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + for (int i = NumOutputDims - 1; i > 0; --i) { + // This is index_i in the output tensor. + const Index idx = index / m_fastOutputStrides[i]; + startInput += idx * m_preservedStrides[i]; + index -= idx * m_outputStrides[i]; + } + } else { + for (int i = 0; i < NumOutputDims - 1; ++i) { + // This is index_i in the output tensor. + const Index idx = index / m_fastOutputStrides[i]; + startInput += idx * m_preservedStrides[i]; + index -= idx * m_outputStrides[i]; + } + } + if (PreservingInnerMostDims) { + eigen_assert(m_numValuesToReduce == 1); + startInput += index; + } else { + startInput += index * m_numValuesToReduce; + } + return startInput; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void GetInputCoordsForOutputIndex( + Index index, + DSizes<Index, NumInputDims>* coords) const { + for (int i = 0; i < NumInputDims; ++i) { + (*coords)[i] = 0; + } + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + for (int i = NumOutputDims - 1; i > 0; --i) { + const Index idx = index / m_fastOutputStrides[i]; + (*coords)[m_output_to_input_dim_map[i]] = idx; + index -= idx * m_outputStrides[i]; + } + (*coords)[m_output_to_input_dim_map[0]] = index; + } else { + for (int i = 0; i < NumOutputDims - 1; ++i) { + const Index idx = index / m_fastOutputStrides[i]; + (*coords)[m_output_to_input_dim_map[i]] = idx; + index -= idx * m_outputStrides[i]; + } + (*coords)[m_output_to_input_dim_map[NumOutputDims-1]] = index; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void CalculateTargetInputBlockShape( + const Index max_coeff_count, + const DSizes<Index, NumInputDims>& input_slice_sizes, + DSizes<Index, NumInputDims>* target_input_block_sizes) const { + typedef typename internal::packet_traits<Scalar>::type Packet; + const Index packet_size = internal::unpacket_traits<Packet>::size; + typedef internal::BlockReducer<Self, Op> BlockReducer; + // TODO(andydavis) Compute reducer overhead correctly for the case where + // we are preserving the inner most dimension, and a single reducer + // reduces a packet's worth of output coefficients. + const Index reducer_overhead = sizeof(BlockReducer) / sizeof(Scalar); + + Index coeff_to_allocate = max_coeff_count; + bool first_preserved_dim_allocated = false; + bool first_reduced_dim_allocated = false; + for (int i = 0; i < NumInputDims; ++i) { + const int dim = static_cast<int>(Layout) == static_cast<int>(ColMajor) + ? i : NumInputDims - i - 1; + (*target_input_block_sizes)[dim] = 1; + if (m_reduced_dim[dim]) { + // TODO(andydavis) Consider allocating to multiple reduced dimensions. + // Watch out for cases where reduced dimensions are not contiguous, + // which induces scattered reads. + if (!first_reduced_dim_allocated) { + (*target_input_block_sizes)[dim] = numext::mini(input_slice_sizes[dim], + coeff_to_allocate); + coeff_to_allocate /= (*target_input_block_sizes)[dim]; + first_reduced_dim_allocated = true; + } + } else if (!first_preserved_dim_allocated) { + // TODO(andydavis) Include output block size in this L1 working set + // calculation. + const Index allocated = max_coeff_count - coeff_to_allocate; + const Index alloc_size = numext::maxi(static_cast<Index>(1), + coeff_to_allocate / + reducer_overhead); + (*target_input_block_sizes)[dim] = numext::mini(input_slice_sizes[dim], + alloc_size); + coeff_to_allocate = numext::maxi( + static_cast<Index>(1), + coeff_to_allocate / ((*target_input_block_sizes)[dim] * + reducer_overhead)); + first_preserved_dim_allocated = true; + } + } + } + + // Bitmap indicating if an input dimension is reduced or not. + array<bool, NumInputDims> m_reduced_dim; + // Dimensions of the output of the operation. + Dimensions m_dimensions; + // Precomputed strides for the input tensor. + array<Index, NumInputDims> m_inputStrides; + // Precomputed strides for the output tensor. + array<Index, NumOutputDims> m_outputStrides; + array<internal::TensorIntDivisor<Index>, NumOutputDims> m_fastOutputStrides; + // Subset of strides of the input tensor for the non-reduced dimensions. + // Indexed by output dimensions. + array<Index, NumOutputDims> m_preservedStrides; + // Map from output to input dimension index. + array<Index, NumOutputDims> m_output_to_input_dim_map; + // How many values go into each reduction + Index m_numValuesToReduce; + + // Subset of strides of the input tensor for the reduced dimensions. + // Indexed by reduced dimensions. + array<Index, NumReducedDims> m_reducedStrides; + // Size of the input dimensions that are reduced. + // Indexed by reduced dimensions. + array<Index, NumReducedDims> m_reducedDims; + + // Evaluator for the input expression. + TensorEvaluator<ArgType, Device> m_impl; + + // Operation to apply for computing the reduction. + Op m_reducer; + + // For full reductions +#ifdef EIGEN_USE_GPU + static const bool RunningOnGPU = internal::is_same<Device, Eigen::GpuDevice>::value; +#else + static const bool RunningOnGPU = false; +#endif + CoeffReturnType* m_result; + std::size_t m_block_total_size_max; + + const Device& m_device; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h new file mode 100644 index 0000000000..d052dcdf69 --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -0,0 +1,642 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Manjunath Kudlur <keveman@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H +#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H + +#if defined(EIGEN_USE_GPU) + +namespace Eigen { +namespace internal { + +template <typename OutExpr, typename InExpr, typename Op, typename Indices, + bool Tileable> +class TensorExecutor< + const TensorAssignOp< + OutExpr, TensorReductionOp<Op, Indices const, InExpr const> const>, + GpuDevice, false, Tileable> { + public: + typedef const TensorAssignOp< + OutExpr, TensorReductionOp<Op, Indices const, InExpr const> const> + Expression; + static void run(const Expression& expr, const GpuDevice& device); +}; + +template <typename OutExpr, typename InExpr, typename Op, typename Indices, + bool Tileable> +class TensorExecutor< + const TensorAssignOp< + OutExpr, TensorReductionOp<Op, Indices const, InExpr const> const>, + GpuDevice, true, Tileable> { + public: + typedef const TensorAssignOp< + OutExpr, TensorReductionOp<Op, Indices const, InExpr const> const> + Expression; + static void run(const Expression& expr, const GpuDevice& device); +}; + +template <typename InExpr, typename Op, typename Indices, bool Tileable> +class TensorExecutor<const TensorEvalToOp<const TensorReductionOp< + Op, const Indices, const InExpr> >, + GpuDevice, false, Tileable> { + public: + typedef const TensorEvalToOp< + const TensorReductionOp<Op, const Indices, const InExpr> > Expression; + static void run(const Expression& expr, const GpuDevice& device); +}; + +template <typename InExpr, typename Op, typename Indices, bool Tileable> +class TensorExecutor<const TensorEvalToOp<const TensorReductionOp< + Op, const Indices, const InExpr> >, + GpuDevice, true, Tileable> { + public: + typedef const TensorEvalToOp< + const TensorReductionOp<Op, const Indices, const InExpr> > Expression; + static void run(const Expression& expr, const GpuDevice& device); +}; + +} // end namespace internal +} // end namespace Eigen + +#if defined(__CUDACC__) + +namespace Eigen { + +namespace internal { + +namespace { + +#define DIVUP(x, y) (((x) + (y)-1) / (y)) + +// Initialize output[0..size-1] with val +template <typename Output> +__global__ void InitVector(const float val, int size, Output output) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + for (int i = idx; i < size; i += gridDim.x * blockDim.x) { + output.coeffRef(i) = val; + } +} + +// ----------------------------------------------------------------------------- +// Column Reduction kernels +// ----------------------------------------------------------------------------- +template <int GRID_DIM, int BLOCK_DIM, int NUM_PER_THREAD, typename Input, + typename Output, typename Reducer> +__global__ void ColumnReduceKernel(Reducer reducer, const Input input, int rows, + int cols, Output output) { + assert(blockDim.x == BLOCK_DIM); + assert(blockDim.y == 1); + assert(blockDim.z == 1); + + assert(gridDim.x == GRID_DIM); + assert(gridDim.y == 1); + assert(gridDim.z == 1); + + typedef typename Input::Index Index; + + const Index num_input_points = DIVUP(rows, NUM_PER_THREAD) * cols; + const int bx = blockIdx.x; + const int tx = threadIdx.x; + + for (Index i = bx * BLOCK_DIM + tx; i < num_input_points; + i += BLOCK_DIM * GRID_DIM) { + const Index input_col = i % cols; + const Index input_row_begin = + ((i / cols) % DIVUP(rows, NUM_PER_THREAD)) * NUM_PER_THREAD; + float reduced_val = reducer.bottom_value(); + for (int j = 0; j < NUM_PER_THREAD; ++j) { + float val = ((input_col < cols) && (input_row_begin + j < rows)) + ? input.coeff((input_row_begin + j) * cols + input_col) + : reducer.bottom_value(); + reduced_val = reducer(reduced_val, val); + } +#if __CUDA_ARCH__ >= 300 + reducer.atomic_reduce(&output.coeffRef(input_col), reduced_val); +#endif + } +} + +// ----------------------------------------------------------------------------- +// Row Reduction kernels +// ----------------------------------------------------------------------------- +template <int GRID_DIM, int BLOCK_DIM, int NUM_PER_THREAD, typename Input, + typename Output, typename Reducer> +__global__ void RowReduceKernel(Reducer reducer, const Input input, int rows, + int cols, Output output) { + assert(BLOCK_DIM % 32 == 0); + assert(blockDim.x == BLOCK_DIM); + assert(blockDim.y == 1); + assert(blockDim.z == 1); + + assert(gridDim.x == GRID_DIM); + assert(gridDim.y == 1); + assert(gridDim.z == 1); + + const int unroll_times = 16; + assert(NUM_PER_THREAD % unroll_times == 0); + + typedef typename Input::Index Index; + + __shared__ float temp[BLOCK_DIM]; + + const Index input_col_blocks = DIVUP(cols, BLOCK_DIM * NUM_PER_THREAD); + const Index num_input_blocks = input_col_blocks * rows; + + const int bx = blockIdx.x; + const int tx = threadIdx.x; + + for (Index i = bx; i < num_input_blocks; i += GRID_DIM) { + const Index col_block = i % input_col_blocks; + const Index row_block = i / input_col_blocks; + const Index col_begin = col_block * BLOCK_DIM * NUM_PER_THREAD + tx; + const Index row = row_block; + float reduced_val = reducer.bottom_value(); + if (row < rows) { + for (Index j = 0; j < NUM_PER_THREAD; j += unroll_times) { + const Index last_col = col_begin + BLOCK_DIM * (j + unroll_times - 1); + if (last_col >= cols) { + // We can skip the last iteration of the loop since we know + // that col >= cols there. +#pragma unroll + for (int k = 0; k < unroll_times - 1; ++k) { + const Index col = col_begin + BLOCK_DIM * (j + k); + const float val = (col < cols ? input.coeff(row * cols + col) + : reducer.bottom_value()); + reduced_val = reducer(reduced_val, val); + } + break; // col < cols for all later iterations. + } else { + // Faster version of the loop with no branches after unrolling. +#pragma unroll + for (int k = 0; k < unroll_times; ++k) { + const Index col = col_begin + BLOCK_DIM * (j + k); + reduced_val = reducer(reduced_val, input.coeff(row * cols + col)); + } + } + } + } + temp[tx] = reduced_val; + + __syncthreads(); + const int warp_id = tx & 31; + if (warp_id < 16) temp[tx] = reducer(temp[tx], temp[tx + 16]); + if (warp_id < 8) temp[tx] = reducer(temp[tx], temp[tx + 8]); + if (warp_id < 4) temp[tx] = reducer(temp[tx], temp[tx + 4]); + if (warp_id < 2) temp[tx] = reducer(temp[tx], temp[tx + 2]); + if (warp_id < 1) temp[tx] = reducer(temp[tx], temp[tx + 1]); + + if (warp_id == 0) { + if (row < rows) { +#if __CUDA_ARCH__ >= 300 + reducer.atomic_reduce(&output.coeffRef(row), temp[tx]); +#endif + } + } + + __syncthreads(); + } +} + +template <typename Input, typename Output, typename Reducer> +void ColumnReduceCuda(Reducer reducer, const GpuDevice& device, + const Input input, int rows, int cols, Output output) { + const int block_size = 256; + const int grid_size = 128; + const int num_per_thread = 16; + LAUNCH_CUDA_KERNEL(InitVector, 32, 1024, 0, device, reducer.bottom_value(), + cols, output); + LAUNCH_CUDA_KERNEL( + (ColumnReduceKernel<grid_size, block_size, num_per_thread>), grid_size, + block_size, 0, device, reducer, input, rows, cols, output); +} + +template <typename Input, typename Output, typename Reducer> +void RowReduceCuda(Reducer reducer, const GpuDevice& device, const Input input, + int rows, int cols, Output output) { + const int block_size = 256; + const int grid_size = 32; + const int num_per_thread = 128; + LAUNCH_CUDA_KERNEL(InitVector, 32, 1024, 0, device, reducer.bottom_value(), + rows, output); + LAUNCH_CUDA_KERNEL((RowReduceKernel<grid_size, block_size, num_per_thread>), + grid_size, block_size, 0, device, reducer, input, rows, + cols, output); +} + +// Provides arbitrary sum reductions, applying a function across the +// right argument being reduced prior to summing +template <typename F> +struct FnSumReducer { + __host__ __device__ FnSumReducer(F f) : f_(f) {} + __host__ __device__ float bottom_value() { return 0.0f; } + __device__ float operator()(float x, float y) const { return x + f_(y); } + __device__ void atomic_reduce(float* x, float y) const { atomicAdd(x, y); } + + F f_; +}; + +// Identity is used for the basic SumReduction +struct Identity { + __device__ float operator()(float x) const { return x; } +}; + +struct CudaSumReducer : FnSumReducer<Identity> { + __host__ __device__ CudaSumReducer() : FnSumReducer(Identity()) {} +}; + +struct CudaMaxReducer { + // nvcc doesn't recognize numeric_limits<float>::lowest for some reason. + CudaMaxReducer() { + bottom_value_ = -3.40282347E+38F; // std::numeric_limits<float>::lowest(); + } + __host__ __device__ float bottom_value() { return bottom_value_; } + __device__ float operator()(float x, float y) const { return fmax(x, y); } + + // This is equivalent to atomicMax(x, y), but CUDA does not have atomicMax for + // float data type. Instead, this atomically compares-and-swaps the old value + // at x with y. If the old value returned by the CAS operation was already + // larger than y, or what was read before, it declares success and finishes, + // otherwise repeats the procedure. + __device__ void atomic_reduce(float* x, float y) { + unsigned int old_val = *reinterpret_cast<unsigned int*>(x); + while (*reinterpret_cast<float*>(&old_val) < y) { + unsigned int current_val = + atomicCAS(reinterpret_cast<unsigned int*>(x), old_val, + *reinterpret_cast<unsigned int*>(&y)); + if (old_val == current_val) { + break; + } + old_val = current_val; + } + } + float bottom_value_; +}; + +} // end namespace + +template <typename Op> +struct IsFloatSumReduction { + static const bool value = false; +}; + +template <> +struct IsFloatSumReduction<SumReducer<float> > { + static const bool value = true; +}; + +template <typename Op> +struct IsFloatMaxReduction { + static const bool value = false; +}; + +template <> +struct IsFloatMaxReduction<MaxReducer<float> > { + static const bool value = true; +}; + +template <typename Op> +struct SumOrMaxOfFloat { + static const bool value = + IsFloatSumReduction<Op>::value || IsFloatMaxReduction<Op>::value; +}; + +enum ReductionType { ROW_REDUCE, COL_REDUCE, UNOPTIMIZED }; + +template <typename Op, typename Expr, typename ReductionExpr> +ReductionType GetReductionType(const Expr& expr, + const ReductionExpr& reduction_expr, + const GpuDevice& device, std::size_t* rows, + std::size_t* cols) { + typedef TensorEvaluator<const Expr, GpuDevice> EvalExpr; + typedef TensorEvaluator<const ReductionExpr, GpuDevice> ReductionEvalExpr; + + if (device.majorDeviceVersion() < 3) { + return UNOPTIMIZED; + } + const EvalExpr eval_expr(expr, device); + + // We only have fast reductions for sum/max of float. + if (!SumOrMaxOfFloat<Op>::value) { + return UNOPTIMIZED; + } + + // For sum/max of float, if we are doing a full reduction, we can + // use the ROW_REDUCE optimization. + if (ReductionEvalExpr::NumReducedDims == ReductionEvalExpr::NumInputDims) { + *rows = 1; + *cols = array_prod(eval_expr.dimensions()); + return ROW_REDUCE; + } + + if (ReductionEvalExpr::NumReducedDims > 1) { + return UNOPTIMIZED; + } + + const int dim = reduction_expr.dims()[0]; + if (static_cast<int>(ReductionEvalExpr::Layout) == + static_cast<int>(RowMajor)) { + if (dim == ReductionEvalExpr::NumInputDims - 1) { + *rows = array_prod(eval_expr.dimensions()) / + eval_expr.dimensions()[ReductionEvalExpr::NumInputDims - 1]; + *cols = eval_expr.dimensions()[ReductionEvalExpr::NumInputDims - 1]; + if (*cols < 32) return UNOPTIMIZED; + return ROW_REDUCE; + } else if (dim == 0) { + *rows = eval_expr.dimensions()[0]; + *cols = array_prod(eval_expr.dimensions()) / eval_expr.dimensions()[0]; + if (*rows < 32) return UNOPTIMIZED; + return COL_REDUCE; + } + } else if (static_cast<int>(ReductionEvalExpr::Layout) == + static_cast<int>(ColMajor)) { + if (dim == ReductionEvalExpr::NumInputDims - 1) { + *rows = eval_expr.dimensions()[ReductionEvalExpr::NumInputDims - 1]; + *cols = array_prod(eval_expr.dimensions()) / + eval_expr.dimensions()[ReductionEvalExpr::NumInputDims - 1]; + if (*rows < 32) return UNOPTIMIZED; + return COL_REDUCE; + } else if (dim == 0) { + *rows = array_prod(eval_expr.dimensions()) / eval_expr.dimensions()[0]; + *cols = eval_expr.dimensions()[0]; + if (*cols < 32) return UNOPTIMIZED; + return ROW_REDUCE; + } + } + return UNOPTIMIZED; +} + +template <typename Expression, typename Index, bool Vectorizable> +struct LaunchKernel; + +template <typename Expression, typename Index> +struct LaunchKernel<Expression, Index, true> { + static void launch(int num_blocks, int block_size, const GpuDevice& device, + const TensorEvaluator<Expression, GpuDevice>& evaluator, + Index size) { + LAUNCH_CUDA_KERNEL( + (EigenMetaKernel_Vectorizable<TensorEvaluator<Expression, GpuDevice>, + Index>), + num_blocks, block_size, 0, device, evaluator, size); + } +}; + +template <typename Expression, typename Index> +struct LaunchKernel<Expression, Index, false> { + static void launch(int num_blocks, int block_size, const GpuDevice& device, + const TensorEvaluator<Expression, GpuDevice>& evaluator, + Index size) { + LAUNCH_CUDA_KERNEL( + (EigenMetaKernel_NonVectorizable<TensorEvaluator<Expression, GpuDevice>, + Index>), + num_blocks, block_size, 0, device, evaluator, size); + } +}; + +template <typename F, typename LHS, typename RHS, bool Compatible> +struct LaunchRowReduce; + +template <typename F, typename LHS, typename RHS> +struct LaunchRowReduce<F, LHS, RHS, true> { + static void launch(const GpuDevice& device, RHS input, std::size_t rows, + std::size_t cols, LHS output) { + RowReduceCuda(F(), device, input, rows, cols, output); + } +}; + +template <typename F, typename LHS, typename RHS> +struct LaunchRowReduce<F, LHS, RHS, false> { + static void launch(const GpuDevice& device, RHS input, std::size_t rows, + std::size_t cols, LHS output) {} +}; + +template <typename F, typename LHS, typename RHS, bool Compatible> +struct LaunchColReduce; + +template <typename F, typename LHS, typename RHS> +struct LaunchColReduce<F, LHS, RHS, true> { + static void launch(const GpuDevice& device, RHS input, std::size_t rows, + std::size_t cols, LHS output) { + ColumnReduceCuda(F(), device, input, rows, cols, output); + } +}; + +template <typename F, typename LHS, typename RHS> +struct LaunchColReduce<F, LHS, RHS, false> { + static void launch(const GpuDevice& device, RHS input, std::size_t rows, + std::size_t cols, LHS output) {} +}; + +template <typename Expression, typename Device, bool Vectorizable> +class TensorAssignExecutorHelper; + +template <typename OutExpr, typename InExpr, typename Op, typename Indices, + bool Vectorizable> +class TensorAssignExecutorHelper< + const TensorAssignOp< + OutExpr, TensorReductionOp<Op, Indices const, InExpr const> const>, + GpuDevice, Vectorizable> { + public: + typedef const TensorAssignOp< + OutExpr, TensorReductionOp<Op, Indices const, InExpr const> const> + Expression; + + typedef typename Expression::Index Index; + typedef TensorEvaluator<OutExpr, GpuDevice> LHSEval; + typedef TensorEvaluator<const InExpr, GpuDevice> RHSEval; + static inline void run(const Expression& expr, const GpuDevice& device) { + std::size_t rows, cols; + const ReductionType reduction_type = + GetReductionType<Op>(expr.rhsExpression().expression(), + expr.rhsExpression(), device, &rows, &cols); + if (reduction_type == UNOPTIMIZED) { + TensorEvaluator<Expression, GpuDevice> evaluator(expr, device); + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + if (needs_assign) { + const int num_blocks = device.getNumCudaMultiProcessors() * + device.maxCudaThreadsPerMultiProcessor() / + device.maxCudaThreadsPerBlock(); + const int block_size = device.maxCudaThreadsPerBlock(); + const Index size = array_prod(evaluator.dimensions()); + LaunchKernel<Expression, Index, Vectorizable>::launch( + num_blocks, block_size, device, evaluator, size); + } + evaluator.cleanup(); + } else { + LHSEval output(expr.lhsExpression(), device); + RHSEval input(expr.rhsExpression().expression(), device); + bool lhs_needs_assign = output.evalSubExprsIfNeeded(NULL); + bool rhs_needs_assign = input.evalSubExprsIfNeeded(NULL); + if (lhs_needs_assign && rhs_needs_assign) { + const bool Compatible = + IsFloatSumReduction<Op>::value || IsFloatMaxReduction<Op>::value; + if (reduction_type == ROW_REDUCE) { + if (IsFloatSumReduction<Op>::value) { + LaunchRowReduce<CudaSumReducer, LHSEval, RHSEval, + Compatible>::launch(device, input, rows, cols, + output); + } else if (IsFloatMaxReduction<Op>::value) { + LaunchRowReduce<CudaMaxReducer, LHSEval, RHSEval, + Compatible>::launch(device, input, rows, cols, + output); + } else { + // Unsupported reduction type + assert(false && "Unsupported reduction function for ROW_REDUCE"); + } + } else { + if (IsFloatSumReduction<Op>::value) { + LaunchColReduce<CudaSumReducer, LHSEval, RHSEval, + Compatible>::launch(device, input, rows, cols, + output); + } else if (IsFloatMaxReduction<Op>::value) { + LaunchColReduce<CudaMaxReducer, LHSEval, RHSEval, + Compatible>::launch(device, input, rows, cols, + output); + } else { + // Unsupported reduction type + assert(false && "Unsupported reduction function for COL_REDUCE"); + } + } + } + input.cleanup(); + output.cleanup(); + } + } +}; + +template <typename OutExpr, typename InExpr, typename Op, typename Indices, + bool Tileable> +inline void TensorExecutor< + const TensorAssignOp< + OutExpr, TensorReductionOp<Op, Indices const, InExpr const> const>, + GpuDevice, false, Tileable>::run(const Expression& expr, + const GpuDevice& device) { + TensorAssignExecutorHelper< + const TensorAssignOp< + OutExpr, TensorReductionOp<Op, Indices const, InExpr const> const>, + GpuDevice, false>::run(expr, device); +} + +template <typename OutExpr, typename InExpr, typename Op, typename Indices, + bool Tileable> +inline void TensorExecutor< + const TensorAssignOp< + OutExpr, TensorReductionOp<Op, Indices const, InExpr const> const>, + GpuDevice, true, Tileable>::run(const Expression& expr, + const GpuDevice& device) { + TensorAssignExecutorHelper< + const TensorAssignOp< + OutExpr, TensorReductionOp<Op, Indices const, InExpr const> const>, + GpuDevice, true>::run(expr, device); +} + +template <typename T, typename Index> +struct PtrWrapper { + EIGEN_DEVICE_FUNC PtrWrapper(T* ptr) : m_ptr(ptr) {} + EIGEN_DEVICE_FUNC T& coeffRef(Index i) { return *(m_ptr + i); } + T* m_ptr; +}; + +template <typename Expression, typename Device, bool Vectorizable> +class TensorEvalToExecutorHelper; + +template <typename InExpr, typename Op, typename Indices, bool Vectorizable> +class TensorEvalToExecutorHelper<const TensorEvalToOp<const TensorReductionOp< + Op, const Indices, const InExpr> >, + GpuDevice, Vectorizable> { + public: + typedef const TensorEvalToOp<const TensorReductionOp< + Op, const Indices, const InExpr> > Expression; + typedef typename Expression::Index Index; + typedef TensorEvaluator<const InExpr, GpuDevice> RHSEval; + + static inline void run(const Expression& expr, const GpuDevice& device) { + std::size_t rows, cols; + const ReductionType reduction_type = + GetReductionType<Op>(expr.expression().expression(), expr.expression(), + device, &rows, &cols); + if (reduction_type == UNOPTIMIZED) { + TensorEvaluator<Expression, GpuDevice> evaluator(expr, device); + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + if (needs_assign) { + const int num_blocks = device.getNumCudaMultiProcessors() * + device.maxCudaThreadsPerMultiProcessor() / + device.maxCudaThreadsPerBlock(); + const int block_size = device.maxCudaThreadsPerBlock(); + const Index size = array_prod(evaluator.dimensions()); + LaunchKernel<Expression, Index, Vectorizable>::launch( + num_blocks, block_size, device, evaluator, size); + } + evaluator.cleanup(); + } else { + typedef typename internal::remove_const<typename Expression::Scalar>::type Scalar; + PtrWrapper<Scalar, Index> output(expr.buffer()); + TensorEvaluator<const InExpr, GpuDevice> input( + expr.expression().expression(), device); + typedef PtrWrapper<Scalar, Index> LHSEval; + typedef TensorEvaluator<const InExpr, GpuDevice> RHSEval; + bool rhs_needs_assign = input.evalSubExprsIfNeeded(NULL); + if (rhs_needs_assign) { + const bool Compatible = + IsFloatSumReduction<Op>::value || IsFloatMaxReduction<Op>::value; + if (reduction_type == ROW_REDUCE) { + if (IsFloatSumReduction<Op>::value) { + LaunchRowReduce<CudaSumReducer, LHSEval, RHSEval, + Compatible>::launch(device, input, rows, cols, + output); + } else if (IsFloatMaxReduction<Op>::value) { + LaunchRowReduce<CudaMaxReducer, LHSEval, RHSEval, + Compatible>::launch(device, input, rows, cols, + output); + } + } else { + if (IsFloatSumReduction<Op>::value) { + LaunchColReduce<CudaSumReducer, LHSEval, RHSEval, + Compatible>::launch(device, input, rows, cols, + output); + } else if (IsFloatMaxReduction<Op>::value) { + LaunchColReduce<CudaMaxReducer, LHSEval, RHSEval, + Compatible>::launch(device, input, rows, cols, + output); + } + } + } + input.cleanup(); + } + } +}; + +template <typename InExpr, typename Op, typename Indices, bool Tileable> +inline void +TensorExecutor<const TensorEvalToOp< + const TensorReductionOp<Op, const Indices, const InExpr> >, + GpuDevice, false, Tileable>::run(const Expression& expr, + const GpuDevice& device) { + TensorEvalToExecutorHelper<const TensorEvalToOp<const TensorReductionOp< + Op, const Indices, const InExpr> >, + GpuDevice, false>::run(expr, device); +} + +template <typename InExpr, typename Op, typename Indices, bool Tileable> +inline void +TensorExecutor<const TensorEvalToOp< + const TensorReductionOp<Op, const Indices, const InExpr> >, + GpuDevice, true, Tileable>::run(const Expression& expr, + const GpuDevice& device) { + TensorEvalToExecutorHelper<const TensorEvalToOp<const TensorReductionOp< + Op, const Indices, const InExpr> >, + GpuDevice, true>::run(expr, device); +} + +} // end namespace internal + +} // end namespace Eigen + +#endif // __CUDACC__ +#endif // EIGEN_USE_GPU +#endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h new file mode 100644 index 0000000000..fb8ba09dd3 --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h @@ -0,0 +1,442 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_REF_H +#define EIGEN_CXX11_TENSOR_TENSOR_REF_H + +namespace Eigen { + +namespace internal { + +template <typename Dimensions, typename Scalar> +class TensorLazyBaseEvaluator { + public: + TensorLazyBaseEvaluator() : m_refcount(0) { } + virtual ~TensorLazyBaseEvaluator() { } + + EIGEN_DEVICE_FUNC virtual const Dimensions& dimensions() const = 0; + EIGEN_DEVICE_FUNC virtual const Scalar* data() const = 0; + + EIGEN_DEVICE_FUNC virtual const Scalar coeff(DenseIndex index) const = 0; + EIGEN_DEVICE_FUNC virtual Scalar& coeffRef(DenseIndex index) = 0; + + void incrRefCount() { ++m_refcount; } + void decrRefCount() { --m_refcount; } + int refCount() const { return m_refcount; } + + private: + // No copy, no assigment; + TensorLazyBaseEvaluator(const TensorLazyBaseEvaluator& other); + TensorLazyBaseEvaluator& operator = (const TensorLazyBaseEvaluator& other); + + int m_refcount; +}; + + +template <typename Dimensions, typename Expr, typename Device> +class TensorLazyEvaluatorReadOnly : public TensorLazyBaseEvaluator<Dimensions, typename TensorEvaluator<Expr, Device>::Scalar> { + public: + // typedef typename TensorEvaluator<Expr, Device>::Dimensions Dimensions; + typedef typename TensorEvaluator<Expr, Device>::Scalar Scalar; + + TensorLazyEvaluatorReadOnly(const Expr& expr, const Device& device) : m_impl(expr, device), m_dummy(Scalar(0)) { + m_dims = m_impl.dimensions(); + m_impl.evalSubExprsIfNeeded(NULL); + } + virtual ~TensorLazyEvaluatorReadOnly() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC virtual const Dimensions& dimensions() const { + return m_dims; + } + EIGEN_DEVICE_FUNC virtual const Scalar* data() const { + return m_impl.data(); + } + + EIGEN_DEVICE_FUNC virtual const Scalar coeff(DenseIndex index) const { + return m_impl.coeff(index); + } + EIGEN_DEVICE_FUNC virtual Scalar& coeffRef(DenseIndex /*index*/) { + eigen_assert(false && "can't reference the coefficient of a rvalue"); + return m_dummy; + }; + + protected: + TensorEvaluator<Expr, Device> m_impl; + Dimensions m_dims; + Scalar m_dummy; +}; + +template <typename Dimensions, typename Expr, typename Device> +class TensorLazyEvaluatorWritable : public TensorLazyEvaluatorReadOnly<Dimensions, Expr, Device> { + public: + typedef TensorLazyEvaluatorReadOnly<Dimensions, Expr, Device> Base; + typedef typename Base::Scalar Scalar; + + TensorLazyEvaluatorWritable(const Expr& expr, const Device& device) : Base(expr, device) { + } + virtual ~TensorLazyEvaluatorWritable() { + } + + EIGEN_DEVICE_FUNC virtual Scalar& coeffRef(DenseIndex index) { + return this->m_impl.coeffRef(index); + } +}; + +template <typename Dimensions, typename Expr, typename Device> +class TensorLazyEvaluator : public internal::conditional<bool(internal::is_lvalue<Expr>::value), + TensorLazyEvaluatorWritable<Dimensions, Expr, Device>, + TensorLazyEvaluatorReadOnly<Dimensions, const Expr, Device> >::type { + public: + typedef typename internal::conditional<bool(internal::is_lvalue<Expr>::value), + TensorLazyEvaluatorWritable<Dimensions, Expr, Device>, + TensorLazyEvaluatorReadOnly<Dimensions, const Expr, Device> >::type Base; + typedef typename Base::Scalar Scalar; + + TensorLazyEvaluator(const Expr& expr, const Device& device) : Base(expr, device) { + } + virtual ~TensorLazyEvaluator() { + } +}; + +} // namespace internal + + +/** \class TensorRef + * \ingroup CXX11_Tensor_Module + * + * \brief A reference to a tensor expression + * The expression will be evaluated lazily (as much as possible). + * + */ +template<typename PlainObjectType> class TensorRef : public TensorBase<TensorRef<PlainObjectType> > +{ + public: + typedef TensorRef<PlainObjectType> Self; + typedef typename PlainObjectType::Base Base; + typedef typename Eigen::internal::nested<Self>::type Nested; + typedef typename internal::traits<PlainObjectType>::StorageKind StorageKind; + typedef typename internal::traits<PlainObjectType>::Index Index; + typedef typename internal::traits<PlainObjectType>::Scalar Scalar; + typedef typename internal::packet_traits<Scalar>::type Packet; + typedef typename NumTraits<Scalar>::Real RealScalar; + typedef typename Base::CoeffReturnType CoeffReturnType; + typedef Scalar* PointerType; + typedef PointerType PointerArgType; + + static const Index NumIndices = PlainObjectType::NumIndices; + typedef typename PlainObjectType::Dimensions Dimensions; + + enum { + IsAligned = false, + PacketAccess = false, + BlockAccess = false, + Layout = PlainObjectType::Layout, + CoordAccess = false, // to be implemented + }; + + EIGEN_STRONG_INLINE TensorRef() : m_evaluator(NULL) { + } + + template <typename Expression> + EIGEN_STRONG_INLINE TensorRef(Expression& expr) : m_evaluator(new internal::TensorLazyEvaluator<Dimensions, Expression, DefaultDevice>(expr, DefaultDevice())) { + m_evaluator->incrRefCount(); + } + + template <typename Expression> + EIGEN_STRONG_INLINE TensorRef(const Expression& expr) : m_evaluator(new internal::TensorLazyEvaluator<Dimensions, const Expression, DefaultDevice>(expr, DefaultDevice())) { + m_evaluator->incrRefCount(); + } + + template <typename Expression> + EIGEN_STRONG_INLINE TensorRef& operator = (const Expression& expr) { + unrefEvaluator(); + m_evaluator = new internal::TensorLazyEvaluator<Dimensions, Expression, DefaultDevice>(expr, DefaultDevice()); + m_evaluator->incrRefCount(); + return *this; + } + + ~TensorRef() { + unrefEvaluator(); + } + + TensorRef(const TensorRef& other) : m_evaluator(other.m_evaluator) { + eigen_assert(m_evaluator->refCount() > 0); + m_evaluator->incrRefCount(); + } + + TensorRef(TensorRef& other) : m_evaluator(other.m_evaluator) { + eigen_assert(m_evaluator->refCount() > 0); + m_evaluator->incrRefCount(); + } + + TensorRef& operator = (const TensorRef& other) { + if (this != &other) { + unrefEvaluator(); + m_evaluator = other.m_evaluator; + eigen_assert(m_evaluator->refCount() > 0); + m_evaluator->incrRefCount(); + } + return *this; + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Index rank() const { return m_evaluator->dimensions().size(); } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Index dimension(Index n) const { return m_evaluator->dimensions()[n]; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_evaluator->dimensions(); } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Index size() const { return m_evaluator->dimensions().TotalSize(); } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar* data() const { return m_evaluator->data(); } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar operator()(Index index) const + { + return m_evaluator->coeff(index); + } + +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES + template<typename... IndexTypes> EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar operator()(Index firstIndex, IndexTypes... otherIndices) const + { + const std::size_t NumIndices = (sizeof...(otherIndices) + 1); + const array<Index, NumIndices> indices{{firstIndex, otherIndices...}}; + return coeff(indices); + } + template<typename... IndexTypes> EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& coeffRef(Index firstIndex, IndexTypes... otherIndices) + { + const std::size_t NumIndices = (sizeof...(otherIndices) + 1); + const array<Index, NumIndices> indices{{firstIndex, otherIndices...}}; + return coeffRef(indices); + } +#else + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar operator()(Index i0, Index i1) const + { + array<Index, 2> indices; + indices[0] = i0; + indices[1] = i1; + return coeff(indices); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar operator()(Index i0, Index i1, Index i2) const + { + array<Index, 3> indices; + indices[0] = i0; + indices[1] = i1; + indices[2] = i2; + return coeff(indices); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar operator()(Index i0, Index i1, Index i2, Index i3) const + { + array<Index, 4> indices; + indices[0] = i0; + indices[1] = i1; + indices[2] = i2; + indices[3] = i3; + return coeff(indices); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const + { + array<Index, 5> indices; + indices[0] = i0; + indices[1] = i1; + indices[2] = i2; + indices[3] = i3; + indices[4] = i4; + return coeff(indices); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& coeffRef(Index i0, Index i1) + { + array<Index, 2> indices; + indices[0] = i0; + indices[1] = i1; + return coeffRef(indices); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& coeffRef(Index i0, Index i1, Index i2) + { + array<Index, 3> indices; + indices[0] = i0; + indices[1] = i1; + indices[2] = i2; + return coeffRef(indices); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3) + { + array<Index, 4> indices; + indices[0] = i0; + indices[1] = i1; + indices[2] = i2; + indices[3] = i3; + return coeffRef(indices); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& coeffRef(Index i0, Index i1, Index i2, Index i3, Index i4) + { + array<Index, 5> indices; + indices[0] = i0; + indices[1] = i1; + indices[2] = i2; + indices[3] = i3; + indices[4] = i4; + return coeffRef(indices); + } +#endif + + template <std::size_t NumIndices> EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar coeff(const array<Index, NumIndices>& indices) const + { + const Dimensions& dims = this->dimensions(); + Index index = 0; + if (PlainObjectType::Options & RowMajor) { + index += indices[0]; + for (int i = 1; i < NumIndices; ++i) { + index = index * dims[i] + indices[i]; + } + } else { + index += indices[NumIndices-1]; + for (int i = NumIndices-2; i >= 0; --i) { + index = index * dims[i] + indices[i]; + } + } + return m_evaluator->coeff(index); + } + template <std::size_t NumIndices> EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& coeffRef(const array<Index, NumIndices>& indices) + { + const Dimensions& dims = this->dimensions(); + Index index = 0; + if (PlainObjectType::Options & RowMajor) { + index += indices[0]; + for (int i = 1; i < NumIndices; ++i) { + index = index * dims[i] + indices[i]; + } + } else { + index += indices[NumIndices-1]; + for (int i = NumIndices-2; i >= 0; --i) { + index = index * dims[i] + indices[i]; + } + } + return m_evaluator->coeffRef(index); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar coeff(Index index) const + { + return m_evaluator->coeff(index); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) + { + return m_evaluator->coeffRef(index); + } + + private: + EIGEN_STRONG_INLINE void unrefEvaluator() { + if (m_evaluator) { + m_evaluator->decrRefCount(); + if (m_evaluator->refCount() == 0) { + delete m_evaluator; + } + } + } + + internal::TensorLazyBaseEvaluator<Dimensions, Scalar>* m_evaluator; +}; + + +// evaluator for rvalues +template<typename Derived, typename Device> +struct TensorEvaluator<const TensorRef<Derived>, Device> +{ + typedef typename Derived::Index Index; + typedef typename Derived::Scalar Scalar; + typedef typename Derived::Packet Packet; + typedef typename Derived::Scalar CoeffReturnType; + typedef typename Derived::Packet PacketReturnType; + typedef typename Derived::Dimensions Dimensions; + + enum { + IsAligned = false, + PacketAccess = false, + BlockAccess = false, + Layout = TensorRef<Derived>::Layout, + CoordAccess = false, // to be implemented + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const TensorRef<Derived>& m, const Device&) + : m_ref(m) + { } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_ref.dimensions(); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + return true; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + return m_ref.coeff(index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { + return m_ref.coeffRef(index); + } + + EIGEN_DEVICE_FUNC Scalar* data() const { return m_ref.data(); } + + protected: + TensorRef<Derived> m_ref; +}; + + +// evaluator for lvalues +template<typename Derived, typename Device> +struct TensorEvaluator<TensorRef<Derived>, Device> : public TensorEvaluator<const TensorRef<Derived>, Device> +{ + typedef typename Derived::Index Index; + typedef typename Derived::Scalar Scalar; + typedef typename Derived::Packet Packet; + typedef typename Derived::Scalar CoeffReturnType; + typedef typename Derived::Packet PacketReturnType; + typedef typename Derived::Dimensions Dimensions; + + typedef TensorEvaluator<const TensorRef<Derived>, Device> Base; + + enum { + IsAligned = false, + PacketAccess = false, + BlockAccess = false, + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(TensorRef<Derived>& m, const Device& d) : Base(m, d) + { } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { + return this->m_ref.coeffRef(index); + } +}; + + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_REF_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h new file mode 100644 index 0000000000..44e147de3e --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h @@ -0,0 +1,278 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Navdeep Jaitly <ndjaitly@google.com> +// Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_REVERSE_H +#define EIGEN_CXX11_TENSOR_TENSOR_REVERSE_H +namespace Eigen { + +/** \class TensorReverse + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor reverse elements class. + * + */ +namespace internal { +template<typename ReverseDimensions, typename XprType> +struct traits<TensorReverseOp<ReverseDimensions, + XprType> > : public traits<XprType> +{ + typedef typename XprType::Scalar Scalar; + typedef traits<XprType> XprTraits; + typedef typename packet_traits<Scalar>::type Packet; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference<Nested>::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; +}; + +template<typename ReverseDimensions, typename XprType> +struct eval<TensorReverseOp<ReverseDimensions, XprType>, Eigen::Dense> +{ + typedef const TensorReverseOp<ReverseDimensions, XprType>& type; +}; + +template<typename ReverseDimensions, typename XprType> +struct nested<TensorReverseOp<ReverseDimensions, XprType>, 1, + typename eval<TensorReverseOp<ReverseDimensions, XprType> >::type> +{ + typedef TensorReverseOp<ReverseDimensions, XprType> type; +}; + +} // end namespace internal + +template<typename ReverseDimensions, typename XprType> +class TensorReverseOp : public TensorBase<TensorReverseOp<ReverseDimensions, + XprType>, WriteAccessors> +{ + public: + typedef typename Eigen::internal::traits<TensorReverseOp>::Scalar Scalar; + typedef typename Eigen::internal::traits<TensorReverseOp>::Packet Packet; + typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename Eigen::internal::nested<TensorReverseOp>::type Nested; + typedef typename Eigen::internal::traits<TensorReverseOp>::StorageKind + StorageKind; + typedef typename Eigen::internal::traits<TensorReverseOp>::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReverseOp( + const XprType& expr, const ReverseDimensions& reverse_dims) + : m_xpr(expr), m_reverse_dims(reverse_dims) {} + + EIGEN_DEVICE_FUNC + const ReverseDimensions& reverse() const { return m_reverse_dims; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all<typename XprType::Nested>::type& + expression() const { return m_xpr; } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorReverseOp& operator = (const TensorReverseOp& other) + { + typedef TensorAssignOp<TensorReverseOp, const TensorReverseOp> Assign; + Assign assign(*this, other); + internal::TensorExecutor<const Assign, DefaultDevice>::run( + assign, DefaultDevice()); + return *this; + } + + template<typename OtherDerived> + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorReverseOp& operator = (const OtherDerived& other) + { + typedef TensorAssignOp<TensorReverseOp, const OtherDerived> Assign; + Assign assign(*this, other); + internal::TensorExecutor<const Assign, DefaultDevice>::run( + assign, DefaultDevice()); + return *this; + } + + protected: + typename XprType::Nested m_xpr; + const ReverseDimensions m_reverse_dims; +}; + +// Eval as rvalue +template<typename ReverseDimensions, typename ArgType, typename Device> +struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device> +{ + typedef TensorReverseOp<ReverseDimensions, ArgType> XprType; + typedef typename XprType::Index Index; + static const int NumDims = internal::array_size<ReverseDimensions>::value; + typedef DSizes<Index, NumDims> Dimensions; + + enum { + IsAligned = false, + PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, + BlockAccess = false, + Layout = TensorEvaluator<ArgType, Device>::Layout, + CoordAccess = false, // to be implemented + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, + const Device& device) + : m_impl(op.expression(), device), m_reverse(op.reverse()) + { + // Compute strides + m_dimensions = m_impl.dimensions(); + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + m_strides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_strides[i] = m_strides[i-1] * m_dimensions[i-1]; + } + } else { + m_strides[NumDims-1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_strides[i] = m_strides[i+1] * m_dimensions[i+1]; + } + } + } + + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index reverseIndex( + Index index) const { + eigen_assert(index < dimensions().TotalSize()); + Index inputIndex = 0; + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + Index idx = index / m_strides[i]; + index -= idx * m_strides[i]; + if (m_reverse[i]) { + idx = m_dimensions[i] - idx - 1; + } + inputIndex += idx * m_strides[i] ; + } + if (m_reverse[0]) { + inputIndex += (m_dimensions[0] - index - 1); + } else { + inputIndex += index; + } + } else { + for (int i = 0; i < NumDims - 1; ++i) { + Index idx = index / m_strides[i]; + index -= idx * m_strides[i]; + if (m_reverse[i]) { + idx = m_dimensions[i] - idx - 1; + } + inputIndex += idx * m_strides[i] ; + } + if (m_reverse[NumDims-1]) { + inputIndex += (m_dimensions[NumDims-1] - index - 1); + } else { + inputIndex += index; + } + } + return inputIndex; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff( + Index index) const { + return m_impl.coeff(reverseIndex(index)); + } + + template<int LoadMode> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + PacketReturnType packet(Index index) const + { + const int packetSize = internal::unpacket_traits<PacketReturnType>::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + + // TODO(ndjaitly): write a better packing routine that uses + // local structure. + EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type + values[packetSize]; + for (int i = 0; i < packetSize; ++i) { + values[i] = coeff(index+i); + } + PacketReturnType rslt = internal::pload<PacketReturnType>(values); + return rslt; + } + + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + + protected: + Dimensions m_dimensions; + array<Index, NumDims> m_strides; + TensorEvaluator<ArgType, Device> m_impl; + ReverseDimensions m_reverse; +}; + +// Eval as lvalue + +template <typename ReverseDimensions, typename ArgType, typename Device> +struct TensorEvaluator<TensorReverseOp<ReverseDimensions, ArgType>, Device> + : public TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, + Device> { + typedef TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, + Device> Base; + typedef TensorReverseOp<ReverseDimensions, ArgType> XprType; + typedef typename XprType::Index Index; + static const int NumDims = internal::array_size<ReverseDimensions>::value; + typedef DSizes<Index, NumDims> Dimensions; + + enum { + IsAligned = false, + PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, + BlockAccess = false, + Layout = TensorEvaluator<ArgType, Device>::Layout, + CoordAccess = false, // to be implemented + }; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, + const Device& device) + : Base(op, device) {} + + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const Dimensions& dimensions() const { return this->m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { + return this->m_impl.coeffRef(Base::reverseIndex(index)); + } + + template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void writePacket(Index index, const PacketReturnType& x) { + const int packetSize = internal::unpacket_traits<PacketReturnType>::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + + // This code is pilfered from TensorMorphing.h + EIGEN_ALIGN_DEFAULT CoeffReturnType values[packetSize]; + internal::pstore<CoeffReturnType, PacketReturnType>(values, x); + for (int i = 0; i < packetSize; ++i) { + this->coeffRef(index+i) = values[i]; + } + } + +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_REVERSE_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h new file mode 100644 index 0000000000..2e59a147bc --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h @@ -0,0 +1,412 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H +#define EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H + +namespace Eigen { + +/** \class TensorShuffling + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor shuffling class. + * + * + */ +namespace internal { +template<typename Shuffle, typename XprType> +struct traits<TensorShufflingOp<Shuffle, XprType> > : public traits<XprType> +{ + typedef typename XprType::Scalar Scalar; + typedef traits<XprType> XprTraits; + typedef typename packet_traits<Scalar>::type Packet; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference<Nested>::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; +}; + +template<typename Shuffle, typename XprType> +struct eval<TensorShufflingOp<Shuffle, XprType>, Eigen::Dense> +{ + typedef const TensorShufflingOp<Shuffle, XprType>& type; +}; + +template<typename Shuffle, typename XprType> +struct nested<TensorShufflingOp<Shuffle, XprType>, 1, typename eval<TensorShufflingOp<Shuffle, XprType> >::type> +{ + typedef TensorShufflingOp<Shuffle, XprType> type; +}; + +} // end namespace internal + + + +template<typename Shuffle, typename XprType> +class TensorShufflingOp : public TensorBase<TensorShufflingOp<Shuffle, XprType> > +{ + public: + typedef typename Eigen::internal::traits<TensorShufflingOp>::Scalar Scalar; + typedef typename Eigen::internal::traits<TensorShufflingOp>::Packet Packet; + typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename Eigen::internal::nested<TensorShufflingOp>::type Nested; + typedef typename Eigen::internal::traits<TensorShufflingOp>::StorageKind StorageKind; + typedef typename Eigen::internal::traits<TensorShufflingOp>::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorShufflingOp(const XprType& expr, const Shuffle& shuffle) + : m_xpr(expr), m_shuffle(shuffle) {} + + EIGEN_DEVICE_FUNC + const Shuffle& shufflePermutation() const { return m_shuffle; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all<typename XprType::Nested>::type& + expression() const { return m_xpr; } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorShufflingOp& operator = (const TensorShufflingOp& other) + { + typedef TensorAssignOp<TensorShufflingOp, const TensorShufflingOp> Assign; + Assign assign(*this, other); + internal::TensorExecutor<const Assign, DefaultDevice>::run( + assign, DefaultDevice()); + return *this; + } + template<typename OtherDerived> + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorShufflingOp& operator = (const OtherDerived& other) + { + typedef TensorAssignOp<TensorShufflingOp, const OtherDerived> Assign; + Assign assign(*this, other); + internal::TensorExecutor<const Assign, DefaultDevice>::run( + assign, DefaultDevice()); + return *this; + } + + protected: + typename XprType::Nested m_xpr; + const Shuffle m_shuffle; +}; + + +// Eval as rvalue +template<typename Shuffle, typename ArgType, typename Device> +struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> +{ + typedef TensorShufflingOp<Shuffle, ArgType> XprType; + typedef typename XprType::Index Index; + static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value; + typedef DSizes<Index, NumDims> Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename internal::remove_const<Scalar>::type ScalarNonConst; + + enum { + IsAligned = false, + PacketAccess = (internal::packet_traits<Scalar>::size > 1), + BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess, + Layout = TensorEvaluator<ArgType, Device>::Layout, + CoordAccess = false, // to be implemented + }; + + typedef typename internal::TensorBlock< + Index, typename internal::remove_const<Scalar>::type, NumDims, + TensorEvaluator<ArgType, Device>::Layout> TensorBlock; + typedef typename internal::TensorBlockReader< + Index, typename internal::remove_const<Scalar>::type, NumDims, + TensorEvaluator<ArgType, Device>::Layout, PacketAccess> TensorBlockReader; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_shuffle(op.shufflePermutation()), m_impl(op.expression(), device) + { + const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions(); + for (int i = 0; i < NumDims; ++i) { + m_dimensions[i] = input_dims[m_shuffle[i]]; + m_inverseShuffle[m_shuffle[i]] = i; + } + + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + m_unshuffledInputStrides[0] = 1; + m_outputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_unshuffledInputStrides[i] = + m_unshuffledInputStrides[i - 1] * input_dims[i - 1]; + m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1]; + } + } else { + m_unshuffledInputStrides[NumDims - 1] = 1; + m_outputStrides[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_unshuffledInputStrides[i] = + m_unshuffledInputStrides[i + 1] * input_dims[i + 1]; + m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1]; + } + } + + for (int i = 0; i < NumDims; ++i) { + m_inputStrides[i] = m_unshuffledInputStrides[m_shuffle[i]]; + } + + m_block_total_size_max = numext::maxi(static_cast<std::size_t>(1), + device.firstLevelCacheSize() / + sizeof(Scalar)); + } + + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + return m_impl.coeff(srcCoeff(index)); + } + + template<int LoadMode> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + const int packetSize = internal::unpacket_traits<PacketReturnType>::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + + EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize]; + for (int i = 0; i < packetSize; ++i) { + values[i] = coeff(index+i); + } + PacketReturnType rslt = internal::pload<PacketReturnType>(values); + return rslt; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( + std::vector<internal::TensorOpResourceRequirements>* resources) const { + resources->push_back(internal::TensorOpResourceRequirements( + internal::kUniformAllDims, m_block_total_size_max)); + m_impl.getResourceRequirements(resources); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block( + TensorBlock* output_block) const { + if (m_impl.data() != NULL) { + // Fast path: we have direct access to the data, so shuffle as we read. + TensorBlockReader::Run(output_block, + srcCoeff(output_block->first_coeff_index()), + m_inverseShuffle, + m_unshuffledInputStrides, + m_impl.data()); + return; + } + + // Slow path: read unshuffled block from the input and shuffle in-place. + // Initialize input block sizes using input-to-output shuffle map. + DSizes<Index, NumDims> input_block_sizes; + for (Index i = 0; i < NumDims; ++i) { + input_block_sizes[i] = output_block->block_sizes()[m_inverseShuffle[i]]; + } + + // Calculate input block strides. + DSizes<Index, NumDims> input_block_strides; + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + input_block_strides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + input_block_strides[i] = input_block_strides[i - 1] * + input_block_sizes[i - 1]; + } + } else { + input_block_strides[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + input_block_strides[i] = input_block_strides[i + 1] * + input_block_sizes[i + 1]; + } + } + + // Read input block. + TensorBlock input_block(srcCoeff(output_block->first_coeff_index()), + input_block_sizes, + input_block_strides, + m_unshuffledInputStrides, + output_block->data()); + + m_impl.block(&input_block); + + // Naive In-place shuffle: random IO but block size is O(L1 cache size). + // TODO(andydavis) Improve the performance of this in-place shuffle. + const Index total_size = input_block_sizes.TotalSize(); + std::vector<bool> bitmap(total_size, false); + ScalarNonConst* data = const_cast<ScalarNonConst*>(output_block->data()); + const DSizes<Index, NumDims>& output_block_strides = + output_block->block_strides(); + for (Index input_index = 0; input_index < total_size; ++input_index) { + if (bitmap[input_index]) { + // Coefficient at this index has already been shuffled. + continue; + } + + Index output_index = GetBlockOutputIndex(input_index, + input_block_strides, + output_block_strides); + if (output_index == input_index) { + // Coefficient already in place. + bitmap[output_index] = true; + continue; + } + + // The following loop starts at 'input_index', and shuffles + // coefficients into their shuffled location at 'output_index'. + // It skips through the array shuffling coefficients by following + // the shuffle cycle starting and ending a 'start_index'. + ScalarNonConst evicted_value; + ScalarNonConst shuffled_value = data[input_index]; + do { + evicted_value = data[output_index]; + data[output_index] = shuffled_value; + shuffled_value = evicted_value; + bitmap[output_index] = true; + output_index = GetBlockOutputIndex(output_index, + input_block_strides, + output_block_strides); + } while (output_index != input_index); + + data[output_index] = shuffled_value; + bitmap[output_index] = true; + } + } + + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + + protected: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index GetBlockOutputIndex( + Index input_index, + const DSizes<Index, NumDims>& input_block_strides, + const DSizes<Index, NumDims>& output_block_strides) const { + Index output_index = 0; + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = input_index / input_block_strides[i]; + output_index += idx * output_block_strides[m_inverseShuffle[i]]; + input_index -= idx * input_block_strides[i]; + } + return output_index + input_index * + output_block_strides[m_inverseShuffle[0]]; + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = input_index / input_block_strides[i]; + output_index += idx * output_block_strides[m_inverseShuffle[i]]; + input_index -= idx * input_block_strides[i]; + } + return output_index + input_index * + output_block_strides[m_inverseShuffle[NumDims - 1]]; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const { + Index inputIndex = 0; + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_outputStrides[i]; + inputIndex += idx * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + return inputIndex + index * m_inputStrides[0]; + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = index / m_outputStrides[i]; + inputIndex += idx * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + return inputIndex + index * m_inputStrides[NumDims - 1]; + } + } + + const Shuffle& m_shuffle; + Dimensions m_dimensions; + array<Index, NumDims> m_inverseShuffle; + array<Index, NumDims> m_outputStrides; + array<Index, NumDims> m_inputStrides; + array<Index, NumDims> m_unshuffledInputStrides; + TensorEvaluator<ArgType, Device> m_impl; + std::size_t m_block_total_size_max; +}; + + +// Eval as lvalue +template<typename Shuffle, typename ArgType, typename Device> +struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device> + : public TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> +{ + typedef TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> Base; + + typedef TensorShufflingOp<Shuffle, ArgType> XprType; + typedef typename XprType::Index Index; + static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value; + typedef DSizes<Index, NumDims> Dimensions; + typedef typename XprType::Scalar Scalar; + + enum { + IsAligned = false, + PacketAccess = (internal::packet_traits<Scalar>::size > 1), + BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess, + Layout = TensorEvaluator<ArgType, Device>::Layout, + }; + + typedef typename internal::TensorBlock< + Index, typename internal::remove_const<Scalar>::type, NumDims, + TensorEvaluator<ArgType, Device>::Layout> TensorBlock; + typedef typename internal::TensorBlockWriter< + Index, typename internal::remove_const<Scalar>::type, NumDims, + TensorEvaluator<ArgType, Device>::Layout, PacketAccess> TensorBlockWriter; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : Base(op, device) + { } + + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) + { + return this->m_impl.coeffRef(this->srcCoeff(index)); + } + + template <int StoreMode> EIGEN_STRONG_INLINE + void writePacket(Index index, const PacketReturnType& x) + { + static const int packetSize = internal::unpacket_traits<PacketReturnType>::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + + EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize]; + internal::pstore<CoeffReturnType, PacketReturnType>(values, x); + for (int i = 0; i < packetSize; ++i) { + this->coeffRef(index+i) = values[i]; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock( + const TensorBlock& block) { + eigen_assert(this->m_impl.data() != NULL); + TensorBlockWriter::Run(block, this->srcCoeff(block.first_coeff_index()), + this->m_inverseShuffle, + this->m_unshuffledInputStrides, this->m_impl.data()); + } +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h new file mode 100644 index 0000000000..cfde4fdc72 --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h @@ -0,0 +1,247 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2013 Christian Seiler <christian@iwakd.de> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSORSTORAGE_H +#define EIGEN_CXX11_TENSOR_TENSORSTORAGE_H + +#ifdef EIGEN_TENSOR_STORAGE_CTOR_PLUGIN + #define EIGEN_INTERNAL_TENSOR_STORAGE_CTOR_PLUGIN EIGEN_TENSOR_STORAGE_CTOR_PLUGIN; +#else + #define EIGEN_INTERNAL_TENSOR_STORAGE_CTOR_PLUGIN +#endif + +namespace Eigen { + +/** \internal + * + * \class TensorStorage + * \ingroup CXX11_Tensor_Module + * + * \brief Stores the data of a tensor + * + * This class stores the data of fixed-size, dynamic-size or mixed tensors + * in a way as compact as possible. + * + * \sa Tensor + */ +template<typename T, typename Dimensions, int Options_> class TensorStorage; + + +// Pure fixed-size storage +template<typename T, int Options_, typename FixedDimensions> +class TensorStorage<T, FixedDimensions, Options_> +{ + private: + static const std::size_t Size = FixedDimensions::total_size; + + EIGEN_ALIGN_DEFAULT T m_data[Size]; + FixedDimensions m_dimensions; + + public: + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorStorage() { + EIGEN_STATIC_ASSERT(Size == FixedDimensions::total_size, YOU_MADE_A_PROGRAMMING_MISTAKE) + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE T *data() { return m_data; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const T *data() const { return m_data; } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const FixedDimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE DenseIndex size() const { return m_dimensions.TotalSize(); } +}; + + +// pure dynamic +template<typename T, int Options_, typename IndexType, std::size_t NumIndices_> +class TensorStorage<T, DSizes<IndexType, NumIndices_>, Options_> +{ + public: + typedef IndexType Index; + typedef DSizes<IndexType, NumIndices_> Dimensions; + typedef TensorStorage<T, DSizes<IndexType, NumIndices_>, Options_> Self; + + EIGEN_DEVICE_FUNC TensorStorage() + : m_data(NumIndices_ ? 0 : internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(1)) + , m_dimensions() {} + + EIGEN_DEVICE_FUNC TensorStorage(internal::constructor_without_unaligned_array_assert) + : m_data(NumIndices_ ? 0 : internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(1)) + , m_dimensions(internal::template repeat<NumIndices_, Index>(0)) {} + + EIGEN_DEVICE_FUNC TensorStorage(Index size, const array<Index, NumIndices_>& dimensions) + : m_data(internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(size)), m_dimensions(dimensions) + { EIGEN_INTERNAL_TENSOR_STORAGE_CTOR_PLUGIN } + + EIGEN_DEVICE_FUNC TensorStorage(const Self& other) + : m_data(internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(internal::array_prod(other.m_dimensions))) + , m_dimensions(other.m_dimensions) + { + internal::smart_copy(other.m_data, other.m_data+internal::array_prod(other.m_dimensions), m_data); + } + EIGEN_DEVICE_FUNC Self& operator=(const Self& other) + { + if (this != &other) { + Self tmp(other); + this->swap(tmp); + } + return *this; + } + + EIGEN_DEVICE_FUNC ~TensorStorage() { internal::conditional_aligned_delete_auto<T,(Options_&DontAlign)==0>(m_data, internal::array_prod(m_dimensions)); } + EIGEN_DEVICE_FUNC void swap(Self& other) + { numext::swap(m_data,other.m_data); numext::swap(m_dimensions,other.m_dimensions); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {return m_dimensions;} + + EIGEN_DEVICE_FUNC void resize(Index size, const array<Index, NumIndices_>& nbDimensions) + { + const Index currentSz = internal::array_prod(m_dimensions); + if(size != currentSz) + { + internal::conditional_aligned_delete_auto<T,(Options_&DontAlign)==0>(m_data, currentSz); + if (size) + m_data = internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(size); + else + m_data = 0; + EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN + } + m_dimensions = nbDimensions; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T *data() { return m_data; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T *data() const { return m_data; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_dimensions.TotalSize(); } + + private: + T *m_data; + Dimensions m_dimensions; +}; + + +// pure dynamic +template<typename T, int Options_> +class TensorStorage<T, VSizes<DenseIndex>, Options_> +{ + T* m_data; + VSizes<DenseIndex> m_dimensions; + typedef TensorStorage<T, VSizes<DenseIndex>, Options_> Self_; + + public: + EIGEN_DEVICE_FUNC TensorStorage() : m_data(0), m_dimensions() {} + + template <DenseIndex NumDims> + EIGEN_DEVICE_FUNC TensorStorage(const array<DenseIndex, NumDims>& dimensions) + { + m_dimensions.resize(NumDims); + for (int i = 0; i < NumDims; ++i) { + m_dimensions[i] = dimensions[i]; + } + const DenseIndex size = array_prod(dimensions); + m_data = internal::conditional_managed_new_auto<T,(Options_&DontAlign)==0,(Options_&AllocateUVM)>(size); + EIGEN_INTERNAL_TENSOR_STORAGE_CTOR_PLUGIN + } + + EIGEN_DEVICE_FUNC TensorStorage(const std::vector<DenseIndex>& dimensions) + : m_dimensions(dimensions) + { + const DenseIndex size = internal::array_prod(dimensions); + m_data = internal::conditional_managed_new_auto<T,(Options_&DontAlign)==0,(Options_&AllocateUVM)>(size); + EIGEN_INTERNAL_TENSOR_STORAGE_CTOR_PLUGIN + } + +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES + template<typename... IndexTypes> EIGEN_DEVICE_FUNC + TensorStorage(IndexTypes... dimensions) { + const int NumDims = sizeof...(dimensions); + m_dimensions.resize(NumDims); + const array<DenseIndex, NumDims> dim{{dimensions...}}; + DenseIndex size = 1; + for (int i = 0; i < NumDims; ++i) { + size *= dim[i]; + m_dimensions[i] = dim[i]; + } + m_data = internal::conditional_managed_new_auto<T,(Options_&DontAlign)==0,(Options_&AllocateUVM)>(size); + EIGEN_INTERNAL_TENSOR_STORAGE_CTOR_PLUGIN + } +#endif + + EIGEN_DEVICE_FUNC TensorStorage(const Self_& other) + : m_data(internal::conditional_managed_new_auto<T,(Options_&DontAlign)==0,(Options_&AllocateUVM)>(internal::array_prod(other.m_dimensions))) + , m_dimensions(other.m_dimensions) + { + internal::smart_copy(other.m_data, other.m_data+internal::array_prod(other.m_dimensions), m_data); + } + + EIGEN_DEVICE_FUNC Self_& operator=(const Self_& other) + { + if (this != &other) { + Self_ tmp(other); + this->swap(tmp); + } + return *this; + } + + EIGEN_DEVICE_FUNC ~TensorStorage() + { + internal::conditional_managed_delete_auto<T,(Options_&DontAlign)==0,(Options_&AllocateUVM)>(m_data, internal::array_prod(m_dimensions)); + } + + EIGEN_DEVICE_FUNC void swap(Self_& other) + { std::swap(m_data,other.m_data); std::swap(m_dimensions,other.m_dimensions); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const VSizes<DenseIndex>& dimensions() const { return m_dimensions; } + + template <typename NewDimensions> EIGEN_DEVICE_FUNC + void resize(DenseIndex size, const NewDimensions& nbDimensions) + { + const DenseIndex currentSz = internal::array_prod(m_dimensions); + if(size != currentSz) + { + internal::conditional_managed_delete_auto<T,(Options_&DontAlign)==0,(Options_&AllocateUVM)>(m_data, currentSz); + if (size) + m_data = internal::conditional_managed_new_auto<T,(Options_&DontAlign)==0,(Options_&AllocateUVM)>(size); + else + m_data = 0; + EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN + } + m_dimensions.resize(internal::array_size<NewDimensions>::value); + for (int i = 0; i < internal::array_size<NewDimensions>::value; ++i) { + m_dimensions[i] = nbDimensions[i]; + } + } + EIGEN_DEVICE_FUNC void resize(DenseIndex size, const std::vector<DenseIndex>& nbDimensions) + { + const DenseIndex currentSz = internal::array_prod(m_dimensions); + if(size != currentSz) + { + internal::conditional_managed_delete_auto<T,(Options_&DontAlign)==0,(Options_&AllocateUVM)>(m_data, currentSz); + if (size) + m_data = internal::conditional_managed_new_auto<T,(Options_&DontAlign)==0,(Options_&AllocateUVM)>(size); + else + m_data = 0; + EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN + } + m_dimensions = nbDimensions; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T *data() { return m_data; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T *data() const { return m_data; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex size() const { return m_dimensions.TotalSize(); } +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSORSTORAGE_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h new file mode 100644 index 0000000000..8abe5ea8e4 --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h @@ -0,0 +1,329 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_STRIDING_H +#define EIGEN_CXX11_TENSOR_TENSOR_STRIDING_H + +namespace Eigen { + +/** \class TensorStriding + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor striding class. + * + * + */ +namespace internal { +template<typename Strides, typename XprType> +struct traits<TensorStridingOp<Strides, XprType> > : public traits<XprType> +{ + typedef typename XprType::Scalar Scalar; + typedef traits<XprType> XprTraits; + typedef typename packet_traits<Scalar>::type Packet; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference<Nested>::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; +}; + +template<typename Strides, typename XprType> +struct eval<TensorStridingOp<Strides, XprType>, Eigen::Dense> +{ + typedef const TensorStridingOp<Strides, XprType>& type; +}; + +template<typename Strides, typename XprType> +struct nested<TensorStridingOp<Strides, XprType>, 1, typename eval<TensorStridingOp<Strides, XprType> >::type> +{ + typedef TensorStridingOp<Strides, XprType> type; +}; + +} // end namespace internal + + + +template<typename Strides, typename XprType> +class TensorStridingOp : public TensorBase<TensorStridingOp<Strides, XprType> > +{ + public: + typedef typename Eigen::internal::traits<TensorStridingOp>::Scalar Scalar; + typedef typename Eigen::internal::traits<TensorStridingOp>::Packet Packet; + typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename Eigen::internal::nested<TensorStridingOp>::type Nested; + typedef typename Eigen::internal::traits<TensorStridingOp>::StorageKind StorageKind; + typedef typename Eigen::internal::traits<TensorStridingOp>::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorStridingOp(const XprType& expr, const Strides& dims) + : m_xpr(expr), m_dims(dims) {} + + EIGEN_DEVICE_FUNC + const Strides& strides() const { return m_dims; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all<typename XprType::Nested>::type& + expression() const { return m_xpr; } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorStridingOp& operator = (const TensorStridingOp& other) + { + typedef TensorAssignOp<TensorStridingOp, const TensorStridingOp> Assign; + Assign assign(*this, other); + internal::TensorExecutor<const Assign, DefaultDevice>::run( + assign, DefaultDevice()); + return *this; + } + + template<typename OtherDerived> + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorStridingOp& operator = (const OtherDerived& other) + { + typedef TensorAssignOp<TensorStridingOp, const OtherDerived> Assign; + Assign assign(*this, other); + internal::TensorExecutor<const Assign, DefaultDevice>::run( + assign, DefaultDevice()); + return *this; + } + + protected: + typename XprType::Nested m_xpr; + const Strides m_dims; +}; + + +// Eval as rvalue +template<typename Strides, typename ArgType, typename Device> +struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device> +{ + typedef TensorStridingOp<Strides, ArgType> XprType; + typedef typename XprType::Index Index; + static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value; + typedef DSizes<Index, NumDims> Dimensions; + + enum { + IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false, + PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, + BlockAccess = false, + Layout = TensorEvaluator<ArgType, Device>::Layout, + CoordAccess = false, // to be implemented + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device) + { + m_dimensions = m_impl.dimensions(); + for (int i = 0; i < NumDims; ++i) { + m_dimensions[i] = ceilf(static_cast<float>(m_dimensions[i]) / op.strides()[i]); + } + + const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions(); + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + m_outputStrides[0] = 1; + m_inputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; + m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; + m_inputStrides[i-1] *= op.strides()[i-1]; + } + m_inputStrides[NumDims-1] *= op.strides()[NumDims-1]; + } else { // RowMajor + m_outputStrides[NumDims-1] = 1; + m_inputStrides[NumDims-1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1]; + m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1]; + m_inputStrides[i+1] *= op.strides()[i+1]; + } + m_inputStrides[0] *= op.strides()[0]; + } + } + + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + return m_impl.coeff(srcCoeff(index)); + } + + template<int LoadMode> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + const int packetSize = internal::unpacket_traits<PacketReturnType>::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + + Index inputIndices[] = {0, 0}; + Index indices[] = {index, index + packetSize - 1}; + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx0 = indices[0] / m_outputStrides[i]; + const Index idx1 = indices[1] / m_outputStrides[i]; + inputIndices[0] += idx0 * m_inputStrides[i]; + inputIndices[1] += idx1 * m_inputStrides[i]; + indices[0] -= idx0 * m_outputStrides[i]; + indices[1] -= idx1 * m_outputStrides[i]; + } + inputIndices[0] += indices[0] * m_inputStrides[0]; + inputIndices[1] += indices[1] * m_inputStrides[0]; + } else { // RowMajor + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx0 = indices[0] / m_outputStrides[i]; + const Index idx1 = indices[1] / m_outputStrides[i]; + inputIndices[0] += idx0 * m_inputStrides[i]; + inputIndices[1] += idx1 * m_inputStrides[i]; + indices[0] -= idx0 * m_outputStrides[i]; + indices[1] -= idx1 * m_outputStrides[i]; + } + inputIndices[0] += indices[0] * m_inputStrides[NumDims-1]; + inputIndices[1] += indices[1] * m_inputStrides[NumDims-1]; + } + if (inputIndices[1] - inputIndices[0] == packetSize - 1) { + PacketReturnType rslt = m_impl.template packet<Unaligned>(inputIndices[0]); + return rslt; + } + else { + EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize]; + values[0] = m_impl.coeff(inputIndices[0]); + values[packetSize-1] = m_impl.coeff(inputIndices[1]); + for (int i = 1; i < packetSize-1; ++i) { + values[i] = coeff(index+i); + } + PacketReturnType rslt = internal::pload<PacketReturnType>(values); + return rslt; + } + } + + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + + protected: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const + { + Index inputIndex = 0; + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_outputStrides[i]; + inputIndex += idx * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + inputIndex += index * m_inputStrides[0]; + } else { // RowMajor + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = index / m_outputStrides[i]; + inputIndex += idx * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + inputIndex += index * m_inputStrides[NumDims-1]; + } + return inputIndex; + } + + Dimensions m_dimensions; + array<Index, NumDims> m_outputStrides; + array<Index, NumDims> m_inputStrides; + TensorEvaluator<ArgType, Device> m_impl; +}; + + +// Eval as lvalue +template<typename Strides, typename ArgType, typename Device> +struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device> + : public TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device> +{ + typedef TensorStridingOp<Strides, ArgType> XprType; + typedef TensorEvaluator<const XprType, Device> Base; + // typedef typename XprType::Index Index; + static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value; + // typedef DSizes<Index, NumDims> Dimensions; + + enum { + IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false, + PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, + BlockAccess = false, + Layout = TensorEvaluator<ArgType, Device>::Layout, + CoordAccess = false, // to be implemented + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : Base(op, device) { } + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::PacketReturnType PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) + { + return this->m_impl.coeffRef(this->srcCoeff(index)); + } + + template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void writePacket(Index index, const PacketReturnType& x) + { + const int packetSize = internal::unpacket_traits<PacketReturnType>::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+packetSize-1 < this->dimensions().TotalSize()); + + Index inputIndices[] = {0, 0}; + Index indices[] = {index, index + packetSize - 1}; + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx0 = indices[0] / this->m_outputStrides[i]; + const Index idx1 = indices[1] / this->m_outputStrides[i]; + inputIndices[0] += idx0 * this->m_inputStrides[i]; + inputIndices[1] += idx1 * this->m_inputStrides[i]; + indices[0] -= idx0 * this->m_outputStrides[i]; + indices[1] -= idx1 * this->m_outputStrides[i]; + } + inputIndices[0] += indices[0] * this->m_inputStrides[0]; + inputIndices[1] += indices[1] * this->m_inputStrides[0]; + } else { // RowMajor + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx0 = indices[0] / this->m_outputStrides[i]; + const Index idx1 = indices[1] / this->m_outputStrides[i]; + inputIndices[0] += idx0 * this->m_inputStrides[i]; + inputIndices[1] += idx1 * this->m_inputStrides[i]; + indices[0] -= idx0 * this->m_outputStrides[i]; + indices[1] -= idx1 * this->m_outputStrides[i]; + } + inputIndices[0] += indices[0] * this->m_inputStrides[NumDims-1]; + inputIndices[1] += indices[1] * this->m_inputStrides[NumDims-1]; + } + if (inputIndices[1] - inputIndices[0] == packetSize - 1) { + this->m_impl.template writePacket<Unaligned>(inputIndices[0], x); + } + else { + EIGEN_ALIGN_DEFAULT Scalar values[packetSize]; + internal::pstore<Scalar, PacketReturnType>(values, x); + this->m_impl.coeffRef(inputIndices[0]) = values[0]; + this->m_impl.coeffRef(inputIndices[1]) = values[packetSize-1]; + for (int i = 1; i < packetSize-1; ++i) { + this->coeffRef(index+i) = values[i]; + } + } + } +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_STRIDING_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h new file mode 100644 index 0000000000..b8c1eadfc3 --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h @@ -0,0 +1,294 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_TRAITS_H +#define EIGEN_CXX11_TENSOR_TENSOR_TRAITS_H + +namespace Eigen { +namespace internal { + + +template<typename Scalar, int Options> +class compute_tensor_flags +{ + enum { + is_dynamic_size_storage = 1, + + aligned_bit = + ( + ((Options&DontAlign)==0) && ( +#if EIGEN_ALIGN_STATICALLY + (!is_dynamic_size_storage) +#else + 0 +#endif + || +#if EIGEN_ALIGN + is_dynamic_size_storage +#else + 0 +#endif + ) + ) ? AlignedBit : 0, + packet_access_bit = packet_traits<Scalar>::Vectorizable && aligned_bit ? PacketAccessBit : 0 + }; + + public: + enum { ret = packet_access_bit | aligned_bit}; +}; + + +template<typename Scalar_, std::size_t NumIndices_, int Options_, typename IndexType_> +struct traits<Tensor<Scalar_, NumIndices_, Options_, IndexType_> > +{ + typedef Scalar_ Scalar; + typedef Dense StorageKind; + typedef IndexType_ Index; + static const int NumDimensions = NumIndices_; + static const int Layout = Options_ & RowMajor ? RowMajor : ColMajor; + enum { + Options = Options_, + Flags = compute_tensor_flags<Scalar_, Options_>::ret | (is_const<Scalar_>::value ? 0 : LvalueBit), + }; +}; + + +template<typename Scalar_, typename Dimensions, int Options_, typename IndexType_> +struct traits<TensorFixedSize<Scalar_, Dimensions, Options_, IndexType_> > +{ + typedef Scalar_ Scalar; + typedef Dense StorageKind; + typedef IndexType_ Index; + static const int NumDimensions = array_size<Dimensions>::value; + static const int Layout = Options_ & RowMajor ? RowMajor : ColMajor; + enum { + Options = Options_, + Flags = compute_tensor_flags<Scalar_, Options_>::ret | (is_const<Scalar_>::value ? 0: LvalueBit), + }; +}; + + +template<typename Scalar_, int Options_, typename IndexType_> +struct traits<TensorVarDim<Scalar_, Options_, IndexType_> > +{ + typedef Scalar_ Scalar; + typedef Dense StorageKind; + typedef IndexType_ Index; + static const int NumDimensions = -1; + static const int Layout = Options_ & RowMajor ? RowMajor : ColMajor; + enum { + Options = Options_, + Flags = compute_tensor_flags<Scalar_, Options_>::ret | (is_const<Scalar_>::value ? 0 : LvalueBit), + }; +}; + +template<typename PlainObjectType, int Options_> +struct traits<TensorMap<PlainObjectType, Options_> > + : public traits<PlainObjectType> +{ + typedef traits<PlainObjectType> BaseTraits; + typedef typename BaseTraits::Scalar Scalar; + typedef typename BaseTraits::StorageKind StorageKind; + typedef typename BaseTraits::Index Index; + static const int NumDimensions = BaseTraits::NumDimensions; + static const int Layout = BaseTraits::Layout; + enum { + Options = Options_, + Flags = (BaseTraits::Flags & ~AlignedBit) | (Options&Aligned ? AlignedBit : 0), + }; +}; + +template<typename PlainObjectType> +struct traits<TensorRef<PlainObjectType> > + : public traits<PlainObjectType> +{ + typedef traits<PlainObjectType> BaseTraits; + typedef typename BaseTraits::Scalar Scalar; + typedef typename BaseTraits::StorageKind StorageKind; + typedef typename BaseTraits::Index Index; + static const int NumDimensions = BaseTraits::NumDimensions; + static const int Layout = BaseTraits::Layout; + enum { + Options = BaseTraits::Options, + Flags = (BaseTraits::Flags & ~AlignedBit) | (Options&Aligned ? AlignedBit : 0), + }; +}; + + +template<typename _Scalar, std::size_t NumIndices_, int Options, typename IndexType_> +struct eval<Tensor<_Scalar, NumIndices_, Options, IndexType_>, Eigen::Dense> +{ + typedef const Tensor<_Scalar, NumIndices_, Options, IndexType_>& type; +}; + +template<typename _Scalar, std::size_t NumIndices_, int Options, typename IndexType_> +struct eval<const Tensor<_Scalar, NumIndices_, Options, IndexType_>, Eigen::Dense> +{ + typedef const Tensor<_Scalar, NumIndices_, Options, IndexType_>& type; +}; + +template<typename Scalar_, typename Dimensions, int Options, typename IndexType_> +struct eval<TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>, Eigen::Dense> +{ + typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>& type; +}; + +template<typename Scalar_, typename Dimensions, int Options, typename IndexType_> +struct eval<const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>, Eigen::Dense> +{ + typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>& type; +}; + +template<typename Scalar_, int Options, typename IndexType_> +struct eval<TensorVarDim<Scalar_, Options, IndexType_>, Eigen::Dense> +{ + typedef const TensorVarDim<Scalar_, Options, IndexType_>& type; +}; + +template<typename Scalar_, int Options, typename IndexType_> +struct eval<const TensorVarDim<Scalar_, Options, IndexType_>, Eigen::Dense> +{ + typedef const TensorVarDim<Scalar_, Options, IndexType_>& type; +}; + +template<typename PlainObjectType, int Options> +struct eval<TensorMap<PlainObjectType, Options>, Eigen::Dense> +{ + typedef const TensorMap<PlainObjectType, Options>& type; +}; + +template<typename PlainObjectType, int Options> +struct eval<const TensorMap<PlainObjectType, Options>, Eigen::Dense> +{ + typedef const TensorMap<PlainObjectType, Options>& type; +}; + +template<typename PlainObjectType> +struct eval<TensorRef<PlainObjectType>, Eigen::Dense> +{ + typedef const TensorRef<PlainObjectType>& type; +}; + +template<typename PlainObjectType> +struct eval<const TensorRef<PlainObjectType>, Eigen::Dense> +{ + typedef const TensorRef<PlainObjectType>& type; +}; + + +template <typename Scalar_, std::size_t NumIndices_, int Options_, typename IndexType_> +struct nested<Tensor<Scalar_, NumIndices_, Options_, IndexType_>, 1, typename eval<Tensor<Scalar_, NumIndices_, Options_, IndexType_> >::type> +{ + typedef const Tensor<Scalar_, NumIndices_, Options_, IndexType_>& type; +}; + +template <typename Scalar_, std::size_t NumIndices_, int Options_, typename IndexType_> +struct nested<const Tensor<Scalar_, NumIndices_, Options_, IndexType_>, 1, typename eval<const Tensor<Scalar_, NumIndices_, Options_, IndexType_> >::type> +{ + typedef const Tensor<Scalar_, NumIndices_, Options_, IndexType_>& type; +}; + +template <typename Scalar_, typename Dimensions, int Options, typename IndexType_> +struct nested<TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>, 1, typename eval<TensorFixedSize<Scalar_, Dimensions, Options, IndexType_> >::type> +{ + typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>& type; +}; + +template <typename Scalar_, typename Dimensions, int Options, typename IndexType_> +struct nested<const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>, 1, typename eval<const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_> >::type> +{ + typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>& type; +}; + +template <typename Scalar_, int Options> +struct nested<TensorVarDim<Scalar_, Options>, 1, typename eval<TensorVarDim<Scalar_, Options> >::type> +{ + typedef const TensorVarDim<Scalar_, Options>& type; +}; + +template <typename Scalar_, int Options> +struct nested<const TensorVarDim<Scalar_, Options>, 1, typename eval<const TensorVarDim<Scalar_, Options> >::type> +{ + typedef const TensorVarDim<Scalar_, Options>& type; +}; + + +template <typename PlainObjectType, int Options> +struct nested<TensorMap<PlainObjectType, Options>, 1, typename eval<TensorMap<PlainObjectType, Options> >::type> +{ + typedef const TensorMap<PlainObjectType, Options>& type; +}; + +template <typename PlainObjectType, int Options> +struct nested<const TensorMap<PlainObjectType, Options>, 1, typename eval<TensorMap<PlainObjectType, Options> >::type> +{ + typedef const TensorMap<PlainObjectType, Options>& type; +}; + +template <typename PlainObjectType> +struct nested<TensorRef<PlainObjectType>, 1, typename eval<TensorRef<PlainObjectType> >::type> +{ + typedef const TensorRef<PlainObjectType>& type; +}; + +template <typename PlainObjectType> +struct nested<const TensorRef<PlainObjectType>, 1, typename eval<TensorRef<PlainObjectType> >::type> +{ + typedef const TensorRef<PlainObjectType>& type; +}; + +} // end namespace internal + +// Convolutional layers take in an input tensor of shape (D, R, C, B), or (D, C, +// R, B), and convolve it with a set of filters, which can also be presented as +// a tensor (D, K, K, M), where M is the number of filters, K is the filter +// size, and each 3-dimensional tensor of size (D, K, K) is a filter. For +// simplicity we assume that we always use square filters (which is usually the +// case in images), hence the two Ks in the tensor dimension. It also takes in +// a few additional parameters: +// Stride (S): The convolution stride is the offset between locations where we +// apply the filters. A larger stride means that the output will be +// spatially smaller. +// Padding (P): The padding we apply to the input tensor along the R and C +// dimensions. This is usually used to make sure that the spatial +// dimensions of the output matches our intention. +// +// Two types of padding are often used: +// SAME: The pad value is computed so that the output will have size +// R/S and C/S. +// VALID: no padding is carried out. +// When we do padding, the padded values at the padded locations are usually +// zero. +// +// The output dimensions for convolution, when given all the parameters above, +// are as follows: +// When Padding = SAME: the output size is (B, R', C', M), where +// R' = ceil(float(R) / float(S)) +// C' = ceil(float(C) / float(S)) +// where ceil is the ceiling function. The input tensor is padded with 0 as +// needed. The number of padded rows and columns are computed as: +// Pr = ((R' - 1) * S + K - R) / 2 +// Pc = ((C' - 1) * S + K - C) / 2 +// when the stride is 1, we have the simplified case R'=R, C'=C, Pr=Pc=(K-1)/2. +// This is where SAME comes from - the output has the same size as the input has. +// When Padding = VALID: the output size is computed as +// R' = ceil(float(R - K + 1) / float(S)) +// C' = ceil(float(C - K + 1) / float(S)) +// and the number of padded rows and columns are computed in the same way as in +// the SAME case. +// When the stride is 1, we have the simplified case R'=R-K+1, C'=C-K+1, Pr=0, +// Pc=0. +typedef enum { + PADDING_VALID = 1, + PADDING_SAME = 2, +} PaddingType; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_TRAITS_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorTrueIndices.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorTrueIndices.h new file mode 100644 index 0000000000..ec1d44e6a6 --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorTrueIndices.h @@ -0,0 +1,250 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Eugene Brevdo <ebrevdo@google.com> +// Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_TRUE_INDICES_H +#define EIGEN_CXX11_TENSOR_TENSOR_TRUE_INDICES_H +namespace Eigen { + +/** \class TensorTrueIndices + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor provide indices of true values class. + * + */ +namespace internal { +template<typename XprType> +struct traits<TensorTrueIndicesOp<XprType> > : public traits<XprType> +{ + typedef DenseIndex Scalar; + typedef DenseIndex CoeffReturnType; + typedef traits<XprType> XprTraits; + //typedef typename packet_traits<Scalar>::type Packet; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference<Nested>::type _Nested; + static const int NumDimensions = 2; // XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; +}; + +template<typename XprType> +struct eval<TensorTrueIndicesOp<XprType>, Eigen::Dense> +{ + typedef const TensorTrueIndicesOp<XprType>& type; +}; + +template<typename XprType> +struct nested<TensorTrueIndicesOp<XprType>, 1, + typename eval<TensorTrueIndicesOp<XprType> >::type> +{ + typedef TensorTrueIndicesOp<XprType> type; +}; + +} // end namespace internal + +template<typename XprType> +class TensorTrueIndicesOp : public TensorBase<TensorTrueIndicesOp<XprType>, WriteAccessors> +{ + public: + typedef typename Eigen::internal::traits<TensorTrueIndicesOp>::Scalar Scalar; + //typedef typename Eigen::internal::traits<TensorTrueIndicesOp>::Packet Packet; + typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; + typedef typename Eigen::internal::traits<TensorTrueIndicesOp>::CoeffReturnType CoeffReturnType; + typedef typename internal::packet_traits<CoeffReturnType>::type PacketReturnType; + typedef typename Eigen::internal::nested<TensorTrueIndicesOp>::type Nested; + typedef typename Eigen::internal::traits<TensorTrueIndicesOp>::StorageKind + StorageKind; + typedef typename Eigen::internal::traits<TensorTrueIndicesOp>::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorTrueIndicesOp( + const XprType& expr, const CoeffReturnType& not_found = -1) + : m_xpr(expr), m_not_found(not_found) { + } + + EIGEN_DEVICE_FUNC + const CoeffReturnType& not_found() const { return m_not_found; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all<typename XprType::Nested>::type& + expression() const { return m_xpr; } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorTrueIndicesOp& operator = (const TensorTrueIndicesOp& other) + { + typedef TensorAssignOp<TensorTrueIndicesOp, const TensorTrueIndicesOp> Assign; + Assign assign(*this, other); + internal::TensorExecutor<const Assign, DefaultDevice>::run( + assign, DefaultDevice()); + return *this; + } + + template<typename OtherDerived> + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorTrueIndicesOp& operator = (const OtherDerived& other) + { + typedef TensorAssignOp<TensorTrueIndicesOp, const OtherDerived> Assign; + Assign assign(*this, other); + internal::TensorExecutor<const Assign, DefaultDevice>::run( + assign, DefaultDevice()); + return *this; + } + + protected: + typename XprType::Nested m_xpr; + CoeffReturnType m_not_found; +}; + +// Eval as rvalue +template<typename ArgType, typename Device> +struct TensorEvaluator<const TensorTrueIndicesOp<ArgType>, Device> +{ + typedef TensorTrueIndicesOp<ArgType> XprType; + typedef typename XprType::Index InputIndex; + typedef typename XprType::Index Index; + static const int NumDims = 2; + typedef DSizes<Index, 2> Dimensions; + typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions; + static const int NumInputDims = internal::array_size<InputDimensions>::value; + + enum { + IsAligned = true, + PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, + BlockAccess = false, + Layout = TensorEvaluator<ArgType, Device>::Layout, + CoordAccess = false, // to be implemented + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, + const Device& device) + : m_impl(op.expression(), device), m_not_found(op.not_found()) + { + // Store original dimensions + m_orig_dimensions = m_impl.dimensions(); + + // Calculate output dimensions + m_dimensions[0] = m_orig_dimensions.TotalSize(); + m_dimensions[1] = NumInputDims; + + // Calculate strides of input expression + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + m_strides[0] = 1; + for (int i = 1; i < NumInputDims; ++i) { + m_strides[i] = m_strides[i-1] * m_orig_dimensions[i-1]; + } + } else { + m_strides[NumInputDims-1] = 1; + for (int i = NumInputDims - 2; i >= 0; --i) { + m_strides[i] = m_strides[i+1] * m_orig_dimensions[i+1]; + } + } + } + + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE InputIndex origIndices( + Index index) const { + eigen_assert(index < dimensions().TotalSize()); + Index inputIndex = 0; + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + inputIndex = index % m_dimensions[0]; + } else { + inputIndex = index / m_dimensions[1]; + } + return inputIndex; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int whichDim( + Index index) const { + eigen_assert(index < dimensions().TotalSize()); + int inputDim = 0; + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + inputDim = index / m_dimensions[0]; + } else { + inputDim = index % m_dimensions[1]; + } + return inputDim; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType origDim( + int dim, InputIndex index) const { + eigen_assert(index < m_orig_dimensions.TotalSize()); + eigen_assert(dim > -1 && dim < m_orig_dimensions.size()); + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + for (int i = NumInputDims - 1; i > 0; --i) { + Index idx = index / m_strides[i]; + if (i == dim) return idx; // Found our dimension + index -= idx * m_strides[i]; + } + return index; + } else { + for (int i = 0; i < NumInputDims - 1; ++i) { + Index idx = index / m_strides[i]; + if (i == dim) return idx; // Found our dimension + index -= idx * m_strides[i]; + } + return index; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff( + Index index) const { + InputIndex orig_index = origIndices(index); + if (m_impl.coeff(orig_index)) + return origDim(whichDim(index), orig_index); + else { + return m_not_found; + } + } + + template<int LoadMode> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + PacketReturnType packet(Index index) const + { + const int packetSize = internal::unpacket_traits<PacketReturnType>::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + + // TODO(ndjaitly): write a better packing routine that uses + // local structure. + EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type + values[packetSize]; + for (int i = 0; i < packetSize; ++i) { + values[i] = coeff(index+i); + } + PacketReturnType rslt = internal::pload<PacketReturnType>(values); + return rslt; + } + + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + + protected: + InputDimensions m_orig_dimensions; + Dimensions m_dimensions; + TensorEvaluator<ArgType, Device> m_impl; + array<Index, NumInputDims> m_strides; + CoeffReturnType m_not_found; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_TRUE_INDICES_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorVarDim.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorVarDim.h new file mode 100644 index 0000000000..49954b955e --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorVarDim.h @@ -0,0 +1,315 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_VAR_DIM_H +#define EIGEN_CXX11_TENSOR_TENSOR_VAR_DIM_H + +namespace Eigen { + +/** \class Tensor + * \ingroup CXX11_Tensor_Module + * + * \brief A version of the tensor class that supports a variable number of dimensions. + * + * The variable equivalent of + * Eigen::Tensor<float, 3> t(3, 5, 7); + * is + * Eigen::TensorVarDim<float> t(3, 5, 7); + */ + +template<typename Scalar_, int Options_, typename IndexType_> +class TensorVarDim : public TensorBase<TensorVarDim<Scalar_, Options_, IndexType_> > +{ + public: + typedef TensorVarDim<Scalar_, Options_, IndexType_> Self; + typedef TensorBase<TensorVarDim<Scalar_, Options_, IndexType_> > Base; + typedef typename Eigen::internal::nested<Self>::type Nested; + typedef typename internal::traits<Self>::StorageKind StorageKind; + typedef typename internal::traits<Self>::Index Index; + typedef Scalar_ Scalar; + typedef typename internal::packet_traits<Scalar>::type Packet; + typedef typename NumTraits<Scalar>::Real RealScalar; + typedef typename Base::CoeffReturnType CoeffReturnType; + typedef typename Base::PacketReturnType PacketReturnType; + + enum { + IsAligned = bool(EIGEN_ALIGN) & !(Options_ & DontAlign), + PacketAccess = (internal::packet_traits<Scalar>::size > 1), + BlockAccess = false, + Layout = Options_ & RowMajor ? RowMajor : ColMajor, + // disabled for now as the number of coefficients is not known by the + // caller at compile time. + CoordAccess = false, + }; + + static const int Options = Options_; + + static const Index NumIndices = Dynamic; + + typedef VSizes<Index> Dimensions; + + protected: + TensorStorage<Scalar, VSizes<Index>, Options_> m_storage; + + public: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rank() const { return m_storage.dimensions().size(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index dimension(std::size_t n) const { return m_storage.dimensions()[n]; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_storage.dimensions(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_storage.size(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar *data() { return m_storage.data(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar *data() const { return m_storage.data(); } + + // This makes EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED + // work, because that uses base().coeffRef() - and we don't yet + // implement a similar class hierarchy + inline Self& base() { return *this; } + inline const Self& base() const { return *this; } + +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES + template<typename... IndexTypes> + EIGEN_DEVICE_FUNC inline const Scalar& coeff(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const + { + // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. + static const std::size_t NumIndices = sizeof...(otherIndices) + 2; + return coeff(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}}); + } +#endif + + template <std::size_t NumIndices> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(const array<Index, NumIndices>& indices) const + { + eigen_internal_assert(checkIndexRange(indices)); + return m_storage.data()[linearizedIndex(indices)]; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index index) const + { + eigen_internal_assert(index >= 0 && index < size()); + return m_storage.data()[index]; + } + +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES + template<typename... IndexTypes> + inline Scalar& coeffRef(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) + { + static const std::size_t NumIndices = sizeof...(otherIndices) + 2; + return coeffRef(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}}); + } +#endif + + template <std::size_t NumIndices> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(const array<Index, NumIndices>& indices) + { + eigen_internal_assert(checkIndexRange(indices)); + return m_storage.data()[linearizedIndex(indices)]; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) + { + eigen_internal_assert(index >= 0 && index < size()); + return m_storage.data()[index]; + } + +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES + template<typename... IndexTypes> + inline const Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const + { + // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. + static const std::size_t NumIndices = sizeof...(otherIndices) + 2; + return this->operator()(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}}); + } +#endif + + template <std::size_t NumIndices> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(const array<Index, NumIndices>& indices) const + { + eigen_assert(checkIndexRange(indices)); + return coeff(indices); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(Index index) const + { + eigen_internal_assert(index >= 0 && index < size()); + return coeff(index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator[](Index index) const + { + return coeff(index); + } + +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES + template<typename... IndexTypes> + inline Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) + { + // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. + static const size_t NumIndices = sizeof...(otherIndices) + 1; + return operator()(array<Index, NumIndices>{{firstIndex, otherIndices...}}); + } +#endif + + template <std::size_t NumIndices> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(const array<Index, NumIndices>& indices) + { + eigen_assert(checkIndexRange(indices)); + return coeffRef(indices); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(Index index) + { + eigen_assert(index >= 0 && index < size()); + return coeffRef(index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator[](Index index) + { + return coeffRef(index); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorVarDim() + : m_storage() + { + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorVarDim(const Self& other) + : m_storage(other.m_storage) + { + } + +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES + template<typename... IndexTypes> + EIGEN_STRONG_INLINE TensorVarDim(Index firstDimension, IndexTypes... otherDimensions) + : m_storage(firstDimension, otherDimensions...) + { + } +#endif + + EIGEN_STRONG_INLINE explicit TensorVarDim(const std::vector<Index>& dimensions) + : m_storage(dimensions) + { + EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED + } + + template<typename OtherDerived> + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorVarDim(const TensorBase<OtherDerived, ReadOnlyAccessors>& other) + { + typedef TensorAssignOp<TensorVarDim, const OtherDerived> Assign; + Assign assign(*this, other.derived()); + resize(TensorEvaluator<const Assign, DefaultDevice>(assign, DefaultDevice()).dimensions()); + internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice()); + } + template<typename OtherDerived> + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorVarDim(const TensorBase<OtherDerived, WriteAccessors>& other) + { + typedef TensorAssignOp<TensorVarDim, const OtherDerived> Assign; + Assign assign(*this, other.derived()); + resize(TensorEvaluator<const Assign, DefaultDevice>(assign, DefaultDevice()).dimensions()); + internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorVarDim& operator=(const TensorVarDim& other) + { + typedef TensorAssignOp<TensorVarDim, const TensorVarDim> Assign; + Assign assign(*this, other); + resize(TensorEvaluator<const Assign, DefaultDevice>(assign, DefaultDevice()).dimensions()); + internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice()); + return *this; + } + template<typename OtherDerived> + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorVarDim& operator=(const OtherDerived& other) + { + typedef TensorAssignOp<TensorVarDim, const OtherDerived> Assign; + Assign assign(*this, other); + resize(TensorEvaluator<const Assign, DefaultDevice>(assign, DefaultDevice()).dimensions()); + internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice()); + return *this; + } + +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES + template<typename... IndexTypes> + void resize(Index firstDimension, IndexTypes... otherDimensions) + { + // The number of dimensions used to resize a tensor must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + static const std::size_t NumIndices = sizeof...(otherDimensions) + 1; + resize(array<Index, NumIndices>{{firstDimension, otherDimensions...}}); + } +#endif + + template <size_t NumIndices> + void resize(const array<Index, NumIndices>& dimensions) + { + Index size = Index(1); + for (std::size_t i = 0; i < NumIndices; i++) { + internal::check_rows_cols_for_overflow<Dynamic>::run(size, dimensions[i]); + size *= dimensions[i]; + } + #ifdef EIGEN_INITIALIZE_COEFFS + bool size_changed = size != this->size(); + m_storage.resize(size, dimensions); + if(size_changed) EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED + #else + m_storage.resize(size, dimensions); + #endif + } + void resize(const std::vector<Index>& dimensions) + { + Index size = Index(1); + for (std::size_t i = 0; i < dimensions.size(); i++) { + internal::check_rows_cols_for_overflow<Dynamic>::run(size, dimensions[i]); + size *= dimensions[i]; + } + #ifdef EIGEN_INITIALIZE_COEFFS + bool size_changed = size != this->size(); + m_storage.resize(size, dimensions); + if(size_changed) EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED + #else + m_storage.resize(size, dimensions); + #endif + } + + protected: + template <std::size_t NumIndices> + bool checkIndexRange(const array<Index, NumIndices>& indices) const + { + /* using internal::array_apply_and_reduce; + using internal::array_zip_and_reduce; + using internal::greater_equal_zero_op; + using internal::logical_and_op; + using internal::lesser_op; + + return + // check whether the indices are all >= 0 + array_apply_and_reduce<logical_and_op, greater_equal_zero_op>(indices) && + // check whether the indices fit in the dimensions + array_zip_and_reduce<logical_and_op, lesser_op>(indices, m_storage.dimensions()); + */ + return true; + } + + template <std::size_t NumIndices> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index linearizedIndex(const array<Index, NumIndices>& indices) const + { + if (Options&RowMajor) { + return m_storage.dimensions().IndexOfRowMajor(indices); + } else { + return m_storage.dimensions().IndexOfColMajor(indices); + } + } +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_VAR_DIM_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h new file mode 100644 index 0000000000..de86c57f11 --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h @@ -0,0 +1,677 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_VOLUME_PATCH_H +#define EIGEN_CXX11_TENSOR_TENSOR_VOLUME_PATCH_H + +namespace Eigen { + +/** \class TensorVolumePatch + * \ingroup CXX11_Tensor_Module + * + * \brief Patch extraction specialized for processing of volumetric data. + * This assumes that the input has a least 4 dimensions ordered as follows: + * - channels + * - planes + * - rows + * - columns + * - (optional) additional dimensions such as time or batch size. + * Calling the volume patch code with patch_planes, patch_rows, and patch_cols + * is equivalent to calling the regular patch extraction code with parameters + * d, patch_planes, patch_rows, patch_cols, and 1 for all the additional + * dimensions. + */ +namespace internal { +template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType> +struct traits<TensorVolumePatchOp<Planes, Rows, Cols, XprType> > : public traits<XprType> +{ + typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar; + typedef traits<XprType> XprTraits; + typedef typename packet_traits<Scalar>::type Packet; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference<Nested>::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions + 1; + static const int Layout = XprTraits::Layout; +}; + +template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType> +struct eval<TensorVolumePatchOp<Planes, Rows, Cols, XprType>, Eigen::Dense> +{ + typedef const TensorVolumePatchOp<Planes, Rows, Cols, XprType>& type; +}; + +template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType> +struct nested<TensorVolumePatchOp<Planes, Rows, Cols, XprType>, 1, typename eval<TensorVolumePatchOp<Planes, Rows, Cols, XprType> >::type> +{ + typedef TensorVolumePatchOp<Planes, Rows, Cols, XprType> type; +}; + +} // end namespace internal + +template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType> +class TensorVolumePatchOp : public TensorBase<TensorVolumePatchOp<Planes, Rows, Cols, XprType>, ReadOnlyAccessors> +{ + public: + typedef typename Eigen::internal::traits<TensorVolumePatchOp>::Scalar Scalar; + typedef typename Eigen::internal::traits<TensorVolumePatchOp>::Packet Packet; + typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename Eigen::internal::nested<TensorVolumePatchOp>::type Nested; + typedef typename Eigen::internal::traits<TensorVolumePatchOp>::StorageKind StorageKind; + typedef typename Eigen::internal::traits<TensorVolumePatchOp>::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorVolumePatchOp(const XprType& expr, DenseIndex patch_planes, DenseIndex patch_rows, DenseIndex patch_cols, + DenseIndex plane_strides, DenseIndex row_strides, DenseIndex col_strides, + DenseIndex in_plane_strides, DenseIndex in_row_strides, DenseIndex in_col_strides, + DenseIndex plane_inflate_strides, DenseIndex row_inflate_strides, DenseIndex col_inflate_strides, + PaddingType padding_type, Scalar padding_value) + : m_xpr(expr), m_patch_planes(patch_planes), m_patch_rows(patch_rows), m_patch_cols(patch_cols), + m_plane_strides(plane_strides), m_row_strides(row_strides), m_col_strides(col_strides), + m_in_plane_strides(in_plane_strides), m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides), + m_plane_inflate_strides(plane_inflate_strides), m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides), + m_padding_explicit(false), m_padding_top_z(0), m_padding_bottom_z(0), m_padding_top(0), m_padding_bottom(0), m_padding_left(0), m_padding_right(0), + m_padding_type(padding_type), m_padding_value(padding_value) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorVolumePatchOp(const XprType& expr, DenseIndex patch_planes, DenseIndex patch_rows, DenseIndex patch_cols, + DenseIndex plane_strides, DenseIndex row_strides, DenseIndex col_strides, + DenseIndex in_plane_strides, DenseIndex in_row_strides, DenseIndex in_col_strides, + DenseIndex plane_inflate_strides, DenseIndex row_inflate_strides, DenseIndex col_inflate_strides, + DenseIndex padding_top_z, DenseIndex padding_bottom_z, + DenseIndex padding_top, DenseIndex padding_bottom, + DenseIndex padding_left, DenseIndex padding_right, + Scalar padding_value) + : m_xpr(expr), m_patch_planes(patch_planes), m_patch_rows(patch_rows), m_patch_cols(patch_cols), + m_plane_strides(plane_strides), m_row_strides(row_strides), m_col_strides(col_strides), + m_in_plane_strides(in_plane_strides), m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides), + m_plane_inflate_strides(plane_inflate_strides), m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides), + m_padding_explicit(true), m_padding_top_z(padding_top_z), m_padding_bottom_z(padding_bottom_z), m_padding_top(padding_top), m_padding_bottom(padding_bottom), + m_padding_left(padding_left), m_padding_right(padding_right), + m_padding_type(PADDING_VALID), m_padding_value(padding_value) {} + + EIGEN_DEVICE_FUNC + DenseIndex patch_planes() const { return m_patch_planes; } + EIGEN_DEVICE_FUNC + DenseIndex patch_rows() const { return m_patch_rows; } + EIGEN_DEVICE_FUNC + DenseIndex patch_cols() const { return m_patch_cols; } + EIGEN_DEVICE_FUNC + DenseIndex plane_strides() const { return m_plane_strides; } + EIGEN_DEVICE_FUNC + DenseIndex row_strides() const { return m_row_strides; } + EIGEN_DEVICE_FUNC + DenseIndex col_strides() const { return m_col_strides; } + EIGEN_DEVICE_FUNC + DenseIndex in_plane_strides() const { return m_in_plane_strides; } + EIGEN_DEVICE_FUNC + DenseIndex in_row_strides() const { return m_in_row_strides; } + EIGEN_DEVICE_FUNC + DenseIndex in_col_strides() const { return m_in_col_strides; } + EIGEN_DEVICE_FUNC + DenseIndex plane_inflate_strides() const { return m_plane_inflate_strides; } + EIGEN_DEVICE_FUNC + DenseIndex row_inflate_strides() const { return m_row_inflate_strides; } + EIGEN_DEVICE_FUNC + DenseIndex col_inflate_strides() const { return m_col_inflate_strides; } + EIGEN_DEVICE_FUNC + bool padding_explicit() const { return m_padding_explicit; } + EIGEN_DEVICE_FUNC + DenseIndex padding_top_z() const { return m_padding_top_z; } + EIGEN_DEVICE_FUNC + DenseIndex padding_bottom_z() const { return m_padding_bottom_z; } + EIGEN_DEVICE_FUNC + DenseIndex padding_top() const { return m_padding_top; } + EIGEN_DEVICE_FUNC + DenseIndex padding_bottom() const { return m_padding_bottom; } + EIGEN_DEVICE_FUNC + DenseIndex padding_left() const { return m_padding_left; } + EIGEN_DEVICE_FUNC + DenseIndex padding_right() const { return m_padding_right; } + EIGEN_DEVICE_FUNC + PaddingType padding_type() const { return m_padding_type; } + EIGEN_DEVICE_FUNC + Scalar padding_value() const { return m_padding_value; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all<typename XprType::Nested>::type& + expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; + const DenseIndex m_patch_planes; + const DenseIndex m_patch_rows; + const DenseIndex m_patch_cols; + const DenseIndex m_plane_strides; + const DenseIndex m_row_strides; + const DenseIndex m_col_strides; + const DenseIndex m_in_plane_strides; + const DenseIndex m_in_row_strides; + const DenseIndex m_in_col_strides; + const DenseIndex m_plane_inflate_strides; + const DenseIndex m_row_inflate_strides; + const DenseIndex m_col_inflate_strides; + const bool m_padding_explicit; + const DenseIndex m_padding_top_z; + const DenseIndex m_padding_bottom_z; + const DenseIndex m_padding_top; + const DenseIndex m_padding_bottom; + const DenseIndex m_padding_left; + const DenseIndex m_padding_right; + const PaddingType m_padding_type; + const Scalar m_padding_value; +}; + + +// Eval as rvalue +template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename ArgType, typename Device> +struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, Device> +{ + typedef TensorVolumePatchOp<Planes, Rows, Cols, ArgType> XprType; + typedef typename XprType::Index Index; + static const int NumInputDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value; + static const int NumDims = NumInputDims + 1; + typedef DSizes<Index, NumDims> Dimensions; + typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar; + + enum { + IsAligned = false, + PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, + BlockAccess = false, + Layout = TensorEvaluator<ArgType, Device>::Layout, + CoordAccess = NumDims == 6, + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device) + { + EIGEN_STATIC_ASSERT(NumDims >= 5, YOU_MADE_A_PROGRAMMING_MISTAKE); + + m_paddingValue = op.padding_value(); + + const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions(); + + // Cache a few variables. + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + m_inputDepth = input_dims[0]; + m_inputPlanes = input_dims[1]; + m_inputRows = input_dims[2]; + m_inputCols = input_dims[3]; + } else { + m_inputDepth = input_dims[NumInputDims-1]; + m_inputPlanes = input_dims[NumInputDims-2]; + m_inputRows = input_dims[NumInputDims-3]; + m_inputCols = input_dims[NumInputDims-4]; + } + + m_plane_strides = op.plane_strides(); + m_row_strides = op.row_strides(); + m_col_strides = op.col_strides(); + + // Input strides and effective input/patch size + m_in_plane_strides = op.in_plane_strides(); + m_in_row_strides = op.in_row_strides(); + m_in_col_strides = op.in_col_strides(); + m_plane_inflate_strides = op.plane_inflate_strides(); + m_row_inflate_strides = op.row_inflate_strides(); + m_col_inflate_strides = op.col_inflate_strides(); + + // The "effective" spatial size after inflating data with zeros. + m_input_planes_eff = (m_inputPlanes - 1) * m_plane_inflate_strides + 1; + m_input_rows_eff = (m_inputRows - 1) * m_row_inflate_strides + 1; + m_input_cols_eff = (m_inputCols - 1) * m_col_inflate_strides + 1; + m_patch_planes_eff = op.patch_planes() + (op.patch_planes() - 1) * (m_in_plane_strides - 1); + m_patch_rows_eff = op.patch_rows() + (op.patch_rows() - 1) * (m_in_row_strides - 1); + m_patch_cols_eff = op.patch_cols() + (op.patch_cols() - 1) * (m_in_col_strides - 1); + + if (op.padding_explicit()) { + m_outputPlanes = ceil((m_input_planes_eff + op.padding_top_z() + op.padding_bottom_z() - m_patch_planes_eff + 1.f) / static_cast<float>(m_plane_strides)); + m_outputRows = ceil((m_input_rows_eff + op.padding_top() + op.padding_bottom() - m_patch_rows_eff + 1.f) / static_cast<float>(m_row_strides)); + m_outputCols = ceil((m_input_cols_eff + op.padding_left() + op.padding_right() - m_patch_cols_eff + 1.f) / static_cast<float>(m_col_strides)); + m_planePaddingTop = op.padding_top_z(); + m_rowPaddingTop = op.padding_top(); + m_colPaddingLeft = op.padding_left(); + } else { + // Computing padding from the type + switch (op.padding_type()) { + case PADDING_VALID: + m_outputPlanes = ceil((m_input_planes_eff - m_patch_planes_eff + 1.f) / static_cast<float>(m_plane_strides)); + m_outputRows = ceil((m_input_rows_eff - m_patch_rows_eff + 1.f) / static_cast<float>(m_row_strides)); + m_outputCols = ceil((m_input_cols_eff - m_patch_cols_eff + 1.f) / static_cast<float>(m_col_strides)); + m_planePaddingTop = 0; + m_rowPaddingTop = 0; + m_colPaddingLeft = 0; + break; + case PADDING_SAME: { + m_outputPlanes = ceil(m_input_planes_eff / static_cast<float>(m_plane_strides)); + m_outputRows = ceil(m_input_rows_eff / static_cast<float>(m_row_strides)); + m_outputCols = ceil(m_input_cols_eff / static_cast<float>(m_col_strides)); + const Index dz = m_outputPlanes * m_plane_strides + m_patch_planes_eff - 1 - m_input_planes_eff; + const Index dy = m_outputRows * m_row_strides + m_patch_rows_eff - 1 - m_input_rows_eff; + const Index dx = m_outputCols * m_col_strides + m_patch_cols_eff - 1 - m_input_cols_eff; + m_planePaddingTop = dz - dz / 2; + m_rowPaddingTop = dy - dy / 2; + m_colPaddingLeft = dx - dx / 2; + break; + } + default: + eigen_assert(false && "unexpected padding"); + } + } + eigen_assert(m_outputRows > 0); + eigen_assert(m_outputCols > 0); + eigen_assert(m_outputPlanes > 0); + + // Dimensions for result of extraction. + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + // ColMajor + // 0: depth + // 1: patch_planes + // 2: patch_rows + // 3: patch_cols + // 4: number of patches + // 5 and beyond: anything else (such as batch). + m_dimensions[0] = input_dims[0]; + m_dimensions[1] = op.patch_planes(); + m_dimensions[2] = op.patch_rows(); + m_dimensions[3] = op.patch_cols(); + m_dimensions[4] = m_outputPlanes * m_outputRows * m_outputCols; + for (int i = 5; i < NumDims; ++i) { + m_dimensions[i] = input_dims[i-1]; + } + } else { + // RowMajor + // NumDims-1: depth + // NumDims-2: patch_planes + // NumDims-3: patch_rows + // NumDims-4: patch_cols + // NumDims-5: number of patches + // NumDims-6 and beyond: anything else (such as batch). + m_dimensions[NumDims-1] = input_dims[NumInputDims-1]; + m_dimensions[NumDims-2] = op.patch_planes(); + m_dimensions[NumDims-3] = op.patch_rows(); + m_dimensions[NumDims-4] = op.patch_cols(); + m_dimensions[NumDims-5] = m_outputPlanes * m_outputRows * m_outputCols; + for (int i = NumDims-6; i >= 0; --i) { + m_dimensions[i] = input_dims[i]; + } + } + + // Strides for the output tensor. + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + m_rowStride = m_dimensions[1]; + m_colStride = m_dimensions[2] * m_rowStride; + m_patchStride = m_colStride * m_dimensions[3] * m_dimensions[0]; + m_otherStride = m_patchStride * m_dimensions[4]; + } else { + m_rowStride = m_dimensions[NumDims-2]; + m_colStride = m_dimensions[NumDims-3] * m_rowStride; + m_patchStride = m_colStride * m_dimensions[NumDims-4] * m_dimensions[NumDims-1]; + m_otherStride = m_patchStride * m_dimensions[NumDims-5]; + } + + // Strides for navigating through the input tensor. + m_planeInputStride = m_inputDepth; + m_rowInputStride = m_inputDepth * m_inputPlanes; + m_colInputStride = m_inputDepth * m_inputRows * m_inputPlanes; + m_otherInputStride = m_inputDepth * m_inputRows * m_inputCols * m_inputPlanes; + + m_outputPlanesRows = m_outputPlanes * m_outputRows; + + // Fast representations of different variables. + m_fastOtherStride = internal::TensorIntDivisor<Index>(m_otherStride); + m_fastPatchStride = internal::TensorIntDivisor<Index>(m_patchStride); + m_fastColStride = internal::TensorIntDivisor<Index>(m_colStride); + m_fastRowStride = internal::TensorIntDivisor<Index>(m_rowStride); + m_fastInputRowStride = internal::TensorIntDivisor<Index>(m_row_inflate_strides); + m_fastInputColStride = internal::TensorIntDivisor<Index>(m_col_inflate_strides); + m_fastInputPlaneStride = internal::TensorIntDivisor<Index>(m_plane_inflate_strides); + m_fastInputColsEff = internal::TensorIntDivisor<Index>(m_input_cols_eff); + m_fastOutputPlanes = internal::TensorIntDivisor<Index>(m_outputPlanes); + m_fastOutputPlanesRows = internal::TensorIntDivisor<Index>(m_outputPlanesRows); + + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + m_fastOutputDepth = internal::TensorIntDivisor<Index>(m_dimensions[0]); + } else { + m_fastOutputDepth = internal::TensorIntDivisor<Index>(m_dimensions[NumDims-1]); + } + } + + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + // Patch index corresponding to the passed in index. + const Index patchIndex = index / m_fastPatchStride; + + // Spatial offset within the patch. This has to be translated into 3D + // coordinates within the patch. + const Index patchOffset = (index - patchIndex * m_patchStride) / m_fastOutputDepth; + + // Batch, etc. + const Index otherIndex = (NumDims == 5) ? 0 : index / m_fastOtherStride; + const Index patch3DIndex = (NumDims == 5) ? patchIndex : (index - otherIndex * m_otherStride) / m_fastPatchStride; + + // Calculate column index in the input original tensor. + const Index colIndex = patch3DIndex / m_fastOutputPlanesRows; + const Index colOffset = patchOffset / m_fastColStride; + const Index inputCol = colIndex * m_col_strides + colOffset * m_in_col_strides - m_colPaddingLeft; + const Index origInputCol = (m_col_inflate_strides == 1) ? inputCol : ((inputCol >= 0) ? (inputCol / m_fastInputColStride) : 0); + if (inputCol < 0 || inputCol >= m_input_cols_eff || + ((m_col_inflate_strides != 1) && (inputCol != origInputCol * m_col_inflate_strides))) { + return Scalar(m_paddingValue); + } + + // Calculate row index in the original input tensor. + const Index rowIndex = (patch3DIndex - colIndex * m_outputPlanesRows) / m_fastOutputPlanes; + const Index rowOffset = (patchOffset - colOffset * m_colStride) / m_fastRowStride; + const Index inputRow = rowIndex * m_row_strides + rowOffset * m_in_row_strides - m_rowPaddingTop; + const Index origInputRow = (m_row_inflate_strides == 1) ? inputRow : ((inputRow >= 0) ? (inputRow / m_fastInputRowStride) : 0); + if (inputRow < 0 || inputRow >= m_input_rows_eff || + ((m_row_inflate_strides != 1) && (inputRow != origInputRow * m_row_inflate_strides))) { + return Scalar(m_paddingValue); + } + + // Calculate plane index in the original input tensor. + const Index planeIndex = (patch3DIndex - m_outputPlanes * (colIndex * m_outputRows + rowIndex)); + const Index planeOffset = patchOffset - colOffset * m_colStride - rowOffset * m_rowStride; + const Index inputPlane = planeIndex * m_plane_strides + planeOffset * m_in_plane_strides - m_planePaddingTop; + const Index origInputPlane = (m_plane_inflate_strides == 1) ? inputPlane : ((inputPlane >= 0) ? (inputPlane / m_fastInputPlaneStride) : 0); + if (inputPlane < 0 || inputPlane >= m_input_planes_eff || + ((m_plane_inflate_strides != 1) && (inputPlane != origInputPlane * m_plane_inflate_strides))) { + return Scalar(m_paddingValue); + } + + const int depth_index = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - 1; + const Index depth = index - (index / m_fastOutputDepth) * m_dimensions[depth_index]; + + const Index inputIndex = depth + + origInputRow * m_rowInputStride + + origInputCol * m_colInputStride + + origInputPlane * m_planeInputStride + + otherIndex * m_otherInputStride; + + return m_impl.coeff(inputIndex); + } + + template<int LoadMode> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + const Index packetSize = internal::unpacket_traits<PacketReturnType>::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + + if (m_in_row_strides != 1 || m_in_col_strides != 1 || m_row_inflate_strides != 1 || m_col_inflate_strides != 1 || + m_in_plane_strides != 1 || m_plane_inflate_strides != 1) { + return packetWithPossibleZero(index); + } + + const Index indices[2] = {index, index + packetSize - 1}; + const Index patchIndex = indices[0] / m_fastPatchStride; + if (patchIndex != indices[1] / m_fastPatchStride) { + return packetWithPossibleZero(index); + } + const Index otherIndex = (NumDims == 5) ? 0 : indices[0] / m_fastOtherStride; + eigen_assert(otherIndex == indices[1] / m_fastOtherStride); + + // Find the offset of the element wrt the location of the first element. + const Index patchOffsets[2] = {(indices[0] - patchIndex * m_patchStride) / m_fastOutputDepth, + (indices[1] - patchIndex * m_patchStride) / m_fastOutputDepth}; + + const Index patch3DIndex = (NumDims == 5) ? patchIndex : (indices[0] - otherIndex * m_otherStride) / m_fastPatchStride; + eigen_assert(patch3DIndex == (indices[1] - otherIndex * m_otherStride) / m_fastPatchStride); + + const Index colIndex = patch3DIndex / m_fastOutputPlanesRows; + const Index colOffsets[2] = { + patchOffsets[0] / m_fastColStride, + patchOffsets[1] / m_fastColStride}; + + // Calculate col indices in the original input tensor. + const Index inputCols[2] = { + colIndex * m_col_strides + colOffsets[0] - m_colPaddingLeft, + colIndex * m_col_strides + colOffsets[1] - m_colPaddingLeft}; + if (inputCols[1] < 0 || inputCols[0] >= m_inputCols) { + return internal::pset1<PacketReturnType>(Scalar(m_paddingValue)); + } + + if (inputCols[0] != inputCols[1]) { + return packetWithPossibleZero(index); + } + + const Index rowIndex = (patch3DIndex - colIndex * m_outputPlanesRows) / m_fastOutputPlanes; + const Index rowOffsets[2] = { + (patchOffsets[0] - colOffsets[0] * m_colStride) / m_fastRowStride, + (patchOffsets[1] - colOffsets[1] * m_colStride) / m_fastRowStride}; + eigen_assert(rowOffsets[0] <= rowOffsets[1]); + // Calculate col indices in the original input tensor. + const Index inputRows[2] = { + rowIndex * m_row_strides + rowOffsets[0] - m_rowPaddingTop, + rowIndex * m_row_strides + rowOffsets[1] - m_rowPaddingTop}; + + if (inputRows[1] < 0 || inputRows[0] >= m_inputRows) { + return internal::pset1<PacketReturnType>(Scalar(m_paddingValue)); + } + + if (inputRows[0] != inputRows[1]) { + return packetWithPossibleZero(index); + } + + const Index planeIndex = (patch3DIndex - m_outputPlanes * (colIndex * m_outputRows + rowIndex)); + const Index planeOffsets[2] = { + patchOffsets[0] - colOffsets[0] * m_colStride - rowOffsets[0] * m_rowStride, + patchOffsets[1] - colOffsets[1] * m_colStride - rowOffsets[1] * m_rowStride}; + eigen_assert(planeOffsets[0] <= planeOffsets[1]); + const Index inputPlanes[2] = { + planeIndex * m_plane_strides + planeOffsets[0] - m_planePaddingTop, + planeIndex * m_plane_strides + planeOffsets[1] - m_planePaddingTop}; + + if (inputPlanes[1] < 0 || inputPlanes[0] >= m_inputPlanes) { + return internal::pset1<PacketReturnType>(Scalar(m_paddingValue)); + } + + if (inputPlanes[0] >= 0 && inputPlanes[1] < m_inputPlanes) { + // no padding + const int depth_index = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - 1; + const Index depth = index - (index / m_fastOutputDepth) * m_dimensions[depth_index]; + const Index inputIndex = depth + + inputRows[0] * m_rowInputStride + + inputCols[0] * m_colInputStride + + m_planeInputStride * inputPlanes[0] + + otherIndex * m_otherInputStride; + return m_impl.template packet<Unaligned>(inputIndex); + } + + return packetWithPossibleZero(index); + } + + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + + const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; } + + Index planePaddingTop() const { return m_planePaddingTop; } + Index rowPaddingTop() const { return m_rowPaddingTop; } + Index colPaddingLeft() const { return m_colPaddingLeft; } + Index outputPlanes() const { return m_outputPlanes; } + Index outputRows() const { return m_outputRows; } + Index outputCols() const { return m_outputCols; } + Index userPlaneStride() const { return m_plane_strides; } + Index userRowStride() const { return m_row_strides; } + Index userColStride() const { return m_col_strides; } + Index userInPlaneStride() const { return m_in_plane_strides; } + Index userInRowStride() const { return m_in_row_strides; } + Index userInColStride() const { return m_in_col_strides; } + Index planeInflateStride() const { return m_plane_inflate_strides; } + Index rowInflateStride() const { return m_row_inflate_strides; } + Index colInflateStride() const { return m_col_inflate_strides; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<Index, NumDims>& coords) const + { + // ColMajor + // 0: depth, 1: patch_planes, 2: patch_rows, 3: patch_cols, 4: number of patches, 5: batches + // RowMajor + // 0: batches, 1: number of patches, 2: patch_cols , 3: patch_rows, 4: patch_planes, 5: depth + const Index patch3DIndex = coords[static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 4 : 1]; + const Index colOffset = coords[static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 3 : 2]; + const Index rowOffset= coords[static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 3]; + const Index planeOffset = coords[static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 4]; + + array<Index, NumDims-1> inputCoords; + + const Index colIndex = patch3DIndex / m_fastOutputPlanesRows; + const Index inputCol = colIndex * m_col_strides + colOffset * m_in_col_strides - m_colPaddingLeft; + const Index origInputCol = (m_col_inflate_strides == 1) ? inputCol : ((inputCol >= 0) ? (inputCol / m_fastInputColStride) : 0); + if (inputCol < 0 || inputCol >= m_input_cols_eff || + ((m_col_inflate_strides != 1) && (inputCol != origInputCol * m_col_inflate_strides))) { + return Scalar(m_paddingValue); + } + + const Index rowIndex = (patch3DIndex - colIndex * m_outputPlanesRows) / m_fastOutputPlanes; + const Index inputRow = rowIndex * m_row_strides + rowOffset * m_in_row_strides - m_rowPaddingTop; + const Index origInputRow = (m_row_inflate_strides == 1) ? inputRow : ((inputRow >= 0) ? (inputRow / m_fastInputRowStride) : 0); + if (inputRow < 0 || inputRow >= m_input_rows_eff || + ((m_row_inflate_strides != 1) && (inputRow != origInputRow * m_row_inflate_strides))) { + return Scalar(m_paddingValue); + } + + const Index planeIndex = patch3DIndex - colIndex * m_outputPlanesRows - rowIndex * m_outputRows; + const Index inputPlane = planeIndex * m_plane_strides + planeOffset * m_in_plane_strides - m_planePaddingTop; + const Index origInputPlane = (m_plane_inflate_strides == 1) ? inputPlane : ((inputPlane >= 0) ? (inputPlane / m_fastInputPlaneStride) : 0); + if (inputPlane < 0 || inputPlane >= m_input_planes_eff || + ((m_plane_inflate_strides != 1) && (inputPlane != origInputPlane * m_plane_inflate_strides))) { + return Scalar(m_paddingValue); + } + + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + inputCoords[0] = coords[0]; // depth + inputCoords[1] = origInputPlane; + inputCoords[2] = origInputRow; + inputCoords[3] = origInputCol; + inputCoords[4] = coords[5]; // batch + } else { + inputCoords[4] = coords[5]; // depth + inputCoords[3] = origInputPlane; + inputCoords[2] = origInputRow; + inputCoords[1] = origInputCol; + inputCoords[0] = coords[0]; // batch + } + if (TensorEvaluator<ArgType, Device>::CoordAccess) { + return m_impl.coeff(inputCoords); + } else { + Index inputIndex; + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + inputIndex = + inputCoords[4] * m_otherInputStride + + inputCoords[3] * m_colInputStride + + inputCoords[2] * m_rowInputStride + + inputCoords[1] * m_planeInputStride + + inputCoords[0]; + } else { + inputIndex = + inputCoords[0] * m_otherInputStride + + inputCoords[1] * m_colInputStride + + inputCoords[2] * m_rowInputStride + + inputCoords[3] * m_planeInputStride + + inputCoords[4]; + } + return m_impl.coeff(inputIndex); + } + } + + protected: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const + { + const int packetSize = internal::unpacket_traits<PacketReturnType>::size; + EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize]; + for (int i = 0; i < packetSize; ++i) { + values[i] = coeff(index+i); + } + PacketReturnType rslt = internal::pload<PacketReturnType>(values); + return rslt; + } + + Dimensions m_dimensions; + + // Parameters passed to the costructor. + Index m_plane_strides; + Index m_row_strides; + Index m_col_strides; + + Index m_outputPlanes; + Index m_outputRows; + Index m_outputCols; + + Index m_planePaddingTop; + Index m_rowPaddingTop; + Index m_colPaddingLeft; + + Index m_in_plane_strides; + Index m_in_row_strides; + Index m_in_col_strides; + + Index m_plane_inflate_strides; + Index m_row_inflate_strides; + Index m_col_inflate_strides; + + // Cached input size. + Index m_inputDepth; + Index m_inputPlanes; + Index m_inputRows; + Index m_inputCols; + + // Other cached variables. + Index m_outputPlanesRows; + + // Effective input/patch post-inflation size. + Index m_input_planes_eff; + Index m_input_rows_eff; + Index m_input_cols_eff; + Index m_patch_planes_eff; + Index m_patch_rows_eff; + Index m_patch_cols_eff; + + // Strides for the output tensor. + Index m_otherStride; + Index m_patchStride; + Index m_rowStride; + Index m_colStride; + + // Strides for the input tensor. + Index m_planeInputStride; + Index m_rowInputStride; + Index m_colInputStride; + Index m_otherInputStride; + + internal::TensorIntDivisor<Index> m_fastOtherStride; + internal::TensorIntDivisor<Index> m_fastPatchStride; + internal::TensorIntDivisor<Index> m_fastColStride; + internal::TensorIntDivisor<Index> m_fastRowStride; + internal::TensorIntDivisor<Index> m_fastInputPlaneStride; + internal::TensorIntDivisor<Index> m_fastInputRowStride; + internal::TensorIntDivisor<Index> m_fastInputColStride; + internal::TensorIntDivisor<Index> m_fastInputColsEff; + internal::TensorIntDivisor<Index> m_fastOutputPlanesRows; + internal::TensorIntDivisor<Index> m_fastOutputPlanes; + internal::TensorIntDivisor<Index> m_fastOutputDepth; + + Scalar m_paddingValue; + + TensorEvaluator<ArgType, Device> m_impl; +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_VOLUME_PATCH_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/g3doc/README.md b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/g3doc/README.md new file mode 100644 index 0000000000..1c3fe32f9b --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/g3doc/README.md @@ -0,0 +1,1792 @@ +# Eigen Tensors + +Tensors are multidimensional arrays of elements. Elements are typically scalars, +but more complex types such as strings are also supported. + +[TOC] + +## Tensor Classes + +You can manipulate a tensor with one of the following classes. They all are in +the namespace ```::Eigen.``` + + +### Class Tensor<data_type, rank> + +This is the class to use to create a tensor and allocate memory for it. The +class is templatized with the tensor datatype, such as float or int, and the +tensor rank. The rank is the number of dimensions, for example rank 2 is a +matrix. + +Tensors of this class are resizable. For example, if you assign a tensor of a +different size to a Tensor, that tensor is resized to match its new value. + +#### Constructor Tensor<data_type, rank>(size0, size1, ...) + +Constructor for a Tensor. The constructor must be passed ```rank``` integers +indicating the sizes of the instance along each of the the ```rank``` +dimensions. + + // Create a tensor of rank 3 of sizes 2, 3, 4. This tensor owns + // memory to hold 24 floating point values (24 = 2 x 3 x 4). + Tensor<float, 3> t_3d(2, 3, 4); + + // Resize t_3d by assigning a tensor of different sizes, but same rank. + t_3d = Tensor<float, 3>(3, 4, 3); + +#### Constructor Tensor<data_type, rank>(size_array) + +Constructor where the sizes for the constructor are specified as an array of +values instead of an explicitly list of parameters. The array type to use is +```Eigen::array<Eigen::Index>```. The array can be constructed automatically +from an initializer list. + + // Create a tensor of strings of rank 2 with sizes 5, 7. + Tensor<string, 2> t_2d({5, 7}); + + +### Class TensorFixedSize<data_type, Sizes<size0, size1, ...>> + +Class to use for tensors of fixed size, where the size is known at compile +time. Fixed sized tensors can provide very fast computations because all their +dimensions are known by the compiler. FixedSize tensors are not resizable. + +If the total number of elements in a fixed size tensor is small enough the +tensor data is held onto the stack and does not cause heap allocation and free. + + // Create a 4 x 3 tensor of floats. + TensorFixedSize<float, Sizes<4, 3>> t_4x3; + +### Class TensorMap<Tensor<data_type, rank>> + +This is the class to use to create a tensor on top of memory allocated and +owned by another part of your code. It allows to view any piece of allocated +memory as a Tensor. Instances of this class do not own the memory where the +data are stored. + +A TensorMap is not resizable because it does not own the memory where its data +are stored. + +#### Constructor TensorMap<Tensor<data_type, rank>>(data, size0, size1, ...) + +Constructor for a Tensor. The constructor must be passed a pointer to the +storage for the data, and "rank" size attributes. The storage has to be +large enough to hold all the data. + + // Map a tensor of ints on top of stack-allocated storage. + int storage[128]; // 2 x 4 x 2 x 8 = 128 + TensorMap<int, 4> t_4d(storage, 2, 4, 2, 8); + + // The same storage can be viewed as a different tensor. + // You can also pass the sizes as an array. + TensorMap<int, 2> t_2d(storage, 16, 8); + + // You can also map fixed-size tensors. Here we get a 1d view of + // the 2d fixed-size tensor. + TensorFixedSize<float, Sizes<4, 5>> t_4x3; + TensorMap<float, 1> t_12(t_4x3, 12); + + +#### Class TensorRef + +See Assigning to a TensorRef below. + +## Accessing Tensor Elements + +#### <data_type> tensor(index0, index1...) + +Return the element at position ```(index0, index1...)``` in tensor +```tensor```. You must pass as many parameters as the rank of ```tensor```. +The expression can be used as an l-value to set the value of the element at the +specified position. The value returned is of the datatype of the tensor. + + // Set the value of the element at position (0, 1, 0); + Tensor<float, 3> t_3d(2, 3, 4); + t_3d(0, 1, 0) = 12.0f; + + // Initialize all elements to random values. + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 4; ++k) { + t_3d(i, j, k) = ...some random value...; + } + } + } + + // Print elements of a tensor. + for (int i = 0; i < 2; ++i) { + LOG(INFO) << t_3d(i, 0, 0); + } + + +## TensorLayout + +The tensor library supports 2 layouts: ```ColMajor``` (the default) and +```RowMajor```. Only the default column major layout is currently fully +supported, and it is therefore not recommended to attempt to use the row major +layout at the moment. + +The layout of a tensor is optionally specified as part of its type. If not +specified explicitly column major is assumed. + + Tensor<float, 3, ColMajor> col_major; // equivalent to Tensor<float, 3> + TensorMap<Tensor<float, 3, RowMajor> > row_major(data, ...); + +All the arguments to an expression must use the same layout. Attempting to mix +different layouts will result in a compilation error. + +It is possible to change the layout of a tensor or an expression using the +```swap_layout()``` method. Note that this will also reverse the order of the +dimensions. + + Tensor<float, 2, ColMajor> col_major(2, 4); + Tensor<float, 2, RowMajor> row_major(2, 4); + + Tensor<float, 2> col_major_result = col_major; // ok, layouts match + Tensor<float, 2> col_major_result = row_major; // will not compile + + // Simple layout swap + col_major_result = row_major.swap_layout(); + eigen_assert(col_major_result.dimension(0) == 4); + eigen_assert(col_major_result.dimension(1) == 2); + + // Swap the layout and preserve the order of the dimensions + array<int, 2> shuffle(1, 0); + col_major_result = row_major.swap_layout().shuffle(shuffle); + eigen_assert(col_major_result.dimension(0) == 2); + eigen_assert(col_major_result.dimension(1) == 4); + + +## Tensor Operations + +The Eigen Tensor library provides a vast library of operations on Tensors: +numerical operations such as addition and multiplication, geometry operations +such as slicing and shuffling, etc. These operations are available as methods +of the Tensor classes, and in some cases as operator overloads. For example +the following code computes the elementwise addition of two tensors: + + Tensor<float, 3> t1(2, 3, 4); + ...set some values in t1... + Tensor<float, 3> t2(2, 3, 4); + ...set some values in t2... + // Set t3 to the element wise sum of t1 and t2 + Tensor<float, 3> t3 = t1 + t2; + +While the code above looks easy enough, it is important to understand that the +expression ```t1 + t2``` is not actually adding the values of the tensors. The +expression instead constructs a "tensor operator" object of the class +TensorCwiseBinaryOp<scalar_sum>, which has references to the tensors +```t1``` and ```t2```. This is a small C++ object that knows how to add +```t1``` and ```t2```. It is only when the value of the expression is assigned +to the tensor ```t3``` that the addition is actually performed. Technically, +this happens through the overloading of ```operator=()``` in the Tensor class. + +This mechanism for computing tensor expressions allows for lazy evaluation and +optimizations which are what make the tensor library very fast. + +Of course, the tensor operators do nest, and the expression ```t1 + t2 * +0.3f``` is actually represented with the (approximate) tree of operators: + + TensorCwiseBinaryOp<scalar_sum>(t1, TensorCwiseUnaryOp<scalar_mul>(t2, 0.3f)) + + +### Tensor Operations and C++ "auto" + +Because Tensor operations create tensor operators, the C++ ```auto``` keyword +does not have its intuitive meaning. Consider these 2 lines of code: + + Tensor<float, 3> t3 = t1 + t2; + auto t4 = t1 + t2; + +In the first line we allocate the tensor ```t3``` and it will contain the +result of the addition of ```t1``` and ```t2```. In the second line, ```t4``` +is actually the tree of tensor operators that will compute the addition of +```t1``` and ```t2```. In fact, ```t4``` is *not* a tensor and you cannot get +the values of its elements: + + Tensor<float, 3> t3 = t1 + t2; + cout << t3(0, 0, 0); // OK prints the value of t1(0, 0, 0) + t2(0, 0, 0) + + auto t4 = t1 + t2; + cout << t4(0, 0, 0); // Compilation error! + +When you use ```auto``` you do not get a Tensor as a result but instead a +non-evaluated expression. So only use ```auto``` to delay evaluation. + +Unfortunately, there is no single underlying concrete type for holding +non-evaluated expressions, hence you have to use auto in the case when you do +want to hold non-evaluated expressions. + +When you need the results of a set of tensor computations you have to assign the +result to a Tensor that will be capable of holding them. This can be +either a normal Tensor, a fixed size Tensor, or a TensorMap on an existing +piece of memory. All the following will work: + + auto t4 = t1 + t2; + + Tensor<float, 3> result = t4; // Could also be: result(t4); + cout << result(0, 0, 0); + + TensorMap<float, 4> result(<a float* with enough space>, <size0>, ...) = t4; + cout << result(0, 0, 0); + + TensorFixedSize<float, Sizes<size0, ...>> result = t4; + cout << result(0, 0, 0); + +Until you need the results, you can keep the operation around, and even reuse +it for additional operations. As long as you keep the expression as an +operation, no computation is performed. + + // One way to compute exp((t1 + t2) * 0.2f); + auto t3 = t1 + t2; + auto t4 = t3 * 0.2f; + auto t5 = t4.exp(); + Tensor<float, 3> result = t5; + + // Another way, exactly as efficient as the previous one: + Tensor<float, 3> result = ((t1 + t2) * 0.2f).exp(); + +### Controlling When Expression are Evaluated + +There are several ways to control when expressions are evaluated: + +* Assignment to a Tensor, TensorFixedSize, or TensorMap. +* Use of the eval() method. +* Assignment to a TensorRef. + +#### Assigning to a Tensor, TensorFixedSize, or TensorMap. + +The most common way to evaluate an expression is to assign it to a Tensor. In +the example below, the ```auto``` declarations make the intermediate values +"Operations", not Tensors, and do not cause the expressions to be evaluated. +The assignment to the Tensor ```result``` causes the evaluation of all the +operations. + + auto t3 = t1 + t2; // t3 is an Operation. + auto t4 = t3 * 0.2f; // t4 is an Operation. + auto t5 = t4.exp(); // t5 is an Operation. + Tensor<float, 3> result = t5; // The operations are evaluated. + +If you know the ranks and sizes of the Operation value you can assign the +Operation to a TensorFixedSize instead of a Tensor, which is a bit more +efficient. + + // We know that the result is a 4x4x2 tensor! + TensorFixedSize<float, Sizes<4, 4, 2>> result = t5; + +Simiarly, assigning an expression to a TensorMap causes its evaluation. Like +tensors of type TensorFixedSize, TensorMaps cannot be resized so they have to +have the rank and sizes of the expression that are assigned to them. + +#### Calling eval(). + +When you compute large composite expressions, you sometimes want to tell Eigen +that an intermediate value in the expression tree is worth evaluating ahead of +time. This is done by inserting a call to the ```eval()``` method of the +expression Operation. + + // The previous example could have been written: + Tensor<float, 3> result = ((t1 + t2) * 0.2f).exp(); + + // If you want to compute (t1 + t2) once ahead of time you can write: + Tensor<float, 3> result = ((t1 + t2).eval() * 0.2f).exp(); + +Semantically, calling ```eval()``` is equivalent to materializing the value of +the expression in a temporary Tensor of the right size. The code above in +effect does: + + // .eval() knows the size! + TensorFixedSize<float, Sizes<4, 4, 2>> tmp = t1 + t2; + Tensor<float, 3> result = (tmp * 0.2f).exp(); + +Note that the return value of ```eval()``` is itself an Operation, so the +following code does not do what you may think: + + // Here t3 is an evaluation Operation. t3 has not been evaluated yet. + auto t3 = (t1 + t2).eval(); + + // You can use t3 in another expression. Still no evaluation. + auto t4 = (t3 * 0.2f).exp(); + + // The value is evaluated when you assign the Operation to a Tensor, using + // an intermediate tensor to represent t3.x + Tensor<float, 3> result = t4; + +While in the examples above calling ```eval()``` does not make a difference in +performance, in other cases it can make a huge difference. In the expression +below the ```broadcast()``` expression causes the ```X.maximum()``` expression +to be evaluated many times: + + Tensor<...> X ...; + Tensor<...> Y = ((X - X.maximum(depth_dim).reshape(dims2d).broadcast(bcast)) + * beta).exp(); + +Inserting a call to ```eval()``` between the ```maximum()``` and +```reshape()``` calls guarantees that maximum() is only computed once and +greatly speeds-up execution: + + Tensor<...> Y = + ((X - X.maximum(depth_dim).eval().reshape(dims2d).broadcast(bcast)) + * beta).exp(); + +In the other example below, the tensor ```Y``` is both used in the expression +and its assignment. This is an aliasing problem and if the evaluation is not +done in the right order Y will be updated incrementally during the evaluation +resulting in bogus results: + + Tensor<...> Y ...; + Y = Y / (Y.sum(depth_dim).reshape(dims2d).broadcast(bcast)); + +Inserting a call to ```eval()``` between the ```sum()``` and ```reshape()``` +expressions ensures that the sum is computed before any updates to ```Y``` are +done. + + Y = Y / (Y.sum(depth_dim).eval().reshape(dims2d).broadcast(bcast)); + +Note that an eval around the full right hand side expression is not needed +because the generated has to compute the i-th value of the right hand side +before assigning it to the left hand side. + +However, if you were assigning the expression value to a shuffle of ```Y``` +then you would need to force an eval for correctness by adding an ```eval()``` +call for the right hand side: + + Y.shuffle(...) = + (Y / (Y.sum(depth_dim).eval().reshape(dims2d).broadcast(bcast))).eval(); + + +#### Assigning to a TensorRef. + +If you need to access only a few elements from the value of an expression you +can avoid materializing the value in a full tensor by using a TensorRef. + +A TensorRef is a small wrapper class for any Eigen Operation. It provides +overloads for the ```()``` operator that let you access individual values in +the expression. TensorRef is convenient, because the Operation themselves do +not provide a way to access individual elements. + + // Create a TensorRef for the expression. The expression is not + // evaluated yet. + TensorRef<Tensor<float, 3> > ref = ((t1 + t2) * 0.2f).exp(); + + // Use "ref" to access individual elements. The expression is evaluated + // on the fly. + float at_0 = ref(0, 0, 0); + cout << ref(0, 1, 0); + +Only use TensorRef when you need a subset of the values of the expression. +TensorRef only computes the values you access. However note that if you are +going to access all the values it will be much faster to materialize the +results in a Tensor first. + +In some cases, if the full Tensor result would be very large, you may save +memory by accessing it as a TensorRef. But not always. So don't count on it. + + +### Controlling How Expressions Are Evaluated + +The tensor library provides several implementations of the various operations +such as contractions and convolutions. The implementations are optimized for +different environments: single threaded on CPU, multi threaded on CPU, or on a +GPU using cuda. Additional implementations may be added later. + +You can choose which implementation to use with the ```device()``` call. If +you do not choose an implementation explicitly the default implementation that +uses a single thread on the CPU is used. + +The default implementation has been optimized for recent Intel CPUs, taking +advantage of SSE, AVX, and FMA instructions. Work is ongoing to tune the +library on ARM CPUs. Note that you need to pass compiler-dependent flags +to enable the use of SSE, AVX, and other instructions. + +For example, the following code adds two tensors using the default +single-threaded CPU implementation: + + Tensor<float, 2> a(30, 40); + Tensor<float, 2> b(30, 40); + Tensor<float, 2> c = a + b; + +To choose a different implementation you have to insert a ```device()``` call +before the assignment of the result. For technical C++ reasons this requires +that the Tensor for the result be declared on its own. This means that you +have to know the size of the result. + + Eigen::Tensor<float, 2> c(30, 40); + c.device(...) = a + b; + +The call to ```device()``` must be the last call on the left of the operator=. + +You must pass to the ```device()``` call an Eigen device object. There are +presently three devices you can use: DefaultDevice, ThreadPoolDevice and +GpuDevice. + + +#### Evaluating With the DefaultDevice + +This is exactly the same as not inserting a ```device()``` call. + + DefaultDevice my_device; + c.device(my_device) = a + b; + +#### Evaluating with a Thread Pool + + #include "thread/threadpool.h" + + // Create a threadpool and start the threads. This is the Google way, + // other environments use different mechanism to create a thread pool. + ThreadPool my_pool(4 /* number of threads in the pool */); + my_pool.StartWorkers(); + + // Create the Eigen ThreadPoolDevice. + // You typically use up to all the available threads in the pool. + Eigen::ThreadPoolDevice my_device(&my_pool, 4 /* number of threads to use */); + + // Now just use the device when evaluating expressions. + Eigen::Tensor<float, 2> c(30, 50); + c.device(my_device) = a.contract(b, dot_product_dims); + + +#### Evaluating On GPU + +This is presently a bit more complicated than just using a thread pool device. +You need to create a GPU device but you also need to explicitly allocate the +memory for tensors with cuda. + + +## API Reference + +### Datatypes + +In the documentation of the tensor methods and Operation we mention datatypes +that are tensor-type specific: + +#### <Tensor-Type>::Dimensions + +Acts like an array of ints. Has an ```int size``` attribute, and can be +indexed like an array to access individual values. Used to represent the +dimensions of a tensor. See ```dimensions()```. + +#### <Tensor-Type>::Index + +Acts like an ```int```. Used for indexing tensors along their dimensions. See +```operator()```, ```dimension()```, and ```size()```. + +#### <Tensor-Type>::Scalar + +Represents the datatype of individual tensor elements. For example, for a +```Tensor<float>```, ```Scalar``` is the type ```float```. See +```setConstant()```. + +#### <Operation> + +We use this pseudo type to indicate that a tensor Operation is returned by a +method. We indicate in the text the type and dimensions of the tensor that the +Operation returns after evaluation. + +The Operation will have to be evaluated, for example by assigning it to a +tensor, before you can access the values of the resulting tensor. You can also +access the values through a TensorRef. + + +## Built-in Tensor Methods + +These are usual C++ methods that act on tensors immediately. They are not +Operations which provide delayed evaluation of their results. Unless specified +otherwise, all the methods listed below are available on all tensor classes: +Tensor, TensorFixedSize, and TensorMap. + +## Metadata + +### int NumDimensions + +Constant value indicating the number of dimensions of a Tensor. This is also +known as the tensor "rank". + + Eigen::Tensor<float, 2> a(3, 4); + cout << "Dims " << a.NumDimensions; + => Dims 2 + +### Dimensions dimensions() + +Returns an array-like object representing the dimensions of the tensor. +The actual type of the dimensions() result is <Tensor-Type>::Dimensions. + + Eigen::Tensor<float, 2> a(3, 4); + const Eigen::Tensor<float, 2>::Dimensions& d = a.dimensions(); + cout << "Dim size: " << d.size << ", dim 0: " << d[0] + << ", dim 1: " << d[1]; + => Dim size: 2, dim 0: 3, dim 1: 4 + +If you use a C++11 compiler, you can use ```auto``` to simplify the code: + + const auto& d = a.dimensions(); + cout << "Dim size: " << d.size << ", dim 0: " << d[0] + << ", dim 1: " << d[1]; + => Dim size: 2, dim 0: 3, dim 1: 4 + +### Index dimension(Index n) + +Returns the n-th dimension of the tensor. The actual type of the +```dimension()``` result is ```<Tensor-Type>::Index```, but you can +always use it like an int. + + Eigen::Tensor<float, 2> a(3, 4); + int dim1 = a.dimension(1); + cout << "Dim 1: " << dim1; + => Dim 1: 4 + +### Index size() + +Returns the total number of elements in the tensor. This is the product of all +the tensor dimensions. The actual type of the ```size()``` result is +```<Tensor-Type>::Index```, but you can always use it like an int. + + Eigen::Tensor<float, 2> a(3, 4); + cout << "Size: " << a.size(); + => Size: 12 + + +### Getting Dimensions From An Operation + +A few operations provide ```dimensions()``` directly, +e.g. ```TensorReslicingOp```. Most operations defer calculating dimensions +until the operation is being evaluated. If you need access to the dimensions +of a deferred operation, you can wrap it in a TensorRef (see Assigning to a +TensorRef above), which provides ```dimensions()``` and ```dimension()``` as +above. + +TensorRef can also wrap the plain Tensor types, so this is a useful idiom in +templated contexts where the underlying object could be either a raw Tensor +or some deferred operation (e.g. a slice of a Tensor). In this case, the +template code can wrap the object in a TensorRef and reason about its +dimensionality while remaining agnostic to the underlying type. + + +## Constructors + +### Tensor + +Creates a tensor of the specified size. The number of arguments must be equal +to the rank of the tensor. The content of the tensor is not initialized. + + Eigen::Tensor<float, 2> a(3, 4); + cout << "NumRows: " << a.dimension(0) << " NumCols: " << a.dimension(1) << endl; + => NumRows: 3 NumCols: 4 + +### TensorFixedSize + +Creates a tensor of the specified size. The number of arguments in the Size<> +template parameter determines the rank of the tensor. The content of the tensor +is not initialized. + + Eigen::TensorFixedSize<float, Sizes<3, 4>> a; + cout << "Rank: " << a.rank() << endl; + => Rank: 2 + cout << "NumRows: " << a.dimension(0) << " NumCols: " << a.dimension(1) << endl; + => NumRows: 3 NumCols: 4 + +### TensorMap + +Creates a tensor mapping an existing array of data. The data must not be freed +until the TensorMap is discarded, and the size of the data must be large enough +to accomodate the coefficients of the tensor. + + float data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; + Eigen::TensorMap<float, 2> a(data, 3, 4); + cout << "NumRows: " << a.dimension(0) << " NumCols: " << a.dimension(1) << endl; + => NumRows: 3 NumCols: 4 + cout << "a(1, 2): " << a(1, 2) << endl; + => a(1, 2): 9 + + +## Contents Initialization + +When a new Tensor or a new TensorFixedSize are created, memory is allocated to +hold all the tensor elements, but the memory is not initialized. Similarly, +when a new TensorMap is created on top of non-initialized memory, its +contents are not initialized. + +You can use one of the methods below to initialize the tensor memory. These +have an immediate effect on the tensor and return the tensor itself as a +result. These are not tensor Operations which delay evaluation. + +### <Tensor-Type> setConstant(const Scalar& val) + +Sets all elements of the tensor to the constant value ```val```. ```Scalar``` +is the type of data stored in the tensor. You can pass any value that is +convertible to that type. + +Returns the tensor itself in case you want to chain another call. + + a.setConstant(12.3f); + cout << "Constant: " << endl << a << endl << endl; + => + Constant: + 12.3 12.3 12.3 12.3 + 12.3 12.3 12.3 12.3 + 12.3 12.3 12.3 12.3 + +Note that ```setConstant()``` can be used on any tensor where the element type +has a copy constructor and an ```operator=()```: + + Eigen::Tensor<string, 2> a(2, 3); + a.setConstant("yolo"); + cout << "String tensor: " << endl << a << endl << endl; + => + String tensor: + yolo yolo yolo + yolo yolo yolo + + +### <Tensor-Type> setZero() + +Fills the tensor with zeros. Equivalent to ```setConstant(Scalar(0))```. +Returns the tensor itself in case you want to chain another call. + + a.setZero(); + cout << "Zeros: " << endl << a << endl << endl; + => + Zeros: + 0 0 0 0 + 0 0 0 0 + 0 0 0 0 + + +### <Tensor-Type> setValues({..initializer_list}) + +Fills the tensor with explicit values specified in a std::initializer_list. +The type of the initializer list depends on the type and rank of the tensor. + +If the tensor has rank N, the initializer list must be nested N times. The +most deeply nested lists must contains P scalars of the Tensor type where P is +the size of the last dimension of the Tensor. + +For example, for a ```TensorFixedSize<float, Sizes<2, 3>>``` the initializer list must +contains 2 lists of 3 floats each. + +```setValues()``` returns the tensor itself in case you want to chain another +call. + + Eigen::Tensor<float, 2> a(2, 3); + a.setValues({{0.0f, 1.0f, 2.0f}, {3.0f, 4.0f, 5.0f}}); + cout << "a" << endl << a << endl << endl; + => + a + 0 1 2 + 3 4 5 + +If a list is too short, the corresponding elements of the tensor will not be +changed. This is valid at each level of nesting. For example the following +code only sets the values of the first row of the tensor. + + Eigen::Tensor<int, 2> a(2, 3); + a.setConstant(1000); + a.setValues({{10, 20, 30}}); + cout << "a" << endl << a << endl << endl; + => + a + 10 20 30 + 1000 1000 1000 + +### <Tensor-Type> setRandom() + +Fills the tensor with random values. Returns the tensor itself in case you +want to chain another call. + + a.setRandom(); + cout << "Random: " << endl << a << endl << endl; + => + Random: + 0.680375 0.59688 -0.329554 0.10794 + -0.211234 0.823295 0.536459 -0.0452059 + 0.566198 -0.604897 -0.444451 0.257742 + +You can customize ```setRandom()``` by providing your own random number +generator as a template argument: + + a.setRandom<MyRandomGenerator>(); + +Here, ```MyRandomGenerator``` must be a struct with the following member +functions, where Scalar and Index are the same as ```<Tensor-Type>::Scalar``` +and ```<Tensor-Type>::Index```. + +See ```struct UniformRandomGenerator``` in TensorFunctors.h for an example. + + // Custom number generator for use with setRandom(). + struct MyRandomGenerator { + // Default and copy constructors. Both are needed + MyRandomGenerator() { } + MyRandomGenerator(const MyRandomGenerator& ) { } + + // Return a random value to be used. "element_location" is the + // location of the entry to set in the tensor, it can typically + // be ignored. + Scalar operator()(Eigen::DenseIndex element_location, + Eigen::DenseIndex /*unused*/ = 0) const { + return <randomly generated value of type T>; + } + + // Same as above but generates several numbers at a time. + typename internal::packet_traits<Scalar>::type packetOp( + Eigen::DenseIndex packet_location, Eigen::DenseIndex /*unused*/ = 0) const { + return <a packet of randomly generated values>; + } + }; + +You can also use one of the 2 random number generators that are part of the +tensor library: +* UniformRandomGenerator +* NormalRandomGenerator + + +## Data Access + +The Tensor, TensorFixedSize, and TensorRef classes provide the following +accessors to access the tensor coefficients: + + const Scalar& operator()(const array<Index, NumIndices>& indices) + const Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) + Scalar& operator()(const array<Index, NumIndices>& indices) + Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) + +The number of indices must be equal to the rank of the tensor. Moreover, these +accessors are not available on tensor expressions. In order to access the +values of a tensor expression, the expression must either be evaluated or +wrapped in a TensorRef. + + +### Scalar* data() and const Scalar* data() const + +Returns a pointer to the storage for the tensor. The pointer is const if the +tensor was const. This allows direct access to the data. The layout of the +data depends on the tensor layout: RowMajor or ColMajor. + +This access is usually only needed for special cases, for example when mixing +Eigen Tensor code with other libraries. + +Scalar is the type of data stored in the tensor. + + Eigen::Tensor<float, 2> a(3, 4); + float* a_data = a.data(); + a_data[0] = 123.45f; + cout << "a(0, 0): " << a(0, 0); + => a(0, 0): 123.45 + + +## Tensor Operations + +All the methods documented below return non evaluated tensor ```Operations```. +These can be chained: you can apply another Tensor Operation to the value +returned by the method. + +The chain of Operation is evaluated lazily, typically when it is assigned to a +tensor. See "Controlling when Expressions are Evaluated" for more details about +their evaluation. + +### <Operation> constant(const Scalar& val) + +Returns a tensor of the same type and dimensions as the original tensor but +where all elements have the value ```val```. + +This is useful, for example, when you want to add or subtract a constant from a +tensor, or multiply every element of a tensor by a scalar. + + Eigen::Tensor<float, 2> a(2, 3); + a.setConstant(1.0f); + Eigen::Tensor<float, 2> b = a + a.constant(2.0f); + Eigen::Tensor<float, 2> c = b * b.constant(0.2f); + cout << "a" << endl << a << endl << endl; + cout << "b" << endl << b << endl << endl; + cout << "c" << endl << c << endl << endl; + => + a + 1 1 1 + 1 1 1 + + b + 3 3 3 + 3 3 3 + + c + 0.6 0.6 0.6 + 0.6 0.6 0.6 + +### <Operation> random() + +Returns a tensor of the same type and dimensions as the current tensor +but where all elements have random values. + +This is for example useful to add random values to an existing tensor. +The generation of random values can be customized in the same manner +as for ```setRandom()```. + + Eigen::Tensor<float, 2> a(2, 3); + a.setConstant(1.0f); + Eigen::Tensor<float, 2> b = a + a.random(); + cout << "a" << endl << a << endl << endl; + cout << "b" << endl << b << endl << endl; + => + a + 1 1 1 + 1 1 1 + + b + 1.68038 1.5662 1.82329 + 0.788766 1.59688 0.395103 + + +## Unary Element Wise Operations + +All these operations take a single input tensor as argument and return a tensor +of the same type and dimensions as the tensor to which they are applied. The +requested operations are applied to each element independently. + +### <Operation> operator-() + +Returns a tensor of the same type and dimensions as the original tensor +containing the opposite values of the original tensor. + + Eigen::Tensor<float, 2> a(2, 3); + a.setConstant(1.0f); + Eigen::Tensor<float, 2> b = -a; + cout << "a" << endl << a << endl << endl; + cout << "b" << endl << b << endl << endl; + => + a + 1 1 1 + 1 1 1 + + b + -1 -1 -1 + -1 -1 -1 + +### <Operation> sqrt() + +Returns a tensor of the same type and dimensions as the original tensor +containing the square roots of the original tensor. + +### <Operation> rsqrt() + +Returns a tensor of the same type and dimensions as the original tensor +containing the inverse square roots of the original tensor. + +### <Operation> square() + +Returns a tensor of the same type and dimensions as the original tensor +containing the squares of the original tensor values. + +### <Operation> inverse() + +Returns a tensor of the same type and dimensions as the original tensor +containing the inverse of the original tensor values. + +### <Operation> exp() + +Returns a tensor of the same type and dimensions as the original tensor +containing the exponential of the original tensor. + +### <Operation> log() + +Returns a tensor of the same type and dimensions as the original tensor +containing the natural logarithms of the original tensor. + +### <Operation> abs() + +Returns a tensor of the same type and dimensions as the original tensor +containing the absolute values of the original tensor. + +### <Operation> pow(Scalar exponent) + +Returns a tensor of the same type and dimensions as the original tensor +containing the coefficients of the original tensor to the power of the +exponent. + +The type of the exponent, Scalar, is always the same as the type of the +tensor coefficients. For example, only integer exponents can be used in +conjuntion with tensors of integer values. + +You can use cast() to lift this restriction. For example this computes +cubic roots of an int Tensor: + + Eigen::Tensor<int, 2> a(2, 3); + a.setValues({{0, 1, 8}, {27, 64, 125}}); + Eigen::Tensor<double, 2> b = a.cast<double>().pow(1.0 / 3.0); + cout << "a" << endl << a << endl << endl; + cout << "b" << endl << b << endl << endl; + => + a + 0 1 8 + 27 64 125 + + b + 0 1 2 + 3 4 5 + +### <Operation> operator * (Scalar scale) + +Multiplies all the coefficients of the input tensor by the provided scale. + +### <Operation> cwiseMax(Scalar threshold) +TODO + +### <Operation> cwiseMin(Scalar threshold) +TODO + +### <Operation> unaryExpr(const CustomUnaryOp& func) +TODO + + +## Binary Element Wise Operations + +These operations take two input tensors as arguments. The 2 input tensors should +be of the same type and dimensions. The result is a tensor of the same +dimensions as the tensors to which they are applied, and unless otherwise +specified it is also of the same type. The requested operations are applied to +each pair of elements independently. + +### <Operation> operator+(const OtherDerived& other) + +Returns a tensor of the same type and dimensions as the input tensors +containing the coefficient wise sums of the inputs. + +### <Operation> operator-(const OtherDerived& other) + +Returns a tensor of the same type and dimensions as the input tensors +containing the coefficient wise differences of the inputs. + +### <Operation> operator*(const OtherDerived& other) + +Returns a tensor of the same type and dimensions as the input tensors +containing the coefficient wise products of the inputs. + +### <Operation> operator/(const OtherDerived& other) + +Returns a tensor of the same type and dimensions as the input tensors +containing the coefficient wise quotients of the inputs. + +This operator is not supported for integer types. + +### <Operation> cwiseMax(const OtherDerived& other) + +Returns a tensor of the same type and dimensions as the input tensors +containing the coefficient wise maximums of the inputs. + +### <Operation> cwiseMin(const OtherDerived& other) + +Returns a tensor of the same type and dimensions as the input tensors +containing the coefficient wise mimimums of the inputs. + +### <Operation> Logical operators + +The following logical operators are supported as well: + +* operator&&(const OtherDerived& other) +* operator||(const OtherDerived& other) +* operator<(const OtherDerived& other) +* operator<=(const OtherDerived& other) +* operator>(const OtherDerived& other) +* operator>=(const OtherDerived& other) +* operator==(const OtherDerived& other) +* operator!=(const OtherDerived& other) + +They all return a tensor of boolean values. + + +## Selection (select(const ThenDerived& thenTensor, const ElseDerived& elseTensor) + +Selection is a coefficient-wise ternary operator that is the tensor equivalent +to the if-then-else operation. + + Tensor<bool, 3> if = ...; + Tensor<float, 3> then = ...; + Tensor<float, 3> else = ...; + Tensor<float, 3> result = if.select(then, else); + +The 3 arguments must be of the same dimensions, which will also be the dimension +of the result. The 'if' tensor must be of type boolean, the 'then' and the +'else' tensor must be of the same type, which will also be the type of the +result. + +Each coefficient in the result is equal to the corresponding coefficient in the +'then' tensor if the corresponding value in the 'if' tensor is true. If not, the +resulting coefficient will come from the 'else' tensor. + + +## Contraction + +Tensor *contractions* are a generalization of the matrix product to the +multidimensional case. + + // Create 2 matrices using tensors of rank 2 + Eigen::Tensor<int, 2> a(2, 3); + a.setValues({{1, 2, 3}, {6, 5, 4}}); + Eigen::Tensor<int, 2> b(3, 2); + a.setValues({{1, 2}, {4, 5}, {5, 6}}); + + // Compute the traditional matrix product + array<IndexPair<int>, 1> product_dims = { IndexPair(1, 0) }; + Eigen::Tensor<int, 2> AB = a.contract(b, product_dims); + + // Compute the product of the transpose of the matrices + array<IndexPair<int>, 1> transpose_product_dims = { IndexPair(0, 1) }; + Eigen::Tensor<int, 2> AtBt = a.contract(b, transposed_product_dims); + + +## Reduction Operations + +A *Reduction* operation returns a tensor with fewer dimensions than the +original tensor. The values in the returned tensor are computed by applying a +*reduction operator* to slices of values from the original tensor. You specify +the dimensions along which the slices are made. + +The Eigen Tensor library provides a set of predefined reduction operators such +as ```maximum()``` and ```sum()``` and lets you define additional operators by +implementing a few methods from a reductor template. + +### Reduction Dimensions + +All reduction operations take a single parameter of type +```<TensorType>::Dimensions``` which can always be specified as an array of +ints. These are called the "reduction dimensions." The values are the indices +of the dimensions of the input tensor over which the reduction is done. The +parameter can have at most as many element as the rank of the input tensor; +each element must be less than the tensor rank, as it indicates one of the +dimensions to reduce. + +Each dimension of the input tensor should occur at most once in the reduction +dimensions as the implementation does not remove duplicates. + +The order of the values in the reduction dimensions does not affect the +results, but the code may execute faster if you list the dimensions in +increasing order. + +Example: Reduction along one dimension. + + // Create a tensor of 2 dimensions + Eigen::Tensor<int, 2> a(2, 3); + a.setValues({{1, 2, 3}, {6, 5, 4}}); + // Reduce it along the second dimension (1)... + Eigen::array<int, 1> dims({1 /* dimension to reduce */}); + // ...using the "maximum" operator. + // The result is a tensor with one dimension. The size of + // that dimension is the same as the first (non-reduced) dimension of a. + Eigen::Tensor<int, 1> b = a.maximum(dims); + cout << "a" << endl << a << endl << endl; + cout << "b" << endl << b << endl << endl; + => + a + 1 2 3 + 6 5 4 + + b + 3 + 6 + +Example: Reduction along two dimensions. + + Eigen::Tensor<float, 3, Eigen::ColMajor> a(2, 3, 4); + a.setValues({{{0.0f, 1.0f, 2.0f, 3.0f}, + {7.0f, 6.0f, 5.0f, 4.0f}, + {8.0f, 9.0f, 10.0f, 11.0f}}, + {{12.0f, 13.0f, 14.0f, 15.0f}, + {19.0f, 18.0f, 17.0f, 16.0f}, + {20.0f, 21.0f, 22.0f, 23.0f}}}); + // The tensor a has 3 dimensions. We reduce along the + // first 2, resulting in a tensor with a single dimension + // of size 4 (the last dimension of a.) + // Note that we pass the array of reduction dimensions + // directly to the maximum() call. + Eigen::Tensor<float, 1, Eigen::ColMajor> b = + a.maximum(Eigen::array<int, 2>({0, 1})); + cout << "b" << endl << b << endl << endl; + => + b + 20 + 21 + 22 + 23 + +#### Reduction along all dimensions + +As a special case, if you pass no parameter to a reduction operation the +original tensor is reduced along *all* its dimensions. The result is a +one-dimension tensor with a single value. + + Eigen::Tensor<float, 3> a(2, 3, 4); + a.setValues({{{0.0f, 1.0f, 2.0f, 3.0f}, + {7.0f, 6.0f, 5.0f, 4.0f}, + {8.0f, 9.0f, 10.0f, 11.0f}}, + {{12.0f, 13.0f, 14.0f, 15.0f}, + {19.0f, 18.0f, 17.0f, 16.0f}, + {20.0f, 21.0f, 22.0f, 23.0f}}}); + // Reduce along all dimensions using the sum() operator. + Eigen::Tensor<float, 1> b = a.sum(); + cout << "b" << endl << b << endl << endl; + => + b + 276 + + +### <Operation> sum(const Dimensions& new_dims) +### <Operation> sum() + +Reduce a tensor using the sum() operator. The resulting values +are the sum of the reduced values. + +### <Operation> mean(const Dimensions& new_dims) +### <Operation> mean() + +Reduce a tensor using the mean() operator. The resulting values +are the mean of the reduced values. + +### <Operation> maximum(const Dimensions& new_dims) +### <Operation> maximum() + +Reduce a tensor using the maximum() operator. The resulting values are the +largest of the reduced values. + +### <Operation> minimum(const Dimensions& new_dims) +### <Operation> minimum() + +Reduce a tensor using the minimum() operator. The resulting values +are the smallest of the reduced values. + +### <Operation> prod(const Dimensions& new_dims) +### <Operation> prod() + +Reduce a tensor using the prod() operator. The resulting values +are the product of the reduced values. + +### <Operation> all(const Dimensions& new_dims) +### <Operation> all() +Reduce a tensor using the all() operator. Casts tensor to bool and then checks +whether all elements are true. Runs through all elements rather than +short-circuiting, so may be significantly inefficient. + +### <Operation> any(const Dimensions& new_dims) +### <Operation> any() +Reduce a tensor using the any() operator. Casts tensor to bool and then checks +whether any element is true. Runs through all elements rather than +short-circuiting, so may be significantly inefficient. + +### <Operation> reduce(const Dimensions& new_dims, const Reducer& reducer) + +Reduce a tensor using a user-defined reduction operator. See ```SumReducer``` +in TensorFunctors.h for information on how to implement a reduction operator. + + +## Convolutions + +### <Operation> convolve(const KernelDerived& kernel, const Dimensions& dims) + +Returns a tensor that is the output of the convolution of the of the input tensor with the kernel, +along the specified dimensions of the input tensor. The dimension size for dimensions of the output tensor +which were part of the convolution will be reduced by the formula: +output_dim_size = input_dim_size - kernel_dim_size + 1 (requires: input_dim_size >= kernel_dim_size). +The dimension sizes for dimensions that were not part of the convolution will remain the same. +Performance of the convolution can depend on the length of the stride(s) of the input tensor dimension(s) along which the +convolution is computed (the first dimension has the shortest stride for ColMajor, whereas RowMajor's shortest stride is +for the last dimension). + + // Compute convolution along the second and third dimension. + Tensor<float, 4, DataLayout> input(3, 3, 7, 11); + Tensor<float, 2, DataLayout> kernel(2, 2); + Tensor<float, 4, DataLayout> output(3, 2, 6, 11); + input.setRandom(); + kernel.setRandom(); + + Eigen::array<ptrdiff_t, 2> dims({1, 2}); // Specify second and third dimension for convolution. + output = input.convolve(kernel, dims); + + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 2; ++j) { + for (int k = 0; k < 6; ++k) { + for (int l = 0; l < 11; ++l) { + const float result = output(i,j,k,l); + const float expected = input(i,j+0,k+0,l) * kernel(0,0) + + input(i,j+1,k+0,l) * kernel(1,0) + + input(i,j+0,k+1,l) * kernel(0,1) + + input(i,j+1,k+1,l) * kernel(1,1); + VERIFY_IS_APPROX(result, expected); + } + } + } + } + + + +## Geometrical Operations + +These operations return a Tensor with different dimensions than the original +Tensor. They can be used to access slices of tensors, see them with different +dimensions, or pad tensors with additional data. + +### <Operation> reshape(const Dimensions& new_dims) + +Returns a view of the input tensor that has been reshaped to the specified +new dimensions. The argument new_dims is an array of Index values. The +rank of the resulting tensor is equal to the number of elements in new_dims. + +The product of all the sizes in the new dimension array must be equal to +the number of elements in the input tensor. + + // Increase the rank of the input tensor by introducing a new dimension + // of size 1. + Tensor<float, 2> input(7, 11); + array<int, 3> three_dims{{7, 11, 1}}; + Tensor<float, 3> result = input.reshape(three_dims); + + // Decrease the rank of the input tensor by merging 2 dimensions; + array<int, 1> one_dim{{7 * 11}}; + Tensor<float, 1> result = input.reshape(one_dim); + +This operation does not move any data in the input tensor, so the resulting +contents of a reshaped Tensor depend on the data layout of the original Tensor. + +For example this is what happens when you ```reshape()``` a 2D ColMajor tensor +to one dimension: + + Eigen::Tensor<float, 2, Eigen::ColMajor> a(2, 3); + a.setValues({{0.0f, 100.0f, 200.0f}, {300.0f, 400.0f, 500.0f}}); + Eigen::array<Eigen::DenseIndex, 1> one_dim({3 * 2}); + Eigen::Tensor<float, 1, Eigen::ColMajor> b = a.reshape(one_dim); + cout << "b" << endl << b << endl; + => + b + 0 + 300 + 100 + 400 + 200 + 500 + +This is what happens when the 2D Tensor is RowMajor: + + Eigen::Tensor<float, 2, Eigen::RowMajor> a(2, 3); + a.setValues({{0.0f, 100.0f, 200.0f}, {300.0f, 400.0f, 500.0f}}); + Eigen::array<Eigen::DenseIndex, 1> one_dim({3 * 2}); + Eigen::Tensor<float, 1, Eigen::RowMajor> b = a.reshape(one_dim); + cout << "b" << endl << b << endl; + => + b + 0 + 100 + 200 + 300 + 400 + 500 + +The reshape operation is a lvalue. In other words, it can be used on the left +side of the assignment operator. + +The previous example can be rewritten as follow: + + Eigen::Tensor<float, 2, Eigen::ColMajor> a(2, 3); + a.setValues({{0.0f, 100.0f, 200.0f}, {300.0f, 400.0f, 500.0f}}); + Eigen::array<Eigen::DenseIndex, 2> two_dim({2, 3}); + Eigen::Tensor<float, 1, Eigen::ColMajor> b; + b.reshape(two_dim) = a; + cout << "b" << endl << b << endl; + => + b + 0 + 300 + 100 + 400 + 200 + 500 + +Note that "b" itself was not reshaped but that instead the assignment is done to +the reshape view of b. + + +### <Operation> shuffle(const Shuffle& shuffle) + +Returns a copy of the input tensor whose dimensions have been +reordered according to the specified permutation. The argument shuffle +is an array of Index values. Its size is the rank of the input +tensor. It must contain a permutation of 0, 1, ..., rank - 1. The i-th +dimension of the output tensor equals to the size of the shuffle[i]-th +dimension of the input tensor. For example: + + // Shuffle all dimensions to the left by 1. + Tensor<float, 3> input(20, 30, 50); + // ... set some values in input. + Tensor<float, 3> output = input.shuffle({1, 2, 0}) + + eigen_assert(output.dimension(0) == 30); + eigen_assert(output.dimension(1) == 50); + eigen_assert(output.dimension(2) == 20); + +Indices into the output tensor are shuffled accordingly to formulate +indices into the input tensor. For example, one can assert in the above +code snippet that: + + eigen_assert(output(3, 7, 11) == input(11, 3, 7)); + +In general, one can assert that + + eigen_assert(output(..., indices[shuffle[i]], ...) == + input(..., indices[i], ...)) + +The shuffle operation results in a lvalue, which means that it can be assigned +to. In other words, it can be used on the left side of the assignment operator. + +Let's rewrite the previous example to take advantage of this feature: + + // Shuffle all dimensions to the left by 1. + Tensor<float, 3> input(20, 30, 50); + // ... set some values in input. + Tensor<float, 3> output(30, 50, 20); + output.shuffle({2, 0, 1}) = input; + + +### <Operation> stride(const Strides& strides) + +Returns a view of the input tensor that strides (skips stride-1 +elements) along each of the dimensions. The argument strides is an +array of Index values. The dimensions of the resulting tensor are +ceil(input_dimensions[i] / strides[i]). + +For example this is what happens when you ```stride()``` a 2D tensor: + + Eigen::Tensor<int, 2> a(4, 3); + a.setValues({{0, 100, 200}, {300, 400, 500}, {600, 700, 800}, {900, 1000, 1100}}); + Eigen::array<Eigen::DenseIndex, 2> strides({3, 2}); + Eigen::Tensor<int, 2> b = a.stride(strides); + cout << "b" << endl << b << endl; + => + b + 0 200 + 900 1100 + +It is possible to assign a tensor to a stride: + Tensor<float, 3> input(20, 30, 50); + // ... set some values in input. + Tensor<float, 3> output(40, 90, 200); + output.stride({2, 3, 4}) = input; + +### <Operation> inflate(const Strides& strides) + +Returns a view of an "inflated" tensor of the input tensor by inserting zeros +between the original elements in the input tensor. The argument strides is an +array of Index values, indicating how much "inflation" there is. The dimensions + of the resulting tensor are (input_dimensions[i] - 1) * strides[i] + 1. In +some sense it is the inverse of the ```stride()``` operation. + +For example this is what happens when you ```inflate()``` a 2D tensor: + + Eigen::Tensor<int, 2> a(2, 3); + a.setValues({{0, 100, 200}, {300, 400, 500}}); + Eigen::array<Eigen::DenseIndex, 2> strides({3, 2}); + Eigen::Tensor<int, 2> b = a.inflate(strides); + cout << "b" << endl << b << endl; + => + b + 0 0 0 100 0 0 200 + 0 0 0 0 0 0 0 + 300 0 0 400 0 0 500 + +The ```inflate()``` operation is an r-value only operation as it doesn't make +sense to assign a value to an inflated tensor in positions where the values are +hardwired to zero. + +### <Operation> slice(const StartIndices& offsets, const Sizes& extents) + +Returns a sub-tensor of the given tensor. For each dimension i, the slice is +made of the coefficients stored between offset[i] and offset[i] + extents[i] in +the input tensor. + + Eigen::Tensor<int, 2> a(4, 3); + a.setValues({{0, 100, 200}, {300, 400, 500}, + {600, 700, 800}, {900, 1000, 1100}}); + Eigen::array<int, 2> offsets = {1, 0}; + Eigen::array<int, 2> extents = {2, 2}; + Eigen::Tensor<int, 1> slice = a.slice(offsets, extents); + cout << "a" << endl << a << endl; + => + a + 0 100 200 + 300 400 500 + 600 700 800 + 900 1000 1100 + cout << "slice" << endl << slice << endl; + => + slice + 300 400 + 600 700 + + +### <Operation> chip(const Index offset, const Index dim) + +A chip is a special kind of slice. It is the subtensor at the given offset in +the dimension dim. The returned tensor has one fewer dimension than the input +tensor: the dimension dim is removed. + +For example, a matrix chip would be either a row or a column of the input +matrix. + + Eigen::Tensor<int, 2> a(4, 3); + a.setValues({{0, 100, 200}, {300, 400, 500}, + {600, 700, 800}, {900, 1000, 1100}}); + Eigen::Tensor<int, 1> row_3 = a.chip(2, 0); + Eigen::Tensor<int, 1> col_2 = a.chip(1, 1); + cout << "a" << endl << a << endl; + => + a + 0 100 200 + 300 400 500 + 600 700 800 + 900 1000 1100 + cout << "row_3" << endl << row_3 << endl; + => + row_3 + 600 700 800 + cout << "col_2" << endl << col_2 << endl; + => + col_2 + 100 400 700 1000 + +It is possible to assign values to a tensor chip since the chip operation is a +lvalue. For example: + + Eigen::Tensor<int, 1> a(3); + a.setValues({{100, 200, 300}}); + Eigen::Tensor<int, 2> b(2, 3); + b.setZero(); + b.chip(0, 0) = a; + cout << "a" << endl << a << endl; + => + a + 100 + 200 + 300 + cout << "b" << endl << b << endl; + => + b + 100 200 300 + 0 0 0 + + +### <Operation> reverse(const ReverseDimensions& reverse) + +Returns a view of the input tensor that reverses the order of the coefficients +along a subset of the dimensions. The argument reverse is an array of boolean +values that indicates whether or not the order of the coefficients should be +reversed along each of the dimensions. This operation preserves the dimensions +of the input tensor. + +For example this is what happens when you ```reverse()``` the first dimension +of a 2D tensor: + + Eigen::Tensor<int, 2> a(4, 3); + a.setValues({{0, 100, 200}, {300, 400, 500}, + {600, 700, 800}, {900, 1000, 1100}}); + Eigen::array<bool, 2> reverse({true, false}); + Eigen::Tensor<int, 2> b = a.reverse(reverse); + cout << "a" << endl << a << endl << "b" << endl << b << endl; + => + a + 0 100 200 + 300 400 500 + 600 700 800 + 900 1000 1100 + b + 900 1000 1100 + 600 700 800 + 300 400 500 + 0 100 200 + + +### <Operation> broadcast(const Broadcast& broadcast) + +Returns a view of the input tensor in which the input is replicated one to many +times. +The broadcast argument specifies how many copies of the input tensor need to be +made in each of the dimensions. + + Eigen::Tensor<int, 2> a(2, 3); + a.setValues({{0, 100, 200}, {300, 400, 500}}); + Eigen::array<int, 2> bcast({3, 2}); + Eigen::Tensor<int, 2> b = a.broadcast(bcast); + cout << "a" << endl << a << endl << "b" << endl << b << endl; + => + a + 0 100 200 + 300 400 500 + b + 0 100 200 0 100 200 + 300 400 500 300 400 500 + 0 100 200 0 100 200 + 300 400 500 300 400 500 + 0 100 200 0 100 200 + 300 400 500 300 400 500 + +### <Operation> concatenate(const OtherDerived& other, Axis axis) + +TODO + +### <Operation> pad(const PaddingDimensions& padding) + +Returns a view of the input tensor in which the input is padded with zeros. + + Eigen::Tensor<int, 2> a(2, 3); + a.setValues({{0, 100, 200}, {300, 400, 500}}); + Eigen::array<std::pair<int, int>, 2> paddings; + paddings[0] = make_pair(0, 1); + paddings[1] = make_pair(2, 3); + Eigen::Tensor<int, 2> b = a.pad(paddings); + cout << "a" << endl << a << endl << "b" << endl << b << endl; + => + a + 0 100 200 + 300 400 500 + b + 0 0 0 0 + 0 0 0 0 + 0 100 200 0 + 300 400 500 0 + 0 0 0 0 + 0 0 0 0 + 0 0 0 0 + + +### <Operation> extract_patches(const PatchDims& patch_dims) + +Returns a tensor of coefficient patches extracted from the input tensor, where +each patch is of dimension specified by 'patch_dims'. The returned tensor has +one greater dimension than the input tensor, which is used to index each patch. +The patch index in the output tensor depends on the data layout of the input +tensor: the patch index is the last dimension ColMajor layout, and the first +dimension in RowMajor layout. + +For example, given the following input tensor: + + Eigen::Tensor<float, 2, DataLayout> tensor(3,4); + tensor.setValues({{0.0f, 1.0f, 2.0f, 3.0f}, + {4.0f, 5.0f, 6.0f, 7.0f}, + {8.0f, 9.0f, 10.0f, 11.0f}}); + + cout << "tensor: " << endl << tensor << endl; + => + tensor: + 0 1 2 3 + 4 5 6 7 + 8 9 10 11 + +Six 2x2 patches can be extracted and indexed using the following code: + + Eigen::Tensor<float, 3, DataLayout> patch; + Eigen::array<ptrdiff_t, 2> patch_dims; + patch_dims[0] = 2; + patch_dims[1] = 2; + patch = tensor.extract_patches(patch_dims); + for (int k = 0; k < 6; ++k) { + cout << "patch index: " << k << endl; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 2; ++j) { + if (DataLayout == ColMajor) { + cout << patch(i, j, k) << " "; + } else { + cout << patch(k, i, j) << " "; + } + } + cout << endl; + } + } + +This code results in the following output when the data layout is ColMajor: + + patch index: 0 + 0 1 + 4 5 + patch index: 1 + 4 5 + 8 9 + patch index: 2 + 1 2 + 5 6 + patch index: 3 + 5 6 + 9 10 + patch index: 4 + 2 3 + 6 7 + patch index: 5 + 6 7 + 10 11 + +This code results in the following output when the data layout is RowMajor: +(NOTE: the set of patches is the same as in ColMajor, but are indexed differently). + + patch index: 0 + 0 1 + 4 5 + patch index: 1 + 1 2 + 5 6 + patch index: 2 + 2 3 + 6 7 + patch index: 3 + 4 5 + 8 9 + patch index: 4 + 5 6 + 9 10 + patch index: 5 + 6 7 + 10 11 + +### <Operation> extract_image_patches(const Index patch_rows, const Index patch_cols, + const Index row_stride, const Index col_stride, + const Index in_row_stride, const Index in_col_stride, + const Index row_inflate_stride, const Index col_inflate_stride, + const PaddingType padding_type, const Scalar padding_value) + +Returns a tensor of coefficient image patches extracted from the input tensor, +which is expected to have dimensions ordered as follows (depending on the data +layout of the input tensor, and the number of additional dimensions 'N'): + +* ColMajor + * 1st dimension: channels (of size d) + * 2nd dimension: rows (of size r) + * 3rd dimension: columns (of size c) + * 4th-Nth dimension: time (for video) or batch (for bulk processing). + +* RowMajor (reverse order of ColMajor) + * 1st-Nth dimension: time (for video) or batch (for bulk processing). + * N+1'th dimension: columns (of size c) + * N+2'th dimension: rows (of size r) + * N+3'th dimension: channels (of size d) + +The returned tensor has one greater dimension than the input tensor, which is +used to index each patch. The patch index in the output tensor depends on the +data layout of the input tensor: the patch index is the 4'th dimension in +ColMajor layout, and the 4'th from the last dimension in RowMajor layout. + +For example, given the following input tensor with the following dimension +sizes: + +* depth: 2 +* rows: 3 +* columns: 5 +* batch: 7 + + Tensor<float, 4> tensor(2,3,5,7); + Tensor<float, 4, RowMajor> tensor_row_major = tensor.swap_layout(); + +2x2 image patches can be extracted and indexed using the following code: + +* 2D patch: ColMajor (patch indexed by second-to-last dimension) + + Tensor<float, 5> twod_patch; + twod_patch = tensor.extract_image_patches<2, 2>(); + // twod_patch.dimension(0) == 2 + // twod_patch.dimension(1) == 2 + // twod_patch.dimension(2) == 2 + // twod_patch.dimension(3) == 3*5 + // twod_patch.dimension(4) == 7 + +* 2D patch: RowMajor (patch indexed by the second dimension) + + Tensor<float, 5, RowMajor> twod_patch_row_major; + twod_patch_row_major = tensor_row_major.extract_image_patches<2, 2>(); + // twod_patch_row_major.dimension(0) == 7 + // twod_patch_row_major.dimension(1) == 3*5 + // twod_patch_row_major.dimension(2) == 2 + // twod_patch_row_major.dimension(3) == 2 + // twod_patch_row_major.dimension(4) == 2 + +Input parameters: + +* patch_rows, patch_cols: Spatial extent of the extracted patches. +* row_stride, col_stride: Image Displacement (in pixels) between the + upper-left coordinates of consecutive patches. +* in_row_stride, in_col_stride: Image displacement (in pixels) between + two consecutive patch samples. If larger than 1 (default), they allow + for sparsely sampling the input image. +* row_inflate_stride, col_inflate_stride: If larger than 1 (default), "inflates" + the inputs by inserting zeros between the original elements. This is useful + for backward convolution. +* padding_type: Boundary conditions. Either PADDING_SAME (default) + or PADDING_VALID. +* padding_value: the value used in padding, defaults to 0. + +## Special Operations + +### <Operation> cast<T>() + +Returns a tensor of type T with the same dimensions as the original tensor. +The returned tensor contains the values of the original tensor converted to +type T. + + Eigen::Tensor<float, 2> a(2, 3); + Eigen::Tensor<int, 2> b = a.cast<int>(); + +This can be useful for example if you need to do element-wise division of +Tensors of integers. This is not currently supported by the Tensor library +but you can easily cast the tensors to floats to do the division: + + Eigen::Tensor<int, 2> a(2, 3); + a.setValues({{0, 1, 2}, {3, 4, 5}}); + Eigen::Tensor<int, 2> b = + (a.cast<float>() / a.constant(2).cast<float>()).cast<int>(); + cout << "a" << endl << a << endl << endl; + cout << "b" << endl << b << endl << endl; + => + a + 0 1 2 + 3 4 5 + + b + 0 0 1 + 1 2 2 + + +### <Operation> eval() + +TODO + + +## Representation of scalar values + +Scalar values are often represented by tensors of size 1 and rank 1. It would be +more logical and user friendly to use tensors of rank 0 instead. For example +Tensor<T, N>::maximum() currently returns a Tensor<T, 1>. Similarly, the inner +product of 2 1d tensors (through contractions) returns a 1d tensor. In the +future these operations might be updated to return 0d tensors instead. + +## GPU Support + +NVidia GPU support can be enabled using: + + #define EIGEN_USE_GPU + +To speedup operations on GPU, it is also recommended to use 32 bit indices. This +prevents Eigen from using 64 bit loop indices, which have to be emulated in +software and make any operation extremely slow. + +This can be achieved globally by using the EIGEN_DEFAULT_DENSE_INDEX_TYPE define +as follow: + + #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int + +This can also be done individually for each tensor by using the Index32Bit +option as follow: + + Eigen::Tensor<DataType, Rank, Eigen::Index32Bit> t; + Eigen::TensorMap<Eigen::Tensor<DataType, Rank, Eigen::Index32Bit> > t_map; + + +## Limitations + +* The number of tensor dimensions is currently limited to 250 when using a + compiler that supports cxx11. It is limited to only 5 for older compilers. +* The IndexList class requires a cxx11 compliant compiler. You can use an + array of indices instead if you don't have access to a modern compiler. +* TensorVarDims are only partially supported +* On GPUs only floating point values are properly tested and optimized for. +* Complex and integer values are known to be broken on GPUs. If you try to use + them you'll most likely end up triggering a static assertion failure such as + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h new file mode 100644 index 0000000000..13cb2157f2 --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h @@ -0,0 +1,293 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2013 Christian Seiler <christian@iwakd.de> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSORSYMMETRY_DYNAMICSYMMETRY_H +#define EIGEN_CXX11_TENSORSYMMETRY_DYNAMICSYMMETRY_H + +namespace Eigen { + +class DynamicSGroup +{ + public: + inline explicit DynamicSGroup() : m_numIndices(1), m_elements(), m_generators(), m_globalFlags(0) { m_elements.push_back(ge(Generator(0, 0, 0))); } + inline DynamicSGroup(const DynamicSGroup& o) : m_numIndices(o.m_numIndices), m_elements(o.m_elements), m_generators(o.m_generators), m_globalFlags(o.m_globalFlags) { } + inline DynamicSGroup(DynamicSGroup&& o) : m_numIndices(o.m_numIndices), m_elements(), m_generators(o.m_generators), m_globalFlags(o.m_globalFlags) { std::swap(m_elements, o.m_elements); } + inline DynamicSGroup& operator=(const DynamicSGroup& o) { m_numIndices = o.m_numIndices; m_elements = o.m_elements; m_generators = o.m_generators; m_globalFlags = o.m_globalFlags; return *this; } + inline DynamicSGroup& operator=(DynamicSGroup&& o) { m_numIndices = o.m_numIndices; std::swap(m_elements, o.m_elements); m_generators = o.m_generators; m_globalFlags = o.m_globalFlags; return *this; } + + void add(int one, int two, int flags = 0); + + template<typename Gen_> + inline void add(Gen_) { add(Gen_::One, Gen_::Two, Gen_::Flags); } + inline void addSymmetry(int one, int two) { add(one, two, 0); } + inline void addAntiSymmetry(int one, int two) { add(one, two, NegationFlag); } + inline void addHermiticity(int one, int two) { add(one, two, ConjugationFlag); } + inline void addAntiHermiticity(int one, int two) { add(one, two, NegationFlag | ConjugationFlag); } + + template<typename Op, typename RV, typename Index, std::size_t N, typename... Args> + inline RV apply(const std::array<Index, N>& idx, RV initial, Args&&... args) const + { + eigen_assert(N >= m_numIndices && "Can only apply symmetry group to objects that have at least the required number of indices."); + for (std::size_t i = 0; i < size(); i++) + initial = Op::run(h_permute(i, idx, typename internal::gen_numeric_list<int, N>::type()), m_elements[i].flags, initial, std::forward<Args>(args)...); + return initial; + } + + template<typename Op, typename RV, typename Index, typename... Args> + inline RV apply(const std::vector<Index>& idx, RV initial, Args&&... args) const + { + eigen_assert(idx.size() >= m_numIndices && "Can only apply symmetry group to objects that have at least the required number of indices."); + for (std::size_t i = 0; i < size(); i++) + initial = Op::run(h_permute(i, idx), m_elements[i].flags, initial, std::forward<Args>(args)...); + return initial; + } + + inline int globalFlags() const { return m_globalFlags; } + inline std::size_t size() const { return m_elements.size(); } + + template<typename Tensor_, typename... IndexTypes> + inline internal::tensor_symmetry_value_setter<Tensor_, DynamicSGroup> operator()(Tensor_& tensor, typename Tensor_::Index firstIndex, IndexTypes... otherIndices) const + { + static_assert(sizeof...(otherIndices) + 1 == Tensor_::NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); + return operator()(tensor, std::array<typename Tensor_::Index, Tensor_::NumIndices>{{firstIndex, otherIndices...}}); + } + + template<typename Tensor_> + inline internal::tensor_symmetry_value_setter<Tensor_, DynamicSGroup> operator()(Tensor_& tensor, std::array<typename Tensor_::Index, Tensor_::NumIndices> const& indices) const + { + return internal::tensor_symmetry_value_setter<Tensor_, DynamicSGroup>(tensor, *this, indices); + } + private: + struct GroupElement { + std::vector<int> representation; + int flags; + bool isId() const + { + for (std::size_t i = 0; i < representation.size(); i++) + if (i != (size_t)representation[i]) + return false; + return true; + } + }; + struct Generator { + int one; + int two; + int flags; + constexpr inline Generator(int one_, int two_, int flags_) : one(one_), two(two_), flags(flags_) {} + }; + + std::size_t m_numIndices; + std::vector<GroupElement> m_elements; + std::vector<Generator> m_generators; + int m_globalFlags; + + template<typename Index, std::size_t N, int... n> + inline std::array<Index, N> h_permute(std::size_t which, const std::array<Index, N>& idx, internal::numeric_list<int, n...>) const + { + return std::array<Index, N>{{ idx[n >= m_numIndices ? n : m_elements[which].representation[n]]... }}; + } + + template<typename Index> + inline std::vector<Index> h_permute(std::size_t which, std::vector<Index> idx) const + { + std::vector<Index> result; + result.reserve(idx.size()); + for (auto k : m_elements[which].representation) + result.push_back(idx[k]); + for (std::size_t i = m_numIndices; i < idx.size(); i++) + result.push_back(idx[i]); + return result; + } + + inline GroupElement ge(Generator const& g) const + { + GroupElement result; + result.representation.reserve(m_numIndices); + result.flags = g.flags; + for (std::size_t k = 0; k < m_numIndices; k++) { + if (k == (std::size_t)g.one) + result.representation.push_back(g.two); + else if (k == (std::size_t)g.two) + result.representation.push_back(g.one); + else + result.representation.push_back(int(k)); + } + return result; + } + + GroupElement mul(GroupElement, GroupElement) const; + inline GroupElement mul(Generator g1, GroupElement g2) const + { + return mul(ge(g1), g2); + } + + inline GroupElement mul(GroupElement g1, Generator g2) const + { + return mul(g1, ge(g2)); + } + + inline GroupElement mul(Generator g1, Generator g2) const + { + return mul(ge(g1), ge(g2)); + } + + inline int findElement(GroupElement e) const + { + for (auto ee : m_elements) { + if (ee.representation == e.representation) + return ee.flags ^ e.flags; + } + return -1; + } + + void updateGlobalFlags(int flagDiffOfSameGenerator); +}; + +// dynamic symmetry group that auto-adds the template parameters in the constructor +template<typename... Gen> +class DynamicSGroupFromTemplateArgs : public DynamicSGroup +{ + public: + inline DynamicSGroupFromTemplateArgs() : DynamicSGroup() + { + add_all(internal::type_list<Gen...>()); + } + inline DynamicSGroupFromTemplateArgs(DynamicSGroupFromTemplateArgs const& other) : DynamicSGroup(other) { } + inline DynamicSGroupFromTemplateArgs(DynamicSGroupFromTemplateArgs&& other) : DynamicSGroup(other) { } + inline DynamicSGroupFromTemplateArgs<Gen...>& operator=(const DynamicSGroupFromTemplateArgs<Gen...>& o) { DynamicSGroup::operator=(o); return *this; } + inline DynamicSGroupFromTemplateArgs<Gen...>& operator=(DynamicSGroupFromTemplateArgs<Gen...>&& o) { DynamicSGroup::operator=(o); return *this; } + + private: + template<typename Gen1, typename... GenNext> + inline void add_all(internal::type_list<Gen1, GenNext...>) + { + add(Gen1()); + add_all(internal::type_list<GenNext...>()); + } + + inline void add_all(internal::type_list<>) + { + } +}; + +inline DynamicSGroup::GroupElement DynamicSGroup::mul(GroupElement g1, GroupElement g2) const +{ + eigen_internal_assert(g1.representation.size() == m_numIndices); + eigen_internal_assert(g2.representation.size() == m_numIndices); + + GroupElement result; + result.representation.reserve(m_numIndices); + for (std::size_t i = 0; i < m_numIndices; i++) { + int v = g2.representation[g1.representation[i]]; + eigen_assert(v >= 0); + result.representation.push_back(v); + } + result.flags = g1.flags ^ g2.flags; + return result; +} + +inline void DynamicSGroup::add(int one, int two, int flags) +{ + eigen_assert(one >= 0); + eigen_assert(two >= 0); + eigen_assert(one != two); + + if ((std::size_t)one >= m_numIndices || (std::size_t)two >= m_numIndices) { + std::size_t newNumIndices = (one > two) ? one : two + 1; + for (auto& gelem : m_elements) { + gelem.representation.reserve(newNumIndices); + for (std::size_t i = m_numIndices; i < newNumIndices; i++) + gelem.representation.push_back(i); + } + m_numIndices = newNumIndices; + } + + Generator g{one, two, flags}; + GroupElement e = ge(g); + + /* special case for first generator */ + if (m_elements.size() == 1) { + while (!e.isId()) { + m_elements.push_back(e); + e = mul(e, g); + } + + if (e.flags > 0) + updateGlobalFlags(e.flags); + + // only add in case we didn't have identity + if (m_elements.size() > 1) + m_generators.push_back(g); + return; + } + + int p = findElement(e); + if (p >= 0) { + updateGlobalFlags(p); + return; + } + + std::size_t coset_order = m_elements.size(); + m_elements.push_back(e); + for (std::size_t i = 1; i < coset_order; i++) + m_elements.push_back(mul(m_elements[i], e)); + m_generators.push_back(g); + + std::size_t coset_rep = coset_order; + do { + for (auto g : m_generators) { + e = mul(m_elements[coset_rep], g); + p = findElement(e); + if (p < 0) { + // element not yet in group + m_elements.push_back(e); + for (std::size_t i = 1; i < coset_order; i++) + m_elements.push_back(mul(m_elements[i], e)); + } else if (p > 0) { + updateGlobalFlags(p); + } + } + coset_rep += coset_order; + } while (coset_rep < m_elements.size()); +} + +inline void DynamicSGroup::updateGlobalFlags(int flagDiffOfSameGenerator) +{ + switch (flagDiffOfSameGenerator) { + case 0: + default: + // nothing happened + break; + case NegationFlag: + // every element is it's own negative => whole tensor is zero + m_globalFlags |= GlobalZeroFlag; + break; + case ConjugationFlag: + // every element is it's own conjugate => whole tensor is real + m_globalFlags |= GlobalRealFlag; + break; + case (NegationFlag | ConjugationFlag): + // every element is it's own negative conjugate => whole tensor is imaginary + m_globalFlags |= GlobalImagFlag; + break; + /* NOTE: + * since GlobalZeroFlag == GlobalRealFlag | GlobalImagFlag, if one generator + * causes the tensor to be real and the next one to be imaginary, this will + * trivially give the correct result + */ + } +} + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSORSYMMETRY_DYNAMICSYMMETRY_H + +/* + * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle; + */ diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h new file mode 100644 index 0000000000..942293bd71 --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h @@ -0,0 +1,236 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2013 Christian Seiler <christian@iwakd.de> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSORSYMMETRY_STATICSYMMETRY_H +#define EIGEN_CXX11_TENSORSYMMETRY_STATICSYMMETRY_H + +namespace Eigen { + +namespace internal { + +template<typename list> struct tensor_static_symgroup_permutate; + +template<int... nn> +struct tensor_static_symgroup_permutate<numeric_list<int, nn...>> +{ + constexpr static std::size_t N = sizeof...(nn); + + template<typename T> + constexpr static inline std::array<T, N> run(const std::array<T, N>& indices) + { + return {{indices[nn]...}}; + } +}; + +template<typename indices_, int flags_> +struct tensor_static_symgroup_element +{ + typedef indices_ indices; + constexpr static int flags = flags_; +}; + +template<typename Gen, int N> +struct tensor_static_symgroup_element_ctor +{ + typedef tensor_static_symgroup_element< + typename gen_numeric_list_swapped_pair<int, N, Gen::One, Gen::Two>::type, + Gen::Flags + > type; +}; + +template<int N> +struct tensor_static_symgroup_identity_ctor +{ + typedef tensor_static_symgroup_element< + typename gen_numeric_list<int, N>::type, + 0 + > type; +}; + +template<typename iib> +struct tensor_static_symgroup_multiply_helper +{ + template<int... iia> + constexpr static inline numeric_list<int, get<iia, iib>::value...> helper(numeric_list<int, iia...>) { + return numeric_list<int, get<iia, iib>::value...>(); + } +}; + +template<typename A, typename B> +struct tensor_static_symgroup_multiply +{ + private: + typedef typename A::indices iia; + typedef typename B::indices iib; + constexpr static int ffa = A::flags; + constexpr static int ffb = B::flags; + + public: + static_assert(iia::count == iib::count, "Cannot multiply symmetry elements with different number of indices."); + + typedef tensor_static_symgroup_element< + decltype(tensor_static_symgroup_multiply_helper<iib>::helper(iia())), + ffa ^ ffb + > type; +}; + +template<typename A, typename B> +struct tensor_static_symgroup_equality +{ + typedef typename A::indices iia; + typedef typename B::indices iib; + constexpr static int ffa = A::flags; + constexpr static int ffb = B::flags; + static_assert(iia::count == iib::count, "Cannot compare symmetry elements with different number of indices."); + + constexpr static bool value = is_same<iia, iib>::value; + + private: + /* this should be zero if they are identical, or else the tensor + * will be forced to be pure real, pure imaginary or even pure zero + */ + constexpr static int flags_cmp_ = ffa ^ ffb; + + /* either they are not equal, then we don't care whether the flags + * match, or they are equal, and then we have to check + */ + constexpr static bool is_zero = value && flags_cmp_ == NegationFlag; + constexpr static bool is_real = value && flags_cmp_ == ConjugationFlag; + constexpr static bool is_imag = value && flags_cmp_ == (NegationFlag | ConjugationFlag); + + public: + constexpr static int global_flags = + (is_real ? GlobalRealFlag : 0) | + (is_imag ? GlobalImagFlag : 0) | + (is_zero ? GlobalZeroFlag : 0); +}; + +template<std::size_t NumIndices, typename... Gen> +struct tensor_static_symgroup +{ + typedef StaticSGroup<Gen...> type; + constexpr static std::size_t size = type::static_size; +}; + +template<typename Index, std::size_t N, int... ii, int... jj> +constexpr static inline std::array<Index, N> tensor_static_symgroup_index_permute(std::array<Index, N> idx, internal::numeric_list<int, ii...>, internal::numeric_list<int, jj...>) +{ + return {{ idx[ii]..., idx[jj]... }}; +} + +template<typename Index, int... ii> +static inline std::vector<Index> tensor_static_symgroup_index_permute(std::vector<Index> idx, internal::numeric_list<int, ii...>) +{ + std::vector<Index> result{{ idx[ii]... }}; + std::size_t target_size = idx.size(); + for (std::size_t i = result.size(); i < target_size; i++) + result.push_back(idx[i]); + return result; +} + +template<typename T> struct tensor_static_symgroup_do_apply; + +template<typename first, typename... next> +struct tensor_static_symgroup_do_apply<internal::type_list<first, next...>> +{ + template<typename Op, typename RV, std::size_t SGNumIndices, typename Index, std::size_t NumIndices, typename... Args> + static inline RV run(const std::array<Index, NumIndices>& idx, RV initial, Args&&... args) + { + static_assert(NumIndices >= SGNumIndices, "Can only apply symmetry group to objects that have at least the required amount of indices."); + typedef typename internal::gen_numeric_list<int, NumIndices - SGNumIndices, SGNumIndices>::type remaining_indices; + initial = Op::run(tensor_static_symgroup_index_permute(idx, typename first::indices(), remaining_indices()), first::flags, initial, std::forward<Args>(args)...); + return tensor_static_symgroup_do_apply<internal::type_list<next...>>::template run<Op, RV, SGNumIndices>(idx, initial, args...); + } + + template<typename Op, typename RV, std::size_t SGNumIndices, typename Index, typename... Args> + static inline RV run(const std::vector<Index>& idx, RV initial, Args&&... args) + { + eigen_assert(idx.size() >= SGNumIndices && "Can only apply symmetry group to objects that have at least the required amount of indices."); + initial = Op::run(tensor_static_symgroup_index_permute(idx, typename first::indices()), first::flags, initial, std::forward<Args>(args)...); + return tensor_static_symgroup_do_apply<internal::type_list<next...>>::template run<Op, RV, SGNumIndices>(idx, initial, args...); + } +}; + +template<EIGEN_TPL_PP_SPEC_HACK_DEF(typename, empty)> +struct tensor_static_symgroup_do_apply<internal::type_list<EIGEN_TPL_PP_SPEC_HACK_USE(empty)>> +{ + template<typename Op, typename RV, std::size_t SGNumIndices, typename Index, std::size_t NumIndices, typename... Args> + static inline RV run(const std::array<Index, NumIndices>&, RV initial, Args&&...) + { + // do nothing + return initial; + } + + template<typename Op, typename RV, std::size_t SGNumIndices, typename Index, typename... Args> + static inline RV run(const std::vector<Index>&, RV initial, Args&&...) + { + // do nothing + return initial; + } +}; + +} // end namespace internal + +template<typename... Gen> +class StaticSGroup +{ + constexpr static std::size_t NumIndices = internal::tensor_symmetry_num_indices<Gen...>::value; + typedef internal::group_theory::enumerate_group_elements< + internal::tensor_static_symgroup_multiply, + internal::tensor_static_symgroup_equality, + typename internal::tensor_static_symgroup_identity_ctor<NumIndices>::type, + internal::type_list<typename internal::tensor_static_symgroup_element_ctor<Gen, NumIndices>::type...> + > group_elements; + typedef typename group_elements::type ge; + public: + constexpr inline StaticSGroup() {} + constexpr inline StaticSGroup(const StaticSGroup<Gen...>&) {} + constexpr inline StaticSGroup(StaticSGroup<Gen...>&&) {} + + template<typename Op, typename RV, typename Index, std::size_t N, typename... Args> + static inline RV apply(const std::array<Index, N>& idx, RV initial, Args&&... args) + { + return internal::tensor_static_symgroup_do_apply<ge>::template run<Op, RV, NumIndices>(idx, initial, args...); + } + + template<typename Op, typename RV, typename Index, typename... Args> + static inline RV apply(const std::vector<Index>& idx, RV initial, Args&&... args) + { + eigen_assert(idx.size() == NumIndices); + return internal::tensor_static_symgroup_do_apply<ge>::template run<Op, RV, NumIndices>(idx, initial, args...); + } + + constexpr static std::size_t static_size = ge::count; + + constexpr static inline std::size_t size() { + return ge::count; + } + constexpr static inline int globalFlags() { return group_elements::global_flags; } + + template<typename Tensor_, typename... IndexTypes> + inline internal::tensor_symmetry_value_setter<Tensor_, StaticSGroup<Gen...>> operator()(Tensor_& tensor, typename Tensor_::Index firstIndex, IndexTypes... otherIndices) const + { + static_assert(sizeof...(otherIndices) + 1 == Tensor_::NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); + return operator()(tensor, std::array<typename Tensor_::Index, Tensor_::NumIndices>{{firstIndex, otherIndices...}}); + } + + template<typename Tensor_> + inline internal::tensor_symmetry_value_setter<Tensor_, StaticSGroup<Gen...>> operator()(Tensor_& tensor, std::array<typename Tensor_::Index, Tensor_::NumIndices> const& indices) const + { + return internal::tensor_symmetry_value_setter<Tensor_, StaticSGroup<Gen...>>(tensor, *this, indices); + } +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSORSYMMETRY_STATICSYMMETRY_H + +/* + * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle; + */ diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/TensorSymmetry/Symmetry.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/TensorSymmetry/Symmetry.h new file mode 100644 index 0000000000..879d6cd77b --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/TensorSymmetry/Symmetry.h @@ -0,0 +1,338 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2013 Christian Seiler <christian@iwakd.de> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSORSYMMETRY_SYMMETRY_H +#define EIGEN_CXX11_TENSORSYMMETRY_SYMMETRY_H + +namespace Eigen { + +enum { + NegationFlag = 0x01, + ConjugationFlag = 0x02 +}; + +enum { + GlobalRealFlag = 0x01, + GlobalImagFlag = 0x02, + GlobalZeroFlag = 0x03 +}; + +namespace internal { + +template<std::size_t NumIndices, typename... Sym> struct tensor_symmetry_pre_analysis; +template<std::size_t NumIndices, typename... Sym> struct tensor_static_symgroup; +template<bool instantiate, std::size_t NumIndices, typename... Sym> struct tensor_static_symgroup_if; +template<typename Tensor_> struct tensor_symmetry_calculate_flags; +template<typename Tensor_> struct tensor_symmetry_assign_value; +template<typename... Sym> struct tensor_symmetry_num_indices; + +} // end namespace internal + +template<int One_, int Two_> +struct Symmetry +{ + static_assert(One_ != Two_, "Symmetries must cover distinct indices."); + constexpr static int One = One_; + constexpr static int Two = Two_; + constexpr static int Flags = 0; +}; + +template<int One_, int Two_> +struct AntiSymmetry +{ + static_assert(One_ != Two_, "Symmetries must cover distinct indices."); + constexpr static int One = One_; + constexpr static int Two = Two_; + constexpr static int Flags = NegationFlag; +}; + +template<int One_, int Two_> +struct Hermiticity +{ + static_assert(One_ != Two_, "Symmetries must cover distinct indices."); + constexpr static int One = One_; + constexpr static int Two = Two_; + constexpr static int Flags = ConjugationFlag; +}; + +template<int One_, int Two_> +struct AntiHermiticity +{ + static_assert(One_ != Two_, "Symmetries must cover distinct indices."); + constexpr static int One = One_; + constexpr static int Two = Two_; + constexpr static int Flags = ConjugationFlag | NegationFlag; +}; + +/** \class DynamicSGroup + * \ingroup TensorSymmetry_Module + * + * \brief Dynamic symmetry group + * + * The %DynamicSGroup class represents a symmetry group that need not be known at + * compile time. It is useful if one wants to support arbitrary run-time defineable + * symmetries for tensors, but it is also instantiated if a symmetry group is defined + * at compile time that would be either too large for the compiler to reasonably + * generate (using templates to calculate this at compile time is very inefficient) + * or that the compiler could generate the group but that it wouldn't make sense to + * unroll the loop for setting coefficients anymore. + */ +class DynamicSGroup; + +/** \internal + * + * \class DynamicSGroupFromTemplateArgs + * \ingroup TensorSymmetry_Module + * + * \brief Dynamic symmetry group, initialized from template arguments + * + * This class is a child class of DynamicSGroup. It uses the template arguments + * specified to initialize itself. + */ +template<typename... Gen> +class DynamicSGroupFromTemplateArgs; + +/** \class StaticSGroup + * \ingroup TensorSymmetry_Module + * + * \brief Static symmetry group + * + * This class represents a symmetry group that is known and resolved completely + * at compile time. Ideally, no run-time penalty is incurred compared to the + * manual unrolling of the symmetry. + * + * <b><i>CAUTION:</i></b> + * + * Do not use this class directly for large symmetry groups. The compiler + * may run into a limit, or segfault or in the very least will take a very, + * very, very long time to compile the code. Use the SGroup class instead + * if you want a static group. That class contains logic that will + * automatically select the DynamicSGroup class instead if the symmetry + * group becomes too large. (In that case, unrolling may not even be + * beneficial.) + */ +template<typename... Gen> +class StaticSGroup; + +/** \class SGroup + * \ingroup TensorSymmetry_Module + * + * \brief Symmetry group, initialized from template arguments + * + * This class represents a symmetry group whose generators are already + * known at compile time. It may or may not be resolved at compile time, + * depending on the estimated size of the group. + * + * \sa StaticSGroup + * \sa DynamicSGroup + */ +template<typename... Gen> +class SGroup : public internal::tensor_symmetry_pre_analysis<internal::tensor_symmetry_num_indices<Gen...>::value, Gen...>::root_type +{ + public: + constexpr static std::size_t NumIndices = internal::tensor_symmetry_num_indices<Gen...>::value; + typedef typename internal::tensor_symmetry_pre_analysis<NumIndices, Gen...>::root_type Base; + + // make standard constructors + assignment operators public + inline SGroup() : Base() { } + inline SGroup(const SGroup<Gen...>& other) : Base(other) { } + inline SGroup(SGroup<Gen...>&& other) : Base(other) { } + inline SGroup<Gen...>& operator=(const SGroup<Gen...>& other) { Base::operator=(other); return *this; } + inline SGroup<Gen...>& operator=(SGroup<Gen...>&& other) { Base::operator=(other); return *this; } + + // all else is defined in the base class +}; + +namespace internal { + +template<typename... Sym> struct tensor_symmetry_num_indices +{ + constexpr static std::size_t value = 1; +}; + +template<int One_, int Two_, typename... Sym> struct tensor_symmetry_num_indices<Symmetry<One_, Two_>, Sym...> +{ +private: + constexpr static std::size_t One = static_cast<std::size_t>(One_); + constexpr static std::size_t Two = static_cast<std::size_t>(Two_); + constexpr static std::size_t Three = tensor_symmetry_num_indices<Sym...>::value; + + // don't use std::max, since it's not constexpr until C++14... + constexpr static std::size_t maxOneTwoPlusOne = ((One > Two) ? One : Two) + 1; +public: + constexpr static std::size_t value = (maxOneTwoPlusOne > Three) ? maxOneTwoPlusOne : Three; +}; + +template<int One_, int Two_, typename... Sym> struct tensor_symmetry_num_indices<AntiSymmetry<One_, Two_>, Sym...> + : public tensor_symmetry_num_indices<Symmetry<One_, Two_>, Sym...> {}; +template<int One_, int Two_, typename... Sym> struct tensor_symmetry_num_indices<Hermiticity<One_, Two_>, Sym...> + : public tensor_symmetry_num_indices<Symmetry<One_, Two_>, Sym...> {}; +template<int One_, int Two_, typename... Sym> struct tensor_symmetry_num_indices<AntiHermiticity<One_, Two_>, Sym...> + : public tensor_symmetry_num_indices<Symmetry<One_, Two_>, Sym...> {}; + +/** \internal + * + * \class tensor_symmetry_pre_analysis + * \ingroup TensorSymmetry_Module + * + * \brief Pre-select whether to use a static or dynamic symmetry group + * + * When a symmetry group could in principle be determined at compile time, + * this template implements the logic whether to actually do that or whether + * to rather defer that to runtime. + * + * The logic is as follows: + * <dl> + * <dt><b>No generators (trivial symmetry):</b></dt> + * <dd>Use a trivial static group. Ideally, this has no performance impact + * compared to not using symmetry at all. In practice, this might not + * be the case.</dd> + * <dt><b>More than 4 generators:</b></dt> + * <dd>Calculate the group at run time, it is likely far too large for the + * compiler to be able to properly generate it in a realistic time.</dd> + * <dt><b>Up to and including 4 generators:</b></dt> + * <dd>Actually enumerate all group elements, but then check how many there + * are. If there are more than 16, it is unlikely that unrolling the + * loop (as is done in the static compile-time case) is sensible, so + * use a dynamic group instead. If there are at most 16 elements, actually + * use that static group. Note that the largest group with 4 generators + * still compiles with reasonable resources.</dd> + * </dl> + * + * Note: Example compile time performance with g++-4.6 on an Intenl Core i5-3470 + * with 16 GiB RAM (all generators non-redundant and the subgroups don't + * factorize): + * + * # Generators -O0 -ggdb -O2 + * ------------------------------------------------------------------- + * 1 0.5 s / 250 MiB 0.45s / 230 MiB + * 2 0.5 s / 260 MiB 0.5 s / 250 MiB + * 3 0.65s / 310 MiB 0.62s / 310 MiB + * 4 2.2 s / 860 MiB 1.7 s / 770 MiB + * 5 130 s / 13000 MiB 120 s / 11000 MiB + * + * It is clear that everything is still very efficient up to 4 generators, then + * the memory and CPU requirements become unreasonable. Thus we only instantiate + * the template group theory logic if the number of generators supplied is 4 or + * lower, otherwise this will be forced to be done during runtime, where the + * algorithm is reasonably fast. + */ +template<std::size_t NumIndices> +struct tensor_symmetry_pre_analysis<NumIndices> +{ + typedef StaticSGroup<> root_type; +}; + +template<std::size_t NumIndices, typename Gen_, typename... Gens_> +struct tensor_symmetry_pre_analysis<NumIndices, Gen_, Gens_...> +{ + constexpr static std::size_t max_static_generators = 4; + constexpr static std::size_t max_static_elements = 16; + typedef tensor_static_symgroup_if<(sizeof...(Gens_) + 1 <= max_static_generators), NumIndices, Gen_, Gens_...> helper; + constexpr static std::size_t possible_size = helper::size; + + typedef typename conditional< + possible_size == 0 || possible_size >= max_static_elements, + DynamicSGroupFromTemplateArgs<Gen_, Gens_...>, + typename helper::type + >::type root_type; +}; + +template<bool instantiate, std::size_t NumIndices, typename... Gens> +struct tensor_static_symgroup_if +{ + constexpr static std::size_t size = 0; + typedef void type; +}; + +template<std::size_t NumIndices, typename... Gens> +struct tensor_static_symgroup_if<true, NumIndices, Gens...> : tensor_static_symgroup<NumIndices, Gens...> {}; + +template<typename Tensor_> +struct tensor_symmetry_assign_value +{ + typedef typename Tensor_::Index Index; + typedef typename Tensor_::Scalar Scalar; + constexpr static std::size_t NumIndices = Tensor_::NumIndices; + + static inline int run(const std::array<Index, NumIndices>& transformed_indices, int transformation_flags, int dummy, Tensor_& tensor, const Scalar& value_) + { + Scalar value(value_); + if (transformation_flags & ConjugationFlag) + value = numext::conj(value); + if (transformation_flags & NegationFlag) + value = -value; + tensor.coeffRef(transformed_indices) = value; + return dummy; + } +}; + +template<typename Tensor_> +struct tensor_symmetry_calculate_flags +{ + typedef typename Tensor_::Index Index; + constexpr static std::size_t NumIndices = Tensor_::NumIndices; + + static inline int run(const std::array<Index, NumIndices>& transformed_indices, int transform_flags, int current_flags, const std::array<Index, NumIndices>& orig_indices) + { + if (transformed_indices == orig_indices) { + if (transform_flags & (ConjugationFlag | NegationFlag)) + return current_flags | GlobalImagFlag; // anti-hermitian diagonal + else if (transform_flags & ConjugationFlag) + return current_flags | GlobalRealFlag; // hermitian diagonal + else if (transform_flags & NegationFlag) + return current_flags | GlobalZeroFlag; // anti-symmetric diagonal + } + return current_flags; + } +}; + +template<typename Tensor_, typename Symmetry_, int Flags = 0> +class tensor_symmetry_value_setter +{ + public: + typedef typename Tensor_::Index Index; + typedef typename Tensor_::Scalar Scalar; + constexpr static std::size_t NumIndices = Tensor_::NumIndices; + + inline tensor_symmetry_value_setter(Tensor_& tensor, Symmetry_ const& symmetry, std::array<Index, NumIndices> const& indices) + : m_tensor(tensor), m_symmetry(symmetry), m_indices(indices) { } + + inline tensor_symmetry_value_setter<Tensor_, Symmetry_, Flags>& operator=(Scalar const& value) + { + doAssign(value); + return *this; + } + private: + Tensor_& m_tensor; + Symmetry_ m_symmetry; + std::array<Index, NumIndices> m_indices; + + inline void doAssign(Scalar const& value) + { + #ifdef EIGEN_TENSOR_SYMMETRY_CHECK_VALUES + int value_flags = m_symmetry.template apply<internal::tensor_symmetry_calculate_flags<Tensor_>, int>(m_indices, m_symmetry.globalFlags(), m_indices); + if (value_flags & GlobalRealFlag) + eigen_assert(numext::imag(value) == 0); + if (value_flags & GlobalImagFlag) + eigen_assert(numext::real(value) == 0); + #endif + m_symmetry.template apply<internal::tensor_symmetry_assign_value<Tensor_>, int>(m_indices, 0, m_tensor, value); + } +}; + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSORSYMMETRY_SYMMETRY_H + +/* + * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle; + */ diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h new file mode 100644 index 0000000000..0fe0b7c46d --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h @@ -0,0 +1,666 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2013 Christian Seiler <christian@iwakd.de> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSORSYMMETRY_TEMPLATEGROUPTHEORY_H +#define EIGEN_CXX11_TENSORSYMMETRY_TEMPLATEGROUPTHEORY_H + +namespace Eigen { + +namespace internal { + +namespace group_theory { + +/** \internal + * \file CXX11/Tensor/util/TemplateGroupTheory.h + * This file contains C++ templates that implement group theory algorithms. + * + * The algorithms allow for a compile-time analysis of finite groups. + * + * Currently only Dimino's algorithm is implemented, which returns a list + * of all elements in a group given a set of (possibly redundant) generators. + * (One could also do that with the so-called orbital algorithm, but that + * is much more expensive and usually has no advantages.) + */ + +/********************************************************************** + * "Ok kid, here is where it gets complicated." + * - Amelia Pond in the "Doctor Who" episode + * "The Big Bang" + * + * Dimino's algorithm + * ================== + * + * The following is Dimino's algorithm in sequential form: + * + * Input: identity element, list of generators, equality check, + * multiplication operation + * Output: list of group elements + * + * 1. add identity element + * 2. remove identities from list of generators + * 3. add all powers of first generator that aren't the + * identity element + * 4. go through all remaining generators: + * a. if generator is already in the list of elements + * -> do nothing + * b. otherwise + * i. remember current # of elements + * (i.e. the size of the current subgroup) + * ii. add all current elements (which includes + * the identity) each multiplied from right + * with the current generator to the group + * iii. add all remaining cosets that are generated + * by products of the new generator with itself + * and all other generators seen so far + * + * In functional form, this is implemented as a long set of recursive + * templates that have a complicated relationship. + * + * The main interface for Dimino's algorithm is the template + * enumerate_group_elements. All lists are implemented as variadic + * type_list<typename...> and numeric_list<typename = int, int...> + * templates. + * + * 'Calling' templates is usually done via typedefs. + * + * This algorithm is an extended version of the basic version. The + * extension consists in the fact that each group element has a set + * of flags associated with it. Multiplication of two group elements + * with each other results in a group element whose flags are the + * XOR of the flags of the previous elements. Each time the algorithm + * notices that a group element it just calculated is already in the + * list of current elements, the flags of both will be compared and + * added to the so-called 'global flags' of the group. + * + * The rationale behind this extension is that this allows not only + * for the description of symmetries between tensor indices, but + * also allows for the description of hermiticity, antisymmetry and + * antihermiticity. Negation and conjugation each are specific bit + * in the flags value and if two different ways to reach a group + * element lead to two different flags, this poses a constraint on + * the allowed values of the resulting tensor. For example, if a + * group element is reach both with and without the conjugation + * flags, it is clear that the resulting tensor has to be real. + * + * Note that this flag mechanism is quite generic and may have other + * uses beyond tensor properties. + * + * IMPORTANT: + * This algorithm assumes the group to be finite. If you try to + * run it with a group that's infinite, the algorithm will only + * terminate once you hit a compiler limit (max template depth). + * Also note that trying to use this implementation to create a + * very large group will probably either make you hit the same + * limit, cause the compiler to segfault or at the very least + * take a *really* long time (hours, days, weeks - sic!) to + * compile. It is not recommended to plug in more than 4 + * generators, unless they are independent of each other. + */ + +/** \internal + * + * \class strip_identities + * \ingroup CXX11_TensorSymmetry_Module + * + * \brief Cleanse a list of group elements of the identity element + * + * This template is used to make a first pass through all initial + * generators of Dimino's algorithm and remove the identity + * elements. + * + * \sa enumerate_group_elements + */ +template<template<typename, typename> class Equality, typename id, typename L> struct strip_identities; + +template< + template<typename, typename> class Equality, + typename id, + typename t, + typename... ts +> +struct strip_identities<Equality, id, type_list<t, ts...>> +{ + typedef typename conditional< + Equality<id, t>::value, + typename strip_identities<Equality, id, type_list<ts...>>::type, + typename concat<type_list<t>, typename strip_identities<Equality, id, type_list<ts...>>::type>::type + >::type type; + constexpr static int global_flags = Equality<id, t>::global_flags | strip_identities<Equality, id, type_list<ts...>>::global_flags; +}; + +template< + template<typename, typename> class Equality, + typename id + EIGEN_TPL_PP_SPEC_HACK_DEFC(typename, ts) +> +struct strip_identities<Equality, id, type_list<EIGEN_TPL_PP_SPEC_HACK_USE(ts)>> +{ + typedef type_list<> type; + constexpr static int global_flags = 0; +}; + +/** \internal + * + * \class dimino_first_step_elements_helper + * \ingroup CXX11_TensorSymmetry_Module + * + * \brief Recursive template that adds powers of the first generator to the list of group elements + * + * This template calls itself recursively to add powers of the first + * generator to the list of group elements. It stops if it reaches + * the identity element again. + * + * \sa enumerate_group_elements, dimino_first_step_elements + */ +template< + template<typename, typename> class Multiply, + template<typename, typename> class Equality, + typename id, + typename g, + typename current_element, + typename elements, + bool dont_add_current_element // = false +> +struct dimino_first_step_elements_helper : + public dimino_first_step_elements_helper< + Multiply, + Equality, + id, + g, + typename Multiply<current_element, g>::type, + typename concat<elements, type_list<current_element>>::type, + Equality<typename Multiply<current_element, g>::type, id>::value + > {}; + +template< + template<typename, typename> class Multiply, + template<typename, typename> class Equality, + typename id, + typename g, + typename current_element, + typename elements +> +struct dimino_first_step_elements_helper<Multiply, Equality, id, g, current_element, elements, true> +{ + typedef elements type; + constexpr static int global_flags = Equality<current_element, id>::global_flags; +}; + +/** \internal + * + * \class dimino_first_step_elements + * \ingroup CXX11_TensorSymmetry_Module + * + * \brief Add all powers of the first generator to the list of group elements + * + * This template takes the first non-identity generator and generates the initial + * list of elements which consists of all powers of that generator. For a group + * with just one generated, it would be enumerated after this. + * + * \sa enumerate_group_elements + */ +template< + template<typename, typename> class Multiply, + template<typename, typename> class Equality, + typename id, + typename generators +> +struct dimino_first_step_elements +{ + typedef typename get<0, generators>::type first_generator; + typedef typename skip<1, generators>::type next_generators; + typedef type_list<first_generator> generators_done; + + typedef dimino_first_step_elements_helper< + Multiply, + Equality, + id, + first_generator, + first_generator, + type_list<id>, + false + > helper; + typedef typename helper::type type; + constexpr static int global_flags = helper::global_flags; +}; + +/** \internal + * + * \class dimino_get_coset_elements + * \ingroup CXX11_TensorSymmetry_Module + * + * \brief Generate all elements of a specific coset + * + * This template generates all the elements of a specific coset by + * multiplying all elements in the given subgroup with the new + * coset representative. Note that the first element of the + * subgroup is always the identity element, so the first element of + * ther result of this template is going to be the coset + * representative itself. + * + * Note that this template accepts an additional boolean parameter + * that specifies whether to actually generate the coset (true) or + * just return an empty list (false). + * + * \sa enumerate_group_elements, dimino_add_cosets_for_rep + */ +template< + template<typename, typename> class Multiply, + typename sub_group_elements, + typename new_coset_rep, + bool generate_coset // = true +> +struct dimino_get_coset_elements +{ + typedef typename apply_op_from_right<Multiply, new_coset_rep, sub_group_elements>::type type; +}; + +template< + template<typename, typename> class Multiply, + typename sub_group_elements, + typename new_coset_rep +> +struct dimino_get_coset_elements<Multiply, sub_group_elements, new_coset_rep, false> +{ + typedef type_list<> type; +}; + +/** \internal + * + * \class dimino_add_cosets_for_rep + * \ingroup CXX11_TensorSymmetry_Module + * + * \brief Recursive template for adding coset spaces + * + * This template multiplies the coset representative with a generator + * from the list of previous generators. If the new element is not in + * the group already, it adds the corresponding coset. Finally it + * proceeds to call itself with the next generator from the list. + * + * \sa enumerate_group_elements, dimino_add_all_coset_spaces + */ +template< + template<typename, typename> class Multiply, + template<typename, typename> class Equality, + typename id, + typename sub_group_elements, + typename elements, + typename generators, + typename rep_element, + int sub_group_size +> +struct dimino_add_cosets_for_rep; + +template< + template<typename, typename> class Multiply, + template<typename, typename> class Equality, + typename id, + typename sub_group_elements, + typename elements, + typename g, + typename... gs, + typename rep_element, + int sub_group_size +> +struct dimino_add_cosets_for_rep<Multiply, Equality, id, sub_group_elements, elements, type_list<g, gs...>, rep_element, sub_group_size> +{ + typedef typename Multiply<rep_element, g>::type new_coset_rep; + typedef contained_in_list_gf<Equality, new_coset_rep, elements> _cil; + constexpr static bool add_coset = !_cil::value; + + typedef typename dimino_get_coset_elements< + Multiply, + sub_group_elements, + new_coset_rep, + add_coset + >::type coset_elements; + + typedef dimino_add_cosets_for_rep< + Multiply, + Equality, + id, + sub_group_elements, + typename concat<elements, coset_elements>::type, + type_list<gs...>, + rep_element, + sub_group_size + > _helper; + + typedef typename _helper::type type; + constexpr static int global_flags = _cil::global_flags | _helper::global_flags; + + /* Note that we don't have to update global flags here, since + * we will only add these elements if they are not part of + * the group already. But that only happens if the coset rep + * is not already in the group, so the check for the coset rep + * will catch this. + */ +}; + +template< + template<typename, typename> class Multiply, + template<typename, typename> class Equality, + typename id, + typename sub_group_elements, + typename elements + EIGEN_TPL_PP_SPEC_HACK_DEFC(typename, empty), + typename rep_element, + int sub_group_size +> +struct dimino_add_cosets_for_rep<Multiply, Equality, id, sub_group_elements, elements, type_list<EIGEN_TPL_PP_SPEC_HACK_USE(empty)>, rep_element, sub_group_size> +{ + typedef elements type; + constexpr static int global_flags = 0; +}; + +/** \internal + * + * \class dimino_add_all_coset_spaces + * \ingroup CXX11_TensorSymmetry_Module + * + * \brief Recursive template for adding all coset spaces for a new generator + * + * This template tries to go through the list of generators (with + * the help of the dimino_add_cosets_for_rep template) as long as + * it still finds elements that are not part of the group and add + * the corresponding cosets. + * + * \sa enumerate_group_elements, dimino_add_cosets_for_rep + */ +template< + template<typename, typename> class Multiply, + template<typename, typename> class Equality, + typename id, + typename sub_group_elements, + typename elements, + typename generators, + int sub_group_size, + int rep_pos, + bool stop_condition // = false +> +struct dimino_add_all_coset_spaces +{ + typedef typename get<rep_pos, elements>::type rep_element; + typedef dimino_add_cosets_for_rep< + Multiply, + Equality, + id, + sub_group_elements, + elements, + generators, + rep_element, + sub_group_elements::count + > _ac4r; + typedef typename _ac4r::type new_elements; + + constexpr static int new_rep_pos = rep_pos + sub_group_elements::count; + constexpr static bool new_stop_condition = new_rep_pos >= new_elements::count; + + typedef dimino_add_all_coset_spaces< + Multiply, + Equality, + id, + sub_group_elements, + new_elements, + generators, + sub_group_size, + new_rep_pos, + new_stop_condition + > _helper; + + typedef typename _helper::type type; + constexpr static int global_flags = _helper::global_flags | _ac4r::global_flags; +}; + +template< + template<typename, typename> class Multiply, + template<typename, typename> class Equality, + typename id, + typename sub_group_elements, + typename elements, + typename generators, + int sub_group_size, + int rep_pos +> +struct dimino_add_all_coset_spaces<Multiply, Equality, id, sub_group_elements, elements, generators, sub_group_size, rep_pos, true> +{ + typedef elements type; + constexpr static int global_flags = 0; +}; + +/** \internal + * + * \class dimino_add_generator + * \ingroup CXX11_TensorSymmetry_Module + * + * \brief Enlarge the group by adding a new generator. + * + * It accepts a boolean parameter that determines if the generator is redundant, + * i.e. was already seen in the group. In that case, it reduces to a no-op. + * + * \sa enumerate_group_elements, dimino_add_all_coset_spaces + */ +template< + template<typename, typename> class Multiply, + template<typename, typename> class Equality, + typename id, + typename elements, + typename generators_done, + typename current_generator, + bool redundant // = false +> +struct dimino_add_generator +{ + /* this template is only called if the generator is not redundant + * => all elements of the group multiplied with the new generator + * are going to be new elements of the most trivial coset space + */ + typedef typename apply_op_from_right<Multiply, current_generator, elements>::type multiplied_elements; + typedef typename concat<elements, multiplied_elements>::type new_elements; + + constexpr static int rep_pos = elements::count; + + typedef dimino_add_all_coset_spaces< + Multiply, + Equality, + id, + elements, // elements of previous subgroup + new_elements, + typename concat<generators_done, type_list<current_generator>>::type, + elements::count, // size of previous subgroup + rep_pos, + false // don't stop (because rep_pos >= new_elements::count is always false at this point) + > _helper; + typedef typename _helper::type type; + constexpr static int global_flags = _helper::global_flags; +}; + +template< + template<typename, typename> class Multiply, + template<typename, typename> class Equality, + typename id, + typename elements, + typename generators_done, + typename current_generator +> +struct dimino_add_generator<Multiply, Equality, id, elements, generators_done, current_generator, true> +{ + // redundant case + typedef elements type; + constexpr static int global_flags = 0; +}; + +/** \internal + * + * \class dimino_add_remaining_generators + * \ingroup CXX11_TensorSymmetry_Module + * + * \brief Recursive template that adds all remaining generators to a group + * + * Loop through the list of generators that remain and successively + * add them to the group. + * + * \sa enumerate_group_elements, dimino_add_generator + */ +template< + template<typename, typename> class Multiply, + template<typename, typename> class Equality, + typename id, + typename generators_done, + typename remaining_generators, + typename elements +> +struct dimino_add_remaining_generators +{ + typedef typename get<0, remaining_generators>::type first_generator; + typedef typename skip<1, remaining_generators>::type next_generators; + + typedef contained_in_list_gf<Equality, first_generator, elements> _cil; + + typedef dimino_add_generator< + Multiply, + Equality, + id, + elements, + generators_done, + first_generator, + _cil::value + > _helper; + + typedef typename _helper::type new_elements; + + typedef dimino_add_remaining_generators< + Multiply, + Equality, + id, + typename concat<generators_done, type_list<first_generator>>::type, + next_generators, + new_elements + > _next_iter; + + typedef typename _next_iter::type type; + constexpr static int global_flags = + _cil::global_flags | + _helper::global_flags | + _next_iter::global_flags; +}; + +template< + template<typename, typename> class Multiply, + template<typename, typename> class Equality, + typename id, + typename generators_done, + typename elements +> +struct dimino_add_remaining_generators<Multiply, Equality, id, generators_done, type_list<>, elements> +{ + typedef elements type; + constexpr static int global_flags = 0; +}; + +/** \internal + * + * \class enumerate_group_elements_noid + * \ingroup CXX11_TensorSymmetry_Module + * + * \brief Helper template that implements group element enumeration + * + * This is a helper template that implements the actual enumeration + * of group elements. This has been split so that the list of + * generators can be cleansed of the identity element before + * performing the actual operation. + * + * \sa enumerate_group_elements + */ +template< + template<typename, typename> class Multiply, + template<typename, typename> class Equality, + typename id, + typename generators, + int initial_global_flags = 0 +> +struct enumerate_group_elements_noid +{ + typedef dimino_first_step_elements<Multiply, Equality, id, generators> first_step; + typedef typename first_step::type first_step_elements; + + typedef dimino_add_remaining_generators< + Multiply, + Equality, + id, + typename first_step::generators_done, + typename first_step::next_generators, // remaining_generators + typename first_step::type // first_step elements + > _helper; + + typedef typename _helper::type type; + constexpr static int global_flags = + initial_global_flags | + first_step::global_flags | + _helper::global_flags; +}; + +// in case when no generators are specified +template< + template<typename, typename> class Multiply, + template<typename, typename> class Equality, + typename id, + int initial_global_flags +> +struct enumerate_group_elements_noid<Multiply, Equality, id, type_list<>, initial_global_flags> +{ + typedef type_list<id> type; + constexpr static int global_flags = initial_global_flags; +}; + +/** \internal + * + * \class enumerate_group_elements + * \ingroup CXX11_TensorSymmetry_Module + * + * \brief Enumerate all elements in a finite group + * + * This template enumerates all elements in a finite group. It accepts + * the following template parameters: + * + * \tparam Multiply The multiplication operation that multiplies two group elements + * with each other. + * \tparam Equality The equality check operation that checks if two group elements + * are equal to another. + * \tparam id The identity element + * \tparam _generators A list of (possibly redundant) generators of the group + */ +template< + template<typename, typename> class Multiply, + template<typename, typename> class Equality, + typename id, + typename _generators +> +struct enumerate_group_elements + : public enumerate_group_elements_noid< + Multiply, + Equality, + id, + typename strip_identities<Equality, id, _generators>::type, + strip_identities<Equality, id, _generators>::global_flags + > +{ +}; + +} // end namespace group_theory + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSORSYMMETRY_TEMPLATEGROUPTHEORY_H + +/* + * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle; + */ |