aboutsummaryrefslogtreecommitdiffhomepage
path: root/unsupported/Eigen
diff options
context:
space:
mode:
Diffstat (limited to 'unsupported/Eigen')
-rw-r--r--unsupported/Eigen/CXX11/CMakeLists.txt2
-rw-r--r--unsupported/Eigen/CXX11/Core51
-rw-r--r--unsupported/Eigen/CXX11/Tensor12
-rw-r--r--unsupported/Eigen/CXX11/TensorSymmetry2
-rw-r--r--unsupported/Eigen/CXX11/ThreadPool65
-rw-r--r--unsupported/Eigen/CXX11/src/CMakeLists.txt3
-rw-r--r--unsupported/Eigen/CXX11/src/Core/CMakeLists.txt1
-rw-r--r--unsupported/Eigen/CXX11/src/Core/util/CMakeLists.txt6
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h5
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h25
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorBase.h6
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h51
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h63
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h15
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h192
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h4
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h17
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h47
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h214
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h25
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h6
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h20
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h157
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h17
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h42
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h192
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h12
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h14
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h21
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h8
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h30
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h31
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h4
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h12
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h9
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h62
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h30
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h135
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h13
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h40
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h36
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h53
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h4
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h26
-rw-r--r--unsupported/Eigen/CXX11/src/ThreadPool/CMakeLists.txt6
-rw-r--r--unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h234
-rw-r--r--unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h232
-rw-r--r--unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h210
-rw-r--r--unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h127
-rw-r--r--unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h38
-rw-r--r--unsupported/Eigen/CXX11/src/ThreadPool/ThreadLocal.h22
-rw-r--r--unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h26
-rw-r--r--unsupported/Eigen/CXX11/src/ThreadPool/ThreadYield.h20
-rw-r--r--unsupported/Eigen/CXX11/src/util/CMakeLists.txt6
-rw-r--r--unsupported/Eigen/CXX11/src/util/CXX11Meta.h (renamed from unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h)18
-rw-r--r--unsupported/Eigen/CXX11/src/util/CXX11Workarounds.h (renamed from unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h)0
-rw-r--r--unsupported/Eigen/CXX11/src/util/EmulateArray.h (renamed from unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h)6
-rw-r--r--unsupported/Eigen/CXX11/src/util/EmulateCXX11Meta.h (renamed from unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h)2
-rw-r--r--unsupported/Eigen/CXX11/src/util/MaxSizeVector.h (renamed from unsupported/Eigen/CXX11/src/Core/util/MaxSizeVector.h)0
59 files changed, 2053 insertions, 674 deletions
diff --git a/unsupported/Eigen/CXX11/CMakeLists.txt b/unsupported/Eigen/CXX11/CMakeLists.txt
index f1d9f0482..a40bc4715 100644
--- a/unsupported/Eigen/CXX11/CMakeLists.txt
+++ b/unsupported/Eigen/CXX11/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(Eigen_CXX11_HEADERS Core Tensor TensorSymmetry)
+set(Eigen_CXX11_HEADERS Tensor TensorSymmetry ThreadPool)
install(FILES
${Eigen_CXX11_HEADERS}
diff --git a/unsupported/Eigen/CXX11/Core b/unsupported/Eigen/CXX11/Core
deleted file mode 100644
index 946145f5a..000000000
--- a/unsupported/Eigen/CXX11/Core
+++ /dev/null
@@ -1,51 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_CORE_MODULE
-#define EIGEN_CXX11_CORE_MODULE
-
-#include <Eigen/Core>
-
-#include <Eigen/src/Core/util/DisableStupidWarnings.h>
-
-/** \defgroup CXX11_Core_Module C++11 Core Module
- *
- * This module provides common core features for all modules that
- * explicitly depend on C++11. Currently, this is only the Tensor
- * module. Note that at this stage, you should not need to include
- * this module directly.
- *
- * It also provides a limited fallback for compilers that don't support
- * CXX11 yet, such as nvcc.
- *
- * \code
- * #include <Eigen/CXX11/Core>
- * \endcode
- */
-
-#include <vector>
-
-#include "src/Core/util/EmulateArray.h"
-#include "src/Core/util/MaxSizeVector.h"
-
-// Emulate the cxx11 functionality that we need if the compiler doesn't support it.
-// Visual studio 2015 doesn't advertise itself as cxx11 compliant, although it
-// supports enough of the standard for our needs
-#if __cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900
-#include "src/Core/util/CXX11Workarounds.h"
-#include "src/Core/util/CXX11Meta.h"
-#else
-#include "src/Core/util/EmulateCXX11Meta.h"
-#endif
-
-#include <Eigen/src/Core/util/ReenableStupidWarnings.h>
-
-#endif // EIGEN_CXX11_CORE_MODULE
-
diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor
index 16132398d..1e97ad3c0 100644
--- a/unsupported/Eigen/CXX11/Tensor
+++ b/unsupported/Eigen/CXX11/Tensor
@@ -11,10 +11,12 @@
//#ifndef EIGEN_CXX11_TENSOR_MODULE
//#define EIGEN_CXX11_TENSOR_MODULE
-#include "Core"
+#include "../../../Eigen/Core"
#include <Eigen/src/Core/util/DisableStupidWarnings.h>
+#include "src/util/CXX11Meta.h"
+#include "src/util/MaxSizeVector.h"
/** \defgroup CXX11_Tensor_Module Tensor Module
*
@@ -26,6 +28,7 @@
* \endcode
*/
+#include <cmath>
#include <cstddef>
#include <cstring>
@@ -51,11 +54,7 @@ typedef unsigned __int64 uint64_t;
#endif
#ifdef EIGEN_USE_THREADS
-#include <atomic>
-#include <condition_variable>
-#include <deque>
-#include <mutex>
-#include <thread>
+#include "ThreadPool"
#endif
#ifdef EIGEN_USE_GPU
@@ -84,6 +83,7 @@ typedef unsigned __int64 uint64_t;
#include "src/Tensor/TensorBase.h"
+#include "src/Tensor/TensorCostModel.h"
#include "src/Tensor/TensorEvaluator.h"
#include "src/Tensor/TensorExpr.h"
#include "src/Tensor/TensorReduction.h"
diff --git a/unsupported/Eigen/CXX11/TensorSymmetry b/unsupported/Eigen/CXX11/TensorSymmetry
index f1dc25fea..fb1b0c0fb 100644
--- a/unsupported/Eigen/CXX11/TensorSymmetry
+++ b/unsupported/Eigen/CXX11/TensorSymmetry
@@ -14,6 +14,8 @@
#include <Eigen/src/Core/util/DisableStupidWarnings.h>
+#include "src/util/CXX11Meta.h"
+
/** \defgroup CXX11_TensorSymmetry_Module Tensor Symmetry Module
*
* This module provides a classes that allow for the definition of
diff --git a/unsupported/Eigen/CXX11/ThreadPool b/unsupported/Eigen/CXX11/ThreadPool
new file mode 100644
index 000000000..09d637e9a
--- /dev/null
+++ b/unsupported/Eigen/CXX11/ThreadPool
@@ -0,0 +1,65 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_THREADPOOL_MODULE
+#define EIGEN_CXX11_THREADPOOL_MODULE
+
+#include "../../../Eigen/Core"
+
+#include <Eigen/src/Core/util/DisableStupidWarnings.h>
+
+/** \defgroup CXX11_ThreadPool_Module C++11 ThreadPool Module
+ *
+ * This module provides 2 threadpool implementations
+ * - a simple reference implementation
+ * - a faster non blocking implementation
+ *
+ * This module requires C++11.
+ *
+ * \code
+ * #include <Eigen/CXX11/ThreadPool>
+ * \endcode
+ */
+
+
+// The code depends on CXX11, so only include the module if the
+// compiler supports it.
+#if __cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900
+#include <cstddef>
+#include <cstring>
+#include <stdint.h>
+#include <time.h>
+
+#include <vector>
+#include <atomic>
+#include <condition_variable>
+#include <deque>
+#include <mutex>
+#include <thread>
+#include <functional>
+#include <memory>
+
+#include "src/util/CXX11Meta.h"
+#include "src/util/MaxSizeVector.h"
+
+#include "src/ThreadPool/ThreadLocal.h"
+#include "src/ThreadPool/ThreadYield.h"
+#include "src/ThreadPool/EventCount.h"
+#include "src/ThreadPool/RunQueue.h"
+#include "src/ThreadPool/ThreadPoolInterface.h"
+#include "src/ThreadPool/ThreadEnvironment.h"
+#include "src/ThreadPool/SimpleThreadPool.h"
+#include "src/ThreadPool/NonBlockingThreadPool.h"
+
+#endif
+
+#include <Eigen/src/Core/util/ReenableStupidWarnings.h>
+
+#endif // EIGEN_CXX11_THREADPOOL_MODULE
+
diff --git a/unsupported/Eigen/CXX11/src/CMakeLists.txt b/unsupported/Eigen/CXX11/src/CMakeLists.txt
index d90ee1b0f..1734262bb 100644
--- a/unsupported/Eigen/CXX11/src/CMakeLists.txt
+++ b/unsupported/Eigen/CXX11/src/CMakeLists.txt
@@ -1,3 +1,4 @@
-add_subdirectory(Core)
+add_subdirectory(util)
+add_subdirectory(ThreadPool)
add_subdirectory(Tensor)
add_subdirectory(TensorSymmetry)
diff --git a/unsupported/Eigen/CXX11/src/Core/CMakeLists.txt b/unsupported/Eigen/CXX11/src/Core/CMakeLists.txt
deleted file mode 100644
index 28571dcb9..000000000
--- a/unsupported/Eigen/CXX11/src/Core/CMakeLists.txt
+++ /dev/null
@@ -1 +0,0 @@
-add_subdirectory(util)
diff --git a/unsupported/Eigen/CXX11/src/Core/util/CMakeLists.txt b/unsupported/Eigen/CXX11/src/Core/util/CMakeLists.txt
deleted file mode 100644
index 1e3b14712..000000000
--- a/unsupported/Eigen/CXX11/src/Core/util/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_CXX11_Core_util_SRCS "*.h")
-
-INSTALL(FILES
- ${Eigen_CXX11_Core_util_SRCS}
- DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/Core/util COMPONENT Devel
- )
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
index f1ec04c49..babafe108 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
@@ -112,6 +112,11 @@ struct TensorEvaluator<const TensorIndexTupleOp<ArgType>, Device>
return CoeffReturnType(index, m_impl.coeff(index));
}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+ costPerCoeff(bool vectorized) const {
+ return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, 1);
+ }
+
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
protected:
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
index 199d2ce41..5abff0800 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
@@ -89,6 +89,12 @@ template<typename LeftArgType, typename RightArgType, typename Device>
struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
{
typedef TensorAssignOp<LeftArgType, RightArgType> XprType;
+ typedef typename XprType::Index Index;
+ typedef typename XprType::Scalar Scalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ typedef typename TensorEvaluator<RightArgType, Device>::Dimensions Dimensions;
+ static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
enum {
IsAligned = TensorEvaluator<LeftArgType, Device>::IsAligned & TensorEvaluator<RightArgType, Device>::IsAligned,
@@ -104,12 +110,6 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
}
- typedef typename XprType::Index Index;
- typedef typename XprType::Scalar Scalar;
- typedef typename XprType::CoeffReturnType CoeffReturnType;
- typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
- typedef typename TensorEvaluator<RightArgType, Device>::Dimensions Dimensions;
-
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
{
// The dimensions of the lhs and the rhs tensors should be equal to prevent
@@ -150,6 +150,19 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
return m_leftImpl.template packet<LoadMode>(index);
}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+ costPerCoeff(bool vectorized) const {
+ // We assume that evalPacket or evalScalar is called to perform the
+ // assignment and account for the cost of the write here, but reduce left
+ // cost by one load because we are using m_leftImpl.coeffRef.
+ TensorOpCost left = m_leftImpl.costPerCoeff(vectorized);
+ return m_rightImpl.costPerCoeff(vectorized) +
+ TensorOpCost(
+ numext::maxi(0.0, left.bytes_loaded() - sizeof(CoeffReturnType)),
+ left.bytes_stored(), left.compute_cycles()) +
+ TensorOpCost(0, sizeof(CoeffReturnType), 0, vectorized, PacketSize);
+ }
+
EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_leftImpl.data(); }
private:
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
index 69d1802d5..1a34f3ccc 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
@@ -334,6 +334,12 @@ class TensorBase<Derived, ReadOnlyAccessors>
return binaryExpr(other.derived(), internal::scalar_boolean_or_op());
}
+ template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorCwiseBinaryOp<internal::scalar_boolean_xor_op, const Derived, const OtherDerived>
+ operator^(const OtherDerived& other) const {
+ return binaryExpr(other.derived(), internal::scalar_boolean_xor_op());
+ }
+
// Comparisons and tests.
template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_LT>, const Derived, const OtherDerived>
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
index b6e6db12a..c771496e2 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
@@ -101,6 +101,9 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
typedef DSizes<Index, NumDims> Dimensions;
typedef typename XprType::Scalar Scalar;
typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
enum {
IsAligned = false,
@@ -140,9 +143,6 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
}
}
- typedef typename XprType::CoeffReturnType CoeffReturnType;
- typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
@@ -247,9 +247,8 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const
{
- const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
- EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
- eigen_assert(index+packetSize-1 < dimensions().TotalSize());
+ EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
const Index originalIndex = index;
@@ -284,12 +283,12 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
// Todo: this could be extended to the second dimension if we're not
// broadcasting alongside the first dimension, and so on.
- if (innermostLoc + packetSize <= m_impl.dimensions()[0]) {
+ if (innermostLoc + PacketSize <= m_impl.dimensions()[0]) {
return m_impl.template packet<Unaligned>(inputIndex);
} else {
- EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
+ EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
values[0] = m_impl.coeff(inputIndex);
- for (int i = 1; i < packetSize; ++i) {
+ for (int i = 1; i < PacketSize; ++i) {
values[i] = coeffColMajor(originalIndex+i);
}
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
@@ -300,9 +299,8 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const
{
- const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
- EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
- eigen_assert(index+packetSize-1 < dimensions().TotalSize());
+ EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
const Index originalIndex = index;
@@ -337,12 +335,12 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
// Todo: this could be extended to the second dimension if we're not
// broadcasting alongside the first dimension, and so on.
- if (innermostLoc + packetSize <= m_impl.dimensions()[NumDims-1]) {
+ if (innermostLoc + PacketSize <= m_impl.dimensions()[NumDims-1]) {
return m_impl.template packet<Unaligned>(inputIndex);
} else {
- EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
+ EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
values[0] = m_impl.coeff(inputIndex);
- for (int i = 1; i < packetSize; ++i) {
+ for (int i = 1; i < PacketSize; ++i) {
values[i] = coeffRowMajor(originalIndex+i);
}
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
@@ -350,6 +348,29 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
}
}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+ costPerCoeff(bool vectorized) const {
+ double compute_cost = TensorOpCost::AddCost<Index>();
+ if (NumDims > 0) {
+ for (int i = NumDims - 1; i > 0; --i) {
+ compute_cost += TensorOpCost::DivCost<Index>();
+ if (internal::index_statically_eq<Broadcast>()(i, 1)) {
+ compute_cost +=
+ TensorOpCost::MulCost<Index>() + TensorOpCost::AddCost<Index>();
+ } else {
+ if (!internal::index_statically_eq<InputDimensions>()(i, 1)) {
+ compute_cost += TensorOpCost::MulCost<Index>() +
+ TensorOpCost::ModCost<Index>() +
+ TensorOpCost::AddCost<Index>();
+ }
+ }
+ compute_cost +=
+ TensorOpCost::MulCost<Index>() + TensorOpCost::AddCost<Index>();
+ }
+ }
+ return m_impl.costPerCoeff(vectorized) +
+ TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
+ }
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
index c21a98fe0..2742dbb95 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
@@ -134,6 +134,10 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
typedef typename XprType::Index Index;
typedef DSizes<Index, NumDims> Dimensions;
typedef typename XprType::Scalar Scalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+
enum {
// Alignment can't be guaranteed at compile time since it depends on the
@@ -180,9 +184,6 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
m_inputOffset = m_stride * op.offset();
}
- typedef typename XprType::CoeffReturnType CoeffReturnType;
- typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
@@ -202,17 +203,16 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
{
- const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
- EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
- eigen_assert(index+packetSize-1 < dimensions().TotalSize());
+ EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == 0) ||
(static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == NumInputDims-1)) {
// m_stride is equal to 1, so let's avoid the integer division.
eigen_assert(m_stride == 1);
Index inputIndex = index * m_inputStride + m_inputOffset;
- EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
- for (int i = 0; i < packetSize; ++i) {
+ EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+ for (int i = 0; i < PacketSize; ++i) {
values[i] = m_impl.coeff(inputIndex);
inputIndex += m_inputStride;
}
@@ -226,13 +226,13 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
} else {
const Index idx = index / m_stride;
const Index rem = index - idx * m_stride;
- if (rem + packetSize <= m_stride) {
+ if (rem + PacketSize <= m_stride) {
Index inputIndex = idx * m_inputStride + m_inputOffset + rem;
return m_impl.template packet<LoadMode>(inputIndex);
} else {
// Cross the stride boundary. Fallback to slow path.
- EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
- for (int i = 0; i < packetSize; ++i) {
+ EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+ for (int i = 0; i < PacketSize; ++i) {
values[i] = coeff(index);
++index;
}
@@ -242,6 +242,28 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
}
}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+ costPerCoeff(bool vectorized) const {
+ double cost = 0;
+ if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) &&
+ m_dim.actualDim() == 0) ||
+ (static_cast<int>(Layout) == static_cast<int>(RowMajor) &&
+ m_dim.actualDim() == NumInputDims - 1)) {
+ cost += TensorOpCost::MulCost<Index>() + TensorOpCost::AddCost<Index>();
+ } else if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) &&
+ m_dim.actualDim() == NumInputDims - 1) ||
+ (static_cast<int>(Layout) == static_cast<int>(RowMajor) &&
+ m_dim.actualDim() == 0)) {
+ cost += TensorOpCost::AddCost<Index>();
+ } else {
+ cost += 3 * TensorOpCost::MulCost<Index>() + TensorOpCost::DivCost<Index>() +
+ 3 * TensorOpCost::AddCost<Index>();
+ }
+
+ return m_impl.costPerCoeff(vectorized) +
+ TensorOpCost(0, 0, cost, vectorized, PacketSize);
+ }
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const {
CoeffReturnType* result = const_cast<CoeffReturnType*>(m_impl.data());
if (((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == NumDims) ||
@@ -298,6 +320,9 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
typedef typename XprType::Index Index;
typedef DSizes<Index, NumDims> Dimensions;
typedef typename XprType::Scalar Scalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
enum {
IsAligned = false,
@@ -309,9 +334,6 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
: Base(op, device)
{ }
- typedef typename XprType::CoeffReturnType CoeffReturnType;
- typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
{
return this->m_impl.coeffRef(this->srcCoeff(index));
@@ -320,17 +342,16 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void writePacket(Index index, const PacketReturnType& x)
{
- static const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
- EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
if ((static_cast<int>(this->Layout) == static_cast<int>(ColMajor) && this->m_dim.actualDim() == 0) ||
(static_cast<int>(this->Layout) == static_cast<int>(RowMajor) && this->m_dim.actualDim() == NumInputDims-1)) {
// m_stride is equal to 1, so let's avoid the integer division.
eigen_assert(this->m_stride == 1);
- EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
+ EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
Index inputIndex = index * this->m_inputStride + this->m_inputOffset;
- for (int i = 0; i < packetSize; ++i) {
+ for (int i = 0; i < PacketSize; ++i) {
this->m_impl.coeffRef(inputIndex) = values[i];
inputIndex += this->m_inputStride;
}
@@ -342,14 +363,14 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
} else {
const Index idx = index / this->m_stride;
const Index rem = index - idx * this->m_stride;
- if (rem + packetSize <= this->m_stride) {
+ if (rem + PacketSize <= this->m_stride) {
const Index inputIndex = idx * this->m_inputStride + this->m_inputOffset + rem;
this->m_impl.template writePacket<StoreMode>(inputIndex, x);
} else {
// Cross stride boundary. Fallback to slow path.
- EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
+ EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
- for (int i = 0; i < packetSize; ++i) {
+ for (int i = 0; i < PacketSize; ++i) {
this->coeffRef(index) = values[i];
++index;
}
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
index 7738f18fb..839c6e3e5 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
@@ -260,6 +260,21 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
return rslt;
}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+ costPerCoeff(bool vectorized) const {
+ const double compute_cost = NumDims * (2 * TensorOpCost::AddCost<Index>() +
+ 2 * TensorOpCost::MulCost<Index>() +
+ TensorOpCost::DivCost<Index>() +
+ TensorOpCost::ModCost<Index>());
+ const double lhs_size = m_leftImpl.dimensions().TotalSize();
+ const double rhs_size = m_rightImpl.dimensions().TotalSize();
+ return (lhs_size / (lhs_size + rhs_size)) *
+ m_leftImpl.costPerCoeff(vectorized) +
+ (rhs_size / (lhs_size + rhs_size)) *
+ m_rightImpl.costPerCoeff(vectorized) +
+ TensorOpCost(0, 0, compute_cost);
+ }
+
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
protected:
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index f070ba61e..97182258d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -426,6 +426,99 @@ struct TensorContractionEvaluatorBase
buffer, resIncr, alpha);
}
+ template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
+ EIGEN_DEVICE_FUNC void evalGemm(Scalar* buffer) const {
+ // columns in left side, rows in right side
+ const Index k = this->m_k_size;
+
+ // rows in left side
+ const Index m = this->m_i_size;
+
+ // columns in right side
+ const Index n = this->m_j_size;
+
+ // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar)
+ this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
+
+ // define mr, nr, and all of my data mapper types
+ typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
+ typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
+ typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits;
+
+ const Index nr = Traits::nr;
+ const Index mr = Traits::mr;
+
+ typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
+ typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
+
+ const Index lhs_packet_size = internal::unpacket_traits<typename LeftEvaluator::PacketReturnType>::size;
+ const Index rhs_packet_size = internal::unpacket_traits<typename RightEvaluator::PacketReturnType>::size;
+
+ typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
+ LeftEvaluator, left_nocontract_t,
+ contract_t, lhs_packet_size,
+ lhs_inner_dim_contiguous,
+ false, Unaligned> LhsMapper;
+
+ typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs,
+ RightEvaluator, right_nocontract_t,
+ contract_t, rhs_packet_size,
+ rhs_inner_dim_contiguous,
+ rhs_inner_dim_reordered, Unaligned> RhsMapper;
+
+ typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
+
+ // Declare GEBP packing and kernel structs
+ internal::gemm_pack_lhs<LhsScalar, Index, typename LhsMapper::SubMapper, mr, Traits::LhsProgress, ColMajor> pack_lhs;
+ internal::gemm_pack_rhs<RhsScalar, Index, typename RhsMapper::SubMapper, nr, ColMajor> pack_rhs;
+
+ internal::gebp_kernel<LhsScalar, RhsScalar, Index, OutputMapper, mr, nr, false, false> gebp;
+
+ // initialize data mappers
+ LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,
+ this->m_left_contracting_strides, this->m_k_strides);
+
+ RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides,
+ this->m_right_contracting_strides, this->m_k_strides);
+
+ OutputMapper output(buffer, m);
+
+ // Sizes of the blocks to load in cache. See the Goto paper for details.
+ internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index, internal::ShardByCol> blocking(k, m, n, 1);
+ const Index kc = blocking.kc();
+ const Index mc = numext::mini(m, blocking.mc());
+ const Index nc = numext::mini(n, blocking.nc());
+ const Index sizeA = mc * kc;
+ const Index sizeB = kc * nc;
+
+ LhsScalar* blockA = static_cast<LhsScalar *>(this->m_device.allocate(sizeA * sizeof(LhsScalar)));
+ RhsScalar* blockB = static_cast<RhsScalar *>(this->m_device.allocate(sizeB * sizeof(RhsScalar)));
+
+ for(Index i2=0; i2<m; i2+=mc)
+ {
+ const Index actual_mc = numext::mini(i2+mc,m)-i2;
+ for (Index k2 = 0; k2 < k; k2 += kc) {
+ // make sure we don't overshoot right edge of left matrix, then pack vertical panel
+ const Index actual_kc = numext::mini(k2 + kc, k) - k2;
+ pack_lhs(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc, 0, 0);
+
+ // series of horizontal blocks
+ for (Index j2 = 0; j2 < n; j2 += nc) {
+ // make sure we don't overshoot right edge of right matrix, then pack block
+ const Index actual_nc = numext::mini(j2 + nc, n) - j2;
+ pack_rhs(blockB, rhs.getSubMapper(k2, j2), actual_kc, actual_nc, 0, 0);
+
+ // call gebp (matrix kernel)
+ // The parameters here are copied from Eigen's GEMM implementation
+ gebp(output.getSubMapper(i2, j2), blockA, blockB, actual_mc, actual_kc, actual_nc, 1.0, -1, -1, 0, 0);
+ }
+ }
+ }
+
+ this->m_device.deallocate(blockA);
+ this->m_device.deallocate(blockB);
+ }
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
m_leftImpl.cleanup();
m_rightImpl.cleanup();
@@ -440,6 +533,10 @@ struct TensorContractionEvaluatorBase
return m_result[index];
}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const {
+ return TensorOpCost(sizeof(CoeffReturnType), 0, 0);
+ }
+
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
return internal::ploadt<PacketReturnType, LoadMode>(m_result + index);
@@ -529,100 +626,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
return;
}
- evalGemm<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer);
- }
-
- template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
- EIGEN_DEVICE_FUNC void evalGemm(Scalar* buffer) const {
- // columns in left side, rows in right side
- const Index k = this->m_k_size;
-
- // rows in left side
- const Index m = this->m_i_size;
-
- // columns in right side
- const Index n = this->m_j_size;
-
- // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar)
- this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
-
- // define mr, nr, and all of my data mapper types
- typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
- typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
- typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits;
-
- const Index nr = Traits::nr;
- const Index mr = Traits::mr;
-
- typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
- typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
-
- const Index lhs_packet_size = internal::unpacket_traits<typename LeftEvaluator::PacketReturnType>::size;
- const Index rhs_packet_size = internal::unpacket_traits<typename RightEvaluator::PacketReturnType>::size;
-
- typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
- LeftEvaluator, left_nocontract_t,
- contract_t, lhs_packet_size,
- lhs_inner_dim_contiguous,
- false, Unaligned> LhsMapper;
-
- typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs,
- RightEvaluator, right_nocontract_t,
- contract_t, rhs_packet_size,
- rhs_inner_dim_contiguous,
- rhs_inner_dim_reordered, Unaligned> RhsMapper;
-
- typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
-
- // Declare GEBP packing and kernel structs
- internal::gemm_pack_lhs<LhsScalar, Index, typename LhsMapper::SubMapper, mr, Traits::LhsProgress, ColMajor> pack_lhs;
- internal::gemm_pack_rhs<RhsScalar, Index, typename RhsMapper::SubMapper, nr, ColMajor> pack_rhs;
-
- internal::gebp_kernel<LhsScalar, RhsScalar, Index, OutputMapper, mr, nr, false, false> gebp;
-
- // initialize data mappers
- LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,
- this->m_left_contracting_strides, this->m_k_strides);
-
- RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides,
- this->m_right_contracting_strides, this->m_k_strides);
-
- OutputMapper output(buffer, m);
-
- // Sizes of the blocks to load in cache. See the Goto paper for details.
- internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index, internal::ShardByCol> blocking(k, m, n, 1);
- const Index kc = blocking.kc();
- const Index mc = numext::mini(m, blocking.mc());
- const Index nc = numext::mini(n, blocking.nc());
- const Index sizeA = mc * kc;
- const Index sizeB = kc * nc;
-
- LhsScalar* blockA = static_cast<LhsScalar *>(this->m_device.allocate(sizeA * sizeof(LhsScalar)));
- RhsScalar* blockB = static_cast<RhsScalar *>(this->m_device.allocate(sizeB * sizeof(RhsScalar)));
-
- for(Index i2=0; i2<m; i2+=mc)
- {
- const Index actual_mc = numext::mini(i2+mc,m)-i2;
- for (Index k2 = 0; k2 < k; k2 += kc) {
- // make sure we don't overshoot right edge of left matrix, then pack vertical panel
- const Index actual_kc = numext::mini(k2 + kc, k) - k2;
- pack_lhs(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc, 0, 0);
-
- // series of horizontal blocks
- for (Index j2 = 0; j2 < n; j2 += nc) {
- // make sure we don't overshoot right edge of right matrix, then pack block
- const Index actual_nc = numext::mini(j2 + nc, n) - j2;
- pack_rhs(blockB, rhs.getSubMapper(k2, j2), actual_kc, actual_nc, 0, 0);
-
- // call gebp (matrix kernel)
- // The parameters here are copied from Eigen's GEMM implementation
- gebp(output.getSubMapper(i2, j2), blockA, blockB, actual_mc, actual_kc, actual_nc, 1.0, -1, -1, 0, 0);
- }
- }
- }
-
- this->m_device.deallocate(blockA);
- this->m_device.deallocate(blockB);
+ this->template evalGemm<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer);
}
};
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
index 3d3f6904f..5cf7b4f71 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
@@ -35,9 +35,7 @@ class TensorContractionBlocking {
computeProductBlockingSizes<LhsScalar, RhsScalar, 1>(kc_, mc_, nc_, num_threads);
}
else {
- if (kc_ && mc_ && nc_) {
- mc_ = (((m / num_threads) + 15) / 16) * 16;
- }
+ computeProductBlockingSizes<LhsScalar, RhsScalar, 1>(kc_, nc_, mc_, num_threads);
}
}
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
index a96776a77..a2f1f71f5 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
@@ -177,7 +177,6 @@ template <typename Eval, typename Scalar> struct ConversionSubExprEval<true, Eva
};
-
// Eval as rvalue
template<typename TargetType, typename ArgType, typename Device>
struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
@@ -190,6 +189,7 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
typedef typename internal::remove_all<typename internal::traits<ArgType>::Scalar>::type SrcType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
typedef typename PacketType<SrcType, Device>::type PacketSourceType;
+ static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
enum {
IsAligned = false,
@@ -231,6 +231,21 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
return converter.template packet<LoadMode>(index);
}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+ costPerCoeff(bool vectorized) const {
+ const double cast_cost = TensorOpCost::CastCost<SrcType, TargetType>();
+ if (vectorized) {
+ const double SrcCoeffRatio =
+ internal::type_casting_traits<SrcType, TargetType>::SrcCoeffRatio;
+ const double TgtCoeffRatio =
+ internal::type_casting_traits<SrcType, TargetType>::TgtCoeffRatio;
+ return m_impl.costPerCoeff(vectorized) * (SrcCoeffRatio / PacketSize) +
+ TensorOpCost(0, 0, TgtCoeffRatio * (cast_cost / PacketSize));
+ } else {
+ return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, cast_cost);
+ }
+ }
+
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
protected:
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
index 4fe1fb943..ff3c5662d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
@@ -297,6 +297,11 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
typedef typename XprType::Index Index;
typedef DSizes<Index, NumDims> Dimensions;
+ typedef typename XprType::Scalar Scalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+
enum {
IsAligned = TensorEvaluator<InputArgType, Device>::IsAligned & TensorEvaluator<KernelArgType, Device>::IsAligned,
PacketAccess = TensorEvaluator<InputArgType, Device>::PacketAccess & TensorEvaluator<KernelArgType, Device>::PacketAccess,
@@ -367,10 +372,6 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
}
}
- typedef typename XprType::Scalar Scalar;
- typedef typename XprType::CoeffReturnType CoeffReturnType;
- typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
@@ -405,7 +406,6 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
template<int LoadMode>
EIGEN_DEVICE_FUNC PacketReturnType packet(const Index index) const
{
- const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
Index indices[2] = {index, index+PacketSize-1};
Index startInputs[2] = {0, 0};
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
@@ -448,6 +448,23 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
}
}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+ costPerCoeff(bool vectorized) const {
+ const double kernel_size = m_kernelImpl.dimensions().TotalSize();
+ // We ignore the use of fused multiply-add.
+ const double convolve_compute_cost =
+ TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
+ const double firstIndex_compute_cost =
+ NumDims *
+ (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
+ TensorOpCost::DivCost<Index>());
+ return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
+ kernel_size * (m_inputImpl.costPerCoeff(vectorized) +
+ m_kernelImpl.costPerCoeff(vectorized) +
+ TensorOpCost(0, 0, convolve_compute_cost, vectorized,
+ PacketSize));
+ }
+
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
private:
@@ -773,6 +790,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename PacketType<CoeffReturnType, GpuDevice>::type PacketReturnType;
typedef typename InputArgType::Scalar Scalar;
+ static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; }
@@ -1044,6 +1062,25 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
return internal::ploadt<PacketReturnType, LoadMode>(m_buf+index);
}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+ costPerCoeff(bool vectorized) const {
+ // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost
+ // model.
+ const double kernel_size = m_kernelImpl.dimensions().TotalSize();
+ // We ignore the use of fused multiply-add.
+ const double convolve_compute_cost =
+ TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
+ const double firstIndex_compute_cost =
+ NumDims *
+ (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
+ TensorOpCost::DivCost<Index>());
+ return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
+ kernel_size * (m_inputImpl.costPerCoeff(vectorized) +
+ m_kernelImpl.costPerCoeff(vectorized) +
+ TensorOpCost(0, 0, convolve_compute_cost, vectorized,
+ PacketSize));
+ }
+
private:
// No assignment (copies are needed by the kernels)
TensorEvaluator& operator = (const TensorEvaluator&);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
new file mode 100644
index 000000000..4e8f86674
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
@@ -0,0 +1,214 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Rasmus Munk Larsen <rmlarsen@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
+#define EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
+
+//#if !defined(EIGEN_USE_GPU)
+//#define EIGEN_USE_COST_MODEL
+//#endif
+
+namespace Eigen {
+
+/** \class TensorEvaluator
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief A cost model used to limit the number of threads used for evaluating
+ * tensor expression.
+ *
+ */
+
+// Class storing the cost of evaluating a tensor expression in terms of the
+// estimated number of operand bytes loads, bytes stored, and compute cycles.
+class TensorOpCost {
+ public:
+ // TODO(rmlarsen): Fix the scalar op costs in Eigen proper. Even a simple
+ // model based on minimal reciprocal throughput numbers from Intel or
+ // Agner Fog's tables would be better than what is there now.
+ template <typename ArgType>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int MulCost() {
+ return internal::functor_traits<
+ internal::scalar_product_op<ArgType, ArgType>>::Cost;
+ }
+ template <typename ArgType>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int AddCost() {
+ return internal::functor_traits<internal::scalar_sum_op<ArgType>>::Cost;
+ }
+ template <typename ArgType>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int DivCost() {
+ return internal::functor_traits<
+ internal::scalar_quotient_op<ArgType, ArgType>>::Cost;
+ }
+ template <typename ArgType>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int ModCost() {
+ return internal::functor_traits<internal::scalar_mod_op<ArgType>>::Cost;
+ }
+ template <typename SrcType, typename TargetType>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int CastCost() {
+ return internal::functor_traits<
+ internal::scalar_cast_op<SrcType, TargetType>>::Cost;
+ }
+
+ TensorOpCost() : bytes_loaded_(0), bytes_stored_(0), compute_cycles_(0) {}
+ TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles)
+ : bytes_loaded_(bytes_loaded),
+ bytes_stored_(bytes_stored),
+ compute_cycles_(compute_cycles) {}
+
+ TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles,
+ bool vectorized, double packet_size)
+ : bytes_loaded_(bytes_loaded),
+ bytes_stored_(bytes_stored),
+ compute_cycles_(vectorized ? compute_cycles / packet_size
+ : compute_cycles) {
+ using std::isfinite;
+ eigen_assert(bytes_loaded >= 0 && (isfinite)(bytes_loaded));
+ eigen_assert(bytes_stored >= 0 && (isfinite)(bytes_stored));
+ eigen_assert(compute_cycles >= 0 && (isfinite)(compute_cycles));
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_loaded() const {
+ return bytes_loaded_;
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_stored() const {
+ return bytes_stored_;
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double compute_cycles() const {
+ return compute_cycles_;
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double total_cost(
+ double load_cost, double store_cost, double compute_cost) const {
+ return load_cost * bytes_loaded_ + store_cost * bytes_stored_ +
+ compute_cost * compute_cycles_;
+ }
+
+ // Drop memory access component. Intended for cases when memory accesses are
+ // sequential or are completely masked by computations.
+ EIGEN_DEVICE_FUNC void dropMemoryCost() {
+ bytes_loaded_ = 0;
+ bytes_stored_ = 0;
+ }
+
+ // TODO(rmlarsen): Define min in terms of total cost, not elementwise.
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& cwiseMin(
+ const TensorOpCost& rhs) {
+ bytes_loaded_ = numext::mini(bytes_loaded_, rhs.bytes_loaded());
+ bytes_stored_ = numext::mini(bytes_stored_, rhs.bytes_stored());
+ compute_cycles_ = numext::mini(compute_cycles_, rhs.compute_cycles());
+ return *this;
+ }
+
+ // TODO(rmlarsen): Define max in terms of total cost, not elementwise.
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& cwiseMax(
+ const TensorOpCost& rhs) {
+ bytes_loaded_ = numext::maxi(bytes_loaded_, rhs.bytes_loaded());
+ bytes_stored_ = numext::maxi(bytes_stored_, rhs.bytes_stored());
+ compute_cycles_ = numext::maxi(compute_cycles_, rhs.compute_cycles());
+ return *this;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator+=(
+ const TensorOpCost& rhs) {
+ bytes_loaded_ += rhs.bytes_loaded();
+ bytes_stored_ += rhs.bytes_stored();
+ compute_cycles_ += rhs.compute_cycles();
+ return *this;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator*=(double rhs) {
+ bytes_loaded_ *= rhs;
+ bytes_stored_ *= rhs;
+ compute_cycles_ *= rhs;
+ return *this;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator+(
+ TensorOpCost lhs, const TensorOpCost& rhs) {
+ lhs += rhs;
+ return lhs;
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*(
+ TensorOpCost lhs, double rhs) {
+ lhs *= rhs;
+ return lhs;
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*(
+ double lhs, TensorOpCost rhs) {
+ rhs *= lhs;
+ return rhs;
+ }
+
+ friend std::ostream& operator<<(std::ostream& os, const TensorOpCost& tc) {
+ return os << "[bytes_loaded = " << tc.bytes_loaded()
+ << ", bytes_stored = " << tc.bytes_stored()
+ << ", compute_cycles = " << tc.compute_cycles() << "]";
+ }
+
+ private:
+ double bytes_loaded_;
+ double bytes_stored_;
+ double compute_cycles_;
+};
+
+// TODO(rmlarsen): Implement a policy that chooses an "optimal" number of theads
+// in [1:max_threads] instead of just switching multi-threading off for small
+// work units.
+template <typename Device>
+class TensorCostModel {
+ public:
+ // Scaling from Eigen compute cost to device cycles.
+ static const int kDeviceCyclesPerComputeCycle = 1;
+
+ // Costs in device cycles.
+ static const int kStartupCycles = 100000;
+ static const int kPerThreadCycles = 100000;
+ static const int kTaskSize = 40000;
+
+ // Returns the number of threads in [1:max_threads] to use for
+ // evaluating an expression with the given output size and cost per
+ // coefficient.
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int numThreads(
+ double output_size, const TensorOpCost& cost_per_coeff, int max_threads) {
+ double cost = totalCost(output_size, cost_per_coeff);
+ int threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9;
+ return numext::mini(max_threads, numext::maxi(1, threads));
+ }
+
+ // taskSize assesses parallel task size.
+ // Value of 1.0 means ideal parallel task size. Values < 1.0 mean that task
+ // granularity needs to be increased to mitigate parallelization overheads.
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double taskSize(
+ double output_size, const TensorOpCost& cost_per_coeff) {
+ return totalCost(output_size, cost_per_coeff) / kTaskSize;
+ }
+
+ private:
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double totalCost(
+ double output_size, const TensorOpCost& cost_per_coeff) {
+ // Cost of memory fetches from L2 cache. 64 is typical cache line size.
+ // 11 is L2 cache latency on Haswell.
+ // We don't know whether data is in L1, L2 or L3. But we are most interested
+ // in single-threaded computational time around 100us-10ms (smaller time
+ // is too small for parallelization, larger time is not intersting
+ // either because we are probably using all available threads already).
+ // And for the target time range, L2 seems to be what matters. Data set
+ // fitting into L1 is too small to take noticeable time. Data set fitting
+ // only into L3 presumably will take more than 10ms to load and process.
+ const double kLoadCycles = 1.0 / 64 * 11;
+ const double kStoreCycles = 1.0 / 64 * 11;
+ // Scaling from Eigen compute cost to device cycles.
+ return output_size *
+ cost_per_coeff.total_cost(kLoadCycles, kStoreCycles,
+ kDeviceCyclesPerComputeCycle);
+ }
+};
+
+} // namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
index b58e513b4..e020d076f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
@@ -83,8 +83,10 @@ struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Devi
typedef typename internal::traits<ArgType>::Index Index;
static const int NumDims = internal::traits<ArgType>::NumDimensions;
typedef DSizes<Index, NumDims> Dimensions;
- typedef
- typename internal::remove_const<typename ArgType::Scalar>::type Scalar;
+ typedef typename internal::remove_const<typename ArgType::Scalar>::type Scalar;
+ typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
enum {
IsAligned = false,
@@ -101,9 +103,6 @@ struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Devi
m_dimensions = op.func().dimensions(op.expression());
}
- typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
- typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
@@ -134,6 +133,11 @@ struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Devi
return internal::ploadt<PacketReturnType, LoadMode>(m_result + index);
}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+ // TODO(rmlarsen): Extend CustomOp API to return its cost estimate.
+ return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
+ }
+
EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_result; }
protected:
@@ -236,6 +240,9 @@ struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType,
static const int NumDims = internal::traits<XprType>::NumDimensions;
typedef DSizes<Index, NumDims> Dimensions;
typedef typename XprType::Scalar Scalar;
+ typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
enum {
IsAligned = false,
@@ -252,9 +259,6 @@ struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType,
m_dimensions = op.func().dimensions(op.lhsExpression(), op.rhsExpression());
}
- typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
- typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
@@ -284,6 +288,11 @@ struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType,
return internal::ploadt<PacketReturnType, LoadMode>(m_result + index);
}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+ // TODO(rmlarsen): Extend CustomOp API to return its cost estimate.
+ return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
+ }
+
EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_result; }
protected:
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
index 821835cf3..1d2d162dc 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
@@ -291,15 +291,9 @@ struct GpuDevice {
int max_blocks_;
};
-#ifndef __CUDA_ARCH__
#define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \
(kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \
assert(cudaGetLastError() == cudaSuccess);
-#else
-#define LAUNCH_CUDA_KERNEL(kernel, ...) \
- { const auto __attribute__((__unused__)) __makeTheKernelInstantiate = &(kernel); } \
- eigen_assert(false && "Cannot launch a kernel from another kernel" __CUDA_ARCH__);
-#endif
// FIXME: Should be device and kernel specific.
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
index 267f6f8e3..9d141395b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
@@ -44,6 +44,26 @@ struct DefaultDevice {
#endif
}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
+#ifndef __CUDA_ARCH__
+ // Running on the host CPU
+ return l1CacheSize();
+#else
+ // Running on a CUDA device, return the amount of shared memory available.
+ return 48*1024;
+#endif
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
+#ifndef __CUDA_ARCH__
+ // Running single threaded on the host CPU
+ return l3CacheSize();
+#else
+ // Running on a CUDA device
+ return firstLevelCacheSize();
+#endif
+ }
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
#ifndef __CUDA_ARCH__
// Running single threaded on the host CPU
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
index cd3dd214b..c02891465 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
@@ -12,145 +12,15 @@
namespace Eigen {
-// This defines an interface that ThreadPoolDevice can take to use
-// custom thread pools underneath.
-class ThreadPoolInterface {
- public:
- virtual void Schedule(std::function<void()> fn) = 0;
-
- virtual ~ThreadPoolInterface() {}
-};
-
-// The implementation of the ThreadPool type ensures that the Schedule method
-// runs the functions it is provided in FIFO order when the scheduling is done
-// by a single thread.
-// Environment provides a way to create threads and also allows to intercept
-// task submission and execution.
-template <typename Environment>
-class ThreadPoolTempl : public ThreadPoolInterface {
- public:
- // Construct a pool that contains "num_threads" threads.
- explicit ThreadPoolTempl(int num_threads, Environment env = Environment())
- : env_(env), threads_(num_threads), waiters_(num_threads) {
- for (int i = 0; i < num_threads; i++) {
- threads_.push_back(env.CreateThread([this]() { WorkerLoop(); }));
- }
- }
-
- // Wait until all scheduled work has finished and then destroy the
- // set of threads.
- ~ThreadPoolTempl() {
- {
- // Wait for all work to get done.
- std::unique_lock<std::mutex> l(mu_);
- while (!pending_.empty()) {
- empty_.wait(l);
- }
- exiting_ = true;
-
- // Wakeup all waiters.
- for (auto w : waiters_) {
- w->ready = true;
- w->task.f = nullptr;
- w->cv.notify_one();
- }
- }
-
- // Wait for threads to finish.
- for (auto t : threads_) {
- delete t;
- }
- }
-
- // Schedule fn() for execution in the pool of threads. The functions are
- // executed in the order in which they are scheduled.
- void Schedule(std::function<void()> fn) {
- Task t = env_.CreateTask(std::move(fn));
- std::unique_lock<std::mutex> l(mu_);
- if (waiters_.empty()) {
- pending_.push_back(std::move(t));
- } else {
- Waiter* w = waiters_.back();
- waiters_.pop_back();
- w->ready = true;
- w->task = std::move(t);
- w->cv.notify_one();
- }
- }
-
- protected:
- void WorkerLoop() {
- std::unique_lock<std::mutex> l(mu_);
- Waiter w;
- Task t;
- while (!exiting_) {
- if (pending_.empty()) {
- // Wait for work to be assigned to me
- w.ready = false;
- waiters_.push_back(&w);
- while (!w.ready) {
- w.cv.wait(l);
- }
- t = w.task;
- w.task.f = nullptr;
- } else {
- // Pick up pending work
- t = std::move(pending_.front());
- pending_.pop_front();
- if (pending_.empty()) {
- empty_.notify_all();
- }
- }
- if (t.f) {
- mu_.unlock();
- env_.ExecuteTask(t);
- t.f = nullptr;
- mu_.lock();
- }
- }
- }
-
- private:
- typedef typename Environment::Task Task;
- typedef typename Environment::EnvThread Thread;
-
- struct Waiter {
- std::condition_variable cv;
- Task task;
- bool ready;
- };
-
- Environment env_;
- std::mutex mu_;
- MaxSizeVector<Thread*> threads_; // All threads
- MaxSizeVector<Waiter*> waiters_; // Stack of waiting threads.
- std::deque<Task> pending_; // Queue of pending work
- std::condition_variable empty_; // Signaled on pending_.empty()
- bool exiting_ = false;
-};
-
-struct StlThreadEnvironment {
- struct Task {
- std::function<void()> f;
- };
-
- // EnvThread constructor must start the thread,
- // destructor must join the thread.
- class EnvThread {
- public:
- EnvThread(std::function<void()> f) : thr_(f) {}
- ~EnvThread() { thr_.join(); }
-
- private:
- std::thread thr_;
- };
-
- EnvThread* CreateThread(std::function<void()> f) { return new EnvThread(f); }
- Task CreateTask(std::function<void()> f) { return Task{std::move(f)}; }
- void ExecuteTask(const Task& t) { t.f(); }
-};
-
-typedef ThreadPoolTempl<StlThreadEnvironment> ThreadPool;
+// Use the SimpleThreadPool by default. We'll switch to the new non blocking
+// thread pool later.
+#ifdef EIGEN_USE_NONBLOCKING_THREAD_POOL
+template <typename Env> using ThreadPoolTempl = NonBlockingThreadPoolTempl<Env>;
+typedef NonBlockingThreadPool ThreadPool;
+#else
+template <typename Env> using ThreadPoolTempl = SimpleThreadPoolTempl<Env>;
+typedef SimpleThreadPool ThreadPool;
+#endif
// Barrier is an object that allows one or more threads to wait until
@@ -264,6 +134,15 @@ struct ThreadPoolDevice {
return num_threads_;
}
+ EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
+ return l1CacheSize();
+ }
+
+ EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
+ // The l3 cache size is shared between all the cores.
+ return l3CacheSize() / num_threads_;
+ }
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
// Should return an enum that encodes the ISA supported by the CPU
return 1;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
index 1fb27a65b..5c6748a43 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
@@ -88,10 +88,14 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType>, Device>
typedef TensorEvalToOp<ArgType> XprType;
typedef typename ArgType::Scalar Scalar;
typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
+ typedef typename XprType::Index Index;
+ typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
enum {
IsAligned = true,
- PacketAccess = true,
+ PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
RawAccess = true
@@ -104,10 +108,6 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType>, Device>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~TensorEvaluator() {
}
- typedef typename XprType::Index Index;
- typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
- typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* scalar) {
@@ -138,6 +138,13 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType>, Device>
return internal::ploadt<PacketReturnType, LoadMode>(m_buffer + index);
}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+ // We assume that evalPacket or evalScalar is called to perform the
+ // assignment and account for the cost of the write here.
+ return m_impl.costPerCoeff(vectorized) +
+ TensorOpCost(0, sizeof(CoeffReturnType), 0, vectorized, PacketSize);
+ }
+
EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_buffer; }
private:
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
index 947a8ed88..ae4ce3c90 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
@@ -101,6 +101,11 @@ struct TensorEvaluator
}
}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+ return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized,
+ internal::unpacket_traits<PacketReturnType>::size);
+ }
+
EIGEN_DEVICE_FUNC Scalar* data() const { return m_data; }
protected:
@@ -184,6 +189,11 @@ struct TensorEvaluator<const Derived, Device>
return loadConstant(m_data+index);
}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+ return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized,
+ internal::unpacket_traits<PacketReturnType>::size);
+ }
+
EIGEN_DEVICE_FUNC const Scalar* data() const { return m_data; }
protected:
@@ -219,6 +229,7 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
typedef typename XprType::Scalar Scalar;
typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); }
@@ -237,6 +248,12 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
return m_functor.template packetOp<Index, PacketReturnType>(index);
}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+ costPerCoeff(bool vectorized) const {
+ return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized,
+ internal::unpacket_traits<PacketReturnType>::size);
+ }
+
EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
private:
@@ -270,6 +287,7 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
typedef typename XprType::Scalar Scalar;
typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); }
@@ -293,6 +311,12 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
return m_functor.packetOp(m_argImpl.template packet<LoadMode>(index));
}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+ const double functor_cost = internal::functor_traits<UnaryOp>::Cost;
+ return m_argImpl.costPerCoeff(vectorized) +
+ TensorOpCost(0, 0, functor_cost, vectorized, PacketSize);
+ }
+
EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
private:
@@ -330,6 +354,7 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
typedef typename XprType::Scalar Scalar;
typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
typedef typename TensorEvaluator<LeftArgType, Device>::Dimensions Dimensions;
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
@@ -358,6 +383,14 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
return m_functor.packetOp(m_leftImpl.template packet<LoadMode>(index), m_rightImpl.template packet<LoadMode>(index));
}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+ costPerCoeff(bool vectorized) const {
+ const double functor_cost = internal::functor_traits<BinaryOp>::Cost;
+ return m_leftImpl.costPerCoeff(vectorized) +
+ m_rightImpl.costPerCoeff(vectorized) +
+ TensorOpCost(0, 0, functor_cost, vectorized, PacketSize);
+ }
+
EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
private:
@@ -398,6 +431,7 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
typedef typename XprType::Index Index;
typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
typedef typename TensorEvaluator<IfArgType, Device>::Dimensions Dimensions;
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
@@ -425,7 +459,6 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
template<int LoadMode>
EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const
{
- const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
internal::Selector<PacketSize> select;
for (Index i = 0; i < PacketSize; ++i) {
select.select[i] = m_condImpl.coeff(index+i);
@@ -435,6 +468,13 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
m_elseImpl.template packet<LoadMode>(index));
}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+ costPerCoeff(bool vectorized) const {
+ return m_condImpl.costPerCoeff(vectorized) +
+ m_thenImpl.costPerCoeff(vectorized)
+ .cwiseMax(m_elseImpl.costPerCoeff(vectorized));
+ }
+
EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
private:
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index 4f4e07aaf..5c3d4d630 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -59,9 +59,16 @@ class TensorExecutor<Expression, DefaultDevice, true>
{
const Index size = array_prod(evaluator.dimensions());
const int PacketSize = unpacket_traits<typename TensorEvaluator<Expression, DefaultDevice>::PacketReturnType>::size;
+ // Manually unroll this loop since compilers don't do it.
+ const Index UnrolledSize = (size / (4 * PacketSize)) * 4 * PacketSize;
+ for (Index i = 0; i < UnrolledSize; i += 4*PacketSize) {
+ evaluator.evalPacket(i);
+ evaluator.evalPacket(i+PacketSize);
+ evaluator.evalPacket(i+2*PacketSize);
+ evaluator.evalPacket(i+3*PacketSize);
+ }
const Index VectorizedSize = (size / PacketSize) * PacketSize;
-
- for (Index i = 0; i < VectorizedSize; i += PacketSize) {
+ for (Index i = UnrolledSize; i < VectorizedSize; i += PacketSize) {
evaluator.evalPacket(i);
}
for (Index i = VectorizedSize; i < size; ++i) {
@@ -78,8 +85,9 @@ class TensorExecutor<Expression, DefaultDevice, true>
#ifdef EIGEN_USE_THREADS
template <typename Evaluator, typename Index, bool Vectorizable>
struct EvalRange {
- static void run(Evaluator evaluator, const Index first, const Index last) {
- eigen_assert(last > first);
+ static void run(Evaluator* evaluator_in, const Index first, const Index last) {
+ Evaluator evaluator = *evaluator_in;
+ eigen_assert(last >= first);
for (Index i = first; i < last; ++i) {
evaluator.evalScalar(i);
}
@@ -88,28 +96,34 @@ struct EvalRange {
template <typename Evaluator, typename Index>
struct EvalRange<Evaluator, Index, true> {
- static void run(Evaluator evaluator, const Index first, const Index last) {
- eigen_assert(last > first);
-
+ static void run(Evaluator* evaluator_in, const Index first, const Index last) {
+ Evaluator evaluator = *evaluator_in;
+ eigen_assert(last >= first);
Index i = first;
- static const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
+ const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
if (last - first >= PacketSize) {
eigen_assert(first % PacketSize == 0);
- Index lastPacket = last - (last % PacketSize);
- for (; i < lastPacket; i += PacketSize) {
+ Index last_chunk_offset = last - 4 * PacketSize;
+ // Manually unroll this loop since compilers don't do it.
+ for (; i <= last_chunk_offset; i += 4*PacketSize) {
+ evaluator.evalPacket(i);
+ evaluator.evalPacket(i+PacketSize);
+ evaluator.evalPacket(i+2*PacketSize);
+ evaluator.evalPacket(i+3*PacketSize);
+ }
+ last_chunk_offset = last - PacketSize;
+ for (; i <= last_chunk_offset; i += PacketSize) {
evaluator.evalPacket(i);
}
}
-
for (; i < last; ++i) {
evaluator.evalScalar(i);
}
}
};
-template<typename Expression, bool Vectorizable>
-class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable>
-{
+template <typename Expression, bool Vectorizable>
+class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> {
public:
typedef typename Expression::Index Index;
static inline void run(const Expression& expr, const ThreadPoolDevice& device)
@@ -119,24 +133,34 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable>
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
if (needs_assign)
{
+ const Index PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1;
const Index size = array_prod(evaluator.dimensions());
-
- static const int PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1;
-
- int blocksz = std::ceil<int>(static_cast<float>(size)/device.numThreads()) + PacketSize - 1;
- const Index blocksize = numext::maxi<Index>(PacketSize, (blocksz - (blocksz % PacketSize)));
- const unsigned int numblocks = static_cast<unsigned int>(size / blocksize);
-
- Barrier barrier(numblocks);
- for (unsigned int i = 0; i < numblocks; ++i) {
- device.enqueue_with_barrier(&barrier, &EvalRange<Evaluator, Index, Vectorizable>::run, evaluator, i*blocksize, (i+1)*blocksize);
+ size_t num_threads = device.numThreads();
+#ifdef EIGEN_USE_COST_MODEL
+ if (num_threads > 1) {
+ num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
+ size, evaluator.costPerCoeff(Vectorizable), num_threads);
}
-
- if (static_cast<Index>(numblocks) * blocksize < size) {
- EvalRange<Evaluator, Index, Vectorizable>::run(evaluator, numblocks * blocksize, size);
+#endif
+ if (num_threads == 1) {
+ EvalRange<Evaluator, Index, Vectorizable>::run(&evaluator, 0, size);
+ } else {
+ Index blocksz = std::ceil<Index>(static_cast<float>(size)/num_threads) + PacketSize - 1;
+ const Index blocksize = numext::maxi<Index>(PacketSize, (blocksz - (blocksz % PacketSize)));
+ const Index numblocks = size / blocksize;
+
+ Barrier barrier(numblocks);
+ for (int i = 0; i < numblocks; ++i) {
+ device.enqueue_with_barrier(
+ &barrier, &EvalRange<Evaluator, Index, Vectorizable>::run,
+ &evaluator, i * blocksize, (i + 1) * blocksize);
+ }
+ if (numblocks * blocksize < size) {
+ EvalRange<Evaluator, Index, Vectorizable>::run(
+ &evaluator, numblocks * blocksize, size);
+ }
+ barrier.Wait();
}
-
- barrier.Wait();
}
evaluator.cleanup();
}
@@ -147,98 +171,78 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable>
// GPU: the evaluation of the expression is offloaded to a GPU.
#if defined(EIGEN_USE_GPU)
-template <typename Expression>
-class TensorExecutor<Expression, GpuDevice, false> {
+template <typename Expression, bool Vectorizable>
+class TensorExecutor<Expression, GpuDevice, Vectorizable> {
public:
typedef typename Expression::Index Index;
- static EIGEN_DEVICE_FUNC void run(const Expression& expr, const GpuDevice& device);
+ static void run(const Expression& expr, const GpuDevice& device);
};
-template <typename Expression>
-class TensorExecutor<Expression, GpuDevice, true> {
- public:
- typedef typename Expression::Index Index;
- static EIGEN_DEVICE_FUNC void run(const Expression& expr, const GpuDevice& device);
-};
#if defined(__CUDACC__)
+template <typename Evaluator, typename Index, bool Vectorizable>
+struct EigenMetaKernelEval {
+ static __device__ EIGEN_ALWAYS_INLINE
+ void run(Evaluator& eval, Index first, Index last, Index step_size) {
+ for (Index i = first; i < last; i += step_size) {
+ eval.evalScalar(i);
+ }
+ }
+};
+
+template <typename Evaluator, typename Index>
+struct EigenMetaKernelEval<Evaluator, Index, true> {
+ static __device__ EIGEN_ALWAYS_INLINE
+ void run(Evaluator& eval, Index first, Index last, Index step_size) {
+ const Index PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
+ const Index vectorized_size = (last / PacketSize) * PacketSize;
+ const Index vectorized_step_size = step_size * PacketSize;
+
+ // Use the vector path
+ for (Index i = first * PacketSize; i < vectorized_size;
+ i += vectorized_step_size) {
+ eval.evalPacket(i);
+ }
+ for (Index i = vectorized_size + first; i < last; i += step_size) {
+ eval.evalScalar(i);
+ }
+ }
+};
template <typename Evaluator, typename Index>
__global__ void
__launch_bounds__(1024)
-EigenMetaKernel_NonVectorizable(Evaluator memcopied_eval, Index size) {
- // Cuda memcopies the kernel arguments. That's fine for POD, but for more
- // complex types such as evaluators we should really conform to the C++
- // standard and call a proper copy constructor.
- Evaluator eval(memcopied_eval);
+EigenMetaKernel(Evaluator memcopied_eval, Index size) {
const Index first_index = blockIdx.x * blockDim.x + threadIdx.x;
const Index step_size = blockDim.x * gridDim.x;
- // Use the scalar path
- for (Index i = first_index; i < size; i += step_size) {
- eval.evalScalar(i);
- }
-}
-
-template <typename Evaluator, typename Index>
-__global__ void
-__launch_bounds__(1024)
-EigenMetaKernel_Vectorizable(Evaluator memcopied_eval, Index size) {
// Cuda memcopies the kernel arguments. That's fine for POD, but for more
// complex types such as evaluators we should really conform to the C++
// standard and call a proper copy constructor.
Evaluator eval(memcopied_eval);
- const Index first_index = blockIdx.x * blockDim.x + threadIdx.x;
- const Index step_size = blockDim.x * gridDim.x;
-
- // Use the vector path
- const Index PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
- const Index vectorized_step_size = step_size * PacketSize;
- const Index vectorized_size = (size / PacketSize) * PacketSize;
- for (Index i = first_index * PacketSize; i < vectorized_size;
- i += vectorized_step_size) {
- eval.evalPacket(i);
- }
- for (Index i = vectorized_size + first_index; i < size; i += step_size) {
- eval.evalScalar(i);
- }
+ const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned;
+ EigenMetaKernelEval<Evaluator, Index, vectorizable>::run(eval, first_index, size, step_size);
}
/*static*/
-template <typename Expression>
-EIGEN_DEVICE_FUNC inline void TensorExecutor<Expression, GpuDevice, false>::run(const Expression& expr, const GpuDevice& device)
-{
+template <typename Expression, bool Vectorizable>
+inline void TensorExecutor<Expression, GpuDevice, Vectorizable>::run(
+ const Expression& expr, const GpuDevice& device) {
TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
- if (needs_assign)
- {
+ if (needs_assign) {
const int block_size = device.maxCudaThreadsPerBlock();
- const int max_blocks = numext::mini<int>(device.maxBlocks(), device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size);
+ const int max_blocks = device.getNumCudaMultiProcessors() *
+ device.maxCudaThreadsPerMultiProcessor() / block_size;
const Index size = array_prod(evaluator.dimensions());
- // Create a least one block to ensure we won't crash if we're called with tensors of size 0.
- const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, (size + block_size - 1) / block_size), 1);
- LAUNCH_CUDA_KERNEL((EigenMetaKernel_NonVectorizable<TensorEvaluator<Expression, GpuDevice>, Index>), num_blocks, block_size, 0, device, evaluator, size);
- }
- evaluator.cleanup();
-}
-
+ // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0.
+ const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, divup<int>(size, block_size)), 1);
-/*static*/
-template<typename Expression>
-EIGEN_DEVICE_FUNC inline void TensorExecutor<Expression, GpuDevice, true>::run(const Expression& expr, const GpuDevice& device)
-{
- TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
- const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
- if (needs_assign)
- {
- const int block_size = device.maxCudaThreadsPerBlock();
- const int max_blocks = numext::mini<int>(device.maxBlocks(), device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size);
- const Index size = array_prod(evaluator.dimensions());
- // Create a least one block to ensure we won't crash if we're called with tensors of size 0.
- const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, (size + block_size - 1) / block_size), 1);
- LAUNCH_CUDA_KERNEL((EigenMetaKernel_Vectorizable<TensorEvaluator<Expression, GpuDevice>, Index>), num_blocks, block_size, 0, device, evaluator, size);
+ LAUNCH_CUDA_KERNEL(
+ (EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, Index>),
+ num_blocks, block_size, 0, device, evaluator, size);
}
evaluator.cleanup();
}
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
index d6db45ade..ece2ed91b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
@@ -129,6 +129,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
typedef typename internal::conditional<FFTResultType == RealPart || FFTResultType == ImagPart, RealScalar, ComplexScalar>::type OutputScalar;
typedef OutputScalar CoeffReturnType;
typedef typename PacketType<OutputScalar, Device>::type PacketReturnType;
+ static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
enum {
IsAligned = false,
@@ -176,7 +177,6 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
}
}
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
if (m_data) {
m_device.deallocate(m_data);
@@ -189,11 +189,17 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
return m_data[index];
}
- template<int LoadMode>
- EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType packet(Index index) const {
+ template <int LoadMode>
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType
+ packet(Index index) const {
return internal::ploadt<PacketReturnType, LoadMode>(m_data + index);
}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+ costPerCoeff(bool vectorized) const {
+ return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
+ }
+
EIGEN_DEVICE_FUNC Scalar* data() const { return m_data; }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
index 14f480901..1ce53ad69 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
@@ -83,10 +83,14 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
typedef TensorForcedEvalOp<ArgType> XprType;
typedef typename ArgType::Scalar Scalar;
typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
+ typedef typename XprType::Index Index;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
enum {
IsAligned = true,
- PacketAccess = (internal::packet_traits<Scalar>::size > 1),
+ PacketAccess = (PacketSize > 1),
Layout = TensorEvaluator<ArgType, Device>::Layout,
RawAccess = true
};
@@ -95,10 +99,6 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
: m_impl(op.expression(), device), m_op(op.expression()), m_device(device), m_buffer(NULL)
{ }
- typedef typename XprType::Index Index;
- typedef typename XprType::CoeffReturnType CoeffReturnType;
- typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
@@ -132,6 +132,10 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
return internal::ploadt<PacketReturnType, LoadMode>(m_buffer + index);
}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+ return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
+ }
+
EIGEN_DEVICE_FUNC Scalar* data() const { return m_buffer; }
private:
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
index b7c13f67f..33cd00391 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
@@ -64,7 +64,7 @@ struct scalar_sigmoid_op {
EIGEN_EMPTY_STRUCT_CTOR(scalar_sigmoid_op)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& x) const {
const T one = T(1);
- return one / (one + std::exp(-x));
+ return one / (one + numext::exp(-x));
}
template <typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@@ -158,8 +158,8 @@ template <typename T> struct MeanReducer
}
protected:
- int scalarCount_;
- int packetCount_;
+ DenseIndex scalarCount_;
+ DenseIndex packetCount_;
};
template <typename T> struct MaxReducer
@@ -594,6 +594,8 @@ template <> class UniformRandomGenerator<std::complex<double> > {
template <typename Scalar>
struct functor_traits<UniformRandomGenerator<Scalar> > {
enum {
+ // Rough estimate.
+ Cost = 100 * NumTraits<Scalar>::MulCost,
PacketAccess = UniformRandomGenerator<Scalar>::PacketAccess
};
};
@@ -774,6 +776,8 @@ template <typename T> class NormalRandomGenerator {
template <typename Scalar>
struct functor_traits<NormalRandomGenerator<Scalar> > {
enum {
+ // Rough estimate.
+ Cost = 100 * NumTraits<Scalar>::MulCost,
PacketAccess = NormalRandomGenerator<Scalar>::PacketAccess
};
};
@@ -799,7 +803,7 @@ class GaussianGenerator {
T offset = coordinates[i] - m_means[i];
tmp += offset * offset / m_two_sigmas[i];
}
- return std::exp(-tmp);
+ return numext::exp(-tmp);
}
private:
@@ -807,6 +811,15 @@ class GaussianGenerator {
array<T, NumDims> m_two_sigmas;
};
+template <typename T, typename Index, size_t NumDims>
+struct functor_traits<GaussianGenerator<T, Index, NumDims> > {
+ enum {
+ Cost = NumDims * (2 * NumTraits<T>::AddCost + NumTraits<T>::MulCost +
+ functor_traits<scalar_quotient_op<T, T> >::Cost) +
+ functor_traits<scalar_exp_op<T> >::Cost,
+ PacketAccess = GaussianGenerator<T, Index, NumDims>::PacketAccess
+ };
+};
} // end namespace internal
} // end namespace Eigen
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
index e4154bd0b..8ff7d5815 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
@@ -145,6 +145,14 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
return rslt;
}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+ costPerCoeff(bool) const {
+ // TODO(rmlarsen): This is just a placeholder. Define interface to make
+ // generators return their cost.
+ return TensorOpCost(0, 0, TensorOpCost::AddCost<Scalar>() +
+ TensorOpCost::MulCost<Scalar>());
+ }
+
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
protected:
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
index 72594a05c..bafcc67bd 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
@@ -159,6 +159,9 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
typedef TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>,
Device> Self;
typedef TensorEvaluator<ArgType, Device> Impl;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
enum {
IsAligned = false,
@@ -307,9 +310,6 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
}
}
- typedef typename XprType::CoeffReturnType CoeffReturnType;
- typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
@@ -362,15 +362,14 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
{
- const Index packetSize = internal::unpacket_traits<PacketReturnType>::size;
- EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
- eigen_assert(index+packetSize-1 < dimensions().TotalSize());
+ EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
if (m_in_row_strides != 1 || m_in_col_strides != 1 || m_row_inflate_strides != 1 || m_col_inflate_strides != 1) {
return packetWithPossibleZero(index);
}
- const Index indices[2] = {index, index + packetSize - 1};
+ const Index indices[2] = {index, index + PacketSize - 1};
const Index patchIndex = indices[0] / m_fastPatchStride;
if (patchIndex != indices[1] / m_fastPatchStride) {
return packetWithPossibleZero(index);
@@ -434,12 +433,23 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
Index rowInflateStride() const { return m_row_inflate_strides; }
Index colInflateStride() const { return m_col_inflate_strides; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+ costPerCoeff(bool vectorized) const {
+ // We conservatively estimate the cost for the code path where the computed
+ // index is inside the original image and
+ // TensorEvaluator<ArgType, Device>::CoordAccess is false.
+ const double compute_cost = 3 * TensorOpCost::DivCost<Index>() +
+ 6 * TensorOpCost::MulCost<Index>() +
+ 8 * TensorOpCost::MulCost<Index>();
+ return m_impl.costPerCoeff(vectorized) +
+ TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
+ }
+
protected:
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
{
- const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
- EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
- for (int i = 0; i < packetSize; ++i) {
+ EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+ for (int i = 0; i < PacketSize; ++i) {
values[i] = coeff(index+i);
}
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
index 368e6f685..de2f67d74 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
@@ -81,6 +81,10 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device>
typedef typename XprType::Index Index;
static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
typedef DSizes<Index, NumDims> Dimensions;
+ typedef typename XprType::Scalar Scalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
enum {
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
@@ -123,11 +127,6 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device>
}
}
- typedef typename XprType::Scalar Scalar;
- typedef typename XprType::CoeffReturnType CoeffReturnType;
- typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
@@ -190,18 +189,30 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device>
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
{
- const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
- EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
- eigen_assert(index+packetSize-1 < dimensions().TotalSize());
+ EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
- EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
- for (int i = 0; i < packetSize; ++i) {
+ EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+ for (int i = 0; i < PacketSize; ++i) {
values[i] = coeff(index+i);
}
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
return rslt;
}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+ const double compute_cost = NumDims * (3 * TensorOpCost::DivCost<Index>() +
+ 3 * TensorOpCost::MulCost<Index>() +
+ 2 * TensorOpCost::AddCost<Index>());
+ const double input_size = m_impl.dimensions().TotalSize();
+ const double output_size = m_dimensions.TotalSize();
+ if (output_size == 0)
+ return TensorOpCost();
+ return m_impl.costPerCoeff(vectorized) +
+ TensorOpCost(sizeof(CoeffReturnType) * input_size / output_size, 0,
+ compute_cost, vectorized, PacketSize);
+ }
+
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
protected:
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
index 9b85914ff..63a8476ef 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
@@ -155,6 +155,10 @@ struct TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device>
return m_impl.template packet<LoadMode>(index);
}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+ return m_impl.costPerCoeff(vectorized);
+ }
+
EIGEN_DEVICE_FUNC Scalar* data() const { return m_impl.data(); }
const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
index 6af2d45d4..cd04716bd 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
@@ -24,9 +24,17 @@ const T2& choose(Cond<false>, const T1&, const T2& second) {
return second;
}
-template <typename T> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+
+template <typename T, typename X, typename Y>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+T divup(const X x, const Y y) {
+ return static_cast<T>((x + y - 1) / y);
+}
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
T divup(const T x, const T y) {
- return (x + y - 1) / y;
+ return static_cast<T>((x + y - 1) / y);
}
template <size_t n> struct max_n_1 {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
index a9c222ea0..bfa65a607 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@@ -142,6 +142,10 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
return m_impl.template packet<LoadMode>(index);
}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+ return m_impl.costPerCoeff(vectorized);
+ }
+
EIGEN_DEVICE_FUNC Scalar* data() const { return const_cast<Scalar*>(m_impl.data()); }
const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
@@ -449,6 +453,11 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
}
}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+ return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, NumDims);
+ }
+
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const {
Scalar* result = m_impl.data();
if (result) {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
index a595a0175..88b838b27 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
@@ -87,6 +87,10 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
typedef typename XprType::Index Index;
static const int NumDims = internal::array_size<PaddingDimensions>::value;
typedef DSizes<Index, NumDims> Dimensions;
+ typedef typename XprType::Scalar Scalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
enum {
IsAligned = false,
@@ -129,10 +133,6 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
}
}
- typedef typename XprType::Scalar Scalar;
- typedef typename XprType::CoeffReturnType CoeffReturnType;
- typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
@@ -224,21 +224,51 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
return m_impl.coeff(inputIndex);
}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+ TensorOpCost cost = m_impl.costPerCoeff(vectorized);
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ for (int i = 0; i < NumDims; ++i)
+ updateCostPerDimension(cost, i, i == 0);
+ } else {
+ for (int i = NumDims - 1; i >= 0; --i)
+ updateCostPerDimension(cost, i, i == NumDims - 1);
+ }
+ return cost;
+ }
+
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+ private:
+ void updateCostPerDimension(TensorOpCost& cost, int i, bool first) const {
+ const double in = static_cast<double>(m_impl.dimensions()[i]);
+ const double out = in + m_padding[i].first + m_padding[i].second;
+ if (out == 0)
+ return;
+ const double reduction = in / out;
+ cost *= reduction;
+ if (first) {
+ cost += TensorOpCost(0, 0, 2 * TensorOpCost::AddCost<Index>() +
+ reduction * (1 * TensorOpCost::AddCost<Index>()));
+ } else {
+ cost += TensorOpCost(0, 0, 2 * TensorOpCost::AddCost<Index>() +
+ 2 * TensorOpCost::MulCost<Index>() +
+ reduction * (2 * TensorOpCost::MulCost<Index>() +
+ 1 * TensorOpCost::DivCost<Index>()));
+ }
+ }
+
protected:
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const
{
- const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
- EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
- eigen_assert(index+packetSize-1 < dimensions().TotalSize());
+ EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
const Index initialIndex = index;
Index inputIndex = 0;
for (int i = NumDims - 1; i > 0; --i) {
const Index first = index;
- const Index last = index + packetSize - 1;
+ const Index last = index + PacketSize - 1;
const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i];
const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i];
const Index lastPaddedRight = m_outputStrides[i+1];
@@ -263,7 +293,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
}
}
- const Index last = index + packetSize - 1;
+ const Index last = index + PacketSize - 1;
const Index first = index;
const Index lastPaddedLeft = m_padding[0].first;
const Index firstPaddedRight = (m_dimensions[0] - m_padding[0].second);
@@ -288,16 +318,15 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const
{
- const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
- EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
- eigen_assert(index+packetSize-1 < dimensions().TotalSize());
+ EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
const Index initialIndex = index;
Index inputIndex = 0;
for (int i = 0; i < NumDims - 1; ++i) {
const Index first = index;
- const Index last = index + packetSize - 1;
+ const Index last = index + PacketSize - 1;
const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i+1];
const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i+1];
const Index lastPaddedRight = m_outputStrides[i];
@@ -322,7 +351,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
}
}
- const Index last = index + packetSize - 1;
+ const Index last = index + PacketSize - 1;
const Index first = index;
const Index lastPaddedLeft = m_padding[NumDims-1].first;
const Index firstPaddedRight = (m_dimensions[NumDims-1] - m_padding[NumDims-1].second);
@@ -347,9 +376,8 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
{
- const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
- EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
- for (int i = 0; i < packetSize; ++i) {
+ EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+ for (int i = 0; i < PacketSize; ++i) {
values[i] = coeff(index+i);
}
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
index 0bf460f4e..a87e45330 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
@@ -85,6 +85,10 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value + 1;
typedef DSizes<Index, NumDims> Dimensions;
typedef typename XprType::Scalar Scalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+
enum {
IsAligned = false,
@@ -137,9 +141,6 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
}
}
- typedef typename XprType::CoeffReturnType CoeffReturnType;
- typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
@@ -183,12 +184,11 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
{
- const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
- EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
- eigen_assert(index+packetSize-1 < dimensions().TotalSize());
+ EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
Index output_stride_index = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? NumDims - 1 : 0;
- Index indices[2] = {index, index + packetSize - 1};
+ Index indices[2] = {index, index + PacketSize - 1};
Index patchIndices[2] = {indices[0] / m_outputStrides[output_stride_index],
indices[1] / m_outputStrides[output_stride_index]};
Index patchOffsets[2] = {indices[0] - patchIndices[0] * m_outputStrides[output_stride_index],
@@ -229,15 +229,15 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
inputIndices[0] += (patchIndices[0] + patchOffsets[0]);
inputIndices[1] += (patchIndices[1] + patchOffsets[1]);
- if (inputIndices[1] - inputIndices[0] == packetSize - 1) {
+ if (inputIndices[1] - inputIndices[0] == PacketSize - 1) {
PacketReturnType rslt = m_impl.template packet<Unaligned>(inputIndices[0]);
return rslt;
}
else {
- EIGEN_ALIGN_MAX CoeffReturnType values[packetSize];
+ EIGEN_ALIGN_MAX CoeffReturnType values[PacketSize];
values[0] = m_impl.coeff(inputIndices[0]);
- values[packetSize-1] = m_impl.coeff(inputIndices[1]);
- for (int i = 1; i < packetSize-1; ++i) {
+ values[PacketSize-1] = m_impl.coeff(inputIndices[1]);
+ for (int i = 1; i < PacketSize-1; ++i) {
values[i] = coeff(index+i);
}
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
@@ -245,6 +245,14 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
}
}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+ const double compute_cost = NumDims * (TensorOpCost::DivCost<Index>() +
+ TensorOpCost::MulCost<Index>() +
+ 2 * TensorOpCost::AddCost<Index>());
+ return m_impl.costPerCoeff(vectorized) +
+ TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
+ }
+
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
protected:
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index 00f870328..885295f0a 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -214,7 +214,7 @@ struct FullReducer {
static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const Device&, typename Self::CoeffReturnType* output) {
const typename Self::Index num_coeffs = array_prod(self.m_impl.dimensions());
- *output = InnerMostDimReducer<Self, Op>::reduce(self, 0, num_coeffs, reducer);
+ *output = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer);
}
};
@@ -222,18 +222,19 @@ struct FullReducer {
#ifdef EIGEN_USE_THREADS
// Multithreaded full reducers
template <typename Self, typename Op,
- bool vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
+ bool Vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
struct FullReducerShard {
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Self& self, typename Self::Index firstIndex,
typename Self::Index numValuesToReduce, Op& reducer,
typename Self::CoeffReturnType* output) {
- *output = InnerMostDimReducer<Self, Op, vectorizable>::reduce(
+ *output = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(
self, firstIndex, numValuesToReduce, reducer);
}
};
-template <typename Self, typename Op>
-struct FullReducer<Self, Op, ThreadPoolDevice, false> {
+// Multithreaded full reducer
+template <typename Self, typename Op, bool Vectorizable>
+struct FullReducer<Self, Op, ThreadPoolDevice, Vectorizable> {
static const bool HasOptimizedImplementation = !Op::IsStateful;
static const int PacketSize =
unpacket_traits<typename Self::PacketReturnType>::size;
@@ -247,79 +248,44 @@ struct FullReducer<Self, Op, ThreadPoolDevice, false> {
*output = reducer.finalize(reducer.initialize());
return;
}
- const std::size_t num_threads = device.numThreads();
- if (num_threads == 1) {
- *output = InnerMostDimReducer<Self, Op, false>::reduce(self, 0, num_coeffs, reducer);
- return;
- } else {
- const Index blocksize = std::floor<Index>(static_cast<float>(num_coeffs) / num_threads);
- const unsigned int numblocks = blocksize > 0 ? static_cast<unsigned int>(num_coeffs / blocksize) : 0;
- eigen_assert(num_coeffs >= static_cast<Index>(numblocks) * blocksize);
-
- Barrier barrier(numblocks);
- MaxSizeVector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize());
- for (unsigned int i = 0; i < numblocks; ++i) {
- device.enqueue_with_barrier(&barrier, &FullReducerShard<Self, Op, false>::run, self,
- i * blocksize, blocksize, reducer, &shards[i]);
- }
-
- typename Self::CoeffReturnType finalShard;
- if (static_cast<Index>(numblocks) * blocksize < num_coeffs) {
- finalShard = InnerMostDimReducer<Self, Op, false>::reduce(
- self, numblocks * blocksize, num_coeffs - numblocks * blocksize, reducer);
- } else {
- finalShard = reducer.initialize();
- }
- barrier.Wait();
- for (unsigned int i = 0; i < numblocks; ++i) {
- reducer.reduce(shards[i], &finalShard);
- }
- *output = reducer.finalize(finalShard);
- }
- }
-};
-
-template <typename Self, typename Op>
-struct FullReducer<Self, Op, ThreadPoolDevice, true> {
- static const bool HasOptimizedImplementation = !Op::IsStateful;
- static const int PacketSize =
- unpacket_traits<typename Self::PacketReturnType>::size;
-
- // launch one reducer per thread and accumulate the result.
- static void run(const Self& self, Op& reducer, const ThreadPoolDevice& device,
- typename Self::CoeffReturnType* output) {
- typedef typename Self::Index Index;
- const Index num_coeffs = array_prod(self.m_impl.dimensions());
- if (num_coeffs == 0) {
- *output = reducer.finalize(reducer.initialize());
- return;
- }
- const std::size_t num_threads = device.numThreads();
+#ifdef EIGEN_USE_COST_MODEL
+ const TensorOpCost cost =
+ self.m_impl.costPerCoeff(Vectorizable) +
+ TensorOpCost(0, 0, internal::functor_traits<Op>::Cost, Vectorizable,
+ PacketSize);
+ const int num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
+ num_coeffs, cost, device.numThreads());
+#else
+ const int num_threads = device.numThreads();
+#endif
if (num_threads == 1) {
- *output = InnerMostDimReducer<Self, Op, true>::reduce(self, 0, num_coeffs, reducer);
+ *output =
+ InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer);
return;
}
- const Index blocksize = std::floor<Index>(static_cast<float>(num_coeffs) / num_threads);
- const unsigned int numblocks = blocksize > 0 ? static_cast<unsigned int>(num_coeffs / blocksize) : 0;
- eigen_assert(num_coeffs >= static_cast<Index>(numblocks) * blocksize);
+ const Index blocksize =
+ std::floor<Index>(static_cast<float>(num_coeffs) / num_threads);
+ const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0;
+ eigen_assert(num_coeffs >= numblocks * blocksize);
Barrier barrier(numblocks);
MaxSizeVector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize());
- for (unsigned int i = 0; i < numblocks; ++i) {
- device.enqueue_with_barrier(&barrier, &FullReducerShard<Self, Op, true>::run,
+ for (Index i = 0; i < numblocks; ++i) {
+ device.enqueue_with_barrier(&barrier, &FullReducerShard<Self, Op, Vectorizable>::run,
self, i * blocksize, blocksize, reducer,
&shards[i]);
}
typename Self::CoeffReturnType finalShard;
- if (static_cast<Index>(numblocks) * blocksize < num_coeffs) {
- finalShard = InnerMostDimReducer<Self, Op, true>::reduce(
- self, numblocks * blocksize, num_coeffs - numblocks * blocksize, reducer);
+ if (numblocks * blocksize < num_coeffs) {
+ finalShard = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(
+ self, numblocks * blocksize, num_coeffs - numblocks * blocksize,
+ reducer);
} else {
finalShard = reducer.initialize();
}
-
barrier.Wait();
- for (unsigned int i = 0; i < numblocks; ++i) {
+
+ for (Index i = 0; i < numblocks; ++i) {
reducer.reduce(shards[i], &finalShard);
}
*output = reducer.finalize(finalShard);
@@ -411,6 +377,9 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
typedef typename XprType::Scalar Scalar;
typedef TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device> Self;
static const bool InputPacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess;
+ typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
enum {
IsAligned = false,
@@ -495,8 +464,13 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
- typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
- typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ static bool size_large_enough(Index total_size) {
+#ifndef EIGEN_USE_COST_MODEL
+ return total_size > 1024 * 1024;
+#else
+ return true || total_size;
+#endif
+ }
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool evalSubExprsIfNeeded(CoeffReturnType* data) {
m_impl.evalSubExprsIfNeeded(NULL);
@@ -504,7 +478,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
// Use the FullReducer if possible.
if (RunningFullReduction && internal::FullReducer<Self, Op, Device>::HasOptimizedImplementation &&
((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) ||
- (!RunningOnGPU && (internal::array_prod(m_impl.dimensions()) > 1024 * 1024)))) {
+ (!RunningOnGPU && size_large_enough(internal::array_prod(m_impl.dimensions()))))) {
bool need_assign = false;
if (!data) {
@@ -584,16 +558,15 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
{
- const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
- EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
- eigen_assert(index + packetSize - 1 < dimensions().TotalSize());
+ EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
- EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
+ EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
if (ReducingInnerMostDims) {
const Index num_values_to_reduce =
(static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumPreservedStrides - 1];
const Index firstIndex = firstInput(index);
- for (Index i = 0; i < packetSize; ++i) {
+ for (Index i = 0; i < PacketSize; ++i) {
Op reducer(m_reducer);
values[i] = internal::InnerMostDimReducer<Self, Op>::reduce(*this, firstIndex + i * num_values_to_reduce,
num_values_to_reduce, reducer);
@@ -602,18 +575,18 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
const Index firstIndex = firstInput(index);
const int innermost_dim = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? 0 : NumOutputDims - 1;
// TBD: extend this the the n innermost dimensions that we preserve.
- if (((firstIndex % m_dimensions[innermost_dim]) + packetSize - 1) < m_dimensions[innermost_dim]) {
+ if (((firstIndex % m_dimensions[innermost_dim]) + PacketSize - 1) < m_dimensions[innermost_dim]) {
Op reducer(m_reducer);
typename Self::PacketReturnType accum = reducer.template initializePacket<typename Self::PacketReturnType>();
internal::InnerMostDimPreserver<NumReducedDims-1, Self, Op>::reduce(*this, firstIndex, reducer, &accum);
return reducer.finalizePacket(accum);
} else {
- for (int i = 0; i < packetSize; ++i) {
+ for (int i = 0; i < PacketSize; ++i) {
values[i] = coeff(index + i);
}
}
} else {
- for (int i = 0; i < packetSize; ++i) {
+ for (int i = 0; i < PacketSize; ++i) {
values[i] = coeff(index + i);
}
}
@@ -621,6 +594,18 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
return rslt;
}
+ // Must be called after evalSubExprsIfNeeded().
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+ if (RunningFullReduction && m_result) {
+ return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
+ } else {
+ const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
+ const double compute_cost = num_values_to_reduce * internal::functor_traits<Op>::Cost;
+ return m_impl.costPerCoeff(vectorized) * num_values_to_reduce +
+ TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
+ }
+ }
+
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
private:
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
index c33d54d6e..fd2587dd5 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
@@ -130,13 +130,18 @@ struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
assert(false && "Should only be called on floats");
}
- static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const GpuDevice& device, float* output) {
+ static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output) {
typedef typename Self::Index Index;
const Index num_coeffs = array_prod(self.m_impl.dimensions());
+ // Don't crash when we're called with an input tensor of size 0.
+ if (num_coeffs == 0) {
+ return;
+ }
+
const int block_size = 256;
const int num_per_thread = 128;
- const int num_blocks = std::ceil(static_cast<float>(num_coeffs) / (block_size * num_per_thread));
+ const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
if (num_blocks > 1) {
// We initialize the outputs outside the reduction kernel when we can't be sure that there
@@ -231,7 +236,7 @@ struct InnerReducer<Self, Op, GpuDevice> {
return true;
}
- static EIGEN_DEVICE_FUNC bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+ static bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
typedef typename Self::Index Index;
// It's faster to use the usual code.
@@ -310,7 +315,7 @@ struct OuterReducer<Self, Op, GpuDevice> {
return true;
}
- static EIGEN_DEVICE_FUNC bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+ static bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
typedef typename Self::Index Index;
// It's faster to use the usual code.
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
index 96d92038c..1a59cc8f7 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
@@ -104,6 +104,10 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
typedef typename XprType::Index Index;
static const int NumDims = internal::array_size<ReverseDimensions>::value;
typedef DSizes<Index, NumDims> Dimensions;
+ typedef typename XprType::Scalar Scalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
enum {
IsAligned = false,
@@ -135,10 +139,6 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
}
}
- typedef typename XprType::Scalar Scalar;
- typedef typename XprType::CoeffReturnType CoeffReturnType;
- typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const Dimensions& dimensions() const { return m_dimensions; }
@@ -195,21 +195,33 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
PacketReturnType packet(Index index) const
{
- const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
- EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
- eigen_assert(index+packetSize-1 < dimensions().TotalSize());
+ EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
// TODO(ndjaitly): write a better packing routine that uses
// local structure.
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type
- values[packetSize];
- for (int i = 0; i < packetSize; ++i) {
+ values[PacketSize];
+ for (int i = 0; i < PacketSize; ++i) {
values[i] = coeff(index+i);
}
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
return rslt;
}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+ double compute_cost = NumDims * (2 * TensorOpCost::AddCost<Index>() +
+ 2 * TensorOpCost::MulCost<Index>() +
+ TensorOpCost::DivCost<Index>());
+ for (int i = 0; i < NumDims; ++i) {
+ if (m_reverse[i]) {
+ compute_cost += 2 * TensorOpCost::AddCost<Index>();
+ }
+ }
+ return m_impl.costPerCoeff(vectorized) +
+ TensorOpCost(0, 0, compute_cost, false /* vectorized */, PacketSize);
+ }
+
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
protected:
@@ -246,6 +258,7 @@ struct TensorEvaluator<TensorReverseOp<ReverseDimensions, ArgType>, Device>
typedef typename XprType::Scalar Scalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const Dimensions& dimensions() const { return this->m_dimensions; }
@@ -256,14 +269,13 @@ struct TensorEvaluator<TensorReverseOp<ReverseDimensions, ArgType>, Device>
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void writePacket(Index index, const PacketReturnType& x) {
- const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
- EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
- eigen_assert(index+packetSize-1 < dimensions().TotalSize());
+ EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
// This code is pilfered from TensorMorphing.h
- EIGEN_ALIGN_MAX CoeffReturnType values[packetSize];
+ EIGEN_ALIGN_MAX CoeffReturnType values[PacketSize];
internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
- for (int i = 0; i < packetSize; ++i) {
+ for (int i = 0; i < PacketSize; ++i) {
this->coeffRef(index+i) = values[i];
}
}
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
index c19833ea5..e76533710 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
@@ -104,6 +104,9 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
typedef DSizes<Index, NumDims> Dimensions;
typedef typename XprType::Scalar Scalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
enum {
IsAligned = false,
@@ -145,9 +148,6 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
}
}
- typedef typename XprType::CoeffReturnType CoeffReturnType;
- typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
@@ -166,18 +166,25 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
{
- const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
- EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
- eigen_assert(index+packetSize-1 < dimensions().TotalSize());
+ EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
- EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
- for (int i = 0; i < packetSize; ++i) {
+ EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+ for (int i = 0; i < PacketSize; ++i) {
values[i] = coeff(index+i);
}
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
return rslt;
}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+ const double compute_cost = NumDims * (2 * TensorOpCost::AddCost<Index>() +
+ 2 * TensorOpCost::MulCost<Index>() +
+ TensorOpCost::DivCost<Index>());
+ return m_impl.costPerCoeff(vectorized) +
+ TensorOpCost(0, 0, compute_cost, false /* vectorized */, PacketSize);
+ }
+
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
protected:
@@ -219,6 +226,9 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
typedef DSizes<Index, NumDims> Dimensions;
typedef typename XprType::Scalar Scalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
enum {
IsAligned = false,
@@ -230,9 +240,6 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
: Base(op, device)
{ }
- typedef typename XprType::CoeffReturnType CoeffReturnType;
- typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
{
return this->m_impl.coeffRef(this->srcCoeff(index));
@@ -241,12 +248,11 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
template <int StoreMode> EIGEN_STRONG_INLINE
void writePacket(Index index, const PacketReturnType& x)
{
- static const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
- EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
- EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
+ EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
- for (int i = 0; i < packetSize; ++i) {
+ for (int i = 0; i < PacketSize; ++i) {
this->coeffRef(index+i) = values[i];
}
}
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
index 085f8fd3d..52b7d216a 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
@@ -103,6 +103,10 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
typedef typename XprType::Index Index;
static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
typedef DSizes<Index, NumDims> Dimensions;
+ typedef typename XprType::Scalar Scalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
enum {
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
@@ -142,10 +146,6 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
}
}
- typedef typename XprType::Scalar Scalar;
- typedef typename XprType::CoeffReturnType CoeffReturnType;
- typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
@@ -164,12 +164,11 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
{
- const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
- EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
- eigen_assert(index+packetSize-1 < dimensions().TotalSize());
+ EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
Index inputIndices[] = {0, 0};
- Index indices[] = {index, index + packetSize - 1};
+ Index indices[] = {index, index + PacketSize - 1};
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
for (int i = NumDims - 1; i > 0; --i) {
const Index idx0 = indices[0] / m_outputStrides[i];
@@ -193,15 +192,15 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
inputIndices[0] += indices[0] * m_inputStrides[NumDims-1];
inputIndices[1] += indices[1] * m_inputStrides[NumDims-1];
}
- if (inputIndices[1] - inputIndices[0] == packetSize - 1) {
+ if (inputIndices[1] - inputIndices[0] == PacketSize - 1) {
PacketReturnType rslt = m_impl.template packet<Unaligned>(inputIndices[0]);
return rslt;
}
else {
- EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
+ EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
values[0] = m_impl.coeff(inputIndices[0]);
- values[packetSize-1] = m_impl.coeff(inputIndices[1]);
- for (int i = 1; i < packetSize-1; ++i) {
+ values[PacketSize-1] = m_impl.coeff(inputIndices[1]);
+ for (int i = 1; i < PacketSize-1; ++i) {
values[i] = coeff(index+i);
}
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
@@ -209,6 +208,20 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
}
}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+ double compute_cost = (NumDims - 1) * (TensorOpCost::AddCost<Index>() +
+ TensorOpCost::MulCost<Index>() +
+ TensorOpCost::DivCost<Index>()) +
+ TensorOpCost::MulCost<Index>();
+ if (vectorized) {
+ compute_cost *= 2; // packet() computes two indices
+ }
+ const int innerDim = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? 0 : (NumDims - 1);
+ return m_impl.costPerCoeff(vectorized && m_inputStrides[innerDim] == 1) +
+ // Computation is not vectorized per se, but it is done once per packet.
+ TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
+ }
+
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
protected:
@@ -266,6 +279,7 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
typedef typename XprType::Scalar Scalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index)
{
@@ -275,12 +289,11 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void writePacket(Index index, const PacketReturnType& x)
{
- const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
- EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
- eigen_assert(index+packetSize-1 < this->dimensions().TotalSize());
+ EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ eigen_assert(index+PacketSize-1 < this->dimensions().TotalSize());
Index inputIndices[] = {0, 0};
- Index indices[] = {index, index + packetSize - 1};
+ Index indices[] = {index, index + PacketSize - 1};
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
for (int i = NumDims - 1; i > 0; --i) {
const Index idx0 = indices[0] / this->m_outputStrides[i];
@@ -304,15 +317,15 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
inputIndices[0] += indices[0] * this->m_inputStrides[NumDims-1];
inputIndices[1] += indices[1] * this->m_inputStrides[NumDims-1];
}
- if (inputIndices[1] - inputIndices[0] == packetSize - 1) {
+ if (inputIndices[1] - inputIndices[0] == PacketSize - 1) {
this->m_impl.template writePacket<Unaligned>(inputIndices[0], x);
}
else {
- EIGEN_ALIGN_MAX Scalar values[packetSize];
+ EIGEN_ALIGN_MAX Scalar values[PacketSize];
internal::pstore<Scalar, PacketReturnType>(values, x);
this->m_impl.coeffRef(inputIndices[0]) = values[0];
- this->m_impl.coeffRef(inputIndices[1]) = values[packetSize-1];
- for (int i = 1; i < packetSize-1; ++i) {
+ this->m_impl.coeffRef(inputIndices[1]) = values[PacketSize-1];
+ for (int i = 1; i < PacketSize-1; ++i) {
this->coeffRef(index+i) = values[i];
}
}
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
index 3e56589c3..5950f38e2 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
@@ -53,9 +53,7 @@ struct TensorUInt128
template<typename T>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
explicit TensorUInt128(const T& x) : high(0), low(x) {
- typedef typename conditional<sizeof(T) == 8, uint64_t, uint32_t>::type UnsignedT;
- typedef typename conditional<sizeof(LOW) == 8, uint64_t, uint32_t>::type UnsignedLow;
- eigen_assert(static_cast<UnsignedT>(x) <= static_cast<UnsignedLow>(NumTraits<LOW>::highest()));
+ eigen_assert((static_cast<typename conditional<sizeof(T) == 8, uint64_t, uint32_t>::type>(x) <= static_cast<typename conditional<sizeof(LOW) == 8, uint64_t, uint32_t>::type>(NumTraits<LOW>::highest())));
eigen_assert(x >= 0);
}
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
index 5bdfbad46..e735fc76f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
@@ -171,6 +171,9 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
static const int NumDims = NumInputDims + 1;
typedef DSizes<Index, NumDims> Dimensions;
typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
enum {
IsAligned = false,
@@ -336,9 +339,6 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
}
}
- typedef typename XprType::CoeffReturnType CoeffReturnType;
- typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
@@ -408,16 +408,15 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
{
- const Index packetSize = internal::unpacket_traits<PacketReturnType>::size;
- EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
- eigen_assert(index+packetSize-1 < dimensions().TotalSize());
+ EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
if (m_in_row_strides != 1 || m_in_col_strides != 1 || m_row_inflate_strides != 1 || m_col_inflate_strides != 1 ||
m_in_plane_strides != 1 || m_plane_inflate_strides != 1) {
return packetWithPossibleZero(index);
}
- const Index indices[2] = {index, index + packetSize - 1};
+ const Index indices[2] = {index, index + PacketSize - 1};
const Index patchIndex = indices[0] / m_fastPatchStride;
if (patchIndex != indices[1] / m_fastPatchStride) {
return packetWithPossibleZero(index);
@@ -495,6 +494,14 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
return packetWithPossibleZero(index);
}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+ costPerCoeff(bool vectorized) const {
+ const double compute_cost =
+ 10 * TensorOpCost::DivCost<Index>() + 21 * TensorOpCost::MulCost<Index>() +
+ 8 * TensorOpCost::AddCost<Index>();
+ return TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
+ }
+
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
@@ -518,9 +525,8 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
protected:
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
{
- const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
- EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
- for (int i = 0; i < packetSize; ++i) {
+ EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+ for (int i = 0; i < PacketSize; ++i) {
values[i] = coeff(index+i);
}
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/CMakeLists.txt b/unsupported/Eigen/CXX11/src/ThreadPool/CMakeLists.txt
new file mode 100644
index 000000000..88fef50c6
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/CMakeLists.txt
@@ -0,0 +1,6 @@
+FILE(GLOB Eigen_CXX11_ThreadPool_SRCS "*.h")
+
+INSTALL(FILES
+ ${Eigen_CXX11_ThreadPool_SRCS}
+ DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/ThreadPool COMPONENT Devel
+ )
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h b/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h
new file mode 100644
index 000000000..6dd64f185
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h
@@ -0,0 +1,234 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Dmitry Vyukov <dvyukov@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_THREADPOOL_EVENTCOUNT_H_
+#define EIGEN_CXX11_THREADPOOL_EVENTCOUNT_H_
+
+namespace Eigen {
+
+// EventCount allows to wait for arbitrary predicates in non-blocking
+// algorithms. Think of condition variable, but wait predicate does not need to
+// be protected by a mutex. Usage:
+// Waiting thread does:
+//
+// if (predicate)
+// return act();
+// EventCount::Waiter& w = waiters[my_index];
+// ec.Prewait(&w);
+// if (predicate) {
+// ec.CancelWait(&w);
+// return act();
+// }
+// ec.CommitWait(&w);
+//
+// Notifying thread does:
+//
+// predicate = true;
+// ec.Notify(true);
+//
+// Notify is cheap if there are no waiting threads. Prewait/CommitWait are not
+// cheap, but they are executed only if the preceeding predicate check has
+// failed.
+//
+// Algorihtm outline:
+// There are two main variables: predicate (managed by user) and state_.
+// Operation closely resembles Dekker mutual algorithm:
+// https://en.wikipedia.org/wiki/Dekker%27s_algorithm
+// Waiting thread sets state_ then checks predicate, Notifying thread sets
+// predicate then checks state_. Due to seq_cst fences in between these
+// operations it is guaranteed than either waiter will see predicate change
+// and won't block, or notifying thread will see state_ change and will unblock
+// the waiter, or both. But it can't happen that both threads don't see each
+// other changes, which would lead to deadlock.
+class EventCount {
+ public:
+ class Waiter;
+
+ EventCount(std::vector<Waiter>& waiters) : waiters_(waiters) {
+ eigen_assert(waiters.size() < (1 << kWaiterBits) - 1);
+ // Initialize epoch to something close to overflow to test overflow.
+ state_ = kStackMask | (kEpochMask - kEpochInc * waiters.size() * 2);
+ }
+
+ ~EventCount() {
+ // Ensure there are no waiters.
+ eigen_assert((state_.load() & (kStackMask | kWaiterMask)) == kStackMask);
+ }
+
+ // Prewait prepares for waiting.
+ // After calling this function the thread must re-check the wait predicate
+ // and call either CancelWait or CommitWait passing the same Waiter object.
+ void Prewait(Waiter* w) {
+ w->epoch = state_.fetch_add(kWaiterInc, std::memory_order_relaxed);
+ std::atomic_thread_fence(std::memory_order_seq_cst);
+ }
+
+ // CommitWait commits waiting.
+ void CommitWait(Waiter* w) {
+ w->state = Waiter::kNotSignaled;
+ // Modification epoch of this waiter.
+ uint64_t epoch =
+ (w->epoch & kEpochMask) +
+ (((w->epoch & kWaiterMask) >> kWaiterShift) << kEpochShift);
+ uint64_t state = state_.load(std::memory_order_seq_cst);
+ for (;;) {
+ if (int64_t((state & kEpochMask) - epoch) < 0) {
+ // The preceeding waiter has not decided on its fate. Wait until it
+ // calls either CancelWait or CommitWait, or is notified.
+ EIGEN_THREAD_YIELD();
+ state = state_.load(std::memory_order_seq_cst);
+ continue;
+ }
+ // We've already been notified.
+ if (int64_t((state & kEpochMask) - epoch) > 0) return;
+ // Remove this thread from prewait counter and add it to the waiter list.
+ eigen_assert((state & kWaiterMask) != 0);
+ uint64_t newstate = state - kWaiterInc + kEpochInc;
+ newstate = (newstate & ~kStackMask) | (w - &waiters_[0]);
+ if ((state & kStackMask) == kStackMask)
+ w->next.store(nullptr, std::memory_order_relaxed);
+ else
+ w->next.store(&waiters_[state & kStackMask], std::memory_order_relaxed);
+ if (state_.compare_exchange_weak(state, newstate,
+ std::memory_order_release))
+ break;
+ }
+ Park(w);
+ }
+
+ // CancelWait cancels effects of the previous Prewait call.
+ void CancelWait(Waiter* w) {
+ uint64_t epoch =
+ (w->epoch & kEpochMask) +
+ (((w->epoch & kWaiterMask) >> kWaiterShift) << kEpochShift);
+ uint64_t state = state_.load(std::memory_order_relaxed);
+ for (;;) {
+ if (int64_t((state & kEpochMask) - epoch) < 0) {
+ // The preceeding waiter has not decided on its fate. Wait until it
+ // calls either CancelWait or CommitWait, or is notified.
+ EIGEN_THREAD_YIELD();
+ state = state_.load(std::memory_order_relaxed);
+ continue;
+ }
+ // We've already been notified.
+ if (int64_t((state & kEpochMask) - epoch) > 0) return;
+ // Remove this thread from prewait counter.
+ eigen_assert((state & kWaiterMask) != 0);
+ if (state_.compare_exchange_weak(state, state - kWaiterInc + kEpochInc,
+ std::memory_order_relaxed))
+ return;
+ }
+ }
+
+ // Notify wakes one or all waiting threads.
+ // Must be called after changing the associated wait predicate.
+ void Notify(bool all) {
+ std::atomic_thread_fence(std::memory_order_seq_cst);
+ uint64_t state = state_.load(std::memory_order_acquire);
+ for (;;) {
+ // Easy case: no waiters.
+ if ((state & kStackMask) == kStackMask && (state & kWaiterMask) == 0)
+ return;
+ uint64_t waiters = (state & kWaiterMask) >> kWaiterShift;
+ uint64_t newstate;
+ if (all) {
+ // Reset prewait counter and empty wait list.
+ newstate = (state & kEpochMask) + (kEpochInc * waiters) + kStackMask;
+ } else if (waiters) {
+ // There is a thread in pre-wait state, unblock it.
+ newstate = state + kEpochInc - kWaiterInc;
+ } else {
+ // Pop a waiter from list and unpark it.
+ Waiter* w = &waiters_[state & kStackMask];
+ Waiter* wnext = w->next.load(std::memory_order_relaxed);
+ uint64_t next = kStackMask;
+ if (wnext != nullptr) next = wnext - &waiters_[0];
+ // Note: we don't add kEpochInc here. ABA problem on the lock-free stack
+ // can't happen because a waiter is re-pushed onto the stack only after
+ // it was in the pre-wait state which inevitably leads to epoch
+ // increment.
+ newstate = (state & kEpochMask) + next;
+ }
+ if (state_.compare_exchange_weak(state, newstate,
+ std::memory_order_acquire)) {
+ if (!all && waiters) return; // unblocked pre-wait thread
+ if ((state & kStackMask) == kStackMask) return;
+ Waiter* w = &waiters_[state & kStackMask];
+ if (!all) w->next.store(nullptr, std::memory_order_relaxed);
+ Unpark(w);
+ return;
+ }
+ }
+ }
+
+ class Waiter {
+ friend class EventCount;
+ std::atomic<Waiter*> next;
+ std::mutex mu;
+ std::condition_variable cv;
+ uint64_t epoch;
+ unsigned state;
+ enum {
+ kNotSignaled,
+ kWaiting,
+ kSignaled,
+ };
+ // Prevent false sharing with other Waiter objects in the same vector.
+ char pad_[128];
+ };
+
+ private:
+ // State_ layout:
+ // - low kStackBits is a stack of waiters committed wait.
+ // - next kWaiterBits is count of waiters in prewait state.
+ // - next kEpochBits is modification counter.
+ static const uint64_t kStackBits = 16;
+ static const uint64_t kStackMask = (1ull << kStackBits) - 1;
+ static const uint64_t kWaiterBits = 16;
+ static const uint64_t kWaiterShift = 16;
+ static const uint64_t kWaiterMask = ((1ull << kWaiterBits) - 1)
+ << kWaiterShift;
+ static const uint64_t kWaiterInc = 1ull << kWaiterBits;
+ static const uint64_t kEpochBits = 32;
+ static const uint64_t kEpochShift = 32;
+ static const uint64_t kEpochMask = ((1ull << kEpochBits) - 1) << kEpochShift;
+ static const uint64_t kEpochInc = 1ull << kEpochShift;
+ std::atomic<uint64_t> state_;
+ std::vector<Waiter>& waiters_;
+
+ void Park(Waiter* w) {
+ std::unique_lock<std::mutex> lock(w->mu);
+ while (w->state != Waiter::kSignaled) {
+ w->state = Waiter::kWaiting;
+ w->cv.wait(lock);
+ }
+ }
+
+ void Unpark(Waiter* waiters) {
+ Waiter* next = nullptr;
+ for (Waiter* w = waiters; w; w = next) {
+ next = w->next.load(std::memory_order_relaxed);
+ unsigned state;
+ {
+ std::unique_lock<std::mutex> lock(w->mu);
+ state = w->state;
+ w->state = Waiter::kSignaled;
+ }
+ // Avoid notifying if it wasn't waiting.
+ if (state == Waiter::kWaiting) w->cv.notify_one();
+ }
+ }
+
+ EventCount(const EventCount&) = delete;
+ void operator=(const EventCount&) = delete;
+};
+
+} // namespace Eigen
+
+#endif // EIGEN_CXX11_THREADPOOL_EVENTCOUNT_H_
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
new file mode 100644
index 000000000..1c471a19f
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
@@ -0,0 +1,232 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Dmitry Vyukov <dvyukov@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_THREADPOOL_NONBLOCKING_THREAD_POOL_H
+#define EIGEN_CXX11_THREADPOOL_NONBLOCKING_THREAD_POOL_H
+
+
+namespace Eigen {
+
+template <typename Environment>
+class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
+ public:
+ typedef typename Environment::Task Task;
+ typedef RunQueue<Task, 1024> Queue;
+
+ NonBlockingThreadPoolTempl(int num_threads, Environment env = Environment())
+ : env_(env),
+ threads_(num_threads),
+ queues_(num_threads),
+ waiters_(num_threads),
+ blocked_(),
+ spinning_(),
+ done_(),
+ ec_(waiters_) {
+ for (int i = 0; i < num_threads; i++) queues_.push_back(new Queue());
+ for (int i = 0; i < num_threads; i++)
+ threads_.push_back(env_.CreateThread([this, i]() { WorkerLoop(i); }));
+ }
+
+ ~NonBlockingThreadPoolTempl() {
+ done_.store(true, std::memory_order_relaxed);
+ // Now if all threads block without work, they will start exiting.
+ // But note that threads can continue to work arbitrary long,
+ // block, submit new work, unblock and otherwise live full life.
+ ec_.Notify(true);
+
+ // Join threads explicitly to avoid destruction order issues.
+ for (size_t i = 0; i < threads_.size(); i++) delete threads_[i];
+ for (size_t i = 0; i < threads_.size(); i++) delete queues_[i];
+ }
+
+ void Schedule(std::function<void()> fn) {
+ Task t = env_.CreateTask(std::move(fn));
+ PerThread* pt = GetPerThread();
+ if (pt->pool == this) {
+ // Worker thread of this pool, push onto the thread's queue.
+ Queue* q = queues_[pt->index];
+ t = q->PushFront(std::move(t));
+ } else {
+ // A free-standing thread (or worker of another pool), push onto a random
+ // queue.
+ Queue* q = queues_[Rand(&pt->rand) % queues_.size()];
+ t = q->PushBack(std::move(t));
+ }
+ // Note: below we touch this after making w available to worker threads.
+ // Strictly speaking, this can lead to a racy-use-after-free. Consider that
+ // Schedule is called from a thread that is neither main thread nor a worker
+ // thread of this pool. Then, execution of w directly or indirectly
+ // completes overall computations, which in turn leads to destruction of
+ // this. We expect that such scenario is prevented by program, that is,
+ // this is kept alive while any threads can potentially be in Schedule.
+ if (!t.f)
+ ec_.Notify(false);
+ else
+ env_.ExecuteTask(t); // Push failed, execute directly.
+ }
+
+ private:
+ typedef typename Environment::EnvThread Thread;
+
+ struct PerThread {
+ bool inited;
+ NonBlockingThreadPoolTempl* pool; // Parent pool, or null for normal threads.
+ unsigned index; // Worker thread index in pool.
+ unsigned rand; // Random generator state.
+ };
+
+ Environment env_;
+ MaxSizeVector<Thread*> threads_;
+ MaxSizeVector<Queue*> queues_;
+ std::vector<EventCount::Waiter> waiters_;
+ std::atomic<unsigned> blocked_;
+ std::atomic<bool> spinning_;
+ std::atomic<bool> done_;
+ EventCount ec_;
+
+ // Main worker thread loop.
+ void WorkerLoop(unsigned index) {
+ PerThread* pt = GetPerThread();
+ pt->pool = this;
+ pt->index = index;
+ Queue* q = queues_[index];
+ EventCount::Waiter* waiter = &waiters_[index];
+ std::vector<Task> stolen;
+ for (;;) {
+ Task t;
+ if (!stolen.empty()) {
+ t = std::move(stolen.back());
+ stolen.pop_back();
+ }
+ if (!t.f) t = q->PopFront();
+ if (!t.f) {
+ if (Steal(&stolen)) {
+ t = std::move(stolen.back());
+ stolen.pop_back();
+ while (stolen.size()) {
+ Task t1 = q->PushFront(std::move(stolen.back()));
+ stolen.pop_back();
+ if (t1.f) {
+ // There is not much we can do in this case. Just execute the
+ // remaining directly.
+ stolen.push_back(std::move(t1));
+ break;
+ }
+ }
+ }
+ }
+ if (t.f) {
+ env_.ExecuteTask(t);
+ continue;
+ }
+ // Leave one thread spinning. This reduces latency.
+ if (!spinning_ && !spinning_.exchange(true)) {
+ bool nowork = true;
+ for (int i = 0; i < 1000; i++) {
+ if (!OutOfWork()) {
+ nowork = false;
+ break;
+ }
+ }
+ spinning_ = false;
+ if (!nowork) continue;
+ }
+ if (!WaitForWork(waiter)) return;
+ }
+ }
+
+ // Steal tries to steal work from other worker threads in best-effort manner.
+ bool Steal(std::vector<Task>* stolen) {
+ if (queues_.size() == 1) return false;
+ PerThread* pt = GetPerThread();
+ unsigned lastq = pt->index;
+ for (unsigned i = queues_.size(); i > 0; i--) {
+ unsigned victim = Rand(&pt->rand) % queues_.size();
+ if (victim == lastq && queues_.size() > 2) {
+ i++;
+ continue;
+ }
+ // Steal half of elements from a victim queue.
+ // It is typical to steal just one element, but that assumes that work is
+ // recursively subdivided in halves so that the stolen element is exactly
+ // half of work. If work elements are equally-sized, then is makes sense
+ // to steal half of elements at once and then work locally for a while.
+ if (queues_[victim]->PopBackHalf(stolen)) return true;
+ lastq = victim;
+ }
+ // Just to make sure that we did not miss anything.
+ for (unsigned i = queues_.size(); i > 0; i--)
+ if (queues_[i - 1]->PopBackHalf(stolen)) return true;
+ return false;
+ }
+
+ // WaitForWork blocks until new work is available, or if it is time to exit.
+ bool WaitForWork(EventCount::Waiter* waiter) {
+ // We already did best-effort emptiness check in Steal, so prepare blocking.
+ ec_.Prewait(waiter);
+ // Now do reliable emptiness check.
+ if (!OutOfWork()) {
+ ec_.CancelWait(waiter);
+ return true;
+ }
+ // Number of blocked threads is used as termination condition.
+ // If we are shutting down and all worker threads blocked without work,
+ // that's we are done.
+ blocked_++;
+ if (done_ && blocked_ == threads_.size()) {
+ ec_.CancelWait(waiter);
+ // Almost done, but need to re-check queues.
+ // Consider that all queues are empty and all worker threads are preempted
+ // right after incrementing blocked_ above. Now a free-standing thread
+ // submits work and calls destructor (which sets done_). If we don't
+ // re-check queues, we will exit leaving the work unexecuted.
+ if (!OutOfWork()) {
+ // Note: we must not pop from queues before we decrement blocked_,
+ // otherwise the following scenario is possible. Consider that instead
+ // of checking for emptiness we popped the only element from queues.
+ // Now other worker threads can start exiting, which is bad if the
+ // work item submits other work. So we just check emptiness here,
+ // which ensures that all worker threads exit at the same time.
+ blocked_--;
+ return true;
+ }
+ // Reached stable termination state.
+ ec_.Notify(true);
+ return false;
+ }
+ ec_.CommitWait(waiter);
+ blocked_--;
+ return true;
+ }
+
+ bool OutOfWork() {
+ for (unsigned i = 0; i < queues_.size(); i++)
+ if (!queues_[i]->Empty()) return false;
+ return true;
+ }
+
+ PerThread* GetPerThread() {
+ EIGEN_THREAD_LOCAL PerThread per_thread_;
+ PerThread* pt = &per_thread_;
+ if (pt->inited) return pt;
+ pt->inited = true;
+ pt->rand = std::hash<std::thread::id>()(std::this_thread::get_id());
+ return pt;
+ }
+
+ static unsigned Rand(unsigned* state) {
+ return *state = *state * 1103515245 + 12345;
+ }
+};
+
+typedef NonBlockingThreadPoolTempl<StlThreadEnvironment> NonBlockingThreadPool;
+
+} // namespace Eigen
+
+#endif // EIGEN_CXX11_THREADPOOL_NONBLOCKING_THREAD_POOL_H
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h b/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h
new file mode 100644
index 000000000..0544a6e15
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h
@@ -0,0 +1,210 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Dmitry Vyukov <dvyukov@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_THREADPOOL_RUNQUEUE_H_
+#define EIGEN_CXX11_THREADPOOL_RUNQUEUE_H_
+
+
+namespace Eigen {
+
+// RunQueue is a fixed-size, partially non-blocking deque or Work items.
+// Operations on front of the queue must be done by a single thread (owner),
+// operations on back of the queue can be done by multiple threads concurrently.
+//
+// Algorithm outline:
+// All remote threads operating on the queue back are serialized by a mutex.
+// This ensures that at most two threads access state: owner and one remote
+// thread (Size aside). The algorithm ensures that the occupied region of the
+// underlying array is logically continuous (can wraparound, but no stray
+// occupied elements). Owner operates on one end of this region, remote thread
+// operates on the other end. Synchronization between these threads
+// (potential consumption of the last element and take up of the last empty
+// element) happens by means of state variable in each element. States are:
+// empty, busy (in process of insertion of removal) and ready. Threads claim
+// elements (empty->busy and ready->busy transitions) by means of a CAS
+// operation. The finishing transition (busy->empty and busy->ready) are done
+// with plain store as the element is exclusively owned by the current thread.
+//
+// Note: we could permit only pointers as elements, then we would not need
+// separate state variable as null/non-null pointer value would serve as state,
+// but that would require malloc/free per operation for large, complex values
+// (and this is designed to store std::function<()>).
+template <typename Work, unsigned kSize>
+class RunQueue {
+ public:
+ RunQueue() : front_(), back_() {
+ // require power-of-two for fast masking
+ eigen_assert((kSize & (kSize - 1)) == 0);
+ eigen_assert(kSize > 2); // why would you do this?
+ eigen_assert(kSize <= (64 << 10)); // leave enough space for counter
+ for (unsigned i = 0; i < kSize; i++)
+ array_[i].state.store(kEmpty, std::memory_order_relaxed);
+ }
+
+ ~RunQueue() { eigen_assert(Size() == 0); }
+
+ // PushFront inserts w at the beginning of the queue.
+ // If queue is full returns w, otherwise returns default-constructed Work.
+ Work PushFront(Work w) {
+ unsigned front = front_.load(std::memory_order_relaxed);
+ Elem* e = &array_[front & kMask];
+ uint8_t s = e->state.load(std::memory_order_relaxed);
+ if (s != kEmpty ||
+ !e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire))
+ return w;
+ front_.store(front + 1 + (kSize << 1), std::memory_order_relaxed);
+ e->w = std::move(w);
+ e->state.store(kReady, std::memory_order_release);
+ return Work();
+ }
+
+ // PopFront removes and returns the first element in the queue.
+ // If the queue was empty returns default-constructed Work.
+ Work PopFront() {
+ unsigned front = front_.load(std::memory_order_relaxed);
+ Elem* e = &array_[(front - 1) & kMask];
+ uint8_t s = e->state.load(std::memory_order_relaxed);
+ if (s != kReady ||
+ !e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire))
+ return Work();
+ Work w = std::move(e->w);
+ e->state.store(kEmpty, std::memory_order_release);
+ front = ((front - 1) & kMask2) | (front & ~kMask2);
+ front_.store(front, std::memory_order_relaxed);
+ return w;
+ }
+
+ // PushBack adds w at the end of the queue.
+ // If queue is full returns w, otherwise returns default-constructed Work.
+ Work PushBack(Work w) {
+ std::unique_lock<std::mutex> lock(mutex_);
+ unsigned back = back_.load(std::memory_order_relaxed);
+ Elem* e = &array_[(back - 1) & kMask];
+ uint8_t s = e->state.load(std::memory_order_relaxed);
+ if (s != kEmpty ||
+ !e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire))
+ return w;
+ back = ((back - 1) & kMask2) | (back & ~kMask2);
+ back_.store(back, std::memory_order_relaxed);
+ e->w = std::move(w);
+ e->state.store(kReady, std::memory_order_release);
+ return Work();
+ }
+
+ // PopBack removes and returns the last elements in the queue.
+ // Can fail spuriously.
+ Work PopBack() {
+ if (Empty()) return 0;
+ std::unique_lock<std::mutex> lock(mutex_, std::try_to_lock);
+ if (!lock) return Work();
+ unsigned back = back_.load(std::memory_order_relaxed);
+ Elem* e = &array_[back & kMask];
+ uint8_t s = e->state.load(std::memory_order_relaxed);
+ if (s != kReady ||
+ !e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire))
+ return Work();
+ Work w = std::move(e->w);
+ e->state.store(kEmpty, std::memory_order_release);
+ back_.store(back + 1 + (kSize << 1), std::memory_order_relaxed);
+ return w;
+ }
+
+ // PopBackHalf removes and returns half last elements in the queue.
+ // Returns number of elements removed. But can also fail spuriously.
+ unsigned PopBackHalf(std::vector<Work>* result) {
+ if (Empty()) return 0;
+ std::unique_lock<std::mutex> lock(mutex_, std::try_to_lock);
+ if (!lock) return 0;
+ unsigned back = back_.load(std::memory_order_relaxed);
+ unsigned size = Size();
+ unsigned mid = back;
+ if (size > 1) mid = back + (size - 1) / 2;
+ unsigned n = 0;
+ unsigned start = 0;
+ for (; static_cast<int>(mid - back) >= 0; mid--) {
+ Elem* e = &array_[mid & kMask];
+ uint8_t s = e->state.load(std::memory_order_relaxed);
+ if (n == 0) {
+ if (s != kReady ||
+ !e->state.compare_exchange_strong(s, kBusy,
+ std::memory_order_acquire))
+ continue;
+ start = mid;
+ } else {
+ // Note: no need to store temporal kBusy, we exclusively own these
+ // elements.
+ eigen_assert(s == kReady);
+ }
+ result->push_back(std::move(e->w));
+ e->state.store(kEmpty, std::memory_order_release);
+ n++;
+ }
+ if (n != 0)
+ back_.store(start + 1 + (kSize << 1), std::memory_order_relaxed);
+ return n;
+ }
+
+ // Size returns current queue size.
+ // Can be called by any thread at any time.
+ unsigned Size() const {
+ // Emptiness plays critical role in thread pool blocking. So we go to great
+ // effort to not produce false positives (claim non-empty queue as empty).
+ for (;;) {
+ // Capture a consistent snapshot of front/tail.
+ unsigned front = front_.load(std::memory_order_acquire);
+ unsigned back = back_.load(std::memory_order_acquire);
+ unsigned front1 = front_.load(std::memory_order_relaxed);
+ if (front != front1) continue;
+ int size = (front & kMask2) - (back & kMask2);
+ // Fix overflow.
+ if (size < 0) size += 2 * kSize;
+ // Order of modification in push/pop is crafted to make the queue look
+ // larger than it is during concurrent modifications. E.g. pop can
+ // decrement size before the corresponding push has incremented it.
+ // So the computed size can be up to kSize + 1, fix it.
+ if (size > static_cast<int>(kSize)) size = kSize;
+ return size;
+ }
+ }
+
+ // Empty tests whether container is empty.
+ // Can be called by any thread at any time.
+ bool Empty() const { return Size() == 0; }
+
+ private:
+ static const unsigned kMask = kSize - 1;
+ static const unsigned kMask2 = (kSize << 1) - 1;
+ struct Elem {
+ std::atomic<uint8_t> state;
+ Work w;
+ };
+ enum {
+ kEmpty,
+ kBusy,
+ kReady,
+ };
+ std::mutex mutex_;
+ // Low log(kSize) + 1 bits in front_ and back_ contain rolling index of
+ // front/back, repsectively. The remaining bits contain modification counters
+ // that are incremented on Push operations. This allows us to (1) distinguish
+ // between empty and full conditions (if we would use log(kSize) bits for
+ // position, these conditions would be indistinguishable); (2) obtain
+ // consistent snapshot of front_/back_ for Size operation using the
+ // modification counters.
+ std::atomic<unsigned> front_;
+ std::atomic<unsigned> back_;
+ Elem array_[kSize];
+
+ RunQueue(const RunQueue&) = delete;
+ void operator=(const RunQueue&) = delete;
+};
+
+} // namespace Eigen
+
+#endif // EIGEN_CXX11_THREADPOOL_RUNQUEUE_H_
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h b/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h
new file mode 100644
index 000000000..17fd1658b
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h
@@ -0,0 +1,127 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_THREADPOOL_SIMPLE_THREAD_POOL_H
+#define EIGEN_CXX11_THREADPOOL_SIMPLE_THREAD_POOL_H
+
+namespace Eigen {
+
+// The implementation of the ThreadPool type ensures that the Schedule method
+// runs the functions it is provided in FIFO order when the scheduling is done
+// by a single thread.
+// Environment provides a way to create threads and also allows to intercept
+// task submission and execution.
+template <typename Environment>
+class SimpleThreadPoolTempl : public ThreadPoolInterface {
+ public:
+ // Construct a pool that contains "num_threads" threads.
+ explicit SimpleThreadPoolTempl(int num_threads, Environment env = Environment())
+ : env_(env), threads_(num_threads), waiters_(num_threads) {
+ for (int i = 0; i < num_threads; i++) {
+ threads_.push_back(env.CreateThread([this]() { WorkerLoop(); }));
+ }
+ }
+
+ // Wait until all scheduled work has finished and then destroy the
+ // set of threads.
+ ~SimpleThreadPoolTempl() {
+ {
+ // Wait for all work to get done.
+ std::unique_lock<std::mutex> l(mu_);
+ while (!pending_.empty()) {
+ empty_.wait(l);
+ }
+ exiting_ = true;
+
+ // Wakeup all waiters.
+ for (auto w : waiters_) {
+ w->ready = true;
+ w->task.f = nullptr;
+ w->cv.notify_one();
+ }
+ }
+
+ // Wait for threads to finish.
+ for (auto t : threads_) {
+ delete t;
+ }
+ }
+
+ // Schedule fn() for execution in the pool of threads. The functions are
+ // executed in the order in which they are scheduled.
+ void Schedule(std::function<void()> fn) {
+ Task t = env_.CreateTask(std::move(fn));
+ std::unique_lock<std::mutex> l(mu_);
+ if (waiters_.empty()) {
+ pending_.push_back(std::move(t));
+ } else {
+ Waiter* w = waiters_.back();
+ waiters_.pop_back();
+ w->ready = true;
+ w->task = std::move(t);
+ w->cv.notify_one();
+ }
+ }
+
+ protected:
+ void WorkerLoop() {
+ std::unique_lock<std::mutex> l(mu_);
+ Waiter w;
+ Task t;
+ while (!exiting_) {
+ if (pending_.empty()) {
+ // Wait for work to be assigned to me
+ w.ready = false;
+ waiters_.push_back(&w);
+ while (!w.ready) {
+ w.cv.wait(l);
+ }
+ t = w.task;
+ w.task.f = nullptr;
+ } else {
+ // Pick up pending work
+ t = std::move(pending_.front());
+ pending_.pop_front();
+ if (pending_.empty()) {
+ empty_.notify_all();
+ }
+ }
+ if (t.f) {
+ mu_.unlock();
+ env_.ExecuteTask(t);
+ t.f = nullptr;
+ mu_.lock();
+ }
+ }
+ }
+
+ private:
+ typedef typename Environment::Task Task;
+ typedef typename Environment::EnvThread Thread;
+
+ struct Waiter {
+ std::condition_variable cv;
+ Task task;
+ bool ready;
+ };
+
+ Environment env_;
+ std::mutex mu_;
+ MaxSizeVector<Thread*> threads_; // All threads
+ MaxSizeVector<Waiter*> waiters_; // Stack of waiting threads.
+ std::deque<Task> pending_; // Queue of pending work
+ std::condition_variable empty_; // Signaled on pending_.empty()
+ bool exiting_ = false;
+};
+
+typedef SimpleThreadPoolTempl<StlThreadEnvironment> SimpleThreadPool;
+
+} // namespace Eigen
+
+#endif // EIGEN_CXX11_THREADPOOL_SIMPLE_THREAD_POOL_H
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h
new file mode 100644
index 000000000..d2204ad5b
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h
@@ -0,0 +1,38 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_THREADPOOL_THREAD_ENVIRONMENT_H
+#define EIGEN_CXX11_THREADPOOL_THREAD_ENVIRONMENT_H
+
+namespace Eigen {
+
+struct StlThreadEnvironment {
+ struct Task {
+ std::function<void()> f;
+ };
+
+ // EnvThread constructor must start the thread,
+ // destructor must join the thread.
+ class EnvThread {
+ public:
+ EnvThread(std::function<void()> f) : thr_(f) {}
+ ~EnvThread() { thr_.join(); }
+
+ private:
+ std::thread thr_;
+ };
+
+ EnvThread* CreateThread(std::function<void()> f) { return new EnvThread(f); }
+ Task CreateTask(std::function<void()> f) { return Task{std::move(f)}; }
+ void ExecuteTask(const Task& t) { t.f(); }
+};
+
+} // namespace Eigen
+
+#endif // EIGEN_CXX11_THREADPOOL_THREAD_ENVIRONMENT_H
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadLocal.h b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadLocal.h
new file mode 100644
index 000000000..cfa221732
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadLocal.h
@@ -0,0 +1,22 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_THREADPOOL_THREAD_LOCAL_H
+#define EIGEN_CXX11_THREADPOOL_THREAD_LOCAL_H
+
+// Try to come up with a portable implementation of thread local variables
+#if EIGEN_COMP_GNUC && EIGEN_GNUC_AT_MOST(4, 7)
+#define EIGEN_THREAD_LOCAL static __thread
+#elif EIGEN_COMP_CLANG
+#define EIGEN_THREAD_LOCAL static __thread
+#else
+#define EIGEN_THREAD_LOCAL static thread_local
+#endif
+
+#endif // EIGEN_CXX11_THREADPOOL_THREAD_LOCAL_H
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h
new file mode 100644
index 000000000..38b40aceb
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h
@@ -0,0 +1,26 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_THREADPOOL_THREAD_POOL_INTERFACE_H
+#define EIGEN_CXX11_THREADPOOL_THREAD_POOL_INTERFACE_H
+
+namespace Eigen {
+
+// This defines an interface that ThreadPoolDevice can take to use
+// custom thread pools underneath.
+class ThreadPoolInterface {
+ public:
+ virtual void Schedule(std::function<void()> fn) = 0;
+
+ virtual ~ThreadPoolInterface() {}
+};
+
+} // namespace Eigen
+
+#endif // EIGEN_CXX11_THREADPOOL_THREAD_POOL_INTERFACE_H
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadYield.h b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadYield.h
new file mode 100644
index 000000000..a859c7ba3
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadYield.h
@@ -0,0 +1,20 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_THREADPOOL_THREAD_YIELD_H
+#define EIGEN_CXX11_THREADPOOL_THREAD_YIELD_H
+
+// Try to come up with a portable way to yield
+#if EIGEN_COMP_GNUC && EIGEN_GNUC_AT_MOST(4, 7)
+#define EIGEN_THREAD_YIELD() sched_yield()
+#else
+#define EIGEN_THREAD_YIELD() std::this_thread::yield()
+#endif
+
+#endif // EIGEN_CXX11_THREADPOOL_THREAD_YIELD_H
diff --git a/unsupported/Eigen/CXX11/src/util/CMakeLists.txt b/unsupported/Eigen/CXX11/src/util/CMakeLists.txt
new file mode 100644
index 000000000..7eab492d6
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/util/CMakeLists.txt
@@ -0,0 +1,6 @@
+FILE(GLOB Eigen_CXX11_util_SRCS "*.h")
+
+INSTALL(FILES
+ ${Eigen_CXX11_util_SRCS}
+ DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/util COMPONENT Devel
+ )
diff --git a/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h b/unsupported/Eigen/CXX11/src/util/CXX11Meta.h
index c582e21f5..f479590b9 100644
--- a/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h
+++ b/unsupported/Eigen/CXX11/src/util/CXX11Meta.h
@@ -10,12 +10,22 @@
#ifndef EIGEN_CXX11META_H
#define EIGEN_CXX11META_H
+#include <vector>
+#include "EmulateArray.h"
+
+// Emulate the cxx11 functionality that we need if the compiler doesn't support it.
+// Visual studio 2015 doesn't advertise itself as cxx11 compliant, although it
+// supports enough of the standard for our needs
+#if __cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900
+
+#include "CXX11Workarounds.h"
+
namespace Eigen {
namespace internal {
/** \internal
- * \file CXX11/Core/util/CXX11Meta.h
+ * \file CXX11/util/CXX11Meta.h
* This file contains generic metaprogramming classes which are not specifically related to Eigen.
* This file expands upon Core/util/Meta.h and adds support for C++11 specific features.
*/
@@ -523,4 +533,10 @@ InstType instantiate_by_c_array(ArrType* arr)
} // end namespace Eigen
+#else // Non C++11, fallback to emulation mode
+
+#include "src/Core/util/EmulateCXX11Meta.h"
+
+#endif
+
#endif // EIGEN_CXX11META_H
diff --git a/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h b/unsupported/Eigen/CXX11/src/util/CXX11Workarounds.h
index fe4d22803..fe4d22803 100644
--- a/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h
+++ b/unsupported/Eigen/CXX11/src/util/CXX11Workarounds.h
diff --git a/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h b/unsupported/Eigen/CXX11/src/util/EmulateArray.h
index 579519b04..24159e54c 100644
--- a/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h
+++ b/unsupported/Eigen/CXX11/src/util/EmulateArray.h
@@ -222,7 +222,7 @@ template<class T, std::size_t N> struct array_size<const array<T,N>& > {
#else
-// The compiler supports c++11, and we're not targetting cuda: use std::array as Eigen array
+// The compiler supports c++11, and we're not targetting cuda: use std::array as Eigen::array
#include <array>
namespace Eigen {
@@ -264,8 +264,4 @@ template<class T, std::size_t N> struct array_size<std::array<T,N> > {
#endif
-
-
-
-
#endif // EIGEN_EMULATE_ARRAY_H
diff --git a/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h b/unsupported/Eigen/CXX11/src/util/EmulateCXX11Meta.h
index d685d4f9d..f3aa1b144 100644
--- a/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h
+++ b/unsupported/Eigen/CXX11/src/util/EmulateCXX11Meta.h
@@ -17,7 +17,7 @@ namespace Eigen {
namespace internal {
/** \internal
- * \file CXX11/Core/util/EmulateCXX11Meta.h
+ * \file CXX11/util/EmulateCXX11Meta.h
* This file emulates a subset of the functionality provided by CXXMeta.h for
* compilers that don't yet support cxx11 such as nvcc.
*/
diff --git a/unsupported/Eigen/CXX11/src/Core/util/MaxSizeVector.h b/unsupported/Eigen/CXX11/src/util/MaxSizeVector.h
index 551124bae..551124bae 100644
--- a/unsupported/Eigen/CXX11/src/Core/util/MaxSizeVector.h
+++ b/unsupported/Eigen/CXX11/src/util/MaxSizeVector.h