aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h4
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h3
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h8
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h18
4 files changed, 6 insertions, 27 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
index 88d485f38..98fe6f542 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
@@ -568,10 +568,6 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
(parallel_pack_ ? nm_ + nn_ : (shard_by_col_ ? nn_ : nm_)) +
nm_ * nn_;
if (k < nk_) {
- // It is important to copy out nm_ and nn_, because once we kick off
- // the last packing operation this and device_ can be destroyed.
- Index nm = nm_;
- Index nn = nn_;
// Issue lhs/rhs packing. Their completion will in turn kick off
// kernels.
if (parallel_pack_) {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
index cb6fb4626..a76c8ca35 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
@@ -10,9 +10,6 @@
#ifndef EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
#define EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
-// Turn on the cost model by default
-#define EIGEN_USE_COST_MODEL
-
namespace Eigen {
/** \class TensorEvaluator
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index 868398753..2f1acd321 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -150,9 +150,8 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> {
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
if (needs_assign)
{
- const Index PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1;
const Index size = array_prod(evaluator.dimensions());
-#if !defined(EIGEN_USE_SIMPLE_THREAD_POOL) && defined(EIGEN_USE_COST_MODEL)
+#if !defined(EIGEN_USE_SIMPLE_THREAD_POOL)
device.parallelFor(size, evaluator.costPerCoeff(Vectorizable),
EvalRange<Evaluator, Index, Vectorizable>::alignBlockSize,
[&evaluator](Index first, Index last) {
@@ -160,15 +159,14 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> {
});
#else
size_t num_threads = device.numThreads();
-#ifdef EIGEN_USE_COST_MODEL
if (num_threads > 1) {
num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
size, evaluator.costPerCoeff(Vectorizable), num_threads);
}
-#endif
if (num_threads == 1) {
EvalRange<Evaluator, Index, Vectorizable>::run(&evaluator, 0, size);
} else {
+ const Index PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1;
Index blocksz = std::ceil<Index>(static_cast<float>(size)/num_threads) + PacketSize - 1;
const Index blocksize = numext::maxi<Index>(PacketSize, (blocksz - (blocksz % PacketSize)));
const Index numblocks = size / blocksize;
@@ -185,7 +183,7 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> {
}
barrier.Wait();
}
-#endif // defined(EIGEN_USE_NONBLOCKING_THREAD_POOL) && defined(EIGEN_USE_COST_MODEL)
+#endif
}
evaluator.cleanup();
}
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index 2a8047b7d..8b10c1120 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -248,16 +248,12 @@ struct FullReducer<Self, Op, ThreadPoolDevice, Vectorizable> {
*output = reducer.finalize(reducer.initialize());
return;
}
-#ifdef EIGEN_USE_COST_MODEL
const TensorOpCost cost =
self.m_impl.costPerCoeff(Vectorizable) +
TensorOpCost(0, 0, internal::functor_traits<Op>::Cost, Vectorizable,
PacketSize);
const int num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
num_coeffs, cost, device.numThreads());
-#else
- const int num_threads = device.numThreads();
-#endif
if (num_threads == 1) {
*output =
InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer);
@@ -472,22 +468,14 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
- static bool size_large_enough(Index total_size) {
-#ifndef EIGEN_USE_COST_MODEL
- return total_size > 1024 * 1024;
-#else
- return true || total_size;
-#endif
- }
-
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool evalSubExprsIfNeeded(CoeffReturnType* data) {
m_impl.evalSubExprsIfNeeded(NULL);
// Use the FullReducer if possible.
- if (RunningFullReduction && internal::FullReducer<Self, Op, Device>::HasOptimizedImplementation &&
+ if (RunningFullReduction &&
+ internal::FullReducer<Self, Op, Device>::HasOptimizedImplementation &&
((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) ||
- (!RunningOnGPU && size_large_enough(internal::array_prod(m_impl.dimensions()))))) {
-
+ !RunningOnGPU)) {
bool need_assign = false;
if (!data) {
m_result = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType)));