diff options
author | Benoit Steiner <benoit.steiner.goog@gmail.com> | 2016-03-08 16:37:27 -0800 |
---|---|---|
committer | Benoit Steiner <benoit.steiner.goog@gmail.com> | 2016-03-08 16:37:27 -0800 |
commit | 46177c8d648a27d82d34cebed7e2b5bc59d441fc (patch) | |
tree | 97a356d04f124ea1ad32eda38e76c607e6b33e5e /unsupported/Eigen/CXX11/src/Tensor | |
parent | 6d6413f76832a094d0835770af2adfaabba24738 (diff) |
Replace std::vector with our own implementation, as using the stl when compiling with nvcc and avx enabled leads to many issues.
Diffstat (limited to 'unsupported/Eigen/CXX11/src/Tensor')
3 files changed, 12 insertions, 17 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index 02b3c6dea..9044454fd 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -28,7 +28,7 @@ struct packLhsArg { template<typename LhsScalar, typename RhsScalar, typename RhsMapper, typename OutputMapper, typename Index> struct packRhsAndKernelArg { - const std::vector<LhsScalar*>* blockAs; + const MaxSizeVector<LhsScalar*>* blockAs; RhsScalar* blockB; const RhsMapper& rhs; OutputMapper& output; @@ -46,8 +46,8 @@ struct packRhsAndKernelArg { const Index n_block_idx; const Index m_blocks; const Index n_blocks; - std::vector<Notification*>* kernel_notifications; - const std::vector<Notification*>* lhs_notifications; + MaxSizeVector<Notification*>* kernel_notifications; + const MaxSizeVector<Notification*>* lhs_notifications; const bool need_to_pack; }; @@ -202,8 +202,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT // the alignment requirements with the assumption that // (Traits::mr * sizeof(ResScalar)) % 16 == 0 const Index numBlockAs = numext::mini(num_threads, m_blocks); - std::vector<LhsScalar *> blockAs; - blockAs.reserve(num_threads); + MaxSizeVector<LhsScalar *> blockAs(num_threads); for (int i = 0; i < num_threads; i++) { blockAs.push_back(static_cast<LhsScalar *>(this->m_device.allocate(sizeA * sizeof(LhsScalar)))); } @@ -212,18 +211,17 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT // TODO: is this too much memory to allocate? This simplifies coding a lot, but is wasteful. // Other options: (1) reuse memory when a thread finishes. con: tricky // (2) allocate block B memory in each thread. con: overhead - std::vector<RhsScalar *> blockBs; - blockBs.reserve(n_blocks); + MaxSizeVector<RhsScalar *> blockBs(n_blocks); for (int i = 0; i < n_blocks; i++) { blockBs.push_back(static_cast<RhsScalar *>(this->m_device.allocate(sizeB * sizeof(RhsScalar)))); } // lhs_notifications starts with all null Notifications - std::vector<Notification*> lhs_notifications(num_threads, nullptr); + MaxSizeVector<Notification*> lhs_notifications(num_threads, nullptr); // this should really be numBlockAs * n_blocks; const Index num_kernel_notifications = num_threads * n_blocks; - std::vector<Notification*> kernel_notifications(num_kernel_notifications, + MaxSizeVector<Notification*> kernel_notifications(num_kernel_notifications, nullptr); for (Index k_block_idx = 0; k_block_idx < k_blocks; k_block_idx++) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index fd9919829..54da77bcf 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -127,8 +127,7 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> const Index blocksize = numext::maxi<Index>(PacketSize, (blocksz - (blocksz % PacketSize))); const Index numblocks = size / blocksize; - std::vector<Notification*> results; - results.reserve(numblocks); + MaxSizeVector<Notification*> results(numblocks); for (int i = 0; i < numblocks; ++i) { results.push_back(device.enqueue(&EvalRange<Evaluator, Index, Vectorizable>::run, evaluator, i*blocksize, (i+1)*blocksize)); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 875155243..2d7fb80d4 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -256,9 +256,8 @@ struct FullReducer<Self, Op, ThreadPoolDevice, false> { const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0; eigen_assert(num_coeffs >= numblocks * blocksize); - std::vector<Notification*> results; - results.reserve(numblocks); - std::vector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize()); + MaxSizeVector<Notification*> results(numblocks); + MaxSizeVector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize()); for (Index i = 0; i < numblocks; ++i) { results.push_back( device.enqueue(&FullReducerShard<Self, Op, false>::run, self, @@ -308,9 +307,8 @@ struct FullReducer<Self, Op, ThreadPoolDevice, true> { const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0; eigen_assert(num_coeffs >= numblocks * blocksize); - std::vector<Notification*> results; - results.reserve(numblocks); - std::vector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize()); + MaxSizeVector<Notification*> results(numblocks); + MaxSizeVector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize()); for (Index i = 0; i < numblocks; ++i) { results.push_back(device.enqueue(&FullReducerShard<Self, Op, true>::run, self, i * blocksize, blocksize, reducer, |