Replace std::vector with our own implementation, as using the stl when compiling with nvcc and avx enabled leads to many issues.

author: Benoit Steiner <benoit.steiner.goog@gmail.com> 2016-03-08 16:37:27 -0800
committer: Benoit Steiner <benoit.steiner.goog@gmail.com> 2016-03-08 16:37:27 -0800
commit: 46177c8d648a27d82d34cebed7e2b5bc59d441fc (patch)
tree: 97a356d04f124ea1ad32eda38e76c607e6b33e5e /unsupported/Eigen/CXX11/src/Tensor
parent: 6d6413f76832a094d0835770af2adfaabba24738 (diff)
3 files changed, 12 insertions, 17 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
index 02b3c6dea..9044454fd 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
@@ -28,7 +28,7 @@ struct packLhsArg {
 
 template<typename LhsScalar, typename RhsScalar, typename RhsMapper, typename OutputMapper, typename Index>
 struct packRhsAndKernelArg {
-  const std::vector<LhsScalar*>* blockAs;
+  const MaxSizeVector<LhsScalar*>* blockAs;
   RhsScalar* blockB;
   const RhsMapper& rhs;
   OutputMapper& output;
@@ -46,8 +46,8 @@ struct packRhsAndKernelArg {
   const Index n_block_idx;
   const Index m_blocks;
   const Index n_blocks;
-  std::vector<Notification*>* kernel_notifications;
-  const std::vector<Notification*>* lhs_notifications;
+  MaxSizeVector<Notification*>* kernel_notifications;
+  const MaxSizeVector<Notification*>* lhs_notifications;
   const bool need_to_pack;
 };
 
@@ -202,8 +202,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     //       the alignment requirements with the assumption that
     //       (Traits::mr * sizeof(ResScalar)) % 16 == 0
     const Index numBlockAs = numext::mini(num_threads, m_blocks);
-    std::vector<LhsScalar *> blockAs;
-    blockAs.reserve(num_threads);
+    MaxSizeVector<LhsScalar *> blockAs(num_threads);
     for (int i = 0; i < num_threads; i++) {
       blockAs.push_back(static_cast<LhsScalar *>(this->m_device.allocate(sizeA * sizeof(LhsScalar))));
     }
@@ -212,18 +211,17 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     // TODO: is this too much memory to allocate? This simplifies coding a lot, but is wasteful.
     //       Other options: (1) reuse memory when a thread finishes. con: tricky
     //                      (2) allocate block B memory in each thread. con: overhead
-    std::vector<RhsScalar *> blockBs;
-    blockBs.reserve(n_blocks);
+    MaxSizeVector<RhsScalar *> blockBs(n_blocks);
     for (int i = 0; i < n_blocks; i++) {
       blockBs.push_back(static_cast<RhsScalar *>(this->m_device.allocate(sizeB * sizeof(RhsScalar))));
     }
 
     // lhs_notifications starts with all null Notifications
-    std::vector<Notification*> lhs_notifications(num_threads, nullptr);
+    MaxSizeVector<Notification*> lhs_notifications(num_threads, nullptr);
 
     // this should really be numBlockAs * n_blocks;
     const Index num_kernel_notifications = num_threads * n_blocks;
-    std::vector<Notification*> kernel_notifications(num_kernel_notifications,
+    MaxSizeVector<Notification*> kernel_notifications(num_kernel_notifications,
                                                     nullptr);
 
     for (Index k_block_idx = 0; k_block_idx < k_blocks; k_block_idx++) {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index fd9919829..54da77bcf 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -127,8 +127,7 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable>
       const Index blocksize = numext::maxi<Index>(PacketSize, (blocksz - (blocksz % PacketSize)));
       const Index numblocks = size / blocksize;
 
-      std::vector<Notification*> results;
-      results.reserve(numblocks);
+      MaxSizeVector<Notification*> results(numblocks);
       for (int i = 0; i < numblocks; ++i) {
         results.push_back(device.enqueue(&EvalRange<Evaluator, Index, Vectorizable>::run, evaluator, i*blocksize, (i+1)*blocksize));
       }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index 875155243..2d7fb80d4 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -256,9 +256,8 @@ struct FullReducer<Self, Op, ThreadPoolDevice, false> {
       const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0;
       eigen_assert(num_coeffs >= numblocks * blocksize);
 
-      std::vector<Notification*> results;
-      results.reserve(numblocks);
-      std::vector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize());
+      MaxSizeVector<Notification*> results(numblocks);
+      MaxSizeVector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize());
       for (Index i = 0; i < numblocks; ++i) {
         results.push_back(
             device.enqueue(&FullReducerShard<Self, Op, false>::run, self,
@@ -308,9 +307,8 @@ struct FullReducer<Self, Op, ThreadPoolDevice, true> {
     const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0;
     eigen_assert(num_coeffs >= numblocks * blocksize);
 
-    std::vector<Notification*> results;
-    results.reserve(numblocks);
-    std::vector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize());
+    MaxSizeVector<Notification*> results(numblocks);
+    MaxSizeVector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize());
     for (Index i = 0; i < numblocks; ++i) {
       results.push_back(device.enqueue(&FullReducerShard<Self, Op, true>::run,
                                        self, i * blocksize, blocksize, reducer,
author	Benoit Steiner <benoit.steiner.goog@gmail.com>	2016-03-08 16:37:27 -0800
committer	Benoit Steiner <benoit.steiner.goog@gmail.com>	2016-03-08 16:37:27 -0800
commit	46177c8d648a27d82d34cebed7e2b5bc59d441fc (patch)
tree	97a356d04f124ea1ad32eda38e76c607e6b33e5e /unsupported/Eigen/CXX11/src/Tensor
parent	6d6413f76832a094d0835770af2adfaabba24738 (diff)