diff options
author | Benoit Steiner <benoit.steiner.goog@gmail.com> | 2015-01-14 15:34:50 -0800 |
---|---|---|
committer | Benoit Steiner <benoit.steiner.goog@gmail.com> | 2015-01-14 15:34:50 -0800 |
commit | 6559d09c60fb4acfc7ee5197284f576ac14926f1 (patch) | |
tree | 4663ffeff5e690ed0259e412221420e895d1d272 /unsupported/Eigen | |
parent | 8a382aa119274efd2eb73b822ae7cd2afa128cc5 (diff) |
Ensured that each thread has it's own copy of the TensorEvaluator: this avoid race conditions when the evaluator calls a non thread safe functor, eg when generating random numbers.
Diffstat (limited to 'unsupported/Eigen')
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h | 27 |
1 files changed, 14 insertions, 13 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index f27f643c1..d93fdd907 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -77,17 +77,17 @@ class TensorExecutor<Expression, DefaultDevice, true> #ifdef EIGEN_USE_THREADS template <typename Evaluator, typename Index, bool Vectorizable = Evaluator::PacketAccess> struct EvalRange { - static void run(Evaluator* evaluator, const Index first, const Index last) { + static void run(Evaluator evaluator, const Index first, const Index last) { eigen_assert(last > first); for (Index i = first; i < last; ++i) { - evaluator->evalScalar(i); + evaluator.evalScalar(i); } } }; template <typename Evaluator, typename Index> struct EvalRange<Evaluator, Index, true> { - static void run(Evaluator* evaluator, const Index first, const Index last) { + static void run(Evaluator evaluator, const Index first, const Index last) { eigen_assert(last > first); Index i = first; @@ -96,12 +96,12 @@ struct EvalRange<Evaluator, Index, true> { eigen_assert(first % PacketSize == 0); Index lastPacket = last - (last % PacketSize); for (; i < lastPacket; i += PacketSize) { - evaluator->evalPacket(i); + evaluator.evalPacket(i); } } for (; i < last; ++i) { - evaluator->evalScalar(i); + evaluator.evalScalar(i); } } }; @@ -130,16 +130,17 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> std::vector<Future> results; results.reserve(numblocks); for (int i = 0; i < numblocks; ++i) { - results.push_back(device.enqueue(&EvalRange<Evaluator, Index>::run, &evaluator, i*blocksize, (i+1)*blocksize)); + results.push_back(device.enqueue(&EvalRange<Evaluator, Index>::run, evaluator, i*blocksize, (i+1)*blocksize)); } - for (int i = 0; i < numblocks; ++i) { - results[i].get(); + if (numblocks * blocksize < size) { + EvalRange<Evaluator, Index>::run(evaluator, numblocks * blocksize, size); } - if (numblocks * blocksize < size) { - EvalRange<Evaluator, Index>::run(&evaluator, numblocks * blocksize, size); + for (int i = 0; i < numblocks; ++i) { + get_when_ready(&results[i]); } + } evaluator.cleanup(); } @@ -168,7 +169,8 @@ __launch_bounds__(1024) const Index PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size; const Index vectorized_step_size = step_size * PacketSize; const Index vectorized_size = (size / PacketSize) * PacketSize; - for (Index i = first_index * PacketSize; i < vectorized_size; i += vectorized_step_size) { + for (Index i = first_index * PacketSize; i < vectorized_size; + i += vectorized_step_size) { eval.evalPacket(i); } for (Index i = vectorized_size + first_index; i < size; i += step_size) { @@ -192,8 +194,7 @@ class TensorExecutor<Expression, GpuDevice, Vectorizable> const int block_size = maxCudaThreadsPerBlock(); const Index size = array_prod(evaluator.dimensions()); - EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, Index><<<num_blocks, block_size, 0, device.stream()>>>(evaluator, size); - assert(cudaGetLastError() == cudaSuccess); + LAUNCH_CUDA_KERNEL((EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, Index>), num_blocks, block_size, 0, device, evaluator, size); } evaluator.cleanup(); } |