aboutsummaryrefslogtreecommitdiffhomepage
path: root/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
diff options
context:
space:
mode:
Diffstat (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h')
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h36
1 files changed, 33 insertions, 3 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
index 413b94579..07e6b273e 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
@@ -77,7 +77,35 @@ struct ThreadPoolDevice {
}
EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
+#ifdef __ANDROID__
::memcpy(dst, src, n);
+#else
+ // TODO(rmlarsen): Align blocks on cache lines.
+ // We have observed that going beyond 4 threads usually just wastes
+ // CPU cycles due to the threads competing for memory bandwidth, so we
+ // statically schedule at most 4 block copies here.
+ const size_t kMinBlockSize = 32768;
+ typedef TensorCostModel<ThreadPoolDevice> CostModel;
+ const size_t num_threads = CostModel::numThreads(n, TensorOpCost(1.0, 1.0, 0), 4);
+ if (n <= kMinBlockSize || num_threads == 1) {
+ ::memcpy(dst, src, n);
+ } else {
+ const char* src_ptr = static_cast<const char*>(src);
+ char* dst_ptr = static_cast<char*>(dst);
+ const size_t blocksize = (n + (num_threads - 1)) / num_threads;
+ Barrier barrier(num_threads - 1);
+ // Launch the last 3 blocks on worker threads.
+ for (size_t i = 1; i < num_threads; ++i) {
+ enqueue_with_barrier(&barrier, [n, i, src_ptr, dst_ptr, blocksize] {
+ ::memcpy(dst_ptr + i * blocksize, src_ptr + i * blocksize,
+ numext::mini(blocksize, n - (i * blocksize)));
+ });
+ }
+ // Launch the first block on the main thread.
+ ::memcpy(dst_ptr, src_ptr, blocksize);
+ barrier.Wait();
+ }
+#endif
}
EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
memcpy(dst, src, n);
@@ -171,7 +199,7 @@ struct ThreadPoolDevice {
const Index max_block_size = numext::mini(n, 2 * block_size);
if (block_align) {
Index new_block_size = block_align(block_size);
- eigen_assert(new_block_size >= block_size);
+ eigen_plain_assert(new_block_size >= block_size);
block_size = numext::mini(n, new_block_size);
}
Index block_count = divup(n, block_size);
@@ -189,7 +217,7 @@ struct ThreadPoolDevice {
Index coarser_block_size = divup(n, prev_block_count - 1);
if (block_align) {
Index new_block_size = block_align(coarser_block_size);
- eigen_assert(new_block_size >= coarser_block_size);
+ eigen_plain_assert(new_block_size >= coarser_block_size);
coarser_block_size = numext::mini(n, new_block_size);
}
if (coarser_block_size > max_block_size) {
@@ -197,7 +225,7 @@ struct ThreadPoolDevice {
}
// Recalculate parallel efficiency.
const Index coarser_block_count = divup(n, coarser_block_size);
- eigen_assert(coarser_block_count < prev_block_count);
+ eigen_plain_assert(coarser_block_count < prev_block_count);
prev_block_count = coarser_block_count;
const double coarser_efficiency =
static_cast<double>(coarser_block_count) /
@@ -240,6 +268,8 @@ struct ThreadPoolDevice {
barrier.Wait();
}
+
+
// Convenience wrapper for parallelFor that does not align blocks.
void parallelFor(Index n, const TensorOpCost& cost,
std::function<void(Index, Index)> f) const {