Merged in rmlarsen/eigen2 (pull request PR-543)

Add parallel memcpy to TensorThreadPoolDevice in Eigen, but limit the number of threads to 4, beyond which we just seem to be wasting CPU cycles as the threads contend for memory bandwidth. Approved-by: Eugene Zhulenev <ezhulenev@google.com>
author: Rasmus Munk Larsen <rmlarsen@google.com> 2018-11-13 17:10:30 +0000
committer: Rasmus Munk Larsen <rmlarsen@google.com> 2018-11-13 17:10:30 +0000
commit: 72928a2c8afefd4d3eccf636cfd7b1d6aba3fd02 (patch)
tree: c78d340ff0f7f8922053cc179d3d98fe7f8b5309
parent: f67b19a884768df88107d44e28542ae5dde677d2 (diff)
parent: cda479d626d13b8c55f27fc462de2a85d668d190 (diff)
1 files changed, 28 insertions, 0 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
index 413b94579..3b87b114d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
@@ -77,7 +77,35 @@ struct ThreadPoolDevice {
   }
 
   EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
+#ifdef __ANDROID__
     ::memcpy(dst, src, n);
+#else
+    // TODO(rmlarsen): Align blocks on cache lines.
+    // We have observed that going beyond 4 threads usually just wastes
+    // CPU cycles due to the threads competing for memory bandwidth, so we
+    // statically schedule at most 4 block copies here.
+    const size_t kMinBlockSize = 32768;
+    typedef TensorCostModel<ThreadPoolDevice> CostModel;
+    const size_t num_threads = CostModel::numThreads(n, TensorOpCost(1.0, 1.0, 0), 4);
+    if (n <= kMinBlockSize || num_threads == 1) {
+      ::memcpy(dst, src, n);
+    } else {
+      const char* src_ptr = static_cast<const char*>(src);
+      char* dst_ptr = static_cast<char*>(dst);
+      const size_t blocksize = (n + (num_threads - 1)) / num_threads;
+      Barrier barrier(num_threads - 1);
+      // Launch the last 3 blocks on worker threads.
+      for (size_t i = 1; i < num_threads; ++i) {
+        enqueue_with_barrier(&barrier, [n, i, src_ptr, dst_ptr, blocksize] {
+          ::memcpy(dst_ptr + i * blocksize, src_ptr + i * blocksize,
+                   numext::mini(blocksize, n - (i * blocksize)));
+        });
+      }
+      // Launch the first block on the main thread.
+      ::memcpy(dst_ptr, src_ptr, blocksize);
+      barrier.Wait();
+    }
+#endif
   }
   EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
     memcpy(dst, src, n);
author	Rasmus Munk Larsen <rmlarsen@google.com>	2018-11-13 17:10:30 +0000
committer	Rasmus Munk Larsen <rmlarsen@google.com>	2018-11-13 17:10:30 +0000
commit	72928a2c8afefd4d3eccf636cfd7b1d6aba3fd02 (patch)
tree	c78d340ff0f7f8922053cc179d3d98fe7f8b5309
parent	f67b19a884768df88107d44e28542ae5dde677d2 (diff)
parent	cda479d626d13b8c55f27fc462de2a85d668d190 (diff)