diff options
author | 2016-05-10 10:55:24 -0800 | |
---|---|---|
committer | 2016-05-10 12:02:05 -0700 | |
commit | 8e37ef50c73d6b3f3ec530a3393fe2cba5ad3a30 (patch) | |
tree | 40cb51260b94963072dfd3102c2e70d8a5476cc0 /tensorflow/core/util/work_sharder.cc | |
parent | aec09b6cb61c63a28e01f6b413499602e224da2f (diff) |
tensorflow: finer-grained Shard parallelization
Provide finer-grained Shard parallelization for the new non-blocking thread pool.
This significantly resembles the parallel for algorithm in eigen executors:
we choose a good block size based on amount of work and parallel efficiency,
and then use recursive division in halves.
Benchmark Time(ns): old new diff CPU(ns): old new diff
==========================================================================================
cpu_RandomUniform/1M 647541 301220 -53.48% 9576553 10553619 +10.20%
cpu_RandomUniform/2M 1116118 495724 -55.58% 18285896 19635580 +7.38%
cpu_RandomUniform/8M 2691384 1671594 -37.89% 67830397 72105713 +6.30%
cpu_RandomNormal/1M 2126780 1269039 -40.33% 46887528 53197040 +13.46%
cpu_RandomNormal/2M 3529118 2350399 -33.40% 94337705 104481933 +10.75%
cpu_RandomNormal/8M 12429704 8984079 -27.72% 383278086 410900286 +7.21%
cpu_TruncatedNormal/1M 2513508 1504161 -40.16% 59181937 66096798 +11.68%
cpu_TruncatedNormal/2M 4012258 2890855 -27.95% 122164300 129760843 +6.22%
cpu_TruncatedNormal/8M 17628696 11159204 -36.70% 465946492 513345503 +10.17%
TESTED:
- passed opensource_build
http://ci.tensorflow.org/view/Internal/job/tensorflow-cl-presubmit-multijob/281/
Change: 121971279
Diffstat (limited to 'tensorflow/core/util/work_sharder.cc')
-rw-r--r-- | tensorflow/core/util/work_sharder.cc | 4 |
1 files changed, 4 insertions, 0 deletions
diff --git a/tensorflow/core/util/work_sharder.cc b/tensorflow/core/util/work_sharder.cc index 046d69a939..38346d1716 100644 --- a/tensorflow/core/util/work_sharder.cc +++ b/tensorflow/core/util/work_sharder.cc @@ -22,6 +22,9 @@ namespace tensorflow { void Shard(int num_workers, thread::ThreadPool* workers, int64 total, int64 cost_per_unit, std::function<void(int64, int64)> work) { +#ifdef EIGEN_USE_NONBLOCKING_THREAD_POOL + workers->ParallelFor(total, cost_per_unit, work); +#else CHECK_GE(total, 0); if (total == 0) { return; @@ -68,6 +71,7 @@ void Shard(int num_workers, thread::ThreadPool* workers, int64 total, // Inline execute the 1st shard. work(0, std::min(block_size, total)); counter.Wait(); +#endif } } // end namespace tensorflow |