From 3b784e52ef8dc08977e07300aa316d2d01cb4e3b Mon Sep 17 00:00:00 2001 From: Jonathan Hseu Date: Thu, 8 Sep 2016 13:45:24 -0800 Subject: Adjust the cost_per_unit for Shard() in SparseFeatureCrossOp to reduce Context overhead for work where few crosses are created. Document the cost_per_unit argument of Shard() to mention the impact of passing an incorrect estimate. Without this fix, Shard()/eigen's ParallelFor splits the work into shards of size 1. Change: 132603143 --- tensorflow/contrib/layers/kernels/sparse_feature_cross_kernel.cc | 5 ++--- tensorflow/core/lib/core/threadpool.h | 6 ++++++ tensorflow/core/util/work_sharder.h | 6 ++++++ 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/tensorflow/contrib/layers/kernels/sparse_feature_cross_kernel.cc b/tensorflow/contrib/layers/kernels/sparse_feature_cross_kernel.cc index e528e75c2e..a81ccff431 100644 --- a/tensorflow/contrib/layers/kernels/sparse_feature_cross_kernel.cc +++ b/tensorflow/contrib/layers/kernels/sparse_feature_cross_kernel.cc @@ -328,9 +328,8 @@ class SparseFeatureCrossOp : public OpKernel { }; auto* worker_threads = context->device()->tensorflow_cpu_worker_threads(); - // TODO(zakaria): optimize kCostPerUnit cross on column id should be - // treated cheaper. - const int kCostPerUnit = 50000 * indices_list_in.size(); + // TODO(zakaria): optimize kCostPerUnit + const int kCostPerUnit = 5000 * indices_list_in.size(); Shard(worker_threads->num_threads, worker_threads->workers, batch_size, kCostPerUnit, do_work); } diff --git a/tensorflow/core/lib/core/threadpool.h b/tensorflow/core/lib/core/threadpool.h index ffdb25d674..371339598d 100644 --- a/tensorflow/core/lib/core/threadpool.h +++ b/tensorflow/core/lib/core/threadpool.h @@ -51,6 +51,12 @@ class ThreadPool { // having roughly "cost_per_unit" cost, in cycles. Each unit of work is // indexed 0, 1, ..., total - 1. Each shard contains 1 or more units of work // and the total cost of each shard is roughly the same. + // + // "cost_per_unit" is an estimate of the number of CPU cycles (or nanoseconds + // if not CPU-bound) to complete a unit of work. Overestimating creates too + // many shards and CPU time will be dominated by per-shard overhead, such as + // Context creation. Underestimating may not fully make use of the specified + // parallelism. void ParallelFor(int64 total, int64 cost_per_unit, std::function fn); diff --git a/tensorflow/core/util/work_sharder.h b/tensorflow/core/util/work_sharder.h index d2e3caad64..451da98b6b 100644 --- a/tensorflow/core/util/work_sharder.h +++ b/tensorflow/core/util/work_sharder.h @@ -31,6 +31,12 @@ namespace tensorflow { // limit). A common configuration is that "workers" is a thread pool // with at least "max_parallelism" threads. // +// "cost_per_unit" is an estimate of the number of CPU cycles (or nanoseconds +// if not CPU-bound) to complete a unit of work. Overestimating creates too +// many shards and CPU time will be dominated by per-shard overhead, such as +// Context creation. Underestimating may not fully make use of the specified +// parallelism. +// // "work" should be a callable taking (int64, int64) arguments. // work(start, limit) computes the work units from [start, // limit), i.e., [start, limit) is a shard. -- cgit v1.2.3