Adjust the cost_per_unit for Shard() in SparseFeatureCrossOp to reduce Context

overhead for work where few crosses are created. Document the cost_per_unit argument of Shard() to mention the impact of passing an incorrect estimate. Without this fix, Shard()/eigen's ParallelFor splits the work into shards of size 1. Change: 132603143
author: Jonathan Hseu <jhseu@google.com> 2016-09-08 13:45:24 -0800
committer: TensorFlower Gardener <gardener@tensorflow.org> 2016-09-08 14:48:00 -0700
commit: 3b784e52ef8dc08977e07300aa316d2d01cb4e3b (patch)
tree: 3e3e22776c7c3093063a865cf0b942f7257b23e3
parent: 5fe484d31b518b6ed172158e4ed2c4d769997c0f (diff)
3 files changed, 14 insertions, 3 deletions
diff --git a/tensorflow/contrib/layers/kernels/sparse_feature_cross_kernel.cc b/tensorflow/contrib/layers/kernels/sparse_feature_cross_kernel.cc
index e528e75c2e..a81ccff431 100644
--- a/tensorflow/contrib/layers/kernels/sparse_feature_cross_kernel.cc
+++ b/tensorflow/contrib/layers/kernels/sparse_feature_cross_kernel.cc
@@ -328,9 +328,8 @@ class SparseFeatureCrossOp : public OpKernel {
     };
 
     auto* worker_threads = context->device()->tensorflow_cpu_worker_threads();
-    // TODO(zakaria): optimize kCostPerUnit cross on column id should be
-    // treated cheaper.
-    const int kCostPerUnit = 50000 * indices_list_in.size();
+    // TODO(zakaria): optimize kCostPerUnit
+    const int kCostPerUnit = 5000 * indices_list_in.size();
     Shard(worker_threads->num_threads, worker_threads->workers, batch_size,
           kCostPerUnit, do_work);
   }
diff --git a/tensorflow/core/lib/core/threadpool.h b/tensorflow/core/lib/core/threadpool.h
index ffdb25d674..371339598d 100644
--- a/tensorflow/core/lib/core/threadpool.h
+++ b/tensorflow/core/lib/core/threadpool.h
@@ -51,6 +51,12 @@ class ThreadPool {
   // having roughly "cost_per_unit" cost, in cycles. Each unit of work is
   // indexed 0, 1, ..., total - 1. Each shard contains 1 or more units of work
   // and the total cost of each shard is roughly the same.
+  //
+  // "cost_per_unit" is an estimate of the number of CPU cycles (or nanoseconds
+  // if not CPU-bound) to complete a unit of work. Overestimating creates too
+  // many shards and CPU time will be dominated by per-shard overhead, such as
+  // Context creation. Underestimating may not fully make use of the specified
+  // parallelism.
   void ParallelFor(int64 total, int64 cost_per_unit,
                    std::function<void(int64, int64)> fn);
 
diff --git a/tensorflow/core/util/work_sharder.h b/tensorflow/core/util/work_sharder.h
index d2e3caad64..451da98b6b 100644
--- a/tensorflow/core/util/work_sharder.h
+++ b/tensorflow/core/util/work_sharder.h
@@ -31,6 +31,12 @@ namespace tensorflow {
 // limit). A common configuration is that "workers" is a thread pool
 // with at least "max_parallelism" threads.
 //
+// "cost_per_unit" is an estimate of the number of CPU cycles (or nanoseconds
+// if not CPU-bound) to complete a unit of work. Overestimating creates too
+// many shards and CPU time will be dominated by per-shard overhead, such as
+// Context creation. Underestimating may not fully make use of the specified
+// parallelism.
+//
 // "work" should be a callable taking (int64, int64) arguments.
 // work(start, limit) computes the work units from [start,
 // limit), i.e., [start, limit) is a shard.
author	Jonathan Hseu <jhseu@google.com>	2016-09-08 13:45:24 -0800
committer	TensorFlower Gardener <gardener@tensorflow.org>	2016-09-08 14:48:00 -0700
commit	3b784e52ef8dc08977e07300aa316d2d01cb4e3b (patch)
tree	3e3e22776c7c3093063a865cf0b942f7257b23e3
parent	5fe484d31b518b6ed172158e4ed2c4d769997c0f (diff)