From 3b784e52ef8dc08977e07300aa316d2d01cb4e3b Mon Sep 17 00:00:00 2001
From: Jonathan Hseu <jhseu@google.com>
Date: Thu, 8 Sep 2016 13:45:24 -0800
Subject: Adjust the cost_per_unit for Shard() in SparseFeatureCrossOp to
 reduce Context overhead for work where few crosses are created. Document the
 cost_per_unit argument of Shard() to mention the impact of passing an
 incorrect estimate.

Without this fix, Shard()/eigen's ParallelFor splits the work into shards of size 1.
Change: 132603143
---
 tensorflow/contrib/layers/kernels/sparse_feature_cross_kernel.cc | 5 ++---
 tensorflow/core/lib/core/threadpool.h                            | 6 ++++++
 tensorflow/core/util/work_sharder.h                              | 6 ++++++
 3 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/tensorflow/contrib/layers/kernels/sparse_feature_cross_kernel.cc b/tensorflow/contrib/layers/kernels/sparse_feature_cross_kernel.cc
index e528e75c2e..a81ccff431 100644
--- a/tensorflow/contrib/layers/kernels/sparse_feature_cross_kernel.cc
+++ b/tensorflow/contrib/layers/kernels/sparse_feature_cross_kernel.cc
@@ -328,9 +328,8 @@ class SparseFeatureCrossOp : public OpKernel {
     };
 
     auto* worker_threads = context->device()->tensorflow_cpu_worker_threads();
-    // TODO(zakaria): optimize kCostPerUnit cross on column id should be
-    // treated cheaper.
-    const int kCostPerUnit = 50000 * indices_list_in.size();
+    // TODO(zakaria): optimize kCostPerUnit
+    const int kCostPerUnit = 5000 * indices_list_in.size();
     Shard(worker_threads->num_threads, worker_threads->workers, batch_size,
           kCostPerUnit, do_work);
   }
diff --git a/tensorflow/core/lib/core/threadpool.h b/tensorflow/core/lib/core/threadpool.h
index ffdb25d674..371339598d 100644
--- a/tensorflow/core/lib/core/threadpool.h
+++ b/tensorflow/core/lib/core/threadpool.h
@@ -51,6 +51,12 @@ class ThreadPool {
   // having roughly "cost_per_unit" cost, in cycles. Each unit of work is
   // indexed 0, 1, ..., total - 1. Each shard contains 1 or more units of work
   // and the total cost of each shard is roughly the same.
+  //
+  // "cost_per_unit" is an estimate of the number of CPU cycles (or nanoseconds
+  // if not CPU-bound) to complete a unit of work. Overestimating creates too
+  // many shards and CPU time will be dominated by per-shard overhead, such as
+  // Context creation. Underestimating may not fully make use of the specified
+  // parallelism.
   void ParallelFor(int64 total, int64 cost_per_unit,
                    std::function<void(int64, int64)> fn);
 
diff --git a/tensorflow/core/util/work_sharder.h b/tensorflow/core/util/work_sharder.h
index d2e3caad64..451da98b6b 100644
--- a/tensorflow/core/util/work_sharder.h
+++ b/tensorflow/core/util/work_sharder.h
@@ -31,6 +31,12 @@ namespace tensorflow {
 // limit). A common configuration is that "workers" is a thread pool
 // with at least "max_parallelism" threads.
 //
+// "cost_per_unit" is an estimate of the number of CPU cycles (or nanoseconds
+// if not CPU-bound) to complete a unit of work. Overestimating creates too
+// many shards and CPU time will be dominated by per-shard overhead, such as
+// Context creation. Underestimating may not fully make use of the specified
+// parallelism.
+//
 // "work" should be a callable taking (int64, int64) arguments.
 // work(start, limit) computes the work units from [start,
 // limit), i.e., [start, limit) is a shard.
-- 
cgit v1.2.3