aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorGravatar Jonathan Hseu <jhseu@google.com>2016-09-08 13:45:24 -0800
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2016-09-08 14:48:00 -0700
commit3b784e52ef8dc08977e07300aa316d2d01cb4e3b (patch)
tree3e3e22776c7c3093063a865cf0b942f7257b23e3
parent5fe484d31b518b6ed172158e4ed2c4d769997c0f (diff)
Adjust the cost_per_unit for Shard() in SparseFeatureCrossOp to reduce Context
overhead for work where few crosses are created. Document the cost_per_unit argument of Shard() to mention the impact of passing an incorrect estimate. Without this fix, Shard()/eigen's ParallelFor splits the work into shards of size 1. Change: 132603143
-rw-r--r--tensorflow/contrib/layers/kernels/sparse_feature_cross_kernel.cc5
-rw-r--r--tensorflow/core/lib/core/threadpool.h6
-rw-r--r--tensorflow/core/util/work_sharder.h6
3 files changed, 14 insertions, 3 deletions
diff --git a/tensorflow/contrib/layers/kernels/sparse_feature_cross_kernel.cc b/tensorflow/contrib/layers/kernels/sparse_feature_cross_kernel.cc
index e528e75c2e..a81ccff431 100644
--- a/tensorflow/contrib/layers/kernels/sparse_feature_cross_kernel.cc
+++ b/tensorflow/contrib/layers/kernels/sparse_feature_cross_kernel.cc
@@ -328,9 +328,8 @@ class SparseFeatureCrossOp : public OpKernel {
};
auto* worker_threads = context->device()->tensorflow_cpu_worker_threads();
- // TODO(zakaria): optimize kCostPerUnit cross on column id should be
- // treated cheaper.
- const int kCostPerUnit = 50000 * indices_list_in.size();
+ // TODO(zakaria): optimize kCostPerUnit
+ const int kCostPerUnit = 5000 * indices_list_in.size();
Shard(worker_threads->num_threads, worker_threads->workers, batch_size,
kCostPerUnit, do_work);
}
diff --git a/tensorflow/core/lib/core/threadpool.h b/tensorflow/core/lib/core/threadpool.h
index ffdb25d674..371339598d 100644
--- a/tensorflow/core/lib/core/threadpool.h
+++ b/tensorflow/core/lib/core/threadpool.h
@@ -51,6 +51,12 @@ class ThreadPool {
// having roughly "cost_per_unit" cost, in cycles. Each unit of work is
// indexed 0, 1, ..., total - 1. Each shard contains 1 or more units of work
// and the total cost of each shard is roughly the same.
+ //
+ // "cost_per_unit" is an estimate of the number of CPU cycles (or nanoseconds
+ // if not CPU-bound) to complete a unit of work. Overestimating creates too
+ // many shards and CPU time will be dominated by per-shard overhead, such as
+ // Context creation. Underestimating may not fully make use of the specified
+ // parallelism.
void ParallelFor(int64 total, int64 cost_per_unit,
std::function<void(int64, int64)> fn);
diff --git a/tensorflow/core/util/work_sharder.h b/tensorflow/core/util/work_sharder.h
index d2e3caad64..451da98b6b 100644
--- a/tensorflow/core/util/work_sharder.h
+++ b/tensorflow/core/util/work_sharder.h
@@ -31,6 +31,12 @@ namespace tensorflow {
// limit). A common configuration is that "workers" is a thread pool
// with at least "max_parallelism" threads.
//
+// "cost_per_unit" is an estimate of the number of CPU cycles (or nanoseconds
+// if not CPU-bound) to complete a unit of work. Overestimating creates too
+// many shards and CPU time will be dominated by per-shard overhead, such as
+// Context creation. Underestimating may not fully make use of the specified
+// parallelism.
+//
// "work" should be a callable taking (int64, int64) arguments.
// work(start, limit) computes the work units from [start,
// limit), i.e., [start, limit) is a shard.