Avoid allocation of std::function<> object per node executed, as

well as a hash-table lookup per allocated output. Instead, we now pre-compute the AllocatorAttributes for every output tensor in the graph into an array (indexed by a base number per node + output index), and changed OpKernelContext::Params to provide a pointer to the base of the array for the node, rather than providing a std::function<>. Updated test code to avoid so much code duplication when setting up the OpKernelContext::Params object in various places. Used gtl::InlinedVector<...> instead of std::vector<...> in a few places in tensorflow/core/kernels/reduction_ops_common.h Didn't make a measurable change in overall performance but allocations and time spent in the std::function destructor code was significantly reduced. Change: 112103260
author: A. Unique TensorFlower <nobody@tensorflow.org> 2016-01-13 17:30:08 -0800
committer: Vijay Vasudevan <vrv@google.com> 2016-01-13 17:30:08 -0800
commit: 6dbfb95100b73ad26ebebb9be9c0429dc0cece8a (patch)
tree: 83a15ef5538529dd3a4fece1276388c20f9260bb /tensorflow/core/kernels/segment_reduction_ops_test.cc
parent: cbdf278dbd998aab57898763605f6863e9b42b76 (diff)
1 files changed, 3 insertions, 7 deletions
diff --git a/tensorflow/core/kernels/segment_reduction_ops_test.cc b/tensorflow/core/kernels/segment_reduction_ops_test.cc
index 00d8e13338..4121ec232b 100644
--- a/tensorflow/core/kernels/segment_reduction_ops_test.cc
+++ b/tensorflow/core/kernels/segment_reduction_ops_test.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/graph/testlib.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 #include "tensorflow/core/public/tensor.h"
@@ -72,13 +73,8 @@ static void BM_SegmentReduction(int iters, string reduction, Index num_rows,
   params.frame_iter = FrameAndIter(0, 0);
   params.inputs = &reduction_inputs;
   params.op_kernel = reduction_op.get();
-  params.output_alloc_attr = [&device, &reduction_op, &params](int index) {
-    AllocatorAttributes attr;
-    const bool on_host =
-        (reduction_op->output_memory_types()[index] == HOST_MEMORY);
-    attr.set_on_host(on_host);
-    return attr;
-  };
+  std::vector<AllocatorAttributes> attrs;
+  test::SetOutputAttrs(&params, &attrs);
 
   std::unique_ptr<OpKernelContext> reduction_context(
       new OpKernelContext(params));
author	A. Unique TensorFlower <nobody@tensorflow.org>	2016-01-13 17:30:08 -0800
committer	Vijay Vasudevan <vrv@google.com>	2016-01-13 17:30:08 -0800
commit	6dbfb95100b73ad26ebebb9be9c0429dc0cece8a (patch)
tree	83a15ef5538529dd3a4fece1276388c20f9260bb /tensorflow/core/kernels/segment_reduction_ops_test.cc
parent	cbdf278dbd998aab57898763605f6863e9b42b76 (diff)