Reduce overhead of ExecutorState::ProcessOutputs:

- Cache result of LogMemory::IsEnabled. - Init Tensor for logging only if logging is enabled. - Add move constructor to Tensor and use it in ProcessOutputs. Also added benchmark for copy vs move in tensor_test.cc Change: 121487202
author: A. Unique TensorFlower <nobody@tensorflow.org> 2016-05-04 08:14:06 -0800
committer: TensorFlower Gardener <gardener@tensorflow.org> 2016-05-04 09:21:36 -0700
commit: f7a662e595f1631c12d15173344ee0f50d2cd9f8 (patch)
tree: ab87e6527dadcc2b7b9367ce70c62052bc343cd5 /tensorflow
parent: a1bc10f9e28cd3d33e91bfaedd8199e4590f2893 (diff)
7 files changed, 135 insertions, 22 deletions
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 963ea1fc4d..792142af62 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -786,6 +786,7 @@ cc_library(
         "lib/gtl/edit_distance.h",
         "lib/gtl/int_type.h",
         "lib/gtl/iterator_range.h",
+        "lib/gtl/manual_constructor.h",
         "lib/gtl/top_n.h",
         "lib/io/iterator.h",
         "lib/io/match.h",
diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index 0c322cac76..b73f7c2aa9 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -44,6 +44,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/notification.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/lib/gtl/manual_constructor.h"
 #include "tensorflow/core/lib/gtl/stl_util.h"
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/lib/strings/str_util.h"
@@ -644,6 +645,9 @@ class ExecutorState {
 
   const bool vlog_;  // true if VLOG_IS_ON(1). Used to check vlog cheaply.
 
+  // true if LogMemory::IsEnabled(). Used to check memory enabled cheaply.
+  const bool log_memory_;
+
   int64 step_id_;
   // Not owned.
   Rendezvous* rendezvous_;
@@ -796,6 +800,7 @@ class ExecutorState {
 
 ExecutorState::ExecutorState(const Executor::Args& args, ExecutorImpl* impl)
     : vlog_(VLOG_IS_ON(1)),
+      log_memory_(LogMemory::IsEnabled()),
       step_id_(args.step_id),
       rendezvous_(args.rendezvous),
       session_state_(args.session_state),
@@ -1260,9 +1265,6 @@ Status ExecutorState::ProcessOutputs(const NodeItem& item, OpKernelContext* ctx,
       Entry* out = &((*outputs)[i]);
       out->has_value = true;
 
-      // This value is filled in below if LogMemory::IsEnabled.
-      Tensor value_to_log;
-
       // Set the device context of the output entry.
       out->device_context = device_context;
 
@@ -1273,23 +1275,31 @@ Status ExecutorState::ProcessOutputs(const NodeItem& item, OpKernelContext* ctx,
       DataType dtype = val->dtype();
       if (val.is_ref()) dtype = MakeRefType(dtype);
       if (dtype == item.output_type(i)) {
+        if (stats_collector_ && val.tensor->IsInitialized()) {
+          nodestats::SetOutput(stats, i, val.tensor);
+        }
         if (val.is_ref()) {
           out->ref = val.tensor;
           out->ref_mu = val.mutex_if_ref;
-          if (LogMemory::IsEnabled()) {
-            // Dereference the tensor under the lock.
-            mutex_lock l(*out->ref_mu);
-            value_to_log = *out->ref;
+          if (log_memory_) {
+            Tensor to_log;
+            {
+              // Dereference the tensor under the lock.
+              mutex_lock l(*out->ref_mu);
+              to_log = *out->ref;
+            }
+            LogMemory::RecordTensorOutput(ctx->op_kernel().name(),
+                                          ctx->step_id(), i, to_log);
           }
         } else {
-          out->val = *val.tensor;
-          if (LogMemory::IsEnabled()) {
-            value_to_log = out->val;
+          // NOTE that std::move is used here, so val.tensor goes to
+          // uninitialized state (val.tensor->IsInitialized return false).
+          out->val = std::move(*val.tensor);
+          if (log_memory_) {
+            LogMemory::RecordTensorOutput(ctx->op_kernel().name(),
+                                          ctx->step_id(), i, out->val);
           }
         }
-        if (stats_collector_ && val.tensor->IsInitialized()) {
-          nodestats::SetOutput(stats, i, val.tensor);
-        }
       } else {
         s.Update(errors::Internal("Output ", i, " of type ",
                                   DataTypeString(dtype),
@@ -1297,10 +1307,6 @@ Status ExecutorState::ProcessOutputs(const NodeItem& item, OpKernelContext* ctx,
                                   DataTypeString(item.output_type(i)),
                                   " for node ", SummarizeNodeDef(node->def())));
       }
-      if (LogMemory::IsEnabled()) {
-        LogMemory::RecordTensorOutput(ctx->op_kernel().name(), ctx->step_id(),
-                                      i, value_to_log);
-      }
     }
     if (!val.is_ref()) {
       // If OpKernelContext returns outputs via pass-by-value, we
diff --git a/tensorflow/core/framework/tensor.cc b/tensorflow/core/framework/tensor.cc
index 1291fae3ae..ea528b958b 100644
--- a/tensorflow/core/framework/tensor.cc
+++ b/tensorflow/core/framework/tensor.cc
@@ -389,11 +389,6 @@ Tensor::Tensor() : Tensor(DT_FLOAT) {}
 
 Tensor::Tensor(DataType type) : shape_({0}), buf_(nullptr) { set_dtype(type); }
 
-Tensor::Tensor(const Tensor& other) : shape_(other.shape()), buf_(other.buf_) {
-  set_dtype(other.dtype());
-  RefIfNonNull(buf_);
-}
-
 Tensor::Tensor(DataType type, const TensorShape& shape, TensorBuffer* buf)
     : shape_(shape), buf_(buf) {
   set_dtype(type);
diff --git a/tensorflow/core/framework/tensor.h b/tensorflow/core/framework/tensor.h
index a8578e789d..f183a0aa67 100644
--- a/tensorflow/core/framework/tensor.h
+++ b/tensorflow/core/framework/tensor.h
@@ -84,6 +84,11 @@ class Tensor {
 
   Tensor(const Tensor& other);  /// Copy constructor.
 
+  // Move constructor.  After this call, <other> is safely destructible and can
+  // be assigned to, but other calls on it (e.g. shape manipulation) are not
+  // valid.
+  Tensor(Tensor&& other);
+
   ~Tensor();
 
   /// Returns the data type.
@@ -137,6 +142,9 @@ class Tensor {
     return *this;
   }
 
+  /// Move operator.  See move constructor for details.
+  Tensor& operator=(Tensor&& other);
+
   /// \brief Copy the other tensor into this tensor and reshape it.
   ///
   /// This tensor shares other's underlying storage. Returns `true`
@@ -543,6 +551,24 @@ typename TTypes<T, NDIMS>::ConstTensor Tensor::flat_outer_dims() const {
   return shaped<T, NDIMS>(ComputeFlatOuterDims(NDIMS));
 }
 
+inline Tensor::Tensor(const Tensor& other)
+    : shape_(other.shape()), buf_(other.buf_) {
+  if (buf_) buf_->Ref();
+}
+
+inline Tensor::Tensor(Tensor&& other)
+    : shape_(std::move(other.shape())), buf_(other.buf_) {
+  other.buf_ = nullptr;
+}
+
+inline Tensor& Tensor::operator=(Tensor&& other) {
+  shape_ = std::move(other.shape_);
+  if (buf_) buf_->Unref();
+  buf_ = other.buf_;
+  other.buf_ = nullptr;
+  return *this;
+}
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_FRAMEWORK_TENSOR_H_
diff --git a/tensorflow/core/framework/tensor_shape.h b/tensorflow/core/framework/tensor_shape.h
index 84947e308a..7cc8223016 100644
--- a/tensorflow/core/framework/tensor_shape.h
+++ b/tensorflow/core/framework/tensor_shape.h
@@ -54,6 +54,13 @@ class TensorShape {
   TensorShape(const TensorShape& b);
   void operator=(const TensorShape& b);
 
+  /// Move the specified shape.  After moving, <b> is safe for destruction and
+  // can be reassigned into, but its dimensions and number of elements can be
+  // nonsensical (e.g., negative dimension sizes, or number of elements not
+  // properly recomputed).
+  TensorShape(TensorShape&& b);
+  void operator=(TensorShape&& b);
+
   /// Returns `true` iff `proto` is a valid tensor shape.
   static bool IsValid(const TensorShapeProto& proto);
 
@@ -308,6 +315,15 @@ inline TensorShape::TensorShape(const TensorShape& b) {
   }
 }
 
+inline TensorShape::TensorShape(TensorShape&& b) {
+  num_elements_ = b.num_elements_;
+  memcpy(buf(), b.buf(), sizeof(u_.buf));
+  // memcpy above Implicitly does:
+  //   set_ndims_byte(b.ndims_byte());
+  //   set_tag(b.tag());
+  b.set_tag(REP16);  // other shape no longer owns out-of-line data, if any.
+}
+
 inline TensorShape::~TensorShape() {
   if (tag() == REP_OUT_OF_LINE) {
     DestructorOutOfLine();
@@ -326,6 +342,18 @@ inline void TensorShape::operator=(const TensorShape& b) {
   }
 }
 
+inline void TensorShape::operator=(TensorShape&& b) {
+  if (tag() == REP_OUT_OF_LINE) {
+    DestructorOutOfLine();
+  }
+  num_elements_ = b.num_elements_;
+  memcpy(buf(), b.buf(), sizeof(u_.buf));
+  // memcpy above Implicitly does:
+  //   set_ndims_byte(b.ndims_byte());
+  //   set_tag(b.tag());
+  b.set_tag(REP16);  // other shape no longer owns out-of-line data, if any.
+}
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_FRAMEWORK_TENSOR_SHAPE_H_
diff --git a/tensorflow/core/framework/tensor_shape_test.cc b/tensorflow/core/framework/tensor_shape_test.cc
index 5eeaeb61da..f3e1a7cbcb 100644
--- a/tensorflow/core/framework/tensor_shape_test.cc
+++ b/tensorflow/core/framework/tensor_shape_test.cc
@@ -473,6 +473,15 @@ TEST(TensorShapeTest, Randomized) {
       fprintf(stderr, "ITERATION %d: %s\n", i, sp.DebugString().c_str());
     }
     EXPECT_EQ(s.num_elements(), sold.num_elements());
+
+    // Test moves.
+    TensorShape copy = s;
+    TensorShape moved(std::move(copy));
+    EXPECT_EQ(s, moved);
+    copy = s;
+    moved = std::move(copy);
+    EXPECT_EQ(s, moved);
+
     int64 ne = sold.num_elements();
     int r = gen.Uniform(100);
     if (r < 10) {
diff --git a/tensorflow/core/framework/tensor_test.cc b/tensorflow/core/framework/tensor_test.cc
index f13c85c188..eed7417d36 100644
--- a/tensorflow/core/framework/tensor_test.cc
+++ b/tensorflow/core/framework/tensor_test.cc
@@ -112,6 +112,22 @@ void TestCopies(const Tensor& t) {
     Tensor t2 = test::AsTensor(values, t.shape());
     test::ExpectTensorEqual<T>(t, t2);
   }
+  {
+    LOG(INFO) << "Move constructor";
+    Tensor t2 = t;
+    Tensor t3(std::move(t2));
+    test::ExpectTensorEqual<T>(t, t3);
+    EXPECT_TRUE(t3.IsInitialized());
+    EXPECT_FALSE(t2.IsInitialized());
+  }
+  {
+    LOG(INFO) << "Move assignment";
+    Tensor t2 = t;
+    Tensor t3 = std::move(t2);
+    test::ExpectTensorEqual<T>(t, t3);
+    EXPECT_TRUE(t3.IsInitialized());
+    EXPECT_FALSE(t2.IsInitialized());
+  }
 }
 
 TEST(Tensor_Float, Simple) {
@@ -791,4 +807,36 @@ TEST(Tensor, EmptyTensorData) {
   EXPECT_EQ(empty.tensor_data().size(), 0);
 }
 
+// Benchmark create and destroy a tensor, with an allocated buffer.
+static void BM_CreateAndDestroyWithBuf(int iters) {
+  TensorShape shape({10, 20});
+  Allocator* allocator = cpu_allocator();
+  while (--iters) {
+    Tensor a(allocator, DT_FLOAT, shape);
+  }
+}
+BENCHMARK(BM_CreateAndDestroyWithBuf);
+
+// Benchmark create+copy a tensor, with an allocated buffer.
+static void BM_CreateAndCopyCtrWithBuf(int iters) {
+  TensorShape shape({10, 20});
+  Allocator* allocator = cpu_allocator();
+  while (--iters) {
+    Tensor a(allocator, DT_FLOAT, shape);
+    Tensor b(a);
+  }
+}
+BENCHMARK(BM_CreateAndCopyCtrWithBuf);
+
+// Benchmark create+move a tensor, with an allocated buffer.
+static void BM_CreateAndMoveCtrWithBuf(int iters) {
+  TensorShape shape({10, 20});
+  Allocator* allocator = cpu_allocator();
+  while (--iters) {
+    Tensor a(allocator, DT_FLOAT, shape);
+    Tensor b(std::move(a));
+  }
+}
+BENCHMARK(BM_CreateAndMoveCtrWithBuf);
+
 }  // namespace tensorflow
author	A. Unique TensorFlower <nobody@tensorflow.org>	2016-05-04 08:14:06 -0800
committer	TensorFlower Gardener <gardener@tensorflow.org>	2016-05-04 09:21:36 -0700
commit	f7a662e595f1631c12d15173344ee0f50d2cd9f8 (patch)
tree	ab87e6527dadcc2b7b9367ce70c62052bc343cd5 /tensorflow
parent	a1bc10f9e28cd3d33e91bfaedd8199e4590f2893 (diff)