Clean up the allocation logic in the interpreter.

PiperOrigin-RevId: 181865795
author: A. Unique TensorFlower <gardener@tensorflow.org> 2018-01-13 16:21:55 -0800
committer: TensorFlower Gardener <gardener@tensorflow.org> 2018-01-13 16:25:51 -0800
commit: e6ff665dbe4888aa5fdff8f34c44405acca2ddd1 (patch)
tree: a290fe97b76bebec82ef3b3a76c906acf6b55f41 /tensorflow/contrib
parent: a4973345351a14a786987cd7f648a99c029fdc1d (diff)
9 files changed, 1096 insertions, 225 deletions
diff --git a/tensorflow/contrib/lite/BUILD b/tensorflow/contrib/lite/BUILD
index 3f1b0be1a7..13350c5a43 100644
--- a/tensorflow/contrib/lite/BUILD
+++ b/tensorflow/contrib/lite/BUILD
@@ -35,6 +35,28 @@ cc_library(
     hdrs = ["version.h"],
 )
 
+cc_library(
+    name = "arena_planner",
+    srcs = ["arena_planner.cc"],
+    hdrs = ["arena_planner.h"],
+    deps = [
+        ":context",
+        ":graph_info",
+        ":memory_planner",
+        ":simple_memory_arena",
+    ],
+)
+
+cc_test(
+    name = "arena_planner_test",
+    size = "small",
+    srcs = ["arena_planner_test.cc"],
+    deps = [
+        ":arena_planner",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 # Main library. No ops are included here.
 # TODO(aselle): Resolve problems preventing C99 usage.
 cc_library(
@@ -44,6 +66,25 @@ cc_library(
 )
 
 cc_library(
+    name = "graph_info",
+    hdrs = ["graph_info.h"],
+    deps = [":context"],
+)
+
+cc_library(
+    name = "memory_planner",
+    hdrs = ["memory_planner.h"],
+    deps = [":context"],
+)
+
+cc_library(
+    name = "simple_memory_arena",
+    srcs = ["simple_memory_arena.cc"],
+    hdrs = ["simple_memory_arena.h"],
+    deps = [":context"],
+)
+
+cc_library(
     name = "builtin_op_data",
     hdrs = [
         "builtin_op_data.h",
@@ -70,7 +111,6 @@ cc_library(
         "model.cc",
         "nnapi_delegate.cc",
         "optional_debug_tools.cc",
-        "simple_memory_arena.cc",
     ],
     hdrs = [
         "allocation.h",
@@ -80,13 +120,16 @@ cc_library(
         "model.h",
         "nnapi_delegate.h",
         "optional_debug_tools.h",
-        "simple_memory_arena.h",
     ],
     copts = tflite_copts(),
     deps = [
+        ":arena_planner",
         ":builtin_op_data",
         ":context",
+        ":graph_info",
+        ":memory_planner",
         ":schema_fbs_version",
+        ":simple_memory_arena",
         "//tensorflow/contrib/lite/kernels:gemm_support",
         "//tensorflow/contrib/lite/nnapi:nnapi_lib",
         "//tensorflow/contrib/lite/schema:schema_fbs",
@@ -134,7 +177,7 @@ cc_test(
     size = "small",
     srcs = ["simple_memory_arena_test.cc"],
     deps = [
-        ":framework",
+        ":simple_memory_arena",
         "//tensorflow/contrib/lite/testing:util",
         "@com_google_googletest//:gtest",
     ],
diff --git a/tensorflow/contrib/lite/arena_planner.cc b/tensorflow/contrib/lite/arena_planner.cc
new file mode 100644
index 0000000000..bf1bcdd1a7
--- /dev/null
+++ b/tensorflow/contrib/lite/arena_planner.cc
@@ -0,0 +1,247 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/arena_planner.h"
+
+namespace tflite {
+
+namespace {
+
+// Memory allocation tuning
+constexpr const int kDefaultArenaAlignment = 64;
+constexpr const int kDefaultTensorAlignment = 4;
+
+}  // namespace
+
+struct AllocationInfo {
+  // The node index requesting this allocation.
+  int node;
+  // The tensor index to be allocated or deallocated.
+  int tensor;
+  // Whether to allocate or deallocate
+  enum { ALLOC, DEALLOC } type;
+};
+
+ArenaPlanner::ArenaPlanner(TfLiteContext* context,
+                           std::unique_ptr<GraphInfo> graph_info)
+    : context_(context),
+      graph_info_(std::move(graph_info)),
+      arena_(kDefaultArenaAlignment),
+      persistent_arena_(kDefaultArenaAlignment) {}
+
+ArenaPlanner::~ArenaPlanner() {}
+
+int64_t ArenaPlanner::BasePointer(TfLiteAllocationType type) {
+  if (type == kTfLiteArenaRwPersistent) {
+    return persistent_arena_.BasePointer();
+  }
+  if (type == kTfLiteArenaRw) {
+    return arena_.BasePointer();
+  }
+  return 0;
+}
+
+TfLiteStatus ArenaPlanner::ResetAllocations() {
+  TF_LITE_ENSURE_STATUS(arena_.Clear());
+  TF_LITE_ENSURE_STATUS(persistent_arena_.Clear());
+  allocs_.clear();
+  allocs_.resize(graph_info_->num_tensors());
+  return kTfLiteOk;
+}
+
+TfLiteStatus ArenaPlanner::PlanAllocations() {
+  // Invalidate any existing data.
+  TF_LITE_ENSURE_STATUS(ResetAllocations());
+
+  // Keeps track of references to each tensor.
+  std::vector<int> refcounts(graph_info_->num_tensors(), 0);
+
+  // There will be an entry in alloc_queue_ for the allocation of each tensor
+  // and another for their deallocation.
+  alloc_queue_.reserve(2 * graph_info_->num_tensors());
+
+  // We must make sure the output tensors are never overwritten. We do that by
+  // artificially adding one to their ref-counts so they are never selected
+  // for deallocation.
+  for (int tensor_index : graph_info_->outputs()) {
+    refcounts[tensor_index]++;
+  }
+
+  // Count references to node input tensors.
+  for (int i = 0; i < graph_info_->num_nodes(); ++i) {
+    const TfLiteNode& node = graph_info_->node(i);
+    TfLiteIntArray* node_inputs = node.inputs;
+    for (int j = 0; j < node_inputs->size; ++j) {
+      int tensor_index = node_inputs->data[j];
+      if (tensor_index != kOptionalTensor) {
+        refcounts[tensor_index]++;
+      }
+    }
+  }
+
+  // Queue all graph inputs for allocation.
+  for (int tensor_index : graph_info_->inputs()) {
+    if (tensor_index != kOptionalTensor) {
+      alloc_queue_.push_back({0, tensor_index, AllocationInfo::ALLOC});
+    }
+  }
+
+  // Go through the graph in execution order.
+  for (int i = 0; i < graph_info_->num_nodes(); ++i) {
+    const TfLiteNode& node = graph_info_->node(i);
+
+    // First queue output tensors for allocation.
+    TfLiteIntArray* node_outputs = node.outputs;
+    for (int j = 0; j < node_outputs->size; ++j) {
+      int tensor_index = node_outputs->data[j];
+      alloc_queue_.push_back({i, tensor_index, AllocationInfo::ALLOC});
+    }
+
+    // Then update the ref-counts of the node's inputs, and if necessary queue
+    // them for deallocation.
+    TfLiteIntArray* node_inputs = node.inputs;
+    for (int j = 0; j < node_inputs->size; ++j) {
+      int tensor_index = node_inputs->data[j];
+      if (tensor_index != kOptionalTensor) {
+        refcounts[tensor_index]--;
+        if (refcounts[tensor_index] == 0) {
+          alloc_queue_.push_back({i, tensor_index, AllocationInfo::DEALLOC});
+        }
+      }
+    }
+  }
+
+  // Note that graph outputs will never be scheduled for deallocation. We
+  // could do that here for completeness, but it won't have any effect.
+  return kTfLiteOk;
+}
+
+TfLiteStatus ArenaPlanner::ExecuteAllocations(int first_node, int last_node) {
+  TF_LITE_ENSURE_STATUS(CalculateAllocations(first_node, last_node));
+  TF_LITE_ENSURE_STATUS(Commit());
+
+  for (int i = 0; i < graph_info_->num_tensors(); ++i) {
+    // TODO(ahentz): we could do this only for the tensors that were modified
+    // in CalculateAllocations(), instead of redoing it for tensors that
+    // already had proper pointers. However we must be very careful, because
+    // SimpleMemoryArena::Commit() could move the base pointer.
+    TF_LITE_ENSURE_STATUS(ResolveTensorAllocation(i));
+  }
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus ArenaPlanner::Commit() {
+  TF_LITE_ENSURE_STATUS(arena_.Commit(context_));
+  TF_LITE_ENSURE_STATUS(persistent_arena_.Commit(context_));
+  return kTfLiteOk;
+}
+
+TfLiteStatus ArenaPlanner::CalculateAllocations(int first_node, int last_node) {
+  int active_node = first_node;
+  // When dynamic tensors are present this method is called multiple times.
+  // The items in the alloc_queue_ referring to nodes before first_node were
+  // processed previously and should be skipped. Entries after last_node are
+  // not yet ready to be handled.
+  for (const auto& alloc_info : alloc_queue_) {
+    if (alloc_info.node < first_node) continue;
+    if (alloc_info.node > last_node) break;
+    if (alloc_info.node == active_node) {
+      // This is the first allocation/deallocation for a given node.  It is
+      // time to deallocate the previous temporaries and allocate new ones.
+      if (active_node != first_node) {
+        TF_LITE_ENSURE_STATUS(
+            CalculateDeallocationOfInternalTensors(active_node - 1));
+      }
+      TF_LITE_ENSURE_STATUS(CalculateAllocationOfInternalTensors(active_node));
+      ++active_node;
+    }
+    // Handle the current item.
+    if (alloc_info.type == AllocationInfo::ALLOC) {
+      TF_LITE_ENSURE_STATUS(CalculateTensorAllocation(alloc_info.tensor));
+    } else {
+      TF_LITE_ENSURE_STATUS(CalculateTensorDeallocation(alloc_info.tensor));
+    }
+  }
+
+  // Don't forget to deallocate temporaries of last node.
+  TF_LITE_ENSURE_STATUS(
+      CalculateDeallocationOfInternalTensors(active_node - 1));
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus ArenaPlanner::ResolveTensorAllocation(int tensor_index) {
+  TfLiteTensor& tensor = *graph_info_->tensor(tensor_index);
+  if (tensor.allocation_type == kTfLiteArenaRw) {
+    TF_LITE_ENSURE_STATUS(
+        arena_.ResolveAlloc(context_, allocs_[tensor_index], &tensor.data.raw));
+  }
+  if (tensor.allocation_type == kTfLiteArenaRwPersistent) {
+    TF_LITE_ENSURE_STATUS(persistent_arena_.ResolveAlloc(
+        context_, allocs_[tensor_index], &tensor.data.raw));
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus ArenaPlanner::CalculateTensorAllocation(int tensor_index) {
+  TfLiteTensor& tensor = *graph_info_->tensor(tensor_index);
+  if (tensor.allocation_type == kTfLiteArenaRw) {
+    TF_LITE_ENSURE_STATUS(arena_.Allocate(context_, kDefaultTensorAlignment,
+                                          tensor.bytes,
+                                          &allocs_[tensor_index]));
+  }
+  if (tensor.allocation_type == kTfLiteArenaRwPersistent) {
+    TF_LITE_ENSURE_STATUS(
+        persistent_arena_.Allocate(context_, kDefaultTensorAlignment,
+                                   tensor.bytes, &allocs_[tensor_index]));
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus ArenaPlanner::CalculateTensorDeallocation(int tensor_index) {
+  TfLiteTensor& tensor = *graph_info_->tensor(tensor_index);
+  if (tensor.allocation_type == kTfLiteArenaRw) {
+    TF_LITE_ENSURE_STATUS(arena_.Deallocate(context_, allocs_[tensor_index]));
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus ArenaPlanner::CalculateAllocationOfInternalTensors(
+    int node_index) {
+  if (node_index < graph_info_->num_nodes()) {
+    const TfLiteNode& node = graph_info_->node(node_index);
+    TfLiteIntArray* node_temporaries = node.temporaries;
+    for (int i = 0; i < node_temporaries->size; ++i) {
+      int tensor_index = node_temporaries->data[i];
+      TF_LITE_ENSURE_STATUS(CalculateTensorAllocation(tensor_index));
+    }
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus ArenaPlanner::CalculateDeallocationOfInternalTensors(
+    int node_index) {
+  if (node_index < graph_info_->num_nodes()) {
+    const TfLiteNode& node = graph_info_->node(node_index);
+    TfLiteIntArray* node_temporaries = node.temporaries;
+    for (int i = 0; i < node_temporaries->size; ++i) {
+      int tensor_index = node_temporaries->data[i];
+      TF_LITE_ENSURE_STATUS(CalculateTensorDeallocation(tensor_index));
+    }
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/arena_planner.h b/tensorflow/contrib/lite/arena_planner.h
new file mode 100644
index 0000000000..bd87414ec3
--- /dev/null
+++ b/tensorflow/contrib/lite/arena_planner.h
@@ -0,0 +1,107 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_ARENA_PLANNER_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_ARENA_PLANNER_H_
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/graph_info.h"
+#include "tensorflow/contrib/lite/memory_planner.h"
+#include "tensorflow/contrib/lite/simple_memory_arena.h"
+
+namespace tflite {
+
+class AllocationInfo;
+
+// A memory planner that makes all the allocations using arenas.
+//
+// Before a model is executed by the interpreter, this class determines when
+// each tensor needs to be allocated and deallocated, and preallocates all the
+// necessary memory (the PlanAllocations phase). It then assigns portions of
+// this memory buffer to each tensor (the ExecuteAllocations phase). Tensors may
+// share some of the bufer if a tensor B is to be allocated after another tensor
+// A has been deallocated.
+//
+// If dynamic tensors are used the planning steps can be repeated during model
+// execution. Since dynamic tensors don't have sizes until after the
+// corresponding operation is executed, this class supports incremental
+// planning.
+class ArenaPlanner : public MemoryPlanner {
+ public:
+  // Ownership of 'context' is not taken and it must remain util the
+  // ArenaPlanner is destroyed.
+  ArenaPlanner(TfLiteContext* context, std::unique_ptr<GraphInfo> graph_info);
+  ~ArenaPlanner() override;
+  ArenaPlanner(const ArenaPlanner&) = delete;
+  ArenaPlanner& operator=(const ArenaPlanner&) = delete;
+
+  TfLiteStatus ResetAllocations() override;
+  TfLiteStatus PlanAllocations() override;
+  TfLiteStatus ExecuteAllocations(int first_node, int last_node) override;
+
+  // Returns the base arena location for a given allocation type.
+  int64_t BasePointer(TfLiteAllocationType type);
+
+ private:
+  // Make sure all the arenas have reserved enough memory to store all their
+  // tensors.
+  TfLiteStatus Commit();
+
+  // Traverse the allocation queue and reserve space in the appropriate arena
+  // for all tensors affected by ops in the interval [first_node, last_node].
+  TfLiteStatus CalculateAllocations(int first_node, int last_node);
+
+  // Assign absolute memory location to a tensor, based on its relative
+  // position inside the corresponding arena buffer.
+  TfLiteStatus ResolveTensorAllocation(int tensor_index);
+
+  // Register an allocation for the given tensor.
+  TfLiteStatus CalculateTensorAllocation(int tensor_index);
+
+  // Register a deallocation for the given tensor.
+  TfLiteStatus CalculateTensorDeallocation(int tensor_index);
+
+  // Register an allocation for all internal (temporary) tensors of
+  // 'node_index'.
+  TfLiteStatus CalculateAllocationOfInternalTensors(int node_index);
+
+  // Register a deallocation for all internal (temporary) tensors of
+  // 'node_index'.
+  TfLiteStatus CalculateDeallocationOfInternalTensors(int node_index);
+
+  TfLiteContext* context_;
+  std::unique_ptr<GraphInfo> graph_info_;
+
+  // Stores allocation data for all tensors.
+  std::vector<ArenaAlloc> allocs_;
+
+  // A chronological list of instructions to allocated and deallocate tensors,
+  // reflecting the way they are used in the graph.
+  std::vector<AllocationInfo> alloc_queue_;
+
+  // Raw memory buffer that is allocated for all temporary and graph outputs.
+  // that are declared kTfLiteArenaRw.
+  SimpleMemoryArena arena_;
+
+  // Raw memory buffer that is allocated for persistent tensors that are
+  // declared as kTfLiteArenaRwPersistent.
+  SimpleMemoryArena persistent_arena_;
+};
+
+}  // namespace tflite
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_ARENA_PLANNER_H_
diff --git a/tensorflow/contrib/lite/arena_planner_test.cc b/tensorflow/contrib/lite/arena_planner_test.cc
new file mode 100644
index 0000000000..c27c327abc
--- /dev/null
+++ b/tensorflow/contrib/lite/arena_planner_test.cc
@@ -0,0 +1,472 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/arena_planner.h"
+
+#include <cstdarg>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+namespace tflite {
+namespace {
+
+// A simple op to be used in tests, as syntactic sugar.
+class TestOp {
+ public:
+  TestOp(std::initializer_list<int> inputs, std::initializer_list<int> outputs,
+         std::initializer_list<int> temporaries)
+      : inputs_(inputs), outputs_(outputs), temporaries_(temporaries) {}
+
+  const std::vector<int>& inputs() const { return inputs_; }
+  const std::vector<int>& outputs() const { return outputs_; }
+  const std::vector<int>& temporaries() const { return temporaries_; }
+
+ private:
+  std::vector<int> inputs_;
+  std::vector<int> outputs_;
+  std::vector<int> temporaries_;
+};
+
+// A test graph where inputs are processed by the given nodes to produce
+// outputs.
+class TestGraph {
+ public:
+  TestGraph(std::initializer_list<int> inputs,
+            std::initializer_list<TestOp> nodes,
+            std::initializer_list<int> outputs)
+      : inputs_(inputs), outputs_(outputs) {
+    int max_tensor_index = 0;
+
+    for (int t : inputs) {
+      max_tensor_index = std::max(max_tensor_index, t);
+    }
+    for (int t : outputs) {
+      max_tensor_index = std::max(max_tensor_index, t);
+    }
+    for (const auto& node : nodes) {
+      auto int_array = [](const std::vector<int>& x) {
+        TfLiteIntArray* lite = TfLiteIntArrayCreate(x.size());
+        for (size_t i = 0; i < x.size(); i++) lite->data[i] = x[i];
+        return lite;
+      };
+
+      nodes_.push_back(TfLiteNode());
+      nodes_.back().inputs = int_array(node.inputs());
+      for (int t : node.inputs()) {
+        max_tensor_index = std::max(max_tensor_index, t);
+      }
+      nodes_.back().outputs = int_array(node.outputs());
+      for (int t : node.outputs()) {
+        max_tensor_index = std::max(max_tensor_index, t);
+      }
+      nodes_.back().temporaries = int_array(node.temporaries());
+      for (int t : node.temporaries()) {
+        max_tensor_index = std::max(max_tensor_index, t);
+      }
+    }
+
+    for (int i = 0; i <= max_tensor_index; ++i) {
+      tensors_.push_back(TfLiteTensor());
+      // Set some default values for allocation_type and bytes, which are the
+      // only fields used by the arena planner.
+      tensors_.back().allocation_type = kTfLiteArenaRw;
+      tensors_.back().bytes = (i + 1) * 3;
+    }
+  }
+
+  ~TestGraph() {
+    for (auto node : nodes_) {
+      TfLiteIntArrayFree(node.inputs);
+      TfLiteIntArrayFree(node.outputs);
+      TfLiteIntArrayFree(node.temporaries);
+    }
+  }
+
+  const std::vector<TfLiteNode>& nodes() { return nodes_; }
+  std::vector<TfLiteTensor>* tensors() { return &tensors_; }
+  const std::vector<int>& inputs() { return inputs_; }
+  const std::vector<int>& outputs() { return outputs_; }
+
+ private:
+  std::vector<TfLiteNode> nodes_;
+  std::vector<TfLiteTensor> tensors_;
+  std::vector<int> inputs_;
+  std::vector<int> outputs_;
+};
+
+// The GraphInfo for a TestGraph.
+class TestGraphInfo : public GraphInfo {
+ public:
+  explicit TestGraphInfo(TestGraph* graph) : graph_(graph) {}
+
+  size_t num_tensors() const override { return graph_->tensors()->size(); }
+  TfLiteTensor* tensor(size_t index) override {
+    return &graph_->tensors()->at(index);
+  }
+  size_t num_nodes() const override { return graph_->nodes().size(); }
+  const TfLiteNode& node(size_t index) const override {
+    return graph_->nodes()[index];
+  }
+  const std::vector<int>& inputs() const override { return graph_->inputs(); }
+  const std::vector<int>& outputs() const override { return graph_->outputs(); }
+
+ private:
+  TestGraph* graph_;
+};
+
+void ReportError(TfLiteContext* context, const char* format, ...) {
+  const size_t kBufferSize = 1024;
+  char temp_buffer[kBufferSize];
+
+  va_list args;
+  va_start(args, format);
+  vsnprintf(temp_buffer, kBufferSize, format, args);
+  va_end(args);
+
+  LOG(INFO) << temp_buffer;
+}
+
+class ArenaPlannerTest : public ::testing::Test {
+ protected:
+  void SetGraph(TestGraph* graph) {
+    graph_ = graph;
+    context_.ReportError = ReportError;
+    planner_.reset(new ArenaPlanner(
+        &context_, std::unique_ptr<GraphInfo>(new TestGraphInfo(graph))));
+    CHECK(planner_->ResetAllocations() == kTfLiteOk);
+    CHECK(planner_->PlanAllocations() == kTfLiteOk);
+  }
+
+  void Execute(int start, int end) {
+    CHECK(planner_->ExecuteAllocations(start, end) == kTfLiteOk);
+  }
+
+  // Returns the actual offset of a given tensor, relative to the start of its
+  // arena.
+  int64_t GetOffset(int tensor_index) {
+    const TfLiteTensor& tensor = (*graph_->tensors())[tensor_index];
+    return reinterpret_cast<int64_t>(tensor.data.raw) -
+           planner_->BasePointer(tensor.allocation_type);
+  }
+
+  // Returns the first aligned offset after a given tensor.
+  int64_t GetOffsetAfter(int tensor_index) {
+    const TfLiteTensor& tensor = (*graph_->tensors())[tensor_index];
+    int64_t offset = GetOffset(tensor_index) + tensor.bytes;
+    // We must make sure the offset is aligned to kDefaultArenaAlignment.
+    if (offset % 4 != 0) {
+      offset += 4 - offset % 4;
+    }
+    return offset;
+  };
+
+  TfLiteContext context_;
+  TestGraph* graph_;
+  std::unique_ptr<ArenaPlanner> planner_;
+};
+
+TEST_F(ArenaPlannerTest, EmptyGraph) {
+  TestGraph graph({}, {}, {});
+  SetGraph(&graph);
+  Execute(0, 10);
+}
+
+TEST_F(ArenaPlannerTest, GraphWithNoOps) {
+  TestGraph graph({0, 10}, {}, {5, 11});
+  SetGraph(&graph);
+  Execute(0, 10);
+  EXPECT_EQ(GetOffset(0), 0);
+  EXPECT_EQ(GetOffset(10), GetOffsetAfter(0));
+  // The outputs are never allocated because they are not connected to any
+  // inputs.
+  EXPECT_EQ(GetOffset(5), 0);
+  EXPECT_EQ(GetOffset(11), 0);
+}
+
+TEST_F(ArenaPlannerTest, GraphWithOneOp) {
+  TestGraph graph({1}, {{{1}, {2}, {}}}, {2});
+  SetGraph(&graph);
+  Execute(0, 10);
+  EXPECT_EQ(GetOffset(1), 0);
+  EXPECT_EQ(GetOffset(2), GetOffsetAfter(1));
+}
+
+TEST_F(ArenaPlannerTest, ZeroSizedTensors) {
+  TestGraph graph({1}, {{{1}, {2}, {}}}, {2});
+  (*graph.tensors())[1].bytes = 0;
+  SetGraph(&graph);
+  // TODO(ahentz): this is currently broken because the arena finds two
+  // allocations with the same offset and returns an error.
+  ASSERT_FALSE(planner_->ExecuteAllocations(0, 10) == kTfLiteOk);
+  // EXPECT_EQ(GetOffset(1), 0);
+  // EXPECT_EQ(GetOffset(2), GetOffsetAfter(1));
+}
+
+TEST_F(ArenaPlannerTest, SimpleGraph) {
+  TestGraph graph({0, 1},
+                  {
+                      /* in, out, tmp */
+                      {{0, 1}, {2}, {}},     // First op
+                      {{2, 0}, {4, 5}, {}},  // Second op
+                      {{4, 5}, {3}, {}}      // Third op
+                  },
+                  {3});
+  SetGraph(&graph);
+  Execute(0, 10);
+
+  // Alloc(+) and dealloc(-) order: +0 +1 +2 -1 +4 +5 -2 -0 +3 -4 -5
+  EXPECT_EQ(GetOffset(0), 0);
+  EXPECT_EQ(GetOffset(1), GetOffsetAfter(0));
+  EXPECT_EQ(GetOffset(2), GetOffsetAfter(1));
+  EXPECT_EQ(GetOffset(4), GetOffsetAfter(2));
+  EXPECT_EQ(GetOffset(5), GetOffsetAfter(4));
+  EXPECT_EQ(GetOffset(3), 0);
+}
+
+TEST_F(ArenaPlannerTest, SimpleGraphWithTemporary) {
+  TestGraph graph({0, 1},
+                  {
+                      /* in, out, tmp */
+                      {{0, 1}, {2}, {}},   // First op
+                      {{2, 0}, {4}, {5}},  // Second op, with temporary
+                      {{4}, {3}, {}}       // Third op
+                  },
+                  {3});
+  SetGraph(&graph);
+  Execute(0, 10);
+
+  // Alloc(+) and dealloc(-) order: +0 +1 +2 -1 +5 +4 -2 -0 -5 +3 -4
+  EXPECT_EQ(GetOffset(0), 0);
+  EXPECT_EQ(GetOffset(1), GetOffsetAfter(0));
+  EXPECT_EQ(GetOffset(2), GetOffsetAfter(1));
+  EXPECT_EQ(GetOffset(5), GetOffsetAfter(2));
+  EXPECT_EQ(GetOffset(4), GetOffsetAfter(5));
+  EXPECT_EQ(GetOffset(3), 0);
+}
+
+TEST_F(ArenaPlannerTest, SimpleGraphWithOptionals) {
+  TestGraph graph({0, -1, 1},
+                  {
+                      /* in, out, tmp */
+                      {{0, 1}, {2}, {}},     // First op
+                      {{2, 0}, {4, 5}, {}},  // Second op
+                      {{4, -1, 5}, {3}, {}}  // Third op, with optional
+                  },
+                  {3});
+  SetGraph(&graph);
+  Execute(0, 10);
+
+  // Alloc(+) and dealloc(-) order: +0 +1 +2 -1 +4 +5 -2 -0 +3 -4 -5
+  EXPECT_EQ(GetOffset(0), 0);
+  EXPECT_EQ(GetOffset(1), GetOffsetAfter(0));
+  EXPECT_EQ(GetOffset(2), GetOffsetAfter(1));
+  EXPECT_EQ(GetOffset(4), GetOffsetAfter(2));
+  EXPECT_EQ(GetOffset(5), GetOffsetAfter(4));
+  EXPECT_EQ(GetOffset(3), 0);
+}
+
+TEST_F(ArenaPlannerTest, SimpleGraphWithLargeTensor) {
+  TestGraph graph({0, -1, 1},
+                  {
+                      /* in, out, tmp */
+                      {{0, 1}, {2}, {}},   // First op
+                      {{2, 0}, {4}, {5}},  // Second op, with temporary
+                      {{4, -1}, {3}, {}}   // Third op, with optional
+                  },
+                  {3});
+
+  // Make #1 very large so its vacancy can be filled with #5 and #4.
+  (*graph.tensors())[1].bytes = 40;
+
+  SetGraph(&graph);
+  Execute(0, 10);
+
+  // Alloc(+) and dealloc(-) order: +0 +1 +2 -1 +5 +4 -2 -0 -5 +3 -4
+  EXPECT_EQ(GetOffset(0), 0);
+  EXPECT_EQ(GetOffset(1), GetOffsetAfter(0));
+  EXPECT_EQ(GetOffset(2), GetOffsetAfter(1));
+  EXPECT_EQ(GetOffset(5), GetOffsetAfter(0));
+  EXPECT_EQ(GetOffset(4), GetOffsetAfter(5));
+  EXPECT_EQ(GetOffset(3), 0);
+}
+
+TEST_F(ArenaPlannerTest, SimpleGraphWithPersistentTensor) {
+  TestGraph graph({0, -1, 1},
+                  {
+                      /* in, out, tmp */
+                      {{0, 1}, {2}, {}},   // First op
+                      {{2, 0}, {4}, {5}},  // Second op, with temporary
+                      {{4, -1}, {3}, {}}   // Third op, with optional
+                  },
+                  {3});
+
+  // Make #1 persistent so it goes into its own arena.
+  (*graph.tensors())[1].allocation_type = kTfLiteArenaRwPersistent;
+
+  SetGraph(&graph);
+  Execute(0, 10);
+
+  // Make sure #0 and #1 were given different memory locations (because they
+  // will both have offset=0, in different arenas.)
+  EXPECT_NE((*graph.tensors())[0].data.raw, (*graph.tensors())[1].data.raw);
+
+  // Alloc(+) and dealloc(-) order: +0 +1 +2 -1 +5 +4 -2 -0 -5 +3 -4
+  EXPECT_EQ(GetOffset(0), 0);
+  EXPECT_EQ(GetOffset(1), 0);
+  EXPECT_EQ(GetOffset(2), GetOffsetAfter(0));
+  EXPECT_EQ(GetOffset(5), GetOffsetAfter(2));
+  EXPECT_EQ(GetOffset(4), GetOffsetAfter(5));
+  EXPECT_EQ(GetOffset(3), 0);
+}
+
+TEST_F(ArenaPlannerTest, SimpleGraphWithDynamicTensor) {
+  TestGraph graph({0, -1, 1},
+                  {
+                      /* in, out, tmp */
+                      {{0, 1}, {2}, {}},   // First op
+                      {{2, 0}, {4}, {5}},  // Second op, with temporary
+                      {{4, -1}, {3}, {}}   // Third op, with optional
+                  },
+                  {3});
+
+  // Make #1 dynaic so it does not get allocated.
+  (*graph.tensors())[1].allocation_type = kTfLiteDynamic;
+
+  SetGraph(&graph);
+  Execute(0, 10);
+
+  EXPECT_EQ((*graph.tensors())[1].data.raw, nullptr);
+
+  // Alloc(+) and dealloc(-) order: +0 +1 +2 -1 +5 +4 -2 -0 -5 +3 -4
+  EXPECT_EQ(GetOffset(0), 0);
+  EXPECT_EQ(GetOffset(2), GetOffsetAfter(0));
+  EXPECT_EQ(GetOffset(5), GetOffsetAfter(2));
+  EXPECT_EQ(GetOffset(4), GetOffsetAfter(5));
+  EXPECT_EQ(GetOffset(3), 0);
+}
+
+TEST_F(ArenaPlannerTest, LargerGraphAndStepwiseAllocation) {
+  TestGraph graph({0, 1},
+                  {
+                      /* in, out, tmp */
+                      {{0, 1}, {2, 3}, {}},
+                      {{2, 0}, {4, 5}, {6}},
+                      {{1, -1}, {7}, {}},
+                      {{7, 3}, {8}, {9}},
+                      {{4, 5, 8}, {10}, {}},
+                  },
+                  {10});
+  SetGraph(&graph);
+
+  auto is_unallocated = [&](int tensor_index) {
+    // TODO(ahentz): We'd to use nullptr to represent unallocated tensors, but
+    // the current code still points them all to the beginning fo the alloc
+    // (that is, zero offset).
+    // return (*graph.tensors())[tensor_index].data.raw == nullptr;
+    return GetOffset(tensor_index) == 0;
+  };
+
+  // The allocation plan is made at the beginning and is independent of
+  // the execution steps. Here's the allocation order:
+  //   Op0: +0 +1 +2 +3
+  //   Op1: +6 +4 +5 -6 -0 -2
+  //   Op2: +7 -1
+  //   Op3: +9 +8 -9 -3 -7
+  //   Op4: +10 -4 -5 -8
+
+  Execute(0, 0);
+  EXPECT_EQ(GetOffset(0), 0);
+  EXPECT_EQ(GetOffset(1), GetOffsetAfter(0));
+  EXPECT_EQ(GetOffset(2), GetOffsetAfter(1));
+  EXPECT_EQ(GetOffset(3), GetOffsetAfter(2));
+  EXPECT_TRUE(is_unallocated(6));
+  EXPECT_TRUE(is_unallocated(4));
+  EXPECT_TRUE(is_unallocated(5));
+  EXPECT_TRUE(is_unallocated(7));
+  EXPECT_TRUE(is_unallocated(9));
+  EXPECT_TRUE(is_unallocated(8));
+  EXPECT_TRUE(is_unallocated(10));
+
+  Execute(1, 1);
+  EXPECT_EQ(GetOffset(0), 0);
+  EXPECT_EQ(GetOffset(1), GetOffsetAfter(0));
+  EXPECT_EQ(GetOffset(2), GetOffsetAfter(1));
+  EXPECT_EQ(GetOffset(3), GetOffsetAfter(2));
+  EXPECT_EQ(GetOffset(6), GetOffsetAfter(3));
+  EXPECT_EQ(GetOffset(4), GetOffsetAfter(6));
+  EXPECT_EQ(GetOffset(5), GetOffsetAfter(4));
+  EXPECT_TRUE(is_unallocated(7));
+  EXPECT_TRUE(is_unallocated(9));
+  EXPECT_TRUE(is_unallocated(8));
+  EXPECT_TRUE(is_unallocated(10));
+
+  Execute(2, 2);
+  EXPECT_EQ(GetOffset(0), 0);
+  EXPECT_EQ(GetOffset(1), GetOffsetAfter(0));
+  EXPECT_EQ(GetOffset(2), GetOffsetAfter(1));
+  EXPECT_EQ(GetOffset(3), GetOffsetAfter(2));
+  EXPECT_EQ(GetOffset(6), GetOffsetAfter(3));
+  EXPECT_EQ(GetOffset(4), GetOffsetAfter(6));
+  EXPECT_EQ(GetOffset(5), GetOffsetAfter(4));
+  // Here's an interesting allocation. Even though #6 requires only 21 bytes,
+  // its deallocation freed up 24 bytes due to the alignment requirements in
+  // the arena. That means we can fit #7 in the same space!
+  EXPECT_EQ(GetOffset(7), GetOffsetAfter(3));
+  EXPECT_TRUE(is_unallocated(9));
+  EXPECT_TRUE(is_unallocated(8));
+  EXPECT_TRUE(is_unallocated(10));
+
+  Execute(3, 3);
+  EXPECT_EQ(GetOffset(0), 0);
+  EXPECT_EQ(GetOffset(1), GetOffsetAfter(0));
+  EXPECT_EQ(GetOffset(2), GetOffsetAfter(1));
+  EXPECT_EQ(GetOffset(3), GetOffsetAfter(2));
+  EXPECT_EQ(GetOffset(6), GetOffsetAfter(3));
+  EXPECT_EQ(GetOffset(4), GetOffsetAfter(6));
+  EXPECT_EQ(GetOffset(5), GetOffsetAfter(4));
+  EXPECT_EQ(GetOffset(7), GetOffsetAfter(3));
+  // The deallocation of #0, #1 and #2 freed up 24 bytes but that's not enough
+  // for #9, so it goes at the end.
+  EXPECT_EQ(GetOffset(9), GetOffsetAfter(5));
+  EXPECT_EQ(GetOffset(8), GetOffsetAfter(9));
+  EXPECT_TRUE(is_unallocated(10));
+
+  Execute(4, 4);
+  EXPECT_EQ(GetOffset(0), 0);
+  EXPECT_EQ(GetOffset(1), GetOffsetAfter(0));
+  EXPECT_EQ(GetOffset(2), GetOffsetAfter(1));
+  EXPECT_EQ(GetOffset(3), GetOffsetAfter(2));
+  EXPECT_EQ(GetOffset(6), GetOffsetAfter(3));
+  EXPECT_EQ(GetOffset(4), GetOffsetAfter(6));
+  EXPECT_EQ(GetOffset(5), GetOffsetAfter(4));
+  EXPECT_EQ(GetOffset(7), GetOffsetAfter(3));
+  EXPECT_EQ(GetOffset(9), GetOffsetAfter(5));
+  EXPECT_EQ(GetOffset(8), GetOffsetAfter(9));
+  // There's just enough space at the beginning for #10 due to the
+  // deallocation of #0, #1, #2 and #3 (total 36 bytes, #10 needs
+  // only 33.)
+  EXPECT_EQ(GetOffset(10), 0);
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  // ::tflite::LogToStderr();
+  FLAGS_logtostderr = true;
+
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/graph_info.h b/tensorflow/contrib/lite/graph_info.h
new file mode 100644
index 0000000000..5481aede60
--- /dev/null
+++ b/tensorflow/contrib/lite/graph_info.h
@@ -0,0 +1,53 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_GRAPH_INFO_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_GRAPH_INFO_H_
+
+#include <vector>
+
+#include "tensorflow/contrib/lite/context.h"
+
+namespace tflite {
+
+// Basic information about an inference graph, where execution nodes
+// are connected via tensors.
+class GraphInfo {
+ public:
+  virtual ~GraphInfo() {}
+
+  // Total number of tensors in the graph.
+  virtual size_t num_tensors() const = 0;
+
+  // Returns a tensor given its index which is expected to be between 0 and
+  // num_tensors().
+  virtual TfLiteTensor* tensor(size_t index) = 0;
+
+  // Total number of nodes in the graph.
+  virtual size_t num_nodes() const = 0;
+
+  // Returns a node given its index which is expected to be between 0 and
+  // num_nodes().
+  virtual const TfLiteNode& node(size_t index) const = 0;
+
+  // Returns the indices of the input tensors.
+  virtual const std::vector<int>& inputs() const = 0;
+
+  // Returns the indices of the output tensors.
+  virtual const std::vector<int>& outputs() const = 0;
+};
+
+}  // namespace tflite
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_GRAPH_INFO_H_
diff --git a/tensorflow/contrib/lite/interpreter.cc b/tensorflow/contrib/lite/interpreter.cc
index 954e236ac8..5f5981e45a 100644
--- a/tensorflow/contrib/lite/interpreter.cc
+++ b/tensorflow/contrib/lite/interpreter.cc
@@ -18,16 +18,16 @@ limitations under the License.
 #include <cstdarg>
 #include <cstdint>
 #include <cstring>
+#include "tensorflow/contrib/lite/arena_planner.h"
 #include "tensorflow/contrib/lite/context.h"
 #include "tensorflow/contrib/lite/error_reporter.h"
+#include "tensorflow/contrib/lite/graph_info.h"
 #include "tensorflow/contrib/lite/kernels/gemm_support.h"
+#include "tensorflow/contrib/lite/memory_planner.h"
 #include "tensorflow/contrib/lite/nnapi_delegate.h"
 
 namespace {
 
-// Memory allocation tuning
-constexpr const int kDefaultArenaAlignment = 64;
-constexpr const int kDefaultTensorAlignment = 4;
 // std::vector preallocation tuning.
 constexpr const int kSlotsToReserve = 128;
 
@@ -35,10 +35,33 @@ constexpr const int kSlotsToReserve = 128;
 
 namespace tflite {
 
+// A trivial implementation of GraphInfo around the Interpreter.
+class InterpreterInfo : public GraphInfo {
+ public:
+  explicit InterpreterInfo(Interpreter* interpreter)
+      : interpreter_(interpreter) {}
+
+  size_t num_tensors() const override { return interpreter_->tensors_size(); }
+  TfLiteTensor* tensor(size_t index) override {
+    return interpreter_->tensor(index);
+  }
+  size_t num_nodes() const override { return interpreter_->nodes_size(); }
+  const TfLiteNode& node(size_t index) const override {
+    return interpreter_->node_and_registration(index)->first;
+  }
+  const std::vector<int>& inputs() const override {
+    return interpreter_->inputs();
+  }
+  const std::vector<int>& outputs() const override {
+    return interpreter_->outputs();
+  }
+
+ public:
+  Interpreter* interpreter_;
+};
+
 Interpreter::Interpreter(ErrorReporter* error_reporter)
-    : arena_(kDefaultArenaAlignment),
-      persistent_arena_(kDefaultArenaAlignment),
-      error_reporter_(error_reporter ? error_reporter
+    : error_reporter_(error_reporter ? error_reporter
                                      : DefaultErrorReporter()) {
   context_.impl_ = static_cast<void*>(this);
   context_.ResizeTensor = ResizeTensor;
@@ -50,7 +73,7 @@ Interpreter::Interpreter(ErrorReporter* error_reporter)
   // Reserve some space for the tensors to avoid excessive resizing.
   tensors_.reserve(kSlotsToReserve);
   nodes_and_registration_.reserve(kSlotsToReserve);
-  next_allocate_node_id_ = 0;
+  next_node_to_prepare_ = 0;
   UseNNAPI(false);
 }
 
@@ -128,181 +151,6 @@ TfLiteStatus Interpreter::BytesRequired(TfLiteType type, const int* dims,
   return kTfLiteOk;
 }
 
-TfLiteStatus Interpreter::AllocateTensorsWhoseSizesAreKnown() {
-  if (!consistent_) {
-    ReportError(&context_, "AllocateTensors() called on inconsistent model.");
-    return kTfLiteError;
-  }
-  if (next_allocate_node_id_ == nodes_and_registration_.size() && invokable_) {
-    return kTfLiteOk;
-  }
-  allocs_and_refcounts_.resize(context_.tensors_size);
-
-  int new_next_allocate_node_id = next_allocate_node_id_;
-  invokable_ = false;
-
-  // Allocate graph input nodes.
-  if (next_allocate_node_id_ == 0) {
-    for (int i = 0; i < inputs_.size(); ++i) {
-      int tensor_index = inputs_[i];
-      if (tensor_index == kOptionalTensor) {
-        continue;
-      }
-      TfLiteTensor& tensor = context_.tensors[tensor_index];
-      if (tensor.allocation_type == kTfLiteArenaRw) {
-        TF_LITE_ENSURE_OK(
-            &context_,
-            arena_.Allocate(&context_, kDefaultTensorAlignment, tensor.bytes,
-                            &allocs_and_refcounts_[tensor_index].alloc));
-      }
-    }
-    // Add 1 to output tensors, so they will not get overwritten.
-    for (int i = 0; i < outputs_.size(); ++i) {
-      allocs_and_refcounts_[outputs_[i]].count++;
-    }
-  }
-
-  // Count references to node input tensors, and resize node-referenced tensors
-  // until we encounter a node that has a dynamic output tensor.
-  for (int k = next_allocate_node_id_; k < nodes_and_registration_.size();
-       k++) {
-    new_next_allocate_node_id++;
-    TfLiteNode& node = nodes_and_registration_[k].first;
-    const TfLiteRegistration& registration = nodes_and_registration_[k].second;
-    if (OpPrepare(registration, &node) == kTfLiteError) {
-      return kTfLiteError;
-    }
-
-    TfLiteIntArray* node_inputs = node.inputs;
-    for (int i = 0; i < node_inputs->size; ++i) {
-      int tensor_index = node_inputs->data[i];
-      if (tensor_index != kOptionalTensor) {
-        allocs_and_refcounts_[node_inputs->data[i]].count++;
-      }
-    }
-
-    // Discontinue if the node has dynamic outputs.
-    bool has_unallocated_dynamic_tensor = false;
-    TfLiteIntArray* node_outputs = node.outputs;
-    for (int i = 0; i < node_outputs->size; ++i) {
-      TfLiteTensor& tensor = context_.tensors[node_outputs->data[i]];
-      if (tensor.allocation_type == kTfLiteDynamic) {
-        has_unallocated_dynamic_tensor = true;
-        break;
-      }
-    }
-    if (has_unallocated_dynamic_tensor) {
-      break;
-    }
-  }
-
-  // Allocate graph persistent outputs, e.g. RNN cell states, etc.
-  for (int k = next_allocate_node_id_; k < new_next_allocate_node_id; k++) {
-    TfLiteNode& node = nodes_and_registration_[k].first;
-
-    // Go through output tensors and allocate the persistent ones first.
-    TfLiteIntArray* node_outputs = node.outputs;
-    for (int i = 0; i < node_outputs->size; ++i) {
-      int tensor_index = node_outputs->data[i];
-      TfLiteTensor& tensor = context_.tensors[tensor_index];
-      if (tensor.allocation_type == kTfLiteArenaRwPersistent) {
-        TF_LITE_ENSURE_OK(&context_,
-                          persistent_arena_.Allocate(
-                              &context_, kDefaultTensorAlignment, tensor.bytes,
-                              &allocs_and_refcounts_[tensor_index].alloc));
-      }
-    }
-  }
-
-  // Go through the graph in execution order.
-  for (int k = next_allocate_node_id_; k < new_next_allocate_node_id; k++) {
-    TfLiteNode& node = nodes_and_registration_[k].first;
-
-    // First allocate output tensors.
-    TfLiteIntArray* node_outputs = node.outputs;
-    for (int i = 0; i < node_outputs->size; ++i) {
-      int tensor_index = node_outputs->data[i];
-      TfLiteTensor& tensor = context_.tensors[tensor_index];
-      if (tensor.allocation_type == kTfLiteArenaRw) {
-        TF_LITE_ENSURE_OK(
-            &context_,
-            arena_.Allocate(&context_, kDefaultTensorAlignment, tensor.bytes,
-                            &allocs_and_refcounts_[tensor_index].alloc));
-      }
-    }
-    // Then the temporaries, in two passes. First allocate them all, them
-    // deallocate them.
-    TfLiteIntArray* node_temporaries = node.temporaries;
-    for (int i = 0; i < node_temporaries->size; ++i) {
-      int tensor_index = node_temporaries->data[i];
-      TfLiteTensor& tensor = context_.tensors[tensor_index];
-      if (tensor.allocation_type == kTfLiteArenaRw) {
-        TF_LITE_ENSURE_OK(
-            &context_,
-            arena_.Allocate(&context_, kDefaultTensorAlignment, tensor.bytes,
-                            &allocs_and_refcounts_[tensor_index].alloc));
-      }
-    }
-    for (int i = 0; i < node_temporaries->size; ++i) {
-      int tensor_index = node_temporaries->data[i];
-      TfLiteTensor& tensor = context_.tensors[tensor_index];
-      allocs_and_refcounts_[tensor_index].count--;
-      if (tensor.allocation_type == kTfLiteArenaRw &&
-          allocs_and_refcounts_[tensor_index].count == 0) {
-        TF_LITE_ENSURE_OK(
-            &context_,
-            arena_.Deallocate(&context_,
-                              allocs_and_refcounts_[tensor_index].alloc));
-      }
-    }
-
-    // Then process the node's inputs.
-    TfLiteIntArray* node_inputs = node.inputs;
-    for (int i = 0; i < node_inputs->size; ++i) {
-      int tensor_index = node_inputs->data[i];
-      if (tensor_index == kOptionalTensor) {
-        continue;
-      }
-      TfLiteTensor& tensor = context_.tensors[tensor_index];
-
-      // Decrease reference count and deallocate if not needed anymore.
-      allocs_and_refcounts_[tensor_index].count--;
-      if (tensor.allocation_type == kTfLiteArenaRw &&
-          allocs_and_refcounts_[tensor_index].count == 0) {
-        TF_LITE_ENSURE_OK(
-            &context_,
-            arena_.Deallocate(&context_,
-                              allocs_and_refcounts_[tensor_index].alloc));
-      }
-    }
-  }
-
-  // Resize the buffer and commit the arena.
-  TF_LITE_ENSURE_OK(&context_, arena_.Commit(&context_));
-  TF_LITE_ENSURE_OK(&context_, persistent_arena_.Commit(&context_));
-
-  // Rewire the tensors to use the underlying arena buffer.
-  for (int i = 0; i < context_.tensors_size; ++i) {
-    TfLiteTensor& tensor = context_.tensors[i];
-    if (tensor.allocation_type == kTfLiteArenaRw) {
-      TF_LITE_ENSURE_OK(
-          &context_,
-          arena_.ResolveAlloc(&context_, allocs_and_refcounts_[i].alloc,
-                              &tensor.data.raw));
-    }
-    if (tensor.allocation_type == kTfLiteArenaRwPersistent) {
-      TF_LITE_ENSURE_OK(
-          &context_,
-          persistent_arena_.ResolveAlloc(
-              &context_, allocs_and_refcounts_[i].alloc, &tensor.data.raw));
-    }
-  }
-
-  invokable_ = true;
-  next_allocate_node_id_ = new_next_allocate_node_id;
-  return kTfLiteOk;
-}
-
 namespace {
 TfLiteIntArray* convertVectorToTfLiteIntArray(const std::vector<int>& x) {
   TfLiteIntArray* lite = TfLiteIntArrayCreate(x.size());
@@ -312,11 +160,19 @@ TfLiteIntArray* convertVectorToTfLiteIntArray(const std::vector<int>& x) {
 }  // namespace
 
 TfLiteStatus Interpreter::AllocateTensors() {
-  next_allocate_node_id_ = 0;
-  TF_LITE_ENSURE_OK(&context_, arena_.Clear());
-  TF_LITE_ENSURE_OK(&context_, persistent_arena_.Clear());
-  allocs_and_refcounts_.clear();
-  return AllocateTensorsWhoseSizesAreKnown();
+  next_node_to_prepare_ = 0;
+  if (memory_planner_) {
+    TF_LITE_ENSURE_STATUS(memory_planner_->ResetAllocations());
+  }
+
+  if (!consistent_) {
+    ReportError(&context_, "AllocateTensors() called on inconsistent model.");
+    return kTfLiteError;
+  }
+
+  TF_LITE_ENSURE_STATUS(PrepareOpsAndTensors());
+  invokable_ = true;
+  return kTfLiteOk;
 }
 
 TfLiteStatus Interpreter::AddNodeWithParameters(
@@ -372,6 +228,57 @@ TfLiteStatus Interpreter::ResizeInputTensor(int tensor_index,
   return ResizeTensorImpl(&context_.tensors[tensor_index], dims_lite);
 }
 
+// Returns true if at least one tensor in the given list is kTfLiteDynamic.
+bool HasDynamicTensor(const TfLiteContext& context,
+                      const TfLiteIntArray* tensors) {
+  for (int i = 0; i < tensors->size; ++i) {
+    const TfLiteTensor& tensor = context.tensors[tensors->data[i]];
+    if (tensor.allocation_type == kTfLiteDynamic) {
+      return true;
+    }
+  }
+  return false;
+}
+
+TfLiteStatus Interpreter::PrepareOpsStartingAt(int first_node,
+                                               int* last_node_prepared) {
+  for (int i = first_node; i < nodes_and_registration_.size(); i++) {
+    TfLiteNode& node = nodes_and_registration_[i].first;
+    const TfLiteRegistration& registration = nodes_and_registration_[i].second;
+    if (OpPrepare(registration, &node) == kTfLiteError) {
+      return kTfLiteError;
+    }
+
+    *last_node_prepared = i;
+
+    // Discontinue if the node has dynamic outputs. Note that we don't
+    // stop for dynamic temporary tensors since they won't affect the
+    // sizes of other tensors in the graph.
+    if (HasDynamicTensor(context_, node.outputs)) {
+      break;
+    }
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus Interpreter::PrepareOpsAndTensors() {
+  if (!memory_planner_) {
+    memory_planner_.reset(new ArenaPlanner(
+        &context_, std::unique_ptr<GraphInfo>(new InterpreterInfo(this))));
+    memory_planner_->PlanAllocations();
+  }
+
+  int last_node_prepared = 0;
+
+  TF_LITE_ENSURE_STATUS(
+      PrepareOpsStartingAt(next_node_to_prepare_, &last_node_prepared));
+  TF_LITE_ENSURE_STATUS(memory_planner_->ExecuteAllocations(
+      next_node_to_prepare_, last_node_prepared));
+
+  next_node_to_prepare_ = last_node_prepared + 1;
+  return kTfLiteOk;
+}
+
 TfLiteStatus Interpreter::Invoke() {
   if (!consistent_) {
     ReportError(&context_, "Invoke called on model that is not consistent.");
@@ -384,10 +291,8 @@ TfLiteStatus Interpreter::Invoke() {
 
   TfLiteStatus status = kTfLiteOk;
   if (nnapi_delegate_) {
-    if (AllocateTensorsWhoseSizesAreKnown() == kTfLiteError) {
-      return kTfLiteError;
-    }
-    if (next_allocate_node_id_ == nodes_and_registration_.size()) {
+    TF_LITE_ENSURE_STATUS(PrepareOpsAndTensors());
+    if (next_node_to_prepare_ == nodes_and_registration_.size()) {
       TF_LITE_ENSURE_OK(&context_, nnapi_delegate_->Invoke(this));
       return kTfLiteOk;
     } else {
@@ -400,14 +305,17 @@ TfLiteStatus Interpreter::Invoke() {
     }
   }
 
+  // Invocations are always done in node order.
+  // Note that calling Invoke repeatedly will cause the original memory plan to
+  // be reused, unless either ResizeInputTensor() or AllocateTensors() has been
+  // called.
+  // TODO(b/71913981): we should force recalculation in the presence of dynamic
+  // tensors, because they may have new value which in turn may affect shapes
+  // and allocations.
   for (int i = 0; i < nodes_and_registration_.size(); i++) {
-    // Ensure we have allocated up to this node. The point of this is to
-    // allocate as much as possible before running any evaluation, but
-    // dynamic shapes can prevent this from being possible.
-    if (i >= next_allocate_node_id_) {
-      if (AllocateTensorsWhoseSizesAreKnown() == kTfLiteError) {
-        return kTfLiteError;
-      }
+    if (i == next_node_to_prepare_) {
+      TF_LITE_ENSURE_STATUS(PrepareOpsAndTensors());
+      TF_LITE_ENSURE(&context_, next_node_to_prepare_ >= i);
     }
     TfLiteNode& node = nodes_and_registration_[i].first;
     const TfLiteRegistration& registration = nodes_and_registration_[i].second;
diff --git a/tensorflow/contrib/lite/interpreter.h b/tensorflow/contrib/lite/interpreter.h
index 65c61e44be..38dd402e8a 100644
--- a/tensorflow/contrib/lite/interpreter.h
+++ b/tensorflow/contrib/lite/interpreter.h
@@ -23,7 +23,7 @@ limitations under the License.
 #include "tensorflow/contrib/lite/allocation.h"
 #include "tensorflow/contrib/lite/context.h"
 #include "tensorflow/contrib/lite/error_reporter.h"
-#include "tensorflow/contrib/lite/simple_memory_arena.h"
+#include "tensorflow/contrib/lite/memory_planner.h"
 
 namespace tflite {
 
@@ -49,13 +49,6 @@ constexpr TfLiteType typeToTfLiteType<unsigned char>() {
   return kTfLiteUInt8;
 }
 
-struct ArenaAllocRefCount {
-  ArenaAllocRefCount() : alloc(), count(0) {}
-
-  ArenaAlloc alloc;
-  int count;
-};
-
 // Forward declare since NNAPIDelegate uses Interpreter.
 class NNAPIDelegate;
 
@@ -276,9 +269,17 @@ class Interpreter {
     return op_reg.invoke(&context_, node);
   }
 
-  // Allocate tensors whose sizes are known in order of nodes. Discontinue when
-  // we encounter a node that has a dynamic output tensor.
-  TfLiteStatus AllocateTensorsWhoseSizesAreKnown();
+  // Call OpPrepare() for as many ops as possible, allocating memory for their
+  // tensors. If an op containing dynamic tensors is found, preparation will be
+  // postponed until this function is called again. This allows the interpreter
+  // to wait until Invoke() to resolve the sizes of dynamic tensors.
+  TfLiteStatus PrepareOpsAndTensors();
+
+  // Call OpPrepare() for all ops starting at 'first_node'. Stop when a
+  // dynamic tensors is found or all ops have been prepared. Fill
+  // 'last_node_prepared' with the id of the op containing dynamic tensors, or
+  // the last in the graph.
+  TfLiteStatus PrepareOpsStartingAt(int first_node, int* last_node_prepared);
 
   // Tensors needed by the interpreter. Use `AddTensors` to add more blank
   // tensor entries. Note, `tensors_.data()` needs to be synchronized to the
@@ -325,17 +326,6 @@ class Interpreter {
   std::vector<std::pair<TfLiteNode, TfLiteRegistration>>
       nodes_and_registration_;
 
-  // Raw memory buffer that is allocated for all temporary and graph outputs.
-  // that are declared kTfLiteArenaRw.
-  SimpleMemoryArena arena_;
-
-  // Raw memory buffer that is allocated for persistent tensors that are
-  // declared as kTfLiteArenaRwPersistent.
-  SimpleMemoryArena persistent_arena_;
-
-  // Stores allocation and reference counts of all tensors.
-  std::vector<ArenaAllocRefCount> allocs_and_refcounts_;
-
   // Whether the model is consistent. That is to say if the inputs and outputs
   // of every node and the global inputs and outputs are valid indexes into
   // the tensor array.
@@ -356,7 +346,7 @@ class Interpreter {
   // The error reporter delegate that tflite will forward queries errors to.
   ErrorReporter* error_reporter_;
 
-  // Next node to allocate output tensors.
+  // Index of the next node to prepare.
   // During Invoke(), Interpreter will allocate input tensors first, which are
   // known to be fixed size. Then it will allocate outputs from nodes as many
   // as possible. When there is a node that produces dynamic sized tensor.
@@ -364,10 +354,12 @@ class Interpreter {
   // node id, and execute the node to generate the output tensor before continue
   // to allocate successors. This process repeats until all nodes are executed.
   // NOTE: this relies on the order of nodes that is in topological order.
-  int next_allocate_node_id_;
+  int next_node_to_prepare_;
 
   // Whether to delegate to NN API
   std::unique_ptr<NNAPIDelegate> nnapi_delegate_;
+
+  std::unique_ptr<MemoryPlanner> memory_planner_;
 };
 
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/memory_planner.h b/tensorflow/contrib/lite/memory_planner.h
new file mode 100644
index 0000000000..b11d86c375
--- /dev/null
+++ b/tensorflow/contrib/lite/memory_planner.h
@@ -0,0 +1,45 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_MEMORY_PLANNER_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_MEMORY_PLANNER_H_
+
+#include "tensorflow/contrib/lite/context.h"
+
+namespace tflite {
+
+// A MemoryPlanner is responsible for planning and executing a number of
+// memory-related operations that are necessary in TF Lite.
+class MemoryPlanner {
+ public:
+  virtual ~MemoryPlanner() {}
+
+  // Plans the necessary memory allocations. This is the MemoryPlanner's
+  // pre-processing step and is called when the graph structure is known but
+  // actual size of the tensors is not.
+  virtual TfLiteStatus PlanAllocations() = 0;
+
+  // Allocates the necessary memory to execute all nodes in the interval
+  // [first_node, last_node].
+  virtual TfLiteStatus ExecuteAllocations(int first_node, int last_node) = 0;
+
+  // Invalidates allocations made earliers. This is called when tensors sizes
+  // have change. All planned allocations remain, but can't be used until
+  // ExecuteAllocations() is called.
+  virtual TfLiteStatus ResetAllocations() = 0;
+};
+
+}  // namespace tflite
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_MEMORY_PLANNER_H_
diff --git a/tensorflow/contrib/lite/simple_memory_arena.h b/tensorflow/contrib/lite/simple_memory_arena.h
index 0d0b7f9ff7..07a38c4243 100644
--- a/tensorflow/contrib/lite/simple_memory_arena.h
+++ b/tensorflow/contrib/lite/simple_memory_arena.h
@@ -68,6 +68,10 @@ class SimpleMemoryArena {
 
   TfLiteStatus Clear();
 
+  int64_t BasePointer() const {
+    return reinterpret_cast<int64_t>(underlying_buffer_aligned_ptr_);
+  }
+
  private:
   bool commited_;
   size_t arena_alignment_;
author	A. Unique TensorFlower <gardener@tensorflow.org>	2018-01-13 16:21:55 -0800
committer	TensorFlower Gardener <gardener@tensorflow.org>	2018-01-13 16:25:51 -0800
commit	e6ff665dbe4888aa5fdff8f34c44405acca2ddd1 (patch)
tree	a290fe97b76bebec82ef3b3a76c906acf6b55f41 /tensorflow/contrib
parent	a4973345351a14a786987cd7f648a99c029fdc1d (diff)