Use shared memory for NNAPI input and output.

PiperOrigin-RevId: 208283489
author: A. Unique TensorFlower <gardener@tensorflow.org> 2018-08-10 16:03:54 -0700
committer: TensorFlower Gardener <gardener@tensorflow.org> 2018-08-10 16:13:31 -0700
commit: bdc3b303f1ba5b22cac1df0b605ad0e0c45421f1 (patch)
tree: b41d0ddbc4ad15d24a731957db90c9457f602fe6 /tensorflow/contrib/lite/delegates/nnapi
parent: be3f9abf85cb8f0a80aa034ec8bfb6b5844fd3e6 (diff)
1 files changed, 77 insertions, 4 deletions
diff --git a/tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate.cc b/tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate.cc
index 17fa120cf9..e6cc3dd99c 100644
--- a/tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate.cc
+++ b/tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate.cc
@@ -27,7 +27,9 @@ limitations under the License.
 #include "tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h"
 
 #ifdef __ANDROID__
+#include <sys/mman.h>
 #include <sys/system_properties.h>
+#include <unistd.h>
 #endif
 
 namespace tflite {
@@ -80,6 +82,44 @@ struct NNFreeCompilation {
   }
 };
 
+// Manage NNAPI shared memory handle
+class NNMemory {
+ public:
+  NNMemory(const char* name, size_t size) {
+#ifdef __ANDROID__
+    byte_size_ = size;
+    fd_ = ASharedMemory_create(name, size);
+    data_ptr_ = reinterpret_cast<uint8_t*>(
+        mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd_, 0));
+    ANeuralNetworksMemory_createFromFd(size, PROT_READ | PROT_WRITE, fd_, 0,
+                                       &nn_memory_handle_);
+#endif
+  }
+
+  ~NNMemory() {
+#ifdef __ANDROID__
+    if (data_ptr_) {
+      munmap(data_ptr_, byte_size_);
+    }
+    if (nn_memory_handle_) {
+      ANeuralNetworksMemory_free(nn_memory_handle_);
+    }
+    if (fd_ > 0) close(fd_);
+#endif
+  }
+
+  ANeuralNetworksMemory* get_handle() { return nn_memory_handle_; }
+  uint8_t* get_data_ptr() { return data_ptr_; }
+
+ private:
+#ifdef __ANDROID__
+  int fd_ = 0;
+  size_t byte_size_ = 0;
+#endif
+  uint8_t* data_ptr_ = nullptr;
+  ANeuralNetworksMemory* nn_memory_handle_ = nullptr;
+};  // namespace
+
 // Track tensor indices to NN API tensor indices mapping.
 class OperandMapping {
  public:
@@ -911,6 +951,8 @@ class NNAPIDelegateKernel {
     // absolute indices but NN api indices inputs by relative indices.
     int relative_input_index = 0;
     int num_optional_tensors = 0;
+
+    size_t input_offset = 0;
     for (auto absolute_input_index : TfLiteIntArrayView(node->inputs)) {
       if (absolute_input_index == kOptionalTensor) {
         num_optional_tensors++;
@@ -920,20 +962,28 @@ class NNAPIDelegateKernel {
       // TODO(miaowang): make sure the delegation works with dequantized weights
       // as intermediate tensors.
       if (tensor->allocation_type != kTfLiteMmapRo) {
-        CHECK_NN(context, ANeuralNetworksExecution_setInput(
+        // copy data to pre-allocated shared memory.
+        memcpy(nn_input_memory_->get_data_ptr() + input_offset,
+               tensor->data.raw, tensor->bytes);
+        CHECK_NN(context, ANeuralNetworksExecution_setInputFromMemory(
                               execution, relative_input_index, nullptr,
-                              tensor->data.raw, tensor->bytes));
+                              nn_input_memory_->get_handle(), input_offset,
+                              tensor->bytes));
+        input_offset += tensor->bytes;
         relative_input_index++;
       }
     }
 
     // Set the output tensor buffers.
     int relative_output_index = 0;
+    size_t output_offset = 0;
     for (auto output_index : TfLiteIntArrayView(node->outputs)) {
       TfLiteTensor* tensor = &context->tensors[output_index];
-      CHECK_NN(context, ANeuralNetworksExecution_setOutput(
+      CHECK_NN(context, ANeuralNetworksExecution_setOutputFromMemory(
                             execution, relative_output_index, nullptr,
-                            tensor->data.raw, tensor->bytes));
+                            nn_output_memory_->get_handle(), output_offset,
+                            tensor->bytes));
+      output_offset += tensor->bytes;
       relative_output_index++;
     }
 
@@ -957,6 +1007,15 @@ class NNAPIDelegateKernel {
     ANeuralNetworksEvent_free(event);
     ANeuralNetworksExecution_free(execution);
 
+    // copy results from shared memory to the destination.
+    output_offset = 0;
+    for (auto output_index : TfLiteIntArrayView(node->outputs)) {
+      TfLiteTensor* tensor = &context->tensors[output_index];
+      memcpy(tensor->data.raw,
+             nn_output_memory_->get_data_ptr() + output_offset, tensor->bytes);
+      output_offset += tensor->bytes;
+    }
+
     return kTfLiteOk;
   }
 
@@ -974,6 +1033,9 @@ class NNAPIDelegateKernel {
   std::vector<int> model_state_inputs_;
   std::vector<int> model_state_tfl_outputs_;
 
+  std::unique_ptr<NNMemory> nn_input_memory_;
+  std::unique_ptr<NNMemory> nn_output_memory_;
+
   TfLiteStatus AddOpsAndTensors(TfLiteContext* context) {
     // The operand builder allows creating a single op. We create it at this
     // reduced power position rather than in the for loop to avoid reallocating
@@ -1024,21 +1086,27 @@ class NNAPIDelegateKernel {
     inputs.reserve(input_tensors->size);
     std::vector<uint32_t> outputs;
     outputs.reserve(output_tensors->size);
+
+    size_t total_input_byte_size = 0;
     // Make the TensorFlow lite inputs and outputs to ann_indices.
     for (int i : TfLiteIntArrayView(input_tensors)) {
       // Constant tensors are not NNAPI inputs.
       if (i != kOptionalTensor &&
           context->tensors[i].allocation_type != kTfLiteMmapRo) {
         inputs.push_back(operand_mapping_.lite_index_to_ann(i));
+        total_input_byte_size += context->tensors[i].bytes;
       }
     }
+
     // Add state input tensors as model inputs
     for (int i : model_state_inputs_) {
       inputs.push_back(i);
     }
 
+    size_t total_output_byte_size = 0;
     for (int i : TfLiteIntArrayView(output_tensors)) {
       outputs.push_back(operand_mapping_.lite_index_to_ann(i));
+      total_output_byte_size += context->tensors[i].bytes;
     }
 
     // Tell ANN to declare inputs/outputs
@@ -1048,6 +1116,11 @@ class NNAPIDelegateKernel {
     // Finalize the model
     CHECK_NN(context, ANeuralNetworksModel_finish(nn_model_.get()));
 
+    // Create shared memory pool for inputs and outputs.
+    nn_input_memory_.reset(new NNMemory("input_pool", total_input_byte_size));
+    nn_output_memory_.reset(
+        new NNMemory("output_pool", total_output_byte_size));
+
     return kTfLiteOk;
   }
 };
author	A. Unique TensorFlower <gardener@tensorflow.org>	2018-08-10 16:03:54 -0700
committer	TensorFlower Gardener <gardener@tensorflow.org>	2018-08-10 16:13:31 -0700
commit	bdc3b303f1ba5b22cac1df0b605ad0e0c45421f1 (patch)
tree	b41d0ddbc4ad15d24a731957db90c9457f602fe6 /tensorflow/contrib/lite/delegates/nnapi
parent	be3f9abf85cb8f0a80aa034ec8bfb6b5844fd3e6 (diff)