15 files changed, 694 insertions, 150 deletions
diff --git a/tensorflow/contrib/hvx/hvx_ops_support_checker/hvx_ops_support_checker_main.cc b/tensorflow/contrib/hvx/hvx_ops_support_checker/hvx_ops_support_checker_main.cc
index 6ae7c4a742..6af608396a 100644
--- a/tensorflow/contrib/hvx/hvx_ops_support_checker/hvx_ops_support_checker_main.cc
+++ b/tensorflow/contrib/hvx/hvx_ops_support_checker/hvx_ops_support_checker_main.cc
@@ -33,10 +33,15 @@ limitations under the License.
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
+
 namespace {
-static int ParseFlags(int argc, char* argv[], string* in_graph) {
+static int ParseFlags(int argc, char* argv[], string* in_graph,
+                      bool* dump_all_nodes, bool* dump_shape_and_type) {
   std::vector<Flag> flag_list = {
-      Flag("in_graph", in_graph, "input graph file name"),
+      Flag("in_graph", in_graph, "Input graph file name to check hvx support."),
+      Flag("dump_all_nodes", dump_all_nodes, "Dump all nodes in the model."),
+      Flag("dump_shape_and_type", dump_shape_and_type,
+           "Dump shape and type of nodes"),
   };
   CHECK(Flags::Parse(&argc, argv, flag_list));
   // We need to call this to set up global state for TensorFlow.
@@ -48,12 +53,25 @@ static int ParseFlags(int argc, char* argv[], string* in_graph) {
   return 0;
 }
 
-static void SummarizeNode(const NodeDef& node_def) {
+static void SummarizeNode(const NodeDef& node_def,
+                          const bool dump_shape_and_type) {
   LOG(INFO) << "Node(" << node_def.name() << ")";
   LOG(INFO) << "  op: " << node_def.op();
   for (const string& input : node_def.input()) {
     LOG(INFO) << " Input: " << input;
   }
+  std::vector<DataType> data_types;
+  std::vector<TensorShape> shapes;
+  const Status status = RemoteFusedGraphExecuteUtils::GetOutputTensorShapeType(
+      node_def, &data_types, &shapes);
+  if (data_types.empty() || shapes.empty()) {
+    return;
+  }
+  CHECK_EQ(data_types.size(), shapes.size());
+  for (int i = 0; i < data_types.size(); ++i) {
+    LOG(INFO) << " Output(" << i << "): " << DataType_Name(data_types.at(i))
+              << ", " << shapes.at(i).DebugString();
+  }
 }
 
 static void DumpRemoteFusedGraph(const NodeDef& node_def) {
@@ -89,10 +107,14 @@ static void DumpRemoteFusedGraph(const NodeDef& node_def) {
   }
 }
 
-static void CheckOpsSupport(const GraphDef& graph_def) {
+static void CheckOpsSupport(const GraphDef& graph_def,
+                            const bool dump_all_nodes,
+                            const bool dump_shape_and_type) {
   const IGraphTransferOpsDefinitions& ops_definition =
       HexagonOpsDefinitions::getInstance();
   LOG(INFO) << "Checking " << graph_def.node_size() << " nodes";
+  LOG(INFO) << "dump_all_nodes = " << dump_all_nodes
+            << ", dump_shape_and_tpye = " << dump_shape_and_type;
 
   std::unordered_set<string> unsupported_ops;
   bool all_supported = true;
@@ -125,9 +147,9 @@ static void CheckOpsSupport(const GraphDef& graph_def) {
     LOG(INFO) << count << " ops are not supported.";
   }
 
-  if (contains_remote_graph) {
+  if (contains_remote_graph || dump_all_nodes) {
     for (const NodeDef& node : graph_def.node()) {
-      SummarizeNode(node);
+      SummarizeNode(node, dump_shape_and_type);
     }
   }
 }
@@ -137,7 +159,10 @@ static void CheckOpsSupport(const GraphDef& graph_def) {
 
 int main(int argc, char** argv) {
   tensorflow::string in_graph;
-  const int ret = tensorflow::ParseFlags(argc, argv, &in_graph);
+  bool dump_all_nodes;
+  bool dump_shape_and_type;
+  const int ret = tensorflow::ParseFlags(argc, argv, &in_graph, &dump_all_nodes,
+                                         &dump_shape_and_type);
   if (ret != 0) {
     return ret;
   }
@@ -146,6 +171,6 @@ int main(int argc, char** argv) {
   TF_CHECK_OK(tensorflow::graph_transforms::LoadTextOrBinaryGraphFile(
       in_graph, &graph_def));
 
-  tensorflow::CheckOpsSupport(graph_def);
+  tensorflow::CheckOpsSupport(graph_def, dump_all_nodes, dump_shape_and_type);
   return 0;
 }
diff --git a/tensorflow/contrib/makefile/Makefile b/tensorflow/contrib/makefile/Makefile
index 305ed0d11e..2150cfe9ea 100644
--- a/tensorflow/contrib/makefile/Makefile
+++ b/tensorflow/contrib/makefile/Makefile
@@ -279,6 +279,16 @@ ifeq ($(TARGET),ANDROID)
 		LIBS += -lhexagon_controller
 		LDFLAGS += -L$(HEXAGON_LIBS)
 		CXXFLAGS += -DUSE_HEXAGON_LIBS
+
+# CAVEAT: We should disable TENSORFLOW_DISABLE_META while running
+# quantized_matmul on Android because it crashes in
+# MultiThreadGemm in tensorflow/core/kernels/meta_support.cc
+# See http://b/33270149
+# TODO(satok): Remove once it's fixed
+		CXXFLAGS += -DTENSORFLOW_DISABLE_META
+
+# Declare __ANDROID_TYPES_FULL__ to enable required types for hvx
+		CXXFLAGS += -D__ANDROID_TYPES_FULL__
 	endif
 
 	ifdef ENABLE_EXPERIMENTAL_HEXNN_OPS
@@ -500,6 +510,18 @@ tensorflow/core/util/reporter.cc \
 tensorflow/tools/benchmark/benchmark_model.cc \
 tensorflow/tools/benchmark/benchmark_model_main.cc
 
+ifdef HEXAGON_LIBS
+	TF_CC_SRCS += \
+tensorflow/cc/framework/scope.cc \
+tensorflow/cc/framework/ops.cc \
+tensorflow/cc/ops/const_op.cc \
+tensorflow/core/kernels/hexagon/graph_transfer_utils.cc \
+tensorflow/core/kernels/hexagon/graph_transferer.cc \
+tensorflow/core/kernels/hexagon/hexagon_control_wrapper.cc \
+tensorflow/core/kernels/hexagon/hexagon_ops_definitions.cc \
+tensorflow/core/kernels/hexagon/hexagon_remote_fused_graph_executor_build.cc
+endif
+
 # File names of the intermediate files target compilation generates.
 TF_CC_OBJS := $(addprefix $(OBJDIR), $(TF_CC_SRCS:.cc=.o))
 PBT_GEN_FILES := $(addprefix $(PBTGENDIR), $(PBT_CC_SRCS))
diff --git a/tensorflow/contrib/makefile/sub_makefiles/hexagon_graph_execution/Makefile.in b/tensorflow/contrib/makefile/sub_makefiles/hexagon_graph_execution/Makefile.in
index 2a6f66edcb..9aa81144fd 100644
--- a/tensorflow/contrib/makefile/sub_makefiles/hexagon_graph_execution/Makefile.in
+++ b/tensorflow/contrib/makefile/sub_makefiles/hexagon_graph_execution/Makefile.in
@@ -34,27 +34,7 @@ $(wildcard $(GTEST_DIR)/src/*.cc) \
 $(wildcard $(GTEST_DIR)/src/*.h) \
 $(GTEST_HEADERS)
 
-# CAVEAT: We should disable TENSORFLOW_DISABLE_META while running
-# quantized_matmul on Android because it crashes in
-# MultiThreadGemm in tensorflow/core/kernels/meta_support.cc
-# TODO(satok): Remove once it's fixed
-CXXFLAGS += -DTENSORFLOW_DISABLE_META
-
-# Declare __ANDROID_TYPES_FULL__ to enable required types for hvx
-CXXFLAGS += -D__ANDROID_TYPES_FULL__
-
 GRAPH_TRANSFER_SRCS := \
-tensorflow/cc/framework/scope.cc \
-tensorflow/cc/framework/ops.cc \
-tensorflow/cc/ops/const_op.cc \
-tensorflow/core/kernels/hexagon/graph_transfer_utils.cc \
-tensorflow/core/kernels/hexagon/graph_transferer.cc \
-tensorflow/core/kernels/hexagon/hexagon_control_wrapper.cc \
-tensorflow/core/kernels/hexagon/hexagon_ops_definitions.cc \
-tensorflow/core/kernels/hexagon/hexagon_remote_fused_graph_executor_build.cc \
-tensorflow/core/kernels/remote_fused_graph_execute_op.cc \
-tensorflow/core/kernels/remote_fused_graph_execute_utils.cc \
-tensorflow/core/ops/remote_fused_graph_ops.cc \
 tensorflow/core/platform/posix/test.cc
 
 GRAPH_EXECUTION_SRCS := \
diff --git a/tensorflow/contrib/makefile/tf_op_files.txt b/tensorflow/contrib/makefile/tf_op_files.txt
index 857d6fa21b..c73ec0305b 100644
--- a/tensorflow/contrib/makefile/tf_op_files.txt
+++ b/tensorflow/contrib/makefile/tf_op_files.txt
@@ -202,12 +202,15 @@ tensorflow/core/kernels/quantized_reshape_op.cc
 tensorflow/core/kernels/quantized_resize_bilinear_op.cc
 tensorflow/core/kernels/requantization_range_op.cc
 tensorflow/core/kernels/requantize.cc
+tensorflow/core/kernels/remote_fused_graph_execute_op.cc
+tensorflow/core/kernels/remote_fused_graph_execute_utils.cc
 tensorflow/core/ops/training_ops.cc
 tensorflow/core/ops/string_ops.cc
 tensorflow/core/ops/state_ops.cc
 tensorflow/core/ops/sparse_ops.cc
 tensorflow/core/ops/sendrecv_ops.cc
 tensorflow/core/ops/script_ops.cc
+tensorflow/core/ops/remote_fused_graph_ops.cc
 tensorflow/core/ops/random_ops.cc
 tensorflow/core/ops/random_grad.cc
 tensorflow/core/ops/parsing_ops.cc
diff --git a/tensorflow/core/kernels/hexagon/graph_transferer.cc b/tensorflow/core/kernels/hexagon/graph_transferer.cc
index d927ef3efa..055108cd00 100644
--- a/tensorflow/core/kernels/hexagon/graph_transferer.cc
+++ b/tensorflow/core/kernels/hexagon/graph_transferer.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/public/session.h"
@@ -43,10 +44,14 @@ const char INPUTS_NODE_PREFIX[] = "inputs_for_";
 const char OUTPUTS_NODE_PREFIX[] = "outputs_for_";
 const char DATA_NODE_PREFIX[] = "data_for_op_";
 const char CONST_SHAPE_PREFIX[] = "const_shape_";
+const char CONST_VAL_PREFIX[] = "const_val_";
+const char CONST_TENSOR_PREFIX[] = "const_tensor_";
 const char PADDING_ATTR_NAME[] = "padding";
 const char STRIDES_ATTR_NAME[] = "strides";
+const char KEEP_DIMS_ATTR_NAME[] = "keep_dims";
 const char KSIZE_ATTR_NAME[] = "ksize";
 const char NULL_OUTPUT_NAME[] = "NULL";
+const char AGGREGATED_INPUT_NODE_NAME[] = "graph_transfer_aggregated_input";
 const int PADDING_NA_ID = 0;  // VALID = 1, SAME = 2
 
 // This is a temporary workaround to support android build
@@ -58,6 +63,16 @@ static string ToString(T val) {
   return stream.str();
 }
 
+static Node* FindMutableNodeByName(const string& name, Graph* graph) {
+  const TensorId tid = ParseTensorName(name);
+  for (Node* node : graph->nodes()) {
+    if (node != nullptr && node->name() == tid.first) {
+      return node;
+    }
+  }
+  return nullptr;
+}
+
 /**
  * graph loading functions
  * - LoadGraphFromProto
@@ -86,13 +101,22 @@ Status GraphTransferer::LoadGraphFromProto(
     }
   }
 
+  TF_RETURN_IF_ERROR(TransformGraphToAddAggregatedInputNode(
+      input_node_info_list, &graph, &shape_refiner));
+
   std::unordered_multimap<string, const Node*> op_name_to_node_multimap(
       graph.num_nodes());
   for (const Node* const node : graph.nodes()) {
+    if (node == nullptr) {
+      continue;
+    }
     CacheNode(*node);
   }
 
   for (const Node* const node : graph.nodes()) {
+    if (node == nullptr) {
+      continue;
+    }
     VLOG(1) << "<Node> " << node->name();
     for (const Node* const input_node : node->in_nodes()) {
       const string& name = input_node->name();
@@ -102,6 +126,9 @@ Status GraphTransferer::LoadGraphFromProto(
   }
 
   for (const Node* const node : graph.nodes()) {
+    if (node == nullptr) {
+      continue;
+    }
     status = RegisterNodeIfAllInputsAreCached(
         ops_definitions, shape_refiner, *node, false, input_node_info_list,
         output_node_names);
@@ -265,19 +292,16 @@ GraphTransferInfo& GraphTransferer::GetMutableGraphTransferInfo() {
   return graph_transfer_info_;
 }
 
-int GraphTransferer::CacheNode(const Node& node) {
+void GraphTransferer::CacheNode(const Node& node) {
   if (node_name_to_id_cache_map_.count(node.name()) > 0) {
-    VLOG(1) << "Emplace node to cache failed";
-    // TODO(satok): check here?
-    return -1;
+    return;
   }
-  VLOG(1) << "Cache node: " << node.name() << ", " << node.op_def().name();
   node_name_cache_list_.emplace_back(&node);
+  const int node_id = node_name_cache_list_.size() - 1;
   bool emplace_succeeded = false;
-  std::tie(std::ignore, emplace_succeeded) = node_name_to_id_cache_map_.emplace(
-      node.name(), node_name_cache_list_.size() - 1);
+  std::tie(std::ignore, emplace_succeeded) =
+      node_name_to_id_cache_map_.emplace(node.name(), node_id);
   CHECK(emplace_succeeded);
-  return node_name_cache_list_.size() - 1;
 }
 
 bool GraphTransferer::AreAllInputsCached(const Node& node) const {
@@ -291,22 +315,124 @@ bool GraphTransferer::AreAllInputsCached(const Node& node) const {
   return true;
 }
 
+Status GraphTransferer::TransformGraphToAddAggregatedInputNode(
+    const std::vector<std::pair<string, Tensor>>& input_node_info_list,
+    Graph* graph, ShapeRefiner* shape_refiner) {
+  // Transform a remote fused graph to add an aggregated input node which takes
+  // all inputs of the remote graph.
+  DataTypeVector input_data_types;
+  std::vector<DataType> data_types;
+  std::vector<TensorShape> shapes;
+  std::vector<string> input_nodes;
+  for (int i = 0; i < input_node_info_list.size(); ++i) {
+    Node* node = FindMutableNodeByName(input_node_info_list.at(i).first, graph);
+    CHECK_NOTNULL(node);
+    input_nodes.emplace_back(node->name());
+    input_data_types.emplace_back(input_node_info_list.at(i).second.dtype());
+    data_types.emplace_back(input_node_info_list.at(i).second.dtype());
+    shapes.emplace_back(input_node_info_list.at(i).second.shape());
+  }
+
+  NodeDef input_node_def;
+  auto builder =
+      NodeBuilder(AGGREGATED_INPUT_NODE_NAME, "RemoteFusedGraphExecute")
+          .Input(std::vector<NodeBuilder::NodeOut>{})
+          .Attr("Tinputs", DataTypeVector{})
+          .Attr("Toutputs", input_data_types)
+          .Attr("serialized_remote_fused_graph_execute_info", "")
+          .Attr(RemoteFusedGraphExecuteUtils::ATTR_OUTPUT_DATA_TYPES,
+                data_types)
+          .Attr(RemoteFusedGraphExecuteUtils::ATTR_OUTPUT_SHAPES, shapes);
+
+  Node* input_node;
+  TF_RETURN_IF_ERROR(builder.Finalize(graph, &input_node));
+  CHECK_NOTNULL(input_node);
+
+  bool refined;
+  TF_RETURN_IF_ERROR(shape_refiner->UpdateNode(input_node, &refined));
+
+  shape_inference::InferenceContext* context =
+      shape_refiner->GetContext(input_node);
+  for (int i = 0; i < input_node_info_list.size(); ++i) {
+    shape_inference::ShapeHandle handle;
+    TF_RETURN_IF_ERROR(context->MakeShapeFromTensorShape(
+        input_node_info_list.at(i).second.shape(), &handle));
+    TF_RETURN_IF_ERROR(shape_refiner->SetShape(input_node, i, handle));
+  }
+
+  // Cache the aggregate input node first as it's consumed first.
+  CacheNode(*input_node);
+
+  std::vector<Node*> original_input_nodes(input_nodes.size());
+
+  for (int i = 0; i < input_nodes.size(); ++i) {
+    const string& node_name = input_nodes.at(i);
+    Node* original_input_node = FindMutableNodeByName(node_name, graph);
+    CHECK_NOTNULL(original_input_node);
+    CHECK_EQ(1, original_input_node->num_outputs());  // replaced by identity.
+    Node* created_node;
+    TF_RETURN_IF_ERROR(RemoteFusedGraphExecuteUtils::BuildIdentityOpNode(
+        node_name, AGGREGATED_INPUT_NODE_NAME, i, data_types.at(i), graph,
+        &created_node));
+    CHECK_NOTNULL(created_node);
+    std::vector<DataType> data_types;
+    std::vector<TensorShape> shapes;
+    Status status = RemoteFusedGraphExecuteUtils::GetOutputTensorShapeType(
+        original_input_node->def(), &data_types, &shapes);
+    if (status.ok()) {
+      created_node->AddAttr(
+          RemoteFusedGraphExecuteUtils::ATTR_OUTPUT_DATA_TYPES, data_types);
+      created_node->AddAttr(RemoteFusedGraphExecuteUtils::ATTR_OUTPUT_SHAPES,
+                            shapes);
+    }
+    for (const Edge* out_edge : original_input_node->out_edges()) {
+      Node* dst = out_edge->dst();
+      int dst_port = out_edge->dst_input();
+      // Unused edge will be removed when removing node.
+      graph->AddEdge(created_node, 0, dst, dst_port);
+    }
+    original_input_nodes[i] = original_input_node;
+
+    TF_RETURN_IF_ERROR(shape_refiner->UpdateNode(created_node, &refined));
+
+    shape_inference::InferenceContext* context =
+        shape_refiner->GetContext(created_node);
+    CHECK_NOTNULL(context);
+
+    // Cache replaced input node next to the aggregated input node.
+    CacheNode(*created_node);
+  }
+
+  // Remove original input nodes after adding new input nodes to avoid
+  // reusing same pointer in Graph.
+  for (Node* original_input_node : original_input_nodes) {
+    graph->RemoveNode(original_input_node);
+  }
+
+  return Status::OK();
+}
+
 Status GraphTransferer::RegisterNode(
     const IGraphTransferOpsDefinitions& ops_definitions,
     const ShapeRefiner& shape_refiner, const Node& node,
     const std::vector<std::pair<string, Tensor>>& input_node_info_list,
     const std::vector<string>& output_node_names) {
-  VLOG(1) << "Register node: " << node.name();
+  VLOG(1) << "Register node: " << node.name() << ", " << std::hex
+          << node_name_to_id_cache_map_.at(node.name());
   if (node.name() == SOURCE_NODE_NAME || node.name() == SINK_NODE_NAME) {
     // Just ignore sink and source
-    return Status();
-  } else if (RemoteFusedGraphExecuteUtils::IsInputNode(input_node_info_list,
-                                                       node.name())) {
+    return Status::OK();
+  } else if (node.name() == AGGREGATED_INPUT_NODE_NAME) {
     RegisterInputNode(ops_definitions, shape_refiner, node);
+    return Status::OK();
   } else if (node.IsConstant()) {
     RegisterConstantNode(shape_refiner, node);
+  } else if (IsPadNode(node)) {
+    RegisterPadNode(ops_definitions, shape_refiner, node);
   } else if (HasPaddingAndStrides(node)) {
     RegisterNodeWithPaddingAndStrides(ops_definitions, shape_refiner, node);
+  } else if (NeedsToAddRank(node)) {
+    RegisterNodeWithRank(ops_definitions, shape_refiner, node);
   } else if (IsNodeFlattenReshape(node, shape_refiner)) {
     RegisterFlattenNode(ops_definitions, shape_refiner, node);
   } else if (ops_definitions.GetOpIdFor(node.type_string(), {}) !=
@@ -318,7 +444,7 @@ Status GraphTransferer::RegisterNode(
                                    " has not been implemented yet.");
   }
 
-  return Status();
+  return Status::OK();
 }
 
 void GraphTransferer::RegisterConstantNode(const ShapeRefiner& shape_refiner,
@@ -361,8 +487,7 @@ void GraphTransferer::RegisterConstantNode(const ShapeRefiner& shape_refiner,
   const TensorProto* proto = nullptr;
   TF_CHECK_OK(GetNodeAttr(node.attrs(), "value", &proto));
   Tensor const_tensor;
-  // TODO(b/32704451): Don't just ignore this status!
-  MakeTensorFromProto(*proto, &const_tensor).IgnoreError();
+  TF_CHECK_OK(MakeTensorFromProto(*proto, &const_tensor));
 
   const_node_info.set_dtype(const_tensor.dtype());
   if (data_size > 0) {
@@ -394,12 +519,82 @@ int GraphTransferer::RegisterConstantShape(const std::vector<int>& shape) {
   return node_name_to_id_cache_map_[shape_name];
 }
 
+int GraphTransferer::RegisterConstTensor(const Tensor& tensor,
+                                         const string& suffix) {
+  VLOG(1) << "Cache const tensor.";
+  const int dims = tensor.shape().dims();
+  CHECK(dims <= 4);
+  const string node_name = strings::StrCat(CONST_TENSOR_PREFIX, "_", suffix);
+  if (node_name_to_id_cache_map_.count(node_name) <= 0) {
+    node_name_cache_list_.emplace_back(nullptr);
+    const int id = node_name_cache_list_.size() - 1;
+    node_name_to_id_cache_map_.emplace(node_name, id);
+    GraphTransferInfo::ConstNodeInfo& const_node_info =
+        *graph_transfer_info_.add_const_node_info();
+    const_node_info.set_name(node_name);
+    const_node_info.set_node_id(id);
+    CHECK_EQ(4, SHAPE_ARRAY_SIZE);
+    for (int i = 0; i < SHAPE_ARRAY_SIZE; ++i) {
+      if (i < SHAPE_ARRAY_SIZE - dims) {
+        const_node_info.add_shape(1);
+      } else {
+        const_node_info.add_shape(
+            tensor.shape().dim_size(i - (SHAPE_ARRAY_SIZE - dims)));
+      }
+    }
+    const_node_info.set_dtype(tensor.dtype());
+    const_node_info.set_data(tensor.tensor_data().data(),
+                             tensor.tensor_data().size());
+  }
+  return node_name_to_id_cache_map_[node_name];
+}
+
+int GraphTransferer::RegisterConstScalar(const DataType dt, const int val,
+                                         const int dst_id,
+                                         const int dst_input_count) {
+  VLOG(1) << "Cache const.";
+  const string val_name =
+      CONST_VAL_PREFIX + ToString(dst_id) + '_' + ToString(dst_input_count);
+  if (node_name_to_id_cache_map_.count(val_name) <= 0) {
+    node_name_cache_list_.emplace_back(nullptr);
+    const int id = node_name_cache_list_.size() - 1;
+    node_name_to_id_cache_map_.emplace(val_name, id);
+    GraphTransferInfo::ConstNodeInfo& const_node_info =
+        *graph_transfer_info_.add_const_node_info();
+    const_node_info.set_name(val_name);
+    const_node_info.set_node_id(id);
+    // TODO(satok): Do not assume rank is 4 here.
+    const_node_info.add_shape(static_cast<int64>(1));
+    const_node_info.add_shape(static_cast<int64>(1));
+    const_node_info.add_shape(static_cast<int64>(1));
+    const_node_info.add_shape(static_cast<int64>(1));
+    const_node_info.set_data(&val, DataTypeSize(dt));
+  }
+  return node_name_to_id_cache_map_[val_name];
+}
+
 bool GraphTransferer::HasPaddingAndStrides(const Node& node) {
   auto attrs = node.attrs();
   return attrs.Find(PADDING_ATTR_NAME) != nullptr &&
          attrs.Find(STRIDES_ATTR_NAME) != nullptr;
 }
 
+bool GraphTransferer::NeedsToAddRank(const Node& node) {
+  const string& op_type = node.def().op();
+  if (op_type == "Transpose" || op_type == "ExpandDims") {
+    return true;
+  }
+  return false;
+}
+
+bool GraphTransferer::IsPadNode(const Node& node) {
+  const string& op_type = node.def().op();
+  if (op_type == "Pad") {
+    return true;
+  }
+  return false;
+}
+
 bool GraphTransferer::IsNodeFlattenReshape(const Node& node,
                                            const ShapeRefiner& shape_refiner) {
   // Check if node is reshape op
@@ -473,15 +668,123 @@ void GraphTransferer::RegisterNodeWithPaddingAndStrides(
       node.num_outputs(), true /* append_input */, true /* append_output */);
 }
 
-void GraphTransferer::RegisterInputNode(
+void GraphTransferer::RegisterNodeWithRank(
     const IGraphTransferOpsDefinitions& ops_definitions,
     const ShapeRefiner& shape_refiner, const Node& node) {
-  VLOG(1) << "Register input node: " << node.name();
   CHECK_EQ(node_name_to_id_cache_map_.count(node.name()), 1);
   const int id = node_name_to_id_cache_map_[node.name()];
+  shape_inference::InferenceContext* context = shape_refiner.GetContext(&node);
+  const Node* input0_node;
+  TF_CHECK_OK(node.input_node(0, &input0_node));
+  CHECK_NOTNULL(input0_node);
+  std::vector<TensorShape> shapes;
+  Status status = RemoteFusedGraphExecuteUtils::GetOutputTensorShapeType(
+      input0_node->def(), nullptr, &shapes);
+  CHECK_EQ(1, shapes.size()) << "Output size should be 1.";
+  const int const_val_id =
+      RegisterConstScalar(DT_INT32, shapes.at(0).dims(), id, node.num_inputs());
+  std::vector<int> extra_inputs{const_val_id};
+  // TODO(satok): Set correct data type if it's given.
+  const int op_type_id = ops_definitions.GetOpIdFor(node.type_string(), {});
+  CHECK(op_type_id >= 0 && op_type_id < ops_definitions.GetTotalOpsCount())
+      << "Op " << node.type_string() << " not found in map(id = " << op_type_id
+      << ")";
+  bool keep_dims = false;
+  int padding_id = PADDING_NA_ID;
+  if (context->GetAttr(KEEP_DIMS_ATTR_NAME, &keep_dims).ok()) {
+    padding_id = keep_dims ? Padding::SAME : Padding::VALID;
+  }
+
+  AppendNodeParamsWithIoParams(
+      shape_refiner, node, node.name(), id, node.type_string(), op_type_id,
+      padding_id, node.num_inputs(), extra_inputs, node.num_outputs(),
+      true /* append_input */, true /* append_output */);
+}
+
+void GraphTransferer::RegisterPadNode(
+    const IGraphTransferOpsDefinitions& ops_definitions,
+    const ShapeRefiner& shape_refiner, const Node& node) {
+  static constexpr int PAD_WIDTH = 4;
+  static constexpr int PAD_HEIGHT = 2;
+  VLOG(1) << "Register generic node: " << node.name();
+  CHECK_EQ(node_name_to_id_cache_map_.count(node.name()), 1);
+  const int id = node_name_to_id_cache_map_[node.name()];
+
+  // TODO(satok): Set correct data type if it's given.
+  const int op_type_id = ops_definitions.GetOpIdFor(node.type_string(), {});
+  CHECK(op_type_id >= 0 && op_type_id < ops_definitions.GetTotalOpsCount());
+
+  CHECK_EQ(2, node.num_inputs());
+
+  GraphTransferInfo::NodeInputInfo& node_input_info =
+      *graph_transfer_info_.add_node_input_info();
+  node_input_info.set_node_id(id);
+
+  AddNodeInputByInputIndex(node, 0, &node_input_info);
+
+  const Edge* edge = nullptr;
+  TF_CHECK_OK(node.input_edge(1, &edge));
+  const Node* input_node = edge->src();
+  CHECK_NOTNULL(input_node);
+  CHECK(input_node->IsConstant());
+
+  const TensorProto* tensor_proto = nullptr;
+  TF_CHECK_OK(GetNodeAttr(input_node->def(), "value", &tensor_proto));
+  CHECK_NOTNULL(tensor_proto);
+  Tensor const_tensor;
+  TF_CHECK_OK(MakeTensorFromProto(*tensor_proto, &const_tensor));
+  CHECK_EQ(2, const_tensor.shape().dims());
+  CHECK_EQ(PAD_HEIGHT, const_tensor.shape().dim_size(1));
+  if (const_tensor.shape().dim_size(0) == PAD_WIDTH) {
+    AddNodeInputByInputIndex(node, 1, &node_input_info);
+  } else if (const_tensor.shape().dim_size(0) < PAD_WIDTH) {
+    const int width = const_tensor.shape().dim_size(0);
+    const TensorProto* proto = nullptr;
+    TF_CHECK_OK(GetNodeAttr(input_node->def(), "value", &proto));
+    Tensor const_tensor;
+    TF_CHECK_OK(MakeTensorFromProto(*proto, &const_tensor));
+    CHECK_EQ(DT_INT32, const_tensor.dtype());
+    // reshape tensor input to be rank 4.
+    // TODO(satok): Never assume rank is 4.
+    Tensor new_const_tensor(const_tensor.dtype(), TensorShape{4, 2});
+    for (int i = 0; i < PAD_HEIGHT; ++i) {
+      for (int j = 0; j < PAD_WIDTH; ++j) {
+        if (j < PAD_WIDTH - width) {
+          new_const_tensor.matrix<int32>()(j, i) = 0;
+        } else {
+          new_const_tensor.matrix<int32>()(j, i) =
+              const_tensor.matrix<int32>()(j - (PAD_WIDTH - width), i);
+        }
+      }
+    }
+
+    const int id = RegisterConstTensor(
+        new_const_tensor,
+        strings::StrCat(input_node->name(), "_", node.name(), "_1"));
+
+    GraphTransferInfo::NodeInput& node_input =
+        *node_input_info.add_node_input();
+    node_input.set_node_id(id);
+    node_input.set_output_port(0);
+  } else {
+    CHECK(false);
+  }
+
+  AppendNodeParamsWithIoParams(
+      shape_refiner, node, node.name(), id, node.type_string(), op_type_id,
+      PADDING_NA_ID, node.num_inputs(), {}, node.num_outputs(),
+      false /* append_input */, true /* append_output */);
+}
+
+void GraphTransferer::RegisterInputNode(
+    const IGraphTransferOpsDefinitions& ops_definitions,
+    const ShapeRefiner& shape_refiner, const Node& node) {
   const string op_type = node.type_string();
+  VLOG(1) << "Register input node: " << node.name() << ", " << op_type;
+  CHECK_EQ(node_name_to_id_cache_map_.count(node.name()), 1);
+  const int id = node_name_to_id_cache_map_[node.name()];
   // TODO(satok): Set correct data type if it's given.
-  const int op_type_id = ops_definitions.GetOpIdFor(op_type, {});
+  const int op_type_id = ops_definitions.GetOpIdFor("INPUT", {});
   CHECK(op_type_id >= 0 && op_type_id < ops_definitions.GetTotalOpsCount())
       << "Op" << node.name() << ", " << op_type << " is not supported,"
       << op_type_id;
@@ -546,7 +849,6 @@ void GraphTransferer::AppendNodeParams(const string& name, const int id,
                                        const int padding, const int inputs_size,
                                        const std::vector<int>& extra_inputs,
                                        const int outputs_size) {
-  VLOG(1) << "Append node params: " << name;
   GraphTransferInfo::NodeInfo& node_info =
       *graph_transfer_info_.add_node_info();
   node_info.set_name(name);
@@ -559,6 +861,23 @@ void GraphTransferer::AppendNodeParams(const string& name, const int id,
   node_info.set_output_count(static_cast<int>(outputs_size));
 }
 
+void GraphTransferer::AddNodeInputByInputIndex(
+    const Node& node, const int idx,
+    GraphTransferInfo::NodeInputInfo* node_input_info) {
+  const Edge* edge = nullptr;
+  TF_CHECK_OK(node.input_edge(idx, &edge));
+  const Node* input_node = edge->src();
+  CHECK_NOTNULL(input_node);
+  const int port = edge->src_output();
+
+  const std::string& op_name = input_node->name();
+  CHECK_GT(node_name_to_id_cache_map_.count(op_name), 0) << op_name;
+  const int src_id = node_name_to_id_cache_map_[op_name];
+  GraphTransferInfo::NodeInput& node_input = *node_input_info->add_node_input();
+  node_input.set_node_id(src_id);
+  node_input.set_output_port(port);
+}
+
 void GraphTransferer::AppendNodeInputParams(
     const int id, const Node& node, const std::vector<int>& extra_inputs) {
   VLOG(1) << "Append input params: " << node.name() << ", " << node.num_inputs()
@@ -567,18 +886,7 @@ void GraphTransferer::AppendNodeInputParams(
       *graph_transfer_info_.add_node_input_info();
   node_input_info.set_node_id(id);
   for (int i = 0; i < node.num_inputs(); ++i) {
-    const Edge* edge = nullptr;
-    TF_CHECK_OK(node.input_edge(i, &edge));
-    const Node* input_node = edge->src();
-    const int port = edge->src_output();
-
-    const std::string& op_name = input_node->name();
-    CHECK_GT(node_name_to_id_cache_map_.count(op_name), 0) << op_name;
-    const int src_id = node_name_to_id_cache_map_[op_name];
-    GraphTransferInfo::NodeInput& node_input =
-        *node_input_info.add_node_input();
-    node_input.set_node_id(src_id);
-    node_input.set_output_port(port);
+    AddNodeInputByInputIndex(node, i, &node_input_info);
   }
   for (const int extra_input : extra_inputs) {
     GraphTransferInfo::NodeInput& node_input =
@@ -596,9 +904,10 @@ void GraphTransferer::AppendNodeOutputParams(const ShapeRefiner& shape_refiner,
       *graph_transfer_info_.add_node_output_info();
   node_output_info.set_node_id(id);
 
+  std::vector<DataType> data_types;
   std::vector<TensorShape> shapes;
   Status status = RemoteFusedGraphExecuteUtils::GetOutputTensorShapeType(
-      node.attrs(), nullptr, &shapes);
+      node.attrs(), &data_types, &shapes);
 
   for (int i = 0; i < node.num_outputs(); ++i) {
     int data_size = -1;
@@ -608,16 +917,20 @@ void GraphTransferer::AppendNodeOutputParams(const ShapeRefiner& shape_refiner,
 
     shape_inference::InferenceContext* context =
         shape_refiner.GetContext(&node);
-    shape_inference::ShapeHandle shape_handle = context->output(output_index);
-    const shape_inference::DimensionHandle num_elements_dim =
-        context->NumElements(shape_handle);
-    if (context->ValueKnown(num_elements_dim)) {
+
+    if (context != nullptr && context->ValueKnown(context->NumElements(
+                                  context->output(output_index)))) {
+      const shape_inference::DimensionHandle num_elements_dim =
+          context->NumElements(context->output(output_index));
       const int64 num_output_elements = context->Value(num_elements_dim);
       data_size = max_bytes_per_data * num_output_elements;
+      if (status.ok()) {
+        TF_CHECK_OK(status);
+        CHECK_EQ(shapes.at(i).num_elements(), num_output_elements);
+      }
     } else {
       TF_CHECK_OK(status);
       // Use attribute attached to node
-      CHECK_EQ(node.num_outputs(), shapes.size()) << node.name();
       data_size = max_bytes_per_data * shapes.at(i).num_elements();
     }
     CHECK_GE(data_size, 0);
@@ -722,11 +1035,11 @@ bool GraphTransferer::TransferParamsComparator::operator()(
   const int node_id0 = obj0.node_id();
   const int node_id1 = obj1.node_id();
   bool obj0_uses_obj1 = false;
-  if (dependency_map_.count(node_id0)) {
+  if (dependency_map_.count(node_id0) > 0) {
     obj0_uses_obj1 = dependency_map_.at(node_id0).count(node_id1) > 0;
   }
   bool obj1_uses_obj0 = false;
-  if (dependency_map_.count(node_id1)) {
+  if (dependency_map_.count(node_id1) > 0) {
     obj1_uses_obj0 = dependency_map_.at(node_id1).count(node_id0) > 0;
   }
   CHECK(!obj0_uses_obj1 || !obj1_uses_obj0);
@@ -735,7 +1048,9 @@ bool GraphTransferer::TransferParamsComparator::operator()(
   } else if (obj1_uses_obj0) {
     return true;
   }
-  return node_id0 > node_id1;
+  // If there is no dependency between two nodes, it expects that
+  // the execution order follows node id order.
+  return node_id0 < node_id1;
 }
 
 /* static */ void GraphTransferer::FillDependencyRec(
diff --git a/tensorflow/core/kernels/hexagon/graph_transferer.h b/tensorflow/core/kernels/hexagon/graph_transferer.h
index fa12b22d75..64c60b87c6 100644
--- a/tensorflow/core/kernels/hexagon/graph_transferer.h
+++ b/tensorflow/core/kernels/hexagon/graph_transferer.h
@@ -88,6 +88,9 @@ class GraphTransferer {
   // Dump verification string of parameters to verify with offline tools
   void DumpVerificationStringOfNodeTransferParams() const;
 
+  static std::array<int64, SHAPE_ARRAY_SIZE> ToTensorShapeArray(
+      const TensorShape& shape);
+
  private:
   class TransferParamsComparator {
    public:
@@ -98,10 +101,16 @@ class GraphTransferer {
     const std::unordered_map<int, std::unordered_set<int>>& dependency_map_;
   };
 
-  int CacheNode(const Node& node);
+  void CacheNode(const Node& node);
 
   bool AreAllInputsCached(const Node& node) const;
 
+  // Transform a remote fused graph to add an aggregated input node which takes
+  // all inputs of the remote graph.
+  Status TransformGraphToAddAggregatedInputNode(
+      const std::vector<std::pair<string, Tensor>>& input_node_info_list,
+      Graph* graph, ShapeRefiner* shape_refiner);
+
   Status RegisterNode(
       const IGraphTransferOpsDefinitions& ops_definitions,
       const ShapeRefiner& shape_refiner, const Node& node,
@@ -113,8 +122,17 @@ class GraphTransferer {
 
   int RegisterConstantShape(const std::vector<int>& shape);
 
+  int RegisterConstTensor(const Tensor& tensor, const string& suffix);
+
+  int RegisterConstScalar(const DataType dt, const int val, const int dst_id,
+                          const int dst_input_count);
+
   bool HasPaddingAndStrides(const Node& node);
 
+  bool NeedsToAddRank(const Node& node);
+
+  bool IsPadNode(const Node& node);
+
   // Return true if the node is a reshape op which just flattens input
   // TODO(satok): Remove this method once generic reshape op is implemented in
   // SOC
@@ -125,6 +143,13 @@ class GraphTransferer {
       const IGraphTransferOpsDefinitions& ops_definitions,
       const ShapeRefiner& shape_refiner, const Node& node);
 
+  void RegisterNodeWithRank(const IGraphTransferOpsDefinitions& ops_definitions,
+                            const ShapeRefiner& shape_refiner,
+                            const Node& node);
+
+  void RegisterPadNode(const IGraphTransferOpsDefinitions& ops_definitions,
+                       const ShapeRefiner& shape_refiner, const Node& node);
+
   void RegisterInputNode(const IGraphTransferOpsDefinitions& ops_definitions,
                          const ShapeRefiner& shape_refiner,
                          const Node& node);
@@ -150,6 +175,10 @@ class GraphTransferer {
                         const std::vector<int>& extra_inputs,
                         const int outputs_size);
 
+  void AddNodeInputByInputIndex(
+      const Node& node, const int idx,
+      GraphTransferInfo::NodeInputInfo* node_input_info);
+
   void AppendNodeInputParams(const int id, const Node& node,
                              const std::vector<int>& extra_inputs);
 
@@ -167,9 +196,6 @@ class GraphTransferer {
       const int outputs_size, const bool append_input_params,
       const bool append_output_params);
 
-  static std::array<int64, SHAPE_ARRAY_SIZE> ToTensorShapeArray(
-      const TensorShape& shape);
-
   static string ToPaddingDebugString(int padding);
 
   // Create dependency map
diff --git a/tensorflow/core/kernels/hexagon/graph_transferer_test.cc b/tensorflow/core/kernels/hexagon/graph_transferer_test.cc
index ebd4a90330..74ffc026f7 100644
--- a/tensorflow/core/kernels/hexagon/graph_transferer_test.cc
+++ b/tensorflow/core/kernels/hexagon/graph_transferer_test.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/hexagon/i_graph_transfer_ops_definitions.h"
 #include "tensorflow/core/kernels/i_remote_fused_graph_executor.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/public/session.h"
@@ -47,21 +48,19 @@ class GraphTransfererTest : public ::testing::Test {
   GraphTransferer gt_;
 };
 
-static const std::vector<string> OP_TYPES{
-    "INPUT", "OUTPUT", "Conv2D", "MaxPool", "NoOp", "Add", "Const", "Softmax"};
 const RemoteFusedGraphExecuteUtils::TensorShapeMap EMPTY_OUTPUT_TENSOR_MAP;
 
 class TestGraphTransferOpsDefinitions : public IGraphTransferOpsDefinitions {
  public:
-  int GetTotalOpsCount() const final { return OP_TYPES.size(); }
+  int GetTotalOpsCount() const final { return op_types_.size(); }
 
-int GetOpIdFor(const string& op_type, const DataTypeVector&) const final {
-  for (int i = 0; i < OP_TYPES.size(); ++i) {
-    if (OP_TYPES[i] == op_type) {
-      return i;
+  int GetOpIdFor(const string& op_type, const DataTypeVector&) const final {
+    for (int i = 0; i < op_types_.size(); ++i) {
+      if (op_types_[i] == op_type) {
+        return i;
+      }
     }
-  }
-  return -1;
+    return -1;
 }
 
 GraphTransferInfo::Destination GetTransferDestination() const final {
@@ -69,6 +68,9 @@ GraphTransferInfo::Destination GetTransferDestination() const final {
   }
 
  private:
+  const std::vector<string> op_types_{"INPUT",   "OUTPUT",  "Conv2D",
+                                      "MaxPool", "NoOp",    "Add",
+                                      "Const",   "Softmax", "Identity"};
 } TEST_GRAPH_TRANSFER_OPS_DEFINITIONS;
 
 static Output BuildAddOps(const Scope& scope, const Input& x, const Input& y) {
@@ -312,7 +314,7 @@ TEST_F(GraphTransfererTest, LoadAddGraphWithOutputTensorMap) {
   const std::vector<string> output_node_names = {NAME_A_PLUS_B};
   status = gt_.LoadGraphFromProto(TEST_GRAPH_TRANSFER_OPS_DEFINITIONS, def,
                                   inputs, output_node_names, false);
-  ASSERT_TRUE(status.ok());
+  TF_ASSERT_OK(status);
 }
 
 TEST_F(GraphTransfererTest, LoadConvGraph) {
@@ -330,7 +332,7 @@ TEST_F(GraphTransfererTest, LoadConvGraph) {
       gt_.GetGraphTransferInfo().const_node_info_size();
   ASSERT_EQ(2, const_node_count);
   const int op_node_count = gt_.GetGraphTransferInfo().node_info_size();
-  ASSERT_EQ(3, op_node_count);
+  ASSERT_EQ(4, op_node_count);
   const GraphTransferInfo::NodeInfo* params_conv = FindNodeInfo(gt_, "conv");
   ASSERT_TRUE(params_conv != nullptr);
   const int id = params_conv->node_id();
@@ -356,7 +358,7 @@ TEST_F(GraphTransfererTest, LoadMaxPoolGraph) {
       gt_.GetGraphTransferInfo().const_node_info_size();
   ASSERT_EQ(2, const_node_count);
   const int op_node_count = gt_.GetGraphTransferInfo().node_info_size();
-  ASSERT_EQ(3, op_node_count);
+  ASSERT_EQ(4, op_node_count);
   const GraphTransferInfo::NodeInfo* params_max_pool =
       FindNodeInfo(gt_, "maxpool");
   ASSERT_TRUE(params_max_pool != nullptr);
diff --git a/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.cc b/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.cc
index 518b399c37..660ffd268d 100644
--- a/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.cc
+++ b/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.cc
@@ -27,6 +27,8 @@ namespace tensorflow {
 constexpr const char* const INPUT_OP_NAME = "INPUT";
 constexpr const char* const OUTPUT_OP_NAME = "OUTPUT";
 
+constexpr int ALIGNMENT_BYTES = 16;
+
 const bool DBG_DUMP_VERIFICATION_STRING = false;
 const int DBG_LEVEL = 0;  // -2: verbose, -1: debug, 0: info
 const bool DBG_USE_DUMMY_INPUT = false;
@@ -34,6 +36,22 @@ const bool DBG_USE_SAMPLE_INPUT = false;
 const int64 FLAG_ENABLE_PANDA_BINARY_INPUT = 0x01;
 const bool DBG_DUMP_INPUT_TENSOR_AS_FLOAT_DATA = false;
 
+static string AddPort(const string& node_name) {
+  if (node_name.find(':') != string::npos) {
+    return node_name;
+  } else {
+    return strings::StrCat(node_name, ":", 0);
+  }
+}
+
+static uint8* FindAlignedPointer(uint8* ptr) {
+  const uintptr_t data_ptr_int = reinterpret_cast<uintptr_t>(ptr);
+  const int shift_count =
+      (ALIGNMENT_BYTES - data_ptr_int % ALIGNMENT_BYTES) % ALIGNMENT_BYTES;
+  uint8* data_ptr = ptr + shift_count;
+  return data_ptr;
+}
+
 /* static */ GraphTransferInfo::NodeInfo* HexagonControlWrapper::FindNodeInfo(
     const string& name, GraphTransferInfo* graph_transfer_info) {
   for (GraphTransferInfo::NodeInfo& node_info :
@@ -60,18 +78,57 @@ bool HexagonControlWrapper::Init(const RemoteFusedGraphExecuteInfo& info) {
     std::vector<string> outputs;
     RemoteFusedGraphExecuteUtils::BuildRemoteGraphInputsAndOutputsFromProto(
         info, &inputs, &outputs);
-    graph_transferer_.LoadGraphFromProto(
+    Status status = graph_transferer_.LoadGraphFromProto(
         HexagonOpsDefinitions::getInstance(), info.remote_graph(), inputs,
         outputs,
         false  // shape_inference_for_unknown_shape
-        );
+    );
+    TF_CHECK_OK(status) << status;
   } else {
     // If graph transfer info is attached, just import it.
     graph_transferer_.SetSerializedGraphTransferInfo(
         info.serialized_executor_parameters());
   }
   execute_info_ = &info;
-  return soc_interface_Init();
+  bool success = soc_interface_Init();
+  if (!success) {
+    LOG(ERROR) << "Hexagon initialization was failed.  See log output.";
+    return false;
+  }
+  const GraphTransferInfo& gt_info = graph_transferer_.GetGraphTransferInfo();
+  std::vector<int> input_sizes;
+  std::vector<int> output_sizes;
+  CHECK_NOTNULL(execute_info_);
+  for (int i = 0; i < execute_info_->graph_input_node_name_size(); ++i) {
+    const string& input = execute_info_->graph_input_node_name(i);
+    LOG(INFO) << "Add input: " << input << ", " << i;
+    CHECK(input_port_map_.emplace(AddPort(input), i).second);
+    const RemoteFusedGraphExecuteInfo::TensorShapeTypeProto& shape_type =
+        execute_info_->default_graph_input_tensor_shape(i);
+    int64 buf_size = DataTypeSize(shape_type.dtype());
+    for (const TensorShapeProto::Dim& dim : shape_type.shape().dim()) {
+      buf_size *= dim.size();
+    }
+    input_sizes.emplace_back(static_cast<int>(buf_size));
+  }
+  for (int i = 0; i < execute_info_->graph_output_node_name_size(); ++i) {
+    const string& output = execute_info_->graph_output_node_name(i);
+    CHECK(output_port_map_.emplace(AddPort(output), i).second);
+    const RemoteFusedGraphExecuteInfo::TensorShapeTypeProto& shape_type =
+        execute_info_->default_graph_output_tensor_shape(i);
+
+    int64 buf_size = DataTypeSize(shape_type.dtype());
+    for (const TensorShapeProto::Dim& dim : shape_type.shape().dim()) {
+      buf_size *= dim.size();
+    }
+    output_sizes.emplace_back(static_cast<int>(buf_size));
+  }
+
+  LOG(INFO) << "Allocate inout buffer";
+  success &= soc_interface_AllocateInOutNodeBuffers(
+      input_sizes.size(), input_sizes.data(), output_sizes.size(),
+      output_sizes.data());
+  return success;
 }
 
 bool HexagonControlWrapper::Finalize() { return soc_interface_Finalize(); }
@@ -86,9 +143,6 @@ bool HexagonControlWrapper::SetupGraph() {
     GraphTransferInfo::NodeInfo* node_info =
         FindNodeInfo(graph_input.name(), &graph_transfer_info);
     CHECK_NE(node_info, nullptr);
-    node_info->set_type_name(INPUT_OP_NAME);
-    node_info->set_soc_op_id(
-        HexagonOpsDefinitions::getInstance().GetOpIdFor(INPUT_OP_NAME, {}));
   }
 
   // Generate a new output node which is connected to graph output node
@@ -202,12 +256,8 @@ bool HexagonControlWrapper::SetupGraph() {
     auto data = dummy_const_data_.emplace(
         std::piecewise_construct, std::make_tuple(node_id), std::make_tuple());
     CHECK(data.second);
-    const int additional_bytes_for_alignment = 16;
-    data.first->second.resize(data_size + additional_bytes_for_alignment - 1);
-    const uintptr_t data_ptr_int =
-        reinterpret_cast<uintptr_t>(data.first->second.data());
-    const int shift_count = (16 - data_ptr_int % 16) % 16;
-    uint8* data_ptr = data.first->second.data() + shift_count;
+    data.first->second.resize(data_size + ALIGNMENT_BYTES - 1);
+    uint8* data_ptr = FindAlignedPointer(data.first->second.data());
     std::memcpy(data_ptr, params.data().data(), data_size);
     soc_interface_AppendConstNode(params.name().c_str(),
                                   node_id + NODE_ID_OFFSET, shape_0, shape_1,
@@ -267,27 +317,37 @@ bool HexagonControlWrapper::TeardownGraph() {
   return soc_interface_TeardownGraph();
 }
 
-bool HexagonControlWrapper::FillInputNode(const string& node_name,
-                                          const ConstByteArray bytes) {
-  uint64 byte_size;
-  const int x = 1;
-  const int y = 299;
-  const int z = 299;
-  const int d = 3;
+bool HexagonControlWrapper::FillInputNode(
+    const string& node_name,
+    const std::array<int64, GraphTransferer::SHAPE_ARRAY_SIZE>& shape,
+    const ConstByteArray bytes) {
+  const string tensor_name = AddPort(node_name);
+  CHECK(input_port_map_.count(tensor_name) > 0);
+  const int port = input_port_map_.at(tensor_name);
+  if (input_tensor_data_.count(port) <= 0) {
+    input_tensor_data_.emplace(port, std::vector<uint8>{});
+  }
+  std::vector<uint8>& input_tensor_data = input_tensor_data_.at(port);
+
+  // hexagon only supports 32bit dimension
+  const int x = static_cast<int>(shape[0]);
+  const int y = static_cast<int>(shape[1]);
+  const int z = static_cast<int>(shape[2]);
+  const int d = static_cast<int>(shape[3]);
+
+  const uint64 byte_size = x * y * z * d * DataTypeSize(std::get<2>(bytes));
+  CHECK_EQ(byte_size, std::get<1>(bytes));
+  input_tensor_data.resize(byte_size + ALIGNMENT_BYTES);
+  uint8* data_ptr = FindAlignedPointer(input_tensor_data.data());
+
   if (DBG_USE_DUMMY_INPUT) {
-    const int array_length = x * y * z * d;
-    byte_size = array_length * sizeof(float);
-    dummy_input_float_.resize(array_length);
-    std::memset(dummy_input_float_.data(), 0, byte_size);
+    std::memset(data_ptr, 0, byte_size);
   } else {
-    CHECK(std::get<2>(bytes) == DT_FLOAT);
-    byte_size = std::get<1>(bytes);
-    dummy_input_float_.resize(byte_size / sizeof(float));
-    std::memcpy(dummy_input_float_.data(), std::get<0>(bytes), byte_size);
+    std::memcpy(data_ptr, std::get<0>(bytes), byte_size);
   }
-  return soc_interface_FillInputNodeFloat(
-      x, y, z, d, reinterpret_cast<uint8*>(dummy_input_float_.data()),
-      byte_size);
+
+  return soc_interface_FillInputNodeWithPort(port, x, y, z, d, data_ptr,
+                                             byte_size);
 }
 
 bool HexagonControlWrapper::ReadOutputNode(
@@ -304,26 +364,28 @@ bool HexagonControlWrapper::ReadOutputNode(
       break;
     }
   }
-  std::vector<IRemoteFusedGraphExecutor::ByteArray> outputs;
+  std::vector<ByteArray> outputs;
   ReadOutputNode(node_name, &outputs);
   CHECK_EQ(1, outputs.size());
-  IRemoteFusedGraphExecutor::ByteArray& output = outputs[0];
+  ByteArray& output = outputs[0];
   Tensor* output_tensor = tensor_allocator(output_shape);
   CHECK(output_tensor->TotalBytes() >= std::get<1>(output))
       << output_tensor->TotalBytes() << ", " << std::get<1>(output);
-  // TODO(satok): Avoid specifying float
-  std::memcpy(output_tensor->flat<float>().data(), std::get<0>(output),
-              std::get<1>(output));
+  TF_CHECK_OK(RemoteFusedGraphExecuteUtils::CopyByteArrayToTensor(
+      std::get<0>(output), std::get<1>(output), output_tensor));
 }
 
 bool HexagonControlWrapper::ReadOutputNode(
     const string& node_name, std::vector<ByteArray>* const outputs) {
   CHECK(outputs != nullptr);
   ByteArray output;
-  soc_interface_ReadOutputNodeFloat(node_name.c_str(), &std::get<0>(output),
-                                    &std::get<1>(output));
+  const string tensor_name = AddPort(node_name);
+  CHECK(output_port_map_.count(tensor_name) > 0);
+  const int port = output_port_map_.at(tensor_name);
+  soc_interface_ReadOutputNodeWithPort(port, &std::get<0>(output),
+                                       &std::get<1>(output));
   // TODO: Accept all results
-  std::get<2>(output) = DT_FLOAT;
+  // std::get<2>(output) = DT_FLOAT;
   outputs->emplace_back(output);
   return true;
 }
@@ -347,7 +409,9 @@ bool HexagonControlWrapper::FillInputNode(const string& node_name,
       }
     }
   }
-  FillInputNode(node_name, ba);
+  const std::array<int64, GraphTransferer::SHAPE_ARRAY_SIZE> shape =
+      GraphTransferer::ToTensorShapeArray(tensor.shape());
+  FillInputNode(node_name, shape, ba);
   return true;
 }
 
@@ -360,7 +424,9 @@ bool HexagonControlWrapper::Finalize() { return false; }
 bool HexagonControlWrapper::SetupGraph() { return false; }
 bool HexagonControlWrapper::ExecuteGraph() { return false; }
 bool HexagonControlWrapper::TeardownGraph() { return false; }
-bool HexagonControlWrapper::FillInputNode(const string&, const ConstByteArray) {
+bool HexagonControlWrapper::FillInputNode(
+    const string&, const std::array<int64, GraphTransferer::SHAPE_ARRAY_SIZE>&,
+    const ConstByteArray) {
   return false;
 }
 bool HexagonControlWrapper::FillInputNode(const string&, const Tensor&) {
diff --git a/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.h b/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.h
index 97448884e1..209ac9dbf4 100644
--- a/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.h
+++ b/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_HEXAGON_CONTROL_WRAPPER_H_
 #define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_HEXAGON_CONTROL_WRAPPER_H_
 
+#include <unordered_map>
 #include <vector>
 
 #include "tensorflow/core/framework/types.h"
@@ -32,6 +33,9 @@ namespace tensorflow {
  */
 class HexagonControlWrapper final : public IRemoteFusedGraphExecutor {
  public:
+  using ByteArray =
+      std::tuple<uint8* /* data */, uint64 /* size */, DataType /* type */>;
+
   HexagonControlWrapper() = default;
   int GetVersion() final;
   bool Init(const RemoteFusedGraphExecuteInfo& info) final;
@@ -45,7 +49,13 @@ class HexagonControlWrapper final : public IRemoteFusedGraphExecutor {
   bool ReadOutputNode(const string& node_name, std::vector<ByteArray>* outputs);
 
  private:
-  bool FillInputNode(const string& node_name, const ConstByteArray bytes);
+  using ConstByteArray = std::tuple<const uint8* /* data */, uint64 /* size */,
+                                    DataType /* type */>;
+
+  bool FillInputNode(
+      const string& node_name,
+      const std::array<int64, GraphTransferer::SHAPE_ARRAY_SIZE>& shape,
+      const ConstByteArray bytes);
 
   // CAVEAT: Need offset as HVX library reserves some ids
   static constexpr int NODE_ID_OFFSET = 0x10000;
@@ -57,11 +67,15 @@ class HexagonControlWrapper final : public IRemoteFusedGraphExecutor {
   GraphTransferer graph_transferer_{};
   // Dummy float array for input node.
   // TODO(satok): Use actual data passed by FillInputNode and remove
-  std::vector<float> dummy_input_float_{};
+  // std::vector<float> dummy_input_float_{};
+  std::unordered_map<int, std::vector<uint8>> input_tensor_data_{};
   // Dummy byte array for cosnt node.
   // TODO(satok): Remove
   std::unordered_map<int, std::vector<uint8>> dummy_const_data_{};
 
+  std::unordered_map<string, int> input_port_map_{};
+  std::unordered_map<string, int> output_port_map_{};
+
   TF_DISALLOW_COPY_AND_ASSIGN(HexagonControlWrapper);
 };
 
diff --git a/tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc b/tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc
index 54ba101501..cb9091e29f 100644
--- a/tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc
+++ b/tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc
@@ -46,8 +46,7 @@ adb push /tmp/imagenet_comp_graph_label_strings.txt /data/local/tmp
 
 namespace tensorflow {
 
-using ByteArray = IRemoteFusedGraphExecutor::ByteArray;
-using ConstByteArray = IRemoteFusedGraphExecutor::ConstByteArray;
+using ByteArray = HexagonControlWrapper::ByteArray;
 
 constexpr const char* const IMAGE_FILENAME = "/data/local/tmp/img_299x299.bmp";
 constexpr const char* const MODEL_FILENAME =
@@ -87,8 +86,7 @@ static void DumpTop10Results(const int byte_size,
       10 /* show top_n results */);
 }
 
-static void DumpTop10Results(
-    const std::vector<IRemoteFusedGraphExecutor::ByteArray>& outputs) {
+static void DumpTop10Results(const std::vector<ByteArray>& outputs) {
   CHECK(outputs.size() == 1);
   const int byte_size = std::get<1>(outputs.at(0));
   const float* float_array =
@@ -96,9 +94,8 @@ static void DumpTop10Results(
   DumpTop10Results(byte_size, float_array);
 }
 
-static void CheckFirstResult(
-    const std::vector<IRemoteFusedGraphExecutor::ByteArray>& outputs,
-    const int expected_first_id) {
+static void CheckFirstResult(const std::vector<ByteArray>& outputs,
+                             const int expected_first_id) {
   EXPECT_GE(outputs.size(), 1);
   const int byte_size = std::get<1>(outputs.at(0));
   const int element_count = byte_size / sizeof(float);
@@ -240,7 +237,7 @@ static void RunInferenceByHexagonControlWrapper(
   }
 
   // 5-1. Read output node's outputs
-  std::vector<IRemoteFusedGraphExecutor::ByteArray> outputs;
+  std::vector<ByteArray> outputs;
   hexagon_control_wrapper.ReadOutputNode("softmax", &outputs);
 
   // 5-2. Dump results
diff --git a/tensorflow/core/kernels/hexagon/hexagon_ops_definitions.cc b/tensorflow/core/kernels/hexagon/hexagon_ops_definitions.cc
index a4b79e6ec4..2b7585aed1 100644
--- a/tensorflow/core/kernels/hexagon/hexagon_ops_definitions.cc
+++ b/tensorflow/core/kernels/hexagon/hexagon_ops_definitions.cc
@@ -350,6 +350,8 @@ HexagonOpsDefinitions::BuildOpNameToSocOpTypeMap() {
 #ifdef ENABLE_EXPERIMENTAL_HEXNN_OPS
   EmplaceOpType("QuantizedMul", {}, SupportedOpType::QUANTIZED_MUL_8x8to32,
                 &op_map);
+  EmplaceOpType("QuantizedAdd", {}, SupportedOpType::QUANTIZED_ADD_8p8to32,
+                &op_map);
   EmplaceOpType("Pad", {}, SupportedOpType::PAD_F, &op_map);
   EmplaceOpType("SpaceToBatchND", {}, SupportedOpType::SPACE_TO_BATCH_ND_F,
                 &op_map),
@@ -359,6 +361,11 @@ HexagonOpsDefinitions::BuildOpNameToSocOpTypeMap() {
                 &op_map);
   EmplaceOpType("ConcatV2", {}, SupportedOpType::CONCAT_V2_F, &op_map);
   EmplaceOpType("Conv2DBackpropInput", {}, SupportedOpType::DECONV_F, &op_map);
+
+  EmplaceOpType("Tanh", {}, SupportedOpType::TANH_F, &op_map);
+  EmplaceOpType("Split", {}, SupportedOpType::SPLIT_F, &op_map);
+  EmplaceOpType("Transpose", {}, SupportedOpType::TRANSPOSE_F, &op_map);
+  EmplaceOpType("Concat", {}, SupportedOpType::CONCAT_F, &op_map);
 #endif
   return op_map;
 };
diff --git a/tensorflow/core/kernels/i_remote_fused_graph_executor.h b/tensorflow/core/kernels/i_remote_fused_graph_executor.h
index fe62a259de..09d1f43ff1 100644
--- a/tensorflow/core/kernels/i_remote_fused_graph_executor.h
+++ b/tensorflow/core/kernels/i_remote_fused_graph_executor.h
@@ -25,10 +25,6 @@ namespace tensorflow {
 
 class IRemoteFusedGraphExecutor {
  public:
-  using ByteArray =
-      std::tuple<uint8* /* data */, uint64 /* size */, DataType /* type */>;
-  using ConstByteArray = std::tuple<const uint8* /* data */, uint64 /* size */,
-                                    DataType /* type */>;
   using TensorAllocatorFunc = std::function<Tensor*(const TensorShape& shape)>;
 
   IRemoteFusedGraphExecutor() = default;
diff --git a/tensorflow/core/kernels/remote_fused_graph_execute_utils.cc b/tensorflow/core/kernels/remote_fused_graph_execute_utils.cc
index 103b2be691..dd9839d245 100644
--- a/tensorflow/core/kernels/remote_fused_graph_execute_utils.cc
+++ b/tensorflow/core/kernels/remote_fused_graph_execute_utils.cc
@@ -1280,6 +1280,69 @@ RemoteFusedGraphExecuteUtils::FuseRemoteGraphByPlacedArguments(
   return true;
 }
 
+/* static */ Status RemoteFusedGraphExecuteUtils::CopyByteArrayToTensor(
+    const void* src_ptr, const int src_size, Tensor* tensor) {
+  CHECK(tensor->TotalBytes() >= src_size)
+      << tensor->TotalBytes() << ", " << src_size;
+  void* dst_ptr;
+  switch (tensor->dtype()) {
+    case DT_FLOAT:
+      dst_ptr = tensor->flat<float>().data();
+      break;
+    case DT_DOUBLE:
+      dst_ptr = tensor->flat<double>().data();
+      break;
+    case DT_INT32:
+      dst_ptr = tensor->flat<int32>().data();
+      break;
+    case DT_UINT8:
+      dst_ptr = tensor->flat<uint8>().data();
+      break;
+    case DT_INT16:
+      dst_ptr = tensor->flat<int16>().data();
+      break;
+    case DT_INT8:
+      dst_ptr = tensor->flat<int8>().data();
+      break;
+    case DT_STRING:
+      dst_ptr = tensor->flat<string>().data();
+      break;
+    case DT_INT64:
+      dst_ptr = tensor->flat<int64>().data();
+      break;
+    case DT_BOOL:
+      dst_ptr = tensor->flat<bool>().data();
+      break;
+    case DT_QINT8:
+      dst_ptr = tensor->flat<qint8>().data();
+      break;
+    case DT_QUINT8:
+      dst_ptr = tensor->flat<quint8>().data();
+      break;
+    case DT_QINT32:
+      dst_ptr = tensor->flat<qint32>().data();
+      break;
+    case DT_BFLOAT16:
+      dst_ptr = tensor->flat<bfloat16>().data();
+      break;
+    case DT_QINT16:
+      dst_ptr = tensor->flat<qint16>().data();
+      break;
+    case DT_QUINT16:
+      dst_ptr = tensor->flat<quint16>().data();
+      break;
+    case DT_UINT16:
+      dst_ptr = tensor->flat<uint16>().data();
+      break;
+    default:
+      CHECK(false) << "type " << tensor->dtype() << " is not supported.";
+      break;
+  }
+  CHECK_NOTNULL(dst_ptr);
+  std::memcpy(dst_ptr, src_ptr, src_size);
+  return Status::OK();
+}
+
 /* static */ Status RemoteFusedGraphExecuteUtils::ReplaceInputNodeByPlaceHolder(
     const string& input, const DataType type, const TensorShape& shape,
     GraphDef* graph_def) {
diff --git a/tensorflow/core/kernels/remote_fused_graph_execute_utils.h b/tensorflow/core/kernels/remote_fused_graph_execute_utils.h
index a80fc79784..1d4423ed46 100644
--- a/tensorflow/core/kernels/remote_fused_graph_execute_utils.h
+++ b/tensorflow/core/kernels/remote_fused_graph_execute_utils.h
@@ -157,7 +157,7 @@ class RemoteFusedGraphExecuteUtils {
       const std::vector<std::pair<string, Tensor>>& input_tensors,
       const bool dry_run_inference, GraphDef* graph_def);
 
-  // Build remote fused graph execute info
+  // Build remote fused graph execute info.
   static Status BuildRemoteFusedGraphExecuteInfo(
       const string& executor_name, const GraphDef& subgraph_def,
       const std::vector<string>& inputs, const std::vector<string>& outputs,
@@ -165,31 +165,31 @@ class RemoteFusedGraphExecuteUtils {
       DataTypeVector* input_types, DataTypeVector* output_types);
 
   // Build remote fused graph execute op node by fusing specified subgraph
-  // as remote fused graph execute info
+  // as remote fused graph execute info.
   static Status BuildRemoteFusedGraphExecuteOpNode(
       const string& node_name, const string& executor_name,
       const GraphDef& subgraph_def, const std::vector<string>& inputs,
       const std::vector<string>& outputs, const bool require_shape_type,
       Graph* graph, Node** created_node);
 
-  // Build Identity node to forward remote graph node output
+  // Build Identity node to forward remote graph node output.
   static Status BuildIdentityOpNode(const string& node_name,
                                     const string& input_node_name,
                                     const int input_node_port,
                                     const DataType dt, Graph* graph,
                                     Node** created_node);
 
-  // Create clusters of given nodes
+  // Create clusters of given nodes.
   static Status ClusterizeNodes(const std::unordered_set<string>& node_names,
                                 const GraphDef& graph_def,
                                 std::vector<ClusterInfo>* cluster_infos);
 
-  // Build GraphDef of a given cluster
+  // Build GraphDef of a given cluster.
   static Status BuildClusterSubgraphDef(const ClusterInfo& cluster,
                                         const GraphDef& graph_def,
                                         GraphDef* subgraph_def);
 
-  // Build a cluster by given border
+  // Build a cluster by given border.
   // CAVEAT: The border must be consistent for one cluster.
   static Status BuildClusterByBorder(const std::vector<string>& border_inputs,
                                      const std::vector<string>& border_outputs,
@@ -211,7 +211,7 @@ class RemoteFusedGraphExecuteUtils {
                             const bool require_shape_type,
                             GraphDef* output_graph_def);
 
-  // Fuse subgraph of specified nodes
+  // Fuse subgraph of specified nodes.
   static Status FuseRemoteGraphByNodeNames(
       const GraphDef& input_graph_def, const std::vector<string>& inputs,
       const std::vector<string>& outputs,
@@ -220,7 +220,7 @@ class RemoteFusedGraphExecuteUtils {
       const string& remote_fused_graph_executor_name,
       const bool require_shape_type, GraphDef* output_graph_def);
 
-  // Fuse subgraph of specified border
+  // Fuse subgraph of specified border.
   static Status FuseRemoteGraphByBorder(
       const GraphDef& input_graph_def, const std::vector<string>& inputs,
       const std::vector<string>& outputs,
@@ -230,7 +230,7 @@ class RemoteFusedGraphExecuteUtils {
       const string& remote_graph_executor_name, const bool require_shape_type,
       GraphDef* output_graph_def);
 
-  // Place arguments to fuse remote graph
+  // Place arguments to fuse remote graph.
   static Status PlaceRemoteGraphArguments(
       const std::vector<string>& inputs, const std::vector<string>& outputs,
       const std::unordered_set<string>& fused_node_names,
@@ -239,7 +239,7 @@ class RemoteFusedGraphExecuteUtils {
       const string& remote_fused_graph_node_name,
       const string& remote_graph_executor_name, GraphDef* graph_def);
 
-  // Fuse remote graph by placed arguments
+  // Fuse remote graph by placed arguments.
   static Status FuseRemoteGraphByPlacedArguments(
       const GraphDef& input_graph_def,
       const std::vector<std::pair<string, Tensor>>& input_tensors,
@@ -249,6 +249,15 @@ class RemoteFusedGraphExecuteUtils {
       const GraphDef& input_graph_def,
       const std::vector<std::pair<string, Tensor>>& input_tensors);
 
+  // Copy a byte array to a tensor data.  Though tensor data must be
+  // updated with typed information in general, we can't guarantee that
+  // returned values from a remote processor has typed information because
+  // a logic running in the remote processor possibly be in a separate binary
+  // which may not link tensorflow libraries.  To deal with this situation,
+  // remote fused graph needs to overwrite the tensor data by a byte array.
+  static Status CopyByteArrayToTensor(const void* src_ptr, const int src_size,
+                                      Tensor* tensor);
+
  private:
   static void EmplaceTensorShapeType(const string& name, const Tensor& tensor,
                                      TensorShapeMap* tensor_shape_map);
diff --git a/tensorflow/core/platform/hexagon/soc_interface.h b/tensorflow/core/platform/hexagon/soc_interface.h
index f4a3cdf4bd..ca37b63e2b 100644
--- a/tensorflow/core/platform/hexagon/soc_interface.h
+++ b/tensorflow/core/platform/hexagon/soc_interface.h
@@ -22,6 +22,8 @@ limitations under the License.
 // naming conflicts.
 #ifdef __cplusplus
 extern "C" {
+#else
+#include <stdbool.h>
 #endif  // __cplusplus
 // Returns the version of loaded hexagon wrapper shared library.
 // You should assert that the version matches the expected version before
@@ -39,13 +41,30 @@ bool soc_interface_Finalize();
 bool soc_interface_ExecuteGraph();
 // Teardown graph setup
 bool soc_interface_TeardownGraph();
+
+// Allocate buffers for input node and output node
+bool soc_interface_AllocateInOutNodeBuffers(int input_count, int* input_sizes,
+                                            int output_count,
+                                            int* output_sizes);
+
+// Send input data to SOC with port
+bool soc_interface_FillInputNodeWithPort(int port, int x, int y, int z, int d,
+                                         const uint8_t* const buf,
+                                         uint64_t buf_byte_size);
+
 // Send input data to SOC
 bool soc_interface_FillInputNodeFloat(int x, int y, int z, int d,
                                       const uint8_t* const buf,
-                                      uint64_t buf_size);
+                                      uint64_t buf_byte_size);
+
+// Load output data from SOC with port
+bool soc_interface_ReadOutputNodeWithPort(int port, uint8_t** buf,
+                                          uint64_t* buf_byte_size);
+
 // Load output data from SOC
 bool soc_interface_ReadOutputNodeFloat(const char* const node_name,
-                                       uint8_t** buf, uint64_t* buf_size);
+                                       uint8_t** buf, uint64_t* buf_byte_size);
+
 // Setup graph
 // TODO(satok): Remove and use runtime version
 bool soc_interface_setupDummyGraph(int version);