diff options
Diffstat (limited to 'tensorflow')
15 files changed, 694 insertions, 150 deletions
diff --git a/tensorflow/contrib/hvx/hvx_ops_support_checker/hvx_ops_support_checker_main.cc b/tensorflow/contrib/hvx/hvx_ops_support_checker/hvx_ops_support_checker_main.cc index 6ae7c4a742..6af608396a 100644 --- a/tensorflow/contrib/hvx/hvx_ops_support_checker/hvx_ops_support_checker_main.cc +++ b/tensorflow/contrib/hvx/hvx_ops_support_checker/hvx_ops_support_checker_main.cc @@ -33,10 +33,15 @@ limitations under the License. #include "tensorflow/tools/graph_transforms/transform_utils.h" namespace tensorflow { + namespace { -static int ParseFlags(int argc, char* argv[], string* in_graph) { +static int ParseFlags(int argc, char* argv[], string* in_graph, + bool* dump_all_nodes, bool* dump_shape_and_type) { std::vector<Flag> flag_list = { - Flag("in_graph", in_graph, "input graph file name"), + Flag("in_graph", in_graph, "Input graph file name to check hvx support."), + Flag("dump_all_nodes", dump_all_nodes, "Dump all nodes in the model."), + Flag("dump_shape_and_type", dump_shape_and_type, + "Dump shape and type of nodes"), }; CHECK(Flags::Parse(&argc, argv, flag_list)); // We need to call this to set up global state for TensorFlow. @@ -48,12 +53,25 @@ static int ParseFlags(int argc, char* argv[], string* in_graph) { return 0; } -static void SummarizeNode(const NodeDef& node_def) { +static void SummarizeNode(const NodeDef& node_def, + const bool dump_shape_and_type) { LOG(INFO) << "Node(" << node_def.name() << ")"; LOG(INFO) << " op: " << node_def.op(); for (const string& input : node_def.input()) { LOG(INFO) << " Input: " << input; } + std::vector<DataType> data_types; + std::vector<TensorShape> shapes; + const Status status = RemoteFusedGraphExecuteUtils::GetOutputTensorShapeType( + node_def, &data_types, &shapes); + if (data_types.empty() || shapes.empty()) { + return; + } + CHECK_EQ(data_types.size(), shapes.size()); + for (int i = 0; i < data_types.size(); ++i) { + LOG(INFO) << " Output(" << i << "): " << DataType_Name(data_types.at(i)) + << ", " << shapes.at(i).DebugString(); + } } static void DumpRemoteFusedGraph(const NodeDef& node_def) { @@ -89,10 +107,14 @@ static void DumpRemoteFusedGraph(const NodeDef& node_def) { } } -static void CheckOpsSupport(const GraphDef& graph_def) { +static void CheckOpsSupport(const GraphDef& graph_def, + const bool dump_all_nodes, + const bool dump_shape_and_type) { const IGraphTransferOpsDefinitions& ops_definition = HexagonOpsDefinitions::getInstance(); LOG(INFO) << "Checking " << graph_def.node_size() << " nodes"; + LOG(INFO) << "dump_all_nodes = " << dump_all_nodes + << ", dump_shape_and_tpye = " << dump_shape_and_type; std::unordered_set<string> unsupported_ops; bool all_supported = true; @@ -125,9 +147,9 @@ static void CheckOpsSupport(const GraphDef& graph_def) { LOG(INFO) << count << " ops are not supported."; } - if (contains_remote_graph) { + if (contains_remote_graph || dump_all_nodes) { for (const NodeDef& node : graph_def.node()) { - SummarizeNode(node); + SummarizeNode(node, dump_shape_and_type); } } } @@ -137,7 +159,10 @@ static void CheckOpsSupport(const GraphDef& graph_def) { int main(int argc, char** argv) { tensorflow::string in_graph; - const int ret = tensorflow::ParseFlags(argc, argv, &in_graph); + bool dump_all_nodes; + bool dump_shape_and_type; + const int ret = tensorflow::ParseFlags(argc, argv, &in_graph, &dump_all_nodes, + &dump_shape_and_type); if (ret != 0) { return ret; } @@ -146,6 +171,6 @@ int main(int argc, char** argv) { TF_CHECK_OK(tensorflow::graph_transforms::LoadTextOrBinaryGraphFile( in_graph, &graph_def)); - tensorflow::CheckOpsSupport(graph_def); + tensorflow::CheckOpsSupport(graph_def, dump_all_nodes, dump_shape_and_type); return 0; } diff --git a/tensorflow/contrib/makefile/Makefile b/tensorflow/contrib/makefile/Makefile index 305ed0d11e..2150cfe9ea 100644 --- a/tensorflow/contrib/makefile/Makefile +++ b/tensorflow/contrib/makefile/Makefile @@ -279,6 +279,16 @@ ifeq ($(TARGET),ANDROID) LIBS += -lhexagon_controller LDFLAGS += -L$(HEXAGON_LIBS) CXXFLAGS += -DUSE_HEXAGON_LIBS + +# CAVEAT: We should disable TENSORFLOW_DISABLE_META while running +# quantized_matmul on Android because it crashes in +# MultiThreadGemm in tensorflow/core/kernels/meta_support.cc +# See http://b/33270149 +# TODO(satok): Remove once it's fixed + CXXFLAGS += -DTENSORFLOW_DISABLE_META + +# Declare __ANDROID_TYPES_FULL__ to enable required types for hvx + CXXFLAGS += -D__ANDROID_TYPES_FULL__ endif ifdef ENABLE_EXPERIMENTAL_HEXNN_OPS @@ -500,6 +510,18 @@ tensorflow/core/util/reporter.cc \ tensorflow/tools/benchmark/benchmark_model.cc \ tensorflow/tools/benchmark/benchmark_model_main.cc +ifdef HEXAGON_LIBS + TF_CC_SRCS += \ +tensorflow/cc/framework/scope.cc \ +tensorflow/cc/framework/ops.cc \ +tensorflow/cc/ops/const_op.cc \ +tensorflow/core/kernels/hexagon/graph_transfer_utils.cc \ +tensorflow/core/kernels/hexagon/graph_transferer.cc \ +tensorflow/core/kernels/hexagon/hexagon_control_wrapper.cc \ +tensorflow/core/kernels/hexagon/hexagon_ops_definitions.cc \ +tensorflow/core/kernels/hexagon/hexagon_remote_fused_graph_executor_build.cc +endif + # File names of the intermediate files target compilation generates. TF_CC_OBJS := $(addprefix $(OBJDIR), $(TF_CC_SRCS:.cc=.o)) PBT_GEN_FILES := $(addprefix $(PBTGENDIR), $(PBT_CC_SRCS)) diff --git a/tensorflow/contrib/makefile/sub_makefiles/hexagon_graph_execution/Makefile.in b/tensorflow/contrib/makefile/sub_makefiles/hexagon_graph_execution/Makefile.in index 2a6f66edcb..9aa81144fd 100644 --- a/tensorflow/contrib/makefile/sub_makefiles/hexagon_graph_execution/Makefile.in +++ b/tensorflow/contrib/makefile/sub_makefiles/hexagon_graph_execution/Makefile.in @@ -34,27 +34,7 @@ $(wildcard $(GTEST_DIR)/src/*.cc) \ $(wildcard $(GTEST_DIR)/src/*.h) \ $(GTEST_HEADERS) -# CAVEAT: We should disable TENSORFLOW_DISABLE_META while running -# quantized_matmul on Android because it crashes in -# MultiThreadGemm in tensorflow/core/kernels/meta_support.cc -# TODO(satok): Remove once it's fixed -CXXFLAGS += -DTENSORFLOW_DISABLE_META - -# Declare __ANDROID_TYPES_FULL__ to enable required types for hvx -CXXFLAGS += -D__ANDROID_TYPES_FULL__ - GRAPH_TRANSFER_SRCS := \ -tensorflow/cc/framework/scope.cc \ -tensorflow/cc/framework/ops.cc \ -tensorflow/cc/ops/const_op.cc \ -tensorflow/core/kernels/hexagon/graph_transfer_utils.cc \ -tensorflow/core/kernels/hexagon/graph_transferer.cc \ -tensorflow/core/kernels/hexagon/hexagon_control_wrapper.cc \ -tensorflow/core/kernels/hexagon/hexagon_ops_definitions.cc \ -tensorflow/core/kernels/hexagon/hexagon_remote_fused_graph_executor_build.cc \ -tensorflow/core/kernels/remote_fused_graph_execute_op.cc \ -tensorflow/core/kernels/remote_fused_graph_execute_utils.cc \ -tensorflow/core/ops/remote_fused_graph_ops.cc \ tensorflow/core/platform/posix/test.cc GRAPH_EXECUTION_SRCS := \ diff --git a/tensorflow/contrib/makefile/tf_op_files.txt b/tensorflow/contrib/makefile/tf_op_files.txt index 857d6fa21b..c73ec0305b 100644 --- a/tensorflow/contrib/makefile/tf_op_files.txt +++ b/tensorflow/contrib/makefile/tf_op_files.txt @@ -202,12 +202,15 @@ tensorflow/core/kernels/quantized_reshape_op.cc tensorflow/core/kernels/quantized_resize_bilinear_op.cc tensorflow/core/kernels/requantization_range_op.cc tensorflow/core/kernels/requantize.cc +tensorflow/core/kernels/remote_fused_graph_execute_op.cc +tensorflow/core/kernels/remote_fused_graph_execute_utils.cc tensorflow/core/ops/training_ops.cc tensorflow/core/ops/string_ops.cc tensorflow/core/ops/state_ops.cc tensorflow/core/ops/sparse_ops.cc tensorflow/core/ops/sendrecv_ops.cc tensorflow/core/ops/script_ops.cc +tensorflow/core/ops/remote_fused_graph_ops.cc tensorflow/core/ops/random_ops.cc tensorflow/core/ops/random_grad.cc tensorflow/core/ops/parsing_ops.cc diff --git a/tensorflow/core/kernels/hexagon/graph_transferer.cc b/tensorflow/core/kernels/hexagon/graph_transferer.cc index d927ef3efa..055108cd00 100644 --- a/tensorflow/core/kernels/hexagon/graph_transferer.cc +++ b/tensorflow/core/kernels/hexagon/graph_transferer.cc @@ -21,6 +21,7 @@ limitations under the License. #include "tensorflow/core/framework/op.h" #include "tensorflow/core/graph/algorithm.h" #include "tensorflow/core/graph/graph_constructor.h" +#include "tensorflow/core/graph/node_builder.h" #include "tensorflow/core/platform/env.h" #include "tensorflow/core/platform/types.h" #include "tensorflow/core/public/session.h" @@ -43,10 +44,14 @@ const char INPUTS_NODE_PREFIX[] = "inputs_for_"; const char OUTPUTS_NODE_PREFIX[] = "outputs_for_"; const char DATA_NODE_PREFIX[] = "data_for_op_"; const char CONST_SHAPE_PREFIX[] = "const_shape_"; +const char CONST_VAL_PREFIX[] = "const_val_"; +const char CONST_TENSOR_PREFIX[] = "const_tensor_"; const char PADDING_ATTR_NAME[] = "padding"; const char STRIDES_ATTR_NAME[] = "strides"; +const char KEEP_DIMS_ATTR_NAME[] = "keep_dims"; const char KSIZE_ATTR_NAME[] = "ksize"; const char NULL_OUTPUT_NAME[] = "NULL"; +const char AGGREGATED_INPUT_NODE_NAME[] = "graph_transfer_aggregated_input"; const int PADDING_NA_ID = 0; // VALID = 1, SAME = 2 // This is a temporary workaround to support android build @@ -58,6 +63,16 @@ static string ToString(T val) { return stream.str(); } +static Node* FindMutableNodeByName(const string& name, Graph* graph) { + const TensorId tid = ParseTensorName(name); + for (Node* node : graph->nodes()) { + if (node != nullptr && node->name() == tid.first) { + return node; + } + } + return nullptr; +} + /** * graph loading functions * - LoadGraphFromProto @@ -86,13 +101,22 @@ Status GraphTransferer::LoadGraphFromProto( } } + TF_RETURN_IF_ERROR(TransformGraphToAddAggregatedInputNode( + input_node_info_list, &graph, &shape_refiner)); + std::unordered_multimap<string, const Node*> op_name_to_node_multimap( graph.num_nodes()); for (const Node* const node : graph.nodes()) { + if (node == nullptr) { + continue; + } CacheNode(*node); } for (const Node* const node : graph.nodes()) { + if (node == nullptr) { + continue; + } VLOG(1) << "<Node> " << node->name(); for (const Node* const input_node : node->in_nodes()) { const string& name = input_node->name(); @@ -102,6 +126,9 @@ Status GraphTransferer::LoadGraphFromProto( } for (const Node* const node : graph.nodes()) { + if (node == nullptr) { + continue; + } status = RegisterNodeIfAllInputsAreCached( ops_definitions, shape_refiner, *node, false, input_node_info_list, output_node_names); @@ -265,19 +292,16 @@ GraphTransferInfo& GraphTransferer::GetMutableGraphTransferInfo() { return graph_transfer_info_; } -int GraphTransferer::CacheNode(const Node& node) { +void GraphTransferer::CacheNode(const Node& node) { if (node_name_to_id_cache_map_.count(node.name()) > 0) { - VLOG(1) << "Emplace node to cache failed"; - // TODO(satok): check here? - return -1; + return; } - VLOG(1) << "Cache node: " << node.name() << ", " << node.op_def().name(); node_name_cache_list_.emplace_back(&node); + const int node_id = node_name_cache_list_.size() - 1; bool emplace_succeeded = false; - std::tie(std::ignore, emplace_succeeded) = node_name_to_id_cache_map_.emplace( - node.name(), node_name_cache_list_.size() - 1); + std::tie(std::ignore, emplace_succeeded) = + node_name_to_id_cache_map_.emplace(node.name(), node_id); CHECK(emplace_succeeded); - return node_name_cache_list_.size() - 1; } bool GraphTransferer::AreAllInputsCached(const Node& node) const { @@ -291,22 +315,124 @@ bool GraphTransferer::AreAllInputsCached(const Node& node) const { return true; } +Status GraphTransferer::TransformGraphToAddAggregatedInputNode( + const std::vector<std::pair<string, Tensor>>& input_node_info_list, + Graph* graph, ShapeRefiner* shape_refiner) { + // Transform a remote fused graph to add an aggregated input node which takes + // all inputs of the remote graph. + DataTypeVector input_data_types; + std::vector<DataType> data_types; + std::vector<TensorShape> shapes; + std::vector<string> input_nodes; + for (int i = 0; i < input_node_info_list.size(); ++i) { + Node* node = FindMutableNodeByName(input_node_info_list.at(i).first, graph); + CHECK_NOTNULL(node); + input_nodes.emplace_back(node->name()); + input_data_types.emplace_back(input_node_info_list.at(i).second.dtype()); + data_types.emplace_back(input_node_info_list.at(i).second.dtype()); + shapes.emplace_back(input_node_info_list.at(i).second.shape()); + } + + NodeDef input_node_def; + auto builder = + NodeBuilder(AGGREGATED_INPUT_NODE_NAME, "RemoteFusedGraphExecute") + .Input(std::vector<NodeBuilder::NodeOut>{}) + .Attr("Tinputs", DataTypeVector{}) + .Attr("Toutputs", input_data_types) + .Attr("serialized_remote_fused_graph_execute_info", "") + .Attr(RemoteFusedGraphExecuteUtils::ATTR_OUTPUT_DATA_TYPES, + data_types) + .Attr(RemoteFusedGraphExecuteUtils::ATTR_OUTPUT_SHAPES, shapes); + + Node* input_node; + TF_RETURN_IF_ERROR(builder.Finalize(graph, &input_node)); + CHECK_NOTNULL(input_node); + + bool refined; + TF_RETURN_IF_ERROR(shape_refiner->UpdateNode(input_node, &refined)); + + shape_inference::InferenceContext* context = + shape_refiner->GetContext(input_node); + for (int i = 0; i < input_node_info_list.size(); ++i) { + shape_inference::ShapeHandle handle; + TF_RETURN_IF_ERROR(context->MakeShapeFromTensorShape( + input_node_info_list.at(i).second.shape(), &handle)); + TF_RETURN_IF_ERROR(shape_refiner->SetShape(input_node, i, handle)); + } + + // Cache the aggregate input node first as it's consumed first. + CacheNode(*input_node); + + std::vector<Node*> original_input_nodes(input_nodes.size()); + + for (int i = 0; i < input_nodes.size(); ++i) { + const string& node_name = input_nodes.at(i); + Node* original_input_node = FindMutableNodeByName(node_name, graph); + CHECK_NOTNULL(original_input_node); + CHECK_EQ(1, original_input_node->num_outputs()); // replaced by identity. + Node* created_node; + TF_RETURN_IF_ERROR(RemoteFusedGraphExecuteUtils::BuildIdentityOpNode( + node_name, AGGREGATED_INPUT_NODE_NAME, i, data_types.at(i), graph, + &created_node)); + CHECK_NOTNULL(created_node); + std::vector<DataType> data_types; + std::vector<TensorShape> shapes; + Status status = RemoteFusedGraphExecuteUtils::GetOutputTensorShapeType( + original_input_node->def(), &data_types, &shapes); + if (status.ok()) { + created_node->AddAttr( + RemoteFusedGraphExecuteUtils::ATTR_OUTPUT_DATA_TYPES, data_types); + created_node->AddAttr(RemoteFusedGraphExecuteUtils::ATTR_OUTPUT_SHAPES, + shapes); + } + for (const Edge* out_edge : original_input_node->out_edges()) { + Node* dst = out_edge->dst(); + int dst_port = out_edge->dst_input(); + // Unused edge will be removed when removing node. + graph->AddEdge(created_node, 0, dst, dst_port); + } + original_input_nodes[i] = original_input_node; + + TF_RETURN_IF_ERROR(shape_refiner->UpdateNode(created_node, &refined)); + + shape_inference::InferenceContext* context = + shape_refiner->GetContext(created_node); + CHECK_NOTNULL(context); + + // Cache replaced input node next to the aggregated input node. + CacheNode(*created_node); + } + + // Remove original input nodes after adding new input nodes to avoid + // reusing same pointer in Graph. + for (Node* original_input_node : original_input_nodes) { + graph->RemoveNode(original_input_node); + } + + return Status::OK(); +} + Status GraphTransferer::RegisterNode( const IGraphTransferOpsDefinitions& ops_definitions, const ShapeRefiner& shape_refiner, const Node& node, const std::vector<std::pair<string, Tensor>>& input_node_info_list, const std::vector<string>& output_node_names) { - VLOG(1) << "Register node: " << node.name(); + VLOG(1) << "Register node: " << node.name() << ", " << std::hex + << node_name_to_id_cache_map_.at(node.name()); if (node.name() == SOURCE_NODE_NAME || node.name() == SINK_NODE_NAME) { // Just ignore sink and source - return Status(); - } else if (RemoteFusedGraphExecuteUtils::IsInputNode(input_node_info_list, - node.name())) { + return Status::OK(); + } else if (node.name() == AGGREGATED_INPUT_NODE_NAME) { RegisterInputNode(ops_definitions, shape_refiner, node); + return Status::OK(); } else if (node.IsConstant()) { RegisterConstantNode(shape_refiner, node); + } else if (IsPadNode(node)) { + RegisterPadNode(ops_definitions, shape_refiner, node); } else if (HasPaddingAndStrides(node)) { RegisterNodeWithPaddingAndStrides(ops_definitions, shape_refiner, node); + } else if (NeedsToAddRank(node)) { + RegisterNodeWithRank(ops_definitions, shape_refiner, node); } else if (IsNodeFlattenReshape(node, shape_refiner)) { RegisterFlattenNode(ops_definitions, shape_refiner, node); } else if (ops_definitions.GetOpIdFor(node.type_string(), {}) != @@ -318,7 +444,7 @@ Status GraphTransferer::RegisterNode( " has not been implemented yet."); } - return Status(); + return Status::OK(); } void GraphTransferer::RegisterConstantNode(const ShapeRefiner& shape_refiner, @@ -361,8 +487,7 @@ void GraphTransferer::RegisterConstantNode(const ShapeRefiner& shape_refiner, const TensorProto* proto = nullptr; TF_CHECK_OK(GetNodeAttr(node.attrs(), "value", &proto)); Tensor const_tensor; - // TODO(b/32704451): Don't just ignore this status! - MakeTensorFromProto(*proto, &const_tensor).IgnoreError(); + TF_CHECK_OK(MakeTensorFromProto(*proto, &const_tensor)); const_node_info.set_dtype(const_tensor.dtype()); if (data_size > 0) { @@ -394,12 +519,82 @@ int GraphTransferer::RegisterConstantShape(const std::vector<int>& shape) { return node_name_to_id_cache_map_[shape_name]; } +int GraphTransferer::RegisterConstTensor(const Tensor& tensor, + const string& suffix) { + VLOG(1) << "Cache const tensor."; + const int dims = tensor.shape().dims(); + CHECK(dims <= 4); + const string node_name = strings::StrCat(CONST_TENSOR_PREFIX, "_", suffix); + if (node_name_to_id_cache_map_.count(node_name) <= 0) { + node_name_cache_list_.emplace_back(nullptr); + const int id = node_name_cache_list_.size() - 1; + node_name_to_id_cache_map_.emplace(node_name, id); + GraphTransferInfo::ConstNodeInfo& const_node_info = + *graph_transfer_info_.add_const_node_info(); + const_node_info.set_name(node_name); + const_node_info.set_node_id(id); + CHECK_EQ(4, SHAPE_ARRAY_SIZE); + for (int i = 0; i < SHAPE_ARRAY_SIZE; ++i) { + if (i < SHAPE_ARRAY_SIZE - dims) { + const_node_info.add_shape(1); + } else { + const_node_info.add_shape( + tensor.shape().dim_size(i - (SHAPE_ARRAY_SIZE - dims))); + } + } + const_node_info.set_dtype(tensor.dtype()); + const_node_info.set_data(tensor.tensor_data().data(), + tensor.tensor_data().size()); + } + return node_name_to_id_cache_map_[node_name]; +} + +int GraphTransferer::RegisterConstScalar(const DataType dt, const int val, + const int dst_id, + const int dst_input_count) { + VLOG(1) << "Cache const."; + const string val_name = + CONST_VAL_PREFIX + ToString(dst_id) + '_' + ToString(dst_input_count); + if (node_name_to_id_cache_map_.count(val_name) <= 0) { + node_name_cache_list_.emplace_back(nullptr); + const int id = node_name_cache_list_.size() - 1; + node_name_to_id_cache_map_.emplace(val_name, id); + GraphTransferInfo::ConstNodeInfo& const_node_info = + *graph_transfer_info_.add_const_node_info(); + const_node_info.set_name(val_name); + const_node_info.set_node_id(id); + // TODO(satok): Do not assume rank is 4 here. + const_node_info.add_shape(static_cast<int64>(1)); + const_node_info.add_shape(static_cast<int64>(1)); + const_node_info.add_shape(static_cast<int64>(1)); + const_node_info.add_shape(static_cast<int64>(1)); + const_node_info.set_data(&val, DataTypeSize(dt)); + } + return node_name_to_id_cache_map_[val_name]; +} + bool GraphTransferer::HasPaddingAndStrides(const Node& node) { auto attrs = node.attrs(); return attrs.Find(PADDING_ATTR_NAME) != nullptr && attrs.Find(STRIDES_ATTR_NAME) != nullptr; } +bool GraphTransferer::NeedsToAddRank(const Node& node) { + const string& op_type = node.def().op(); + if (op_type == "Transpose" || op_type == "ExpandDims") { + return true; + } + return false; +} + +bool GraphTransferer::IsPadNode(const Node& node) { + const string& op_type = node.def().op(); + if (op_type == "Pad") { + return true; + } + return false; +} + bool GraphTransferer::IsNodeFlattenReshape(const Node& node, const ShapeRefiner& shape_refiner) { // Check if node is reshape op @@ -473,15 +668,123 @@ void GraphTransferer::RegisterNodeWithPaddingAndStrides( node.num_outputs(), true /* append_input */, true /* append_output */); } -void GraphTransferer::RegisterInputNode( +void GraphTransferer::RegisterNodeWithRank( const IGraphTransferOpsDefinitions& ops_definitions, const ShapeRefiner& shape_refiner, const Node& node) { - VLOG(1) << "Register input node: " << node.name(); CHECK_EQ(node_name_to_id_cache_map_.count(node.name()), 1); const int id = node_name_to_id_cache_map_[node.name()]; + shape_inference::InferenceContext* context = shape_refiner.GetContext(&node); + const Node* input0_node; + TF_CHECK_OK(node.input_node(0, &input0_node)); + CHECK_NOTNULL(input0_node); + std::vector<TensorShape> shapes; + Status status = RemoteFusedGraphExecuteUtils::GetOutputTensorShapeType( + input0_node->def(), nullptr, &shapes); + CHECK_EQ(1, shapes.size()) << "Output size should be 1."; + const int const_val_id = + RegisterConstScalar(DT_INT32, shapes.at(0).dims(), id, node.num_inputs()); + std::vector<int> extra_inputs{const_val_id}; + // TODO(satok): Set correct data type if it's given. + const int op_type_id = ops_definitions.GetOpIdFor(node.type_string(), {}); + CHECK(op_type_id >= 0 && op_type_id < ops_definitions.GetTotalOpsCount()) + << "Op " << node.type_string() << " not found in map(id = " << op_type_id + << ")"; + bool keep_dims = false; + int padding_id = PADDING_NA_ID; + if (context->GetAttr(KEEP_DIMS_ATTR_NAME, &keep_dims).ok()) { + padding_id = keep_dims ? Padding::SAME : Padding::VALID; + } + + AppendNodeParamsWithIoParams( + shape_refiner, node, node.name(), id, node.type_string(), op_type_id, + padding_id, node.num_inputs(), extra_inputs, node.num_outputs(), + true /* append_input */, true /* append_output */); +} + +void GraphTransferer::RegisterPadNode( + const IGraphTransferOpsDefinitions& ops_definitions, + const ShapeRefiner& shape_refiner, const Node& node) { + static constexpr int PAD_WIDTH = 4; + static constexpr int PAD_HEIGHT = 2; + VLOG(1) << "Register generic node: " << node.name(); + CHECK_EQ(node_name_to_id_cache_map_.count(node.name()), 1); + const int id = node_name_to_id_cache_map_[node.name()]; + + // TODO(satok): Set correct data type if it's given. + const int op_type_id = ops_definitions.GetOpIdFor(node.type_string(), {}); + CHECK(op_type_id >= 0 && op_type_id < ops_definitions.GetTotalOpsCount()); + + CHECK_EQ(2, node.num_inputs()); + + GraphTransferInfo::NodeInputInfo& node_input_info = + *graph_transfer_info_.add_node_input_info(); + node_input_info.set_node_id(id); + + AddNodeInputByInputIndex(node, 0, &node_input_info); + + const Edge* edge = nullptr; + TF_CHECK_OK(node.input_edge(1, &edge)); + const Node* input_node = edge->src(); + CHECK_NOTNULL(input_node); + CHECK(input_node->IsConstant()); + + const TensorProto* tensor_proto = nullptr; + TF_CHECK_OK(GetNodeAttr(input_node->def(), "value", &tensor_proto)); + CHECK_NOTNULL(tensor_proto); + Tensor const_tensor; + TF_CHECK_OK(MakeTensorFromProto(*tensor_proto, &const_tensor)); + CHECK_EQ(2, const_tensor.shape().dims()); + CHECK_EQ(PAD_HEIGHT, const_tensor.shape().dim_size(1)); + if (const_tensor.shape().dim_size(0) == PAD_WIDTH) { + AddNodeInputByInputIndex(node, 1, &node_input_info); + } else if (const_tensor.shape().dim_size(0) < PAD_WIDTH) { + const int width = const_tensor.shape().dim_size(0); + const TensorProto* proto = nullptr; + TF_CHECK_OK(GetNodeAttr(input_node->def(), "value", &proto)); + Tensor const_tensor; + TF_CHECK_OK(MakeTensorFromProto(*proto, &const_tensor)); + CHECK_EQ(DT_INT32, const_tensor.dtype()); + // reshape tensor input to be rank 4. + // TODO(satok): Never assume rank is 4. + Tensor new_const_tensor(const_tensor.dtype(), TensorShape{4, 2}); + for (int i = 0; i < PAD_HEIGHT; ++i) { + for (int j = 0; j < PAD_WIDTH; ++j) { + if (j < PAD_WIDTH - width) { + new_const_tensor.matrix<int32>()(j, i) = 0; + } else { + new_const_tensor.matrix<int32>()(j, i) = + const_tensor.matrix<int32>()(j - (PAD_WIDTH - width), i); + } + } + } + + const int id = RegisterConstTensor( + new_const_tensor, + strings::StrCat(input_node->name(), "_", node.name(), "_1")); + + GraphTransferInfo::NodeInput& node_input = + *node_input_info.add_node_input(); + node_input.set_node_id(id); + node_input.set_output_port(0); + } else { + CHECK(false); + } + + AppendNodeParamsWithIoParams( + shape_refiner, node, node.name(), id, node.type_string(), op_type_id, + PADDING_NA_ID, node.num_inputs(), {}, node.num_outputs(), + false /* append_input */, true /* append_output */); +} + +void GraphTransferer::RegisterInputNode( + const IGraphTransferOpsDefinitions& ops_definitions, + const ShapeRefiner& shape_refiner, const Node& node) { const string op_type = node.type_string(); + VLOG(1) << "Register input node: " << node.name() << ", " << op_type; + CHECK_EQ(node_name_to_id_cache_map_.count(node.name()), 1); + const int id = node_name_to_id_cache_map_[node.name()]; // TODO(satok): Set correct data type if it's given. - const int op_type_id = ops_definitions.GetOpIdFor(op_type, {}); + const int op_type_id = ops_definitions.GetOpIdFor("INPUT", {}); CHECK(op_type_id >= 0 && op_type_id < ops_definitions.GetTotalOpsCount()) << "Op" << node.name() << ", " << op_type << " is not supported," << op_type_id; @@ -546,7 +849,6 @@ void GraphTransferer::AppendNodeParams(const string& name, const int id, const int padding, const int inputs_size, const std::vector<int>& extra_inputs, const int outputs_size) { - VLOG(1) << "Append node params: " << name; GraphTransferInfo::NodeInfo& node_info = *graph_transfer_info_.add_node_info(); node_info.set_name(name); @@ -559,6 +861,23 @@ void GraphTransferer::AppendNodeParams(const string& name, const int id, node_info.set_output_count(static_cast<int>(outputs_size)); } +void GraphTransferer::AddNodeInputByInputIndex( + const Node& node, const int idx, + GraphTransferInfo::NodeInputInfo* node_input_info) { + const Edge* edge = nullptr; + TF_CHECK_OK(node.input_edge(idx, &edge)); + const Node* input_node = edge->src(); + CHECK_NOTNULL(input_node); + const int port = edge->src_output(); + + const std::string& op_name = input_node->name(); + CHECK_GT(node_name_to_id_cache_map_.count(op_name), 0) << op_name; + const int src_id = node_name_to_id_cache_map_[op_name]; + GraphTransferInfo::NodeInput& node_input = *node_input_info->add_node_input(); + node_input.set_node_id(src_id); + node_input.set_output_port(port); +} + void GraphTransferer::AppendNodeInputParams( const int id, const Node& node, const std::vector<int>& extra_inputs) { VLOG(1) << "Append input params: " << node.name() << ", " << node.num_inputs() @@ -567,18 +886,7 @@ void GraphTransferer::AppendNodeInputParams( *graph_transfer_info_.add_node_input_info(); node_input_info.set_node_id(id); for (int i = 0; i < node.num_inputs(); ++i) { - const Edge* edge = nullptr; - TF_CHECK_OK(node.input_edge(i, &edge)); - const Node* input_node = edge->src(); - const int port = edge->src_output(); - - const std::string& op_name = input_node->name(); - CHECK_GT(node_name_to_id_cache_map_.count(op_name), 0) << op_name; - const int src_id = node_name_to_id_cache_map_[op_name]; - GraphTransferInfo::NodeInput& node_input = - *node_input_info.add_node_input(); - node_input.set_node_id(src_id); - node_input.set_output_port(port); + AddNodeInputByInputIndex(node, i, &node_input_info); } for (const int extra_input : extra_inputs) { GraphTransferInfo::NodeInput& node_input = @@ -596,9 +904,10 @@ void GraphTransferer::AppendNodeOutputParams(const ShapeRefiner& shape_refiner, *graph_transfer_info_.add_node_output_info(); node_output_info.set_node_id(id); + std::vector<DataType> data_types; std::vector<TensorShape> shapes; Status status = RemoteFusedGraphExecuteUtils::GetOutputTensorShapeType( - node.attrs(), nullptr, &shapes); + node.attrs(), &data_types, &shapes); for (int i = 0; i < node.num_outputs(); ++i) { int data_size = -1; @@ -608,16 +917,20 @@ void GraphTransferer::AppendNodeOutputParams(const ShapeRefiner& shape_refiner, shape_inference::InferenceContext* context = shape_refiner.GetContext(&node); - shape_inference::ShapeHandle shape_handle = context->output(output_index); - const shape_inference::DimensionHandle num_elements_dim = - context->NumElements(shape_handle); - if (context->ValueKnown(num_elements_dim)) { + + if (context != nullptr && context->ValueKnown(context->NumElements( + context->output(output_index)))) { + const shape_inference::DimensionHandle num_elements_dim = + context->NumElements(context->output(output_index)); const int64 num_output_elements = context->Value(num_elements_dim); data_size = max_bytes_per_data * num_output_elements; + if (status.ok()) { + TF_CHECK_OK(status); + CHECK_EQ(shapes.at(i).num_elements(), num_output_elements); + } } else { TF_CHECK_OK(status); // Use attribute attached to node - CHECK_EQ(node.num_outputs(), shapes.size()) << node.name(); data_size = max_bytes_per_data * shapes.at(i).num_elements(); } CHECK_GE(data_size, 0); @@ -722,11 +1035,11 @@ bool GraphTransferer::TransferParamsComparator::operator()( const int node_id0 = obj0.node_id(); const int node_id1 = obj1.node_id(); bool obj0_uses_obj1 = false; - if (dependency_map_.count(node_id0)) { + if (dependency_map_.count(node_id0) > 0) { obj0_uses_obj1 = dependency_map_.at(node_id0).count(node_id1) > 0; } bool obj1_uses_obj0 = false; - if (dependency_map_.count(node_id1)) { + if (dependency_map_.count(node_id1) > 0) { obj1_uses_obj0 = dependency_map_.at(node_id1).count(node_id0) > 0; } CHECK(!obj0_uses_obj1 || !obj1_uses_obj0); @@ -735,7 +1048,9 @@ bool GraphTransferer::TransferParamsComparator::operator()( } else if (obj1_uses_obj0) { return true; } - return node_id0 > node_id1; + // If there is no dependency between two nodes, it expects that + // the execution order follows node id order. + return node_id0 < node_id1; } /* static */ void GraphTransferer::FillDependencyRec( diff --git a/tensorflow/core/kernels/hexagon/graph_transferer.h b/tensorflow/core/kernels/hexagon/graph_transferer.h index fa12b22d75..64c60b87c6 100644 --- a/tensorflow/core/kernels/hexagon/graph_transferer.h +++ b/tensorflow/core/kernels/hexagon/graph_transferer.h @@ -88,6 +88,9 @@ class GraphTransferer { // Dump verification string of parameters to verify with offline tools void DumpVerificationStringOfNodeTransferParams() const; + static std::array<int64, SHAPE_ARRAY_SIZE> ToTensorShapeArray( + const TensorShape& shape); + private: class TransferParamsComparator { public: @@ -98,10 +101,16 @@ class GraphTransferer { const std::unordered_map<int, std::unordered_set<int>>& dependency_map_; }; - int CacheNode(const Node& node); + void CacheNode(const Node& node); bool AreAllInputsCached(const Node& node) const; + // Transform a remote fused graph to add an aggregated input node which takes + // all inputs of the remote graph. + Status TransformGraphToAddAggregatedInputNode( + const std::vector<std::pair<string, Tensor>>& input_node_info_list, + Graph* graph, ShapeRefiner* shape_refiner); + Status RegisterNode( const IGraphTransferOpsDefinitions& ops_definitions, const ShapeRefiner& shape_refiner, const Node& node, @@ -113,8 +122,17 @@ class GraphTransferer { int RegisterConstantShape(const std::vector<int>& shape); + int RegisterConstTensor(const Tensor& tensor, const string& suffix); + + int RegisterConstScalar(const DataType dt, const int val, const int dst_id, + const int dst_input_count); + bool HasPaddingAndStrides(const Node& node); + bool NeedsToAddRank(const Node& node); + + bool IsPadNode(const Node& node); + // Return true if the node is a reshape op which just flattens input // TODO(satok): Remove this method once generic reshape op is implemented in // SOC @@ -125,6 +143,13 @@ class GraphTransferer { const IGraphTransferOpsDefinitions& ops_definitions, const ShapeRefiner& shape_refiner, const Node& node); + void RegisterNodeWithRank(const IGraphTransferOpsDefinitions& ops_definitions, + const ShapeRefiner& shape_refiner, + const Node& node); + + void RegisterPadNode(const IGraphTransferOpsDefinitions& ops_definitions, + const ShapeRefiner& shape_refiner, const Node& node); + void RegisterInputNode(const IGraphTransferOpsDefinitions& ops_definitions, const ShapeRefiner& shape_refiner, const Node& node); @@ -150,6 +175,10 @@ class GraphTransferer { const std::vector<int>& extra_inputs, const int outputs_size); + void AddNodeInputByInputIndex( + const Node& node, const int idx, + GraphTransferInfo::NodeInputInfo* node_input_info); + void AppendNodeInputParams(const int id, const Node& node, const std::vector<int>& extra_inputs); @@ -167,9 +196,6 @@ class GraphTransferer { const int outputs_size, const bool append_input_params, const bool append_output_params); - static std::array<int64, SHAPE_ARRAY_SIZE> ToTensorShapeArray( - const TensorShape& shape); - static string ToPaddingDebugString(int padding); // Create dependency map diff --git a/tensorflow/core/kernels/hexagon/graph_transferer_test.cc b/tensorflow/core/kernels/hexagon/graph_transferer_test.cc index ebd4a90330..74ffc026f7 100644 --- a/tensorflow/core/kernels/hexagon/graph_transferer_test.cc +++ b/tensorflow/core/kernels/hexagon/graph_transferer_test.cc @@ -25,6 +25,7 @@ limitations under the License. #include "tensorflow/core/kernels/hexagon/i_graph_transfer_ops_definitions.h" #include "tensorflow/core/kernels/i_remote_fused_graph_executor.h" #include "tensorflow/core/lib/core/status.h" +#include "tensorflow/core/lib/core/status_test_util.h" #include "tensorflow/core/lib/io/path.h" #include "tensorflow/core/platform/test.h" #include "tensorflow/core/public/session.h" @@ -47,21 +48,19 @@ class GraphTransfererTest : public ::testing::Test { GraphTransferer gt_; }; -static const std::vector<string> OP_TYPES{ - "INPUT", "OUTPUT", "Conv2D", "MaxPool", "NoOp", "Add", "Const", "Softmax"}; const RemoteFusedGraphExecuteUtils::TensorShapeMap EMPTY_OUTPUT_TENSOR_MAP; class TestGraphTransferOpsDefinitions : public IGraphTransferOpsDefinitions { public: - int GetTotalOpsCount() const final { return OP_TYPES.size(); } + int GetTotalOpsCount() const final { return op_types_.size(); } -int GetOpIdFor(const string& op_type, const DataTypeVector&) const final { - for (int i = 0; i < OP_TYPES.size(); ++i) { - if (OP_TYPES[i] == op_type) { - return i; + int GetOpIdFor(const string& op_type, const DataTypeVector&) const final { + for (int i = 0; i < op_types_.size(); ++i) { + if (op_types_[i] == op_type) { + return i; + } } - } - return -1; + return -1; } GraphTransferInfo::Destination GetTransferDestination() const final { @@ -69,6 +68,9 @@ GraphTransferInfo::Destination GetTransferDestination() const final { } private: + const std::vector<string> op_types_{"INPUT", "OUTPUT", "Conv2D", + "MaxPool", "NoOp", "Add", + "Const", "Softmax", "Identity"}; } TEST_GRAPH_TRANSFER_OPS_DEFINITIONS; static Output BuildAddOps(const Scope& scope, const Input& x, const Input& y) { @@ -312,7 +314,7 @@ TEST_F(GraphTransfererTest, LoadAddGraphWithOutputTensorMap) { const std::vector<string> output_node_names = {NAME_A_PLUS_B}; status = gt_.LoadGraphFromProto(TEST_GRAPH_TRANSFER_OPS_DEFINITIONS, def, inputs, output_node_names, false); - ASSERT_TRUE(status.ok()); + TF_ASSERT_OK(status); } TEST_F(GraphTransfererTest, LoadConvGraph) { @@ -330,7 +332,7 @@ TEST_F(GraphTransfererTest, LoadConvGraph) { gt_.GetGraphTransferInfo().const_node_info_size(); ASSERT_EQ(2, const_node_count); const int op_node_count = gt_.GetGraphTransferInfo().node_info_size(); - ASSERT_EQ(3, op_node_count); + ASSERT_EQ(4, op_node_count); const GraphTransferInfo::NodeInfo* params_conv = FindNodeInfo(gt_, "conv"); ASSERT_TRUE(params_conv != nullptr); const int id = params_conv->node_id(); @@ -356,7 +358,7 @@ TEST_F(GraphTransfererTest, LoadMaxPoolGraph) { gt_.GetGraphTransferInfo().const_node_info_size(); ASSERT_EQ(2, const_node_count); const int op_node_count = gt_.GetGraphTransferInfo().node_info_size(); - ASSERT_EQ(3, op_node_count); + ASSERT_EQ(4, op_node_count); const GraphTransferInfo::NodeInfo* params_max_pool = FindNodeInfo(gt_, "maxpool"); ASSERT_TRUE(params_max_pool != nullptr); diff --git a/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.cc b/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.cc index 518b399c37..660ffd268d 100644 --- a/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.cc +++ b/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.cc @@ -27,6 +27,8 @@ namespace tensorflow { constexpr const char* const INPUT_OP_NAME = "INPUT"; constexpr const char* const OUTPUT_OP_NAME = "OUTPUT"; +constexpr int ALIGNMENT_BYTES = 16; + const bool DBG_DUMP_VERIFICATION_STRING = false; const int DBG_LEVEL = 0; // -2: verbose, -1: debug, 0: info const bool DBG_USE_DUMMY_INPUT = false; @@ -34,6 +36,22 @@ const bool DBG_USE_SAMPLE_INPUT = false; const int64 FLAG_ENABLE_PANDA_BINARY_INPUT = 0x01; const bool DBG_DUMP_INPUT_TENSOR_AS_FLOAT_DATA = false; +static string AddPort(const string& node_name) { + if (node_name.find(':') != string::npos) { + return node_name; + } else { + return strings::StrCat(node_name, ":", 0); + } +} + +static uint8* FindAlignedPointer(uint8* ptr) { + const uintptr_t data_ptr_int = reinterpret_cast<uintptr_t>(ptr); + const int shift_count = + (ALIGNMENT_BYTES - data_ptr_int % ALIGNMENT_BYTES) % ALIGNMENT_BYTES; + uint8* data_ptr = ptr + shift_count; + return data_ptr; +} + /* static */ GraphTransferInfo::NodeInfo* HexagonControlWrapper::FindNodeInfo( const string& name, GraphTransferInfo* graph_transfer_info) { for (GraphTransferInfo::NodeInfo& node_info : @@ -60,18 +78,57 @@ bool HexagonControlWrapper::Init(const RemoteFusedGraphExecuteInfo& info) { std::vector<string> outputs; RemoteFusedGraphExecuteUtils::BuildRemoteGraphInputsAndOutputsFromProto( info, &inputs, &outputs); - graph_transferer_.LoadGraphFromProto( + Status status = graph_transferer_.LoadGraphFromProto( HexagonOpsDefinitions::getInstance(), info.remote_graph(), inputs, outputs, false // shape_inference_for_unknown_shape - ); + ); + TF_CHECK_OK(status) << status; } else { // If graph transfer info is attached, just import it. graph_transferer_.SetSerializedGraphTransferInfo( info.serialized_executor_parameters()); } execute_info_ = &info; - return soc_interface_Init(); + bool success = soc_interface_Init(); + if (!success) { + LOG(ERROR) << "Hexagon initialization was failed. See log output."; + return false; + } + const GraphTransferInfo& gt_info = graph_transferer_.GetGraphTransferInfo(); + std::vector<int> input_sizes; + std::vector<int> output_sizes; + CHECK_NOTNULL(execute_info_); + for (int i = 0; i < execute_info_->graph_input_node_name_size(); ++i) { + const string& input = execute_info_->graph_input_node_name(i); + LOG(INFO) << "Add input: " << input << ", " << i; + CHECK(input_port_map_.emplace(AddPort(input), i).second); + const RemoteFusedGraphExecuteInfo::TensorShapeTypeProto& shape_type = + execute_info_->default_graph_input_tensor_shape(i); + int64 buf_size = DataTypeSize(shape_type.dtype()); + for (const TensorShapeProto::Dim& dim : shape_type.shape().dim()) { + buf_size *= dim.size(); + } + input_sizes.emplace_back(static_cast<int>(buf_size)); + } + for (int i = 0; i < execute_info_->graph_output_node_name_size(); ++i) { + const string& output = execute_info_->graph_output_node_name(i); + CHECK(output_port_map_.emplace(AddPort(output), i).second); + const RemoteFusedGraphExecuteInfo::TensorShapeTypeProto& shape_type = + execute_info_->default_graph_output_tensor_shape(i); + + int64 buf_size = DataTypeSize(shape_type.dtype()); + for (const TensorShapeProto::Dim& dim : shape_type.shape().dim()) { + buf_size *= dim.size(); + } + output_sizes.emplace_back(static_cast<int>(buf_size)); + } + + LOG(INFO) << "Allocate inout buffer"; + success &= soc_interface_AllocateInOutNodeBuffers( + input_sizes.size(), input_sizes.data(), output_sizes.size(), + output_sizes.data()); + return success; } bool HexagonControlWrapper::Finalize() { return soc_interface_Finalize(); } @@ -86,9 +143,6 @@ bool HexagonControlWrapper::SetupGraph() { GraphTransferInfo::NodeInfo* node_info = FindNodeInfo(graph_input.name(), &graph_transfer_info); CHECK_NE(node_info, nullptr); - node_info->set_type_name(INPUT_OP_NAME); - node_info->set_soc_op_id( - HexagonOpsDefinitions::getInstance().GetOpIdFor(INPUT_OP_NAME, {})); } // Generate a new output node which is connected to graph output node @@ -202,12 +256,8 @@ bool HexagonControlWrapper::SetupGraph() { auto data = dummy_const_data_.emplace( std::piecewise_construct, std::make_tuple(node_id), std::make_tuple()); CHECK(data.second); - const int additional_bytes_for_alignment = 16; - data.first->second.resize(data_size + additional_bytes_for_alignment - 1); - const uintptr_t data_ptr_int = - reinterpret_cast<uintptr_t>(data.first->second.data()); - const int shift_count = (16 - data_ptr_int % 16) % 16; - uint8* data_ptr = data.first->second.data() + shift_count; + data.first->second.resize(data_size + ALIGNMENT_BYTES - 1); + uint8* data_ptr = FindAlignedPointer(data.first->second.data()); std::memcpy(data_ptr, params.data().data(), data_size); soc_interface_AppendConstNode(params.name().c_str(), node_id + NODE_ID_OFFSET, shape_0, shape_1, @@ -267,27 +317,37 @@ bool HexagonControlWrapper::TeardownGraph() { return soc_interface_TeardownGraph(); } -bool HexagonControlWrapper::FillInputNode(const string& node_name, - const ConstByteArray bytes) { - uint64 byte_size; - const int x = 1; - const int y = 299; - const int z = 299; - const int d = 3; +bool HexagonControlWrapper::FillInputNode( + const string& node_name, + const std::array<int64, GraphTransferer::SHAPE_ARRAY_SIZE>& shape, + const ConstByteArray bytes) { + const string tensor_name = AddPort(node_name); + CHECK(input_port_map_.count(tensor_name) > 0); + const int port = input_port_map_.at(tensor_name); + if (input_tensor_data_.count(port) <= 0) { + input_tensor_data_.emplace(port, std::vector<uint8>{}); + } + std::vector<uint8>& input_tensor_data = input_tensor_data_.at(port); + + // hexagon only supports 32bit dimension + const int x = static_cast<int>(shape[0]); + const int y = static_cast<int>(shape[1]); + const int z = static_cast<int>(shape[2]); + const int d = static_cast<int>(shape[3]); + + const uint64 byte_size = x * y * z * d * DataTypeSize(std::get<2>(bytes)); + CHECK_EQ(byte_size, std::get<1>(bytes)); + input_tensor_data.resize(byte_size + ALIGNMENT_BYTES); + uint8* data_ptr = FindAlignedPointer(input_tensor_data.data()); + if (DBG_USE_DUMMY_INPUT) { - const int array_length = x * y * z * d; - byte_size = array_length * sizeof(float); - dummy_input_float_.resize(array_length); - std::memset(dummy_input_float_.data(), 0, byte_size); + std::memset(data_ptr, 0, byte_size); } else { - CHECK(std::get<2>(bytes) == DT_FLOAT); - byte_size = std::get<1>(bytes); - dummy_input_float_.resize(byte_size / sizeof(float)); - std::memcpy(dummy_input_float_.data(), std::get<0>(bytes), byte_size); + std::memcpy(data_ptr, std::get<0>(bytes), byte_size); } - return soc_interface_FillInputNodeFloat( - x, y, z, d, reinterpret_cast<uint8*>(dummy_input_float_.data()), - byte_size); + + return soc_interface_FillInputNodeWithPort(port, x, y, z, d, data_ptr, + byte_size); } bool HexagonControlWrapper::ReadOutputNode( @@ -304,26 +364,28 @@ bool HexagonControlWrapper::ReadOutputNode( break; } } - std::vector<IRemoteFusedGraphExecutor::ByteArray> outputs; + std::vector<ByteArray> outputs; ReadOutputNode(node_name, &outputs); CHECK_EQ(1, outputs.size()); - IRemoteFusedGraphExecutor::ByteArray& output = outputs[0]; + ByteArray& output = outputs[0]; Tensor* output_tensor = tensor_allocator(output_shape); CHECK(output_tensor->TotalBytes() >= std::get<1>(output)) << output_tensor->TotalBytes() << ", " << std::get<1>(output); - // TODO(satok): Avoid specifying float - std::memcpy(output_tensor->flat<float>().data(), std::get<0>(output), - std::get<1>(output)); + TF_CHECK_OK(RemoteFusedGraphExecuteUtils::CopyByteArrayToTensor( + std::get<0>(output), std::get<1>(output), output_tensor)); } bool HexagonControlWrapper::ReadOutputNode( const string& node_name, std::vector<ByteArray>* const outputs) { CHECK(outputs != nullptr); ByteArray output; - soc_interface_ReadOutputNodeFloat(node_name.c_str(), &std::get<0>(output), - &std::get<1>(output)); + const string tensor_name = AddPort(node_name); + CHECK(output_port_map_.count(tensor_name) > 0); + const int port = output_port_map_.at(tensor_name); + soc_interface_ReadOutputNodeWithPort(port, &std::get<0>(output), + &std::get<1>(output)); // TODO: Accept all results - std::get<2>(output) = DT_FLOAT; + // std::get<2>(output) = DT_FLOAT; outputs->emplace_back(output); return true; } @@ -347,7 +409,9 @@ bool HexagonControlWrapper::FillInputNode(const string& node_name, } } } - FillInputNode(node_name, ba); + const std::array<int64, GraphTransferer::SHAPE_ARRAY_SIZE> shape = + GraphTransferer::ToTensorShapeArray(tensor.shape()); + FillInputNode(node_name, shape, ba); return true; } @@ -360,7 +424,9 @@ bool HexagonControlWrapper::Finalize() { return false; } bool HexagonControlWrapper::SetupGraph() { return false; } bool HexagonControlWrapper::ExecuteGraph() { return false; } bool HexagonControlWrapper::TeardownGraph() { return false; } -bool HexagonControlWrapper::FillInputNode(const string&, const ConstByteArray) { +bool HexagonControlWrapper::FillInputNode( + const string&, const std::array<int64, GraphTransferer::SHAPE_ARRAY_SIZE>&, + const ConstByteArray) { return false; } bool HexagonControlWrapper::FillInputNode(const string&, const Tensor&) { diff --git a/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.h b/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.h index 97448884e1..209ac9dbf4 100644 --- a/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.h +++ b/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.h @@ -16,6 +16,7 @@ limitations under the License. #ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_HEXAGON_CONTROL_WRAPPER_H_ #define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_HEXAGON_CONTROL_WRAPPER_H_ +#include <unordered_map> #include <vector> #include "tensorflow/core/framework/types.h" @@ -32,6 +33,9 @@ namespace tensorflow { */ class HexagonControlWrapper final : public IRemoteFusedGraphExecutor { public: + using ByteArray = + std::tuple<uint8* /* data */, uint64 /* size */, DataType /* type */>; + HexagonControlWrapper() = default; int GetVersion() final; bool Init(const RemoteFusedGraphExecuteInfo& info) final; @@ -45,7 +49,13 @@ class HexagonControlWrapper final : public IRemoteFusedGraphExecutor { bool ReadOutputNode(const string& node_name, std::vector<ByteArray>* outputs); private: - bool FillInputNode(const string& node_name, const ConstByteArray bytes); + using ConstByteArray = std::tuple<const uint8* /* data */, uint64 /* size */, + DataType /* type */>; + + bool FillInputNode( + const string& node_name, + const std::array<int64, GraphTransferer::SHAPE_ARRAY_SIZE>& shape, + const ConstByteArray bytes); // CAVEAT: Need offset as HVX library reserves some ids static constexpr int NODE_ID_OFFSET = 0x10000; @@ -57,11 +67,15 @@ class HexagonControlWrapper final : public IRemoteFusedGraphExecutor { GraphTransferer graph_transferer_{}; // Dummy float array for input node. // TODO(satok): Use actual data passed by FillInputNode and remove - std::vector<float> dummy_input_float_{}; + // std::vector<float> dummy_input_float_{}; + std::unordered_map<int, std::vector<uint8>> input_tensor_data_{}; // Dummy byte array for cosnt node. // TODO(satok): Remove std::unordered_map<int, std::vector<uint8>> dummy_const_data_{}; + std::unordered_map<string, int> input_port_map_{}; + std::unordered_map<string, int> output_port_map_{}; + TF_DISALLOW_COPY_AND_ASSIGN(HexagonControlWrapper); }; diff --git a/tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc b/tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc index 54ba101501..cb9091e29f 100644 --- a/tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc +++ b/tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc @@ -46,8 +46,7 @@ adb push /tmp/imagenet_comp_graph_label_strings.txt /data/local/tmp namespace tensorflow { -using ByteArray = IRemoteFusedGraphExecutor::ByteArray; -using ConstByteArray = IRemoteFusedGraphExecutor::ConstByteArray; +using ByteArray = HexagonControlWrapper::ByteArray; constexpr const char* const IMAGE_FILENAME = "/data/local/tmp/img_299x299.bmp"; constexpr const char* const MODEL_FILENAME = @@ -87,8 +86,7 @@ static void DumpTop10Results(const int byte_size, 10 /* show top_n results */); } -static void DumpTop10Results( - const std::vector<IRemoteFusedGraphExecutor::ByteArray>& outputs) { +static void DumpTop10Results(const std::vector<ByteArray>& outputs) { CHECK(outputs.size() == 1); const int byte_size = std::get<1>(outputs.at(0)); const float* float_array = @@ -96,9 +94,8 @@ static void DumpTop10Results( DumpTop10Results(byte_size, float_array); } -static void CheckFirstResult( - const std::vector<IRemoteFusedGraphExecutor::ByteArray>& outputs, - const int expected_first_id) { +static void CheckFirstResult(const std::vector<ByteArray>& outputs, + const int expected_first_id) { EXPECT_GE(outputs.size(), 1); const int byte_size = std::get<1>(outputs.at(0)); const int element_count = byte_size / sizeof(float); @@ -240,7 +237,7 @@ static void RunInferenceByHexagonControlWrapper( } // 5-1. Read output node's outputs - std::vector<IRemoteFusedGraphExecutor::ByteArray> outputs; + std::vector<ByteArray> outputs; hexagon_control_wrapper.ReadOutputNode("softmax", &outputs); // 5-2. Dump results diff --git a/tensorflow/core/kernels/hexagon/hexagon_ops_definitions.cc b/tensorflow/core/kernels/hexagon/hexagon_ops_definitions.cc index a4b79e6ec4..2b7585aed1 100644 --- a/tensorflow/core/kernels/hexagon/hexagon_ops_definitions.cc +++ b/tensorflow/core/kernels/hexagon/hexagon_ops_definitions.cc @@ -350,6 +350,8 @@ HexagonOpsDefinitions::BuildOpNameToSocOpTypeMap() { #ifdef ENABLE_EXPERIMENTAL_HEXNN_OPS EmplaceOpType("QuantizedMul", {}, SupportedOpType::QUANTIZED_MUL_8x8to32, &op_map); + EmplaceOpType("QuantizedAdd", {}, SupportedOpType::QUANTIZED_ADD_8p8to32, + &op_map); EmplaceOpType("Pad", {}, SupportedOpType::PAD_F, &op_map); EmplaceOpType("SpaceToBatchND", {}, SupportedOpType::SPACE_TO_BATCH_ND_F, &op_map), @@ -359,6 +361,11 @@ HexagonOpsDefinitions::BuildOpNameToSocOpTypeMap() { &op_map); EmplaceOpType("ConcatV2", {}, SupportedOpType::CONCAT_V2_F, &op_map); EmplaceOpType("Conv2DBackpropInput", {}, SupportedOpType::DECONV_F, &op_map); + + EmplaceOpType("Tanh", {}, SupportedOpType::TANH_F, &op_map); + EmplaceOpType("Split", {}, SupportedOpType::SPLIT_F, &op_map); + EmplaceOpType("Transpose", {}, SupportedOpType::TRANSPOSE_F, &op_map); + EmplaceOpType("Concat", {}, SupportedOpType::CONCAT_F, &op_map); #endif return op_map; }; diff --git a/tensorflow/core/kernels/i_remote_fused_graph_executor.h b/tensorflow/core/kernels/i_remote_fused_graph_executor.h index fe62a259de..09d1f43ff1 100644 --- a/tensorflow/core/kernels/i_remote_fused_graph_executor.h +++ b/tensorflow/core/kernels/i_remote_fused_graph_executor.h @@ -25,10 +25,6 @@ namespace tensorflow { class IRemoteFusedGraphExecutor { public: - using ByteArray = - std::tuple<uint8* /* data */, uint64 /* size */, DataType /* type */>; - using ConstByteArray = std::tuple<const uint8* /* data */, uint64 /* size */, - DataType /* type */>; using TensorAllocatorFunc = std::function<Tensor*(const TensorShape& shape)>; IRemoteFusedGraphExecutor() = default; diff --git a/tensorflow/core/kernels/remote_fused_graph_execute_utils.cc b/tensorflow/core/kernels/remote_fused_graph_execute_utils.cc index 103b2be691..dd9839d245 100644 --- a/tensorflow/core/kernels/remote_fused_graph_execute_utils.cc +++ b/tensorflow/core/kernels/remote_fused_graph_execute_utils.cc @@ -1280,6 +1280,69 @@ RemoteFusedGraphExecuteUtils::FuseRemoteGraphByPlacedArguments( return true; } +/* static */ Status RemoteFusedGraphExecuteUtils::CopyByteArrayToTensor( + const void* src_ptr, const int src_size, Tensor* tensor) { + CHECK(tensor->TotalBytes() >= src_size) + << tensor->TotalBytes() << ", " << src_size; + void* dst_ptr; + switch (tensor->dtype()) { + case DT_FLOAT: + dst_ptr = tensor->flat<float>().data(); + break; + case DT_DOUBLE: + dst_ptr = tensor->flat<double>().data(); + break; + case DT_INT32: + dst_ptr = tensor->flat<int32>().data(); + break; + case DT_UINT8: + dst_ptr = tensor->flat<uint8>().data(); + break; + case DT_INT16: + dst_ptr = tensor->flat<int16>().data(); + break; + case DT_INT8: + dst_ptr = tensor->flat<int8>().data(); + break; + case DT_STRING: + dst_ptr = tensor->flat<string>().data(); + break; + case DT_INT64: + dst_ptr = tensor->flat<int64>().data(); + break; + case DT_BOOL: + dst_ptr = tensor->flat<bool>().data(); + break; + case DT_QINT8: + dst_ptr = tensor->flat<qint8>().data(); + break; + case DT_QUINT8: + dst_ptr = tensor->flat<quint8>().data(); + break; + case DT_QINT32: + dst_ptr = tensor->flat<qint32>().data(); + break; + case DT_BFLOAT16: + dst_ptr = tensor->flat<bfloat16>().data(); + break; + case DT_QINT16: + dst_ptr = tensor->flat<qint16>().data(); + break; + case DT_QUINT16: + dst_ptr = tensor->flat<quint16>().data(); + break; + case DT_UINT16: + dst_ptr = tensor->flat<uint16>().data(); + break; + default: + CHECK(false) << "type " << tensor->dtype() << " is not supported."; + break; + } + CHECK_NOTNULL(dst_ptr); + std::memcpy(dst_ptr, src_ptr, src_size); + return Status::OK(); +} + /* static */ Status RemoteFusedGraphExecuteUtils::ReplaceInputNodeByPlaceHolder( const string& input, const DataType type, const TensorShape& shape, GraphDef* graph_def) { diff --git a/tensorflow/core/kernels/remote_fused_graph_execute_utils.h b/tensorflow/core/kernels/remote_fused_graph_execute_utils.h index a80fc79784..1d4423ed46 100644 --- a/tensorflow/core/kernels/remote_fused_graph_execute_utils.h +++ b/tensorflow/core/kernels/remote_fused_graph_execute_utils.h @@ -157,7 +157,7 @@ class RemoteFusedGraphExecuteUtils { const std::vector<std::pair<string, Tensor>>& input_tensors, const bool dry_run_inference, GraphDef* graph_def); - // Build remote fused graph execute info + // Build remote fused graph execute info. static Status BuildRemoteFusedGraphExecuteInfo( const string& executor_name, const GraphDef& subgraph_def, const std::vector<string>& inputs, const std::vector<string>& outputs, @@ -165,31 +165,31 @@ class RemoteFusedGraphExecuteUtils { DataTypeVector* input_types, DataTypeVector* output_types); // Build remote fused graph execute op node by fusing specified subgraph - // as remote fused graph execute info + // as remote fused graph execute info. static Status BuildRemoteFusedGraphExecuteOpNode( const string& node_name, const string& executor_name, const GraphDef& subgraph_def, const std::vector<string>& inputs, const std::vector<string>& outputs, const bool require_shape_type, Graph* graph, Node** created_node); - // Build Identity node to forward remote graph node output + // Build Identity node to forward remote graph node output. static Status BuildIdentityOpNode(const string& node_name, const string& input_node_name, const int input_node_port, const DataType dt, Graph* graph, Node** created_node); - // Create clusters of given nodes + // Create clusters of given nodes. static Status ClusterizeNodes(const std::unordered_set<string>& node_names, const GraphDef& graph_def, std::vector<ClusterInfo>* cluster_infos); - // Build GraphDef of a given cluster + // Build GraphDef of a given cluster. static Status BuildClusterSubgraphDef(const ClusterInfo& cluster, const GraphDef& graph_def, GraphDef* subgraph_def); - // Build a cluster by given border + // Build a cluster by given border. // CAVEAT: The border must be consistent for one cluster. static Status BuildClusterByBorder(const std::vector<string>& border_inputs, const std::vector<string>& border_outputs, @@ -211,7 +211,7 @@ class RemoteFusedGraphExecuteUtils { const bool require_shape_type, GraphDef* output_graph_def); - // Fuse subgraph of specified nodes + // Fuse subgraph of specified nodes. static Status FuseRemoteGraphByNodeNames( const GraphDef& input_graph_def, const std::vector<string>& inputs, const std::vector<string>& outputs, @@ -220,7 +220,7 @@ class RemoteFusedGraphExecuteUtils { const string& remote_fused_graph_executor_name, const bool require_shape_type, GraphDef* output_graph_def); - // Fuse subgraph of specified border + // Fuse subgraph of specified border. static Status FuseRemoteGraphByBorder( const GraphDef& input_graph_def, const std::vector<string>& inputs, const std::vector<string>& outputs, @@ -230,7 +230,7 @@ class RemoteFusedGraphExecuteUtils { const string& remote_graph_executor_name, const bool require_shape_type, GraphDef* output_graph_def); - // Place arguments to fuse remote graph + // Place arguments to fuse remote graph. static Status PlaceRemoteGraphArguments( const std::vector<string>& inputs, const std::vector<string>& outputs, const std::unordered_set<string>& fused_node_names, @@ -239,7 +239,7 @@ class RemoteFusedGraphExecuteUtils { const string& remote_fused_graph_node_name, const string& remote_graph_executor_name, GraphDef* graph_def); - // Fuse remote graph by placed arguments + // Fuse remote graph by placed arguments. static Status FuseRemoteGraphByPlacedArguments( const GraphDef& input_graph_def, const std::vector<std::pair<string, Tensor>>& input_tensors, @@ -249,6 +249,15 @@ class RemoteFusedGraphExecuteUtils { const GraphDef& input_graph_def, const std::vector<std::pair<string, Tensor>>& input_tensors); + // Copy a byte array to a tensor data. Though tensor data must be + // updated with typed information in general, we can't guarantee that + // returned values from a remote processor has typed information because + // a logic running in the remote processor possibly be in a separate binary + // which may not link tensorflow libraries. To deal with this situation, + // remote fused graph needs to overwrite the tensor data by a byte array. + static Status CopyByteArrayToTensor(const void* src_ptr, const int src_size, + Tensor* tensor); + private: static void EmplaceTensorShapeType(const string& name, const Tensor& tensor, TensorShapeMap* tensor_shape_map); diff --git a/tensorflow/core/platform/hexagon/soc_interface.h b/tensorflow/core/platform/hexagon/soc_interface.h index f4a3cdf4bd..ca37b63e2b 100644 --- a/tensorflow/core/platform/hexagon/soc_interface.h +++ b/tensorflow/core/platform/hexagon/soc_interface.h @@ -22,6 +22,8 @@ limitations under the License. // naming conflicts. #ifdef __cplusplus extern "C" { +#else +#include <stdbool.h> #endif // __cplusplus // Returns the version of loaded hexagon wrapper shared library. // You should assert that the version matches the expected version before @@ -39,13 +41,30 @@ bool soc_interface_Finalize(); bool soc_interface_ExecuteGraph(); // Teardown graph setup bool soc_interface_TeardownGraph(); + +// Allocate buffers for input node and output node +bool soc_interface_AllocateInOutNodeBuffers(int input_count, int* input_sizes, + int output_count, + int* output_sizes); + +// Send input data to SOC with port +bool soc_interface_FillInputNodeWithPort(int port, int x, int y, int z, int d, + const uint8_t* const buf, + uint64_t buf_byte_size); + // Send input data to SOC bool soc_interface_FillInputNodeFloat(int x, int y, int z, int d, const uint8_t* const buf, - uint64_t buf_size); + uint64_t buf_byte_size); + +// Load output data from SOC with port +bool soc_interface_ReadOutputNodeWithPort(int port, uint8_t** buf, + uint64_t* buf_byte_size); + // Load output data from SOC bool soc_interface_ReadOutputNodeFloat(const char* const node_name, - uint8_t** buf, uint64_t* buf_size); + uint8_t** buf, uint64_t* buf_byte_size); + // Setup graph // TODO(satok): Remove and use runtime version bool soc_interface_setupDummyGraph(int version); |