aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/examples/multibox_detector
diff options
context:
space:
mode:
authorGravatar Andrew Harp <andrewharp@google.com>2017-01-25 16:01:16 -0800
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2017-01-25 17:09:37 -0800
commitd7478ece1254e5a8f2382064213364f53ace1413 (patch)
treea5a5ec435991345e2b7d22049442c51bb3824045 /tensorflow/examples/multibox_detector
parent554e5f29878b2c89b4ff5357784027aa74dec72e (diff)
Adding MultiBox person detector standalone example.
Change: 145608840
Diffstat (limited to 'tensorflow/examples/multibox_detector')
-rw-r--r--tensorflow/examples/multibox_detector/BUILD41
-rw-r--r--tensorflow/examples/multibox_detector/README.md71
-rwxr-xr-xtensorflow/examples/multibox_detector/data/surfers.jpgbin0 -> 53834 bytes
-rw-r--r--tensorflow/examples/multibox_detector/main.cc428
4 files changed, 540 insertions, 0 deletions
diff --git a/tensorflow/examples/multibox_detector/BUILD b/tensorflow/examples/multibox_detector/BUILD
new file mode 100644
index 0000000000..5d0c769007
--- /dev/null
+++ b/tensorflow/examples/multibox_detector/BUILD
@@ -0,0 +1,41 @@
+# Description:
+# TensorFlow C++ inference example for labeling images.
+
+package(
+ default_visibility = ["//tensorflow:internal"],
+ features = [
+ "-layering_check",
+ "-parse_headers",
+ ],
+)
+
+licenses(["notice"]) # Apache 2.0
+
+exports_files(["LICENSE"])
+
+cc_binary(
+ name = "detect_objects",
+ srcs = [
+ "main.cc",
+ ],
+ linkopts = ["-lm"],
+ deps = [
+ "//tensorflow/cc:cc_ops",
+ "//tensorflow/core:framework_internal",
+ "//tensorflow/core:tensorflow",
+ ],
+)
+
+filegroup(
+ name = "all_files",
+ srcs = glob(
+ ["**/*"],
+ exclude = [
+ "**/METADATA",
+ "**/OWNERS",
+ "bin/**",
+ "gen/**",
+ ],
+ ),
+ visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/examples/multibox_detector/README.md b/tensorflow/examples/multibox_detector/README.md
new file mode 100644
index 0000000000..42c63a5167
--- /dev/null
+++ b/tensorflow/examples/multibox_detector/README.md
@@ -0,0 +1,71 @@
+# TensorFlow C++ MultiBox Object Detection Demo
+
+This example shows how you can load a pre-trained TensorFlow network and use it
+to detect objects in images in C++. For an alternate implementation see the
+[Android TensorFlow demo](https://tensorflow.org/tutorials/image_recognition/)
+
+## Description
+
+This demo uses a model based on [Scalable Object Detection using Deep NeuralNetworks](https://arxiv.org/abs/1312.2249) to detect people in images passed in from
+the command line. This is the same model also used in the Android TensorFlow
+demo for real-time person detection and tracking in the camera preview.
+
+## To build/install/run
+
+The TensorFlow `GraphDef` that contains the model definition and weights is not
+packaged in the repo because of its size. Instead, you must first download the
+file to the `data` directory in the source tree:
+
+```bash
+$ wget https://storage.googleapis.com/download.tensorflow.org/models/mobile_multibox_v1a.zip -O tensorflow/examples/multibox_detector/data/mobile_multibox_v1a.zip
+
+$ unzip tensorflow/examples/multibox_detector/data/mobile_multibox_v1a.zip -d tensorflow/examples/multibox_detector/data/
+```
+
+Then, as long as you've managed to build the main TensorFlow framework, you
+should have everything you need to run this example installed already.
+
+Once extracted, see the box priors file in the data directory. This file
+contains means and standard deviations for all 784 possible detections,
+normalized from 0-1 in left top right bottom order.
+
+To build it, run this command:
+
+```bash
+$ bazel build -c opt tensorflow/examples/multibox_detector/...
+```
+
+That should build a binary executable that you can then run like this:
+
+```bash
+$ bazel-bin/tensorflow/examples/multibox_detector/detect_objects --image_out=$HOME/x20/surfers_labeled.png
+```
+
+This uses the default example image that ships with the framework, and should
+output something similar to this:
+
+```
+I0125 18:24:13.804047 8677 main.cc:293] ===== Top 5 Detections ======
+I0125 18:24:13.804058 8677 main.cc:307] Detection 0: L:324.542 T:76.5764 R:373.26 B:214.957 (635) score: 0.267425
+I0125 18:24:13.804077 8677 main.cc:307] Detection 1: L:332.896 T:76.2751 R:372.116 B:204.614 (523) score: 0.245334
+I0125 18:24:13.804087 8677 main.cc:307] Detection 2: L:306.605 T:76.2228 R:371.356 B:217.32 (634) score: 0.216121
+I0125 18:24:13.804096 8677 main.cc:307] Detection 3: L:143.918 T:86.0909 R:187.333 B:195.885 (387) score: 0.171368
+I0125 18:24:13.804104 8677 main.cc:307] Detection 4: L:144.915 T:86.2675 R:185.243 B:165.246 (219) score: 0.169244
+```
+
+In this case, we're using a public domain stock image of surfers walking on the
+beach, and the top two few detections are of the two on the right. Adding more
+detections with --num_detections=N will also include the surfer on the left,
+and eventually non-person boxes below a certain threshold.
+
+You can visually inspect the detections by viewing the resulting png file
+'~/surfers_labeled.png'.
+
+Next, try it out on your own images by supplying the --image= argument, e.g.
+
+```bash
+$ bazel-bin/tensorflow/examples/multibox_detector/detect_objects --image=my_image.png
+```
+
+For another implementation of this work, you can check out the [Android
+TensorFlow demo](https://tensorflow.org/tutorials/image_recognition/).
diff --git a/tensorflow/examples/multibox_detector/data/surfers.jpg b/tensorflow/examples/multibox_detector/data/surfers.jpg
new file mode 100755
index 0000000000..940cf234d4
--- /dev/null
+++ b/tensorflow/examples/multibox_detector/data/surfers.jpg
Binary files differ
diff --git a/tensorflow/examples/multibox_detector/main.cc b/tensorflow/examples/multibox_detector/main.cc
new file mode 100644
index 0000000000..42972078e5
--- /dev/null
+++ b/tensorflow/examples/multibox_detector/main.cc
@@ -0,0 +1,428 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <setjmp.h>
+#include <stdio.h>
+#include <string.h>
+#include <fstream>
+#include <vector>
+
+#include "tensorflow/cc/ops/const_op.h"
+#include "tensorflow/cc/ops/image_ops.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/graph/default_device.h"
+#include "tensorflow/core/graph/graph_def_builder.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/public/session.h"
+#include "tensorflow/core/util/command_line_flags.h"
+
+// These are all common classes it's handy to reference with no namespace.
+using tensorflow::Flag;
+using tensorflow::Tensor;
+using tensorflow::Status;
+using tensorflow::string;
+using tensorflow::int32;
+using tensorflow::uint8;
+
+// Takes a file name, and loads a list of comma-separated box priors from it,
+// one per line, and returns a vector of the values.
+Status ReadLocationsFile(const string& file_name, std::vector<float>* result,
+ size_t* found_label_count) {
+ std::ifstream file(file_name);
+ if (!file) {
+ return tensorflow::errors::NotFound("Labels file ", file_name,
+ " not found.");
+ }
+ result->clear();
+ string line;
+ while (std::getline(file, line)) {
+ std::vector<float> tokens;
+ CHECK(tensorflow::str_util::SplitAndParseAsFloats(line, ',', &tokens));
+ for (auto number : tokens) {
+ result->push_back(number);
+ }
+ }
+ *found_label_count = result->size();
+ return Status::OK();
+}
+
+// Given an image file name, read in the data, try to decode it as an image,
+// resize it to the requested size, and then scale the values as desired.
+Status ReadTensorFromImageFile(const string& file_name, const int input_height,
+ const int input_width, const float input_mean,
+ const float input_std,
+ std::vector<Tensor>* out_tensors) {
+ auto root = tensorflow::Scope::NewRootScope();
+ using namespace ::tensorflow::ops; // NOLINT(build/namespaces)
+
+ string input_name = "file_reader";
+ string original_name = "identity";
+ string output_name = "normalized";
+ auto file_reader =
+ tensorflow::ops::ReadFile(root.WithOpName(input_name), file_name);
+ // Now try to figure out what kind of file it is and decode it.
+ const int wanted_channels = 3;
+ tensorflow::Output image_reader;
+ if (tensorflow::StringPiece(file_name).ends_with(".png")) {
+ image_reader = DecodePng(root.WithOpName("png_reader"), file_reader,
+ DecodePng::Channels(wanted_channels));
+ } else if (tensorflow::StringPiece(file_name).ends_with(".gif")) {
+ image_reader = DecodeGif(root.WithOpName("gif_reader"), file_reader);
+ } else {
+ // Assume if it's neither a PNG nor a GIF then it must be a JPEG.
+ image_reader = DecodeJpeg(root.WithOpName("jpeg_reader"), file_reader,
+ DecodeJpeg::Channels(wanted_channels));
+ }
+
+ // Also return identity so that we can know the original dimensions and
+ // optionally save the image out with bounding boxes overlaid.
+ auto original_image = Identity(root.WithOpName(original_name), image_reader);
+
+ // Now cast the image data to float so we can do normal math on it.
+ auto float_caster = Cast(root.WithOpName("float_caster"), original_image,
+ tensorflow::DT_FLOAT);
+ // The convention for image ops in TensorFlow is that all images are expected
+ // to be in batches, so that they're four-dimensional arrays with indices of
+ // [batch, height, width, channel]. Because we only have a single image, we
+ // have to add a batch dimension of 1 to the start with ExpandDims().
+ auto dims_expander = ExpandDims(root, float_caster, 0);
+
+ // Bilinearly resize the image to fit the required dimensions.
+ auto resized = ResizeBilinear(
+ root, dims_expander,
+ Const(root.WithOpName("size"), {input_height, input_width}));
+ // Subtract the mean and divide by the scale.
+ Div(root.WithOpName(output_name), Sub(root, resized, {input_mean}),
+ {input_std});
+
+ // This runs the GraphDef network definition that we've just constructed, and
+ // returns the results in the output tensor.
+ tensorflow::GraphDef graph;
+ TF_RETURN_IF_ERROR(root.ToGraphDef(&graph));
+
+ std::unique_ptr<tensorflow::Session> session(
+ tensorflow::NewSession(tensorflow::SessionOptions()));
+ TF_RETURN_IF_ERROR(session->Create(graph));
+ TF_RETURN_IF_ERROR(
+ session->Run({}, {output_name, original_name}, {}, out_tensors));
+ return Status::OK();
+}
+
+Status SaveImage(const Tensor& tensor, const string& file_path) {
+ LOG(INFO) << "Saving image to " << file_path;
+ CHECK(tensorflow::StringPiece(file_path).ends_with(".png"))
+ << "Only saving of png files is supported.";
+
+ auto root = tensorflow::Scope::NewRootScope();
+ using namespace ::tensorflow::ops; // NOLINT(build/namespaces)
+
+ string encoder_name = "encode";
+ string output_name = "file_writer";
+
+ tensorflow::Output image_encoder =
+ EncodePng(root.WithOpName(encoder_name), tensor);
+ tensorflow::ops::WriteFile file_saver = tensorflow::ops::WriteFile(
+ root.WithOpName(output_name), file_path, image_encoder);
+
+ tensorflow::GraphDef graph;
+ TF_RETURN_IF_ERROR(root.ToGraphDef(&graph));
+
+ std::unique_ptr<tensorflow::Session> session(
+ tensorflow::NewSession(tensorflow::SessionOptions()));
+ TF_RETURN_IF_ERROR(session->Create(graph));
+ std::vector<Tensor> outputs;
+ TF_RETURN_IF_ERROR(session->Run({}, {}, {output_name}, &outputs));
+
+ return Status::OK();
+}
+
+// Reads a model graph definition from disk, and creates a session object you
+// can use to run it.
+Status LoadGraph(string graph_file_name,
+ std::unique_ptr<tensorflow::Session>* session) {
+ tensorflow::GraphDef graph_def;
+ Status load_graph_status =
+ ReadBinaryProto(tensorflow::Env::Default(), graph_file_name, &graph_def);
+ if (!load_graph_status.ok()) {
+ return tensorflow::errors::NotFound("Failed to load compute graph at '",
+ graph_file_name, "'");
+ }
+ session->reset(tensorflow::NewSession(tensorflow::SessionOptions()));
+ Status session_create_status = (*session)->Create(graph_def);
+ if (!session_create_status.ok()) {
+ return session_create_status;
+ }
+ return Status::OK();
+}
+
+// Analyzes the output of the MultiBox graph to retrieve the highest scores and
+// their positions in the tensor, which correspond to individual box detections.
+Status GetTopDetections(const std::vector<Tensor>& outputs, int how_many_labels,
+ Tensor* indices, Tensor* scores) {
+ auto root = tensorflow::Scope::NewRootScope();
+ using namespace ::tensorflow::ops; // NOLINT(build/namespaces)
+
+ string output_name = "top_k";
+ TopKV2(root.WithOpName(output_name), outputs[0], how_many_labels);
+ // This runs the GraphDef network definition that we've just constructed, and
+ // returns the results in the output tensors.
+ tensorflow::GraphDef graph;
+ TF_RETURN_IF_ERROR(root.ToGraphDef(&graph));
+
+ std::unique_ptr<tensorflow::Session> session(
+ tensorflow::NewSession(tensorflow::SessionOptions()));
+ TF_RETURN_IF_ERROR(session->Create(graph));
+ // The TopK node returns two outputs, the scores and their original indices,
+ // so we have to append :0 and :1 to specify them both.
+ std::vector<Tensor> out_tensors;
+ TF_RETURN_IF_ERROR(session->Run({}, {output_name + ":0", output_name + ":1"},
+ {}, &out_tensors));
+ *scores = out_tensors[0];
+ *indices = out_tensors[1];
+ return Status::OK();
+}
+
+// Converts an encoded location to an actual box placement with the provided
+// box priors.
+void DecodeLocation(const float* encoded_location, const float* box_priors,
+ float* decoded_location) {
+ bool non_zero = false;
+ for (int i = 0; i < 4; ++i) {
+ const float curr_encoding = encoded_location[i];
+ non_zero = non_zero || curr_encoding != 0.0f;
+
+ const float mean = box_priors[i * 2];
+ const float std_dev = box_priors[i * 2 + 1];
+
+ float currentLocation = curr_encoding * std_dev + mean;
+
+ currentLocation = std::max(currentLocation, 0.0f);
+ currentLocation = std::min(currentLocation, 1.0f);
+ decoded_location[i] = currentLocation;
+ }
+
+ if (!non_zero) {
+ LOG(WARNING) << "No non-zero encodings; check log for inference errors.";
+ }
+}
+
+float DecodeScore(float encoded_score) { return 1 / (1 + exp(-encoded_score)); }
+
+void DrawBox(const int image_width, const int image_height, int left, int top,
+ int right, int bottom, tensorflow::TTypes<uint8>::Flat* image) {
+ tensorflow::TTypes<uint8>::Flat image_ref = *image;
+
+ top = std::max(0, std::min(image_height - 1, top));
+ bottom = std::max(0, std::min(image_height - 1, bottom));
+
+ left = std::max(0, std::min(image_width - 1, left));
+ right = std::max(0, std::min(image_width - 1, right));
+
+ for (int i = 0; i < 3; ++i) {
+ uint8 val = i == 2 ? 255 : 0;
+ for (int x = left; x <= right; ++x) {
+ image_ref((top * image_width + x) * 3 + i) = val;
+ image_ref((bottom * image_width + x) * 3 + i) = val;
+ }
+ for (int y = top; y <= bottom; ++y) {
+ image_ref((y * image_width + left) * 3 + i) = val;
+ image_ref((y * image_width + right) * 3 + i) = val;
+ }
+ }
+}
+
+// Given the output of a model run, and the name of a file containing the labels
+// this prints out the top five highest-scoring values.
+Status PrintTopDetections(const std::vector<Tensor>& outputs,
+ const string& labels_file_name,
+ const int num_boxes,
+ const int num_detections,
+ const string& image_file_name,
+ Tensor* original_tensor) {
+ std::vector<float> locations;
+ size_t label_count;
+ Status read_labels_status =
+ ReadLocationsFile(labels_file_name, &locations, &label_count);
+ if (!read_labels_status.ok()) {
+ LOG(ERROR) << read_labels_status;
+ return read_labels_status;
+ }
+ CHECK_EQ(label_count, num_boxes * 8);
+
+ const int how_many_labels =
+ std::min(num_detections, static_cast<int>(label_count));
+ Tensor indices;
+ Tensor scores;
+ TF_RETURN_IF_ERROR(
+ GetTopDetections(outputs, how_many_labels, &indices, &scores));
+
+ tensorflow::TTypes<float>::Flat scores_flat = scores.flat<float>();
+
+ tensorflow::TTypes<int32>::Flat indices_flat = indices.flat<int32>();
+
+ const Tensor& encoded_locations = outputs[1];
+ auto locations_encoded = encoded_locations.flat<float>();
+
+ LOG(INFO) << original_tensor->DebugString();
+ const int image_width = original_tensor->shape().dim_size(1);
+ const int image_height = original_tensor->shape().dim_size(0);
+
+ tensorflow::TTypes<uint8>::Flat image_flat = original_tensor->flat<uint8>();
+
+ LOG(INFO) << "===== Top " << how_many_labels << " Detections ======";
+ for (int pos = 0; pos < how_many_labels; ++pos) {
+ const int label_index = indices_flat(pos);
+ const float score = scores_flat(pos);
+
+ float decoded_location[4];
+ DecodeLocation(&locations_encoded(label_index * 4),
+ &locations[label_index * 8], decoded_location);
+
+ float left = decoded_location[0] * image_width;
+ float top = decoded_location[1] * image_height;
+ float right = decoded_location[2] * image_width;
+ float bottom = decoded_location[3] * image_height;
+
+ LOG(INFO) << "Detection " << pos << ": "
+ << "L:" << left << " "
+ << "T:" << top << " "
+ << "R:" << right << " "
+ << "B:" << bottom << " "
+ << "(" << label_index << ") score: " << DecodeScore(score);
+
+ DrawBox(image_width, image_height, left, top, right, bottom, &image_flat);
+ }
+
+ if (!image_file_name.empty()) {
+ return SaveImage(*original_tensor, image_file_name);
+ }
+ return Status::OK();
+}
+
+int main(int argc, char* argv[]) {
+ // These are the command-line flags the program can understand.
+ // They define where the graph and input data is located, and what kind of
+ // input the model expects. If you train your own model, or use something
+ // other than multibox_model you'll need to update these.
+ string image =
+ "tensorflow/examples/multibox_detector/data/surfers.jpg";
+ string graph =
+ "tensorflow/examples/multibox_detector/data/"
+ "multibox_model.pb";
+ string box_priors =
+ "tensorflow/examples/multibox_detector/data/"
+ "multibox_location_priors.txt";
+ int32 input_width = 224;
+ int32 input_height = 224;
+ int32 input_mean = 128;
+ int32 input_std = 128;
+ int32 num_detections = 5;
+ int32 num_boxes = 784;
+ string input_layer = "ResizeBilinear";
+ string output_location_layer = "output_locations/Reshape";
+ string output_score_layer = "output_scores/Reshape";
+ string root_dir = "";
+ string image_out = "";
+
+ std::vector<Flag> flag_list = {
+ Flag("image", &image, "image to be processed"),
+ Flag("image_out", &image_out,
+ "location to save output image, if desired"),
+ Flag("graph", &graph, "graph to be executed"),
+ Flag("box_priors", &box_priors, "name of file containing box priors"),
+ Flag("input_width", &input_width, "resize image to this width in pixels"),
+ Flag("input_height", &input_height,
+ "resize image to this height in pixels"),
+ Flag("input_mean", &input_mean, "scale pixel values to this mean"),
+ Flag("input_std", &input_std, "scale pixel values to this std deviation"),
+ Flag("num_detections", &num_detections,
+ "number of top detections to return"),
+ Flag("num_boxes", &num_boxes,
+ "number of boxes defined by the location file"),
+ Flag("input_layer", &input_layer, "name of input layer"),
+ Flag("output_location_layer", &output_location_layer,
+ "name of location output layer"),
+ Flag("output_score_layer", &output_score_layer,
+ "name of score output layer"),
+ Flag("root_dir", &root_dir,
+ "interpret image and graph file names relative to this directory"),
+ };
+
+ string usage = tensorflow::Flags::Usage(argv[0], flag_list);
+ const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
+ if (!parse_result) {
+ LOG(ERROR) << usage;
+ return -1;
+ }
+
+ // We need to call this to set up global state for TensorFlow.
+ tensorflow::port::InitMain(argv[0], &argc, &argv);
+ if (argc > 1) {
+ LOG(ERROR) << "Unknown argument " << argv[1] << "\n" << usage;
+ return -1;
+ }
+
+ // First we load and initialize the model.
+ std::unique_ptr<tensorflow::Session> session;
+ string graph_path = tensorflow::io::JoinPath(root_dir, graph);
+ Status load_graph_status = LoadGraph(graph_path, &session);
+ if (!load_graph_status.ok()) {
+ LOG(ERROR) << load_graph_status;
+ return -1;
+ }
+
+ // Get the image from disk as a float array of numbers, resized and normalized
+ // to the specifications the main graph expects.
+ std::vector<Tensor> image_tensors;
+ string image_path = tensorflow::io::JoinPath(root_dir, image);
+
+ Status read_tensor_status =
+ ReadTensorFromImageFile(image_path, input_height, input_width, input_mean,
+ input_std, &image_tensors);
+ if (!read_tensor_status.ok()) {
+ LOG(ERROR) << read_tensor_status;
+ return -1;
+ }
+ const Tensor& resized_tensor = image_tensors[0];
+
+ // Actually run the image through the model.
+ std::vector<Tensor> outputs;
+ Status run_status =
+ session->Run({{input_layer, resized_tensor}},
+ {output_score_layer, output_location_layer}, {}, &outputs);
+ if (!run_status.ok()) {
+ LOG(ERROR) << "Running model failed: " << run_status;
+ return -1;
+ }
+
+ Status print_status = PrintTopDetections(outputs, box_priors, num_boxes,
+ num_detections, image_out,
+ &image_tensors[1]);
+
+ if (!print_status.ok()) {
+ LOG(ERROR) << "Running print failed: " << print_status;
+ return -1;
+ }
+ return 0;
+}