aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/core/util/example_proto_fast_parsing.h
blob: 055d9c2c305ba816cb0a6ac22ca4e1c65ae2d27d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/

#ifndef TENSORFLOW_CORE_UTIL_EXAMPLE_PROTO_FAST_PARSING_H_
#define TENSORFLOW_CORE_UTIL_EXAMPLE_PROTO_FAST_PARSING_H_

#include <string>
#include <unordered_map>
#include <vector>

#include "tensorflow/core/example/example.pb.h"
#include "tensorflow/core/framework/allocator.h"
#include "tensorflow/core/framework/graph.pb.h"
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/framework/partial_tensor_shape.h"
#include "tensorflow/core/framework/tensor.h"
#include "tensorflow/core/framework/types.h"
#include "tensorflow/core/lib/gtl/array_slice.h"
#include "tensorflow/core/platform/types.h"
#include "tensorflow/core/util/sparse/sparse_tensor.h"

namespace tensorflow {
namespace example {

// FastParseExampleConfig defines how to parse features in Example.
// Each sub-config is responsible for one feature identified with feautre_name.
// FastParseExampleConfig can't have two sub-configs with the same feature_name.
// dtype identifies the type of output vector and the kind of Feature expected
// in Example.
struct FastParseExampleConfig {
  struct Dense {
    string feature_name;
    DataType dtype;
    // These 2 fields correspond exactly to dense_shapes and dense_defaults in
    // ParseExample op.
    // Documentation is available in: tensorflow/core/ops/parsing_ops.cc
    PartialTensorShape shape;
    Tensor default_value;
    bool variable_length;
    std::size_t elements_per_stride;
  };

  struct Sparse {
    string feature_name;
    DataType dtype;
  };

  std::vector<Dense> dense;
  std::vector<Sparse> sparse;

  // If `true`, `Result::feature_stats` will contain one
  // `PerExampleFeatureStats` for each serialized example in the input.
  bool collect_feature_stats = false;
};

// Statistics about the features in each example passed to
// `FastParse[Single]Example()`.
//
// TODO(b/111553342): The gathered statistics currently have two limitations:
// * Feature names that appear more than once will be counted multiple times.
// * The feature values count only represents the counts for features that were
//   requested in the `FastParseExampleConfig`.
// These could be addressed with additional work at runtime.
struct PerExampleFeatureStats {
  // The number of feature names in an example.
  size_t features_count = 0;

  // The sum of the number of values in each feature that is parsed.
  size_t feature_values_count = 0;
};

// This is exactly the output of TF's ParseExample Op.
// Documentation is available in: tensorflow/core/ops/parsing_ops.cc
struct Result {
  std::vector<Tensor> sparse_indices;
  std::vector<Tensor> sparse_values;
  std::vector<Tensor> sparse_shapes;
  std::vector<Tensor> dense_values;

  // This vector will be populated with one element per example if
  // `FastParseExampleConfig::collect_feature_stats` is set to `true`.
  std::vector<PerExampleFeatureStats> feature_stats;
};

// Parses a batch of serialized Example protos and converts them into result
// according to given config.
// Given example names have to either be empty or the same size as serialized.
// example_names are used only for error messages.
Status FastParseExample(const FastParseExampleConfig& config,
                        gtl::ArraySlice<string> serialized,
                        gtl::ArraySlice<string> example_names,
                        thread::ThreadPool* thread_pool, Result* result);

// TODO(mrry): Move the hash table construction into the config object.
typedef FastParseExampleConfig FastParseSingleExampleConfig;

Status FastParseSingleExample(const FastParseSingleExampleConfig& config,
                              const string& serialized, Result* result);

// Parses a batch of serialized SequenceExample protos and converts them into
// result according to given config.
// Given example names have to either be empty or the same size as serialized.
// example_names are used only for error messages.
Status FastParseSequenceExample(
    const example::FastParseExampleConfig& context_config,
    const example::FastParseExampleConfig& feature_list_config,
    gtl::ArraySlice<string> serialized, gtl::ArraySlice<string> example_names,
    thread::ThreadPool* thread_pool, example::Result* context_result,
    example::Result* feature_list_result,
    std::vector<Tensor>* dense_feature_lengths);

// This function parses serialized Example and populates given example.
// It uses the same specialized parser as FastParseExample which is efficient.
// But then constructs Example which is relatively slow.
// It is exported here as a convenient API to test parser part separately.
bool TestFastParse(const string& serialized, Example* example);

}  // namespace example
}  // namespace tensorflow

#endif  // TENSORFLOW_CORE_UTIL_EXAMPLE_PROTO_FAST_PARSING_H_