aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorGravatar Derek Murray <mrry@google.com>2017-12-20 20:54:25 -0800
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2017-12-20 20:57:54 -0800
commit96f3023b6a8b154c3840776c5feff3e028860a36 (patch)
tree023052b527aaf486dc52aba70bbf4de84685527f
parent469e10274cb4e0c79cd314d3986917f322d4e0c1 (diff)
[tf.data] Add `tf.contrib.data.parse_single_example()`.
The new op is a fused implementation of the existing `tf.parse_single_example()`, which is more efficient when parsing a single Example at a time. PiperOrigin-RevId: 179768512
-rw-r--r--tensorflow/contrib/data/BUILD1
-rw-r--r--tensorflow/contrib/data/__init__.py3
-rw-r--r--tensorflow/core/api_def/base_api/api_def_ParseSingleExample.pbtxt78
-rw-r--r--tensorflow/core/kernels/example_parsing_ops.cc104
-rw-r--r--tensorflow/core/kernels/example_parsing_ops_test.cc83
-rw-r--r--tensorflow/core/ops/parsing_ops.cc82
-rw-r--r--tensorflow/core/ops/parsing_ops_test.cc2
-rw-r--r--tensorflow/core/util/example_proto_fast_parsing.cc398
-rw-r--r--tensorflow/core/util/example_proto_fast_parsing.h6
-rw-r--r--tensorflow/core/util/example_proto_helper.cc21
-rw-r--r--tensorflow/core/util/example_proto_helper.h70
-rw-r--r--tensorflow/python/kernel_tests/BUILD17
-rw-r--r--tensorflow/python/kernel_tests/parse_single_example_op_test.py930
-rw-r--r--tensorflow/python/ops/parsing_ops.py195
14 files changed, 1962 insertions, 28 deletions
diff --git a/tensorflow/contrib/data/BUILD b/tensorflow/contrib/data/BUILD
index 3b1c33063f..8ecc003348 100644
--- a/tensorflow/contrib/data/BUILD
+++ b/tensorflow/contrib/data/BUILD
@@ -20,6 +20,7 @@ py_library(
"//tensorflow/contrib/data/python/ops:readers",
"//tensorflow/contrib/data/python/ops:shuffle_ops",
"//tensorflow/contrib/data/python/ops:transformation_ops",
+ "//tensorflow/python:parsing_ops",
"//tensorflow/python:util",
"//tensorflow/python/data/ops:iterator_ops",
],
diff --git a/tensorflow/contrib/data/__init__.py b/tensorflow/contrib/data/__init__.py
index c9ad091bd4..46125a8875 100644
--- a/tensorflow/contrib/data/__init__.py
+++ b/tensorflow/contrib/data/__init__.py
@@ -38,6 +38,8 @@ See the @{$datasets$Importing Data} Programmer's Guide for an overview.
@@sloppy_interleave
@@get_single_element
+
+@@parse_single_example
"""
from __future__ import absolute_import
@@ -68,6 +70,7 @@ from tensorflow.contrib.data.python.ops.resampling import rejection_resample
from tensorflow.contrib.data.python.ops.scan_ops import scan
from tensorflow.contrib.data.python.ops.shuffle_ops import shuffle_and_repeat
from tensorflow.python.data.ops.iterator_ops import Iterator
+from tensorflow.python.ops.parsing_ops import parse_single_example_v2 as parse_single_example
# pylint: enable=unused-import
from tensorflow.python.util.all_util import remove_undocumented
diff --git a/tensorflow/core/api_def/base_api/api_def_ParseSingleExample.pbtxt b/tensorflow/core/api_def/base_api/api_def_ParseSingleExample.pbtxt
new file mode 100644
index 0000000000..476c01d0ad
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ParseSingleExample.pbtxt
@@ -0,0 +1,78 @@
+op {
+ graph_op_name: "ParseSingleExample"
+ in_arg {
+ name: "serialized"
+ description: <<END
+A vector containing a batch of binary serialized Example protos.
+END
+ }
+ in_arg {
+ name: "dense_defaults"
+ description: <<END
+A list of Tensors (some may be empty), whose length matches
+the length of `dense_keys`. dense_defaults[j] provides default values
+when the example's feature_map lacks dense_key[j]. If an empty Tensor is
+provided for dense_defaults[j], then the Feature dense_keys[j] is required.
+The input type is inferred from dense_defaults[j], even when it's empty.
+If dense_defaults[j] is not empty, and dense_shapes[j] is fully defined,
+then the shape of dense_defaults[j] must match that of dense_shapes[j].
+If dense_shapes[j] has an undefined major dimension (variable strides dense
+feature), dense_defaults[j] must contain a single element:
+the padding element.
+END
+ }
+ attr {
+ name: "num_sparse"
+ description: <<END
+The number of sparse features to be parsed from the example. This
+must match the lengths of `sparse_keys` and `sparse_types`.
+END
+ }
+ attr {
+ name: "sparse_keys"
+ description: <<END
+A list of `num_sparse` strings.
+The keys expected in the Examples' features associated with sparse values.
+END
+ }
+ attr {
+ name: "dense_keys"
+ description: <<END
+The keys expected in the Examples' features associated with dense
+values.
+END
+ }
+ attr {
+ name: "sparse_types"
+ description: <<END
+A list of `num_sparse` types; the data types of data in each
+Feature given in sparse_keys.
+Currently the ParseSingleExample op supports DT_FLOAT (FloatList),
+DT_INT64 (Int64List), and DT_STRING (BytesList).
+END
+ }
+ attr {
+ name: "Tdense"
+ description: <<END
+The data types of data in each Feature given in dense_keys.
+The length of this list must match the length of `dense_keys`.
+Currently the ParseSingleExample op supports DT_FLOAT (FloatList),
+DT_INT64 (Int64List), and DT_STRING (BytesList).
+END
+ }
+ attr {
+ name: "dense_shapes"
+ description: <<END
+The shapes of data in each Feature given in dense_keys.
+The length of this list must match the length of `dense_keys`. The
+number of elements in the Feature corresponding to dense_key[j] must
+always equal dense_shapes[j].NumEntries(). If dense_shapes[j] ==
+(D0, D1, ..., DN) then the shape of output Tensor dense_values[j]
+will be (D0, D1, ..., DN): In the case dense_shapes[j] = (-1, D1,
+..., DN), the shape of the output Tensor dense_values[j] will be (M,
+D1, .., DN), where M is the number of blocks of elements of length
+D1 * .... * DN, in the input.
+END
+ }
+ summary: "Transforms a tf.Example proto (as a string) into typed tensors."
+}
diff --git a/tensorflow/core/kernels/example_parsing_ops.cc b/tensorflow/core/kernels/example_parsing_ops.cc
index 2db844e410..268a059275 100644
--- a/tensorflow/core/kernels/example_parsing_ops.cc
+++ b/tensorflow/core/kernels/example_parsing_ops.cc
@@ -34,9 +34,9 @@ limitations under the License.
namespace tensorflow {
-class ExampleParserOp : public OpKernel {
+class ParseExampleOp : public OpKernel {
public:
- explicit ExampleParserOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+ explicit ParseExampleOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
OP_REQUIRES_OK(ctx, attrs_.Init(ctx));
}
@@ -162,11 +162,107 @@ class ExampleParserOp : public OpKernel {
}
protected:
- ParseSingleExampleAttrs attrs_;
+ ParseExampleAttrs attrs_;
};
REGISTER_KERNEL_BUILDER(Name("ParseExample").Device(DEVICE_CPU),
- ExampleParserOp);
+ ParseExampleOp);
+
+class ParseSingleExampleOp : public OpKernel {
+ public:
+ explicit ParseSingleExampleOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+ OP_REQUIRES_OK(ctx, attrs_.Init(ctx));
+ }
+
+ void Compute(OpKernelContext* ctx) override {
+ const Tensor* serialized;
+ OpInputList dense_defaults;
+
+ // Grab the input list arguments.
+ OP_REQUIRES_OK(ctx, ctx->input("serialized", &serialized));
+ OP_REQUIRES_OK(ctx, ctx->input_list("dense_defaults", &dense_defaults));
+
+ OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(serialized->shape()),
+ errors::InvalidArgument(
+ "Expected serialized to be a scalar, got shape: ",
+ serialized->shape().DebugString()));
+ OP_REQUIRES(ctx, dense_defaults.size() == attrs_.dense_keys.size(),
+ errors::InvalidArgument(
+ "Expected len(dense_defaults) == len(dense_keys) but got: ",
+ dense_defaults.size(), " vs. ", attrs_.dense_keys.size()));
+
+ for (size_t d = 0; d < attrs_.dense_keys.size(); ++d) {
+ const Tensor& def_value = dense_defaults[d];
+ if (attrs_.variable_length[d]) {
+ OP_REQUIRES(ctx, def_value.NumElements() == 1,
+ errors::InvalidArgument(
+ "dense_shape[", d, "] is a variable length shape: ",
+ attrs_.dense_shapes[d].DebugString(),
+ ", therefore "
+ "def_value[",
+ d,
+ "] must contain a single element ("
+ "the padding element). But its shape is: ",
+ def_value.shape().DebugString()));
+ } else if (def_value.NumElements() > 0) {
+ OP_REQUIRES(ctx,
+ attrs_.dense_shapes[d].IsCompatibleWith(def_value.shape()),
+ errors::InvalidArgument(
+ "def_value[", d,
+ "].shape() == ", def_value.shape().DebugString(),
+ " is not compatible with dense_shapes_[", d,
+ "] == ", attrs_.dense_shapes[d].DebugString()));
+ }
+ OP_REQUIRES(ctx, def_value.dtype() == attrs_.dense_types[d],
+ errors::InvalidArgument(
+ "dense_defaults[", d, "].dtype() == ",
+ DataTypeString(def_value.dtype()), " != dense_types_[", d,
+ "] == ", DataTypeString(attrs_.dense_types[d])));
+ }
+
+ example::Result result;
+
+ // TODO(mrry): Build the configuration once and cache it.
+ example::FastParseExampleConfig config;
+ for (int d = 0; d < attrs_.dense_keys.size(); ++d) {
+ config.dense.push_back({attrs_.dense_keys[d], attrs_.dense_types[d],
+ attrs_.dense_shapes[d], dense_defaults[d],
+ attrs_.variable_length[d],
+ attrs_.elements_per_stride[d]});
+ }
+ for (int d = 0; d < attrs_.sparse_keys.size(); ++d) {
+ config.sparse.push_back({attrs_.sparse_keys[d], attrs_.sparse_types[d]});
+ }
+
+ const string& serialized_proto = serialized->scalar<string>()();
+
+ OP_REQUIRES_OK(ctx,
+ FastParseSingleExample(config, serialized_proto, &result));
+
+ OpOutputList dense_values;
+ OpOutputList sparse_indices;
+ OpOutputList sparse_values;
+ OpOutputList sparse_shapes;
+ OP_REQUIRES_OK(ctx, ctx->output_list("dense_values", &dense_values));
+ OP_REQUIRES_OK(ctx, ctx->output_list("sparse_indices", &sparse_indices));
+ OP_REQUIRES_OK(ctx, ctx->output_list("sparse_values", &sparse_values));
+ OP_REQUIRES_OK(ctx, ctx->output_list("sparse_shapes", &sparse_shapes));
+ for (int d = 0; d < attrs_.dense_keys.size(); ++d) {
+ dense_values.set(d, result.dense_values[d]);
+ }
+ for (int d = 0; d < attrs_.sparse_keys.size(); ++d) {
+ sparse_indices.set(d, result.sparse_indices[d]);
+ sparse_values.set(d, result.sparse_values[d]);
+ sparse_shapes.set(d, result.sparse_shapes[d]);
+ }
+ }
+
+ protected:
+ ParseSingleExampleAttrs attrs_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("ParseSingleExample").Device(DEVICE_CPU),
+ ParseSingleExampleOp);
class SingleSequenceExampleParserOp : public OpKernel {
public:
diff --git a/tensorflow/core/kernels/example_parsing_ops_test.cc b/tensorflow/core/kernels/example_parsing_ops_test.cc
index 29dbfd3b1b..0a64a6c154 100644
--- a/tensorflow/core/kernels/example_parsing_ops_test.cc
+++ b/tensorflow/core/kernels/example_parsing_ops_test.cc
@@ -103,6 +103,9 @@ struct ExampleStore {
}
static ExampleTensorMap GetSerializedExamples() {
ExampleTensorMap examples;
+ AddExample(&examples, 10, 1, 1);
+ AddExample(&examples, 100, 1, 1);
+ AddExample(&examples, 1000, 1, 1);
AddExample(&examples, 10, 128, 1);
AddExample(&examples, 100, 128, 1);
AddExample(&examples, 1000, 128, 1);
@@ -186,6 +189,56 @@ static Graph* ParseExample(int batch_size, int num_keys, int feature_size) {
return g;
}
+template <typename Options>
+static Graph* ParseSingleExample(int num_keys, int feature_size) {
+ Graph* g = new Graph(OpRegistry::Global());
+ Tensor& serialized_batch_1 =
+ Options::Store::serialized_example[std::make_tuple(1, num_keys,
+ feature_size)];
+ Tensor serialized(DT_STRING, TensorShape());
+ serialized.scalar<string>()() = serialized_batch_1.vec<string>()(0);
+
+ std::vector<string> sparse_keys;
+ std::vector<string> dense_keys;
+ std::vector<NodeBuilder::NodeOut> dense_defaults;
+ std::vector<DataType> sparse_types;
+ std::vector<PartialTensorShape> dense_shapes;
+ Options opt;
+ for (int i = 0; i < num_keys; ++i) {
+ string key = strings::Printf("feature_%d", i);
+ switch (opt.benchmark_type) {
+ case kDense:
+ dense_keys.push_back(key),
+ dense_defaults.emplace_back(test::graph::Constant(
+ g, opt.filler.make_dense_default(feature_size)));
+ dense_shapes.push_back(PartialTensorShape({feature_size}));
+ break;
+ case kVarLenDense:
+ dense_keys.push_back(key),
+ dense_defaults.emplace_back(
+ test::graph::Constant(g, opt.filler.make_dense_default(1)));
+ dense_shapes.push_back(PartialTensorShape({-1}));
+ break;
+ case kSparse:
+ sparse_keys.push_back(key), sparse_types.push_back(opt.filler.dtype);
+ break;
+ }
+ }
+
+ Node* ret;
+ TF_EXPECT_OK(NodeBuilder(g->NewName("n"), "ParseSingleExample")
+ .Input(test::graph::Constant(g, serialized))
+ .Input(dense_defaults)
+ .Attr<int64>("num_sparse", sparse_keys.size())
+ .Attr("sparse_keys", sparse_keys)
+ .Attr("sparse_types", sparse_types)
+ .Attr("dense_keys", dense_keys)
+ .Attr("dense_shapes", dense_shapes)
+ .Finalize(g, &ret));
+
+ return g;
+}
+
// Benchmark settings (Sparse, Dense) X (Bytes, Int64, Float)
typedef BenchmarkOptions<ExampleStore<BytesFiller>, kSparse> SparseString;
typedef BenchmarkOptions<ExampleStore<BytesFiller>, kDense> DenseString;
@@ -212,10 +265,13 @@ typedef BenchmarkOptions<ExampleStore<FloatFiller>, kVarLenDense>
BENCHMARK(BM_ParseExample##_##TYPE##_##B##_##K##_##F);
#define BM_AllParseExample(Type) \
+ BM_ParseExample(Type, 1, 10, 1); \
BM_ParseExample(Type, 128, 10, 1); \
BM_ParseExample(Type, 512, 10, 1); \
+ BM_ParseExample(Type, 1, 100, 1); \
BM_ParseExample(Type, 128, 100, 1); \
BM_ParseExample(Type, 512, 100, 1); \
+ BM_ParseExample(Type, 1, 1000, 1); \
BM_ParseExample(Type, 128, 1000, 1); \
BM_ParseExample(Type, 512, 1000, 1); \
BM_ParseExample(Type, 1, 1, 1000000);
@@ -230,4 +286,31 @@ BM_AllParseExample(SparseFloat);
BM_AllParseExample(DenseFloat);
BM_AllParseExample(VarLenDenseFloat);
+// K == num_keys. F == feature_size.
+// K must be one of 10, 100, 1000
+#define BM_ParseSingleExample(TYPE, K, F) \
+ static void BM_ParseSingleExample##_##TYPE##_1_##K##_##F(int iters) { \
+ int64 items_per_iter = K * F; \
+ testing::UseRealTime(); \
+ testing::ItemsProcessed(static_cast<int64>(iters) * items_per_iter); \
+ test::Benchmark("cpu", ParseSingleExample<TYPE>(K, F)).Run(iters); \
+ } \
+ BENCHMARK(BM_ParseSingleExample##_##TYPE##_1_##K##_##F);
+
+#define BM_AllParseSingleExample(Type) \
+ BM_ParseSingleExample(Type, 10, 1); \
+ BM_ParseSingleExample(Type, 100, 1); \
+ BM_ParseSingleExample(Type, 1000, 1); \
+ BM_ParseSingleExample(Type, 1, 1000000);
+
+BM_AllParseSingleExample(SparseString);
+BM_AllParseSingleExample(DenseString);
+BM_AllParseSingleExample(VarLenDenseString);
+BM_AllParseSingleExample(SparseInt64);
+BM_AllParseSingleExample(DenseInt64);
+BM_AllParseSingleExample(VarLenDenseInt64);
+BM_AllParseSingleExample(SparseFloat);
+BM_AllParseSingleExample(DenseFloat);
+BM_AllParseSingleExample(VarLenDenseFloat);
+
} // end namespace tensorflow
diff --git a/tensorflow/core/ops/parsing_ops.cc b/tensorflow/core/ops/parsing_ops.cc
index 40ec792ef8..11b28badc8 100644
--- a/tensorflow/core/ops/parsing_ops.cc
+++ b/tensorflow/core/ops/parsing_ops.cc
@@ -64,7 +64,7 @@ REGISTER_OP("ParseExample")
.Attr("Tdense: list({float,int64,string}) >= 0")
.Attr("dense_shapes: list(shape) >= 0")
.SetShapeFn([](InferenceContext* c) {
- ParseSingleExampleAttrs attrs;
+ ParseExampleAttrs attrs;
TF_RETURN_IF_ERROR(attrs.Init(c));
ShapeHandle input;
@@ -138,6 +138,86 @@ sparse_types: A list of Nsparse types; the data types of data in each Feature
DT_INT64 (Int64List), and DT_STRING (BytesList).
)doc");
+REGISTER_OP("ParseSingleExample")
+ .Input("serialized: string")
+ .Input("dense_defaults: Tdense")
+ .Output("sparse_indices: num_sparse * int64")
+ .Output("sparse_values: sparse_types")
+ .Output("sparse_shapes: num_sparse * int64")
+ .Output("dense_values: Tdense")
+ .Attr("num_sparse: int >= 0")
+ .Attr("sparse_keys: list(string) >= 0")
+ .Attr("dense_keys: list(string) >= 0")
+ .Attr("sparse_types: list({float,int64,string}) >= 0")
+ .Attr("Tdense: list({float,int64,string}) >= 0")
+ .Attr("dense_shapes: list(shape) >= 0")
+ .SetShapeFn([](InferenceContext* c) {
+ ParseSingleExampleAttrs attrs;
+ TF_RETURN_IF_ERROR(attrs.Init(c));
+
+ ShapeHandle input;
+ TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &input));
+
+ // Output sparse_indices, sparse_values, and sparse_shapes.
+ int output_idx = 0;
+ for (int i = 0; i < attrs.sparse_keys.size(); ++i) {
+ c->set_output(output_idx++, c->Matrix(c->UnknownDim(), 1));
+ }
+ for (int i = 0; i < attrs.sparse_keys.size(); ++i) {
+ c->set_output(output_idx++, c->Vector(c->UnknownDim()));
+ }
+ for (int i = 0; i < attrs.sparse_keys.size(); ++i) {
+ c->set_output(output_idx++, c->Vector(1));
+ }
+
+ // Output dense_shapes.
+ for (int i = 0; i < attrs.dense_keys.size(); ++i) {
+ ShapeHandle dense;
+ TF_RETURN_IF_ERROR(
+ c->MakeShapeFromPartialTensorShape(attrs.dense_shapes[i], &dense));
+ c->set_output(output_idx++, dense);
+ }
+ return Status::OK();
+ })
+ .Doc(R"doc(
+Transforms a tf.Example proto (as a string) into typed tensors.
+
+serialized: A vector containing a batch of binary serialized Example protos.
+dense_keys: The keys expected in the Examples' features associated with dense
+ values.
+dense_defaults: A list of Tensors (some may be empty), whose length matches
+ the length of `dense_keys`. dense_defaults[j] provides default values
+ when the example's feature_map lacks dense_key[j]. If an empty Tensor is
+ provided for dense_defaults[j], then the Feature dense_keys[j] is required.
+ The input type is inferred from dense_defaults[j], even when it's empty.
+ If dense_defaults[j] is not empty, and dense_shapes[j] is fully defined,
+ then the shape of dense_defaults[j] must match that of dense_shapes[j].
+ If dense_shapes[j] has an undefined major dimension (variable strides dense
+ feature), dense_defaults[j] must contain a single element:
+ the padding element.
+Tdense: The data types of data in each Feature given in dense_keys.
+ The length of this list must match the length of `dense_keys`.
+ Currently the ParseSingleExample op supports DT_FLOAT (FloatList),
+ DT_INT64 (Int64List), and DT_STRING (BytesList).
+dense_shapes: The shapes of data in each Feature given in dense_keys.
+ The length of this list must match the length of `dense_keys`. The
+ number of elements in the Feature corresponding to dense_key[j] must
+ always equal dense_shapes[j].NumEntries(). If dense_shapes[j] ==
+ (D0, D1, ..., DN) then the shape of output Tensor dense_values[j]
+ will be (D0, D1, ..., DN): In the case dense_shapes[j] = (-1, D1,
+ ..., DN), the shape of the output Tensor dense_values[j] will be (M,
+ D1, .., DN), where M is the number of blocks of elements of length
+ D1 * .... * DN, in the input.
+num_sparse: The number of sparse features to be parsed from the example. This
+ must match the lengths of `sparse_keys` and `sparse_types`.
+sparse_keys: A list of `num_sparse` strings.
+ The keys expected in the Examples' features associated with sparse values.
+sparse_types: A list of `num_sparse` types; the data types of data in each
+ Feature given in sparse_keys.
+ Currently the ParseSingleExample op supports DT_FLOAT (FloatList),
+ DT_INT64 (Int64List), and DT_STRING (BytesList).
+)doc");
+
REGISTER_OP("ParseSingleSequenceExample")
.Input("serialized: string")
.Input("feature_list_dense_missing_assumed_empty: string")
diff --git a/tensorflow/core/ops/parsing_ops_test.cc b/tensorflow/core/ops/parsing_ops_test.cc
index c6e521e33e..9121d7ae92 100644
--- a/tensorflow/core/ops/parsing_ops_test.cc
+++ b/tensorflow/core/ops/parsing_ops_test.cc
@@ -119,7 +119,7 @@ TEST(ParsingOpsTest, ParseExample_ShapeFn) {
("[?,2];[?,2];[?];[?];[2];[2];" // sparse outputs
"[d0_0,1];[d0_0,1,2];[d0_0,1,2,3]")); // dense outputs
- // Confirm an error from ParseSingleExampleAttrs.Init().
+ // Confirm an error from ParseExampleAttrs.Init().
set_outputs(2, 3, true /* add_extra_shape */);
INFER_ERROR("len(dense_keys) != len(dense_shapes)", op,
"?;?;?;?;?;?;?;?;?;?");
diff --git a/tensorflow/core/util/example_proto_fast_parsing.cc b/tensorflow/core/util/example_proto_fast_parsing.cc
index b9cf97195b..dd9ae46f88 100644
--- a/tensorflow/core/util/example_proto_fast_parsing.cc
+++ b/tensorflow/core/util/example_proto_fast_parsing.cc
@@ -94,9 +94,29 @@ class Feature {
return Status::OK();
}
+ bool GetNumElementsInBytesList(int* num_elements) {
+ protobuf::io::CodedInputStream stream(
+ reinterpret_cast<const uint8*>(serialized_.data()), serialized_.size());
+ EnableAliasing(&stream);
+ uint32 length = 0;
+ if (!stream.ReadVarint32(&length)) return false;
+ auto limit = stream.PushLimit(length);
+ *num_elements = 0;
+ while (!stream.ExpectAtEnd()) {
+ if (!stream.ExpectTag(kDelimitedTag(1))) return false;
+ uint32 bytes_length = 0;
+ if (!stream.ReadVarint32(&bytes_length)) return false;
+ if (!stream.Skip(bytes_length)) return false;
+ ++*num_elements;
+ }
+ stream.PopLimit(limit);
+ return true;
+ }
+
template <typename Result>
bool ParseBytesList(Result* bytes_list) {
DCHECK(bytes_list != nullptr);
+
protobuf::io::CodedInputStream stream(
reinterpret_cast<const uint8*>(serialized_.data()), serialized_.size());
@@ -111,9 +131,13 @@ class Feature {
// parse string
uint32 bytes_length;
if (!stream.ReadVarint32(&bytes_length)) return false;
- string bytes;
- if (!stream.ReadString(&bytes, bytes_length)) return false;
- bytes_list->push_back(std::move(bytes));
+ const void* buf_ptr = nullptr;
+ int size = 0;
+ if (!stream.GetDirectBufferPointer(&buf_ptr, &size)) return false;
+ if (size < bytes_length) return false;
+ bytes_list->push_back(
+ string(static_cast<const char*>(buf_ptr), bytes_length));
+ if (!stream.Skip(bytes_length)) return false;
}
stream.PopLimit(limit);
return true;
@@ -447,6 +471,28 @@ class LimitedArraySlice {
T* end_;
};
+void LogDenseFeatureDataLoss(StringPiece feature_name) {
+ LOG(WARNING) << "Data loss! Feature '" << feature_name
+ << "' is present in multiple concatenated "
+ "tf.Examples. Ignoring all but last one.";
+ static auto* duplicated_dense_feature = monitoring::Counter<0>::New(
+ "/tensorflow/core/util/example_proto_fast_parsing/"
+ "duplicated_dense_feature",
+ "Dense feature appears twice in a tf.Example");
+ duplicated_dense_feature->GetCell()->IncrementBy(1);
+}
+
+void LogSparseFeatureDataLoss(StringPiece feature_name) {
+ LOG(WARNING) << "Data loss! Feature '" << feature_name
+ << "' is present in multiple concatenated "
+ "tf.Examples. Ignoring all but last one.";
+ static auto* duplicated_sparse_feature = monitoring::Counter<0>::New(
+ "/tensorflow/core/util/example_proto_fast_parsing/"
+ "duplicated_sparse_feature",
+ "Sparse feature appears twice in a tf.Example");
+ duplicated_sparse_feature->GetCell()->IncrementBy(1);
+}
+
Status FastParseSerializedExample(
const string& serialized_example, const string& example_name,
const size_t example_index, const Config& config,
@@ -510,14 +556,7 @@ Status FastParseSerializedExample(
// If feature was already visited, skip.
// Compare comment at the beginning of the loop.
if (dense_feature_last_example[d] == example_index) {
- LOG(WARNING) << "Data loss! Feature '" << feature_name
- << "' in present in multiple concatenated "
- "tf.Examples. Ignoring all but last one.";
- static auto* duplicated_dense_feature = monitoring::Counter<0>::New(
- "/tensorflow/core/util/example_proto_fast_parsing/"
- "duplicated_dense_feature",
- "Dense feature appears twice in a tf.Example");
- duplicated_dense_feature->GetCell()->IncrementBy(1);
+ LogDenseFeatureDataLoss(feature_name);
continue;
}
dense_feature_last_example[d] = example_index;
@@ -639,14 +678,7 @@ Status FastParseSerializedExample(
// If feature was already visited, skip.
// Compare comment at the beginning of the loop.
if (sparse_feature_last_example[d] == example_index) {
- LOG(WARNING) << "Data loss! Feature '" << feature_name
- << "' in present in multiple concatenated "
- "tf.Examples. Ignoring all but last one.";
- static auto* duplicated_sparse_feature = monitoring::Counter<0>::New(
- "/tensorflow/core/util/example_proto_fast_parsing/"
- "duplicated_sparse_feature",
- "sparse feature appears twice in a tf.Example");
- duplicated_sparse_feature->GetCell()->IncrementBy(1);
+ LogSparseFeatureDataLoss(feature_name);
continue;
}
sparse_feature_last_example[d] = example_index;
@@ -1099,5 +1131,333 @@ Status FastParseExample(const Config& config,
return Status::OK();
}
+Status FastParseSingleExample(const Config& config, const string& serialized,
+ Result* result) {
+ DCHECK(result != nullptr);
+ // Check config so we can safely CHECK(false) in switches on config.*.dtype
+ for (auto& c : config.sparse) {
+ TF_RETURN_IF_ERROR(CheckConfigDataType(c.dtype));
+ }
+ for (auto& c : config.dense) {
+ TF_RETURN_IF_ERROR(CheckConfigDataType(c.dtype));
+ }
+
+ // TODO(mrry): Cache the construction of this map at Op construction time.
+ size_t config_size = config.dense.size() + config.sparse.size();
+ SeededHasher hasher;
+ // Build config index.
+ PresizedCuckooMap<std::pair<size_t, Type>> config_index(config_size);
+ bool ok = true;
+ for (size_t i = 0; i < 1000; ++i) {
+ for (size_t d = 0; d < config.dense.size(); ++d) {
+ ok &= config_index.InsertUnique(hasher(config.dense[d].feature_name),
+ {d, Type::Dense});
+ }
+ for (size_t d = 0; d < config.sparse.size(); ++d) {
+ ok &= config_index.InsertUnique(hasher(config.sparse[d].feature_name),
+ {d, Type::Sparse});
+ }
+ if (ok) break;
+ LOG(WARNING) << "Collision found. This should happen only if you have "
+ "around 2^32 entries in your config.";
+ hasher.seed++;
+ config_index.Clear(config_size);
+ }
+ if (!ok) {
+ return errors::Internal(
+ "Could not avoid collision. This should not happen.");
+ }
+
+ // Allocate dense output tensors.
+ for (size_t d = 0; d < config.dense.size(); ++d) {
+ if (!config.dense[d].variable_length) {
+ TensorShape values_shape;
+ if (!config.dense[d].shape.AsTensorShape(&values_shape)) {
+ return errors::Internal(
+ "Fixed-length shape was not a statically defined shape.");
+ }
+ result->dense_values.emplace_back(config.dense[d].dtype, values_shape);
+ } else {
+ // Variable-length tensor will be allocated later.
+ result->dense_values.emplace_back();
+ }
+ }
+
+ // Allocate sparse output tensors.
+ for (size_t d = 0; d < config.sparse.size(); ++d) {
+ // The dense_shape is always a vector of length 1.
+ result->sparse_shapes.emplace_back(DT_INT64, TensorShape({1}));
+ // Variable-length tensors will be allocated later.
+ result->sparse_indices.emplace_back();
+ result->sparse_values.emplace_back();
+ }
+
+ parsed::Example parsed_example;
+ if (!ParseExample(serialized, &parsed_example)) {
+ return errors::InvalidArgument("Could not parse example input, value: '",
+ serialized, "'");
+ }
+ std::vector<bool> sparse_feature_already_seen(config.sparse.size(), false);
+ std::vector<bool> dense_feature_already_seen(config.dense.size(), false);
+
+ // Handle features present in the example.
+ const size_t parsed_example_size = parsed_example.size();
+ for (size_t i = 0; i < parsed_example_size; ++i) {
+ // This is a logic that standard protobuf parsing is implementing.
+ // I.e. last entry in the map overwrites all the previous ones.
+ parsed::FeatureMapEntry& name_and_feature =
+ parsed_example[parsed_example_size - i - 1];
+
+ const StringPiece feature_name = name_and_feature.first;
+ parsed::Feature& feature = name_and_feature.second;
+
+ std::pair<size_t, Type> d_and_type;
+ uint64 h = hasher(feature_name);
+ if (!config_index.Find(h, &d_and_type)) continue;
+
+ size_t d = d_and_type.first;
+ bool is_dense = d_and_type.second == Type::Dense;
+
+ {
+ // Testing for PresizedCuckooMap collision.
+ // TODO(lew): Use dense_hash_map and avoid this and hasher creation.
+ const string& config_feature_name = is_dense
+ ? config.dense[d].feature_name
+ : config.sparse[d].feature_name;
+ if (feature_name != config_feature_name) continue;
+ }
+
+ auto example_error = [feature_name](StringPiece suffix) {
+ return errors::InvalidArgument("Key: ", feature_name, ". ", suffix);
+ };
+
+ auto parse_error = [feature_name] {
+ return errors::InvalidArgument("Key: ", feature_name,
+ ". Can't parse serialized Example.");
+ };
+
+ DataType example_dtype;
+ TF_RETURN_IF_ERROR(feature.ParseDataType(&example_dtype));
+ if (example_dtype == DT_INVALID) continue;
+
+ if (is_dense && !config.dense[d].variable_length) {
+ // If feature was already visited, skip.
+ // Compare comment at the beginning of the loop.
+ if (dense_feature_already_seen[d]) {
+ LogDenseFeatureDataLoss(feature_name);
+ continue;
+ }
+ dense_feature_already_seen[d] = true;
+
+ if (example_dtype != config.dense[d].dtype) {
+ return example_error(strings::StrCat(
+ "Data types don't match. Data type: ",
+ DataTypeString(example_dtype),
+ " but expected type: ", DataTypeString(config.dense[d].dtype)));
+ }
+
+ Tensor* out = &result->dense_values[d];
+ const std::size_t num_elements = config.dense[d].elements_per_stride;
+
+ switch (example_dtype) {
+ case DT_INT64: {
+ auto out_p = out->flat<int64>().data();
+ LimitedArraySlice<int64> slice(out_p, num_elements);
+ if (!feature.ParseInt64List(&slice)) return parse_error();
+ if (slice.EndDistance() != 0) {
+ return parse_error();
+ }
+ break;
+ }
+ case DT_FLOAT: {
+ auto out_p = out->flat<float>().data();
+ LimitedArraySlice<float> slice(out_p, num_elements);
+ if (!feature.ParseFloatList(&slice)) return parse_error();
+ if (slice.EndDistance() != 0) {
+ return parse_error();
+ }
+ break;
+ }
+ case DT_STRING: {
+ auto out_p = out->flat<string>().data();
+ LimitedArraySlice<string> slice(out_p, num_elements);
+ if (!feature.ParseBytesList(&slice)) return parse_error();
+ if (slice.EndDistance() != 0) {
+ return parse_error();
+ }
+ break;
+ }
+ default:
+ LOG(FATAL) << "Should not happen.";
+ }
+
+ } else { // if variable length
+ SparseBuffer out_temp;
+ const size_t num_elements_divisor =
+ is_dense ? config.dense[d].elements_per_stride : 1;
+ size_t num_elements;
+
+ if (is_dense) {
+ // If feature was already visited, skip.
+ // Compare comment at the beginning of the loop.
+ if (dense_feature_already_seen[d]) {
+ LogDenseFeatureDataLoss(feature_name);
+ continue;
+ }
+ dense_feature_already_seen[d] = true;
+ if (example_dtype != config.dense[d].dtype) {
+ return example_error(strings::StrCat(
+ "Data types don't match. Data type: ",
+ DataTypeString(example_dtype),
+ " but expected type: ", DataTypeString(config.dense[d].dtype)));
+ }
+ } else {
+ // If feature was already visited, skip.
+ // Compare comment at the beginning of the loop.
+ if (sparse_feature_already_seen[d]) {
+ LogSparseFeatureDataLoss(feature_name);
+ continue;
+ }
+ sparse_feature_already_seen[d] = true;
+
+ // Handle sparse features.
+ if (example_dtype != DT_INVALID &&
+ example_dtype != config.sparse[d].dtype) {
+ return example_error(strings::StrCat(
+ "Data types don't match. ",
+ "Expected type: ", DataTypeString(config.sparse[d].dtype),
+ ", Actual type: ", DataTypeString(example_dtype)));
+ }
+ }
+
+ switch (example_dtype) {
+ case DT_INT64: {
+ // TODO(mrry): Use the fact that the `int64_list` is packed to read
+ // out the length and pre-allocate the output tensor.
+ if (!feature.ParseInt64List(&out_temp.int64_list))
+ return parse_error();
+ num_elements = out_temp.int64_list.size();
+ break;
+ }
+ case DT_FLOAT: {
+ // TODO(mrry): Use the fact that the `float_list` is packed to read
+ // out the length and pre-allocate the output tensor.
+ if (!feature.ParseFloatList(&out_temp.float_list))
+ return parse_error();
+ num_elements = out_temp.float_list.size();
+ break;
+ }
+ case DT_STRING: {
+ int actual_num_elements = 0;
+ if (!feature.GetNumElementsInBytesList(&actual_num_elements)) {
+ return parse_error();
+ }
+ out_temp.bytes_list.reserve(actual_num_elements);
+ if (!feature.ParseBytesList(&out_temp.bytes_list))
+ return parse_error();
+ num_elements = out_temp.bytes_list.size();
+ break;
+ }
+ default:
+ LOG(FATAL) << "Should not happen. " << DataTypeString(example_dtype);
+ }
+
+ if (num_elements % num_elements_divisor != 0) {
+ return parse_error();
+ }
+
+ Tensor* out;
+ if (is_dense) {
+ TensorShape values_shape;
+ values_shape.AddDim(num_elements / num_elements_divisor);
+ for (int i = 1; i < config.dense[d].shape.dims(); ++i) {
+ values_shape.AddDim(config.dense[d].shape.dim_size(i));
+ }
+
+ out = &result->dense_values[d];
+ *out = Tensor(config.dense[d].dtype, values_shape);
+
+ } else {
+ Tensor* out_indices = &result->sparse_indices[d];
+ Tensor* out_dense_shape = &result->sparse_shapes[d];
+ out = &result->sparse_values[d];
+
+ // TODO(mrry): Investigate the possibility of not materializing
+ // the indices (and perhaps dense_shape) until they are needed.
+ *out_indices = Tensor(
+ DT_INT64, TensorShape({static_cast<int64>(num_elements), 1}));
+ auto indices_flat = out_indices->flat<int64>();
+ for (size_t i = 0; i < num_elements; ++i) {
+ indices_flat(i) = static_cast<int64>(i);
+ }
+
+ *out_dense_shape = Tensor(DT_INT64, TensorShape({1}));
+ auto shapes_shape_t = out_dense_shape->vec<int64>();
+ shapes_shape_t(0) = num_elements;
+
+ *out = Tensor(config.sparse[d].dtype,
+ TensorShape({static_cast<int64>(num_elements)}));
+ }
+
+ switch (example_dtype) {
+ case DT_INT64: {
+ CopyOrMoveBlock(out_temp.int64_list.begin(),
+ out_temp.int64_list.end(), out->flat<int64>().data());
+ break;
+ }
+ case DT_FLOAT: {
+ CopyOrMoveBlock(out_temp.float_list.begin(),
+ out_temp.float_list.end(), out->flat<float>().data());
+ break;
+ }
+ case DT_STRING: {
+ CopyOrMoveBlock(out_temp.bytes_list.begin(),
+ out_temp.bytes_list.end(),
+ out->flat<string>().data());
+ break;
+ }
+ default:
+ LOG(FATAL) << "Should not happen.";
+ }
+ }
+ }
+
+ // Handle missing dense features.
+ for (size_t d = 0; d < config.dense.size(); ++d) {
+ if (!dense_feature_already_seen[d]) {
+ if (!config.dense[d].variable_length) {
+ // Handle missing fixed-length dense feature.
+ if (config.dense[d].default_value.NumElements() == 0) {
+ return errors::InvalidArgument(
+ "Feature: ", config.dense[d].feature_name,
+ " (data type: ", DataTypeString(config.dense[d].dtype), ")",
+ " is required but could not be found.");
+ }
+ result->dense_values[d] = config.dense[d].default_value;
+ } else {
+ // Handle missing varlen dense feature.
+ TensorShape empty_shape;
+ empty_shape.AddDim(0);
+ for (int i = 1; i < config.dense[d].shape.dims(); ++i) {
+ empty_shape.AddDim(config.dense[d].shape.dim_size(i));
+ }
+ result->dense_values[d] = Tensor(config.dense[d].dtype, empty_shape);
+ }
+ }
+ }
+
+ // Handle missing sparse features.
+ for (size_t d = 0; d < config.sparse.size(); ++d) {
+ if (!sparse_feature_already_seen[d]) {
+ result->sparse_indices[d] = Tensor(DT_INT64, TensorShape({0, 1}));
+ result->sparse_values[d] =
+ Tensor(config.sparse[d].dtype, TensorShape({0}));
+ result->sparse_shapes[d].vec<int64>()(0) = 0;
+ }
+ }
+
+ return Status::OK();
+}
+
} // namespace example
} // namespace tensorflow
diff --git a/tensorflow/core/util/example_proto_fast_parsing.h b/tensorflow/core/util/example_proto_fast_parsing.h
index 20536cee16..fe59ec77ca 100644
--- a/tensorflow/core/util/example_proto_fast_parsing.h
+++ b/tensorflow/core/util/example_proto_fast_parsing.h
@@ -79,6 +79,12 @@ Status FastParseExample(const FastParseExampleConfig& config,
gtl::ArraySlice<string> example_names,
thread::ThreadPool* thread_pool, Result* result);
+// TODO(mrry): Move the hash table construction into the config object.
+typedef FastParseExampleConfig FastParseSingleExampleConfig;
+
+Status FastParseSingleExample(const FastParseSingleExampleConfig& config,
+ const string& serialized, Result* result);
+
// This function parses serialized Example and populates given example.
// It uses the same specialized parser as FastParseExample which is efficient.
// But then constructs Example which is relatively slow.
diff --git a/tensorflow/core/util/example_proto_helper.cc b/tensorflow/core/util/example_proto_helper.cc
index 4b5bf63112..41f56d2daa 100644
--- a/tensorflow/core/util/example_proto_helper.cc
+++ b/tensorflow/core/util/example_proto_helper.cc
@@ -400,7 +400,7 @@ Status BatchExampleProtoToTensors(
return Status::OK();
}
-Status ParseSingleExampleAttrs::FinishInit() {
+Status ParseExampleAttrs::FinishInit() {
if (static_cast<size_t>(num_sparse) != sparse_types.size()) {
return errors::InvalidArgument("len(sparse_keys) != len(sparse_types)");
}
@@ -422,6 +422,25 @@ Status ParseSingleExampleAttrs::FinishInit() {
return Status::OK();
}
+Status ParseSingleExampleAttrs::FinishInit() {
+ if (sparse_keys.size() != sparse_types.size()) {
+ return errors::InvalidArgument("len(sparse_keys) != len(sparse_types)");
+ }
+ if (dense_keys.size() != dense_types.size()) {
+ return errors::InvalidArgument("len(dense_keys) != len(dense_types)");
+ }
+ if (dense_keys.size() != dense_shapes.size()) {
+ return errors::InvalidArgument("len(dense_keys) != len(dense_shapes)");
+ }
+ for (const DataType& type : dense_types) {
+ TF_RETURN_IF_ERROR(CheckValidType(type));
+ }
+ for (const DataType& type : sparse_types) {
+ TF_RETURN_IF_ERROR(CheckValidType(type));
+ }
+ return Status::OK();
+}
+
Status ParseSingleSequenceExampleAttrs::FinishInit() {
if (static_cast<size_t>(num_context_sparse) != context_sparse_types.size()) {
return errors::InvalidArgument(
diff --git a/tensorflow/core/util/example_proto_helper.h b/tensorflow/core/util/example_proto_helper.h
index 7414d61e8b..8b3c6c5a3f 100644
--- a/tensorflow/core/util/example_proto_helper.h
+++ b/tensorflow/core/util/example_proto_helper.h
@@ -148,9 +148,9 @@ Tensor FeatureSparseCopy(const std::size_t batch, const string& key,
int64 CopyIntoSparseTensor(const Tensor& in, const int batch,
const int64 offset, Tensor* indices, Tensor* values);
-// Parses the attributes passed to ParseSingleExample.
+// Parses the attributes passed to ParseExample.
// REQUIRES: Init must be called after construction.
-class ParseSingleExampleAttrs {
+class ParseExampleAttrs {
public:
template <typename ContextType>
Status Init(ContextType* ctx) {
@@ -205,6 +205,72 @@ class ParseSingleExampleAttrs {
Status FinishInit(); // for context-independent parts of Init.
};
+// Parses the attributes passed to ParseSingleExample.
+// REQUIRES: Init must be called after construction.
+class ParseSingleExampleAttrs {
+ public:
+ template <typename ContextType>
+ Status Init(ContextType* ctx) {
+ TF_RETURN_IF_ERROR(ctx->GetAttr("sparse_keys", &sparse_keys));
+ TF_RETURN_IF_ERROR(ctx->GetAttr("sparse_types", &sparse_types));
+ TF_RETURN_IF_ERROR(ctx->GetAttr("dense_keys", &dense_keys));
+ TF_RETURN_IF_ERROR(ctx->GetAttr("Tdense", &dense_types));
+ TF_RETURN_IF_ERROR(ctx->GetAttr("dense_shapes", &dense_shapes));
+
+ int num_sparse;
+ TF_RETURN_IF_ERROR(ctx->GetAttr("num_sparse", &num_sparse));
+ if (num_sparse != sparse_keys.size() || num_sparse != sparse_types.size()) {
+ return errors::InvalidArgument(
+ "num_sparse (", num_sparse, ") must match the size of sparse_keys (",
+ sparse_keys.size(), ") and sparse_types (", sparse_types.size(), ")");
+ }
+
+ // Temporary check until we start allowing a variable length outer
+ // dimension.
+ for (int i = 0; i < dense_shapes.size(); ++i) {
+ bool shape_ok = true;
+ if (dense_shapes[i].dims() == -1) {
+ shape_ok = false;
+ } else {
+ for (int d = 1; d < dense_shapes[i].dims(); ++d) {
+ if (dense_shapes[i].dim_size(d) == -1) {
+ shape_ok = false;
+ }
+ }
+ }
+ if (!shape_ok) {
+ return errors::InvalidArgument(
+ "dense_shapes[", i,
+ "] has unknown rank or unknown inner dimensions: ",
+ dense_shapes[i].DebugString());
+ }
+ TensorShape dense_shape;
+ if (dense_shapes[i].dims() > 0 && dense_shapes[i].dim_size(0) == -1) {
+ variable_length.push_back(true);
+ for (int d = 1; d < dense_shapes[i].dims(); ++d) {
+ dense_shape.AddDim(dense_shapes[i].dim_size(d));
+ }
+ } else {
+ variable_length.push_back(false);
+ dense_shapes[i].AsTensorShape(&dense_shape);
+ }
+ elements_per_stride.push_back(dense_shape.num_elements());
+ }
+ return FinishInit();
+ }
+
+ std::vector<string> sparse_keys;
+ std::vector<DataType> sparse_types;
+ std::vector<string> dense_keys;
+ std::vector<DataType> dense_types;
+ std::vector<PartialTensorShape> dense_shapes;
+ std::vector<bool> variable_length;
+ std::vector<std::size_t> elements_per_stride;
+
+ private:
+ Status FinishInit(); // for context-independent parts of Init.
+};
+
// Parses the attributes passed to ParseSingleSequenceExample.
// REQUIRES: Init must be called after construction.
class ParseSingleSequenceExampleAttrs {
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index 31d3bd1b74..d98bb0f8cc 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -604,6 +604,23 @@ tf_py_test(
)
tf_py_test(
+ name = "parse_single_example_op_test",
+ size = "small",
+ srcs = ["parse_single_example_op_test.py"],
+ additional_deps = [
+ "//third_party/py/numpy",
+ "//tensorflow/core:protos_all_py",
+ "//tensorflow/python:array_ops",
+ "//tensorflow/python:client_testlib",
+ "//tensorflow/python:errors",
+ "//tensorflow/python:framework",
+ "//tensorflow/python:framework_for_generated_wrappers",
+ "//tensorflow/python:parsing_ops",
+ "//tensorflow/python:platform",
+ ],
+)
+
+tf_py_test(
name = "partitioned_variables_test",
size = "small",
srcs = ["partitioned_variables_test.py"],
diff --git a/tensorflow/python/kernel_tests/parse_single_example_op_test.py b/tensorflow/python/kernel_tests/parse_single_example_op_test.py
new file mode 100644
index 0000000000..b5bd1b9bee
--- /dev/null
+++ b/tensorflow/python/kernel_tests/parse_single_example_op_test.py
@@ -0,0 +1,930 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tensorflow.ops.parsing_ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import itertools
+
+import numpy as np
+
+from tensorflow.core.example import example_pb2
+from tensorflow.core.example import feature_pb2
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import parsing_ops
+from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging
+
+# Helpers for creating Example objects
+example = example_pb2.Example
+feature = feature_pb2.Feature
+features = lambda d: feature_pb2.Features(feature=d)
+bytes_feature = lambda v: feature(bytes_list=feature_pb2.BytesList(value=v))
+int64_feature = lambda v: feature(int64_list=feature_pb2.Int64List(value=v))
+float_feature = lambda v: feature(float_list=feature_pb2.FloatList(value=v))
+# Helpers for creating SequenceExample objects
+feature_list = lambda l: feature_pb2.FeatureList(feature=l)
+feature_lists = lambda d: feature_pb2.FeatureLists(feature_list=d)
+sequence_example = example_pb2.SequenceExample
+
+
+def empty_sparse(dtype, shape=None):
+ if shape is None:
+ shape = [0]
+ return (np.empty(shape=(0, len(shape)), dtype=np.int64),
+ np.array([], dtype=dtype), np.array(shape, dtype=np.int64))
+
+
+def flatten(list_of_lists):
+ """Flatten one level of nesting."""
+ return itertools.chain.from_iterable(list_of_lists)
+
+
+def flatten_values_tensors_or_sparse(tensors_list):
+ """Flatten each SparseTensor object into 3 Tensors for session.run()."""
+ return list(
+ flatten([[v.indices, v.values, v.dense_shape] if isinstance(
+ v, sparse_tensor.SparseTensor) else [v] for v in tensors_list]))
+
+
+def _compare_output_to_expected(tester, dict_tensors, expected_tensors,
+ flat_output):
+ tester.assertEqual(set(dict_tensors.keys()), set(expected_tensors.keys()))
+
+ i = 0 # Index into the flattened output of session.run()
+ for k, v in dict_tensors.items():
+ expected_v = expected_tensors[k]
+ tf_logging.info("Comparing key: %s", k)
+ if isinstance(v, sparse_tensor.SparseTensor):
+ # Three outputs for SparseTensor : indices, values, shape.
+ tester.assertEqual([k, len(expected_v)], [k, 3])
+ tester.assertAllEqual(expected_v[0], flat_output[i])
+ tester.assertAllEqual(expected_v[1], flat_output[i + 1])
+ tester.assertAllEqual(expected_v[2], flat_output[i + 2])
+ i += 3
+ else:
+ # One output for standard Tensor.
+ tester.assertAllEqual(expected_v, flat_output[i])
+ i += 1
+
+
+class ParseExampleTest(test.TestCase):
+
+ def _test(self, kwargs, expected_values=None, expected_err=None):
+ with self.test_session() as sess:
+ if expected_err:
+ with self.assertRaisesWithPredicateMatch(expected_err[0],
+ expected_err[1]):
+ out = parsing_ops.parse_single_example_v2(**kwargs)
+ sess.run(flatten_values_tensors_or_sparse(out.values()))
+ return
+ else:
+ # Returns dict w/ Tensors and SparseTensors.
+ out = parsing_ops.parse_single_example_v2(**kwargs)
+ result = flatten_values_tensors_or_sparse(out.values())
+ # Check values.
+ tf_result = sess.run(result)
+ _compare_output_to_expected(self, out, expected_values, tf_result)
+
+ for k, f in kwargs["features"].items():
+ if isinstance(f, parsing_ops.FixedLenFeature) and f.shape is not None:
+ self.assertEqual(tuple(out[k].get_shape().as_list()), f.shape)
+ elif isinstance(f, parsing_ops.VarLenFeature):
+ self.assertEqual(
+ tuple(out[k].indices.get_shape().as_list()), (None, 1))
+ self.assertEqual(tuple(out[k].values.get_shape().as_list()), (None,))
+ self.assertEqual(
+ tuple(out[k].dense_shape.get_shape().as_list()), (1,))
+
+ def testEmptySerializedWithAllDefaults(self):
+ sparse_name = "st_a"
+ a_name = "a"
+ b_name = "b"
+ c_name = "c:has_a_tricky_name"
+ a_default = [0, 42, 0]
+ b_default = np.random.rand(3, 3).astype(bytes)
+ c_default = np.random.rand(2).astype(np.float32)
+
+ expected_st_a = ( # indices, values, shape
+ np.empty((0, 1), dtype=np.int64), # indices
+ np.empty((0,), dtype=np.int64), # sp_a is DT_INT64
+ np.array([0], dtype=np.int64)) # max_elems = 0
+
+ expected_output = {
+ sparse_name: expected_st_a,
+ a_name: np.array([a_default]),
+ b_name: np.array(b_default),
+ c_name: np.array(c_default),
+ }
+
+ self._test({
+ "serialized": ops.convert_to_tensor(""),
+ "features": {
+ sparse_name:
+ parsing_ops.VarLenFeature(dtypes.int64),
+ a_name:
+ parsing_ops.FixedLenFeature(
+ (1, 3), dtypes.int64, default_value=a_default),
+ b_name:
+ parsing_ops.FixedLenFeature(
+ (3, 3), dtypes.string, default_value=b_default),
+ c_name:
+ parsing_ops.FixedLenFeature(
+ (2,), dtypes.float32, default_value=c_default),
+ }
+ }, expected_output)
+
+ def testEmptySerializedWithoutDefaultsShouldFail(self):
+ input_features = {
+ "st_a":
+ parsing_ops.VarLenFeature(dtypes.int64),
+ "a":
+ parsing_ops.FixedLenFeature(
+ (1, 3), dtypes.int64, default_value=[0, 42, 0]),
+ "b":
+ parsing_ops.FixedLenFeature(
+ (3, 3),
+ dtypes.string,
+ default_value=np.random.rand(3, 3).astype(bytes)),
+ # Feature "c" is missing a default, this gap will cause failure.
+ "c":
+ parsing_ops.FixedLenFeature(
+ (2,), dtype=dtypes.float32),
+ }
+
+ # Edge case where the key is there but the feature value is empty
+ original = example(features=features({"c": feature()}))
+ self._test(
+ {
+ "serialized": original.SerializeToString(),
+ "features": input_features,
+ },
+ expected_err=(errors_impl.OpError,
+ "Feature: c \\(data type: float\\) is required"))
+
+ # Standard case of missing key and value.
+ self._test(
+ {
+ "serialized": "",
+ "features": input_features,
+ },
+ expected_err=(errors_impl.OpError,
+ "Feature: c \\(data type: float\\) is required"))
+
+ def testDenseNotMatchingShapeShouldFail(self):
+ original = example(features=features({
+ "a": float_feature([-1, -1]),
+ }))
+
+ serialized = original.SerializeToString()
+
+ self._test(
+ {
+ "serialized": ops.convert_to_tensor(serialized),
+ "features": {
+ "a": parsing_ops.FixedLenFeature((1, 3), dtypes.float32)
+ }
+ },
+ # TODO(mrry): Consider matching the `tf.parse_example()` error message.
+ expected_err=(errors_impl.OpError, "Key: a."))
+
+ def testDenseDefaultNoShapeShouldFail(self):
+ original = example(features=features({
+ "a": float_feature([1, 1, 3]),
+ }))
+
+ serialized = original.SerializeToString()
+
+ self._test(
+ {
+ "serialized": ops.convert_to_tensor(serialized),
+ "features": {
+ "a": parsing_ops.FixedLenFeature(None, dtypes.float32)
+ }
+ },
+ expected_err=(ValueError, "Missing shape for feature a"))
+
+ def testSerializedContainingSparse(self):
+ original = [
+ example(features=features({
+ "st_c": float_feature([3, 4])
+ })),
+ example(features=features({
+ "st_c": float_feature([]), # empty float list
+ })),
+ example(features=features({
+ "st_d": feature(), # feature with nothing in it
+ })),
+ example(features=features({
+ "st_c": float_feature([1, 2, -1]),
+ "st_d": bytes_feature([b"hi"])
+ }))
+ ]
+
+ expected_outputs = [{
+ "st_c": (np.array([[0], [1]], dtype=np.int64),
+ np.array([3.0, 4.0], dtype=np.float32),
+ np.array([2], dtype=np.int64)),
+ "st_d":
+ empty_sparse(bytes)
+ }, {
+ "st_c": empty_sparse(np.float32),
+ "st_d": empty_sparse(bytes)
+ }, {
+ "st_c": empty_sparse(np.float32),
+ "st_d": empty_sparse(bytes)
+ }, {
+ "st_c": (np.array([[0], [1], [2]], dtype=np.int64),
+ np.array([1.0, 2.0, -1.0], dtype=np.float32),
+ np.array([3], dtype=np.int64)),
+ "st_d": (np.array([[0]], dtype=np.int64), np.array(["hi"], dtype=bytes),
+ np.array([1], dtype=np.int64))
+ }]
+
+ for proto, expected_output in zip(original, expected_outputs):
+ self._test({
+ "serialized": ops.convert_to_tensor(proto.SerializeToString()),
+ "features": {
+ "st_c": parsing_ops.VarLenFeature(dtypes.float32),
+ "st_d": parsing_ops.VarLenFeature(dtypes.string)
+ },
+ }, expected_output)
+
+ def testSerializedContainingSparseFeature(self):
+ original = [
+ example(features=features({
+ "val": float_feature([3, 4]),
+ "idx": int64_feature([5, 10])
+ })),
+ example(features=features({
+ "val": float_feature([]), # empty float list
+ "idx": int64_feature([])
+ })),
+ example(features=features({
+ "val": feature(), # feature with nothing in it
+ # missing idx feature
+ })),
+ example(features=features({
+ "val": float_feature([1, 2, -1]),
+ "idx":
+ int64_feature([0, 9, 3]) # unsorted
+ }))
+ ]
+
+ expected_outputs = [{
+ "sp": (np.array([[5], [10]], dtype=np.int64),
+ np.array([3.0, 4.0], dtype=np.float32),
+ np.array([13], dtype=np.int64))
+ }, {
+ "sp": empty_sparse(np.float32, shape=[13])
+ }, {
+ "sp": empty_sparse(np.float32, shape=[13])
+ }, {
+ "sp": (np.array([[0], [3], [9]], dtype=np.int64),
+ np.array([1.0, -1.0, 2.0], dtype=np.float32),
+ np.array([13], dtype=np.int64))
+ }]
+
+ for proto, expected_output in zip(original, expected_outputs):
+ self._test({
+ "serialized": ops.convert_to_tensor(proto.SerializeToString()),
+ "features": {
+ "sp":
+ parsing_ops.SparseFeature(["idx"], "val", dtypes.float32,
+ [13])
+ }
+ }, expected_output)
+
+ def testSerializedContainingSparseFeatureReuse(self):
+ original = [
+ example(features=features({
+ "val1": float_feature([3, 4]),
+ "val2": float_feature([5, 6]),
+ "idx": int64_feature([5, 10])
+ })),
+ example(features=features({
+ "val1": float_feature([]), # empty float list
+ "idx": int64_feature([])
+ })),
+ ]
+
+ expected_outputs = [{
+ "sp1": (np.array([[5], [10]], dtype=np.int64),
+ np.array([3.0, 4.0], dtype=np.float32),
+ np.array([13], dtype=np.int64)),
+ "sp2": (np.array([[5], [10]], dtype=np.int64),
+ np.array([5.0, 6.0], dtype=np.float32),
+ np.array([7], dtype=np.int64))
+ }, {
+ "sp1": empty_sparse(np.float32, shape=[13]),
+ "sp2": empty_sparse(np.float32, shape=[7])
+ }]
+
+ for proto, expected_output in zip(original, expected_outputs):
+ self._test({
+ "serialized": ops.convert_to_tensor(proto.SerializeToString()),
+ "features": {
+ "sp1":
+ parsing_ops.SparseFeature("idx", "val1", dtypes.float32, 13),
+ "sp2":
+ parsing_ops.SparseFeature(
+ "idx",
+ "val2",
+ dtypes.float32,
+ size=7,
+ already_sorted=True)
+ }
+ }, expected_output)
+
+ def testSerializedContaining3DSparseFeature(self):
+ original = [
+ example(features=features({
+ "val": float_feature([3, 4]),
+ "idx0": int64_feature([5, 10]),
+ "idx1": int64_feature([0, 2]),
+ })),
+ example(features=features({
+ "val": float_feature([]), # empty float list
+ "idx0": int64_feature([]),
+ "idx1": int64_feature([]),
+ })),
+ example(features=features({
+ "val": feature(), # feature with nothing in it
+ # missing idx feature
+ })),
+ example(features=features({
+ "val": float_feature([1, 2, -1]),
+ "idx0": int64_feature([0, 9, 3]), # unsorted
+ "idx1": int64_feature([1, 0, 2]),
+ }))
+ ]
+
+ expected_outputs = [{
+ "sp": (np.array([[5, 0], [10, 2]], dtype=np.int64),
+ np.array([3.0, 4.0], dtype=np.float32),
+ np.array([13, 3], dtype=np.int64))
+ }, {
+ "sp": empty_sparse(np.float32, shape=[13, 3])
+ }, {
+ "sp": empty_sparse(np.float32, shape=[13, 3])
+ }, {
+ "sp": (np.array([[0, 1], [3, 2], [9, 0]], dtype=np.int64),
+ np.array([1.0, -1.0, 2.0], dtype=np.float32),
+ np.array([13, 3], dtype=np.int64))
+ }]
+
+ for proto, expected_output in zip(original, expected_outputs):
+ self._test({
+ "serialized": ops.convert_to_tensor(proto.SerializeToString()),
+ "features": {
+ "sp":
+ parsing_ops.SparseFeature(["idx0", "idx1"], "val",
+ dtypes.float32, [13, 3])
+ }
+ }, expected_output)
+
+ def testSerializedContainingDense(self):
+ aname = "a"
+ bname = "b*has+a:tricky_name"
+ original = [
+ example(features=features({
+ aname: float_feature([1, 1]),
+ bname: bytes_feature([b"b0_str"]),
+ })), example(features=features({
+ aname: float_feature([-1, -1]),
+ bname: bytes_feature([b"b1"]),
+ }))
+ ]
+
+ expected_outputs = [{
+ aname: np.array([1, 1], dtype=np.float32).reshape(1, 2, 1),
+ bname: np.array(["b0_str"], dtype=bytes).reshape(1, 1, 1, 1)
+ }, {
+ aname: np.array([-1, -1], dtype=np.float32).reshape(1, 2, 1),
+ bname: np.array(["b1"], dtype=bytes).reshape(1, 1, 1, 1)
+ }]
+
+ for proto, expected_output in zip(original, expected_outputs):
+ # No defaults, values required
+ self._test({
+ "serialized": ops.convert_to_tensor(proto.SerializeToString()),
+ "features": {
+ aname:
+ parsing_ops.FixedLenFeature((1, 2, 1), dtype=dtypes.float32),
+ bname:
+ parsing_ops.FixedLenFeature(
+ (1, 1, 1, 1), dtype=dtypes.string),
+ }
+ }, expected_output)
+
+ # This test is identical as the previous one except
+ # for the creation of 'serialized'.
+ def testSerializedContainingDenseWithConcat(self):
+ aname = "a"
+ bname = "b*has+a:tricky_name"
+ # TODO(lew): Feature appearing twice should be an error in future.
+ original = [
+ (example(features=features({
+ aname: float_feature([10, 10]),
+ })), example(features=features({
+ aname: float_feature([1, 1]),
+ bname: bytes_feature([b"b0_str"]),
+ }))),
+ (
+ example(features=features({
+ bname: bytes_feature([b"b100"]),
+ })),
+ example(features=features({
+ aname: float_feature([-1, -1]),
+ bname: bytes_feature([b"b1"]),
+ })),),
+ ]
+
+ expected_outputs = [{
+ aname: np.array([1, 1], dtype=np.float32).reshape(1, 2, 1),
+ bname: np.array(["b0_str"], dtype=bytes).reshape(1, 1, 1, 1)
+ }, {
+ aname: np.array([-1, -1], dtype=np.float32).reshape(1, 2, 1),
+ bname: np.array(["b1"], dtype=bytes).reshape(1, 1, 1, 1)
+ }]
+
+ for (m, n), expected_output in zip(original, expected_outputs):
+ # No defaults, values required
+ self._test({
+ "serialized":
+ ops.convert_to_tensor(
+ m.SerializeToString() + n.SerializeToString()),
+ "features": {
+ aname:
+ parsing_ops.FixedLenFeature((1, 2, 1), dtype=dtypes.float32),
+ bname:
+ parsing_ops.FixedLenFeature(
+ (1, 1, 1, 1), dtype=dtypes.string),
+ }
+ }, expected_output)
+
+ def testSerializedContainingDenseScalar(self):
+ original = [
+ example(features=features({
+ "a": float_feature([1]),
+ })), example(features=features({}))
+ ]
+
+ expected_outputs = [{
+ "a": np.array([1], dtype=np.float32)
+ }, {
+ "a": np.array([-1], dtype=np.float32)
+ }]
+
+ for proto, expected_output in zip(original, expected_outputs):
+ self._test({
+ "serialized": ops.convert_to_tensor(proto.SerializeToString()),
+ "features": {
+ "a":
+ parsing_ops.FixedLenFeature(
+ (1,), dtype=dtypes.float32, default_value=-1),
+ }
+ }, expected_output)
+
+ def testSerializedContainingDenseWithDefaults(self):
+ original = [
+ example(features=features({
+ "a": float_feature([1, 1]),
+ })),
+ example(features=features({
+ "b": bytes_feature([b"b1"]),
+ })),
+ example(features=features({
+ "b": feature()
+ })),
+ ]
+
+ expected_outputs = [{
+ "a": np.array([1, 1], dtype=np.float32).reshape(1, 2, 1),
+ "b": np.array("tmp_str", dtype=bytes).reshape(1, 1, 1, 1)
+ }, {
+ "a": np.array([3, -3], dtype=np.float32).reshape(1, 2, 1),
+ "b": np.array("b1", dtype=bytes).reshape(1, 1, 1, 1)
+ }, {
+ "a": np.array([3, -3], dtype=np.float32).reshape(1, 2, 1),
+ "b": np.array("tmp_str", dtype=bytes).reshape(1, 1, 1, 1)
+ }]
+
+ for proto, expected_output in zip(original, expected_outputs):
+ self._test({
+ "serialized": ops.convert_to_tensor(proto.SerializeToString()),
+ "features": {
+ "a":
+ parsing_ops.FixedLenFeature(
+ (1, 2, 1),
+ dtype=dtypes.float32,
+ default_value=[3.0, -3.0]),
+ "b":
+ parsing_ops.FixedLenFeature(
+ (1, 1, 1, 1),
+ dtype=dtypes.string,
+ default_value="tmp_str"),
+ }
+ }, expected_output)
+
+ def testSerializedContainingSparseAndSparseFeatureAndDenseWithNoDefault(self):
+ original = [
+ example(features=features({
+ "c": float_feature([3, 4]),
+ "val": bytes_feature([b"a", b"b"]),
+ "idx": int64_feature([0, 3])
+ })), example(features=features({
+ "c": float_feature([1, 2]),
+ "val": bytes_feature([b"c"]),
+ "idx": int64_feature([7])
+ }))
+ ]
+
+ a_default = np.array([[1, 2, 3]], dtype=np.int64)
+ b_default = np.random.rand(3, 3).astype(bytes)
+
+ expected_st_a = empty_sparse(np.int64)
+
+ expected_outputs = [{
+ "st_a":
+ expected_st_a,
+ "sp": (np.array([[0], [3]], dtype=np.int64),
+ np.array(["a", "b"], dtype=bytes), np.array(
+ [13], dtype=np.int64)),
+ "a":
+ a_default,
+ "b":
+ b_default,
+ "c":
+ np.array([3, 4], dtype=np.float32)
+ }, {
+ "st_a":
+ expected_st_a,
+ "sp": (np.array([[7]], dtype=np.int64), np.array(["c"], dtype=bytes),
+ np.array([13], dtype=np.int64)),
+ "a":
+ a_default,
+ "b":
+ b_default,
+ "c":
+ np.array([1, 2], dtype=np.float32)
+ }]
+
+ for proto, expected_output in zip(original, expected_outputs):
+ self._test(
+ {
+ "serialized": ops.convert_to_tensor(proto.SerializeToString()),
+ "features": {
+ "st_a":
+ parsing_ops.VarLenFeature(dtypes.int64),
+ "sp":
+ parsing_ops.SparseFeature("idx", "val", dtypes.string, 13
+ ),
+ "a":
+ parsing_ops.FixedLenFeature(
+ (1, 3), dtypes.int64, default_value=a_default),
+ "b":
+ parsing_ops.FixedLenFeature(
+ (3, 3), dtypes.string, default_value=b_default),
+ # Feature "c" must be provided, since it has no default_value.
+ "c":
+ parsing_ops.FixedLenFeature((2,), dtypes.float32),
+ }
+ },
+ expected_output)
+
+ def testSerializedContainingSparseAndSparseFeatureWithReuse(self):
+ original = [
+ example(features=features({
+ "val": bytes_feature([b"a", b"b"]),
+ "idx": int64_feature([0, 3])
+ })), example(features=features({
+ "val": bytes_feature([b"c", b"d"]),
+ "idx": int64_feature([7, 1])
+ }))
+ ]
+
+ expected_outputs = [{
+ "idx": (np.array([[0], [1]], dtype=np.int64),
+ np.array([0, 3], dtype=np.int64), np.array([2],
+ dtype=np.int64)),
+ "sp": (np.array([[0], [3]], dtype=np.int64),
+ np.array(["a", "b"], dtype=bytes), np.array(
+ [13], dtype=np.int64))
+ },
+ {
+ "idx": (np.array([[0], [1]], dtype=np.int64),
+ np.array([7, 1], dtype=np.int64),
+ np.array([2], dtype=np.int64)),
+ "sp": (np.array([[1], [7]], dtype=np.int64),
+ np.array(["d", "c"], dtype=bytes),
+ np.array([13], dtype=np.int64))
+ }]
+
+ for proto, expected_output in zip(original, expected_outputs):
+ self._test({
+ "serialized": ops.convert_to_tensor(proto.SerializeToString()),
+ "features": {
+ "idx":
+ parsing_ops.VarLenFeature(dtypes.int64),
+ "sp":
+ parsing_ops.SparseFeature(["idx"], "val", dtypes.string, [13]
+ ),
+ }
+ }, expected_output)
+
+ def testSerializedContainingVarLenDense(self):
+ aname = "a"
+ bname = "b"
+ cname = "c"
+ dname = "d"
+ original = [
+ example(features=features({
+ cname: int64_feature([2]),
+ })),
+ example(features=features({
+ aname: float_feature([1, 1]),
+ bname: bytes_feature([b"b0_str", b"b1_str"]),
+ })),
+ example(features=features({
+ aname: float_feature([-1, -1, 2, 2]),
+ bname: bytes_feature([b"b1"]),
+ })),
+ example(features=features({
+ aname: float_feature([]),
+ cname: int64_feature([3]),
+ })),
+ ]
+
+ expected_outputs = [
+ {
+ aname: np.empty(shape=(0, 2, 1), dtype=np.int64),
+ bname: np.empty(shape=(0, 1, 1, 1), dtype=bytes),
+ cname: np.array([2], dtype=np.int64),
+ dname: np.empty(shape=(0,), dtype=bytes)
+ },
+ {
+ aname:
+ np.array([[[1], [1]]], dtype=np.float32),
+ bname:
+ np.array(["b0_str", "b1_str"], dtype=bytes).reshape(2, 1, 1, 1),
+ cname:
+ np.empty(shape=(0,), dtype=np.int64),
+ dname:
+ np.empty(shape=(0,), dtype=bytes)
+ },
+ {
+ aname: np.array([[[-1], [-1]], [[2], [2]]], dtype=np.float32),
+ bname: np.array(["b1"], dtype=bytes).reshape(1, 1, 1, 1),
+ cname: np.empty(shape=(0,), dtype=np.int64),
+ dname: np.empty(shape=(0,), dtype=bytes)
+ },
+ {
+ aname: np.empty(shape=(0, 2, 1), dtype=np.int64),
+ bname: np.empty(shape=(0, 1, 1, 1), dtype=bytes),
+ cname: np.array([3], dtype=np.int64),
+ dname: np.empty(shape=(0,), dtype=bytes)
+ },
+ ]
+
+ for proto, expected_output in zip(original, expected_outputs):
+ self._test({
+ "serialized": ops.convert_to_tensor(proto.SerializeToString()),
+ "features": {
+ aname:
+ parsing_ops.FixedLenSequenceFeature(
+ (2, 1), dtype=dtypes.float32, allow_missing=True),
+ bname:
+ parsing_ops.FixedLenSequenceFeature(
+ (1, 1, 1), dtype=dtypes.string, allow_missing=True),
+ cname:
+ parsing_ops.FixedLenSequenceFeature(
+ shape=[], dtype=dtypes.int64, allow_missing=True),
+ dname:
+ parsing_ops.FixedLenSequenceFeature(
+ shape=[], dtype=dtypes.string, allow_missing=True),
+ }
+ }, expected_output)
+
+ # Test with padding values.
+ # NOTE(mrry): Since we parse a single example at a time, the fixed-length
+ # sequences will not be padded, and the padding value will be ignored.
+ for proto, expected_output in zip(original, expected_outputs):
+ self._test({
+ "serialized": ops.convert_to_tensor(proto.SerializeToString()),
+ "features": {
+ aname:
+ parsing_ops.FixedLenSequenceFeature(
+ (2, 1), dtype=dtypes.float32, allow_missing=True),
+ bname:
+ parsing_ops.FixedLenSequenceFeature(
+ (1, 1, 1), dtype=dtypes.string, allow_missing=True),
+ cname:
+ parsing_ops.FixedLenSequenceFeature(
+ shape=[], dtype=dtypes.int64, allow_missing=True),
+ dname:
+ parsing_ops.FixedLenSequenceFeature(
+ shape=[], dtype=dtypes.string, allow_missing=True),
+ }
+ }, expected_output)
+
+ # Change number of required values so the inputs are not a
+ # multiple of this size.
+ self._test(
+ {
+ "serialized":
+ ops.convert_to_tensor(original[2].SerializeToString()),
+ "features": {
+ aname:
+ parsing_ops.FixedLenSequenceFeature(
+ (2, 1), dtype=dtypes.float32, allow_missing=True),
+ bname:
+ parsing_ops.FixedLenSequenceFeature(
+ (2, 1, 1), dtype=dtypes.string, allow_missing=True),
+ }
+ },
+ # TODO(mrry): Consider matching the `tf.parse_example()` error message.
+ expected_err=(errors_impl.OpError, "Key: b."))
+
+ self._test(
+ {
+ "serialized": ops.convert_to_tensor(""),
+ "features": {
+ aname:
+ parsing_ops.FixedLenSequenceFeature(
+ (2, 1),
+ dtype=dtypes.float32,
+ allow_missing=True,
+ default_value=[]),
+ bname:
+ parsing_ops.FixedLenSequenceFeature(
+ (2, 1, 1), dtype=dtypes.string, allow_missing=True),
+ }
+ },
+ expected_err=(ValueError,
+ "Cannot reshape a tensor with 0 elements to shape"))
+
+ self._test(
+ {
+ "serialized": ops.convert_to_tensor(""),
+ "features": {
+ aname:
+ parsing_ops.FixedLenFeature(
+ (None, 2, 1), dtype=dtypes.float32),
+ bname:
+ parsing_ops.FixedLenSequenceFeature(
+ (2, 1, 1), dtype=dtypes.string, allow_missing=True),
+ }
+ },
+ expected_err=(ValueError,
+ "First dimension of shape for feature a unknown. "
+ "Consider using FixedLenSequenceFeature."))
+
+ self._test(
+ {
+ "serialized": ops.convert_to_tensor(""),
+ "features": {
+ cname:
+ parsing_ops.FixedLenFeature(
+ (1, None), dtype=dtypes.int64, default_value=[[1]]),
+ }
+ },
+ expected_err=(ValueError,
+ "All dimensions of shape for feature c need to be known "
+ r"but received \(1, None\)."))
+
+ self._test(
+ {
+ "serialized": ops.convert_to_tensor(""),
+ "features": {
+ aname:
+ parsing_ops.FixedLenSequenceFeature(
+ (2, 1), dtype=dtypes.float32, allow_missing=True),
+ bname:
+ parsing_ops.FixedLenSequenceFeature(
+ (1, 1, 1), dtype=dtypes.string, allow_missing=True),
+ cname:
+ parsing_ops.FixedLenSequenceFeature(
+ shape=[], dtype=dtypes.int64, allow_missing=False),
+ dname:
+ parsing_ops.FixedLenSequenceFeature(
+ shape=[], dtype=dtypes.string, allow_missing=True),
+ }
+ },
+ expected_err=(ValueError,
+ "Unsupported: FixedLenSequenceFeature requires "
+ "allow_missing to be True."))
+
+
+class ParseSingleExampleTest(test.TestCase):
+
+ def _test(self, kwargs, expected_values=None, expected_err=None):
+ with self.test_session() as sess:
+ if expected_err:
+ with self.assertRaisesWithPredicateMatch(expected_err[0],
+ expected_err[1]):
+ out = parsing_ops.parse_single_example_v2(**kwargs)
+ sess.run(flatten_values_tensors_or_sparse(out.values()))
+ else:
+ # Returns dict w/ Tensors and SparseTensors.
+ out = parsing_ops.parse_single_example_v2(**kwargs)
+ # Check values.
+ tf_result = sess.run(flatten_values_tensors_or_sparse(out.values()))
+ _compare_output_to_expected(self, out, expected_values, tf_result)
+
+ # Check shapes.
+ for k, f in kwargs["features"].items():
+ if isinstance(f, parsing_ops.FixedLenFeature) and f.shape is not None:
+ self.assertEqual(tuple(out[k].get_shape()),
+ tensor_shape.as_shape(f.shape))
+ elif isinstance(f, parsing_ops.VarLenFeature):
+ self.assertEqual(
+ tuple(out[k].indices.get_shape().as_list()), (None, 1))
+ self.assertEqual(tuple(out[k].values.get_shape().as_list()), (None,))
+ self.assertEqual(
+ tuple(out[k].dense_shape.get_shape().as_list()), (1,))
+
+ def testSingleExampleWithSparseAndSparseFeatureAndDense(self):
+ original = example(features=features({
+ "c": float_feature([3, 4]),
+ "d": float_feature([0.0, 1.0]),
+ "val": bytes_feature([b"a", b"b"]),
+ "idx": int64_feature([0, 3]),
+ "st_a": float_feature([3.0, 4.0])
+ }))
+
+ serialized = original.SerializeToString()
+
+ expected_st_a = (
+ np.array(
+ [[0], [1]], dtype=np.int64), # indices
+ np.array(
+ [3.0, 4.0], dtype=np.float32), # values
+ np.array(
+ [2], dtype=np.int64)) # shape: max_values = 2
+
+ expected_sp = ( # indices, values, shape
+ np.array(
+ [[0], [3]], dtype=np.int64), np.array(
+ ["a", "b"], dtype="|S"), np.array(
+ [13], dtype=np.int64)) # max_values = 13
+
+ a_default = [1, 2, 3]
+ b_default = np.random.rand(3, 3).astype(bytes)
+ expected_output = {
+ "st_a": expected_st_a,
+ "sp": expected_sp,
+ "a": [a_default],
+ "b": b_default,
+ "c": np.array([3, 4], dtype=np.float32),
+ "d": np.array([0.0, 1.0], dtype=np.float32),
+ }
+
+ self._test(
+ {
+ "serialized":
+ ops.convert_to_tensor(serialized),
+ "features": {
+ "st_a":
+ parsing_ops.VarLenFeature(dtypes.float32),
+ "sp":
+ parsing_ops.SparseFeature(
+ ["idx"], "val", dtypes.string, [13]),
+ "a":
+ parsing_ops.FixedLenFeature(
+ (1, 3), dtypes.int64, default_value=a_default),
+ "b":
+ parsing_ops.FixedLenFeature(
+ (3, 3), dtypes.string, default_value=b_default),
+ # Feature "c" must be provided, since it has no default_value.
+ "c":
+ parsing_ops.FixedLenFeature(2, dtypes.float32),
+ "d":
+ parsing_ops.FixedLenSequenceFeature([],
+ dtypes.float32,
+ allow_missing=True)
+ }
+ },
+ expected_output)
+
+
+if __name__ == "__main__":
+ test.main()
diff --git a/tensorflow/python/ops/parsing_ops.py b/tensorflow/python/ops/parsing_ops.py
index 14aef01dec..eba40c4f85 100644
--- a/tensorflow/python/ops/parsing_ops.py
+++ b/tensorflow/python/ops/parsing_ops.py
@@ -1205,3 +1205,198 @@ def decode_csv(records, record_defaults, field_delim=",",
field_delim=field_delim, use_quote_delim=use_quote_delim,
na_value=na_value, name=name)
# pylint: enable=protected-access
+
+
+# TODO(b/70890287): Combine the implementation of this op and
+# `parse_single_example()` after 1/10/2018.
+def parse_single_example_v2(serialized, features, name=None):
+ # pylint: disable=line-too-long
+ """Parses an `Example` proto into a `dict` of tensors.
+
+ Parses a serialized
+ [`Example`](https://www.tensorflow.org/code/tensorflow/core/example/example.proto)
+ proto given in `serialized`.
+
+ This op parses serialized examples into a dictionary mapping keys to `Tensor`
+ and `SparseTensor` objects. `features` is a dict from keys to `VarLenFeature`,
+ `SparseFeature`, and `FixedLenFeature` objects. Each `VarLenFeature`
+ and `SparseFeature` is mapped to a `SparseTensor`, and each
+ `FixedLenFeature` is mapped to a `Tensor`.
+
+ Each `VarLenFeature` maps to a `SparseTensor` of the specified type
+ representing a ragged matrix. Its indices are `[index]` where
+ `index` is the value's index in the list of values associated with
+ that feature and example.
+
+ Each `SparseFeature` maps to a `SparseTensor` of the specified type
+ representing a Tensor of `dense_shape` `SparseFeature.size`.
+ Its `values` come from the feature in the examples with key `value_key`.
+ A `values[i]` comes from a position `k` in the feature of an example at batch
+ entry `batch`. This positional information is recorded in `indices[i]` as
+ `[batch, index_0, index_1, ...]` where `index_j` is the `k-th` value of
+ the feature in the example at with key `SparseFeature.index_key[j].
+ In other words, we split the indices (except the first index indicating the
+ batch entry) of a `SparseTensor` by dimension into different features of the
+ `Example`. Due to its complexity a `VarLenFeature` should be preferred over a
+ `SparseFeature` whenever possible.
+
+ Each `FixedLenFeature` `df` maps to a `Tensor` of the specified type (or
+ `tf.float32` if not specified) and shape `df.shape`.
+
+ `FixedLenFeature` entries with a `default_value` are optional. With no default
+ value, we will fail if that `Feature` is missing from any example in
+ `serialized`.
+
+ Each `FixedLenSequenceFeature` `df` maps to a `Tensor` of the specified type
+ (or `tf.float32` if not specified) and shape `(None,) + df.shape`.
+
+ Args:
+ serialized: A scalar (0-D Tensor) string, a serialized `Example` proto.
+ features: A `dict` mapping feature keys to `FixedLenFeature`,
+ `VarLenFeature`, and `SparseFeature` values.
+ name: A name for this operation (optional).
+
+ Returns:
+ A `dict` mapping feature keys to `Tensor` and `SparseTensor` values.
+
+ Raises:
+ ValueError: if any feature is invalid.
+ """
+ if not features:
+ raise ValueError("Missing: features was %s." % features)
+ features = _prepend_none_dimension(features)
+ (sparse_keys, sparse_types, dense_keys, dense_types,
+ dense_defaults, dense_shapes) = _features_to_raw_params(
+ features,
+ [VarLenFeature, SparseFeature, FixedLenFeature, FixedLenSequenceFeature])
+ outputs = _parse_single_example_v2_raw(serialized, sparse_keys, sparse_types,
+ dense_keys, dense_types,
+ dense_defaults, dense_shapes, name)
+ return _construct_sparse_tensors_for_sparse_features(features, outputs)
+
+
+def _parse_single_example_v2_raw(serialized, sparse_keys, sparse_types,
+ dense_keys, dense_types, dense_defaults,
+ dense_shapes, name):
+ """Parses `Example` protos.
+
+ Args:
+ serialized: A scalar (0-D Tensor) string, containing a binary
+ serialized `Example` proto.
+ sparse_keys: A list of string keys in the examples' features.
+ The results for these keys will be returned as `SparseTensor` objects.
+ sparse_types: A list of `DTypes` of the same length as `sparse_keys`.
+ Only `tf.float32` (`FloatList`), `tf.int64` (`Int64List`),
+ and `tf.string` (`BytesList`) are supported.
+ dense_keys: A list of string keys in the examples' features.
+ The results for these keys will be returned as `Tensor`s
+ dense_types: A list of DTypes of the same length as `dense_keys`.
+ Only `tf.float32` (`FloatList`), `tf.int64` (`Int64List`),
+ and `tf.string` (`BytesList`) are supported.
+ dense_defaults: A dict mapping string keys to `Tensor`s.
+ The keys of the dict must match the dense_keys of the feature.
+ dense_shapes: A list of tuples with the same length as `dense_keys`.
+ The shape of the data for each dense feature referenced by `dense_keys`.
+ Required for any input tensors identified by `dense_keys`. Must be
+ either fully defined, or may contain an unknown first dimension.
+ An unknown first dimension means the feature is treated as having
+ a variable number of blocks, and the output shape along this dimension
+ is considered unknown at graph build time. Padding is applied for
+ minibatch elements smaller than the maximum number of blocks for the
+ given feature along this dimension.
+ name: A name for this operation (optional).
+
+ Returns:
+ A `dict` mapping keys to `Tensor`s and `SparseTensor`s.
+
+ Raises:
+ ValueError: If sparse and dense key sets intersect, or input lengths do not
+ match up.
+ """
+ with ops.name_scope(name, "ParseSingleExample", [serialized]):
+ dense_defaults = collections.OrderedDict(
+ ) if dense_defaults is None else dense_defaults
+ sparse_keys = [] if sparse_keys is None else sparse_keys
+ sparse_types = [] if sparse_types is None else sparse_types
+ dense_keys = [] if dense_keys is None else dense_keys
+ dense_types = [] if dense_types is None else dense_types
+ dense_shapes = ([[]] * len(dense_keys)
+ if dense_shapes is None else dense_shapes)
+
+ num_dense = len(dense_keys)
+ num_sparse = len(sparse_keys)
+
+ if len(dense_shapes) != num_dense:
+ raise ValueError("len(dense_shapes) != len(dense_keys): %d vs. %d" %
+ (len(dense_shapes), num_dense))
+ if len(dense_types) != num_dense:
+ raise ValueError("len(dense_types) != len(num_dense): %d vs. %d" %
+ (len(dense_types), num_dense))
+ if len(sparse_types) != num_sparse:
+ raise ValueError("len(sparse_types) != len(sparse_keys): %d vs. %d" %
+ (len(sparse_types), num_sparse))
+ if num_dense + num_sparse == 0:
+ raise ValueError("Must provide at least one sparse key or dense key")
+ if not set(dense_keys).isdisjoint(set(sparse_keys)):
+ raise ValueError(
+ "Dense and sparse keys must not intersect; intersection: %s" %
+ set(dense_keys).intersection(set(sparse_keys)))
+
+ # Convert dense_shapes to TensorShape object.
+ dense_shapes = [tensor_shape.as_shape(shape) for shape in dense_shapes]
+
+ dense_defaults_vec = []
+ for i, key in enumerate(dense_keys):
+ default_value = dense_defaults.get(key)
+ dense_shape = dense_shapes[i]
+ if (dense_shape.ndims is not None and dense_shape.ndims > 0 and
+ dense_shape[0].value is None):
+ # Variable stride dense shape, the default value should be a
+ # scalar padding value
+ if default_value is None:
+ default_value = ops.convert_to_tensor(
+ "" if dense_types[i] == dtypes.string else 0,
+ dtype=dense_types[i])
+ else:
+ # Reshape to a scalar to ensure user gets an error if they
+ # provide a tensor that's not intended to be a padding value
+ # (0 or 2+ elements).
+ key_name = "padding_" + re.sub("[^A-Za-z0-9_.\\-/]", "_", key)
+ default_value = ops.convert_to_tensor(
+ default_value, dtype=dense_types[i], name=key_name)
+ default_value = array_ops.reshape(default_value, [])
+ else:
+ if default_value is None:
+ default_value = constant_op.constant([], dtype=dense_types[i])
+ elif not isinstance(default_value, ops.Tensor):
+ key_name = "key_" + re.sub("[^A-Za-z0-9_.\\-/]", "_", key)
+ default_value = ops.convert_to_tensor(
+ default_value, dtype=dense_types[i], name=key_name)
+ default_value = array_ops.reshape(default_value, dense_shape)
+
+ dense_defaults_vec.append(default_value)
+
+ # Finally, convert dense_shapes to TensorShapeProto
+ dense_shapes = [shape.as_proto() for shape in dense_shapes]
+
+ # pylint: disable=protected-access
+ outputs = gen_parsing_ops.parse_single_example(
+ serialized=serialized,
+ dense_defaults=dense_defaults_vec,
+ num_sparse=len(sparse_keys),
+ sparse_keys=sparse_keys,
+ sparse_types=sparse_types,
+ dense_keys=dense_keys,
+ dense_shapes=dense_shapes,
+ name=name)
+ # pylint: enable=protected-access
+
+ (sparse_indices, sparse_values, sparse_shapes, dense_values) = outputs
+
+ sparse_tensors = [
+ sparse_tensor.SparseTensor(ix, val, shape)
+ for (ix, val,
+ shape) in zip(sparse_indices, sparse_values, sparse_shapes)
+ ]
+
+ return dict(zip(sparse_keys + dense_keys, sparse_tensors + dense_values))