From 96f3023b6a8b154c3840776c5feff3e028860a36 Mon Sep 17 00:00:00 2001 From: Derek Murray Date: Wed, 20 Dec 2017 20:54:25 -0800 Subject: [tf.data] Add `tf.contrib.data.parse_single_example()`. The new op is a fused implementation of the existing `tf.parse_single_example()`, which is more efficient when parsing a single Example at a time. PiperOrigin-RevId: 179768512 --- tensorflow/contrib/data/BUILD | 1 + tensorflow/contrib/data/__init__.py | 3 + .../base_api/api_def_ParseSingleExample.pbtxt | 78 ++ tensorflow/core/kernels/example_parsing_ops.cc | 104 ++- .../core/kernels/example_parsing_ops_test.cc | 83 ++ tensorflow/core/ops/parsing_ops.cc | 82 +- tensorflow/core/ops/parsing_ops_test.cc | 2 +- tensorflow/core/util/example_proto_fast_parsing.cc | 398 ++++++++- tensorflow/core/util/example_proto_fast_parsing.h | 6 + tensorflow/core/util/example_proto_helper.cc | 21 +- tensorflow/core/util/example_proto_helper.h | 70 +- tensorflow/python/kernel_tests/BUILD | 17 + .../kernel_tests/parse_single_example_op_test.py | 930 +++++++++++++++++++++ tensorflow/python/ops/parsing_ops.py | 195 +++++ 14 files changed, 1962 insertions(+), 28 deletions(-) create mode 100644 tensorflow/core/api_def/base_api/api_def_ParseSingleExample.pbtxt create mode 100644 tensorflow/python/kernel_tests/parse_single_example_op_test.py diff --git a/tensorflow/contrib/data/BUILD b/tensorflow/contrib/data/BUILD index 3b1c33063f..8ecc003348 100644 --- a/tensorflow/contrib/data/BUILD +++ b/tensorflow/contrib/data/BUILD @@ -20,6 +20,7 @@ py_library( "//tensorflow/contrib/data/python/ops:readers", "//tensorflow/contrib/data/python/ops:shuffle_ops", "//tensorflow/contrib/data/python/ops:transformation_ops", + "//tensorflow/python:parsing_ops", "//tensorflow/python:util", "//tensorflow/python/data/ops:iterator_ops", ], diff --git a/tensorflow/contrib/data/__init__.py b/tensorflow/contrib/data/__init__.py index c9ad091bd4..46125a8875 100644 --- a/tensorflow/contrib/data/__init__.py +++ b/tensorflow/contrib/data/__init__.py @@ -38,6 +38,8 @@ See the @{$datasets$Importing Data} Programmer's Guide for an overview. @@sloppy_interleave @@get_single_element + +@@parse_single_example """ from __future__ import absolute_import @@ -68,6 +70,7 @@ from tensorflow.contrib.data.python.ops.resampling import rejection_resample from tensorflow.contrib.data.python.ops.scan_ops import scan from tensorflow.contrib.data.python.ops.shuffle_ops import shuffle_and_repeat from tensorflow.python.data.ops.iterator_ops import Iterator +from tensorflow.python.ops.parsing_ops import parse_single_example_v2 as parse_single_example # pylint: enable=unused-import from tensorflow.python.util.all_util import remove_undocumented diff --git a/tensorflow/core/api_def/base_api/api_def_ParseSingleExample.pbtxt b/tensorflow/core/api_def/base_api/api_def_ParseSingleExample.pbtxt new file mode 100644 index 0000000000..476c01d0ad --- /dev/null +++ b/tensorflow/core/api_def/base_api/api_def_ParseSingleExample.pbtxt @@ -0,0 +1,78 @@ +op { + graph_op_name: "ParseSingleExample" + in_arg { + name: "serialized" + description: <input("serialized", &serialized)); + OP_REQUIRES_OK(ctx, ctx->input_list("dense_defaults", &dense_defaults)); + + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(serialized->shape()), + errors::InvalidArgument( + "Expected serialized to be a scalar, got shape: ", + serialized->shape().DebugString())); + OP_REQUIRES(ctx, dense_defaults.size() == attrs_.dense_keys.size(), + errors::InvalidArgument( + "Expected len(dense_defaults) == len(dense_keys) but got: ", + dense_defaults.size(), " vs. ", attrs_.dense_keys.size())); + + for (size_t d = 0; d < attrs_.dense_keys.size(); ++d) { + const Tensor& def_value = dense_defaults[d]; + if (attrs_.variable_length[d]) { + OP_REQUIRES(ctx, def_value.NumElements() == 1, + errors::InvalidArgument( + "dense_shape[", d, "] is a variable length shape: ", + attrs_.dense_shapes[d].DebugString(), + ", therefore " + "def_value[", + d, + "] must contain a single element (" + "the padding element). But its shape is: ", + def_value.shape().DebugString())); + } else if (def_value.NumElements() > 0) { + OP_REQUIRES(ctx, + attrs_.dense_shapes[d].IsCompatibleWith(def_value.shape()), + errors::InvalidArgument( + "def_value[", d, + "].shape() == ", def_value.shape().DebugString(), + " is not compatible with dense_shapes_[", d, + "] == ", attrs_.dense_shapes[d].DebugString())); + } + OP_REQUIRES(ctx, def_value.dtype() == attrs_.dense_types[d], + errors::InvalidArgument( + "dense_defaults[", d, "].dtype() == ", + DataTypeString(def_value.dtype()), " != dense_types_[", d, + "] == ", DataTypeString(attrs_.dense_types[d]))); + } + + example::Result result; + + // TODO(mrry): Build the configuration once and cache it. + example::FastParseExampleConfig config; + for (int d = 0; d < attrs_.dense_keys.size(); ++d) { + config.dense.push_back({attrs_.dense_keys[d], attrs_.dense_types[d], + attrs_.dense_shapes[d], dense_defaults[d], + attrs_.variable_length[d], + attrs_.elements_per_stride[d]}); + } + for (int d = 0; d < attrs_.sparse_keys.size(); ++d) { + config.sparse.push_back({attrs_.sparse_keys[d], attrs_.sparse_types[d]}); + } + + const string& serialized_proto = serialized->scalar()(); + + OP_REQUIRES_OK(ctx, + FastParseSingleExample(config, serialized_proto, &result)); + + OpOutputList dense_values; + OpOutputList sparse_indices; + OpOutputList sparse_values; + OpOutputList sparse_shapes; + OP_REQUIRES_OK(ctx, ctx->output_list("dense_values", &dense_values)); + OP_REQUIRES_OK(ctx, ctx->output_list("sparse_indices", &sparse_indices)); + OP_REQUIRES_OK(ctx, ctx->output_list("sparse_values", &sparse_values)); + OP_REQUIRES_OK(ctx, ctx->output_list("sparse_shapes", &sparse_shapes)); + for (int d = 0; d < attrs_.dense_keys.size(); ++d) { + dense_values.set(d, result.dense_values[d]); + } + for (int d = 0; d < attrs_.sparse_keys.size(); ++d) { + sparse_indices.set(d, result.sparse_indices[d]); + sparse_values.set(d, result.sparse_values[d]); + sparse_shapes.set(d, result.sparse_shapes[d]); + } + } + + protected: + ParseSingleExampleAttrs attrs_; +}; + +REGISTER_KERNEL_BUILDER(Name("ParseSingleExample").Device(DEVICE_CPU), + ParseSingleExampleOp); class SingleSequenceExampleParserOp : public OpKernel { public: diff --git a/tensorflow/core/kernels/example_parsing_ops_test.cc b/tensorflow/core/kernels/example_parsing_ops_test.cc index 29dbfd3b1b..0a64a6c154 100644 --- a/tensorflow/core/kernels/example_parsing_ops_test.cc +++ b/tensorflow/core/kernels/example_parsing_ops_test.cc @@ -103,6 +103,9 @@ struct ExampleStore { } static ExampleTensorMap GetSerializedExamples() { ExampleTensorMap examples; + AddExample(&examples, 10, 1, 1); + AddExample(&examples, 100, 1, 1); + AddExample(&examples, 1000, 1, 1); AddExample(&examples, 10, 128, 1); AddExample(&examples, 100, 128, 1); AddExample(&examples, 1000, 128, 1); @@ -186,6 +189,56 @@ static Graph* ParseExample(int batch_size, int num_keys, int feature_size) { return g; } +template +static Graph* ParseSingleExample(int num_keys, int feature_size) { + Graph* g = new Graph(OpRegistry::Global()); + Tensor& serialized_batch_1 = + Options::Store::serialized_example[std::make_tuple(1, num_keys, + feature_size)]; + Tensor serialized(DT_STRING, TensorShape()); + serialized.scalar()() = serialized_batch_1.vec()(0); + + std::vector sparse_keys; + std::vector dense_keys; + std::vector dense_defaults; + std::vector sparse_types; + std::vector dense_shapes; + Options opt; + for (int i = 0; i < num_keys; ++i) { + string key = strings::Printf("feature_%d", i); + switch (opt.benchmark_type) { + case kDense: + dense_keys.push_back(key), + dense_defaults.emplace_back(test::graph::Constant( + g, opt.filler.make_dense_default(feature_size))); + dense_shapes.push_back(PartialTensorShape({feature_size})); + break; + case kVarLenDense: + dense_keys.push_back(key), + dense_defaults.emplace_back( + test::graph::Constant(g, opt.filler.make_dense_default(1))); + dense_shapes.push_back(PartialTensorShape({-1})); + break; + case kSparse: + sparse_keys.push_back(key), sparse_types.push_back(opt.filler.dtype); + break; + } + } + + Node* ret; + TF_EXPECT_OK(NodeBuilder(g->NewName("n"), "ParseSingleExample") + .Input(test::graph::Constant(g, serialized)) + .Input(dense_defaults) + .Attr("num_sparse", sparse_keys.size()) + .Attr("sparse_keys", sparse_keys) + .Attr("sparse_types", sparse_types) + .Attr("dense_keys", dense_keys) + .Attr("dense_shapes", dense_shapes) + .Finalize(g, &ret)); + + return g; +} + // Benchmark settings (Sparse, Dense) X (Bytes, Int64, Float) typedef BenchmarkOptions, kSparse> SparseString; typedef BenchmarkOptions, kDense> DenseString; @@ -212,10 +265,13 @@ typedef BenchmarkOptions, kVarLenDense> BENCHMARK(BM_ParseExample##_##TYPE##_##B##_##K##_##F); #define BM_AllParseExample(Type) \ + BM_ParseExample(Type, 1, 10, 1); \ BM_ParseExample(Type, 128, 10, 1); \ BM_ParseExample(Type, 512, 10, 1); \ + BM_ParseExample(Type, 1, 100, 1); \ BM_ParseExample(Type, 128, 100, 1); \ BM_ParseExample(Type, 512, 100, 1); \ + BM_ParseExample(Type, 1, 1000, 1); \ BM_ParseExample(Type, 128, 1000, 1); \ BM_ParseExample(Type, 512, 1000, 1); \ BM_ParseExample(Type, 1, 1, 1000000); @@ -230,4 +286,31 @@ BM_AllParseExample(SparseFloat); BM_AllParseExample(DenseFloat); BM_AllParseExample(VarLenDenseFloat); +// K == num_keys. F == feature_size. +// K must be one of 10, 100, 1000 +#define BM_ParseSingleExample(TYPE, K, F) \ + static void BM_ParseSingleExample##_##TYPE##_1_##K##_##F(int iters) { \ + int64 items_per_iter = K * F; \ + testing::UseRealTime(); \ + testing::ItemsProcessed(static_cast(iters) * items_per_iter); \ + test::Benchmark("cpu", ParseSingleExample(K, F)).Run(iters); \ + } \ + BENCHMARK(BM_ParseSingleExample##_##TYPE##_1_##K##_##F); + +#define BM_AllParseSingleExample(Type) \ + BM_ParseSingleExample(Type, 10, 1); \ + BM_ParseSingleExample(Type, 100, 1); \ + BM_ParseSingleExample(Type, 1000, 1); \ + BM_ParseSingleExample(Type, 1, 1000000); + +BM_AllParseSingleExample(SparseString); +BM_AllParseSingleExample(DenseString); +BM_AllParseSingleExample(VarLenDenseString); +BM_AllParseSingleExample(SparseInt64); +BM_AllParseSingleExample(DenseInt64); +BM_AllParseSingleExample(VarLenDenseInt64); +BM_AllParseSingleExample(SparseFloat); +BM_AllParseSingleExample(DenseFloat); +BM_AllParseSingleExample(VarLenDenseFloat); + } // end namespace tensorflow diff --git a/tensorflow/core/ops/parsing_ops.cc b/tensorflow/core/ops/parsing_ops.cc index 40ec792ef8..11b28badc8 100644 --- a/tensorflow/core/ops/parsing_ops.cc +++ b/tensorflow/core/ops/parsing_ops.cc @@ -64,7 +64,7 @@ REGISTER_OP("ParseExample") .Attr("Tdense: list({float,int64,string}) >= 0") .Attr("dense_shapes: list(shape) >= 0") .SetShapeFn([](InferenceContext* c) { - ParseSingleExampleAttrs attrs; + ParseExampleAttrs attrs; TF_RETURN_IF_ERROR(attrs.Init(c)); ShapeHandle input; @@ -138,6 +138,86 @@ sparse_types: A list of Nsparse types; the data types of data in each Feature DT_INT64 (Int64List), and DT_STRING (BytesList). )doc"); +REGISTER_OP("ParseSingleExample") + .Input("serialized: string") + .Input("dense_defaults: Tdense") + .Output("sparse_indices: num_sparse * int64") + .Output("sparse_values: sparse_types") + .Output("sparse_shapes: num_sparse * int64") + .Output("dense_values: Tdense") + .Attr("num_sparse: int >= 0") + .Attr("sparse_keys: list(string) >= 0") + .Attr("dense_keys: list(string) >= 0") + .Attr("sparse_types: list({float,int64,string}) >= 0") + .Attr("Tdense: list({float,int64,string}) >= 0") + .Attr("dense_shapes: list(shape) >= 0") + .SetShapeFn([](InferenceContext* c) { + ParseSingleExampleAttrs attrs; + TF_RETURN_IF_ERROR(attrs.Init(c)); + + ShapeHandle input; + TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &input)); + + // Output sparse_indices, sparse_values, and sparse_shapes. + int output_idx = 0; + for (int i = 0; i < attrs.sparse_keys.size(); ++i) { + c->set_output(output_idx++, c->Matrix(c->UnknownDim(), 1)); + } + for (int i = 0; i < attrs.sparse_keys.size(); ++i) { + c->set_output(output_idx++, c->Vector(c->UnknownDim())); + } + for (int i = 0; i < attrs.sparse_keys.size(); ++i) { + c->set_output(output_idx++, c->Vector(1)); + } + + // Output dense_shapes. + for (int i = 0; i < attrs.dense_keys.size(); ++i) { + ShapeHandle dense; + TF_RETURN_IF_ERROR( + c->MakeShapeFromPartialTensorShape(attrs.dense_shapes[i], &dense)); + c->set_output(output_idx++, dense); + } + return Status::OK(); + }) + .Doc(R"doc( +Transforms a tf.Example proto (as a string) into typed tensors. + +serialized: A vector containing a batch of binary serialized Example protos. +dense_keys: The keys expected in the Examples' features associated with dense + values. +dense_defaults: A list of Tensors (some may be empty), whose length matches + the length of `dense_keys`. dense_defaults[j] provides default values + when the example's feature_map lacks dense_key[j]. If an empty Tensor is + provided for dense_defaults[j], then the Feature dense_keys[j] is required. + The input type is inferred from dense_defaults[j], even when it's empty. + If dense_defaults[j] is not empty, and dense_shapes[j] is fully defined, + then the shape of dense_defaults[j] must match that of dense_shapes[j]. + If dense_shapes[j] has an undefined major dimension (variable strides dense + feature), dense_defaults[j] must contain a single element: + the padding element. +Tdense: The data types of data in each Feature given in dense_keys. + The length of this list must match the length of `dense_keys`. + Currently the ParseSingleExample op supports DT_FLOAT (FloatList), + DT_INT64 (Int64List), and DT_STRING (BytesList). +dense_shapes: The shapes of data in each Feature given in dense_keys. + The length of this list must match the length of `dense_keys`. The + number of elements in the Feature corresponding to dense_key[j] must + always equal dense_shapes[j].NumEntries(). If dense_shapes[j] == + (D0, D1, ..., DN) then the shape of output Tensor dense_values[j] + will be (D0, D1, ..., DN): In the case dense_shapes[j] = (-1, D1, + ..., DN), the shape of the output Tensor dense_values[j] will be (M, + D1, .., DN), where M is the number of blocks of elements of length + D1 * .... * DN, in the input. +num_sparse: The number of sparse features to be parsed from the example. This + must match the lengths of `sparse_keys` and `sparse_types`. +sparse_keys: A list of `num_sparse` strings. + The keys expected in the Examples' features associated with sparse values. +sparse_types: A list of `num_sparse` types; the data types of data in each + Feature given in sparse_keys. + Currently the ParseSingleExample op supports DT_FLOAT (FloatList), + DT_INT64 (Int64List), and DT_STRING (BytesList). +)doc"); + REGISTER_OP("ParseSingleSequenceExample") .Input("serialized: string") .Input("feature_list_dense_missing_assumed_empty: string") diff --git a/tensorflow/core/ops/parsing_ops_test.cc b/tensorflow/core/ops/parsing_ops_test.cc index c6e521e33e..9121d7ae92 100644 --- a/tensorflow/core/ops/parsing_ops_test.cc +++ b/tensorflow/core/ops/parsing_ops_test.cc @@ -119,7 +119,7 @@ TEST(ParsingOpsTest, ParseExample_ShapeFn) { ("[?,2];[?,2];[?];[?];[2];[2];" // sparse outputs "[d0_0,1];[d0_0,1,2];[d0_0,1,2,3]")); // dense outputs - // Confirm an error from ParseSingleExampleAttrs.Init(). + // Confirm an error from ParseExampleAttrs.Init(). set_outputs(2, 3, true /* add_extra_shape */); INFER_ERROR("len(dense_keys) != len(dense_shapes)", op, "?;?;?;?;?;?;?;?;?;?"); diff --git a/tensorflow/core/util/example_proto_fast_parsing.cc b/tensorflow/core/util/example_proto_fast_parsing.cc index b9cf97195b..dd9ae46f88 100644 --- a/tensorflow/core/util/example_proto_fast_parsing.cc +++ b/tensorflow/core/util/example_proto_fast_parsing.cc @@ -94,9 +94,29 @@ class Feature { return Status::OK(); } + bool GetNumElementsInBytesList(int* num_elements) { + protobuf::io::CodedInputStream stream( + reinterpret_cast(serialized_.data()), serialized_.size()); + EnableAliasing(&stream); + uint32 length = 0; + if (!stream.ReadVarint32(&length)) return false; + auto limit = stream.PushLimit(length); + *num_elements = 0; + while (!stream.ExpectAtEnd()) { + if (!stream.ExpectTag(kDelimitedTag(1))) return false; + uint32 bytes_length = 0; + if (!stream.ReadVarint32(&bytes_length)) return false; + if (!stream.Skip(bytes_length)) return false; + ++*num_elements; + } + stream.PopLimit(limit); + return true; + } + template bool ParseBytesList(Result* bytes_list) { DCHECK(bytes_list != nullptr); + protobuf::io::CodedInputStream stream( reinterpret_cast(serialized_.data()), serialized_.size()); @@ -111,9 +131,13 @@ class Feature { // parse string uint32 bytes_length; if (!stream.ReadVarint32(&bytes_length)) return false; - string bytes; - if (!stream.ReadString(&bytes, bytes_length)) return false; - bytes_list->push_back(std::move(bytes)); + const void* buf_ptr = nullptr; + int size = 0; + if (!stream.GetDirectBufferPointer(&buf_ptr, &size)) return false; + if (size < bytes_length) return false; + bytes_list->push_back( + string(static_cast(buf_ptr), bytes_length)); + if (!stream.Skip(bytes_length)) return false; } stream.PopLimit(limit); return true; @@ -447,6 +471,28 @@ class LimitedArraySlice { T* end_; }; +void LogDenseFeatureDataLoss(StringPiece feature_name) { + LOG(WARNING) << "Data loss! Feature '" << feature_name + << "' is present in multiple concatenated " + "tf.Examples. Ignoring all but last one."; + static auto* duplicated_dense_feature = monitoring::Counter<0>::New( + "/tensorflow/core/util/example_proto_fast_parsing/" + "duplicated_dense_feature", + "Dense feature appears twice in a tf.Example"); + duplicated_dense_feature->GetCell()->IncrementBy(1); +} + +void LogSparseFeatureDataLoss(StringPiece feature_name) { + LOG(WARNING) << "Data loss! Feature '" << feature_name + << "' is present in multiple concatenated " + "tf.Examples. Ignoring all but last one."; + static auto* duplicated_sparse_feature = monitoring::Counter<0>::New( + "/tensorflow/core/util/example_proto_fast_parsing/" + "duplicated_sparse_feature", + "Sparse feature appears twice in a tf.Example"); + duplicated_sparse_feature->GetCell()->IncrementBy(1); +} + Status FastParseSerializedExample( const string& serialized_example, const string& example_name, const size_t example_index, const Config& config, @@ -510,14 +556,7 @@ Status FastParseSerializedExample( // If feature was already visited, skip. // Compare comment at the beginning of the loop. if (dense_feature_last_example[d] == example_index) { - LOG(WARNING) << "Data loss! Feature '" << feature_name - << "' in present in multiple concatenated " - "tf.Examples. Ignoring all but last one."; - static auto* duplicated_dense_feature = monitoring::Counter<0>::New( - "/tensorflow/core/util/example_proto_fast_parsing/" - "duplicated_dense_feature", - "Dense feature appears twice in a tf.Example"); - duplicated_dense_feature->GetCell()->IncrementBy(1); + LogDenseFeatureDataLoss(feature_name); continue; } dense_feature_last_example[d] = example_index; @@ -639,14 +678,7 @@ Status FastParseSerializedExample( // If feature was already visited, skip. // Compare comment at the beginning of the loop. if (sparse_feature_last_example[d] == example_index) { - LOG(WARNING) << "Data loss! Feature '" << feature_name - << "' in present in multiple concatenated " - "tf.Examples. Ignoring all but last one."; - static auto* duplicated_sparse_feature = monitoring::Counter<0>::New( - "/tensorflow/core/util/example_proto_fast_parsing/" - "duplicated_sparse_feature", - "sparse feature appears twice in a tf.Example"); - duplicated_sparse_feature->GetCell()->IncrementBy(1); + LogSparseFeatureDataLoss(feature_name); continue; } sparse_feature_last_example[d] = example_index; @@ -1099,5 +1131,333 @@ Status FastParseExample(const Config& config, return Status::OK(); } +Status FastParseSingleExample(const Config& config, const string& serialized, + Result* result) { + DCHECK(result != nullptr); + // Check config so we can safely CHECK(false) in switches on config.*.dtype + for (auto& c : config.sparse) { + TF_RETURN_IF_ERROR(CheckConfigDataType(c.dtype)); + } + for (auto& c : config.dense) { + TF_RETURN_IF_ERROR(CheckConfigDataType(c.dtype)); + } + + // TODO(mrry): Cache the construction of this map at Op construction time. + size_t config_size = config.dense.size() + config.sparse.size(); + SeededHasher hasher; + // Build config index. + PresizedCuckooMap> config_index(config_size); + bool ok = true; + for (size_t i = 0; i < 1000; ++i) { + for (size_t d = 0; d < config.dense.size(); ++d) { + ok &= config_index.InsertUnique(hasher(config.dense[d].feature_name), + {d, Type::Dense}); + } + for (size_t d = 0; d < config.sparse.size(); ++d) { + ok &= config_index.InsertUnique(hasher(config.sparse[d].feature_name), + {d, Type::Sparse}); + } + if (ok) break; + LOG(WARNING) << "Collision found. This should happen only if you have " + "around 2^32 entries in your config."; + hasher.seed++; + config_index.Clear(config_size); + } + if (!ok) { + return errors::Internal( + "Could not avoid collision. This should not happen."); + } + + // Allocate dense output tensors. + for (size_t d = 0; d < config.dense.size(); ++d) { + if (!config.dense[d].variable_length) { + TensorShape values_shape; + if (!config.dense[d].shape.AsTensorShape(&values_shape)) { + return errors::Internal( + "Fixed-length shape was not a statically defined shape."); + } + result->dense_values.emplace_back(config.dense[d].dtype, values_shape); + } else { + // Variable-length tensor will be allocated later. + result->dense_values.emplace_back(); + } + } + + // Allocate sparse output tensors. + for (size_t d = 0; d < config.sparse.size(); ++d) { + // The dense_shape is always a vector of length 1. + result->sparse_shapes.emplace_back(DT_INT64, TensorShape({1})); + // Variable-length tensors will be allocated later. + result->sparse_indices.emplace_back(); + result->sparse_values.emplace_back(); + } + + parsed::Example parsed_example; + if (!ParseExample(serialized, &parsed_example)) { + return errors::InvalidArgument("Could not parse example input, value: '", + serialized, "'"); + } + std::vector sparse_feature_already_seen(config.sparse.size(), false); + std::vector dense_feature_already_seen(config.dense.size(), false); + + // Handle features present in the example. + const size_t parsed_example_size = parsed_example.size(); + for (size_t i = 0; i < parsed_example_size; ++i) { + // This is a logic that standard protobuf parsing is implementing. + // I.e. last entry in the map overwrites all the previous ones. + parsed::FeatureMapEntry& name_and_feature = + parsed_example[parsed_example_size - i - 1]; + + const StringPiece feature_name = name_and_feature.first; + parsed::Feature& feature = name_and_feature.second; + + std::pair d_and_type; + uint64 h = hasher(feature_name); + if (!config_index.Find(h, &d_and_type)) continue; + + size_t d = d_and_type.first; + bool is_dense = d_and_type.second == Type::Dense; + + { + // Testing for PresizedCuckooMap collision. + // TODO(lew): Use dense_hash_map and avoid this and hasher creation. + const string& config_feature_name = is_dense + ? config.dense[d].feature_name + : config.sparse[d].feature_name; + if (feature_name != config_feature_name) continue; + } + + auto example_error = [feature_name](StringPiece suffix) { + return errors::InvalidArgument("Key: ", feature_name, ". ", suffix); + }; + + auto parse_error = [feature_name] { + return errors::InvalidArgument("Key: ", feature_name, + ". Can't parse serialized Example."); + }; + + DataType example_dtype; + TF_RETURN_IF_ERROR(feature.ParseDataType(&example_dtype)); + if (example_dtype == DT_INVALID) continue; + + if (is_dense && !config.dense[d].variable_length) { + // If feature was already visited, skip. + // Compare comment at the beginning of the loop. + if (dense_feature_already_seen[d]) { + LogDenseFeatureDataLoss(feature_name); + continue; + } + dense_feature_already_seen[d] = true; + + if (example_dtype != config.dense[d].dtype) { + return example_error(strings::StrCat( + "Data types don't match. Data type: ", + DataTypeString(example_dtype), + " but expected type: ", DataTypeString(config.dense[d].dtype))); + } + + Tensor* out = &result->dense_values[d]; + const std::size_t num_elements = config.dense[d].elements_per_stride; + + switch (example_dtype) { + case DT_INT64: { + auto out_p = out->flat().data(); + LimitedArraySlice slice(out_p, num_elements); + if (!feature.ParseInt64List(&slice)) return parse_error(); + if (slice.EndDistance() != 0) { + return parse_error(); + } + break; + } + case DT_FLOAT: { + auto out_p = out->flat().data(); + LimitedArraySlice slice(out_p, num_elements); + if (!feature.ParseFloatList(&slice)) return parse_error(); + if (slice.EndDistance() != 0) { + return parse_error(); + } + break; + } + case DT_STRING: { + auto out_p = out->flat().data(); + LimitedArraySlice slice(out_p, num_elements); + if (!feature.ParseBytesList(&slice)) return parse_error(); + if (slice.EndDistance() != 0) { + return parse_error(); + } + break; + } + default: + LOG(FATAL) << "Should not happen."; + } + + } else { // if variable length + SparseBuffer out_temp; + const size_t num_elements_divisor = + is_dense ? config.dense[d].elements_per_stride : 1; + size_t num_elements; + + if (is_dense) { + // If feature was already visited, skip. + // Compare comment at the beginning of the loop. + if (dense_feature_already_seen[d]) { + LogDenseFeatureDataLoss(feature_name); + continue; + } + dense_feature_already_seen[d] = true; + if (example_dtype != config.dense[d].dtype) { + return example_error(strings::StrCat( + "Data types don't match. Data type: ", + DataTypeString(example_dtype), + " but expected type: ", DataTypeString(config.dense[d].dtype))); + } + } else { + // If feature was already visited, skip. + // Compare comment at the beginning of the loop. + if (sparse_feature_already_seen[d]) { + LogSparseFeatureDataLoss(feature_name); + continue; + } + sparse_feature_already_seen[d] = true; + + // Handle sparse features. + if (example_dtype != DT_INVALID && + example_dtype != config.sparse[d].dtype) { + return example_error(strings::StrCat( + "Data types don't match. ", + "Expected type: ", DataTypeString(config.sparse[d].dtype), + ", Actual type: ", DataTypeString(example_dtype))); + } + } + + switch (example_dtype) { + case DT_INT64: { + // TODO(mrry): Use the fact that the `int64_list` is packed to read + // out the length and pre-allocate the output tensor. + if (!feature.ParseInt64List(&out_temp.int64_list)) + return parse_error(); + num_elements = out_temp.int64_list.size(); + break; + } + case DT_FLOAT: { + // TODO(mrry): Use the fact that the `float_list` is packed to read + // out the length and pre-allocate the output tensor. + if (!feature.ParseFloatList(&out_temp.float_list)) + return parse_error(); + num_elements = out_temp.float_list.size(); + break; + } + case DT_STRING: { + int actual_num_elements = 0; + if (!feature.GetNumElementsInBytesList(&actual_num_elements)) { + return parse_error(); + } + out_temp.bytes_list.reserve(actual_num_elements); + if (!feature.ParseBytesList(&out_temp.bytes_list)) + return parse_error(); + num_elements = out_temp.bytes_list.size(); + break; + } + default: + LOG(FATAL) << "Should not happen. " << DataTypeString(example_dtype); + } + + if (num_elements % num_elements_divisor != 0) { + return parse_error(); + } + + Tensor* out; + if (is_dense) { + TensorShape values_shape; + values_shape.AddDim(num_elements / num_elements_divisor); + for (int i = 1; i < config.dense[d].shape.dims(); ++i) { + values_shape.AddDim(config.dense[d].shape.dim_size(i)); + } + + out = &result->dense_values[d]; + *out = Tensor(config.dense[d].dtype, values_shape); + + } else { + Tensor* out_indices = &result->sparse_indices[d]; + Tensor* out_dense_shape = &result->sparse_shapes[d]; + out = &result->sparse_values[d]; + + // TODO(mrry): Investigate the possibility of not materializing + // the indices (and perhaps dense_shape) until they are needed. + *out_indices = Tensor( + DT_INT64, TensorShape({static_cast(num_elements), 1})); + auto indices_flat = out_indices->flat(); + for (size_t i = 0; i < num_elements; ++i) { + indices_flat(i) = static_cast(i); + } + + *out_dense_shape = Tensor(DT_INT64, TensorShape({1})); + auto shapes_shape_t = out_dense_shape->vec(); + shapes_shape_t(0) = num_elements; + + *out = Tensor(config.sparse[d].dtype, + TensorShape({static_cast(num_elements)})); + } + + switch (example_dtype) { + case DT_INT64: { + CopyOrMoveBlock(out_temp.int64_list.begin(), + out_temp.int64_list.end(), out->flat().data()); + break; + } + case DT_FLOAT: { + CopyOrMoveBlock(out_temp.float_list.begin(), + out_temp.float_list.end(), out->flat().data()); + break; + } + case DT_STRING: { + CopyOrMoveBlock(out_temp.bytes_list.begin(), + out_temp.bytes_list.end(), + out->flat().data()); + break; + } + default: + LOG(FATAL) << "Should not happen."; + } + } + } + + // Handle missing dense features. + for (size_t d = 0; d < config.dense.size(); ++d) { + if (!dense_feature_already_seen[d]) { + if (!config.dense[d].variable_length) { + // Handle missing fixed-length dense feature. + if (config.dense[d].default_value.NumElements() == 0) { + return errors::InvalidArgument( + "Feature: ", config.dense[d].feature_name, + " (data type: ", DataTypeString(config.dense[d].dtype), ")", + " is required but could not be found."); + } + result->dense_values[d] = config.dense[d].default_value; + } else { + // Handle missing varlen dense feature. + TensorShape empty_shape; + empty_shape.AddDim(0); + for (int i = 1; i < config.dense[d].shape.dims(); ++i) { + empty_shape.AddDim(config.dense[d].shape.dim_size(i)); + } + result->dense_values[d] = Tensor(config.dense[d].dtype, empty_shape); + } + } + } + + // Handle missing sparse features. + for (size_t d = 0; d < config.sparse.size(); ++d) { + if (!sparse_feature_already_seen[d]) { + result->sparse_indices[d] = Tensor(DT_INT64, TensorShape({0, 1})); + result->sparse_values[d] = + Tensor(config.sparse[d].dtype, TensorShape({0})); + result->sparse_shapes[d].vec()(0) = 0; + } + } + + return Status::OK(); +} + } // namespace example } // namespace tensorflow diff --git a/tensorflow/core/util/example_proto_fast_parsing.h b/tensorflow/core/util/example_proto_fast_parsing.h index 20536cee16..fe59ec77ca 100644 --- a/tensorflow/core/util/example_proto_fast_parsing.h +++ b/tensorflow/core/util/example_proto_fast_parsing.h @@ -79,6 +79,12 @@ Status FastParseExample(const FastParseExampleConfig& config, gtl::ArraySlice example_names, thread::ThreadPool* thread_pool, Result* result); +// TODO(mrry): Move the hash table construction into the config object. +typedef FastParseExampleConfig FastParseSingleExampleConfig; + +Status FastParseSingleExample(const FastParseSingleExampleConfig& config, + const string& serialized, Result* result); + // This function parses serialized Example and populates given example. // It uses the same specialized parser as FastParseExample which is efficient. // But then constructs Example which is relatively slow. diff --git a/tensorflow/core/util/example_proto_helper.cc b/tensorflow/core/util/example_proto_helper.cc index 4b5bf63112..41f56d2daa 100644 --- a/tensorflow/core/util/example_proto_helper.cc +++ b/tensorflow/core/util/example_proto_helper.cc @@ -400,7 +400,7 @@ Status BatchExampleProtoToTensors( return Status::OK(); } -Status ParseSingleExampleAttrs::FinishInit() { +Status ParseExampleAttrs::FinishInit() { if (static_cast(num_sparse) != sparse_types.size()) { return errors::InvalidArgument("len(sparse_keys) != len(sparse_types)"); } @@ -422,6 +422,25 @@ Status ParseSingleExampleAttrs::FinishInit() { return Status::OK(); } +Status ParseSingleExampleAttrs::FinishInit() { + if (sparse_keys.size() != sparse_types.size()) { + return errors::InvalidArgument("len(sparse_keys) != len(sparse_types)"); + } + if (dense_keys.size() != dense_types.size()) { + return errors::InvalidArgument("len(dense_keys) != len(dense_types)"); + } + if (dense_keys.size() != dense_shapes.size()) { + return errors::InvalidArgument("len(dense_keys) != len(dense_shapes)"); + } + for (const DataType& type : dense_types) { + TF_RETURN_IF_ERROR(CheckValidType(type)); + } + for (const DataType& type : sparse_types) { + TF_RETURN_IF_ERROR(CheckValidType(type)); + } + return Status::OK(); +} + Status ParseSingleSequenceExampleAttrs::FinishInit() { if (static_cast(num_context_sparse) != context_sparse_types.size()) { return errors::InvalidArgument( diff --git a/tensorflow/core/util/example_proto_helper.h b/tensorflow/core/util/example_proto_helper.h index 7414d61e8b..8b3c6c5a3f 100644 --- a/tensorflow/core/util/example_proto_helper.h +++ b/tensorflow/core/util/example_proto_helper.h @@ -148,9 +148,9 @@ Tensor FeatureSparseCopy(const std::size_t batch, const string& key, int64 CopyIntoSparseTensor(const Tensor& in, const int batch, const int64 offset, Tensor* indices, Tensor* values); -// Parses the attributes passed to ParseSingleExample. +// Parses the attributes passed to ParseExample. // REQUIRES: Init must be called after construction. -class ParseSingleExampleAttrs { +class ParseExampleAttrs { public: template Status Init(ContextType* ctx) { @@ -205,6 +205,72 @@ class ParseSingleExampleAttrs { Status FinishInit(); // for context-independent parts of Init. }; +// Parses the attributes passed to ParseSingleExample. +// REQUIRES: Init must be called after construction. +class ParseSingleExampleAttrs { + public: + template + Status Init(ContextType* ctx) { + TF_RETURN_IF_ERROR(ctx->GetAttr("sparse_keys", &sparse_keys)); + TF_RETURN_IF_ERROR(ctx->GetAttr("sparse_types", &sparse_types)); + TF_RETURN_IF_ERROR(ctx->GetAttr("dense_keys", &dense_keys)); + TF_RETURN_IF_ERROR(ctx->GetAttr("Tdense", &dense_types)); + TF_RETURN_IF_ERROR(ctx->GetAttr("dense_shapes", &dense_shapes)); + + int num_sparse; + TF_RETURN_IF_ERROR(ctx->GetAttr("num_sparse", &num_sparse)); + if (num_sparse != sparse_keys.size() || num_sparse != sparse_types.size()) { + return errors::InvalidArgument( + "num_sparse (", num_sparse, ") must match the size of sparse_keys (", + sparse_keys.size(), ") and sparse_types (", sparse_types.size(), ")"); + } + + // Temporary check until we start allowing a variable length outer + // dimension. + for (int i = 0; i < dense_shapes.size(); ++i) { + bool shape_ok = true; + if (dense_shapes[i].dims() == -1) { + shape_ok = false; + } else { + for (int d = 1; d < dense_shapes[i].dims(); ++d) { + if (dense_shapes[i].dim_size(d) == -1) { + shape_ok = false; + } + } + } + if (!shape_ok) { + return errors::InvalidArgument( + "dense_shapes[", i, + "] has unknown rank or unknown inner dimensions: ", + dense_shapes[i].DebugString()); + } + TensorShape dense_shape; + if (dense_shapes[i].dims() > 0 && dense_shapes[i].dim_size(0) == -1) { + variable_length.push_back(true); + for (int d = 1; d < dense_shapes[i].dims(); ++d) { + dense_shape.AddDim(dense_shapes[i].dim_size(d)); + } + } else { + variable_length.push_back(false); + dense_shapes[i].AsTensorShape(&dense_shape); + } + elements_per_stride.push_back(dense_shape.num_elements()); + } + return FinishInit(); + } + + std::vector sparse_keys; + std::vector sparse_types; + std::vector dense_keys; + std::vector dense_types; + std::vector dense_shapes; + std::vector variable_length; + std::vector elements_per_stride; + + private: + Status FinishInit(); // for context-independent parts of Init. +}; + // Parses the attributes passed to ParseSingleSequenceExample. // REQUIRES: Init must be called after construction. class ParseSingleSequenceExampleAttrs { diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD index 31d3bd1b74..d98bb0f8cc 100644 --- a/tensorflow/python/kernel_tests/BUILD +++ b/tensorflow/python/kernel_tests/BUILD @@ -603,6 +603,23 @@ tf_py_test( ], ) +tf_py_test( + name = "parse_single_example_op_test", + size = "small", + srcs = ["parse_single_example_op_test.py"], + additional_deps = [ + "//third_party/py/numpy", + "//tensorflow/core:protos_all_py", + "//tensorflow/python:array_ops", + "//tensorflow/python:client_testlib", + "//tensorflow/python:errors", + "//tensorflow/python:framework", + "//tensorflow/python:framework_for_generated_wrappers", + "//tensorflow/python:parsing_ops", + "//tensorflow/python:platform", + ], +) + tf_py_test( name = "partitioned_variables_test", size = "small", diff --git a/tensorflow/python/kernel_tests/parse_single_example_op_test.py b/tensorflow/python/kernel_tests/parse_single_example_op_test.py new file mode 100644 index 0000000000..b5bd1b9bee --- /dev/null +++ b/tensorflow/python/kernel_tests/parse_single_example_op_test.py @@ -0,0 +1,930 @@ +# Copyright 2015 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for tensorflow.ops.parsing_ops.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import itertools + +import numpy as np + +from tensorflow.core.example import example_pb2 +from tensorflow.core.example import feature_pb2 +from tensorflow.python.framework import dtypes +from tensorflow.python.framework import errors_impl +from tensorflow.python.framework import ops +from tensorflow.python.framework import sparse_tensor +from tensorflow.python.framework import tensor_shape +from tensorflow.python.ops import parsing_ops +from tensorflow.python.platform import test +from tensorflow.python.platform import tf_logging + +# Helpers for creating Example objects +example = example_pb2.Example +feature = feature_pb2.Feature +features = lambda d: feature_pb2.Features(feature=d) +bytes_feature = lambda v: feature(bytes_list=feature_pb2.BytesList(value=v)) +int64_feature = lambda v: feature(int64_list=feature_pb2.Int64List(value=v)) +float_feature = lambda v: feature(float_list=feature_pb2.FloatList(value=v)) +# Helpers for creating SequenceExample objects +feature_list = lambda l: feature_pb2.FeatureList(feature=l) +feature_lists = lambda d: feature_pb2.FeatureLists(feature_list=d) +sequence_example = example_pb2.SequenceExample + + +def empty_sparse(dtype, shape=None): + if shape is None: + shape = [0] + return (np.empty(shape=(0, len(shape)), dtype=np.int64), + np.array([], dtype=dtype), np.array(shape, dtype=np.int64)) + + +def flatten(list_of_lists): + """Flatten one level of nesting.""" + return itertools.chain.from_iterable(list_of_lists) + + +def flatten_values_tensors_or_sparse(tensors_list): + """Flatten each SparseTensor object into 3 Tensors for session.run().""" + return list( + flatten([[v.indices, v.values, v.dense_shape] if isinstance( + v, sparse_tensor.SparseTensor) else [v] for v in tensors_list])) + + +def _compare_output_to_expected(tester, dict_tensors, expected_tensors, + flat_output): + tester.assertEqual(set(dict_tensors.keys()), set(expected_tensors.keys())) + + i = 0 # Index into the flattened output of session.run() + for k, v in dict_tensors.items(): + expected_v = expected_tensors[k] + tf_logging.info("Comparing key: %s", k) + if isinstance(v, sparse_tensor.SparseTensor): + # Three outputs for SparseTensor : indices, values, shape. + tester.assertEqual([k, len(expected_v)], [k, 3]) + tester.assertAllEqual(expected_v[0], flat_output[i]) + tester.assertAllEqual(expected_v[1], flat_output[i + 1]) + tester.assertAllEqual(expected_v[2], flat_output[i + 2]) + i += 3 + else: + # One output for standard Tensor. + tester.assertAllEqual(expected_v, flat_output[i]) + i += 1 + + +class ParseExampleTest(test.TestCase): + + def _test(self, kwargs, expected_values=None, expected_err=None): + with self.test_session() as sess: + if expected_err: + with self.assertRaisesWithPredicateMatch(expected_err[0], + expected_err[1]): + out = parsing_ops.parse_single_example_v2(**kwargs) + sess.run(flatten_values_tensors_or_sparse(out.values())) + return + else: + # Returns dict w/ Tensors and SparseTensors. + out = parsing_ops.parse_single_example_v2(**kwargs) + result = flatten_values_tensors_or_sparse(out.values()) + # Check values. + tf_result = sess.run(result) + _compare_output_to_expected(self, out, expected_values, tf_result) + + for k, f in kwargs["features"].items(): + if isinstance(f, parsing_ops.FixedLenFeature) and f.shape is not None: + self.assertEqual(tuple(out[k].get_shape().as_list()), f.shape) + elif isinstance(f, parsing_ops.VarLenFeature): + self.assertEqual( + tuple(out[k].indices.get_shape().as_list()), (None, 1)) + self.assertEqual(tuple(out[k].values.get_shape().as_list()), (None,)) + self.assertEqual( + tuple(out[k].dense_shape.get_shape().as_list()), (1,)) + + def testEmptySerializedWithAllDefaults(self): + sparse_name = "st_a" + a_name = "a" + b_name = "b" + c_name = "c:has_a_tricky_name" + a_default = [0, 42, 0] + b_default = np.random.rand(3, 3).astype(bytes) + c_default = np.random.rand(2).astype(np.float32) + + expected_st_a = ( # indices, values, shape + np.empty((0, 1), dtype=np.int64), # indices + np.empty((0,), dtype=np.int64), # sp_a is DT_INT64 + np.array([0], dtype=np.int64)) # max_elems = 0 + + expected_output = { + sparse_name: expected_st_a, + a_name: np.array([a_default]), + b_name: np.array(b_default), + c_name: np.array(c_default), + } + + self._test({ + "serialized": ops.convert_to_tensor(""), + "features": { + sparse_name: + parsing_ops.VarLenFeature(dtypes.int64), + a_name: + parsing_ops.FixedLenFeature( + (1, 3), dtypes.int64, default_value=a_default), + b_name: + parsing_ops.FixedLenFeature( + (3, 3), dtypes.string, default_value=b_default), + c_name: + parsing_ops.FixedLenFeature( + (2,), dtypes.float32, default_value=c_default), + } + }, expected_output) + + def testEmptySerializedWithoutDefaultsShouldFail(self): + input_features = { + "st_a": + parsing_ops.VarLenFeature(dtypes.int64), + "a": + parsing_ops.FixedLenFeature( + (1, 3), dtypes.int64, default_value=[0, 42, 0]), + "b": + parsing_ops.FixedLenFeature( + (3, 3), + dtypes.string, + default_value=np.random.rand(3, 3).astype(bytes)), + # Feature "c" is missing a default, this gap will cause failure. + "c": + parsing_ops.FixedLenFeature( + (2,), dtype=dtypes.float32), + } + + # Edge case where the key is there but the feature value is empty + original = example(features=features({"c": feature()})) + self._test( + { + "serialized": original.SerializeToString(), + "features": input_features, + }, + expected_err=(errors_impl.OpError, + "Feature: c \\(data type: float\\) is required")) + + # Standard case of missing key and value. + self._test( + { + "serialized": "", + "features": input_features, + }, + expected_err=(errors_impl.OpError, + "Feature: c \\(data type: float\\) is required")) + + def testDenseNotMatchingShapeShouldFail(self): + original = example(features=features({ + "a": float_feature([-1, -1]), + })) + + serialized = original.SerializeToString() + + self._test( + { + "serialized": ops.convert_to_tensor(serialized), + "features": { + "a": parsing_ops.FixedLenFeature((1, 3), dtypes.float32) + } + }, + # TODO(mrry): Consider matching the `tf.parse_example()` error message. + expected_err=(errors_impl.OpError, "Key: a.")) + + def testDenseDefaultNoShapeShouldFail(self): + original = example(features=features({ + "a": float_feature([1, 1, 3]), + })) + + serialized = original.SerializeToString() + + self._test( + { + "serialized": ops.convert_to_tensor(serialized), + "features": { + "a": parsing_ops.FixedLenFeature(None, dtypes.float32) + } + }, + expected_err=(ValueError, "Missing shape for feature a")) + + def testSerializedContainingSparse(self): + original = [ + example(features=features({ + "st_c": float_feature([3, 4]) + })), + example(features=features({ + "st_c": float_feature([]), # empty float list + })), + example(features=features({ + "st_d": feature(), # feature with nothing in it + })), + example(features=features({ + "st_c": float_feature([1, 2, -1]), + "st_d": bytes_feature([b"hi"]) + })) + ] + + expected_outputs = [{ + "st_c": (np.array([[0], [1]], dtype=np.int64), + np.array([3.0, 4.0], dtype=np.float32), + np.array([2], dtype=np.int64)), + "st_d": + empty_sparse(bytes) + }, { + "st_c": empty_sparse(np.float32), + "st_d": empty_sparse(bytes) + }, { + "st_c": empty_sparse(np.float32), + "st_d": empty_sparse(bytes) + }, { + "st_c": (np.array([[0], [1], [2]], dtype=np.int64), + np.array([1.0, 2.0, -1.0], dtype=np.float32), + np.array([3], dtype=np.int64)), + "st_d": (np.array([[0]], dtype=np.int64), np.array(["hi"], dtype=bytes), + np.array([1], dtype=np.int64)) + }] + + for proto, expected_output in zip(original, expected_outputs): + self._test({ + "serialized": ops.convert_to_tensor(proto.SerializeToString()), + "features": { + "st_c": parsing_ops.VarLenFeature(dtypes.float32), + "st_d": parsing_ops.VarLenFeature(dtypes.string) + }, + }, expected_output) + + def testSerializedContainingSparseFeature(self): + original = [ + example(features=features({ + "val": float_feature([3, 4]), + "idx": int64_feature([5, 10]) + })), + example(features=features({ + "val": float_feature([]), # empty float list + "idx": int64_feature([]) + })), + example(features=features({ + "val": feature(), # feature with nothing in it + # missing idx feature + })), + example(features=features({ + "val": float_feature([1, 2, -1]), + "idx": + int64_feature([0, 9, 3]) # unsorted + })) + ] + + expected_outputs = [{ + "sp": (np.array([[5], [10]], dtype=np.int64), + np.array([3.0, 4.0], dtype=np.float32), + np.array([13], dtype=np.int64)) + }, { + "sp": empty_sparse(np.float32, shape=[13]) + }, { + "sp": empty_sparse(np.float32, shape=[13]) + }, { + "sp": (np.array([[0], [3], [9]], dtype=np.int64), + np.array([1.0, -1.0, 2.0], dtype=np.float32), + np.array([13], dtype=np.int64)) + }] + + for proto, expected_output in zip(original, expected_outputs): + self._test({ + "serialized": ops.convert_to_tensor(proto.SerializeToString()), + "features": { + "sp": + parsing_ops.SparseFeature(["idx"], "val", dtypes.float32, + [13]) + } + }, expected_output) + + def testSerializedContainingSparseFeatureReuse(self): + original = [ + example(features=features({ + "val1": float_feature([3, 4]), + "val2": float_feature([5, 6]), + "idx": int64_feature([5, 10]) + })), + example(features=features({ + "val1": float_feature([]), # empty float list + "idx": int64_feature([]) + })), + ] + + expected_outputs = [{ + "sp1": (np.array([[5], [10]], dtype=np.int64), + np.array([3.0, 4.0], dtype=np.float32), + np.array([13], dtype=np.int64)), + "sp2": (np.array([[5], [10]], dtype=np.int64), + np.array([5.0, 6.0], dtype=np.float32), + np.array([7], dtype=np.int64)) + }, { + "sp1": empty_sparse(np.float32, shape=[13]), + "sp2": empty_sparse(np.float32, shape=[7]) + }] + + for proto, expected_output in zip(original, expected_outputs): + self._test({ + "serialized": ops.convert_to_tensor(proto.SerializeToString()), + "features": { + "sp1": + parsing_ops.SparseFeature("idx", "val1", dtypes.float32, 13), + "sp2": + parsing_ops.SparseFeature( + "idx", + "val2", + dtypes.float32, + size=7, + already_sorted=True) + } + }, expected_output) + + def testSerializedContaining3DSparseFeature(self): + original = [ + example(features=features({ + "val": float_feature([3, 4]), + "idx0": int64_feature([5, 10]), + "idx1": int64_feature([0, 2]), + })), + example(features=features({ + "val": float_feature([]), # empty float list + "idx0": int64_feature([]), + "idx1": int64_feature([]), + })), + example(features=features({ + "val": feature(), # feature with nothing in it + # missing idx feature + })), + example(features=features({ + "val": float_feature([1, 2, -1]), + "idx0": int64_feature([0, 9, 3]), # unsorted + "idx1": int64_feature([1, 0, 2]), + })) + ] + + expected_outputs = [{ + "sp": (np.array([[5, 0], [10, 2]], dtype=np.int64), + np.array([3.0, 4.0], dtype=np.float32), + np.array([13, 3], dtype=np.int64)) + }, { + "sp": empty_sparse(np.float32, shape=[13, 3]) + }, { + "sp": empty_sparse(np.float32, shape=[13, 3]) + }, { + "sp": (np.array([[0, 1], [3, 2], [9, 0]], dtype=np.int64), + np.array([1.0, -1.0, 2.0], dtype=np.float32), + np.array([13, 3], dtype=np.int64)) + }] + + for proto, expected_output in zip(original, expected_outputs): + self._test({ + "serialized": ops.convert_to_tensor(proto.SerializeToString()), + "features": { + "sp": + parsing_ops.SparseFeature(["idx0", "idx1"], "val", + dtypes.float32, [13, 3]) + } + }, expected_output) + + def testSerializedContainingDense(self): + aname = "a" + bname = "b*has+a:tricky_name" + original = [ + example(features=features({ + aname: float_feature([1, 1]), + bname: bytes_feature([b"b0_str"]), + })), example(features=features({ + aname: float_feature([-1, -1]), + bname: bytes_feature([b"b1"]), + })) + ] + + expected_outputs = [{ + aname: np.array([1, 1], dtype=np.float32).reshape(1, 2, 1), + bname: np.array(["b0_str"], dtype=bytes).reshape(1, 1, 1, 1) + }, { + aname: np.array([-1, -1], dtype=np.float32).reshape(1, 2, 1), + bname: np.array(["b1"], dtype=bytes).reshape(1, 1, 1, 1) + }] + + for proto, expected_output in zip(original, expected_outputs): + # No defaults, values required + self._test({ + "serialized": ops.convert_to_tensor(proto.SerializeToString()), + "features": { + aname: + parsing_ops.FixedLenFeature((1, 2, 1), dtype=dtypes.float32), + bname: + parsing_ops.FixedLenFeature( + (1, 1, 1, 1), dtype=dtypes.string), + } + }, expected_output) + + # This test is identical as the previous one except + # for the creation of 'serialized'. + def testSerializedContainingDenseWithConcat(self): + aname = "a" + bname = "b*has+a:tricky_name" + # TODO(lew): Feature appearing twice should be an error in future. + original = [ + (example(features=features({ + aname: float_feature([10, 10]), + })), example(features=features({ + aname: float_feature([1, 1]), + bname: bytes_feature([b"b0_str"]), + }))), + ( + example(features=features({ + bname: bytes_feature([b"b100"]), + })), + example(features=features({ + aname: float_feature([-1, -1]), + bname: bytes_feature([b"b1"]), + })),), + ] + + expected_outputs = [{ + aname: np.array([1, 1], dtype=np.float32).reshape(1, 2, 1), + bname: np.array(["b0_str"], dtype=bytes).reshape(1, 1, 1, 1) + }, { + aname: np.array([-1, -1], dtype=np.float32).reshape(1, 2, 1), + bname: np.array(["b1"], dtype=bytes).reshape(1, 1, 1, 1) + }] + + for (m, n), expected_output in zip(original, expected_outputs): + # No defaults, values required + self._test({ + "serialized": + ops.convert_to_tensor( + m.SerializeToString() + n.SerializeToString()), + "features": { + aname: + parsing_ops.FixedLenFeature((1, 2, 1), dtype=dtypes.float32), + bname: + parsing_ops.FixedLenFeature( + (1, 1, 1, 1), dtype=dtypes.string), + } + }, expected_output) + + def testSerializedContainingDenseScalar(self): + original = [ + example(features=features({ + "a": float_feature([1]), + })), example(features=features({})) + ] + + expected_outputs = [{ + "a": np.array([1], dtype=np.float32) + }, { + "a": np.array([-1], dtype=np.float32) + }] + + for proto, expected_output in zip(original, expected_outputs): + self._test({ + "serialized": ops.convert_to_tensor(proto.SerializeToString()), + "features": { + "a": + parsing_ops.FixedLenFeature( + (1,), dtype=dtypes.float32, default_value=-1), + } + }, expected_output) + + def testSerializedContainingDenseWithDefaults(self): + original = [ + example(features=features({ + "a": float_feature([1, 1]), + })), + example(features=features({ + "b": bytes_feature([b"b1"]), + })), + example(features=features({ + "b": feature() + })), + ] + + expected_outputs = [{ + "a": np.array([1, 1], dtype=np.float32).reshape(1, 2, 1), + "b": np.array("tmp_str", dtype=bytes).reshape(1, 1, 1, 1) + }, { + "a": np.array([3, -3], dtype=np.float32).reshape(1, 2, 1), + "b": np.array("b1", dtype=bytes).reshape(1, 1, 1, 1) + }, { + "a": np.array([3, -3], dtype=np.float32).reshape(1, 2, 1), + "b": np.array("tmp_str", dtype=bytes).reshape(1, 1, 1, 1) + }] + + for proto, expected_output in zip(original, expected_outputs): + self._test({ + "serialized": ops.convert_to_tensor(proto.SerializeToString()), + "features": { + "a": + parsing_ops.FixedLenFeature( + (1, 2, 1), + dtype=dtypes.float32, + default_value=[3.0, -3.0]), + "b": + parsing_ops.FixedLenFeature( + (1, 1, 1, 1), + dtype=dtypes.string, + default_value="tmp_str"), + } + }, expected_output) + + def testSerializedContainingSparseAndSparseFeatureAndDenseWithNoDefault(self): + original = [ + example(features=features({ + "c": float_feature([3, 4]), + "val": bytes_feature([b"a", b"b"]), + "idx": int64_feature([0, 3]) + })), example(features=features({ + "c": float_feature([1, 2]), + "val": bytes_feature([b"c"]), + "idx": int64_feature([7]) + })) + ] + + a_default = np.array([[1, 2, 3]], dtype=np.int64) + b_default = np.random.rand(3, 3).astype(bytes) + + expected_st_a = empty_sparse(np.int64) + + expected_outputs = [{ + "st_a": + expected_st_a, + "sp": (np.array([[0], [3]], dtype=np.int64), + np.array(["a", "b"], dtype=bytes), np.array( + [13], dtype=np.int64)), + "a": + a_default, + "b": + b_default, + "c": + np.array([3, 4], dtype=np.float32) + }, { + "st_a": + expected_st_a, + "sp": (np.array([[7]], dtype=np.int64), np.array(["c"], dtype=bytes), + np.array([13], dtype=np.int64)), + "a": + a_default, + "b": + b_default, + "c": + np.array([1, 2], dtype=np.float32) + }] + + for proto, expected_output in zip(original, expected_outputs): + self._test( + { + "serialized": ops.convert_to_tensor(proto.SerializeToString()), + "features": { + "st_a": + parsing_ops.VarLenFeature(dtypes.int64), + "sp": + parsing_ops.SparseFeature("idx", "val", dtypes.string, 13 + ), + "a": + parsing_ops.FixedLenFeature( + (1, 3), dtypes.int64, default_value=a_default), + "b": + parsing_ops.FixedLenFeature( + (3, 3), dtypes.string, default_value=b_default), + # Feature "c" must be provided, since it has no default_value. + "c": + parsing_ops.FixedLenFeature((2,), dtypes.float32), + } + }, + expected_output) + + def testSerializedContainingSparseAndSparseFeatureWithReuse(self): + original = [ + example(features=features({ + "val": bytes_feature([b"a", b"b"]), + "idx": int64_feature([0, 3]) + })), example(features=features({ + "val": bytes_feature([b"c", b"d"]), + "idx": int64_feature([7, 1]) + })) + ] + + expected_outputs = [{ + "idx": (np.array([[0], [1]], dtype=np.int64), + np.array([0, 3], dtype=np.int64), np.array([2], + dtype=np.int64)), + "sp": (np.array([[0], [3]], dtype=np.int64), + np.array(["a", "b"], dtype=bytes), np.array( + [13], dtype=np.int64)) + }, + { + "idx": (np.array([[0], [1]], dtype=np.int64), + np.array([7, 1], dtype=np.int64), + np.array([2], dtype=np.int64)), + "sp": (np.array([[1], [7]], dtype=np.int64), + np.array(["d", "c"], dtype=bytes), + np.array([13], dtype=np.int64)) + }] + + for proto, expected_output in zip(original, expected_outputs): + self._test({ + "serialized": ops.convert_to_tensor(proto.SerializeToString()), + "features": { + "idx": + parsing_ops.VarLenFeature(dtypes.int64), + "sp": + parsing_ops.SparseFeature(["idx"], "val", dtypes.string, [13] + ), + } + }, expected_output) + + def testSerializedContainingVarLenDense(self): + aname = "a" + bname = "b" + cname = "c" + dname = "d" + original = [ + example(features=features({ + cname: int64_feature([2]), + })), + example(features=features({ + aname: float_feature([1, 1]), + bname: bytes_feature([b"b0_str", b"b1_str"]), + })), + example(features=features({ + aname: float_feature([-1, -1, 2, 2]), + bname: bytes_feature([b"b1"]), + })), + example(features=features({ + aname: float_feature([]), + cname: int64_feature([3]), + })), + ] + + expected_outputs = [ + { + aname: np.empty(shape=(0, 2, 1), dtype=np.int64), + bname: np.empty(shape=(0, 1, 1, 1), dtype=bytes), + cname: np.array([2], dtype=np.int64), + dname: np.empty(shape=(0,), dtype=bytes) + }, + { + aname: + np.array([[[1], [1]]], dtype=np.float32), + bname: + np.array(["b0_str", "b1_str"], dtype=bytes).reshape(2, 1, 1, 1), + cname: + np.empty(shape=(0,), dtype=np.int64), + dname: + np.empty(shape=(0,), dtype=bytes) + }, + { + aname: np.array([[[-1], [-1]], [[2], [2]]], dtype=np.float32), + bname: np.array(["b1"], dtype=bytes).reshape(1, 1, 1, 1), + cname: np.empty(shape=(0,), dtype=np.int64), + dname: np.empty(shape=(0,), dtype=bytes) + }, + { + aname: np.empty(shape=(0, 2, 1), dtype=np.int64), + bname: np.empty(shape=(0, 1, 1, 1), dtype=bytes), + cname: np.array([3], dtype=np.int64), + dname: np.empty(shape=(0,), dtype=bytes) + }, + ] + + for proto, expected_output in zip(original, expected_outputs): + self._test({ + "serialized": ops.convert_to_tensor(proto.SerializeToString()), + "features": { + aname: + parsing_ops.FixedLenSequenceFeature( + (2, 1), dtype=dtypes.float32, allow_missing=True), + bname: + parsing_ops.FixedLenSequenceFeature( + (1, 1, 1), dtype=dtypes.string, allow_missing=True), + cname: + parsing_ops.FixedLenSequenceFeature( + shape=[], dtype=dtypes.int64, allow_missing=True), + dname: + parsing_ops.FixedLenSequenceFeature( + shape=[], dtype=dtypes.string, allow_missing=True), + } + }, expected_output) + + # Test with padding values. + # NOTE(mrry): Since we parse a single example at a time, the fixed-length + # sequences will not be padded, and the padding value will be ignored. + for proto, expected_output in zip(original, expected_outputs): + self._test({ + "serialized": ops.convert_to_tensor(proto.SerializeToString()), + "features": { + aname: + parsing_ops.FixedLenSequenceFeature( + (2, 1), dtype=dtypes.float32, allow_missing=True), + bname: + parsing_ops.FixedLenSequenceFeature( + (1, 1, 1), dtype=dtypes.string, allow_missing=True), + cname: + parsing_ops.FixedLenSequenceFeature( + shape=[], dtype=dtypes.int64, allow_missing=True), + dname: + parsing_ops.FixedLenSequenceFeature( + shape=[], dtype=dtypes.string, allow_missing=True), + } + }, expected_output) + + # Change number of required values so the inputs are not a + # multiple of this size. + self._test( + { + "serialized": + ops.convert_to_tensor(original[2].SerializeToString()), + "features": { + aname: + parsing_ops.FixedLenSequenceFeature( + (2, 1), dtype=dtypes.float32, allow_missing=True), + bname: + parsing_ops.FixedLenSequenceFeature( + (2, 1, 1), dtype=dtypes.string, allow_missing=True), + } + }, + # TODO(mrry): Consider matching the `tf.parse_example()` error message. + expected_err=(errors_impl.OpError, "Key: b.")) + + self._test( + { + "serialized": ops.convert_to_tensor(""), + "features": { + aname: + parsing_ops.FixedLenSequenceFeature( + (2, 1), + dtype=dtypes.float32, + allow_missing=True, + default_value=[]), + bname: + parsing_ops.FixedLenSequenceFeature( + (2, 1, 1), dtype=dtypes.string, allow_missing=True), + } + }, + expected_err=(ValueError, + "Cannot reshape a tensor with 0 elements to shape")) + + self._test( + { + "serialized": ops.convert_to_tensor(""), + "features": { + aname: + parsing_ops.FixedLenFeature( + (None, 2, 1), dtype=dtypes.float32), + bname: + parsing_ops.FixedLenSequenceFeature( + (2, 1, 1), dtype=dtypes.string, allow_missing=True), + } + }, + expected_err=(ValueError, + "First dimension of shape for feature a unknown. " + "Consider using FixedLenSequenceFeature.")) + + self._test( + { + "serialized": ops.convert_to_tensor(""), + "features": { + cname: + parsing_ops.FixedLenFeature( + (1, None), dtype=dtypes.int64, default_value=[[1]]), + } + }, + expected_err=(ValueError, + "All dimensions of shape for feature c need to be known " + r"but received \(1, None\).")) + + self._test( + { + "serialized": ops.convert_to_tensor(""), + "features": { + aname: + parsing_ops.FixedLenSequenceFeature( + (2, 1), dtype=dtypes.float32, allow_missing=True), + bname: + parsing_ops.FixedLenSequenceFeature( + (1, 1, 1), dtype=dtypes.string, allow_missing=True), + cname: + parsing_ops.FixedLenSequenceFeature( + shape=[], dtype=dtypes.int64, allow_missing=False), + dname: + parsing_ops.FixedLenSequenceFeature( + shape=[], dtype=dtypes.string, allow_missing=True), + } + }, + expected_err=(ValueError, + "Unsupported: FixedLenSequenceFeature requires " + "allow_missing to be True.")) + + +class ParseSingleExampleTest(test.TestCase): + + def _test(self, kwargs, expected_values=None, expected_err=None): + with self.test_session() as sess: + if expected_err: + with self.assertRaisesWithPredicateMatch(expected_err[0], + expected_err[1]): + out = parsing_ops.parse_single_example_v2(**kwargs) + sess.run(flatten_values_tensors_or_sparse(out.values())) + else: + # Returns dict w/ Tensors and SparseTensors. + out = parsing_ops.parse_single_example_v2(**kwargs) + # Check values. + tf_result = sess.run(flatten_values_tensors_or_sparse(out.values())) + _compare_output_to_expected(self, out, expected_values, tf_result) + + # Check shapes. + for k, f in kwargs["features"].items(): + if isinstance(f, parsing_ops.FixedLenFeature) and f.shape is not None: + self.assertEqual(tuple(out[k].get_shape()), + tensor_shape.as_shape(f.shape)) + elif isinstance(f, parsing_ops.VarLenFeature): + self.assertEqual( + tuple(out[k].indices.get_shape().as_list()), (None, 1)) + self.assertEqual(tuple(out[k].values.get_shape().as_list()), (None,)) + self.assertEqual( + tuple(out[k].dense_shape.get_shape().as_list()), (1,)) + + def testSingleExampleWithSparseAndSparseFeatureAndDense(self): + original = example(features=features({ + "c": float_feature([3, 4]), + "d": float_feature([0.0, 1.0]), + "val": bytes_feature([b"a", b"b"]), + "idx": int64_feature([0, 3]), + "st_a": float_feature([3.0, 4.0]) + })) + + serialized = original.SerializeToString() + + expected_st_a = ( + np.array( + [[0], [1]], dtype=np.int64), # indices + np.array( + [3.0, 4.0], dtype=np.float32), # values + np.array( + [2], dtype=np.int64)) # shape: max_values = 2 + + expected_sp = ( # indices, values, shape + np.array( + [[0], [3]], dtype=np.int64), np.array( + ["a", "b"], dtype="|S"), np.array( + [13], dtype=np.int64)) # max_values = 13 + + a_default = [1, 2, 3] + b_default = np.random.rand(3, 3).astype(bytes) + expected_output = { + "st_a": expected_st_a, + "sp": expected_sp, + "a": [a_default], + "b": b_default, + "c": np.array([3, 4], dtype=np.float32), + "d": np.array([0.0, 1.0], dtype=np.float32), + } + + self._test( + { + "serialized": + ops.convert_to_tensor(serialized), + "features": { + "st_a": + parsing_ops.VarLenFeature(dtypes.float32), + "sp": + parsing_ops.SparseFeature( + ["idx"], "val", dtypes.string, [13]), + "a": + parsing_ops.FixedLenFeature( + (1, 3), dtypes.int64, default_value=a_default), + "b": + parsing_ops.FixedLenFeature( + (3, 3), dtypes.string, default_value=b_default), + # Feature "c" must be provided, since it has no default_value. + "c": + parsing_ops.FixedLenFeature(2, dtypes.float32), + "d": + parsing_ops.FixedLenSequenceFeature([], + dtypes.float32, + allow_missing=True) + } + }, + expected_output) + + +if __name__ == "__main__": + test.main() diff --git a/tensorflow/python/ops/parsing_ops.py b/tensorflow/python/ops/parsing_ops.py index 14aef01dec..eba40c4f85 100644 --- a/tensorflow/python/ops/parsing_ops.py +++ b/tensorflow/python/ops/parsing_ops.py @@ -1205,3 +1205,198 @@ def decode_csv(records, record_defaults, field_delim=",", field_delim=field_delim, use_quote_delim=use_quote_delim, na_value=na_value, name=name) # pylint: enable=protected-access + + +# TODO(b/70890287): Combine the implementation of this op and +# `parse_single_example()` after 1/10/2018. +def parse_single_example_v2(serialized, features, name=None): + # pylint: disable=line-too-long + """Parses an `Example` proto into a `dict` of tensors. + + Parses a serialized + [`Example`](https://www.tensorflow.org/code/tensorflow/core/example/example.proto) + proto given in `serialized`. + + This op parses serialized examples into a dictionary mapping keys to `Tensor` + and `SparseTensor` objects. `features` is a dict from keys to `VarLenFeature`, + `SparseFeature`, and `FixedLenFeature` objects. Each `VarLenFeature` + and `SparseFeature` is mapped to a `SparseTensor`, and each + `FixedLenFeature` is mapped to a `Tensor`. + + Each `VarLenFeature` maps to a `SparseTensor` of the specified type + representing a ragged matrix. Its indices are `[index]` where + `index` is the value's index in the list of values associated with + that feature and example. + + Each `SparseFeature` maps to a `SparseTensor` of the specified type + representing a Tensor of `dense_shape` `SparseFeature.size`. + Its `values` come from the feature in the examples with key `value_key`. + A `values[i]` comes from a position `k` in the feature of an example at batch + entry `batch`. This positional information is recorded in `indices[i]` as + `[batch, index_0, index_1, ...]` where `index_j` is the `k-th` value of + the feature in the example at with key `SparseFeature.index_key[j]. + In other words, we split the indices (except the first index indicating the + batch entry) of a `SparseTensor` by dimension into different features of the + `Example`. Due to its complexity a `VarLenFeature` should be preferred over a + `SparseFeature` whenever possible. + + Each `FixedLenFeature` `df` maps to a `Tensor` of the specified type (or + `tf.float32` if not specified) and shape `df.shape`. + + `FixedLenFeature` entries with a `default_value` are optional. With no default + value, we will fail if that `Feature` is missing from any example in + `serialized`. + + Each `FixedLenSequenceFeature` `df` maps to a `Tensor` of the specified type + (or `tf.float32` if not specified) and shape `(None,) + df.shape`. + + Args: + serialized: A scalar (0-D Tensor) string, a serialized `Example` proto. + features: A `dict` mapping feature keys to `FixedLenFeature`, + `VarLenFeature`, and `SparseFeature` values. + name: A name for this operation (optional). + + Returns: + A `dict` mapping feature keys to `Tensor` and `SparseTensor` values. + + Raises: + ValueError: if any feature is invalid. + """ + if not features: + raise ValueError("Missing: features was %s." % features) + features = _prepend_none_dimension(features) + (sparse_keys, sparse_types, dense_keys, dense_types, + dense_defaults, dense_shapes) = _features_to_raw_params( + features, + [VarLenFeature, SparseFeature, FixedLenFeature, FixedLenSequenceFeature]) + outputs = _parse_single_example_v2_raw(serialized, sparse_keys, sparse_types, + dense_keys, dense_types, + dense_defaults, dense_shapes, name) + return _construct_sparse_tensors_for_sparse_features(features, outputs) + + +def _parse_single_example_v2_raw(serialized, sparse_keys, sparse_types, + dense_keys, dense_types, dense_defaults, + dense_shapes, name): + """Parses `Example` protos. + + Args: + serialized: A scalar (0-D Tensor) string, containing a binary + serialized `Example` proto. + sparse_keys: A list of string keys in the examples' features. + The results for these keys will be returned as `SparseTensor` objects. + sparse_types: A list of `DTypes` of the same length as `sparse_keys`. + Only `tf.float32` (`FloatList`), `tf.int64` (`Int64List`), + and `tf.string` (`BytesList`) are supported. + dense_keys: A list of string keys in the examples' features. + The results for these keys will be returned as `Tensor`s + dense_types: A list of DTypes of the same length as `dense_keys`. + Only `tf.float32` (`FloatList`), `tf.int64` (`Int64List`), + and `tf.string` (`BytesList`) are supported. + dense_defaults: A dict mapping string keys to `Tensor`s. + The keys of the dict must match the dense_keys of the feature. + dense_shapes: A list of tuples with the same length as `dense_keys`. + The shape of the data for each dense feature referenced by `dense_keys`. + Required for any input tensors identified by `dense_keys`. Must be + either fully defined, or may contain an unknown first dimension. + An unknown first dimension means the feature is treated as having + a variable number of blocks, and the output shape along this dimension + is considered unknown at graph build time. Padding is applied for + minibatch elements smaller than the maximum number of blocks for the + given feature along this dimension. + name: A name for this operation (optional). + + Returns: + A `dict` mapping keys to `Tensor`s and `SparseTensor`s. + + Raises: + ValueError: If sparse and dense key sets intersect, or input lengths do not + match up. + """ + with ops.name_scope(name, "ParseSingleExample", [serialized]): + dense_defaults = collections.OrderedDict( + ) if dense_defaults is None else dense_defaults + sparse_keys = [] if sparse_keys is None else sparse_keys + sparse_types = [] if sparse_types is None else sparse_types + dense_keys = [] if dense_keys is None else dense_keys + dense_types = [] if dense_types is None else dense_types + dense_shapes = ([[]] * len(dense_keys) + if dense_shapes is None else dense_shapes) + + num_dense = len(dense_keys) + num_sparse = len(sparse_keys) + + if len(dense_shapes) != num_dense: + raise ValueError("len(dense_shapes) != len(dense_keys): %d vs. %d" % + (len(dense_shapes), num_dense)) + if len(dense_types) != num_dense: + raise ValueError("len(dense_types) != len(num_dense): %d vs. %d" % + (len(dense_types), num_dense)) + if len(sparse_types) != num_sparse: + raise ValueError("len(sparse_types) != len(sparse_keys): %d vs. %d" % + (len(sparse_types), num_sparse)) + if num_dense + num_sparse == 0: + raise ValueError("Must provide at least one sparse key or dense key") + if not set(dense_keys).isdisjoint(set(sparse_keys)): + raise ValueError( + "Dense and sparse keys must not intersect; intersection: %s" % + set(dense_keys).intersection(set(sparse_keys))) + + # Convert dense_shapes to TensorShape object. + dense_shapes = [tensor_shape.as_shape(shape) for shape in dense_shapes] + + dense_defaults_vec = [] + for i, key in enumerate(dense_keys): + default_value = dense_defaults.get(key) + dense_shape = dense_shapes[i] + if (dense_shape.ndims is not None and dense_shape.ndims > 0 and + dense_shape[0].value is None): + # Variable stride dense shape, the default value should be a + # scalar padding value + if default_value is None: + default_value = ops.convert_to_tensor( + "" if dense_types[i] == dtypes.string else 0, + dtype=dense_types[i]) + else: + # Reshape to a scalar to ensure user gets an error if they + # provide a tensor that's not intended to be a padding value + # (0 or 2+ elements). + key_name = "padding_" + re.sub("[^A-Za-z0-9_.\\-/]", "_", key) + default_value = ops.convert_to_tensor( + default_value, dtype=dense_types[i], name=key_name) + default_value = array_ops.reshape(default_value, []) + else: + if default_value is None: + default_value = constant_op.constant([], dtype=dense_types[i]) + elif not isinstance(default_value, ops.Tensor): + key_name = "key_" + re.sub("[^A-Za-z0-9_.\\-/]", "_", key) + default_value = ops.convert_to_tensor( + default_value, dtype=dense_types[i], name=key_name) + default_value = array_ops.reshape(default_value, dense_shape) + + dense_defaults_vec.append(default_value) + + # Finally, convert dense_shapes to TensorShapeProto + dense_shapes = [shape.as_proto() for shape in dense_shapes] + + # pylint: disable=protected-access + outputs = gen_parsing_ops.parse_single_example( + serialized=serialized, + dense_defaults=dense_defaults_vec, + num_sparse=len(sparse_keys), + sparse_keys=sparse_keys, + sparse_types=sparse_types, + dense_keys=dense_keys, + dense_shapes=dense_shapes, + name=name) + # pylint: enable=protected-access + + (sparse_indices, sparse_values, sparse_shapes, dense_values) = outputs + + sparse_tensors = [ + sparse_tensor.SparseTensor(ix, val, shape) + for (ix, val, + shape) in zip(sparse_indices, sparse_values, sparse_shapes) + ] + + return dict(zip(sparse_keys + dense_keys, sparse_tensors + dense_values)) -- cgit v1.2.3