diff options
-rw-r--r-- | tensorflow/core/api_def/base_api/api_def_ParseSequenceExample.pbtxt | 112 | ||||
-rw-r--r-- | tensorflow/core/api_def/python_api/api_def_ParseSequenceExample.pbtxt | 4 | ||||
-rw-r--r-- | tensorflow/core/kernels/example_parsing_ops.cc | 165 | ||||
-rw-r--r-- | tensorflow/core/ops/parsing_ops.cc | 93 | ||||
-rw-r--r-- | tensorflow/core/ops/parsing_ops_test.cc | 82 | ||||
-rw-r--r-- | tensorflow/core/util/example_proto_fast_parsing.cc | 228 | ||||
-rw-r--r-- | tensorflow/core/util/example_proto_fast_parsing.h | 3 | ||||
-rw-r--r-- | tensorflow/core/util/example_proto_helper.cc | 53 | ||||
-rw-r--r-- | tensorflow/core/util/example_proto_helper.h | 61 | ||||
-rw-r--r-- | tensorflow/python/kernel_tests/parsing_ops_test.py | 1158 | ||||
-rw-r--r-- | tensorflow/python/ops/parsing_ops.py | 346 | ||||
-rw-r--r-- | tensorflow/tools/api/golden/v1/tensorflow.io.pbtxt | 4 | ||||
-rw-r--r-- | tensorflow/tools/api/golden/v2/tensorflow.io.pbtxt | 4 |
13 files changed, 1726 insertions, 587 deletions
diff --git a/tensorflow/core/api_def/base_api/api_def_ParseSequenceExample.pbtxt b/tensorflow/core/api_def/base_api/api_def_ParseSequenceExample.pbtxt new file mode 100644 index 0000000000..b1cb9a696d --- /dev/null +++ b/tensorflow/core/api_def/base_api/api_def_ParseSequenceExample.pbtxt @@ -0,0 +1,112 @@ +op { + graph_op_name: "ParseSequenceExample" + in_arg { + name: "serialized" + description: <<END +A vector containing binary serialized SequenceExample protos. +END + } + in_arg { + name: "debug_name" + description: <<END +A vector containing the names of the serialized protos. +May contain, for example, table key (descriptive) name for the +corresponding serialized proto. This is purely useful for debugging +purposes, and the presence of values here has no effect on the output. +May also be an empty vector if no name is available. +END + } + in_arg { + name: "context_dense_defaults" + description: <<END +A list of Ncontext_dense Tensors (some may be empty). +context_dense_defaults[j] provides default values +when the SequenceExample's context map lacks context_dense_key[j]. +If an empty Tensor is provided for context_dense_defaults[j], +then the Feature context_dense_keys[j] is required. +The input type is inferred from context_dense_defaults[j], even when it's +empty. If context_dense_defaults[j] is not empty, its shape must match +context_dense_shapes[j]. +END + } + attr { + name: "feature_list_dense_missing_assumed_empty" + description: <<END +A vector listing the +FeatureList keys which may be missing from the SequenceExamples. If the +associated FeatureList is missing, it is treated as empty. By default, +any FeatureList not listed in this vector must exist in the SequenceExamples. +END + } + attr { + name: "context_sparse_keys" + description: <<END +A list of Ncontext_sparse string Tensors (scalars). +The keys expected in the Examples' features associated with context_sparse +values. +END + } + attr { + name: "context_dense_keys" + description: <<END +A list of Ncontext_dense string Tensors (scalars). +The keys expected in the SequenceExamples' context features associated with +dense values. +END + } + attr { + name: "feature_list_sparse_keys" + description: <<END +A list of Nfeature_list_sparse string Tensors +(scalars). The keys expected in the FeatureLists associated with sparse +values. +END + } + attr { + name: "feature_list_dense_keys" + description: <<END +A list of Nfeature_list_dense string Tensors (scalars). +The keys expected in the SequenceExamples' feature_lists associated +with lists of dense values. +END + } + attr { + name: "context_sparse_types" + description: <<END +A list of Ncontext_sparse types; the data types of data in +each context Feature given in context_sparse_keys. +Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList), +DT_INT64 (Int64List), and DT_STRING (BytesList). +END + } + attr { + name: "context_dense_shapes" + description: <<END +A list of Ncontext_dense shapes; the shapes of data in +each context Feature given in context_dense_keys. +The number of elements in the Feature corresponding to context_dense_key[j] +must always equal context_dense_shapes[j].NumEntries(). +The shape of context_dense_values[j] will match context_dense_shapes[j]. +END + } + attr { + name: "feature_list_sparse_types" + description: <<END +A list of Nfeature_list_sparse types; the data types +of data in each FeatureList given in feature_list_sparse_keys. +Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList), +DT_INT64 (Int64List), and DT_STRING (BytesList). +END + } + attr { + name: "feature_list_dense_shapes" + description: <<END +A list of Nfeature_list_dense shapes; the shapes of +data in each FeatureList given in feature_list_dense_keys. +The shape of each Feature in the FeatureList corresponding to +feature_list_dense_key[j] must always equal +feature_list_dense_shapes[j].NumEntries(). +END + } + summary: "Transforms a vector of brain.SequenceExample protos (as strings) into typed tensors." +} diff --git a/tensorflow/core/api_def/python_api/api_def_ParseSequenceExample.pbtxt b/tensorflow/core/api_def/python_api/api_def_ParseSequenceExample.pbtxt new file mode 100644 index 0000000000..4a7e75ba0e --- /dev/null +++ b/tensorflow/core/api_def/python_api/api_def_ParseSequenceExample.pbtxt @@ -0,0 +1,4 @@ +op { + graph_op_name: "ParseSequenceExample" + visibility: HIDDEN +} diff --git a/tensorflow/core/kernels/example_parsing_ops.cc b/tensorflow/core/kernels/example_parsing_ops.cc index 83cd0e9b47..528b3c6bf0 100644 --- a/tensorflow/core/kernels/example_parsing_ops.cc +++ b/tensorflow/core/kernels/example_parsing_ops.cc @@ -264,9 +264,168 @@ class ParseSingleExampleOp : public OpKernel { REGISTER_KERNEL_BUILDER(Name("ParseSingleExample").Device(DEVICE_CPU), ParseSingleExampleOp); -class SingleSequenceExampleParserOp : public OpKernel { +class ParseSequenceExampleOp : public OpKernel { public: - explicit SingleSequenceExampleParserOp(OpKernelConstruction* ctx) + explicit ParseSequenceExampleOp(OpKernelConstruction* ctx) : OpKernel(ctx) { + OP_REQUIRES_OK(ctx, attrs_.Init(ctx)); + } + + void Compute(OpKernelContext* ctx) override { + const Tensor* debug_name; + const Tensor* serialized; + OpInputList context_dense_defaults; + + OP_REQUIRES_OK(ctx, ctx->input("debug_name", &debug_name)); + OP_REQUIRES_OK(ctx, ctx->input("serialized", &serialized)); + OP_REQUIRES_OK(ctx, ctx->input_list("context_dense_defaults", + &context_dense_defaults)); + + bool has_debug_name = (debug_name->NumElements() > 0); + if (has_debug_name) { + OP_REQUIRES(ctx, TensorShapeUtils::IsVector(debug_name->shape()), + errors::InvalidArgument( + "Expected debug_name to be a vector, got shape: ", + debug_name->shape().DebugString())); + } + + OP_REQUIRES(ctx, TensorShapeUtils::IsVector(serialized->shape()), + errors::InvalidArgument( + "Expected serialized to be a vector, got shape: ", + serialized->shape().DebugString())); + + OP_REQUIRES(ctx, context_dense_defaults.size() == attrs_.num_context_dense, + errors::InvalidArgument("Expected len(context_dense_defaults) " + "== len(context_dense_keys) but got: ", + context_dense_defaults.size(), " vs. ", + attrs_.num_context_dense)); + + std::vector<bool> required(attrs_.num_context_dense); + for (int d = 0; d < attrs_.num_context_dense; ++d) { + const Tensor& def_value = context_dense_defaults[d]; + required[d] = (def_value.NumElements() == 0); // No default provided. + + if (def_value.NumElements() > 0) { + OP_REQUIRES(ctx, def_value.shape() == attrs_.context_dense_shapes[d], + errors::InvalidArgument( + "default_value[", d, + "].shape() == ", def_value.shape().DebugString(), + " != context_dense_shapes[", d, + "] == ", attrs_.context_dense_shapes[d].DebugString())); + OP_REQUIRES( + ctx, def_value.dtype() == attrs_.context_dense_types[d], + errors::InvalidArgument( + "context_dense_defaults[", d, "].dtype() == ", + DataTypeString(def_value.dtype()), " != context_dense_types[", + d, "] == ", DataTypeString(attrs_.context_dense_types[d]))); + } + } + + example::Result context_result, feature_list_result; + std::vector<Tensor> dense_feature_lengths; + + example::FastParseExampleConfig context_config; + for (int d = 0; d < attrs_.num_context_dense; ++d) { + context_config.dense.push_back( + {attrs_.context_dense_keys[d], attrs_.context_dense_types[d], + attrs_.context_dense_shapes[d], context_dense_defaults[d], + false /* attrs_.context_variable_length[d] */, + 0 /*attrs_.context_elements_per_stride[d] */}); + } + for (int d = 0; d < attrs_.num_context_sparse; ++d) { + context_config.sparse.push_back( + {attrs_.context_sparse_keys[d], attrs_.context_sparse_types[d]}); + } + example::FastParseExampleConfig feature_list_config; + for (int d = 0; d < attrs_.num_feature_list_dense; ++d) { + DataType dtype = attrs_.feature_list_dense_types[d]; + Tensor default_value = Tensor(dtype, TensorShape({})); + feature_list_config.dense.push_back( + {attrs_.feature_list_dense_keys[d], dtype, + attrs_.feature_list_dense_shapes[d], default_value, + (attrs_.feature_list_dense_missing_assumed_empty.count( + attrs_.feature_list_dense_keys[d]) > 0), + 0 /*attrs_.context_elements_per_stride[d] */}); + } + for (int d = 0; d < attrs_.num_feature_list_sparse; ++d) { + feature_list_config.sparse.push_back( + {attrs_.feature_list_sparse_keys[d], + attrs_.feature_list_sparse_types[d]}); + } + + auto serialized_t = serialized->flat<string>(); + auto debug_name_t = debug_name->flat<string>(); + gtl::ArraySlice<string> slice(serialized_t.data(), serialized_t.size()); + gtl::ArraySlice<string> names_slice(debug_name_t.data(), + debug_name_t.size()); + + OP_REQUIRES_OK( + ctx, + FastParseSequenceExample( + context_config, feature_list_config, slice, names_slice, + ctx->device()->tensorflow_cpu_worker_threads()->workers, + &context_result, &feature_list_result, &dense_feature_lengths)); + + OpOutputList context_sparse_indices; + OpOutputList context_sparse_values; + OpOutputList context_sparse_shapes; + OpOutputList context_dense_values; + OpOutputList feature_list_sparse_indices; + OpOutputList feature_list_sparse_values; + OpOutputList feature_list_sparse_shapes; + OpOutputList feature_list_dense_values; + OpOutputList feature_list_dense_lengths; + + OP_REQUIRES_OK(ctx, ctx->output_list("context_sparse_indices", + &context_sparse_indices)); + OP_REQUIRES_OK( + ctx, ctx->output_list("context_sparse_values", &context_sparse_values)); + OP_REQUIRES_OK( + ctx, ctx->output_list("context_sparse_shapes", &context_sparse_shapes)); + OP_REQUIRES_OK( + ctx, ctx->output_list("context_dense_values", &context_dense_values)); + OP_REQUIRES_OK(ctx, ctx->output_list("context_sparse_indices", + &context_sparse_indices)); + OP_REQUIRES_OK(ctx, ctx->output_list("feature_list_sparse_indices", + &feature_list_sparse_indices)); + OP_REQUIRES_OK(ctx, ctx->output_list("feature_list_sparse_values", + &feature_list_sparse_values)); + OP_REQUIRES_OK(ctx, ctx->output_list("feature_list_sparse_shapes", + &feature_list_sparse_shapes)); + OP_REQUIRES_OK(ctx, ctx->output_list("feature_list_dense_values", + &feature_list_dense_values)); + OP_REQUIRES_OK(ctx, ctx->output_list("feature_list_dense_lengths", + &feature_list_dense_lengths)); + for (int d = 0; d < attrs_.num_context_dense; ++d) { + context_dense_values.set(d, context_result.dense_values[d]); + } + TensorShape lengths_shape; + lengths_shape.AddDim(serialized_t.size()); + for (int d = 0; d < attrs_.num_feature_list_dense; ++d) { + feature_list_dense_values.set(d, feature_list_result.dense_values[d]); + feature_list_dense_lengths.set(d, dense_feature_lengths[d]); + } + for (int d = 0; d < attrs_.num_context_sparse; ++d) { + context_sparse_indices.set(d, context_result.sparse_indices[d]); + context_sparse_values.set(d, context_result.sparse_values[d]); + context_sparse_shapes.set(d, context_result.sparse_shapes[d]); + } + for (int d = 0; d < attrs_.num_feature_list_sparse; ++d) { + feature_list_sparse_indices.set(d, feature_list_result.sparse_indices[d]); + feature_list_sparse_values.set(d, feature_list_result.sparse_values[d]); + feature_list_sparse_shapes.set(d, feature_list_result.sparse_shapes[d]); + } + } + + protected: + ParseSequenceExampleAttrs attrs_; +}; + +REGISTER_KERNEL_BUILDER(Name("ParseSequenceExample").Device(DEVICE_CPU), + ParseSequenceExampleOp); + +class ParseSingleSequenceExampleOp : public OpKernel { + public: + explicit ParseSingleSequenceExampleOp(OpKernelConstruction* ctx) : OpKernel(ctx) { OP_REQUIRES_OK(ctx, attrs_.Init(ctx)); } @@ -658,7 +817,7 @@ class SingleSequenceExampleParserOp : public OpKernel { }; REGISTER_KERNEL_BUILDER(Name("ParseSingleSequenceExample").Device(DEVICE_CPU), - SingleSequenceExampleParserOp); + ParseSingleSequenceExampleOp); #ifndef IS_MOBILE_PLATFORM // when using lite protos on mobile, decoding JSON is not available. diff --git a/tensorflow/core/ops/parsing_ops.cc b/tensorflow/core/ops/parsing_ops.cc index ddb714b4e9..79ca96d249 100644 --- a/tensorflow/core/ops/parsing_ops.cc +++ b/tensorflow/core/ops/parsing_ops.cc @@ -132,6 +132,99 @@ REGISTER_OP("ParseSingleExample") return Status::OK(); }); +REGISTER_OP("ParseSequenceExample") + .Input("serialized: string") + .Input("debug_name: string") + .Input("context_dense_defaults: Tcontext_dense") + .Output("context_sparse_indices: Ncontext_sparse * int64") + .Output("context_sparse_values: context_sparse_types") + .Output("context_sparse_shapes: Ncontext_sparse * int64") + .Output("context_dense_values: Tcontext_dense") + .Output("feature_list_sparse_indices: Nfeature_list_sparse * int64") + .Output("feature_list_sparse_values: feature_list_sparse_types") + .Output("feature_list_sparse_shapes: Nfeature_list_sparse * int64") + .Output("feature_list_dense_values: feature_list_dense_types") + .Output("feature_list_dense_lengths: Nfeature_list_dense * int64") + .Attr("feature_list_dense_missing_assumed_empty: list(string) >= 0") + .Attr("context_sparse_keys: list(string) >= 0") + .Attr("context_dense_keys: list(string) >= 0") + .Attr("feature_list_sparse_keys: list(string) >= 0") + .Attr("feature_list_dense_keys: list(string) >= 0") + .Attr("Ncontext_sparse: int >= 0 = 0") + .Attr("Ncontext_dense: int >= 0 = 0") + .Attr("Nfeature_list_sparse: int >= 0 = 0") + .Attr("Nfeature_list_dense: int >= 0 = 0") + .Attr("context_sparse_types: list({float,int64,string}) >= 0 = []") + .Attr("Tcontext_dense: list({float,int64,string}) >= 0 = []") + .Attr("feature_list_dense_types: list({float,int64,string}) >= 0 = []") + .Attr("context_dense_shapes: list(shape) >= 0 = []") + .Attr("feature_list_sparse_types: list({float,int64,string}) >= 0 = []") + .Attr("feature_list_dense_shapes: list(shape) >= 0 = []") + .SetShapeFn([](InferenceContext* c) { + ParseSequenceExampleAttrs attrs; + TF_RETURN_IF_ERROR(attrs.Init(c)); + + // Verify that the input is a vector, and carry the shape if known. + ShapeHandle input; + TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &input)); + shape_inference::DimensionHandle num_examples = c->Dim(input, 0); + + ShapeHandle unused; + TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &unused)); // debug_name + + int output_idx = 0; + + // Output context_sparse_indices, context_sparse_values, and + // context_sparse_shapes. + for (int i = 0; i < attrs.num_context_sparse; ++i) { + c->set_output(output_idx++, c->Matrix(c->UnknownDim(), 2)); + } + for (int i = 0; i < attrs.num_context_sparse; ++i) { + c->set_output(output_idx++, c->Vector(c->UnknownDim())); + } + for (int i = 0; i < attrs.num_context_sparse; ++i) { + c->set_output(output_idx++, c->Vector(2)); + } + + // Output context_dense_values. + for (int i = 0; i < attrs.num_context_dense; ++i) { + ShapeHandle s; + TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape( + attrs.context_dense_shapes[i], &s)); + TF_RETURN_IF_ERROR(c->Concatenate(c->Vector(num_examples), s, &s)); + c->set_output(output_idx++, s); + } + + // Output feature_list_sparse_indices, feature_list_sparse_values, + // feature_list_sparse_shapes. + for (int i = 0; i < attrs.num_feature_list_sparse; ++i) { + c->set_output(output_idx++, c->Matrix(c->UnknownDim(), 3)); + } + for (int i = 0; i < attrs.num_feature_list_sparse; ++i) { + c->set_output(output_idx++, c->Vector(c->UnknownDim())); + } + for (int i = 0; i < attrs.num_feature_list_sparse; ++i) { + c->set_output(output_idx++, c->Vector(3)); + } + + // Output feature_list_dense_shapes. + for (int i = 0; i < attrs.num_feature_list_dense; ++i) { + ShapeHandle s; + TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape( + attrs.feature_list_dense_shapes[i], &s)); + TF_RETURN_IF_ERROR( + c->Concatenate(c->Matrix(num_examples, c->UnknownDim()), s, &s)); + c->set_output(output_idx++, s); + } + + // Output feature_list_dense_lengths. + for (int i = 0; i < attrs.num_feature_list_dense; ++i) { + c->set_output(output_idx++, c->Vector(num_examples)); + } + + return Status::OK(); + }); + REGISTER_OP("ParseSingleSequenceExample") .Input("serialized: string") .Input("feature_list_dense_missing_assumed_empty: string") diff --git a/tensorflow/core/ops/parsing_ops_test.cc b/tensorflow/core/ops/parsing_ops_test.cc index 9121d7ae92..c65e66d1a8 100644 --- a/tensorflow/core/ops/parsing_ops_test.cc +++ b/tensorflow/core/ops/parsing_ops_test.cc @@ -143,6 +143,88 @@ TEST(ParsingOpsTest, ParseExample_ShapeFn) { "?;?;?;?;?;?;?;?;?;?"); } +TEST(ParsingOpsTest, ParseSequenceExample_ShapeFn) { + ShapeInferenceTestOp op("ParseSequenceExample"); + auto set_outputs = [&op](int num_context_sparse, int num_context_dense, + int num_feature_list_sparse, + int num_feature_list_dense, + bool add_extra_shape = false) { + using NodeOutList = std::vector<NodeDefBuilder::NodeOut>; + using DataTypeList = std::vector<DataType>; + string string_in("test"); + NodeDefBuilder::NodeOut node_in{"a", 0, DT_STRING}; + TF_ASSERT_OK( + NodeDefBuilder("test", "ParseSequenceExample") + .Input("serialized", 0, DT_STRING) + .Input("debug_name", 0, DT_STRING) + .Input(NodeOutList(num_context_dense, node_in)) + .Attr("Ncontext_sparse", num_context_sparse) + .Attr("Ncontext_dense", num_context_dense) + .Attr("Nfeature_list_sparse", num_feature_list_sparse) + .Attr("Nfeature_list_dense", num_feature_list_dense) + .Attr("feature_list_dense_missing_assumed_empty", + std::vector<string>(num_feature_list_dense, string_in)) + .Attr("context_sparse_keys", + std::vector<string>(num_context_sparse, string_in)) + .Attr("context_dense_keys", + std::vector<string>(num_context_dense, string_in)) + .Attr("feature_list_sparse_keys", + std::vector<string>(num_feature_list_sparse, string_in)) + .Attr("feature_list_dense_keys", + std::vector<string>(num_feature_list_dense, string_in)) + .Attr("context_sparse_types", + DataTypeList(num_context_sparse, DT_FLOAT)) + .Attr("context_dense_types", + DataTypeList(num_context_dense, DT_FLOAT)) + .Attr("context_dense_shapes", + MakeDenseShapes(num_context_dense, add_extra_shape, 0)) + .Attr("feature_list_sparse_types", + DataTypeList(num_feature_list_sparse, DT_FLOAT)) + .Attr("feature_list_dense_types", + DataTypeList(num_feature_list_dense, DT_FLOAT)) + .Attr("feature_list_dense_shapes", + MakeDenseShapes(num_feature_list_dense, add_extra_shape, 0)) + .Finalize(&op.node_def)); + }; + + // Verify inputs 'serialized' and 'debug_name'. + set_outputs(0, 0, 0, 0); + INFER_OK(op, "[?];[?]", ""); + INFER_OK(op, "[8];[8]", ""); + INFER_ERROR("must be rank 1", op, "[];[?]"); + INFER_ERROR("must be rank 1", op, "[?];[]"); + + // context inputs with no feature_list inputs. + set_outputs(2 /* num_context_sparse */, 3 /* num_context_dense */, 0, 0); + INFER_OK(op, "[?];[?];?;?;?", + ("[?,2];[?,2];[?];[?];[2];[2];" // context sparse + "[d0_0,1];[d0_0,1,2];[d0_0,1,2,3]")); // context dense + + // feature_list inputs with no context inputs. + set_outputs(0, 0, 2 /* num_feature_list_sparse */, + 3 /* num_feature_list_dense */); + INFER_OK(op, "[?];[?]", + ("[?,3];[?,3];[?];[?];[3];[3];" // feature_list sparse + "[d0_0,?,1];[d0_0,?,1,2];[d0_0,?,1,2,3];" // feature_list dense + "[d0_0];[d0_0];[d0_0]")); // feature_list length + + // Combine previous two test cases. + set_outputs(2, 3, 2, 3); + INFER_OK(op, "[7];[7];?;?;?", + ("[?,2];[?,2];[?];[?];[2];[2];" // context sparse + "[d0_0,1];[d0_0,1,2];[d0_0,1,2,3];" // context dense + "[?,3];[?,3];[?];[?];[3];[3];" // feature_list sparse + "[d0_0,?,1];[d0_0,?,1,2];[d0_0,?,1,2,3];" // feature_list dense + "[d0_0];[d0_0];[d0_0]")); // feature_list length + + // Confirm an error from ParseSequenceExampleAttrs.Init(). + set_outputs(1, 1, 1, 1, true /* add_extra_shape */); + INFER_ERROR( + "num_context_dense (1) must match the size of context_dense_keys (1), " + "context_dense_types (1) and context_dense_shapes (2)", + op, "[?];[?];?"); +} + TEST(ParsingOpsTest, ParseSingleSequenceExample_ShapeFn) { ShapeInferenceTestOp op("ParseSingleSequenceExample"); auto set_outputs = [&op](int num_context_sparse, int num_context_dense, diff --git a/tensorflow/core/util/example_proto_fast_parsing.cc b/tensorflow/core/util/example_proto_fast_parsing.cc index a38cd1d09f..e52d55e2ff 100644 --- a/tensorflow/core/util/example_proto_fast_parsing.cc +++ b/tensorflow/core/util/example_proto_fast_parsing.cc @@ -1722,10 +1722,11 @@ Status FastParseSequenceExample( const FastParseExampleConfig& feature_list_config, gtl::ArraySlice<string> serialized, gtl::ArraySlice<string> example_names, thread::ThreadPool* thread_pool, Result* context_result, - Result* feature_list_result) { + Result* feature_list_result, std::vector<Tensor>* dense_feature_lengths) { int num_examples = serialized.size(); DCHECK(context_result != nullptr); DCHECK(feature_list_result != nullptr); + DCHECK(dense_feature_lengths != nullptr); std::map<StringPiece, bool> context_is_sparse; std::map<StringPiece, std::pair<DataType, size_t>> context_feature_type_and_lengths; @@ -1740,9 +1741,22 @@ Status FastParseSequenceExample( context_is_sparse[c.feature_name] = true; } for (auto& c : context_config.dense) { + if (context_is_sparse[c.feature_name]) { + return errors::InvalidArgument("Context feature " + c.feature_name + + " cannot be both dense and sparse"); + } TF_RETURN_IF_ERROR(CheckConfigDataType(c.dtype)); context_feature_type_and_lengths[c.feature_name] = - std::make_pair(c.dtype, 0); + std::make_pair(c.dtype, c.default_value.NumElements()); + if (c.default_value.NumElements() > 0) { + if (!c.shape.IsCompatibleWith(c.default_value.shape())) { + return errors::InvalidArgument("Default value for context feature ", + c.feature_name, + " has an incorrect shape: saw ", + c.default_value.shape().DebugString(), + " but expected ", c.shape.DebugString()); + } + } context_is_sparse[c.feature_name] = false; } std::map<StringPiece, bool> sequence_is_sparse; @@ -1755,6 +1769,10 @@ Status FastParseSequenceExample( sequence_is_sparse[c.feature_name] = true; } for (auto& c : feature_list_config.dense) { + if (sequence_is_sparse[c.feature_name]) { + return errors::InvalidArgument("Sequence feature " + c.feature_name + + " cannot be both dense and sparse"); + } TF_RETURN_IF_ERROR(CheckConfigDataType(c.dtype)); sequence_feature_type_and_lengths[c.feature_name] = std::make_pair(c.dtype, 0); @@ -1792,14 +1810,14 @@ Status FastParseSequenceExample( features = sequence_features; config = &sequence_feature_type_and_lengths; } else if (!SkipExtraneousTag(&stream)) { - return errors::InvalidArgument(strings::StrCat( - "Invalid protocol message input, example id: ", example_name)); + return errors::InvalidArgument( + "Invalid protocol message input, example id: ", example_name); } if (features != nullptr) { uint32 length; if (!stream.ReadVarint32(&length)) { - return errors::InvalidArgument(strings::StrCat( - "Invalid protocol message input, example id: ", example_name)); + return errors::InvalidArgument( + "Invalid protocol message input, example id: ", example_name); } auto limit = stream.PushLimit(length); while (!stream.ExpectAtEnd()) { @@ -1807,16 +1825,16 @@ Status FastParseSequenceExample( uint32 length; if (!stream.ExpectTag(kDelimitedTag(1)) || !stream.ReadVarint32(&length)) { - return errors::InvalidArgument(strings::StrCat( - "Invalid protocol message input, example id: ", example_name)); + return errors::InvalidArgument( + "Invalid protocol message input, example id: ", example_name); } auto limit = stream.PushLimit(length); if (!stream.ExpectTag(kDelimitedTag(1)) || !ParseString(&stream, &key) || !stream.ExpectTag(kDelimitedTag(2)) || !ParseString(&stream, &value) || !stream.ExpectAtEnd()) { - return errors::InvalidArgument(strings::StrCat( - "Invalid protocol message input, example id: ", example_name)); + return errors::InvalidArgument( + "Invalid protocol message input, example id: ", example_name); } stream.PopLimit(limit); // Only save if this feature was requested. @@ -1851,9 +1869,8 @@ Status FastParseSequenceExample( break; } if (num == -1) { - return errors::InvalidArgument( - strings::StrCat("Error in context feature ", c.first, - " in example ", example_name)); + return errors::InvalidArgument("Error in context feature ", c.first, + " in example ", example_name); } num_elements += num; } @@ -1876,9 +1893,9 @@ Status FastParseSequenceExample( uint32 feature_length; if (!stream.ExpectTag(kDelimitedTag(1)) || !stream.ReadVarint32(&feature_length)) { - return errors::InvalidArgument( - strings::StrCat("Error in sequence feature ", c.first, - " in example ", example_name)); + return errors::InvalidArgument("Error in sequence feature ", + c.first, " in example ", + example_name); } if (feature_length > 2) { auto limit = stream.PushLimit(feature_length); @@ -1898,22 +1915,22 @@ Status FastParseSequenceExample( break; } if (num == -1) { - return errors::InvalidArgument( - strings::StrCat("Error in sequence feature ", c.first, - " in example ", example_name)); + return errors::InvalidArgument("Error in sequence feature ", + c.first, " in example ", + example_name); } num_elements += num; stream.PopLimit(limit); } else if (feature_length == 2) { if (!SkipEmptyFeature(&stream, dtype)) { - return errors::InvalidArgument( - strings::StrCat("Error in sequence feature ", c.first, - " in example ", example_name)); + return errors::InvalidArgument("Error in sequence feature ", + c.first, " in example ", + example_name); } } else if (feature_length != 0) { - return errors::InvalidArgument( - strings::StrCat("Error in sequence feature ", c.first, - " in example ", example_name)); + return errors::InvalidArgument("Error in sequence feature ", + c.first, " in example ", + example_name); } } } @@ -1936,15 +1953,19 @@ Status FastParseSequenceExample( feature_list_result->sparse_indices.resize(feature_list_config.sparse.size()); feature_list_result->sparse_shapes.resize(feature_list_config.sparse.size()); feature_list_result->dense_values.resize(feature_list_config.dense.size()); + dense_feature_lengths->resize(feature_list_config.dense.size()); + int t = 0; for (const auto& c : context_config.dense) { - TensorShape dense_shape; + TensorShape dense_shape, example_shape; DataType dtype = c.dtype; - size_t expected_max_elements = + const size_t expected_max_elements = context_feature_type_and_lengths[c.feature_name].second; - if (expected_max_elements != dense_shape.num_elements()) { - return errors::InvalidArgument(strings::StrCat( - "Inconsistent number of elements for feature ", c.feature_name)); + if (!c.shape.AsTensorShape(&example_shape) || + expected_max_elements != example_shape.num_elements()) { + return errors::InvalidArgument( + "Inconsistent number of elements for feature ", c.feature_name, ": ", + expected_max_elements, " vs ", dense_shape.num_elements()); } dense_shape.AddDim(num_examples); for (const int dim : c.shape.dim_sizes()) { @@ -1968,18 +1989,58 @@ Status FastParseSequenceExample( out_int64 = context_result->dense_values[t].flat<int64>().data(); break; default: - return errors::InvalidArgument(strings::StrCat( - "Unexpected dtype ", dtype, " in feature ", c.feature_name)); + return errors::InvalidArgument("Unexpected dtype ", dtype, + " in feature ", c.feature_name); } t++; // Fill in the values. for (int e = 0; e < num_examples; e++) { size_t num_elements = 0; - const auto& feature = all_context_features[e][c.feature_name]; + const auto feature_iter = all_context_features[e].find(c.feature_name); const string& example_name = example_names.empty() ? kUnknown : example_names[e]; - if (!feature.empty()) { + if (feature_iter == all_context_features[e].end()) { + // Copy the default value, if present. If not, return an error. + if (c.default_value.NumElements() == 0) { + return errors::InvalidArgument( + "Feature: ", c.feature_name, + " (data type: ", DataTypeString(c.dtype), ")", + " is required but could not be found."); + } + const string* in_bytes = nullptr; + const float* in_float = nullptr; + const int64* in_int64 = nullptr; + size_t num = 0; + switch (dtype) { + case DT_STRING: + in_bytes = c.default_value.flat<string>().data(); + num = c.default_value.NumElements(); + for (int p = 0; p < num; p++) { + *out_bytes++ = *in_bytes++; + } + break; + case DT_FLOAT: + in_float = c.default_value.flat<float>().data(); + num = c.default_value.NumElements(); + for (int p = 0; p < num; p++) { + *out_float++ = *in_float++; + } + break; + case DT_INT64: + in_int64 = c.default_value.flat<int64>().data(); + num = c.default_value.NumElements(); + for (int p = 0; p < num; p++) { + *out_int64++ = *in_int64++; + } + break; + default: + return errors::InvalidArgument("Unexpected dtype ", dtype, + " in example ", example_name); + } + num_elements += num; + } else if (!feature_iter->second.empty()) { + const auto& feature = feature_iter->second; protobuf::io::CodedInputStream stream( reinterpret_cast<const uint8*>(feature.data()), feature.size()); EnableAliasing(&stream); @@ -1998,14 +2059,14 @@ Status FastParseSequenceExample( out_int64 += num_added; break; default: - return errors::InvalidArgument(strings::StrCat( - "Unexpected dtype ", dtype, " in example ", example_name)); + return errors::InvalidArgument("Unexpected dtype ", dtype, + " in example ", example_name); } num_elements += num_added; } if (num_elements != expected_max_elements) { - return errors::InvalidArgument(strings::StrCat( - "Unexpected number of elements in example ", example_name)); + return errors::InvalidArgument( + "Unexpected number of elements in example ", example_name); } } } @@ -2037,8 +2098,8 @@ Status FastParseSequenceExample( out_int64 = context_result->sparse_values[t].flat<int64>().data(); break; default: - return errors::InvalidArgument(strings::StrCat( - "Unexpected dtype ", dtype, " in feature ", c.feature_name)); + return errors::InvalidArgument("Unexpected dtype ", dtype, + " in feature ", c.feature_name); } int64* out_indices = context_result->sparse_indices[t].flat<int64>().data(); auto out_shape = context_result->sparse_shapes[t].vec<int64>(); @@ -2070,8 +2131,8 @@ Status FastParseSequenceExample( out_int64 += num_added; break; default: - return errors::InvalidArgument(strings::StrCat( - "Unexpected dtype ", dtype, " in example ", example_name)); + return errors::InvalidArgument("Unexpected dtype ", dtype, + " in example ", example_name); } num_elements += num_added; max_num_cols = std::max(max_num_cols, num_added); @@ -2082,30 +2143,35 @@ Status FastParseSequenceExample( } } if (num_elements != expected_num_elements) { - return errors::InvalidArgument(strings::StrCat( - "Unexpected total number of elements in feature ", c.feature_name)); + return errors::InvalidArgument( + "Unexpected total number of elements in feature ", c.feature_name); } out_shape(0) = num_examples; out_shape(1) = max_num_cols; } t = 0; + TensorShape dense_length_shape({num_examples}); for (const auto& c : feature_list_config.dense) { TensorShape dense_shape, row_shape; DataType dtype = c.dtype; - size_t expected_max_elements = + const size_t expected_max_elements = sequence_feature_type_and_lengths[c.feature_name].second; - int64 expected_max_rows = expected_max_elements / row_shape.num_elements(); if (!c.shape.AsTensorShape(&row_shape) || - expected_max_elements != expected_max_rows * row_shape.num_elements()) { - return errors::InvalidArgument(strings::StrCat( - "Unexpected shape error in feature ", c.feature_name)); + expected_max_elements != + (expected_max_elements / row_shape.num_elements()) * + row_shape.num_elements()) { + return errors::InvalidArgument("Unexpected shape error in feature ", + c.feature_name); } + int64 expected_max_rows = expected_max_elements / row_shape.num_elements(); dense_shape.AddDim(num_examples); dense_shape.AddDim(expected_max_rows); for (const int dim : feature_list_config.dense[t].shape.dim_sizes()) { dense_shape.AddDim(dim); } feature_list_result->dense_values[t] = Tensor(dtype, dense_shape); + (*dense_feature_lengths)[t] = Tensor(DT_INT64, dense_length_shape); + int64* out_lengths = (*dense_feature_lengths)[t].flat<int64>().data(); string* out_bytes = nullptr; float* out_float = nullptr; @@ -2121,18 +2187,26 @@ Status FastParseSequenceExample( out_int64 = feature_list_result->dense_values[t].flat<int64>().data(); break; default: - return errors::InvalidArgument(strings::StrCat( - "Unexpected dtype ", dtype, " in feature ", c.feature_name)); + return errors::InvalidArgument("Unexpected dtype ", dtype, + " in feature ", c.feature_name); } t++; // Fill in the values. for (int e = 0; e < num_examples; e++) { - size_t num_elements = 0; - const auto& feature = all_sequence_features[e][c.feature_name]; + size_t num_elements = 0, num_rows = 0; + const auto feature_iter = all_sequence_features[e].find(c.feature_name); const string& example_name = example_names.empty() ? kUnknown : example_names[e]; - if (!feature.empty()) { + if (feature_iter == all_sequence_features[e].end()) { + // Return an error if this feature was not allowed to be missing. + // Otherwise, we'll pad as needed below. + if (!c.variable_length) { + return errors::InvalidArgument("Missing feature ", c.feature_name, + " in example ", example_name); + } + } else if (!feature_iter->second.empty()) { + const auto& feature = feature_iter->second; protobuf::io::CodedInputStream stream( reinterpret_cast<const uint8*>(feature.data()), feature.size()); EnableAliasing(&stream); @@ -2140,9 +2214,9 @@ Status FastParseSequenceExample( uint32 feature_length; if (!stream.ExpectTag(kDelimitedTag(1)) || !stream.ReadVarint32(&feature_length)) { - return errors::InvalidArgument( - strings::StrCat("Error in sequence feature ", c.feature_name, - " in example ", example_name)); + return errors::InvalidArgument("Error in sequence feature ", + c.feature_name, " in example ", + example_name); } auto limit = stream.PushLimit(feature_length); size_t num_added; @@ -2160,10 +2234,11 @@ Status FastParseSequenceExample( out_int64 += num_added; break; default: - return errors::InvalidArgument(strings::StrCat( - "Unexpected dtype ", dtype, " in example ", example_name)); + return errors::InvalidArgument("Unexpected dtype ", dtype, + " in example ", example_name); } num_elements += num_added; + num_rows++; if (num_added != row_shape.num_elements()) { return errors::InvalidArgument( "Unexpected number of elements in feature ", c.feature_name, @@ -2172,6 +2247,7 @@ Status FastParseSequenceExample( stream.PopLimit(limit); } } + *out_lengths++ = num_rows; // Pad as necessary. int num_to_pad = expected_max_elements - num_elements; switch (dtype) { @@ -2187,8 +2263,8 @@ Status FastParseSequenceExample( out_int64 += num_to_pad; break; default: - return errors::InvalidArgument(strings::StrCat( - "Unexpected dtype ", dtype, " in example ", example_name)); + return errors::InvalidArgument("Unexpected dtype ", dtype, + " in example ", example_name); } } } @@ -2219,8 +2295,8 @@ Status FastParseSequenceExample( out_int64 = feature_list_result->sparse_values[t].flat<int64>().data(); break; default: - return errors::InvalidArgument(strings::StrCat( - "Unexpected dtype ", dtype, " in feature ", c.feature_name)); + return errors::InvalidArgument("Unexpected dtype ", dtype, + " in feature ", c.feature_name); } int64* out_indices = feature_list_result->sparse_indices[t].flat<int64>().data(); @@ -2244,9 +2320,9 @@ Status FastParseSequenceExample( uint32 feature_length; if (!stream.ExpectTag(kDelimitedTag(1)) || !stream.ReadVarint32(&feature_length)) { - return errors::InvalidArgument( - strings::StrCat("Error in sequence feature ", c.feature_name, - " in example ", example_name)); + return errors::InvalidArgument("Error in sequence feature ", + c.feature_name, " in example ", + example_name); } if (feature_length > 2) { auto limit = stream.PushLimit(feature_length); @@ -2265,8 +2341,8 @@ Status FastParseSequenceExample( out_int64 += num_added; break; default: - return errors::InvalidArgument(strings::StrCat( - "Unexpected dtype ", dtype, " in example ", example_name)); + return errors::InvalidArgument("Unexpected dtype ", dtype, + " in example ", example_name); } num_elements += num_added; max_num_cols = std::max(max_num_cols, num_added); @@ -2278,14 +2354,14 @@ Status FastParseSequenceExample( stream.PopLimit(limit); } else if (feature_length == 2) { if (!SkipEmptyFeature(&stream, dtype)) { - return errors::InvalidArgument( - strings::StrCat("Error in sequence feature ", c.feature_name, - " in example ", example_name)); + return errors::InvalidArgument("Error in sequence feature ", + c.feature_name, " in example ", + example_name); } } else if (feature_length != 0) { - return errors::InvalidArgument( - strings::StrCat("Error in sequence feature ", c.feature_name, - " in example ", example_name)); + return errors::InvalidArgument("Error in sequence feature ", + c.feature_name, " in example ", + example_name); } num_rows++; } @@ -2293,8 +2369,8 @@ Status FastParseSequenceExample( } } if (num_elements != expected_num_elements) { - return errors::InvalidArgument(strings::StrCat( - "Unexpected number of elements in feature ", c.feature_name)); + return errors::InvalidArgument( + "Unexpected number of elements in feature ", c.feature_name); } out_shape(0) = num_examples; out_shape(1) = max_num_rows; diff --git a/tensorflow/core/util/example_proto_fast_parsing.h b/tensorflow/core/util/example_proto_fast_parsing.h index db5b5ff929..055d9c2c30 100644 --- a/tensorflow/core/util/example_proto_fast_parsing.h +++ b/tensorflow/core/util/example_proto_fast_parsing.h @@ -118,7 +118,8 @@ Status FastParseSequenceExample( const example::FastParseExampleConfig& feature_list_config, gtl::ArraySlice<string> serialized, gtl::ArraySlice<string> example_names, thread::ThreadPool* thread_pool, example::Result* context_result, - example::Result* feature_list_result); + example::Result* feature_list_result, + std::vector<Tensor>* dense_feature_lengths); // This function parses serialized Example and populates given example. // It uses the same specialized parser as FastParseExample which is efficient. diff --git a/tensorflow/core/util/example_proto_helper.cc b/tensorflow/core/util/example_proto_helper.cc index e156a3bc8f..41fb20c00a 100644 --- a/tensorflow/core/util/example_proto_helper.cc +++ b/tensorflow/core/util/example_proto_helper.cc @@ -443,6 +443,59 @@ Status ParseSingleExampleAttrs::FinishInit() { return Status::OK(); } +Status ParseSequenceExampleAttrs::FinishInit() { + if (num_context_sparse != context_sparse_keys.size() || + num_context_sparse != context_sparse_types.size()) { + return errors::InvalidArgument( + "num_context_sparse (", num_context_sparse, + ") must match the size of context_sparse_keys (", + context_sparse_keys.size(), ") and context_sparse_types (", + context_sparse_types.size(), ")"); + } + if (num_context_dense != context_dense_keys.size() || + num_context_dense != context_dense_types.size() || + num_context_dense != context_dense_shapes.size()) { + return errors::InvalidArgument( + "num_context_dense (", num_context_dense, + ") must match the size of context_dense_keys (", + context_dense_keys.size(), "), context_dense_types (", + context_dense_types.size(), ") and context_dense_shapes (", + context_dense_shapes.size(), ")"); + } + if (num_feature_list_sparse != feature_list_sparse_keys.size() || + num_feature_list_sparse != feature_list_sparse_types.size()) { + return errors::InvalidArgument( + "num_feature_list_sparse (", num_feature_list_sparse, + ") must match the size of feature_list_sparse_keys (", + feature_list_sparse_keys.size(), ") and feature_list_sparse_types (", + feature_list_sparse_types.size(), ")"); + } + if (num_feature_list_dense != feature_list_dense_keys.size() || + num_feature_list_dense != feature_list_dense_types.size() || + num_feature_list_dense != feature_list_dense_shapes.size()) { + return errors::InvalidArgument( + "num_feature_list_dense (", num_feature_list_dense, + ") must match the size of feature_list_dense_keys (", + feature_list_dense_keys.size(), "), feature_list_dense_types (", + feature_list_dense_types.size(), ") and feature_list_dense_shapes (", + feature_list_dense_shapes.size(), ")"); + } + for (const DataType& type : context_dense_types) { + TF_RETURN_IF_ERROR(CheckValidType(type)); + } + for (const DataType& type : context_sparse_types) { + TF_RETURN_IF_ERROR(CheckValidType(type)); + } + for (const DataType& type : feature_list_dense_types) { + TF_RETURN_IF_ERROR(CheckValidType(type)); + } + for (const DataType& type : feature_list_sparse_types) { + TF_RETURN_IF_ERROR(CheckValidType(type)); + } + + return Status::OK(); +} + Status ParseSingleSequenceExampleAttrs::FinishInit() { if (static_cast<size_t>(num_context_sparse) != context_sparse_types.size()) { return errors::InvalidArgument( diff --git a/tensorflow/core/util/example_proto_helper.h b/tensorflow/core/util/example_proto_helper.h index e511704962..c183ee4d96 100644 --- a/tensorflow/core/util/example_proto_helper.h +++ b/tensorflow/core/util/example_proto_helper.h @@ -26,6 +26,7 @@ limitations under the License. #include "tensorflow/core/framework/partial_tensor_shape.h" #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/framework/types.h" +#include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/platform/types.h" #include "tensorflow/core/util/sparse/sparse_tensor.h" @@ -271,6 +272,66 @@ class ParseSingleExampleAttrs { Status FinishInit(); // for context-independent parts of Init. }; +// Parses the attributes passed to ParseSequenceExample. +// REQUIRES: Init must be called after construction. +class ParseSequenceExampleAttrs { + public: + template <typename ContextType> + Status Init(ContextType* ctx) { + std::vector<string> feature_list_dense_missing_assumed_empty_tmp; + TF_RETURN_IF_ERROR( + ctx->GetAttr("feature_list_dense_missing_assumed_empty", + &feature_list_dense_missing_assumed_empty_tmp)); + for (const string& feature : feature_list_dense_missing_assumed_empty_tmp) { + feature_list_dense_missing_assumed_empty.insert(feature); + } + TF_RETURN_IF_ERROR( + ctx->GetAttr("context_sparse_keys", &context_sparse_keys)); + TF_RETURN_IF_ERROR(ctx->GetAttr("context_dense_keys", &context_dense_keys)); + TF_RETURN_IF_ERROR( + ctx->GetAttr("feature_list_sparse_keys", &feature_list_sparse_keys)); + TF_RETURN_IF_ERROR( + ctx->GetAttr("feature_list_dense_keys", &feature_list_dense_keys)); + TF_RETURN_IF_ERROR( + ctx->GetAttr("context_sparse_types", &context_sparse_types)); + TF_RETURN_IF_ERROR(ctx->GetAttr("Ncontext_dense", &num_context_dense)); + TF_RETURN_IF_ERROR( + ctx->GetAttr("Nfeature_list_dense", &num_feature_list_dense)); + TF_RETURN_IF_ERROR(ctx->GetAttr("Ncontext_sparse", &num_context_sparse)); + TF_RETURN_IF_ERROR(ctx->GetAttr("Tcontext_dense", &context_dense_types)); + TF_RETURN_IF_ERROR( + ctx->GetAttr("feature_list_sparse_types", &feature_list_sparse_types)); + TF_RETURN_IF_ERROR( + ctx->GetAttr("feature_list_dense_types", &feature_list_dense_types)); + TF_RETURN_IF_ERROR( + ctx->GetAttr("Nfeature_list_sparse", &num_feature_list_sparse)); + TF_RETURN_IF_ERROR( + ctx->GetAttr("context_dense_shapes", &context_dense_shapes)); + TF_RETURN_IF_ERROR( + ctx->GetAttr("feature_list_dense_shapes", &feature_list_dense_shapes)); + return FinishInit(); + } + + std::unordered_set<string> feature_list_dense_missing_assumed_empty; + int64 num_context_sparse; + int64 num_context_dense; + int64 num_feature_list_sparse; + int64 num_feature_list_dense; + std::vector<string> context_sparse_keys; + std::vector<string> context_dense_keys; + std::vector<string> feature_list_sparse_keys; + std::vector<string> feature_list_dense_keys; + std::vector<DataType> context_sparse_types; + std::vector<DataType> context_dense_types; + std::vector<TensorShape> context_dense_shapes; + std::vector<DataType> feature_list_sparse_types; + std::vector<DataType> feature_list_dense_types; + std::vector<TensorShape> feature_list_dense_shapes; + + private: + Status FinishInit(); // for context-independent parts of Init. +}; + // Parses the attributes passed to ParseSingleSequenceExample. // REQUIRES: Init must be called after construction. class ParseSingleSequenceExampleAttrs { diff --git a/tensorflow/python/kernel_tests/parsing_ops_test.py b/tensorflow/python/kernel_tests/parsing_ops_test.py index 59b3ee2013..7dff4501cc 100644 --- a/tensorflow/python/kernel_tests/parsing_ops_test.py +++ b/tensorflow/python/kernel_tests/parsing_ops_test.py @@ -60,8 +60,9 @@ def flatten(list_of_lists): def flatten_values_tensors_or_sparse(tensors_list): """Flatten each SparseTensor object into 3 Tensors for session.run().""" return list( - flatten([[v.indices, v.values, v.dense_shape] if isinstance( - v, sparse_tensor.SparseTensor) else [v] for v in tensors_list])) + flatten([[v.indices, v.values, v.dense_shape] + if isinstance(v, sparse_tensor.SparseTensor) else [v] + for v in tensors_list])) def _compare_output_to_expected(tester, dict_tensors, expected_tensors, @@ -106,8 +107,9 @@ class ParseExampleTest(test.TestCase): # Check shapes; if serialized is a Tensor we need its size to # properly check. serialized = kwargs["serialized"] - batch_size = (serialized.eval().size if isinstance(serialized, ops.Tensor) - else np.asarray(serialized).size) + batch_size = ( + serialized.eval().size if isinstance(serialized, ops.Tensor) else + np.asarray(serialized).size) for k, f in kwargs["features"].items(): if isinstance(f, parsing_ops.FixedLenFeature) and f.shape is not None: self.assertEqual( @@ -129,12 +131,9 @@ class ParseExampleTest(test.TestCase): c_default = np.random.rand(2).astype(np.float32) expected_st_a = ( # indices, values, shape - np.empty( - (0, 2), dtype=np.int64), # indices - np.empty( - (0,), dtype=np.int64), # sp_a is DT_INT64 - np.array( - [2, 0], dtype=np.int64)) # batch == 2, max_elems = 0 + np.empty((0, 2), dtype=np.int64), # indices + np.empty((0,), dtype=np.int64), # sp_a is DT_INT64 + np.array([2, 0], dtype=np.int64)) # batch == 2, max_elems = 0 expected_output = { sparse_name: expected_st_a, @@ -143,28 +142,23 @@ class ParseExampleTest(test.TestCase): c_name: np.array(2 * [c_default]), } - self._test( - { - "example_names": - np.empty( - (0,), dtype=bytes), - "serialized": - ops.convert_to_tensor(["", ""]), - "features": { - sparse_name: - parsing_ops.VarLenFeature(dtypes.int64), - a_name: - parsing_ops.FixedLenFeature( - (1, 3), dtypes.int64, default_value=a_default), - b_name: - parsing_ops.FixedLenFeature( - (3, 3), dtypes.string, default_value=b_default), - c_name: - parsing_ops.FixedLenFeature( - (2,), dtypes.float32, default_value=c_default), - } - }, - expected_output) + self._test({ + "example_names": np.empty((0,), dtype=bytes), + "serialized": ops.convert_to_tensor(["", ""]), + "features": { + sparse_name: + parsing_ops.VarLenFeature(dtypes.int64), + a_name: + parsing_ops.FixedLenFeature( + (1, 3), dtypes.int64, default_value=a_default), + b_name: + parsing_ops.FixedLenFeature( + (3, 3), dtypes.string, default_value=b_default), + c_name: + parsing_ops.FixedLenFeature( + (2,), dtypes.float32, default_value=c_default), + } + }, expected_output) def testEmptySerializedWithoutDefaultsShouldFail(self): input_features = { @@ -180,8 +174,7 @@ class ParseExampleTest(test.TestCase): default_value=np.random.rand(3, 3).astype(bytes)), # Feature "c" is missing a default, this gap will cause failure. "c": - parsing_ops.FixedLenFeature( - (2,), dtype=dtypes.float32), + parsing_ops.FixedLenFeature((2,), dtype=dtypes.float32), } # Edge case where the key is there but the feature value is empty @@ -211,7 +204,8 @@ class ParseExampleTest(test.TestCase): original = [ example(features=features({ "a": float_feature([1, 1, 3]), - })), example(features=features({ + })), + example(features=features({ "a": float_feature([-1, -1]), })) ] @@ -231,7 +225,11 @@ class ParseExampleTest(test.TestCase): "Name: failing, Key: a, Index: 1. Number of float val")) def testDenseDefaultNoShapeShouldFail(self): - original = [example(features=features({"a": float_feature([1, 1, 3]),})),] + original = [ + example(features=features({ + "a": float_feature([1, 1, 3]), + })), + ] serialized = [m.SerializeToString() for m in original] @@ -250,31 +248,31 @@ class ParseExampleTest(test.TestCase): example(features=features({ "st_c": float_feature([3, 4]) })), - example(features=features({ - "st_c": float_feature([]), # empty float list - })), - example(features=features({ - "st_d": feature(), # feature with nothing in it - })), - example(features=features({ - "st_c": float_feature([1, 2, -1]), - "st_d": bytes_feature([b"hi"]) - })) + example( + features=features({ + "st_c": float_feature([]), # empty float list + })), + example( + features=features({ + "st_d": feature(), # feature with nothing in it + })), + example( + features=features({ + "st_c": float_feature([1, 2, -1]), + "st_d": bytes_feature([b"hi"]) + })) ] serialized = [m.SerializeToString() for m in original] expected_st_c = ( # indices, values, shape - np.array( - [[0, 0], [0, 1], [3, 0], [3, 1], [3, 2]], dtype=np.int64), np.array( - [3.0, 4.0, 1.0, 2.0, -1.0], dtype=np.float32), np.array( - [4, 3], dtype=np.int64)) # batch == 2, max_elems = 3 + np.array([[0, 0], [0, 1], [3, 0], [3, 1], [3, 2]], dtype=np.int64), + np.array([3.0, 4.0, 1.0, 2.0, -1.0], dtype=np.float32), + np.array([4, 3], dtype=np.int64)) # batch == 2, max_elems = 3 expected_st_d = ( # indices, values, shape - np.array( - [[3, 0]], dtype=np.int64), np.array( - ["hi"], dtype=bytes), np.array( - [4, 1], dtype=np.int64)) # batch == 2, max_elems = 1 + np.array([[3, 0]], dtype=np.int64), np.array(["hi"], dtype=bytes), + np.array([4, 1], dtype=np.int64)) # batch == 2, max_elems = 1 expected_output = { "st_c": expected_st_c, @@ -291,70 +289,74 @@ class ParseExampleTest(test.TestCase): def testSerializedContainingSparseFeature(self): original = [ - example(features=features({ - "val": float_feature([3, 4]), - "idx": int64_feature([5, 10]) - })), - example(features=features({ - "val": float_feature([]), # empty float list - "idx": int64_feature([]) - })), - example(features=features({ - "val": feature(), # feature with nothing in it - # missing idx feature - })), - example(features=features({ - "val": float_feature([1, 2, -1]), - "idx": - int64_feature([0, 9, 3]) # unsorted - })) + example( + features=features({ + "val": float_feature([3, 4]), + "idx": int64_feature([5, 10]) + })), + example( + features=features({ + "val": float_feature([]), # empty float list + "idx": int64_feature([]) + })), + example( + features=features({ + "val": feature(), # feature with nothing in it + # missing idx feature + })), + example( + features=features({ + "val": float_feature([1, 2, -1]), + "idx": + int64_feature([0, 9, 3]) # unsorted + })) ] serialized = [m.SerializeToString() for m in original] expected_sp = ( # indices, values, shape - np.array( - [[0, 5], [0, 10], [3, 0], [3, 3], [3, 9]], dtype=np.int64), - np.array( - [3.0, 4.0, 1.0, -1.0, 2.0], dtype=np.float32), np.array( - [4, 13], dtype=np.int64)) # batch == 4, max_elems = 13 + np.array([[0, 5], [0, 10], [3, 0], [3, 3], [3, 9]], dtype=np.int64), + np.array([3.0, 4.0, 1.0, -1.0, 2.0], dtype=np.float32), + np.array([4, 13], dtype=np.int64)) # batch == 4, max_elems = 13 - expected_output = {"sp": expected_sp,} + expected_output = { + "sp": expected_sp, + } self._test({ "serialized": ops.convert_to_tensor(serialized), "features": { - "sp": parsing_ops.SparseFeature( - ["idx"], "val", dtypes.float32, [13]) + "sp": + parsing_ops.SparseFeature(["idx"], "val", dtypes.float32, [13]) } }, expected_output) def testSerializedContainingSparseFeatureReuse(self): original = [ - example(features=features({ - "val1": float_feature([3, 4]), - "val2": float_feature([5, 6]), - "idx": int64_feature([5, 10]) - })), - example(features=features({ - "val1": float_feature([]), # empty float list - "idx": int64_feature([]) - })), + example( + features=features({ + "val1": float_feature([3, 4]), + "val2": float_feature([5, 6]), + "idx": int64_feature([5, 10]) + })), + example( + features=features({ + "val1": float_feature([]), # empty float list + "idx": int64_feature([]) + })), ] serialized = [m.SerializeToString() for m in original] expected_sp1 = ( # indices, values, shape - np.array( - [[0, 5], [0, 10]], dtype=np.int64), np.array( - [3.0, 4.0], dtype=np.float32), np.array( - [2, 13], dtype=np.int64)) # batch == 2, max_elems = 13 + np.array([[0, 5], [0, 10]], dtype=np.int64), + np.array([3.0, 4.0], dtype=np.float32), np.array( + [2, 13], dtype=np.int64)) # batch == 2, max_elems = 13 expected_sp2 = ( # indices, values, shape - np.array( - [[0, 5], [0, 10]], dtype=np.int64), np.array( - [5.0, 6.0], dtype=np.float32), np.array( - [2, 7], dtype=np.int64)) # batch == 2, max_elems = 13 + np.array([[0, 5], [0, 10]], dtype=np.int64), + np.array([5.0, 6.0], dtype=np.float32), np.array( + [2, 7], dtype=np.int64)) # batch == 2, max_elems = 13 expected_output = { "sp1": expected_sp1, @@ -374,25 +376,29 @@ class ParseExampleTest(test.TestCase): def testSerializedContaining3DSparseFeature(self): original = [ - example(features=features({ - "val": float_feature([3, 4]), - "idx0": int64_feature([5, 10]), - "idx1": int64_feature([0, 2]), - })), - example(features=features({ - "val": float_feature([]), # empty float list - "idx0": int64_feature([]), - "idx1": int64_feature([]), - })), - example(features=features({ - "val": feature(), # feature with nothing in it - # missing idx feature - })), - example(features=features({ - "val": float_feature([1, 2, -1]), - "idx0": int64_feature([0, 9, 3]), # unsorted - "idx1": int64_feature([1, 0, 2]), - })) + example( + features=features({ + "val": float_feature([3, 4]), + "idx0": int64_feature([5, 10]), + "idx1": int64_feature([0, 2]), + })), + example( + features=features({ + "val": float_feature([]), # empty float list + "idx0": int64_feature([]), + "idx1": int64_feature([]), + })), + example( + features=features({ + "val": feature(), # feature with nothing in it + # missing idx feature + })), + example( + features=features({ + "val": float_feature([1, 2, -1]), + "idx0": int64_feature([0, 9, 3]), # unsorted + "idx1": int64_feature([1, 0, 2]), + })) ] serialized = [m.SerializeToString() for m in original] @@ -407,13 +413,16 @@ class ParseExampleTest(test.TestCase): # shape batch == 4, max_elems = 13 np.array([4, 13, 3], dtype=np.int64)) - expected_output = {"sp": expected_sp,} + expected_output = { + "sp": expected_sp, + } self._test({ "serialized": ops.convert_to_tensor(serialized), "features": { - "sp": parsing_ops.SparseFeature( - ["idx0", "idx1"], "val", dtypes.float32, [13, 3]) + "sp": + parsing_ops.SparseFeature(["idx0", "idx1"], "val", + dtypes.float32, [13, 3]) } }, expected_output) @@ -421,41 +430,37 @@ class ParseExampleTest(test.TestCase): aname = "a" bname = "b*has+a:tricky_name" original = [ - example(features=features({ - aname: float_feature([1, 1]), - bname: bytes_feature([b"b0_str"]), - })), example(features=features({ - aname: float_feature([-1, -1]), - bname: bytes_feature([b""]), - })) + example( + features=features({ + aname: float_feature([1, 1]), + bname: bytes_feature([b"b0_str"]), + })), + example( + features=features({ + aname: float_feature([-1, -1]), + bname: bytes_feature([b""]), + })) ] serialized = [m.SerializeToString() for m in original] expected_output = { aname: - np.array( - [[1, 1], [-1, -1]], dtype=np.float32).reshape(2, 1, 2, 1), + np.array([[1, 1], [-1, -1]], dtype=np.float32).reshape(2, 1, 2, 1), bname: - np.array( - ["b0_str", ""], dtype=bytes).reshape(2, 1, 1, 1, 1), + np.array(["b0_str", ""], dtype=bytes).reshape(2, 1, 1, 1, 1), } # No defaults, values required - self._test( - { - "serialized": - ops.convert_to_tensor(serialized), - "features": { - aname: - parsing_ops.FixedLenFeature( - (1, 2, 1), dtype=dtypes.float32), - bname: - parsing_ops.FixedLenFeature( - (1, 1, 1, 1), dtype=dtypes.string), - } - }, - expected_output) + self._test({ + "serialized": ops.convert_to_tensor(serialized), + "features": { + aname: + parsing_ops.FixedLenFeature((1, 2, 1), dtype=dtypes.float32), + bname: + parsing_ops.FixedLenFeature((1, 1, 1, 1), dtype=dtypes.string), + } + }, expected_output) # This test is identical as the previous one except # for the creation of 'serialized'. @@ -466,18 +471,22 @@ class ParseExampleTest(test.TestCase): original = [ (example(features=features({ aname: float_feature([10, 10]), - })), example(features=features({ - aname: float_feature([1, 1]), - bname: bytes_feature([b"b0_str"]), - }))), + })), + example( + features=features({ + aname: float_feature([1, 1]), + bname: bytes_feature([b"b0_str"]), + }))), ( example(features=features({ bname: bytes_feature([b"b100"]), })), - example(features=features({ - aname: float_feature([-1, -1]), - bname: bytes_feature([b"b1"]), - })),), + example( + features=features({ + aname: float_feature([-1, -1]), + bname: bytes_feature([b"b1"]), + })), + ), ] serialized = [ @@ -486,55 +495,45 @@ class ParseExampleTest(test.TestCase): expected_output = { aname: - np.array( - [[1, 1], [-1, -1]], dtype=np.float32).reshape(2, 1, 2, 1), + np.array([[1, 1], [-1, -1]], dtype=np.float32).reshape(2, 1, 2, 1), bname: - np.array( - ["b0_str", "b1"], dtype=bytes).reshape(2, 1, 1, 1, 1), + np.array(["b0_str", "b1"], dtype=bytes).reshape(2, 1, 1, 1, 1), } # No defaults, values required - self._test( - { - "serialized": - ops.convert_to_tensor(serialized), - "features": { - aname: - parsing_ops.FixedLenFeature( - (1, 2, 1), dtype=dtypes.float32), - bname: - parsing_ops.FixedLenFeature( - (1, 1, 1, 1), dtype=dtypes.string), - } - }, - expected_output) + self._test({ + "serialized": ops.convert_to_tensor(serialized), + "features": { + aname: + parsing_ops.FixedLenFeature((1, 2, 1), dtype=dtypes.float32), + bname: + parsing_ops.FixedLenFeature((1, 1, 1, 1), dtype=dtypes.string), + } + }, expected_output) def testSerializedContainingDenseScalar(self): original = [ example(features=features({ "a": float_feature([1]), - })), example(features=features({})) + })), + example(features=features({})) ] serialized = [m.SerializeToString() for m in original] expected_output = { "a": - np.array( - [[1], [-1]], dtype=np.float32) # 2x1 (column vector) + np.array([[1], [-1]], dtype=np.float32) # 2x1 (column vector) } - self._test( - { - "serialized": - ops.convert_to_tensor(serialized), - "features": { - "a": - parsing_ops.FixedLenFeature( - (1,), dtype=dtypes.float32, default_value=-1), - } - }, - expected_output) + self._test({ + "serialized": ops.convert_to_tensor(serialized), + "features": { + "a": + parsing_ops.FixedLenFeature( + (1,), dtype=dtypes.float32, default_value=-1), + } + }, expected_output) def testSerializedContainingDenseWithDefaults(self): original = [ @@ -553,58 +552,48 @@ class ParseExampleTest(test.TestCase): expected_output = { "a": - np.array( - [[1, 1], [3, -3], [3, -3]], dtype=np.float32).reshape(3, 1, 2, - 1), + np.array([[1, 1], [3, -3], [3, -3]], dtype=np.float32).reshape( + 3, 1, 2, 1), "b": - np.array( - ["tmp_str", "b1", "tmp_str"], dtype=bytes).reshape(3, 1, 1, 1, - 1), + np.array(["tmp_str", "b1", "tmp_str"], dtype=bytes).reshape( + 3, 1, 1, 1, 1), } - self._test( - { - "serialized": - ops.convert_to_tensor(serialized), - "features": { - "a": - parsing_ops.FixedLenFeature( - (1, 2, 1), - dtype=dtypes.float32, - default_value=[3.0, -3.0]), - "b": - parsing_ops.FixedLenFeature( - (1, 1, 1, 1), - dtype=dtypes.string, - default_value="tmp_str"), - } - }, - expected_output) + self._test({ + "serialized": ops.convert_to_tensor(serialized), + "features": { + "a": + parsing_ops.FixedLenFeature( + (1, 2, 1), dtype=dtypes.float32, default_value=[3.0, -3.0]), + "b": + parsing_ops.FixedLenFeature( + (1, 1, 1, 1), dtype=dtypes.string, default_value="tmp_str"), + } + }, expected_output) def testSerializedContainingSparseAndSparseFeatureAndDenseWithNoDefault(self): expected_st_a = ( # indices, values, shape - np.empty( - (0, 2), dtype=np.int64), # indices - np.empty( - (0,), dtype=np.int64), # sp_a is DT_INT64 - np.array( - [2, 0], dtype=np.int64)) # batch == 2, max_elems = 0 + np.empty((0, 2), dtype=np.int64), # indices + np.empty((0,), dtype=np.int64), # sp_a is DT_INT64 + np.array([2, 0], dtype=np.int64)) # batch == 2, max_elems = 0 expected_sp = ( # indices, values, shape - np.array( - [[0, 0], [0, 3], [1, 7]], dtype=np.int64), np.array( - ["a", "b", "c"], dtype="|S"), np.array( - [2, 13], dtype=np.int64)) # batch == 4, max_elems = 13 + np.array([[0, 0], [0, 3], [1, 7]], dtype=np.int64), + np.array(["a", "b", "c"], dtype="|S"), np.array( + [2, 13], dtype=np.int64)) # batch == 4, max_elems = 13 original = [ - example(features=features({ - "c": float_feature([3, 4]), - "val": bytes_feature([b"a", b"b"]), - "idx": int64_feature([0, 3]) - })), example(features=features({ - "c": float_feature([1, 2]), - "val": bytes_feature([b"c"]), - "idx": int64_feature([7]) - })) + example( + features=features({ + "c": float_feature([3, 4]), + "val": bytes_feature([b"a", b"b"]), + "idx": int64_feature([0, 3]) + })), + example( + features=features({ + "c": float_feature([1, 2]), + "val": bytes_feature([b"c"]), + "idx": int64_feature([7]) + })) ] names = ["in1", "in2"] @@ -617,16 +606,13 @@ class ParseExampleTest(test.TestCase): "sp": expected_sp, "a": np.array(2 * [[a_default]]), "b": np.array(2 * [b_default]), - "c": np.array( - [[3, 4], [1, 2]], dtype=np.float32), + "c": np.array([[3, 4], [1, 2]], dtype=np.float32), } self._test( { - "example_names": - names, - "serialized": - ops.convert_to_tensor(serialized), + "example_names": names, + "serialized": ops.convert_to_tensor(serialized), "features": { "st_a": parsing_ops.VarLenFeature(dtypes.int64), @@ -647,25 +633,26 @@ class ParseExampleTest(test.TestCase): def testSerializedContainingSparseAndSparseFeatureWithReuse(self): expected_idx = ( # indices, values, shape - np.array( - [[0, 0], [0, 1], [1, 0], [1, 1]], dtype=np.int64), - np.array([0, 3, 7, 1]), np.array( - [2, 2], dtype=np.int64)) # batch == 4, max_elems = 2 + np.array([[0, 0], [0, 1], [1, 0], [1, 1]], dtype=np.int64), + np.array([0, 3, 7, 1]), + np.array([2, 2], dtype=np.int64)) # batch == 4, max_elems = 2 expected_sp = ( # indices, values, shape - np.array( - [[0, 0], [0, 3], [1, 1], [1, 7]], dtype=np.int64), np.array( - ["a", "b", "d", "c"], dtype="|S"), np.array( - [2, 13], dtype=np.int64)) # batch == 4, max_elems = 13 + np.array([[0, 0], [0, 3], [1, 1], [1, 7]], dtype=np.int64), + np.array(["a", "b", "d", "c"], dtype="|S"), + np.array([2, 13], dtype=np.int64)) # batch == 4, max_elems = 13 original = [ - example(features=features({ - "val": bytes_feature([b"a", b"b"]), - "idx": int64_feature([0, 3]) - })), example(features=features({ - "val": bytes_feature([b"c", b"d"]), - "idx": int64_feature([7, 1]) - })) + example( + features=features({ + "val": bytes_feature([b"a", b"b"]), + "idx": int64_feature([0, 3]) + })), + example( + features=features({ + "val": bytes_feature([b"c", b"d"]), + "idx": int64_feature([7, 1]) + })) ] names = ["in1", "in2"] @@ -680,9 +667,10 @@ class ParseExampleTest(test.TestCase): "example_names": names, "serialized": ops.convert_to_tensor(serialized), "features": { - "idx": parsing_ops.VarLenFeature(dtypes.int64), - "sp": parsing_ops.SparseFeature( - ["idx"], "val", dtypes.string, [13]), + "idx": + parsing_ops.VarLenFeature(dtypes.int64), + "sp": + parsing_ops.SparseFeature(["idx"], "val", dtypes.string, [13]), } }, expected_output) @@ -720,10 +708,11 @@ class ParseExampleTest(test.TestCase): } original = [ - example(features=features( - {"a": int64_feature([truth_int[i]]), - "b": bytes_feature(truth_str[i])})) - for i in range(batch_size) + example( + features=features({ + "a": int64_feature([truth_int[i]]), + "b": bytes_feature(truth_str[i]) + })) for i in range(batch_size) ] serialized = [m.SerializeToString() for m in original] @@ -731,12 +720,18 @@ class ParseExampleTest(test.TestCase): self._test({ "serialized": ops.convert_to_tensor(serialized, dtype=dtypes.string), "features": { - "a": parsing_ops.FixedLenSequenceFeature( - shape=(), dtype=dtypes.int64, allow_missing=True, - default_value=-1), - "b": parsing_ops.FixedLenSequenceFeature( - shape=[], dtype=dtypes.string, allow_missing=True, - default_value="default"), + "a": + parsing_ops.FixedLenSequenceFeature( + shape=(), + dtype=dtypes.int64, + allow_missing=True, + default_value=-1), + "b": + parsing_ops.FixedLenSequenceFeature( + shape=[], + dtype=dtypes.string, + allow_missing=True, + default_value="default"), } }, expected_output) @@ -755,18 +750,21 @@ class ParseExampleTest(test.TestCase): example(features=features({ cname: int64_feature([2]), })), - example(features=features({ - aname: float_feature([1, 1]), - bname: bytes_feature([b"b0_str", b"b1_str"]), - })), - example(features=features({ - aname: float_feature([-1, -1, 2, 2]), - bname: bytes_feature([b"b1"]), - })), - example(features=features({ - aname: float_feature([]), - cname: int64_feature([3]), - })), + example( + features=features({ + aname: float_feature([1, 1]), + bname: bytes_feature([b"b0_str", b"b1_str"]), + })), + example( + features=features({ + aname: float_feature([-1, -1, 2, 2]), + bname: bytes_feature([b"b1"]), + })), + example( + features=features({ + aname: float_feature([]), + cname: int64_feature([3]), + })), ] serialized = [m.SerializeToString() for m in original] @@ -827,7 +825,9 @@ class ParseExampleTest(test.TestCase): "features": { aname: parsing_ops.FixedLenSequenceFeature( - (2, 1), dtype=dtypes.float32, allow_missing=True, + (2, 1), + dtype=dtypes.float32, + allow_missing=True, default_value=-2.0), bname: parsing_ops.FixedLenSequenceFeature( @@ -867,7 +867,9 @@ class ParseExampleTest(test.TestCase): "features": { aname: parsing_ops.FixedLenSequenceFeature( - (2, 1), dtype=dtypes.float32, allow_missing=True, + (2, 1), + dtype=dtypes.float32, + allow_missing=True, default_value=[]), bname: parsing_ops.FixedLenSequenceFeature( @@ -908,26 +910,28 @@ class ParseExampleTest(test.TestCase): "All dimensions of shape for feature c need to be known " r"but received \(1, None\).")) - self._test({ - "example_names": example_names, - "serialized": ops.convert_to_tensor(serialized), - "features": { - aname: - parsing_ops.FixedLenSequenceFeature( - (2, 1), dtype=dtypes.float32, allow_missing=True), - bname: - parsing_ops.FixedLenSequenceFeature( - (1, 1, 1), dtype=dtypes.string, allow_missing=True), - cname: - parsing_ops.FixedLenSequenceFeature( - shape=[], dtype=dtypes.int64, allow_missing=False), - dname: - parsing_ops.FixedLenSequenceFeature( - shape=[], dtype=dtypes.string, allow_missing=True), - } - }, expected_err=(ValueError, - "Unsupported: FixedLenSequenceFeature requires " - "allow_missing to be True.")) + self._test( + { + "example_names": example_names, + "serialized": ops.convert_to_tensor(serialized), + "features": { + aname: + parsing_ops.FixedLenSequenceFeature( + (2, 1), dtype=dtypes.float32, allow_missing=True), + bname: + parsing_ops.FixedLenSequenceFeature( + (1, 1, 1), dtype=dtypes.string, allow_missing=True), + cname: + parsing_ops.FixedLenSequenceFeature( + shape=[], dtype=dtypes.int64, allow_missing=False), + dname: + parsing_ops.FixedLenSequenceFeature( + shape=[], dtype=dtypes.string, allow_missing=True), + } + }, + expected_err=(ValueError, + "Unsupported: FixedLenSequenceFeature requires " + "allow_missing to be True.")) class ParseSingleExampleTest(test.TestCase): @@ -949,8 +953,8 @@ class ParseSingleExampleTest(test.TestCase): # Check shapes. for k, f in kwargs["features"].items(): if isinstance(f, parsing_ops.FixedLenFeature) and f.shape is not None: - self.assertEqual(tuple(out[k].get_shape()), - tensor_shape.as_shape(f.shape)) + self.assertEqual( + tuple(out[k].get_shape()), tensor_shape.as_shape(f.shape)) elif isinstance(f, parsing_ops.VarLenFeature): self.assertEqual( tuple(out[k].indices.get_shape().as_list()), (None, 1)) @@ -959,29 +963,25 @@ class ParseSingleExampleTest(test.TestCase): tuple(out[k].dense_shape.get_shape().as_list()), (1,)) def testSingleExampleWithSparseAndSparseFeatureAndDense(self): - original = example(features=features({ - "c": float_feature([3, 4]), - "d": float_feature([0.0, 1.0]), - "val": bytes_feature([b"a", b"b"]), - "idx": int64_feature([0, 3]), - "st_a": float_feature([3.0, 4.0]) - })) + original = example( + features=features({ + "c": float_feature([3, 4]), + "d": float_feature([0.0, 1.0]), + "val": bytes_feature([b"a", b"b"]), + "idx": int64_feature([0, 3]), + "st_a": float_feature([3.0, 4.0]) + })) serialized = original.SerializeToString() expected_st_a = ( - np.array( - [[0], [1]], dtype=np.int64), # indices - np.array( - [3.0, 4.0], dtype=np.float32), # values - np.array( - [2], dtype=np.int64)) # shape: max_values = 2 + np.array([[0], [1]], dtype=np.int64), # indices + np.array([3.0, 4.0], dtype=np.float32), # values + np.array([2], dtype=np.int64)) # shape: max_values = 2 expected_sp = ( # indices, values, shape - np.array( - [[0], [3]], dtype=np.int64), np.array( - ["a", "b"], dtype="|S"), np.array( - [13], dtype=np.int64)) # max_values = 13 + np.array([[0], [3]], dtype=np.int64), np.array(["a", "b"], dtype="|S"), + np.array([13], dtype=np.int64)) # max_values = 13 a_default = [1, 2, 3] b_default = np.random.rand(3, 3).astype(bytes) @@ -996,16 +996,14 @@ class ParseSingleExampleTest(test.TestCase): self._test( { - "example_names": - ops.convert_to_tensor("in1"), - "serialized": - ops.convert_to_tensor(serialized), + "example_names": ops.convert_to_tensor("in1"), + "serialized": ops.convert_to_tensor(serialized), "features": { "st_a": parsing_ops.VarLenFeature(dtypes.float32), "sp": - parsing_ops.SparseFeature( - ["idx"], "val", dtypes.string, [13]), + parsing_ops.SparseFeature(["idx"], "val", dtypes.string, + [13]), "a": parsing_ops.FixedLenFeature( (1, 3), dtypes.int64, default_value=a_default), @@ -1016,9 +1014,8 @@ class ParseSingleExampleTest(test.TestCase): "c": parsing_ops.FixedLenFeature(2, dtypes.float32), "d": - parsing_ops.FixedLenSequenceFeature([], - dtypes.float32, - allow_missing=True) + parsing_ops.FixedLenSequenceFeature( + [], dtypes.float32, allow_missing=True) } }, expected_output) @@ -1050,43 +1047,71 @@ class ParseSequenceExampleTest(test.TestCase): kwargs, expected_context_values=None, expected_feat_list_values=None, - expected_err=None): + expected_length_values=None, + expected_err=None, + batch=False): expected_context_values = expected_context_values or {} expected_feat_list_values = expected_feat_list_values or {} + expected_length_values = expected_length_values or {} with self.test_session() as sess: if expected_err: with self.assertRaisesWithPredicateMatch(expected_err[0], expected_err[1]): - c_out, fl_out = parsing_ops.parse_single_sequence_example(**kwargs) + if batch: + c_out, fl_out, _ = parsing_ops.parse_sequence_example(**kwargs) + else: + c_out, fl_out = parsing_ops.parse_single_sequence_example(**kwargs) if c_out: sess.run(flatten_values_tensors_or_sparse(c_out.values())) if fl_out: sess.run(flatten_values_tensors_or_sparse(fl_out.values())) else: # Returns dicts w/ Tensors and SparseTensors. - context_out, feat_list_out = parsing_ops.parse_single_sequence_example( - **kwargs) + if batch: + (context_out, feat_list_out, + lengths_out) = parsing_ops.parse_sequence_example(**kwargs) + else: + (context_out, + feat_list_out) = parsing_ops.parse_single_sequence_example(**kwargs) + lengths_out = {} + context_result = sess.run( - flatten_values_tensors_or_sparse(context_out.values( - ))) if context_out else [] + flatten_values_tensors_or_sparse( + context_out.values())) if context_out else [] feat_list_result = sess.run( - flatten_values_tensors_or_sparse(feat_list_out.values( - ))) if feat_list_out else [] + flatten_values_tensors_or_sparse( + feat_list_out.values())) if feat_list_out else [] + lengths_result = sess.run( + flatten_values_tensors_or_sparse( + lengths_out.values())) if lengths_out else [] # Check values. _compare_output_to_expected(self, context_out, expected_context_values, context_result) _compare_output_to_expected(self, feat_list_out, expected_feat_list_values, feat_list_result) + _compare_output_to_expected(self, lengths_out, expected_length_values, + lengths_result) # Check shapes; if serialized is a Tensor we need its size to # properly check. if "context_features" in kwargs: for k, f in kwargs["context_features"].items(): if isinstance(f, parsing_ops.FixedLenFeature) and f.shape is not None: + if batch: + self.assertEqual( + tuple(context_out[k].get_shape().as_list()[1:]), f.shape) + else: + self.assertEqual( + tuple(context_out[k].get_shape().as_list()), f.shape) + elif isinstance(f, parsing_ops.VarLenFeature) and batch: self.assertEqual( - tuple(context_out[k].get_shape().as_list()), f.shape) - elif isinstance(f, parsing_ops.VarLenFeature): + tuple(context_out[k].indices.get_shape().as_list()), (None, 2)) + self.assertEqual( + tuple(context_out[k].values.get_shape().as_list()), (None,)) + self.assertEqual( + tuple(context_out[k].dense_shape.get_shape().as_list()), (2,)) + elif isinstance(f, parsing_ops.VarLenFeature) and not batch: self.assertEqual( tuple(context_out[k].indices.get_shape().as_list()), (None, 1)) self.assertEqual( @@ -1094,38 +1119,94 @@ class ParseSequenceExampleTest(test.TestCase): self.assertEqual( tuple(context_out[k].dense_shape.get_shape().as_list()), (1,)) + def _testBoth(self, + kwargs, + expected_context_values=None, + expected_feat_list_values=None, + expected_err=None): + # Test using tf.parse_single_sequence_example + self._test( + kwargs, + expected_context_values=expected_context_values, + expected_feat_list_values=expected_feat_list_values, + expected_err=expected_err, + batch=False) + + # Convert the input to a batch of size 1, and test using + # tf.parse_sequence_example. + + # Some replacements are needed for the batch version. + kwargs["serialized"] = [kwargs.pop("serialized")] + kwargs["example_names"] = [kwargs.pop("example_name") + ] if "example_name" in kwargs else None + # Disable error string matching; it's not consistent for batch mode. + if expected_err: + expected_err = (expected_err[0], "") + + # Add a batch dimension to expected output + if expected_context_values: + new_values = {} + for k in expected_context_values: + v = expected_context_values[k] + if isinstance(kwargs["context_features"][k], + parsing_ops.FixedLenFeature): + new_values[k] = np.expand_dims(v, axis=0) + else: + # Sparse tensor. + new_values[k] = (np.insert(v[0], 0, 0, axis=1), v[1], + np.insert(v[2], 0, 1)) + expected_context_values = new_values + + expected_length_values = {} + if expected_feat_list_values: + new_values = {} + for k in expected_feat_list_values: + v = expected_feat_list_values[k] + if isinstance(kwargs["sequence_features"][k], + parsing_ops.FixedLenSequenceFeature): + expected_length_values[k] = [np.shape(v)[0]] + new_values[k] = np.expand_dims(v, axis=0) + else: + # Sparse tensor. + new_values[k] = (np.insert(v[0], 0, 0, axis=1), v[1], + np.insert(v[2], 0, 1)) + expected_feat_list_values = new_values + + self._test( + kwargs, + expected_context_values=expected_context_values, + expected_feat_list_values=expected_feat_list_values, + expected_length_values=expected_length_values, + expected_err=expected_err, + batch=True) + def testSequenceExampleWithSparseAndDenseContext(self): - original = sequence_example(context=features({ - "c": float_feature([3, 4]), - "st_a": float_feature([3.0, 4.0]) - })) + original = sequence_example( + context=features({ + "c": float_feature([3, 4]), + "st_a": float_feature([3.0, 4.0]) + })) serialized = original.SerializeToString() expected_st_a = ( - np.array( - [[0], [1]], dtype=np.int64), # indices - np.array( - [3.0, 4.0], dtype=np.float32), # values - np.array( - [2], dtype=np.int64)) # shape: num_features = 2 + np.array([[0], [1]], dtype=np.int64), # indices + np.array([3.0, 4.0], dtype=np.float32), # values + np.array([2], dtype=np.int64)) # shape: num_features = 2 - a_default = [1, 2, 3] + a_default = [[1, 2, 3]] b_default = np.random.rand(3, 3).astype(bytes) expected_context_output = { "st_a": expected_st_a, - "a": [a_default], + "a": a_default, "b": b_default, - "c": np.array( - [3, 4], dtype=np.float32), + "c": np.array([3, 4], dtype=np.float32), } - self._test( + self._testBoth( { - "example_name": - "in1", - "serialized": - ops.convert_to_tensor(serialized), + "example_name": "in1", + "serialized": ops.convert_to_tensor(serialized), "context_features": { "st_a": parsing_ops.VarLenFeature(dtypes.float32), @@ -1143,51 +1224,54 @@ class ParseSequenceExampleTest(test.TestCase): expected_context_values=expected_context_output) def testSequenceExampleWithMultipleSizeFeatureLists(self): - original = sequence_example(feature_lists=feature_lists({ - "a": - feature_list([ - int64_feature([-1, 0, 1]), - int64_feature([2, 3, 4]), - int64_feature([5, 6, 7]), - int64_feature([8, 9, 10]), - ]), - "b": - feature_list([bytes_feature([b"r00", b"r01", b"r10", b"r11"])]), - "c": - feature_list([float_feature([3, 4]), float_feature([-1, 2])]), - })) + original = sequence_example( + feature_lists=feature_lists({ + "a": + feature_list([ + int64_feature([-1, 0, 1]), + int64_feature([2, 3, 4]), + int64_feature([5, 6, 7]), + int64_feature([8, 9, 10]), + ]), + "b": + feature_list([bytes_feature([b"r00", b"r01", b"r10", b"r11"])]), + "c": + feature_list([float_feature([3, 4]), + float_feature([-1, 2])]), + })) serialized = original.SerializeToString() expected_feature_list_output = { - "a": np.array( - [ # outer dimension is time. - [[-1, 0, 1]], # inside are 1x3 matrices - [[2, 3, 4]], - [[5, 6, 7]], - [[8, 9, 10]] - ], - dtype=np.int64), - "b": np.array( - [ # outer dimension is time, inside are 2x2 matrices - [[b"r00", b"r01"], [b"r10", b"r11"]] - ], - dtype=bytes), - "c": np.array( - [ # outer dimension is time, inside are 2-vectors - [3, 4], [-1, 2] - ], - dtype=np.float32), - "d": np.empty( - shape=(0, 5), dtype=np.float32), # empty_allowed_missing + "a": + np.array( + [ # outer dimension is time. + [[-1, 0, 1]], # inside are 1x3 matrices + [[2, 3, 4]], + [[5, 6, 7]], + [[8, 9, 10]] + ], + dtype=np.int64), + "b": + np.array( + [ # outer dimension is time, inside are 2x2 matrices + [[b"r00", b"r01"], [b"r10", b"r11"]] + ], + dtype=bytes), + "c": + np.array( + [ # outer dimension is time, inside are 2-vectors + [3, 4], [-1, 2] + ], + dtype=np.float32), + "d": + np.empty(shape=(0, 5), dtype=np.float32), # empty_allowed_missing } - self._test( + self._testBoth( { - "example_name": - "in1", - "serialized": - ops.convert_to_tensor(serialized), + "example_name": "in1", + "serialized": ops.convert_to_tensor(serialized), "sequence_features": { "a": parsing_ops.FixedLenSequenceFeature((1, 3), dtypes.int64), @@ -1203,56 +1287,51 @@ class ParseSequenceExampleTest(test.TestCase): expected_feat_list_values=expected_feature_list_output) def testSequenceExampleWithoutDebugName(self): - original = sequence_example(feature_lists=feature_lists({ - "a": - feature_list([int64_feature([3, 4]), int64_feature([1, 0])]), - "st_a": - feature_list([ - float_feature([3.0, 4.0]), float_feature([5.0]), - float_feature([]) - ]), - "st_b": - feature_list([ - bytes_feature([b"a"]), bytes_feature([]), bytes_feature([]), - bytes_feature([b"b", b"c"]) - ]) - })) + original = sequence_example( + feature_lists=feature_lists({ + "a": + feature_list([int64_feature([3, 4]), + int64_feature([1, 0])]), + "st_a": + feature_list([ + float_feature([3.0, 4.0]), + float_feature([5.0]), + float_feature([]) + ]), + "st_b": + feature_list([ + bytes_feature([b"a"]), + bytes_feature([]), + bytes_feature([]), + bytes_feature([b"b", b"c"]) + ]) + })) serialized = original.SerializeToString() expected_st_a = ( - np.array( - [[0, 0], [0, 1], [1, 0]], dtype=np.int64), # indices - np.array( - [3.0, 4.0, 5.0], dtype=np.float32), # values - np.array( - [3, 2], dtype=np.int64)) # shape: num_time = 3, max_feat = 2 + np.array([[0, 0], [0, 1], [1, 0]], dtype=np.int64), # indices + np.array([3.0, 4.0, 5.0], dtype=np.float32), # values + np.array([3, 2], dtype=np.int64)) # shape: num_time = 3, max_feat = 2 expected_st_b = ( - np.array( - [[0, 0], [3, 0], [3, 1]], dtype=np.int64), # indices - np.array( - ["a", "b", "c"], dtype="|S"), # values - np.array( - [4, 2], dtype=np.int64)) # shape: num_time = 4, max_feat = 2 + np.array([[0, 0], [3, 0], [3, 1]], dtype=np.int64), # indices + np.array(["a", "b", "c"], dtype="|S"), # values + np.array([4, 2], dtype=np.int64)) # shape: num_time = 4, max_feat = 2 expected_st_c = ( - np.empty( - (0, 2), dtype=np.int64), # indices - np.empty( - (0,), dtype=np.int64), # values - np.array( - [0, 0], dtype=np.int64)) # shape: num_time = 0, max_feat = 0 + np.empty((0, 2), dtype=np.int64), # indices + np.empty((0,), dtype=np.int64), # values + np.array([0, 0], dtype=np.int64)) # shape: num_time = 0, max_feat = 0 expected_feature_list_output = { - "a": np.array( - [[3, 4], [1, 0]], dtype=np.int64), + "a": np.array([[3, 4], [1, 0]], dtype=np.int64), "st_a": expected_st_a, "st_b": expected_st_b, "st_c": expected_st_c, } - self._test( + self._testBoth( { "serialized": ops.convert_to_tensor(serialized), "sequence_features": { @@ -1265,56 +1344,51 @@ class ParseSequenceExampleTest(test.TestCase): expected_feat_list_values=expected_feature_list_output) def testSequenceExampleWithSparseAndDenseFeatureLists(self): - original = sequence_example(feature_lists=feature_lists({ - "a": - feature_list([int64_feature([3, 4]), int64_feature([1, 0])]), - "st_a": - feature_list([ - float_feature([3.0, 4.0]), float_feature([5.0]), - float_feature([]) - ]), - "st_b": - feature_list([ - bytes_feature([b"a"]), bytes_feature([]), bytes_feature([]), - bytes_feature([b"b", b"c"]) - ]) - })) + original = sequence_example( + feature_lists=feature_lists({ + "a": + feature_list([int64_feature([3, 4]), + int64_feature([1, 0])]), + "st_a": + feature_list([ + float_feature([3.0, 4.0]), + float_feature([5.0]), + float_feature([]) + ]), + "st_b": + feature_list([ + bytes_feature([b"a"]), + bytes_feature([]), + bytes_feature([]), + bytes_feature([b"b", b"c"]) + ]) + })) serialized = original.SerializeToString() expected_st_a = ( - np.array( - [[0, 0], [0, 1], [1, 0]], dtype=np.int64), # indices - np.array( - [3.0, 4.0, 5.0], dtype=np.float32), # values - np.array( - [3, 2], dtype=np.int64)) # shape: num_time = 3, max_feat = 2 + np.array([[0, 0], [0, 1], [1, 0]], dtype=np.int64), # indices + np.array([3.0, 4.0, 5.0], dtype=np.float32), # values + np.array([3, 2], dtype=np.int64)) # shape: num_time = 3, max_feat = 2 expected_st_b = ( - np.array( - [[0, 0], [3, 0], [3, 1]], dtype=np.int64), # indices - np.array( - ["a", "b", "c"], dtype="|S"), # values - np.array( - [4, 2], dtype=np.int64)) # shape: num_time = 4, max_feat = 2 + np.array([[0, 0], [3, 0], [3, 1]], dtype=np.int64), # indices + np.array(["a", "b", "c"], dtype="|S"), # values + np.array([4, 2], dtype=np.int64)) # shape: num_time = 4, max_feat = 2 expected_st_c = ( - np.empty( - (0, 2), dtype=np.int64), # indices - np.empty( - (0,), dtype=np.int64), # values - np.array( - [0, 0], dtype=np.int64)) # shape: num_time = 0, max_feat = 0 + np.empty((0, 2), dtype=np.int64), # indices + np.empty((0,), dtype=np.int64), # values + np.array([0, 0], dtype=np.int64)) # shape: num_time = 0, max_feat = 0 expected_feature_list_output = { - "a": np.array( - [[3, 4], [1, 0]], dtype=np.int64), + "a": np.array([[3, 4], [1, 0]], dtype=np.int64), "st_a": expected_st_a, "st_b": expected_st_b, "st_c": expected_st_c, } - self._test( + self._testBoth( { "example_name": "in1", "serialized": ops.convert_to_tensor(serialized), @@ -1328,30 +1402,28 @@ class ParseSequenceExampleTest(test.TestCase): expected_feat_list_values=expected_feature_list_output) def testSequenceExampleWithEmptyFeatureInFeatureLists(self): - original = sequence_example(feature_lists=feature_lists({ - "st_a": - feature_list([ - float_feature([3.0, 4.0]), - feature(), - float_feature([5.0]), - ]), - })) + original = sequence_example( + feature_lists=feature_lists({ + "st_a": + feature_list([ + float_feature([3.0, 4.0]), + feature(), + float_feature([5.0]), + ]), + })) serialized = original.SerializeToString() expected_st_a = ( - np.array( - [[0, 0], [0, 1], [2, 0]], dtype=np.int64), # indices - np.array( - [3.0, 4.0, 5.0], dtype=np.float32), # values - np.array( - [3, 2], dtype=np.int64)) # shape: num_time = 3, max_feat = 2 + np.array([[0, 0], [0, 1], [2, 0]], dtype=np.int64), # indices + np.array([3.0, 4.0, 5.0], dtype=np.float32), # values + np.array([3, 2], dtype=np.int64)) # shape: num_time = 3, max_feat = 2 expected_feature_list_output = { "st_a": expected_st_a, } - self._test( + self._testBoth( { "example_name": "in1", "serialized": ops.convert_to_tensor(serialized), @@ -1362,13 +1434,15 @@ class ParseSequenceExampleTest(test.TestCase): expected_feat_list_values=expected_feature_list_output) def testSequenceExampleListWithInconsistentDataFails(self): - original = sequence_example(feature_lists=feature_lists({ - "a": feature_list([int64_feature([-1, 0]), float_feature([2, 3])]) - })) + original = sequence_example( + feature_lists=feature_lists({ + "a": feature_list([int64_feature([-1, 0]), + float_feature([2, 3])]) + })) serialized = original.SerializeToString() - self._test( + self._testBoth( { "example_name": "in1", "serialized": ops.convert_to_tensor(serialized), @@ -1380,13 +1454,14 @@ class ParseSequenceExampleTest(test.TestCase): " Data types don't match. Expected type: int64")) def testSequenceExampleListWithWrongDataTypeFails(self): - original = sequence_example(feature_lists=feature_lists({ - "a": feature_list([float_feature([2, 3])]) - })) + original = sequence_example( + feature_lists=feature_lists({ + "a": feature_list([float_feature([2, 3])]) + })) serialized = original.SerializeToString() - self._test( + self._testBoth( { "example_name": "in1", "serialized": ops.convert_to_tensor(serialized), @@ -1399,17 +1474,19 @@ class ParseSequenceExampleTest(test.TestCase): " Expected type: int64")) def testSequenceExampleListWithWrongSparseDataTypeFails(self): - original = sequence_example(feature_lists=feature_lists({ - "a": - feature_list([ - int64_feature([3, 4]), int64_feature([1, 2]), - float_feature([2.0, 3.0]) - ]) - })) + original = sequence_example( + feature_lists=feature_lists({ + "a": + feature_list([ + int64_feature([3, 4]), + int64_feature([1, 2]), + float_feature([2.0, 3.0]) + ]) + })) serialized = original.SerializeToString() - self._test( + self._testBoth( { "example_name": "in1", "serialized": ops.convert_to_tensor(serialized), @@ -1423,13 +1500,16 @@ class ParseSequenceExampleTest(test.TestCase): " Feature is: float_list")) def testSequenceExampleListWithWrongShapeFails(self): - original = sequence_example(feature_lists=feature_lists({ - "a": feature_list([int64_feature([2, 3]), int64_feature([2, 3, 4])]), - })) + original = sequence_example( + feature_lists=feature_lists({ + "a": + feature_list([int64_feature([2, 3]), + int64_feature([2, 3, 4])]), + })) serialized = original.SerializeToString() - self._test( + self._testBoth( { "example_name": "in1", "serialized": ops.convert_to_tensor(serialized), @@ -1446,7 +1526,7 @@ class ParseSequenceExampleTest(test.TestCase): # Test fails because we didn't add: # feature_list_dense_defaults = {"a": None} - self._test( + self._testBoth( { "example_name": "in1", "serialized": ops.convert_to_tensor(original.SerializeToString()), @@ -1461,6 +1541,67 @@ class ParseSequenceExampleTest(test.TestCase): " feature_list_dense_missing_assumed_empty or" " feature_list_dense_defaults?")) + def testSequenceExampleBatch(self): + first = sequence_example( + feature_lists=feature_lists({ + "a": + feature_list([ + int64_feature([-1, 0, 1]), + int64_feature([2, 3, 4]), + int64_feature([5, 6, 7]), + int64_feature([8, 9, 10]), + ]) + })) + second = sequence_example( + feature_lists=feature_lists({ + "a": feature_list([ + int64_feature([21, 2, 11]), + ]) + })) + + serialized = [first.SerializeToString(), second.SerializeToString()] + + expected_feature_list_output = { + "a": + np.array( + [ # outermost dimension is example id + [ # middle dimension is time. + [[-1, 0, 1]], # inside are 1x3 matrices + [[2, 3, 4]], + [[5, 6, 7]], + [[8, 9, 10]] + ], + [ # middle dimension is time. + [[21, 2, 11]], # inside are 1x3 matrices + [[0, 0, 0]], # additional entries are padded with 0 + [[0, 0, 0]], + [[0, 0, 0]] + ] + ], + dtype=np.int64), + "d": + np.empty(shape=(2, 0, 5), dtype=np.float32), # allowed_missing + } + + self._test( + { + "example_names": ops.convert_to_tensor(["in1", "in2"]), + "serialized": ops.convert_to_tensor(serialized), + "sequence_features": { + "a": + parsing_ops.FixedLenSequenceFeature((1, 3), dtypes.int64), + "d": + parsing_ops.FixedLenSequenceFeature( + (5,), dtypes.float32, allow_missing=True), + } + }, + expected_feat_list_values=expected_feature_list_output, + expected_length_values={ + "a": [4, 1], + "d": [0, 0] + }, + batch=True) + class DecodeJSONExampleTest(test.TestCase): @@ -1531,24 +1672,27 @@ class DecodeJSONExampleTest(test.TestCase): example(features=features({ "st_d": feature() })), - example(features=features({ - "st_c": float_feature([1, 2, -1]), - "st_d": bytes_feature([b"hi"]) - })), + example( + features=features({ + "st_c": float_feature([1, 2, -1]), + "st_d": bytes_feature([b"hi"]) + })), ]) def testSerializedContainingBytes(self): aname = "a" bname = "b*has+a:tricky_name" self._testRoundTrip([ - example(features=features({ - aname: float_feature([1, 1]), - bname: bytes_feature([b"b0_str"]) - })), - example(features=features({ - aname: float_feature([-1, -1]), - bname: bytes_feature([b"b1"]) - })), + example( + features=features({ + aname: float_feature([1, 1]), + bname: bytes_feature([b"b0_str"]) + })), + example( + features=features({ + aname: float_feature([-1, -1]), + bname: bytes_feature([b"b1"]) + })), ]) def testInvalidSyntax(self): diff --git a/tensorflow/python/ops/parsing_ops.py b/tensorflow/python/ops/parsing_ops.py index 6041e2a0c5..8224097ac4 100644 --- a/tensorflow/python/ops/parsing_ops.py +++ b/tensorflow/python/ops/parsing_ops.py @@ -897,6 +897,352 @@ def _parse_single_example_raw(serialized, return outputs +@tf_export("io.parse_sequence_example") +def parse_sequence_example(serialized, + context_features=None, + sequence_features=None, + example_names=None, + name=None): + # pylint: disable=line-too-long + """Parses a batch of `SequenceExample` protos. + + Parses a vector of serialized + [`SequenceExample`](https://www.tensorflow.org/code/tensorflow/core/example/example.proto) + protos given in `serialized`. + + This op parses serialized sequence examples into a tuple of dictionaries + mapping keys to `Tensor` and `SparseTensor` objects respectively. + The first dictionary contains mappings for keys appearing in + `context_features`, and the second dictionary contains mappings for keys + appearing in `sequence_features`. + + At least one of `context_features` and `sequence_features` must be provided + and non-empty. + + The `context_features` keys are associated with a `SequenceExample` as a + whole, independent of time / frame. In contrast, the `sequence_features` keys + provide a way to access variable-length data within the `FeatureList` section + of the `SequenceExample` proto. While the shapes of `context_features` values + are fixed with respect to frame, the frame dimension (the first dimension) + of `sequence_features` values may vary between `SequenceExample` protos, + and even between `feature_list` keys within the same `SequenceExample`. + + `context_features` contains `VarLenFeature` and `FixedLenFeature` objects. + Each `VarLenFeature` is mapped to a `SparseTensor`, and each `FixedLenFeature` + is mapped to a `Tensor`, of the specified type, shape, and default value. + + `sequence_features` contains `VarLenFeature` and `FixedLenSequenceFeature` + objects. Each `VarLenFeature` is mapped to a `SparseTensor`, and each + `FixedLenSequenceFeature` is mapped to a `Tensor`, each of the specified type. + The shape will be `(B,T,) + df.dense_shape` for `FixedLenSequenceFeature` + `df`, where `B` is the batch size, and `T` is the length of the associated + `FeatureList` in the `SequenceExample`. For instance, + `FixedLenSequenceFeature([])` yields a scalar 2-D `Tensor` of static shape + `[None, None]` and dynamic shape `[B, T]`, while + `FixedLenSequenceFeature([k])` (for `int k >= 1`) yields a 3-D matrix `Tensor` + of static shape `[None, None, k]` and dynamic shape `[B, T, k]`. + + Like the input, the resulting output tensors have a batch dimension. This + means that the original per-example shapes of `VarLenFeature`s and + `FixedLenSequenceFeature`s can be lost. To handle that situation, this op also + provides dicts of shape tensors as part of the output. There is one dict for + the context features, and one for the feature_list features. Context features + of type `FixedLenFeature`s will not be present, since their shapes are already + known by the caller. In situations where the input 'FixedLenFeature`s are of + different lengths across examples, the shorter examples will be padded with + default datatype values: 0 for numeric types, and the empty string for string + types. + + Each `SparseTensor` corresponding to `sequence_features` represents a ragged + vector. Its indices are `[time, index]`, where `time` is the `FeatureList` + entry and `index` is the value's index in the list of values associated with + that time. + + `FixedLenFeature` entries with a `default_value` and `FixedLenSequenceFeature` + entries with `allow_missing=True` are optional; otherwise, we will fail if + that `Feature` or `FeatureList` is missing from any example in `serialized`. + + `example_name` may contain a descriptive name for the corresponding serialized + proto. This may be useful for debugging purposes, but it has no effect on the + output. If not `None`, `example_name` must be a scalar. + + Args: + serialized: A vector (1-D Tensor) of type string containing binary + serialized `SequenceExample` protos. + context_features: A `dict` mapping feature keys to `FixedLenFeature` or + `VarLenFeature` values. These features are associated with a + `SequenceExample` as a whole. + sequence_features: A `dict` mapping feature keys to + `FixedLenSequenceFeature` or `VarLenFeature` values. These features are + associated with data within the `FeatureList` section of the + `SequenceExample` proto. + example_names: A vector (1-D Tensor) of strings (optional), the name of the + serialized protos. + name: A name for this operation (optional). + + Returns: + A tuple of two `dict`s, each mapping keys to `Tensor`s and `SparseTensor`s. + The first dict contains the context key/values. + The second dict contains the feature_list key/values. + + Raises: + ValueError: if any feature is invalid. + """ + if not (context_features or sequence_features): + raise ValueError("Missing features.") + (context_sparse_keys, context_sparse_types, context_dense_keys, + context_dense_types, + context_dense_defaults, context_dense_shapes) = _features_to_raw_params( + context_features, [VarLenFeature, FixedLenFeature]) + (feature_list_sparse_keys, feature_list_sparse_types, feature_list_dense_keys, + feature_list_dense_types, feature_list_dense_defaults, + feature_list_dense_shapes) = _features_to_raw_params( + sequence_features, [VarLenFeature, FixedLenSequenceFeature]) + return _parse_sequence_example_raw( + serialized, example_names, context_sparse_keys, context_sparse_types, + context_dense_keys, context_dense_types, context_dense_defaults, + context_dense_shapes, feature_list_sparse_keys, feature_list_sparse_types, + feature_list_dense_keys, feature_list_dense_types, + feature_list_dense_shapes, feature_list_dense_defaults, name) + + +def _parse_sequence_example_raw(serialized, + debug_name=None, + context_sparse_keys=None, + context_sparse_types=None, + context_dense_keys=None, + context_dense_types=None, + context_dense_defaults=None, + context_dense_shapes=None, + feature_list_sparse_keys=None, + feature_list_sparse_types=None, + feature_list_dense_keys=None, + feature_list_dense_types=None, + feature_list_dense_shapes=None, + feature_list_dense_defaults=None, + name=None): + """Parses a vector of `SequenceExample` protos. + + Args: + serialized: A vector (1-D Tensor) of type string, containing binary + serialized `SequenceExample` protos. + debug_name: A vector (1-D Tensor) of strings (optional), the names of the + serialized protos. + context_sparse_keys: A list of string keys in the `SequenceExample`'s + features. The results for these keys will be returned as `SparseTensor` + objects. + context_sparse_types: A list of `DTypes`, the same length as `sparse_keys`. + Only `tf.float32` (`FloatList`), `tf.int64` (`Int64List`), and `tf.string` + (`BytesList`) are supported. + context_dense_keys: A list of string keys in the examples' features. The + results for these keys will be returned as `Tensor`s + context_dense_types: A list of DTypes, same length as `context_dense_keys`. + Only `tf.float32` (`FloatList`), `tf.int64` (`Int64List`), and `tf.string` + (`BytesList`) are supported. + context_dense_defaults: A dict mapping string keys to `Tensor`s. The keys of + the dict must match the context_dense_keys of the feature. + context_dense_shapes: A list of tuples, same length as `context_dense_keys`. + The shape of the data for each context_dense feature referenced by + `context_dense_keys`. Required for any input tensors identified by + `context_dense_keys` whose shapes are anything other than `[]` or `[1]`. + feature_list_sparse_keys: A list of string keys in the `SequenceExample`'s + feature_lists. The results for these keys will be returned as + `SparseTensor` objects. + feature_list_sparse_types: A list of `DTypes`, same length as `sparse_keys`. + Only `tf.float32` (`FloatList`), `tf.int64` (`Int64List`), and `tf.string` + (`BytesList`) are supported. + feature_list_dense_keys: A list of string keys in the `SequenceExample`'s + features_lists. The results for these keys will be returned as `Tensor`s. + feature_list_dense_types: A list of `DTypes`, same length as + `feature_list_dense_keys`. Only `tf.float32` (`FloatList`), `tf.int64` + (`Int64List`), and `tf.string` (`BytesList`) are supported. + feature_list_dense_shapes: A list of tuples, same length as + `feature_list_dense_keys`. The shape of the data for each `FeatureList` + feature referenced by `feature_list_dense_keys`. + feature_list_dense_defaults: A dict mapping key strings to values. The only + currently allowed value is `None`. Any key appearing in this dict with + value `None` is allowed to be missing from the `SequenceExample`. If + missing, the key is treated as zero-length. + name: A name for this operation (optional). + + Returns: + A tuple of three `dict`s, each mapping keys to `Tensor`s and + `SparseTensor`s. The first dict contains the context key/values, + the second dict contains the feature_list key/values, and the final dict + contains the lengths of any dense feature_list features. + + Raises: + ValueError: If context_sparse and context_dense key sets intersect, + if feature_list_sparse and feature_list_dense key sets intersect, + if input lengths do not match up, or if a value in + feature_list_dense_defaults is not None. + TypeError: if feature_list_dense_defaults is not either None or a dict. + """ + with ops.name_scope(name, "ParseSequenceExample", [serialized]): + context_dense_defaults = ({} if context_dense_defaults is None else + context_dense_defaults) + context_sparse_keys = ([] if context_sparse_keys is None else + context_sparse_keys) + context_sparse_types = ([] if context_sparse_types is None else + context_sparse_types) + context_dense_keys = ([] + if context_dense_keys is None else context_dense_keys) + context_dense_types = ([] if context_dense_types is None else + context_dense_types) + context_dense_shapes = ([[]] * len(context_dense_keys) + if context_dense_shapes is None else + context_dense_shapes) + feature_list_sparse_keys = ([] if feature_list_sparse_keys is None else + feature_list_sparse_keys) + feature_list_sparse_types = ([] if feature_list_sparse_types is None else + feature_list_sparse_types) + feature_list_dense_keys = ([] if feature_list_dense_keys is None else + feature_list_dense_keys) + feature_list_dense_types = ([] if feature_list_dense_types is None else + feature_list_dense_types) + feature_list_dense_shapes = ([[]] * len(feature_list_dense_keys) + if feature_list_dense_shapes is None else + feature_list_dense_shapes) + feature_list_dense_defaults = ( + dict() + if feature_list_dense_defaults is None else feature_list_dense_defaults) + debug_name = [] if debug_name is None else debug_name + + # Internal + feature_list_dense_missing_assumed_empty = [] + + num_context_dense = len(context_dense_keys) + num_feature_list_dense = len(feature_list_dense_keys) + num_context_sparse = len(context_sparse_keys) + num_feature_list_sparse = len(feature_list_sparse_keys) + + if len(context_dense_shapes) != num_context_dense: + raise ValueError( + "len(context_dense_shapes) != len(context_dense_keys): %d vs. %d" % + (len(context_dense_shapes), num_context_dense)) + if len(context_dense_types) != num_context_dense: + raise ValueError( + "len(context_dense_types) != len(num_context_dense): %d vs. %d" % + (len(context_dense_types), num_context_dense)) + if len(feature_list_dense_shapes) != num_feature_list_dense: + raise ValueError( + "len(feature_list_dense_shapes) != len(feature_list_dense_keys): " + "%d vs. %d" % (len(feature_list_dense_shapes), + num_feature_list_dense)) + if len(feature_list_dense_types) != num_feature_list_dense: + raise ValueError( + "len(feature_list_dense_types) != len(num_feature_list_dense):" + "%d vs. %d" % (len(feature_list_dense_types), num_feature_list_dense)) + if len(context_sparse_types) != num_context_sparse: + raise ValueError( + "len(context_sparse_types) != len(context_sparse_keys): %d vs. %d" % + (len(context_sparse_types), num_context_sparse)) + if len(feature_list_sparse_types) != num_feature_list_sparse: + raise ValueError( + "len(feature_list_sparse_types) != len(feature_list_sparse_keys): " + "%d vs. %d" % (len(feature_list_sparse_types), + num_feature_list_sparse)) + if (num_context_dense + num_context_sparse + num_feature_list_dense + + num_feature_list_sparse) == 0: + raise ValueError( + "Must provide at least one context_sparse key, context_dense key, " + ", feature_list_sparse key, or feature_list_dense key") + if not set(context_dense_keys).isdisjoint(set(context_sparse_keys)): + raise ValueError( + "context_dense and context_sparse keys must not intersect; " + "intersection: %s" % set(context_dense_keys).intersection( + set(context_sparse_keys))) + if not set(feature_list_dense_keys).isdisjoint( + set(feature_list_sparse_keys)): + raise ValueError( + "feature_list_dense and feature_list_sparse keys must not intersect; " + "intersection: %s" % set(feature_list_dense_keys).intersection( + set(feature_list_sparse_keys))) + if not isinstance(feature_list_dense_defaults, dict): + raise TypeError("feature_list_dense_defaults must be a dict") + for k, v in feature_list_dense_defaults.items(): + if v is not None: + raise ValueError( + "Value feature_list_dense_defaults[%s] must be None" % k) + feature_list_dense_missing_assumed_empty.append(k) + + context_dense_defaults_vec = [] + for i, key in enumerate(context_dense_keys): + default_value = context_dense_defaults.get(key) + if default_value is None: + default_value = constant_op.constant([], dtype=context_dense_types[i]) + elif not isinstance(default_value, ops.Tensor): + key_name = "key_" + re.sub("[^A-Za-z0-9_.\\-/]", "_", key) + default_value = ops.convert_to_tensor( + default_value, dtype=context_dense_types[i], name=key_name) + + context_dense_defaults_vec.append(default_value) + + context_dense_shapes = [ + tensor_shape.as_shape(shape).as_proto() + for shape in context_dense_shapes + ] + feature_list_dense_shapes = [ + tensor_shape.as_shape(shape).as_proto() + for shape in feature_list_dense_shapes + ] + + # pylint: disable=protected-access + outputs = gen_parsing_ops.parse_sequence_example( + serialized=serialized, + debug_name=debug_name, + Ncontext_sparse=num_context_sparse, + Ncontext_dense=num_context_dense, + Nfeature_list_sparse=num_feature_list_sparse, + Nfeature_list_dense=num_feature_list_dense, + context_dense_defaults=context_dense_defaults_vec, + context_sparse_keys=context_sparse_keys, + context_sparse_types=context_sparse_types, + context_dense_keys=context_dense_keys, + context_dense_shapes=context_dense_shapes, + feature_list_sparse_keys=feature_list_sparse_keys, + feature_list_sparse_types=feature_list_sparse_types, + feature_list_dense_keys=feature_list_dense_keys, + feature_list_dense_types=feature_list_dense_types, + feature_list_dense_shapes=feature_list_dense_shapes, + feature_list_dense_missing_assumed_empty=( + feature_list_dense_missing_assumed_empty), + name=name) + # pylint: enable=protected-access + + (context_sparse_indices, context_sparse_values, context_sparse_shapes, + context_dense_values, feature_list_sparse_indices, + feature_list_sparse_values, feature_list_sparse_shapes, + feature_list_dense_values, feature_list_dense_lengths) = outputs + + context_sparse_tensors = [ + sparse_tensor.SparseTensor(ix, val, shape) + for (ix, val, + shape) in zip(context_sparse_indices, context_sparse_values, + context_sparse_shapes) + ] + + feature_list_sparse_tensors = [ + sparse_tensor.SparseTensor(ix, val, shape) + for (ix, val, shape + ) in zip(feature_list_sparse_indices, feature_list_sparse_values, + feature_list_sparse_shapes) + ] + + context_output = dict( + zip(context_sparse_keys + context_dense_keys, + context_sparse_tensors + context_dense_values)) + feature_list_output = dict( + zip(feature_list_sparse_keys + feature_list_dense_keys, + feature_list_sparse_tensors + feature_list_dense_values)) + feature_list_lengths = dict( + zip(feature_list_dense_keys, feature_list_dense_lengths)) + + return (context_output, feature_list_output, feature_list_lengths) + + +# TODO(sundberg): rewrite this method to call the batch version, which is more +# efficient especially for large inputs. @tf_export("parse_single_sequence_example") def parse_single_sequence_example( serialized, context_features=None, sequence_features=None, diff --git a/tensorflow/tools/api/golden/v1/tensorflow.io.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.io.pbtxt index 3a36c168aa..8938cf217b 100644 --- a/tensorflow/tools/api/golden/v1/tensorflow.io.pbtxt +++ b/tensorflow/tools/api/golden/v1/tensorflow.io.pbtxt @@ -25,6 +25,10 @@ tf_module { argspec: "args=[\'pattern\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], " } member_method { + name: "parse_sequence_example" + argspec: "args=[\'serialized\', \'context_features\', \'sequence_features\', \'example_names\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], " + } + member_method { name: "parse_tensor" argspec: "args=[\'serialized\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], " } diff --git a/tensorflow/tools/api/golden/v2/tensorflow.io.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.io.pbtxt index 3a36c168aa..8938cf217b 100644 --- a/tensorflow/tools/api/golden/v2/tensorflow.io.pbtxt +++ b/tensorflow/tools/api/golden/v2/tensorflow.io.pbtxt @@ -25,6 +25,10 @@ tf_module { argspec: "args=[\'pattern\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], " } member_method { + name: "parse_sequence_example" + argspec: "args=[\'serialized\', \'context_features\', \'sequence_features\', \'example_names\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], " + } + member_method { name: "parse_tensor" argspec: "args=[\'serialized\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], " } |