diff options
Diffstat (limited to 'tensorflow/core/kernels/example_parsing_ops.cc')
-rw-r--r-- | tensorflow/core/kernels/example_parsing_ops.cc | 444 |
1 files changed, 444 insertions, 0 deletions
diff --git a/tensorflow/core/kernels/example_parsing_ops.cc b/tensorflow/core/kernels/example_parsing_ops.cc new file mode 100644 index 0000000000..c217c18207 --- /dev/null +++ b/tensorflow/core/kernels/example_parsing_ops.cc @@ -0,0 +1,444 @@ +// See docs in ../ops/parsing_ops.cc. + +#include "tensorflow/core/example/example.pb.h" +#include "tensorflow/core/framework/numeric_op.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/platform/protobuf.h" +#include "tensorflow/core/util/sparse/sparse_tensor.h" + +#include "tensorflow/core/platform/logging.h" + +namespace tensorflow { + +namespace { + +Status CheckValidType(const DataType& dtype) { + switch (dtype) { + case DT_INT64: + case DT_FLOAT: + case DT_STRING: + return Status::OK(); + default: + return errors::InvalidArgument("Received input dtype: ", + DataTypeString(dtype)); + } +} + +Status CheckTypesMatch(const Feature& feature, const DataType& dtype, + bool* match) { + switch (dtype) { + case DT_INT64: + *match = (feature.kind_case() == Feature::kInt64List); + break; + case DT_FLOAT: + *match = (feature.kind_case() == Feature::kFloatList); + break; + case DT_STRING: + *match = (feature.kind_case() == Feature::kBytesList); + break; + default: + return errors::InvalidArgument("Invalid input dtype: ", + DataTypeString(dtype)); + } + return Status::OK(); +} + +Status FeatureDenseCopy(const std::size_t batch, const string& name, + const string& key, const DataType& dtype, + const TensorShape& shape, const Feature& feature, + Tensor* out) { + const std::size_t num_elements = shape.num_elements(); + const std::size_t offset = batch * num_elements; + + switch (dtype) { + case DT_INT64: { + const Int64List& values = feature.int64_list(); + if (static_cast<size_t>(values.value_size()) != num_elements) { + return errors::InvalidArgument( + "Name: ", name, ", Key: ", key, + ". Number of int64 values != expected. " + "values size: ", + values.value_size(), " but output shape: ", + shape.ShortDebugString()); + } + auto out_p = out->flat<int64>().data() + offset; + std::copy_n(values.value().data(), num_elements, out_p); + return Status::OK(); + } + case DT_FLOAT: { + const FloatList& values = feature.float_list(); + if (static_cast<size_t>(values.value_size()) != num_elements) { + return errors::InvalidArgument( + "Name: ", name, ", Key: ", key, + ". Number of float values != expected. " + "values size: ", + values.value_size(), " but output shape: ", + shape.ShortDebugString()); + } + auto out_p = out->flat<float>().data() + offset; + std::copy_n(values.value().data(), num_elements, out_p); + return Status::OK(); + } + case DT_STRING: { + const BytesList& values = feature.bytes_list(); + if (static_cast<size_t>(values.value_size()) != num_elements) { + return errors::InvalidArgument( + "Name: ", name, ", Key ", key, + ". number of bytes values != expected. " + "values size: ", + values.value_size(), " but output shape: ", + shape.ShortDebugString()); + } + auto out_p = out->flat<string>().data() + offset; + std::transform(values.value().data(), + values.value().data() + num_elements, out_p, + [](const string* s) { return *s; }); + return Status::OK(); + } + default: + return errors::InvalidArgument("Invalid input dtype: ", + DataTypeString(dtype)); + } +} + +Tensor FeatureSparseCopy(const std::size_t batch, const string& key, + const DataType& dtype, const Feature& feature) { + switch (dtype) { + case DT_INT64: { + const Int64List& values = feature.int64_list(); + const int64 num_elements = values.value_size(); + Tensor out(dtype, TensorShape({num_elements})); + auto out_p = out.flat<int64>().data(); + std::copy_n(values.value().data(), num_elements, out_p); + return out; + } + case DT_FLOAT: { + const FloatList& values = feature.float_list(); + const int64 num_elements = values.value_size(); + Tensor out(dtype, TensorShape({num_elements})); + auto out_p = out.flat<float>().data(); + std::copy_n(values.value().data(), num_elements, out_p); + return out; + } + case DT_STRING: { + const BytesList& values = feature.bytes_list(); + const int64 num_elements = values.value_size(); + Tensor out(dtype, TensorShape({num_elements})); + auto out_p = out.flat<string>().data(); + std::transform(values.value().data(), + values.value().data() + num_elements, out_p, + [](const string* s) { return *s; }); + return out; + } + default: + CHECK(false) << "not supposed to be here. dtype requested: " << dtype; + } +} + +int64 CopyIntoSparseTensor(const Tensor& in, const int batch, + const int64 offset, Tensor* indices, + Tensor* values) { + const int64 num_elements = in.shape().num_elements(); + const DataType& dtype = in.dtype(); + CHECK_EQ(dtype, values->dtype()); + + // Update indices + auto ix_t = indices->matrix<int64>(); + int64* ix_p = &ix_t(offset, 0); + for (int64 i = 0; i < num_elements; ++i, ix_p += 2) { + *ix_p = batch; // Column 0 stores the batch entry + *(ix_p + 1) = i; // Column 1 stores the index in the batch + } + + // Copy values over + switch (dtype) { + case DT_INT64: { + std::copy_n(in.flat<int64>().data(), num_elements, + values->flat<int64>().data() + offset); + break; + } + case DT_FLOAT: { + std::copy_n(in.flat<float>().data(), num_elements, + values->flat<float>().data() + offset); + break; + } + case DT_STRING: { + std::copy_n(in.flat<string>().data(), num_elements, + values->flat<string>().data() + offset); + break; + // auto values_t = values->flat<string>().data() + offset; + // auto in_t = in.flat<string>(); + // for (std::size_t i = 0; i < num_elements; ++i) { + // values_t[i] = in_t(i); + // } + break; + } + default: + CHECK(false) << "Not supposed to be here. Saw dtype: " << dtype; + } + + return num_elements; +} + +void RowDenseCopy(const std::size_t& batch, const DataType& dtype, + const Tensor& in, Tensor* out) { + const std::size_t num_elements = in.shape().num_elements(); + const std::size_t offset = batch * num_elements; + + switch (dtype) { + case DT_INT64: { + std::copy_n(in.flat<int64>().data(), num_elements, + out->flat<int64>().data() + offset); + break; + } + case DT_FLOAT: { + std::copy_n(in.flat<float>().data(), num_elements, + out->flat<float>().data() + offset); + break; + } + case DT_STRING: { + std::copy_n(in.flat<string>().data(), num_elements, + out->flat<string>().data() + offset); + break; + } + default: + CHECK(false) << "Not supposed to be here. Saw dtype: " << dtype; + } +} + +} // namespace + +class ExampleParserOp : public OpKernel { + public: + explicit ExampleParserOp(OpKernelConstruction* ctx) : OpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("sparse_types", &sparse_types_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("Ndense", &num_dense_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("Nsparse", &num_sparse_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("Tdense", &dense_types_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("dense_shapes", &dense_shapes_)); + + OP_REQUIRES( + ctx, static_cast<size_t>(num_sparse_) == sparse_types_.size(), + errors::InvalidArgument("len(sparse_keys) != len(sparse_types")); + OP_REQUIRES(ctx, static_cast<size_t>(num_dense_) == dense_types_.size(), + errors::InvalidArgument("len(dense_keys) != len(dense_types")); + OP_REQUIRES(ctx, static_cast<size_t>(num_dense_) == dense_shapes_.size(), + errors::InvalidArgument("len(dense_keys) != len(dense_shapes")); + for (const DataType& type : dense_types_) { + OP_REQUIRES_OK(ctx, CheckValidType(type)); + } + for (const DataType& type : sparse_types_) { + OP_REQUIRES_OK(ctx, CheckValidType(type)); + } + } + + void Compute(OpKernelContext* ctx) override { + const Tensor* names; + const Tensor* serialized; + OpInputList dense_keys; + OpInputList sparse_keys; + OpInputList dense_defaults; + + OP_REQUIRES_OK(ctx, ctx->input("names", &names)); + OP_REQUIRES_OK(ctx, ctx->input("serialized", &serialized)); + OP_REQUIRES_OK(ctx, ctx->input_list("dense_keys", &dense_keys)); + OP_REQUIRES_OK(ctx, ctx->input_list("sparse_keys", &sparse_keys)); + OP_REQUIRES_OK(ctx, ctx->input_list("dense_defaults", &dense_defaults)); + + std::vector<string> dense_keys_t(num_dense_); + std::vector<string> sparse_keys_t(num_sparse_); + CHECK_EQ(dense_keys.size(), num_dense_); + CHECK_EQ(sparse_keys.size(), num_sparse_); + for (int di = 0; di < num_dense_; ++di) { + dense_keys_t[di] = dense_keys[di].scalar<string>()(); + } + for (int di = 0; di < num_sparse_; ++di) { + sparse_keys_t[di] = sparse_keys[di].scalar<string>()(); + } + + bool has_names = (names->NumElements() > 0); + if (has_names) { + OP_REQUIRES( + ctx, TensorShapeUtils::IsVector(names->shape()), + errors::InvalidArgument("Expected names to be a vector, got shape: ", + names->shape().ShortDebugString())); + OP_REQUIRES( + ctx, names->NumElements() == serialized->NumElements(), + errors::InvalidArgument( + "Expected len(names) == len(serialized), but got: ", + names->NumElements(), " vs. ", serialized->NumElements())); + } + auto names_t = names->flat<string>(); + + OP_REQUIRES(ctx, TensorShapeUtils::IsVector(serialized->shape()), + errors::InvalidArgument( + "Expected serialized to be a vector, got shape: ", + serialized->shape().ShortDebugString())); + OP_REQUIRES(ctx, dense_defaults.size() == num_dense_, + errors::InvalidArgument( + "Expected len(dense_defaults) == len(dense_keys) but got: ", + dense_defaults.size(), " vs. ", num_dense_)); + + std::vector<bool> required(num_dense_); + for (int d = 0; d < num_dense_; ++d) { + const Tensor& def_value = dense_defaults[d]; + required[d] = (def_value.NumElements() == 0); // No default provided. + + if (def_value.NumElements() > 0) { + OP_REQUIRES( + ctx, def_value.shape() == dense_shapes_[d], + errors::InvalidArgument("def_value[", d, "].shape() == ", + def_value.shape().ShortDebugString(), + " != dense_shapes_[", d, "] == ", + dense_shapes_[d].ShortDebugString())); + OP_REQUIRES(ctx, def_value.dtype() == dense_types_[d], + errors::InvalidArgument( + "dense_defaults[", d, "].dtype() == ", + DataTypeString(def_value.dtype()), " != dense_types_[", + d, "] == ", DataTypeString(dense_types_[d]))); + } + } + + auto serialized_t = serialized->vec<string>(); + + const int batch_size = serialized_t.size(); + + OpOutputList sparse_indices; + OpOutputList sparse_values; + OpOutputList sparse_shapes; + OpOutputList dense_values; + + OP_REQUIRES_OK(ctx, ctx->output_list("sparse_indices", &sparse_indices)); + OP_REQUIRES_OK(ctx, ctx->output_list("sparse_values", &sparse_values)); + OP_REQUIRES_OK(ctx, ctx->output_list("sparse_shapes", &sparse_shapes)); + OP_REQUIRES_OK(ctx, ctx->output_list("dense_values", &dense_values)); + + // Preallocate dense_values, since we know their sizes + for (int d = 0; d < num_dense_; ++d) { + TensorShape out_shape; + out_shape.AddDim(batch_size); + for (const int dim : dense_shapes_[d].dim_sizes()) out_shape.AddDim(dim); + Tensor* out = nullptr; + dense_values.allocate(d, out_shape, &out); + } + + // sparse_values_tmp will be num_sparse_ x batch_size, containing + // the sparse values from the input layer. after these are all + // stored, we can allocate properly sized outputs and copy data over. + // Doing it this way saves us the trouble of either performing + // deserialization twice, or alternatively storing all copies of + // the full Example protos. + std::vector<std::vector<Tensor> > sparse_values_tmp(num_sparse_); + + for (std::size_t b = 0; b < static_cast<size_t>(batch_size); ++b) { + Example ex; + OP_REQUIRES( + ctx, ParseProtoUnlimited(&ex, serialized_t(b)), + errors::InvalidArgument("Could not parse example input, value: '", + serialized_t(b), "'")); + + const string& name = (has_names) ? names_t(b) : "<unknown>"; + const Features& features = ex.features(); + const auto& feature_dict = features.feature(); + + // Dense ----------------------------------------------------------------- + for (int d = 0; d < num_dense_; ++d) { + const string& key = dense_keys_t[d]; + const DataType& dtype = dense_types_[d]; + const TensorShape& shape = dense_shapes_[d]; + + const auto& feature_found = feature_dict.find(key); + OP_REQUIRES( + ctx, (feature_found != feature_dict.end()) || !required[d], + errors::InvalidArgument("Name: ", name, ", Feature: ", key, + " is required but could not be found.")); + if (feature_found != feature_dict.end()) { + const Feature& f = feature_found->second; + bool types_match; + OP_REQUIRES_OK(ctx, CheckTypesMatch(f, dtype, &types_match)); + OP_REQUIRES( + ctx, types_match, + errors::InvalidArgument("Name: ", name, ", Feature: ", key, + ". Data types don't match. ", + "Expected type: ", DataTypeString(dtype), + " Feature is: ", f.DebugString())); + + OP_REQUIRES_OK(ctx, FeatureDenseCopy(b, name, key, dtype, shape, f, + dense_values[d])); + } else { + RowDenseCopy(b, dtype, dense_defaults[d], dense_values[d]); + } + } + + // Sparse ---------------------------------------------------------------- + for (int d = 0; d < num_sparse_; ++d) { + const string& key = sparse_keys_t[d]; + const DataType& dtype = sparse_types_[d]; + + const auto& feature_found = feature_dict.find(key); + bool feature_has_data = // Found key & data type is set + (feature_found != feature_dict.end() && + (feature_found->second.kind_case() != Feature::KIND_NOT_SET)); + if (feature_has_data) { + const Feature& f = feature_found->second; + bool types_match; + OP_REQUIRES_OK(ctx, CheckTypesMatch(f, dtype, &types_match)); + OP_REQUIRES( + ctx, types_match, + errors::InvalidArgument("Name: ", name, ", Feature: ", key, + ". Data types don't match. ", + "Expected type: ", DataTypeString(dtype), + " Feature is: ", f.DebugString())); + sparse_values_tmp[d].push_back(FeatureSparseCopy(b, key, dtype, f)); + } else { + sparse_values_tmp[d].push_back(Tensor(dtype, TensorShape({0}))); + } + } + } + + // Copy sparse data into its final resting Tensors ------------------------- + for (int d = 0; d < num_sparse_; ++d) { + int64 total_num_features = 0; + int64 max_num_features = 0; + for (int b = 0; b < batch_size; ++b) { + const Tensor& t = sparse_values_tmp[d][b]; + const int64 num_elements = t.shape().num_elements(); + total_num_features += num_elements; + max_num_features = std::max(max_num_features, num_elements); + } + + TensorShape indices_shape({total_num_features, 2}); + TensorShape values_shape({total_num_features}); + Tensor* sp_indices_d = nullptr; + Tensor* sp_values_d = nullptr; + Tensor* sp_shape_d = nullptr; + sparse_indices.allocate(d, indices_shape, &sp_indices_d); + sparse_values.allocate(d, values_shape, &sp_values_d); + sparse_shapes.allocate(d, TensorShape({2}), &sp_shape_d); + + auto shape_t = sp_shape_d->vec<int64>(); + shape_t(0) = batch_size; + shape_t(1) = max_num_features; + + int64 offset = 0; + + for (int b = 0; b < batch_size; ++b) { + const int64 num_elements = CopyIntoSparseTensor( + sparse_values_tmp[d][b], b, offset, sp_indices_d, sp_values_d); + offset += num_elements; + } + } + } + + protected: + int64 num_sparse_; + int64 num_dense_; + std::vector<DataType> sparse_types_; + std::vector<DataType> dense_types_; + std::vector<TensorShape> dense_shapes_; +}; + +REGISTER_KERNEL_BUILDER(Name("ParseExample").Device(DEVICE_CPU), + ExampleParserOp); + +} // namespace tensorflow |