diff options
author | 2018-01-09 13:32:17 -0800 | |
---|---|---|
committer | 2018-01-09 13:36:12 -0800 | |
commit | 3e852d462aaba446f62f76007405c0794a6087b9 (patch) | |
tree | 790dc1747aa319facc98f18450a94015f83a9a89 /tensorflow/core/ops/string_ops.cc | |
parent | 55cd506ab8220c6a1075965eb7839cac4af1db3e (diff) |
Automated g4 rollback of changelist 180691955
PiperOrigin-RevId: 181365803
Diffstat (limited to 'tensorflow/core/ops/string_ops.cc')
-rw-r--r-- | tensorflow/core/ops/string_ops.cc | 263 |
1 files changed, 10 insertions, 253 deletions
diff --git a/tensorflow/core/ops/string_ops.cc b/tensorflow/core/ops/string_ops.cc index aebd14c7e5..8beb28de0a 100644 --- a/tensorflow/core/ops/string_ops.cc +++ b/tensorflow/core/ops/string_ops.cc @@ -27,67 +27,20 @@ REGISTER_OP("StringToHashBucketFast") .Input("input: string") .Output("output: int64") .Attr("num_buckets: int >= 1") - .SetShapeFn(shape_inference::UnchangedShape) - .Doc(R"doc( -Converts each string in the input Tensor to its hash mod by a number of buckets. - -The hash function is deterministic on the content of the string within the -process and will never change. However, it is not suitable for cryptography. -This function may be used when CPU time is scarce and inputs are trusted or -unimportant. There is a risk of adversaries constructing inputs that all hash -to the same bucket. To prevent this problem, use a strong hash function with -`tf.string_to_hash_bucket_strong`. - -input: The strings to assign a hash bucket. -num_buckets: The number of buckets. -output: A Tensor of the same shape as the input `string_tensor`. -)doc"); + .SetShapeFn(shape_inference::UnchangedShape); REGISTER_OP("StringToHashBucketStrong") .Input("input: string") .Output("output: int64") .Attr("num_buckets: int >= 1") .Attr("key: list(int)") - .SetShapeFn(shape_inference::UnchangedShape) - .Doc(R"doc( -Converts each string in the input Tensor to its hash mod by a number of buckets. - -The hash function is deterministic on the content of the string within the -process. The hash function is a keyed hash function, where attribute `key` -defines the key of the hash function. `key` is an array of 2 elements. - -A strong hash is important when inputs may be malicious, e.g. URLs with -additional components. Adversaries could try to make their inputs hash to the -same bucket for a denial-of-service attack or to skew the results. A strong -hash prevents this by making it difficult, if not infeasible, to compute inputs -that hash to the same bucket. This comes at a cost of roughly 4x higher compute -time than `tf.string_to_hash_bucket_fast`. - -input: The strings to assign a hash bucket. -num_buckets: The number of buckets. -key: The key for the keyed hash function passed as a list of two uint64 - elements. -output: A Tensor of the same shape as the input `string_tensor`. -)doc"); + .SetShapeFn(shape_inference::UnchangedShape); REGISTER_OP("StringToHashBucket") .Input("string_tensor: string") .Output("output: int64") .Attr("num_buckets: int >= 1") - .SetShapeFn(shape_inference::UnchangedShape) - .Doc(R"doc( -Converts each string in the input Tensor to its hash mod by a number of buckets. - -The hash function is deterministic on the content of the string within the -process. - -Note that the hash function may change from time to time. -This functionality will be deprecated and it's recommended to use -`tf.string_to_hash_bucket_fast()` or `tf.string_to_hash_bucket_strong()`. - -num_buckets: The number of buckets. -output: A Tensor of the same shape as the input `string_tensor`. -)doc"); + .SetShapeFn(shape_inference::UnchangedShape); REGISTER_OP("ReduceJoin") .Input("inputs: string") @@ -95,41 +48,7 @@ REGISTER_OP("ReduceJoin") .Attr("keep_dims: bool = false") .Attr("separator: string = ''") .Output("output: string") - .SetShapeFn(shape_inference::ReductionShape) - .Doc(R"doc( -Joins a string Tensor across the given dimensions. - -Computes the string join across dimensions in the given string Tensor of shape -`[d_0, d_1, ..., d_n-1]`. Returns a new Tensor created by joining the input -strings with the given separator (default: empty string). Negative indices are -counted backwards from the end, with `-1` being equivalent to `n - 1`. - -For example: - -```python -# tensor `a` is [["a", "b"], ["c", "d"]] -tf.reduce_join(a, 0) ==> ["ac", "bd"] -tf.reduce_join(a, 1) ==> ["ab", "cd"] -tf.reduce_join(a, -2) = tf.reduce_join(a, 0) ==> ["ac", "bd"] -tf.reduce_join(a, -1) = tf.reduce_join(a, 1) ==> ["ab", "cd"] -tf.reduce_join(a, 0, keep_dims=True) ==> [["ac", "bd"]] -tf.reduce_join(a, 1, keep_dims=True) ==> [["ab"], ["cd"]] -tf.reduce_join(a, 0, separator=".") ==> ["a.c", "b.d"] -tf.reduce_join(a, [0, 1]) ==> ["acbd"] -tf.reduce_join(a, [1, 0]) ==> ["abcd"] -tf.reduce_join(a, []) ==> ["abcd"] -``` - -inputs: The input to be joined. All reduced indices must have non-zero size. -reduction_indices: The dimensions to reduce over. Dimensions are reduced in the - order specified. Omitting `reduction_indices` is equivalent to passing - `[n-1, n-2, ..., 0]`. Negative indices from `-n` to `-1` are supported. -keep_dims: If `True`, retain reduced dimensions with length `1`. -separator: The separator to use when joining. - -output: Has shape equal to that of the input with reduced dimensions removed or - set to `1` depending on `keep_dims`. -)doc"); + .SetShapeFn(shape_inference::ReductionShape); REGISTER_OP("AsString") .Input("input: T") @@ -140,22 +59,7 @@ REGISTER_OP("AsString") .Attr("shortest: bool = false") .Attr("width: int = -1") .Attr("fill: string = ''") - .SetShapeFn(shape_inference::UnchangedShape) - .Doc(R"doc( -Converts each entry in the given tensor to strings. Supports many numeric -types and boolean. - -precision: The post-decimal precision to use for floating point numbers. - Only used if precision > -1. -scientific: Use scientific notation for floating point numbers. -shortest: Use shortest representation (either scientific or standard) for - floating point numbers. -width: Pad pre-decimal numbers to this width. - Applies to both floating point and integer numbers. - Only used if width > -1. -fill: The value to pad if width > -1. If empty, pads with spaces. - Another typical value is '0'. String cannot be longer than 1 character. -)doc"); + .SetShapeFn(shape_inference::UnchangedShape); REGISTER_OP("StringJoin") .Input("inputs: N * string") @@ -185,16 +89,7 @@ REGISTER_OP("StringJoin") } c->set_output(0, out); return Status::OK(); - }) - .Doc(R"doc( -Joins the strings in the given list of string tensors into one tensor; -with the given separator (default is an empty separator). - -inputs: A list of string tensors. The tensors must all have the same shape, - or be scalars. Scalars may be mixed in; these will be broadcast to the shape - of non-scalar inputs. -separator: string, an optional join separator. -)doc"); + }); REGISTER_OP("StringSplit") .Input("input: string") @@ -212,74 +107,18 @@ REGISTER_OP("StringSplit") c->set_output(1, c->Vector(InferenceContext::kUnknownDim)); c->set_output(2, c->Vector(2)); return Status::OK(); - }) - .Doc(R"doc( -Split elements of `input` based on `delimiter` into a `SparseTensor`. - -Let N be the size of source (typically N will be the batch size). Split each -element of `input` based on `delimiter` and return a `SparseTensor` -containing the splitted tokens. Empty tokens are ignored. - -`delimiter` can be empty, or a string of split characters. If `delimiter` is an - empty string, each element of `input` is split into individual single-byte - character strings, including splitting of UTF-8 multibyte sequences. Otherwise - every character of `delimiter` is a potential split point. - -For example: - N = 2, input[0] is 'hello world' and input[1] is 'a b c', then the output - will be - - indices = [0, 0; - 0, 1; - 1, 0; - 1, 1; - 1, 2] - shape = [2, 3] - values = ['hello', 'world', 'a', 'b', 'c'] - -input: 1-D. Strings to split. -delimiter: 0-D. Delimiter characters (bytes), or empty string. -skip_empty: A `bool`. If `True`, skip the empty strings from the result. -indices: A dense matrix of int64 representing the indices of the sparse tensor. -values: A vector of strings corresponding to the splited values. -shape: a length-2 vector of int64 representing the shape of the sparse - tensor, where the first value is N and the second value is the maximum number - of tokens in a single input entry. -)doc"); + }); REGISTER_OP("EncodeBase64") .Input("input: string") .Output("output: string") .Attr("pad: bool = false") - .SetShapeFn(shape_inference::UnchangedShape) - .Doc(R"doc( -Encode strings into web-safe base64 format. - -Refer to the following article for more information on base64 format: -en.wikipedia.org/wiki/Base64. Base64 strings may have padding with '=' at the -end so that the encoded has length multiple of 4. See Padding section of the -link above. - -Web-safe means that the encoder uses - and _ instead of + and /. - -input: Strings to be encoded. -output: Input strings encoded in base64. -pad: Bool whether padding is applied at the ends. -)doc"); + .SetShapeFn(shape_inference::UnchangedShape); REGISTER_OP("DecodeBase64") .Input("input: string") .Output("output: string") - .SetShapeFn(shape_inference::UnchangedShape) - .Doc(R"doc( -Decode web-safe base64-encoded strings. - -Input may or may not have padding at the end. See EncodeBase64 for padding. -Web-safe means that input must use - and _ instead of + and /. - -input: Base64 strings to decode. -output: Decoded strings. -)doc"); + .SetShapeFn(shape_inference::UnchangedShape); REGISTER_OP("Substr") .Input("input: string") @@ -306,88 +145,6 @@ REGISTER_OP("Substr") // c->input(0) is the ShapeHandle to input strings // BroadcastBinaryOpShapeFn infers shape from c->input(0) and c->input(1). return shape_inference::BroadcastBinaryOpShapeFn(c); - }) - .Doc(R"doc( -Return substrings from `Tensor` of strings. - -For each string in the input `Tensor`, creates a substring starting at index -`pos` with a total length of `len`. - -If `len` defines a substring that would extend beyond the length of the input -string, then as many characters as possible are used. - -If `pos` is negative or specifies a character index larger than any of the input -strings, then an `InvalidArgumentError` is thrown. - -`pos` and `len` must have the same shape, otherwise a `ValueError` is thrown on -Op creation. - -*NOTE*: `Substr` supports broadcasting up to two dimensions. More about -broadcasting -[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html) - ---- - -Examples - -Using scalar `pos` and `len`: - -```python -input = [b'Hello', b'World'] -position = 1 -length = 3 - -output = [b'ell', b'orl'] -``` - -Using `pos` and `len` with same shape as `input`: - -```python -input = [[b'ten', b'eleven', b'twelve'], - [b'thirteen', b'fourteen', b'fifteen'], - [b'sixteen', b'seventeen', b'eighteen']] -position = [[1, 2, 3], - [1, 2, 3], - [1, 2, 3]] -length = [[2, 3, 4], - [4, 3, 2], - [5, 5, 5]] - -output = [[b'en', b'eve', b'lve'], - [b'hirt', b'urt', b'te'], - [b'ixtee', b'vente', b'hteen']] -``` - -Broadcasting `pos` and `len` onto `input`: - -``` -input = [[b'ten', b'eleven', b'twelve'], - [b'thirteen', b'fourteen', b'fifteen'], - [b'sixteen', b'seventeen', b'eighteen'], - [b'nineteen', b'twenty', b'twentyone']] -position = [1, 2, 3] -length = [1, 2, 3] - -output = [[b'e', b'ev', b'lve'], - [b'h', b'ur', b'tee'], - [b'i', b've', b'hte'], - [b'i', b'en', b'nty']] -``` - -Broadcasting `input` onto `pos` and `len`: - -``` -input = b'thirteen' -position = [1, 5, 7] -length = [3, 2, 1] - -output = [b'hir', b'ee', b'n'] -``` - -input: Tensor of strings -pos: Scalar defining the position of first character in each substring -len: Scalar defining the number of characters to include in each substring -output: Tensor of substrings -)doc"); + }); } // namespace tensorflow |