diff options
author | 2018-09-27 10:31:36 -0700 | |
---|---|---|
committer | 2018-09-27 10:36:29 -0700 | |
commit | 334244be6864dd1dbec9bc8bb4996cc286a8e3e3 (patch) | |
tree | d379f72fe1d7a68e74d1ee27bed737775c09bfec /tensorflow/core | |
parent | 3002b10e29363854c6fc20d788bc65233fd5116f (diff) |
Add tf.strings.unicode_script, which detects the script of a unicode codepoint
based on standard ranges.
PiperOrigin-RevId: 214796357
Diffstat (limited to 'tensorflow/core')
-rw-r--r-- | tensorflow/core/api_def/base_api/api_def_UnicodeScript.pbtxt | 28 | ||||
-rw-r--r-- | tensorflow/core/api_def/python_api/api_def_UnicodeScript.pbtxt | 6 | ||||
-rw-r--r-- | tensorflow/core/kernels/BUILD | 12 | ||||
-rw-r--r-- | tensorflow/core/kernels/unicode_script_op.cc | 53 | ||||
-rw-r--r-- | tensorflow/core/ops/string_ops.cc | 5 |
5 files changed, 104 insertions, 0 deletions
diff --git a/tensorflow/core/api_def/base_api/api_def_UnicodeScript.pbtxt b/tensorflow/core/api_def/base_api/api_def_UnicodeScript.pbtxt new file mode 100644 index 0000000000..7898fe8d6b --- /dev/null +++ b/tensorflow/core/api_def/base_api/api_def_UnicodeScript.pbtxt @@ -0,0 +1,28 @@ +op { + graph_op_name: "UnicodeScript" + endpoint { + name: "UnicodeScript" + } + in_arg { + name: "input" + description: <<END +A Tensor of int32 Unicode code points. +END + } + out_arg { + name: "output" + description: <<END +A Tensor of int32 script codes corresponding to each input code point. +END + } + summary: <<END +Determine the script codes of a given tensor of Unicode integer code points. +END + description: <<END +This operation converts Unicode code points to script codes corresponding to +each code point. Script codes correspond to International Components for +Unicode (ICU) UScriptCode values. See http://icu-project.org/apiref/icu4c/uscript_8h.html. +Returns -1 (USCRIPT_INVALID_CODE) for invalid codepoints. Output shape will +match input shape. +END +} diff --git a/tensorflow/core/api_def/python_api/api_def_UnicodeScript.pbtxt b/tensorflow/core/api_def/python_api/api_def_UnicodeScript.pbtxt new file mode 100644 index 0000000000..a884a46143 --- /dev/null +++ b/tensorflow/core/api_def/python_api/api_def_UnicodeScript.pbtxt @@ -0,0 +1,6 @@ +op { + graph_op_name: "UnicodeScript" + endpoint { + name: "strings.unicode_script" + } +} diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD index 0534b1829d..0b8e9ec527 100644 --- a/tensorflow/core/kernels/BUILD +++ b/tensorflow/core/kernels/BUILD @@ -4431,6 +4431,7 @@ cc_library( ":string_strip_op", ":string_to_hash_bucket_op", ":substr_op", + ":unicode_script_op", ], ) @@ -5471,6 +5472,7 @@ filegroup( "batch_kernels.*", "regex_full_match_op.cc", "regex_replace_op.cc", + "unicode_script_op.cc", # Ops that are inherently incompatible with Android (e.g. tied to x86 platform). "mkl_*", "xsmm_*", @@ -6565,6 +6567,16 @@ tf_kernel_library( ], ) +tf_kernel_library( + name = "unicode_script_op", + srcs = ["unicode_script_op.cc"], + deps = [ + "//tensorflow/core:framework", + "//tensorflow/core:string_ops_op_lib", + "@icu//:common", + ], +) + # ----------------------------------------------------------------------------- # Google-internal targets. These must be at the end for syncrepo. diff --git a/tensorflow/core/kernels/unicode_script_op.cc b/tensorflow/core/kernels/unicode_script_op.cc new file mode 100644 index 0000000000..085e397eba --- /dev/null +++ b/tensorflow/core/kernels/unicode_script_op.cc @@ -0,0 +1,53 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "unicode/errorcode.h" // TF:icu +#include "unicode/uscript.h" // TF:icu +#include "tensorflow/core/framework/op_kernel.h" + +namespace tensorflow { + +class UnicodeScriptOp : public OpKernel { + public: + explicit UnicodeScriptOp(OpKernelConstruction* context) : OpKernel(context) {} + + void Compute(OpKernelContext* context) override { + const Tensor* input_tensor; + OP_REQUIRES_OK(context, context->input("input", &input_tensor)); + const auto& input_flat = input_tensor->flat<int32>(); + + Tensor* output_tensor = nullptr; + OP_REQUIRES_OK(context, + context->allocate_output("output", input_tensor->shape(), + &output_tensor)); + auto output_flat = output_tensor->flat<int32>(); + + icu::ErrorCode status; + for (int i = 0; i < input_flat.size(); i++) { + UScriptCode script_code = uscript_getScript(input_flat(i), status); + if (status.isSuccess()) { + output_flat(i) = script_code; + } else { + output_flat(i) = -1; + status.reset(); + } + } + } +}; + +REGISTER_KERNEL_BUILDER(Name("UnicodeScript").Device(DEVICE_CPU), + UnicodeScriptOp); + +} // namespace tensorflow diff --git a/tensorflow/core/ops/string_ops.cc b/tensorflow/core/ops/string_ops.cc index da1d2a6432..b4fbde54d9 100644 --- a/tensorflow/core/ops/string_ops.cc +++ b/tensorflow/core/ops/string_ops.cc @@ -244,4 +244,9 @@ REGISTER_OP("Substr") return shape_inference::BroadcastBinaryOpShapeFn(c); }); +REGISTER_OP("UnicodeScript") + .Input("input: int32") + .Output("output: int32") + .SetShapeFn(shape_inference::UnchangedShape); + } // namespace tensorflow |