diff options
author | 2018-09-27 10:31:36 -0700 | |
---|---|---|
committer | 2018-09-27 10:36:29 -0700 | |
commit | 334244be6864dd1dbec9bc8bb4996cc286a8e3e3 (patch) | |
tree | d379f72fe1d7a68e74d1ee27bed737775c09bfec /tensorflow/core/kernels | |
parent | 3002b10e29363854c6fc20d788bc65233fd5116f (diff) |
Add tf.strings.unicode_script, which detects the script of a unicode codepoint
based on standard ranges.
PiperOrigin-RevId: 214796357
Diffstat (limited to 'tensorflow/core/kernels')
-rw-r--r-- | tensorflow/core/kernels/BUILD | 12 | ||||
-rw-r--r-- | tensorflow/core/kernels/unicode_script_op.cc | 53 |
2 files changed, 65 insertions, 0 deletions
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD index 0534b1829d..0b8e9ec527 100644 --- a/tensorflow/core/kernels/BUILD +++ b/tensorflow/core/kernels/BUILD @@ -4431,6 +4431,7 @@ cc_library( ":string_strip_op", ":string_to_hash_bucket_op", ":substr_op", + ":unicode_script_op", ], ) @@ -5471,6 +5472,7 @@ filegroup( "batch_kernels.*", "regex_full_match_op.cc", "regex_replace_op.cc", + "unicode_script_op.cc", # Ops that are inherently incompatible with Android (e.g. tied to x86 platform). "mkl_*", "xsmm_*", @@ -6565,6 +6567,16 @@ tf_kernel_library( ], ) +tf_kernel_library( + name = "unicode_script_op", + srcs = ["unicode_script_op.cc"], + deps = [ + "//tensorflow/core:framework", + "//tensorflow/core:string_ops_op_lib", + "@icu//:common", + ], +) + # ----------------------------------------------------------------------------- # Google-internal targets. These must be at the end for syncrepo. diff --git a/tensorflow/core/kernels/unicode_script_op.cc b/tensorflow/core/kernels/unicode_script_op.cc new file mode 100644 index 0000000000..085e397eba --- /dev/null +++ b/tensorflow/core/kernels/unicode_script_op.cc @@ -0,0 +1,53 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "unicode/errorcode.h" // TF:icu +#include "unicode/uscript.h" // TF:icu +#include "tensorflow/core/framework/op_kernel.h" + +namespace tensorflow { + +class UnicodeScriptOp : public OpKernel { + public: + explicit UnicodeScriptOp(OpKernelConstruction* context) : OpKernel(context) {} + + void Compute(OpKernelContext* context) override { + const Tensor* input_tensor; + OP_REQUIRES_OK(context, context->input("input", &input_tensor)); + const auto& input_flat = input_tensor->flat<int32>(); + + Tensor* output_tensor = nullptr; + OP_REQUIRES_OK(context, + context->allocate_output("output", input_tensor->shape(), + &output_tensor)); + auto output_flat = output_tensor->flat<int32>(); + + icu::ErrorCode status; + for (int i = 0; i < input_flat.size(); i++) { + UScriptCode script_code = uscript_getScript(input_flat(i), status); + if (status.isSuccess()) { + output_flat(i) = script_code; + } else { + output_flat(i) = -1; + status.reset(); + } + } + } +}; + +REGISTER_KERNEL_BUILDER(Name("UnicodeScript").Device(DEVICE_CPU), + UnicodeScriptOp); + +} // namespace tensorflow |