diff options
Diffstat (limited to 'tensorflow/core/kernels/string_util.h')
-rw-r--r-- | tensorflow/core/kernels/string_util.h | 89 |
1 files changed, 89 insertions, 0 deletions
diff --git a/tensorflow/core/kernels/string_util.h b/tensorflow/core/kernels/string_util.h new file mode 100644 index 0000000000..d40e93ea33 --- /dev/null +++ b/tensorflow/core/kernels/string_util.h @@ -0,0 +1,89 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#ifndef TENSORFLOW_CORE_KERNELS_STRING_UTIL_H_ +#define TENSORFLOW_CORE_KERNELS_STRING_UTIL_H_ + +#include "tensorflow/core/lib/core/status.h" + +namespace tensorflow { + +// Enumeration for unicode encodings. Used by ops such as +// tf.strings.unicode_encode and tf.strings.unicode_decode. +// TODO(edloper): Add support for: +// UTF16, UTF32, UTF16BE, UTF32BE, UTF16LE, UTF32LE +enum class UnicodeEncoding { UTF8 }; + +// Enumeration for character units. Used by string such as +// tf.strings.length and tf.substr. +// TODO(edloper): Add support for: UTF32_CHAR, etc. +enum class CharUnit { BYTE, UTF8_CHAR }; + +// Whether or not the given byte is the trailing byte of a UTF-8/16/32 char. +inline bool IsTrailByte(char x) { return static_cast<signed char>(x) < -0x40; } + +// Sets `encoding` based on `str`. +Status ParseUnicodeEncoding(const string& str, UnicodeEncoding* encoding); + +// Sets `unit` value based on `str`. +Status ParseCharUnit(const string& str, CharUnit* unit); + +// Returns the number of Unicode characters in a UTF-8 string. +// Result may be incorrect if the input string is not valid UTF-8. +int32 UTF8StrLen(const string& string); + +// Get the next UTF8 character position starting at the given position and +// skipping the given number of characters. Position is a byte offset, and +// should never be `null`. The function return true if successful. However, if +// the end of the string is reached before the requested characters, then the +// position will point to the end of string and this function will return false. +template <typename T> +bool ForwardNUTF8CharPositions(const StringPiece in, + const T num_utf8_chars_to_shift, T* pos) { + const size_t size = in.size(); + T utf8_chars_counted = 0; + while (utf8_chars_counted < num_utf8_chars_to_shift && *pos < size) { + // move forward one utf-8 character + do { + ++*pos; + } while (IsTrailByte(in[*pos]) && *pos < size); + ++utf8_chars_counted; + } + return utf8_chars_counted == num_utf8_chars_to_shift; +} + +// Get the previous UTF8 character position starting at the given position and +// skipping the given number of characters. Position is a byte offset with a +// positive value, relative to the beginning of the string, and should never be +// `null`. The function return true if successful. However, if the beginning of +// the string is reached before the requested character, then the position will +// point to the beginning of the string and this function will return false. +template <typename T> +bool BackNUTF8CharPositions(const StringPiece in, + const T num_utf8_chars_to_shift, T* pos) { + const size_t start = 0; + T utf8_chars_counted = 0; + while (utf8_chars_counted < num_utf8_chars_to_shift && (*pos > start)) { + // move back one utf-8 character + do { + --*pos; + } while (IsTrailByte(in[*pos]) && *pos > start); + ++utf8_chars_counted; + } + return utf8_chars_counted == num_utf8_chars_to_shift; +} + +} // namespace tensorflow + +#endif // TENSORFLOW_CORE_KERNELS_STRING_UTIL_H_ |