diff options
Diffstat (limited to 'src/utils')
-rw-r--r-- | src/utils/SkUTF.cpp | 253 | ||||
-rw-r--r-- | src/utils/SkUTF.h | 68 |
2 files changed, 321 insertions, 0 deletions
diff --git a/src/utils/SkUTF.cpp b/src/utils/SkUTF.cpp new file mode 100644 index 0000000000..0670ae02a4 --- /dev/null +++ b/src/utils/SkUTF.cpp @@ -0,0 +1,253 @@ +// Copyright 2018 Google LLC. +// Use of this source code is governed by a BSD-style license that can be found in the LICENSE file. + +#include "SkUTF.h" + +#include <climits> + +static constexpr inline int32_t left_shift(int32_t value, int32_t shift) { + return (int32_t) ((uint32_t) value << shift); +} + +template <typename T> static constexpr bool is_align2(T x) { return 0 == (x & 1); } + +template <typename T> static constexpr bool is_align4(T x) { return 0 == (x & 3); } + +static constexpr inline bool utf16_is_high_surrogate(uint16_t c) { return (c & 0xFC00) == 0xD800; } + +static constexpr inline bool utf16_is_low_surrogate(uint16_t c) { return (c & 0xFC00) == 0xDC00; } + +/** @returns -1 iff invalid UTF8 byte, + 0 iff UTF8 continuation byte, + 1 iff ASCII byte, + 2 iff leading byte of 2-byte sequence, + 3 iff leading byte of 3-byte sequence, and + 4 iff leading byte of 4-byte sequence. + I.e.: if return value > 0, then gives length of sequence. +*/ +static int utf8_byte_type(uint8_t c) { + if (c < 0x80) { + return 1; + } else if (c < 0xC0) { + return 0; + } else if (c >= 0xF5 || (c & 0xFE) == 0xC0) { // "octet values c0, c1, f5 to ff never appear" + return -1; + } else { + int value = (((0xe5 << 24) >> ((unsigned)c >> 4 << 1)) & 3) + 1; + // assert(value >= 2 && value <=4); + return value; + } +} +static bool utf8_type_is_valid_leading_byte(int type) { return type > 0; } + +static bool utf8_byte_is_continuation(uint8_t c) { return utf8_byte_type(c) == 0; } + +//////////////////////////////////////////////////////////////////////////////// + +int SkUTF::CountUTF8(const char* utf8, size_t byteLength) { + if (!utf8) { + return -1; + } + int count = 0; + const char* stop = utf8 + byteLength; + while (utf8 < stop) { + int type = utf8_byte_type(*(const uint8_t*)utf8); + if (!utf8_type_is_valid_leading_byte(type) || utf8 + type > stop) { + return -1; // Sequence extends beyond end. + } + while(type-- > 1) { + ++utf8; + if (!utf8_byte_is_continuation(*(const uint8_t*)utf8)) { + return -1; + } + } + ++utf8; + ++count; + } + return count; +} + +int SkUTF::CountUTF16(const uint16_t* utf16, size_t byteLength) { + if (!utf16 || !is_align2(intptr_t(utf16)) || !is_align2(byteLength)) { + return -1; + } + const uint16_t* src = (const uint16_t*)utf16; + const uint16_t* stop = src + (byteLength >> 1); + int count = 0; + while (src < stop) { + unsigned c = *src++; + if (utf16_is_low_surrogate(c)) { + return -1; + } + if (utf16_is_high_surrogate(c)) { + if (src >= stop) { + return -1; + } + c = *src++; + if (!utf16_is_low_surrogate(c)) { + return -1; + } + } + count += 1; + } + return count; +} + +int SkUTF::CountUTF32(const int32_t* utf32, size_t byteLength) { + if (!is_align4(intptr_t(utf32)) || !is_align4(byteLength) || byteLength >> 2 > INT_MAX) { + return -1; + } + const uint32_t kInvalidUnicharMask = 0xFF000000; // unichar fits in 24 bits + const uint32_t* ptr = (const uint32_t*)utf32; + const uint32_t* stop = ptr + (byteLength >> 2); + while (ptr < stop) { + if (*ptr & kInvalidUnicharMask) { + return -1; + } + ptr += 1; + } + return (int)(byteLength >> 2); +} + +template <typename T> +static SkUnichar next_fail(const T** ptr, const T* end) { + *ptr = end; + return -1; +} + +SkUnichar SkUTF::NextUTF8(const char** ptr, const char* end) { + if (!ptr || !end ) { + return -1; + } + const uint8_t* p = (const uint8_t*)*ptr; + if (!p || p >= (const uint8_t*)end) { + return next_fail(ptr, end); + } + int c = *p; + int hic = c << 24; + + if (!utf8_type_is_valid_leading_byte(utf8_byte_type(c))) { + return next_fail(ptr, end); + } + if (hic < 0) { + uint32_t mask = (uint32_t)~0x3F; + hic = left_shift(hic, 1); + do { + ++p; + if (p >= (const uint8_t*)end) { + return next_fail(ptr, end); + } + // check before reading off end of array. + uint8_t nextByte = *p; + if (!utf8_byte_is_continuation(nextByte)) { + return next_fail(ptr, end); + } + c = (c << 6) | (nextByte & 0x3F); + mask <<= 5; + } while ((hic = left_shift(hic, 1)) < 0); + c &= ~mask; + } + *ptr = (char*)p + 1; + return c; +} + +SkUnichar SkUTF::NextUTF16(const uint16_t** ptr, const uint16_t* end) { + if (!ptr || !end ) { + return next_fail(ptr, end); + } + const uint16_t* src = *ptr; + if (!src || src + 1 > end || !is_align2(intptr_t(src))) { + return next_fail(ptr, end); + } + uint16_t c = *src++; + SkUnichar result = c; + if (utf16_is_low_surrogate(c)) { + return next_fail(ptr, end); // srcPtr should never point at low surrogate. + } + if (utf16_is_high_surrogate(c)) { + if (src + 1 > end) { + return next_fail(ptr, end); // Truncated string. + } + uint16_t low = *src++; + if (!utf16_is_low_surrogate(low)) { + return next_fail(ptr, end); + } + /* + [paraphrased from wikipedia] + Take the high surrogate and subtract 0xD800, then multiply by 0x400. + Take the low surrogate and subtract 0xDC00. Add these two results + together, and finally add 0x10000 to get the final decoded codepoint. + + unicode = (high - 0xD800) * 0x400 + low - 0xDC00 + 0x10000 + unicode = (high * 0x400) - (0xD800 * 0x400) + low - 0xDC00 + 0x10000 + unicode = (high << 10) - (0xD800 << 10) + low - 0xDC00 + 0x10000 + unicode = (high << 10) + low - ((0xD800 << 10) + 0xDC00 - 0x10000) + */ + result = (result << 10) + (SkUnichar)low - ((0xD800 << 10) + 0xDC00 - 0x10000); + } + *ptr = src; + return result; +} + +SkUnichar SkUTF::NextUTF32(const int32_t** ptr, const int32_t* end) { + if (!ptr || !end ) { + return -1; + } + const int32_t* s = *ptr; + if (!s || s + 1 > end || !is_align4(intptr_t(s))) { + return next_fail(ptr, end); + } + int32_t value = *s; + const uint32_t kInvalidUnicharMask = 0xFF000000; // unichar fits in 24 bits + if (value & kInvalidUnicharMask) { + return next_fail(ptr, end); + } + *ptr = s + 1; + return value; +} + +size_t SkUTF::ToUTF8(SkUnichar uni, char utf8[SkUTF::kMaxBytesInUTF8Sequence]) { + if ((uint32_t)uni > 0x10FFFF) { + return 0; + } + if (uni <= 127) { + if (utf8) { + *utf8 = (char)uni; + } + return 1; + } + char tmp[4]; + char* p = tmp; + size_t count = 1; + while (uni > 0x7F >> count) { + *p++ = (char)(0x80 | (uni & 0x3F)); + uni >>= 6; + count += 1; + } + if (utf8) { + p = tmp; + utf8 += count; + while (p < tmp + count - 1) { + *--utf8 = *p++; + } + *--utf8 = (char)(~(0xFF >> count) | uni); + } + return count; +} + +size_t SkUTF::ToUTF16(SkUnichar uni, uint16_t utf16[2]) { + if ((uint32_t)uni > 0x10FFFF) { + return 0; + } + int extra = (uni > 0xFFFF); + if (utf16) { + if (extra) { + utf16[0] = (uint16_t)((0xD800 - 64) + (uni >> 10)); + utf16[1] = (uint16_t)(0xDC00 | (uni & 0x3FF)); + } else { + utf16[0] = (uint16_t)uni; + } + } + return 1 + extra; +} + diff --git a/src/utils/SkUTF.h b/src/utils/SkUTF.h new file mode 100644 index 0000000000..385102aadb --- /dev/null +++ b/src/utils/SkUTF.h @@ -0,0 +1,68 @@ +// Copyright 2018 Google LLC. +// Use of this source code is governed by a BSD-style license that can be found in the LICENSE file. +#ifndef SkUTF_DEFINED +#define SkUTF_DEFINED + +#include <cstddef> +#include <cstdint> + +typedef int32_t SkUnichar; + +namespace SkUTF { + +/** Given a sequence of UTF-8 bytes, return the number of unicode codepoints. + If the sequence is invalid UTF-8, return -1. +*/ +int CountUTF8(const char* utf8, size_t byteLength); + +/** Given a sequence of aligned UTF-16 characters in machine-endian form, + return the number of unicode codepoints. If the sequence is invalid + UTF-16, return -1. +*/ +int CountUTF16(const uint16_t* utf16, size_t byteLength); + +/** Given a sequence of aligned UTF-32 characters in machine-endian form, + return the number of unicode codepoints. If the sequence is invalid + UTF-32, return -1. +*/ +int CountUTF32(const int32_t* utf32, size_t byteLength); + +/** Given a sequence of UTF-8 bytes, return the first unicode codepoint. + The pointer will be incremented to point at the next codepoint's start. If + invalid UTF-8 is encountered, set *ptr to end and return -1. +*/ +SkUnichar NextUTF8(const char** ptr, const char* end); + +/** Given a sequence of aligned UTF-16 characters in machine-endian form, + return the first unicode codepoint. The pointer will be incremented to + point at the next codepoint's start. If invalid UTF-16 is encountered, + set *ptr to end and return -1. +*/ +SkUnichar NextUTF16(const uint16_t** ptr, const uint16_t* end); + +/** Given a sequence of aligned UTF-32 characters in machine-endian form, + return the first unicode codepoint. The pointer will be incremented to + point at the next codepoint's start. If invalid UTF-32 is encountered, + set *ptr to end and return -1. +*/ +SkUnichar NextUTF32(const int32_t** ptr, const int32_t* end); + +constexpr unsigned kMaxBytesInUTF8Sequence = 4; + +/** Convert the unicode codepoint into UTF-8. If `utf8` is non-null, place the + result in that array. Return the number of bytes in the result. If `utf8` + is null, simply return the number of bytes that would be used. For invalid + unicode codepoints, return 0. +*/ +size_t ToUTF8(SkUnichar uni, char utf8[kMaxBytesInUTF8Sequence] = nullptr); + +/** Convert the unicode codepoint into UTF-16. If `utf16` is non-null, place + the result in that array. Return the number of UTF-16 code units in the + result (1 or 2). If `utf16` is null, simply return the number of code + units that would be used. For invalid unicode codepoints, return 0. +*/ +size_t ToUTF16(SkUnichar uni, uint16_t utf16[2] = nullptr); + +} // namespace SkUTF + +#endif // SkUTF_DEFINED |