aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/utils/SkUTF.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/utils/SkUTF.cpp')
-rw-r--r--src/utils/SkUTF.cpp253
1 files changed, 253 insertions, 0 deletions
diff --git a/src/utils/SkUTF.cpp b/src/utils/SkUTF.cpp
new file mode 100644
index 0000000000..0670ae02a4
--- /dev/null
+++ b/src/utils/SkUTF.cpp
@@ -0,0 +1,253 @@
+// Copyright 2018 Google LLC.
+// Use of this source code is governed by a BSD-style license that can be found in the LICENSE file.
+
+#include "SkUTF.h"
+
+#include <climits>
+
+static constexpr inline int32_t left_shift(int32_t value, int32_t shift) {
+ return (int32_t) ((uint32_t) value << shift);
+}
+
+template <typename T> static constexpr bool is_align2(T x) { return 0 == (x & 1); }
+
+template <typename T> static constexpr bool is_align4(T x) { return 0 == (x & 3); }
+
+static constexpr inline bool utf16_is_high_surrogate(uint16_t c) { return (c & 0xFC00) == 0xD800; }
+
+static constexpr inline bool utf16_is_low_surrogate(uint16_t c) { return (c & 0xFC00) == 0xDC00; }
+
+/** @returns -1 iff invalid UTF8 byte,
+ 0 iff UTF8 continuation byte,
+ 1 iff ASCII byte,
+ 2 iff leading byte of 2-byte sequence,
+ 3 iff leading byte of 3-byte sequence, and
+ 4 iff leading byte of 4-byte sequence.
+ I.e.: if return value > 0, then gives length of sequence.
+*/
+static int utf8_byte_type(uint8_t c) {
+ if (c < 0x80) {
+ return 1;
+ } else if (c < 0xC0) {
+ return 0;
+ } else if (c >= 0xF5 || (c & 0xFE) == 0xC0) { // "octet values c0, c1, f5 to ff never appear"
+ return -1;
+ } else {
+ int value = (((0xe5 << 24) >> ((unsigned)c >> 4 << 1)) & 3) + 1;
+ // assert(value >= 2 && value <=4);
+ return value;
+ }
+}
+static bool utf8_type_is_valid_leading_byte(int type) { return type > 0; }
+
+static bool utf8_byte_is_continuation(uint8_t c) { return utf8_byte_type(c) == 0; }
+
+////////////////////////////////////////////////////////////////////////////////
+
+int SkUTF::CountUTF8(const char* utf8, size_t byteLength) {
+ if (!utf8) {
+ return -1;
+ }
+ int count = 0;
+ const char* stop = utf8 + byteLength;
+ while (utf8 < stop) {
+ int type = utf8_byte_type(*(const uint8_t*)utf8);
+ if (!utf8_type_is_valid_leading_byte(type) || utf8 + type > stop) {
+ return -1; // Sequence extends beyond end.
+ }
+ while(type-- > 1) {
+ ++utf8;
+ if (!utf8_byte_is_continuation(*(const uint8_t*)utf8)) {
+ return -1;
+ }
+ }
+ ++utf8;
+ ++count;
+ }
+ return count;
+}
+
+int SkUTF::CountUTF16(const uint16_t* utf16, size_t byteLength) {
+ if (!utf16 || !is_align2(intptr_t(utf16)) || !is_align2(byteLength)) {
+ return -1;
+ }
+ const uint16_t* src = (const uint16_t*)utf16;
+ const uint16_t* stop = src + (byteLength >> 1);
+ int count = 0;
+ while (src < stop) {
+ unsigned c = *src++;
+ if (utf16_is_low_surrogate(c)) {
+ return -1;
+ }
+ if (utf16_is_high_surrogate(c)) {
+ if (src >= stop) {
+ return -1;
+ }
+ c = *src++;
+ if (!utf16_is_low_surrogate(c)) {
+ return -1;
+ }
+ }
+ count += 1;
+ }
+ return count;
+}
+
+int SkUTF::CountUTF32(const int32_t* utf32, size_t byteLength) {
+ if (!is_align4(intptr_t(utf32)) || !is_align4(byteLength) || byteLength >> 2 > INT_MAX) {
+ return -1;
+ }
+ const uint32_t kInvalidUnicharMask = 0xFF000000; // unichar fits in 24 bits
+ const uint32_t* ptr = (const uint32_t*)utf32;
+ const uint32_t* stop = ptr + (byteLength >> 2);
+ while (ptr < stop) {
+ if (*ptr & kInvalidUnicharMask) {
+ return -1;
+ }
+ ptr += 1;
+ }
+ return (int)(byteLength >> 2);
+}
+
+template <typename T>
+static SkUnichar next_fail(const T** ptr, const T* end) {
+ *ptr = end;
+ return -1;
+}
+
+SkUnichar SkUTF::NextUTF8(const char** ptr, const char* end) {
+ if (!ptr || !end ) {
+ return -1;
+ }
+ const uint8_t* p = (const uint8_t*)*ptr;
+ if (!p || p >= (const uint8_t*)end) {
+ return next_fail(ptr, end);
+ }
+ int c = *p;
+ int hic = c << 24;
+
+ if (!utf8_type_is_valid_leading_byte(utf8_byte_type(c))) {
+ return next_fail(ptr, end);
+ }
+ if (hic < 0) {
+ uint32_t mask = (uint32_t)~0x3F;
+ hic = left_shift(hic, 1);
+ do {
+ ++p;
+ if (p >= (const uint8_t*)end) {
+ return next_fail(ptr, end);
+ }
+ // check before reading off end of array.
+ uint8_t nextByte = *p;
+ if (!utf8_byte_is_continuation(nextByte)) {
+ return next_fail(ptr, end);
+ }
+ c = (c << 6) | (nextByte & 0x3F);
+ mask <<= 5;
+ } while ((hic = left_shift(hic, 1)) < 0);
+ c &= ~mask;
+ }
+ *ptr = (char*)p + 1;
+ return c;
+}
+
+SkUnichar SkUTF::NextUTF16(const uint16_t** ptr, const uint16_t* end) {
+ if (!ptr || !end ) {
+ return next_fail(ptr, end);
+ }
+ const uint16_t* src = *ptr;
+ if (!src || src + 1 > end || !is_align2(intptr_t(src))) {
+ return next_fail(ptr, end);
+ }
+ uint16_t c = *src++;
+ SkUnichar result = c;
+ if (utf16_is_low_surrogate(c)) {
+ return next_fail(ptr, end); // srcPtr should never point at low surrogate.
+ }
+ if (utf16_is_high_surrogate(c)) {
+ if (src + 1 > end) {
+ return next_fail(ptr, end); // Truncated string.
+ }
+ uint16_t low = *src++;
+ if (!utf16_is_low_surrogate(low)) {
+ return next_fail(ptr, end);
+ }
+ /*
+ [paraphrased from wikipedia]
+ Take the high surrogate and subtract 0xD800, then multiply by 0x400.
+ Take the low surrogate and subtract 0xDC00. Add these two results
+ together, and finally add 0x10000 to get the final decoded codepoint.
+
+ unicode = (high - 0xD800) * 0x400 + low - 0xDC00 + 0x10000
+ unicode = (high * 0x400) - (0xD800 * 0x400) + low - 0xDC00 + 0x10000
+ unicode = (high << 10) - (0xD800 << 10) + low - 0xDC00 + 0x10000
+ unicode = (high << 10) + low - ((0xD800 << 10) + 0xDC00 - 0x10000)
+ */
+ result = (result << 10) + (SkUnichar)low - ((0xD800 << 10) + 0xDC00 - 0x10000);
+ }
+ *ptr = src;
+ return result;
+}
+
+SkUnichar SkUTF::NextUTF32(const int32_t** ptr, const int32_t* end) {
+ if (!ptr || !end ) {
+ return -1;
+ }
+ const int32_t* s = *ptr;
+ if (!s || s + 1 > end || !is_align4(intptr_t(s))) {
+ return next_fail(ptr, end);
+ }
+ int32_t value = *s;
+ const uint32_t kInvalidUnicharMask = 0xFF000000; // unichar fits in 24 bits
+ if (value & kInvalidUnicharMask) {
+ return next_fail(ptr, end);
+ }
+ *ptr = s + 1;
+ return value;
+}
+
+size_t SkUTF::ToUTF8(SkUnichar uni, char utf8[SkUTF::kMaxBytesInUTF8Sequence]) {
+ if ((uint32_t)uni > 0x10FFFF) {
+ return 0;
+ }
+ if (uni <= 127) {
+ if (utf8) {
+ *utf8 = (char)uni;
+ }
+ return 1;
+ }
+ char tmp[4];
+ char* p = tmp;
+ size_t count = 1;
+ while (uni > 0x7F >> count) {
+ *p++ = (char)(0x80 | (uni & 0x3F));
+ uni >>= 6;
+ count += 1;
+ }
+ if (utf8) {
+ p = tmp;
+ utf8 += count;
+ while (p < tmp + count - 1) {
+ *--utf8 = *p++;
+ }
+ *--utf8 = (char)(~(0xFF >> count) | uni);
+ }
+ return count;
+}
+
+size_t SkUTF::ToUTF16(SkUnichar uni, uint16_t utf16[2]) {
+ if ((uint32_t)uni > 0x10FFFF) {
+ return 0;
+ }
+ int extra = (uni > 0xFFFF);
+ if (utf16) {
+ if (extra) {
+ utf16[0] = (uint16_t)((0xD800 - 64) + (uni >> 10));
+ utf16[1] = (uint16_t)(0xDC00 | (uni & 0x3FF));
+ } else {
+ utf16[0] = (uint16_t)uni;
+ }
+ }
+ return 1 + extra;
+}
+