2 files changed, 321 insertions, 0 deletions
diff --git a/src/utils/SkUTF.cpp b/src/utils/SkUTF.cpp
new file mode 100644
index 0000000000..0670ae02a4
--- /dev/null
+++ b/src/utils/SkUTF.cpp
@@ -0,0 +1,253 @@
+// Copyright 2018 Google LLC.
+// Use of this source code is governed by a BSD-style license that can be found in the LICENSE file.
+
+#include "SkUTF.h"
+
+#include <climits>
+
+static constexpr inline int32_t left_shift(int32_t value, int32_t shift) {
+    return (int32_t) ((uint32_t) value << shift);
+}
+
+template <typename T> static constexpr bool is_align2(T x) { return 0 == (x & 1); }
+
+template <typename T> static constexpr bool is_align4(T x) { return 0 == (x & 3); }
+
+static constexpr inline bool utf16_is_high_surrogate(uint16_t c) { return (c & 0xFC00) == 0xD800; }
+
+static constexpr inline bool utf16_is_low_surrogate(uint16_t c) { return (c & 0xFC00) == 0xDC00; }
+
+/** @returns   -1  iff invalid UTF8 byte,
+                0  iff UTF8 continuation byte,
+                1  iff ASCII byte,
+                2  iff leading byte of 2-byte sequence,
+                3  iff leading byte of 3-byte sequence, and
+                4  iff leading byte of 4-byte sequence.
+      I.e.: if return value > 0, then gives length of sequence.
+*/
+static int utf8_byte_type(uint8_t c) {
+    if (c < 0x80) {
+        return 1;
+    } else if (c < 0xC0) {
+        return 0;
+    } else if (c >= 0xF5 || (c & 0xFE) == 0xC0) { // "octet values c0, c1, f5 to ff never appear"
+        return -1;
+    } else {
+        int value = (((0xe5 << 24) >> ((unsigned)c >> 4 << 1)) & 3) + 1;
+        // assert(value >= 2 && value <=4);
+        return value;
+    }
+}
+static bool utf8_type_is_valid_leading_byte(int type) { return type > 0; }
+
+static bool utf8_byte_is_continuation(uint8_t c) { return utf8_byte_type(c) == 0; }
+
+////////////////////////////////////////////////////////////////////////////////
+
+int SkUTF::CountUTF8(const char* utf8, size_t byteLength) {
+    if (!utf8) {
+        return -1;
+    }
+    int count = 0;
+    const char* stop = utf8 + byteLength;
+    while (utf8 < stop) {
+        int type = utf8_byte_type(*(const uint8_t*)utf8);
+        if (!utf8_type_is_valid_leading_byte(type) || utf8 + type > stop) {
+            return -1;  // Sequence extends beyond end.
+        }
+        while(type-- > 1) {
+            ++utf8;
+            if (!utf8_byte_is_continuation(*(const uint8_t*)utf8)) {
+                return -1;
+            }
+        }
+        ++utf8;
+        ++count;
+    }
+    return count;
+}
+
+int SkUTF::CountUTF16(const uint16_t* utf16, size_t byteLength) {
+    if (!utf16 || !is_align2(intptr_t(utf16)) || !is_align2(byteLength)) {
+        return -1;
+    }
+    const uint16_t* src = (const uint16_t*)utf16;
+    const uint16_t* stop = src + (byteLength >> 1);
+    int count = 0;
+    while (src < stop) {
+        unsigned c = *src++;
+        if (utf16_is_low_surrogate(c)) {
+            return -1;
+        }
+        if (utf16_is_high_surrogate(c)) {
+            if (src >= stop) {
+                return -1;
+            }
+            c = *src++;
+            if (!utf16_is_low_surrogate(c)) {
+                return -1;
+            }
+        }
+        count += 1;
+    }
+    return count;
+}
+
+int SkUTF::CountUTF32(const int32_t* utf32, size_t byteLength) {
+    if (!is_align4(intptr_t(utf32)) || !is_align4(byteLength) || byteLength >> 2 > INT_MAX) {
+        return -1;
+    }
+    const uint32_t kInvalidUnicharMask = 0xFF000000;    // unichar fits in 24 bits
+    const uint32_t* ptr = (const uint32_t*)utf32;
+    const uint32_t* stop = ptr + (byteLength >> 2);
+    while (ptr < stop) {
+        if (*ptr & kInvalidUnicharMask) {
+            return -1;
+        }
+        ptr += 1;
+    }
+    return (int)(byteLength >> 2);
+}
+
+template <typename T>
+static SkUnichar next_fail(const T** ptr, const T* end) {
+    *ptr = end;
+    return -1;
+}
+
+SkUnichar SkUTF::NextUTF8(const char** ptr, const char* end) {
+    if (!ptr || !end ) {
+        return -1;
+    }
+    const uint8_t*  p = (const uint8_t*)*ptr;
+    if (!p || p >= (const uint8_t*)end) {
+        return next_fail(ptr, end);
+    }
+    int             c = *p;
+    int             hic = c << 24;
+
+    if (!utf8_type_is_valid_leading_byte(utf8_byte_type(c))) {
+        return next_fail(ptr, end);
+    }
+    if (hic < 0) {
+        uint32_t mask = (uint32_t)~0x3F;
+        hic = left_shift(hic, 1);
+        do {
+            ++p;
+            if (p >= (const uint8_t*)end) {
+                return next_fail(ptr, end);
+            }
+            // check before reading off end of array.
+            uint8_t nextByte = *p;
+            if (!utf8_byte_is_continuation(nextByte)) {
+                return next_fail(ptr, end);
+            }
+            c = (c << 6) | (nextByte & 0x3F);
+            mask <<= 5;
+        } while ((hic = left_shift(hic, 1)) < 0);
+        c &= ~mask;
+    }
+    *ptr = (char*)p + 1;
+    return c;
+}
+
+SkUnichar SkUTF::NextUTF16(const uint16_t** ptr, const uint16_t* end) {
+    if (!ptr || !end ) {
+        return next_fail(ptr, end);
+    }
+    const uint16_t* src = *ptr;
+    if (!src || src + 1 > end || !is_align2(intptr_t(src))) {
+        return next_fail(ptr, end);
+    }
+    uint16_t c = *src++;
+    SkUnichar result = c;
+    if (utf16_is_low_surrogate(c)) {
+        return next_fail(ptr, end);  // srcPtr should never point at low surrogate.
+    }
+    if (utf16_is_high_surrogate(c)) {
+        if (src + 1 > end) {
+            return next_fail(ptr, end);  // Truncated string.
+        }
+        uint16_t low = *src++;
+        if (!utf16_is_low_surrogate(low)) {
+            return next_fail(ptr, end);
+        }
+        /*
+        [paraphrased from wikipedia]
+        Take the high surrogate and subtract 0xD800, then multiply by 0x400.
+        Take the low surrogate and subtract 0xDC00.  Add these two results
+        together, and finally add 0x10000 to get the final decoded codepoint.
+
+        unicode = (high - 0xD800) * 0x400 + low - 0xDC00 + 0x10000
+        unicode = (high * 0x400) - (0xD800 * 0x400) + low - 0xDC00 + 0x10000
+        unicode = (high << 10) - (0xD800 << 10) + low - 0xDC00 + 0x10000
+        unicode = (high << 10) + low - ((0xD800 << 10) + 0xDC00 - 0x10000)
+        */
+        result = (result << 10) + (SkUnichar)low - ((0xD800 << 10) + 0xDC00 - 0x10000);
+    }
+    *ptr = src;
+    return result;
+}
+
+SkUnichar SkUTF::NextUTF32(const int32_t** ptr, const int32_t* end) {
+    if (!ptr || !end ) {
+        return -1;
+    }
+    const int32_t* s = *ptr;
+    if (!s || s + 1 > end || !is_align4(intptr_t(s))) {
+        return next_fail(ptr, end);
+    }
+    int32_t value = *s;
+    const uint32_t kInvalidUnicharMask = 0xFF000000;    // unichar fits in 24 bits
+    if (value & kInvalidUnicharMask) {
+        return next_fail(ptr, end);
+    }
+    *ptr = s + 1;
+    return value;
+}
+
+size_t SkUTF::ToUTF8(SkUnichar uni, char utf8[SkUTF::kMaxBytesInUTF8Sequence]) {
+    if ((uint32_t)uni > 0x10FFFF) {
+        return 0;
+    }
+    if (uni <= 127) {
+        if (utf8) {
+            *utf8 = (char)uni;
+        }
+        return 1;
+    }
+    char    tmp[4];
+    char*   p = tmp;
+    size_t  count = 1;
+    while (uni > 0x7F >> count) {
+        *p++ = (char)(0x80 | (uni & 0x3F));
+        uni >>= 6;
+        count += 1;
+    }
+    if (utf8) {
+        p = tmp;
+        utf8 += count;
+        while (p < tmp + count - 1) {
+            *--utf8 = *p++;
+        }
+        *--utf8 = (char)(~(0xFF >> count) | uni);
+    }
+    return count;
+}
+
+size_t SkUTF::ToUTF16(SkUnichar uni, uint16_t utf16[2]) {
+    if ((uint32_t)uni > 0x10FFFF) {
+        return 0;
+    }
+    int extra = (uni > 0xFFFF);
+    if (utf16) {
+        if (extra) {
+            utf16[0] = (uint16_t)((0xD800 - 64) + (uni >> 10));
+            utf16[1] = (uint16_t)(0xDC00 | (uni & 0x3FF));
+        } else {
+            utf16[0] = (uint16_t)uni;
+        }
+    }
+    return 1 + extra;
+}
+
diff --git a/src/utils/SkUTF.h b/src/utils/SkUTF.h
new file mode 100644
index 0000000000..385102aadb
--- /dev/null
+++ b/src/utils/SkUTF.h
@@ -0,0 +1,68 @@
+// Copyright 2018 Google LLC.
+// Use of this source code is governed by a BSD-style license that can be found in the LICENSE file.
+#ifndef SkUTF_DEFINED
+#define SkUTF_DEFINED
+
+#include <cstddef>
+#include <cstdint>
+
+typedef int32_t SkUnichar;
+
+namespace SkUTF {
+
+/** Given a sequence of UTF-8 bytes, return the number of unicode codepoints.
+    If the sequence is invalid UTF-8, return -1.
+*/
+int CountUTF8(const char* utf8, size_t byteLength);
+
+/** Given a sequence of aligned UTF-16 characters in machine-endian form,
+    return the number of unicode codepoints.  If the sequence is invalid
+    UTF-16, return -1.
+*/
+int CountUTF16(const uint16_t* utf16, size_t byteLength);
+
+/** Given a sequence of aligned UTF-32 characters in machine-endian form,
+    return the number of unicode codepoints.  If the sequence is invalid
+    UTF-32, return -1.
+*/
+int CountUTF32(const int32_t* utf32, size_t byteLength);
+
+/** Given a sequence of UTF-8 bytes, return the first unicode codepoint.
+    The pointer will be incremented to point at the next codepoint's start.  If
+    invalid UTF-8 is encountered, set *ptr to end and return -1.
+*/
+SkUnichar NextUTF8(const char** ptr, const char* end);
+
+/** Given a sequence of aligned UTF-16 characters in machine-endian form,
+    return the first unicode codepoint.  The pointer will be incremented to
+    point at the next codepoint's start.  If invalid UTF-16 is encountered,
+    set *ptr to end and return -1.
+*/
+SkUnichar NextUTF16(const uint16_t** ptr, const uint16_t* end);
+
+/** Given a sequence of aligned UTF-32 characters in machine-endian form,
+    return the first unicode codepoint.  The pointer will be incremented to
+    point at the next codepoint's start.  If invalid UTF-32 is encountered,
+    set *ptr to end and return -1.
+*/
+SkUnichar NextUTF32(const int32_t** ptr, const int32_t* end);
+
+constexpr unsigned kMaxBytesInUTF8Sequence = 4;
+
+/** Convert the unicode codepoint into UTF-8.  If `utf8` is non-null, place the
+    result in that array.  Return the number of bytes in the result.  If `utf8`
+    is null, simply return the number of bytes that would be used.  For invalid
+    unicode codepoints, return 0.
+*/
+size_t ToUTF8(SkUnichar uni, char utf8[kMaxBytesInUTF8Sequence] = nullptr);
+
+/** Convert the unicode codepoint into UTF-16.  If `utf16` is non-null, place
+    the result in that array.  Return the number of UTF-16 code units in the
+    result (1 or 2).  If `utf16` is null, simply return the number of code
+    units that would be used.  For invalid unicode codepoints, return 0.
+*/
+size_t ToUTF16(SkUnichar uni, uint16_t utf16[2] = nullptr);
+
+}  // namespace SkUTF
+
+#endif  // SkUTF_DEFINED