aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/core/SkUtils.cpp
diff options
context:
space:
mode:
authorGravatar Hal Canary <halcanary@google.com>2017-01-11 15:53:25 -0500
committerGravatar Skia Commit-Bot <skia-commit-bot@chromium.org>2017-01-11 21:27:29 +0000
commitd1c8e56423f4d1a879f3a7bcd24e2725d9b690a7 (patch)
tree7a054dc76e1818e3de7cfa553ff61ca511b13285 /src/core/SkUtils.cpp
parentf046e15347373c20e42b1a25ecd87cbdb84de146 (diff)
SkUTF8_CountUnichars(s,l) and SkUTF8_NextUnichar(s,l) now safe.
Theory: We will accept blobs of data as utf-8 text without validation, but when it comes time to process it: count code poits or convert to code points, be careful to check for errors. TODO: SkTypeface::charsToGlyphs() needs to take a length. Change-Id: Id8110ab43dbffce96faffdda1e0bdaa39cad40e4 Reviewed-on: https://skia-review.googlesource.com/6849 Commit-Queue: Hal Canary <halcanary@google.com> Reviewed-by: Herb Derby <herb@google.com>
Diffstat (limited to 'src/core/SkUtils.cpp')
-rw-r--r--src/core/SkUtils.cpp88
1 files changed, 83 insertions, 5 deletions
diff --git a/src/core/SkUtils.cpp b/src/core/SkUtils.cpp
index 635d1b1736..85ebb3e29c 100644
--- a/src/core/SkUtils.cpp
+++ b/src/core/SkUtils.cpp
@@ -19,10 +19,19 @@
0xE5 << 24
*/
+static bool utf8_byte_is_valid(uint8_t c) {
+ return c < 0xF5 && (c & 0xFE) != 0xC0;
+}
+static bool utf8_byte_is_continuation(uint8_t c) {
+ return (c & 0xC0) == 0x80;
+}
+static bool utf8_byte_is_leading_byte(uint8_t c) {
+ return utf8_byte_is_valid(c) && !utf8_byte_is_continuation(c);
+}
+
#ifdef SK_DEBUG
static void assert_utf8_leadingbyte(unsigned c) {
- SkASSERT(c <= 0xF7); // otherwise leading byte is too big (more than 4 bytes)
- SkASSERT((c & 0xC0) != 0x80); // can't begin with a middle char
+ SkASSERT(utf8_byte_is_leading_byte(SkToU8(c)));
}
int SkUTF8_LeadByteToCount(unsigned c) {
@@ -33,6 +42,29 @@
#define assert_utf8_leadingbyte(c)
#endif
+/**
+ * @returns -1 iff invalid UTF8 byte,
+ * 0 iff UTF8 continuation byte,
+ * 1 iff ASCII byte,
+ * 2 iff leading byte of 2-byte sequence,
+ * 3 iff leading byte of 3-byte sequence, and
+ * 4 iff leading byte of 4-byte sequence.
+ *
+ * I.e.: if return value > 0, then gives length of sequence.
+*/
+static int utf8_byte_type(uint8_t c) {
+ if (c < 0x80) {
+ return 1;
+ } else if (c < 0xC0) {
+ return 0;
+ } else if (c < 0xF5 && (c & 0xFE) != 0xC0) { // "octet values C0, C1, F5 to FF never appear"
+ return (((0xE5 << 24) >> ((unsigned)c >> 4 << 1)) & 3) + 1;
+ } else {
+ return -1;
+ }
+}
+static bool utf8_type_is_valid_leading_byte(int type) { return type > 0; }
+
int SkUTF8_CountUnichars(const char utf8[]) {
SkASSERT(utf8);
@@ -49,15 +81,28 @@ int SkUTF8_CountUnichars(const char utf8[]) {
return count;
}
-int SkUTF8_CountUnichars(const char utf8[], size_t byteLength) {
+// SAFE: returns -1 if invalid UTF-8
+int SkUTF8_CountUnicharsWithError(const char utf8[], size_t byteLength) {
SkASSERT(utf8 || 0 == byteLength);
int count = 0;
const char* stop = utf8 + byteLength;
while (utf8 < stop) {
- utf8 += SkUTF8_LeadByteToCount(*(const uint8_t*)utf8);
- count += 1;
+ int type = utf8_byte_type(*(const uint8_t*)utf8);
+ SkASSERT(type >= -1 && type <= 4);
+ if (!utf8_type_is_valid_leading_byte(type) ||
+ utf8 + type > stop) { // Sequence extends beyond end.
+ return -1;
+ }
+ while(type-- > 1) {
+ ++utf8;
+ if (!utf8_byte_is_continuation(*(const uint8_t*)utf8)) {
+ return -1;
+ }
+ }
+ ++utf8;
+ ++count;
}
return count;
}
@@ -83,6 +128,39 @@ SkUnichar SkUTF8_ToUnichar(const char utf8[]) {
return c;
}
+// SAFE: returns -1 on invalid UTF-8 sequence.
+SkUnichar SkUTF8_NextUnicharWithError(const char** ptr, const char* end) {
+ SkASSERT(ptr && *ptr);
+ SkASSERT(*ptr < end);
+ const uint8_t* p = (const uint8_t*)*ptr;
+ int c = *p;
+ int hic = c << 24;
+
+ if (!utf8_byte_is_leading_byte(c)) {
+ return -1;
+ }
+ if (hic < 0) {
+ uint32_t mask = (uint32_t)~0x3F;
+ hic = SkLeftShift(hic, 1);
+ do {
+ ++p;
+ if (p >= (const uint8_t*)end) {
+ return -1;
+ }
+ // check before reading off end of array.
+ uint8_t nextByte = *p;
+ if (!utf8_byte_is_continuation(nextByte)) {
+ return -1;
+ }
+ c = (c << 6) | (nextByte & 0x3F);
+ mask <<= 5;
+ } while ((hic = SkLeftShift(hic, 1)) < 0);
+ c &= ~mask;
+ }
+ *ptr = (char*)p + 1;
+ return c;
+}
+
SkUnichar SkUTF8_NextUnichar(const char** ptr) {
SkASSERT(ptr && *ptr);