diff options
author | Hal Canary <halcanary@google.com> | 2017-01-11 15:53:25 -0500 |
---|---|---|
committer | Skia Commit-Bot <skia-commit-bot@chromium.org> | 2017-01-11 21:27:29 +0000 |
commit | d1c8e56423f4d1a879f3a7bcd24e2725d9b690a7 (patch) | |
tree | 7a054dc76e1818e3de7cfa553ff61ca511b13285 | |
parent | f046e15347373c20e42b1a25ecd87cbdb84de146 (diff) |
SkUTF8_CountUnichars(s,l) and SkUTF8_NextUnichar(s,l) now safe.
Theory: We will accept blobs of data as utf-8 text without validation,
but when it comes time to process it: count code poits or convert to
code points, be careful to check for errors.
TODO: SkTypeface::charsToGlyphs() needs to take a length.
Change-Id: Id8110ab43dbffce96faffdda1e0bdaa39cad40e4
Reviewed-on: https://skia-review.googlesource.com/6849
Commit-Queue: Hal Canary <halcanary@google.com>
Reviewed-by: Herb Derby <herb@google.com>
-rw-r--r-- | src/core/SkPaint.cpp | 6 | ||||
-rw-r--r-- | src/core/SkUtils.cpp | 88 | ||||
-rw-r--r-- | src/core/SkUtils.h | 28 | ||||
-rw-r--r-- | src/pdf/SkPDFDevice.cpp | 11 | ||||
-rw-r--r-- | tests/UtilsTest.cpp | 61 |
5 files changed, 183 insertions, 11 deletions
diff --git a/src/core/SkPaint.cpp b/src/core/SkPaint.cpp index a5df8dd145..fb89a67127 100644 --- a/src/core/SkPaint.cpp +++ b/src/core/SkPaint.cpp @@ -440,7 +440,11 @@ int SkPaint::textToGlyphs(const void* textData, size_t byteLength, uint16_t glyp switch (this->getTextEncoding()) { case SkPaint::kUTF8_TextEncoding: while (text < stop) { - *gptr++ = cache->unicharToGlyph(SkUTF8_NextUnichar(&text)); + SkUnichar u = SkUTF8_NextUnicharWithError(&text, stop); + if (u < 0) { + return 0; // bad UTF-8 sequence + } + *gptr++ = cache->unicharToGlyph(u); } break; case SkPaint::kUTF16_TextEncoding: { diff --git a/src/core/SkUtils.cpp b/src/core/SkUtils.cpp index 635d1b1736..85ebb3e29c 100644 --- a/src/core/SkUtils.cpp +++ b/src/core/SkUtils.cpp @@ -19,10 +19,19 @@ 0xE5 << 24 */ +static bool utf8_byte_is_valid(uint8_t c) { + return c < 0xF5 && (c & 0xFE) != 0xC0; +} +static bool utf8_byte_is_continuation(uint8_t c) { + return (c & 0xC0) == 0x80; +} +static bool utf8_byte_is_leading_byte(uint8_t c) { + return utf8_byte_is_valid(c) && !utf8_byte_is_continuation(c); +} + #ifdef SK_DEBUG static void assert_utf8_leadingbyte(unsigned c) { - SkASSERT(c <= 0xF7); // otherwise leading byte is too big (more than 4 bytes) - SkASSERT((c & 0xC0) != 0x80); // can't begin with a middle char + SkASSERT(utf8_byte_is_leading_byte(SkToU8(c))); } int SkUTF8_LeadByteToCount(unsigned c) { @@ -33,6 +42,29 @@ #define assert_utf8_leadingbyte(c) #endif +/** + * @returns -1 iff invalid UTF8 byte, + * 0 iff UTF8 continuation byte, + * 1 iff ASCII byte, + * 2 iff leading byte of 2-byte sequence, + * 3 iff leading byte of 3-byte sequence, and + * 4 iff leading byte of 4-byte sequence. + * + * I.e.: if return value > 0, then gives length of sequence. +*/ +static int utf8_byte_type(uint8_t c) { + if (c < 0x80) { + return 1; + } else if (c < 0xC0) { + return 0; + } else if (c < 0xF5 && (c & 0xFE) != 0xC0) { // "octet values C0, C1, F5 to FF never appear" + return (((0xE5 << 24) >> ((unsigned)c >> 4 << 1)) & 3) + 1; + } else { + return -1; + } +} +static bool utf8_type_is_valid_leading_byte(int type) { return type > 0; } + int SkUTF8_CountUnichars(const char utf8[]) { SkASSERT(utf8); @@ -49,15 +81,28 @@ int SkUTF8_CountUnichars(const char utf8[]) { return count; } -int SkUTF8_CountUnichars(const char utf8[], size_t byteLength) { +// SAFE: returns -1 if invalid UTF-8 +int SkUTF8_CountUnicharsWithError(const char utf8[], size_t byteLength) { SkASSERT(utf8 || 0 == byteLength); int count = 0; const char* stop = utf8 + byteLength; while (utf8 < stop) { - utf8 += SkUTF8_LeadByteToCount(*(const uint8_t*)utf8); - count += 1; + int type = utf8_byte_type(*(const uint8_t*)utf8); + SkASSERT(type >= -1 && type <= 4); + if (!utf8_type_is_valid_leading_byte(type) || + utf8 + type > stop) { // Sequence extends beyond end. + return -1; + } + while(type-- > 1) { + ++utf8; + if (!utf8_byte_is_continuation(*(const uint8_t*)utf8)) { + return -1; + } + } + ++utf8; + ++count; } return count; } @@ -83,6 +128,39 @@ SkUnichar SkUTF8_ToUnichar(const char utf8[]) { return c; } +// SAFE: returns -1 on invalid UTF-8 sequence. +SkUnichar SkUTF8_NextUnicharWithError(const char** ptr, const char* end) { + SkASSERT(ptr && *ptr); + SkASSERT(*ptr < end); + const uint8_t* p = (const uint8_t*)*ptr; + int c = *p; + int hic = c << 24; + + if (!utf8_byte_is_leading_byte(c)) { + return -1; + } + if (hic < 0) { + uint32_t mask = (uint32_t)~0x3F; + hic = SkLeftShift(hic, 1); + do { + ++p; + if (p >= (const uint8_t*)end) { + return -1; + } + // check before reading off end of array. + uint8_t nextByte = *p; + if (!utf8_byte_is_continuation(nextByte)) { + return -1; + } + c = (c << 6) | (nextByte & 0x3F); + mask <<= 5; + } while ((hic = SkLeftShift(hic, 1)) < 0); + c &= ~mask; + } + *ptr = (char*)p + 1; + return c; +} + SkUnichar SkUTF8_NextUnichar(const char** ptr) { SkASSERT(ptr && *ptr); diff --git a/src/core/SkUtils.h b/src/core/SkUtils.h index 26f19e6906..e24dd52f45 100644 --- a/src/core/SkUtils.h +++ b/src/core/SkUtils.h @@ -9,6 +9,7 @@ #define SkUtils_DEFINED #include "SkTypes.h" +#include "SkMath.h" /** Similar to memset(), but it assigns a 16bit value into the buffer. @param buffer The memory to have value copied into it @@ -58,7 +59,31 @@ inline int SkUTF8_CountUTF8Bytes(const char utf8[]) { } int SkUTF8_CountUnichars(const char utf8[]); -int SkUTF8_CountUnichars(const char utf8[], size_t byteLength); + +/** This function is safe: invalid UTF8 sequences will return -1; */ +int SkUTF8_CountUnicharsWithError(const char utf8[], size_t byteLength); + +/** This function is safe: invalid UTF8 sequences will return 0; */ +inline int SkUTF8_CountUnichars(const char utf8[], size_t byteLength) { + return SkClampPos(SkUTF8_CountUnicharsWithError(utf8, byteLength)); +} + +/** This function is safe: invalid UTF8 sequences will return -1 + * When -1 is returned, ptr is unchanged. + * Precondition: *ptr < end; + */ +SkUnichar SkUTF8_NextUnicharWithError(const char** ptr, const char* end); + +/** this version replaces invalid utf-8 sequences with code point U+FFFD. */ +inline SkUnichar SkUTF8_NextUnichar(const char** ptr, const char* end) { + SkUnichar val = SkUTF8_NextUnicharWithError(ptr, end); + if (val < 0) { + *ptr = end; + return 0xFFFD; // REPLACEMENT CHARACTER + } + return val; +} + SkUnichar SkUTF8_ToUnichar(const char utf8[]); SkUnichar SkUTF8_NextUnichar(const char**); SkUnichar SkUTF8_PrevUnichar(const char**); @@ -99,5 +124,4 @@ inline bool SkUnichar_IsVariationSelector(SkUnichar uni) { } return true; } - #endif diff --git a/src/pdf/SkPDFDevice.cpp b/src/pdf/SkPDFDevice.cpp index 9ef3611142..6da4d3b3bd 100644 --- a/src/pdf/SkPDFDevice.cpp +++ b/src/pdf/SkPDFDevice.cpp @@ -1340,9 +1340,11 @@ void SkPDFDevice::internalDrawText( if (c.fUtf8Text) { // real cluster // Check if `/ActualText` needed. const char* textPtr = c.fUtf8Text; - // TODO(halcanary): validate utf8 input. - SkUnichar unichar = SkUTF8_NextUnichar(&textPtr); const char* textEnd = c.fUtf8Text + c.fTextByteLength; + SkUnichar unichar = SkUTF8_NextUnicharWithError(&textPtr, textEnd); + if (unichar < 0) { + return; + } if (textPtr < textEnd || // more characters left glyphLimit > index + 1 || // toUnicode wouldn't work unichar != map_glyph(glyphToUnicode, glyphs[index])) // test single Unichar map @@ -1353,7 +1355,10 @@ void SkPDFDevice::internalDrawText( // the BOM marks this text as UTF-16BE, not PDFDocEncoding. SkPDFUtils::WriteUTF16beHex(out, unichar); // first char while (textPtr < textEnd) { - unichar = SkUTF8_NextUnichar(&textPtr); + unichar = SkUTF8_NextUnicharWithError(&textPtr, textEnd); + if (unichar < 0) { + break; + } SkPDFUtils::WriteUTF16beHex(out, unichar); } out->writeText("> >> BDC\n"); // begin marked-content sequence diff --git a/tests/UtilsTest.cpp b/tests/UtilsTest.cpp index b6f90e03b5..ed861c69a0 100644 --- a/tests/UtilsTest.cpp +++ b/tests/UtilsTest.cpp @@ -219,3 +219,64 @@ DEF_TEST(Utils, reporter) { test_autounref(reporter); test_autostarray(reporter); } + +#define ASCII_BYTE "X" +#define CONTINUATION_BYTE "\x80" +#define LEADING_TWO_BYTE "\xC4" +#define LEADING_THREE_BYTE "\xE0" +#define LEADING_FOUR_BYTE "\xF0" +#define INVALID_BYTE "\xFC" +static bool valid_utf8(const char* p, size_t l) { + return SkUTF8_CountUnicharsWithError(p, l) >= 0; +} +DEF_TEST(Utils_UTF8_ValidLength, r) { + const char* goodTestcases[] = { + "", + ASCII_BYTE, + ASCII_BYTE ASCII_BYTE, + LEADING_TWO_BYTE CONTINUATION_BYTE, + ASCII_BYTE LEADING_TWO_BYTE CONTINUATION_BYTE, + ASCII_BYTE ASCII_BYTE LEADING_TWO_BYTE CONTINUATION_BYTE, + LEADING_THREE_BYTE CONTINUATION_BYTE CONTINUATION_BYTE, + ASCII_BYTE LEADING_THREE_BYTE CONTINUATION_BYTE CONTINUATION_BYTE, + ASCII_BYTE ASCII_BYTE LEADING_THREE_BYTE CONTINUATION_BYTE CONTINUATION_BYTE, + LEADING_FOUR_BYTE CONTINUATION_BYTE CONTINUATION_BYTE CONTINUATION_BYTE, + ASCII_BYTE LEADING_FOUR_BYTE CONTINUATION_BYTE CONTINUATION_BYTE CONTINUATION_BYTE, + ASCII_BYTE ASCII_BYTE LEADING_FOUR_BYTE CONTINUATION_BYTE CONTINUATION_BYTE + CONTINUATION_BYTE, + }; + for (const char* testcase : goodTestcases) { + REPORTER_ASSERT(r, valid_utf8(testcase, strlen(testcase))); + } + const char* badTestcases[] = { + INVALID_BYTE, + INVALID_BYTE CONTINUATION_BYTE, + INVALID_BYTE CONTINUATION_BYTE CONTINUATION_BYTE, + INVALID_BYTE CONTINUATION_BYTE CONTINUATION_BYTE CONTINUATION_BYTE, + LEADING_TWO_BYTE, + CONTINUATION_BYTE, + CONTINUATION_BYTE CONTINUATION_BYTE, + LEADING_THREE_BYTE CONTINUATION_BYTE, + CONTINUATION_BYTE CONTINUATION_BYTE CONTINUATION_BYTE, + LEADING_FOUR_BYTE CONTINUATION_BYTE, + CONTINUATION_BYTE CONTINUATION_BYTE CONTINUATION_BYTE CONTINUATION_BYTE, + + ASCII_BYTE INVALID_BYTE, + ASCII_BYTE INVALID_BYTE CONTINUATION_BYTE, + ASCII_BYTE INVALID_BYTE CONTINUATION_BYTE CONTINUATION_BYTE, + ASCII_BYTE INVALID_BYTE CONTINUATION_BYTE CONTINUATION_BYTE CONTINUATION_BYTE, + ASCII_BYTE LEADING_TWO_BYTE, + ASCII_BYTE CONTINUATION_BYTE, + ASCII_BYTE CONTINUATION_BYTE CONTINUATION_BYTE, + ASCII_BYTE LEADING_THREE_BYTE CONTINUATION_BYTE, + ASCII_BYTE CONTINUATION_BYTE CONTINUATION_BYTE CONTINUATION_BYTE, + ASCII_BYTE LEADING_FOUR_BYTE CONTINUATION_BYTE, + ASCII_BYTE CONTINUATION_BYTE CONTINUATION_BYTE CONTINUATION_BYTE CONTINUATION_BYTE, + + // LEADING_FOUR_BYTE LEADING_TWO_BYTE CONTINUATION_BYTE, + }; + for (const char* testcase : badTestcases) { + REPORTER_ASSERT(r, !valid_utf8(testcase, strlen(testcase))); + } + +} |