aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorGravatar kenton@google.com <kenton@google.com@630680e5-0e50-0410-840e-4b1c322b438d>2009-12-11 03:49:28 +0000
committerGravatar kenton@google.com <kenton@google.com@630680e5-0e50-0410-840e-4b1c322b438d>2009-12-11 03:49:28 +0000
commitb221008884cec52232b2c292f743cad0ed4d3dee (patch)
tree43bbfce99e41c5cecf83f9497f9939a41198791e
parentde747794544996f460c6cf07e0e5c3b7757efc92 (diff)
Fix UTF-8 validity checks to not do unaligned reads.
-rw-r--r--src/google/protobuf/stubs/structurally_valid.cc81
-rw-r--r--src/google/protobuf/stubs/structurally_valid_unittest.cc14
2 files changed, 60 insertions, 35 deletions
diff --git a/src/google/protobuf/stubs/structurally_valid.cc b/src/google/protobuf/stubs/structurally_valid.cc
index e385a81e..0f6afe6d 100644
--- a/src/google/protobuf/stubs/structurally_valid.cc
+++ b/src/google/protobuf/stubs/structurally_valid.cc
@@ -371,36 +371,44 @@ int UTF8GenericScan(const UTF8ScanObj* st,
// Do state-table scan
int e = 0;
uint8 c;
-
- // Do fast for groups of 8 identity bytes.
- // This covers a lot of 7-bit ASCII ~8x faster then the 1-byte loop,
- // including slowing slightly on cr/lf/ht
- //----------------------------
const uint8* Tbl2 = &st->fast_state[0];
- uint32 losub = st->losub;
- uint32 hiadd = st->hiadd;
- while (src < srclimit8) {
- uint32 s0123 = (reinterpret_cast<const uint32 *>(src))[0];
- uint32 s4567 = (reinterpret_cast<const uint32 *>(src))[1];
- src += 8;
- // This is a fast range check for all bytes in [lowsub..0x80-hiadd)
- uint32 temp = (s0123 - losub) | (s0123 + hiadd) |
- (s4567 - losub) | (s4567 + hiadd);
- if ((temp & 0x80808080) != 0) {
- // We typically end up here on cr/lf/ht; src was incremented
- int e0123 = (Tbl2[src[-8]] | Tbl2[src[-7]]) |
- (Tbl2[src[-6]] | Tbl2[src[-5]]);
- if (e0123 != 0) {
- src -= 8;
- break;
- } // Exit on Non-interchange
- e0123 = (Tbl2[src[-4]] | Tbl2[src[-3]]) |
- (Tbl2[src[-2]] | Tbl2[src[-1]]);
- if (e0123 != 0) {
- src -= 4;
- break;
- } // Exit on Non-interchange
- // Else OK, go around again
+ const uint32 losub = st->losub;
+ const uint32 hiadd = st->hiadd;
+ // Check initial few bytes one at a time until 8-byte aligned
+ //----------------------------
+ while ((((uintptr_t)src & 0x07) != 0) &&
+ (src < srclimit) &&
+ Tbl2[src[0]] == 0) {
+ src++;
+ }
+ if (((uintptr_t)src & 0x07) == 0) {
+ // Do fast for groups of 8 identity bytes.
+ // This covers a lot of 7-bit ASCII ~8x faster then the 1-byte loop,
+ // including slowing slightly on cr/lf/ht
+ //----------------------------
+ while (src < srclimit8) {
+ uint32 s0123 = (reinterpret_cast<const uint32 *>(src))[0];
+ uint32 s4567 = (reinterpret_cast<const uint32 *>(src))[1];
+ src += 8;
+ // This is a fast range check for all bytes in [lowsub..0x80-hiadd)
+ uint32 temp = (s0123 - losub) | (s0123 + hiadd) |
+ (s4567 - losub) | (s4567 + hiadd);
+ if ((temp & 0x80808080) != 0) {
+ // We typically end up here on cr/lf/ht; src was incremented
+ int e0123 = (Tbl2[src[-8]] | Tbl2[src[-7]]) |
+ (Tbl2[src[-6]] | Tbl2[src[-5]]);
+ if (e0123 != 0) {
+ src -= 8;
+ break;
+ } // Exit on Non-interchange
+ e0123 = (Tbl2[src[-4]] | Tbl2[src[-3]]) |
+ (Tbl2[src[-2]] | Tbl2[src[-1]]);
+ if (e0123 != 0) {
+ src -= 4;
+ break;
+ } // Exit on Non-interchange
+ // Else OK, go around again
+ }
}
}
//----------------------------
@@ -470,10 +478,17 @@ int UTF8GenericScanFastAscii(const UTF8ScanObj* st,
int rest_consumed;
int exit_reason;
do {
- while ((src < srclimit8) &&
- (((reinterpret_cast<const uint32*>(src)[0] |
- reinterpret_cast<const uint32*>(src)[1]) & 0x80808080) == 0)) {
- src += 8;
+ // Check initial few bytes one at a time until 8-byte aligned
+ while ((((uintptr_t)src & 0x07) != 0) &&
+ (src < srclimit) && (src[0] < 0x80)) {
+ src++;
+ }
+ if (((uintptr_t)src & 0x07) == 0) {
+ while ((src < srclimit8) &&
+ (((reinterpret_cast<const uint32*>(src)[0] |
+ reinterpret_cast<const uint32*>(src)[1]) & 0x80808080) == 0)) {
+ src += 8;
+ }
}
while ((src < srclimit) && (src[0] < 0x80)) {
src++;
diff --git a/src/google/protobuf/stubs/structurally_valid_unittest.cc b/src/google/protobuf/stubs/structurally_valid_unittest.cc
index 22825516..90888885 100644
--- a/src/google/protobuf/stubs/structurally_valid_unittest.cc
+++ b/src/google/protobuf/stubs/structurally_valid_unittest.cc
@@ -13,15 +13,25 @@ TEST(StructurallyValidTest, ValidUTF8String) {
// On GCC, this string can be written as:
// "abcd 1234 - \u2014\u2013\u2212"
// MSVC seems to interpret \u differently.
- string valid_str("abcd 1234 - \342\200\224\342\200\223\342\210\222");
+ string valid_str("abcd 1234 - \342\200\224\342\200\223\342\210\222 - xyz789");
EXPECT_TRUE(IsStructurallyValidUTF8(valid_str.data(),
valid_str.size()));
+ // Additional check for pointer alignment
+ for (int i = 1; i < 8; ++i) {
+ EXPECT_TRUE(IsStructurallyValidUTF8(valid_str.data() + i,
+ valid_str.size() - i));
+ }
}
TEST(StructurallyValidTest, InvalidUTF8String) {
- string invalid_str("\xA0\xB0");
+ const string invalid_str("abcd\xA0\xB0\xA0\xB0\xA0\xB0 - xyz789");
EXPECT_FALSE(IsStructurallyValidUTF8(invalid_str.data(),
invalid_str.size()));
+ // Additional check for pointer alignment
+ for (int i = 1; i < 8; ++i) {
+ EXPECT_FALSE(IsStructurallyValidUTF8(invalid_str.data() + i,
+ invalid_str.size() - i));
+ }
}
} // namespace