diff options
author | Chris Mihelich <cmihelic@google.com> | 2024-06-26 15:13:52 -0700 |
---|---|---|
committer | Copybara-Service <copybara-worker@google.com> | 2024-06-26 15:14:45 -0700 |
commit | 16452e1418c1c2a8bcf4a99238e190ba901a20a6 (patch) | |
tree | 551f86735011199f26edd2a635683c33bcd80cd6 /absl/debugging/internal | |
parent | 63d4b2fe1e0d4bd6a7b916f398643db40c35624b (diff) |
Decoder for Rust-style Punycode encodings of bounded length.
PiperOrigin-RevId: 647093624
Change-Id: Ic76bfa4aa8fb616cb23095ce7bfa30c3812dcb21
Diffstat (limited to 'absl/debugging/internal')
-rw-r--r-- | absl/debugging/internal/decode_rust_punycode.cc | 258 | ||||
-rw-r--r-- | absl/debugging/internal/decode_rust_punycode.h | 55 | ||||
-rw-r--r-- | absl/debugging/internal/decode_rust_punycode_test.cc | 606 |
3 files changed, 919 insertions, 0 deletions
diff --git a/absl/debugging/internal/decode_rust_punycode.cc b/absl/debugging/internal/decode_rust_punycode.cc new file mode 100644 index 00000000..43b46bf9 --- /dev/null +++ b/absl/debugging/internal/decode_rust_punycode.cc @@ -0,0 +1,258 @@ +// Copyright 2024 The Abseil Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "absl/debugging/internal/decode_rust_punycode.h" + +#include <cstddef> +#include <cstdint> +#include <cstring> + +#include "absl/base/config.h" +#include "absl/base/nullability.h" +#include "absl/debugging/internal/bounded_utf8_length_sequence.h" +#include "absl/debugging/internal/utf8_for_code_point.h" + +namespace absl { +ABSL_NAMESPACE_BEGIN +namespace debugging_internal { + +namespace { + +// Decoding Punycode requires repeated random-access insertion into a stream of +// variable-length UTF-8 code-point encodings. We need this to be tolerably +// fast (no N^2 slowdown for unfortunate inputs), and we can't allocate any data +// structures on the heap (async-signal-safety). +// +// It is pragmatic to impose a moderately low limit on the identifier length and +// bail out if we ever hit it. Then BoundedUtf8LengthSequence efficiently +// determines where to insert the next code point, and memmove efficiently makes +// room for it. +// +// The chosen limit is a round number several times larger than identifiers +// expected in practice, yet still small enough that a memmove of this many +// UTF-8 characters is not much more expensive than the division and modulus +// operations that Punycode decoding requires. +constexpr uint32_t kMaxChars = 256; + +// Constants from RFC 3492 section 5. +constexpr uint32_t kBase = 36, kTMin = 1, kTMax = 26, kSkew = 38, kDamp = 700; + +constexpr uint32_t kMaxCodePoint = 0x10ffff; + +// Overflow threshold in DecodeRustPunycode's inner loop; see comments there. +constexpr uint32_t kMaxI = 1 << 30; + +// If punycode_begin .. punycode_end begins with a prefix matching the regular +// expression [0-9a-zA-Z_]+_, removes that prefix, copies all but the final +// underscore into out_begin .. out_end, sets num_ascii_chars to the number of +// bytes copied, and returns true. (A prefix of this sort represents the +// nonempty subsequence of ASCII characters in the corresponding plaintext.) +// +// If punycode_begin .. punycode_end does not contain an underscore, sets +// num_ascii_chars to zero and returns true. (The encoding of a plaintext +// without any ASCII characters does not carry such a prefix.) +// +// Returns false and zeroes num_ascii_chars on failure (either parse error or +// not enough space in the output buffer). +bool ConsumeOptionalAsciiPrefix(const char*& punycode_begin, + const char* const punycode_end, + char* const out_begin, + char* const out_end, + uint32_t& num_ascii_chars) { + num_ascii_chars = 0; + + // Remember the last underscore if any. Also use the same string scan to + // reject any ASCII bytes that do not belong in an identifier, including NUL, + // as well as non-ASCII bytes, which should have been delta-encoded instead. + int last_underscore = -1; + for (int i = 0; i < punycode_end - punycode_begin; ++i) { + const char c = punycode_begin[i]; + if (c == '_') { + last_underscore = i; + continue; + } + // We write out the meaning of absl::ascii_isalnum rather than call that + // function because its documentation does not promise it will remain + // async-signal-safe under future development. + if ('a' <= c && c <= 'z') continue; + if ('A' <= c && c <= 'Z') continue; + if ('0' <= c && c <= '9') continue; + return false; + } + + // If there was no underscore, that means there were no ASCII characters in + // the plaintext, so there is no prefix to consume. Our work is done. + if (last_underscore < 0) return true; + + // Otherwise there will be an underscore delimiter somewhere. It can't be + // initial because then there would be no ASCII characters to its left, and no + // delimiter would have been added in that case. + if (last_underscore == 0) return false; + + // Any other position is reasonable. Make sure there's room in the buffer. + if (last_underscore + 1 > out_end - out_begin) return false; + + // Consume and write out the ASCII characters. + num_ascii_chars = static_cast<uint32_t>(last_underscore); + std::memcpy(out_begin, punycode_begin, num_ascii_chars); + out_begin[num_ascii_chars] = '\0'; + punycode_begin += num_ascii_chars + 1; + return true; +} + +// Returns the value of `c` as a base-36 digit according to RFC 3492 section 5, +// or -1 if `c` is not such a digit. +int DigitValue(char c) { + if ('0' <= c && c <= '9') return c - '0' + 26; + if ('a' <= c && c <= 'z') return c - 'a'; + if ('A' <= c && c <= 'Z') return c - 'A'; + return -1; +} + +// Consumes the next delta encoding from punycode_begin .. punycode_end, +// updating i accordingly. Returns true on success. Returns false on parse +// failure or arithmetic overflow. +bool ScanNextDelta(const char*& punycode_begin, const char* const punycode_end, + uint32_t bias, uint32_t& i) { + uint64_t w = 1; // 64 bits to prevent overflow in w *= kBase - t + + // "for k = base to infinity in steps of base do begin ... end" in RFC 3492 + // section 6.2. Each loop iteration scans one digit of the delta. + for (uint32_t k = kBase; punycode_begin != punycode_end; k += kBase) { + const int digit_value = DigitValue(*punycode_begin++); + if (digit_value < 0) return false; + + // Compute this in 64-bit arithmetic so we can check for overflow afterward. + const uint64_t new_i = i + static_cast<uint64_t>(digit_value) * w; + + // Valid deltas are bounded by (#chars already emitted) * kMaxCodePoint, but + // invalid input could encode an arbitrarily large delta. Nip that in the + // bud here. + static_assert( + kMaxI >= kMaxChars * kMaxCodePoint, + "kMaxI is too small to prevent spurious failures on good input"); + if (new_i > kMaxI) return false; + + static_assert( + kMaxI < (uint64_t{1} << 32), + "Make kMaxI smaller or i 64 bits wide to prevent silent wraparound"); + i = static_cast<uint32_t>(new_i); + + // Compute the threshold that determines whether this is the last digit and + // (if not) what the next digit's place value will be. This logic from RFC + // 3492 section 6.2 is explained in section 3.3. + uint32_t t; + if (k <= bias + kTMin) { + t = kTMin; + } else if (k >= bias + kTMax) { + t = kTMax; + } else { + t = k - bias; + } + if (static_cast<uint32_t>(digit_value) < t) return true; + + // If this gets too large, the range check on new_i in the next iteration + // will catch it. We know this multiplication will not overwrap because w + // is 64 bits wide. + w *= kBase - t; + } + return false; +} + +} // namespace + +absl::Nullable<char*> DecodeRustPunycode(DecodeRustPunycodeOptions options) { + const char* punycode_begin = options.punycode_begin; + const char* const punycode_end = options.punycode_end; + char* const out_begin = options.out_begin; + char* const out_end = options.out_end; + + // Write a NUL terminator first. Later memcpy calls will keep bumping it + // along to its new right place. + const size_t out_size = static_cast<size_t>(out_end - out_begin); + if (out_size == 0) return nullptr; + *out_begin = '\0'; + + // RFC 3492 section 6.2 begins here. We retain the names of integer variables + // appearing in that text. + uint32_t n = 128, i = 0, bias = 72, num_chars = 0; + + // If there are any ASCII characters, consume them and their trailing + // underscore delimiter. + if (!ConsumeOptionalAsciiPrefix(punycode_begin, punycode_end, + out_begin, out_end, num_chars)) { + return nullptr; + } + uint32_t total_utf8_bytes = num_chars; + + BoundedUtf8LengthSequence<kMaxChars> utf8_lengths; + + // "while the input is not exhausted do begin ... end" + while (punycode_begin != punycode_end) { + if (num_chars >= kMaxChars) return nullptr; + + const uint32_t old_i = i; + + if (!ScanNextDelta(punycode_begin, punycode_end, bias, i)) return nullptr; + + // Update bias as in RFC 3492 section 6.1. (We have inlined adapt.) + uint32_t delta = i - old_i; + delta /= (old_i == 0 ? kDamp : 2); + delta += delta/(num_chars + 1); + bias = 0; + while (delta > ((kBase - kTMin) * kTMax)/2) { + delta /= kBase - kTMin; + bias += kBase; + } + bias += ((kBase - kTMin + 1) * delta)/(delta + kSkew); + + // Back in section 6.2, compute the new code point and insertion index. + static_assert( + kMaxI + kMaxCodePoint < (uint64_t{1} << 32), + "Make kMaxI smaller or n 64 bits wide to prevent silent wraparound"); + n += i/(num_chars + 1); + i %= num_chars + 1; + + // To actually insert, we need to convert the code point n to UTF-8 and the + // character index i to an index into the byte stream emitted so far. First + // prepare the UTF-8 encoding for n, rejecting surrogates, overlarge values, + // and anything that won't fit into the remaining output storage. + Utf8ForCodePoint utf8_for_code_point(n); + if (!utf8_for_code_point.ok()) return nullptr; + if (total_utf8_bytes + utf8_for_code_point.length + 1 > out_size) { + return nullptr; + } + + // Now insert the new character into both our length map and the output. + uint32_t n_index = + utf8_lengths.InsertAndReturnSumOfPredecessors( + i, utf8_for_code_point.length); + std::memmove( + out_begin + n_index + utf8_for_code_point.length, out_begin + n_index, + total_utf8_bytes + 1 - n_index); + std::memcpy(out_begin + n_index, utf8_for_code_point.bytes, + utf8_for_code_point.length); + total_utf8_bytes += utf8_for_code_point.length; + ++num_chars; + + // Finally, advance to the next state before continuing. + ++i; + } + + return out_begin + total_utf8_bytes; +} + +} // namespace debugging_internal +ABSL_NAMESPACE_END +} // namespace absl diff --git a/absl/debugging/internal/decode_rust_punycode.h b/absl/debugging/internal/decode_rust_punycode.h new file mode 100644 index 00000000..0ae53ff3 --- /dev/null +++ b/absl/debugging/internal/decode_rust_punycode.h @@ -0,0 +1,55 @@ +// Copyright 2024 The Abseil Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef ABSL_DEBUGGING_INTERNAL_DECODE_RUST_PUNYCODE_H_ +#define ABSL_DEBUGGING_INTERNAL_DECODE_RUST_PUNYCODE_H_ + +#include "absl/base/config.h" +#include "absl/base/nullability.h" + +namespace absl { +ABSL_NAMESPACE_BEGIN +namespace debugging_internal { + +struct DecodeRustPunycodeOptions { + const char* punycode_begin; + const char* punycode_end; + char* out_begin; + char* out_end; +}; + +// Given Rust Punycode in `punycode_begin .. punycode_end`, writes the +// corresponding UTF-8 plaintext into `out_begin .. out_end`, followed by a NUL +// character, and returns a pointer to that final NUL on success. On failure +// returns a null pointer, and the contents of `out_begin .. out_end` are +// unspecified. +// +// Failure occurs in precisely these cases: +// - Any input byte does not match [0-9a-zA-Z_]. +// - The first input byte is an underscore, but no other underscore appears in +// the input. +// - The delta sequence does not represent a valid sequence of code-point +// insertions. +// - The plaintext would contain more than 256 code points. +// +// DecodeRustPunycode is async-signal-safe with bounded runtime and a small +// stack footprint, making it suitable for use in demangling Rust symbol names +// from a signal handler. +absl::Nullable<char*> DecodeRustPunycode(DecodeRustPunycodeOptions options); + +} // namespace debugging_internal +ABSL_NAMESPACE_END +} // namespace absl + +#endif // ABSL_DEBUGGING_INTERNAL_DECODE_RUST_PUNYCODE_H_ diff --git a/absl/debugging/internal/decode_rust_punycode_test.cc b/absl/debugging/internal/decode_rust_punycode_test.cc new file mode 100644 index 00000000..78d1c332 --- /dev/null +++ b/absl/debugging/internal/decode_rust_punycode_test.cc @@ -0,0 +1,606 @@ +// Copyright 2024 The Abseil Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "absl/debugging/internal/decode_rust_punycode.h" + +#include <cstddef> +#include <cstring> +#include <string> + +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include "absl/base/config.h" + +namespace absl { +ABSL_NAMESPACE_BEGIN +namespace debugging_internal { +namespace { + +using ::testing::AllOf; +using ::testing::Eq; +using ::testing::IsNull; +using ::testing::Pointee; +using ::testing::ResultOf; +using ::testing::StrEq; + +class DecodeRustPunycodeTest : public ::testing::Test { + protected: + void FillBufferWithNonzeroBytes() { + // The choice of nonzero value to fill with is arbitrary. The point is just + // to fail tests if DecodeRustPunycode forgets to write the final NUL + // character. + std::memset(buffer_storage_, 0xab, sizeof(buffer_storage_)); + } + + DecodeRustPunycodeOptions WithAmpleSpace() { + FillBufferWithNonzeroBytes(); + + DecodeRustPunycodeOptions options; + options.punycode_begin = punycode_.data(); + options.punycode_end = punycode_.data() + punycode_.size(); + options.out_begin = buffer_storage_; + options.out_end = buffer_storage_ + sizeof(buffer_storage_); + return options; + } + + DecodeRustPunycodeOptions WithJustEnoughSpace() { + FillBufferWithNonzeroBytes(); + + const size_t begin_offset = sizeof(buffer_storage_) - plaintext_.size() - 1; + DecodeRustPunycodeOptions options; + options.punycode_begin = punycode_.data(); + options.punycode_end = punycode_.data() + punycode_.size(); + options.out_begin = buffer_storage_ + begin_offset; + options.out_end = buffer_storage_ + sizeof(buffer_storage_); + return options; + } + + DecodeRustPunycodeOptions WithOneByteTooFew() { + FillBufferWithNonzeroBytes(); + + const size_t begin_offset = sizeof(buffer_storage_) - plaintext_.size(); + DecodeRustPunycodeOptions options; + options.punycode_begin = punycode_.data(); + options.punycode_end = punycode_.data() + punycode_.size(); + options.out_begin = buffer_storage_ + begin_offset; + options.out_end = buffer_storage_ + sizeof(buffer_storage_); + return options; + } + + // Matches a correct return value of DecodeRustPunycode when `golden` is the + // expected plaintext output. + auto PointsToTheNulAfter(const std::string& golden) { + const size_t golden_size = golden.size(); + return AllOf( + Pointee(Eq('\0')), + ResultOf("preceding string body", + [golden_size](const char* p) { return p - golden_size; }, + StrEq(golden))); + } + + std::string punycode_; + std::string plaintext_; + char buffer_storage_[1024]; +}; + +TEST_F(DecodeRustPunycodeTest, MapsEmptyToEmpty) { + punycode_ = ""; + plaintext_ = ""; + + ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()), + PointsToTheNulAfter(plaintext_)); + ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()), + PointsToTheNulAfter(plaintext_)); + EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull()); +} + +TEST_F(DecodeRustPunycodeTest, + StripsTheTrailingDelimiterFromAPureRunOfBasicChars) { + punycode_ = "foo_"; + plaintext_ = "foo"; + + ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()), + PointsToTheNulAfter(plaintext_)); + ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()), + PointsToTheNulAfter(plaintext_)); + EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull()); +} + +TEST_F(DecodeRustPunycodeTest, TreatsTheLastUnderscoreAsTheDelimiter) { + punycode_ = "foo_bar_"; + plaintext_ = "foo_bar"; + + ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()), + PointsToTheNulAfter(plaintext_)); + ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()), + PointsToTheNulAfter(plaintext_)); + EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull()); +} + +TEST_F(DecodeRustPunycodeTest, AcceptsALeadingUnderscoreIfNotTheDelimiter) { + punycode_ = "_foo_"; + plaintext_ = "_foo"; + + ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()), + PointsToTheNulAfter(plaintext_)); + ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()), + PointsToTheNulAfter(plaintext_)); + EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull()); +} + +TEST_F(DecodeRustPunycodeTest, RejectsALeadingUnderscoreDelimiter) { + punycode_ = "_foo"; + + EXPECT_THAT(DecodeRustPunycode(WithAmpleSpace()), IsNull()); +} + +TEST_F(DecodeRustPunycodeTest, RejectsEmbeddedNul) { + punycode_ = std::string("foo\0bar_", 8); + + EXPECT_THAT(DecodeRustPunycode(WithAmpleSpace()), IsNull()); +} + +TEST_F(DecodeRustPunycodeTest, RejectsAsciiCharsOtherThanIdentifierChars) { + punycode_ = "foo\007_"; + EXPECT_THAT(DecodeRustPunycode(WithAmpleSpace()), IsNull()); + + punycode_ = "foo-_"; + EXPECT_THAT(DecodeRustPunycode(WithAmpleSpace()), IsNull()); + + punycode_ = "foo;_"; + EXPECT_THAT(DecodeRustPunycode(WithAmpleSpace()), IsNull()); + + punycode_ = "foo\177_"; + EXPECT_THAT(DecodeRustPunycode(WithAmpleSpace()), IsNull()); +} + +TEST_F(DecodeRustPunycodeTest, RejectsRawNonAsciiChars) { + punycode_ = "\x80"; + EXPECT_THAT(DecodeRustPunycode(WithAmpleSpace()), IsNull()); + + punycode_ = "\x80_"; + EXPECT_THAT(DecodeRustPunycode(WithAmpleSpace()), IsNull()); + + punycode_ = "\xff"; + EXPECT_THAT(DecodeRustPunycode(WithAmpleSpace()), IsNull()); + + punycode_ = "\xff_"; + EXPECT_THAT(DecodeRustPunycode(WithAmpleSpace()), IsNull()); +} + +TEST_F(DecodeRustPunycodeTest, RecognizesU0080) { + // a encodes 0, so the output is the smallest non-ASCII code point standing + // alone. (U+0080 PAD is not an identifier character, but DecodeRustPunycode + // does not check whether non-ASCII characters could belong to an identifier.) + punycode_ = "a"; + plaintext_ = "\xc2\x80"; + + ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()), + PointsToTheNulAfter(plaintext_)); + ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()), + PointsToTheNulAfter(plaintext_)); + EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull()); +} + +TEST_F(DecodeRustPunycodeTest, OneByteDeltaSequencesMustBeA) { + // Because bias = 72 for the first code point, any digit but a/A is nonfinal + // in one of the first two bytes of a delta sequence. + punycode_ = "b"; + EXPECT_THAT(DecodeRustPunycode(WithAmpleSpace()), IsNull()); + + punycode_ = "z"; + EXPECT_THAT(DecodeRustPunycode(WithAmpleSpace()), IsNull()); + + punycode_ = "0"; + EXPECT_THAT(DecodeRustPunycode(WithAmpleSpace()), IsNull()); + + punycode_ = "9"; + EXPECT_THAT(DecodeRustPunycode(WithAmpleSpace()), IsNull()); +} + +TEST_F(DecodeRustPunycodeTest, AcceptsDeltaSequenceBA) { + punycode_ = "ba"; + plaintext_ = "\xc2\x81"; + + ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()), + PointsToTheNulAfter(plaintext_)); + ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()), + PointsToTheNulAfter(plaintext_)); + EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull()); +} + +TEST_F(DecodeRustPunycodeTest, AcceptsOtherDeltaSequencesWithSecondByteA) { + punycode_ = "ca"; + plaintext_ = "\xc2\x82"; + EXPECT_THAT(DecodeRustPunycode(WithJustEnoughSpace()), + PointsToTheNulAfter(plaintext_)); + + punycode_ = "za"; + plaintext_ = "\xc2\x99"; + EXPECT_THAT(DecodeRustPunycode(WithJustEnoughSpace()), + PointsToTheNulAfter(plaintext_)); + + punycode_ = "0a"; + plaintext_ = "\xc2\x9a"; + EXPECT_THAT(DecodeRustPunycode(WithJustEnoughSpace()), + PointsToTheNulAfter(plaintext_)); + + punycode_ = "1a"; + plaintext_ = "\xc2\x9b"; + EXPECT_THAT(DecodeRustPunycode(WithJustEnoughSpace()), + PointsToTheNulAfter(plaintext_)); + + punycode_ = "9a"; + plaintext_ = "£"; // Pound sign, U+00A3 + EXPECT_THAT(DecodeRustPunycode(WithJustEnoughSpace()), + PointsToTheNulAfter(plaintext_)); +} + +TEST_F(DecodeRustPunycodeTest, RejectsDeltaWhereTheSecondAndLastDigitIsNotA) { + punycode_ = "bb"; + EXPECT_THAT(DecodeRustPunycode(WithAmpleSpace()), IsNull()); + + punycode_ = "zz"; + EXPECT_THAT(DecodeRustPunycode(WithAmpleSpace()), IsNull()); + + punycode_ = "00"; + EXPECT_THAT(DecodeRustPunycode(WithAmpleSpace()), IsNull()); + + punycode_ = "99"; + EXPECT_THAT(DecodeRustPunycode(WithAmpleSpace()), IsNull()); +} + +TEST_F(DecodeRustPunycodeTest, AcceptsDeltasWithSecondByteBFollowedByA) { + punycode_ = "bba"; + plaintext_ = "¤"; // U+00A4 + EXPECT_THAT(DecodeRustPunycode(WithJustEnoughSpace()), + PointsToTheNulAfter(plaintext_)); + + punycode_ = "cba"; + plaintext_ = "¥"; // U+00A5 + EXPECT_THAT(DecodeRustPunycode(WithJustEnoughSpace()), + PointsToTheNulAfter(plaintext_)); + + punycode_ = "zba"; + plaintext_ = "¼"; // U+00BC + EXPECT_THAT(DecodeRustPunycode(WithJustEnoughSpace()), + PointsToTheNulAfter(plaintext_)); + + punycode_ = "0ba"; + plaintext_ = "½"; // U+00BD + EXPECT_THAT(DecodeRustPunycode(WithJustEnoughSpace()), + PointsToTheNulAfter(plaintext_)); + + punycode_ = "1ba"; + plaintext_ = "¾"; // U+00BE + EXPECT_THAT(DecodeRustPunycode(WithJustEnoughSpace()), + PointsToTheNulAfter(plaintext_)); + + punycode_ = "9ba"; + plaintext_ = "Æ"; // U+00C6 + EXPECT_THAT(DecodeRustPunycode(WithJustEnoughSpace()), + PointsToTheNulAfter(plaintext_)); +} + +// Tests beyond this point use characters allowed in identifiers, so you can +// prepend _RNvC1cu<decimal length><underscore if [0-9_] follows> to a test +// input and run it through another Rust demangler to verify that the +// corresponding golden output is correct. + +TEST_F(DecodeRustPunycodeTest, AcceptsTwoByteCharAlone) { + punycode_ = "0ca"; + plaintext_ = "à"; + + ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()), + PointsToTheNulAfter(plaintext_)); + ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()), + PointsToTheNulAfter(plaintext_)); + EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull()); +} + +TEST_F(DecodeRustPunycodeTest, AcceptsTwoByteCharBeforeBasicChars) { + punycode_ = "_la_mode_yya"; + plaintext_ = "à_la_mode"; + + ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()), + PointsToTheNulAfter(plaintext_)); + ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()), + PointsToTheNulAfter(plaintext_)); + EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull()); +} + +TEST_F(DecodeRustPunycodeTest, AcceptsTwoByteCharAmidBasicChars) { + punycode_ = "verre__vin_m4a"; + plaintext_ = "verre_à_vin"; + + ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()), + PointsToTheNulAfter(plaintext_)); + ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()), + PointsToTheNulAfter(plaintext_)); + EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull()); +} + +TEST_F(DecodeRustPunycodeTest, AcceptsTwoByteCharAfterBasicChars) { + punycode_ = "belt_3na"; + plaintext_ = "beltà"; + + ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()), + PointsToTheNulAfter(plaintext_)); + ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()), + PointsToTheNulAfter(plaintext_)); + EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull()); +} + +TEST_F(DecodeRustPunycodeTest, AcceptsRepeatedTwoByteChar) { + punycode_ = "0caaaa"; + plaintext_ = "àààà"; + + ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()), + PointsToTheNulAfter(plaintext_)); + ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()), + PointsToTheNulAfter(plaintext_)); + EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull()); +} + +TEST_F(DecodeRustPunycodeTest, AcceptsNearbyTwoByteCharsInOrder) { + punycode_ = "3camsuz"; + plaintext_ = "ãéïôù"; + + ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()), + PointsToTheNulAfter(plaintext_)); + ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()), + PointsToTheNulAfter(plaintext_)); + EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull()); +} + +TEST_F(DecodeRustPunycodeTest, AcceptsNearbyTwoByteCharsOutOfOrder) { + punycode_ = "3caltsx"; + plaintext_ = "ùéôãï"; + + ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()), + PointsToTheNulAfter(plaintext_)); + ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()), + PointsToTheNulAfter(plaintext_)); + EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull()); +} + +TEST_F(DecodeRustPunycodeTest, AcceptsThreeByteCharAlone) { + punycode_ = "fiq"; + plaintext_ = "中"; + + ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()), + PointsToTheNulAfter(plaintext_)); + ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()), + PointsToTheNulAfter(plaintext_)); + EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull()); +} + +TEST_F(DecodeRustPunycodeTest, AcceptsRepeatedThreeByteChar) { + punycode_ = "fiqaaaa"; + plaintext_ = "中中中中中"; + + ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()), + PointsToTheNulAfter(plaintext_)); + ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()), + PointsToTheNulAfter(plaintext_)); + EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull()); +} + +TEST_F(DecodeRustPunycodeTest, AcceptsThreeByteCharsInOrder) { + punycode_ = "fiq228c"; + plaintext_ = "中文"; + + ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()), + PointsToTheNulAfter(plaintext_)); + ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()), + PointsToTheNulAfter(plaintext_)); + EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull()); +} + +TEST_F(DecodeRustPunycodeTest, AcceptsNearbyThreeByteCharsOutOfOrder) { + punycode_ = "fiq128c"; + plaintext_ = "文中"; + + ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()), + PointsToTheNulAfter(plaintext_)); + ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()), + PointsToTheNulAfter(plaintext_)); + EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull()); +} + +TEST_F(DecodeRustPunycodeTest, AcceptsFourByteCharAlone) { + punycode_ = "uy7h"; + plaintext_ = "🂻"; + + ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()), + PointsToTheNulAfter(plaintext_)); + ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()), + PointsToTheNulAfter(plaintext_)); + EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull()); +} + +TEST_F(DecodeRustPunycodeTest, AcceptsFourByteCharBeforeBasicChars) { + punycode_ = "jack__uh63d"; + plaintext_ = "jack_🂻"; + + ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()), + PointsToTheNulAfter(plaintext_)); + ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()), + PointsToTheNulAfter(plaintext_)); + EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull()); +} + +TEST_F(DecodeRustPunycodeTest, AcceptsFourByteCharAmidBasicChars) { + punycode_ = "jack__of_hearts_ki37n"; + plaintext_ = "jack_🂻_of_hearts"; + + ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()), + PointsToTheNulAfter(plaintext_)); + ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()), + PointsToTheNulAfter(plaintext_)); + EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull()); +} + +TEST_F(DecodeRustPunycodeTest, AcceptsFourByteCharAfterBasicChars) { + punycode_ = "_of_hearts_kz45i"; + plaintext_ = "🂻_of_hearts"; + + ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()), + PointsToTheNulAfter(plaintext_)); + ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()), + PointsToTheNulAfter(plaintext_)); + EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull()); +} + +TEST_F(DecodeRustPunycodeTest, AcceptsRepeatedFourByteChar) { + punycode_ = "uy7haaaa"; + plaintext_ = "🂻🂻🂻🂻🂻"; + + ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()), + PointsToTheNulAfter(plaintext_)); + ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()), + PointsToTheNulAfter(plaintext_)); + EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull()); +} + +TEST_F(DecodeRustPunycodeTest, AcceptsNearbyFourByteCharsInOrder) { + punycode_ = "8x7hcjmf"; + plaintext_ = "🂦🂧🂪🂭🂮"; + + ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()), + PointsToTheNulAfter(plaintext_)); + ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()), + PointsToTheNulAfter(plaintext_)); + EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull()); +} + +TEST_F(DecodeRustPunycodeTest, AcceptsNearbyFourByteCharsOutOfOrder) { + punycode_ = "8x7hcild"; + plaintext_ = "🂮🂦🂭🂪🂧"; + + ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()), + PointsToTheNulAfter(plaintext_)); + ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()), + PointsToTheNulAfter(plaintext_)); + EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull()); +} + +TEST_F(DecodeRustPunycodeTest, AcceptsAMixtureOfByteLengths) { + punycode_ = "3caltsx2079ivf8aiuy7cja3a6ak"; + plaintext_ = "ùéôãï中文🂮🂦🂭🂪🂧"; + + ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()), + PointsToTheNulAfter(plaintext_)); + ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()), + PointsToTheNulAfter(plaintext_)); + EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull()); +} + +TEST_F(DecodeRustPunycodeTest, RejectsOverlargeDeltas) { + punycode_ = "123456789a"; + + EXPECT_THAT(DecodeRustPunycode(WithAmpleSpace()), IsNull()); +} + +// Finally, we test on a few prose and poetry snippets as a defense in depth. +// If our artificial short test inputs did not exercise a bug that is tickled by +// patterns typical of real human writing, maybe real human writing will catch +// that. +// +// These test inputs are extracted from texts old enough to be out of copyright +// that probe a variety of ranges of code-point space. All are longer than 32 +// code points, so they exercise the carrying of seminibbles from one uint64_t +// to the next higher one in BoundedUtf8LengthSequence. + +// The first three lines of the Old English epic _Beowulf_, mostly ASCII with a +// few archaic two-byte letters interspersed. +TEST_F(DecodeRustPunycodeTest, Beowulf) { + punycode_ = "hwt_we_gardena_in_geardagum_" + "eodcyninga_rym_gefrunon_" + "hu_a_elingas_ellen_fremedon_hxg9c70do9alau"; + plaintext_ = "hwæt_we_gardena_in_geardagum_" + "þeodcyninga_þrym_gefrunon_" + "hu_ða_æþelingas_ellen_fremedon"; + + ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()), + PointsToTheNulAfter(plaintext_)); + ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()), + PointsToTheNulAfter(plaintext_)); + EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull()); +} + +// The whole of 過故人莊 by the 8th-century Chinese poet 孟浩然 +// (Meng Haoran), exercising three-byte-character processing. +TEST_F(DecodeRustPunycodeTest, MengHaoran) { + punycode_ = "gmq4ss0cfvao1e2wg8mcw8b0wkl9a7tt90a8riuvbk7t8kbv9a66ogofvzlf6" + "3d01ybn1u28dyqi5q2cxyyxnk5d2gx1ks9ddvfm17bk6gbsd6wftrav60u4ta"; + plaintext_ = "故人具雞黍" "邀我至田家" + "綠樹村邊合" "青山郭外斜" + "開軒面場圃" "把酒話桑麻" + "待到重陽日" "還來就菊花"; + + ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()), + PointsToTheNulAfter(plaintext_)); + ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()), + PointsToTheNulAfter(plaintext_)); + EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull()); +} + +// A poem of the 8th-century Japanese poet 山上憶良 (Yamanoue no Okura). +// Japanese mixes two-byte and three-byte characters: a good workout for codecs. +TEST_F(DecodeRustPunycodeTest, YamanoueNoOkura) { + punycode_ = "48jdaa3a6ccpepjrsmlb0q4bwcdtid8fg6c0cai9822utqeruk3om0u4f2wbp0" + "em23do0op23cc2ff70mb6tae8aq759gja"; + plaintext_ = "瓜食めば" + "子ども思ほゆ" + "栗食めば" + "まして偲はゆ" + "何処より" + "来りしものそ" + "眼交に" + "もとな懸りて" + "安眠し寝さぬ"; + + ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()), + PointsToTheNulAfter(plaintext_)); + ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()), + PointsToTheNulAfter(plaintext_)); + EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull()); +} + +// The first two lines of the Phoenician-language inscription on the sarcophagus +// of Eshmunazar II of Sidon, 6th century BCE. Phoenician and many other +// archaic scripts are allocated in the Supplemental Multilingual Plane (U+10000 +// through U+1FFFF) and thus exercise four-byte-character processing. +TEST_F(DecodeRustPunycodeTest, EshmunazarSarcophagus) { + punycode_ = "wj9caaabaabbaaohcacxvhdc7bgxbccbdcjeacddcedcdlddbdbddcdbdcknfcee" + "ifel8del2a7inq9fhcpxikms7a4a9ac9ataaa0g"; + plaintext_ = "𐤁𐤉𐤓𐤇𐤁𐤋𐤁𐤔𐤍𐤕𐤏𐤎𐤓" + "𐤅𐤀𐤓𐤁𐤏𐤗𐤖𐤖𐤖𐤖𐤋𐤌𐤋𐤊𐤉𐤌𐤋𐤊" + "𐤀𐤔𐤌𐤍𐤏𐤆𐤓𐤌𐤋𐤊𐤑𐤃𐤍𐤌" + "𐤁𐤍𐤌𐤋𐤊𐤕𐤁𐤍𐤕𐤌𐤋𐤊𐤑𐤃𐤍𐤌" + "𐤃𐤁𐤓𐤌𐤋𐤊𐤀𐤔𐤌𐤍𐤏𐤆𐤓𐤌𐤋𐤊" + "𐤑𐤃𐤍𐤌𐤋𐤀𐤌𐤓𐤍𐤂𐤆𐤋𐤕"; + + ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()), + PointsToTheNulAfter(plaintext_)); + ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()), + PointsToTheNulAfter(plaintext_)); + EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull()); +} + +} // namespace +} // namespace debugging_internal +ABSL_NAMESPACE_END +} // namespace absl |