Decoder for Rust-style Punycode encodings of bounded length.

PiperOrigin-RevId: 647093624 Change-Id: Ic76bfa4aa8fb616cb23095ce7bfa30c3812dcb21
author: Chris Mihelich <cmihelic@google.com> 2024-06-26 15:13:52 -0700
committer: Copybara-Service <copybara-worker@google.com> 2024-06-26 15:14:45 -0700
commit: 16452e1418c1c2a8bcf4a99238e190ba901a20a6 (patch)
tree: 551f86735011199f26edd2a635683c33bcd80cd6 /absl/debugging
parent: 63d4b2fe1e0d4bd6a7b916f398643db40c35624b (diff)
5 files changed, 951 insertions, 0 deletions
diff --git a/absl/debugging/BUILD.bazel b/absl/debugging/BUILD.bazel
index 0cd8015c..6e867cef 100644
--- a/absl/debugging/BUILD.bazel
+++ b/absl/debugging/BUILD.bazel
@@ -220,12 +220,14 @@ cc_library(
 cc_library(
     name = "demangle_internal",
     srcs = [
+        "internal/decode_rust_punycode.cc",
         "internal/demangle.cc",
         "internal/demangle_rust.cc",
         "internal/utf8_for_code_point.cc",
     ],
     hdrs = [
         "internal/bounded_utf8_length_sequence.h",
+        "internal/decode_rust_punycode.h",
         "internal/demangle.h",
         "internal/demangle_rust.h",
         "internal/utf8_for_code_point.h",
@@ -240,6 +242,7 @@ cc_library(
         "//absl/base",
         "//absl/base:config",
         "//absl/base:core_headers",
+        "//absl/base:nullability",
         "//absl/numeric:bits",
     ],
 )
@@ -258,6 +261,19 @@ cc_test(
 )
 
 cc_test(
+    name = "decode_rust_punycode_test",
+    srcs = ["internal/decode_rust_punycode_test.cc"],
+    copts = ABSL_TEST_COPTS,
+    linkopts = ABSL_DEFAULT_LINKOPTS,
+    deps = [
+        ":demangle_internal",
+        "//absl/base:config",
+        "@com_google_googletest//:gtest",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_test(
     name = "demangle_rust_test",
     srcs = ["internal/demangle_rust_test.cc"],
     copts = ABSL_TEST_COPTS,
diff --git a/absl/debugging/CMakeLists.txt b/absl/debugging/CMakeLists.txt
index 738ae4e3..14707775 100644
--- a/absl/debugging/CMakeLists.txt
+++ b/absl/debugging/CMakeLists.txt
@@ -196,10 +196,12 @@ absl_cc_library(
     demangle_internal
   HDRS
     "internal/bounded_utf8_length_sequence.h"
+    "internal/decode_rust_punycode.h"
     "internal/demangle.h"
     "internal/demangle_rust.h"
     "internal/utf8_for_code_point.h"
   SRCS
+    "internal/decode_rust_punycode.cc"
     "internal/demangle.cc"
     "internal/demangle_rust.cc"
     "internal/utf8_for_code_point.cc"
@@ -209,6 +211,7 @@ absl_cc_library(
     absl::base
     absl::bits
     absl::core_headers
+    absl::nullability
   PUBLIC
 )
 
@@ -227,6 +230,19 @@ absl_cc_test(
 
 absl_cc_test(
   NAME
+    decode_rust_punycode_test
+  SRCS
+    "internal/decode_rust_punycode_test.cc"
+  COPTS
+    ${ABSL_TEST_COPTS}
+  DEPS
+    absl::demangle_internal
+    absl::config
+    GTest::gmock_main
+)
+
+absl_cc_test(
+  NAME
     demangle_rust_test
   SRCS
     "internal/demangle_rust_test.cc"
diff --git a/absl/debugging/internal/decode_rust_punycode.cc b/absl/debugging/internal/decode_rust_punycode.cc
new file mode 100644
index 00000000..43b46bf9
--- /dev/null
+++ b/absl/debugging/internal/decode_rust_punycode.cc
@@ -0,0 +1,258 @@
+// Copyright 2024 The Abseil Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "absl/debugging/internal/decode_rust_punycode.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "absl/base/config.h"
+#include "absl/base/nullability.h"
+#include "absl/debugging/internal/bounded_utf8_length_sequence.h"
+#include "absl/debugging/internal/utf8_for_code_point.h"
+
+namespace absl {
+ABSL_NAMESPACE_BEGIN
+namespace debugging_internal {
+
+namespace {
+
+// Decoding Punycode requires repeated random-access insertion into a stream of
+// variable-length UTF-8 code-point encodings.  We need this to be tolerably
+// fast (no N^2 slowdown for unfortunate inputs), and we can't allocate any data
+// structures on the heap (async-signal-safety).
+//
+// It is pragmatic to impose a moderately low limit on the identifier length and
+// bail out if we ever hit it.  Then BoundedUtf8LengthSequence efficiently
+// determines where to insert the next code point, and memmove efficiently makes
+// room for it.
+//
+// The chosen limit is a round number several times larger than identifiers
+// expected in practice, yet still small enough that a memmove of this many
+// UTF-8 characters is not much more expensive than the division and modulus
+// operations that Punycode decoding requires.
+constexpr uint32_t kMaxChars = 256;
+
+// Constants from RFC 3492 section 5.
+constexpr uint32_t kBase = 36, kTMin = 1, kTMax = 26, kSkew = 38, kDamp = 700;
+
+constexpr uint32_t kMaxCodePoint = 0x10ffff;
+
+// Overflow threshold in DecodeRustPunycode's inner loop; see comments there.
+constexpr uint32_t kMaxI = 1 << 30;
+
+// If punycode_begin .. punycode_end begins with a prefix matching the regular
+// expression [0-9a-zA-Z_]+_, removes that prefix, copies all but the final
+// underscore into out_begin .. out_end, sets num_ascii_chars to the number of
+// bytes copied, and returns true.  (A prefix of this sort represents the
+// nonempty subsequence of ASCII characters in the corresponding plaintext.)
+//
+// If punycode_begin .. punycode_end does not contain an underscore, sets
+// num_ascii_chars to zero and returns true.  (The encoding of a plaintext
+// without any ASCII characters does not carry such a prefix.)
+//
+// Returns false and zeroes num_ascii_chars on failure (either parse error or
+// not enough space in the output buffer).
+bool ConsumeOptionalAsciiPrefix(const char*& punycode_begin,
+                                const char* const punycode_end,
+                                char* const out_begin,
+                                char* const out_end,
+                                uint32_t& num_ascii_chars) {
+  num_ascii_chars = 0;
+
+  // Remember the last underscore if any.  Also use the same string scan to
+  // reject any ASCII bytes that do not belong in an identifier, including NUL,
+  // as well as non-ASCII bytes, which should have been delta-encoded instead.
+  int last_underscore = -1;
+  for (int i = 0; i < punycode_end - punycode_begin; ++i) {
+    const char c = punycode_begin[i];
+    if (c == '_') {
+      last_underscore = i;
+      continue;
+    }
+    // We write out the meaning of absl::ascii_isalnum rather than call that
+    // function because its documentation does not promise it will remain
+    // async-signal-safe under future development.
+    if ('a' <= c && c <= 'z') continue;
+    if ('A' <= c && c <= 'Z') continue;
+    if ('0' <= c && c <= '9') continue;
+    return false;
+  }
+
+  // If there was no underscore, that means there were no ASCII characters in
+  // the plaintext, so there is no prefix to consume.  Our work is done.
+  if (last_underscore < 0) return true;
+
+  // Otherwise there will be an underscore delimiter somewhere.  It can't be
+  // initial because then there would be no ASCII characters to its left, and no
+  // delimiter would have been added in that case.
+  if (last_underscore == 0) return false;
+
+  // Any other position is reasonable.  Make sure there's room in the buffer.
+  if (last_underscore + 1 > out_end - out_begin) return false;
+
+  // Consume and write out the ASCII characters.
+  num_ascii_chars = static_cast<uint32_t>(last_underscore);
+  std::memcpy(out_begin, punycode_begin, num_ascii_chars);
+  out_begin[num_ascii_chars] = '\0';
+  punycode_begin += num_ascii_chars + 1;
+  return true;
+}
+
+// Returns the value of `c` as a base-36 digit according to RFC 3492 section 5,
+// or -1 if `c` is not such a digit.
+int DigitValue(char c) {
+  if ('0' <= c && c <= '9') return c - '0' + 26;
+  if ('a' <= c && c <= 'z') return c - 'a';
+  if ('A' <= c && c <= 'Z') return c - 'A';
+  return -1;
+}
+
+// Consumes the next delta encoding from punycode_begin .. punycode_end,
+// updating i accordingly.  Returns true on success.  Returns false on parse
+// failure or arithmetic overflow.
+bool ScanNextDelta(const char*& punycode_begin, const char* const punycode_end,
+                   uint32_t bias, uint32_t& i) {
+  uint64_t w = 1;  // 64 bits to prevent overflow in w *= kBase - t
+
+  // "for k = base to infinity in steps of base do begin ... end" in RFC 3492
+  // section 6.2.  Each loop iteration scans one digit of the delta.
+  for (uint32_t k = kBase; punycode_begin != punycode_end; k += kBase) {
+    const int digit_value = DigitValue(*punycode_begin++);
+    if (digit_value < 0) return false;
+
+    // Compute this in 64-bit arithmetic so we can check for overflow afterward.
+    const uint64_t new_i = i + static_cast<uint64_t>(digit_value) * w;
+
+    // Valid deltas are bounded by (#chars already emitted) * kMaxCodePoint, but
+    // invalid input could encode an arbitrarily large delta.  Nip that in the
+    // bud here.
+    static_assert(
+        kMaxI >= kMaxChars * kMaxCodePoint,
+        "kMaxI is too small to prevent spurious failures on good input");
+    if (new_i > kMaxI) return false;
+
+    static_assert(
+        kMaxI < (uint64_t{1} << 32),
+        "Make kMaxI smaller or i 64 bits wide to prevent silent wraparound");
+    i = static_cast<uint32_t>(new_i);
+
+    // Compute the threshold that determines whether this is the last digit and
+    // (if not) what the next digit's place value will be.  This logic from RFC
+    // 3492 section 6.2 is explained in section 3.3.
+    uint32_t t;
+    if (k <= bias + kTMin) {
+      t = kTMin;
+    } else if (k >= bias + kTMax) {
+      t = kTMax;
+    } else {
+      t = k - bias;
+    }
+    if (static_cast<uint32_t>(digit_value) < t) return true;
+
+    // If this gets too large, the range check on new_i in the next iteration
+    // will catch it.  We know this multiplication will not overwrap because w
+    // is 64 bits wide.
+    w *= kBase - t;
+  }
+  return false;
+}
+
+}  // namespace
+
+absl::Nullable<char*> DecodeRustPunycode(DecodeRustPunycodeOptions options) {
+  const char* punycode_begin = options.punycode_begin;
+  const char* const punycode_end = options.punycode_end;
+  char* const out_begin = options.out_begin;
+  char* const out_end = options.out_end;
+
+  // Write a NUL terminator first.  Later memcpy calls will keep bumping it
+  // along to its new right place.
+  const size_t out_size = static_cast<size_t>(out_end - out_begin);
+  if (out_size == 0) return nullptr;
+  *out_begin = '\0';
+
+  // RFC 3492 section 6.2 begins here.  We retain the names of integer variables
+  // appearing in that text.
+  uint32_t n = 128, i = 0, bias = 72, num_chars = 0;
+
+  // If there are any ASCII characters, consume them and their trailing
+  // underscore delimiter.
+  if (!ConsumeOptionalAsciiPrefix(punycode_begin, punycode_end,
+                                  out_begin, out_end, num_chars)) {
+    return nullptr;
+  }
+  uint32_t total_utf8_bytes = num_chars;
+
+  BoundedUtf8LengthSequence<kMaxChars> utf8_lengths;
+
+  // "while the input is not exhausted do begin ... end"
+  while (punycode_begin != punycode_end) {
+    if (num_chars >= kMaxChars) return nullptr;
+
+    const uint32_t old_i = i;
+
+    if (!ScanNextDelta(punycode_begin, punycode_end, bias, i)) return nullptr;
+
+    // Update bias as in RFC 3492 section 6.1.  (We have inlined adapt.)
+    uint32_t delta = i - old_i;
+    delta /= (old_i == 0 ? kDamp : 2);
+    delta += delta/(num_chars + 1);
+    bias = 0;
+    while (delta > ((kBase - kTMin) * kTMax)/2) {
+      delta /= kBase - kTMin;
+      bias += kBase;
+    }
+    bias += ((kBase - kTMin + 1) * delta)/(delta + kSkew);
+
+    // Back in section 6.2, compute the new code point and insertion index.
+    static_assert(
+        kMaxI + kMaxCodePoint < (uint64_t{1} << 32),
+        "Make kMaxI smaller or n 64 bits wide to prevent silent wraparound");
+    n += i/(num_chars + 1);
+    i %= num_chars + 1;
+
+    // To actually insert, we need to convert the code point n to UTF-8 and the
+    // character index i to an index into the byte stream emitted so far.  First
+    // prepare the UTF-8 encoding for n, rejecting surrogates, overlarge values,
+    // and anything that won't fit into the remaining output storage.
+    Utf8ForCodePoint utf8_for_code_point(n);
+    if (!utf8_for_code_point.ok()) return nullptr;
+    if (total_utf8_bytes + utf8_for_code_point.length + 1 > out_size) {
+      return nullptr;
+    }
+
+    // Now insert the new character into both our length map and the output.
+    uint32_t n_index =
+        utf8_lengths.InsertAndReturnSumOfPredecessors(
+            i, utf8_for_code_point.length);
+    std::memmove(
+        out_begin + n_index + utf8_for_code_point.length, out_begin + n_index,
+        total_utf8_bytes + 1 - n_index);
+    std::memcpy(out_begin + n_index, utf8_for_code_point.bytes,
+                utf8_for_code_point.length);
+    total_utf8_bytes += utf8_for_code_point.length;
+    ++num_chars;
+
+    // Finally, advance to the next state before continuing.
+    ++i;
+  }
+
+  return out_begin + total_utf8_bytes;
+}
+
+}  // namespace debugging_internal
+ABSL_NAMESPACE_END
+}  // namespace absl
diff --git a/absl/debugging/internal/decode_rust_punycode.h b/absl/debugging/internal/decode_rust_punycode.h
new file mode 100644
index 00000000..0ae53ff3
--- /dev/null
+++ b/absl/debugging/internal/decode_rust_punycode.h
@@ -0,0 +1,55 @@
+// Copyright 2024 The Abseil Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ABSL_DEBUGGING_INTERNAL_DECODE_RUST_PUNYCODE_H_
+#define ABSL_DEBUGGING_INTERNAL_DECODE_RUST_PUNYCODE_H_
+
+#include "absl/base/config.h"
+#include "absl/base/nullability.h"
+
+namespace absl {
+ABSL_NAMESPACE_BEGIN
+namespace debugging_internal {
+
+struct DecodeRustPunycodeOptions {
+  const char* punycode_begin;
+  const char* punycode_end;
+  char* out_begin;
+  char* out_end;
+};
+
+// Given Rust Punycode in `punycode_begin .. punycode_end`, writes the
+// corresponding UTF-8 plaintext into `out_begin .. out_end`, followed by a NUL
+// character, and returns a pointer to that final NUL on success.  On failure
+// returns a null pointer, and the contents of `out_begin .. out_end` are
+// unspecified.
+//
+// Failure occurs in precisely these cases:
+//   - Any input byte does not match [0-9a-zA-Z_].
+//   - The first input byte is an underscore, but no other underscore appears in
+//     the input.
+//   - The delta sequence does not represent a valid sequence of code-point
+//     insertions.
+//   - The plaintext would contain more than 256 code points.
+//
+// DecodeRustPunycode is async-signal-safe with bounded runtime and a small
+// stack footprint, making it suitable for use in demangling Rust symbol names
+// from a signal handler.
+absl::Nullable<char*> DecodeRustPunycode(DecodeRustPunycodeOptions options);
+
+}  // namespace debugging_internal
+ABSL_NAMESPACE_END
+}  // namespace absl
+
+#endif  // ABSL_DEBUGGING_INTERNAL_DECODE_RUST_PUNYCODE_H_
diff --git a/absl/debugging/internal/decode_rust_punycode_test.cc b/absl/debugging/internal/decode_rust_punycode_test.cc
new file mode 100644
index 00000000..78d1c332
--- /dev/null
+++ b/absl/debugging/internal/decode_rust_punycode_test.cc
@@ -0,0 +1,606 @@
+// Copyright 2024 The Abseil Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "absl/debugging/internal/decode_rust_punycode.h"
+
+#include <cstddef>
+#include <cstring>
+#include <string>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "absl/base/config.h"
+
+namespace absl {
+ABSL_NAMESPACE_BEGIN
+namespace debugging_internal {
+namespace {
+
+using ::testing::AllOf;
+using ::testing::Eq;
+using ::testing::IsNull;
+using ::testing::Pointee;
+using ::testing::ResultOf;
+using ::testing::StrEq;
+
+class DecodeRustPunycodeTest : public ::testing::Test {
+ protected:
+  void FillBufferWithNonzeroBytes() {
+    // The choice of nonzero value to fill with is arbitrary.  The point is just
+    // to fail tests if DecodeRustPunycode forgets to write the final NUL
+    // character.
+    std::memset(buffer_storage_, 0xab, sizeof(buffer_storage_));
+  }
+
+  DecodeRustPunycodeOptions WithAmpleSpace() {
+    FillBufferWithNonzeroBytes();
+
+    DecodeRustPunycodeOptions options;
+    options.punycode_begin = punycode_.data();
+    options.punycode_end = punycode_.data() + punycode_.size();
+    options.out_begin = buffer_storage_;
+    options.out_end = buffer_storage_ + sizeof(buffer_storage_);
+    return options;
+  }
+
+  DecodeRustPunycodeOptions WithJustEnoughSpace() {
+    FillBufferWithNonzeroBytes();
+
+    const size_t begin_offset = sizeof(buffer_storage_) - plaintext_.size() - 1;
+    DecodeRustPunycodeOptions options;
+    options.punycode_begin = punycode_.data();
+    options.punycode_end = punycode_.data() + punycode_.size();
+    options.out_begin = buffer_storage_ + begin_offset;
+    options.out_end = buffer_storage_ + sizeof(buffer_storage_);
+    return options;
+  }
+
+  DecodeRustPunycodeOptions WithOneByteTooFew() {
+    FillBufferWithNonzeroBytes();
+
+    const size_t begin_offset = sizeof(buffer_storage_) - plaintext_.size();
+    DecodeRustPunycodeOptions options;
+    options.punycode_begin = punycode_.data();
+    options.punycode_end = punycode_.data() + punycode_.size();
+    options.out_begin = buffer_storage_ + begin_offset;
+    options.out_end = buffer_storage_ + sizeof(buffer_storage_);
+    return options;
+  }
+
+  // Matches a correct return value of DecodeRustPunycode when `golden` is the
+  // expected plaintext output.
+  auto PointsToTheNulAfter(const std::string& golden) {
+    const size_t golden_size = golden.size();
+    return AllOf(
+        Pointee(Eq('\0')),
+        ResultOf("preceding string body",
+                 [golden_size](const char* p) { return p - golden_size; },
+                 StrEq(golden)));
+  }
+
+  std::string punycode_;
+  std::string plaintext_;
+  char buffer_storage_[1024];
+};
+
+TEST_F(DecodeRustPunycodeTest, MapsEmptyToEmpty) {
+  punycode_ = "";
+  plaintext_ = "";
+
+  ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()),
+              PointsToTheNulAfter(plaintext_));
+  ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
+              PointsToTheNulAfter(plaintext_));
+  EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull());
+}
+
+TEST_F(DecodeRustPunycodeTest,
+       StripsTheTrailingDelimiterFromAPureRunOfBasicChars) {
+  punycode_ = "foo_";
+  plaintext_ = "foo";
+
+  ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()),
+              PointsToTheNulAfter(plaintext_));
+  ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
+              PointsToTheNulAfter(plaintext_));
+  EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull());
+}
+
+TEST_F(DecodeRustPunycodeTest, TreatsTheLastUnderscoreAsTheDelimiter) {
+  punycode_ = "foo_bar_";
+  plaintext_ = "foo_bar";
+
+  ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()),
+              PointsToTheNulAfter(plaintext_));
+  ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
+              PointsToTheNulAfter(plaintext_));
+  EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull());
+}
+
+TEST_F(DecodeRustPunycodeTest, AcceptsALeadingUnderscoreIfNotTheDelimiter) {
+  punycode_ = "_foo_";
+  plaintext_ = "_foo";
+
+  ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()),
+              PointsToTheNulAfter(plaintext_));
+  ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
+              PointsToTheNulAfter(plaintext_));
+  EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull());
+}
+
+TEST_F(DecodeRustPunycodeTest, RejectsALeadingUnderscoreDelimiter) {
+  punycode_ = "_foo";
+
+  EXPECT_THAT(DecodeRustPunycode(WithAmpleSpace()), IsNull());
+}
+
+TEST_F(DecodeRustPunycodeTest, RejectsEmbeddedNul) {
+  punycode_ = std::string("foo\0bar_", 8);
+
+  EXPECT_THAT(DecodeRustPunycode(WithAmpleSpace()), IsNull());
+}
+
+TEST_F(DecodeRustPunycodeTest, RejectsAsciiCharsOtherThanIdentifierChars) {
+  punycode_ = "foo\007_";
+  EXPECT_THAT(DecodeRustPunycode(WithAmpleSpace()), IsNull());
+
+  punycode_ = "foo-_";
+  EXPECT_THAT(DecodeRustPunycode(WithAmpleSpace()), IsNull());
+
+  punycode_ = "foo;_";
+  EXPECT_THAT(DecodeRustPunycode(WithAmpleSpace()), IsNull());
+
+  punycode_ = "foo\177_";
+  EXPECT_THAT(DecodeRustPunycode(WithAmpleSpace()), IsNull());
+}
+
+TEST_F(DecodeRustPunycodeTest, RejectsRawNonAsciiChars) {
+  punycode_ = "\x80";
+  EXPECT_THAT(DecodeRustPunycode(WithAmpleSpace()), IsNull());
+
+  punycode_ = "\x80_";
+  EXPECT_THAT(DecodeRustPunycode(WithAmpleSpace()), IsNull());
+
+  punycode_ = "\xff";
+  EXPECT_THAT(DecodeRustPunycode(WithAmpleSpace()), IsNull());
+
+  punycode_ = "\xff_";
+  EXPECT_THAT(DecodeRustPunycode(WithAmpleSpace()), IsNull());
+}
+
+TEST_F(DecodeRustPunycodeTest, RecognizesU0080) {
+  // a encodes 0, so the output is the smallest non-ASCII code point standing
+  // alone.  (U+0080 PAD is not an identifier character, but DecodeRustPunycode
+  // does not check whether non-ASCII characters could belong to an identifier.)
+  punycode_ = "a";
+  plaintext_ = "\xc2\x80";
+
+  ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()),
+              PointsToTheNulAfter(plaintext_));
+  ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
+              PointsToTheNulAfter(plaintext_));
+  EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull());
+}
+
+TEST_F(DecodeRustPunycodeTest, OneByteDeltaSequencesMustBeA) {
+  // Because bias = 72 for the first code point, any digit but a/A is nonfinal
+  // in one of the first two bytes of a delta sequence.
+  punycode_ = "b";
+  EXPECT_THAT(DecodeRustPunycode(WithAmpleSpace()), IsNull());
+
+  punycode_ = "z";
+  EXPECT_THAT(DecodeRustPunycode(WithAmpleSpace()), IsNull());
+
+  punycode_ = "0";
+  EXPECT_THAT(DecodeRustPunycode(WithAmpleSpace()), IsNull());
+
+  punycode_ = "9";
+  EXPECT_THAT(DecodeRustPunycode(WithAmpleSpace()), IsNull());
+}
+
+TEST_F(DecodeRustPunycodeTest, AcceptsDeltaSequenceBA) {
+  punycode_ = "ba";
+  plaintext_ = "\xc2\x81";
+
+  ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()),
+              PointsToTheNulAfter(plaintext_));
+  ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
+              PointsToTheNulAfter(plaintext_));
+  EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull());
+}
+
+TEST_F(DecodeRustPunycodeTest, AcceptsOtherDeltaSequencesWithSecondByteA) {
+  punycode_ = "ca";
+  plaintext_ = "\xc2\x82";
+  EXPECT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
+              PointsToTheNulAfter(plaintext_));
+
+  punycode_ = "za";
+  plaintext_ = "\xc2\x99";
+  EXPECT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
+              PointsToTheNulAfter(plaintext_));
+
+  punycode_ = "0a";
+  plaintext_ = "\xc2\x9a";
+  EXPECT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
+              PointsToTheNulAfter(plaintext_));
+
+  punycode_ = "1a";
+  plaintext_ = "\xc2\x9b";
+  EXPECT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
+              PointsToTheNulAfter(plaintext_));
+
+  punycode_ = "9a";
+  plaintext_ = "£";  // Pound sign, U+00A3
+  EXPECT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
+              PointsToTheNulAfter(plaintext_));
+}
+
+TEST_F(DecodeRustPunycodeTest, RejectsDeltaWhereTheSecondAndLastDigitIsNotA) {
+  punycode_ = "bb";
+  EXPECT_THAT(DecodeRustPunycode(WithAmpleSpace()), IsNull());
+
+  punycode_ = "zz";
+  EXPECT_THAT(DecodeRustPunycode(WithAmpleSpace()), IsNull());
+
+  punycode_ = "00";
+  EXPECT_THAT(DecodeRustPunycode(WithAmpleSpace()), IsNull());
+
+  punycode_ = "99";
+  EXPECT_THAT(DecodeRustPunycode(WithAmpleSpace()), IsNull());
+}
+
+TEST_F(DecodeRustPunycodeTest, AcceptsDeltasWithSecondByteBFollowedByA) {
+  punycode_ = "bba";
+  plaintext_ = "¤";  // U+00A4
+  EXPECT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
+              PointsToTheNulAfter(plaintext_));
+
+  punycode_ = "cba";
+  plaintext_ = "¥";  // U+00A5
+  EXPECT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
+              PointsToTheNulAfter(plaintext_));
+
+  punycode_ = "zba";
+  plaintext_ = "¼";  // U+00BC
+  EXPECT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
+              PointsToTheNulAfter(plaintext_));
+
+  punycode_ = "0ba";
+  plaintext_ = "½";  // U+00BD
+  EXPECT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
+              PointsToTheNulAfter(plaintext_));
+
+  punycode_ = "1ba";
+  plaintext_ = "¾";  // U+00BE
+  EXPECT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
+              PointsToTheNulAfter(plaintext_));
+
+  punycode_ = "9ba";
+  plaintext_ = "Æ";  // U+00C6
+  EXPECT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
+              PointsToTheNulAfter(plaintext_));
+}
+
+// Tests beyond this point use characters allowed in identifiers, so you can
+// prepend _RNvC1cu<decimal length><underscore if [0-9_] follows> to a test
+// input and run it through another Rust demangler to verify that the
+// corresponding golden output is correct.
+
+TEST_F(DecodeRustPunycodeTest, AcceptsTwoByteCharAlone) {
+  punycode_ = "0ca";
+  plaintext_ = "à";
+
+  ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()),
+              PointsToTheNulAfter(plaintext_));
+  ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
+              PointsToTheNulAfter(plaintext_));
+  EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull());
+}
+
+TEST_F(DecodeRustPunycodeTest, AcceptsTwoByteCharBeforeBasicChars) {
+  punycode_ = "_la_mode_yya";
+  plaintext_ = "à_la_mode";
+
+  ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()),
+              PointsToTheNulAfter(plaintext_));
+  ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
+              PointsToTheNulAfter(plaintext_));
+  EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull());
+}
+
+TEST_F(DecodeRustPunycodeTest, AcceptsTwoByteCharAmidBasicChars) {
+  punycode_ = "verre__vin_m4a";
+  plaintext_ = "verre_à_vin";
+
+  ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()),
+              PointsToTheNulAfter(plaintext_));
+  ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
+              PointsToTheNulAfter(plaintext_));
+  EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull());
+}
+
+TEST_F(DecodeRustPunycodeTest, AcceptsTwoByteCharAfterBasicChars) {
+  punycode_ = "belt_3na";
+  plaintext_ = "beltà";
+
+  ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()),
+              PointsToTheNulAfter(plaintext_));
+  ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
+              PointsToTheNulAfter(plaintext_));
+  EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull());
+}
+
+TEST_F(DecodeRustPunycodeTest, AcceptsRepeatedTwoByteChar) {
+  punycode_ = "0caaaa";
+  plaintext_ = "àààà";
+
+  ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()),
+              PointsToTheNulAfter(plaintext_));
+  ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
+              PointsToTheNulAfter(plaintext_));
+  EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull());
+}
+
+TEST_F(DecodeRustPunycodeTest, AcceptsNearbyTwoByteCharsInOrder) {
+  punycode_ = "3camsuz";
+  plaintext_ = "ãéïôù";
+
+  ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()),
+              PointsToTheNulAfter(plaintext_));
+  ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
+              PointsToTheNulAfter(plaintext_));
+  EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull());
+}
+
+TEST_F(DecodeRustPunycodeTest, AcceptsNearbyTwoByteCharsOutOfOrder) {
+  punycode_ = "3caltsx";
+  plaintext_ = "ùéôãï";
+
+  ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()),
+              PointsToTheNulAfter(plaintext_));
+  ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
+              PointsToTheNulAfter(plaintext_));
+  EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull());
+}
+
+TEST_F(DecodeRustPunycodeTest, AcceptsThreeByteCharAlone) {
+  punycode_ = "fiq";
+  plaintext_ = "中";
+
+  ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()),
+              PointsToTheNulAfter(plaintext_));
+  ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
+              PointsToTheNulAfter(plaintext_));
+  EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull());
+}
+
+TEST_F(DecodeRustPunycodeTest, AcceptsRepeatedThreeByteChar) {
+  punycode_ = "fiqaaaa";
+  plaintext_ = "中中中中中";
+
+  ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()),
+              PointsToTheNulAfter(plaintext_));
+  ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
+              PointsToTheNulAfter(plaintext_));
+  EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull());
+}
+
+TEST_F(DecodeRustPunycodeTest, AcceptsThreeByteCharsInOrder) {
+  punycode_ = "fiq228c";
+  plaintext_ = "中文";
+
+  ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()),
+              PointsToTheNulAfter(plaintext_));
+  ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
+              PointsToTheNulAfter(plaintext_));
+  EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull());
+}
+
+TEST_F(DecodeRustPunycodeTest, AcceptsNearbyThreeByteCharsOutOfOrder) {
+  punycode_ = "fiq128c";
+  plaintext_ = "文中";
+
+  ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()),
+              PointsToTheNulAfter(plaintext_));
+  ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
+              PointsToTheNulAfter(plaintext_));
+  EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull());
+}
+
+TEST_F(DecodeRustPunycodeTest, AcceptsFourByteCharAlone) {
+  punycode_ = "uy7h";
+  plaintext_ = "🂻";
+
+  ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()),
+              PointsToTheNulAfter(plaintext_));
+  ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
+              PointsToTheNulAfter(plaintext_));
+  EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull());
+}
+
+TEST_F(DecodeRustPunycodeTest, AcceptsFourByteCharBeforeBasicChars) {
+  punycode_ = "jack__uh63d";
+  plaintext_ = "jack_🂻";
+
+  ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()),
+              PointsToTheNulAfter(plaintext_));
+  ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
+              PointsToTheNulAfter(plaintext_));
+  EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull());
+}
+
+TEST_F(DecodeRustPunycodeTest, AcceptsFourByteCharAmidBasicChars) {
+  punycode_ = "jack__of_hearts_ki37n";
+  plaintext_ = "jack_🂻_of_hearts";
+
+  ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()),
+              PointsToTheNulAfter(plaintext_));
+  ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
+              PointsToTheNulAfter(plaintext_));
+  EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull());
+}
+
+TEST_F(DecodeRustPunycodeTest, AcceptsFourByteCharAfterBasicChars) {
+  punycode_ = "_of_hearts_kz45i";
+  plaintext_ = "🂻_of_hearts";
+
+  ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()),
+              PointsToTheNulAfter(plaintext_));
+  ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
+              PointsToTheNulAfter(plaintext_));
+  EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull());
+}
+
+TEST_F(DecodeRustPunycodeTest, AcceptsRepeatedFourByteChar) {
+  punycode_ = "uy7haaaa";
+  plaintext_ = "🂻🂻🂻🂻🂻";
+
+  ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()),
+              PointsToTheNulAfter(plaintext_));
+  ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
+              PointsToTheNulAfter(plaintext_));
+  EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull());
+}
+
+TEST_F(DecodeRustPunycodeTest, AcceptsNearbyFourByteCharsInOrder) {
+  punycode_ = "8x7hcjmf";
+  plaintext_ = "🂦🂧🂪🂭🂮";
+
+  ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()),
+              PointsToTheNulAfter(plaintext_));
+  ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
+              PointsToTheNulAfter(plaintext_));
+  EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull());
+}
+
+TEST_F(DecodeRustPunycodeTest, AcceptsNearbyFourByteCharsOutOfOrder) {
+  punycode_ = "8x7hcild";
+  plaintext_ = "🂮🂦🂭🂪🂧";
+
+  ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()),
+              PointsToTheNulAfter(plaintext_));
+  ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
+              PointsToTheNulAfter(plaintext_));
+  EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull());
+}
+
+TEST_F(DecodeRustPunycodeTest, AcceptsAMixtureOfByteLengths) {
+  punycode_ = "3caltsx2079ivf8aiuy7cja3a6ak";
+  plaintext_ = "ùéôãï中文🂮🂦🂭🂪🂧";
+
+  ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()),
+              PointsToTheNulAfter(plaintext_));
+  ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
+              PointsToTheNulAfter(plaintext_));
+  EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull());
+}
+
+TEST_F(DecodeRustPunycodeTest, RejectsOverlargeDeltas) {
+  punycode_ = "123456789a";
+
+  EXPECT_THAT(DecodeRustPunycode(WithAmpleSpace()), IsNull());
+}
+
+// Finally, we test on a few prose and poetry snippets as a defense in depth.
+// If our artificial short test inputs did not exercise a bug that is tickled by
+// patterns typical of real human writing, maybe real human writing will catch
+// that.
+//
+// These test inputs are extracted from texts old enough to be out of copyright
+// that probe a variety of ranges of code-point space.  All are longer than 32
+// code points, so they exercise the carrying of seminibbles from one uint64_t
+// to the next higher one in BoundedUtf8LengthSequence.
+
+// The first three lines of the Old English epic _Beowulf_, mostly ASCII with a
+// few archaic two-byte letters interspersed.
+TEST_F(DecodeRustPunycodeTest, Beowulf) {
+  punycode_ = "hwt_we_gardena_in_geardagum_"
+              "eodcyninga_rym_gefrunon_"
+              "hu_a_elingas_ellen_fremedon_hxg9c70do9alau";
+  plaintext_ = "hwæt_we_gardena_in_geardagum_"
+               "þeodcyninga_þrym_gefrunon_"
+               "hu_ða_æþelingas_ellen_fremedon";
+
+  ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()),
+              PointsToTheNulAfter(plaintext_));
+  ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
+              PointsToTheNulAfter(plaintext_));
+  EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull());
+}
+
+// The whole of 過故人莊 by the 8th-century Chinese poet 孟浩然
+// (Meng Haoran), exercising three-byte-character processing.
+TEST_F(DecodeRustPunycodeTest, MengHaoran) {
+  punycode_ = "gmq4ss0cfvao1e2wg8mcw8b0wkl9a7tt90a8riuvbk7t8kbv9a66ogofvzlf6"
+              "3d01ybn1u28dyqi5q2cxyyxnk5d2gx1ks9ddvfm17bk6gbsd6wftrav60u4ta";
+  plaintext_ = "故人具雞黍" "邀我至田家"
+               "綠樹村邊合" "青山郭外斜"
+               "開軒面場圃" "把酒話桑麻"
+               "待到重陽日" "還來就菊花";
+
+  ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()),
+              PointsToTheNulAfter(plaintext_));
+  ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
+              PointsToTheNulAfter(plaintext_));
+  EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull());
+}
+
+// A poem of the 8th-century Japanese poet 山上憶良 (Yamanoue no Okura).
+// Japanese mixes two-byte and three-byte characters: a good workout for codecs.
+TEST_F(DecodeRustPunycodeTest, YamanoueNoOkura) {
+  punycode_ = "48jdaa3a6ccpepjrsmlb0q4bwcdtid8fg6c0cai9822utqeruk3om0u4f2wbp0"
+              "em23do0op23cc2ff70mb6tae8aq759gja";
+  plaintext_ = "瓜食めば"
+               "子ども思ほゆ"
+               "栗食めば"
+               "まして偲はゆ"
+               "何処より"
+               "来りしものそ"
+               "眼交に"
+               "もとな懸りて"
+               "安眠し寝さぬ";
+
+  ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()),
+              PointsToTheNulAfter(plaintext_));
+  ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
+              PointsToTheNulAfter(plaintext_));
+  EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull());
+}
+
+// The first two lines of the Phoenician-language inscription on the sarcophagus
+// of Eshmunazar II of Sidon, 6th century BCE.  Phoenician and many other
+// archaic scripts are allocated in the Supplemental Multilingual Plane (U+10000
+// through U+1FFFF) and thus exercise four-byte-character processing.
+TEST_F(DecodeRustPunycodeTest, EshmunazarSarcophagus) {
+  punycode_ = "wj9caaabaabbaaohcacxvhdc7bgxbccbdcjeacddcedcdlddbdbddcdbdcknfcee"
+              "ifel8del2a7inq9fhcpxikms7a4a9ac9ataaa0g";
+  plaintext_ = "𐤁𐤉𐤓𐤇𐤁𐤋𐤁𐤔𐤍𐤕𐤏𐤎𐤓"
+               "𐤅𐤀𐤓𐤁𐤏𐤗𐤖𐤖𐤖𐤖𐤋𐤌𐤋𐤊𐤉𐤌𐤋𐤊"
+               "𐤀𐤔𐤌𐤍𐤏𐤆𐤓𐤌𐤋𐤊𐤑𐤃𐤍𐤌"
+               "𐤁𐤍𐤌𐤋𐤊𐤕𐤁𐤍𐤕𐤌𐤋𐤊𐤑𐤃𐤍𐤌"
+               "𐤃𐤁𐤓𐤌𐤋𐤊𐤀𐤔𐤌𐤍𐤏𐤆𐤓𐤌𐤋𐤊"
+               "𐤑𐤃𐤍𐤌𐤋𐤀𐤌𐤓𐤍𐤂𐤆𐤋𐤕";
+
+  ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()),
+              PointsToTheNulAfter(plaintext_));
+  ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
+              PointsToTheNulAfter(plaintext_));
+  EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull());
+}
+
+}  // namespace
+}  // namespace debugging_internal
+ABSL_NAMESPACE_END
+}  // namespace absl
author	Chris Mihelich <cmihelic@google.com>	2024-06-26 15:13:52 -0700
committer	Copybara-Service <copybara-worker@google.com>	2024-06-26 15:14:45 -0700
commit	16452e1418c1c2a8bcf4a99238e190ba901a20a6 (patch)
tree	551f86735011199f26edd2a635683c33bcd80cd6 /absl/debugging
parent	63d4b2fe1e0d4bd6a7b916f398643db40c35624b (diff)