diff options
author | Chris Mihelich <cmihelic@google.com> | 2024-06-10 12:17:24 -0700 |
---|---|---|
committer | Copybara-Service <copybara-worker@google.com> | 2024-06-10 12:18:15 -0700 |
commit | d30298a1b6f3dd8939910561e211fe990e4e2e8e (patch) | |
tree | 25c12d7fd3ed9fbacf74fde29914651d290ffe74 /absl | |
parent | 96cdf6cc87e7a21d92f9f96a72263a93d3929ec7 (diff) |
UTF-8 encoding library to support Rust Punycode decoding.
PiperOrigin-RevId: 641983507
Change-Id: Iad7933884aef6bfd90d159c049a1d698d19456c6
Diffstat (limited to 'absl')
-rw-r--r-- | absl/debugging/BUILD.bazel | 15 | ||||
-rw-r--r-- | absl/debugging/CMakeLists.txt | 15 | ||||
-rw-r--r-- | absl/debugging/internal/utf8_for_code_point.cc | 70 | ||||
-rw-r--r-- | absl/debugging/internal/utf8_for_code_point.h | 47 | ||||
-rw-r--r-- | absl/debugging/internal/utf8_for_code_point_test.cc | 175 |
5 files changed, 322 insertions, 0 deletions
diff --git a/absl/debugging/BUILD.bazel b/absl/debugging/BUILD.bazel index 22494554..670f7764 100644 --- a/absl/debugging/BUILD.bazel +++ b/absl/debugging/BUILD.bazel @@ -222,10 +222,12 @@ cc_library( srcs = [ "internal/demangle.cc", "internal/demangle_rust.cc", + "internal/utf8_for_code_point.cc", ], hdrs = [ "internal/demangle.h", "internal/demangle_rust.h", + "internal/utf8_for_code_point.h", ], copts = ABSL_DEFAULT_COPTS, linkopts = ABSL_DEFAULT_LINKOPTS, @@ -271,6 +273,19 @@ cc_test( ], ) +cc_test( + name = "utf8_for_code_point_test", + srcs = ["internal/utf8_for_code_point_test.cc"], + copts = ABSL_TEST_COPTS, + linkopts = ABSL_DEFAULT_LINKOPTS, + deps = [ + ":demangle_internal", + "//absl/base:config", + "@com_google_googletest//:gtest", + "@com_google_googletest//:gtest_main", + ], +) + cc_library( name = "leak_check", srcs = ["leak_check.cc"], diff --git a/absl/debugging/CMakeLists.txt b/absl/debugging/CMakeLists.txt index 027a6be2..8379b5b9 100644 --- a/absl/debugging/CMakeLists.txt +++ b/absl/debugging/CMakeLists.txt @@ -197,9 +197,11 @@ absl_cc_library( HDRS "internal/demangle.h" "internal/demangle_rust.h" + "internal/utf8_for_code_point.h" SRCS "internal/demangle.cc" "internal/demangle_rust.cc" + "internal/utf8_for_code_point.cc" COPTS ${ABSL_DEFAULT_COPTS} DEPS @@ -238,6 +240,19 @@ absl_cc_test( GTest::gmock_main ) +absl_cc_test( + NAME + utf8_for_code_point_test + SRCS + "internal/utf8_for_code_point_test.cc" + COPTS + ${ABSL_TEST_COPTS} + DEPS + absl::demangle_internal + absl::config + GTest::gmock_main +) + absl_cc_library( NAME leak_check diff --git a/absl/debugging/internal/utf8_for_code_point.cc b/absl/debugging/internal/utf8_for_code_point.cc new file mode 100644 index 00000000..658a3b51 --- /dev/null +++ b/absl/debugging/internal/utf8_for_code_point.cc @@ -0,0 +1,70 @@ +// Copyright 2024 The Abseil Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "absl/debugging/internal/utf8_for_code_point.h" + +#include <cstdint> + +#include "absl/base/config.h" + +namespace absl { +ABSL_NAMESPACE_BEGIN +namespace debugging_internal { +namespace { + +// UTF-8 encoding bounds. +constexpr uint32_t kMinSurrogate = 0xd800, kMaxSurrogate = 0xdfff; +constexpr uint32_t kMax1ByteCodePoint = 0x7f; +constexpr uint32_t kMax2ByteCodePoint = 0x7ff; +constexpr uint32_t kMax3ByteCodePoint = 0xffff; +constexpr uint32_t kMaxCodePoint = 0x10ffff; + +} // namespace + +Utf8ForCodePoint::Utf8ForCodePoint(uint64_t code_point) { + if (code_point <= kMax1ByteCodePoint) { + length = 1; + bytes[0] = static_cast<char>(code_point); + return; + } + + if (code_point <= kMax2ByteCodePoint) { + length = 2; + bytes[0] = static_cast<char>(0xc0 | (code_point >> 6)); + bytes[1] = static_cast<char>(0x80 | (code_point & 0x3f)); + return; + } + + if (kMinSurrogate <= code_point && code_point <= kMaxSurrogate) return; + + if (code_point <= kMax3ByteCodePoint) { + length = 3; + bytes[0] = static_cast<char>(0xe0 | (code_point >> 12)); + bytes[1] = static_cast<char>(0x80 | ((code_point >> 6) & 0x3f)); + bytes[2] = static_cast<char>(0x80 | (code_point & 0x3f)); + return; + } + + if (code_point > kMaxCodePoint) return; + + length = 4; + bytes[0] = static_cast<char>(0xf0 | (code_point >> 18)); + bytes[1] = static_cast<char>(0x80 | ((code_point >> 12) & 0x3f)); + bytes[2] = static_cast<char>(0x80 | ((code_point >> 6) & 0x3f)); + bytes[3] = static_cast<char>(0x80 | (code_point & 0x3f)); +} + +} // namespace debugging_internal +ABSL_NAMESPACE_END +} // namespace absl diff --git a/absl/debugging/internal/utf8_for_code_point.h b/absl/debugging/internal/utf8_for_code_point.h new file mode 100644 index 00000000..f23cde6d --- /dev/null +++ b/absl/debugging/internal/utf8_for_code_point.h @@ -0,0 +1,47 @@ +// Copyright 2024 The Abseil Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef ABSL_DEBUGGING_INTERNAL_UTF8_FOR_CODE_POINT_H_ +#define ABSL_DEBUGGING_INTERNAL_UTF8_FOR_CODE_POINT_H_ + +#include <cstdint> + +#include "absl/base/config.h" + +namespace absl { +ABSL_NAMESPACE_BEGIN +namespace debugging_internal { + +struct Utf8ForCodePoint { + // Converts a Unicode code point to the corresponding UTF-8 byte sequence. + // Async-signal-safe to support use in symbolizing stack traces from a signal + // handler. + explicit Utf8ForCodePoint(uint64_t code_point); + + // Returns true if the constructor's code_point argument was valid. + bool ok() const { return length != 0; } + + // If code_point was in range, then 1 <= length <= 4, and the UTF-8 encoding + // is found in bytes[0 .. (length - 1)]. If code_point was invalid, then + // length == 0. In either case, the contents of bytes[length .. 3] are + // unspecified. + char bytes[4] = {}; + uint32_t length = 0; +}; + +} // namespace debugging_internal +ABSL_NAMESPACE_END +} // namespace absl + +#endif // ABSL_DEBUGGING_INTERNAL_UTF8_FOR_CODE_POINT_H_ diff --git a/absl/debugging/internal/utf8_for_code_point_test.cc b/absl/debugging/internal/utf8_for_code_point_test.cc new file mode 100644 index 00000000..d6817c77 --- /dev/null +++ b/absl/debugging/internal/utf8_for_code_point_test.cc @@ -0,0 +1,175 @@ +// Copyright 2024 The Abseil Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "absl/debugging/internal/utf8_for_code_point.h" + +#include <cstdint> + +#include "gtest/gtest.h" +#include "absl/base/config.h" + +namespace absl { +ABSL_NAMESPACE_BEGIN +namespace debugging_internal { +namespace { + +TEST(Utf8ForCodePointTest, RecognizesTheSmallestCodePoint) { + Utf8ForCodePoint utf8(std::uint64_t{0}); + ASSERT_EQ(utf8.length, 1); + EXPECT_EQ(utf8.bytes[0], '\0'); +} + +TEST(Utf8ForCodePointTest, RecognizesAsciiSmallA) { + Utf8ForCodePoint utf8(std::uint64_t{'a'}); + ASSERT_EQ(utf8.length, 1); + EXPECT_EQ(utf8.bytes[0], 'a'); +} + +TEST(Utf8ForCodePointTest, RecognizesTheLargestOneByteCodePoint) { + Utf8ForCodePoint utf8(std::uint64_t{0x7f}); + ASSERT_EQ(utf8.length, 1); + EXPECT_EQ(utf8.bytes[0], '\x7f'); +} + +TEST(Utf8ForCodePointTest, RecognizesTheSmallestTwoByteCodePoint) { + Utf8ForCodePoint utf8(std::uint64_t{0x80}); + ASSERT_EQ(utf8.length, 2); + EXPECT_EQ(utf8.bytes[0], static_cast<char>(0xc2)); + EXPECT_EQ(utf8.bytes[1], static_cast<char>(0x80)); +} + +TEST(Utf8ForCodePointTest, RecognizesSmallNWithTilde) { + Utf8ForCodePoint utf8(std::uint64_t{0xf1}); + ASSERT_EQ(utf8.length, 2); + const char* want = "ñ"; + EXPECT_EQ(utf8.bytes[0], want[0]); + EXPECT_EQ(utf8.bytes[1], want[1]); +} + +TEST(Utf8ForCodePointTest, RecognizesCapitalPi) { + Utf8ForCodePoint utf8(std::uint64_t{0x3a0}); + ASSERT_EQ(utf8.length, 2); + const char* want = "Π"; + EXPECT_EQ(utf8.bytes[0], want[0]); + EXPECT_EQ(utf8.bytes[1], want[1]); +} + +TEST(Utf8ForCodePointTest, RecognizesTheLargestTwoByteCodePoint) { + Utf8ForCodePoint utf8(std::uint64_t{0x7ff}); + ASSERT_EQ(utf8.length, 2); + EXPECT_EQ(utf8.bytes[0], static_cast<char>(0xdf)); + EXPECT_EQ(utf8.bytes[1], static_cast<char>(0xbf)); +} + +TEST(Utf8ForCodePointTest, RecognizesTheSmallestThreeByteCodePoint) { + Utf8ForCodePoint utf8(std::uint64_t{0x800}); + ASSERT_EQ(utf8.length, 3); + EXPECT_EQ(utf8.bytes[0], static_cast<char>(0xe0)); + EXPECT_EQ(utf8.bytes[1], static_cast<char>(0xa0)); + EXPECT_EQ(utf8.bytes[2], static_cast<char>(0x80)); +} + +TEST(Utf8ForCodePointTest, RecognizesTheChineseCharacterZhong1AsInZhong1Wen2) { + Utf8ForCodePoint utf8(std::uint64_t{0x4e2d}); + ASSERT_EQ(utf8.length, 3); + const char* want = "中"; + EXPECT_EQ(utf8.bytes[0], want[0]); + EXPECT_EQ(utf8.bytes[1], want[1]); + EXPECT_EQ(utf8.bytes[2], want[2]); +} + +TEST(Utf8ForCodePointTest, RecognizesOneBeforeTheSmallestSurrogate) { + Utf8ForCodePoint utf8(std::uint64_t{0xd7ff}); + ASSERT_EQ(utf8.length, 3); + EXPECT_EQ(utf8.bytes[0], static_cast<char>(0xed)); + EXPECT_EQ(utf8.bytes[1], static_cast<char>(0x9f)); + EXPECT_EQ(utf8.bytes[2], static_cast<char>(0xbf)); +} + +TEST(Utf8ForCodePointTest, RejectsTheSmallestSurrogate) { + Utf8ForCodePoint utf8(std::uint64_t{0xd800}); + EXPECT_EQ(utf8.length, 0); +} + +TEST(Utf8ForCodePointTest, RejectsTheLargestSurrogate) { + Utf8ForCodePoint utf8(std::uint64_t{0xdfff}); + EXPECT_EQ(utf8.length, 0); +} + +TEST(Utf8ForCodePointTest, RecognizesOnePastTheLargestSurrogate) { + Utf8ForCodePoint utf8(std::uint64_t{0xe000}); + ASSERT_EQ(utf8.length, 3); + EXPECT_EQ(utf8.bytes[0], static_cast<char>(0xee)); + EXPECT_EQ(utf8.bytes[1], static_cast<char>(0x80)); + EXPECT_EQ(utf8.bytes[2], static_cast<char>(0x80)); +} + +TEST(Utf8ForCodePointTest, RecognizesTheLargestThreeByteCodePoint) { + Utf8ForCodePoint utf8(std::uint64_t{0xffff}); + ASSERT_EQ(utf8.length, 3); + EXPECT_EQ(utf8.bytes[0], static_cast<char>(0xef)); + EXPECT_EQ(utf8.bytes[1], static_cast<char>(0xbf)); + EXPECT_EQ(utf8.bytes[2], static_cast<char>(0xbf)); +} + +TEST(Utf8ForCodePointTest, RecognizesTheSmallestFourByteCodePoint) { + Utf8ForCodePoint utf8(std::uint64_t{0x10000}); + ASSERT_EQ(utf8.length, 4); + EXPECT_EQ(utf8.bytes[0], static_cast<char>(0xf0)); + EXPECT_EQ(utf8.bytes[1], static_cast<char>(0x90)); + EXPECT_EQ(utf8.bytes[2], static_cast<char>(0x80)); + EXPECT_EQ(utf8.bytes[3], static_cast<char>(0x80)); +} + +TEST(Utf8ForCodePointTest, RecognizesTheJackOfHearts) { + Utf8ForCodePoint utf8(std::uint64_t{0x1f0bb}); + ASSERT_EQ(utf8.length, 4); + const char* want = "🂻"; + EXPECT_EQ(utf8.bytes[0], want[0]); + EXPECT_EQ(utf8.bytes[1], want[1]); + EXPECT_EQ(utf8.bytes[2], want[2]); + EXPECT_EQ(utf8.bytes[3], want[3]); +} + +TEST(Utf8ForCodePointTest, RecognizesTheLargestFourByteCodePoint) { + Utf8ForCodePoint utf8(std::uint64_t{0x10ffff}); + ASSERT_EQ(utf8.length, 4); + EXPECT_EQ(utf8.bytes[0], static_cast<char>(0xf4)); + EXPECT_EQ(utf8.bytes[1], static_cast<char>(0x8f)); + EXPECT_EQ(utf8.bytes[2], static_cast<char>(0xbf)); + EXPECT_EQ(utf8.bytes[3], static_cast<char>(0xbf)); +} + +TEST(Utf8ForCodePointTest, RejectsTheSmallestOverlargeCodePoint) { + Utf8ForCodePoint utf8(std::uint64_t{0x110000}); + EXPECT_EQ(utf8.length, 0); +} + +TEST(Utf8ForCodePointTest, RejectsAThroughlyOverlargeCodePoint) { + Utf8ForCodePoint utf8(std::uint64_t{0xffffffff00000000}); + EXPECT_EQ(utf8.length, 0); +} + +TEST(Utf8ForCodePointTest, OkReturnsTrueForAValidCodePoint) { + EXPECT_TRUE(Utf8ForCodePoint(std::uint64_t{0}).ok()); +} + +TEST(Utf8ForCodePointTest, OkReturnsFalseForAnInvalidCodePoint) { + EXPECT_FALSE(Utf8ForCodePoint(std::uint64_t{0xffffffff00000000}).ok()); +} + +} // namespace +} // namespace debugging_internal +ABSL_NAMESPACE_END +} // namespace absl |