summaryrefslogtreecommitdiff
path: root/absl
diff options
context:
space:
mode:
authorGravatar Chris Mihelich <cmihelic@google.com>2024-06-10 12:17:24 -0700
committerGravatar Copybara-Service <copybara-worker@google.com>2024-06-10 12:18:15 -0700
commitd30298a1b6f3dd8939910561e211fe990e4e2e8e (patch)
tree25c12d7fd3ed9fbacf74fde29914651d290ffe74 /absl
parent96cdf6cc87e7a21d92f9f96a72263a93d3929ec7 (diff)
UTF-8 encoding library to support Rust Punycode decoding.
PiperOrigin-RevId: 641983507 Change-Id: Iad7933884aef6bfd90d159c049a1d698d19456c6
Diffstat (limited to 'absl')
-rw-r--r--absl/debugging/BUILD.bazel15
-rw-r--r--absl/debugging/CMakeLists.txt15
-rw-r--r--absl/debugging/internal/utf8_for_code_point.cc70
-rw-r--r--absl/debugging/internal/utf8_for_code_point.h47
-rw-r--r--absl/debugging/internal/utf8_for_code_point_test.cc175
5 files changed, 322 insertions, 0 deletions
diff --git a/absl/debugging/BUILD.bazel b/absl/debugging/BUILD.bazel
index 22494554..670f7764 100644
--- a/absl/debugging/BUILD.bazel
+++ b/absl/debugging/BUILD.bazel
@@ -222,10 +222,12 @@ cc_library(
srcs = [
"internal/demangle.cc",
"internal/demangle_rust.cc",
+ "internal/utf8_for_code_point.cc",
],
hdrs = [
"internal/demangle.h",
"internal/demangle_rust.h",
+ "internal/utf8_for_code_point.h",
],
copts = ABSL_DEFAULT_COPTS,
linkopts = ABSL_DEFAULT_LINKOPTS,
@@ -271,6 +273,19 @@ cc_test(
],
)
+cc_test(
+ name = "utf8_for_code_point_test",
+ srcs = ["internal/utf8_for_code_point_test.cc"],
+ copts = ABSL_TEST_COPTS,
+ linkopts = ABSL_DEFAULT_LINKOPTS,
+ deps = [
+ ":demangle_internal",
+ "//absl/base:config",
+ "@com_google_googletest//:gtest",
+ "@com_google_googletest//:gtest_main",
+ ],
+)
+
cc_library(
name = "leak_check",
srcs = ["leak_check.cc"],
diff --git a/absl/debugging/CMakeLists.txt b/absl/debugging/CMakeLists.txt
index 027a6be2..8379b5b9 100644
--- a/absl/debugging/CMakeLists.txt
+++ b/absl/debugging/CMakeLists.txt
@@ -197,9 +197,11 @@ absl_cc_library(
HDRS
"internal/demangle.h"
"internal/demangle_rust.h"
+ "internal/utf8_for_code_point.h"
SRCS
"internal/demangle.cc"
"internal/demangle_rust.cc"
+ "internal/utf8_for_code_point.cc"
COPTS
${ABSL_DEFAULT_COPTS}
DEPS
@@ -238,6 +240,19 @@ absl_cc_test(
GTest::gmock_main
)
+absl_cc_test(
+ NAME
+ utf8_for_code_point_test
+ SRCS
+ "internal/utf8_for_code_point_test.cc"
+ COPTS
+ ${ABSL_TEST_COPTS}
+ DEPS
+ absl::demangle_internal
+ absl::config
+ GTest::gmock_main
+)
+
absl_cc_library(
NAME
leak_check
diff --git a/absl/debugging/internal/utf8_for_code_point.cc b/absl/debugging/internal/utf8_for_code_point.cc
new file mode 100644
index 00000000..658a3b51
--- /dev/null
+++ b/absl/debugging/internal/utf8_for_code_point.cc
@@ -0,0 +1,70 @@
+// Copyright 2024 The Abseil Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "absl/debugging/internal/utf8_for_code_point.h"
+
+#include <cstdint>
+
+#include "absl/base/config.h"
+
+namespace absl {
+ABSL_NAMESPACE_BEGIN
+namespace debugging_internal {
+namespace {
+
+// UTF-8 encoding bounds.
+constexpr uint32_t kMinSurrogate = 0xd800, kMaxSurrogate = 0xdfff;
+constexpr uint32_t kMax1ByteCodePoint = 0x7f;
+constexpr uint32_t kMax2ByteCodePoint = 0x7ff;
+constexpr uint32_t kMax3ByteCodePoint = 0xffff;
+constexpr uint32_t kMaxCodePoint = 0x10ffff;
+
+} // namespace
+
+Utf8ForCodePoint::Utf8ForCodePoint(uint64_t code_point) {
+ if (code_point <= kMax1ByteCodePoint) {
+ length = 1;
+ bytes[0] = static_cast<char>(code_point);
+ return;
+ }
+
+ if (code_point <= kMax2ByteCodePoint) {
+ length = 2;
+ bytes[0] = static_cast<char>(0xc0 | (code_point >> 6));
+ bytes[1] = static_cast<char>(0x80 | (code_point & 0x3f));
+ return;
+ }
+
+ if (kMinSurrogate <= code_point && code_point <= kMaxSurrogate) return;
+
+ if (code_point <= kMax3ByteCodePoint) {
+ length = 3;
+ bytes[0] = static_cast<char>(0xe0 | (code_point >> 12));
+ bytes[1] = static_cast<char>(0x80 | ((code_point >> 6) & 0x3f));
+ bytes[2] = static_cast<char>(0x80 | (code_point & 0x3f));
+ return;
+ }
+
+ if (code_point > kMaxCodePoint) return;
+
+ length = 4;
+ bytes[0] = static_cast<char>(0xf0 | (code_point >> 18));
+ bytes[1] = static_cast<char>(0x80 | ((code_point >> 12) & 0x3f));
+ bytes[2] = static_cast<char>(0x80 | ((code_point >> 6) & 0x3f));
+ bytes[3] = static_cast<char>(0x80 | (code_point & 0x3f));
+}
+
+} // namespace debugging_internal
+ABSL_NAMESPACE_END
+} // namespace absl
diff --git a/absl/debugging/internal/utf8_for_code_point.h b/absl/debugging/internal/utf8_for_code_point.h
new file mode 100644
index 00000000..f23cde6d
--- /dev/null
+++ b/absl/debugging/internal/utf8_for_code_point.h
@@ -0,0 +1,47 @@
+// Copyright 2024 The Abseil Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ABSL_DEBUGGING_INTERNAL_UTF8_FOR_CODE_POINT_H_
+#define ABSL_DEBUGGING_INTERNAL_UTF8_FOR_CODE_POINT_H_
+
+#include <cstdint>
+
+#include "absl/base/config.h"
+
+namespace absl {
+ABSL_NAMESPACE_BEGIN
+namespace debugging_internal {
+
+struct Utf8ForCodePoint {
+ // Converts a Unicode code point to the corresponding UTF-8 byte sequence.
+ // Async-signal-safe to support use in symbolizing stack traces from a signal
+ // handler.
+ explicit Utf8ForCodePoint(uint64_t code_point);
+
+ // Returns true if the constructor's code_point argument was valid.
+ bool ok() const { return length != 0; }
+
+ // If code_point was in range, then 1 <= length <= 4, and the UTF-8 encoding
+ // is found in bytes[0 .. (length - 1)]. If code_point was invalid, then
+ // length == 0. In either case, the contents of bytes[length .. 3] are
+ // unspecified.
+ char bytes[4] = {};
+ uint32_t length = 0;
+};
+
+} // namespace debugging_internal
+ABSL_NAMESPACE_END
+} // namespace absl
+
+#endif // ABSL_DEBUGGING_INTERNAL_UTF8_FOR_CODE_POINT_H_
diff --git a/absl/debugging/internal/utf8_for_code_point_test.cc b/absl/debugging/internal/utf8_for_code_point_test.cc
new file mode 100644
index 00000000..d6817c77
--- /dev/null
+++ b/absl/debugging/internal/utf8_for_code_point_test.cc
@@ -0,0 +1,175 @@
+// Copyright 2024 The Abseil Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "absl/debugging/internal/utf8_for_code_point.h"
+
+#include <cstdint>
+
+#include "gtest/gtest.h"
+#include "absl/base/config.h"
+
+namespace absl {
+ABSL_NAMESPACE_BEGIN
+namespace debugging_internal {
+namespace {
+
+TEST(Utf8ForCodePointTest, RecognizesTheSmallestCodePoint) {
+ Utf8ForCodePoint utf8(std::uint64_t{0});
+ ASSERT_EQ(utf8.length, 1);
+ EXPECT_EQ(utf8.bytes[0], '\0');
+}
+
+TEST(Utf8ForCodePointTest, RecognizesAsciiSmallA) {
+ Utf8ForCodePoint utf8(std::uint64_t{'a'});
+ ASSERT_EQ(utf8.length, 1);
+ EXPECT_EQ(utf8.bytes[0], 'a');
+}
+
+TEST(Utf8ForCodePointTest, RecognizesTheLargestOneByteCodePoint) {
+ Utf8ForCodePoint utf8(std::uint64_t{0x7f});
+ ASSERT_EQ(utf8.length, 1);
+ EXPECT_EQ(utf8.bytes[0], '\x7f');
+}
+
+TEST(Utf8ForCodePointTest, RecognizesTheSmallestTwoByteCodePoint) {
+ Utf8ForCodePoint utf8(std::uint64_t{0x80});
+ ASSERT_EQ(utf8.length, 2);
+ EXPECT_EQ(utf8.bytes[0], static_cast<char>(0xc2));
+ EXPECT_EQ(utf8.bytes[1], static_cast<char>(0x80));
+}
+
+TEST(Utf8ForCodePointTest, RecognizesSmallNWithTilde) {
+ Utf8ForCodePoint utf8(std::uint64_t{0xf1});
+ ASSERT_EQ(utf8.length, 2);
+ const char* want = "ñ";
+ EXPECT_EQ(utf8.bytes[0], want[0]);
+ EXPECT_EQ(utf8.bytes[1], want[1]);
+}
+
+TEST(Utf8ForCodePointTest, RecognizesCapitalPi) {
+ Utf8ForCodePoint utf8(std::uint64_t{0x3a0});
+ ASSERT_EQ(utf8.length, 2);
+ const char* want = "Π";
+ EXPECT_EQ(utf8.bytes[0], want[0]);
+ EXPECT_EQ(utf8.bytes[1], want[1]);
+}
+
+TEST(Utf8ForCodePointTest, RecognizesTheLargestTwoByteCodePoint) {
+ Utf8ForCodePoint utf8(std::uint64_t{0x7ff});
+ ASSERT_EQ(utf8.length, 2);
+ EXPECT_EQ(utf8.bytes[0], static_cast<char>(0xdf));
+ EXPECT_EQ(utf8.bytes[1], static_cast<char>(0xbf));
+}
+
+TEST(Utf8ForCodePointTest, RecognizesTheSmallestThreeByteCodePoint) {
+ Utf8ForCodePoint utf8(std::uint64_t{0x800});
+ ASSERT_EQ(utf8.length, 3);
+ EXPECT_EQ(utf8.bytes[0], static_cast<char>(0xe0));
+ EXPECT_EQ(utf8.bytes[1], static_cast<char>(0xa0));
+ EXPECT_EQ(utf8.bytes[2], static_cast<char>(0x80));
+}
+
+TEST(Utf8ForCodePointTest, RecognizesTheChineseCharacterZhong1AsInZhong1Wen2) {
+ Utf8ForCodePoint utf8(std::uint64_t{0x4e2d});
+ ASSERT_EQ(utf8.length, 3);
+ const char* want = "中";
+ EXPECT_EQ(utf8.bytes[0], want[0]);
+ EXPECT_EQ(utf8.bytes[1], want[1]);
+ EXPECT_EQ(utf8.bytes[2], want[2]);
+}
+
+TEST(Utf8ForCodePointTest, RecognizesOneBeforeTheSmallestSurrogate) {
+ Utf8ForCodePoint utf8(std::uint64_t{0xd7ff});
+ ASSERT_EQ(utf8.length, 3);
+ EXPECT_EQ(utf8.bytes[0], static_cast<char>(0xed));
+ EXPECT_EQ(utf8.bytes[1], static_cast<char>(0x9f));
+ EXPECT_EQ(utf8.bytes[2], static_cast<char>(0xbf));
+}
+
+TEST(Utf8ForCodePointTest, RejectsTheSmallestSurrogate) {
+ Utf8ForCodePoint utf8(std::uint64_t{0xd800});
+ EXPECT_EQ(utf8.length, 0);
+}
+
+TEST(Utf8ForCodePointTest, RejectsTheLargestSurrogate) {
+ Utf8ForCodePoint utf8(std::uint64_t{0xdfff});
+ EXPECT_EQ(utf8.length, 0);
+}
+
+TEST(Utf8ForCodePointTest, RecognizesOnePastTheLargestSurrogate) {
+ Utf8ForCodePoint utf8(std::uint64_t{0xe000});
+ ASSERT_EQ(utf8.length, 3);
+ EXPECT_EQ(utf8.bytes[0], static_cast<char>(0xee));
+ EXPECT_EQ(utf8.bytes[1], static_cast<char>(0x80));
+ EXPECT_EQ(utf8.bytes[2], static_cast<char>(0x80));
+}
+
+TEST(Utf8ForCodePointTest, RecognizesTheLargestThreeByteCodePoint) {
+ Utf8ForCodePoint utf8(std::uint64_t{0xffff});
+ ASSERT_EQ(utf8.length, 3);
+ EXPECT_EQ(utf8.bytes[0], static_cast<char>(0xef));
+ EXPECT_EQ(utf8.bytes[1], static_cast<char>(0xbf));
+ EXPECT_EQ(utf8.bytes[2], static_cast<char>(0xbf));
+}
+
+TEST(Utf8ForCodePointTest, RecognizesTheSmallestFourByteCodePoint) {
+ Utf8ForCodePoint utf8(std::uint64_t{0x10000});
+ ASSERT_EQ(utf8.length, 4);
+ EXPECT_EQ(utf8.bytes[0], static_cast<char>(0xf0));
+ EXPECT_EQ(utf8.bytes[1], static_cast<char>(0x90));
+ EXPECT_EQ(utf8.bytes[2], static_cast<char>(0x80));
+ EXPECT_EQ(utf8.bytes[3], static_cast<char>(0x80));
+}
+
+TEST(Utf8ForCodePointTest, RecognizesTheJackOfHearts) {
+ Utf8ForCodePoint utf8(std::uint64_t{0x1f0bb});
+ ASSERT_EQ(utf8.length, 4);
+ const char* want = "🂻";
+ EXPECT_EQ(utf8.bytes[0], want[0]);
+ EXPECT_EQ(utf8.bytes[1], want[1]);
+ EXPECT_EQ(utf8.bytes[2], want[2]);
+ EXPECT_EQ(utf8.bytes[3], want[3]);
+}
+
+TEST(Utf8ForCodePointTest, RecognizesTheLargestFourByteCodePoint) {
+ Utf8ForCodePoint utf8(std::uint64_t{0x10ffff});
+ ASSERT_EQ(utf8.length, 4);
+ EXPECT_EQ(utf8.bytes[0], static_cast<char>(0xf4));
+ EXPECT_EQ(utf8.bytes[1], static_cast<char>(0x8f));
+ EXPECT_EQ(utf8.bytes[2], static_cast<char>(0xbf));
+ EXPECT_EQ(utf8.bytes[3], static_cast<char>(0xbf));
+}
+
+TEST(Utf8ForCodePointTest, RejectsTheSmallestOverlargeCodePoint) {
+ Utf8ForCodePoint utf8(std::uint64_t{0x110000});
+ EXPECT_EQ(utf8.length, 0);
+}
+
+TEST(Utf8ForCodePointTest, RejectsAThroughlyOverlargeCodePoint) {
+ Utf8ForCodePoint utf8(std::uint64_t{0xffffffff00000000});
+ EXPECT_EQ(utf8.length, 0);
+}
+
+TEST(Utf8ForCodePointTest, OkReturnsTrueForAValidCodePoint) {
+ EXPECT_TRUE(Utf8ForCodePoint(std::uint64_t{0}).ok());
+}
+
+TEST(Utf8ForCodePointTest, OkReturnsFalseForAnInvalidCodePoint) {
+ EXPECT_FALSE(Utf8ForCodePoint(std::uint64_t{0xffffffff00000000}).ok());
+}
+
+} // namespace
+} // namespace debugging_internal
+ABSL_NAMESPACE_END
+} // namespace absl