From 77111e3d5b40df6019fedc85198f7376c120bffb Mon Sep 17 00:00:00 2001 From: Tsige Solomon Date: Mon, 12 Jun 2023 15:56:24 -0700 Subject: Functions added: FindLongestCommonSuffix, FindLongestCommonPrefix. PiperOrigin-RevId: 539784770 Change-Id: Ie224afa04af023bbddc89b967e8c8440f9e8a887 --- absl/strings/match.cc | 67 +++++++++++++++++++++++++- absl/strings/match.h | 10 ++++ absl/strings/match_test.cc | 117 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 193 insertions(+), 1 deletion(-) diff --git a/absl/strings/match.cc b/absl/strings/match.cc index b65cbc67..3b81b2c0 100644 --- a/absl/strings/match.cc +++ b/absl/strings/match.cc @@ -13,8 +13,13 @@ // limitations under the License. #include "absl/strings/match.h" -#include "absl/strings/ascii.h" +#include +#include + +#include "absl/base/internal/endian.h" +#include "absl/numeric/bits.h" +#include "absl/strings/ascii.h" #include "absl/strings/internal/memutil.h" namespace absl { @@ -61,5 +66,65 @@ bool EndsWithIgnoreCase(absl::string_view text, EqualsIgnoreCase(text.substr(text.size() - suffix.size()), suffix); } +absl::string_view FindLongestCommonPrefix(absl::string_view a, + absl::string_view b) { + const absl::string_view::size_type limit = std::min(a.size(), b.size()); + const char* const pa = a.data(); + const char* const pb = b.data(); + absl::string_view::size_type count = (unsigned) 0; + + if (ABSL_PREDICT_FALSE(limit < 8)) { + while (ABSL_PREDICT_TRUE(count + 2 <= limit)) { + uint16_t xor_bytes = absl::little_endian::Load16(pa + count) ^ + absl::little_endian::Load16(pb + count); + if (ABSL_PREDICT_FALSE(xor_bytes != 0)) { + if (ABSL_PREDICT_TRUE((xor_bytes & 0xff) == 0)) ++count; + return absl::string_view(pa, count); + } + count += 2; + } + if (ABSL_PREDICT_TRUE(count != limit)) { + if (ABSL_PREDICT_TRUE(pa[count] == pb[count])) ++count; + } + return absl::string_view(pa, count); + } + + do { + uint64_t xor_bytes = absl::little_endian::Load64(pa + count) ^ + absl::little_endian::Load64(pb + count); + if (ABSL_PREDICT_FALSE(xor_bytes != 0)) { + count += static_cast(absl::countr_zero(xor_bytes) >> 3); + return absl::string_view(pa, count); + } + count += 8; + } while (ABSL_PREDICT_TRUE(count + 8 < limit)); + + count = limit - 8; + uint64_t xor_bytes = absl::little_endian::Load64(pa + count) ^ + absl::little_endian::Load64(pb + count); + if (ABSL_PREDICT_TRUE(xor_bytes != 0)) { + count += static_cast(absl::countr_zero(xor_bytes) >> 3); + return absl::string_view(pa, count); + } + return absl::string_view(pa, limit); +} + +absl::string_view FindLongestCommonSuffix(absl::string_view a, + absl::string_view b) { + const absl::string_view::size_type limit = std::min(a.size(), b.size()); + if (limit == 0) return absl::string_view(); + + const char* pa = a.data() + a.size() - 1; + const char* pb = b.data() + b.size() - 1; + absl::string_view::size_type count = (unsigned) 0; + while (count < limit && *pa == *pb) { + --pa; + --pb; + ++count; + } + + return absl::string_view(++pa, count); +} + ABSL_NAMESPACE_END } // namespace absl diff --git a/absl/strings/match.h b/absl/strings/match.h index 1dc0beaf..1eeafbbf 100644 --- a/absl/strings/match.h +++ b/absl/strings/match.h @@ -103,6 +103,16 @@ bool StartsWithIgnoreCase(absl::string_view text, bool EndsWithIgnoreCase(absl::string_view text, absl::string_view suffix) noexcept; +// Yields the longest prefix in common between both input strings. +// Pointer-wise, the returned result is a subset of input "a". +absl::string_view FindLongestCommonPrefix(absl::string_view a, + absl::string_view b); + +// Yields the longest suffix in common between both input strings. +// Pointer-wise, the returned result is a subset of input "a". +absl::string_view FindLongestCommonSuffix(absl::string_view a, + absl::string_view b); + ABSL_NAMESPACE_END } // namespace absl diff --git a/absl/strings/match_test.cc b/absl/strings/match_test.cc index f063b4ea..71618f71 100644 --- a/absl/strings/match_test.cc +++ b/absl/strings/match_test.cc @@ -168,4 +168,121 @@ TEST(MatchTest, ContainsCharIgnoreCase) { EXPECT_FALSE(absl::StrContainsIgnoreCase("", '0')); } +TEST(MatchTest, FindLongestCommonPrefix) { + EXPECT_EQ(absl::FindLongestCommonPrefix("", ""), ""); + EXPECT_EQ(absl::FindLongestCommonPrefix("", "abc"), ""); + EXPECT_EQ(absl::FindLongestCommonPrefix("abc", ""), ""); + EXPECT_EQ(absl::FindLongestCommonPrefix("ab", "abc"), "ab"); + EXPECT_EQ(absl::FindLongestCommonPrefix("abc", "ab"), "ab"); + EXPECT_EQ(absl::FindLongestCommonPrefix("abc", "abd"), "ab"); + EXPECT_EQ(absl::FindLongestCommonPrefix("abc", "abcd"), "abc"); + EXPECT_EQ(absl::FindLongestCommonPrefix("abcd", "abcd"), "abcd"); + EXPECT_EQ(absl::FindLongestCommonPrefix("abcd", "efgh"), ""); + + // "abcde" v. "abc" but in the middle of other data + EXPECT_EQ(absl::FindLongestCommonPrefix( + absl::string_view("1234 abcdef").substr(5, 5), + absl::string_view("5678 abcdef").substr(5, 3)), + "abc"); +} + +// Since the little-endian implementation involves a bit of if-else and various +// return paths, the following tests aims to provide full test coverage of the +// implementation. +TEST(MatchTest, FindLongestCommonPrefixLoad16Mismatch) { + const std::string x1 = "abcdefgh"; + const std::string x2 = "abcde_"; + EXPECT_EQ(absl::FindLongestCommonPrefix(x1, x2), "abcde"); + EXPECT_EQ(absl::FindLongestCommonPrefix(x2, x1), "abcde"); +} + +TEST(MatchTest, FindLongestCommonPrefixLoad16MatchesNoLast) { + const std::string x1 = "abcdef"; + const std::string x2 = "abcdef"; + EXPECT_EQ(absl::FindLongestCommonPrefix(x1, x2), "abcdef"); + EXPECT_EQ(absl::FindLongestCommonPrefix(x2, x1), "abcdef"); +} + +TEST(MatchTest, FindLongestCommonPrefixLoad16MatchesLastCharMismatches) { + const std::string x1 = "abcdefg"; + const std::string x2 = "abcdef_h"; + EXPECT_EQ(absl::FindLongestCommonPrefix(x1, x2), "abcdef"); + EXPECT_EQ(absl::FindLongestCommonPrefix(x2, x1), "abcdef"); +} + +TEST(MatchTest, FindLongestCommonPrefixLoad16MatchesLastMatches) { + const std::string x1 = "abcde"; + const std::string x2 = "abcdefgh"; + EXPECT_EQ(absl::FindLongestCommonPrefix(x1, x2), "abcde"); + EXPECT_EQ(absl::FindLongestCommonPrefix(x2, x1), "abcde"); +} + +TEST(MatchTest, FindLongestCommonPrefixSize8Load64Mismatches) { + const std::string x1 = "abcdefghijk"; + const std::string x2 = "abcde_g_"; + EXPECT_EQ(absl::FindLongestCommonPrefix(x1, x2), "abcde"); + EXPECT_EQ(absl::FindLongestCommonPrefix(x2, x1), "abcde"); +} + +TEST(MatchTest, FindLongestCommonPrefixSize8Load64Matches) { + const std::string x1 = "abcdefgh"; + const std::string x2 = "abcdefgh"; + EXPECT_EQ(absl::FindLongestCommonPrefix(x1, x2), "abcdefgh"); + EXPECT_EQ(absl::FindLongestCommonPrefix(x2, x1), "abcdefgh"); +} + +TEST(MatchTest, FindLongestCommonPrefixSize15Load64Mismatches) { + const std::string x1 = "012345670123456"; + const std::string x2 = "0123456701_34_6"; + EXPECT_EQ(absl::FindLongestCommonPrefix(x1, x2), "0123456701"); + EXPECT_EQ(absl::FindLongestCommonPrefix(x2, x1), "0123456701"); +} + +TEST(MatchTest, FindLongestCommonPrefixSize15Load64Matches) { + const std::string x1 = "012345670123456"; + const std::string x2 = "0123456701234567"; + EXPECT_EQ(absl::FindLongestCommonPrefix(x1, x2), "012345670123456"); + EXPECT_EQ(absl::FindLongestCommonPrefix(x2, x1), "012345670123456"); +} + +TEST(MatchTest, FindLongestCommonPrefixSizeFirstByteOfLast8BytesMismatch) { + const std::string x1 = "012345670123456701234567"; + const std::string x2 = "0123456701234567_1234567"; + EXPECT_EQ(absl::FindLongestCommonPrefix(x1, x2), "0123456701234567"); + EXPECT_EQ(absl::FindLongestCommonPrefix(x2, x1), "0123456701234567"); +} + +TEST(MatchTest, FindLongestCommonPrefixLargeLastCharMismatches) { + const std::string x1(300, 'x'); + std::string x2 = x1; + x2.back() = '#'; + EXPECT_EQ(absl::FindLongestCommonPrefix(x1, x2), std::string(299, 'x')); + EXPECT_EQ(absl::FindLongestCommonPrefix(x2, x1), std::string(299, 'x')); +} + +TEST(MatchTest, FindLongestCommonPrefixLargeFullMatch) { + const std::string x1(300, 'x'); + const std::string x2 = x1; + EXPECT_EQ(absl::FindLongestCommonPrefix(x1, x2), std::string(300, 'x')); + EXPECT_EQ(absl::FindLongestCommonPrefix(x2, x1), std::string(300, 'x')); +} + +TEST(MatchTest, FindLongestCommonSuffix) { + EXPECT_EQ(absl::FindLongestCommonSuffix("", ""), ""); + EXPECT_EQ(absl::FindLongestCommonSuffix("", "abc"), ""); + EXPECT_EQ(absl::FindLongestCommonSuffix("abc", ""), ""); + EXPECT_EQ(absl::FindLongestCommonSuffix("bc", "abc"), "bc"); + EXPECT_EQ(absl::FindLongestCommonSuffix("abc", "bc"), "bc"); + EXPECT_EQ(absl::FindLongestCommonSuffix("abc", "dbc"), "bc"); + EXPECT_EQ(absl::FindLongestCommonSuffix("bcd", "abcd"), "bcd"); + EXPECT_EQ(absl::FindLongestCommonSuffix("abcd", "abcd"), "abcd"); + EXPECT_EQ(absl::FindLongestCommonSuffix("abcd", "efgh"), ""); + + // "abcde" v. "cde" but in the middle of other data + EXPECT_EQ(absl::FindLongestCommonSuffix( + absl::string_view("1234 abcdef").substr(5, 5), + absl::string_view("5678 abcdef").substr(7, 3)), + "cde"); +} + } // namespace -- cgit v1.2.3