diff options
author | Abseil Team <absl-team@google.com> | 2023-08-16 10:08:53 -0700 |
---|---|---|
committer | Copybara-Service <copybara-worker@google.com> | 2023-08-16 10:09:41 -0700 |
commit | 17a3ac4bcc9b16aa1ae020a2f7067008d627ad88 (patch) | |
tree | ad64797c7d86d2c2365303499faf6ad612e91061 /absl/strings | |
parent | 334aca32051ef6ede2711487acf45d959e9bdffc (diff) |
Implement `Cord::Find()` and `Cord::Contains()`
PiperOrigin-RevId: 557523229
Change-Id: I959c0b0b14a4a96bee396d4bc09b80328060287d
Diffstat (limited to 'absl/strings')
-rw-r--r-- | absl/strings/cord.cc | 201 | ||||
-rw-r--r-- | absl/strings/cord.h | 16 | ||||
-rw-r--r-- | absl/strings/cord_test.cc | 87 |
3 files changed, 297 insertions, 7 deletions
diff --git a/absl/strings/cord.cc b/absl/strings/cord.cc index 0c26e37e..08a165e1 100644 --- a/absl/strings/cord.cc +++ b/absl/strings/cord.cc @@ -31,6 +31,7 @@ #include <string> #include <utility> +#include "absl/base/attributes.h" #include "absl/base/config.h" #include "absl/base/internal/endian.h" #include "absl/base/internal/raw_logging.h" @@ -49,8 +50,10 @@ #include "absl/strings/internal/cord_rep_flat.h" #include "absl/strings/internal/cordz_update_tracker.h" #include "absl/strings/internal/resize_uninitialized.h" +#include "absl/strings/match.h" #include "absl/strings/str_cat.h" #include "absl/strings/string_view.h" +#include "absl/strings/strip.h" #include "absl/types/optional.h" #include "absl/types/span.h" @@ -565,13 +568,9 @@ CordBuffer Cord::GetAppendBufferSlowPath(size_t block_size, size_t capacity, return CreateAppendBuffer(contents_.data_, block_size, capacity); } -void Cord::Append(const Cord& src) { - AppendImpl(src); -} +void Cord::Append(const Cord& src) { AppendImpl(src); } -void Cord::Append(Cord&& src) { - AppendImpl(std::move(src)); -} +void Cord::Append(Cord&& src) { AppendImpl(std::move(src)); } template <typename T, Cord::EnableIfString<T>> void Cord::Append(T&& src) { @@ -1157,6 +1156,194 @@ char Cord::operator[](size_t i) const { } } +namespace { + +// Tests whether the sequence of chunks beginning at `position` starts with +// `needle`. +// +// REQUIRES: remaining `absl::Cord` starting at `position` is greater than or +// equal to `needle.size()`. +bool IsSubstringInCordAt(absl::Cord::CharIterator position, + absl::string_view needle) { + auto haystack_chunk = absl::Cord::ChunkRemaining(position); + while (true) { + // Precondition is that `absl::Cord::ChunkRemaining(position)` is not + // empty. This assert will trigger if that is not true. + assert(!haystack_chunk.empty()); + auto min_length = std::min(haystack_chunk.size(), needle.size()); + if (!absl::ConsumePrefix(&needle, haystack_chunk.substr(0, min_length))) { + return false; + } + if (needle.empty()) { + return true; + } + absl::Cord::Advance(&position, min_length); + haystack_chunk = absl::Cord::ChunkRemaining(position); + } +} + +} // namespace + +// A few options how this could be implemented: +// (a) Flatten the Cord and find, i.e. +// haystack.Flatten().find(needle) +// For large 'haystack' (where Cord makes sense to be used), this copies +// the whole 'haystack' and can be slow. +// (b) Use std::search, i.e. +// std::search(haystack.char_begin(), haystack.char_end(), +// needle.begin(), needle.end()) +// This avoids the copy, but compares one byte at a time, and branches a +// lot every time it has to advance. It is also not possible to use +// std::search as is, because CharIterator is only an input iterator, not a +// forward iterator. +// (c) Use string_view::find in each fragment, and specifically handle fragment +// boundaries. +// +// This currently implements option (b). +absl::Cord::CharIterator absl::Cord::FindImpl(CharIterator it, + absl::string_view needle) const { + // Ensure preconditions are met by callers first. + + // Needle must not be empty. + assert(!needle.empty()); + // Haystack must be at least as large as needle. + assert(it.chunk_iterator_.bytes_remaining_ >= needle.size()); + + // Cord is a sequence of chunks. To find `needle` we go chunk by chunk looking + // for the first char of needle, up until we have advanced `N` defined as + // `haystack.size() - needle.size()`. If we find the first char of needle at + // `P` and `P` is less than `N`, we then call `IsSubstringInCordAt` to + // see if this is the needle. If not, we advance to `P + 1` and try again. + while (it.chunk_iterator_.bytes_remaining_ >= needle.size()) { + auto haystack_chunk = Cord::ChunkRemaining(it); + assert(!haystack_chunk.empty()); + // Look for the first char of `needle` in the current chunk. + auto idx = haystack_chunk.find(needle.front()); + if (idx == absl::string_view::npos) { + // No potential match in this chunk, advance past it. + Cord::Advance(&it, haystack_chunk.size()); + continue; + } + // We found the start of a potential match in the chunk. Advance the + // iterator and haystack chunk to the match the position. + Cord::Advance(&it, idx); + // Check if there is enough haystack remaining to actually have a match. + if (it.chunk_iterator_.bytes_remaining_ < needle.size()) { + break; + } + // Check if this is `needle`. + if (IsSubstringInCordAt(it, needle)) { + return it; + } + // No match, increment the iterator for the next attempt. + Cord::Advance(&it, 1); + } + // If we got here, we did not find `needle`. + return char_end(); +} + +absl::Cord::CharIterator absl::Cord::Find(absl::string_view needle) const { + if (needle.empty()) { + return char_begin(); + } + if (needle.size() > size()) { + return char_end(); + } + if (needle.size() == size()) { + return *this == needle ? char_begin() : char_end(); + } + return FindImpl(char_begin(), needle); +} + +namespace { + +// Tests whether the sequence of chunks beginning at `haystack` starts with the +// sequence of chunks beginning at `needle_begin` and extending to `needle_end`. +// +// REQUIRES: remaining `absl::Cord` starting at `position` is greater than or +// equal to `needle_end - needle_begin` and `advance`. +bool IsSubcordInCordAt(absl::Cord::CharIterator haystack, + absl::Cord::CharIterator needle_begin, + absl::Cord::CharIterator needle_end) { + while (needle_begin != needle_end) { + auto haystack_chunk = absl::Cord::ChunkRemaining(haystack); + assert(!haystack_chunk.empty()); + auto needle_chunk = absl::Cord::ChunkRemaining(needle_begin); + auto min_length = std::min(haystack_chunk.size(), needle_chunk.size()); + if (haystack_chunk.substr(0, min_length) != + needle_chunk.substr(0, min_length)) { + return false; + } + absl::Cord::Advance(&haystack, min_length); + absl::Cord::Advance(&needle_begin, min_length); + } + return true; +} + +// Tests whether the sequence of chunks beginning at `position` starts with the +// cord `needle`. +// +// REQUIRES: remaining `absl::Cord` starting at `position` is greater than or +// equal to `needle.size()`. +bool IsSubcordInCordAt(absl::Cord::CharIterator position, + const absl::Cord& needle) { + return IsSubcordInCordAt(position, needle.char_begin(), needle.char_end()); +} + +} // namespace + +absl::Cord::CharIterator absl::Cord::Find(const absl::Cord& needle) const { + if (needle.empty()) { + return char_begin(); + } + const auto needle_size = needle.size(); + if (needle_size > size()) { + return char_end(); + } + if (needle_size == size()) { + return *this == needle ? char_begin() : char_end(); + } + const auto needle_chunk = Cord::ChunkRemaining(needle.char_begin()); + auto haystack_it = char_begin(); + while (true) { + haystack_it = FindImpl(haystack_it, needle_chunk); + if (haystack_it == char_end() || + haystack_it.chunk_iterator_.bytes_remaining_ < needle_size) { + break; + } + // We found the first chunk of `needle` at `haystack_it` but not the entire + // subcord. Advance past the first chunk and check for the remainder. + auto haystack_advanced_it = haystack_it; + auto needle_it = needle.char_begin(); + Cord::Advance(&haystack_advanced_it, needle_chunk.size()); + Cord::Advance(&needle_it, needle_chunk.size()); + if (IsSubcordInCordAt(haystack_advanced_it, needle_it, needle.char_end())) { + return haystack_it; + } + Cord::Advance(&haystack_it, 1); + if (haystack_it.chunk_iterator_.bytes_remaining_ < needle_size) { + break; + } + if (haystack_it.chunk_iterator_.bytes_remaining_ == needle_size) { + // Special case, if there is exactly `needle_size` bytes remaining, the + // subcord is either at `haystack_it` or not at all. + if (IsSubcordInCordAt(haystack_it, needle)) { + return haystack_it; + } + break; + } + } + return char_end(); +} + +bool Cord::Contains(absl::string_view rhs) const { + return rhs.empty() || Find(rhs) != char_end(); +} + +bool Cord::Contains(const absl::Cord& rhs) const { + return rhs.empty() || Find(rhs) != char_end(); +} + absl::string_view Cord::FlattenSlowPath() { assert(contents_.is_tree()); size_t total_size = size(); @@ -1281,7 +1468,7 @@ static void DumpNode(CordRep* rep, bool include_data, std::ostream* os, *os << absl::CEscape(std::string(rep->flat()->Data(), rep->length)); *os << "]\n"; } else { - CordRepBtree::Dump(rep, /*label=*/ "", include_data, *os); + CordRepBtree::Dump(rep, /*label=*/"", include_data, *os); } } if (leaf) { diff --git a/absl/strings/cord.h b/absl/strings/cord.h index 8a37df96..0dede5c7 100644 --- a/absl/strings/cord.h +++ b/absl/strings/cord.h @@ -396,6 +396,12 @@ class Cord { bool EndsWith(absl::string_view rhs) const; bool EndsWith(const Cord& rhs) const; + // Cord::Contains() + // + // Determines whether the Cord contains the passed string data `rhs`. + bool Contains(absl::string_view rhs) const; + bool Contains(const Cord& rhs) const; + // Cord::operator std::string() // // Converts a Cord into a `std::string()`. This operator is marked explicit to @@ -747,6 +753,14 @@ class Cord { // If the cord was already flat, the contents are not modified. absl::string_view Flatten() ABSL_ATTRIBUTE_LIFETIME_BOUND; + // Cord::Find() + // + // Returns an iterator to the first occurrance of the substring `needle`. + // + // If the substring `needle` does not occur, `Cord::char_end()` is returned. + CharIterator Find(absl::string_view needle) const; + CharIterator Find(const absl::Cord& needle) const; + // Supports absl::Cord as a sink object for absl::Format(). friend void AbslFormatFlush(absl::Cord* cord, absl::string_view part) { cord->Append(part); @@ -1028,6 +1042,8 @@ class Cord { friend class CrcCord; void SetCrcCordState(crc_internal::CrcCordState state); const crc_internal::CrcCordState* MaybeGetCrcCordState() const; + + CharIterator FindImpl(CharIterator it, absl::string_view needle) const; }; ABSL_NAMESPACE_END diff --git a/absl/strings/cord_test.cc b/absl/strings/cord_test.cc index 51fef316..36478890 100644 --- a/absl/strings/cord_test.cc +++ b/absl/strings/cord_test.cc @@ -501,6 +501,93 @@ TEST_P(CordTest, StartsEndsWith) { ASSERT_TRUE(!empty.EndsWith("xyz")); } +TEST_P(CordTest, Contains) { + auto flat_haystack = absl::Cord("this is a flat cord"); + auto fragmented_haystack = absl::MakeFragmentedCord( + {"this", " ", "is", " ", "a", " ", "fragmented", " ", "cord"}); + + EXPECT_TRUE(flat_haystack.Contains("")); + EXPECT_TRUE(fragmented_haystack.Contains("")); + EXPECT_TRUE(flat_haystack.Contains(absl::Cord(""))); + EXPECT_TRUE(fragmented_haystack.Contains(absl::Cord(""))); + EXPECT_TRUE(absl::Cord("").Contains("")); + EXPECT_TRUE(absl::Cord("").Contains(absl::Cord(""))); + EXPECT_FALSE(absl::Cord("").Contains(flat_haystack)); + EXPECT_FALSE(absl::Cord("").Contains(fragmented_haystack)); + + EXPECT_FALSE(flat_haystack.Contains("z")); + EXPECT_FALSE(fragmented_haystack.Contains("z")); + EXPECT_FALSE(flat_haystack.Contains(absl::Cord("z"))); + EXPECT_FALSE(fragmented_haystack.Contains(absl::Cord("z"))); + + EXPECT_FALSE(flat_haystack.Contains("is an")); + EXPECT_FALSE(fragmented_haystack.Contains("is an")); + EXPECT_FALSE(flat_haystack.Contains(absl::Cord("is an"))); + EXPECT_FALSE(fragmented_haystack.Contains(absl::Cord("is an"))); + EXPECT_FALSE( + flat_haystack.Contains(absl::MakeFragmentedCord({"is", " ", "an"}))); + EXPECT_FALSE(fragmented_haystack.Contains( + absl::MakeFragmentedCord({"is", " ", "an"}))); + + EXPECT_TRUE(flat_haystack.Contains("is a")); + EXPECT_TRUE(fragmented_haystack.Contains("is a")); + EXPECT_TRUE(flat_haystack.Contains(absl::Cord("is a"))); + EXPECT_TRUE(fragmented_haystack.Contains(absl::Cord("is a"))); + EXPECT_TRUE( + flat_haystack.Contains(absl::MakeFragmentedCord({"is", " ", "a"}))); + EXPECT_TRUE( + fragmented_haystack.Contains(absl::MakeFragmentedCord({"is", " ", "a"}))); +} + +TEST_P(CordTest, Find) { + auto flat_haystack = absl::Cord("this is a flat cord"); + auto fragmented_haystack = absl::MakeFragmentedCord( + {"this", " ", "is", " ", "a", " ", "fragmented", " ", "cord"}); + auto empty_haystack = absl::Cord(""); + + EXPECT_EQ(flat_haystack.Find(""), flat_haystack.char_begin()); + EXPECT_EQ(fragmented_haystack.Find(""), fragmented_haystack.char_begin()); + EXPECT_EQ(flat_haystack.Find(absl::Cord("")), flat_haystack.char_begin()); + EXPECT_EQ(fragmented_haystack.Find(absl::Cord("")), + fragmented_haystack.char_begin()); + EXPECT_EQ(empty_haystack.Find(""), empty_haystack.char_begin()); + EXPECT_EQ(empty_haystack.Find(absl::Cord("")), empty_haystack.char_begin()); + EXPECT_EQ(empty_haystack.Find(flat_haystack), empty_haystack.char_end()); + EXPECT_EQ(empty_haystack.Find(fragmented_haystack), + empty_haystack.char_end()); + + EXPECT_EQ(flat_haystack.Find("z"), flat_haystack.char_end()); + EXPECT_EQ(fragmented_haystack.Find("z"), fragmented_haystack.char_end()); + EXPECT_EQ(flat_haystack.Find(absl::Cord("z")), flat_haystack.char_end()); + EXPECT_EQ(fragmented_haystack.Find(absl::Cord("z")), + fragmented_haystack.char_end()); + + EXPECT_EQ(flat_haystack.Find("is an"), flat_haystack.char_end()); + EXPECT_EQ(fragmented_haystack.Find("is an"), fragmented_haystack.char_end()); + EXPECT_EQ(flat_haystack.Find(absl::Cord("is an")), flat_haystack.char_end()); + EXPECT_EQ(fragmented_haystack.Find(absl::Cord("is an")), + fragmented_haystack.char_end()); + EXPECT_EQ(flat_haystack.Find(absl::MakeFragmentedCord({"is", " ", "an"})), + flat_haystack.char_end()); + EXPECT_EQ( + fragmented_haystack.Find(absl::MakeFragmentedCord({"is", " ", "an"})), + fragmented_haystack.char_end()); + + EXPECT_EQ(flat_haystack.Find("is a"), + std::next(flat_haystack.char_begin(), 5)); + EXPECT_EQ(fragmented_haystack.Find("is a"), + std::next(fragmented_haystack.char_begin(), 5)); + EXPECT_EQ(flat_haystack.Find(absl::Cord("is a")), + std::next(flat_haystack.char_begin(), 5)); + EXPECT_EQ(fragmented_haystack.Find(absl::Cord("is a")), + std::next(fragmented_haystack.char_begin(), 5)); + EXPECT_EQ(flat_haystack.Find(absl::MakeFragmentedCord({"is", " ", "a"})), + std::next(flat_haystack.char_begin(), 5)); + EXPECT_EQ( + fragmented_haystack.Find(absl::MakeFragmentedCord({"is", " ", "a"})), + std::next(fragmented_haystack.char_begin(), 5)); +} + TEST_P(CordTest, Subcord) { RandomEngine rng(GTEST_FLAG_GET(random_seed)); const std::string s = RandomLowercaseString(&rng, 1024); |