diff options
author | Abseil Team <absl-team@google.com> | 2022-10-17 01:09:18 -0700 |
---|---|---|
committer | Copybara-Service <copybara-worker@google.com> | 2022-10-17 01:09:54 -0700 |
commit | edf41d72381bf496c83b24b9dc242bc055c584dd (patch) | |
tree | ff991f5348838e496710ea51c98df58cacfeb222 /absl/strings | |
parent | 5fa65f28e46e86c44966a1ca8a727a329d9c1ff8 (diff) |
Implement function to calculate Damerau-Levenshtein distance between two strings.
PiperOrigin-RevId: 481568970
Change-Id: Icb132348f62fed4c0168aac4963b3313a060890b
Diffstat (limited to 'absl/strings')
-rw-r--r-- | absl/strings/BUILD.bazel | 15 | ||||
-rw-r--r-- | absl/strings/CMakeLists.txt | 15 | ||||
-rw-r--r-- | absl/strings/internal/damerau_levenshtein_distance.cc | 93 | ||||
-rw-r--r-- | absl/strings/internal/damerau_levenshtein_distance.h | 35 | ||||
-rw-r--r-- | absl/strings/internal/damerau_levenshtein_distance_test.cc | 99 |
5 files changed, 257 insertions, 0 deletions
diff --git a/absl/strings/BUILD.bazel b/absl/strings/BUILD.bazel index c6989816..809f566d 100644 --- a/absl/strings/BUILD.bazel +++ b/absl/strings/BUILD.bazel @@ -37,6 +37,7 @@ cc_library( "internal/charconv_bigint.h", "internal/charconv_parse.cc", "internal/charconv_parse.h", + "internal/damerau_levenshtein_distance.cc", "internal/memutil.cc", "internal/memutil.h", "internal/stl_type_traits.h", @@ -55,6 +56,7 @@ cc_library( "ascii.h", "charconv.h", "escaping.h", + "internal/damerau_levenshtein_distance.h", "internal/string_constant.h", "match.h", "numbers.h", @@ -180,6 +182,19 @@ cc_test( ) cc_test( + name = "damerau_levenshtein_distance_test", + size = "small", + srcs = [ + "internal/damerau_levenshtein_distance_test.cc", + ], + copts = ABSL_TEST_COPTS, + deps = [ + "//absl/strings", + "@com_google_googletest//:gtest_main", + ], +) + +cc_test( name = "memutil_benchmark", srcs = [ "internal/memutil.h", diff --git a/absl/strings/CMakeLists.txt b/absl/strings/CMakeLists.txt index fe82e1df..30dedcf5 100644 --- a/absl/strings/CMakeLists.txt +++ b/absl/strings/CMakeLists.txt @@ -21,6 +21,7 @@ absl_cc_library( "ascii.h" "charconv.h" "escaping.h" + "internal/damerau_levenshtein_distance.h" "internal/string_constant.h" "match.h" "numbers.h" @@ -39,6 +40,7 @@ absl_cc_library( "internal/charconv_bigint.h" "internal/charconv_parse.cc" "internal/charconv_parse.h" + "internal/damerau_levenshtein_distance.cc" "internal/memutil.cc" "internal/memutil.h" "internal/stringify_sink.h" @@ -135,6 +137,19 @@ absl_cc_test( absl_cc_test( NAME + damerau_levenshtein_distance_test + SRCS + "internal/damerau_levenshtein_distance_test.cc" + COPTS + ${ABSL_TEST_COPTS} + DEPS + absl::strings + absl::base + GTest::gmock_main +) + +absl_cc_test( + NAME memutil_test SRCS "internal/memutil.h" diff --git a/absl/strings/internal/damerau_levenshtein_distance.cc b/absl/strings/internal/damerau_levenshtein_distance.cc new file mode 100644 index 00000000..7cc23acd --- /dev/null +++ b/absl/strings/internal/damerau_levenshtein_distance.cc @@ -0,0 +1,93 @@ +// Copyright 2022 The Abseil Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "absl/strings/internal/damerau_levenshtein_distance.h" + +#include <algorithm> +#include <array> +#include <numeric> + +#include "absl/strings/string_view.h" +namespace absl { +ABSL_NAMESPACE_BEGIN +namespace strings_internal { +// Calculate DamerauLevenshtein (adjacent transpositions) distance +// between two strings, +// https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance. The +// algorithm follows the condition that no substring is edited more than once. +// While this can reduce is larger distance, it's a) a much simpler algorithm +// and b) more realistic for the case that typographic mistakes should be +// detected. +// When the distance is larger than cutoff, or one of the strings has more +// than MAX_SIZE=100 characters, the code returns min(MAX_SIZE, cutoff) + 1. +size_t CappedDamerauLevenshteinDistance(absl::string_view s1, + absl::string_view s2, uint8_t cutoff) { + const uint8_t MAX_SIZE = 100; + const uint8_t _cutoff = std::min(MAX_SIZE, cutoff); + const uint8_t cutoff_plus_1 = static_cast<uint8_t>(_cutoff + 1); + + if (s1.size() > s2.size()) std::swap(s1, s2); + if (s1.size() + _cutoff < s2.size() || s2.size() > MAX_SIZE) + return cutoff_plus_1; + + if (s1.empty()) + return std::min(static_cast<size_t>(cutoff_plus_1), s2.size()); + + // Lower diagonal bound: y = x - lower_diag + const uint8_t lower_diag = + _cutoff - static_cast<uint8_t>(s2.size() - s1.size()); + // Upper diagonal bound: y = x + upper_diag + const uint8_t upper_diag = _cutoff; + + // d[i][j] is the number of edits required to convert s1[0, i] to s2[0, j] + std::array<std::array<uint8_t, MAX_SIZE + 2>, MAX_SIZE + 2> d; + std::iota(d[0].begin(), d[0].begin() + upper_diag + 1, 0); + d[0][cutoff_plus_1] = cutoff_plus_1; + for (size_t i = 1; i <= s1.size(); ++i) { + // Deduce begin of relevant window. + size_t j_begin = 1; + if (i > lower_diag) { + j_begin = i - lower_diag; + d[i][j_begin - 1] = cutoff_plus_1; + } else { + d[i][0] = static_cast<uint8_t>(i); + } + + // Deduce end of relevant window. + size_t j_end = i + upper_diag; + if (j_end > s2.size()) { + j_end = s2.size(); + } else { + d[i][j_end + 1] = cutoff_plus_1; + } + + for (size_t j = j_begin; j <= j_end; ++j) { + const uint8_t deletion_distance = d[i - 1][j] + 1; + const uint8_t insertion_distance = d[i][j - 1] + 1; + const uint8_t mismatched_tail_cost = s1[i - 1] == s2[j - 1] ? 0 : 1; + const uint8_t mismatch_distance = d[i - 1][j - 1] + mismatched_tail_cost; + uint8_t transposition_distance = _cutoff + 1; + if (i > 1 && j > 1 && s1[i - 1] == s2[j - 2] && s1[i - 2] == s2[j - 1]) + transposition_distance = d[i - 2][j - 2] + 1; + d[i][j] = std::min({cutoff_plus_1, deletion_distance, insertion_distance, + mismatch_distance, transposition_distance}); + } + } + return d[s1.size()][s2.size()]; +} + +} // namespace strings_internal + +ABSL_NAMESPACE_END +} // namespace absl diff --git a/absl/strings/internal/damerau_levenshtein_distance.h b/absl/strings/internal/damerau_levenshtein_distance.h new file mode 100644 index 00000000..b9bb6fe1 --- /dev/null +++ b/absl/strings/internal/damerau_levenshtein_distance.h @@ -0,0 +1,35 @@ +// Copyright 2022 The Abseil Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef ABSL_STRINGS_INTERNAL_DAMERAU_LEVENSHTEIN_DISTANCE_H_ +#define ABSL_STRINGS_INTERNAL_DAMERAU_LEVENSHTEIN_DISTANCE_H_ + +#include <numeric> +#include <vector> + +#include "absl/strings/string_view.h" + +namespace absl { +ABSL_NAMESPACE_BEGIN +namespace strings_internal { +// Calculate DamerauLevenshtein distance between two strings. +// When the distance is larger than cutoff, the code just returns cutoff + 1. +size_t CappedDamerauLevenshteinDistance(absl::string_view s1, + absl::string_view s2, uint8_t cutoff); + +} // namespace strings_internal +ABSL_NAMESPACE_END +} // namespace absl + +#endif // ABSL_STRINGS_INTERNAL_DAMERAU_LEVENSHTEIN_DISTANCE_H_ diff --git a/absl/strings/internal/damerau_levenshtein_distance_test.cc b/absl/strings/internal/damerau_levenshtein_distance_test.cc new file mode 100644 index 00000000..85871941 --- /dev/null +++ b/absl/strings/internal/damerau_levenshtein_distance_test.cc @@ -0,0 +1,99 @@ +// Copyright 2022 The Abseil Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "absl/strings/internal/damerau_levenshtein_distance.h" + +#include <cstdint> + +#include "gmock/gmock.h" +#include "gtest/gtest.h" + +namespace { + +using absl::strings_internal::CappedDamerauLevenshteinDistance; + +TEST(Distance, TestDistances) { + EXPECT_THAT(CappedDamerauLevenshteinDistance("ab", "ab", 6), 0); + EXPECT_THAT(CappedDamerauLevenshteinDistance("a", "b", 6), 1); + EXPECT_THAT(CappedDamerauLevenshteinDistance("ca", "abc", 6), 3); + EXPECT_THAT(CappedDamerauLevenshteinDistance("abcd", "ad", 6), 2); + EXPECT_THAT(CappedDamerauLevenshteinDistance("abcd", "cadb", 6), 4); + EXPECT_THAT(CappedDamerauLevenshteinDistance("abcd", "bdac", 6), 4); + EXPECT_THAT(CappedDamerauLevenshteinDistance("ab", "ab", 0), 0); + EXPECT_THAT(CappedDamerauLevenshteinDistance("", "", 0), 0); + // combinations for 3-character strings: + // 1, 2, 3 removals, insertions or replacements and transpositions + EXPECT_THAT(CappedDamerauLevenshteinDistance("abc", "abc", 6), 0); + for (auto res : + {"", "ca", "efg", "ea", "ce", "ceb", "eca", "cae", "cea", "bea"}) { + EXPECT_THAT(CappedDamerauLevenshteinDistance("abc", res, 6), 3); + EXPECT_THAT(CappedDamerauLevenshteinDistance(res, "abc", 6), 3); + } + for (auto res : + {"a", "b", "c", "ba", "cb", "bca", "cab", "cba", "ace", + "efc", "ebf", "aef", "ae", "be", "eb", "ec", "ecb", "bec", + "bce", "cbe", "ace", "eac", "aeb", "bae", "eab", "eba"}) { + EXPECT_THAT(CappedDamerauLevenshteinDistance("abc", res, 6), 2); + EXPECT_THAT(CappedDamerauLevenshteinDistance(res, "abc", 6), 2); + } + for (auto res : {"ab", "ac", "bc", "acb", "bac", "ebc", "aec", "abe"}) { + EXPECT_THAT(CappedDamerauLevenshteinDistance("abc", res, 6), 1); + EXPECT_THAT(CappedDamerauLevenshteinDistance(res, "abc", 6), 1); + } +} + +TEST(Distance, TestCutoff) { + // Returing cutoff + 1 if the value is larger than cutoff or string longer + // than MAX_SIZE. + EXPECT_THAT(CappedDamerauLevenshteinDistance("abcd", "a", 3), 3); + EXPECT_THAT(CappedDamerauLevenshteinDistance("abcd", "a", 2), 3); + EXPECT_THAT(CappedDamerauLevenshteinDistance("abcd", "a", 1), 2); + EXPECT_THAT(CappedDamerauLevenshteinDistance("abcdefg", "a", 2), 3); + EXPECT_THAT(CappedDamerauLevenshteinDistance("a", "abcde", 2), 3); + EXPECT_THAT(CappedDamerauLevenshteinDistance(std::string(102, 'a'), + std::string(102, 'a'), 105), + 101); + EXPECT_THAT(CappedDamerauLevenshteinDistance(std::string(100, 'a'), + std::string(100, 'a'), 100), + 0); + EXPECT_THAT(CappedDamerauLevenshteinDistance(std::string(100, 'a'), + std::string(100, 'b'), 100), + 100); + EXPECT_THAT(CappedDamerauLevenshteinDistance(std::string(100, 'a'), + std::string(99, 'a'), 2), + 1); + EXPECT_THAT(CappedDamerauLevenshteinDistance(std::string(100, 'a'), + std::string(101, 'a'), 2), + 3); + EXPECT_THAT(CappedDamerauLevenshteinDistance(std::string(100, 'a'), + std::string(101, 'a'), 2), + 3); + EXPECT_THAT(CappedDamerauLevenshteinDistance(std::string(UINT8_MAX + 1, 'a'), + std::string(UINT8_MAX + 1, 'b'), + UINT8_MAX), + 101); + EXPECT_THAT(CappedDamerauLevenshteinDistance(std::string(UINT8_MAX - 1, 'a'), + std::string(UINT8_MAX - 1, 'b'), + UINT8_MAX), + 101); + EXPECT_THAT( + CappedDamerauLevenshteinDistance(std::string(UINT8_MAX, 'a'), + std::string(UINT8_MAX, 'b'), UINT8_MAX), + 101); + EXPECT_THAT(CappedDamerauLevenshteinDistance(std::string(UINT8_MAX - 1, 'a'), + std::string(UINT8_MAX - 1, 'a'), + UINT8_MAX), + 101); +} +} // namespace |