summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar Abseil Team <absl-team@google.com>2022-10-17 01:09:18 -0700
committerGravatar Copybara-Service <copybara-worker@google.com>2022-10-17 01:09:54 -0700
commitedf41d72381bf496c83b24b9dc242bc055c584dd (patch)
treeff991f5348838e496710ea51c98df58cacfeb222
parent5fa65f28e46e86c44966a1ca8a727a329d9c1ff8 (diff)
Implement function to calculate Damerau-Levenshtein distance between two strings.
PiperOrigin-RevId: 481568970 Change-Id: Icb132348f62fed4c0168aac4963b3313a060890b
-rw-r--r--CMake/AbseilDll.cmake2
-rw-r--r--absl/strings/BUILD.bazel15
-rw-r--r--absl/strings/CMakeLists.txt15
-rw-r--r--absl/strings/internal/damerau_levenshtein_distance.cc93
-rw-r--r--absl/strings/internal/damerau_levenshtein_distance.h35
-rw-r--r--absl/strings/internal/damerau_levenshtein_distance_test.cc99
6 files changed, 259 insertions, 0 deletions
diff --git a/CMake/AbseilDll.cmake b/CMake/AbseilDll.cmake
index f18b54ad..30069da2 100644
--- a/CMake/AbseilDll.cmake
+++ b/CMake/AbseilDll.cmake
@@ -238,6 +238,8 @@ set(ABSL_INTERNAL_DLL_FILES
"strings/internal/cordz_statistics.h"
"strings/internal/cordz_update_scope.h"
"strings/internal/cordz_update_tracker.h"
+ "strings/internal/damerau_levenshtein_distance.h"
+ "strings/internal/damerau_levenshtein_distance.cc"
"strings/internal/stl_type_traits.h"
"strings/internal/string_constant.h"
"strings/internal/stringify_sink.h"
diff --git a/absl/strings/BUILD.bazel b/absl/strings/BUILD.bazel
index c6989816..809f566d 100644
--- a/absl/strings/BUILD.bazel
+++ b/absl/strings/BUILD.bazel
@@ -37,6 +37,7 @@ cc_library(
"internal/charconv_bigint.h",
"internal/charconv_parse.cc",
"internal/charconv_parse.h",
+ "internal/damerau_levenshtein_distance.cc",
"internal/memutil.cc",
"internal/memutil.h",
"internal/stl_type_traits.h",
@@ -55,6 +56,7 @@ cc_library(
"ascii.h",
"charconv.h",
"escaping.h",
+ "internal/damerau_levenshtein_distance.h",
"internal/string_constant.h",
"match.h",
"numbers.h",
@@ -180,6 +182,19 @@ cc_test(
)
cc_test(
+ name = "damerau_levenshtein_distance_test",
+ size = "small",
+ srcs = [
+ "internal/damerau_levenshtein_distance_test.cc",
+ ],
+ copts = ABSL_TEST_COPTS,
+ deps = [
+ "//absl/strings",
+ "@com_google_googletest//:gtest_main",
+ ],
+)
+
+cc_test(
name = "memutil_benchmark",
srcs = [
"internal/memutil.h",
diff --git a/absl/strings/CMakeLists.txt b/absl/strings/CMakeLists.txt
index fe82e1df..30dedcf5 100644
--- a/absl/strings/CMakeLists.txt
+++ b/absl/strings/CMakeLists.txt
@@ -21,6 +21,7 @@ absl_cc_library(
"ascii.h"
"charconv.h"
"escaping.h"
+ "internal/damerau_levenshtein_distance.h"
"internal/string_constant.h"
"match.h"
"numbers.h"
@@ -39,6 +40,7 @@ absl_cc_library(
"internal/charconv_bigint.h"
"internal/charconv_parse.cc"
"internal/charconv_parse.h"
+ "internal/damerau_levenshtein_distance.cc"
"internal/memutil.cc"
"internal/memutil.h"
"internal/stringify_sink.h"
@@ -135,6 +137,19 @@ absl_cc_test(
absl_cc_test(
NAME
+ damerau_levenshtein_distance_test
+ SRCS
+ "internal/damerau_levenshtein_distance_test.cc"
+ COPTS
+ ${ABSL_TEST_COPTS}
+ DEPS
+ absl::strings
+ absl::base
+ GTest::gmock_main
+)
+
+absl_cc_test(
+ NAME
memutil_test
SRCS
"internal/memutil.h"
diff --git a/absl/strings/internal/damerau_levenshtein_distance.cc b/absl/strings/internal/damerau_levenshtein_distance.cc
new file mode 100644
index 00000000..7cc23acd
--- /dev/null
+++ b/absl/strings/internal/damerau_levenshtein_distance.cc
@@ -0,0 +1,93 @@
+// Copyright 2022 The Abseil Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "absl/strings/internal/damerau_levenshtein_distance.h"
+
+#include <algorithm>
+#include <array>
+#include <numeric>
+
+#include "absl/strings/string_view.h"
+namespace absl {
+ABSL_NAMESPACE_BEGIN
+namespace strings_internal {
+// Calculate DamerauLevenshtein (adjacent transpositions) distance
+// between two strings,
+// https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance. The
+// algorithm follows the condition that no substring is edited more than once.
+// While this can reduce is larger distance, it's a) a much simpler algorithm
+// and b) more realistic for the case that typographic mistakes should be
+// detected.
+// When the distance is larger than cutoff, or one of the strings has more
+// than MAX_SIZE=100 characters, the code returns min(MAX_SIZE, cutoff) + 1.
+size_t CappedDamerauLevenshteinDistance(absl::string_view s1,
+ absl::string_view s2, uint8_t cutoff) {
+ const uint8_t MAX_SIZE = 100;
+ const uint8_t _cutoff = std::min(MAX_SIZE, cutoff);
+ const uint8_t cutoff_plus_1 = static_cast<uint8_t>(_cutoff + 1);
+
+ if (s1.size() > s2.size()) std::swap(s1, s2);
+ if (s1.size() + _cutoff < s2.size() || s2.size() > MAX_SIZE)
+ return cutoff_plus_1;
+
+ if (s1.empty())
+ return std::min(static_cast<size_t>(cutoff_plus_1), s2.size());
+
+ // Lower diagonal bound: y = x - lower_diag
+ const uint8_t lower_diag =
+ _cutoff - static_cast<uint8_t>(s2.size() - s1.size());
+ // Upper diagonal bound: y = x + upper_diag
+ const uint8_t upper_diag = _cutoff;
+
+ // d[i][j] is the number of edits required to convert s1[0, i] to s2[0, j]
+ std::array<std::array<uint8_t, MAX_SIZE + 2>, MAX_SIZE + 2> d;
+ std::iota(d[0].begin(), d[0].begin() + upper_diag + 1, 0);
+ d[0][cutoff_plus_1] = cutoff_plus_1;
+ for (size_t i = 1; i <= s1.size(); ++i) {
+ // Deduce begin of relevant window.
+ size_t j_begin = 1;
+ if (i > lower_diag) {
+ j_begin = i - lower_diag;
+ d[i][j_begin - 1] = cutoff_plus_1;
+ } else {
+ d[i][0] = static_cast<uint8_t>(i);
+ }
+
+ // Deduce end of relevant window.
+ size_t j_end = i + upper_diag;
+ if (j_end > s2.size()) {
+ j_end = s2.size();
+ } else {
+ d[i][j_end + 1] = cutoff_plus_1;
+ }
+
+ for (size_t j = j_begin; j <= j_end; ++j) {
+ const uint8_t deletion_distance = d[i - 1][j] + 1;
+ const uint8_t insertion_distance = d[i][j - 1] + 1;
+ const uint8_t mismatched_tail_cost = s1[i - 1] == s2[j - 1] ? 0 : 1;
+ const uint8_t mismatch_distance = d[i - 1][j - 1] + mismatched_tail_cost;
+ uint8_t transposition_distance = _cutoff + 1;
+ if (i > 1 && j > 1 && s1[i - 1] == s2[j - 2] && s1[i - 2] == s2[j - 1])
+ transposition_distance = d[i - 2][j - 2] + 1;
+ d[i][j] = std::min({cutoff_plus_1, deletion_distance, insertion_distance,
+ mismatch_distance, transposition_distance});
+ }
+ }
+ return d[s1.size()][s2.size()];
+}
+
+} // namespace strings_internal
+
+ABSL_NAMESPACE_END
+} // namespace absl
diff --git a/absl/strings/internal/damerau_levenshtein_distance.h b/absl/strings/internal/damerau_levenshtein_distance.h
new file mode 100644
index 00000000..b9bb6fe1
--- /dev/null
+++ b/absl/strings/internal/damerau_levenshtein_distance.h
@@ -0,0 +1,35 @@
+// Copyright 2022 The Abseil Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ABSL_STRINGS_INTERNAL_DAMERAU_LEVENSHTEIN_DISTANCE_H_
+#define ABSL_STRINGS_INTERNAL_DAMERAU_LEVENSHTEIN_DISTANCE_H_
+
+#include <numeric>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+
+namespace absl {
+ABSL_NAMESPACE_BEGIN
+namespace strings_internal {
+// Calculate DamerauLevenshtein distance between two strings.
+// When the distance is larger than cutoff, the code just returns cutoff + 1.
+size_t CappedDamerauLevenshteinDistance(absl::string_view s1,
+ absl::string_view s2, uint8_t cutoff);
+
+} // namespace strings_internal
+ABSL_NAMESPACE_END
+} // namespace absl
+
+#endif // ABSL_STRINGS_INTERNAL_DAMERAU_LEVENSHTEIN_DISTANCE_H_
diff --git a/absl/strings/internal/damerau_levenshtein_distance_test.cc b/absl/strings/internal/damerau_levenshtein_distance_test.cc
new file mode 100644
index 00000000..85871941
--- /dev/null
+++ b/absl/strings/internal/damerau_levenshtein_distance_test.cc
@@ -0,0 +1,99 @@
+// Copyright 2022 The Abseil Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "absl/strings/internal/damerau_levenshtein_distance.h"
+
+#include <cstdint>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace {
+
+using absl::strings_internal::CappedDamerauLevenshteinDistance;
+
+TEST(Distance, TestDistances) {
+ EXPECT_THAT(CappedDamerauLevenshteinDistance("ab", "ab", 6), 0);
+ EXPECT_THAT(CappedDamerauLevenshteinDistance("a", "b", 6), 1);
+ EXPECT_THAT(CappedDamerauLevenshteinDistance("ca", "abc", 6), 3);
+ EXPECT_THAT(CappedDamerauLevenshteinDistance("abcd", "ad", 6), 2);
+ EXPECT_THAT(CappedDamerauLevenshteinDistance("abcd", "cadb", 6), 4);
+ EXPECT_THAT(CappedDamerauLevenshteinDistance("abcd", "bdac", 6), 4);
+ EXPECT_THAT(CappedDamerauLevenshteinDistance("ab", "ab", 0), 0);
+ EXPECT_THAT(CappedDamerauLevenshteinDistance("", "", 0), 0);
+ // combinations for 3-character strings:
+ // 1, 2, 3 removals, insertions or replacements and transpositions
+ EXPECT_THAT(CappedDamerauLevenshteinDistance("abc", "abc", 6), 0);
+ for (auto res :
+ {"", "ca", "efg", "ea", "ce", "ceb", "eca", "cae", "cea", "bea"}) {
+ EXPECT_THAT(CappedDamerauLevenshteinDistance("abc", res, 6), 3);
+ EXPECT_THAT(CappedDamerauLevenshteinDistance(res, "abc", 6), 3);
+ }
+ for (auto res :
+ {"a", "b", "c", "ba", "cb", "bca", "cab", "cba", "ace",
+ "efc", "ebf", "aef", "ae", "be", "eb", "ec", "ecb", "bec",
+ "bce", "cbe", "ace", "eac", "aeb", "bae", "eab", "eba"}) {
+ EXPECT_THAT(CappedDamerauLevenshteinDistance("abc", res, 6), 2);
+ EXPECT_THAT(CappedDamerauLevenshteinDistance(res, "abc", 6), 2);
+ }
+ for (auto res : {"ab", "ac", "bc", "acb", "bac", "ebc", "aec", "abe"}) {
+ EXPECT_THAT(CappedDamerauLevenshteinDistance("abc", res, 6), 1);
+ EXPECT_THAT(CappedDamerauLevenshteinDistance(res, "abc", 6), 1);
+ }
+}
+
+TEST(Distance, TestCutoff) {
+ // Returing cutoff + 1 if the value is larger than cutoff or string longer
+ // than MAX_SIZE.
+ EXPECT_THAT(CappedDamerauLevenshteinDistance("abcd", "a", 3), 3);
+ EXPECT_THAT(CappedDamerauLevenshteinDistance("abcd", "a", 2), 3);
+ EXPECT_THAT(CappedDamerauLevenshteinDistance("abcd", "a", 1), 2);
+ EXPECT_THAT(CappedDamerauLevenshteinDistance("abcdefg", "a", 2), 3);
+ EXPECT_THAT(CappedDamerauLevenshteinDistance("a", "abcde", 2), 3);
+ EXPECT_THAT(CappedDamerauLevenshteinDistance(std::string(102, 'a'),
+ std::string(102, 'a'), 105),
+ 101);
+ EXPECT_THAT(CappedDamerauLevenshteinDistance(std::string(100, 'a'),
+ std::string(100, 'a'), 100),
+ 0);
+ EXPECT_THAT(CappedDamerauLevenshteinDistance(std::string(100, 'a'),
+ std::string(100, 'b'), 100),
+ 100);
+ EXPECT_THAT(CappedDamerauLevenshteinDistance(std::string(100, 'a'),
+ std::string(99, 'a'), 2),
+ 1);
+ EXPECT_THAT(CappedDamerauLevenshteinDistance(std::string(100, 'a'),
+ std::string(101, 'a'), 2),
+ 3);
+ EXPECT_THAT(CappedDamerauLevenshteinDistance(std::string(100, 'a'),
+ std::string(101, 'a'), 2),
+ 3);
+ EXPECT_THAT(CappedDamerauLevenshteinDistance(std::string(UINT8_MAX + 1, 'a'),
+ std::string(UINT8_MAX + 1, 'b'),
+ UINT8_MAX),
+ 101);
+ EXPECT_THAT(CappedDamerauLevenshteinDistance(std::string(UINT8_MAX - 1, 'a'),
+ std::string(UINT8_MAX - 1, 'b'),
+ UINT8_MAX),
+ 101);
+ EXPECT_THAT(
+ CappedDamerauLevenshteinDistance(std::string(UINT8_MAX, 'a'),
+ std::string(UINT8_MAX, 'b'), UINT8_MAX),
+ 101);
+ EXPECT_THAT(CappedDamerauLevenshteinDistance(std::string(UINT8_MAX - 1, 'a'),
+ std::string(UINT8_MAX - 1, 'a'),
+ UINT8_MAX),
+ 101);
+}
+} // namespace