diff options
author | 2016-06-01 16:36:41 +0000 | |
---|---|---|
committer | 2016-06-01 18:18:14 +0000 | |
commit | 9bf2da49b552ef4d57e9342bab5bfd2d57c0ec8a (patch) | |
tree | f58838748b0a9d5f64f8c20557d9c890d3cf31cc /src | |
parent | 77903754ad27cfaf660e4dfebc2e23ed4d640e98 (diff) |
Add a SpellChecker class with edit distance function.
This will be used later to detect typos and provide suggestions.
--
MOS_MIGRATED_REVID=123761611
Diffstat (limited to 'src')
-rw-r--r-- | src/main/java/com/google/devtools/build/lib/util/SpellChecker.java | 74 | ||||
-rw-r--r-- | src/test/java/com/google/devtools/build/lib/util/SpellCheckerTest.java | 79 |
2 files changed, 153 insertions, 0 deletions
diff --git a/src/main/java/com/google/devtools/build/lib/util/SpellChecker.java b/src/main/java/com/google/devtools/build/lib/util/SpellChecker.java new file mode 100644 index 0000000000..e453ebcf19 --- /dev/null +++ b/src/main/java/com/google/devtools/build/lib/util/SpellChecker.java @@ -0,0 +1,74 @@ +// Copyright 2016 The Bazel Authors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package com.google.devtools.build.lib.util; + +/** + * Class that provides functions to do spell checking, i.e. detect typos + * and make suggestions. + */ +public final class SpellChecker { + /** + * Computes the edit distance between two strings. The edit distance is + * the minimum number of insertions, deletions and replacements to + * transform a string into the other string. + * + * maxEditDistance is the maximum distance the function can return. If + * it would be greater, the function returns -1. It is useful for + * speeding up the computations. + */ + public static int editDistance(String s1, String s2, int maxEditDistance) { + // This is the Levenshtein distance, as described here: + // http://en.wikipedia.org/wiki/Levenshtein_distance + // + // We don't need to keep the full matrix. To update a cell, we only + // need top-left, top, and left values. Using a single array is + // sufficient. Top value is still in row[j] from the last iteration. + // Top-left value is stored in 'previous'. Left value is row[j - 1]. + + if (s1.equals(s2)) { + return 0; + } + // Short-circuit based on string length. + if (Math.abs(s1.length() - s2.length()) > maxEditDistance) { + return -1; + } + + int[] row = new int[s2.length() + 1]; + for (int i = 0; i <= s2.length(); i++) { + row[i] = i; + } + + for (int i = 1; i <= s1.length(); i++) { + row[0] = i; + int bestInTheRow = row[0]; + int previous = i - 1; + + for (int j = 1; j <= s2.length(); j++) { + int old = row[j]; + + row[j] = Math.min( + previous + (s1.charAt(i - 1) == s2.charAt(j - 1) ? 0 : 1), + 1 + Math.min(row[j - 1], row[j])); + previous = old; + bestInTheRow = Math.min(bestInTheRow, row[j]); + } + if (bestInTheRow > maxEditDistance) { + return -1; + } + } + int result = row[s2.length()]; + return result <= maxEditDistance ? result : -1; + } +} diff --git a/src/test/java/com/google/devtools/build/lib/util/SpellCheckerTest.java b/src/test/java/com/google/devtools/build/lib/util/SpellCheckerTest.java new file mode 100644 index 0000000000..d0d84e38f6 --- /dev/null +++ b/src/test/java/com/google/devtools/build/lib/util/SpellCheckerTest.java @@ -0,0 +1,79 @@ +// Copyright 2016 The Bazel Authors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +package com.google.devtools.build.lib.util; + +import static com.google.common.truth.Truth.assertThat; + +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +/** + * Tests for {@link SpellChecker}. + */ +@RunWith(JUnit4.class) +public class SpellCheckerTest { + + private void assertDistance(String s1, String s2, int distance) { + assertThat(SpellChecker.editDistance(s1, s2, 100)).isEqualTo(distance); + assertThat(SpellChecker.editDistance(s1, s2, distance)).isEqualTo(distance); + + // Symmetry + assertThat(SpellChecker.editDistance(s2, s1, 100)).isEqualTo(distance); + assertThat(SpellChecker.editDistance(s2, s1, distance)).isEqualTo(distance); + } + + @Test + public void editDistance_1() throws Exception { + // Deletion + assertDistance("abcdef", "abdef", 1); + assertDistance("abcdef", "abcde", 1); + assertDistance("abcdef", "bcdef", 1); + + // Replacement + assertDistance("abcdef", "_bcdef", 1); + assertDistance("abcdef", "abc_ef", 1); + assertDistance("abcdef", "abcde_", 1); + + // Insertion + assertDistance("abcdef", "_abcdef", 1); + assertDistance("abcdef", "abcd_ef", 1); + assertDistance("abcdef", "abcdef_", 1); + } + + @Test + public void editDistance_general() throws Exception { + assertDistance("", "", 0); + assertDistance("abcd", "abcd", 0); + assertDistance("abcde", "", 5); + assertDistance("abcde", "12345", 5); + assertDistance("ab", "ba", 2); + assertDistance("abba", "acca", 2); + assertDistance("abaa", "aaca", 2); + assertDistance("kitten", "sitting", 3); + assertDistance("kitten kitten", "sitting sitting", 6); + assertDistance("flaw", "lawn", 2); + } + + @Test + public void editDistance_maxDistance() throws Exception { + assertThat(SpellChecker.editDistance("kitten", "sitting", 0)).isEqualTo(-1); + assertThat(SpellChecker.editDistance("kitten", "sitting", 1)).isEqualTo(-1); + assertThat(SpellChecker.editDistance("kitten", "sitting", 2)).isEqualTo(-1); + assertThat(SpellChecker.editDistance("kitten", "sitting", 3)).isEqualTo(3); + assertThat(SpellChecker.editDistance("kitten", "sitting", 4)).isEqualTo(3); + + assertThat(SpellChecker.editDistance("abcdefg", "s", 2)).isEqualTo(-1); + } +} |