diff options
Diffstat (limited to 'tensorflow/core/lib/gtl/edit_distance.h')
-rw-r--r-- | tensorflow/core/lib/gtl/edit_distance.h | 82 |
1 files changed, 82 insertions, 0 deletions
diff --git a/tensorflow/core/lib/gtl/edit_distance.h b/tensorflow/core/lib/gtl/edit_distance.h new file mode 100644 index 0000000000..82b6c2299f --- /dev/null +++ b/tensorflow/core/lib/gtl/edit_distance.h @@ -0,0 +1,82 @@ +#ifndef TENSORFLOW_LIB_GTL_EDIT_DISTANCE_H_ +#define TENSORFLOW_LIB_GTL_EDIT_DISTANCE_H_ + +#include "tensorflow/core/lib/gtl/array_slice.h" +#include "tensorflow/core/lib/gtl/inlined_vector.h" + +namespace tensorflow { +namespace gtl { + +// Calculate the Levenshtein Edit Distance between two contiguous +// sequences, s and t, of type T. +// +// The Levenshtein distance is a symmetric distance defined as the +// smallest number of insertions, deletions, and substitutions +// required to convert sequence s to t (and vice versa). +// Note, this distance does not consider transpositions. +// +// For more details and a reference implementation, see: +// https://en.wikipedia.org/wiki/Levenshtein_distance +// +// This implementation has time complexity O(|s|*|t|) +// and space complexity O(min(|s|, |t|)), where +// |x| := x.size() +// +// A simple call to LevenshteinDistance looks like: +// +// int64 dist = LevenshteinDistance("hi", "bye", std::equal_to<char>()); +// +template <typename T, typename Cmp> +inline int64 LevenshteinDistance(const gtl::ArraySlice<T>& s, + const gtl::ArraySlice<T>& t, const Cmp& cmp) { + const int64 s_size = s.size(); + const int64 t_size = t.size(); + + if (s_size == 0) return t_size; + if (t_size == 0) return s_size; + if (s == t) return 0; + if (t_size > s_size) return LevenshteinDistance(t, s, cmp); + + // Create work vectors + gtl::InlinedVector<int64, 32> scratch0(t_size + 1); + gtl::InlinedVector<int64, 32> scratch1(t_size + 1); + + int64* previous = scratch0.data(); + int64* current = scratch1.data(); + + // Initialize previous row of distances + std::iota(scratch0.begin(), scratch0.end(), 0); + + for (int64 i = 0; i < s_size; ++i) { + // Swap current and previous rows for next iteration + std::swap(previous, current); + + // Calculate current row distances from previous row + current[0] = i + 1; + + // Fill in the rest of the row + for (int64 j = 0; j < t_size; ++j) { + const int64 cost = cmp(s[i], t[j]) ? 0 : 1; + current[j + 1] = + std::min(current[j] + 1, // deletion cost + std::min(previous[j + 1] + 1, // insertion cost + previous[j] + cost)); // substitution cost + } + } + + return current[t_size]; +} + +template <typename Container1, typename Container2, typename Cmp> +inline int64 LevenshteinDistance(const Container1& s, const Container2& t, + const Cmp& cmp) { + return LevenshteinDistance( + gtl::ArraySlice<typename Container1::value_type>(s.data(), s.size()), + gtl::ArraySlice<typename Container1::value_type>(t.data(), t.size()), + cmp); +} + +} // namespace gtl +} // namespace tensorflow + +#endif // TENSORFLOW_LIB_GTL_EDIT_DISTANCE_H_ |