1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
|
#ifndef TENSORFLOW_LIB_GTL_EDIT_DISTANCE_H_
#define TENSORFLOW_LIB_GTL_EDIT_DISTANCE_H_
#include "tensorflow/core/lib/gtl/array_slice.h"
#include "tensorflow/core/lib/gtl/inlined_vector.h"
namespace tensorflow {
namespace gtl {
// Calculate the Levenshtein Edit Distance between two contiguous
// sequences, s and t, of type T.
//
// The Levenshtein distance is a symmetric distance defined as the
// smallest number of insertions, deletions, and substitutions
// required to convert sequence s to t (and vice versa).
// Note, this distance does not consider transpositions.
//
// For more details and a reference implementation, see:
// https://en.wikipedia.org/wiki/Levenshtein_distance
//
// This implementation has time complexity O(|s|*|t|)
// and space complexity O(min(|s|, |t|)), where
// |x| := x.size()
//
// A simple call to LevenshteinDistance looks like:
//
// int64 dist = LevenshteinDistance("hi", "bye", std::equal_to<char>());
//
template <typename T, typename Cmp>
inline int64 LevenshteinDistance(const gtl::ArraySlice<T>& s,
const gtl::ArraySlice<T>& t, const Cmp& cmp) {
const int64 s_size = s.size();
const int64 t_size = t.size();
if (s_size == 0) return t_size;
if (t_size == 0) return s_size;
if (s == t) return 0;
if (t_size > s_size) return LevenshteinDistance(t, s, cmp);
// Create work vectors
gtl::InlinedVector<int64, 32> scratch0(t_size + 1);
gtl::InlinedVector<int64, 32> scratch1(t_size + 1);
int64* previous = scratch0.data();
int64* current = scratch1.data();
// Initialize previous row of distances
std::iota(scratch0.begin(), scratch0.end(), 0);
for (int64 i = 0; i < s_size; ++i) {
// Swap current and previous rows for next iteration
std::swap(previous, current);
// Calculate current row distances from previous row
current[0] = i + 1;
// Fill in the rest of the row
for (int64 j = 0; j < t_size; ++j) {
const int64 cost = cmp(s[i], t[j]) ? 0 : 1;
current[j + 1] =
std::min(current[j] + 1, // deletion cost
std::min(previous[j + 1] + 1, // insertion cost
previous[j] + cost)); // substitution cost
}
}
return current[t_size];
}
template <typename Container1, typename Container2, typename Cmp>
inline int64 LevenshteinDistance(const Container1& s, const Container2& t,
const Cmp& cmp) {
return LevenshteinDistance(
gtl::ArraySlice<typename Container1::value_type>(s.data(), s.size()),
gtl::ArraySlice<typename Container1::value_type>(t.data(), t.size()),
cmp);
}
} // namespace gtl
} // namespace tensorflow
#endif // TENSORFLOW_LIB_GTL_EDIT_DISTANCE_H_
|