tensorflow/core/lib/gtl/edit_distance.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82

#ifndef TENSORFLOW_LIB_GTL_EDIT_DISTANCE_H_
#define TENSORFLOW_LIB_GTL_EDIT_DISTANCE_H_

#include "tensorflow/core/lib/gtl/array_slice.h"
#include "tensorflow/core/lib/gtl/inlined_vector.h"

namespace tensorflow {
namespace gtl {

// Calculate the Levenshtein Edit Distance between two contiguous
// sequences, s and t, of type T.
//
// The Levenshtein distance is a symmetric distance defined as the
// smallest number of insertions, deletions, and substitutions
// required to convert sequence s to t (and vice versa).
// Note, this distance does not consider transpositions.
//
// For more details and a reference implementation, see:
//   https://en.wikipedia.org/wiki/Levenshtein_distance
//
// This implementation has time complexity O(|s|*|t|)
// and space complexity O(min(|s|, |t|)), where
//   |x| := x.size()
//
// A simple call to LevenshteinDistance looks like:
//
//  int64 dist = LevenshteinDistance("hi", "bye", std::equal_to<char>());
//
template <typename T, typename Cmp>
inline int64 LevenshteinDistance(const gtl::ArraySlice<T>& s,
                                 const gtl::ArraySlice<T>& t, const Cmp& cmp) {
  const int64 s_size = s.size();
  const int64 t_size = t.size();

  if (s_size == 0) return t_size;
  if (t_size == 0) return s_size;
  if (s == t) return 0;
  if (t_size > s_size) return LevenshteinDistance(t, s, cmp);

  // Create work vectors
  gtl::InlinedVector<int64, 32> scratch0(t_size + 1);
  gtl::InlinedVector<int64, 32> scratch1(t_size + 1);

  int64* previous = scratch0.data();
  int64* current = scratch1.data();

  // Initialize previous row of distances
  std::iota(scratch0.begin(), scratch0.end(), 0);

  for (int64 i = 0; i < s_size; ++i) {
    // Swap current and previous rows for next iteration
    std::swap(previous, current);

    // Calculate current row distances from previous row
    current[0] = i + 1;

    // Fill in the rest of the row
    for (int64 j = 0; j < t_size; ++j) {
      const int64 cost = cmp(s[i], t[j]) ? 0 : 1;
      current[j + 1] =
          std::min(current[j] + 1,                 // deletion cost
                   std::min(previous[j + 1] + 1,   // insertion cost
                            previous[j] + cost));  // substitution cost
    }
  }

  return current[t_size];
}

template <typename Container1, typename Container2, typename Cmp>
inline int64 LevenshteinDistance(const Container1& s, const Container2& t,
                                 const Cmp& cmp) {
  return LevenshteinDistance(
      gtl::ArraySlice<typename Container1::value_type>(s.data(), s.size()),
      gtl::ArraySlice<typename Container1::value_type>(t.data(), t.size()),
      cmp);
}

}  // namespace gtl
}  // namespace tensorflow

#endif  // TENSORFLOW_LIB_GTL_EDIT_DISTANCE_H_