From 55735379ccda8a64e49717e95e9e0915e7b8dc8e Mon Sep 17 00:00:00 2001 From: Yutaka Leon Date: Mon, 12 Dec 2016 13:51:14 -0800 Subject: Add string_to_index_table, which returns a lookup table that matches strings to indices. Indices are defined by a vocabulary file or a mapping tensor. It also supports the assignation of hash buckets to out-of-vocabulary terms. Deprecate string_to_index in favor of string_to_index_table. Change: 141806125 --- tensorflow/contrib/lookup/__init__.py | 3 + tensorflow/contrib/lookup/lookup_ops.py | 205 ++++++++++++++++++++++++--- tensorflow/contrib/lookup/lookup_ops_test.py | 159 +++++++++++++++++++++ 3 files changed, 351 insertions(+), 16 deletions(-) diff --git a/tensorflow/contrib/lookup/__init__.py b/tensorflow/contrib/lookup/__init__.py index 8717c3fa2b..99bf5fafb8 100644 --- a/tensorflow/contrib/lookup/__init__.py +++ b/tensorflow/contrib/lookup/__init__.py @@ -15,9 +15,12 @@ """Ops for lookup operations. @@string_to_index +@@string_to_index_table_from_file +@@string_to_index_table_from_tensor @@index_to_string @@LookupInterface @@InitializableLookupTableBase +@@IdTableWithHashBuckets @@HashTable @@MutableHashTable @@TableInitializerBase diff --git a/tensorflow/contrib/lookup/lookup_ops.py b/tensorflow/contrib/lookup/lookup_ops.py index 20d2627a67..48570dfeb9 100644 --- a/tensorflow/contrib/lookup/lookup_ops.py +++ b/tensorflow/contrib/lookup/lookup_ops.py @@ -32,6 +32,7 @@ from tensorflow.python.ops import math_ops from tensorflow.python.ops import string_ops from tensorflow.python.training.saver import BaseSaverBuilder from tensorflow.python.util import compat +from tensorflow.python.util.deprecation import deprecated class LookupInterface(object): @@ -775,6 +776,191 @@ class IdTableWithHashBuckets(LookupInterface): return ids +def string_to_index_table_from_file(vocabulary_file=None, + num_oov_buckets=0, + vocab_size=None, + default_value=-1, + hasher_spec=FastHashSpec, + name=None): + """Returns a lookup table that converts a string tensor into int64 IDs. + + This operation constructs a lookup table to convert tensor of strings into + int64 IDs. The mapping can be initialized from a vocabulary file specified in + `vocabulary_file`, where the whole line is the key and the zero-based line + number is the ID. + + Any lookup of an out-of-vocabulary token will return a bucket ID based on its + hash if `num_oov_buckets` is greater than zero. Otherwise it is assigned the + `default_value`. + The bucket ID range is `[vocabulary size, vocabulary size + num_oov_buckets]`. + + The underlying table must be initialized by calling + `tf.initialize_all_tables.run()` or `table.init.run()` once. + + Sample Usages: + + If we have a vocabulary file "test.txt" with the following content: + + ``` + emerson + lake + palmer + ``` + + ```python + features = tf.constant(["emerson", "lake", "and", "palmer"]) + table = tf.contrib.lookup.string_to_index_table_from_file( + vocabulary_file="test.txt", num_oov_buckets=1) + ids = table.lookup(features) + ... + tf.initialize_all_tables().run() + + ids.eval() ==> [0, 1, 3, 2] # where 3 is the out-of-vocabulary bucket + ``` + + Args: + vocabulary_file: The vocabulary filename. + num_oov_buckets: The number of out-of-vocabulary buckets. + vocab_size: Number of the elements in the vocabulary, if known. + default_value: The value to use for out-of-vocabulary feature values. + Defaults to -1. + hasher_spec: A `HasherSpec` to specify the hash function to use for + assignation of out-of-vocabulary buckets. + name: A name for this op (optional). + + Returns: + The lookup table to map a string `Tensor` to index `int64` `Tensor`. + + Raises: + ValueError: If `vocabulary_file` is not set. + ValueError: If `num_oov_buckets` is negative or `vocab_size` is not greater + than zero. + """ + if not vocabulary_file: + raise ValueError("vocabulary_file must be specified.") + if num_oov_buckets < 0: + raise ValueError("num_oov_buckets must be greater or equal than 0, got %d." + % num_oov_buckets) + if vocab_size is not None and vocab_size < 1: + raise ValueError("vocab_size must be greater than 0, got %d." % vocab_size) + + with ops.name_scope(name, "string_to_index") as feat_to_id_scope: + table = None + shared_name = "" + with ops.name_scope(None, "hash_table") as hash_table_scope: + if vocab_size: + # Keep the shared_name: + # ____ + shared_name = "hash_table_%s_%d_%s_%s" % (vocabulary_file, vocab_size, + TextFileIndex.WHOLE_LINE, + TextFileIndex.LINE_NUMBER) + else: + # Keep the shared_name + # ___ + shared_name = "hash_table_%s_%s_%s" % (vocabulary_file, + TextFileIndex.WHOLE_LINE, + TextFileIndex.LINE_NUMBER) + init = TextFileIdTableInitializer( + vocabulary_file, vocab_size=vocab_size, name="table_init") + + table = HashTable( + init, default_value, shared_name=shared_name, name=hash_table_scope) + if num_oov_buckets: + table = IdTableWithHashBuckets( + table, + num_oov_buckets=num_oov_buckets, + hasher_spec=hasher_spec, + name=feat_to_id_scope) + + return table + + +def string_to_index_table_from_tensor(mapping, + num_oov_buckets=0, + default_value=-1, + hasher_spec=FastHashSpec, + name=None): + """Returns a lookup table that converts a string tensor into int64 IDs. + + This operation constructs a lookup table to convert tensor of strings into + int64 IDs. The mapping can be initialized from a string `mapping` 1-D tensor + where each element is a key and corresponding index within the tensor is the + value. + + Any lookup of an out-of-vocabulary token will return a bucket ID based on its + hash if `num_oov_buckets` is greater than zero. Otherwise it is assigned the + `default_value`. + The bucket ID range is `[mapping size, mapping size + num_oov_buckets]`. + + The underlying table must be initialized by calling + `tf.initialize_all_tables.run()` or `table.init.run()` once. + + Elements in `mapping` cannot have duplicates, otherwise when executing the + table initializer op, it will throw a `FailedPreconditionError`. + + Sample Usages: + + ```python + mapping_strings = t.constant(["emerson", "lake", "palmer") + table = tf.contrib.lookup.string_to_index_table_from_tensor( + mapping=mapping_strings, num_oov_buckets=1, default_value=-1) + features = tf.constant(["emerson", "lake", "and", "palmer"]) + ids = table.lookup(features) + ... + tf.initialize_all_tables().run() + + ids.eval() ==> [0, 1, 4, 2] + ``` + + Args: + mapping: A 1-D string `Tensor` that specifies the mapping of strings to + indices. + num_oov_buckets: The number of out-of-vocabulary buckets. + default_value: The value to use for out-of-vocabulary feature values. + Defaults to -1. + hasher_spec: A `HasherSpec` to specify the hash function to use for + assignation of out-of-vocabulary buckets. + name: A name for this op (optional). + + Returns: + The lookup table to map a string `Tensor` to index `int64` `Tensor`. + + Raises: + ValueError: `mapping` is invalid. + ValueError: If `num_oov_buckets` is negative. + """ + if mapping is None: + raise ValueError("mapping must be specified.") + + if num_oov_buckets < 0: + raise ValueError("num_oov_buckets must be greater or equal than 0, got %d." + % num_oov_buckets) + + with ops.name_scope(name, "string_to_index") as feat_to_id_scope: + keys = ops.convert_to_tensor(mapping, dtypes.string) + num_elements = array_ops.size(keys) + values = math_ops.cast(math_ops.range(num_elements), dtypes.int64) + + shared_name = "" + with ops.name_scope(None, "hash_table") as hash_table_scope: + init = KeyValueTensorInitializer( + keys, values, dtypes.string, dtypes.int64, name="table_init") + table = HashTable( + init, default_value, shared_name=shared_name, name=hash_table_scope) + if num_oov_buckets: + table = IdTableWithHashBuckets( + table, + num_oov_buckets=num_oov_buckets, + hasher_spec=hasher_spec, + name=feat_to_id_scope) + + return table + + +@deprecated( + "2017-01-07", "This op will be removed after the deprecation date. " + "Please switch to string_to_index_table_from_tensor and call the lookup " + "method of the returned table.") def string_to_index(tensor, mapping, default_value=-1, name=None): """Maps `tensor` of strings into `int64` indices based on `mapping`. @@ -816,22 +1002,9 @@ def string_to_index(tensor, mapping, default_value=-1, name=None): The mapped indices. It has the same shape and tensor type (dense or sparse) as `tensor`. """ - with ops.name_scope(name, "string_to_index", [tensor]) as scope: - shared_name = "" - keys = ops.convert_to_tensor(mapping, dtypes.string) - vocab_size = array_ops.size(keys) - values = math_ops.cast(math_ops.range(vocab_size), dtypes.int64) - init = KeyValueTensorInitializer(keys, - values, - dtypes.string, - dtypes.int64, - name="table_init") - - t = HashTable(init, - default_value, - shared_name=shared_name, - name="hash_table") - return t.lookup(tensor, name=scope) + table = string_to_index_table_from_tensor( + mapping=mapping, default_value=default_value, name=name) + return table.lookup(tensor) def index_to_string(tensor, mapping, default_value="UNK", name=None): diff --git a/tensorflow/contrib/lookup/lookup_ops_test.py b/tensorflow/contrib/lookup/lookup_ops_test.py index 940b624161..40a6d4d70e 100644 --- a/tensorflow/contrib/lookup/lookup_ops_test.py +++ b/tensorflow/contrib/lookup/lookup_ops_test.py @@ -1123,6 +1123,165 @@ class MutableDenseHashTableOpTest(tf.test.TestCase): self.assertAllEqual(0, table2.size().eval()) +class StringToIndexTableFromFile(tf.test.TestCase): + + def _createVocabFile(self, basename): + vocabulary_file = os.path.join(self.get_temp_dir(), basename) + with open(vocabulary_file, "w") as f: + f.write("\n".join(["brain", "salad", "surgery"]) + "\n") + return vocabulary_file + + def test_string_to_index_table_from_file(self): + vocabulary_file = self._createVocabFile("f2i_vocab1.txt") + with self.test_session(): + table = tf.contrib.lookup.string_to_index_table_from_file( + vocabulary_file=vocabulary_file, num_oov_buckets=1) + ids = table.lookup(tf.constant(["salad", "surgery", "tarkus"])) + + self.assertRaises(tf.OpError, ids.eval) + tf.initialize_all_tables().run() + self.assertAllEqual((1, 2, 3), ids.eval()) + + def test_string_to_index_table_from_file_with_default_value(self): + default_value = -42 + vocabulary_file = self._createVocabFile("f2i_vocab2.txt") + with self.test_session(): + table = tf.contrib.lookup.string_to_index_table_from_file( + vocabulary_file=vocabulary_file, default_value=default_value) + ids = table.lookup(tf.constant(["salad", "surgery", "tarkus"])) + + self.assertRaises(tf.OpError, ids.eval) + tf.initialize_all_tables().run() + self.assertAllEqual((1, 2, default_value), ids.eval()) + + def test_string_to_index_table_from_file_with_oov_buckets(self): + vocabulary_file = self._createVocabFile("f2i_vocab3.txt") + with self.test_session(): + table = tf.contrib.lookup.string_to_index_table_from_file( + vocabulary_file=vocabulary_file, num_oov_buckets=1000) + ids = table.lookup(tf.constant(["salad", "surgery", "tarkus", "toccata"])) + + self.assertRaises(tf.OpError, ids.eval) + tf.initialize_all_tables().run() + self.assertAllEqual( + ( + 1, # From vocabulary file. + 2, # From vocabulary file. + 867, # 3 + fingerprint("tarkus") mod 300. + 860), # 3 + fingerprint("toccata") mod 300. + ids.eval()) + + def test_string_to_index_table_from_file_with_only_oov_buckets(self): + self.assertRaises( + ValueError, + tf.contrib.lookup.string_to_index_table_from_file, + vocabulary_file=None) + + def test_string_to_index_table_from_file_with_vocab_size_too_small(self): + vocabulary_file = self._createVocabFile("f2i_vocab5.txt") + with self.test_session(): + table = tf.contrib.lookup.string_to_index_table_from_file( + vocabulary_file=vocabulary_file, vocab_size=2) + ids = table.lookup(tf.constant(["salad", "surgery", "tarkus"])) + + self.assertRaises(tf.OpError, ids.eval) + tf.initialize_all_tables().run() + self.assertAllEqual((1, -1, -1), ids.eval()) + self.assertEqual(2, table.size().eval()) + + def test_string_to_index_table_from_file_with_vocab_size_too_large(self): + vocabulary_file = self._createVocabFile("f2i_vocab6.txt") + with self.test_session(): + table = tf.contrib.lookup.string_to_index_table_from_file( + vocabulary_file=vocabulary_file, vocab_size=4) + self.assertRaisesRegexp(tf.errors.InvalidArgumentError, + "Invalid vocab_size", table.init.run) + + def test_string_to_index_table_from_file_with_vocab_size(self): + vocabulary_file = self._createVocabFile("f2i_vocab7.txt") + + self.assertRaises( + ValueError, + tf.contrib.lookup.string_to_index_table_from_file, + vocabulary_file=vocabulary_file, + vocab_size=0) + + with self.test_session(): + table = tf.contrib.lookup.string_to_index_table_from_file( + vocabulary_file=vocabulary_file, vocab_size=3) + ids = table.lookup(tf.constant(["salad", "surgery", "tarkus"])) + + self.assertRaises(tf.OpError, ids.eval) + tf.initialize_all_tables().run() + self.assertAllEqual((1, 2, -1), ids.eval()) + self.assertEqual(3, table.size().eval()) + + def test_string_to_index_table_from_file_with_invalid_hashers(self): + vocabulary_file = self._createVocabFile("invalid_hasher.txt") + with self.test_session(): + with self.assertRaises(TypeError): + tf.contrib.lookup.string_to_index_table_from_file( + vocabulary_file=vocabulary_file, + vocab_size=3, + num_oov_buckets=1, + hasher_spec=1) + + table = tf.contrib.lookup.string_to_index_table_from_file( + vocabulary_file=vocabulary_file, + vocab_size=3, + num_oov_buckets=1, + hasher_spec=tf.contrib.lookup.HasherSpec("my-awesome-hash", None)) + + self.assertRaises(ValueError, table.lookup, + tf.constant(["salad", "surgery", "tarkus"])) + + +class StringToIndexTableFromTensor(tf.test.TestCase): + + def test_string_to_index_table_from_tensor_with_tensor_init(self): + with self.test_session(): + table = tf.contrib.lookup.string_to_index_table_from_tensor( + mapping=["brain", "salad", "surgery"], num_oov_buckets=1) + ids = table.lookup(tf.constant(["salad", "surgery", "tarkus"])) + + self.assertRaises(tf.OpError, ids.eval) + tf.initialize_all_tables().run() + self.assertAllEqual((1, 2, 3), ids.eval()) + + def test_string_to_index_table_from_tensor_with_default_value(self): + default_value = -42 + with self.test_session(): + table = tf.contrib.lookup.string_to_index_table_from_tensor( + mapping=["brain", "salad", "surgery"], default_value=default_value) + ids = table.lookup(tf.constant(["salad", "surgery", "tarkus"])) + + self.assertRaises(tf.OpError, ids.eval) + tf.initialize_all_tables().run() + self.assertAllEqual((1, 2, default_value), ids.eval()) + + def test_string_to_index_table_from_tensor_with_only_oov_buckets(self): + with self.test_session(): + with self.assertRaises(ValueError): + tf.contrib.lookup.string_to_index_table_from_tensor( + mapping=None, num_oov_buckets=1) + + def test_string_to_index_table_from_tensor_with_invalid_hashers(self): + with self.test_session(): + with self.assertRaises(TypeError): + tf.contrib.lookup.string_to_index_table_from_tensor( + mapping=["brain", "salad", "surgery"], + num_oov_buckets=1, + hasher_spec=1) + + table = tf.contrib.lookup.string_to_index_table_from_tensor( + mapping=["brain", "salad", "surgery"], + num_oov_buckets=1, + hasher_spec=tf.contrib.lookup.HasherSpec("my-awesome-hash", None)) + + self.assertRaises(ValueError, table.lookup, + tf.constant(["salad", "surgery", "tarkus"])) + + class StringToIndexTest(tf.test.TestCase): def test_string_to_index(self): -- cgit v1.2.3