diff options
author | 2017-08-26 00:00:13 -0700 | |
---|---|---|
committer | 2017-08-26 00:04:15 -0700 | |
commit | ea945f55ffef13c92a3065baa234a4dc58983ea3 (patch) | |
tree | 72a4cbd65b4e1d4956f69c038b9eb510c60ffe4e /tensorflow/examples/tutorials | |
parent | 17f26f81bfaf8ee03e330b98f4297cb754676c35 (diff) |
Adding some comments explaining type and meaning of variables.
PiperOrigin-RevId: 166564600
Diffstat (limited to 'tensorflow/examples/tutorials')
-rw-r--r-- | tensorflow/examples/tutorials/word2vec/word2vec_basic.py | 14 |
1 files changed, 12 insertions, 2 deletions
diff --git a/tensorflow/examples/tutorials/word2vec/word2vec_basic.py b/tensorflow/examples/tutorials/word2vec/word2vec_basic.py index d73b1c6373..6d98c7b85d 100644 --- a/tensorflow/examples/tutorials/word2vec/word2vec_basic.py +++ b/tensorflow/examples/tutorials/word2vec/word2vec_basic.py @@ -86,6 +86,12 @@ def build_dataset(words, n_words): reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys())) return data, count, dictionary, reversed_dictionary +# Filling 4 global variables: +# data - list of codes (integers from 0 to vocabulary_size-1). +# This is the original text but words are replaced by their codes +# count - map of words(strings) to count of occurences +# dictionary - map of words(strings) to their codes(integers) +# reverse_dictionary - maps codes(integers) to words(strings) data, count, dictionary, reverse_dictionary = build_dataset(vocabulary, vocabulary_size) del vocabulary # Hint to reduce memory. @@ -136,14 +142,16 @@ batch_size = 128 embedding_size = 128 # Dimension of the embedding vector. skip_window = 1 # How many words to consider left and right. num_skips = 2 # How many times to reuse an input to generate a label. +num_sampled = 64 # Number of negative examples to sample. # We pick a random validation set to sample nearest neighbors. Here we limit the # validation samples to the words that have a low numeric ID, which by -# construction are also the most frequent. +# construction are also the most frequent. These 3 variables are used only for +# displaying model accuracy, they don't affect calculation. valid_size = 16 # Random set of words to evaluate similarity on. valid_window = 100 # Only pick dev samples in the head of the distribution. valid_examples = np.random.choice(valid_window, valid_size, replace=False) -num_sampled = 64 # Number of negative examples to sample. + graph = tf.Graph() @@ -170,6 +178,8 @@ with graph.as_default(): # Compute the average NCE loss for the batch. # tf.nce_loss automatically draws a new sample of the negative labels each # time we evaluate the loss. + # Explanation of the meaning of NCE loss: + # http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/ loss = tf.reduce_mean( tf.nn.nce_loss(weights=nce_weights, biases=nce_biases, |