Adding some comments explaining type and meaning of variables.

PiperOrigin-RevId: 166564600
author: A. Unique TensorFlower <gardener@tensorflow.org> 2017-08-26 00:00:13 -0700
committer: TensorFlower Gardener <gardener@tensorflow.org> 2017-08-26 00:04:15 -0700
commit: ea945f55ffef13c92a3065baa234a4dc58983ea3 (patch)
tree: 72a4cbd65b4e1d4956f69c038b9eb510c60ffe4e /tensorflow/examples/tutorials
parent: 17f26f81bfaf8ee03e330b98f4297cb754676c35 (diff)
1 files changed, 12 insertions, 2 deletions
diff --git a/tensorflow/examples/tutorials/word2vec/word2vec_basic.py b/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
index d73b1c6373..6d98c7b85d 100644
--- a/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
+++ b/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
@@ -86,6 +86,12 @@ def build_dataset(words, n_words):
   reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
   return data, count, dictionary, reversed_dictionary
 
+# Filling 4 global variables:
+# data - list of codes (integers from 0 to vocabulary_size-1).
+#   This is the original text but words are replaced by their codes
+# count - map of words(strings) to count of occurences
+# dictionary - map of words(strings) to their codes(integers)
+# reverse_dictionary - maps codes(integers) to words(strings)
 data, count, dictionary, reverse_dictionary = build_dataset(vocabulary,
                                                             vocabulary_size)
 del vocabulary  # Hint to reduce memory.
@@ -136,14 +142,16 @@ batch_size = 128
 embedding_size = 128  # Dimension of the embedding vector.
 skip_window = 1       # How many words to consider left and right.
 num_skips = 2         # How many times to reuse an input to generate a label.
+num_sampled = 64      # Number of negative examples to sample.
 
 # We pick a random validation set to sample nearest neighbors. Here we limit the
 # validation samples to the words that have a low numeric ID, which by
-# construction are also the most frequent.
+# construction are also the most frequent. These 3 variables are used only for
+# displaying model accuracy, they don't affect calculation.
 valid_size = 16     # Random set of words to evaluate similarity on.
 valid_window = 100  # Only pick dev samples in the head of the distribution.
 valid_examples = np.random.choice(valid_window, valid_size, replace=False)
-num_sampled = 64    # Number of negative examples to sample.
+
 
 graph = tf.Graph()
 
@@ -170,6 +178,8 @@ with graph.as_default():
   # Compute the average NCE loss for the batch.
   # tf.nce_loss automatically draws a new sample of the negative labels each
   # time we evaluate the loss.
+  # Explanation of the meaning of NCE loss:
+  #   http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/
   loss = tf.reduce_mean(
       tf.nn.nce_loss(weights=nce_weights,
                      biases=nce_biases,
author	A. Unique TensorFlower <gardener@tensorflow.org>	2017-08-26 00:00:13 -0700
committer	TensorFlower Gardener <gardener@tensorflow.org>	2017-08-26 00:04:15 -0700
commit	ea945f55ffef13c92a3065baa234a4dc58983ea3 (patch)
tree	72a4cbd65b4e1d4956f69c038b9eb510c60ffe4e /tensorflow/examples/tutorials
parent	17f26f81bfaf8ee03e330b98f4297cb754676c35 (diff)