aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/examples/tutorials
diff options
context:
space:
mode:
authorGravatar A. Unique TensorFlower <gardener@tensorflow.org>2017-08-26 00:00:13 -0700
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2017-08-26 00:04:15 -0700
commitea945f55ffef13c92a3065baa234a4dc58983ea3 (patch)
tree72a4cbd65b4e1d4956f69c038b9eb510c60ffe4e /tensorflow/examples/tutorials
parent17f26f81bfaf8ee03e330b98f4297cb754676c35 (diff)
Adding some comments explaining type and meaning of variables.
PiperOrigin-RevId: 166564600
Diffstat (limited to 'tensorflow/examples/tutorials')
-rw-r--r--tensorflow/examples/tutorials/word2vec/word2vec_basic.py14
1 files changed, 12 insertions, 2 deletions
diff --git a/tensorflow/examples/tutorials/word2vec/word2vec_basic.py b/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
index d73b1c6373..6d98c7b85d 100644
--- a/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
+++ b/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
@@ -86,6 +86,12 @@ def build_dataset(words, n_words):
reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
return data, count, dictionary, reversed_dictionary
+# Filling 4 global variables:
+# data - list of codes (integers from 0 to vocabulary_size-1).
+# This is the original text but words are replaced by their codes
+# count - map of words(strings) to count of occurences
+# dictionary - map of words(strings) to their codes(integers)
+# reverse_dictionary - maps codes(integers) to words(strings)
data, count, dictionary, reverse_dictionary = build_dataset(vocabulary,
vocabulary_size)
del vocabulary # Hint to reduce memory.
@@ -136,14 +142,16 @@ batch_size = 128
embedding_size = 128 # Dimension of the embedding vector.
skip_window = 1 # How many words to consider left and right.
num_skips = 2 # How many times to reuse an input to generate a label.
+num_sampled = 64 # Number of negative examples to sample.
# We pick a random validation set to sample nearest neighbors. Here we limit the
# validation samples to the words that have a low numeric ID, which by
-# construction are also the most frequent.
+# construction are also the most frequent. These 3 variables are used only for
+# displaying model accuracy, they don't affect calculation.
valid_size = 16 # Random set of words to evaluate similarity on.
valid_window = 100 # Only pick dev samples in the head of the distribution.
valid_examples = np.random.choice(valid_window, valid_size, replace=False)
-num_sampled = 64 # Number of negative examples to sample.
+
graph = tf.Graph()
@@ -170,6 +178,8 @@ with graph.as_default():
# Compute the average NCE loss for the batch.
# tf.nce_loss automatically draws a new sample of the negative labels each
# time we evaluate the loss.
+ # Explanation of the meaning of NCE loss:
+ # http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/
loss = tf.reduce_mean(
tf.nn.nce_loss(weights=nce_weights,
biases=nce_biases,