diff options
author | Vijay Vasudevan <vrv@google.com> | 2016-02-01 16:39:16 -0800 |
---|---|---|
committer | Manjunath Kudlur <keveman@gmail.com> | 2016-02-02 08:35:00 -0800 |
commit | a8d2f0983ecdea8ff2526c717d6a9b2f06f403d8 (patch) | |
tree | 485daa70344be4240e68f89c016231bfdab242d1 /tensorflow/examples/udacity | |
parent | 5ff6d34a05b2eb49f7a79a4d0b78ada9a6842b6c (diff) |
Minor formatting fixes.
Change: 113582098
Diffstat (limited to 'tensorflow/examples/udacity')
-rw-r--r-- | tensorflow/examples/udacity/1_notmnist.ipynb | 305 | ||||
-rw-r--r-- | tensorflow/examples/udacity/4_convolutions.ipynb | 4 | ||||
-rw-r--r-- | tensorflow/examples/udacity/5_word2vec.ipynb | 54 | ||||
-rw-r--r-- | tensorflow/examples/udacity/README.md | 12 |
4 files changed, 243 insertions, 132 deletions
diff --git a/tensorflow/examples/udacity/1_notmnist.ipynb b/tensorflow/examples/udacity/1_notmnist.ipynb index b2a06ba115..661ea4df92 100644 --- a/tensorflow/examples/udacity/1_notmnist.ipynb +++ b/tensorflow/examples/udacity/1_notmnist.ipynb @@ -113,7 +113,7 @@ " filename, _ = urlretrieve(url + filename, filename)\n", " statinfo = os.stat(filename)\n", " if statinfo.st_size == expected_bytes:\n", - " print 'Found and verified', filename\n", + " print('Found and verified', filename)\n", " else:\n", " raise Exception(\n", " 'Failed to verify' + filename + '. Can you get to it with a browser?')\n", @@ -237,9 +237,9 @@ "colab_type": "text" }, "source": [ - "Now let's load the data in a more manageable format.\n", + "Now let's load the data in a more manageable format. Since, depending on your computer setup you might not be able to fit it all in memory, we'll load each class into a separate dataset, store them on disk and curate them independently. Later we'll merge them into a single dataset of manageable size.\n", "\n", - "We'll convert the entire dataset into a 3D array (image index, x, y) of floating point values, normalized to have approximately zero mean and standard deviation ~0.5 to make training easier down the road. The labels will be stored into a separate array of integers 0 through 9.\n", + "We'll convert the entire dataset into a 3D array (image index, x, y) of floating point values, normalized to have approximately zero mean and standard deviation ~0.5 to make training easier down the road. \n", "\n", "A few images might not be readable, we'll just skip them." ] @@ -283,83 +283,143 @@ "image_size = 28 # Pixel width and height.\n", "pixel_depth = 255.0 # Number of levels per pixel.\n", "\n", - "def load(data_folders, min_num_images, max_num_images):\n", - " dataset = np.ndarray(\n", - " shape=(max_num_images, image_size, image_size), dtype=np.float32)\n", - " labels = np.ndarray(shape=(max_num_images), dtype=np.int32)\n", - " label_index = 0\n", - " image_index = 0\n", - " for folder in data_folders:\n", - " print(folder)\n", + "def load_letter(folder, min_num_images):\n", + " image_files = os.listdir(folder)\n", + " dataset = np.ndarray(shape=(len(image_files), image_size, image_size),\n", + " dtype=np.float32)\n", + " image_index = 0\n", + " print folder\n", " for image in os.listdir(folder):\n", - " if image_index >= max_num_images:\n", - " raise Exception('More images than expected: %d >= %d' % (\n", - " image_index, max_num_images))\n", " image_file = os.path.join(folder, image)\n", " try:\n", - " image_data = (ndimage.imread(image_file).astype(float) -\n", + " image_data = (ndimage.imread(image_file).astype(float) - \n", " pixel_depth / 2) / pixel_depth\n", " if image_data.shape != (image_size, image_size):\n", " raise Exception('Unexpected image shape: %s' % str(image_data.shape))\n", " dataset[image_index, :, :] = image_data\n", - " labels[image_index] = label_index\n", " image_index += 1\n", " except IOError as e:\n", " print('Could not read:', image_file, ':', e, '- it\\'s ok, skipping.')\n", - " label_index += 1\n", - " num_images = image_index\n", - " dataset = dataset[0:num_images, :, :]\n", - " labels = labels[0:num_images]\n", - " if num_images < min_num_images:\n", - " raise Exception('Many fewer images than expected: %d < %d' % (\n", - " num_images, min_num_images))\n", - " print('Full dataset tensor:', dataset.shape)\n", - " print('Mean:', np.mean(dataset))\n", - " print('Standard deviation:', np.std(dataset))\n", - " print('Labels:', labels.shape)\n", - " return dataset, labels\n", - "train_dataset, train_labels = load(train_folders, 450000, 550000)\n", - "test_dataset, test_labels = load(test_folders, 18000, 20000)" + " \n", + " num_images = image_index\n", + " dataset = dataset[0:num_images, :, :]\n", + " if num_images < min_num_images:\n", + " raise Exception('Many fewer images than expected: %d < %d' % \n", + " (num_images, min_num_images))\n", + " \n", + " print('Full dataset tensor:', dataset.shape)\n", + " print('Mean:', np.mean(dataset))\n", + " print('Standard deviation:', np.std(dataset))\n", + " return dataset\n", + " \n", + "def load(data_folders, min_num_images_per_class):\n", + " dataset_names = []\n", + " for folder in data_folders:\n", + " dataset = load_letter(folder, min_num_images_per_class)\n", + " set_filename = folder + '.pickle'\n", + " try:\n", + " with open(set_filename, 'wb') as f:\n", + " pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)\n", + " dataset_names.append(set_filename)\n", + " except Exception as e:\n", + " print('Unable to save data to', pickle_file, ':', e)\n", + " \n", + " return dataset_names\n", + "\n", + "train_datasets = load(train_folders, 45000)\n", + "test_datasets = load(test_folders, 1800)" ], "outputs": [ { "output_type": "stream", "text": [ "notMNIST_large/A\n", - "Could not read: notMNIST_large/A/SG90IE11c3RhcmQgQlROIFBvc3Rlci50dGY=.png : cannot identify image file - it's ok, skipping.\n", - "Could not read: notMNIST_large/A/RnJlaWdodERpc3BCb29rSXRhbGljLnR0Zg==.png : cannot identify image file - it's ok, skipping.\n", "Could not read: notMNIST_large/A/Um9tYW5hIEJvbGQucGZi.png : cannot identify image file - it's ok, skipping.\n", + "Could not read: notMNIST_large/A/RnJlaWdodERpc3BCb29rSXRhbGljLnR0Zg==.png : cannot identify image file - it's ok, skipping.\n", + "Could not read: notMNIST_large/A/SG90IE11c3RhcmQgQlROIFBvc3Rlci50dGY=.png : cannot identify image file - it's ok, skipping.\n", + "Full dataset tensor: (52909, 28, 28)\n", + "Mean: -0.12848\n", + "Standard deviation: 0.425576\n", "notMNIST_large/B\n", "Could not read: notMNIST_large/B/TmlraXNFRi1TZW1pQm9sZEl0YWxpYy5vdGY=.png : cannot identify image file - it's ok, skipping.\n", + "Full dataset tensor: (52911, 28, 28)\n", + "Mean: -0.00755947\n", + "Standard deviation: 0.417272\n", "notMNIST_large/C\n", + "Full dataset tensor: (52912, 28, 28)\n", + "Mean: -0.142321\n", + "Standard deviation: 0.421305\n", "notMNIST_large/D\n", "Could not read: notMNIST_large/D/VHJhbnNpdCBCb2xkLnR0Zg==.png : cannot identify image file - it's ok, skipping.\n", + "Full dataset tensor: (52911, 28, 28)\n", + "Mean: -0.0574553\n", + "Standard deviation: 0.434072\n", "notMNIST_large/E\n", + "Full dataset tensor: (52912, 28, 28)\n", + "Mean: -0.0701406\n", + "Standard deviation: 0.42882\n", "notMNIST_large/F\n", + "Full dataset tensor: (52912, 28, 28)\n", + "Mean: -0.125914\n", + "Standard deviation: 0.429645\n", "notMNIST_large/G\n", + "Full dataset tensor: (52912, 28, 28)\n", + "Mean: -0.0947771\n", + "Standard deviation: 0.421674\n", "notMNIST_large/H\n", + "Full dataset tensor: (52912, 28, 28)\n", + "Mean: -0.0687667\n", + "Standard deviation: 0.430344\n", "notMNIST_large/I\n", + "Full dataset tensor: (52912, 28, 28)\n", + "Mean: 0.0307405\n", + "Standard deviation: 0.449686\n", "notMNIST_large/J\n", - "Full dataset tensor: (529114, 28, 28)\n", - "Mean: -0.0816593\n", - "Standard deviation: 0.454232\n", - "Labels: (529114,)\n", + "Full dataset tensor: (52911, 28, 28)\n", + "Mean: -0.153479\n", + "Standard deviation: 0.397169\n", "notMNIST_small/A\n", "Could not read: notMNIST_small/A/RGVtb2NyYXRpY2FCb2xkT2xkc3R5bGUgQm9sZC50dGY=.png : cannot identify image file - it's ok, skipping.\n", + "Full dataset tensor: (1872, 28, 28)\n", + "Mean: -0.132588\n", + "Standard deviation: 0.445923\n", "notMNIST_small/B\n", + "Full dataset tensor: (1873, 28, 28)\n", + "Mean: 0.00535619\n", + "Standard deviation: 0.457054\n", "notMNIST_small/C\n", + "Full dataset tensor: (1873, 28, 28)\n", + "Mean: -0.141489\n", + "Standard deviation: 0.441056\n", "notMNIST_small/D\n", + "Full dataset tensor: (1873, 28, 28)\n", + "Mean: -0.0492094\n", + "Standard deviation: 0.460477\n", "notMNIST_small/E\n", + "Full dataset tensor: (1873, 28, 28)\n", + "Mean: -0.0598952\n", + "Standard deviation: 0.456146\n", "notMNIST_small/F\n", "Could not read: notMNIST_small/F/Q3Jvc3NvdmVyIEJvbGRPYmxpcXVlLnR0Zg==.png : cannot identify image file - it's ok, skipping.\n", + "Full dataset tensor: (1872, 28, 28)\n", + "Mean: -0.118148\n", + "Standard deviation: 0.451134\n", "notMNIST_small/G\n", + "Full dataset tensor: (1872, 28, 28)\n", + "Mean: -0.092519\n", + "Standard deviation: 0.448468\n", "notMNIST_small/H\n", + "Full dataset tensor: (1872, 28, 28)\n", + "Mean: -0.0586729\n", + "Standard deviation: 0.457387\n", "notMNIST_small/I\n", + "Full dataset tensor: (1872, 28, 28)\n", + "Mean: 0.0526481\n", + "Standard deviation: 0.472657\n", "notMNIST_small/J\n", - "Full dataset tensor: (18724, 28, 28)\n", - "Mean: -0.0746364\n", - "Standard deviation: 0.458622\n", - "Labels: (18724,)\n" + "Full dataset tensor: (1872, 28, 28)\n", + "Mean: -0.15167\n", + "Standard deviation: 0.449521\n" ], "name": "stdout" } @@ -385,63 +445,12 @@ { "cell_type": "markdown", "metadata": { - "id": "GPTCnjIcyuKN", - "colab_type": "text" - }, - "source": [ - "Next, we'll randomize the data. It's important to have the labels well shuffled for the training and test distributions to match." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "6WZ2l2tN2zOL", - "colab_type": "code", - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - }, - "cellView": "both" - }, - "source": [ - "np.random.seed(133)\n", - "def randomize(dataset, labels):\n", - " permutation = np.random.permutation(labels.shape[0])\n", - " shuffled_dataset = dataset[permutation,:,:]\n", - " shuffled_labels = labels[permutation]\n", - " return shuffled_dataset, shuffled_labels\n", - "train_dataset, train_labels = randomize(train_dataset, train_labels)\n", - "test_dataset, test_labels = randomize(test_dataset, test_labels)" - ], - "outputs": [], - "execution_count": 0 - }, - { - "cell_type": "markdown", - "metadata": { - "id": "puDUTe6t6USl", - "colab_type": "text" - }, - "source": [ - "---\n", - "Problem 3\n", - "---------\n", - "Convince yourself that the data is still good after shuffling!\n", - "\n", - "---" - ] - }, - { - "cell_type": "markdown", - "metadata": { "id": "cYznx5jUwzoO", "colab_type": "text" }, "source": [ "---\n", - "Problem 4\n", + "Problem 3\n", "---------\n", "Another check: we expect the data to be balanced across classes. Verify that.\n", "\n", @@ -455,7 +464,7 @@ "colab_type": "text" }, "source": [ - "Prune the training data as needed. Depending on your computer setup, you might not be able to fit it all in memory, and you can tune train_size as needed.\n", + "Merge and prune the training data as needed. Depending on your computer setup, you might not be able to fit it all in memory, and you can tune `train_size` as needed. The labels will be stored into a separate array of integers 0 through 9.\n", "\n", "Also create a validation dataset for hyperparameter tuning." ] @@ -496,22 +505,65 @@ "outputId": "8af66da6-902d-4719-bedc-7c9fb7ae7948" }, "source": [ + "def make_arrays(nb_rows, img_size):\n", + " if nb_rows:\n", + " dataset = np.ndarray((nb_rows, img_size, img_size), dtype=np.float32)\n", + " labels = np.ndarray(nb_rows, dtype=np.int32)\n", + " else:\n", + " dataset, labels = None, None\n", + " return dataset, labels\n", + "\n", + "def merge_datasets(pickle_files, train_size, valid_size=0):\n", + " num_classes = len(pickle_files)\n", + " valid_dataset, valid_labels = make_arrays(valid_size, image_size)\n", + " train_dataset, train_labels = make_arrays(train_size, image_size)\n", + " vsize_per_class = valid_size // num_classes\n", + " tsize_per_class = train_size // num_classes\n", + " \n", + " start_v, start_t = 0, 0\n", + " end_v, end_t = vsize_per_class, tsize_per_class\n", + " end_l = vsize_per_class+tsize_per_class\n", + " for label, pickle_file in enumerate(pickle_files): \n", + " try:\n", + " with open(pickle_file, 'rb') as f:\n", + " letter_set = pickle.load(f)\n", + " if valid_dataset is not None:\n", + " valid_letter = letter_set[:vsize_per_class, :, :]\n", + " valid_dataset[start_v:end_v, :, :] = valid_letter\n", + " valid_labels[start_v:end_v] = label\n", + " start_v += vsize_per_class\n", + " end_v += vsize_per_class\n", + " \n", + " train_letter = letter_set[vsize_per_class:end_l, :, :]\n", + " train_dataset[start_t:end_t, :, :] = train_letter\n", + " train_labels[start_t:end_t] = label\n", + " start_t += tsize_per_class\n", + " end_t += tsize_per_class\n", + " except Exception as e:\n", + " print('Unable to process data from', pickle_file, ':', e)\n", + " raise\n", + " \n", + " return valid_dataset, valid_labels, train_dataset, train_labels\n", + " \n", + " \n", "train_size = 200000\n", "valid_size = 10000\n", + "test_size = 10000\n", + "\n", + "valid_dataset, valid_labels, train_dataset, train_labels = merge_datasets(train_datasets, train_size, valid_size)\n", + "__, __, test_dataset, test_labels = merge_datasets(test_datasets, test_size)\n", "\n", - "valid_dataset = train_dataset[:valid_size,:,:]\n", - "valid_labels = train_labels[:valid_size]\n", - "train_dataset = train_dataset[valid_size:valid_size+train_size,:,:]\n", - "train_labels = train_labels[valid_size:valid_size+train_size]\n", - "print('Training', train_dataset.shape, train_labels.shape)\n", - "print('Validation', valid_dataset.shape, valid_labels.shape)" + "print('Training:', train_dataset.shape, train_labels.shape)\n", + "print('Validation:', valid_dataset.shape, valid_labels.shape)\n", + "print('Testing:', test_dataset.shape, test_labels.shape)" ], "outputs": [ { "output_type": "stream", "text": [ "Training (200000, 28, 28) (200000,)\n", - "Validation (10000, 28, 28) (10000,)\n" + "Validation (10000, 28, 28) (10000,)\n", + "Testing (10000, 28, 28) (10000,)\n" ], "name": "stdout" } @@ -521,6 +573,57 @@ { "cell_type": "markdown", "metadata": { + "id": "GPTCnjIcyuKN", + "colab_type": "text" + }, + "source": [ + "Next, we'll randomize the data. It's important to have the labels well shuffled for the training and test distributions to match." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "6WZ2l2tN2zOL", + "colab_type": "code", + "colab": { + "autoexec": { + "startup": false, + "wait_interval": 0 + } + }, + "cellView": "both" + }, + "source": [ + "np.random.seed(133)\n", + "def randomize(dataset, labels):\n", + " permutation = np.random.permutation(labels.shape[0])\n", + " shuffled_dataset = dataset[permutation,:,:]\n", + " shuffled_labels = labels[permutation]\n", + " return shuffled_dataset, shuffled_labels\n", + "train_dataset, train_labels = randomize(train_dataset, train_labels)\n", + "test_dataset, test_labels = randomize(test_dataset, test_labels)" + ], + "outputs": [], + "execution_count": 0 + }, + { + "cell_type": "markdown", + "metadata": { + "id": "puDUTe6t6USl", + "colab_type": "text" + }, + "source": [ + "---\n", + "Problem 4\n", + "---------\n", + "Convince yourself that the data is still good after shuffling!\n", + "\n", + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": { "id": "tIQJaJuwg5Hw", "colab_type": "text" }, @@ -654,4 +757,4 @@ ] } ] -}
\ No newline at end of file +} diff --git a/tensorflow/examples/udacity/4_convolutions.ipynb b/tensorflow/examples/udacity/4_convolutions.ipynb index 151aa25ce8..9ad41acb0c 100644 --- a/tensorflow/examples/udacity/4_convolutions.ipynb +++ b/tensorflow/examples/udacity/4_convolutions.ipynb @@ -265,7 +265,7 @@ " [patch_size, patch_size, depth, depth], stddev=0.1))\n", " layer2_biases = tf.Variable(tf.constant(1.0, shape=[depth]))\n", " layer3_weights = tf.Variable(tf.truncated_normal(\n", - " [image_size / 4 * image_size / 4 * depth, num_hidden], stddev=0.1))\n", + " [image_size // 4 * image_size // 4 * depth, num_hidden], stddev=0.1))\n", " layer3_biases = tf.Variable(tf.constant(1.0, shape=[num_hidden]))\n", " layer4_weights = tf.Variable(tf.truncated_normal(\n", " [num_hidden, num_labels], stddev=0.1))\n", @@ -461,4 +461,4 @@ ] } ] -}
\ No newline at end of file +} diff --git a/tensorflow/examples/udacity/5_word2vec.ipynb b/tensorflow/examples/udacity/5_word2vec.ipynb index 1b7f5e2005..b3a7a71e2c 100644 --- a/tensorflow/examples/udacity/5_word2vec.ipynb +++ b/tensorflow/examples/udacity/5_word2vec.ipynb @@ -114,7 +114,7 @@ " if statinfo.st_size == expected_bytes:\n", " print('Found and verified %s' % filename)\n", " else:\n", - " print statinfo.st_size\n", + " print(statinfo.st_size)\n", " raise Exception(\n", " 'Failed to verify ' + filename + '. Can you get to it with a browser?')\n", " return filename\n", @@ -354,35 +354,31 @@ " data_index = (data_index + 1) % len(data)\n", " return batch, labels\n", "\n", - "batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=1)\n", - "for i in range(8):\n", - " print('%d -> %d' % (batch[i], labels[i, 0]))\n", - " print('%s -> %s' % (reverse_dictionary[batch[i]],\n", - " reverse_dictionary[labels[i, 0]]))" + "print('data:', [reverse_dictionary[di] for di in data[:8]])\n", + "\n", + "for num_skips, skip_window in [(2, 1), (4, 2)]:\n", + " data_index = 0\n", + " batch, labels = generate_batch(batch_size=8, num_skips=num_skips, skip_window=skip_window)\n", + " print('\\nwith num_skips = %d and skip_window = %d:' % (num_skips, skip_window))\n", + " print(' batch:', [reverse_dictionary[bi] for bi in batch])\n", + " print(' labels:', [reverse_dictionary[li] for li in labels.reshape(8)])" ], "outputs": [ - { - "output_type": "stream", - "text": [ - " 3083 -> 5243\n", - "originated -> anarchism\n", - "3083 -> 12\n", - "originated -> as\n", - "12 -> 3083\n", - "as -> originated\n", - "12 -> 6\n", - "as -> a\n", - "6 -> 12\n", - "a -> as\n", - "6 -> 195\n", - "a -> term\n", - "195 -> 6\n", - "term -> a\n", - "195 -> 2\n", - "term -> of\n" - ], - "name": "stdout" - } + { + "name": "stdout", + "output_type": "stream", + "text": [ + "data: ['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first']\n", + "\n", + "with num_skips = 2 and skip_window = 1:\n", + " batch: ['originated', 'originated', 'as', 'as', 'a', 'a', 'term', 'term']\n", + " labels: ['as', 'anarchism', 'a', 'originated', 'term', 'as', 'a', 'of']\n", + "\n", + "with num_skips = 4 and skip_window = 2:\n", + " batch: ['as', 'as', 'as', 'as', 'a', 'a', 'a', 'a']\n", + " labels: ['anarchism', 'originated', 'term', 'a', 'as', 'of', 'originated', 'term']\n" + ] + } ], "execution_count": 0 }, @@ -890,4 +886,4 @@ ] } ] -}
\ No newline at end of file +} diff --git a/tensorflow/examples/udacity/README.md b/tensorflow/examples/udacity/README.md index a3adc5f155..65b1664882 100644 --- a/tensorflow/examples/udacity/README.md +++ b/tensorflow/examples/udacity/README.md @@ -1,6 +1,8 @@ Assignments for Udacity Deep Learning class with TensorFlow =========================================================== +Course information can be found at https://www.udacity.com/course/deep-learning--ud730 + Running the Docker container from the Google Cloud repository ------------------------------------------------------------- @@ -44,8 +46,18 @@ Building a local Docker container Running the local container --------------------------- +To run a disposable container: + docker run -p 8888:8888 -it --rm $USER/assignments +Note the above command will create an ephemeral container and all data stored in the container will be lost when the container stops. + +To avoid losing work between sessions in the container, it is recommended that you mount the `tensorflow/examples/udacity` directory into the container: + + docker run -p 8888:8888 -v </path/to/tensorflow/examples/udacity>:/notebooks -it --rm $USER/assignments + +This will allow you to save work and have access to generated files on the host filesystem. + Pushing a Google Cloud release ------------------------------ |