diff options
139 files changed, 6584 insertions, 2536 deletions
diff --git a/eigen.BUILD b/eigen.BUILD index 1a1467a7e5..85b4f11865 100644 --- a/eigen.BUILD +++ b/eigen.BUILD @@ -1,6 +1,6 @@ package(default_visibility = ["//visibility:public"]) -archive_dir = "eigen-eigen-db7b61411772" +archive_dir = "eigen-eigen-0a13bf3e579d" cc_library( name = "eigen", diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD index 098dfe2752..708cfddefc 100644 --- a/tensorflow/contrib/BUILD +++ b/tensorflow/contrib/BUILD @@ -24,6 +24,14 @@ py_library( ], ) +cc_library( + name = "contrib_kernels", + visibility = ["//visibility:public"], + deps = [ + "//tensorflow/contrib/linear_optimizer/kernels:sdca_ops", + ], +) + filegroup( name = "all_files", srcs = glob( diff --git a/tensorflow/contrib/layers/python/layers/layers_test.py b/tensorflow/contrib/layers/python/layers/layers_test.py index 2c024b7bce..68200db076 100644 --- a/tensorflow/contrib/layers/python/layers/layers_test.py +++ b/tensorflow/contrib/layers/python/layers/layers_test.py @@ -211,6 +211,18 @@ class FullyConnectedTest(tf.test.TestCase): tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)) self.assertEqual(1, cnt[0]) + def test_empty_x_results_in_empty_output(self): + # Empty x is common if someone masks their input with tf.boolean_mask in + # order to drop missing entries, and in a particular batch all entries are + # missing. + with self.test_session(): + x = tf.constant([[]], shape=[0, 3]) + self.assertEqual(0, tf.size(x).eval()) + y = tf.contrib.layers.fully_connected(x, 2, activation_fn=tf.nn.softmax) + tf.initialize_all_variables().run() + expected_y = np.array([]).reshape(0,2) + np.testing.assert_array_equal(expected_y, y.eval()) + class Convolution2dTest(tf.test.TestCase): diff --git a/tensorflow/contrib/layers/python/ops/loss_ops.py b/tensorflow/contrib/layers/python/ops/loss_ops.py index 276d4cc541..c451fc81d4 100644 --- a/tensorflow/contrib/layers/python/ops/loss_ops.py +++ b/tensorflow/contrib/layers/python/ops/loss_ops.py @@ -22,16 +22,17 @@ These loss ops are, by design, minimal, enabling flexibility in how their output can be used. @@reduce_batch_sum -@@reduce_batch_mean @@absolute_loss @@squared_loss +@@logistic_loss +@@sum_absolute_loss @@sum_squared_loss -@@mean_absolute_loss -@@mean_squared_loss -@@root_mean_squared_loss +@@sum_logistic_loss +@@scalar_absolute_loss +@@scalar_squared_loss @@scalar_logistic_loss """ @@ -39,14 +40,15 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +from tensorflow.contrib.layers.python.framework import tensor_util from tensorflow.python.framework import ops from tensorflow.python.ops import array_ops from tensorflow.python.ops import math_ops from tensorflow.python.ops import nn -__all__ = ["reduce_batch_sum", "reduce_batch_mean", "absolute_loss", - "squared_loss", "sum_squared_loss", "mean_absolute_loss", - "mean_squared_loss", "root_mean_squared_loss", +__all__ = ["reduce_batch_sum", "absolute_loss", "squared_loss", "logistic_loss", + "sum_absolute_loss", "sum_squared_loss", "sum_logistic_loss", + "scalar_absolute_loss", "scalar_squared_loss", "scalar_logistic_loss"] @@ -120,31 +122,11 @@ def reduce_batch_sum(x, name=None): return _reduce_batch(x, math_ops.reduce_sum, name) -def reduce_batch_mean(x, name=None): - """Given a tensor `x`, returns the mean across all dimensions except dim 0. - - Given a tensor with the number of dimensions > 1, reduce_batch_mean - will calculate the mean across all dimensions except for dimension - 0. This function is useful for calculating the mean loss (error) - across all examples in a batch when training. As an example, given a - tensor of shape [batch_size, d1, d2], this function will calculate - the mean across dimensions d1 and d2, returning a tensor of shape - [batch_size]. - - Tensors of dimension 1 are returned as-is. - - Args: - x: A `Tensor` with dimension > 0. - name: A name for the operation (optional). - - Returns: - A `Tensor` with values averaged across all dimensions > 0. - - Raises: - ValueError: If `x` has dimension 0. - - """ - return _reduce_batch(x, math_ops.reduce_mean, name) +def _validate_predicted_and_target(predicted, target): + # TODO(ptucker): Optionally add assert op for shape check, for cases when + # shape is not fully defined at graph construction time? + predicted.get_shape().assert_is_compatible_with(target.get_shape()) + tensor_util.assert_same_float_dtype([predicted, target]) def absolute_loss(predicted, target, name=None): @@ -172,12 +154,12 @@ def absolute_loss(predicted, target, name=None): with ops.op_scope([predicted, target], name, "absolute_loss") as scope: predicted = ops.convert_to_tensor(predicted, name="predicted") target = ops.convert_to_tensor(target, name="target") - predicted.get_shape().assert_is_compatible_with(target.get_shape()) + _validate_predicted_and_target(predicted, target) return math_ops.abs(target - predicted, name=scope) def squared_loss(predicted, target, name=None): - """Computes and returns the per-example squared loss. + """Computes and returns the per-example squared loss, divided by 2. Computes the per-example squared difference between the target and predicted tensors. The tensors must have the same shape. @@ -200,27 +182,33 @@ def squared_loss(predicted, target, name=None): with ops.op_scope([predicted, target], name, "squared_loss") as scope: predicted = ops.convert_to_tensor(predicted, name="predicted") target = ops.convert_to_tensor(target, name="target") - predicted.get_shape().assert_is_compatible_with(target.get_shape()) - return math_ops.square(target - predicted, name=scope) + _validate_predicted_and_target(predicted, target) + return math_ops.div(math_ops.square(target - predicted), 2.0, name=scope) -def sum_squared_loss(predicted, target, name=None): - """Calculates 1/2 the sum of the squared loss across batches. +def logistic_loss(logit, target, name=None): + """Calculates the logistic cross-entropy loss. - Computes the squared difference between the target and predicted - tensors, sums across all dimensions except dimension 0, and divides - by 2: + **WARNING:** `logit` must be unscaled, while the `target` should be a + normalized probability prediction. See + `tf.nn.sigmoid_cross_entropy_with_logits` for more details. - losses = reduce_batch_sum(squared_loss(predicted, target)) / 2.0 + Args: + logit: A `Tensor` of shape `[batch_size, dim_1, ..., dim_n]` + of predicted logit values. + target: A `Tensor` of shape `[batch_size, dim_1, ..., dim_n]` of + target values. The shape of the target tensor should match the + `predicted` tensor. + name: A name for the operation (optional). - where `losses` is a tensor with dimensions [batch_size]. + Returns: + A `Tensor` of the logistic cross-entropy loss. + """ + return nn.sigmoid_cross_entropy_with_logits(logit, target, name=name) - The tensors must have the same shape. - This function is equivalent to typical formulations of L2 loss, and - similar to TensorFlow's l2_loss function. It differs from the - l2_loss function by allowing the caller to specify both the - predicted and target tensors. +def _sum_loss(predicted, target, loss_fn, name="sum_loss"): + """Apply loss function, then sum across all non-batch dimensions. Args: predicted: A `Tensor` of shape `[batch_size, dim_1, ..., dim_n]` @@ -228,30 +216,23 @@ def sum_squared_loss(predicted, target, name=None): target: A `Tensor` of shape `[batch_size, dim_1, ..., dim_n]` of target values. The shape of the target tensor should match the `predicted` tensor. + loss_fn: Loss to apply, takes 2 tensors as parameters and returns a tensor. name: A name for the operation (optional). Returns: - A `[batch_size]` tensor of squared losses summed across all dimensions - except dimension 0, divided by 2. - - Raises: - ValueError: If `predicted` and `target` shapes do not match. - + A `[batch_size]` tensor of losses, averaged across all dimensions except + dimension 0. """ - with ops.op_scope([predicted, target], name, "sum_squared_loss") as scope: - return math_ops.div( - reduce_batch_sum(squared_loss(predicted, target)), - 2.0, - name=scope) + return reduce_batch_sum(loss_fn(predicted, target), name=name) -def mean_absolute_loss(predicted, target, name=None): - """Calculates the mean absolute loss across batches. +def sum_absolute_loss(predicted, target, name="sum_absolute_loss"): + """Calculates the sum of absolute losses across batches. Computes the absolute difference between the target and predicted tensors, averaged across all dimensions except dimension 0: - losses = reduce_batch_mean(absolute_loss(predicted, target)) + losses = reduce_batch_sum(absolute_loss(predicted, target)) where `losses` is a tensor with dimensions [batch_size]. @@ -275,22 +256,26 @@ def mean_absolute_loss(predicted, target, name=None): ValueError: If `predicted` and `target` shapes do not match. """ - with ops.op_scope([predicted, target], name, "mean_absolute_loss") as scope: - return reduce_batch_mean(absolute_loss(predicted, target), name=scope) + return _sum_loss(predicted, target, absolute_loss, name=name) -def mean_squared_loss(predicted, target, name=None): - """Calculates the mean squared loss across batches. +def sum_squared_loss(predicted, target, name="sum_squared_loss"): + """Calculates the sum of the squared loss across batches. Computes the squared difference between the target and predicted - tensors, and averages across all dimensions except dimension 0: + tensors, sums across all dimensions except dimension 0. - losses = reduce_batch_mean(squared_loss(predicted, target)) + losses = reduce_batch_sum(squared_loss(predicted, target)) where `losses` is a tensor with dimensions [batch_size]. The tensors must have the same shape. + This function is equivalent to typical formulations of L2 loss, and + similar to TensorFlow's l2_loss function. It differs from the + l2_loss function by allowing the caller to specify both the + predicted and target tensors. + Args: predicted: A `Tensor` of shape `[batch_size, dim_1, ..., dim_n]` of predicted values. @@ -300,29 +285,43 @@ def mean_squared_loss(predicted, target, name=None): name: A name for the operation (optional). Returns: - A `[batch_size]` tensor of squared differences, averaged across - all dimensions except dimension 0. + A `[batch_size]` tensor of squared losses summed across all dimensions + except dimension 0. Raises: ValueError: If `predicted` and `target` shapes do not match. """ - with ops.op_scope([predicted, target], name, "mean_squared_loss") as scope: - return reduce_batch_mean(squared_loss(predicted, target), name=scope) + return _sum_loss(predicted, target, squared_loss, name=name) -def root_mean_squared_loss(predicted, target, name=None): - """Calculates the root mean squared loss across batches. +def sum_logistic_loss(logit, target, name="sum_logistic_loss"): + """Calculates the sum of the logistic loss across batches. - Computes the root mean squared loss between the target and predicted - tensors, which is the square root of the mean squared differences - between the predicted and target tensors: + Computes the logistic between logit and predicted tensors, summed across all + dimensions except dimension 0. - losses = sqrt(mean_squared_loss(predicted, target)) + **WARNING:** `logit` must be unscaled, while the `target` should be a + normalized probability prediction. See + `tf.nn.sigmoid_cross_entropy_with_logits` for more details. - where `losses` is a tensor with dimensions [batch_size]. + Args: + logit: A `Tensor` of shape `[batch_size, dim_1, ..., dim_n]` + of predicted logit values. + target: A `Tensor` of shape `[batch_size, dim_1, ..., dim_n]` of + target values. The shape of the target tensor should match the + `predicted` tensor. + name: A name for the operation (optional). - The tensors must have the same shape. + Returns: + A `[batch_size]` tensor of logistic losses summed across all dimensions + except dimension 0. + """ + return _sum_loss(logit, target, logistic_loss, name=name) + + +def _scalar_loss(predicted, target, loss_fn, name=None): + """Reduces losses to a scalar. Args: predicted: A `Tensor` of shape `[batch_size, dim_1, ..., dim_n]` @@ -330,23 +329,52 @@ def root_mean_squared_loss(predicted, target, name=None): target: A `Tensor` of shape `[batch_size, dim_1, ..., dim_n]` of target values. The shape of the target tensor should match the `predicted` tensor. + loss_fn: Loss to apply, takes 2 tensors as parameters and returns a tensor. name: A name for the operation (optional). Returns: - A `[batch_size]` tensor of the root mean squared differences. + Caculate sum of losses per example, then average across batch. + """ + with ops.op_scope([predicted, target], name, "scalar_loss") as scope: + return math_ops.reduce_mean( + _sum_loss(predicted, target, loss_fn), name=scope) - Raises: - ValueError: If `predicted` and `target` shapes do not match. +def scalar_absolute_loss(predicted, target, name="scalar_absolute_loss"): + """Reduces absolute losses to a scalar. + + Args: + predicted: A `Tensor` of shape `[batch_size, dim_1, ..., dim_n]` + of predicted values. + target: A `Tensor` of shape `[batch_size, dim_1, ..., dim_n]` of + target values. The shape of the target tensor should match the + `predicted` tensor. + name: A name for the operation (optional). + + Returns: + Caculate sum of absolute losses per example, then average across batch. """ - with ops.op_scope([predicted, target], - name, - "root_mean_squared_loss") as scope: - return math_ops.sqrt(mean_squared_loss(predicted, target), - name=scope) + return _scalar_loss(predicted, target, loss_fn=absolute_loss, name=name) + +def scalar_squared_loss(predicted, target, name="scalar_squared_loss"): + """Reduces squared losses to a scalar. -def scalar_logistic_loss(logit, target, name=None): + Args: + predicted: A `Tensor` of shape `[batch_size, dim_1, ..., dim_n]` + of predicted values. + target: A `Tensor` of shape `[batch_size, dim_1, ..., dim_n]` of + target values. The shape of the target tensor should match the + `predicted` tensor. + name: A name for the operation (optional). + + Returns: + Caculate sum of squared losses per example, then average across batch. + """ + return _scalar_loss(predicted, target, loss_fn=squared_loss, name=name) + + +def scalar_logistic_loss(logit, target, name="scalar_logistic_loss"): """Calculates the logistic cross-entropy loss, averaged across batches. **WARNING:** `logit` must be unscaled, while the `target` should be a @@ -368,8 +396,5 @@ def scalar_logistic_loss(logit, target, name=None): Raises: ValueError: If `logit` and `target` shapes do not match. """ - with ops.op_scope([logit, target], name, - "scalar_logistic_loss") as scope: - batch_loss = reduce_batch_sum(nn.sigmoid_cross_entropy_with_logits(logit, - target)) - return math_ops.reduce_mean(batch_loss, [0], name=scope) + return _scalar_loss(logit, target, loss_fn=logistic_loss, name=name) + diff --git a/tensorflow/contrib/layers/python/ops/loss_ops_test.py b/tensorflow/contrib/layers/python/ops/loss_ops_test.py index 48f49989cf..1453af5331 100644 --- a/tensorflow/contrib/layers/python/ops/loss_ops_test.py +++ b/tensorflow/contrib/layers/python/ops/loss_ops_test.py @@ -21,6 +21,10 @@ from __future__ import print_function import numpy as np import tensorflow as tf +from tensorflow.contrib.layers.python.framework import tensor_util + +pi = 3.14 +indiana_pi = 3.2 # https://en.wikipedia.org/wiki/Indiana_Pi_Bill class ReduceBatchSumTest(tf.test.TestCase): @@ -89,72 +93,6 @@ class ReduceBatchSumTest(tf.test.TestCase): self.assertAllClose(expected_result, actual_result.eval()) -class ReduceBatchMeanTest(tf.test.TestCase): - - def testDimensionNone(self): - with self.test_session(): - input_array = np.array([ - [1.0, 2.0], - [-1.0, -2.0] - ], dtype=np.float32) - placeholder_vec = tf.placeholder(tf.float32, name="placeholder_vec") - expected_result = np.array([1.5, -1.5]) - actual_result = tf.contrib.layers.reduce_batch_mean(placeholder_vec) - self.assertEqual(actual_result.get_shape().as_list(), [None]) - self.assertAllClose(expected_result, actual_result.eval(feed_dict={ - placeholder_vec: input_array - })) - - def testDimension0(self): - with self.test_session(): - input_vec = tf.constant(2.0) - with self.assertRaises(ValueError): - tf.contrib.layers.reduce_batch_mean(input_vec) - - def testDimension1(self): - with self.test_session(): - input_vec = tf.constant([1.0, 2.0]) - expected_result = np.array([1.0, 2.0]) - actual_result = tf.contrib.layers.reduce_batch_mean(input_vec) - self.assertAllClose(expected_result, actual_result.eval()) - - def testDimension2(self): - with self.test_session(): - input_vec = tf.constant([ - [1.0, 2.0], - [-1.0, -2.0] - ]) - expected_result = np.array([1.5, -1.5]) - actual_result = tf.contrib.layers.reduce_batch_mean(input_vec) - self.assertAllClose(expected_result, actual_result.eval()) - - def testReturnShape(self): - with self.test_session(): - input_vec = tf.constant([ - [1.0, 2.0], - [-1.0, -2.0] - ]) - expected_result = np.array([3.0, -3.0]) - actual_result = tf.contrib.layers.reduce_batch_mean(input_vec) - self.assertShapeEqual(expected_result, actual_result) - - def testDimensionN(self): - with self.test_session(): - input_vec = tf.constant([ - [ - [1.0, 2.0], - [3.0, 4.0] - ], - [ - [5.0, 6.0], - [7.0, 8.0] - ] - ]) - expected_result = np.array([2.5, 6.5]) - actual_result = tf.contrib.layers.reduce_batch_mean(input_vec) - self.assertAllClose(expected_result, actual_result.eval()) - - class AbsoluteLossTest(tf.test.TestCase): def _getTestVectors(self): @@ -191,7 +129,7 @@ class SquaredLossTest(tf.test.TestCase): target = tf.constant([1.0, 0.0, 3.0, 2.0], shape=[2, 2], name="target") predicted = tf.constant([1.1, -0.2, 3.3, 1.6], shape=[2, 2], name="predicted") - expected_loss = np.array([0.01, 0.04, 0.09, 0.16]).reshape(2, 2) + expected_loss = np.array([0.005, 0.02, 0.045, 0.08]).reshape(2, 2) return target, predicted, expected_loss def testSquaredLoss(self): @@ -250,114 +188,108 @@ class SumSquaredLossTest(tf.test.TestCase): tf.contrib.layers.sum_squared_loss(incompatible_shape, target) -class MeanAbsoluteLossTest(tf.test.TestCase): - - def _getTestVectors(self): - target = tf.constant([[0.0, 1.0, 2.0], - [3.0, 2.0, 4.0]], - shape=[2, 3], - name="target") - predicted = tf.constant([[3.0, -3.0, 0.0], - [1.0, 2.0, 0.0]], - shape=[2, 3], - name="predicted") - expected_loss = np.array([3.0, 2.0]) - return target, predicted, expected_loss - - def testMeanAbsoluteLoss(self): - with self.test_session(): - target, predicted, expected_loss = self._getTestVectors() - result = tf.contrib.layers.mean_absolute_loss(predicted, target) - self.assertAllClose(expected_loss, result.eval()) - - def testMeanAbsoluteLossReturnShape(self): - with self.test_session(): - target, predicted, expected_loss = self._getTestVectors() - result = tf.contrib.layers.mean_absolute_loss(predicted, target) - self.assertShapeEqual(expected_loss, result) +class ScalarAbsoluteLossTest(tf.test.TestCase): - def testInvalidShapesValueError(self): + def testScalarAbsoluteLoss(self): with self.test_session(): - target = tf.constant([1.0, 0.0, 3.0, 2.0], shape=[2, 2], name="target") - incompatible_shape = tf.constant([0.0, 1.1], shape=[2], - name="incompatible_shape") - with self.assertRaises(ValueError): - tf.contrib.layers.mean_absolute_loss(incompatible_shape, target) - - -class MeanSquaredLossTest(tf.test.TestCase): - - def _getTestVectors(self): - target = tf.constant([[0.0, 1.0, 2.0], - [3.0, 2.0, 4.0]], - shape=[2, 3], - name="target") - predicted = tf.constant([[3.0, -3.0, 0.0], - [1.0, 2.0, 0.0]], - shape=[2, 3], - name="predicted") - expected_loss = np.array([9.666667, 6.666667]) - return target, predicted, expected_loss - - def testMeanSquaredLoss(self): - with self.test_session(): - target, predicted, expected_loss = self._getTestVectors() - result = tf.contrib.layers.mean_squared_loss(predicted, target) - self.assertAllClose(expected_loss, result.eval()) - - def testMeanSquaredLossReturnShape(self): - with self.test_session(): - target, predicted, expected_loss = self._getTestVectors() - result = tf.contrib.layers.mean_squared_loss(predicted, target) - self.assertShapeEqual(expected_loss, result) - - def testInvalidShapesValueError(self): + actual = tf.constant([pi], name="pi") + actual_placeholder = tf.placeholder(tf.float32) + label = tf.constant([indiana_pi], name="lbl") + label_placeholder = tf.placeholder(tf.float32, name="lbl_ph") + expected_loss = abs(indiana_pi - pi) + + # Both shapes are set. + both_shapes_loss = tf.contrib.layers.scalar_absolute_loss(actual, label) + tf.initialize_all_variables().run() + np.testing.assert_almost_equal( + both_shapes_loss.eval(), expected_loss, decimal=6) + + # No shape for 'actual' - check that the loss layer can be created. + no_actual_shape_loss = tf.contrib.layers.scalar_absolute_loss( + actual_placeholder, label) + tf.initialize_all_variables().run() + np.testing.assert_almost_equal( + no_actual_shape_loss.eval({actual_placeholder: [pi]}), + expected_loss, decimal=6) + + # No shape for 'label' - check that the loss layer can be created. + no_label_shape_loss = tf.contrib.layers.scalar_absolute_loss( + actual, label_placeholder) + tf.initialize_all_variables().run() + np.testing.assert_almost_equal( + no_label_shape_loss.eval({label_placeholder: [indiana_pi]}), + expected_loss, decimal=6) + + # No shapes. + no_shape_loss = tf.contrib.layers.scalar_absolute_loss( + actual_placeholder, label_placeholder) + tf.initialize_all_variables().run() + np.testing.assert_almost_equal( + no_shape_loss.eval({label_placeholder: [indiana_pi], + actual_placeholder: [pi]}), + expected_loss, decimal=6) + + # Evaluate the previous one again, but this time with different + # (matching) shapes. This should still work. + np.testing.assert_almost_equal( + no_shape_loss.eval({label_placeholder: [indiana_pi, indiana_pi], + actual_placeholder: [pi, pi]}), + expected_loss, decimal=6) + + +class ScalarSquaredLossTest(tf.test.TestCase): + + def testScalarSquaredLoss(self): with self.test_session(): - target = tf.constant([1.0, 0.0, 3.0, 2.0], shape=[2, 2], name="target") - incompatible_shape = tf.constant([0.0, 1.1], shape=[2], - name="incompatible_shape") - with self.assertRaises(ValueError): - tf.contrib.layers.mean_squared_loss(incompatible_shape, target) - - -class RootMeanSquaredLossTest(tf.test.TestCase): - - def _getTestVectors(self): - target = tf.constant([[0.0, 1.0, 2.0], - [3.0, 2.0, 4.0]], - shape=[2, 3], - name="target") - predicted = tf.constant([[3.0, -3.0, 0.0], - [1.0, 2.0, 0.0]], - shape=[2, 3], - name="predicted") - expected_loss = np.array([3.109126, 2.5819889]) - return target, predicted, expected_loss - - def testRootMeanSquaredLoss(self): - with self.test_session(): - target, predicted, expected_loss = self._getTestVectors() - result = tf.contrib.layers.root_mean_squared_loss(predicted, target) - self.assertAllClose(expected_loss, result.eval()) - - def testRootMeanSquaredLossReturnShape(self): - with self.test_session(): - target, predicted, expected_loss = self._getTestVectors() - result = tf.contrib.layers.root_mean_squared_loss(predicted, target) - self.assertShapeEqual(expected_loss, result) - - def testInvalidShapesValueError(self): - with self.test_session(): - target = tf.constant([1.0, 0.0, 3.0, 2.0], shape=[2, 2], name="target") - incompatible_shape = tf.constant([0.0, 1.1], shape=[2], - name="incompatible_shape") - with self.assertRaises(ValueError): - tf.contrib.layers.root_mean_squared_loss(incompatible_shape, target) - - -class MeanScalarLogisticLossTest(tf.test.TestCase): - - def _get_mean_sigmoid_logistic_loss(self, logit, target): + actual = tf.constant([pi], name="pi") + actual_placeholder = tf.placeholder(tf.float32) + label = tf.constant([indiana_pi], name="lbl") + label_placeholder = tf.placeholder(tf.float32, name="lbl_ph") + expected_loss = (indiana_pi - pi) * (indiana_pi - pi) / 2 + + # Both shapes are set. + both_shapes_loss = tf.contrib.layers.scalar_squared_loss(actual, label) + tf.initialize_all_variables().run() + np.testing.assert_almost_equal( + both_shapes_loss.eval(), expected_loss, decimal=6) + + # No shape for 'actual' - check that the loss layer can be created. + no_actual_shape_loss = tf.contrib.layers.scalar_squared_loss( + actual_placeholder, label) + tf.initialize_all_variables().run() + np.testing.assert_almost_equal( + no_actual_shape_loss.eval({actual_placeholder: [pi]}), + expected_loss, decimal=6) + + # No shape for 'label' - check that the loss layer can be created. + no_label_shape_loss = tf.contrib.layers.scalar_squared_loss( + actual, label_placeholder) + tf.initialize_all_variables().run() + np.testing.assert_almost_equal( + no_label_shape_loss.eval({label_placeholder: [indiana_pi]}), + expected_loss, + decimal=6) + + # No shapes. + no_shape_loss = tf.contrib.layers.scalar_squared_loss( + actual_placeholder, label_placeholder) + tf.initialize_all_variables().run() + np.testing.assert_almost_equal( + no_shape_loss.eval({label_placeholder: [indiana_pi], + actual_placeholder: [pi]}), + expected_loss, decimal=6) + + # Evaluate the previous one again, but this time with different + # (matching) shapes. This should still work. + np.testing.assert_almost_equal( + no_shape_loss.eval({label_placeholder: [indiana_pi, indiana_pi], + actual_placeholder: [pi, pi]}), + expected_loss, decimal=6) + + +class ScalarLogisticLossTest(tf.test.TestCase): + + def _expected_loss(self, logit, target): sigmoid = 1.0 / (1.0 + np.exp(-logit)) logistic_loss = (target * -np.log(sigmoid)) - ( (1.0 - target) * np.log(1.0 - sigmoid)) @@ -365,14 +297,13 @@ class MeanScalarLogisticLossTest(tf.test.TestCase): return np.sum(batch_losses) / len(batch_losses) - def test_mean__scalar_logistic_loss(self): + def test_scalar_logistic_loss(self): logit = np.array([[9.45, -42], [4.2, 1], [-0.6, 20]]) target = np.array([[0.8, 0.9], [0.45, 0.99999], [0.1, 0.0006]]) - expected_loss = self._get_mean_sigmoid_logistic_loss(logit, target) with self.test_session(): result = tf.contrib.layers.scalar_logistic_loss( tf.constant(logit), tf.constant(target)) - self.assertAllClose(expected_loss, result.eval()) + self.assertAllClose(self._expected_loss(logit, target), result.eval()) if __name__ == "__main__": diff --git a/tensorflow/contrib/linear_optimizer/BUILD b/tensorflow/contrib/linear_optimizer/BUILD index 234142757f..3d92123c28 100644 --- a/tensorflow/contrib/linear_optimizer/BUILD +++ b/tensorflow/contrib/linear_optimizer/BUILD @@ -36,6 +36,7 @@ py_test( name = "sdca_ops_test", srcs = ["python/kernel_tests/sdca_ops_test.py"], srcs_version = "PY2AND3", + tags = ["noasan"], # doesn't pass ASAN for some reason deps = [ ":sdca_ops_py", "//tensorflow:tensorflow_py", diff --git a/tensorflow/contrib/linear_optimizer/python/kernel_tests/sdca_ops_test.py b/tensorflow/contrib/linear_optimizer/python/kernel_tests/sdca_ops_test.py index e973a88bb7..a6da0ce5e9 100644 --- a/tensorflow/contrib/linear_optimizer/python/kernel_tests/sdca_ops_test.py +++ b/tensorflow/contrib/linear_optimizer/python/kernel_tests/sdca_ops_test.py @@ -112,12 +112,13 @@ def make_dense_variable_dict(num_dense_features, num_examples): def get_binary_predictions_for_logistic(predictions, cutoff=0.5): return tf.cast( tf.greater_equal(predictions, tf.ones_like(predictions) * cutoff), - tf.float32) + dtype=tf.float32) def get_binary_predictions_for_hinge(predictions): - all_ones = tf.ones_like(predictions) - return tf.add(tf.sign(predictions), all_ones) / 2 + return tf.cast( + tf.greater_equal(predictions, tf.zeros_like(predictions)), + dtype=tf.float32) # Setup the single container shared across all tests. This is testing proper diff --git a/tensorflow/contrib/linear_optimizer/python/ops/sdca_ops.py b/tensorflow/contrib/linear_optimizer/python/ops/sdca_ops.py index 8116ad00b0..5820794f35 100644 --- a/tensorflow/contrib/linear_optimizer/python/ops/sdca_ops.py +++ b/tensorflow/contrib/linear_optimizer/python/ops/sdca_ops.py @@ -28,9 +28,11 @@ from tensorflow.python.framework import ops from tensorflow.python.framework.load_library import load_op_library from tensorflow.python.framework.ops import convert_to_tensor from tensorflow.python.framework.ops import name_scope +from tensorflow.python.framework.ops import op_scope from tensorflow.python.ops import array_ops from tensorflow.python.ops import control_flow_ops from tensorflow.python.ops import math_ops +from tensorflow.python.ops import state_ops from tensorflow.python.ops import variables as var_ops from tensorflow.python.ops.nn import sigmoid_cross_entropy_with_logits from tensorflow.python.platform import resource_loader @@ -55,6 +57,7 @@ def _maybe_load_sdca_ops(): assert _sdca_ops, 'Could not load _sdca_ops.so' +# TODO(rohananil): add op_scope to appropriate methods. class SdcaModel(object): """Stochastic dual coordinate ascent solver for linear models. @@ -255,13 +258,20 @@ class SdcaModel(object): predictions = math_ops.sigmoid(predictions) return predictions - def minimize(self): + def minimize(self, global_step=None, name=None): """Add operations to train a linear model by minimizing the loss function. + Args: + global_step: Optional `Variable` to increment by one after the + variables have been updated. + name: Optional name for the returned operation. + Returns: An Operation that updates the variables passed in the constructor. """ - with name_scope('sdca/minimize'): + # Technically, the op depends on a lot more than the variables, + # but we'll keep the list short. + with op_scope([], name, 'sdca/minimize'): sparse_features_indices = [] sparse_features_values = [] for sf in self._examples['sparse_features']: @@ -301,7 +311,7 @@ class SdcaModel(object): assign_ops.append(var.assign(slot_var)) assign_group = control_flow_ops.group(*assign_ops) with ops.control_dependencies([assign_group]): - return _sdca_ops.sdca_shrink_l1( + shrink_l1 = _sdca_ops.sdca_shrink_l1( self._convert_n_to_tensor( self._variables['sparse_features_weights'], as_ref=True), @@ -310,6 +320,11 @@ class SdcaModel(object): as_ref=True), l1=self._options['symmetric_l1_regularization'], l2=self._symmetric_l2_regularization()) + if not global_step: + return shrink_l1 + with ops.control_dependencies([shrink_l1]): + with ops.colocate_with(global_step): + return state_ops.assign_add(global_step, 1, name=name).op def approximate_duality_gap(self): """Add operations to compute the approximate duality gap. diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD index cea3b723f8..ef76ebdd8d 100644 --- a/tensorflow/core/BUILD +++ b/tensorflow/core/BUILD @@ -968,7 +968,6 @@ tf_cuda_library( tf_cuda_library( name = "gpu_runtime", srcs = [ - "common_runtime/gpu/gpu_allocator_retry.cc", "common_runtime/gpu/gpu_bfc_allocator.cc", "common_runtime/gpu/gpu_debug_allocator.cc", "common_runtime/gpu/gpu_device.cc", @@ -982,7 +981,6 @@ tf_cuda_library( "common_runtime/gpu_device_context.h", ], hdrs = [ - "common_runtime/gpu/gpu_allocator_retry.h", "common_runtime/gpu/gpu_bfc_allocator.h", "common_runtime/gpu/gpu_debug_allocator.h", "common_runtime/gpu/gpu_device.h", @@ -991,7 +989,6 @@ tf_cuda_library( "common_runtime/gpu/gpu_util.h", "common_runtime/gpu/pool_allocator.h", "common_runtime/gpu/process_state.h", - "common_runtime/gpu/visitable_allocator.h", ], copts = tf_copts(), linkstatic = 1, diff --git a/tensorflow/core/client/tensor_c_api.cc b/tensorflow/core/client/tensor_c_api.cc index fa2f1417d2..8abebcd811 100644 --- a/tensorflow/core/client/tensor_c_api.cc +++ b/tensorflow/core/client/tensor_c_api.cc @@ -420,18 +420,26 @@ void TF_Run_Helper(TF_Session* s, const char* handle, run_options->length)) { status->status = tensorflow::errors::InvalidArgument("Unparseable RunOptions proto"); + return; + } + if (run_outputs != nullptr && run_outputs->data != nullptr) { + status->status = tensorflow::errors::InvalidArgument( + "Passing non-empty run_outputs is invalid."); + return; } - RunOutputs run_outputs_proto; + RunOutputs run_outputs_proto; result = s->session->Run(run_options_proto, inputs, output_tensor_names, target_node_names, &outputs, &run_outputs_proto); // Serialize back to upstream client, who now owns the new buffer - int proto_size = run_outputs_proto.ByteSize(); - void* str_buf = reinterpret_cast<void*>(operator new(proto_size)); - run_outputs_proto.SerializeToArray(str_buf, proto_size); - run_outputs->data = str_buf; - run_outputs->length = proto_size; + if (run_outputs != nullptr) { + int proto_size = run_outputs_proto.ByteSize(); + void* str_buf = reinterpret_cast<void*>(operator new(proto_size)); + run_outputs_proto.SerializeToArray(str_buf, proto_size); + run_outputs->data = str_buf; + run_outputs->length = proto_size; + } } } else { // NOTE(zongheng): PRun does not support RunOptions yet. diff --git a/tensorflow/core/common_runtime/gpu/gpu_allocator_retry.cc b/tensorflow/core/common_runtime/allocator_retry.cc index 4d97491f2e..8c3c45706f 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_allocator_retry.cc +++ b/tensorflow/core/common_runtime/allocator_retry.cc @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "tensorflow/core/common_runtime/gpu/gpu_allocator_retry.h" +#include "tensorflow/core/common_runtime/allocator_retry.h" #include "tensorflow/core/platform/env.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/mutex.h" @@ -21,9 +21,9 @@ limitations under the License. namespace tensorflow { -GPUAllocatorRetry::GPUAllocatorRetry() : env_(Env::Default()) {} +AllocatorRetry::AllocatorRetry() : env_(Env::Default()) {} -void* GPUAllocatorRetry::AllocateRaw( +void* AllocatorRetry::AllocateRaw( std::function<void*(size_t alignment, size_t num_bytes, bool verbose_failure)> alloc_func, diff --git a/tensorflow/core/common_runtime/gpu/gpu_allocator_retry.h b/tensorflow/core/common_runtime/allocator_retry.h index aa4ac81998..613f19d41b 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_allocator_retry.h +++ b/tensorflow/core/common_runtime/allocator_retry.h @@ -13,8 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_ALLOCATOR_RETRY_H_ -#define TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_ALLOCATOR_RETRY_H_ +#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_ALLOCATOR_RETRY_H_ +#define TENSORFLOW_CORE_COMMON_RUNTIME_ALLOCATOR_RETRY_H_ #include "tensorflow/core/platform/env.h" #include "tensorflow/core/platform/mutex.h" @@ -23,9 +23,9 @@ limitations under the License. namespace tensorflow { // A retrying wrapper for a memory allocator. -class GPUAllocatorRetry { +class AllocatorRetry { public: - GPUAllocatorRetry(); + AllocatorRetry(); // Call 'alloc_func' to obtain memory. On first call, // 'verbose_failure' will be false. If return value is nullptr, @@ -50,11 +50,11 @@ class GPUAllocatorRetry { }; // Implementation details below -inline void GPUAllocatorRetry::NotifyDealloc() { +inline void AllocatorRetry::NotifyDealloc() { mutex_lock l(mu_); memory_returned_.notify_all(); } } // namespace tensorflow -#endif // TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_ALLOCATOR_RETRY_H_ +#endif // TENSORFLOW_CORE_COMMON_RUNTIME_ALLOCATOR_RETRY_H_ diff --git a/tensorflow/core/common_runtime/bfc_allocator.cc b/tensorflow/core/common_runtime/bfc_allocator.cc new file mode 100644 index 0000000000..7a2ea91c9b --- /dev/null +++ b/tensorflow/core/common_runtime/bfc_allocator.cc @@ -0,0 +1,702 @@ +/* Copyright 2015 Google Inc. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/core/common_runtime/bfc_allocator.h" + +#include "tensorflow/core/common_runtime/allocator_retry.h" +#include "tensorflow/core/lib/core/bits.h" +#include "tensorflow/core/lib/gtl/stl_util.h" +#include "tensorflow/core/lib/strings/numbers.h" +#include "tensorflow/core/lib/strings/str_util.h" +#include "tensorflow/core/lib/strings/strcat.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/mutex.h" +#include "tensorflow/core/platform/types.h" + +namespace tensorflow { + +BFCAllocator::BFCAllocator(SubAllocator* sub_allocator, size_t total_memory, + bool allow_growth, const string& name) + : suballocator_(sub_allocator), + name_(name), + free_chunks_list_(kInvalidChunkHandle), + next_allocation_id_(1) { + if (allow_growth) { + // 1MiB smallest initial allocation, unless total memory available + // is less. + curr_region_allocation_bytes_ = + RoundedBytes(std::min(total_memory, size_t{1048576})); + } else { + curr_region_allocation_bytes_ = RoundedBytes(total_memory); + } + + // Allocate the requested amount of memory. + memory_limit_ = total_memory; + stats_.bytes_limit = static_cast<int64>(total_memory); + + // Create a bunch of bins of various good sizes. + + // We create bins to fit all possible ranges that cover the + // memory_limit_ starting from allocations up to 256 bytes to + // allocations up to (and including) the memory limit. + for (BinNum b = 0; b < kNumBins; b++) { + size_t bin_size = BinNumToSize(b); + VLOG(1) << "Creating bin of max chunk size " + << strings::HumanReadableNumBytes(bin_size); + new (BinFromIndex(b)) Bin(this, bin_size); + CHECK_EQ(BinForSize(bin_size), BinFromIndex(b)); + CHECK_EQ(BinForSize(bin_size + 255), BinFromIndex(b)); + CHECK_EQ(BinForSize(bin_size * 2 - 1), BinFromIndex(b)); + if (b + 1 < kNumBins) { + CHECK_NE(BinForSize(bin_size * 2), BinFromIndex(b)); + } + } +} + +BFCAllocator::~BFCAllocator() { + // Return memory back. + VLOG(2) << "Number of regions allocated: " + << region_manager_.regions().size(); + for (const auto& region : region_manager_.regions()) { + suballocator_->Free(region.ptr(), region.memory_size()); + } + + for (BinNum b = 0; b < kNumBins; b++) { + BinFromIndex(b)->~Bin(); + } +} + +BFCAllocator::Chunk* BFCAllocator::ChunkFromHandle(ChunkHandle h) { + DCHECK_GE(h, 0); + DCHECK_LT(h, static_cast<int>(chunks_.size())); + return &(chunks_[h]); +} + +bool BFCAllocator::Extend(size_t rounded_bytes) { + // Do we have enough space to handle the client's request? + // If not, fail immediately. + if (total_region_allocated_bytes_ + rounded_bytes > memory_limit_) { + return false; + } + + // If curr_region_allocation_bytes_ is not enough to satisfy the + // allocation, keep multiplying by a power of two until that is + // sufficient. + bool increased_allocation = false; + while (rounded_bytes > curr_region_allocation_bytes_) { + curr_region_allocation_bytes_ *= 2; + increased_allocation = true; + } + + // Try allocating. + size_t bytes = curr_region_allocation_bytes_; + void* mem_addr = suballocator_->Alloc(32, bytes); + if (mem_addr == nullptr && !started_backpedal_) { + // Only backpedal once. + started_backpedal_ = true; + + static constexpr float kBackpedalFactor = 0.9; + + // Try allocating less memory. + bytes = RoundedBytes(bytes * kBackpedalFactor); + while (mem_addr == nullptr && bytes > rounded_bytes) { + mem_addr = suballocator_->Alloc(32, bytes); + bytes = RoundedBytes(bytes * kBackpedalFactor); + } + } + + if (mem_addr == nullptr) { + return false; + } + + if (!increased_allocation) { + // Increase the region size of the next required allocation. + curr_region_allocation_bytes_ *= 2; + } + + VLOG(1) << "Extending allocation by " << strings::HumanReadableNumBytes(bytes) + << " bytes."; + + total_region_allocated_bytes_ += bytes; + VLOG(1) << "Total allocated bytes: " + << strings::HumanReadableNumBytes(total_region_allocated_bytes_); + + VLOG(1) << "Allocated memory at " << mem_addr << " to " + << static_cast<void*>(static_cast<char*>(mem_addr) + bytes); + region_manager_.AddAllocationRegion(mem_addr, bytes); + + // Create one large chunk for the whole memory space that will + // be chunked later. + ChunkHandle h = AllocateChunk(); + BFCAllocator::Chunk* c = ChunkFromHandle(h); + c->ptr = mem_addr; + c->size = bytes; + c->allocation_id = -1; + c->prev = kInvalidChunkHandle; + c->next = kInvalidChunkHandle; + + region_manager_.set_handle(c->ptr, h); + + // TODO(vrv): Try to merge this new region with an existing region, + // if the address space is contiguous, to avoid fragmentation + // across regions. + + // Insert the chunk into the right bin. + InsertFreeChunkIntoBin(h); + + // Invoke visitors on newly allocated region. + for (auto visitor : region_visitors_) { + visitor(mem_addr, bytes); + } + return true; +} + +BFCAllocator::ChunkHandle BFCAllocator::AllocateChunk() { + if (free_chunks_list_ != kInvalidChunkHandle) { + ChunkHandle h = free_chunks_list_; + Chunk* c = ChunkFromHandle(h); + free_chunks_list_ = c->next; + return h; + } else { + ChunkHandle h = chunks_.size(); + chunks_.resize(h + 1); + return h; + } +} + +void BFCAllocator::DeallocateChunk(ChunkHandle h) { + Chunk* c = ChunkFromHandle(h); + c->next = free_chunks_list_; + free_chunks_list_ = h; +} + +void* BFCAllocator::AllocateRaw(size_t unused_alignment, size_t num_bytes) { + // Fast path: Try once to allocate without getting the retry_helper_ involved + void* r = AllocateRawInternal(unused_alignment, num_bytes, false); + if (r != nullptr) { + return r; + } else { + static const int64 kMaxMillisToWait = 10000; // 10 seconds + return retry_helper_.AllocateRaw( + [this](size_t a, size_t nb, bool v) { + return AllocateRawInternal(a, nb, v); + }, + kMaxMillisToWait, unused_alignment, num_bytes); + } +} + +void* BFCAllocator::AllocateRaw(size_t unused_alignment, size_t num_bytes, + const AllocationAttributes& allocation_attr) { + if (allocation_attr.no_retry_on_failure) { + // Return immediately upon the first failure if this is for allocating an + // optional scratch space. + void* result = AllocateRawInternal(unused_alignment, num_bytes, false); + if (result == nullptr) { + // The counter incrementing is not thread-safe. But we don't really care. + // TODO(zhengxq): we should implement a LOG_FIRST_N and LOG_EVERY_N for + // more general usage. + static int log_counter = 0; + if (log_counter < 10) { + log_counter++; + LOG(WARNING) + << "Ran out of memory trying to allocate " + << strings::HumanReadableNumBytes(num_bytes) + << ". The caller indicates that this is not a failure, but" + << " may mean that there could be performance gains if more" + << " memory is available."; + } + } + return result; + } else { + return AllocateRaw(unused_alignment, num_bytes); + } +} + +// static +size_t BFCAllocator::RoundedBytes(size_t bytes) { + size_t rounded_bytes = + (kMinAllocationSize * + ((bytes + kMinAllocationSize - 1) / kMinAllocationSize)); + DCHECK_EQ(size_t{0}, rounded_bytes % kMinAllocationSize); + return rounded_bytes; +} + +void* BFCAllocator::AllocateRawInternal(size_t unused_alignment, + size_t num_bytes, + bool dump_log_on_failure) { + if (num_bytes == 0) { + LOG(ERROR) << "tried to allocate 0 bytes"; + return nullptr; + } + // First, always allocate memory of at least kMinAllocationSize + // bytes, and always allocate multiples of kMinAllocationSize bytes + // so all memory addresses are nicely byte aligned. + size_t rounded_bytes = RoundedBytes(num_bytes); + + // The BFC allocator tries to find the best fit first. + BinNum bin_num = BinNumForSize(rounded_bytes); + + mutex_lock l(lock_); + void* ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes); + if (ptr != nullptr) { + return ptr; + } + + // Try to extend + if (Extend(rounded_bytes)) { + ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes); + if (ptr != nullptr) { + return ptr; + } + } + + // We searched all bins for an existing free chunk to use and + // couldn't find one. This means we must have run out of memory, + // Dump the memory log for analysis. + if (dump_log_on_failure) { + DumpMemoryLog(rounded_bytes); + LOG(WARNING) << RenderOccupancy(); + LOG(WARNING) << "Ran out of memory trying to allocate " + << strings::HumanReadableNumBytes(num_bytes) + << ". See logs for memory state."; + } + return nullptr; +} + +void* BFCAllocator::FindChunkPtr(BinNum bin_num, size_t rounded_bytes, + size_t num_bytes) { + // First identify the first bin that could satisfy rounded_bytes. + for (; bin_num < kNumBins; bin_num++) { + // Start searching from the first bin for the smallest chunk that fits + // rounded_bytes. + Bin* b = BinFromIndex(bin_num); + for (auto citer = b->free_chunks.begin(); citer != b->free_chunks.end(); + ++citer) { + const BFCAllocator::ChunkHandle h = (*citer); + BFCAllocator::Chunk* chunk = ChunkFromHandle(h); + DCHECK(!chunk->in_use()); + if (chunk->size >= rounded_bytes) { + // We found an existing chunk that fits us that wasn't in use, so remove + // it from the free bin structure prior to using. + RemoveFreeChunkIterFromBin(&b->free_chunks, citer); + + // If we can break the size of the chunk into two reasonably + // large pieces, do so. + // + // TODO(vrv): What should be the criteria when deciding when + // to split? + if (chunk->size >= rounded_bytes * 2) { + SplitChunk(h, rounded_bytes); + chunk = ChunkFromHandle(h); // Update chunk pointer in case it moved + } + + // The requested size of the returned chunk is what the user + // has allocated. + chunk->requested_size = num_bytes; + // Assign a unique id and increment the id counter, marking the + // chunk as being in use. + chunk->allocation_id = next_allocation_id_++; + + // Update stats. + ++stats_.num_allocs; + stats_.bytes_in_use += chunk->size; + stats_.max_bytes_in_use = + std::max(stats_.max_bytes_in_use, stats_.bytes_in_use); + stats_.max_alloc_size = + std::max<std::size_t>(stats_.max_alloc_size, chunk->size); + + VLOG(4) << "Returning: " << chunk->ptr; + if (VLOG_IS_ON(4)) { + LOG(INFO) << "A: " << RenderOccupancy(); + } + return chunk->ptr; + } + } + } + + return nullptr; +} + +void BFCAllocator::SplitChunk(BFCAllocator::ChunkHandle h, size_t num_bytes) { + // Allocate the new chunk before we do any ChunkFromHandle + ChunkHandle h_new_chunk = AllocateChunk(); + + Chunk* c = ChunkFromHandle(h); + CHECK(!c->in_use() && (c->bin_num == kInvalidBinNum)); + + // Create a new chunk starting num_bytes after c + BFCAllocator::Chunk* new_chunk = ChunkFromHandle(h_new_chunk); + new_chunk->ptr = static_cast<void*>(static_cast<char*>(c->ptr) + num_bytes); + region_manager_.set_handle(new_chunk->ptr, h_new_chunk); + + // Set the new sizes of the chunks. + new_chunk->size = c->size - num_bytes; + c->size = num_bytes; + + // The new chunk is not in use. + new_chunk->allocation_id = -1; + + // Maintain the pointers. + // c <-> c_neighbor becomes + // c <-> new_chunk <-> c_neighbor + BFCAllocator::ChunkHandle h_neighbor = c->next; + new_chunk->prev = h; + new_chunk->next = h_neighbor; + c->next = h_new_chunk; + if (h_neighbor != kInvalidChunkHandle) { + Chunk* c_neighbor = ChunkFromHandle(h_neighbor); + c_neighbor->prev = h_new_chunk; + } + + // Add the newly free chunk to the free bin. + InsertFreeChunkIntoBin(h_new_chunk); +} + +void BFCAllocator::DeallocateRaw(void* ptr) { + DeallocateRawInternal(ptr); + retry_helper_.NotifyDealloc(); +} + +void BFCAllocator::DeallocateRawInternal(void* ptr) { + if (ptr == nullptr) { + LOG(ERROR) << "tried to deallocate nullptr"; + return; + } + mutex_lock l(lock_); + + // Find the chunk from the ptr. + BFCAllocator::ChunkHandle h = region_manager_.get_handle(ptr); + CHECK(h != kInvalidChunkHandle); + + // Consider coalescing it. + FreeAndMaybeCoalesce(h); + + if (VLOG_IS_ON(4)) { + LOG(INFO) << "F: " << RenderOccupancy(); + } +} + +// Merges h1 and h2 when Chunk(h1)->next is h2 and Chunk(h2)->prev is c1. +// We merge Chunk(h2) into Chunk(h1). +void BFCAllocator::Merge(BFCAllocator::ChunkHandle h1, + BFCAllocator::ChunkHandle h2) { + Chunk* c1 = ChunkFromHandle(h1); + Chunk* c2 = ChunkFromHandle(h2); + // We can only merge chunks that are not in use. + CHECK(!c1->in_use() && !c2->in_use()); + + // c1's prev doesn't change, still points to the same ptr, and is + // still not in use. + + // Fix up neighbor pointers + // + // c1 <-> c2 <-> c3 should become + // c1 <-> c3 + + BFCAllocator::ChunkHandle h3 = c2->next; + c1->next = h3; + CHECK(c2->prev == h1); + if (h3 != kInvalidChunkHandle) { + BFCAllocator::Chunk* c3 = ChunkFromHandle(h3); + c3->prev = h1; + } + + // Set the new size + c1->size += c2->size; + + DeleteChunk(h2); +} + +void BFCAllocator::DeleteChunk(ChunkHandle h) { + // Delete h and cleanup all state + Chunk* c = ChunkFromHandle(h); + // VLOG(4) << "Removing: " << c->ptr; + region_manager_.erase(c->ptr); + DeallocateChunk(h); +} + +void BFCAllocator::InsertFreeChunkIntoBin(BFCAllocator::ChunkHandle h) { + Chunk* c = ChunkFromHandle(h); + CHECK(!c->in_use() && (c->bin_num == kInvalidBinNum)); + BinNum bin_num = BinNumForSize(c->size); + Bin* new_bin = BinFromIndex(bin_num); + c->bin_num = bin_num; + new_bin->free_chunks.insert(h); +} + +void BFCAllocator::RemoveFreeChunkIterFromBin( + BFCAllocator::Bin::FreeChunkSet* free_chunks, + const BFCAllocator::Bin::FreeChunkSet::iterator& citer) { + ChunkHandle h = *citer; + Chunk* c = ChunkFromHandle(h); + CHECK(!c->in_use() && (c->bin_num != kInvalidBinNum)); + free_chunks->erase(citer); + c->bin_num = kInvalidBinNum; +} + +void BFCAllocator::RemoveFreeChunkFromBin(BFCAllocator::ChunkHandle h) { + Chunk* c = ChunkFromHandle(h); + CHECK(!c->in_use() && (c->bin_num != kInvalidBinNum)); + int count = BinFromIndex(c->bin_num)->free_chunks.erase(h); + CHECK(count > 0) << "Could not find chunk in bin"; + c->bin_num = kInvalidBinNum; +} + +void BFCAllocator::FreeAndMaybeCoalesce(BFCAllocator::ChunkHandle h) { + Chunk* c = ChunkFromHandle(h); + CHECK(c->in_use() && (c->bin_num == kInvalidBinNum)); + + // Mark the chunk as no longer in use + c->allocation_id = -1; + + // Updates the stats. + stats_.bytes_in_use -= c->size; + + // This chunk is no longer in-use, consider coalescing the chunk + // with adjacent chunks. + ChunkHandle chunk_to_reassign = h; + + // If the next chunk is free, coalesce the two + if (c->next != kInvalidChunkHandle) { + Chunk* cnext = ChunkFromHandle(c->next); + if (!cnext->in_use()) { + // VLOG(8) << "Chunk at " << cnext->ptr << " merging with c " << + // c->ptr; + + chunk_to_reassign = h; + + // Deletes c->next + RemoveFreeChunkFromBin(c->next); + Merge(h, ChunkFromHandle(h)->next); + } + } + + // If the previous chunk is free, coalesce the two + c = ChunkFromHandle(h); + if (c->prev != kInvalidChunkHandle) { + Chunk* cprev = ChunkFromHandle(c->prev); + if (!cprev->in_use()) { + // VLOG(8) << "Chunk at " << c->ptr << " merging into c->prev " + // << cprev->ptr; + + chunk_to_reassign = c->prev; + + // Deletes c + RemoveFreeChunkFromBin(c->prev); + Merge(ChunkFromHandle(h)->prev, h); + c = ChunkFromHandle(h); + } + } + + InsertFreeChunkIntoBin(chunk_to_reassign); +} + +void BFCAllocator::AddAllocVisitor(Visitor visitor) { + VLOG(1) << "AddVisitor"; + mutex_lock l(lock_); + region_visitors_.push_back(visitor); + for (const auto& region : region_manager_.regions()) { + visitor(region.ptr(), region.memory_size()); + } +} + +bool BFCAllocator::TracksAllocationSizes() { return true; } + +size_t BFCAllocator::RequestedSize(void* ptr) { + mutex_lock l(lock_); + BFCAllocator::ChunkHandle h = region_manager_.get_handle(ptr); + CHECK(h != kInvalidChunkHandle) + << "Asked for requested size of pointer we never allocated: " << ptr; + BFCAllocator::Chunk* c = ChunkFromHandle(h); + return c->requested_size; +} + +size_t BFCAllocator::AllocatedSize(void* ptr) { + mutex_lock l(lock_); + BFCAllocator::ChunkHandle h = region_manager_.get_handle(ptr); + CHECK(h != kInvalidChunkHandle) + << "Asked for allocated size of pointer we never allocated: " << ptr; + BFCAllocator::Chunk* c = ChunkFromHandle(h); + return c->size; +} + +int64 BFCAllocator::AllocationId(void* ptr) { + mutex_lock l(lock_); + BFCAllocator::ChunkHandle h = region_manager_.get_handle(ptr); + CHECK(h != kInvalidChunkHandle) + << "Asked for allocation id of pointer we never allocated: " << ptr; + BFCAllocator::Chunk* c = ChunkFromHandle(h); + return c->allocation_id; +} + +namespace { + +void RenderRegion(char* rendered, const size_t resolution, + const size_t total_render_size, const size_t offset, + const void* base_ptr, const void* ptr, const size_t size, + const char c) { + const char* base_ptr_c = static_cast<const char*>(base_ptr); + const char* ptr_c = static_cast<const char*>(ptr); + + size_t start_location = + ((ptr_c - base_ptr_c + offset) * resolution) / total_render_size; + CHECK_GE(start_location, 0); + CHECK_LT(start_location, resolution); + size_t end_location = + ((ptr_c + size - 1 - base_ptr_c + offset) * resolution) / + total_render_size; + CHECK_GE(end_location, 0); + CHECK_LT(end_location, resolution); + + for (size_t i = start_location; i <= end_location; ++i) { + rendered[i] = c; + } +} + +} // namespace + +string BFCAllocator::RenderOccupancy() { + // Make a buffer for the ASCII-art representation. + const size_t resolution = 100; + char rendered[resolution]; + + // Compute the total region size to render over + size_t total_region_size = 0; + for (const auto& region : region_manager_.regions()) { + total_region_size += region.memory_size(); + } + + // Start out with everything empty + RenderRegion(rendered, resolution, total_region_size, 0, nullptr, nullptr, + total_region_size, '_'); + + size_t region_offset = 0; + for (const auto& region : region_manager_.regions()) { + ChunkHandle h = region_manager_.get_handle(region.ptr()); + // Then render each chunk left to right. + while (h != kInvalidChunkHandle) { + Chunk* c = ChunkFromHandle(h); + if (c->in_use()) { + // Render the wasted space + size_t wasted = c->size - c->requested_size; + if (wasted > 0) { + RenderRegion(rendered, resolution, total_region_size, + region_offset + c->requested_size, region.ptr(), c->ptr, + wasted, 'x'); + } + // Then the occupied space + RenderRegion(rendered, resolution, total_region_size, region_offset, + region.ptr(), c->ptr, c->requested_size, '*'); + } + h = c->next; + } + region_offset += region.memory_size(); + } + + return StringPiece(rendered, resolution).ToString(); +} + +void BFCAllocator::DumpMemoryLog(size_t num_bytes) { + // For each bin: tally up the total number of chunks and bytes. + // Note that bins hold only free chunks. + for (BinNum bin_num = 0; bin_num < kNumBins; bin_num++) { + Bin* b = BinFromIndex(bin_num); + + size_t total_bytes_in_use = 0; + size_t total_bytes_in_bin = 0; + size_t total_requested_bytes_in_use = 0; + size_t total_requested_bytes_in_bin = 0; + size_t total_chunks_in_use = 0; + size_t total_chunks_in_bin = 0; + for (ChunkHandle h : b->free_chunks) { + Chunk* c = ChunkFromHandle(h); + total_bytes_in_bin += c->size; + total_requested_bytes_in_bin += c->requested_size; + ++total_chunks_in_bin; + if (c->in_use()) { + total_bytes_in_use += c->size; + total_requested_bytes_in_use += c->requested_size; + ++total_chunks_in_use; + } + } + + LOG(INFO) << "Bin (" << b->bin_size + << "): \tTotal Chunks: " << total_chunks_in_bin + << ", Chunks in use: " << total_chunks_in_use << " " + << strings::HumanReadableNumBytes(total_bytes_in_bin) + << " allocated for chunks. " + << strings::HumanReadableNumBytes(total_requested_bytes_in_bin) + << " client-requested for chunks. " + << strings::HumanReadableNumBytes(total_bytes_in_use) + << " in use in bin. " + << strings::HumanReadableNumBytes(total_requested_bytes_in_use) + << " client-requested in use in bin."; + } + + // Find the bin that we would have liked to allocate in, so we + // can get some further analysis about fragmentation. + Bin* b = BinForSize(num_bytes); + + LOG(INFO) << "Bin for " << strings::HumanReadableNumBytes(num_bytes) + << " was " << strings::HumanReadableNumBytes(b->bin_size) + << ", Chunk State: "; + + for (ChunkHandle h : b->free_chunks) { + Chunk* c = ChunkFromHandle(h); + LOG(INFO) << c->DebugString(this, true); + } + + // Next show the chunks that are in use, and also summarize their + // number by size. + std::map<size_t, int> in_use_by_size; + for (const auto& region : region_manager_.regions()) { + ChunkHandle h = region_manager_.get_handle(region.ptr()); + while (h != kInvalidChunkHandle) { + const Chunk* c = ChunkFromHandle(h); + if (c->in_use()) { + in_use_by_size[c->size]++; + LOG(INFO) << "Chunk at " << c->ptr << " of size " << c->size; + } + h = c->next; + } + + h = region_manager_.get_handle(region.ptr()); + while (h != kInvalidChunkHandle) { + const Chunk* c = ChunkFromHandle(h); + if (!c->in_use()) { + LOG(INFO) << "Free at " << c->ptr << " of size " << c->size; + } + h = c->next; + } + } + + LOG(INFO) << " Summary of in-use Chunks by size: "; + size_t total_bytes = 0; + for (auto& it : in_use_by_size) { + LOG(INFO) << it.second << " Chunks of size " << it.first << " totalling " + << strings::HumanReadableNumBytes(it.first * it.second); + total_bytes += (it.first * it.second); + } + LOG(INFO) << "Sum Total of in-use chunks: " + << strings::HumanReadableNumBytes(total_bytes); + LOG(INFO) << "Stats: \n" << stats_.DebugString(); +} + +void BFCAllocator::GetStats(AllocatorStats* stats) { + mutex_lock l(lock_); + *stats = stats_; +} + +} // namespace tensorflow diff --git a/tensorflow/core/common_runtime/bfc_allocator.h b/tensorflow/core/common_runtime/bfc_allocator.h new file mode 100644 index 0000000000..1be804090a --- /dev/null +++ b/tensorflow/core/common_runtime/bfc_allocator.h @@ -0,0 +1,413 @@ +/* Copyright 2015 Google Inc. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMMON_RUNTIME_BFC_ALLOCATOR_H_ +#define TENSORFLOW_COMMON_RUNTIME_BFC_ALLOCATOR_H_ + +#include <memory> +#include <string> +#include <unordered_map> +#include <vector> + +#include "tensorflow/core/common_runtime/allocator_retry.h" +#include "tensorflow/core/common_runtime/visitable_allocator.h" +#include "tensorflow/core/lib/gtl/stl_util.h" +#include "tensorflow/core/lib/strings/strcat.h" +#include "tensorflow/core/platform/macros.h" +#include "tensorflow/core/platform/mutex.h" +#include "tensorflow/core/platform/thread_annotations.h" +#include "tensorflow/core/platform/types.h" +#include "tensorflow/core/protobuf/config.pb.h" + +namespace tensorflow { + +// A memory allocator that implements a 'best-fit with coalescing' +// algorithm. This is essentially a very simple version of Doug Lea's +// malloc (dlmalloc). +// +// The goal of this allocator is to support defragmentation via +// coalescing. One assumption we make is that the process using this +// allocator owns pretty much all of the memory, and that nearly +// all requests to allocate memory go through this interface. +class BFCAllocator : public VisitableAllocator { + public: + // Takes ownership of sub_allocator. + BFCAllocator(SubAllocator* sub_allocator, size_t total_memory, + bool allow_growth, const string& name); + ~BFCAllocator() override; + + string Name() override { return name_; } + void* AllocateRaw(size_t alignment, size_t num_bytes) override; + void* AllocateRaw(size_t alignment, size_t num_bytes, + const AllocationAttributes& allocation_attr) override; + void DeallocateRaw(void* ptr) override; + + void AddAllocVisitor(Visitor visitor) override; + + // Does nothing, because memory is never freed. + void AddFreeVisitor(Visitor visitor) override {} + + bool TracksAllocationSizes() override; + + size_t RequestedSize(void* ptr) override; + + size_t AllocatedSize(void* ptr) override; + + int64 AllocationId(void* ptr) override; + + void GetStats(AllocatorStats* stats) override; + + private: + struct Bin; + + void* AllocateRawInternal(size_t alignment, size_t num_bytes, + bool dump_log_on_failure); + void DeallocateRawInternal(void* ptr); + + // A ChunkHandle is an index into the chunks_ vector in BFCAllocator + // kInvalidChunkHandle means an invalid chunk + typedef int ChunkHandle; + static const int kInvalidChunkHandle = -1; + + typedef int BinNum; + static const int kInvalidBinNum = -1; + static const int kNumBins = 21; + + // Chunks point to memory. Their prev/next pointers form a + // doubly-linked list of addresses sorted by base address that + // must be contiguous. Chunks contain information about whether + // they are in use or whether they are free, and contain a pointer + // to the bin they are in. + struct Chunk { + size_t size = 0; // Full size of buffer. + + // We sometimes give chunks that are larger than needed to reduce + // fragmentation. requested_size keeps track of what the client + // actually wanted so we can understand whether our splitting + // strategy is efficient. + size_t requested_size = 0; + + // allocation_id is set to -1 when the chunk is not in use. It is assigned a + // value greater than zero before the chunk is returned from + // AllocateRaw, and this value is unique among values assigned by + // the parent allocator. + int64 allocation_id = -1; + void* ptr = nullptr; // pointer to granted subbuffer. + + // If not kInvalidChunkHandle, the memory referred to by 'prev' is directly + // preceding the memory used by this chunk. E.g., It should start + // at 'ptr - prev->size' + ChunkHandle prev = kInvalidChunkHandle; + + // If not kInvalidChunkHandle, the memory referred to by 'next' is directly + // following the memory used by this chunk. E.g., It should be at + // 'ptr + size' + ChunkHandle next = kInvalidChunkHandle; + + // What bin are we in? + BinNum bin_num = kInvalidBinNum; + + bool in_use() const { return allocation_id != -1; } + + string DebugString(BFCAllocator* a, bool recurse) { + string dbg; + strings::StrAppend(&dbg, " Size: ", strings::HumanReadableNumBytes(size), + " | Requested Size: ", + strings::HumanReadableNumBytes(requested_size), + " | in_use: ", in_use()); + if (recurse && prev != BFCAllocator::kInvalidChunkHandle) { + Chunk* p = a->ChunkFromHandle(prev); + strings::StrAppend(&dbg, ", prev: ", p->DebugString(a, false)); + } + if (recurse && next != BFCAllocator::kInvalidChunkHandle) { + Chunk* n = a->ChunkFromHandle(next); + strings::StrAppend(&dbg, ", next: ", n->DebugString(a, false)); + } + return dbg; + } + }; + + // A Bin is a collection of similar-sized free chunks. + struct Bin { + // All chunks in this bin have >= bin_size memory. + size_t bin_size = 0; + + struct ChunkComparator { + explicit ChunkComparator(BFCAllocator* allocator) + : allocator_(allocator) {} + // Sort first by size and then use pointer address as a tie breaker. + bool operator()(const ChunkHandle ha, const ChunkHandle hb) const { + const Chunk* a = allocator_->ChunkFromHandle(ha); + const Chunk* b = allocator_->ChunkFromHandle(hb); + if (a->size != b->size) { + return a->size < b->size; + } + return a->ptr < b->ptr; + } + + private: + BFCAllocator* allocator_; // The parent allocator + }; + + typedef std::set<ChunkHandle, ChunkComparator> FreeChunkSet; + // List of free chunks within the bin, sorted by chunk size. + // Chunk * not owned. + FreeChunkSet free_chunks; + Bin(BFCAllocator* allocator, size_t bs) + : bin_size(bs), free_chunks(ChunkComparator(allocator)) {} + }; + + static const size_t kMinAllocationBits = 8; + static const size_t kMinAllocationSize = 1 << kMinAllocationBits; + + // AllocationRegion maps pointers to ChunkHandles for a single + // contiguous memory region. + // + // This class is thread-compatible. + class AllocationRegion { + public: + AllocationRegion(void* ptr, size_t memory_size) + : ptr_(ptr), + memory_size_(memory_size), + end_ptr_( + static_cast<void*>(static_cast<char*>(ptr_) + memory_size_)) { + DCHECK_EQ(0, memory_size % kMinAllocationSize); + const size_t n_handles = + (memory_size + kMinAllocationSize - 1) / kMinAllocationSize; + handles_ = new ChunkHandle[n_handles]; + for (size_t i = 0; i < n_handles; i++) { + handles_[i] = kInvalidChunkHandle; + } + } + + AllocationRegion() {} + + ~AllocationRegion() { delete[] handles_; } + + AllocationRegion(AllocationRegion&& other) { Swap(other); } + + AllocationRegion& operator=(AllocationRegion&& other) { + Swap(other); + return *this; + } + + void* ptr() const { return ptr_; } + void* end_ptr() const { return end_ptr_; } + size_t memory_size() const { return memory_size_; } + ChunkHandle get_handle(const void* p) const { + return handles_[IndexFor(p)]; + } + void set_handle(const void* p, ChunkHandle h) { handles_[IndexFor(p)] = h; } + void erase(const void* p) { set_handle(p, kInvalidChunkHandle); } + + private: + void Swap(AllocationRegion& other) { + std::swap(ptr_, other.ptr_); + std::swap(memory_size_, other.memory_size_); + std::swap(end_ptr_, other.end_ptr_); + std::swap(handles_, other.handles_); + } + + int IndexFor(const void* p) const { + std::uintptr_t p_int = reinterpret_cast<std::uintptr_t>(p); + std::uintptr_t base_int = reinterpret_cast<std::uintptr_t>(ptr_); + DCHECK_GE(p_int, base_int); + DCHECK_LT(p_int, base_int + memory_size_); + return static_cast<int>(((p_int - base_int) >> kMinAllocationBits)); + } + + // Metadata about the allocation region. + void* ptr_ = nullptr; + size_t memory_size_ = 0; + void* end_ptr_ = nullptr; + + // Array of size "memory_size / kMinAllocationSize". It is + // indexed by (p-base) / kMinAllocationSize, contains ChunkHandle + // for the memory allocation represented by "p" + ChunkHandle* handles_ = nullptr; + + TF_DISALLOW_COPY_AND_ASSIGN(AllocationRegion); + }; + + // RegionManager aggregates one or more "AllocationRegions" and provides + // a layer of indirection from pointers to the underlying ChunkHandle, + // allowing allocation across multiple discontiguous memory regions. + // + // This class is thread-compatible. + class RegionManager { + public: + RegionManager() {} + ~RegionManager() {} + + void AddAllocationRegion(void* ptr, size_t memory_size) { + // Insert sorted by end_ptr + auto entry = + std::upper_bound(regions_.begin(), regions_.end(), ptr, &Comparator); + regions_.insert(entry, AllocationRegion(ptr, memory_size)); + } + + ChunkHandle get_handle(const void* p) const { + return RegionFor(p)->get_handle(p); + } + + void set_handle(const void* p, ChunkHandle h) { + return MutableRegionFor(p)->set_handle(p, h); + } + void erase(const void* p) { return MutableRegionFor(p)->erase(p); } + + const std::vector<AllocationRegion>& regions() const { return regions_; } + + private: + static bool Comparator(const void* ptr, const AllocationRegion& other) { + return ptr < other.end_ptr(); + } + + AllocationRegion* MutableRegionFor(const void* p) { + return const_cast<AllocationRegion*>(RegionFor(p)); + } + + const AllocationRegion* RegionFor(const void* p) const { + auto entry = + std::upper_bound(regions_.begin(), regions_.end(), p, &Comparator); + + if (entry != regions_.end()) { + return &(*entry); + } + + LOG(FATAL) << "Could not find Region for " << p; + return nullptr; + } + + private: + std::vector<AllocationRegion> regions_; + }; + + // Returns 'bytes' rounded up to the next highest kMinAllocationSize. + size_t RoundedBytes(size_t bytes); + + // Try to add a new memory region that can satisfy an allocation of + // 'rounded_bytes' bytes. Returns true on success and false on + // failure. + bool Extend(size_t rounded_bytes) EXCLUSIVE_LOCKS_REQUIRED(lock_); + + // Returns a pointer to an underlying allocated chunk of size + // 'rounded_bytes'. + void* FindChunkPtr(BinNum bin_num, size_t rounded_bytes, size_t num_bytes) + EXCLUSIVE_LOCKS_REQUIRED(lock_); + + // Splits the chunk specified by 'h' into two chunks, one at least + // of size 'num_bytes'. + void SplitChunk(ChunkHandle h, size_t num_bytes) + EXCLUSIVE_LOCKS_REQUIRED(lock_); + + // Merges the two chunk handles. Requires that the chunks are + // contiguous in their allocation. + void Merge(ChunkHandle h, ChunkHandle h2) EXCLUSIVE_LOCKS_REQUIRED(lock_); + + // Frees the memory represented by 'h', coalescing the chunk if + // possible. + void FreeAndMaybeCoalesce(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_); + + // Adds the chunk 'h' to the proper free bin. + void InsertFreeChunkIntoBin(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_); + + // Removes the free chunk pointed to by 'c' from the set free_chunks. + void RemoveFreeChunkIterFromBin(Bin::FreeChunkSet* free_chunks, + const Bin::FreeChunkSet::iterator& c) + EXCLUSIVE_LOCKS_REQUIRED(lock_); + + // Removes a free chunk from the bin. + void RemoveFreeChunkFromBin(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_); + + // Removes the chunk metadata represented by 'h'. + void DeleteChunk(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_); + + string RenderOccupancy() EXCLUSIVE_LOCKS_REQUIRED(lock_); + void DumpMemoryLog(size_t num_bytes) EXCLUSIVE_LOCKS_REQUIRED(lock_); + + ChunkHandle AllocateChunk() EXCLUSIVE_LOCKS_REQUIRED(lock_); + void DeallocateChunk(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_); + + Chunk* ChunkFromHandle(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_); + + AllocatorRetry retry_helper_; + + // Structures immutable after construction + size_t memory_limit_ = 0; + inline int Log2FloorNonZero(uint64 n) { +#if defined(__GNUC__) + return 63 ^ __builtin_clzll(n); +#else + int r = 0; + while (n > 0) { + r++; + n >>= 1; + } + return r; +#endif + } + + // Map from bin size to Bin + Bin* BinFromIndex(BinNum index) { + return reinterpret_cast<Bin*>(&(bins_space_[index * sizeof(Bin)])); + } + size_t BinNumToSize(BinNum index) { + return static_cast<size_t>(256) << index; + } + BinNum BinNumForSize(size_t bytes) { + uint64 v = std::max<size_t>(bytes, 256) >> kMinAllocationBits; + int b = std::min(kNumBins - 1, Log2FloorNonZero(v)); + return b; + } + Bin* BinForSize(size_t bytes) { return BinFromIndex(BinNumForSize(bytes)); } + + char bins_space_[sizeof(Bin) * kNumBins]; + + // The size of the current region allocation. + size_t curr_region_allocation_bytes_; + + // The total number of allocated bytes by the allocator. + size_t total_region_allocated_bytes_ = 0; + + // An indicator that expansion of a region has hit the limits + // of the available memory. + bool started_backpedal_ = false; + + std::unique_ptr<SubAllocator> suballocator_; + string name_; + + // Structures mutable after construction + mutable mutex lock_; + RegionManager region_manager_ GUARDED_BY(lock_); + + std::vector<Chunk> chunks_; + ChunkHandle free_chunks_list_; // Ptr to head of linked list of free Chunks + + // Called once on each region, ASAP. + std::vector<Visitor> region_visitors_; + + // Counter containing the next unique identifier to assign to a + // newly-created chunk. + int64 next_allocation_id_ GUARDED_BY(lock_); + + // Stats. + AllocatorStats stats_ GUARDED_BY(lock_); + + TF_DISALLOW_COPY_AND_ASSIGN(BFCAllocator); +}; + +} // namespace tensorflow + +#endif // TENSORFLOW_COMMON_RUNTIME_BFC_ALLOCATOR_H_ diff --git a/tensorflow/core/common_runtime/function.cc b/tensorflow/core/common_runtime/function.cc index 403cece230..47bd6c56ec 100644 --- a/tensorflow/core/common_runtime/function.cc +++ b/tensorflow/core/common_runtime/function.cc @@ -1170,37 +1170,44 @@ FunctionBody* SymbolicGradientHelper::Compute() { Copy(); Graph* g = gbody_->graph; + + const int num_y = gbody_->ret_nodes.size(); + + // Populate 'y_node_outputs_' with node function body outputs. // Populate 'y_grad_nodes' with initial gradient nodes for each return node of // the original function body (these will be 'arg' nodes in the function // gradient body). - const int num_y = gbody_->ret_nodes.size(); - std::vector<Node*> y_grad_nodes; - y_grad_nodes.reserve(num_y); + std::vector<NodeOut> y_node_outputs; + y_node_outputs.reserve(num_y); + std::vector<NodeOut> y_grad_node_outputs; + y_grad_node_outputs.reserve(num_y); for (int i = 0; i < num_y; ++i) { Node* y = gbody_->ret_nodes[i]; + y_node_outputs.push_back({y, 0}); DCHECK_EQ(y->type_string(), kRetOp); const DataType dtype = y->input_type(0); const int index = gbody_->arg_nodes.size(); Node* dy = AddArg(g, dtype, index); gbody_->arg_types.push_back(dtype); gbody_->arg_nodes.push_back(dy); - y_grad_nodes.push_back(dy); + y_grad_node_outputs.push_back({dy, 0}); } - // Populate 'x_nodes' with function args (not including 'y_grad_nodes'). + // Populate 'x_nodes' with function args (excluding 'y_grad_node_outputs'). const int num_x = fbody_->arg_nodes.size(); - std::vector<Node*> x_nodes; - x_nodes.reserve(num_x); + std::vector<NodeOut> x_node_outputs; + x_node_outputs.reserve(num_x); for (size_t i = 0; i < fbody_->arg_nodes.size(); ++i) { - x_nodes.push_back(gbody_->arg_nodes[i]); + x_node_outputs.push_back({gbody_->arg_nodes[i], 0}); } // Call AddSymbolicGradients which will add nodes to graph 'g' that - // compute the function gradient (adding an entry in 'x_grad_nodes' for - // each node in 'x_nodes'). - std::vector<GradNodeOutput> x_grad_nodes(x_nodes.size()); - TF_CHECK_OK(AddSymbolicGradients(gbody_->ret_nodes, x_nodes, y_grad_nodes, - &x_grad_nodes, g)); + // compute the function gradient (adding an entry in 'x_grad_node_outputs' for + // each node in 'x_node_outputs'). + std::vector<NodeOut> x_grad_node_outputs; + TF_CHECK_OK(AddSymbolicGradients(y_node_outputs, x_node_outputs, + y_grad_node_outputs, &x_grad_node_outputs, + g)); // Remove the old return nodes from the function body. for (Node* n : gbody_->ret_nodes) { @@ -1211,7 +1218,7 @@ FunctionBody* SymbolicGradientHelper::Compute() { // Add new return nodes to the function gradient body for each node // in 'x_grad_nodes'. for (size_t i = 0; i < fbody_->arg_types.size(); ++i) { - Endpoint grad = {x_grad_nodes[i].node, x_grad_nodes[i].index}; + Endpoint grad = {x_grad_node_outputs[i].node, x_grad_node_outputs[i].index}; Node* ret = AddRet(g, grad, i); gbody_->ret_nodes.push_back(ret); } diff --git a/tensorflow/core/common_runtime/gpu/gpu_allocator_retry_test.cc b/tensorflow/core/common_runtime/gpu/gpu_allocator_retry_test.cc index c701f80cec..a3ac2e1d67 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_allocator_retry_test.cc +++ b/tensorflow/core/common_runtime/gpu/gpu_allocator_retry_test.cc @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "tensorflow/core/common_runtime/gpu/gpu_allocator_retry.h" +#include "tensorflow/core/common_runtime/allocator_retry.h" #include <vector> #include "tensorflow/core/lib/core/notification.h" @@ -55,7 +55,7 @@ class FakeAllocator { } private: - GPUAllocatorRetry retry_; + AllocatorRetry retry_; void* good_ptr_ = reinterpret_cast<void*>(0xdeadbeef); mutex mu_; size_t memory_capacity_ GUARDED_BY(mu_); diff --git a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc index 03507cd948..33496154ec 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc +++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc @@ -15,17 +15,7 @@ limitations under the License. #include "tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h" -#include "tensorflow/core/common_runtime/gpu/gpu_allocator_retry.h" #include "tensorflow/core/common_runtime/gpu/gpu_init.h" -#include "tensorflow/core/lib/core/bits.h" -#include "tensorflow/core/lib/gtl/stl_util.h" -#include "tensorflow/core/lib/strings/numbers.h" -#include "tensorflow/core/lib/strings/str_util.h" -#include "tensorflow/core/lib/strings/strcat.h" -#include "tensorflow/core/platform/logging.h" -#include "tensorflow/core/platform/mutex.h" -#include "tensorflow/core/platform/stream_executor.h" -#include "tensorflow/core/platform/types.h" namespace gpu = ::perftools::gputools; @@ -36,680 +26,9 @@ GPUBFCAllocator::GPUBFCAllocator(int device_id, size_t total_memory) GPUBFCAllocator::GPUBFCAllocator(int device_id, size_t total_memory, const GPUOptions& gpu_options) - : device_id_(device_id), - free_chunks_list_(kInvalidChunkHandle), - next_allocation_id_(1) { - // Get a pointer to the stream_executor for this device - stream_exec_ = GPUMachineManager()->ExecutorForDevice(device_id).ValueOrDie(); - - if (gpu_options.allow_growth()) { - // 1MiB smallest initial allocation, unless total memory available - // is less. - curr_region_allocation_bytes_ = - RoundedBytes(std::min(total_memory, size_t{1048576})); - } else { - curr_region_allocation_bytes_ = RoundedBytes(total_memory); - } - - // Allocate the requested amount of memory. - gpu_memory_size_ = total_memory; - stats_.bytes_limit = static_cast<int64>(total_memory); - - // Create a bunch of bins of various good sizes. - - // We create bins to fit all possible ranges that cover the - // gpu_memory_size_ starting from allocations up to 256 bytes to - // allocations up to (and including) the memory limit. - for (BinNum b = 0; b < kNumBins; b++) { - size_t bin_size = BinNumToSize(b); - VLOG(1) << "Creating bin of max chunk size " - << strings::HumanReadableNumBytes(bin_size); - new (BinFromIndex(b)) Bin(this, bin_size); - CHECK_EQ(BinForSize(bin_size), BinFromIndex(b)); - CHECK_EQ(BinForSize(bin_size + 255), BinFromIndex(b)); - CHECK_EQ(BinForSize(bin_size * 2 - 1), BinFromIndex(b)); - if (b + 1 < kNumBins) { - CHECK_NE(BinForSize(bin_size * 2), BinFromIndex(b)); - } - } -} - -GPUBFCAllocator::~GPUBFCAllocator() { - // Return memory back. - VLOG(2) << "Number of regions allocated: " - << region_manager_.regions().size(); - for (const auto& region : region_manager_.regions()) { - gpu::DeviceMemoryBase gpu_ptr{region.ptr()}; - stream_exec_->Deallocate(&gpu_ptr); - } - - for (BinNum b = 0; b < kNumBins; b++) { - BinFromIndex(b)->~Bin(); - } -} - -GPUBFCAllocator::Chunk* GPUBFCAllocator::ChunkFromHandle(ChunkHandle h) { - DCHECK_GE(h, 0); - DCHECK_LT(h, static_cast<int>(chunks_.size())); - return &(chunks_[h]); -} - -bool GPUBFCAllocator::Extend(size_t rounded_bytes) { - // Do we have enough space to handle the client's request? - // If not, fail immediately. - if (total_region_allocated_bytes_ + rounded_bytes > gpu_memory_size_) { - return false; - } - - // If curr_region_allocation_bytes_ is not enough to satisfy the - // allocation, keep multiplying by a power of two until that is - // sufficient. - bool increased_allocation = false; - while (rounded_bytes > curr_region_allocation_bytes_) { - curr_region_allocation_bytes_ *= 2; - increased_allocation = true; - } - - // Try allocating. - size_t bytes = curr_region_allocation_bytes_; - gpu::DeviceMemory<char> gpu_mem = stream_exec_->AllocateArray<char>(bytes); - if (gpu_mem == nullptr && !started_backpedal_) { - // Only backpedal once. - started_backpedal_ = true; - - static constexpr float kBackpedalFactor = 0.9; - - // Try allocating less memory. - bytes = RoundedBytes(bytes * kBackpedalFactor); - while (gpu_mem == nullptr && bytes > rounded_bytes) { - gpu_mem = stream_exec_->AllocateArray<char>(bytes); - bytes = RoundedBytes(bytes * kBackpedalFactor); - } - } - - if (gpu_mem == nullptr) { - return false; - } - - if (!increased_allocation) { - // Increase the region size of the next required allocation. - curr_region_allocation_bytes_ *= 2; - } - - VLOG(1) << "Extending allocation by " << strings::HumanReadableNumBytes(bytes) - << " bytes."; - - total_region_allocated_bytes_ += bytes; - VLOG(1) << "Total allocated bytes: " - << strings::HumanReadableNumBytes(total_region_allocated_bytes_); - - void* gpu_mem_base = gpu_mem.opaque(); - VLOG(1) << "Allocated memory at " << gpu_mem_base << " to " - << static_cast<void*>(static_cast<char*>(gpu_mem_base) + bytes); - region_manager_.AddAllocationRegion(gpu_mem_base, bytes); - - // Create one large chunk for the whole memory space that will - // be chunked later. - ChunkHandle h = AllocateChunk(); - GPUBFCAllocator::Chunk* c = ChunkFromHandle(h); - c->ptr = gpu_mem_base; - c->size = bytes; - c->allocation_id = -1; - c->prev = kInvalidChunkHandle; - c->next = kInvalidChunkHandle; - - region_manager_.set_handle(c->ptr, h); - - // TODO(vrv): Try to merge this new region with an existing region, - // if the address space is contiguous, to avoid fragmentation - // across regions. - - // Insert the chunk into the right bin. - InsertFreeChunkIntoBin(h); - - // Invoke visitors on newly allocated region. - for (auto visitor : region_visitors_) { - visitor(gpu_mem_base, bytes); - } - return true; -} - -GPUBFCAllocator::ChunkHandle GPUBFCAllocator::AllocateChunk() { - if (free_chunks_list_ != kInvalidChunkHandle) { - ChunkHandle h = free_chunks_list_; - Chunk* c = ChunkFromHandle(h); - free_chunks_list_ = c->next; - return h; - } else { - ChunkHandle h = chunks_.size(); - chunks_.resize(h + 1); - return h; - } -} - -void GPUBFCAllocator::DeallocateChunk(ChunkHandle h) { - Chunk* c = ChunkFromHandle(h); - c->next = free_chunks_list_; - free_chunks_list_ = h; -} - -void* GPUBFCAllocator::AllocateRaw(size_t unused_alignment, size_t num_bytes) { - // Fast path: Try once to allocate without getting the retry_helper_ involved - void* r = AllocateRawInternal(unused_alignment, num_bytes, false); - if (r != nullptr) { - return r; - } else { - static const int64 kMaxMillisToWait = 10000; // 10 seconds - return retry_helper_.AllocateRaw( - [this](size_t a, size_t nb, bool v) { - return AllocateRawInternal(a, nb, v); - }, - kMaxMillisToWait, unused_alignment, num_bytes); - } -} - -void* GPUBFCAllocator::AllocateRaw( - size_t unused_alignment, size_t num_bytes, - const AllocationAttributes& allocation_attr) { - if (allocation_attr.no_retry_on_failure) { - // Return immediately upon the first failure if this is for allocating an - // optional scratch space. - void* result = AllocateRawInternal(unused_alignment, num_bytes, false); - if (result == nullptr) { - // The counter incrementing is not thread-safe. But we don't really care. - // TODO(zhengxq): we should implement a LOG_FIRST_N and LOG_EVERY_N for - // more general usage. - static int log_counter = 0; - if (log_counter < 10) { - log_counter++; - LOG(WARNING) - << "Ran out of memory trying to allocate " - << strings::HumanReadableNumBytes(num_bytes) - << ". The caller indicates that this is not a failure, but" - << " may mean that there could be performance gains if more" - << " memory is available."; - } - } - return result; - } else { - return AllocateRaw(unused_alignment, num_bytes); - } -} - -// static -size_t GPUBFCAllocator::RoundedBytes(size_t bytes) { - size_t rounded_bytes = - (kMinAllocationSize * - ((bytes + kMinAllocationSize - 1) / kMinAllocationSize)); - DCHECK_EQ(size_t{0}, rounded_bytes % kMinAllocationSize); - return rounded_bytes; -} - -void* GPUBFCAllocator::AllocateRawInternal(size_t unused_alignment, - size_t num_bytes, - bool dump_log_on_failure) { - if (num_bytes == 0) { - LOG(ERROR) << "tried to allocate 0 bytes"; - return nullptr; - } - // First, always allocate memory of at least kMinAllocationSize - // bytes, and always allocate multiples of kMinAllocationSize bytes - // so all memory addresses are nicely byte aligned. - size_t rounded_bytes = RoundedBytes(num_bytes); - - // The BFC allocator tries to find the best fit first. - BinNum bin_num = BinNumForSize(rounded_bytes); - - mutex_lock l(lock_); - void* ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes); - if (ptr != nullptr) { - return ptr; - } - - // Try to extend - if (Extend(rounded_bytes)) { - ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes); - if (ptr != nullptr) { - return ptr; - } - } - - // We searched all bins for an existing free chunk to use and - // couldn't find one. This means we must have run out of memory, - // Dump the memory log for analysis. - if (dump_log_on_failure) { - DumpMemoryLog(rounded_bytes); - LOG(WARNING) << RenderOccupancy(); - LOG(WARNING) << "Ran out of memory trying to allocate " - << strings::HumanReadableNumBytes(num_bytes) - << ". See logs for memory state."; - } - return nullptr; -} - -void* GPUBFCAllocator::FindChunkPtr(BinNum bin_num, size_t rounded_bytes, - size_t num_bytes) { - // First identify the first bin that could satisfy rounded_bytes. - for (; bin_num < kNumBins; bin_num++) { - // Start searching from the first bin for the smallest chunk that fits - // rounded_bytes. - Bin* b = BinFromIndex(bin_num); - for (auto citer = b->free_chunks.begin(); citer != b->free_chunks.end(); - ++citer) { - const GPUBFCAllocator::ChunkHandle h = (*citer); - GPUBFCAllocator::Chunk* chunk = ChunkFromHandle(h); - DCHECK(!chunk->in_use()); - if (chunk->size >= rounded_bytes) { - // We found an existing chunk that fits us that wasn't in use, so remove - // it from the free bin structure prior to using. - RemoveFreeChunkIterFromBin(&b->free_chunks, citer); - - // If we can break the size of the chunk into two reasonably - // large pieces, do so. - // - // TODO(vrv): What should be the criteria when deciding when - // to split? - if (chunk->size >= rounded_bytes * 2) { - SplitChunk(h, rounded_bytes); - chunk = ChunkFromHandle(h); // Update chunk pointer in case it moved - } - - // The requested size of the returned chunk is what the user - // has allocated. - chunk->requested_size = num_bytes; - // Assign a unique id and increment the id counter, marking the - // chunk as being in use. - chunk->allocation_id = next_allocation_id_++; - - // Update stats. - ++stats_.num_allocs; - stats_.bytes_in_use += chunk->size; - stats_.max_bytes_in_use = - std::max(stats_.max_bytes_in_use, stats_.bytes_in_use); - stats_.max_alloc_size = - std::max<std::size_t>(stats_.max_alloc_size, chunk->size); - - VLOG(4) << "Returning: " << chunk->ptr; - if (VLOG_IS_ON(4)) { - LOG(INFO) << "A: " << RenderOccupancy(); - } - return chunk->ptr; - } - } - } - - return nullptr; -} - -void GPUBFCAllocator::SplitChunk(GPUBFCAllocator::ChunkHandle h, - size_t num_bytes) { - // Allocate the new chunk before we do any ChunkFromHandle - ChunkHandle h_new_chunk = AllocateChunk(); - - Chunk* c = ChunkFromHandle(h); - CHECK(!c->in_use() && (c->bin_num == kInvalidBinNum)); - - // Create a new chunk starting num_bytes after c - GPUBFCAllocator::Chunk* new_chunk = ChunkFromHandle(h_new_chunk); - new_chunk->ptr = static_cast<void*>(static_cast<char*>(c->ptr) + num_bytes); - region_manager_.set_handle(new_chunk->ptr, h_new_chunk); - - // Set the new sizes of the chunks. - new_chunk->size = c->size - num_bytes; - c->size = num_bytes; - - // The new chunk is not in use. - new_chunk->allocation_id = -1; - - // Maintain the pointers. - // c <-> c_neighbor becomes - // c <-> new_chunk <-> c_neighbor - GPUBFCAllocator::ChunkHandle h_neighbor = c->next; - new_chunk->prev = h; - new_chunk->next = h_neighbor; - c->next = h_new_chunk; - if (h_neighbor != kInvalidChunkHandle) { - Chunk* c_neighbor = ChunkFromHandle(h_neighbor); - c_neighbor->prev = h_new_chunk; - } - - // Add the newly free chunk to the free bin. - InsertFreeChunkIntoBin(h_new_chunk); -} - -void GPUBFCAllocator::DeallocateRaw(void* ptr) { - DeallocateRawInternal(ptr); - retry_helper_.NotifyDealloc(); -} - -void GPUBFCAllocator::DeallocateRawInternal(void* ptr) { - if (ptr == nullptr) { - LOG(ERROR) << "tried to deallocate nullptr"; - return; - } - mutex_lock l(lock_); - - // Find the chunk from the ptr. - GPUBFCAllocator::ChunkHandle h = region_manager_.get_handle(ptr); - CHECK(h != kInvalidChunkHandle); - - // Consider coalescing it. - FreeAndMaybeCoalesce(h); - - if (VLOG_IS_ON(4)) { - LOG(INFO) << "F: " << RenderOccupancy(); - } -} - -// Merges h1 and h2 when Chunk(h1)->next is h2 and Chunk(h2)->prev is c1. -// We merge Chunk(h2) into Chunk(h1). -void GPUBFCAllocator::Merge(GPUBFCAllocator::ChunkHandle h1, - GPUBFCAllocator::ChunkHandle h2) { - Chunk* c1 = ChunkFromHandle(h1); - Chunk* c2 = ChunkFromHandle(h2); - // We can only merge chunks that are not in use. - CHECK(!c1->in_use() && !c2->in_use()); - - // c1's prev doesn't change, still points to the same ptr, and is - // still not in use. - - // Fix up neighbor pointers - // - // c1 <-> c2 <-> c3 should become - // c1 <-> c3 - - GPUBFCAllocator::ChunkHandle h3 = c2->next; - c1->next = h3; - CHECK(c2->prev == h1); - if (h3 != kInvalidChunkHandle) { - GPUBFCAllocator::Chunk* c3 = ChunkFromHandle(h3); - c3->prev = h1; - } - - // Set the new size - c1->size += c2->size; - - DeleteChunk(h2); -} - -void GPUBFCAllocator::DeleteChunk(ChunkHandle h) { - // Delete h and cleanup all state - Chunk* c = ChunkFromHandle(h); - // VLOG(4) << "Removing: " << c->ptr; - region_manager_.erase(c->ptr); - DeallocateChunk(h); -} - -void GPUBFCAllocator::InsertFreeChunkIntoBin(GPUBFCAllocator::ChunkHandle h) { - Chunk* c = ChunkFromHandle(h); - CHECK(!c->in_use() && (c->bin_num == kInvalidBinNum)); - BinNum bin_num = BinNumForSize(c->size); - Bin* new_bin = BinFromIndex(bin_num); - c->bin_num = bin_num; - new_bin->free_chunks.insert(h); -} - -void GPUBFCAllocator::RemoveFreeChunkIterFromBin( - GPUBFCAllocator::Bin::FreeChunkSet* free_chunks, - const GPUBFCAllocator::Bin::FreeChunkSet::iterator& citer) { - ChunkHandle h = *citer; - Chunk* c = ChunkFromHandle(h); - CHECK(!c->in_use() && (c->bin_num != kInvalidBinNum)); - free_chunks->erase(citer); - c->bin_num = kInvalidBinNum; -} - -void GPUBFCAllocator::RemoveFreeChunkFromBin(GPUBFCAllocator::ChunkHandle h) { - Chunk* c = ChunkFromHandle(h); - CHECK(!c->in_use() && (c->bin_num != kInvalidBinNum)); - int count = BinFromIndex(c->bin_num)->free_chunks.erase(h); - CHECK(count > 0) << "Could not find chunk in bin"; - c->bin_num = kInvalidBinNum; -} - -void GPUBFCAllocator::FreeAndMaybeCoalesce(GPUBFCAllocator::ChunkHandle h) { - Chunk* c = ChunkFromHandle(h); - CHECK(c->in_use() && (c->bin_num == kInvalidBinNum)); - - // Mark the chunk as no longer in use - c->allocation_id = -1; - - // Updates the stats. - stats_.bytes_in_use -= c->size; - - // This chunk is no longer in-use, consider coalescing the chunk - // with adjacent chunks. - ChunkHandle chunk_to_reassign = h; - - // If the next chunk is free, coalesce the two - if (c->next != kInvalidChunkHandle) { - Chunk* cnext = ChunkFromHandle(c->next); - if (!cnext->in_use()) { - // VLOG(8) << "Chunk at " << cnext->ptr << " merging with c " << - // c->ptr; - - chunk_to_reassign = h; - - // Deletes c->next - RemoveFreeChunkFromBin(c->next); - Merge(h, ChunkFromHandle(h)->next); - } - } - - // If the previous chunk is free, coalesce the two - c = ChunkFromHandle(h); - if (c->prev != kInvalidChunkHandle) { - Chunk* cprev = ChunkFromHandle(c->prev); - if (!cprev->in_use()) { - // VLOG(8) << "Chunk at " << c->ptr << " merging into c->prev " - // << cprev->ptr; - - chunk_to_reassign = c->prev; - - // Deletes c - RemoveFreeChunkFromBin(c->prev); - Merge(ChunkFromHandle(h)->prev, h); - c = ChunkFromHandle(h); - } - } - - InsertFreeChunkIntoBin(chunk_to_reassign); -} - -void GPUBFCAllocator::AddAllocVisitor(Visitor visitor) { - VLOG(1) << "AddVisitor"; - mutex_lock l(lock_); - region_visitors_.push_back(visitor); - for (const auto& region : region_manager_.regions()) { - visitor(region.ptr(), region.memory_size()); - } -} - -bool GPUBFCAllocator::TracksAllocationSizes() { return true; } - -size_t GPUBFCAllocator::RequestedSize(void* ptr) { - mutex_lock l(lock_); - GPUBFCAllocator::ChunkHandle h = region_manager_.get_handle(ptr); - CHECK(h != kInvalidChunkHandle) - << "Asked for requested size of pointer we never allocated: " << ptr; - GPUBFCAllocator::Chunk* c = ChunkFromHandle(h); - return c->requested_size; -} - -size_t GPUBFCAllocator::AllocatedSize(void* ptr) { - mutex_lock l(lock_); - GPUBFCAllocator::ChunkHandle h = region_manager_.get_handle(ptr); - CHECK(h != kInvalidChunkHandle) - << "Asked for allocated size of pointer we never allocated: " << ptr; - GPUBFCAllocator::Chunk* c = ChunkFromHandle(h); - return c->size; -} - -int64 GPUBFCAllocator::AllocationId(void* ptr) { - mutex_lock l(lock_); - GPUBFCAllocator::ChunkHandle h = region_manager_.get_handle(ptr); - CHECK(h != kInvalidChunkHandle) - << "Asked for allocation id of pointer we never allocated: " << ptr; - GPUBFCAllocator::Chunk* c = ChunkFromHandle(h); - return c->allocation_id; -} - -namespace { - -void RenderRegion(char* rendered, const size_t resolution, - const size_t total_render_size, const size_t offset, - const void* base_ptr, const void* ptr, const size_t size, - const char c) { - const char* base_ptr_c = static_cast<const char*>(base_ptr); - const char* ptr_c = static_cast<const char*>(ptr); - - size_t start_location = - ((ptr_c - base_ptr_c + offset) * resolution) / total_render_size; - CHECK_GE(start_location, 0); - CHECK_LT(start_location, resolution); - size_t end_location = - ((ptr_c + size - 1 - base_ptr_c + offset) * resolution) / - total_render_size; - CHECK_GE(end_location, 0); - CHECK_LT(end_location, resolution); - - for (size_t i = start_location; i <= end_location; ++i) { - rendered[i] = c; - } -} - -} // namespace - -string GPUBFCAllocator::RenderOccupancy() { - // Make a buffer for the ASCII-art representation. - const size_t resolution = 100; - char rendered[resolution]; - - // Compute the total region size to render over - size_t total_region_size = 0; - for (const auto& region : region_manager_.regions()) { - total_region_size += region.memory_size(); - } - - // Start out with everything empty - RenderRegion(rendered, resolution, total_region_size, 0, nullptr, nullptr, - total_region_size, '_'); - - size_t region_offset = 0; - for (const auto& region : region_manager_.regions()) { - ChunkHandle h = region_manager_.get_handle(region.ptr()); - // Then render each chunk left to right. - while (h != kInvalidChunkHandle) { - Chunk* c = ChunkFromHandle(h); - if (c->in_use()) { - // Render the wasted space - size_t wasted = c->size - c->requested_size; - if (wasted > 0) { - RenderRegion(rendered, resolution, total_region_size, - region_offset + c->requested_size, region.ptr(), c->ptr, - wasted, 'x'); - } - // Then the occupied space - RenderRegion(rendered, resolution, total_region_size, region_offset, - region.ptr(), c->ptr, c->requested_size, '*'); - } - h = c->next; - } - region_offset += region.memory_size(); - } - - return StringPiece(rendered, resolution).ToString(); -} - -void GPUBFCAllocator::DumpMemoryLog(size_t num_bytes) { - // For each bin: tally up the total number of chunks and bytes. - // Note that bins hold only free chunks. - for (BinNum bin_num = 0; bin_num < kNumBins; bin_num++) { - Bin* b = BinFromIndex(bin_num); - - size_t total_bytes_in_use = 0; - size_t total_bytes_in_bin = 0; - size_t total_requested_bytes_in_use = 0; - size_t total_requested_bytes_in_bin = 0; - size_t total_chunks_in_use = 0; - size_t total_chunks_in_bin = 0; - for (ChunkHandle h : b->free_chunks) { - Chunk* c = ChunkFromHandle(h); - total_bytes_in_bin += c->size; - total_requested_bytes_in_bin += c->requested_size; - ++total_chunks_in_bin; - if (c->in_use()) { - total_bytes_in_use += c->size; - total_requested_bytes_in_use += c->requested_size; - ++total_chunks_in_use; - } - } - - LOG(INFO) << "Bin (" << b->bin_size - << "): \tTotal Chunks: " << total_chunks_in_bin - << ", Chunks in use: " << total_chunks_in_use << " " - << strings::HumanReadableNumBytes(total_bytes_in_bin) - << " allocated for chunks. " - << strings::HumanReadableNumBytes(total_requested_bytes_in_bin) - << " client-requested for chunks. " - << strings::HumanReadableNumBytes(total_bytes_in_use) - << " in use in bin. " - << strings::HumanReadableNumBytes(total_requested_bytes_in_use) - << " client-requested in use in bin."; - } - - // Find the bin that we would have liked to allocate in, so we - // can get some further analysis about fragmentation. - Bin* b = BinForSize(num_bytes); - - LOG(INFO) << "Bin for " << strings::HumanReadableNumBytes(num_bytes) - << " was " << strings::HumanReadableNumBytes(b->bin_size) - << ", Chunk State: "; - - for (ChunkHandle h : b->free_chunks) { - Chunk* c = ChunkFromHandle(h); - LOG(INFO) << c->DebugString(this, true); - } - - // Next show the chunks that are in use, and also summarize their - // number by size. - std::map<size_t, int> in_use_by_size; - for (const auto& region : region_manager_.regions()) { - ChunkHandle h = region_manager_.get_handle(region.ptr()); - while (h != kInvalidChunkHandle) { - const Chunk* c = ChunkFromHandle(h); - if (c->in_use()) { - in_use_by_size[c->size]++; - LOG(INFO) << "Chunk at " << c->ptr << " of size " << c->size; - } - h = c->next; - } - - h = region_manager_.get_handle(region.ptr()); - while (h != kInvalidChunkHandle) { - const Chunk* c = ChunkFromHandle(h); - if (!c->in_use()) { - LOG(INFO) << "Free at " << c->ptr << " of size " << c->size; - } - h = c->next; - } - } - - LOG(INFO) << " Summary of in-use Chunks by size: "; - size_t total_bytes = 0; - for (auto& it : in_use_by_size) { - LOG(INFO) << it.second << " Chunks of size " << it.first << " totalling " - << strings::HumanReadableNumBytes(it.first * it.second); - total_bytes += (it.first * it.second); - } - LOG(INFO) << "Sum Total of in-use chunks: " - << strings::HumanReadableNumBytes(total_bytes); - LOG(INFO) << "Stats: \n" << stats_.DebugString(); -} - -void GPUBFCAllocator::GetStats(AllocatorStats* stats) { - mutex_lock l(lock_); - *stats = stats_; -} + : BFCAllocator( + new GPUMemAllocator( + GPUMachineManager()->ExecutorForDevice(device_id).ValueOrDie()), + total_memory, gpu_options.allow_growth(), "gpu_bfc") {} } // namespace tensorflow diff --git a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h index 2714fd3487..f94367cc98 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h +++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h @@ -21,394 +21,60 @@ limitations under the License. #include <unordered_map> #include <vector> -#include "tensorflow/core/common_runtime/gpu/gpu_allocator_retry.h" -#include "tensorflow/core/common_runtime/gpu/visitable_allocator.h" -#include "tensorflow/core/lib/gtl/stl_util.h" -#include "tensorflow/core/lib/strings/strcat.h" -#include "tensorflow/core/platform/macros.h" -#include "tensorflow/core/platform/mutex.h" +#include "tensorflow/core/common_runtime/allocator_retry.h" +#include "tensorflow/core/common_runtime/bfc_allocator.h" #include "tensorflow/core/platform/stream_executor.h" #include "tensorflow/core/platform/thread_annotations.h" #include "tensorflow/core/platform/types.h" #include "tensorflow/core/protobuf/config.pb.h" +namespace gpu = ::perftools::gputools; + namespace tensorflow { // A GPU memory allocator that implements a 'best-fit with coalescing' -// algorithm. This is essentially a very simple version of Doug Lea's -// malloc (dlmalloc). -// -// The goal of this allocator is to support defragmentation via -// coalescing. One assumption we make is that the process using this -// allocator owns pretty much all of the GPU memory, and that nearly -// all requests to allocate GPU memory go through this interface. -class GPUBFCAllocator : public VisitableAllocator { +// algorithm. +class GPUBFCAllocator : public BFCAllocator { public: // 'device_id' refers to the StreamExecutor ID of the device within // the process and must reference a valid ID in the process. GPUBFCAllocator(int device_id, size_t total_memory); GPUBFCAllocator(int device_id, size_t total_memory, const GPUOptions& gpu_options); - ~GPUBFCAllocator() override; - - string Name() override { return "gpu_bfc"; } - void* AllocateRaw(size_t alignment, size_t num_bytes) override; - void* AllocateRaw(size_t alignment, size_t num_bytes, - const AllocationAttributes& allocation_attr) override; - void DeallocateRaw(void* ptr) override; - - void AddAllocVisitor(Visitor visitor) override; - - // Does nothing, because gpu memory is never freed. - void AddFreeVisitor(Visitor visitor) override {} - - bool TracksAllocationSizes() override; - - size_t RequestedSize(void* ptr) override; - - size_t AllocatedSize(void* ptr) override; - - int64 AllocationId(void* ptr) override; - - void GetStats(AllocatorStats* stats) override; - - private: - struct Bin; - - void* AllocateRawInternal(size_t alignment, size_t num_bytes, - bool dump_log_on_failure); - void DeallocateRawInternal(void* ptr); - - // A ChunkHandle is an index into the chunks_ vector in GPUBFCAllocator - // kInvalidChunkHandle means an invalid chunk - typedef int ChunkHandle; - static const int kInvalidChunkHandle = -1; - - typedef int BinNum; - static const int kInvalidBinNum = -1; - static const int kNumBins = 21; - - // Chunks point to GPU memory. Their prev/next pointers form a - // doubly-linked list of addresses sorted by GPU base address that - // must be contiguous. Chunks contain information about whether - // they are in use or whether they are free, and contain a pointer - // to the bin they are in. - struct Chunk { - size_t size = 0; // Full size of GPU buffer. - - // We sometimes give chunks that are larger than needed to reduce - // fragmentation. requested_size keeps track of what the client - // actually wanted so we can understand whether our splitting - // strategy is efficient. - size_t requested_size = 0; - - // allocation_id is set to -1 when the chunk is not in use. It is assigned a - // value greater than zero before the chunk is returned from - // AllocateRaw, and this value is unique among values assigned by - // the parent allocator. - int64 allocation_id = -1; - void* ptr = nullptr; // pointer to granted GPU subbuffer. - - // If not kInvalidChunkHandle, the memory referred to by 'prev' is directly - // preceding the memory used by this chunk. E.g., It should start - // at 'ptr - prev->size' - ChunkHandle prev = kInvalidChunkHandle; - - // If not kInvalidChunkHandle, the memory referred to by 'next' is directly - // following the memory used by this chunk. E.g., It should be at - // 'ptr + size' - ChunkHandle next = kInvalidChunkHandle; - - // What bin are we in? - BinNum bin_num = kInvalidBinNum; - - bool in_use() const { return allocation_id != -1; } - - string DebugString(GPUBFCAllocator* a, bool recurse) { - string dbg; - strings::StrAppend(&dbg, " Size: ", strings::HumanReadableNumBytes(size), - " | Requested Size: ", - strings::HumanReadableNumBytes(requested_size), - " | in_use: ", in_use()); - if (recurse && prev != GPUBFCAllocator::kInvalidChunkHandle) { - Chunk* p = a->ChunkFromHandle(prev); - strings::StrAppend(&dbg, ", prev: ", p->DebugString(a, false)); - } - if (recurse && next != GPUBFCAllocator::kInvalidChunkHandle) { - Chunk* n = a->ChunkFromHandle(next); - strings::StrAppend(&dbg, ", next: ", n->DebugString(a, false)); - } - return dbg; - } - }; + virtual ~GPUBFCAllocator() {} - // A Bin is a collection of similar-sized free chunks. - struct Bin { - // All chunks in this bin have >= bin_size memory. - size_t bin_size = 0; - - struct ChunkComparator { - explicit ChunkComparator(GPUBFCAllocator* allocator) - : allocator_(allocator) {} - // Sort first by size and then use pointer address as a tie breaker. - bool operator()(const ChunkHandle ha, const ChunkHandle hb) const { - const Chunk* a = allocator_->ChunkFromHandle(ha); - const Chunk* b = allocator_->ChunkFromHandle(hb); - if (a->size != b->size) { - return a->size < b->size; - } - return a->ptr < b->ptr; - } - - private: - GPUBFCAllocator* allocator_; // The parent allocator - }; - - typedef std::set<ChunkHandle, ChunkComparator> FreeChunkSet; - // List of free chunks within the bin, sorted by chunk size. - // Chunk * not owned. - FreeChunkSet free_chunks; - Bin(GPUBFCAllocator* allocator, size_t bs) - : bin_size(bs), free_chunks(ChunkComparator(allocator)) {} - }; - - static const size_t kMinAllocationBits = 8; - static const size_t kMinAllocationSize = 1 << kMinAllocationBits; - - // AllocationRegion maps pointers to ChunkHandles for a single - // contiguous memory region. - // - // This class is thread-compatible. - class AllocationRegion { - public: - AllocationRegion(void* ptr, size_t memory_size) - : ptr_(ptr), - memory_size_(memory_size), - end_ptr_( - static_cast<void*>(static_cast<char*>(ptr_) + memory_size_)) { - DCHECK_EQ(0, memory_size % kMinAllocationSize); - const size_t n_handles = - (memory_size + kMinAllocationSize - 1) / kMinAllocationSize; - handles_ = new ChunkHandle[n_handles]; - for (size_t i = 0; i < n_handles; i++) { - handles_[i] = kInvalidChunkHandle; - } - } - - AllocationRegion() {} - - ~AllocationRegion() { delete[] handles_; } - - AllocationRegion(AllocationRegion&& other) { Swap(other); } - - AllocationRegion& operator=(AllocationRegion&& other) { - Swap(other); - return *this; - } - - void* ptr() const { return ptr_; } - void* end_ptr() const { return end_ptr_; } - size_t memory_size() const { return memory_size_; } - ChunkHandle get_handle(const void* p) const { - return handles_[IndexFor(p)]; - } - void set_handle(const void* p, ChunkHandle h) { handles_[IndexFor(p)] = h; } - void erase(const void* p) { set_handle(p, kInvalidChunkHandle); } - - private: - void Swap(AllocationRegion& other) { - std::swap(ptr_, other.ptr_); - std::swap(memory_size_, other.memory_size_); - std::swap(end_ptr_, other.end_ptr_); - std::swap(handles_, other.handles_); - } - - int IndexFor(const void* p) const { - std::uintptr_t p_int = reinterpret_cast<std::uintptr_t>(p); - std::uintptr_t base_int = reinterpret_cast<std::uintptr_t>(ptr_); - DCHECK_GE(p_int, base_int); - DCHECK_LT(p_int, base_int + memory_size_); - return static_cast<int>(((p_int - base_int) >> kMinAllocationBits)); - } - - // Metadata about the allocation region. - void* ptr_ = nullptr; - size_t memory_size_ = 0; - void* end_ptr_ = nullptr; - - // Array of size "memory_size / kMinAllocationSize". It is - // indexed by (p-base) / kMinAllocationSize, contains ChunkHandle - // for the memory allocation represented by "p" - ChunkHandle* handles_ = nullptr; - - TF_DISALLOW_COPY_AND_ASSIGN(AllocationRegion); - }; - - // RegionManager aggregates one or more "AllocationRegions" and provides - // a layer of indirection from pointers to the underlying ChunkHandle, - // allowing allocation across multiple discontiguous memory regions. - // - // This class is thread-compatible. - class RegionManager { - public: - RegionManager() {} - ~RegionManager() {} - - void AddAllocationRegion(void* ptr, size_t memory_size) { - // Insert sorted by end_ptr - auto entry = - std::upper_bound(regions_.begin(), regions_.end(), ptr, &Comparator); - regions_.insert(entry, AllocationRegion(ptr, memory_size)); - } - - ChunkHandle get_handle(const void* p) const { - return RegionFor(p)->get_handle(p); - } - - void set_handle(const void* p, ChunkHandle h) { - return MutableRegionFor(p)->set_handle(p, h); - } - void erase(const void* p) { return MutableRegionFor(p)->erase(p); } - - const std::vector<AllocationRegion>& regions() const { return regions_; } - - private: - static bool Comparator(const void* ptr, const AllocationRegion& other) { - return ptr < other.end_ptr(); - } - - AllocationRegion* MutableRegionFor(const void* p) { - return const_cast<AllocationRegion*>(RegionFor(p)); - } - - const AllocationRegion* RegionFor(const void* p) const { - auto entry = - std::upper_bound(regions_.begin(), regions_.end(), p, &Comparator); - - if (entry != regions_.end()) { - return &(*entry); - } - - LOG(FATAL) << "Could not find Region for " << p; - return nullptr; - } - - private: - std::vector<AllocationRegion> regions_; - }; - - // Returns 'bytes' rounded up to the next highest kMinAllocationSize. - size_t RoundedBytes(size_t bytes); - - // Try to add a new memory region that can satisfy an allocation of - // 'rounded_bytes' bytes. Returns true on success and false on - // failure. - bool Extend(size_t rounded_bytes) EXCLUSIVE_LOCKS_REQUIRED(lock_); - - // Returns a pointer to an underlying allocated chunk of size - // 'rounded_bytes'. - void* FindChunkPtr(BinNum bin_num, size_t rounded_bytes, size_t num_bytes) - EXCLUSIVE_LOCKS_REQUIRED(lock_); - - // Splits the chunk specified by 'h' into two chunks, one at least - // of size 'num_bytes'. - void SplitChunk(ChunkHandle h, size_t num_bytes) - EXCLUSIVE_LOCKS_REQUIRED(lock_); - - // Merges the two chunk handles. Requires that the chunks are - // contiguous in their allocation. - void Merge(ChunkHandle h, ChunkHandle h2) EXCLUSIVE_LOCKS_REQUIRED(lock_); - - // Frees the memory represented by 'h', coalescing the chunk if - // possible. - void FreeAndMaybeCoalesce(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_); - - // Adds the chunk 'h' to the proper free bin. - void InsertFreeChunkIntoBin(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_); - - // Removes the free chunk pointed to by 'c' from the set free_chunks. - void RemoveFreeChunkIterFromBin(Bin::FreeChunkSet* free_chunks, - const Bin::FreeChunkSet::iterator& c) - EXCLUSIVE_LOCKS_REQUIRED(lock_); - - // Removes a free chunk from the bin. - void RemoveFreeChunkFromBin(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_); - - // Removes the chunk metadata represented by 'h'. - void DeleteChunk(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_); - - string RenderOccupancy() EXCLUSIVE_LOCKS_REQUIRED(lock_); - void DumpMemoryLog(size_t num_bytes) EXCLUSIVE_LOCKS_REQUIRED(lock_); - - ChunkHandle AllocateChunk() EXCLUSIVE_LOCKS_REQUIRED(lock_); - void DeallocateChunk(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_); - - Chunk* ChunkFromHandle(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_); + TF_DISALLOW_COPY_AND_ASSIGN(GPUBFCAllocator); +}; - GPUAllocatorRetry retry_helper_; +// Suballocator for GPU memory. +class GPUMemAllocator : public SubAllocator { + public: + // Note: stream_exec cannot be null. + explicit GPUMemAllocator(perftools::gputools::StreamExecutor* stream_exec) + : stream_exec_(stream_exec) { + CHECK(stream_exec_ != nullptr); + } + ~GPUMemAllocator() override {} - // Structures immutable after construction - const int device_id_; - size_t gpu_memory_size_ = 0; - inline int Log2FloorNonZero(uint64 n) { -#if defined(__GNUC__) - return 63 ^ __builtin_clzll(n); -#else - int r = 0; - while (n > 0) { - r++; - n >>= 1; + void* Alloc(size_t alignment, size_t num_bytes) override { + void* ptr = nullptr; + if (num_bytes > 0) { + ptr = stream_exec_->AllocateArray<char>(num_bytes).opaque(); } - return r; -#endif + return ptr; } - // Map from bin size to Bin - Bin* BinFromIndex(BinNum index) { - return reinterpret_cast<Bin*>(&(bins_space_[index * sizeof(Bin)])); - } - size_t BinNumToSize(BinNum index) { - return static_cast<size_t>(256) << index; - } - BinNum BinNumForSize(size_t bytes) { - uint64 v = std::max<size_t>(bytes, 256) >> kMinAllocationBits; - int b = std::min(kNumBins - 1, Log2FloorNonZero(v)); - return b; + void Free(void* ptr, size_t num_bytes) override { + if (ptr != nullptr) { + gpu::DeviceMemoryBase gpu_ptr(ptr); + stream_exec_->Deallocate(&gpu_ptr); + } } - Bin* BinForSize(size_t bytes) { return BinFromIndex(BinNumForSize(bytes)); } - char bins_space_[sizeof(Bin) * kNumBins]; - - perftools::gputools::StreamExecutor* stream_exec_; // Not owned. - - // The size of the current region allocation. - size_t curr_region_allocation_bytes_; - - // The total number of allocated bytes by the allocator. - size_t total_region_allocated_bytes_ = 0; - - // An indicator that expansion of a region has hit the limits - // of the available GPU memory. - bool started_backpedal_ = false; - - // Structures mutable after construction - mutable mutex lock_; - RegionManager region_manager_ GUARDED_BY(lock_); - - std::vector<Chunk> chunks_; - ChunkHandle free_chunks_list_; // Ptr to head of linked list of free Chunks - - // Called once on each region, ASAP. - std::vector<Visitor> region_visitors_; - - // Counter containing the next unique identifier to assign to a - // newly-created chunk. - int64 next_allocation_id_ GUARDED_BY(lock_); - - // Stats. - AllocatorStats stats_ GUARDED_BY(lock_); + private: + perftools::gputools::StreamExecutor* stream_exec_; // not owned, non-null - TF_DISALLOW_COPY_AND_ASSIGN(GPUBFCAllocator); + TF_DISALLOW_COPY_AND_ASSIGN(GPUMemAllocator); }; } // namespace tensorflow diff --git a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h index 0dfa97bb63..58ea42ea1b 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h +++ b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h @@ -20,7 +20,7 @@ limitations under the License. #include <string> #include <unordered_map> -#include "tensorflow/core/common_runtime/gpu/visitable_allocator.h" +#include "tensorflow/core/common_runtime/visitable_allocator.h" #include "tensorflow/core/platform/macros.h" #include "tensorflow/core/platform/stream_executor.h" #include "tensorflow/core/platform/types.h" diff --git a/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc b/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc index f52f0078b0..4e102e823f 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc +++ b/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc @@ -226,30 +226,6 @@ TEST(EventMgr, ManySmallTensorsSeparateCallsFlushed) { } } -// Running the polling loop should clear the queue, without an explict -// poll call here, given a moderate delay. -TEST(EventMgr, LongDelayedPolling) { - auto stream_exec = GPUMachineManager()->ExecutorForDevice(0).ValueOrDie(); - EventMgr em(stream_exec, GPUOptions()); - TEST_EventMgrHelper th(&em); - EXPECT_EQ(0, th.queue_size()); - EXPECT_EQ(0, th.free_size()); - std::unique_ptr<gpu::Stream> stream(new gpu::Stream(stream_exec)); - CHECK(stream.get()); - stream->Init(); - for (int i = 0; i < 5; ++i) { - TensorReferenceVector* v = new TensorReferenceVector; - AddTensorReference(v, 100 * 1048576); - th.QueueTensors(stream.get(), v); - EXPECT_EQ(1 + i, th.queue_size()); - EXPECT_EQ(0, th.free_size()); - } - th.StartPollingLoop(); - sleep(1); - EXPECT_EQ(0, th.queue_size()); - EXPECT_EQ(5, th.free_size()); -} - // Deleting the EventMgr when events are still pending should shut // down gracefully. TEST(EventMgr, NonEmptyShutdown) { diff --git a/tensorflow/core/common_runtime/gpu/pool_allocator.h b/tensorflow/core/common_runtime/gpu/pool_allocator.h index c3fc53ea62..d8838ab7f4 100644 --- a/tensorflow/core/common_runtime/gpu/pool_allocator.h +++ b/tensorflow/core/common_runtime/gpu/pool_allocator.h @@ -24,7 +24,7 @@ limitations under the License. #include <map> #include <memory> #include <vector> -#include "tensorflow/core/common_runtime/gpu/visitable_allocator.h" +#include "tensorflow/core/common_runtime/visitable_allocator.h" #include "tensorflow/core/lib/core/bits.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/macros.h" @@ -35,14 +35,6 @@ limitations under the License. namespace tensorflow { -// Interface of an object that does the underlying alloc/free of memory. -class SubAllocator { - public: - virtual ~SubAllocator() {} - virtual void* Alloc(size_t alignment, size_t num_bytes) = 0; - virtual void Free(void* ptr, size_t num_bytes) = 0; -}; - // Interface of an object that rounds up integers. class RoundUpInterface { public: diff --git a/tensorflow/core/common_runtime/gpu/process_state.cc b/tensorflow/core/common_runtime/gpu/process_state.cc index e4f66c3b4b..67e10f7c05 100644 --- a/tensorflow/core/common_runtime/gpu/process_state.cc +++ b/tensorflow/core/common_runtime/gpu/process_state.cc @@ -187,9 +187,17 @@ Allocator* ProcessState::GetCUDAHostAllocator(int numa_node) { gpu::Platform* gpu_platform = GPUMachineManager(); gpu::StreamExecutor* se = gpu_platform->ExecutorForDevice(0).ValueOrDie(); CHECK(se); - Allocator* allocator = new PoolAllocator( - 100 /*pool_size_limit*/, true /*auto_resize*/, - new CUDAHostAllocator(se), new Pow2Rounder, "cuda_host"); + Allocator* allocator = nullptr; + static constexpr bool kCudaHostMemoryUseBFC = true; + if (kCudaHostMemoryUseBFC) { + allocator = + new BFCAllocator(new CUDAHostAllocator(se), 1LL << 36 /*64GB max*/, + true /*allow_growth*/, "cuda_host_bfc" /*name*/); + } else { + allocator = new PoolAllocator( + 100 /*pool_size_limit*/, true /*auto_resize*/, + new CUDAHostAllocator(se), new Pow2Rounder, "cuda_host"); + } if (LogMemory::IsEnabled()) { // Wrap the allocator to track allocation ids for better logging // at the cost of performance. diff --git a/tensorflow/core/common_runtime/simple_placer.cc b/tensorflow/core/common_runtime/simple_placer.cc index 5414b75fff..1f4ccf7096 100644 --- a/tensorflow/core/common_runtime/simple_placer.cc +++ b/tensorflow/core/common_runtime/simple_placer.cc @@ -315,11 +315,20 @@ class ColocationGraph { device_set_->FindMatchingDevices(specified_device_name, &devices_matching_nodedef); if (devices_matching_nodedef.empty()) { + // Sometimes it is almost impossible to understand the problem + // without a list of available devices. + std::vector<string> device_names; + for (const Device* device : device_set_->devices()) { + device_names.push_back(device->name()); + } + std::sort(device_names.begin(), device_names.end()); + return errors::InvalidArgument( "Could not satisfy explicit device specification '", node->def().device(), "' because no devices matching that specification " - "are registered in this process"); + "are registered in this process; available devices: ", + str_util::Join(device_names, ", ")); } else if (specified_device_name.has_type) { return errors::InvalidArgument( "Could not satisfy explicit device specification '", diff --git a/tensorflow/core/common_runtime/gpu/visitable_allocator.h b/tensorflow/core/common_runtime/visitable_allocator.h index b0b5ec3bd9..17582a2915 100644 --- a/tensorflow/core/common_runtime/gpu/visitable_allocator.h +++ b/tensorflow/core/common_runtime/visitable_allocator.h @@ -13,8 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#ifndef TENSORFLOW_COMMON_RUNTIME_GPU_VISITABLE_ALLOCATOR_H_ -#define TENSORFLOW_COMMON_RUNTIME_GPU_VISITABLE_ALLOCATOR_H_ +#ifndef TENSORFLOW_COMMON_RUNTIME_VISITABLE_ALLOCATOR_H_ +#define TENSORFLOW_COMMON_RUNTIME_VISITABLE_ALLOCATOR_H_ #include <functional> #include "tensorflow/core/framework/allocator.h" @@ -42,4 +42,4 @@ class VisitableAllocator : public Allocator { virtual void AddFreeVisitor(Visitor visitor) = 0; }; } // namespace tensorflow -#endif // TENSORFLOW_COMMON_RUNTIME_GPU_VISITABLE_ALLOCATOR_H_ +#endif // TENSORFLOW_COMMON_RUNTIME_VISITABLE_ALLOCATOR_H_ diff --git a/tensorflow/core/framework/allocator.h b/tensorflow/core/framework/allocator.h index 30c7c19102..9eddf53e3d 100644 --- a/tensorflow/core/framework/allocator.h +++ b/tensorflow/core/framework/allocator.h @@ -292,6 +292,15 @@ Allocator* cpu_allocator(); // AllocatorStats. By default, it's disabled. void EnableCPUAllocatorStats(bool enable); +// Abstract interface of an object that does the underlying suballoc/free of +// memory for a higher-level allocator. +class SubAllocator { + public: + virtual ~SubAllocator() {} + virtual void* Alloc(size_t alignment, size_t num_bytes) = 0; + virtual void Free(void* ptr, size_t num_bytes) = 0; +}; + } // namespace tensorflow #endif // TENSORFLOW_FRAMEWORK_ALLOCATOR_H_ diff --git a/tensorflow/core/framework/allocator_test.cc b/tensorflow/core/framework/allocator_test.cc index 2abe47cafd..f97a9b6dcf 100644 --- a/tensorflow/core/framework/allocator_test.cc +++ b/tensorflow/core/framework/allocator_test.cc @@ -38,6 +38,26 @@ static void CheckStats(Allocator* a, int64 num_allocs, int64 bytes_in_use, #endif } +TEST(AllocatorAttributesTest, AllCombos) { + for (bool on_host : {false, true}) { + for (bool nic_compatible : {false, true}) { + for (bool gpu_compatible : {false, true}) { + for (bool track_sizes : {false, true}) { + AllocatorAttributes aa; + aa.set_on_host(on_host); + aa.set_nic_compatible(nic_compatible); + aa.set_gpu_compatible(gpu_compatible); + aa.set_track_sizes(track_sizes); + EXPECT_EQ(on_host, aa.on_host()); + EXPECT_EQ(nic_compatible, aa.nic_compatible()); + EXPECT_EQ(gpu_compatible, aa.gpu_compatible()); + EXPECT_EQ(track_sizes, aa.track_sizes()); + } + } + } + } +} + TEST(CPUAllocatorTest, Simple) { EnableCPUAllocatorStats(true); Allocator* a = cpu_allocator(); diff --git a/tensorflow/core/graph/gradients.cc b/tensorflow/core/graph/gradients.cc index 2646370a76..1c902d29a0 100644 --- a/tensorflow/core/graph/gradients.cc +++ b/tensorflow/core/graph/gradients.cc @@ -40,37 +40,30 @@ static const char* const kRetOp = "_Retval"; static const char* const kGradientOp = "SymbolicGradient"; static const char* const kNodeLabel = "Func"; -// Represents the index-th output of a node. -struct Endpoint { - Node* node; - int index; - - // Returns the string name represents this endpoint. - string name() const { - if (index == 0) { - return node->name(); - } else { - return strings::StrCat(node->name(), ":", index); - } +string NodeOut::name() const { + if (index == 0) { + return node->name(); + } else { + return strings::StrCat(node->name(), ":", index); } +} - DataType dtype() const { return node->output_type(index); } -}; +DataType NodeOut::dtype() const { return node->output_type(index); } -struct EndpointHash { - uint64 operator()(const Endpoint& x) const { +struct NodeOutHash { + uint64 operator()(const NodeOut& x) const { return Hash64(reinterpret_cast<const char*>(&x.node), sizeof(Node*), x.index); } }; -struct EndpointEq { - bool operator()(const Endpoint& x, const Endpoint& y) const { +struct NodeOutEq { + bool operator()(const NodeOut& x, const NodeOut& y) const { return (x.node == y.node) && (x.index == y.index); } }; -static Node* AddZerosLike(Graph* g, Endpoint input) { +static Node* AddZerosLike(Graph* g, NodeOut input) { DCHECK_LT(0, input.dtype()); DCHECK_LT(input.dtype(), DT_FLOAT_REF); NodeDef ndef; @@ -85,7 +78,7 @@ static Node* AddZerosLike(Graph* g, Endpoint input) { return ret; } -static Node* AddSymGrad(Graph* g, Node* n, gtl::ArraySlice<Endpoint> grads) { +static Node* AddSymGrad(Graph* g, Node* n, gtl::ArraySlice<NodeOut> grads) { const int num_x = n->num_inputs(); const int num_y = n->num_outputs(); CHECK_EQ(num_y, grads.size()); @@ -95,19 +88,19 @@ static Node* AddSymGrad(Graph* g, Node* n, gtl::ArraySlice<Endpoint> grads) { ndef.set_op(kGradientOp); // The gradient node should have num_x + num_y inputs. - std::vector<Endpoint> n_inputs(num_x); + std::vector<NodeOut> n_inputs(num_x); for (const Edge* e : n->in_edges()) { if (e->IsControlEdge()) continue; n_inputs[e->dst_input()] = {e->src(), e->src_output()}; } DataTypeVector in_types; - for (const Endpoint& ep : n_inputs) { - ndef.add_input(ep.name()); - in_types.push_back(ep.dtype()); + for (const NodeOut& nout : n_inputs) { + ndef.add_input(nout.name()); + in_types.push_back(nout.dtype()); } - for (const Endpoint& ep : grads) { - ndef.add_input(ep.name()); - in_types.push_back(ep.dtype()); + for (const NodeOut& nout : grads) { + ndef.add_input(nout.name()); + in_types.push_back(nout.dtype()); } CHECK_EQ(ndef.input_size(), num_x + num_y); @@ -128,34 +121,34 @@ static Node* AddSymGrad(Graph* g, Node* n, gtl::ArraySlice<Endpoint> grads) { class SymbolicGradientBuilder { public: - SymbolicGradientBuilder(gtl::ArraySlice<Node*> y_nodes, - gtl::ArraySlice<Node*> x_nodes, - gtl::ArraySlice<Node*> y_grad_nodes, - std::vector<GradNodeOutput>* x_grad_nodes, + SymbolicGradientBuilder(gtl::ArraySlice<NodeOut> y_node_outputs, + gtl::ArraySlice<NodeOut> x_node_outputs, + gtl::ArraySlice<NodeOut> y_grad_node_outputs, + std::vector<NodeOut>* x_grad_node_outputs, Graph* graph); Status Compute(); private: - gtl::ArraySlice<Node*> y_nodes_; - gtl::ArraySlice<Node*> x_nodes_; - gtl::ArraySlice<Node*> y_grad_nodes_; - std::vector<GradNodeOutput>* x_grad_nodes_; + gtl::ArraySlice<NodeOut> y_node_outputs_; + gtl::ArraySlice<NodeOut> x_node_outputs_; + gtl::ArraySlice<NodeOut> y_grad_node_outputs_; + std::vector<NodeOut>* x_grad_node_outputs_; Graph* graph_; // Not owned. // A vector of output endpoints which represents backpropagated // gradients - typedef std::vector<Endpoint> BackpropedGradients; + typedef std::vector<NodeOut> BackpropedGradients; - // backprops_ is a map from an output endpoint to its accumulated - // gradients. When an output endpoint has accumulated all its + // backprops_ is a map from a node output to its accumulated + // gradients. When a node output has accumulated all its // gradients, we add a node which sums them up. - std::unordered_map<Endpoint, BackpropedGradients, EndpointHash, EndpointEq> + std::unordered_map<NodeOut, BackpropedGradients, NodeOutHash, NodeOutEq> backprops_; // pending[i] is count-down counter for i-th node's expected // backprops. When pending[i] becomes zero, we collected all - // backprop gradients for all output endpoint of the ith-node. + // backprop gradients for all outputs of the ith-node. std::vector<int> pending_; // 'ready' keeps track of nodes that have been completely @@ -163,7 +156,8 @@ class SymbolicGradientBuilder { // add dy as an input of the gradient function. std::deque<Node*> ready_; - // The set of nodes at which to stop backprop (and populate 'x_grad_nodes_'). + // The set of nodes at which to stop backprop. + // Maps from node.id -> index of 'x_node_outputs_' std::unordered_map<int, int> stop_nodes_; // Initialize pending_ and ready_. @@ -173,33 +167,35 @@ class SymbolicGradientBuilder { // to 'dst', when the backprop algorithm constructs the node // 'dst_grad' which computes the gradient, we need to propagate it // to 'src'. - void BackpropAlongEdge(const Endpoint& dst_grad, const Endpoint& src); - void BackpropZerosAlongEdge(const Endpoint& src); + void BackpropAlongEdge(const NodeOut& dst_grad, const NodeOut& src); + void BackpropZerosAlongEdge(const NodeOut& src); - Endpoint SumGradients(const Endpoint& src); + NodeOut SumGradients(const NodeOut& src); TF_DISALLOW_COPY_AND_ASSIGN(SymbolicGradientBuilder); }; SymbolicGradientBuilder::SymbolicGradientBuilder( - gtl::ArraySlice<Node*> y_nodes, - gtl::ArraySlice<Node*> x_nodes, - gtl::ArraySlice<Node*> y_grad_nodes, - std::vector<GradNodeOutput>* x_grad_nodes, - Graph* graph) : y_nodes_(y_nodes), x_nodes_(x_nodes), - y_grad_nodes_(y_grad_nodes), x_grad_nodes_(x_grad_nodes), - graph_(graph) { - CHECK_EQ(y_nodes_.size(), y_grad_nodes.size()); - x_grad_nodes_->clear(); - x_grad_nodes_->resize(x_nodes_.size()); - stop_nodes_.reserve(x_nodes_.size()); - for (int i = 0; i < x_nodes_.size(); ++i) { - stop_nodes_.insert(std::make_pair(x_nodes_[i]->id(), i)); + gtl::ArraySlice<NodeOut> y_node_outputs, + gtl::ArraySlice<NodeOut> x_node_outputs, + gtl::ArraySlice<NodeOut> y_grad_node_outputs, + std::vector<NodeOut>* x_grad_node_outputs, Graph* graph) + : y_node_outputs_(y_node_outputs), + x_node_outputs_(x_node_outputs), + y_grad_node_outputs_(y_grad_node_outputs), + x_grad_node_outputs_(x_grad_node_outputs), + graph_(graph) { + CHECK_EQ(y_node_outputs_.size(), y_grad_node_outputs.size()); + x_grad_node_outputs_->clear(); + x_grad_node_outputs_->resize(x_node_outputs_.size()); + stop_nodes_.reserve(x_node_outputs_.size()); + for (int i = 0; i < x_node_outputs_.size(); ++i) { + stop_nodes_.insert(std::make_pair(x_node_outputs_[i].node->id(), i)); } } -void SymbolicGradientBuilder::BackpropAlongEdge(const Endpoint& dst_grad, - const Endpoint& src) { +void SymbolicGradientBuilder::BackpropAlongEdge(const NodeOut& dst_grad, + const NodeOut& src) { CHECK_NOTNULL(src.node); auto iter = backprops_.find(src); if (iter != backprops_.end()) { @@ -211,7 +207,7 @@ void SymbolicGradientBuilder::BackpropAlongEdge(const Endpoint& dst_grad, } } -void SymbolicGradientBuilder::BackpropZerosAlongEdge(const Endpoint& src) { +void SymbolicGradientBuilder::BackpropZerosAlongEdge(const NodeOut& src) { CHECK_NOTNULL(src.node); auto iter = backprops_.find(src); if (iter != backprops_.end()) { @@ -227,9 +223,9 @@ void SymbolicGradientBuilder::InitBackprop() { backprops_.clear(); std::unordered_set<Node*> visited; std::deque<Node*> queue; - for (Node* n : x_nodes_) { - queue.push_back(n); - visited.insert(n); + for (const NodeOut& nout : x_node_outputs_) { + queue.push_back(nout.node); + visited.insert(nout.node); } // Going forward to figure out which endpoints need backprop-ed. @@ -255,20 +251,19 @@ void SymbolicGradientBuilder::InitBackprop() { } { - const int num_y = y_grad_nodes_.size(); + const int num_y = y_grad_node_outputs_.size(); for (int i = 0; i < num_y; ++i) { - Node* y = y_nodes_[i]; - Node* dy = y_grad_nodes_[i]; + Node* y = y_node_outputs_[i].node; for (const Edge* e : y->in_edges()) { if (e->IsControlEdge()) continue; - BackpropAlongEdge({dy, e->dst_input()}, {e->src(), e->src_output()}); + BackpropAlongEdge(y_grad_node_outputs_[i], {e->src(), e->src_output()}); } } } CHECK(!ready_.empty()); } -Endpoint SymbolicGradientBuilder::SumGradients(const Endpoint& src) { +NodeOut SymbolicGradientBuilder::SumGradients(const NodeOut& src) { const DataType dtype = src.dtype(); auto iter = backprops_.find(src); CHECK(iter != backprops_.end()); @@ -286,8 +281,8 @@ Endpoint SymbolicGradientBuilder::SumGradients(const Endpoint& src) { NodeDef ndef; ndef.set_name(graph_->NewName(kNodeLabel)); ndef.set_op("AddN"); // N-way Add - for (const Endpoint& ep : grads) { - ndef.add_input(ep.name()); + for (const NodeOut& nout : grads) { + ndef.add_input(nout.name()); } AddNodeAttr("N", static_cast<int64>(grads.size()), &ndef); AddNodeAttr("T", dtype, &ndef); @@ -295,8 +290,8 @@ Endpoint SymbolicGradientBuilder::SumGradients(const Endpoint& src) { Node* add = graph_->AddNode(ndef, &s); TF_CHECK_OK(s); for (size_t i = 0; i < grads.size(); ++i) { - const Endpoint& ep = grads[i]; - graph_->AddEdge(ep.node, ep.index, add, i); + const NodeOut& nout = grads[i]; + graph_->AddEdge(nout.node, nout.index, add, i); } return {add, 0}; } @@ -312,7 +307,7 @@ Status SymbolicGradientBuilder::Compute() { InitBackprop(); // Backward propagation. - gtl::InlinedVector<Endpoint, 8> dy; + gtl::InlinedVector<NodeOut, 8> dy; while (!ready_.empty()) { // n has collected all gradients. Node* n = ready_.front(); @@ -324,11 +319,11 @@ Status SymbolicGradientBuilder::Compute() { auto iter = stop_nodes_.find(n->id()); if (iter != stop_nodes_.end()) { - // Stop backprop and add gradient sum to 'x_grad_nodes'. + // Stop backprop and add gradient sum to 'x_grad_node_outputs_'. // TODO(andydavis) Support stop nodes with more than one output. CHECK_EQ(1, num_y); - Endpoint grad = SumGradients({n, 0}); - (*x_grad_nodes_)[iter->second] = {grad.node, grad.index}; + const int index = iter->second; + (*x_grad_node_outputs_)[index] = SumGradients(x_node_outputs_[index]); continue; } @@ -350,6 +345,7 @@ Status SymbolicGradientBuilder::Compute() { // Adds a gradient node with num_x + num_y inputs and num_x // outputs. + // TODO(andydavis) Support primitive gradient ops. Node* grad = AddSymGrad(graph_, n, dy); for (const Edge* e : n->in_edges()) { if (e->IsControlEdge()) continue; @@ -369,12 +365,13 @@ Status SymbolicGradientBuilder::Compute() { return Status::OK(); } -Status AddSymbolicGradients(gtl::ArraySlice<Node*> y_nodes, - gtl::ArraySlice<Node*> x_nodes, - gtl::ArraySlice<Node*> y_grad_nodes, - std::vector<GradNodeOutput>* x_grad_nodes, +Status AddSymbolicGradients(gtl::ArraySlice<NodeOut> y_node_outputs, + gtl::ArraySlice<NodeOut> x_node_outputs, + gtl::ArraySlice<NodeOut> y_grad_node_outputs, + std::vector<NodeOut>* x_grad_node_outputs, Graph* graph) { - SymbolicGradientBuilder builder(y_nodes, x_nodes, y_grad_nodes, x_grad_nodes, + SymbolicGradientBuilder builder(y_node_outputs, x_node_outputs, + y_grad_node_outputs, x_grad_node_outputs, graph); return builder.Compute(); } diff --git a/tensorflow/core/graph/gradients.h b/tensorflow/core/graph/gradients.h index bc18fd7cf2..a7d9613d79 100644 --- a/tensorflow/core/graph/gradients.h +++ b/tensorflow/core/graph/gradients.h @@ -16,40 +16,41 @@ limitations under the License. #ifndef THIRD_PARTY_TENSORFLOW_CORE_GRAPH_GRADIENTS_H_ #define THIRD_PARTY_TENSORFLOW_CORE_GRAPH_GRADIENTS_H_ +#include "tensorflow/core/graph/graph.h" #include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/lib/gtl/array_slice.h" namespace tensorflow { -// GradNodeOutput represents a single gradient node output. -struct GradNodeOutput { +// Represents the output of 'node' at 'index'. +struct NodeOut { Node* node; int index; + + // Returns the string name that represents the output of this node. + string name() const; + // Returns the data type of the output of this node. + DataType dtype() const; }; // NOTE: This API is a work in progress and will likely be changing frequently. // -// Given initial gradient nodes 'y_grad_nodes' (which compute the symbolic -// partial derivatives of some loss function 'L' w.r.t the inputs of each -// node in 'y_nodes'), adds gradient nodes to 'graph' that compute the sum -// of all gradients flowing into the single output of each node in 'x_nodes'. -// Note that gradient nodes will not be added to 'graph' which compute -// the symbolic partial derivative of 'L' w.r.t. each node in 'x_nodes' (i.e. -// backprop will stop at these nodes). This restriction will be lifted in -// a subsequent CL. +// Given initial gradient-node outputs 'y_grad_node_outputs' (which compute the +// symbolic partial derivatives of some loss function 'L' w.r.t the node outputs +// 'y_node_outputs'), adds gradient nodes to 'graph' that compute the symbolic +// partial derivatives of 'L' w.r.t the node outputs 'x_node_outputs'. // -// REQUIRES: Each node in 'x_nodes' must have a single output (this -// restriction will be removed in a subsequent change). +// REQUIRES: Each node in 'x_node_outputs' to be unique, and so to have a single +// output (this restriction will be removed in a subsequent change). -// TODO(andydavis) Add support for returning 'x_node' gradients by endpoint -// (i.e. {node, index}). // TODO(andydavis) Add symbolic gradient support for general graphs (the current // implementation only supports gradients for functions). In particular, // the nodes in 'x_nodes' are currently restricted to have one output. -Status AddSymbolicGradients(gtl::ArraySlice<Node*> y_nodes, - gtl::ArraySlice<Node*> x_nodes, - gtl::ArraySlice<Node*> y_grad_nodes, - std::vector<GradNodeOutput>* x_grad_nodes, + +Status AddSymbolicGradients(gtl::ArraySlice<NodeOut> y_node_outputs, + gtl::ArraySlice<NodeOut> x_node_outputs, + gtl::ArraySlice<NodeOut> y_grad_node_outputs, + std::vector<NodeOut>* x_grad_node_outputs, Graph* graph); } // namespace tensorflow diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD index ebda2a2a6d..6b9e093baf 100644 --- a/tensorflow/core/kernels/BUILD +++ b/tensorflow/core/kernels/BUILD @@ -214,6 +214,21 @@ cc_header_only_library( deps = [":bounds_check"], ) +cc_library( + name = "image_resizer_state", + hdrs = ["image_resizer_state.h"], + visibility = ["//visibility:private"], + deps = [ + "//tensorflow/core:lib", + "//third_party/eigen3", + ], +) + +cc_header_only_library( + name = "image_resizer_state_lib", + deps = [":image_resizer_state"], +) + # OpKernel libraries ---------------------------------------------------------- tf_kernel_libraries( @@ -221,7 +236,6 @@ tf_kernel_libraries( prefixes = [ "bcast_ops", "bitcast_op", - "depthtospace_op", "concat_op", "constant_op", "diag_op", @@ -239,7 +253,6 @@ tf_kernel_libraries( "reverse_sequence_op", "shape_ops", "slice_op", - "spacetodepth_op", "split_op", "tile_ops", "transpose_op", @@ -250,6 +263,7 @@ tf_kernel_libraries( deps = [ ":bounds_check", ":concat_lib", + ":depth_space_ops", ":fill_functor", ":ops_util", ":split_lib", @@ -545,6 +559,7 @@ tf_kernel_libraries( "sample_distorted_bounding_box_op", ], deps = [ + ":image_resizer_state", "//tensorflow/core:framework", "//tensorflow/core:image_ops_op_lib", "//tensorflow/core:lib", @@ -830,6 +845,31 @@ tf_kernel_library( ], ) +tf_kernel_library( + name = "depth_space_ops", + srcs = [ + "depthtospace_op.cc", + "spacetodepth_op.cc", + ], + hdrs = [ + "depthtospace_op.h", + "spacetodepth_op.h", + ], + gpu_srcs = [ + "depthtospace_op.h", + "depthtospace_op_gpu.cu.cc", + "spacetodepth_op.h", + "spacetodepth_op_gpu.cu.cc", + ], + visibility = ["//visibility:private"], + deps = [ + "//tensorflow/core:framework", + "//tensorflow/core:lib", + "//third_party/eigen3", + ], + alwayslink = 0, +) + tf_kernel_libraries( name = "parsing", prefixes = [ @@ -1062,6 +1102,7 @@ filegroup( "slice_op.h", "softmax_op.cc", "softmax_op.h", + "softmax_op_functor.h", "split_lib.h", "split_lib_cpu.cc", "split_op.cc", @@ -1095,10 +1136,12 @@ filegroup( "batch_norm_op.h", "control_flow_ops.h", "conv_2d.h", + "image_resizer_state.h", "maxpooling_op.h", "reduction_ops.h", "reduction_ops_common.h", "relu_op.h", + "relu_op_functor.h", "save_restore_tensor.h", "softplus_op.h", "softsign_op.h", diff --git a/tensorflow/core/kernels/batch_matmul_op.cc b/tensorflow/core/kernels/batch_matmul_op.cc index 306ae6d38c..f5a64e1f46 100644 --- a/tensorflow/core/kernels/batch_matmul_op.cc +++ b/tensorflow/core/kernels/batch_matmul_op.cc @@ -113,6 +113,39 @@ perftools::gputools::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory) { perftools::gputools::DeviceMemory<T> typed(wrapped); return typed; } + +class CublasScratchAllocator : public perftools::gputools::ScratchAllocator { + public: + using Stream = ::perftools::gputools::Stream; + using DeviceMemoryBytes = ::perftools::gputools::DeviceMemory<uint8>; + + CublasScratchAllocator(OpKernelContext* context) : context_(context) {} + + int64 GetMemoryLimitInBytes(Stream* stream) override { return -1; } + + perftools::gputools::port::StatusOr<DeviceMemoryBytes> AllocateBytes( + Stream* stream, int64 byte_size) override { + Tensor temporary_memory; + + Status allocation_status(context_->allocate_temp( + DT_UINT8, TensorShape({byte_size}), &temporary_memory)); + if (!allocation_status.ok()) { + return perftools::gputools::port::StatusOr<DeviceMemoryBytes>( + DeviceMemoryBytes::MakeFromByteSize(nullptr, 0)); + } + // Hold the reference of the allocated tensors until the end of the + // allocator. + allocated_tensors_.push_back(temporary_memory); + return perftools::gputools::port::StatusOr<DeviceMemoryBytes>( + DeviceMemoryBytes::MakeFromByteSize( + temporary_memory.flat<uint8>().data(), + temporary_memory.flat<uint8>().size())); + } + + private: + OpKernelContext* context_; + std::vector<Tensor> allocated_tensors_; +}; } // namespace template <typename Scalar> @@ -162,12 +195,14 @@ struct LaunchBatchMatMul<GPUDevice, Scalar> { // where A, B and C are assumed to be in column major. // We want the output to be in row-major, so we can compute // C' = B' x A' (' stands for transpose) + CublasScratchAllocator scratch_allocator(context); bool blas_launch_status = - stream->ThenBlasGemmBatched(blas_transpose_b, blas_transpose_a, n, m, k, - static_cast<Scalar>(1.0), b_ptrs, - adj_y ? k : n, a_ptrs, adj_x ? m : k, - static_cast<Scalar>(0.0), c_ptrs, n, - batch_size) + stream + ->ThenBlasGemmBatchedWithScratch( + blas_transpose_b, blas_transpose_a, n, m, k, + static_cast<Scalar>(1.0), b_ptrs, adj_y ? k : n, a_ptrs, + adj_x ? m : k, static_cast<Scalar>(0.0), c_ptrs, n, batch_size, + &scratch_allocator) .ok(); if (!blas_launch_status) { context->SetStatus(errors::Internal( @@ -265,9 +300,7 @@ REGISTER_CPU(int32); REGISTER_CPU(complex64); #ifdef GOOGLE_CUDA -// TODO(kalakris): The GPU implementation is currently disabled due to issues -// encountered in practice. See b/24534272. -// REGISTER_GPU(float); +REGISTER_GPU(float); #endif // GOOGLE_CUDA #undef REGISTER_CPU diff --git a/tensorflow/core/kernels/decode_csv_op.cc b/tensorflow/core/kernels/decode_csv_op.cc index 60f0474103..0e70bc31e8 100644 --- a/tensorflow/core/kernels/decode_csv_op.cc +++ b/tensorflow/core/kernels/decode_csv_op.cc @@ -45,7 +45,7 @@ class DecodeCSVOp : public OpKernel { OP_REQUIRES_OK(ctx, ctx->input("records", &records)); OP_REQUIRES_OK(ctx, ctx->input_list("record_defaults", &record_defaults)); - for (int i = 0; i < record_defaults.size(); ++i) { + for (int64 i = 0; i < record_defaults.size(); ++i) { OP_REQUIRES(ctx, record_defaults[i].NumElements() < 2, errors::InvalidArgument( "There should only be 1 default per field but field ", i, @@ -53,7 +53,7 @@ class DecodeCSVOp : public OpKernel { } auto records_t = records->flat<string>(); - int records_size = records_t.size(); + int64 records_size = records_t.size(); OpOutputList output; OP_REQUIRES_OK(ctx, ctx->output_list("output", &output)); @@ -63,7 +63,7 @@ class DecodeCSVOp : public OpKernel { output.allocate(i, records->shape(), &out); } - for (int i = 0; i < records_size; ++i) { + for (int64 i = 0; i < records_size; ++i) { const StringPiece record(records_t(i)); std::vector<string> fields; ExtractFields(ctx, record, &fields); @@ -165,7 +165,7 @@ class DecodeCSVOp : public OpKernel { void ExtractFields(OpKernelContext* ctx, StringPiece input, std::vector<string>* result) { - int current_idx = 0; + int64 current_idx = 0; if (!input.empty()) { while (static_cast<size_t>(current_idx) < input.size()) { if (input[current_idx] == '\n' || input[current_idx] == '\r') { diff --git a/tensorflow/core/kernels/depthtospace_op.cc b/tensorflow/core/kernels/depthtospace_op.cc index 01d5c479ae..4355bda960 100644 --- a/tensorflow/core/kernels/depthtospace_op.cc +++ b/tensorflow/core/kernels/depthtospace_op.cc @@ -21,6 +21,8 @@ limitations under the License. #include <string> #include <utility> +#include "tensorflow/core/kernels/depthtospace_op.h" + #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" #include "tensorflow/core/framework/op.h" #include "tensorflow/core/framework/op_kernel.h" @@ -60,8 +62,8 @@ class DepthToSpaceOp : public OpKernel { "instead of: ", dims)); const int batch_size = input.dim_size(0); - const int height = input.dim_size(1); - const int width = input.dim_size(2); + const int input_height = input.dim_size(1); + const int input_width = input.dim_size(2); const int input_depth = input.dim_size(3); const int block_size_sq = block_size_ * block_size_; @@ -73,40 +75,57 @@ class DepthToSpaceOp : public OpKernel { "should be divisible by: ", block_size_sq)); const int output_depth = input_depth / block_size_sq; - const int output_width = width * block_size_; - const int output_height = height * block_size_; + const int output_width = input_width * block_size_; + const int output_height = input_height * block_size_; // Allocate output tensor. - Tensor* outputs_tensor = nullptr; + Tensor* output = nullptr; OP_REQUIRES_OK(context, context->allocate_output( 0, TensorShape({batch_size, output_height, output_width, output_depth}), - &outputs_tensor)); + &output)); + + typename TTypes<T, 4>::ConstTensor Tinput = input.tensor<T, 4>(); + typename TTypes<T, 4>::Tensor Toutput = output->tensor<T, 4>(); + + functor::DepthToSpaceOpFunctor<Device, T> functor; + functor(context->eigen_device<Device>(), Tinput, block_size_, Toutput); + }; + + private: + int block_size_; +}; - auto Toutput = outputs_tensor->tensor<T, 4>(); - auto Tinput = input.tensor<T, 4>(); +// Partial specialization of DepthToSpaceOpFunctor for a CPUDevice. +namespace functor { +template <typename T> +struct DepthToSpaceOpFunctor<CPUDevice, T> { + void operator()(const CPUDevice& d, typename TTypes<T, 4>::ConstTensor input, + int block_size, typename TTypes<T, 4>::Tensor output) { + const int batch_size = output.dimension(0); + const int output_height = output.dimension(1); + const int output_width = output.dimension(2); + const int output_depth = output.dimension(3); for (int b = 0; b < batch_size; ++b) { for (int h = 0; h < output_height; ++h) { - const int in_h = h / block_size_; - const int offset_h = (h % block_size_); + const int in_h = h / block_size; + const int offset_h = (h % block_size); for (int w = 0; w < output_width; ++w) { - const int in_w = w / block_size_; - const int offset_w = (w % block_size_); + const int in_w = w / block_size; + const int offset_w = (w % block_size); const int offset_d = - (offset_h * block_size_ + offset_w) * output_depth; + (offset_h * block_size + offset_w) * output_depth; for (int d = 0; d < output_depth; ++d) { const int in_d = d + offset_d; - Toutput(b, h, w, d) = Tinput(b, in_h, in_w, in_d); + output(b, h, w, d) = input(b, in_h, in_w, in_d); } } } } - }; - - private: - int block_size_; + } }; +} // namespace functor #define REGISTER(type) \ REGISTER_KERNEL_BUILDER( \ @@ -116,4 +135,10 @@ class DepthToSpaceOp : public OpKernel { TF_CALL_ALL_TYPES(REGISTER); #undef REGISTER +#if GOOGLE_CUDA +REGISTER_KERNEL_BUILDER( + Name("DepthToSpace").Device(DEVICE_GPU).TypeConstraint<float>("T"), + DepthToSpaceOp<GPUDevice, float>); +#endif // GOOGLE_CUDA + } // end namespace tensorflow diff --git a/tensorflow/core/kernels/depthtospace_op.h b/tensorflow/core/kernels/depthtospace_op.h new file mode 100644 index 0000000000..60c347d985 --- /dev/null +++ b/tensorflow/core/kernels/depthtospace_op.h @@ -0,0 +1,44 @@ +/* Copyright 2015 Google Inc. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_CORE_KERNELS_DEPTHTOSPACE_OP_H_ +#define TENSORFLOW_CORE_KERNELS_DEPTHTOSPACE_OP_H_ +// Functor definition for XentOp, must be compilable by nvcc. + +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" +#include "tensorflow/core/framework/tensor_types.h" + +namespace tensorflow { +namespace functor { + +// Functor used by DepthToSpaceOp to do the computations. +template <typename Device, typename T> +struct DepthToSpaceOpFunctor { + // Implements the depth to space conversion. + // + // input: 4-D input tensor. + // block_size: block size for the conversion. + // output: 4-D output tensor. + // + // The dimensions of the tensors are guaranteed to be correct when the + // functor is called. + void operator()(const Device& d, typename TTypes<T, 4>::ConstTensor input, + int block_size, typename TTypes<T, 4>::Tensor output); +}; + +} // namespace functor +} // namespace tensorflow + +#endif // TENSORFLOW_CORE_KERNELS_DEPTHTOSPACE_OP_H_ diff --git a/tensorflow/core/kernels/depthtospace_op_gpu.cu.cc b/tensorflow/core/kernels/depthtospace_op_gpu.cu.cc new file mode 100644 index 0000000000..229222c305 --- /dev/null +++ b/tensorflow/core/kernels/depthtospace_op_gpu.cu.cc @@ -0,0 +1,88 @@ +/* Copyright 2015 Google Inc. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#if GOOGLE_CUDA + +#define EIGEN_USE_GPU + +#include "tensorflow/core/kernels/depthtospace_op.h" + +#include "tensorflow/core/framework/tensor_types.h" +#include "tensorflow/core/platform/types.h" +#include "tensorflow/core/util/cuda_kernel_helper.h" + +namespace tensorflow { + +typedef Eigen::GpuDevice GPUDevice; + +template <typename dtype> +__global__ void D2S(const int32 nthreads, const dtype* input_ptr, + const int block_size, const int batch_size, + const int input_height, const int input_width, + const int input_depth, const int output_height, + const int output_width, const int output_depth, + dtype* output_ptr) { + CUDA_1D_KERNEL_LOOP(out_idx, nthreads) { + // out_idx = d + output_depth * (w + output_width * (h + output_height * b)) + const int d = out_idx % output_depth; + const int out_idx2 = out_idx / output_depth; + const int w = out_idx2 % output_width; + const int out_idx3 = out_idx2 / output_width; + const int h = out_idx3 % output_height; + const int b = out_idx3 / output_height; + + const int in_h = h / block_size; + const int offset_h = h % block_size; + const int in_w = w / block_size; + const int offset_w = w % block_size; + const int offset_d = (offset_h * block_size + offset_w) * output_depth; + const int in_d = d + offset_d; + const int inp_idx = + in_d + input_depth * (in_w + input_width * (in_h + input_height * b)); + *(output_ptr + out_idx) = ldg(input_ptr + inp_idx); + } +} + +// Specialization of DepthToSpaceOpFunctor for a GPUDevice. +namespace functor { +template <typename T> +struct DepthToSpaceOpFunctor<GPUDevice, T> { + void operator()(const GPUDevice& d, typename TTypes<T, 4>::ConstTensor input, + int block_size, typename TTypes<T, 4>::Tensor output) { + const int batch_size = output.dimension(0); + const int input_height = input.dimension(1); + const int input_width = input.dimension(2); + const int input_depth = input.dimension(3); + const int output_height = output.dimension(1); + const int output_width = output.dimension(2); + const int output_depth = output.dimension(3); + + const int total_count = + batch_size * output_height * output_width * output_depth; + CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d); + D2S<<<config.block_count, config.thread_per_block, 0, d.stream()>>>( + config.virtual_thread_count, input.data(), block_size, batch_size, + input_height, input_width, input_depth, output_height, output_width, + output_depth, output.data()); + } +}; +} // end namespace functor + +// Instantiate the GPU implementation for float. +template struct functor::DepthToSpaceOpFunctor<GPUDevice, float>; + +} // end namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/image_resizer_state.h b/tensorflow/core/kernels/image_resizer_state.h new file mode 100644 index 0000000000..776d4d56e1 --- /dev/null +++ b/tensorflow/core/kernels/image_resizer_state.h @@ -0,0 +1,111 @@ +/* Copyright 2016 Google Inc. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +// This is a helper struct to package up the input and ouput +// parameters of an image resizer (the height, widths, etc.). To +// reduce code duplication and ensure consistency across the different +// resizers, it performs the input validation. + +#ifndef TENSORFLOW_KERNELS_IMAGE_RESIZER_STATE_H_ +#define TENSORFLOW_KERNELS_IMAGE_RESIZER_STATE_H_ + +#define EIGEN_USE_THREADS + +#include <math.h> +#include <algorithm> +#include <array> + +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/framework/tensor_shape.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/kernels/bounds_check.h" + +namespace tensorflow { + +struct ImageResizerState { + explicit ImageResizerState(bool align_corners) + : align_corners_(align_corners) {} + + // ValidateAndCreateOutput checks the bounds on the input tensors + // and requested size, sets up some of the resizing state such as the + // height_scale and width_scale, and allocates the output. + // If any of these operations fails, it sets an error status in + // the context, which the caller must check. + void ValidateAndCreateOutput(OpKernelContext* context, const Tensor& input) { + OP_REQUIRES(context, input.dims() == 4, + errors::InvalidArgument("input must be 4-dimensional", + input.shape().DebugString())); + const Tensor& shape_t = context->input(1); + OP_REQUIRES(context, shape_t.dims() == 1, + errors::InvalidArgument("shape_t must be 1-dimensional", + shape_t.shape().DebugString())); + OP_REQUIRES(context, shape_t.NumElements() == 2, + errors::InvalidArgument("shape_t must have two elements", + shape_t.shape().DebugString())); + auto Svec = shape_t.vec<int32>(); + batch_size = input.dim_size(0); + out_height = internal::SubtleMustCopy(Svec(0)); + out_width = internal::SubtleMustCopy(Svec(1)); + OP_REQUIRES( + context, + FastBoundsCheck(input.dim_size(1), std::numeric_limits<int32>::max()) && + FastBoundsCheck(input.dim_size(2), + std::numeric_limits<int32>::max()), + errors::InvalidArgument("input sizes must be between 0 and max int32")); + + in_height = static_cast<int32>(input.dim_size(1)); + in_width = static_cast<int32>(input.dim_size(2)); + channels = input.dim_size(3); + OP_REQUIRES(context, out_height > 0 && out_width > 0, + errors::InvalidArgument("output dimensions must be positive")); + OP_REQUIRES( + context, channels > 0, + errors::InvalidArgument("image must have at least one channel")); + OP_REQUIRES( + context, input.dim_size(1) > 0 && input.dim_size(2) > 0, + errors::InvalidArgument("input image must be of non-zero size")); + OP_REQUIRES_OK(context, context->allocate_output( + 0, TensorShape({input.dim_size(0), out_height, + out_width, input.dim_size(3)}), + &output)); + + height_scale = (align_corners_ && out_height > 1) + ? (in_height - 1) / static_cast<float>(out_height - 1) + : in_height / static_cast<float>(out_height); + width_scale = (align_corners_ && out_width > 1) + ? (in_width - 1) / static_cast<float>(out_width - 1) + : in_width / static_cast<float>(out_width); + } + + int64 batch_size; + int64 out_height; + int64 out_width; + int64 in_height; + int64 in_width; + int64 channels; + float height_scale; + float width_scale; + Tensor* output; + + private: + bool align_corners_; +}; + +} // namespace tensorflow + +#endif // TENSORFLOW_KERNELS_IMAGE_RESIZER_STATE_H_ diff --git a/tensorflow/core/kernels/nn_ops_test.cc b/tensorflow/core/kernels/nn_ops_test.cc index 05808840f9..dddb8bbb4b 100644 --- a/tensorflow/core/kernels/nn_ops_test.cc +++ b/tensorflow/core/kernels/nn_ops_test.cc @@ -492,6 +492,8 @@ static void BM_ConvFloatDepthwise(int iters, int batch, int rows, int cols, // OD: output_depth // KR: kernel_rows // KC: kernel_cols +// STR: stride +// PAD: padding #define BM_ConvFloatDepthwiseFwd(BS, R, C, ID, DM, OD, KR, KC, STR, PAD, \ LABEL) \ @@ -509,12 +511,25 @@ static void BM_ConvFloatDepthwise(int iters, int batch, int rows, int cols, strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", DM, "_", OD, "_", \ KR, "_", KC, "_", STR, "_", PAD, "_cpu4")); \ } \ + static void BM_ConvFloatDepthwiseFwdGPU_##LABEL(int iters) { \ + BM_ConvFloatDepthwise( \ + iters, BS, R, C, ID, DM, OD, KR, KC, DEPTHWISE_CONV_OP_FWD, 1, STR, \ + PAD, true, \ + strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", DM, "_", OD, "_", \ + KR, "_", KC, "_", STR, "_", PAD, "_gpu")); \ + } \ BENCHMARK(BM_ConvFloatDepthwiseFwdCPU1_##LABEL); \ - BENCHMARK(BM_ConvFloatDepthwiseFwdCPU4_##LABEL) + BENCHMARK(BM_ConvFloatDepthwiseFwdCPU4_##LABEL); \ + BENCHMARK(BM_ConvFloatDepthwiseFwdGPU_##LABEL); -// TODO(andydavis,jmchen) Add more benchmarks. +// The configurations below are mostly from mobilenet models. BM_ConvFloatDepthwiseFwd(32, 112, 112, 3, 8, 24, 3, 3, 1, SAME, conv0); BM_ConvFloatDepthwiseFwd(32, 112, 112, 64, 1, 64, 3, 3, 1, SAME, conv1); +BM_ConvFloatDepthwiseFwd(32, 56, 56, 128, 1, 128, 3, 3, 1, SAME, conv2); +BM_ConvFloatDepthwiseFwd(32, 56, 56, 128, 1, 128, 3, 3, 2, SAME, conv3); +BM_ConvFloatDepthwiseFwd(32, 28, 28, 128, 1, 128, 3, 3, 1, SAME, conv4); +BM_ConvFloatDepthwiseFwd(32, 14, 14, 512, 1, 512, 3, 3, 1, SAME, conv5); +BM_ConvFloatDepthwiseFwd(32, 7, 7, 1024, 1, 1024, 3, 3, 1, SAME, conv6); static void BM_LRNFloat(int iters, int depth, int cols, int rows, int batch_size, int range, int num_threads, diff --git a/tensorflow/core/kernels/relu_op.cc b/tensorflow/core/kernels/relu_op.cc index b70c9657b2..899011417f 100644 --- a/tensorflow/core/kernels/relu_op.cc +++ b/tensorflow/core/kernels/relu_op.cc @@ -30,147 +30,6 @@ namespace tensorflow { typedef Eigen::ThreadPoolDevice CPUDevice; typedef Eigen::GpuDevice GPUDevice; -template <typename Device, typename T> -class ReluOp : public UnaryElementWiseOp<T, ReluOp<Device, T>> { - public: - using UnaryElementWiseOp<T, ReluOp<Device, T>>::UnaryElementWiseOp; - - void Operate(OpKernelContext* context, const Tensor& input, Tensor* output) { - functor::Relu<Device, T> functor; - functor(context->eigen_device<Device>(), input.flat<T>(), - output->flat<T>()); - } -}; - -// Out of line check to save code space (we have this code once, rather -// than once for every NDIMS * NumTypes * Num_different_relu_variants -// functions. -static void ValidateSameSizeHelper(OpKernelContext* context, const Tensor& g, - const Tensor& a) { - OP_REQUIRES(context, a.IsSameSize(g), - errors::InvalidArgument("g and a must be the same size")); -} -static bool ValidateSameSize(OpKernelContext* context, const Tensor& g, - const Tensor& a) { - ValidateSameSizeHelper(context, g, a); - return context->status().ok(); -} - -template <typename Device, typename T> -class ReluGradOp : public BinaryElementWiseOp<T, ReluGradOp<Device, T>> { - public: - using BinaryElementWiseOp<T, ReluGradOp<Device, T>>::BinaryElementWiseOp; - - void OperateNoTemplate(OpKernelContext* context, const Tensor& g, - const Tensor& a, Tensor* output); - - // INPUTS: - // g (gradients): backpropagated gradients - // a (inputs): either the inputs that were passed to ReluOp(), or its - // outputs (using either one yields the same result here). - // OUTPUT: - // gradients to backprop - template <int NDIMS> - void Operate(OpKernelContext* context, const Tensor& g, const Tensor& a, - Tensor* output) { - OperateNoTemplate(context, g, a, output); - } -}; - -template <typename Device, typename T> -void ReluGradOp<Device, T>::OperateNoTemplate(OpKernelContext* context, - const Tensor& g, const Tensor& a, - Tensor* output) { - if (!ValidateSameSize(context, g, a)) return; - functor::ReluGrad<Device, T> functor; - functor(context->eigen_device<Device>(), g.flat<T>(), a.flat<T>(), - output->flat<T>()); -} - -template <typename Device, typename T> -class Relu6Op : public UnaryElementWiseOp<T, Relu6Op<Device, T>> { - public: - using UnaryElementWiseOp<T, Relu6Op<Device, T>>::UnaryElementWiseOp; - - void Operate(OpKernelContext* context, const Tensor& input, Tensor* output) { - functor::Relu6<Device, T> functor; - functor(context->eigen_device<Device>(), input.flat<T>(), - output->flat<T>()); - } -}; - -template <typename Device, typename T> -class Relu6GradOp : public BinaryElementWiseOp<T, Relu6GradOp<Device, T>> { - public: - using BinaryElementWiseOp<T, Relu6GradOp<Device, T>>::BinaryElementWiseOp; - - void OperateNoTemplate(OpKernelContext* context, const Tensor& g, - const Tensor& a, Tensor* output); - - // INPUTS: - // g (gradients): backpropagated gradients - // a (inputs): inputs that were passed to Relu6Op() - // OUTPUT: - // gradients to backprop - template <int NDIMS> - void Operate(OpKernelContext* context, const Tensor& g, const Tensor& a, - Tensor* output) { - OperateNoTemplate(context, g, a, output); - } -}; - -template <typename Device, typename T> -void Relu6GradOp<Device, T>::OperateNoTemplate(OpKernelContext* context, - const Tensor& g, const Tensor& a, - Tensor* output) { - if (!ValidateSameSize(context, g, a)) return; - functor::Relu6Grad<Device, T> functor; - functor(context->eigen_device<Device>(), g.flat<T>(), a.flat<T>(), - output->flat<T>()); -} - -template <typename Device, typename T> -class EluOp : public UnaryElementWiseOp<T, EluOp<Device, T>> { - public: - using UnaryElementWiseOp<T, EluOp<Device, T>>::UnaryElementWiseOp; - - void Operate(OpKernelContext* context, const Tensor& input, Tensor* output) { - functor::Elu<Device, T> functor; - functor(context->eigen_device<Device>(), input.flat<T>(), - output->flat<T>()); - } -}; - -template <typename Device, typename T> -class EluGradOp : public BinaryElementWiseOp<T, EluGradOp<Device, T>> { - public: - using BinaryElementWiseOp<T, EluGradOp<Device, T>>::BinaryElementWiseOp; - - void OperateNoTemplate(OpKernelContext* context, const Tensor& g, - const Tensor& a, Tensor* output); - - // INPUTS: - // g (gradients): backpropagated gradients - // a (outputs): outputs of the EluOp() - // OUTPUT: - // gradients to backprop - template <int NDIMS> - void Operate(OpKernelContext* context, const Tensor& g, const Tensor& a, - Tensor* output) { - OperateNoTemplate(context, g, a, output); - } -}; - -template <typename Device, typename T> -void EluGradOp<Device, T>::OperateNoTemplate(OpKernelContext* context, - const Tensor& g, const Tensor& a, - Tensor* output) { - if (!ValidateSameSize(context, g, a)) return; - functor::EluGrad<Device, T> functor; - functor(context->eigen_device<Device>(), g.flat<T>(), a.flat<T>(), - output->flat<T>()); -} - #define REGISTER_RELU_KERNELS(type) \ REGISTER_KERNEL_BUILDER( \ Name("Relu").Device(DEVICE_CPU).TypeConstraint<type>("T"), \ diff --git a/tensorflow/core/kernels/relu_op.h b/tensorflow/core/kernels/relu_op.h index b8431aeded..b41be2dfe3 100644 --- a/tensorflow/core/kernels/relu_op.h +++ b/tensorflow/core/kernels/relu_op.h @@ -13,118 +13,168 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ +// See docs in ../ops/nn_ops.cc. + #ifndef TENSORFLOW_KERNELS_RELU_OP_H_ #define TENSORFLOW_KERNELS_RELU_OP_H_ -// Functor definition for ReluOp and ReluGradOp, must be compilable by nvcc. + +#define EIGEN_USE_THREADS #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" -#include "tensorflow/core/framework/tensor_types.h" +#include "tensorflow/core/framework/numeric_op.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/kernels/relu_op_functor.h" +#include "tensorflow/core/lib/core/errors.h" namespace tensorflow { -namespace functor { -// Functor used by ReluOp to do the computations. template <typename Device, typename T> -struct Relu { - // Computes Relu activation. - // - // features: any shape. - // activations: same shape as "features". - void operator()(const Device& d, typename TTypes<T>::ConstTensor features, - typename TTypes<T>::Tensor activations) { - activations.device(d) = features.cwiseMax(static_cast<T>(0)); +class ReluOp : public UnaryElementWiseOp<T, ReluOp<Device, T>> { + public: + using UnaryElementWiseOp<T, ReluOp<Device, T>>::UnaryElementWiseOp; + + void Operate(OpKernelContext* context, const Tensor& input, Tensor* output) { + functor::Relu<Device, T> functor; + functor(context->eigen_device<Device>(), input.flat<T>(), + output->flat<T>()); + } +}; + +// Out of line check to save code space (we have this code once, rather +// than once for every NDIMS * NumTypes * Num_different_relu_variants +// functions. +struct ReluHelpers { + static void ValidateSameSizeHelper(OpKernelContext* context, const Tensor& g, + const Tensor& a) { + OP_REQUIRES(context, a.IsSameSize(g), + errors::InvalidArgument("g and a must be the same size")); + } + static bool ValidateSameSize(OpKernelContext* context, const Tensor& g, + const Tensor& a) { + ValidateSameSizeHelper(context, g, a); + return context->status().ok(); } }; -// Functor used by ReluGradOp to do the computations. template <typename Device, typename T> -struct ReluGrad { - // Computes ReluGrad backprops. - // - // gradients: gradients backpropagated to the Relu op. - // features: either the inputs that were passed to the Relu or, or its - // outputs (using either one yields the same result here). - // backprops: gradients to backpropagate to the Relu inputs. - void operator()(const Device& d, typename TTypes<T>::ConstTensor gradients, - typename TTypes<T>::ConstTensor features, - typename TTypes<T>::Tensor backprops) { - // NOTE: When the activation is exactly zero, we do not propagate the - // associated gradient value. This allows the output of the Relu to be used, - // as well as its input. - backprops.device(d) = - gradients * (features > features.constant(static_cast<T>(0))); +class ReluGradOp : public BinaryElementWiseOp<T, ReluGradOp<Device, T>> { + public: + using BinaryElementWiseOp<T, ReluGradOp<Device, T>>::BinaryElementWiseOp; + + void OperateNoTemplate(OpKernelContext* context, const Tensor& g, + const Tensor& a, Tensor* output); + + // INPUTS: + // g (gradients): backpropagated gradients + // a (inputs): either the inputs that were passed to ReluOp(), or its + // outputs (using either one yields the same result here). + // OUTPUT: + // gradients to backprop + template <int NDIMS> + void Operate(OpKernelContext* context, const Tensor& g, const Tensor& a, + Tensor* output) { + OperateNoTemplate(context, g, a, output); } }; -// Functor used by Relu6Op to do the computations. template <typename Device, typename T> -struct Relu6 { - // Computes Relu6 activation. - // - // features: any shape. - // activations: same shape as "features". - void operator()(const Device& d, typename TTypes<T>::ConstTensor features, - typename TTypes<T>::Tensor activations) { - activations.device(d) = - features.cwiseMax(static_cast<T>(0)).cwiseMin(static_cast<T>(6)); +void ReluGradOp<Device, T>::OperateNoTemplate(OpKernelContext* context, + const Tensor& g, const Tensor& a, + Tensor* output) { + if (!ReluHelpers::ValidateSameSize(context, g, a)) return; + functor::ReluGrad<Device, T> functor; + functor(context->eigen_device<Device>(), g.flat<T>(), a.flat<T>(), + output->flat<T>()); +} + +template <typename Device, typename T> +class Relu6Op : public UnaryElementWiseOp<T, Relu6Op<Device, T>> { + public: + using UnaryElementWiseOp<T, Relu6Op<Device, T>>::UnaryElementWiseOp; + + void Operate(OpKernelContext* context, const Tensor& input, Tensor* output) { + functor::Relu6<Device, T> functor; + functor(context->eigen_device<Device>(), input.flat<T>(), + output->flat<T>()); } }; -// Functor used by ReluGradOp to do the computations. template <typename Device, typename T> -struct Relu6Grad { - // Computes Relu6Grad backprops. - // - // gradients: gradients backpropagated to the Relu6 op. - // features: inputs that where passed to the Relu6 op. - // backprops: gradients to backpropagate to the Relu6 inputs. - void operator()(const Device& d, typename TTypes<T>::ConstTensor gradients, - typename TTypes<T>::ConstTensor features, - typename TTypes<T>::Tensor backprops) { - // NOTE: When the activation is exactly zero or six, we - // arbitrarily choose to not propagate the associated gradient - // value. - backprops.device(d) = gradients * - (features > features.constant(static_cast<T>(0))) * - (features < features.constant(static_cast<T>(6))); +class Relu6GradOp : public BinaryElementWiseOp<T, Relu6GradOp<Device, T>> { + public: + using BinaryElementWiseOp<T, Relu6GradOp<Device, T>>::BinaryElementWiseOp; + + void OperateNoTemplate(OpKernelContext* context, const Tensor& g, + const Tensor& a, Tensor* output); + + // INPUTS: + // g (gradients): backpropagated gradients + // a (inputs): inputs that were passed to Relu6Op() + // OUTPUT: + // gradients to backprop + template <int NDIMS> + void Operate(OpKernelContext* context, const Tensor& g, const Tensor& a, + Tensor* output) { + OperateNoTemplate(context, g, a, output); } }; -// Functor used by EluOp to do the computations. template <typename Device, typename T> -struct Elu { - // Computes Elu activation. - // - // features: any shape. - // activations: same shape as "features". - void operator()(const Device& d, typename TTypes<T>::ConstTensor features, - typename TTypes<T>::Tensor activations) { - // features.constant(?) - activations.device(d) = - (features < static_cast<T>(0)) - .select(features.exp() - features.constant(static_cast<T>(1)), - features); +void Relu6GradOp<Device, T>::OperateNoTemplate(OpKernelContext* context, + const Tensor& g, const Tensor& a, + Tensor* output) { + if (!ReluHelpers::ValidateSameSize(context, g, a)) return; + functor::Relu6Grad<Device, T> functor; + functor(context->eigen_device<Device>(), g.flat<T>(), a.flat<T>(), + output->flat<T>()); +} + +template <typename Device, typename T> +class EluOp : public UnaryElementWiseOp<T, EluOp<Device, T>> { + public: + using UnaryElementWiseOp<T, EluOp<Device, T>>::UnaryElementWiseOp; + + void Operate(OpKernelContext* context, const Tensor& input, Tensor* output) { + functor::Elu<Device, T> functor; + functor(context->eigen_device<Device>(), input.flat<T>(), + output->flat<T>()); } }; -// Functor used by EluGradOp to do the computations. template <typename Device, typename T> -struct EluGrad { - // Computes EluGrad backprops. - // - // gradients: gradients backpropagated to the Elu op. - // activations: outputs of the Elu op. - // backprops: gradients to backpropagate to the Elu inputs. - void operator()(const Device& d, typename TTypes<T>::ConstTensor gradients, - typename TTypes<T>::ConstTensor activations, - typename TTypes<T>::Tensor backprops) { - backprops.device(d) = - (activations < static_cast<T>(0)) - .select((activations + static_cast<T>(1)) * gradients, gradients); +class EluGradOp : public BinaryElementWiseOp<T, EluGradOp<Device, T>> { + public: + using BinaryElementWiseOp<T, EluGradOp<Device, T>>::BinaryElementWiseOp; + + void OperateNoTemplate(OpKernelContext* context, const Tensor& g, + const Tensor& a, Tensor* output); + + // INPUTS: + // g (gradients): backpropagated gradients + // a (outputs): outputs of the EluOp() + // OUTPUT: + // gradients to backprop + template <int NDIMS> + void Operate(OpKernelContext* context, const Tensor& g, const Tensor& a, + Tensor* output) { + OperateNoTemplate(context, g, a, output); } }; -} // namespace functor +template <typename Device, typename T> +void EluGradOp<Device, T>::OperateNoTemplate(OpKernelContext* context, + const Tensor& g, const Tensor& a, + Tensor* output) { + if (!ReluHelpers::ValidateSameSize(context, g, a)) return; + functor::EluGrad<Device, T> functor; + functor(context->eigen_device<Device>(), g.flat<T>(), a.flat<T>(), + output->flat<T>()); +} + } // namespace tensorflow +#undef EIGEN_USE_THREADS + #endif // TENSORFLOW_KERNELS_RELU_OP_H_ diff --git a/tensorflow/core/kernels/relu_op_functor.h b/tensorflow/core/kernels/relu_op_functor.h new file mode 100644 index 0000000000..5d732a6141 --- /dev/null +++ b/tensorflow/core/kernels/relu_op_functor.h @@ -0,0 +1,130 @@ +/* Copyright 2015 Google Inc. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_KERNELS_RELU_OP_FUNCTOR_H_ +#define TENSORFLOW_KERNELS_RELU_OP_FUNCTOR_H_ +// Functor definition for ReluOp and ReluGradOp, must be compilable by nvcc. + +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" +#include "tensorflow/core/framework/tensor_types.h" + +namespace tensorflow { +namespace functor { + +// Functor used by ReluOp to do the computations. +template <typename Device, typename T> +struct Relu { + // Computes Relu activation. + // + // features: any shape. + // activations: same shape as "features". + void operator()(const Device& d, typename TTypes<T>::ConstTensor features, + typename TTypes<T>::Tensor activations) { + activations.device(d) = features.cwiseMax(static_cast<T>(0)); + } +}; + +// Functor used by ReluGradOp to do the computations. +template <typename Device, typename T> +struct ReluGrad { + // Computes ReluGrad backprops. + // + // gradients: gradients backpropagated to the Relu op. + // features: either the inputs that were passed to the Relu or, or its + // outputs (using either one yields the same result here). + // backprops: gradients to backpropagate to the Relu inputs. + void operator()(const Device& d, typename TTypes<T>::ConstTensor gradients, + typename TTypes<T>::ConstTensor features, + typename TTypes<T>::Tensor backprops) { + // NOTE: When the activation is exactly zero, we do not propagate the + // associated gradient value. This allows the output of the Relu to be used, + // as well as its input. + backprops.device(d) = + gradients * (features > features.constant(static_cast<T>(0))); + } +}; + +// Functor used by Relu6Op to do the computations. +template <typename Device, typename T> +struct Relu6 { + // Computes Relu6 activation. + // + // features: any shape. + // activations: same shape as "features". + void operator()(const Device& d, typename TTypes<T>::ConstTensor features, + typename TTypes<T>::Tensor activations) { + activations.device(d) = + features.cwiseMax(static_cast<T>(0)).cwiseMin(static_cast<T>(6)); + } +}; + +// Functor used by ReluGradOp to do the computations. +template <typename Device, typename T> +struct Relu6Grad { + // Computes Relu6Grad backprops. + // + // gradients: gradients backpropagated to the Relu6 op. + // features: inputs that where passed to the Relu6 op. + // backprops: gradients to backpropagate to the Relu6 inputs. + void operator()(const Device& d, typename TTypes<T>::ConstTensor gradients, + typename TTypes<T>::ConstTensor features, + typename TTypes<T>::Tensor backprops) { + // NOTE: When the activation is exactly zero or six, we + // arbitrarily choose to not propagate the associated gradient + // value. + backprops.device(d) = gradients * + (features > features.constant(static_cast<T>(0))) * + (features < features.constant(static_cast<T>(6))); + } +}; + +// Functor used by EluOp to do the computations. +template <typename Device, typename T> +struct Elu { + // Computes Elu activation. + // + // features: any shape. + // activations: same shape as "features". + void operator()(const Device& d, typename TTypes<T>::ConstTensor features, + typename TTypes<T>::Tensor activations) { + // features.constant(?) + activations.device(d) = + (features < static_cast<T>(0)) + .select(features.exp() - features.constant(static_cast<T>(1)), + features); + } +}; + +// Functor used by EluGradOp to do the computations. +template <typename Device, typename T> +struct EluGrad { + // Computes EluGrad backprops. + // + // gradients: gradients backpropagated to the Elu op. + // activations: outputs of the Elu op. + // backprops: gradients to backpropagate to the Elu inputs. + void operator()(const Device& d, typename TTypes<T>::ConstTensor gradients, + typename TTypes<T>::ConstTensor activations, + typename TTypes<T>::Tensor backprops) { + backprops.device(d) = + (activations < static_cast<T>(0)) + .select((activations + static_cast<T>(1)) * gradients, gradients); + } +}; + +} // namespace functor +} // namespace tensorflow + +#endif // TENSORFLOW_KERNELS_RELU_OP_FUNCTOR_H_ diff --git a/tensorflow/core/kernels/relu_op_gpu.cu.cc b/tensorflow/core/kernels/relu_op_gpu.cu.cc index 6451619768..0a12c854b8 100644 --- a/tensorflow/core/kernels/relu_op_gpu.cu.cc +++ b/tensorflow/core/kernels/relu_op_gpu.cu.cc @@ -19,7 +19,7 @@ limitations under the License. #include <stdio.h> -#include "tensorflow/core/kernels/relu_op.h" +#include "tensorflow/core/kernels/relu_op_functor.h" #include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/framework/tensor_types.h" diff --git a/tensorflow/core/kernels/resize_area_op.cc b/tensorflow/core/kernels/resize_area_op.cc index 5c4b1cdb12..5bf064f159 100644 --- a/tensorflow/core/kernels/resize_area_op.cc +++ b/tensorflow/core/kernels/resize_area_op.cc @@ -24,6 +24,7 @@ limitations under the License. #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/framework/tensor_shape.h" #include "tensorflow/core/framework/types.h" +#include "tensorflow/core/kernels/image_resizer_state.h" #include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/platform/logging.h" @@ -40,49 +41,22 @@ class ResizeAreaOp : public OpKernel { void Compute(OpKernelContext* context) override { const Tensor& input = context->input(0); - OP_REQUIRES(context, input.dims() == 4, - errors::InvalidArgument("input must be 4-dimensional", - input.shape().DebugString())); - const Tensor& shape_t = context->input(1); - OP_REQUIRES(context, shape_t.dims() == 1, - errors::InvalidArgument("shape_t must be 1-dimensional", - shape_t.shape().DebugString())); - OP_REQUIRES(context, shape_t.NumElements() == 2, - errors::InvalidArgument("shape_t must have two elements", - shape_t.shape().DebugString())); - - auto Svec = shape_t.vec<int32>(); - Tensor* output = nullptr; - OP_REQUIRES_OK(context, context->allocate_output( - 0, TensorShape({input.dim_size(0), Svec(0), - Svec(1), input.dim_size(3)}), - &output)); - const int64 batch_size = input.dim_size(0); - const int64 in_height = input.dim_size(1); - const int64 in_width = input.dim_size(2); - const int64 channels = input.dim_size(3); - const int64 out_height = output->dim_size(1); - const int64 out_width = output->dim_size(2); + ImageResizerState st(align_corners_); + st.ValidateAndCreateOutput(context, input); + + if (!context->status().ok()) return; typename TTypes<T, 4>::ConstTensor input_data = input.tensor<T, 4>(); - typename TTypes<float, 4>::Tensor output_data = output->tensor<float, 4>(); + typename TTypes<float, 4>::Tensor output_data = + st.output->tensor<float, 4>(); // A temporary tensor for computing the sum. Tensor sum_tensor; - OP_REQUIRES_OK( - context, context->allocate_temp(DataTypeToEnum<float>::value, - TensorShape({channels}), &sum_tensor)); + OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<float>::value, + TensorShape({st.channels}), + &sum_tensor)); typename TTypes<float, 1>::Tensor sum_data = sum_tensor.vec<float>(); - const float height_scale = - (align_corners_ && out_height > 1) - ? (in_height - 1) / static_cast<float>(out_height - 1) - : in_height / static_cast<float>(out_height); - const float width_scale = - (align_corners_ && out_width > 1) - ? (in_width - 1) / static_cast<float>(out_width - 1) - : in_width / static_cast<float>(out_width); - // When using this algorithm for downsizing, the target pixel value is the // weighted average of all the source pixels. The weight is determined by // the contribution percentage of the source pixel. @@ -102,19 +76,19 @@ class ResizeAreaOp : public OpKernel { // out[0] = (in[0] * 1.0 + in[1] * 1/3) * scale // out[1] = (in[1] * 2/3 + in[2] * 2/3 * scale // out[2] = (in[3] * 1/3 + in[3] * 1.0) * scale - float scale = 1.0 / (height_scale * width_scale); - for (int64 b = 0; b < batch_size; ++b) { - for (int64 y = 0; y < out_height; ++y) { - const float in_y = y * height_scale; - const float in_y1 = (y + 1) * height_scale; + float scale = 1.0 / (st.height_scale * st.width_scale); + for (int64 b = 0; b < st.batch_size; ++b) { + for (int64 y = 0; y < st.out_height; ++y) { + const float in_y = y * st.height_scale; + const float in_y1 = (y + 1) * st.height_scale; // The start and end height indices of all the cells that could // contribute to the target cell. int64 y_start = floor(in_y); int64 y_end = ceil(in_y1); - for (int64 x = 0; x < out_width; ++x) { - const float in_x = x * width_scale; - const float in_x1 = (x + 1) * width_scale; + for (int64 x = 0; x < st.out_width; ++x) { + const float in_x = x * st.width_scale; + const float in_x1 = (x + 1) * st.width_scale; // The start and end width indices of all the cells that could // contribute to the target cell. int64 x_start = floor(in_x); @@ -127,16 +101,16 @@ class ResizeAreaOp : public OpKernel { for (int64 j = x_start; j < x_end; ++j) { float scale_x = j < in_x ? j + 1 - in_x : (j + 1 > in_x1 ? in_x1 - j : 1.0); - for (int64 c = 0; c < channels; ++c) { + for (int64 c = 0; c < st.channels; ++c) { #define BOUND(val, limit) std::min(((limit)-1ll), (std::max(0ll, (val)))) - sum_data(c) += - input_data(b, BOUND(i, in_height), BOUND(j, in_width), c) * - scale_y * scale_x * scale; + sum_data(c) += input_data(b, BOUND(i, st.in_height), + BOUND(j, st.in_width), c) * + scale_y * scale_x * scale; #undef BOUND } } } - for (int64 c = 0; c < channels; ++c) { + for (int64 c = 0; c < st.channels; ++c) { output_data(b, y, x, c) = sum_data(c); } } diff --git a/tensorflow/core/kernels/resize_bicubic_op.cc b/tensorflow/core/kernels/resize_bicubic_op.cc index f81383984b..ce6c920bd1 100644 --- a/tensorflow/core/kernels/resize_bicubic_op.cc +++ b/tensorflow/core/kernels/resize_bicubic_op.cc @@ -26,6 +26,7 @@ limitations under the License. #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/framework/tensor_shape.h" #include "tensorflow/core/framework/types.h" +#include "tensorflow/core/kernels/image_resizer_state.h" #include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/platform/logging.h" @@ -92,62 +93,28 @@ class ResizeBicubicOp : public OpKernel { void Compute(OpKernelContext* context) override { const Tensor& input = context->input(0); - OP_REQUIRES(context, input.dims() == 4, - errors::InvalidArgument("input must be 4-dimensional", - input.shape().DebugString())); - const Tensor& shape_t = context->input(1); - OP_REQUIRES(context, shape_t.dims() == 1, - errors::InvalidArgument("shape_t must be 1-dimensional", - shape_t.shape().DebugString())); - OP_REQUIRES(context, shape_t.NumElements() == 2, - errors::InvalidArgument("shape_t must have two elements", - shape_t.shape().DebugString())); - - auto Svec = shape_t.vec<int32>(); - // Initialize shape to the batch size of the input, then add - // the rest of the dimensions - Tensor* output = nullptr; - OP_REQUIRES_OK(context, context->allocate_output( - 0, TensorShape({input.dim_size(0), Svec(0), - Svec(1), input.dim_size(3)}), - &output)); - const int64 batch_size = input.dim_size(0); - const int64 in_height = input.dim_size(1); - const int64 in_width = input.dim_size(2); - const int64 channels = input.dim_size(3); - const int64 out_height = output->dim_size(1); - const int64 out_width = output->dim_size(2); - CHECK_GT(in_height, 0); - CHECK_GT(in_width, 0); - CHECK_GT(channels, 0); - CHECK_GT(out_height, 0); - CHECK_GT(out_width, 0); + ImageResizerState st(align_corners_); + st.ValidateAndCreateOutput(context, input); - typename TTypes<T, 4>::ConstTensor input_data = input.tensor<T, 4>(); - typename TTypes<float, 4>::Tensor output_data = output->tensor<float, 4>(); + if (!context->status().ok()) return; - const float height_scale = - (align_corners_ && out_height > 1) - ? (in_height - 1) / static_cast<float>(out_height - 1) - : in_height / static_cast<float>(out_height); - const float width_scale = - (align_corners_ && out_width > 1) - ? (in_width - 1) / static_cast<float>(out_width - 1) - : in_width / static_cast<float>(out_width); + typename TTypes<T, 4>::ConstTensor input_data = input.tensor<T, 4>(); + typename TTypes<float, 4>::Tensor output_data = + st.output->tensor<float, 4>(); std::array<float, 4> coeff = {{0.0, 0.0, 0.0, 0.0}}; - for (int64 b = 0; b < batch_size; ++b) { - for (int64 y = 0; y < out_height; ++y) { + for (int64 b = 0; b < st.batch_size; ++b) { + for (int64 y = 0; y < st.out_height; ++y) { std::array<float, 4> y_weights; std::array<int64, 4> y_indices; - GetWeightsAndIndices(height_scale, y, in_height, &y_weights, + GetWeightsAndIndices(st.height_scale, y, st.in_height, &y_weights, &y_indices); - for (int64 x = 0; x < out_width; ++x) { + for (int64 x = 0; x < st.out_width; ++x) { std::array<float, 4> x_weights; std::array<int64, 4> x_indices; - GetWeightsAndIndices(width_scale, x, in_width, &x_weights, + GetWeightsAndIndices(st.width_scale, x, st.in_width, &x_weights, &x_indices); - for (int64 c = 0; c < channels; ++c) { + for (int64 c = 0; c < st.channels; ++c) { // Use a 4x4 patch to compute the interpolated output value at // (b, y, x, c). for (int64 i = 0; i < 4; ++i) { diff --git a/tensorflow/core/kernels/resize_bilinear_op.cc b/tensorflow/core/kernels/resize_bilinear_op.cc index ebf9532d7b..bdf60f4c4f 100644 --- a/tensorflow/core/kernels/resize_bilinear_op.cc +++ b/tensorflow/core/kernels/resize_bilinear_op.cc @@ -23,6 +23,7 @@ limitations under the License. #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/framework/tensor_shape.h" #include "tensorflow/core/framework/types.h" +#include "tensorflow/core/kernels/image_resizer_state.h" #include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/platform/logging.h" @@ -39,64 +40,29 @@ class ResizeBilinearOp : public OpKernel { void Compute(OpKernelContext* context) override { const Tensor& input = context->input(0); - OP_REQUIRES(context, input.dims() == 4, - errors::InvalidArgument("input must be 4-dimensional", - input.shape().DebugString())); - const Tensor& shape_t = context->input(1); - OP_REQUIRES(context, shape_t.dims() == 1, - errors::InvalidArgument("shape_t must be 1-dimensional", - shape_t.shape().DebugString())); - OP_REQUIRES(context, shape_t.NumElements() == 2, - errors::InvalidArgument("shape_t must have two elements", - shape_t.shape().DebugString())); - - auto Svec = shape_t.vec<int32>(); - // Initialize shape to the batch size of the input, then add - // the rest of the dimensions - Tensor* output = nullptr; - OP_REQUIRES_OK(context, context->allocate_output( - 0, TensorShape({input.dim_size(0), Svec(0), - Svec(1), input.dim_size(3)}), - &output)); + ImageResizerState st(align_corners_); + st.ValidateAndCreateOutput(context, input); - const int64 batch_size = input.dim_size(0); - const int64 in_height = input.dim_size(1); - const int64 in_width = input.dim_size(2); - const int64 channels = input.dim_size(3); - const int64 out_height = output->dim_size(1); - const int64 out_width = output->dim_size(2); - CHECK_GT(in_height, 0); - CHECK_GT(in_width, 0); - CHECK_GT(channels, 0); - CHECK_GT(out_height, 0); - CHECK_GT(out_width, 0); + if (!context->status().ok()) return; typename TTypes<T, 4>::ConstTensor input_data = input.tensor<T, 4>(); - typename TTypes<float, 4>::Tensor output_data = output->tensor<float, 4>(); + typename TTypes<float, 4>::Tensor output_data = + st.output->tensor<float, 4>(); - const float height_scale = - (align_corners_ && out_height > 1) - ? (in_height - 1) / static_cast<float>(out_height - 1) - : in_height / static_cast<float>(out_height); - const float width_scale = - (align_corners_ && out_width > 1) - ? (in_width - 1) / static_cast<float>(out_width - 1) - : in_width / static_cast<float>(out_width); - - for (int b = 0; b < batch_size; ++b) { - for (int y = 0; y < out_height; ++y) { - const float in_y = y * height_scale; + for (int b = 0; b < st.batch_size; ++b) { + for (int y = 0; y < st.out_height; ++y) { + const float in_y = y * st.height_scale; const int top_y_index = static_cast<int>(floorf(in_y)); const int bottom_y_index = - std::min(static_cast<int64>(ceilf(in_y)), (in_height - 1)); + std::min(static_cast<int64>(ceilf(in_y)), (st.in_height - 1)); const float y_lerp = in_y - top_y_index; - for (int x = 0; x < out_width; ++x) { - const float in_x = x * width_scale; + for (int x = 0; x < st.out_width; ++x) { + const float in_x = x * st.width_scale; const int left_x_index = static_cast<int>(floorf(in_x)); const int right_x_index = - std::min(static_cast<int64>(ceilf(in_x)), (in_width - 1)); + std::min(static_cast<int64>(ceilf(in_x)), (st.in_width - 1)); const float x_lerp = in_x - left_x_index; - for (int c = 0; c < channels; ++c) { + for (int c = 0; c < st.channels; ++c) { const float top_left = input_data(b, top_y_index, left_x_index, c); const float top_right = input_data(b, top_y_index, right_x_index, c); diff --git a/tensorflow/core/kernels/resize_nearest_neighbor_op.cc b/tensorflow/core/kernels/resize_nearest_neighbor_op.cc index 281e0feb39..61b89fb9a5 100644 --- a/tensorflow/core/kernels/resize_nearest_neighbor_op.cc +++ b/tensorflow/core/kernels/resize_nearest_neighbor_op.cc @@ -23,6 +23,7 @@ limitations under the License. #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/framework/tensor_shape.h" #include "tensorflow/core/framework/types.h" +#include "tensorflow/core/kernels/image_resizer_state.h" #include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/platform/logging.h" @@ -44,56 +45,28 @@ class ResizeNearestNeighborOp : public OpKernel { void Compute(OpKernelContext* context) override { const Tensor& input = context->input(0); - OP_REQUIRES(context, input.dims() == 4, - errors::InvalidArgument("input must be 4-dimensional", - input.shape().DebugString())); - const Tensor& shape_t = context->input(1); - OP_REQUIRES(context, shape_t.dims() == 1, - errors::InvalidArgument("shape_t must be 1-dimensional", - shape_t.shape().DebugString())); - OP_REQUIRES(context, shape_t.NumElements() == 2, - errors::InvalidArgument("shape_t must have two elements", - shape_t.shape().DebugString())); + ImageResizerState st(align_corners_); + st.ValidateAndCreateOutput(context, input); - auto sizes = shape_t.vec<int32>(); - OP_REQUIRES(context, sizes(0) > 0 && sizes(1) > 0, - errors::InvalidArgument("shape_t's elements must be positive")); - - // Initialize shape to the batch size of the input, then add - // the rest of the dimensions - Tensor* output = nullptr; - OP_REQUIRES_OK( - context, context->allocate_output(0, TensorShape({input.dim_size(0), sizes(0), - sizes(1), input.dim_size(3)}), - &output)); + if (!context->status().ok()) return; - const int64 batch_size = input.dim_size(0); - const int64 in_height = input.dim_size(1); - const int64 in_width = input.dim_size(2); - const int64 channels = input.dim_size(3); - const int64 out_height = output->dim_size(1); - const int64 out_width = output->dim_size(2); + OP_REQUIRES(context, st.in_height < (1 << 24) && st.in_width < (1 << 24), + errors::InvalidArgument("nearest neighbor requires max height " + "& width of 2^24")); typename TTypes<T, 4>::ConstTensor input_data = input.tensor<T, 4>(); - typename TTypes<T, 4>::Tensor output_data = output->tensor<T, 4>(); - - const float height_scale = - (align_corners_ && out_height > 1) - ? (in_height - 1) / static_cast<float>(out_height - 1) - : in_height / static_cast<float>(out_height); - const float width_scale = - (align_corners_ && out_width > 1) - ? (in_width - 1) / static_cast<float>(out_width - 1) - : in_width / static_cast<float>(out_width); - - for (int b = 0; b < batch_size; ++b) { - for (int y = 0; y < out_height; ++y) { - const int in_y = std::min(static_cast<int64>(floorf(y * height_scale)), - (in_height - 1)); - for (int x = 0; x < out_width; ++x) { - const int in_x = std::min(static_cast<int64>(floorf(x * width_scale)), - (in_width - 1)); - for (int c = 0; c < channels; ++c) { + typename TTypes<T, 4>::Tensor output_data = st.output->tensor<T, 4>(); + + for (int b = 0; b < st.batch_size; ++b) { + for (int y = 0; y < st.out_height; ++y) { + const int in_y = + std::min(static_cast<int64>(floorf(y * st.height_scale)), + (st.in_height - 1)); + for (int x = 0; x < st.out_width; ++x) { + const int in_x = + std::min(static_cast<int64>(floorf(x * st.width_scale)), + (st.in_width - 1)); + for (int c = 0; c < st.channels; ++c) { output_data(b, y, x, c) = input_data(b, in_y, in_x, c); } } diff --git a/tensorflow/core/kernels/softmax_op.cc b/tensorflow/core/kernels/softmax_op.cc index 038efbe31a..305a91fecf 100644 --- a/tensorflow/core/kernels/softmax_op.cc +++ b/tensorflow/core/kernels/softmax_op.cc @@ -28,29 +28,6 @@ namespace tensorflow { typedef Eigen::ThreadPoolDevice CPUDevice; typedef Eigen::GpuDevice GPUDevice; -template <typename Device, typename T> -class SoftmaxOp : public OpKernel { - public: - explicit SoftmaxOp(OpKernelConstruction* context) : OpKernel(context) { - log_ = StringPiece(name()).starts_with("Log"); - } - - void Compute(OpKernelContext* context) override { - const Tensor& logits_in = context->input(0); - OP_REQUIRES(context, TensorShapeUtils::IsMatrix(logits_in.shape()), - errors::InvalidArgument("logits must be 2-dimensional")); - Tensor* softmax_out = nullptr; - OP_REQUIRES_OK( - context, context->allocate_output(0, logits_in.shape(), &softmax_out)); - functor::SoftmaxFunctor<Device, T> functor; - functor(context->eigen_device<Device>(), logits_in.matrix<T>(), - softmax_out->matrix<T>(), log_); - } - - private: - bool log_; -}; - // Partial specialization for a CPUDevice, that uses the Eigen implementation // from SoftmaxEigenImpl. namespace functor { diff --git a/tensorflow/core/kernels/softmax_op.h b/tensorflow/core/kernels/softmax_op.h index 6e0064bd5b..df78f85cc2 100644 --- a/tensorflow/core/kernels/softmax_op.h +++ b/tensorflow/core/kernels/softmax_op.h @@ -13,89 +13,48 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ +// See docs in ../ops/nn_ops.cc. + #ifndef TENSORFLOW_KERNELS_SOFTMAX_OP_H_ #define TENSORFLOW_KERNELS_SOFTMAX_OP_H_ -// Functor definition for SoftmaxOp, must be compilable by nvcc. + +#define EIGEN_USE_THREADS #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" -#include "tensorflow/core/framework/tensor_types.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/framework/tensor_shape.h" +#include "tensorflow/core/kernels/softmax_op_functor.h" namespace tensorflow { -namespace functor { - -// Functor used by SoftmaxOp to do the computations. -template <typename Device, typename T> -struct SoftmaxFunctor { - // Computes Softmax or LogSoftmax activation. - // - // logits: dim: batch_size, num_classes. - // softmax: dims: batch_size, num_classes. - // log: boolean - void operator()(const Device& d, typename TTypes<T>::ConstMatrix logits, - typename TTypes<T>::Matrix softmax, const bool log); -}; -// Eigen code implementing SoftmaxFunctor::operator() or -// LogSoftmaxFunctor::operator(). -// This code works for both CPU and GPU and is used by the functor -// specializations for both device types. template <typename Device, typename T> -struct SoftmaxEigenImpl { - static void Compute(const Device& d, typename TTypes<T>::ConstMatrix logits, - typename TTypes<T>::Matrix softmax, const bool log) { - const int kBatchDim = 0; - const int kClassDim = 1; - - const int batch_size = logits.dimension(kBatchDim); - const int num_classes = logits.dimension(kClassDim); +class SoftmaxOp : public OpKernel { + public: + explicit SoftmaxOp(OpKernelConstruction* context) : OpKernel(context) { + log_ = StringPiece(name()).starts_with("Log"); + } -// These arrays are used to reduce along the class dimension, and broadcast -// the resulting value to all classes. -#if !defined(EIGEN_HAS_INDEX_LIST) - Eigen::DSizes<int, 1> along_class(kClassDim); - Eigen::DSizes<int, 2> batch_by_one(batch_size, 1); - Eigen::DSizes<int, 2> one_by_class(1, num_classes); -#else - Eigen::IndexList<Eigen::type2index<kClassDim> > along_class; - Eigen::IndexList<Eigen::type2index<1> > depth_dim; - Eigen::IndexList<int, Eigen::type2index<1> > batch_by_one; - batch_by_one.set(0, batch_size); - Eigen::IndexList<Eigen::type2index<1>, int> one_by_class; - one_by_class.set(1, num_classes); -#endif - //shifted_logits = logits - max(logits along classes); - auto shifted_logits = (logits - logits.maximum(along_class) - .eval() - .reshape(batch_by_one) - .broadcast(one_by_class)); - if (log) { - // Calculate the log of the softmax - // softmax = logits - max(logits along classes); - softmax.device(d) = shifted_logits; - // softmax = softmax - log(sum(exp(softmax along classes))); - softmax.device(d) = (softmax - - softmax.exp().sum(along_class) - .eval() - .reshape(batch_by_one) - .broadcast(one_by_class) - .log()); - } else { - // NOTE(touts): If you modify this implementation please run - // the BM_ImageNetSoftmaxFwd benchmark in nn_ops_test.cc. - // - // softmax = exp(logits - max(logits along classes)); - softmax.device(d) = shifted_logits.exp(); - // softmax = softmax / sum(softmax along classes); - softmax.device(d) = (softmax / - softmax.sum(along_class) - .eval() - .reshape(batch_by_one) - .broadcast(one_by_class)); + void Compute(OpKernelContext* context) override { + const Tensor& logits_in = context->input(0); + OP_REQUIRES(context, TensorShapeUtils::IsMatrix(logits_in.shape()), + errors::InvalidArgument("logits must be 2-dimensional")); + Tensor* softmax_out = nullptr; + OP_REQUIRES_OK( + context, context->allocate_output(0, logits_in.shape(), &softmax_out)); + if (logits_in.NumElements()) { + functor::SoftmaxFunctor<Device, T> functor; + functor(context->eigen_device<Device>(), logits_in.matrix<T>(), + softmax_out->matrix<T>(), log_); } } + + private: + bool log_; }; -} // namespace functor } // namespace tensorflow +#undef EIGEN_USE_THREADS + #endif // TENSORFLOW_KERNELS_SOFTMAX_OP_H_ diff --git a/tensorflow/core/kernels/softmax_op_functor.h b/tensorflow/core/kernels/softmax_op_functor.h new file mode 100644 index 0000000000..47bb9de411 --- /dev/null +++ b/tensorflow/core/kernels/softmax_op_functor.h @@ -0,0 +1,101 @@ +/* Copyright 2015 Google Inc. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_KERNELS_SOFTMAX_OP_FUNCTOR_H_ +#define TENSORFLOW_KERNELS_SOFTMAX_OP_FUNCTOR_H_ +// Functor definition for SoftmaxOp, must be compilable by nvcc. + +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" +#include "tensorflow/core/framework/tensor_types.h" + +namespace tensorflow { +namespace functor { + +// Functor used by SoftmaxOp to do the computations. +template <typename Device, typename T> +struct SoftmaxFunctor { + // Computes Softmax or LogSoftmax activation. + // + // logits: dim: batch_size, num_classes. + // softmax: dims: batch_size, num_classes. + // log: boolean + void operator()(const Device& d, typename TTypes<T>::ConstMatrix logits, + typename TTypes<T>::Matrix softmax, const bool log); +}; + +// Eigen code implementing SoftmaxFunctor::operator() or +// LogSoftmaxFunctor::operator(). +// This code works for both CPU and GPU and is used by the functor +// specializations for both device types. +template <typename Device, typename T> +struct SoftmaxEigenImpl { + static void Compute(const Device& d, typename TTypes<T>::ConstMatrix logits, + typename TTypes<T>::Matrix softmax, const bool log) { + const int kBatchDim = 0; + const int kClassDim = 1; + + const int batch_size = logits.dimension(kBatchDim); + const int num_classes = logits.dimension(kClassDim); + +// These arrays are used to reduce along the class dimension, and broadcast +// the resulting value to all classes. +#if !defined(EIGEN_HAS_INDEX_LIST) + Eigen::DSizes<int, 1> along_class(kClassDim); + Eigen::DSizes<int, 2> batch_by_one(batch_size, 1); + Eigen::DSizes<int, 2> one_by_class(1, num_classes); +#else + Eigen::IndexList<Eigen::type2index<kClassDim> > along_class; + Eigen::IndexList<Eigen::type2index<1> > depth_dim; + Eigen::IndexList<int, Eigen::type2index<1> > batch_by_one; + batch_by_one.set(0, batch_size); + Eigen::IndexList<Eigen::type2index<1>, int> one_by_class; + one_by_class.set(1, num_classes); +#endif + //shifted_logits = logits - max(logits along classes); + auto shifted_logits = (logits - logits.maximum(along_class) + .eval() + .reshape(batch_by_one) + .broadcast(one_by_class)); + if (log) { + // Calculate the log of the softmax + // softmax = logits - max(logits along classes); + softmax.device(d) = shifted_logits; + // softmax = softmax - log(sum(exp(softmax along classes))); + softmax.device(d) = (softmax - + softmax.exp().sum(along_class) + .eval() + .reshape(batch_by_one) + .broadcast(one_by_class) + .log()); + } else { + // NOTE(touts): If you modify this implementation please run + // the BM_ImageNetSoftmaxFwd benchmark in nn_ops_test.cc. + // + // softmax = exp(logits - max(logits along classes)); + softmax.device(d) = shifted_logits.exp(); + // softmax = softmax / sum(softmax along classes); + softmax.device(d) = (softmax / + softmax.sum(along_class) + .eval() + .reshape(batch_by_one) + .broadcast(one_by_class)); + } + } +}; + +} // namespace functor +} // namespace tensorflow + +#endif // TENSORFLOW_KERNELS_SOFTMAX_OP_FUNCTOR_H_ diff --git a/tensorflow/core/kernels/softmax_op_gpu.cu.cc b/tensorflow/core/kernels/softmax_op_gpu.cu.cc index 0bfc27d32b..e27fff9b92 100644 --- a/tensorflow/core/kernels/softmax_op_gpu.cu.cc +++ b/tensorflow/core/kernels/softmax_op_gpu.cu.cc @@ -17,7 +17,7 @@ limitations under the License. #define EIGEN_USE_GPU -#include "tensorflow/core/kernels/softmax_op.h" +#include "tensorflow/core/kernels/softmax_op_functor.h" #include "tensorflow/core/framework/tensor_types.h" #include "tensorflow/core/platform/types.h" diff --git a/tensorflow/core/kernels/spacetodepth_op.cc b/tensorflow/core/kernels/spacetodepth_op.cc index 4f9a71ce90..9b6bb19ee8 100644 --- a/tensorflow/core/kernels/spacetodepth_op.cc +++ b/tensorflow/core/kernels/spacetodepth_op.cc @@ -21,6 +21,8 @@ limitations under the License. #include <string> #include <utility> +#include "tensorflow/core/kernels/spacetodepth_op.h" + #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" #include "tensorflow/core/framework/op.h" #include "tensorflow/core/framework/op_kernel.h" @@ -89,27 +91,43 @@ class SpaceToDepthOp : public OpKernel { auto Toutput = outputs_tensor->tensor<T, 4>(); auto Tinput = input.tensor<T, 4>(); + functor::SpaceToDepthOpFunctor<Device, T> functor; + functor(context->eigen_device<Device>(), Tinput, block_size_, Toutput); + }; + + private: + int block_size_; +}; + +// Partial specialization of SpaceToDepthOpFunctor for a CPUDevice. +namespace functor { +template <typename T> +struct SpaceToDepthOpFunctor<CPUDevice, T> { + void operator()(const CPUDevice& d, typename TTypes<T, 4>::ConstTensor input, + int block_size, typename TTypes<T, 4>::Tensor output) { + const int batch_size = output.dimension(0); + const int input_height = input.dimension(1); + const int input_width = input.dimension(2); + const int input_depth = input.dimension(3); + for (int b = 0; b < batch_size; ++b) { - for (int h = 0; h < height; ++h) { - const int out_h = h / block_size_; - const int offset_h = (h % block_size_); - for (int w = 0; w < width; ++w) { - const int out_w = w / block_size_; - const int offset_w = (w % block_size_); - const int offset_d = - (offset_h * block_size_ + offset_w) * input_depth; + for (int h = 0; h < input_height; ++h) { + const int out_h = h / block_size; + const int offset_h = (h % block_size); + for (int w = 0; w < input_width; ++w) { + const int out_w = w / block_size; + const int offset_w = (w % block_size); + const int offset_d = (offset_h * block_size + offset_w) * input_depth; for (int d = 0; d < input_depth; ++d) { const int out_d = d + offset_d; - Toutput(b, out_h, out_w, out_d) = Tinput(b, h, w, d); + output(b, out_h, out_w, out_d) = input(b, h, w, d); } } } } - }; - - private: - int block_size_; + } }; +} // namespace functor #define REGISTER(type) \ REGISTER_KERNEL_BUILDER( \ @@ -119,4 +137,10 @@ class SpaceToDepthOp : public OpKernel { TF_CALL_ALL_TYPES(REGISTER); #undef REGISTER +#if GOOGLE_CUDA +REGISTER_KERNEL_BUILDER( + Name("SpaceToDepth").Device(DEVICE_GPU).TypeConstraint<float>("T"), + SpaceToDepthOp<GPUDevice, float>); +#endif // GOOGLE_CUDA + } // end namespace tensorflow diff --git a/tensorflow/core/kernels/spacetodepth_op.h b/tensorflow/core/kernels/spacetodepth_op.h new file mode 100644 index 0000000000..8d225c6cdb --- /dev/null +++ b/tensorflow/core/kernels/spacetodepth_op.h @@ -0,0 +1,44 @@ +/* Copyright 2015 Google Inc. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_CORE_KERNELS_SPACETODEPTH_OP_H_ +#define TENSORFLOW_CORE_KERNELS_SPACETODEPTH_OP_H_ +// Functor definition for XentOp, must be compilable by nvcc. + +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" +#include "tensorflow/core/framework/tensor_types.h" + +namespace tensorflow { +namespace functor { + +// Functor used by SpaceToDepthOp to do the computations. +template <typename Device, typename T> +struct SpaceToDepthOpFunctor { + // Implements the space to depth conversion. + // + // input: 4-D input tensor. + // block_size: block size for the conversion. + // output: 4-D output tensor. + // + // The dimensions of the tensors are guaranteed to be right when the + // functor is called. + void operator()(const Device& d, typename TTypes<T, 4>::ConstTensor input, + int block_size, typename TTypes<T, 4>::Tensor output); +}; + +} // namespace functor +} // namespace tensorflow + +#endif // TENSORFLOW_CORE_KERNELS_SPACETODEPTH_OP_H_ diff --git a/tensorflow/core/kernels/spacetodepth_op_gpu.cu.cc b/tensorflow/core/kernels/spacetodepth_op_gpu.cu.cc new file mode 100644 index 0000000000..d6678a22ed --- /dev/null +++ b/tensorflow/core/kernels/spacetodepth_op_gpu.cu.cc @@ -0,0 +1,89 @@ +/* Copyright 2015 Google Inc. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#if GOOGLE_CUDA + +#define EIGEN_USE_GPU + +#include "tensorflow/core/kernels/spacetodepth_op.h" + +#include "tensorflow/core/framework/tensor_types.h" +#include "tensorflow/core/platform/types.h" +#include "tensorflow/core/util/cuda_kernel_helper.h" + +namespace tensorflow { + +typedef Eigen::GpuDevice GPUDevice; + +template <typename dtype> +__global__ void S2D(const int32 nthreads, const dtype* input_ptr, + const int block_size, const int batch_size, + const int input_height, const int input_width, + const int input_depth, const int output_height, + const int output_width, const int output_depth, + dtype* output_ptr) { + CUDA_1D_KERNEL_LOOP(inp_idx, nthreads) { + // inp_idx = d + input_depth * (w + input_width * (h + input_height * b)) + const int d = inp_idx % input_depth; + const int inp_idx2 = inp_idx / input_depth; + const int w = inp_idx2 % input_width; + const int inp_idx3 = inp_idx2 / input_width; + const int h = inp_idx3 % input_height; + const int b = inp_idx3 / input_height; + + const int out_h = h / block_size; + const int offset_h = h % block_size; + const int out_w = w / block_size; + const int offset_w = w % block_size; + const int offset_d = (offset_h * block_size + offset_w) * input_depth; + const int out_d = d + offset_d; + const int out_idx = + out_d + + output_depth * (out_w + output_width * (out_h + output_height * b)); + *(output_ptr + out_idx) = ldg(input_ptr + inp_idx); + } +} + +// Specialization of SpaceToDepthOpFunctor for a CPUDevice. +namespace functor { +template <typename T> +struct SpaceToDepthOpFunctor<GPUDevice, T> { + void operator()(const GPUDevice& d, typename TTypes<T, 4>::ConstTensor input, + int block_size, typename TTypes<T, 4>::Tensor output) { + const int batch_size = output.dimension(0); + const int input_height = input.dimension(1); + const int input_width = input.dimension(2); + const int input_depth = input.dimension(3); + const int output_height = output.dimension(1); + const int output_width = output.dimension(2); + const int output_depth = output.dimension(3); + + const int total_count = + batch_size * input_height * input_width * input_depth; + CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d); + S2D<<<config.block_count, config.thread_per_block, 0, d.stream()>>>( + config.virtual_thread_count, input.data(), block_size, batch_size, + input_height, input_width, input_depth, output_height, output_width, + output_depth, output.data()); + } +}; +} // end namespace functor + +// Instantiate the GPU implementation for float. +template struct functor::SpaceToDepthOpFunctor<GPUDevice, float>; + +} // end namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/transpose_op.cc b/tensorflow/core/kernels/transpose_op.cc index 5ecef9c6f9..52e792a399 100644 --- a/tensorflow/core/kernels/transpose_op.cc +++ b/tensorflow/core/kernels/transpose_op.cc @@ -23,6 +23,7 @@ limitations under the License. #include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/framework/tensor_shape.h" +#include "tensorflow/core/kernels/bounds_check.h" #include "tensorflow/core/kernels/transpose_functor.h" #include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/lib/strings/str_util.h" @@ -55,8 +56,8 @@ class InvertPermutationOp : public OpKernel { auto Tout = output->vec<int32>(); std::fill_n(Tout.data(), N, -1); for (int i = 0; i < N; ++i) { - const int32 d = Tin(i); - OP_REQUIRES(context, 0 <= d && d < N, + const int32 d = internal::SubtleMustCopy(Tin(i)); + OP_REQUIRES(context, FastBoundsCheck(d, N), errors::InvalidArgument(d, " is not between 0 and ", N)); OP_REQUIRES(context, Tout(d) == -1, errors::InvalidArgument(d, " is duplicated in the input.")); @@ -107,18 +108,26 @@ void TransposeOp::Compute(OpKernelContext* ctx) { errors::InvalidArgument( "transpose expects a vector of size ", input.dims(), ". But input(1) is a vector of size ", Vperm.size())); - gtl::ArraySlice<int32> permutation( - reinterpret_cast<const int32*>(Vperm.data()), dims); + // using volatile instead of SubtleMustCopy here so that the + // asynchrony boundary is permutation. + const volatile int32* perm_begin = + reinterpret_cast<const volatile int32*>(Vperm.data()); + const std::vector<int32> permutation(perm_begin, perm_begin + dims); TensorShape shape; // Check whether permutation is a permutation of integers of [0 .. dims). gtl::InlinedVector<bool, 8> bits(dims); - for (const int32 d : permutation) { + bool is_identity = true; + for (int i = 0; i < dims; ++i) { + const int32 d = permutation[i]; OP_REQUIRES( ctx, 0 <= d && d < dims, errors::InvalidArgument(d, " is out of range [0 .. ", dims, ")")); bits[d] = true; shape.AddDim(input.dim_size(d)); + if (d != i) { + is_identity = false; + } } for (int i = 0; i < dims; ++i) { OP_REQUIRES(ctx, bits[i], errors::InvalidArgument( @@ -126,8 +135,8 @@ void TransposeOp::Compute(OpKernelContext* ctx) { str_util::Join(permutation, ","), "}.")); } - // 0-D and 1-D transposes do nothing - if (dims <= 1) { + // 0-D, 1-D, and identity transposes do nothing. + if (dims <= 1 || is_identity) { ctx->set_output(0, input); return; } diff --git a/tensorflow/core/public/session.h b/tensorflow/core/public/session.h index cf71dc7ecb..6fa02fd729 100644 --- a/tensorflow/core/public/session.h +++ b/tensorflow/core/public/session.h @@ -139,7 +139,8 @@ class Session { /// \brief Like `Run`, but allows users to pass in a `RunOptions` proto and /// to retrieve non-Tensor metadata output via a `RunOutputs` proto for this - /// step. + /// step. `run_outputs` may be nullptr, in which case any metadata output is + /// discarded. /// NOTE: This API is still experimental and may change. virtual Status Run(const RunOptions& run_options, const std::vector<std::pair<string, Tensor> >& inputs, @@ -148,8 +149,8 @@ class Session { std::vector<Tensor>* outputs, RunOutputs* run_outputs); /// \brief Sets up a graph for partial execution. All future feeds and - /// fetches are specified by 'input_names' and 'output_names'. Returns - /// 'handle' that can be used to perform a sequence of partial feeds and + /// fetches are specified by `input_names` and `output_names`. Returns + /// `handle` that can be used to perform a sequence of partial feeds and /// fetches. /// NOTE: This API is still experimental and may change. virtual Status PRunSetup(const std::vector<string>& input_names, @@ -157,7 +158,7 @@ class Session { const std::vector<string>& target_nodes, string* handle); - /// \brief Continues the pending execution specified by 'handle' with the + /// \brief Continues the pending execution specified by `handle` with the /// provided input tensors and fills `outputs` for the endpoints specified /// in `output_names`. /// NOTE: This API is still experimental and may change. diff --git a/tensorflow/core/public/tensor_c_api.h b/tensorflow/core/public/tensor_c_api.h index b7ac96b6b9..e846f5d0b6 100644 --- a/tensorflow/core/public/tensor_c_api.h +++ b/tensorflow/core/public/tensor_c_api.h @@ -268,15 +268,26 @@ extern void TF_ExtendGraph(TF_Session*, const void* proto, size_t proto_len, // failure, inputs[] become the property of the implementation (the // implementation will eventually call TF_DeleteTensor on each input). // -// The caller retains the ownership of both `run_options` and `run_outputs`, and -// should manually call TF_DeleteBuffer on them. +// Any NULL and non-NULL value combinations for (`run_options`, +// `run_outputs`) are valid. +// +// - `run_options` may be NULL, in which case it will be ignored; or +// non-NULL, in which case it must point to a `TF_Buffer` containing the +// serialized representation of a `RunOptions` protocol buffer. +// - `run_output` may be NULL, in which case it will be ignored; or non-NULL, +// in which case it must point to an empty, freshly allocated `TF_Buffer` +// that may be updated to contain the serialized representation of a +// `RunOutput` protocol buffer. +// +// The caller retains the ownership of `run_options` and/or `run_outputs` (when +// not NULL) and should manually call TF_DeleteBuffer on them. // // On success, the tensors corresponding to output_names[0,noutputs-1] // are placed in outputs[], and these outputs[] become the property // of the caller (the caller must eventually call TF_DeleteTensor on // them). // -// On failure, outputs[] contains nulls. +// On failure, outputs[] contains NULLs. extern void TF_Run(TF_Session*, // RunOptions const TF_Buffer* run_options, @@ -341,7 +352,7 @@ extern void TF_PRun(TF_Session*, const char* handle, // On success, place OK in status and return the newly created library handle. // The caller owns the library handle. // -// On failure, place an error status in status and return nullptr. +// On failure, place an error status in status and return NULL. extern TF_Library* TF_LoadLibrary(const char* library_filename, TF_Status* status); diff --git a/tensorflow/core/util/work_sharder.cc b/tensorflow/core/util/work_sharder.cc index dfc2c04baa..046d69a939 100644 --- a/tensorflow/core/util/work_sharder.cc +++ b/tensorflow/core/util/work_sharder.cc @@ -39,8 +39,10 @@ void Shard(int num_workers, thread::ThreadPool* workers, int64 total, // much. Let us assume each cost unit is 1ns, kMinCostPerShard=10000 // is 10us. static const int64 kMinCostPerShard = 10000; - const int num_shards = std::max( - 1, std::min<int>(num_workers, total * cost_per_unit / kMinCostPerShard)); + const int num_shards = + std::max<int>(1, std::min(static_cast<int64>(num_workers), + total * cost_per_unit / kMinCostPerShard)); + // Each shard contains up to "block_size" units. [0, total) is sharded // into: // [0, block_size), [block_size, 2*block_size), ... diff --git a/tensorflow/core/util/work_sharder_test.cc b/tensorflow/core/util/work_sharder_test.cc index 3772bf9bca..c0d7267da9 100644 --- a/tensorflow/core/util/work_sharder_test.cc +++ b/tensorflow/core/util/work_sharder_test.cc @@ -59,6 +59,25 @@ TEST(Shard, Basic) { } } +TEST(Shard, OverflowTest) { + thread::ThreadPool threads(Env::Default(), "test", 3); + mutex mu; + for (auto workers : {1, 2, 3}) { + const int64 total_elements = 1LL << 32; + const int64 cost_per_unit = 10000; + int num_shards = 0; + int64 num_elements = 0; + Shard(workers, &threads, total_elements, cost_per_unit, + [&mu, &num_shards, &num_elements](int64 start, int64 limit) { + mutex_lock l(mu); + ++num_shards; + num_elements += limit - start; + }); + EXPECT_EQ(num_shards, workers); + EXPECT_EQ(num_elements, total_elements); + } +} + void BM_Sharding(int iters, int arg) { thread::ThreadPool threads(Env::Default(), "test", 16); const int64 total = 1LL << 30; diff --git a/tensorflow/examples/android/jni/jni_utils.cc b/tensorflow/examples/android/jni/jni_utils.cc index 0a1f8adbd0..db0eedeb16 100644 --- a/tensorflow/examples/android/jni/jni_utils.cc +++ b/tensorflow/examples/android/jni/jni_utils.cc @@ -157,3 +157,17 @@ void ReadFileToVector(AAssetManager* const asset_manager, VLOG(0) << "Read " << str_vector->size() << " values from " << filename; } +void WriteProtoToFile(const char* const filename, + const google::protobuf::MessageLite& message) { + std::fstream outfile; + outfile.open(filename, std::fstream::binary | std::fstream::out); + if (outfile.fail()) { + LOG(WARNING) << "Failed to write proto to " << filename; + return; + } else { + google::protobuf::io::OstreamOutputStream raw_out(&outfile); + google::protobuf::io::CodedOutputStream coded_out(&raw_out); + message.SerializeToCodedStream(&coded_out); + } + VLOG(0) << "Wrote proto to " << filename; +} diff --git a/tensorflow/examples/android/jni/jni_utils.h b/tensorflow/examples/android/jni/jni_utils.h index 4c1b140abf..c296744061 100644 --- a/tensorflow/examples/android/jni/jni_utils.h +++ b/tensorflow/examples/android/jni/jni_utils.h @@ -42,4 +42,7 @@ void ReadFileToString(AAssetManager* const asset_manager, void ReadFileToVector(AAssetManager* const asset_manager, const char* const filename, std::vector<std::string>* str_vector); +void WriteProtoToFile(const char* const filename, + const google::protobuf::MessageLite& message); + #endif // ORG_TENSORFLOW_JNI_JNI_UTILS_H_ diff --git a/tensorflow/examples/android/jni/tensorflow_jni.cc b/tensorflow/examples/android/jni/tensorflow_jni.cc index 2b0aa82777..e1060ab666 100644 --- a/tensorflow/examples/android/jni/tensorflow_jni.cc +++ b/tensorflow/examples/android/jni/tensorflow_jni.cc @@ -21,13 +21,16 @@ limitations under the License. #include <jni.h> #include <pthread.h> +#include <sys/stat.h> #include <unistd.h> #include <queue> #include <sstream> #include <string> +#include "tensorflow/core/framework/step_stats.pb.h" #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/framework/types.pb.h" +#include "tensorflow/core/lib/strings/stringprintf.h" #include "tensorflow/core/platform/env.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/mutex.h" @@ -51,6 +54,12 @@ static int g_image_mean; // The image mean. static int g_num_runs = 0; static int64 g_timing_total_us = 0; +#ifdef SAVE_STEP_STATS +static const bool kSaveStepStats = true; +#else +static const bool kSaveStepStats = false; +#endif + inline static int64 CurrentThreadTimeUs() { struct timeval tv; gettimeofday(&tv, NULL); @@ -199,11 +208,30 @@ static std::string ClassifyImage(const RGBA* const bitmap_src, std::vector<tensorflow::Tensor> output_tensors; std::vector<std::string> output_names({"output:0"}); - const int64 start_time = CurrentThreadTimeUs(); - tensorflow::Status s = - session->Run(input_tensors, output_names, {}, &output_tensors); - const int64 end_time = CurrentThreadTimeUs(); - + tensorflow::Status s; + int64 start_time, end_time; + + if (kSaveStepStats) { + RunOptions run_options; + run_options.set_trace_level(RunOptions::FULL_TRACE); + RunOutputs run_outputs; + start_time = CurrentThreadTimeUs(); + s = session->Run(run_options, input_tensors, output_names, {}, + &output_tensors, &run_outputs); + end_time = CurrentThreadTimeUs(); + assert(run_outputs.has_step_stats()); + + const StepStats& stats = run_outputs.step_stats(); + + mkdir("/sdcard/tf/", 0755); + const string filename = + strings::Printf("/sdcard/tf/stepstats%05d.pb", g_num_runs); + WriteProtoToFile(filename.c_str(), stats); + } else { + start_time = CurrentThreadTimeUs(); + s = session->Run(input_tensors, output_names, {}, &output_tensors); + end_time = CurrentThreadTimeUs(); + } const int64 elapsed_time_inf = end_time - start_time; g_timing_total_us += elapsed_time_inf; VLOG(0) << "End computing. Ran in " << elapsed_time_inf / 1000 << "ms (" diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD index 3d8ddf639f..ddd50985db 100644 --- a/tensorflow/python/BUILD +++ b/tensorflow/python/BUILD @@ -40,6 +40,7 @@ py_library( name = "platform", srcs = glob(["platform/**/*.py"]), srcs_version = "PY2AND3", + deps = ["//tensorflow/core:protos_all_py"], ) py_library( @@ -1006,6 +1007,7 @@ py_test( name = "session_test", srcs = ["client/session_test.py"], srcs_version = "PY2AND3", + tags = ["noasan"], deps = [ ":framework", ":framework_test_lib", @@ -1034,12 +1036,12 @@ cpu_only_kernel_test_list = glob([ "kernel_tests/attention_ops_test.py", "kernel_tests/barrier_ops_test.py", "kernel_tests/bcast_ops_test.py", + "kernel_tests/benchmark_test.py", "kernel_tests/candidate_sampler_ops_test.py", "kernel_tests/cholesky_op_test.py", "kernel_tests/clip_ops_test.py", "kernel_tests/decode_csv_op_test.py", "kernel_tests/decode_raw_op_test.py", - "kernel_tests/depthtospace_op_test.py", "kernel_tests/determinant_op_test.py", "kernel_tests/diag_op_test.py", "kernel_tests/edit_distance_op_test.py", @@ -1069,7 +1071,6 @@ cpu_only_kernel_test_list = glob([ "kernel_tests/sparse_reorder_op_test.py", "kernel_tests/sparse_to_dense_op_test.py", "kernel_tests/sparsemask_op_test.py", - "kernel_tests/spacetodepth_op_test.py", "kernel_tests/summary_ops_test.py", "kernel_tests/template_test.py", "kernel_tests/topk_op_test.py", diff --git a/tensorflow/python/__init__.py b/tensorflow/python/__init__.py index c36cdfe30f..84adaca8da 100644 --- a/tensorflow/python/__init__.py +++ b/tensorflow/python/__init__.py @@ -59,7 +59,7 @@ from tensorflow.core.framework.attr_value_pb2 import * from tensorflow.core.protobuf.config_pb2 import * from tensorflow.core.util.event_pb2 import * # Import things out of contrib -from tensorflow import contrib +import tensorflow.contrib as contrib # Framework from tensorflow.python.framework.framework_lib import * @@ -101,6 +101,7 @@ from tensorflow.python.framework import framework_lib from tensorflow.python.ops import array_ops from tensorflow.python.ops import constant_op from tensorflow.python.ops import control_flow_ops +from tensorflow.python.ops import histogram_ops from tensorflow.python.ops import io_ops from tensorflow.python.ops import math_ops from tensorflow.python.ops import script_ops @@ -117,8 +118,8 @@ _whitelist = set([app, compat, contrib, errors, flags, gfile, image, # strings of other modules. __all__ = make_all(__name__, [framework_lib, array_ops, client_lib, constant_op, - control_flow_ops, io_ops, math_ops, nn, script_ops, - sparse_ops, state_ops, train]) + control_flow_ops, histogram_ops, io_ops, math_ops, nn, + script_ops, sparse_ops, state_ops, train]) # Symbols whitelisted for export without documentation. # TODO(cwhipkey): review these and move to contrib, expose through diff --git a/tensorflow/python/client/session.py b/tensorflow/python/client/session.py index a77cdffda5..d739eb9ee5 100644 --- a/tensorflow/python/client/session.py +++ b/tensorflow/python/client/session.py @@ -294,7 +294,7 @@ class BaseSession(SessionInterface): [`SparseTensorValue`](../../api_docs/python/sparse_ops.md#SparseTensorValue). The optional `options` argument expects a [`RunOptions`] proto. The options - allow controling the behavior of this particular step (e.g. turning tracing + allow controlling the behavior of this particular step (e.g. turning tracing on). The optional `run_outputs` argument expects a [`RunOutputs`] proto. When diff --git a/tensorflow/python/client/session_test.py b/tensorflow/python/client/session_test.py index 491b293125..c82e9a96d0 100644 --- a/tensorflow/python/client/session_test.py +++ b/tensorflow/python/client/session_test.py @@ -25,7 +25,6 @@ import numpy as np import six from six.moves import xrange # pylint: disable=redefined-builtin -from tensorflow.core.framework import step_stats_pb2 from tensorflow.core.lib.core import error_codes_pb2 from tensorflow.core.protobuf import config_pb2 from tensorflow.python.client import session @@ -927,13 +926,32 @@ class SessionTest(test_util.TensorFlowTestCase): sess.run(constant_op.constant(1.0), options=run_options, run_outputs=run_outputs) + self.assertTrue(run_outputs.HasField('step_stats')) + self.assertEquals(len(run_outputs.step_stats.dev_stats), 1) + + def testRunOptionsRunOutputs(self): + run_options = config_pb2.RunOptions( + trace_level=config_pb2.RunOptions.FULL_TRACE) + run_outputs = config_pb2.RunOutputs() + + with ops.device('/cpu:0'): + with session.Session() as sess: + # all combinations are valid + sess.run(constant_op.constant(1.0), options=None, run_outputs=None) + sess.run(constant_op.constant(1.0), options=None, + run_outputs=run_outputs) + self.assertTrue(not run_outputs.HasField('step_stats')) - step_stats = step_stats_pb2.StepStats() - self.assertEquals(len(step_stats.dev_stats), 0) + sess.run(constant_op.constant(1.0), options=run_options, + run_outputs=None) + self.assertTrue(not run_outputs.HasField('step_stats')) - step_stats.CopyFrom(run_outputs.step_stats) - self.assertEquals(len(step_stats.dev_stats), 1) + sess.run(constant_op.constant(1.0), options=run_options, + run_outputs=run_outputs) + + self.assertTrue(run_outputs.HasField('step_stats')) + self.assertEquals(len(run_outputs.step_stats.dev_stats), 1) def testFeedShapeCompatibility(self): with session.Session() as sess: diff --git a/tensorflow/python/framework/gen_docs_combined.py b/tensorflow/python/framework/gen_docs_combined.py index 7180f7d77c..d19482a154 100644 --- a/tensorflow/python/framework/gen_docs_combined.py +++ b/tensorflow/python/framework/gen_docs_combined.py @@ -81,6 +81,7 @@ def all_libraries(module_to_name, members, documented): exclude_symbols=["sparse_matmul", "arg_min", "arg_max", "lin_space", "sparse_segment_mean_grad"], prefix=PREFIX_TEXT), + library("histogram_ops", "Histograms"), library("control_flow_ops", "Control Flow", prefix=PREFIX_TEXT), library("image", "Images", tf.image, exclude_symbols=["ResizeMethod"], prefix=PREFIX_TEXT), diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py index 284db94d45..17f21d56af 100644 --- a/tensorflow/python/framework/test_util.py +++ b/tensorflow/python/framework/test_util.py @@ -165,9 +165,8 @@ class TensorFlowTestCase(googletest.TestCase): text_format.Merge(expected_message_maybe_ascii, expected_message) self._AssertProtoEquals(expected_message, message) else: - assert False, ("Can't compare protos of type " + - type(expected_message_maybe_ascii) + " and " + - type(message)) + assert False, ("Can't compare protos of type %s and %s" % + (type(expected_message_maybe_ascii), type(message))) def assertProtoEqualsVersion( self, expected, actual, producer=versions.GRAPH_DEF_VERSION, diff --git a/tensorflow/python/kernel_tests/benchmark_test.py b/tensorflow/python/kernel_tests/benchmark_test.py new file mode 100644 index 0000000000..4a5d55fbff --- /dev/null +++ b/tensorflow/python/kernel_tests/benchmark_test.py @@ -0,0 +1,158 @@ +# Copyright 2016 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Tests for tensorflow.python.framework.importer.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import random + +import tensorflow as tf + +from google.protobuf import text_format +from tensorflow.core.util import test_log_pb2 +from tensorflow.python.platform import benchmark + + +# Used by SomeRandomBenchmark class below. +_ran_somebenchmark_1 = [False] +_ran_somebenchmark_2 = [False] +_ran_somebenchmark_but_shouldnt = [False] + + +class SomeRandomBenchmark(tf.test.Benchmark): + """This Benchmark should automatically be registered in the registry.""" + + def _dontRunThisBenchmark(self): + _ran_somebenchmark_but_shouldnt[0] = True + + def notBenchmarkMethod(self): + _ran_somebenchmark_but_shouldnt[0] = True + + def benchmark1(self): + _ran_somebenchmark_1[0] = True + + def benchmark2(self): + _ran_somebenchmark_2[0] = True + + +class TestReportingBenchmark(tf.test.Benchmark): + """This benchmark (maybe) reports some stuff.""" + + def benchmarkReport1(self): + self.report_benchmark(iters=1) + + def benchmarkReport2(self): + self.report_benchmark( + iters=2, name="custom_benchmark_name", + extras={"number_key": 3, "other_key": "string"}) + + +class BenchmarkTest(tf.test.TestCase): + + def testGlobalBenchmarkRegistry(self): + registry = list(benchmark.GLOBAL_BENCHMARK_REGISTRY) + self.assertEqual(len(registry), 2) + self.assertTrue(SomeRandomBenchmark in registry) + self.assertTrue(TestReportingBenchmark in registry) + + def testRunSomeRandomBenchmark(self): + # Validate that SomeBenchmark has not run yet + self.assertFalse(_ran_somebenchmark_1[0]) + self.assertFalse(_ran_somebenchmark_2[0]) + self.assertFalse(_ran_somebenchmark_but_shouldnt[0]) + + # Run other benchmarks, but this wont run the one we care about + benchmark._run_benchmarks("unrelated") + + # Validate that SomeBenchmark has not run yet + self.assertFalse(_ran_somebenchmark_1[0]) + self.assertFalse(_ran_somebenchmark_2[0]) + self.assertFalse(_ran_somebenchmark_but_shouldnt[0]) + + # Run all the benchmarks, avoid generating any reports + if benchmark.TEST_REPORTER_TEST_ENV in os.environ: + del os.environ[benchmark.TEST_REPORTER_TEST_ENV] + benchmark._run_benchmarks("SomeRandom") + + # Validate that SomeRandomBenchmark ran correctly + self.assertTrue(_ran_somebenchmark_1[0]) + self.assertTrue(_ran_somebenchmark_2[0]) + self.assertFalse(_ran_somebenchmark_but_shouldnt[0]) + + def testReportingBenchmark(self): + tempdir = tf.test.get_temp_dir() + try: + tf.gfile.MakeDirs(tempdir) + except OSError as e: + # It's OK if the directory already exists. + if " exists:" not in str(e): + raise e + + prefix = os.path.join( + tempdir, "reporting_bench_%016x_" % random.getrandbits(64)) + expected_output_file = "%s%s" % ( + prefix, "TestReportingBenchmark.benchmarkReport1") + expected_output_file_2 = "%s%s" % ( + prefix, "TestReportingBenchmark.custom_benchmark_name") + try: + self.assertFalse(tf.gfile.Exists(expected_output_file)) + # Run benchmark but without env, shouldn't write anything + if benchmark.TEST_REPORTER_TEST_ENV in os.environ: + del os.environ[benchmark.TEST_REPORTER_TEST_ENV] + reporting = TestReportingBenchmark() + reporting.benchmarkReport1() # This should run without writing anything + self.assertFalse(tf.gfile.Exists(expected_output_file)) + + # Runbenchmark with env, should write + os.environ[benchmark.TEST_REPORTER_TEST_ENV] = prefix + + reporting = TestReportingBenchmark() + reporting.benchmarkReport1() # This should write + reporting.benchmarkReport2() # This should write + + # Check the files were written + self.assertTrue(tf.gfile.Exists(expected_output_file)) + self.assertTrue(tf.gfile.Exists(expected_output_file_2)) + + # Check the contents are correct + expected_1 = test_log_pb2.BenchmarkEntry() + expected_1.name = "TestReportingBenchmark.benchmarkReport1" + expected_1.iters = 1 + + expected_2 = test_log_pb2.BenchmarkEntry() + expected_2.name = "TestReportingBenchmark.custom_benchmark_name" + expected_2.iters = 2 + expected_2.extras["number_key"].double_value = 3 + expected_2.extras["other_key"].string_value = "string" + + read_benchmark_1 = tf.gfile.GFile(expected_output_file, "r").read() + read_benchmark_1 = text_format.Merge( + read_benchmark_1, test_log_pb2.BenchmarkEntry()) + self.assertProtoEquals(expected_1, read_benchmark_1) + + read_benchmark_2 = tf.gfile.GFile(expected_output_file_2, "r").read() + read_benchmark_2 = text_format.Merge( + read_benchmark_2, test_log_pb2.BenchmarkEntry()) + self.assertProtoEquals(expected_2, read_benchmark_2) + + finally: + tf.gfile.DeleteRecursively(tempdir) + + +if __name__ == "__main__": + tf.test.main() diff --git a/tensorflow/python/kernel_tests/depthtospace_op_test.py b/tensorflow/python/kernel_tests/depthtospace_op_test.py index 8dda8832b3..bace61b40f 100644 --- a/tensorflow/python/kernel_tests/depthtospace_op_test.py +++ b/tensorflow/python/kernel_tests/depthtospace_op_test.py @@ -25,12 +25,17 @@ import tensorflow as tf class DepthToSpaceTest(tf.test.TestCase): + def _testOne(self, inputs, block_size, outputs): + for use_gpu in [False, True]: + with self.test_session(use_gpu=use_gpu): + x_tf = tf.depth_to_space(tf.to_float(inputs), block_size) + self.assertAllEqual(x_tf.eval(), outputs) + def testBasic(self): x_np = [[[[1, 2, 3, 4]]]] - with self.test_session(use_gpu=False): - block_size = 2 - x_tf = tf.depth_to_space(x_np, block_size) - self.assertAllEqual(x_tf.eval(), [[[[1], [2]], [[3], [4]]]]) + block_size = 2 + x_out = [[[[1], [2]], [[3], [4]]]] + self._testOne(x_np, block_size, x_out) # Tests for larger input dimensions. To make sure elements are # correctly ordered spatially. @@ -40,12 +45,28 @@ class DepthToSpaceTest(tf.test.TestCase): [[9, 10, 11, 12], [13, 14, 15, 16]]]] block_size = 2 - with self.test_session(use_gpu=False): - x_tf = tf.depth_to_space(x_np, block_size) - self.assertAllEqual(x_tf.eval(), [[[[1], [2], [5], [6]], - [[3], [4], [7], [8]], - [[9], [10], [13], [14]], - [[11], [12], [15], [16]]]]) + x_out = [[[[1], [2], [5], [6]], + [[3], [4], [7], [8]], + [[9], [10], [13], [14]], + [[11], [12], [15], [16]]]] + self._testOne(x_np, block_size, x_out) + + def testBlockSize2Batch10(self): + block_size = 2 + def batch_input_elt(i): + return [[[1 * i, 2 * i, 3 * i, 4 * i], + [5 * i, 6 * i, 7 * i, 8 * i]], + [[9 * i, 10 * i, 11 * i, 12 * i], + [13 * i, 14 * i, 15 * i, 16 * i]]] + def batch_output_elt(i): + return [[[1 * i], [2 * i], [5 * i], [6 * i]], + [[3 * i], [4 * i], [7 * i], [8 * i]], + [[9 * i], [10 * i], [13 * i], [14 * i]], + [[11 * i], [12 * i], [15 * i], [16 * i]]] + batch_size = 10 + x_np = [batch_input_elt(i) for i in xrange(batch_size)] + x_out = [batch_output_elt(i) for i in xrange(batch_size)] + self._testOne(x_np, block_size, x_out) # Tests for different width and height. def testNonSquare(self): @@ -53,46 +74,42 @@ class DepthToSpaceTest(tf.test.TestCase): [[5, 50, 6, 60, 7, 70, 8, 80]], [[9, 90, 10, 100, 11, 110, 12, 120]]]] block_size = 2 - with self.test_session(use_gpu=False): - x_tf = tf.depth_to_space(x_np, block_size) - self.assertAllEqual(x_tf.eval(), [[[[1, 10], [2, 20]], - [[3, 30], [4, 40]], - [[5, 50], [6, 60]], - [[7, 70], [8, 80]], - [[9, 90], [10, 100]], - [[11, 110], [12, 120]]]]) + x_out = [[[[1, 10], [2, 20]], + [[3, 30], [4, 40]], + [[5, 50], [6, 60]], + [[7, 70], [8, 80]], + [[9, 90], [10, 100]], + [[11, 110], [12, 120]]]] + self._testOne(x_np, block_size, x_out) # Tests for larger input dimensions. To make sure elements are # correctly ordered spatially. def testBlockSize4FlatInput(self): x_np = [[[[1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16]]]] block_size = 4 - with self.test_session(use_gpu=False): - x_tf = tf.depth_to_space(x_np, block_size) - self.assertAllEqual(x_tf.eval(), [[[[1], [2], [5], [6]], - [[3], [4], [7], [8]], - [[9], [10], [13], [14]], - [[11], [12], [15], [16]]]]) + x_out = [[[[1], [2], [5], [6]], + [[3], [4], [7], [8]], + [[9], [10], [13], [14]], + [[11], [12], [15], [16]]]] + self._testOne(x_np, block_size, x_out) # Tests for larger input depths. # To make sure elements are properly interleaved in depth. def testDepthInterleaved(self): x_np = [[[[1, 10, 2, 20, 3, 30, 4, 40]]]] block_size = 2 - with self.test_session(use_gpu=False): - x_tf = tf.depth_to_space(x_np, block_size) - self.assertAllEqual(x_tf.eval(), [[[[1, 10], [2, 20]], - [[3, 30], [4, 40]]]]) + x_out = [[[[1, 10], [2, 20]], + [[3, 30], [4, 40]]]] + self._testOne(x_np, block_size, x_out) # Tests for larger input depths. Here an odd depth. # To make sure elements are properly interleaved in depth. def testDepthInterleavedDepth3(self): x_np = [[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]] block_size = 2 - with self.test_session(use_gpu=False): - x_tf = tf.depth_to_space(x_np, block_size) - self.assertAllEqual(x_tf.eval(), [[[[1, 2, 3], [4, 5, 6]], - [[7, 8, 9], [10, 11, 12]]]]) + x_out = [[[[1, 2, 3], [4, 5, 6]], + [[7, 8, 9], [10, 11, 12]]]] + self._testOne(x_np, block_size, x_out) # Tests for larger input depths. # To make sure elements are properly interleaved in depth. @@ -102,13 +119,11 @@ class DepthToSpaceTest(tf.test.TestCase): [[9, 90, 10, 100, 11, 110, 12, 120], [13, 130, 14, 140, 15, 150, 16, 160]]]] block_size = 2 - with self.test_session(use_gpu=False): - x_tf = tf.depth_to_space(x_np, block_size) - self.assertAllEqual(x_tf.eval(), - [[[[1, 10], [2, 20], [5, 50], [6, 60]], - [[3, 30], [4, 40], [7, 70], [8, 80]], - [[9, 90], [10, 100], [13, 130], [14, 140]], - [[11, 110], [12, 120], [15, 150], [16, 160]]]]) + x_out = [[[[1, 10], [2, 20], [5, 50], [6, 60]], + [[3, 30], [4, 40], [7, 70], [8, 80]], + [[9, 90], [10, 100], [13, 130], [14, 140]], + [[11, 110], [12, 120], [15, 150], [16, 160]]]] + self._testOne(x_np, block_size, x_out) # Error handling: @@ -205,5 +220,6 @@ class DepthToSpaceGradientTest(tf.test.TestCase): block_size = 3 self._compare(1, 2, 3, 2, block_size) + if __name__ == "__main__": tf.test.main() diff --git a/tensorflow/python/kernel_tests/rnn_cell_test.py b/tensorflow/python/kernel_tests/rnn_cell_test.py index 88048cfa7a..5261af4aab 100644 --- a/tensorflow/python/kernel_tests/rnn_cell_test.py +++ b/tensorflow/python/kernel_tests/rnn_cell_test.py @@ -184,7 +184,8 @@ class RNNCellTest(tf.test.TestCase): x = tf.zeros([1, 1], dtype=tf.int32) m = tf.zeros([1, 2]) g, new_m = tf.nn.rnn_cell.EmbeddingWrapper( - tf.nn.rnn_cell.GRUCell(2), 3)(x, m) + tf.nn.rnn_cell.GRUCell(2), + embedding_classes=3, embedding_size=2)(x, m) sess.run([tf.initialize_all_variables()]) res = sess.run([g, new_m], {x.name: np.array([[1]]), m.name: np.array([[0.1, 0.1]])}) diff --git a/tensorflow/python/kernel_tests/rnn_test.py b/tensorflow/python/kernel_tests/rnn_test.py index 82c432922c..41dae10210 100644 --- a/tensorflow/python/kernel_tests/rnn_test.py +++ b/tensorflow/python/kernel_tests/rnn_test.py @@ -19,7 +19,6 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -import sys import time import timeit @@ -953,6 +952,7 @@ def graph_creation_static_vs_dynamic_rnn_benchmark(max_time): print("%d \t %f \t %f \t %f" % (max_time, delta_static, delta_dynamic, delta_dynamic/delta_static)) + return delta_static, delta_dynamic def _timer(sess, ops): @@ -1013,6 +1013,8 @@ def static_vs_dynamic_rnn_benchmark(batch_size, max_time, num_units, use_gpu): (batch_size, max_time, num_units, use_gpu, delta_static, delta_dynamic, delta_dynamic/delta_static)) + return delta_static, delta_dynamic + def _dynamic_rnn_swap_memory_benchmark(inputs_t, sequence_length, swap_memory): @@ -1061,6 +1063,7 @@ def dynamic_rnn_swap_memory_benchmark(batch_size, max_time, num_units): print("%d \t %d \t %d \t %f \t %f \t %f" % (batch_size, max_time, num_units, no_swap, swap, swap/no_swap)) + return no_swap, swap def rnn_long_sequence_benchmark(batch_size, seqlen, num_units, @@ -1097,34 +1100,55 @@ def rnn_long_sequence_benchmark(batch_size, seqlen, num_units, elapsed/seqlen)) -def main(_): - print("Graph Creation: Static Unroll vs. Dynamic Unroll LSTM") - print("max_t \t dt(static) \t dt(dynamic) \t dt(dynamic)/dt(static)") - for max_time in (1, 25, 50): - graph_creation_static_vs_dynamic_rnn_benchmark(max_time) - - print("Calculation: Static Unroll with Dynamic Flow LSTM " - "vs. Dynamic Unroll LSTM") - print("batch \t max_t \t units \t gpu \t dt(static) \t dt(dynamic) " - "\t dt(dynamic)/dt(static)") - for batch_size in (256,): - for max_time in (50,): - for num_units in (512, 256, 128): - for use_gpu in (False, True): - static_vs_dynamic_rnn_benchmark( - batch_size, max_time, num_units, use_gpu) - - print("Calculation: Dynamic LSTM No Memory Swap vs. Memory Swap") - print("batch \t max_t \t units \t no_swap \t swap \t swap/no_swap") - for batch_size in (256, 512): - for max_time in (100,): - for num_units in (512, 256, 128): - dynamic_rnn_swap_memory_benchmark(batch_size, max_time, num_units) +class BenchmarkRNN(tf.test.Benchmark): + + def benchmarkGraphCreationStaticVsDynamicLSTM(self): + print("Graph Creation: Static Unroll vs. Dynamic Unroll LSTM") + print("max_t \t dt(static) \t dt(dynamic) \t dt(dynamic)/dt(static)") + for max_time in (1, 25, 50): + s_dt, d_dt = graph_creation_static_vs_dynamic_rnn_benchmark(max_time) + self.report_benchmark(name="graph_creation_time_static_T%02d" % max_time, + iters=5, wall_time=s_dt) + self.report_benchmark(name="graph_creation_time_dynamic_T%02d" % max_time, + iters=5, wall_time=d_dt) + + def benchmarkStaticUnrollVsDynamicFlowLSTM(self): + print("Calculation: Static Unroll with Dynamic Flow LSTM " + "vs. Dynamic Unroll LSTM") + print("batch \t max_t \t units \t gpu \t dt(static) \t dt(dynamic) " + "\t dt(dynamic)/dt(static)") + for batch_size in (256,): + for max_time in (50,): + for num_units in (512, 256, 128): + for use_gpu in (False, True): + s_dt, d_dt = static_vs_dynamic_rnn_benchmark( + batch_size, max_time, num_units, use_gpu) + self.report_benchmark( + name="static_unroll_time_T%02d_B%03d_N%03d_gpu_%s" + % (max_time, batch_size, num_units, use_gpu), + iters=10, wall_time=s_dt) + self.report_benchmark( + name="dynamic_unroll_time_T%02d_B%03d_N%03d_gpu_%s" + % (max_time, batch_size, num_units, use_gpu), + iters=10, wall_time=d_dt) + + def benchmarkDynamicLSTMNoMemorySwapVsMemorySwap(self): + print("Calculation: Dynamic LSTM No Memory Swap vs. Memory Swap") + print("batch \t max_t \t units \t no_swap \t swap \t swap/no_swap") + for batch_size in (256, 512): + for max_time in (100,): + for num_units in (512, 256, 128): + no_swap, swap = dynamic_rnn_swap_memory_benchmark( + batch_size, max_time, num_units) + self.report_benchmark( + name="dynamic_lstm_no_memory_swap_T%02d_B%03d_N%03d" + % (max_time, batch_size, num_units), + iters=10, wall_time=no_swap) + self.report_benchmark( + name="dynamic_lstm_with_memory_swap_T%02d_B%03d_N%03d" + % (max_time, batch_size, num_units), + iters=10, wall_time=swap) if __name__ == "__main__": - if "--benchmarks" in sys.argv: - sys.argv.remove("--benchmarks") - tf.app.run() - else: - tf.test.main() + tf.test.main() diff --git a/tensorflow/python/kernel_tests/softmax_op_test.py b/tensorflow/python/kernel_tests/softmax_op_test.py index 44d5d30fb3..91c389a2a2 100644 --- a/tensorflow/python/kernel_tests/softmax_op_test.py +++ b/tensorflow/python/kernel_tests/softmax_op_test.py @@ -121,6 +121,13 @@ class SoftmaxTest(tf.test.TestCase): self._testOverflow(use_gpu=False) + def testEmpty(self): + with self.test_session(): + x = tf.constant([[]], shape=[0, 3]) + self.assertEqual(0, tf.size(x).eval()) + expected_y = np.array([]).reshape(0, 3) + np.testing.assert_array_equal(expected_y, tf.nn.softmax(x).eval()) + if __name__ == "__main__": tf.test.main() diff --git a/tensorflow/python/kernel_tests/spacetodepth_op_test.py b/tensorflow/python/kernel_tests/spacetodepth_op_test.py index 8b8ef6158a..02ebdce768 100644 --- a/tensorflow/python/kernel_tests/spacetodepth_op_test.py +++ b/tensorflow/python/kernel_tests/spacetodepth_op_test.py @@ -25,13 +25,18 @@ import tensorflow as tf class SpaceToDepthTest(tf.test.TestCase): + def _testOne(self, inputs, block_size, outputs): + for use_gpu in [False, True]: + with self.test_session(use_gpu=use_gpu): + x_tf = tf.space_to_depth(tf.to_float(inputs), block_size) + self.assertAllEqual(x_tf.eval(), outputs) + def testBasic(self): x_np = [[[[1], [2]], [[3], [4]]]] - with self.test_session(use_gpu=False): - block_size = 2 - out_tf = tf.space_to_depth(x_np, block_size) - self.assertAllEqual(out_tf.eval(), [[[[1, 2, 3, 4]]]]) + block_size = 2 + x_out = [[[[1, 2, 3, 4]]]] + self._testOne(x_np, block_size, x_out) # Tests for larger input dimensions. To make sure elements are # correctly ordered spatially. @@ -40,14 +45,12 @@ class SpaceToDepthTest(tf.test.TestCase): [[3], [4], [7], [8]], [[9], [10], [13], [14]], [[11], [12], [15], [16]]]] - - with self.test_session(use_gpu=False): - block_size = 2 - out_tf = tf.space_to_depth(x_np, block_size) - self.assertAllEqual(out_tf.eval(), [[[[1, 2, 3, 4], - [5, 6, 7, 8]], - [[9, 10, 11, 12], - [13, 14, 15, 16]]]]) + block_size = 2 + x_out = [[[[1, 2, 3, 4], + [5, 6, 7, 8]], + [[9, 10, 11, 12], + [13, 14, 15, 16]]]] + self._testOne(x_np, block_size, x_out) # Tests for larger input dimensions. To make sure elements are # correctly ordered in depth. Here, larger block size. @@ -56,34 +59,27 @@ class SpaceToDepthTest(tf.test.TestCase): [[3], [4], [7], [8]], [[9], [10], [13], [14]], [[11], [12], [15], [16]]]] - - with self.test_session(use_gpu=False): - block_size = 4 - out_tf = tf.space_to_depth(x_np, block_size) - self.assertAllEqual( - out_tf.eval(), - [[[[1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16]]]]) + block_size = 4 + x_out = [[[[1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16]]]] + self._testOne(x_np, block_size, x_out) # Tests for larger input depths. # To make sure elements are properly interleaved in depth. def testDepthInterleaved(self): x_np = [[[[1, 10], [2, 20]], [[3, 30], [4, 40]]]] - with self.test_session(use_gpu=False): - block_size = 2 - out_tf = tf.space_to_depth(x_np, block_size) - self.assertAllEqual(out_tf.eval(), [[[[1, 10, 2, 20, 3, 30, 4, 40]]]]) + block_size = 2 + x_out = [[[[1, 10, 2, 20, 3, 30, 4, 40]]]] + self._testOne(x_np, block_size, x_out) # Tests for larger input depths. Here an odd depth. # To make sure elements are properly interleaved in depth. def testDepthInterleavedDepth3(self): x_np = [[[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]] - with self.test_session(use_gpu=False): - block_size = 2 - out_tf = tf.space_to_depth(x_np, block_size) - self.assertAllEqual(out_tf.eval(), - [[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]]) + block_size = 2 + x_out = [[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]] + self._testOne(x_np, block_size, x_out) # Tests for larger input dimensions AND for larger input depths. # To make sure elements are properly interleaved in depth and ordered @@ -93,14 +89,29 @@ class SpaceToDepthTest(tf.test.TestCase): [[3, 30], [4, 40], [7, 70], [8, 80]], [[9, 90], [10, 100], [13, 130], [14, 140]], [[11, 110], [12, 120], [15, 150], [16, 160]]]] - with self.test_session(use_gpu=False): - block_size = 2 - out_tf = tf.space_to_depth(x_np, block_size) - self.assertAllEqual(out_tf.eval(), - [[[[1, 10, 2, 20, 3, 30, 4, 40], - [5, 50, 6, 60, 7, 70, 8, 80]], - [[9, 90, 10, 100, 11, 110, 12, 120], - [13, 130, 14, 140, 15, 150, 16, 160]]]]) + block_size = 2 + x_out = [[[[1, 10, 2, 20, 3, 30, 4, 40], + [5, 50, 6, 60, 7, 70, 8, 80]], + [[9, 90, 10, 100, 11, 110, 12, 120], + [13, 130, 14, 140, 15, 150, 16, 160]]]] + self._testOne(x_np, block_size, x_out) + + def testBlockSize2Batch10(self): + block_size = 2 + def batch_input_elt(i): + return [[[1 * i], [2 * i], [5 * i], [6 * i]], + [[3 * i], [4 * i], [7 * i], [8 * i]], + [[9 * i], [10 * i], [13 * i], [14 * i]], + [[11 * i], [12 * i], [15 * i], [16 * i]]] + def batch_output_elt(i): + return [[[1 * i, 2 * i, 3 * i, 4 * i], + [5 * i, 6 * i, 7 * i, 8 * i]], + [[9 * i, 10 * i, 11 * i, 12 * i], + [13 * i, 14 * i, 15 * i, 16 * i]]] + batch_size = 10 + x_np = [batch_input_elt(i) for i in xrange(batch_size)] + x_out = [batch_output_elt(i) for i in xrange(batch_size)] + self._testOne(x_np, block_size, x_out) # Tests for different width and height. def testNonSquare(self): @@ -110,13 +121,11 @@ class SpaceToDepthTest(tf.test.TestCase): [[7, 70], [8, 80]], [[9, 90], [10, 100]], [[11, 110], [12, 120]]]] - with self.test_session(use_gpu=False): - block_size = 2 - out_tf = tf.space_to_depth(x_np, block_size) - self.assertAllEqual(out_tf.eval(), - [[[[1, 10, 2, 20, 3, 30, 4, 40]], - [[5, 50, 6, 60, 7, 70, 8, 80]], - [[9, 90, 10, 100, 11, 110, 12, 120]]]]) + block_size = 2 + x_out = [[[[1, 10, 2, 20, 3, 30, 4, 40]], + [[5, 50, 6, 60, 7, 70, 8, 80]], + [[9, 90, 10, 100, 11, 110, 12, 120]]]] + self._testOne(x_np, block_size, x_out) # Error handling: diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py index ccf38d5be1..f759a0a1a0 100644 --- a/tensorflow/python/ops/array_ops.py +++ b/tensorflow/python/ops/array_ops.py @@ -405,6 +405,7 @@ def boolean_mask(tensor, mask, name="boolean_mask"): ValueError: If shapes do not conform. Examples: + ```python # 2-D example a = [[1, 2], [3, 4], [5, 6]] diff --git a/tensorflow/python/ops/data_flow_ops.py b/tensorflow/python/ops/data_flow_ops.py index cba8daa368..0b6125dfd6 100644 --- a/tensorflow/python/ops/data_flow_ops.py +++ b/tensorflow/python/ops/data_flow_ops.py @@ -218,7 +218,7 @@ class QueueBase(object): return gen_data_flow_ops._queue_enqueue(self._queue_ref, vals, name=scope) def enqueue_many(self, vals, name=None): - """Enqueues zero or elements to this queue. + """Enqueues zero or more elements to this queue. This operation slices each component tensor along the 0th dimension to make multiple queue elements. All of the tensors in `vals` must have the diff --git a/tensorflow/python/ops/histogram_ops.py b/tensorflow/python/ops/histogram_ops.py index a24fb39eb5..12e2e4eb8b 100644 --- a/tensorflow/python/ops/histogram_ops.py +++ b/tensorflow/python/ops/histogram_ops.py @@ -12,7 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Operations for histograms.""" +# pylint: disable=g-short-docstring-punctuation +"""## Histograms + +@@histogram_fixed_width +""" from __future__ import absolute_import from __future__ import division @@ -24,30 +28,34 @@ from tensorflow.python.ops import array_ops from tensorflow.python.ops import clip_ops from tensorflow.python.ops import math_ops from tensorflow.python.ops import state_ops +from tensorflow.python.ops import variable_scope -def histogram_fixed_width(hist, - new_values, +def histogram_fixed_width(values, value_range, - use_locking=False, - name='histogram_fixed_width'): - """Update histogram Variable with new values. + nbins=100, + use_locking=True, + dtype=dtypes.int32, + name=None): + """Return histogram of values. - This Op fills histogram with counts of values falling within fixed-width, - half-open bins. + Given the tensor `values`, this operation returns a rank 1 histogram counting + the number of entries in `values` that fell into every bin. The bins are + equal width and determined by the arguments `value_range` and `nbins`. Args: - hist: 1-D mutable `Tensor`, e.g. a `Variable`. - new_values: Numeric `Tensor`. + values: Numeric `Tensor`. value_range: Shape [2] `Tensor`. new_values <= value_range[0] will be mapped to hist[0], values >= value_range[1] will be mapped to hist[-1]. Must be same dtype as new_values. + nbins: Integer number of bins in this histogram. use_locking: Boolean. If `True`, use locking during the operation (optional). - name: A name for this operation (optional). + dtype: dtype for returned histogram. + name: A name for this operation (defaults to 'histogram_fixed_width'). Returns: - An op that updates `hist` with `new_values` when evaluated. + A `Variable` holding histogram of values. Examples: ```python @@ -57,24 +65,21 @@ def histogram_fixed_width(hist, new_values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15] with tf.default_session() as sess: - hist = variables.Variable(array_ops.zeros(nbins, dtype=tf.int32)) - hist_update = histogram_ops.histogram_fixed_width(hist, new_values, - value_range) + hist = tf.histogram_fixed_width(new_values, value_range, nbins=5) variables.initialize_all_variables().run() - sess.run(hist_update) => [2, 1, 1, 0, 2] + sess.run(hist) => [2, 1, 1, 0, 2] ``` """ - with ops.op_scope([hist, new_values, value_range], name) as scope: - new_values = ops.convert_to_tensor(new_values, name='new_values') - new_values = array_ops.reshape(new_values, [-1]) + with variable_scope.variable_op_scope( + [values, value_range], name, 'histogram_fixed_width') as scope: + values = ops.convert_to_tensor(values, name='values') + values = array_ops.reshape(values, [-1]) value_range = ops.convert_to_tensor(value_range, name='value_range') - dtype = hist.dtype # Map tensor values that fall within value_range to [0, 1]. - scaled_values = math_ops.truediv(new_values - value_range[0], + scaled_values = math_ops.truediv(values - value_range[0], value_range[1] - value_range[0], name='scaled_values') - nbins = math_ops.cast(hist.get_shape()[0], scaled_values.dtype) # map tensor values within the open interval value_range to {0,.., nbins-1}, # values outside the open interval will be zero or less, or nbins or more. @@ -87,9 +92,18 @@ def histogram_fixed_width(hist, # Dummy vector to scatter. # TODO(langmore) Replace non-ideal creation of large dummy vector once an # alternative to scatter is available. - updates = array_ops.ones([indices.get_shape()[0]], dtype=dtype) - return state_ops.scatter_add(hist, - indices, - updates, - use_locking=use_locking, - name=scope) + updates = array_ops.ones_like(indices, dtype=dtype) + + hist = variable_scope.get_variable('hist', + initializer=array_ops.zeros_initializer( + [nbins], + dtype=dtype), + trainable=False) + hist_assign_zero = hist.assign(array_ops.zeros_like(hist)) + + with ops.control_dependencies([hist_assign_zero]): + return state_ops.scatter_add(hist, + indices, + updates, + use_locking=use_locking, + name=scope.name) diff --git a/tensorflow/python/ops/histogram_ops_test.py b/tensorflow/python/ops/histogram_ops_test.py index 8358c2f1ea..514ba14e16 100644 --- a/tensorflow/python/ops/histogram_ops_test.py +++ b/tensorflow/python/ops/histogram_ops_test.py @@ -17,149 +17,132 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -from tensorflow.python.framework import dtypes -from tensorflow.python.framework import test_util -from tensorflow.python.ops import array_ops -from tensorflow.python.ops import histogram_ops -from tensorflow.python.ops import variables -from tensorflow.python.platform import googletest - import numpy as np +import tensorflow as tf -class HistogramFixedWidthTest(test_util.TensorFlowTestCase): +class HistogramFixedWidthTest(tf.test.TestCase): def setUp(self): self.rng = np.random.RandomState(0) + def test_empty_input_gives_all_zero_counts(self): + # Bins will be: + # (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf) + value_range = [0.0, 5.0] + values = [] + expected_bin_counts = [0, 0, 0, 0, 0] + with self.test_session(): + hist = tf.histogram_fixed_width(values, value_range, nbins=5) + tf.initialize_all_variables().run() + + # Hist should start "fresh" with every eval. + self.assertAllClose(expected_bin_counts, hist.eval()) + self.assertAllClose(expected_bin_counts, hist.eval()) + def test_one_update_on_constant_input(self): # Bins will be: # (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf) - nbins = [5] value_range = [0.0, 5.0] - new_values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15] + values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15] expected_bin_counts = [2, 1, 1, 0, 2] - with self.test_session() as sess: - hist = variables.Variable(array_ops.zeros(nbins, dtype=dtypes.int32)) - hist_update = histogram_ops.histogram_fixed_width(hist, new_values, - value_range) - variables.initialize_all_variables().run() - self.assertTrue(hist.dtype.is_compatible_with(hist_update.dtype)) - updated_hist_array = sess.run(hist_update) - - # The new updated_hist_array is returned by the updating op. - self.assertAllClose(expected_bin_counts, updated_hist_array) + with self.test_session(): + hist = tf.histogram_fixed_width(values, value_range, nbins=5) + tf.initialize_all_variables().run() - # hist should contain updated values, but eval() should not change it. + # Hist should start "fresh" with every eval. self.assertAllClose(expected_bin_counts, hist.eval()) self.assertAllClose(expected_bin_counts, hist.eval()) def test_one_update_on_constant_2d_input(self): # Bins will be: # (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf) - nbins = [5] value_range = [0.0, 5.0] - new_values = [[-1.0, 0.0, 1.5], [2.0, 5.0, 15]] + values = [[-1.0, 0.0, 1.5], [2.0, 5.0, 15]] expected_bin_counts = [2, 1, 1, 0, 2] - with self.test_session() as sess: - hist = variables.Variable(array_ops.zeros(nbins, dtype=dtypes.int32)) - hist_update = histogram_ops.histogram_fixed_width(hist, new_values, - value_range) - variables.initialize_all_variables().run() - self.assertTrue(hist.dtype.is_compatible_with(hist_update.dtype)) - updated_hist_array = sess.run(hist_update) - - # The new updated_hist_array is returned by the updating op. - self.assertAllClose(expected_bin_counts, updated_hist_array) + with self.test_session(): + hist = tf.histogram_fixed_width(values, value_range, nbins=5) + tf.initialize_all_variables().run() - # hist should contain updated values, but eval() should not change it. + # Hist should start "fresh" with every eval. self.assertAllClose(expected_bin_counts, hist.eval()) self.assertAllClose(expected_bin_counts, hist.eval()) def test_two_updates_on_constant_input(self): # Bins will be: # (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf) - nbins = [5] value_range = [0.0, 5.0] - new_values_1 = [-1.0, 0.0, 1.5, 2.0, 5.0, 15] - new_values_2 = [1.5, 4.5, 4.5, 4.5, 0.0, 0.0] + values_1 = [-1.0, 0.0, 1.5, 2.0, 5.0, 15] + values_2 = [1.5, 4.5, 4.5, 4.5, 0.0, 0.0] expected_bin_counts_1 = [2, 1, 1, 0, 2] - expected_bin_counts_2 = [4, 2, 1, 0, 5] - with self.test_session() as sess: - hist = variables.Variable(array_ops.zeros(nbins, dtype=dtypes.int32)) - new_values = array_ops.placeholder(dtypes.float32, shape=[6]) - hist_update = histogram_ops.histogram_fixed_width(hist, new_values, - value_range) - variables.initialize_all_variables().run() - updated_hist_array = sess.run(hist_update, - feed_dict={new_values: new_values_1}) - - # The new updated_hist_array is returned by the updating op. - # hist should contain the updated values. - self.assertAllClose(expected_bin_counts_1, updated_hist_array) - self.assertAllClose(expected_bin_counts_1, hist.eval()) - - updated_hist_array = sess.run(hist_update, - feed_dict={new_values: new_values_2}) - self.assertAllClose(expected_bin_counts_2, updated_hist_array) - self.assertAllClose(expected_bin_counts_2, hist.eval()) + expected_bin_counts_2 = [2, 1, 0, 0, 3] + with self.test_session(): + values = tf.placeholder(tf.float32, shape=[6]) + hist = tf.histogram_fixed_width(values, value_range, nbins=5) + tf.initialize_all_variables().run() + + # The values in hist should depend on the current feed and nothing else. + self.assertAllClose(expected_bin_counts_1, + hist.eval(feed_dict={values: values_1})) + self.assertAllClose(expected_bin_counts_2, + hist.eval(feed_dict={values: values_2})) + self.assertAllClose(expected_bin_counts_1, + hist.eval(feed_dict={values: values_1})) + self.assertAllClose(expected_bin_counts_1, + hist.eval(feed_dict={values: values_1})) def test_two_updates_on_scalar_input(self): # Bins will be: # (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf) - nbins = [5] value_range = [0.0, 5.0] - new_values_1 = 1.5 - new_values_2 = 2.5 + values_1 = 1.5 + values_2 = 2.5 expected_bin_counts_1 = [0, 1, 0, 0, 0] - expected_bin_counts_2 = [0, 1, 1, 0, 0] - with self.test_session() as sess: - hist = variables.Variable(array_ops.zeros(nbins, dtype=dtypes.int32)) - new_values = array_ops.placeholder(dtypes.float32, shape=[]) - hist_update = histogram_ops.histogram_fixed_width(hist, new_values, - value_range) - variables.initialize_all_variables().run() - - # The new updated_hist_array is returned by the updating op. - # hist should contain the updated values. - updated_hist_array = sess.run(hist_update, - feed_dict={new_values: new_values_1}) - self.assertAllClose(expected_bin_counts_1, updated_hist_array) - self.assertAllClose(expected_bin_counts_1, hist.eval()) - - updated_hist_array = sess.run(hist_update, - feed_dict={new_values: new_values_2}) - self.assertAllClose(expected_bin_counts_2, updated_hist_array) - self.assertAllClose(expected_bin_counts_2, hist.eval()) - - def test_multiple_random_3d_updates_results_in_right_dist(self): - # Update with uniform 3-D rvs. Resultant + expected_bin_counts_2 = [0, 0, 1, 0, 0] + with self.test_session(): + values = tf.placeholder(tf.float32, shape=[]) + hist = tf.histogram_fixed_width(values, value_range, nbins=5) + tf.initialize_all_variables().run() + + # The values in hist should depend on the current feed and nothing else. + self.assertAllClose(expected_bin_counts_2, + hist.eval(feed_dict={values: values_2})) + self.assertAllClose(expected_bin_counts_1, + hist.eval(feed_dict={values: values_1})) + self.assertAllClose(expected_bin_counts_1, + hist.eval(feed_dict={values: values_1})) + self.assertAllClose(expected_bin_counts_2, + hist.eval(feed_dict={values: values_2})) + + def test_multiple_random_accumulating_updates_results_in_right_dist(self): + # Accumulate the updates in a new variable. Resultant # histogram should be uniform. Use only 3 bins because with many bins it # would be unlikely that all would be close to 1/n. If someone ever wants # to test that, it would be better to check that the cdf was linear. - nbins = [3] value_range = [1.0, 4.14159] with self.test_session() as sess: - hist = variables.Variable(array_ops.zeros(nbins, dtype=dtypes.int32)) - new_values = array_ops.placeholder(dtypes.float32, shape=[4, 4, 4]) - hist_update = histogram_ops.histogram_fixed_width(hist, new_values, - value_range) - variables.initialize_all_variables().run() + values = tf.placeholder(tf.float32, shape=[4, 4, 4]) + hist = tf.histogram_fixed_width(values, + value_range, + nbins=3, + dtype=tf.int64) + + hist_accum = tf.Variable(tf.zeros_initializer([3], dtype=tf.int64)) + hist_accum = hist_accum.assign_add(hist) + + tf.initialize_all_variables().run() for _ in range(100): # Map the rv: U[0, 1] --> U[value_range[0], value_range[1]]. - new_values_arr = ( + values_arr = ( value_range[0] + (value_range[1] - value_range[0]) * self.rng.rand(4, 4, 4)) - # The new updated_hist_array is returned by the updating op. - # hist should contain the updated values. - updated_hist_array = sess.run(hist_update, - feed_dict={new_values: new_values_arr}) + hist_accum_arr = sess.run(hist_accum, feed_dict={values: values_arr}) - pmf = updated_hist_array / float(updated_hist_array.sum()) + pmf = hist_accum_arr / float(hist_accum_arr.sum()) np.testing.assert_allclose(1 / 3, pmf, atol=0.02) if __name__ == '__main__': - googletest.main() + tf.test.main() diff --git a/tensorflow/python/ops/io_ops.py b/tensorflow/python/ops/io_ops.py index 15cf4736a3..d622d5dc21 100644 --- a/tensorflow/python/ops/io_ops.py +++ b/tensorflow/python/ops/io_ops.py @@ -92,6 +92,7 @@ The "producer" functions add a queue to the graph and a corresponding @@match_filenames_once @@limit_epochs +@@input_producer @@range_input_producer @@slice_input_producer @@string_input_producer diff --git a/tensorflow/python/ops/rnn_cell.py b/tensorflow/python/ops/rnn_cell.py index 7d922b3ed7..ebdfdc113b 100644 --- a/tensorflow/python/ops/rnn_cell.py +++ b/tensorflow/python/ops/rnn_cell.py @@ -556,15 +556,13 @@ class EmbeddingWrapper(RNNCell): feed into your RNN. """ - def __init__(self, cell, embedding_classes=0, embedding=None, - initializer=None): + def __init__(self, cell, embedding_classes, embedding_size, initializer=None): """Create a cell with an added input embedding. Args: cell: an RNNCell, an embedding will be put before its inputs. embedding_classes: integer, how many symbols will be embedded. - embedding: Variable, the embedding to use; if None, a new embedding - will be created; if set, then embedding_classes is not required. + embedding_size: integer, the size of the vectors we embed into. initializer: an initializer to use when creating the embedding; if None, the initializer from variable scope or a default one is used. @@ -574,21 +572,12 @@ class EmbeddingWrapper(RNNCell): """ if not isinstance(cell, RNNCell): raise TypeError("The parameter cell is not RNNCell.") - if embedding_classes < 1 and embedding is None: - raise ValueError("Pass embedding or embedding_classes must be > 0: %d." - % embedding_classes) - if embedding_classes > 0 and embedding is not None: - if embedding.size[0] != embedding_classes: - raise ValueError("You declared embedding_classes=%d but passed an " - "embedding for %d classes." % (embedding.size[0], - embedding_classes)) - if embedding.size[1] != cell.input_size: - raise ValueError("You passed embedding with output size %d and a cell" - " that accepts size %d." % (embedding.size[1], - cell.input_size)) + if embedding_classes <= 0 or embedding_size <= 0: + raise ValueError("Both embedding_classes and embedding_size must be > 0: " + "%d, %d." % (embedding_classes, embedding_size)) self._cell = cell self._embedding_classes = embedding_classes - self._embedding = embedding + self._embedding_size = embedding_size self._initializer = initializer @property @@ -607,20 +596,17 @@ class EmbeddingWrapper(RNNCell): """Run the cell on embedded inputs.""" with vs.variable_scope(scope or type(self).__name__): # "EmbeddingWrapper" with ops.device("/cpu:0"): - if self._embedding: - embedding = self._embedding + if self._initializer: + initializer = self._initializer + elif vs.get_variable_scope().initializer: + initializer = vs.get_variable_scope().initializer else: - if self._initializer: - initializer = self._initializer - elif vs.get_variable_scope().initializer: - initializer = vs.get_variable_scope().initializer - else: - # Default initializer for embeddings should have variance=1. - sqrt3 = math.sqrt(3) # Uniform(-sqrt(3), sqrt(3)) has variance=1. - initializer = init_ops.random_uniform_initializer(-sqrt3, sqrt3) - embedding = vs.get_variable("embedding", [self._embedding_classes, - self._cell.input_size], - initializer=initializer) + # Default initializer for embeddings should have variance=1. + sqrt3 = math.sqrt(3) # Uniform(-sqrt(3), sqrt(3)) has variance=1. + initializer = init_ops.random_uniform_initializer(-sqrt3, sqrt3) + embedding = vs.get_variable("embedding", [self._embedding_classes, + self._embedding_size], + initializer=initializer) embedded = embedding_ops.embedding_lookup( embedding, array_ops.reshape(inputs, [-1])) return self._cell(embedded, state) diff --git a/tensorflow/python/ops/seq2seq.py b/tensorflow/python/ops/seq2seq.py index 7df123ef70..6cbf70437c 100644 --- a/tensorflow/python/ops/seq2seq.py +++ b/tensorflow/python/ops/seq2seq.py @@ -311,7 +311,9 @@ def embedding_rnn_seq2seq(encoder_inputs, decoder_inputs, cell, """ with variable_scope.variable_scope(scope or "embedding_rnn_seq2seq"): # Encoder. - encoder_cell = rnn_cell.EmbeddingWrapper(cell, num_encoder_symbols) + encoder_cell = rnn_cell.EmbeddingWrapper( + cell, embedding_classes=num_encoder_symbols, + embedding_size=cell.input_size) _, encoder_state = rnn.rnn(encoder_cell, encoder_inputs, dtype=dtype) # Decoder. @@ -686,7 +688,9 @@ def embedding_attention_seq2seq(encoder_inputs, decoder_inputs, cell, """ with variable_scope.variable_scope(scope or "embedding_attention_seq2seq"): # Encoder. - encoder_cell = rnn_cell.EmbeddingWrapper(cell, num_encoder_symbols) + encoder_cell = rnn_cell.EmbeddingWrapper( + cell, embedding_classes=num_encoder_symbols, + embedding_size=cell.input_size) encoder_outputs, encoder_state = rnn.rnn( encoder_cell, encoder_inputs, dtype=dtype) @@ -772,7 +776,9 @@ def one2many_rnn_seq2seq(encoder_inputs, decoder_inputs_dict, cell, with variable_scope.variable_scope(scope or "one2many_rnn_seq2seq"): # Encoder. - encoder_cell = rnn_cell.EmbeddingWrapper(cell, num_encoder_symbols) + encoder_cell = rnn_cell.EmbeddingWrapper( + cell, embedding_classes=num_encoder_symbols, + embedding_size=cell.input_size) _, encoder_state = rnn.rnn(encoder_cell, encoder_inputs, dtype=dtype) # Decoder. diff --git a/tensorflow/python/ops/sparse_ops.py b/tensorflow/python/ops/sparse_ops.py index c7c4ceb083..e1fd5d0143 100644 --- a/tensorflow/python/ops/sparse_ops.py +++ b/tensorflow/python/ops/sparse_ops.py @@ -774,7 +774,7 @@ def _SerializeManySparseShape(op): # pylint: disable=invalid-name return [tensor_shape.matrix(None, 3)] -def deserialize_many_sparse(serialized_sparse, dtype, name=None): +def deserialize_many_sparse(serialized_sparse, dtype, rank=None, name=None): """Deserialize and concatenate `SparseTensors` from a serialized minibatch. The input `serialized_sparse` must be a string matrix of shape `[N x 3]` where @@ -823,6 +823,7 @@ def deserialize_many_sparse(serialized_sparse, dtype, name=None): serialized_sparse: 2-D `Tensor` of type `string` of shape `[N, 3]`. The serialized and packed `SparseTensor' objects. dtype: The `dtype` of the serialized `SparseTensor` objects. + rank: (optional) Python int, the rank of the `SparseTensor` objects. name: A name prefix for the returned tensors (optional) Returns: @@ -835,6 +836,10 @@ def deserialize_many_sparse(serialized_sparse, dtype, name=None): gen_sparse_ops._deserialize_many_sparse( serialized_sparse, dtype, name=name)) + # Feed rank data back in, if available + output_indices.set_shape([None, rank]) + output_shape.set_shape([rank]) + return ops.SparseTensor(output_indices, output_values, output_shape) diff --git a/tensorflow/python/ops/standard_ops.py b/tensorflow/python/ops/standard_ops.py index 13cb40cf31..90ac0a057b 100644 --- a/tensorflow/python/ops/standard_ops.py +++ b/tensorflow/python/ops/standard_ops.py @@ -42,6 +42,7 @@ from tensorflow.python.ops.control_flow_ops import foldr from tensorflow.python.ops.control_flow_ops import map_fn from tensorflow.python.ops.data_flow_ops import * from tensorflow.python.ops.gradients import * +from tensorflow.python.ops.histogram_ops import * from tensorflow.python.ops.init_ops import * from tensorflow.python.ops.io_ops import * from tensorflow.python.ops.linalg_ops import * diff --git a/tensorflow/python/platform/benchmark.py b/tensorflow/python/platform/benchmark.py new file mode 100644 index 0000000000..87f95bb2c7 --- /dev/null +++ b/tensorflow/python/platform/benchmark.py @@ -0,0 +1,213 @@ +# Copyright 2016 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Utilities to run benchmarks.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import inspect +import numbers +import os +import re +import sys + +import six # pylint: disable=unused-import + +from google.protobuf import text_format +from tensorflow.core.util import test_log_pb2 +from tensorflow.python.platform import app +from tensorflow.python.platform import gfile + +# When a subclass of the Benchmark class is created, it is added to +# the registry automatically +GLOBAL_BENCHMARK_REGISTRY = set() + +# Environment variable that determines whether benchmarks are written. +# See also tensorflow/core/util/reporter.h TestReporter::kTestReporterEnv. +TEST_REPORTER_TEST_ENV = "TEST_REPORT_FILE_PREFIX" + + +def _global_report_benchmark( + name, iters=None, cpu_time=None, wall_time=None, + throughput=None, extras=None): + """Method for recording a benchmark directly. + + Args: + name: The BenchmarkEntry name. + iters: (optional) How many iterations were run + cpu_time: (optional) Total cpu time in seconds + wall_time: (optional) Total wall time in seconds + throughput: (optional) Throughput (in MB/s) + extras: (optional) Dict mapping string keys to additional benchmark info. + + Raises: + TypeError: if extras is not a dict. + IOError: if the benchmark output file already exists. + """ + if extras is not None: + if not isinstance(extras, dict): + raise TypeError("extras must be a dict") + + test_env = os.environ.get(TEST_REPORTER_TEST_ENV, None) + if test_env is None: + # Reporting was not requested + return + + entry = test_log_pb2.BenchmarkEntry() + entry.name = name + if iters is not None: + entry.iters = iters + if cpu_time is not None: + entry.cpu_time = cpu_time + if wall_time is not None: + entry.wall_time = wall_time + if throughput is not None: + entry.throughput = throughput + if extras is not None: + for (k, v) in extras.items(): + if isinstance(v, numbers.Number): + entry.extras[k].double_value = v + else: + entry.extras[k].string_value = str(v) + + serialized_entry = text_format.MessageToString(entry) + + mangled_name = name.replace("/", "__") + output_path = "%s%s" % (test_env, mangled_name) + if gfile.Exists(output_path): + raise IOError("File already exists: %s" % output_path) + with gfile.GFile(output_path, "w") as out: + out.write(serialized_entry) + + +class _BenchmarkRegistrar(type): + """The Benchmark class registrar. Used by abstract Benchmark class.""" + + def __new__(mcs, clsname, base, attrs): + newclass = super(mcs, _BenchmarkRegistrar).__new__( + mcs, clsname, base, attrs) + if len(newclass.mro()) > 2: + # Only the base Benchmark abstract class has mro length 2. + # The rest subclass from it and are therefore registered. + GLOBAL_BENCHMARK_REGISTRY.add(newclass) + return newclass + + +class Benchmark(object): + """Abstract class that provides helper functions for running benchmarks. + + Any class subclassing this one is immediately registered in the global + benchmark registry. + + Only methods whose names start with the word "benchmark" will be run during + benchmarking. + """ + __metaclass__ = _BenchmarkRegistrar + + def _get_name(self, overwrite_name): + """Returns full name of class and method calling report_benchmark.""" + + # Expect that the caller called report_benchmark, which called _get_name. + caller = inspect.stack()[2] + calling_class = caller[0].f_locals.get("self", None) + # Use the method name, or overwrite_name is provided. + name = overwrite_name if overwrite_name is not None else caller[3] + if calling_class is not None: + # Prefix the name with the class name. + class_name = type(calling_class).__name__ + name = "%s.%s" % (class_name, name) + return name + + def report_benchmark( + self, + iters=None, + cpu_time=None, + wall_time=None, + throughput=None, + extras=None, + name=None): + """Report a benchmark. + + Args: + iters: (optional) How many iterations were run + cpu_time: (optional) Total cpu time in seconds + wall_time: (optional) Total wall time in seconds + throughput: (optional) Throughput (in MB/s) + extras: (optional) Dict mapping string keys to additional benchmark info. + name: (optional) Override the BenchmarkEntry name with `name`. + Otherwise it is inferred from the calling class and top-level + method name. + """ + name = self._get_name(overwrite_name=name) + _global_report_benchmark( + name=name, iters=iters, cpu_time=cpu_time, wall_time=wall_time, + throughput=throughput, extras=extras) + + +def _run_specific_benchmark(benchmark_class): + benchmark = benchmark_class() + attrs = dir(benchmark) + # Only run methods of this class whose names start with "benchmark" + for attr in attrs: + if not attr.startswith("benchmark"): + continue + benchmark_fn = getattr(benchmark, attr) + if not callable(benchmark_fn): + continue + # Call this benchmark method + benchmark_fn() + + +def _run_benchmarks(regex): + """Run benchmarks that match regex `regex`. + + This function goes through the global benchmark registry, and matches + benchmark **classe names** of the form "module.name.BenchmarkClass" to + the given regex. If a class matches, all of its benchmark methods + are run. + + Args: + regex: The string regular expression to match Benchmark classes against. + """ + registry = list(GLOBAL_BENCHMARK_REGISTRY) + + # Match benchmarks in registry against regex + for benchmark in registry: + benchmark_name = "%s.%s" % (benchmark.__module__, benchmark.__name__) + if re.search(regex, benchmark_name): + # Found a match + + _run_specific_benchmark(benchmark) + + +def benchmarks_main(true_main=None): + """Run benchmarks as declared in args. + + Args: + true_main: True main function to run if benchmarks are not requested. + """ + argv = sys.argv + found_arg = [arg for arg in argv + if arg.startswith("--benchmarks=") + or arg.startswith("-benchmarks=")] + if found_arg: + # Remove --benchmarks arg from sys.argv + argv.remove(found_arg[0]) + + regex = found_arg[0].split("=")[1] + app.run(lambda _: _run_benchmarks(regex)) + else: + true_main() diff --git a/tensorflow/python/platform/default/_app.py b/tensorflow/python/platform/default/_app.py index e700956f17..74fecfe7ef 100644 --- a/tensorflow/python/platform/default/_app.py +++ b/tensorflow/python/platform/default/_app.py @@ -23,8 +23,8 @@ import sys from tensorflow.python.platform import flags -def run(): +def run(main=None): f = flags.FLAGS f._parse_flags() - main = sys.modules['__main__'].main + main = main or sys.modules['__main__'].main sys.exit(main(sys.argv)) diff --git a/tensorflow/python/platform/googletest.py b/tensorflow/python/platform/googletest.py index 2049bd2b1d..76e15d7872 100644 --- a/tensorflow/python/platform/googletest.py +++ b/tensorflow/python/platform/googletest.py @@ -21,7 +21,20 @@ from __future__ import print_function # pylint: disable=g-import-not-at-top # pylint: disable=wildcard-import from . import control_imports +from tensorflow.python.platform import benchmark + +# Import the Benchmark class +Benchmark = benchmark.Benchmark # pylint: disable=invalid-name + if control_imports.USE_OSS and control_imports.OSS_GOOGLETEST: from tensorflow.python.platform.default._googletest import * + from tensorflow.python.platform.default._googletest import main as g_main else: from tensorflow.python.platform.google._googletest import * + from tensorflow.python.platform.google._googletest import main as g_main + + +# Redefine main to allow running benchmarks +def main(): + # Benchmarks determine whether to run tests or not, by calling g_main + benchmark.benchmarks_main(true_main=g_main) diff --git a/tensorflow/python/platform/test.py b/tensorflow/python/platform/test.py index d2b9d1f974..6d78193233 100644 --- a/tensorflow/python/platform/test.py +++ b/tensorflow/python/platform/test.py @@ -72,6 +72,10 @@ from tensorflow.python.kernel_tests.gradient_checker import compute_gradient # pylint: enable=unused-import +# Import Benchmark class +Benchmark = googletest.Benchmark # pylint: disable=invalid-name + + def main(): """Runs all unit tests.""" return googletest.main() diff --git a/tensorflow/python/training/coordinator.py b/tensorflow/python/training/coordinator.py index 661bae7bc1..f018126bc8 100644 --- a/tensorflow/python/training/coordinator.py +++ b/tensorflow/python/training/coordinator.py @@ -131,6 +131,8 @@ class Coordinator(object): # Event set when threads must stop. self._stop_event = threading.Event() # Python exc_info to report. + # If not None, it should hold the returned value of sys.exc_info(), which is + # a tuple containing exception (type, value, traceback). self._exc_info_to_raise = None def request_stop(self, ex=None): @@ -138,6 +140,10 @@ class Coordinator(object): After this is called, calls to `should_stop()` will return `True`. + Note: If an exception is being passed in, in must be in the context of + handling the exception (i.e. `try: ... except Exception as ex: ...`) and not + a newly created one. + Args: ex: Optional `Exception`, or Python `exc_info` tuple as returned by `sys.exc_info()`. If this is the first call to `request_stop()` the @@ -154,6 +160,22 @@ class Coordinator(object): logging.info("Error reported to Coordinator: %s", compat.as_str_any(ex)) self._exc_info_to_raise = sys.exc_info() + # self._exc_info_to_raise should contain a tuple containing exception + # (type, value, traceback) + if (len(self._exc_info_to_raise) != 3 or + not self._exc_info_to_raise[0] or + not self._exc_info_to_raise[1]): + # Raise, catch and record the exception here so that error happens + # where expected. + try: + raise ValueError( + "ex must be a tuple or sys.exc_info must return the current " + "exception: %s" + % self._exc_info_to_raise) + except ValueError: + # Record this error so it kills the coordinator properly. + self._exc_info_to_raise = sys.exc_info() + self._stop_event.set() def clear_stop(self): diff --git a/tensorflow/python/training/input.py b/tensorflow/python/training/input.py index ace7b49d97..ae2df782a4 100644 --- a/tensorflow/python/training/input.py +++ b/tensorflow/python/training/input.py @@ -84,20 +84,63 @@ def limit_epochs(tensor, num_epochs=None, name=None): return array_ops.identity(tensor, name=name) -def _input_producer(input_tensor, dtype, num_epochs, shuffle, seed, capacity, - shared_name, name, summary_name): - if shuffle: - input_tensor = random_ops.random_shuffle(input_tensor, seed=seed) - input_tensor = limit_epochs(input_tensor, num_epochs) - - q = data_flow_ops.FIFOQueue(capacity=capacity, dtypes=[dtype], shapes=[[]], - shared_name=shared_name, name=name) - enq = q.enqueue_many([input_tensor]) - queue_runner.add_queue_runner(queue_runner.QueueRunner(q, [enq])) - logging_ops.scalar_summary("queue/%s/%s" % (q.name, summary_name), - math_ops.cast(q.size(), dtypes.float32) * - (1. / capacity)) - return q +def input_producer(input_tensor, element_shape=None, num_epochs=None, + shuffle=True, seed=None, capacity=32, shared_name=None, + summary_name=None, name=None): + """Output the rows of `input_tensor` to a queue for an input pipeline. + + Args: + input_tensor: A tensor with the rows to produce. Must be at + one-dimensional. Must either have a fully-defined shape, or + `element_shape` must be defined. + element_shape: (Optional.) A `TensorShape` representing the shape of a + row of `input_tensor`, if it cannot be inferred. + num_epochs: (Optional.) An integer. If specified `input_producer` produces + each row of `input_tensor` `num_epochs` times before generating an + `OutOfRange` error. If not specified, `input_producer` can cycle through + the rows of `input_tensor` an unlimited number of times. + shuffle: (Optional.) A boolean. If true, the rows are randomly shuffled + within each eopch. + seed: (Optional.) An integer. The seed to use if `shuffle` is true. + capacity: (Optional.) The capacity of the queue to be used for buffering + the input. + shared_name: (Optional.) If set, this queue will be shared under the given + name across multiple sessions. + summary_name: (Optional.) If set, a scalar summary for the current queue + size will be generated, using this name as part of the tag. + name: (Optional.) A name for queue. + + Returns: + A queue with the output rows. A `QueueRunner` for the queue is + added to the current `QUEUE_RUNNER` collection of the current + graph. + + Raises: + ValueError: If the shape of the input cannot be inferred from the arguments. + """ + with ops.op_scope([input_tensor], name, "input_producer"): + input_tensor = ops.convert_to_tensor(input_tensor, name="input_tensor") + element_shape = input_tensor.get_shape()[1:].merge_with(element_shape) + if not element_shape.is_fully_defined(): + raise ValueError("Either `input_tensor` must have a fully defined shape " + "or `element_shape` must be specified") + + if shuffle: + input_tensor = random_ops.random_shuffle(input_tensor, seed=seed) + + input_tensor = limit_epochs(input_tensor, num_epochs) + + q = data_flow_ops.FIFOQueue(capacity=capacity, + dtypes=[input_tensor.dtype.base_dtype], + shapes=[element_shape], + shared_name=shared_name, name=name) + enq = q.enqueue_many([input_tensor]) + queue_runner.add_queue_runner(queue_runner.QueueRunner(q, [enq])) + if summary_name is not None: + logging_ops.scalar_summary("queue/%s/%s" % (q.name, summary_name), + math_ops.cast(q.size(), dtypes.float32) * + (1. / capacity)) + return q def string_input_producer(string_tensor, num_epochs=None, shuffle=True, @@ -108,9 +151,9 @@ def string_input_producer(string_tensor, num_epochs=None, shuffle=True, string_tensor: A 1-D string tensor with the strings to produce. num_epochs: An integer (optional). If specified, `string_input_producer` produces each string from `string_tensor` `num_epochs` times before - generating an OutOfRange error. If not specified, `string_input_producer` - can cycle through the strings in `string_tensor` an unlimited number of - times. + generating an `OutOfRange` error. If not specified, + `string_input_producer` can cycle through the strings in `string_tensor` + an unlimited number of times. shuffle: Boolean. If true, the strings are randomly shuffled within each epoch. seed: An integer (optional). Seed used if shuffle == True. @@ -137,9 +180,9 @@ def string_input_producer(string_tensor, num_epochs=None, shuffle=True, logging_ops.Assert(math_ops.greater(array_ops.size(string_tensor), 0), [not_null_err])]): string_tensor = array_ops.identity(string_tensor) - return _input_producer( + return input_producer( input_tensor=string_tensor, - dtype=dtypes.string, + element_shape=[], num_epochs=num_epochs, shuffle=shuffle, seed=seed, @@ -173,8 +216,8 @@ def range_input_producer(limit, num_epochs=None, shuffle=True, seed=None, """ with ops.op_scope([limit], name, "input_producer") as name: range_tensor = math_ops.range(limit) - return _input_producer( - range_tensor, dtypes.int32, num_epochs, shuffle, seed, capacity, + return input_producer( + range_tensor, [], num_epochs, shuffle, seed, capacity, shared_name, name, "fraction_of_%d_full" % capacity) @@ -231,51 +274,104 @@ def _flatten(tensor_list_list): return [tensor for tensor_list in tensor_list_list for tensor in tensor_list] +class _SparseMetaData(object): + """Store information about the Tensor: Is it sparse?, dtype, and rank.""" + + def __init__(self, sparse, dtype, rank): + self._sparse = sparse + self._dtype = dtype + self._rank = rank + + def __eq__(self, other): + if self.sparse != other.sparse: + return False + if not self.sparse: + return True + if self.dtype != other.dtype: + return False + if not self.rank.is_compatible_with(other.rank): + return False + return True + + def __ne__(self, other): + return not self.__eq__(other) + + def __str__(self): + return "[SparseMetaData(%s, %s, %s)]" % (self.sparse, self.dtype, self.rank) + + def merge_with(self, other): + if self != other: + raise ValueError("SparseMetaData objects are incompatible: %s vs. %s" + % (self, other)) + if self.sparse: + self.rank.merge_with(other.rank) + return self + + @property + def dtype(self): + return self._dtype + + @property + def sparse(self): + return self._sparse + + @property + def rank(self): + return self._rank + + def _serialize_sparse_tensors(tensor_list, enqueue_many): """Serialize SparseTensors for feeding into batch, etc.""" - is_sparse_list = [isinstance(t, ops.SparseTensor) for t in tensor_list] - sparse_dtypes_list = [ - t.dtype if isinstance(t, ops.SparseTensor) else None + sparse_info_list = [ + _SparseMetaData(sparse=True, + dtype=t.dtype, + rank=t.shape.get_shape().with_rank(1)[0]) + if isinstance(t, ops.SparseTensor) + else _SparseMetaData(False, None, None) for t in tensor_list] - def _maybe_serialize(t, is_sparse): - if not is_sparse: + def _maybe_serialize(t, sparse): + if not sparse: return t return (sparse_ops.serialize_many_sparse(t) if enqueue_many else sparse_ops.serialize_sparse(t)) + serialized_list = [ - _maybe_serialize(t, is_sparse) - for (t, is_sparse) in zip(tensor_list, is_sparse_list)] - return serialized_list, is_sparse_list, sparse_dtypes_list + _maybe_serialize(t, info.sparse) for (t, info) + in zip(tensor_list, sparse_info_list)] + + return serialized_list, sparse_info_list def _serialize_sparse_tensors_join(tensor_list_list, enqueue_many): """Serialize SparseTensors for feeding into batch_join, etc.""" - (s0, is_sparse_list, sparse_dtypes_list) = _serialize_sparse_tensors( + (s0, sparse_info_list) = _serialize_sparse_tensors( tensor_list_list[0], enqueue_many) serialized_list_list = [s0] for tensor_list in tensor_list_list[1:]: - (s, is_sparse_candidate, sparse_dtypes_candidate) = ( - _serialize_sparse_tensors(tensor_list, enqueue_many)) - if is_sparse_candidate != is_sparse_list: + s, sparse_info_candidate = _serialize_sparse_tensors( + tensor_list, enqueue_many) + if sparse_info_list != sparse_info_candidate: raise ValueError("Inconsistent SparseTensors list: %s vs. %s" % (tensor_list_list[0], tensor_list)) - if sparse_dtypes_candidate != sparse_dtypes_list: - raise ValueError("Inconsistent SparseTensor dtypes in list: %s vs. %s" - % (tensor_list_list[0], tensor_list)) + sparse_info_list = [ + info.merge_with(candidate) + for (info, candidate) in zip(sparse_info_list, sparse_info_candidate)] serialized_list_list.append(s) - return (serialized_list_list, is_sparse_list, sparse_dtypes_list) + + return (serialized_list_list, sparse_info_list) -def _deserialize_sparse_tensors(serialized_list, is_sparse_list, sparse_dtypes): +def _deserialize_sparse_tensors(serialized_list, sparse_info_list): """Deserialize SparseTensors after dequeue in batch, batch_join, etc.""" received_sequence = isinstance(serialized_list, collections.Sequence) if not received_sequence: serialized_list = (serialized_list,) - tensors = [sparse_ops.deserialize_many_sparse(s, sparse_dtype) if is_sparse - else s - for (s, is_sparse, sparse_dtype) - in zip(serialized_list, is_sparse_list, sparse_dtypes)] + tensors = [ + sparse_ops.deserialize_many_sparse(s, info.dtype, info.rank.value) + if info.sparse else s + for (s, info) + in zip(serialized_list, sparse_info_list)] return tensors if received_sequence else tensors[0] @@ -345,7 +441,8 @@ def _enqueue(queue, tensor_list, threads, enqueue_many): def batch(tensor_list, batch_size, num_threads=1, capacity=32, - enqueue_many=False, shapes=None, shared_name=None, name=None): + enqueue_many=False, shapes=None, + shared_name=None, name=None): """Creates batches of tensors in `tensor_list`. This function is implemented using a queue. A `QueueRunner` for the @@ -394,7 +491,7 @@ def batch(tensor_list, batch_size, num_threads=1, capacity=32, """ with ops.op_scope(tensor_list, name, "batch") as name: tensor_list = _validate(tensor_list) - tensor_list, is_sparse, sparse_dtypes = _serialize_sparse_tensors( + (tensor_list, sparse_info) = _serialize_sparse_tensors( tensor_list, enqueue_many) types = _dtypes([tensor_list]) shapes = _shapes([tensor_list], shapes, enqueue_many) @@ -407,7 +504,7 @@ def batch(tensor_list, batch_size, num_threads=1, capacity=32, math_ops.cast(queue.size(), dtypes.float32) * (1. / capacity)) dequeued = queue.dequeue_many(batch_size, name=name) - dequeued = _deserialize_sparse_tensors(dequeued, is_sparse, sparse_dtypes) + dequeued = _deserialize_sparse_tensors(dequeued, sparse_info) return dequeued @@ -478,8 +575,8 @@ def batch_join(tensor_list_list, batch_size, capacity=32, enqueue_many=False, """ with ops.op_scope(_flatten(tensor_list_list), name, "batch_join") as name: tensor_list_list = _validate_join(tensor_list_list) - tensor_list_list, is_sparse, sparse_dtypes = ( - _serialize_sparse_tensors_join(tensor_list_list, enqueue_many)) + tensor_list_list, sparse_info = _serialize_sparse_tensors_join( + tensor_list_list, enqueue_many) types = _dtypes(tensor_list_list) shapes = _shapes(tensor_list_list, shapes, enqueue_many) # TODO(josh11b,mrry): Switch to BatchQueue once it is written. @@ -491,7 +588,7 @@ def batch_join(tensor_list_list, batch_size, capacity=32, enqueue_many=False, math_ops.cast(queue.size(), dtypes.float32) * (1. / capacity)) dequeued = queue.dequeue_many(batch_size, name=name) - dequeued = _deserialize_sparse_tensors(dequeued, is_sparse, sparse_dtypes) + dequeued = _deserialize_sparse_tensors(dequeued, sparse_info) return dequeued @@ -567,7 +664,7 @@ def shuffle_batch(tensor_list, batch_size, capacity, min_after_dequeue, """ with ops.op_scope(tensor_list, name, "shuffle_batch") as name: tensor_list = _validate(tensor_list) - tensor_list, is_sparse, sparse_dtypes = _serialize_sparse_tensors( + tensor_list, sparse_info = _serialize_sparse_tensors( tensor_list, enqueue_many) types = _dtypes([tensor_list]) shapes = _shapes([tensor_list], shapes, enqueue_many) @@ -586,7 +683,7 @@ def shuffle_batch(tensor_list, batch_size, capacity, min_after_dequeue, logging_ops.scalar_summary(summary_name, full) dequeued = queue.dequeue_many(batch_size, name=name) - dequeued = _deserialize_sparse_tensors(dequeued, is_sparse, sparse_dtypes) + dequeued = _deserialize_sparse_tensors(dequeued, sparse_info) return dequeued @@ -652,8 +749,8 @@ def shuffle_batch_join(tensor_list_list, batch_size, capacity, with ops.op_scope( _flatten(tensor_list_list), name, "shuffle_batch_join") as name: tensor_list_list = _validate_join(tensor_list_list) - tensor_list_list, is_sparse, sparse_dtypes = ( - _serialize_sparse_tensors_join(tensor_list_list, enqueue_many)) + tensor_list_list, sparse_info = _serialize_sparse_tensors_join( + tensor_list_list, enqueue_many) types = _dtypes(tensor_list_list) shapes = _shapes(tensor_list_list, shapes, enqueue_many) queue = data_flow_ops.RandomShuffleQueue( @@ -671,5 +768,5 @@ def shuffle_batch_join(tensor_list_list, batch_size, capacity, logging_ops.scalar_summary(summary_name, full) dequeued = queue.dequeue_many(batch_size, name=name) - dequeued = _deserialize_sparse_tensors(dequeued, is_sparse, sparse_dtypes) + dequeued = _deserialize_sparse_tensors(dequeued, sparse_info) return dequeued diff --git a/tensorflow/python/training/input_test.py b/tensorflow/python/training/input_test.py index 14c31442dd..b265c6e3c4 100644 --- a/tensorflow/python/training/input_test.py +++ b/tensorflow/python/training/input_test.py @@ -69,6 +69,60 @@ class LimitEpochsTest(tf.test.TestCase): love_me_two_times.eval() +class InputProducerTest(tf.test.TestCase): + + def testNoShuffle(self): + with self.test_session(): + input_tensor = [[1, 2, 3, 4], + [5, 6, 7, 8], + [9, 10, 11, 12]] + num_epochs = 2 + queue = tf.train.input_producer( + input_tensor, num_epochs=num_epochs, shuffle=False) + dequeue_many = queue.dequeue_many(len(input_tensor) * num_epochs) + dequeue = queue.dequeue() + tf.initialize_all_variables().run() + threads = tf.train.start_queue_runners() + + # No randomness, so just see repeated copies of the input. + self.assertAllEqual(input_tensor * num_epochs, dequeue_many.eval()) + + # Reached the limit. + with self.assertRaises(tf.errors.OutOfRangeError): + dequeue.eval() + for thread in threads: + thread.join() + + def testNoShapeInference(self): + with self.test_session(): + # Disable shape inference for the input. + input_value = [[1, 2, 3, 4], + [5, 6, 7, 8], + [9, 10, 11, 12]] + input_tensor = tf.placeholder_with_default(input_value, shape=None) + num_epochs = 2 + queue = tf.train.input_producer( + input_tensor, element_shape=[4], num_epochs=num_epochs, shuffle=False) + dequeue_many = queue.dequeue_many(len(input_value) * num_epochs) + dequeue = queue.dequeue() + tf.initialize_all_variables().run() + threads = tf.train.start_queue_runners() + + # No randomness, so just see repeated copies of the input. + self.assertAllEqual(input_value * num_epochs, dequeue_many.eval()) + + # Reached the limit. + with self.assertRaises(tf.errors.OutOfRangeError): + dequeue.eval() + for thread in threads: + thread.join() + + def testShapeError(self): + input_tensor = tf.placeholder(tf.float32, None) + with self.assertRaisesRegexp(ValueError, "fully defined shape"): + _ = tf.train.input_producer(input_tensor) + + class StringInputProducerTest(tf.test.TestCase): def testNoShuffle(self): diff --git a/tensorflow/python/training/summary_io.py b/tensorflow/python/training/summary_io.py index 1257230df9..ff92008872 100644 --- a/tensorflow/python/training/summary_io.py +++ b/tensorflow/python/training/summary_io.py @@ -25,11 +25,14 @@ import time import six +from tensorflow.core.framework import graph_pb2 from tensorflow.core.framework import summary_pb2 from tensorflow.core.util import event_pb2 from tensorflow.python import pywrap_tensorflow +from tensorflow.python.framework import ops from tensorflow.python.lib.io import tf_record from tensorflow.python.platform import gfile +from tensorflow.python.platform import logging from tensorflow.python.util import compat @@ -53,7 +56,8 @@ class SummaryWriter(object): @@close """ - def __init__(self, logdir, graph_def=None, max_queue=10, flush_secs=120): + def __init__(self, logdir, graph=None, max_queue=10, flush_secs=120, + graph_def=None): """Creates a `SummaryWriter` and an event file. On construction the summary writer creates a new event file in `logdir`. @@ -61,7 +65,7 @@ class SummaryWriter(object): call one of the following functions: `add_summary()`, `add_session_log()`, `add_event()`, or `add_graph()`. - If you pass a `graph_def` protocol buffer to the constructor it is added to + If you pass a `Graph` to the constructor it is added to the event file. (This is equivalent to calling `add_graph()` later). TensorBoard will pick the graph from the file and display it graphically so @@ -72,8 +76,8 @@ class SummaryWriter(object): ...create a graph... # Launch the graph in a session. sess = tf.Session() - # Create a summary writer, add the 'graph_def' to the event file. - writer = tf.train.SummaryWriter(<some-directory>, sess.graph_def) + # Create a summary writer, add the 'graph' to the event file. + writer = tf.train.SummaryWriter(<some-directory>, sess.graph) ``` The other arguments to the constructor control the asynchronous writes to @@ -86,10 +90,11 @@ class SummaryWriter(object): Args: logdir: A string. Directory where event file will be written. - graph_def: A `GraphDef` protocol buffer. + graph: A `Graph` object, such as `sess.graph`. max_queue: Integer. Size of the queue for pending events and summaries. flush_secs: Number. How often, in seconds, to flush the pending events and summaries to disk. + graph_def: DEPRECATED: Use the `graph` argument instead. """ self._logdir = logdir if not gfile.IsDirectory(self._logdir): @@ -100,8 +105,9 @@ class SummaryWriter(object): self._worker = _EventLoggerThread(self._event_queue, self._ev_writer, flush_secs) self._worker.start() - if graph_def is not None: - self.add_graph(graph_def) + if graph is not None or graph_def is not None: + # Calling it with both graph and graph_def for backward compatibility. + self.add_graph(graph=graph, graph_def=graph_def) def add_summary(self, summary, global_step=None): """Adds a `Summary` protocol buffer to the event file. @@ -154,22 +160,64 @@ class SummaryWriter(object): """ self._event_queue.put(event) - def add_graph(self, graph_def, global_step=None): - """Adds a `GraphDef` protocol buffer to the event file. + def _add_graph_def(self, graph_def, global_step=None): + graph_bytes = graph_def.SerializeToString() + event = event_pb2.Event(wall_time=time.time(), graph_def=graph_bytes) + if global_step is not None: + event.step = int(global_step) + self._event_queue.put(event) + + def add_graph(self, graph, global_step=None, graph_def=None): + """Adds a `Graph` to the event file. The graph described by the protocol buffer will be displayed by TensorBoard. Most users pass a graph in the constructor instead. Args: - graph_def: A `GraphDef` protocol buffer. + graph: A `Graph` object, such as `sess.graph`. global_step: Number. Optional global step counter to record with the graph. + graph_def: DEPRECATED. Use the `graph` parameter instead. + + Raises: + ValueError: If both graph and graph_def are passed to the method. """ - graph_bytes = graph_def.SerializeToString() - event = event_pb2.Event(wall_time=time.time(), graph_def=graph_bytes) - if global_step is not None: - event.step = int(global_step) - self._event_queue.put(event) + + if graph is not None and graph_def is not None: + raise ValueError("Please pass only graph, or graph_def (deprecated), " + "but not both.") + + if isinstance(graph, ops.Graph) or isinstance(graph_def, ops.Graph): + # The user passed a `Graph`. + + # Check if the user passed it via the graph or the graph_def argument and + # correct for that. + if not isinstance(graph, ops.Graph): + logging.warning("When passing a `Graph` object, please use the `graph`" + " named argument instead of `graph_def`.") + graph = graph_def + + # Serialize the graph with additional info. + true_graph_def = graph.as_graph_def(add_shapes=True) + elif (isinstance(graph, graph_pb2.GraphDef) + or isinstance(graph_def, graph_pb2.GraphDef)): + # The user passed a `GraphDef`. + logging.warning("Passing a `GraphDef` to the SummaryWriter is deprecated." + " Pass a `Graph` object instead, such as `sess.graph`.") + + # Check if the user passed it via the graph or the graph_def argument and + # correct for that. + if isinstance(graph, graph_pb2.GraphDef): + true_graph_def = graph + else: + true_graph_def = graph_def + + else: + # The user passed neither `Graph`, nor `GraphDef`. + raise TypeError("The passed graph must be an instance of `Graph` " + "or the deprecated `GraphDef`") + # Finally, add the graph_def to the summary writer. + self._add_graph_def(true_graph_def, global_step) def flush(self): """Flushes the event file to disk. diff --git a/tensorflow/python/training/summary_writer_test.py b/tensorflow/python/training/summary_writer_test.py index 3307c2da12..d1ff95f902 100644 --- a/tensorflow/python/training/summary_writer_test.py +++ b/tensorflow/python/training/summary_writer_test.py @@ -49,6 +49,25 @@ class SummaryWriterTestCase(tf.test.TestCase): def _assertRecent(self, t): self.assertTrue(abs(t - time.time()) < 5) + def _assertEventsWithGraph(self, test_dir, g, has_shapes): + rr = self._EventsReader(test_dir) + + # The first event should list the file_version. + ev = next(rr) + self._assertRecent(ev.wall_time) + self.assertEquals("brain.Event:2", ev.file_version) + + # The next event should have the graph. + ev = next(rr) + self._assertRecent(ev.wall_time) + self.assertEquals(0, ev.step) + ev_graph = tf.GraphDef() + ev_graph.ParseFromString(ev.graph_def) + self.assertProtoEquals(g.as_graph_def(add_shapes=has_shapes), ev_graph) + + # We should be done. + self.assertRaises(StopIteration, lambda: next(rr)) + def testAddingSummaryAndGraph(self): test_dir = self._CleanTestDir("basics") sw = tf.train.SummaryWriter(test_dir) @@ -105,30 +124,54 @@ class SummaryWriterTestCase(tf.test.TestCase): # We should be done. self.assertRaises(StopIteration, lambda: next(rr)) - def testInitializingWithGraphDef(self): - test_dir = self._CleanTestDir("basics_with_graph") + def testGraphAsNamed(self): + test_dir = self._CleanTestDir("basics_named_graph") + with tf.Graph().as_default() as g: + tf.constant([12], name="douze") + sw = tf.train.SummaryWriter(test_dir, graph=g) + sw.close() + self._assertEventsWithGraph(test_dir, g, True) + + def testGraphAsPositional(self): + test_dir = self._CleanTestDir("basics_positional_graph") + with tf.Graph().as_default() as g: + tf.constant([12], name="douze") + sw = tf.train.SummaryWriter(test_dir, g) + sw.close() + self._assertEventsWithGraph(test_dir, g, True) + + def testGraphDefAsNamed(self): + test_dir = self._CleanTestDir("basics_named_graph_def") with tf.Graph().as_default() as g: tf.constant([12], name="douze") gd = g.as_graph_def() sw = tf.train.SummaryWriter(test_dir, graph_def=gd) sw.close() - rr = self._EventsReader(test_dir) - - # The first event should list the file_version. - ev = next(rr) - self._assertRecent(ev.wall_time) - self.assertEquals("brain.Event:2", ev.file_version) + self._assertEventsWithGraph(test_dir, g, False) - # The next event should have the graph. - ev = next(rr) - self._assertRecent(ev.wall_time) - self.assertEquals(0, ev.step) - ev_graph = tf.GraphDef() - ev_graph.ParseFromString(ev.graph_def) - self.assertProtoEquals(gd, ev_graph) + def testGraphDefAsPositional(self): + test_dir = self._CleanTestDir("basics_positional_graph_def") + with tf.Graph().as_default() as g: + tf.constant([12], name="douze") + gd = g.as_graph_def() + sw = tf.train.SummaryWriter(test_dir, gd) + sw.close() + self._assertEventsWithGraph(test_dir, g, False) + + def testGraphAndGraphDef(self): + with self.assertRaises(ValueError): + test_dir = self._CleanTestDir("basics_graph_and_graph_def") + with tf.Graph().as_default() as g: + tf.constant([12], name="douze") + gd = g.as_graph_def() + sw = tf.train.SummaryWriter(test_dir, graph=g, graph_def=gd) + sw.close() - # We should be done. - self.assertRaises(StopIteration, lambda: next(rr)) + def testNeitherGraphNorGraphDef(self): + with self.assertRaises(TypeError): + test_dir = self._CleanTestDir("basics_string_instead_of_graph") + sw = tf.train.SummaryWriter(test_dir, "string instead of graph object") + sw.close() # Checks that values returned from session Run() calls are added correctly to # summaries. These are numpy types so we need to check they fit in the diff --git a/tensorflow/python/training/supervisor.py b/tensorflow/python/training/supervisor.py index 7df435fc65..7396627f7b 100644 --- a/tensorflow/python/training/supervisor.py +++ b/tensorflow/python/training/supervisor.py @@ -844,7 +844,7 @@ class SVSummaryThread(coordinator.LooperThread): self._sess = sess def run_loop(self): - if self._sv.global_step: + if self._sv.global_step is not None: summary_strs, global_step = self._sess.run([self._sv.summary_op, self._sv.global_step]) else: @@ -912,7 +912,7 @@ class SVTimerCheckpointThread(coordinator.LooperThread): def run_loop(self): self._sv.saver.save(self._sess, self._sv.save_path, global_step=self._sv.global_step) - if self._sv.summary_writer and self._sv.global_step: + if self._sv.summary_writer and self._sv.global_step is not None: current_step = training_util.global_step(self._sess, self._sv.global_step) self._sv.summary_writer.add_session_log( SessionLog(status=SessionLog.CHECKPOINT, diff --git a/tensorflow/stream_executor/blas.h b/tensorflow/stream_executor/blas.h index 94475817e0..1f1d427c45 100644 --- a/tensorflow/stream_executor/blas.h +++ b/tensorflow/stream_executor/blas.h @@ -50,6 +50,7 @@ namespace perftools { namespace gputools { class Stream; +class ScratchAllocator; template <typename ElemT> class DeviceMemory; @@ -880,14 +881,14 @@ class BlasSupport { const port::ArraySlice<DeviceMemory<float> *> &a, int lda, const port::ArraySlice<DeviceMemory<float> *> &b, int ldb, float beta, const port::ArraySlice<DeviceMemory<float> *> &c, int ldc, - int batch_count) = 0; + int batch_count, ScratchAllocator *scratch_allocator) = 0; virtual bool DoBlasGemmBatched( Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n, uint64 k, double alpha, const port::ArraySlice<DeviceMemory<double> *> &a, int lda, const port::ArraySlice<DeviceMemory<double> *> &b, int ldb, double beta, const port::ArraySlice<DeviceMemory<double> *> &c, int ldc, - int batch_count) = 0; + int batch_count, ScratchAllocator *scratch_allocator) = 0; virtual bool DoBlasGemmBatched( Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n, uint64 k, std::complex<float> alpha, @@ -895,7 +896,7 @@ class BlasSupport { const port::ArraySlice<DeviceMemory<std::complex<float>> *> &b, int ldb, std::complex<float> beta, const port::ArraySlice<DeviceMemory<std::complex<float>> *> &c, int ldc, - int batch_count) = 0; + int batch_count, ScratchAllocator *scratch_allocator) = 0; virtual bool DoBlasGemmBatched( Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n, uint64 k, std::complex<double> alpha, @@ -903,7 +904,7 @@ class BlasSupport { const port::ArraySlice<DeviceMemory<std::complex<double>> *> &b, int ldb, std::complex<double> beta, const port::ArraySlice<DeviceMemory<std::complex<double>> *> &c, int ldc, - int batch_count) = 0; + int batch_count, ScratchAllocator *scratch_allocator) = 0; // Computes a matrix-matrix product where one input matrix is Hermitian: // @@ -1140,7 +1141,7 @@ class BlasSupport { // Macro used to quickly declare overrides for abstract virtuals in the // BlasSupport base class. -#define TENSORFLOW_STREAM_EXECUTOR_GPU_BLAS_SUPPORT_OVERRIDES \ +#define TENSORFLOW_STREAM_EXECUTOR_GPU_BLAS_SUPPORT_OVERRIDES \ bool DoBlasAsum(Stream *stream, uint64 elem_count, \ const DeviceMemory<float> &x, int incx, \ DeviceMemory<float> *result) override; \ @@ -1626,14 +1627,14 @@ class BlasSupport { const port::ArraySlice<DeviceMemory<float> *> &a, int lda, \ const port::ArraySlice<DeviceMemory<float> *> &b, int ldb, float beta, \ const port::ArraySlice<DeviceMemory<float> *> &c, int ldc, \ - int batch_count) override; \ + int batch_count, ScratchAllocator *scratch_allocator) override; \ bool DoBlasGemmBatched( \ Stream *stream, blas::Transpose transa, blas::Transpose transb, \ uint64 m, uint64 n, uint64 k, double alpha, \ const port::ArraySlice<DeviceMemory<double> *> &a, int lda, \ const port::ArraySlice<DeviceMemory<double> *> &b, int ldb, double beta, \ const port::ArraySlice<DeviceMemory<double> *> &c, int ldc, \ - int batch_count) override; \ + int batch_count, ScratchAllocator *scratch_allocator) override; \ bool DoBlasGemmBatched( \ Stream *stream, blas::Transpose transa, blas::Transpose transb, \ uint64 m, uint64 n, uint64 k, std::complex<float> alpha, \ @@ -1641,7 +1642,7 @@ class BlasSupport { const port::ArraySlice<DeviceMemory<std::complex<float>> *> &b, int ldb, \ std::complex<float> beta, \ const port::ArraySlice<DeviceMemory<std::complex<float>> *> &c, int ldc, \ - int batch_count) override; \ + int batch_count, ScratchAllocator *scratch_allocator) override; \ bool DoBlasGemmBatched( \ Stream *stream, blas::Transpose transa, blas::Transpose transb, \ uint64 m, uint64 n, uint64 k, std::complex<double> alpha, \ @@ -1650,7 +1651,7 @@ class BlasSupport { const port::ArraySlice<DeviceMemory<std::complex<double>> *> &b, \ int ldb, std::complex<double> beta, \ const port::ArraySlice<DeviceMemory<std::complex<double>> *> &c, \ - int ldc, int batch_count) override; \ + int ldc, int batch_count, ScratchAllocator *scratch_allocator) override; \ bool DoBlasHemm(Stream *stream, blas::Side side, blas::UpperLower uplo, \ uint64 m, uint64 n, std::complex<float> alpha, \ const DeviceMemory<std::complex<float>> &a, int lda, \ diff --git a/tensorflow/stream_executor/cuda/cuda_blas.cc b/tensorflow/stream_executor/cuda/cuda_blas.cc index 19ad12d28b..fb21baf9bf 100644 --- a/tensorflow/stream_executor/cuda/cuda_blas.cc +++ b/tensorflow/stream_executor/cuda/cuda_blas.cc @@ -19,6 +19,7 @@ limitations under the License. #include <complex> +#include "third_party/gpus/cuda/include/cublas_v2.h" #include "tensorflow/stream_executor/cuda/cuda_activation.h" #include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h" #include "tensorflow/stream_executor/cuda/cuda_helpers.h" @@ -34,8 +35,8 @@ limitations under the License. #include "tensorflow/stream_executor/platform/logging.h" #include "tensorflow/stream_executor/platform/port.h" #include "tensorflow/stream_executor/plugin_registry.h" +#include "tensorflow/stream_executor/scratch_allocator.h" #include "tensorflow/stream_executor/stream_executor.h" -#include "third_party/gpus/cuda/include/cublas_v2.h" namespace perftools { namespace gputools { @@ -1707,37 +1708,64 @@ template <typename T, typename FuncT> port::Status CUDABlas::DoBlasGemmBatchedInternal( FuncT cublas_func, Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n, uint64 k, T alpha, - const port::ArraySlice<DeviceMemory<T> *> &a_array, int lda, - const port::ArraySlice<DeviceMemory<T> *> &b_array, int ldb, T beta, - const port::ArraySlice<DeviceMemory<T> *> &c_array, int ldc, - int batch_count) { - std::vector<T *> a_ptr_vec, b_ptr_vec, c_ptr_vec; + const port::ArraySlice<DeviceMemory<T> *> &a_ptrs_to_wrappers, int lda, + const port::ArraySlice<DeviceMemory<T> *> &b_ptrs_to_wrappers, int ldb, + T beta, const port::ArraySlice<DeviceMemory<T> *> &c_ptrs_to_wrappers, + int ldc, int batch_count, ScratchAllocator *scratch_allocator) { + std::vector<T *> a_raw_ptrs, b_raw_ptrs, c_raw_ptrs; for (int i = 0; i < batch_count; ++i) { - a_ptr_vec.push_back(static_cast<T *>(a_array[i]->opaque())); - b_ptr_vec.push_back(static_cast<T *>(b_array[i]->opaque())); - c_ptr_vec.push_back(static_cast<T *>(c_array[i]->opaque())); + a_raw_ptrs.push_back(static_cast<T *>(a_ptrs_to_wrappers[i]->opaque())); + b_raw_ptrs.push_back(static_cast<T *>(b_ptrs_to_wrappers[i]->opaque())); + c_raw_ptrs.push_back(static_cast<T *>(c_ptrs_to_wrappers[i]->opaque())); } typedef typename CUDAComplexT<T>::type CUDA_T; - SE_ASSIGN_OR_RETURN( - std::unique_ptr<TemporaryDeviceMemory<CUDA_T *>> a_ptr_array, - stream->AllocateTemporaryArray<CUDA_T *>(batch_count)); - SE_ASSIGN_OR_RETURN( - std::unique_ptr<TemporaryDeviceMemory<CUDA_T *>> b_ptr_array, - stream->AllocateTemporaryArray<CUDA_T *>(batch_count)); - SE_ASSIGN_OR_RETURN( - std::unique_ptr<TemporaryDeviceMemory<CUDA_T *>> c_ptr_array, - stream->AllocateTemporaryArray<CUDA_T *>(batch_count)); - - if (!stream->ThenMemcpy(a_ptr_array->mutable_device_memory(), - a_ptr_vec.data(), batch_count * sizeof(T *)) - .ok() || - !stream->ThenMemcpy(b_ptr_array->mutable_device_memory(), - b_ptr_vec.data(), batch_count * sizeof(T *)) - .ok() || - !stream->ThenMemcpy(c_ptr_array->mutable_device_memory(), - c_ptr_vec.data(), batch_count * sizeof(T *)) - .ok()) { + + const size_t size = batch_count * sizeof(CUDA_T *); + + // Device-side copy of pointers to matrices. + DeviceMemory<CUDA_T *> a; + DeviceMemory<CUDA_T *> b; + DeviceMemory<CUDA_T *> c; + + // If temporary space is allocated for device-side copies of pointers to + // matrices, that temporary space should not be freed until this function + // returns. Although the values for these unique_ptrs are not set here, they + // are declared at this scope so they will be destroyed when the function + // returns. + // + // If a scratch allocator is provided, these pointers will not be used at all. + std::unique_ptr<TemporaryDeviceMemory<CUDA_T *>> a_temporary; + std::unique_ptr<TemporaryDeviceMemory<CUDA_T *>> b_temporary; + std::unique_ptr<TemporaryDeviceMemory<CUDA_T *>> c_temporary; + + // Decide how to allocate device-side copy of pointers to matrices based on + // whether a scratch allocator was passed. + if (scratch_allocator != nullptr) { + SE_ASSIGN_OR_RETURN(DeviceMemory<uint8> a_bytes, + scratch_allocator->AllocateBytes(stream, size)); + SE_ASSIGN_OR_RETURN(DeviceMemory<uint8> b_bytes, + scratch_allocator->AllocateBytes(stream, size)); + SE_ASSIGN_OR_RETURN(DeviceMemory<uint8> c_bytes, + scratch_allocator->AllocateBytes(stream, size)); + a = DeviceMemory<CUDA_T *>(a_bytes); + b = DeviceMemory<CUDA_T *>(b_bytes); + c = DeviceMemory<CUDA_T *>(c_bytes); + } else { + SE_ASSIGN_OR_RETURN(a_temporary, + stream->AllocateTemporaryArray<CUDA_T *>(batch_count)); + SE_ASSIGN_OR_RETURN(b_temporary, + stream->AllocateTemporaryArray<CUDA_T *>(batch_count)); + SE_ASSIGN_OR_RETURN(c_temporary, + stream->AllocateTemporaryArray<CUDA_T *>(batch_count)); + a = DeviceMemory<CUDA_T *>(*a_temporary->mutable_device_memory()); + b = DeviceMemory<CUDA_T *>(*b_temporary->mutable_device_memory()); + c = DeviceMemory<CUDA_T *>(*c_temporary->mutable_device_memory()); + } + + if (!stream->ThenMemcpy(&a, a_raw_ptrs.data(), size).ok() || + !stream->ThenMemcpy(&b, b_raw_ptrs.data(), size).ok() || + !stream->ThenMemcpy(&c, c_raw_ptrs.data(), size).ok()) { return port::Status(port::error::INTERNAL, "failed to copy memory from host to device in " "CUDABlas::DoBlasGemmBatched"); @@ -1746,13 +1774,9 @@ port::Status CUDABlas::DoBlasGemmBatchedInternal( bool ok = DoBlasInternal( cublas_func, stream, true /* = pointer_mode_host */, CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k, - CUDAComplex(&alpha), - const_cast<const CUDA_T **>(CUDAMemory(a_ptr_array->device_memory())), - lda, - const_cast<const CUDA_T **>(CUDAMemory(b_ptr_array->device_memory())), - ldb, CUDAComplex(&beta), - const_cast<CUDA_T **>(CUDAMemory(c_ptr_array->device_memory())), ldc, - batch_count); + CUDAComplex(&alpha), const_cast<const CUDA_T **>(CUDAMemory(a)), lda, + const_cast<const CUDA_T **>(CUDAMemory(b)), ldb, CUDAComplex(&beta), + const_cast<CUDA_T **>(CUDAMemory(c)), ldc, batch_count); if (ok) { return port::Status::OK(); @@ -1767,10 +1791,11 @@ bool CUDABlas::DoBlasGemmBatched( const port::ArraySlice<DeviceMemory<float> *> &a_array, int lda, const port::ArraySlice<DeviceMemory<float> *> &b_array, int ldb, float beta, const port::ArraySlice<DeviceMemory<float> *> &c_array, int ldc, - int batch_count) { + int batch_count, ScratchAllocator *scratch_allocator) { SE_RETURN_STATUS_AS_BOOL(DoBlasGemmBatchedInternal( dynload::cublasSgemmBatched, stream, transa, transb, m, n, k, alpha, - a_array, lda, b_array, ldb, beta, c_array, ldc, batch_count)); + a_array, lda, b_array, ldb, beta, c_array, ldc, batch_count, + scratch_allocator)); } bool CUDABlas::DoBlasGemmBatched( @@ -1779,10 +1804,11 @@ bool CUDABlas::DoBlasGemmBatched( const port::ArraySlice<DeviceMemory<double> *> &a_array, int lda, const port::ArraySlice<DeviceMemory<double> *> &b_array, int ldb, double beta, const port::ArraySlice<DeviceMemory<double> *> &c_array, - int ldc, int batch_count) { + int ldc, int batch_count, ScratchAllocator *scratch_allocator) { SE_RETURN_STATUS_AS_BOOL(DoBlasGemmBatchedInternal( dynload::cublasDgemmBatched, stream, transa, transb, m, n, k, alpha, - a_array, lda, b_array, ldb, beta, c_array, ldc, batch_count)); + a_array, lda, b_array, ldb, beta, c_array, ldc, batch_count, + scratch_allocator)); } bool CUDABlas::DoBlasGemmBatched( @@ -1793,10 +1819,11 @@ bool CUDABlas::DoBlasGemmBatched( const port::ArraySlice<DeviceMemory<std::complex<float>> *> &b_array, int ldb, std::complex<float> beta, const port::ArraySlice<DeviceMemory<std::complex<float>> *> &c_array, - int ldc, int batch_count) { + int ldc, int batch_count, ScratchAllocator *scratch_allocator) { SE_RETURN_STATUS_AS_BOOL(DoBlasGemmBatchedInternal( dynload::cublasCgemmBatched, stream, transa, transb, m, n, k, alpha, - a_array, lda, b_array, ldb, beta, c_array, ldc, batch_count)); + a_array, lda, b_array, ldb, beta, c_array, ldc, batch_count, + scratch_allocator)); } bool CUDABlas::DoBlasGemmBatched( @@ -1807,10 +1834,11 @@ bool CUDABlas::DoBlasGemmBatched( const port::ArraySlice<DeviceMemory<std::complex<double>> *> &b_array, int ldb, std::complex<double> beta, const port::ArraySlice<DeviceMemory<std::complex<double>> *> &c_array, - int ldc, int batch_count) { + int ldc, int batch_count, ScratchAllocator *scratch_allocator) { SE_RETURN_STATUS_AS_BOOL(DoBlasGemmBatchedInternal( dynload::cublasZgemmBatched, stream, transa, transb, m, n, k, alpha, - a_array, lda, b_array, ldb, beta, c_array, ldc, batch_count)); + a_array, lda, b_array, ldb, beta, c_array, ldc, batch_count, + scratch_allocator)); } bool CUDABlas::DoBlasHemm(Stream *stream, blas::Side side, diff --git a/tensorflow/stream_executor/cuda/cuda_blas.h b/tensorflow/stream_executor/cuda/cuda_blas.h index 046b7253e4..d5b949f7d1 100644 --- a/tensorflow/stream_executor/cuda/cuda_blas.h +++ b/tensorflow/stream_executor/cuda/cuda_blas.h @@ -93,7 +93,7 @@ class CUDABlas : public blas::BlasSupport { const port::ArraySlice<DeviceMemory<T> *> &a_array, int lda, const port::ArraySlice<DeviceMemory<T> *> &b_array, int ldb, T beta, const port::ArraySlice<DeviceMemory<T> *> &c_array, int ldc, - int batch_count); + int batch_count, ScratchAllocator *scratch_allocator); // mutex that guards the cuBLAS handle for this device. mutex mu_; diff --git a/tensorflow/stream_executor/stream.cc b/tensorflow/stream_executor/stream.cc index 587896a2ab..cee781f77b 100644 --- a/tensorflow/stream_executor/stream.cc +++ b/tensorflow/stream_executor/stream.cc @@ -2986,6 +2986,17 @@ Stream &Stream::ThenBlasGemmBatched( int lda, const port::ArraySlice<DeviceMemory<float> *> &b, int ldb, float beta, const port::ArraySlice<DeviceMemory<float> *> &c, int ldc, int batch_count) { + return ThenBlasGemmBatchedWithScratch(transa, transb, m, n, k, alpha, a, lda, + b, ldb, beta, c, ldc, batch_count, + nullptr); +} + +Stream &Stream::ThenBlasGemmBatchedWithScratch( + blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n, + uint64 k, float alpha, const port::ArraySlice<DeviceMemory<float> *> &a, + int lda, const port::ArraySlice<DeviceMemory<float> *> &b, int ldb, + float beta, const port::ArraySlice<DeviceMemory<float> *> &c, int ldc, + int batch_count, ScratchAllocator *scratch_allocator) { VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k), PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb), PARAM(beta), PARAM(c), PARAM(ldc), PARAM(batch_count)); @@ -2993,9 +3004,12 @@ Stream &Stream::ThenBlasGemmBatched( ThenBlasImpl<blas::Transpose, blas::Transpose, uint64, uint64, uint64, float, const port::ArraySlice<DeviceMemory<float> *> &, int, const port::ArraySlice<DeviceMemory<float> *> &, int, float, - const port::ArraySlice<DeviceMemory<float> *> &, int, int> impl; + const port::ArraySlice<DeviceMemory<float> *> &, int, int, + ScratchAllocator *> + impl; return impl(this, &blas::BlasSupport::DoBlasGemmBatched, transa, transb, m, n, - k, alpha, a, lda, b, ldb, beta, c, ldc, batch_count); + k, alpha, a, lda, b, ldb, beta, c, ldc, batch_count, + scratch_allocator); } Stream &Stream::ThenBlasGemmBatched( @@ -3004,6 +3018,17 @@ Stream &Stream::ThenBlasGemmBatched( int lda, const port::ArraySlice<DeviceMemory<double> *> &b, int ldb, double beta, const port::ArraySlice<DeviceMemory<double> *> &c, int ldc, int batch_count) { + return ThenBlasGemmBatchedWithScratch(transa, transb, m, n, k, alpha, a, lda, + b, ldb, beta, c, ldc, batch_count, + nullptr); +} + +Stream &Stream::ThenBlasGemmBatchedWithScratch( + blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n, + uint64 k, double alpha, const port::ArraySlice<DeviceMemory<double> *> &a, + int lda, const port::ArraySlice<DeviceMemory<double> *> &b, int ldb, + double beta, const port::ArraySlice<DeviceMemory<double> *> &c, int ldc, + int batch_count, ScratchAllocator *scratch_allocator) { VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k), PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb), PARAM(beta), PARAM(c), PARAM(ldc), PARAM(batch_count)); @@ -3011,9 +3036,12 @@ Stream &Stream::ThenBlasGemmBatched( ThenBlasImpl<blas::Transpose, blas::Transpose, uint64, uint64, uint64, double, const port::ArraySlice<DeviceMemory<double> *> &, int, const port::ArraySlice<DeviceMemory<double> *> &, int, double, - const port::ArraySlice<DeviceMemory<double> *> &, int, int> impl; + const port::ArraySlice<DeviceMemory<double> *> &, int, int, + ScratchAllocator *> + impl; return impl(this, &blas::BlasSupport::DoBlasGemmBatched, transa, transb, m, n, - k, alpha, a, lda, b, ldb, beta, c, ldc, batch_count); + k, alpha, a, lda, b, ldb, beta, c, ldc, batch_count, + scratch_allocator); } Stream &Stream::ThenBlasGemmBatched( @@ -3024,6 +3052,19 @@ Stream &Stream::ThenBlasGemmBatched( std::complex<float> beta, const port::ArraySlice<DeviceMemory<std::complex<float>> *> &c, int ldc, int batch_count) { + return ThenBlasGemmBatchedWithScratch(transa, transb, m, n, k, alpha, a, lda, + b, ldb, beta, c, ldc, batch_count, + nullptr); +} + +Stream &Stream::ThenBlasGemmBatchedWithScratch( + blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n, + uint64 k, std::complex<float> alpha, + const port::ArraySlice<DeviceMemory<std::complex<float>> *> &a, int lda, + const port::ArraySlice<DeviceMemory<std::complex<float>> *> &b, int ldb, + std::complex<float> beta, + const port::ArraySlice<DeviceMemory<std::complex<float>> *> &c, int ldc, + int batch_count, ScratchAllocator *scratch_allocator) { VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k), PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb), PARAM(beta), PARAM(c), PARAM(ldc), PARAM(batch_count)); @@ -3035,9 +3076,11 @@ Stream &Stream::ThenBlasGemmBatched( const port::ArraySlice<DeviceMemory<std::complex<float>> *> &, int, std::complex<float>, const port::ArraySlice<DeviceMemory<std::complex<float>> *> &, - int, int> impl; + int, int, ScratchAllocator *> + impl; return impl(this, &blas::BlasSupport::DoBlasGemmBatched, transa, transb, m, n, - k, alpha, a, lda, b, ldb, beta, c, ldc, batch_count); + k, alpha, a, lda, b, ldb, beta, c, ldc, batch_count, + scratch_allocator); } Stream &Stream::ThenBlasGemmBatched( @@ -3048,6 +3091,19 @@ Stream &Stream::ThenBlasGemmBatched( std::complex<double> beta, const port::ArraySlice<DeviceMemory<std::complex<double>> *> &c, int ldc, int batch_count) { + return ThenBlasGemmBatchedWithScratch(transa, transb, m, n, k, alpha, a, lda, + b, ldb, beta, c, ldc, batch_count, + nullptr); +} + +Stream &Stream::ThenBlasGemmBatchedWithScratch( + blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n, + uint64 k, std::complex<double> alpha, + const port::ArraySlice<DeviceMemory<std::complex<double>> *> &a, int lda, + const port::ArraySlice<DeviceMemory<std::complex<double>> *> &b, int ldb, + std::complex<double> beta, + const port::ArraySlice<DeviceMemory<std::complex<double>> *> &c, int ldc, + int batch_count, ScratchAllocator *scratch_allocator) { VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k), PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb), PARAM(beta), PARAM(c), PARAM(ldc), PARAM(batch_count)); @@ -3059,9 +3115,11 @@ Stream &Stream::ThenBlasGemmBatched( const port::ArraySlice<DeviceMemory<std::complex<double>> *> &, int, std::complex<double>, const port::ArraySlice<DeviceMemory<std::complex<double>> *> &, - int, int> impl; + int, int, ScratchAllocator *> + impl; return impl(this, &blas::BlasSupport::DoBlasGemmBatched, transa, transb, m, n, - k, alpha, a, lda, b, ldb, beta, c, ldc, batch_count); + k, alpha, a, lda, b, ldb, beta, c, ldc, batch_count, + scratch_allocator); } Stream &Stream::ThenSetRngSeed(const uint8 *seed, uint64 seed_bytes) { diff --git a/tensorflow/stream_executor/stream.h b/tensorflow/stream_executor/stream.h index d91c62ca26..599146f49b 100644 --- a/tensorflow/stream_executor/stream.h +++ b/tensorflow/stream_executor/stream.h @@ -944,6 +944,34 @@ class Stream { std::complex<double> beta, const port::ArraySlice<DeviceMemory<std::complex<double>> *> &c, int ldc, int batch_count); + Stream &ThenBlasGemmBatchedWithScratch( + blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n, + uint64 k, float alpha, const port::ArraySlice<DeviceMemory<float> *> &a, + int lda, const port::ArraySlice<DeviceMemory<float> *> &b, int ldb, + float beta, const port::ArraySlice<DeviceMemory<float> *> &c, int ldc, + int batch_count, ScratchAllocator *scratch_allocator); + Stream &ThenBlasGemmBatchedWithScratch( + blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n, + uint64 k, double alpha, const port::ArraySlice<DeviceMemory<double> *> &a, + int lda, const port::ArraySlice<DeviceMemory<double> *> &b, int ldb, + double beta, const port::ArraySlice<DeviceMemory<double> *> &c, int ldc, + int batch_count, ScratchAllocator *scratch_allocator); + Stream &ThenBlasGemmBatchedWithScratch( + blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n, + uint64 k, std::complex<float> alpha, + const port::ArraySlice<DeviceMemory<std::complex<float>> *> &a, int lda, + const port::ArraySlice<DeviceMemory<std::complex<float>> *> &b, int ldb, + std::complex<float> beta, + const port::ArraySlice<DeviceMemory<std::complex<float>> *> &c, int ldc, + int batch_count, ScratchAllocator *scratch_allocator); + Stream &ThenBlasGemmBatchedWithScratch( + blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n, + uint64 k, std::complex<double> alpha, + const port::ArraySlice<DeviceMemory<std::complex<double>> *> &a, int lda, + const port::ArraySlice<DeviceMemory<std::complex<double>> *> &b, int ldb, + std::complex<double> beta, + const port::ArraySlice<DeviceMemory<std::complex<double>> *> &c, int ldc, + int batch_count, ScratchAllocator *scratch_allocator); // See BlasSupport::DoBlasHemm. Stream &ThenBlasHemm(blas::Side side, blas::UpperLower uplo, uint64 m, diff --git a/tensorflow/tensorboard/components/tf-event-dashboard/tf-event-dashboard.html b/tensorflow/tensorboard/components/tf-event-dashboard/tf-event-dashboard.html index d77af121d7..6bf32a4ccd 100644 --- a/tensorflow/tensorboard/components/tf-event-dashboard/tf-event-dashboard.html +++ b/tensorflow/tensorboard/components/tf-event-dashboard/tf-event-dashboard.html @@ -115,7 +115,7 @@ The #center div contains tf-charts embedded inside tf-collapsable-panes. <p> Maybe data hasn't loaded yet, or maybe you need to add some <code>tf.scalar_summary</code> ops to your graph, and - serialize them using the <code>tf.training.summary_io.SummaryWriter</code>. + serialize them using the <code>tf.train.SummaryWriter</code>. </p> </div> </template> diff --git a/tensorflow/tensorboard/components/tf-event-dashboard/tf-run-selector.html b/tensorflow/tensorboard/components/tf-event-dashboard/tf-run-selector.html index 563b2dd194..c69446edc4 100644 --- a/tensorflow/tensorboard/components/tf-event-dashboard/tf-run-selector.html +++ b/tensorflow/tensorboard/components/tf-event-dashboard/tf-run-selector.html @@ -75,7 +75,6 @@ Properties out: display: flex; flex-grow: 1; flex-shrink: 1; - height: 0px; /* hackhack So the flex-grow takes over and gives it space */ } .x-button { font-size: 13px; diff --git a/tensorflow/tensorboard/components/tf-graph-common/lib/hierarchy.ts b/tensorflow/tensorboard/components/tf-graph-common/lib/hierarchy.ts index 1dba760aae..8dca63c9ab 100644 --- a/tensorflow/tensorboard/components/tf-graph-common/lib/hierarchy.ts +++ b/tensorflow/tensorboard/components/tf-graph-common/lib/hierarchy.ts @@ -515,6 +515,13 @@ function addEdges(h: Hierarchy, graph: SlimGraph, let sourceAncestorIndex = getPath(graph.nodes[baseEdge.v], sourcePath); let destAncestorIndex = getPath(graph.nodes[baseEdge.w], destPath); + // If the hierarchical path cannot be found for either endpoint, then we + // cannot create the edge. This happens for example when a node has a + // control dependency on a summary node, which are embedded. + if (sourceAncestorIndex === -1 || destAncestorIndex === -1) { + return; + } + // Find the lowest shared ancestor between source and dest by looking for // the highest nodes that differ between their ancestor paths. while (sourcePath[sourceAncestorIndex] === destPath[destAncestorIndex]) { diff --git a/tensorflow/tensorboard/components/tf-graph-common/lib/layout.ts b/tensorflow/tensorboard/components/tf-graph-common/lib/layout.ts index 0d9e5b53bf..b2f4fd1d7f 100644 --- a/tensorflow/tensorboard/components/tf-graph-common/lib/layout.ts +++ b/tensorflow/tensorboard/components/tf-graph-common/lib/layout.ts @@ -87,7 +87,7 @@ export const PARAMS = { */ labelHeight: 20, /** X-space between each extracted node and the core graph. */ - extractXOffset: 50, + extractXOffset: 15, /** Y-space between each extracted node. */ extractYOffset: 20 }, @@ -486,9 +486,24 @@ function layoutMetanode(renderNodeInfo: render.RenderGroupNodeInfo): void { return height + yOffset + child.height; }, 0); + // Compute the total padding between the core graph, in-extract and + // out-extract boxes. + let numParts = 0; + if (renderNodeInfo.isolatedInExtract.length > 0) { + numParts++; + } + if (renderNodeInfo.isolatedOutExtract.length > 0) { + numParts++; + } + if (renderNodeInfo.coreGraph.nodeCount() > 0) { + numParts++; + } + let offset = PARAMS.subscene.meta.extractXOffset; + let padding = numParts <= 1 ? 0 : (numParts <= 2 ? offset : 2 * offset); + // Add the in-extract and out-extract width to the core box width. renderNodeInfo.coreBox.width += renderNodeInfo.inExtractBox.width + - renderNodeInfo.outExtractBox.width; + renderNodeInfo.outExtractBox.width + padding; renderNodeInfo.coreBox.height = params.labelHeight + Math.max( diff --git a/tensorflow/tensorboard/components/tf-graph-common/lib/render.ts b/tensorflow/tensorboard/components/tf-graph-common/lib/render.ts index fa0ee99d19..dd43e650d3 100644 --- a/tensorflow/tensorboard/components/tf-graph-common/lib/render.ts +++ b/tensorflow/tensorboard/components/tf-graph-common/lib/render.ts @@ -964,8 +964,6 @@ export class RenderNodeInfo { /** Label vertical offset from the center of node shape */ labelOffset: number; - /** X-space between each extracted node and the core graph. */ - extractXOffset: number; /** Rectangle radius (for making rounded rectangle) */ radius: number; @@ -1027,7 +1025,6 @@ export class RenderNodeInfo { // Params for node box. this.labelOffset = 0; - this.extractXOffset = 0; this.radius = 0; // Params for expanded node diff --git a/tensorflow/tensorboard/components/tf-graph-common/lib/scene/scene.ts b/tensorflow/tensorboard/components/tf-graph-common/lib/scene/scene.ts index b6eb3f7d81..1b6cb3a58c 100644 --- a/tensorflow/tensorboard/components/tf-graph-common/lib/scene/scene.ts +++ b/tensorflow/tensorboard/components/tf-graph-common/lib/scene/scene.ts @@ -321,15 +321,18 @@ function position(sceneGroup, renderNode: render.RenderGroupNodeInfo) { // in-extract let hasInExtract = renderNode.isolatedInExtract.length > 0; + let hasOutExtract = renderNode.isolatedOutExtract.length > 0; + if (hasInExtract) { + let offset = layout.PARAMS.subscene.meta.extractXOffset; let inExtractX = renderNode.coreBox.width - - renderNode.inExtractBox.width / 2 - renderNode.outExtractBox.width; + renderNode.inExtractBox.width / 2 - renderNode.outExtractBox.width - + (hasOutExtract ? offset : 0); translate(selectChild(sceneGroup, "g", Class.Scene.INEXTRACT), inExtractX, yTranslate); } // out-extract - let hasOutExtract = renderNode.isolatedOutExtract.length > 0; if (hasOutExtract) { let outExtractX = renderNode.coreBox.width - renderNode.outExtractBox.width / 2; diff --git a/tensorflow/tensorboard/components/tf-graph-dashboard/tf-graph-dashboard.html b/tensorflow/tensorboard/components/tf-graph-dashboard/tf-graph-dashboard.html index d26bf2e8f4..e0c0184864 100644 --- a/tensorflow/tensorboard/components/tf-graph-dashboard/tf-graph-dashboard.html +++ b/tensorflow/tensorboard/components/tf-graph-dashboard/tf-graph-dashboard.html @@ -37,7 +37,7 @@ by default. The user can select a different run from a dropdown menu. </p> <p> To store a graph, create a - <code>tf.python.training.summary_io.SummaryWriter</code> + <code>tf.train.SummaryWriter</code> and pass the graph either via the constructor, or by calling its <code>add_graph()</code> method. </p> diff --git a/tensorflow/tensorboard/components/tf-graph-info/tf-node-info.html b/tensorflow/tensorboard/components/tf-graph-info/tf-node-info.html index d715925d2c..c23e358bf2 100644 --- a/tensorflow/tensorboard/components/tf-graph-info/tf-node-info.html +++ b/tensorflow/tensorboard/components/tf-graph-info/tf-node-info.html @@ -195,7 +195,7 @@ </paper-icon-button> Control dependencies </div> - <iron-collapse opened="{{_openedControlPred}}"> + <iron-collapse opened="{{_openedControlPred}}" no-animation> <template is="dom-if" if="{{_openedControlPred}}" restamp="true"> <iron-list class="sub-list" items="[[_predecessors.control]]"> <template> @@ -246,7 +246,7 @@ </paper-icon-button> Control dependencies </div> - <iron-collapse opened="{{_openedControlSucc}}"> + <iron-collapse opened="{{_openedControlSucc}}" no-animation> <template is="dom-if" if="{{_openedControlSucc}}" restamp="true"> <iron-list class="sub-list" items="[[_successors.control]]"> <template> diff --git a/tensorflow/tensorboard/components/tf-histogram-dashboard/tf-histogram-dashboard.html b/tensorflow/tensorboard/components/tf-histogram-dashboard/tf-histogram-dashboard.html index 8f68791c51..d7c631ebdb 100644 --- a/tensorflow/tensorboard/components/tf-histogram-dashboard/tf-histogram-dashboard.html +++ b/tensorflow/tensorboard/components/tf-histogram-dashboard/tf-histogram-dashboard.html @@ -109,7 +109,7 @@ The #center div contains tf-charts embedded inside tf-collapsable-panes. <p> Maybe data hasn't loaded yet, or maybe you need to add some <code>tf.histogram_summary</code> ops to your graph, and - serialize them using the <code>tf.training.summary_io.SummaryWriter</code>. + serialize them using the <code>tf.train.SummaryWriter</code>. </p> </div> </template> diff --git a/tensorflow/tensorboard/components/tf-image-dashboard/tf-image-dashboard.html b/tensorflow/tensorboard/components/tf-image-dashboard/tf-image-dashboard.html index 57c41abe95..18a7b9f708 100644 --- a/tensorflow/tensorboard/components/tf-image-dashboard/tf-image-dashboard.html +++ b/tensorflow/tensorboard/components/tf-image-dashboard/tf-image-dashboard.html @@ -43,7 +43,7 @@ mechanism for loading older images rather than always getting the most recent on <p> Maybe data hasn't loaded yet, or maybe you need to add some <code>tf.image_summary</code> ops to your graph, and - serialize them using the <code>tf.training.summary_io.SummaryWriter</code>. + serialize them using the <code>tf.train.SummaryWriter</code>. </p> </div> </template> diff --git a/tensorflow/tensorboard/dist/tf-tensorboard.html b/tensorflow/tensorboard/dist/tf-tensorboard.html index 492a5d45ce..31c62345ac 100644 --- a/tensorflow/tensorboard/dist/tf-tensorboard.html +++ b/tensorflow/tensorboard/dist/tf-tensorboard.html @@ -2086,7 +2086,7 @@ var TF; <p> Maybe data hasn't loaded yet, or maybe you need to add some <code>tf.scalar_summary</code> ops to your graph, and - serialize them using the <code>tf.training.summary_io.SummaryWriter</code>. + serialize them using the <code>tf.train.SummaryWriter</code>. </p> </div> </template> @@ -2201,7 +2201,7 @@ var TF; <p> Maybe data hasn't loaded yet, or maybe you need to add some <code>tf.histogram_summary</code> ops to your graph, and - serialize them using the <code>tf.training.summary_io.SummaryWriter</code>. + serialize them using the <code>tf.train.SummaryWriter</code>. </p> </div> </template> @@ -2476,7 +2476,7 @@ var TF; <p> Maybe data hasn't loaded yet, or maybe you need to add some <code>tf.image_summary</code> ops to your graph, and - serialize them using the <code>tf.training.summary_io.SummaryWriter</code>. + serialize them using the <code>tf.train.SummaryWriter</code>. </p> </div> </template> @@ -11197,7 +11197,7 @@ function convertToHumanReadable(value, units, unitIndex) { </p> <p> To store a graph, create a - <code>tf.python.training.summary_io.SummaryWriter</code> + <code>tf.train.SummaryWriter</code> and pass the graph either via the constructor, or by calling its <code>add_graph()</code> method. </p> @@ -11401,4 +11401,4 @@ Polymer({ }); </script> </dom-module> -</body></html>
\ No newline at end of file +</body></html> diff --git a/tensorflow/tensorboard/lib/js/colorScale/colorScale.ts b/tensorflow/tensorboard/lib/js/colorScale/colorScale.ts new file mode 100644 index 0000000000..f57551dbe0 --- /dev/null +++ b/tensorflow/tensorboard/lib/js/colorScale/colorScale.ts @@ -0,0 +1,148 @@ +/* Copyright 2015 Google Inc. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +// Each color scale is initialized with a configurable number of base hues. +// There are also several palettes available. +// TF.palettes.googleStandard, TF.palettes.googleColorBlind, +// TF.palettes.googleCool, TF.palettes.googleWarm, TF.palettes.constantValue +// Each string is hashed to an integer, +// then mapped to one of the base hues above. +// If there is a collision, the color that is later in an alphabetical sort +// gets nudged a little darker or lighter to disambiguate. +// I would call it mostly stable, in that the same array of strings will +// always return the same colors, but the same individual string may +// shift a little depending on its peers. +// +// runs = ["train", "test", "test1", "test2"] +// ccs = new TF.ColorScale(12, "googleStandard"); +// ccs.domain(runs); +// ccs.getColor("train"); +// ccs.getColor("test1"); + +module TF { + export class ColorScale { + public numColors: number; + public internalColorScale: d3.scale.Linear<string, string>; + private buckets: string[][]; + + /** + * The palette you provide defines your spectrum. The colorscale will + * always use the full spectrum you provide. When you define "numColors" + * it resamples at regular intervals along the full extent of the spectrum. + * Thus you get the maximum distance between hues for the "numColors" + * given. This allows the programmer to tweak the algorithm depending on + * how big your expected domain is. If you generally think you're going to + * have a small number of elements in the domain, then a small numColors + * will be serviceable. With large domains, a small numColors would produce + * too many hash collisions, so you'd want to bump it up to the threshold + * of human perception (probably around 14 or 18). + * + * @param {number} [numColors=12] - The number of base colors you want + * in the palette. The more colors, the smaller the number + * the more hash collisions you will have, but the more + * differentiable the base colors will be. + * + * @param {string[]} [palette=TF.palettes.googleColorBlind] - The color + * palette you want as an Array of hex strings. Note, the + * length of the array in this palette is independent of the + * param numColors above. The scale will interpolate to + * create the proper "numColors" given in the first param. + * + */ + constructor(numColors = 12, palette: string[] = TF.palettes.googleColorBlind) { + this.numColors = numColors; + this.domain([]); + + if (palette.length < 2) { + throw new Error("Not enough colors in palette. Must be more than one.") + } + + var k = (this.numColors - 1) / (palette.length - 1); + this.internalColorScale = d3.scale.linear<string>() + .domain(d3.range(palette.length).map((i) => i * k)) + .range(palette); + } + + private hash(s: string): number { + function h(hash, str) { + hash = (hash << 5) - hash + str.charCodeAt(0); + return hash & hash; + } + return Math.abs(Array.prototype.reduce.call(s, h, 0)) % this.numColors; + } + + + /** + * Set the domain of strings so we can calculate collisions preemptively. + * Can be reset at any point. + * + * @param {string[]} strings - An array of strings to use as the domain + * for your scale. + */ + public domain(strings: string[]) { + this.buckets = d3.range(this.numColors).map(() => []); + var sortedUniqueKeys = d3.set(strings).values().sort(function(a, b) { return a.localeCompare(b); }); + sortedUniqueKeys.forEach((s) => this.addToDomain(s)); + } + + private getBucketForString(s: string) { + var bucketIdx = this.hash(s); + return this.buckets[bucketIdx]; + } + + private addToDomain(s: string) { + var bucketIdx = this.hash(s); + var bucket = this.buckets[bucketIdx]; + if (bucket.indexOf(s) === -1) { + bucket.push(s); + } + } + + private nudge(color: string, amount: number): any { + // If amount is zero, just give back same color + if (amount === 0) { + return color; + + // For first tick, nudge lighter... + } else if (amount === 1) { + return d3.hcl(color).brighter(0.6); + + // ..otherwise nudge darker. Darker will approach black, which is visible. + } else { + return d3.hcl(color).darker((amount - 1) / 2); + } + } + + /** + * Use the color scale to transform an element in the domain into a color. + * If there was a hash conflict, the color will be "nudged" darker or lighter so that it is + * unique. + * @param {string} The input string to map to a color. + * @return {string} The color corresponding to that input string. + * @throws Will error if input string is not in the scale's domain. + */ + + public getColor(s: string): string { + var bucket = this.getBucketForString(s); + var idx = bucket.indexOf(s); + if (idx === -1) { + throw new Error("String was not in the domain."); + } + var color = this.internalColorScale(this.hash(s)); + return this.nudge(color, idx).toString(); + } + + } +} diff --git a/tensorflow/tensorboard/lib/js/colorScale/demo/index.html b/tensorflow/tensorboard/lib/js/colorScale/demo/index.html new file mode 100644 index 0000000000..c3d94da539 --- /dev/null +++ b/tensorflow/tensorboard/lib/js/colorScale/demo/index.html @@ -0,0 +1,176 @@ +<!doctype html> +<meta charset="utf-8"> +<script src="../../../../components/d3/d3.min.js"></script> +<script src="../palettes.js"></script> +<script src="../colorScale.js"></script> + +<link rel="stylesheet" href="style.css"> + +<style> + +.color-swatch { + display: inline-block; + height: 20px; +} +.stage { + margin-top: 40px; + margin-bottom: 200px; + position: relative; +} +.color { + position: absolute; + margin: 0 0 4px 0; +} +.swatch { + border-radius: 2px; + float: left; + width: 15px; + height: 15px; + margin-right: 10px; + margin-top: 8px; +} + +.label { + display: inline; +} +</style> + +<header> + <h1 class="trunk">Stable and Unique Colors for Category Labels</h1> + <p class="trunk">A method for defining a stable categorical color scale for real-time, changing data.</p> + +</header> +<h3 class="trunk">Base colors</h3> +<p class="trunk">Each color scale is initialized with a configurable number of base hues. There are 18 shown below. There are also several palettes available.</p> +<p class="palettes trunk"></p> +<h3 class="trunk">A sample list of categories</h3> +<p class="trunk"> + Each string is hashed to an integer, then mapped to one of the base hues above. If there is a collision, the color that is later in an alphabetical sort gets nudged a little darker or lighter to disambiguate. I would call it <i>mostly</i> stable, in that the same array of strings will always return the same colors, but the same individual string may shift a little depending on its peers. +</p> +<p class="stage trunk"></p> + +<script type> +"use strict"; + +var runs = [ + "A Midsummer Night's Dream", + "All's Well That Ends Well", + "Antony and Cleopatra", + "As You Like It", + "Coriolanus", + "Cymbeline", + "Hamlet", + "Henry IV", + "Henry VIII", + "Julius Caesar", + "King John", + "King Lear", + "Love's Labour's Lost", + "Macbeth", + "Measure for Measure", + "Much Ado About Nothing", + "Othello", + "Pericles, Prince of Tyre", + "Richard II", + "Richard III", + "Romeo and Juliet", + "The Comedy of Errors", + "The Merchant of Venice", + "The Merry Wives of Windsor", + "The Taming of the Shrew", + "The Tempest", + "The Two Noble Kinsmen", + "The Winter's Tale", + "Timon of Athens ", + "Titus Andronicus", + "Troilus and Cressida ", + "Twelfth Night", + "Two Gentlemen of Verona" +]; + +var palettes = [ + "googleColorBlind", + "googleStandard", + "constantValue", + "googleWarm", + "googleCool" +]; + +var stage = d3.select(".stage"); + +var palettesStage = d3.select(".palettes"); + +var palette = palettesStage.selectAll(".palette") + .data(palettes) + .enter().append("div") + .attr("class", "palette"); + +palette.each(function(d) { + d3.select(this).append("div").text(d); + var ccs = new TF.ColorScale(17, TF.palettes[d]); + var colorSwatches = d3.select(this).selectAll(".color-swatch") + .data(d3.range(ccs.numColors)) + .enter().append("div") + .attr("class", "color-swatch") + .style("width", 100 / ccs.numColors + "%") + .style("background-color", (d) => ccs.internalColorScale(d)); +}); + +var previousRuns = runs.slice(0, 10).concat(["train", "test", "eval"]); +function ping() { + d3.shuffle(previousRuns); + previousRuns = previousRuns.slice(0, -Math.ceil(Math.random() * 3)); + previousRuns = previousRuns.concat(d3.shuffle(runs).slice(0, Math.floor(Math.random() * 6))).sort(); + previousRuns = d3.set(previousRuns).values().sort(); + var ccs = new TF.ColorScale(); + ccs.domain(previousRuns); + + var color = stage.selectAll(".color") + .data(previousRuns, (d) => d); + + color + .style("opacity", 1) + .style("left", 0) + .transition() + .delay(200) + .duration(300) + .style("top", (d, i) => i * 25 + "px"); + + var colorEnter = color.enter().append("div") + .attr("class", "color") + .style("left", "-100px") + .style("opacity", 0) + .style("top", (d, i) => i * 25 + "px"); + + colorEnter + .transition() + .delay(400) + .duration(300) + .style("left", "0px") + .style("opacity", 1); + + color.exit() + .transition() + .duration(300) + .style("left", "100px") + .style("opacity", 0) + .remove(); + + colorEnter.append("div") + .attr("class", "swatch"); + + color.select(".swatch").style("background-color", (d) => ccs.getColor(d)); + + colorEnter.append("div") + .attr("class", "label") + .text((d) => d); + + stage.transition().duration(300) + .style("height", previousRuns.length * 25 + "px") +} + +ping(); +setInterval(ping, 2000); + + +</script> diff --git a/tensorflow/tensorboard/lib/js/colorScale/demo/style.css b/tensorflow/tensorboard/lib/js/colorScale/demo/style.css new file mode 100644 index 0000000000..d2fe1dc294 --- /dev/null +++ b/tensorflow/tensorboard/lib/js/colorScale/demo/style.css @@ -0,0 +1,74 @@ +body { + font-family: roboto, sans-serif; +} +header { + /*background-color: hsl(0, 0%, 95%);*/ + border-bottom: solid 1px rgba(0, 0, 0, 0.1); + padding: 60px 0; + margin: 0 0 40px 0; + z-index: 10; + position: relative; + color: hsla(0, 0%, 0%, 0.7); +} +header h1 { + font-size: 36px; + font-weight: 700; + margin: 0 0 12px; + line-height: 1.2em; +} +header p { + font-size: 22px; + line-height: 1.6em; + font-weight: 300; + margin-bottom: 20px; + margin-top: 0; +} +.byline { + font-weight: 400; + font-size: 13px; + color: rgba(0, 0, 0, 0.5); + display: none; +} +.byline .date { + margin-left: 12px; + padding-left: 12px; + border-left: solid 1px #ddd; +} +/* Text Styles */ +h3 { + color: rgba(0, 0, 0, 0.7); + margin-top: 40px; +} +a { + color: black; + text-decoration: none; + border-bottom: solid 1px black; +} +p { + font-weight: 400; + font-size: 17px; + line-height: 1.8; + color: rgba(0, 0, 0, 0.7); +} +.trunk { + margin-left: auto; + margin-right: auto; + max-width: 600px; +} +.page { + margin-left: auto; + margin-right: auto; + max-width: 900px; +} +.screen { + margin-left: auto; + margin-right: auto; +} + +.data-picker { + background: white; + padding: 5px 0; +} + +.sticky-fixed .data-picker { +} diff --git a/tensorflow/tensorboard/lib/js/colorScale/palettes.ts b/tensorflow/tensorboard/lib/js/colorScale/palettes.ts new file mode 100644 index 0000000000..76d69779a3 --- /dev/null +++ b/tensorflow/tensorboard/lib/js/colorScale/palettes.ts @@ -0,0 +1,54 @@ +module TF { + export const palettes = { + googleStandard: [ + "#db4437", //google red 500 + "#ff7043", //deep orange 400 + "#f4b400", //google yellow 500 + "#0f9d58", //google green 500 + "#00796b", //teal 700 + "#00acc1", //cyan 600 + "#4285f4", //google blue 500 + "#5c6bc0", //indigo 400 + "#ab47bc" //purple 400 + ], + googleCool: [ + "#9e9d24", //lime 800 + "#0f9d58", //google green 500 + "#00796b", //teal 700 + "#00acc1", //cyan 600 + "#4285f4", //google blue 500 + "#5c6bc0", //indigo 400 + "#607d8b" //blue gray 500 + ], + googleWarm: [ + "#795548", //brown 500 + "#ab47bc", //purple 400 + "#f06292", //pink 300 + "#c2185b", //pink 700 + "#db4437", //google red 500 + "#ff7043", //deep orange 400 + "#f4b400" //google yellow 700 + ], + googleColorBlind: [ + "#c53929", //google red 700 + "#ff7043", //deep orange 400 + "#f7cb4d", //google yellow 300 + "#0b8043", //google green 700 + "#80deea", //cyan 200 + "#4285f4", //google blue 500 + "#5e35b1" //deep purple 600 + ], + //This rainbow palette attempts to keep a constant brightness across hues. + constantValue: [ + "#f44336", + "#ffa216", + "#c2d22d", + "#51b455", + "#1ca091", + "#505ec4", + "#a633ba" + ] + } +} + + diff --git a/tensorflow/tensorboard/lib/js/colorScale/test/colorScaleTests.ts b/tensorflow/tensorboard/lib/js/colorScale/test/colorScaleTests.ts new file mode 100644 index 0000000000..b3709fd520 --- /dev/null +++ b/tensorflow/tensorboard/lib/js/colorScale/test/colorScaleTests.ts @@ -0,0 +1,99 @@ +/* Copyright 2015 Google Inc. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +module TF { + let assert = chai.assert; + + describe("ColorScale", function() { + let ccs: ColorScale; + + beforeEach(function() { + ccs = new ColorScale(); + }); + + it("No collisions with train, eval and test", function() { + ccs.domain(["train"]); + var trainColor = ccs.getColor("train"); + ccs.domain(["eval"]); + var evalColor = ccs.getColor("eval"); + ccs.domain(["test"]); + var testColor = ccs.getColor("test"); + assert.notEqual(trainColor, evalColor, testColor); + }); + + it("Returns consistent colors, given no hash collisions", function() { + //These three colors don't have hash collisions + ccs.domain(["red", "yellow"]); + var firstRedColor = ccs.getColor("red"); + ccs.domain(["red", "yellow", "blue"]); + var secondRedColor = ccs.getColor("red"); + assert.deepEqual(firstRedColor, secondRedColor); + }); + + it("A 2-color scale returns the first and last colors of the palette", function() { + var twoColorScale = new ColorScale(2, TF.palettes.googleStandard); + // No hash collisions with these. + twoColorScale.domain(["red", "blue"]); + assert.deepEqual(twoColorScale.getColor("blue"), TF.palettes.googleStandard[0]); + assert.deepEqual(twoColorScale.getColor("red"), TF.palettes.googleStandard[TF.palettes.googleStandard.length - 1]); + }) + + //This is testing that when we reset the domain with new colors, the old + //domain doesn't influence the new color choices. Basically testing that we + //get a fresh slate if we have a new domain. Basically testing that all the + //internal bins are reset etc. and we aren't finding collisions with + //previous colors. + it("Colors don't nudge away from colors from an old domain.", function() { + // at 12 breaks, "orange" and "blue" collide. + ccs.domain(["red", "blue"]); + var firstBlue = ccs.getColor("blue"); + ccs.domain(["red", "orange"]); + var firstOrange = ccs.getColor("orange"); + assert.deepEqual(firstBlue, firstOrange); + }); + + it("Nudges all colors, given only one base color", function() { + var ccsWithOneColor = new ColorScale(1); + ccsWithOneColor.domain(["one", "two", "three"]); + assert.notEqual(ccsWithOneColor.getColor("one"), ccsWithOneColor.getColor("two")); + assert.notEqual(ccsWithOneColor.getColor("two"), ccsWithOneColor.getColor("three")); + assert.notEqual(ccsWithOneColor.getColor("one"), ccsWithOneColor.getColor("three")); + }); + + it("Nudges a color if it has a hash collision", function() { + // at 12 breaks, "orange" and "blue" collide. + ccs.domain(["red", "blue"]); + var firstBlue = ccs.getColor("blue"); + ccs.domain(["red", "orange"]); + var firstOrange = ccs.getColor("orange"); + ccs.domain(["red", "blue", "orange"]); + var secondBlue = ccs.getColor("blue"); + var secondOrange = ccs.getColor("orange"); + assert.deepEqual(firstBlue, secondBlue); + assert.deepEqual(firstBlue, firstOrange); + assert.notEqual(secondBlue, secondOrange); + }); + + it("Throws an error if string is not in the domain", function() { + ccs.domain(["red", "yellow", "green"]); + assert.throws(function() { + ccs.getColor("not in domain"); + }, "String was not in the domain."); + }); + + + }); + +} diff --git a/tensorflow/tensorboard/lib/js/colorScale/test/index.html b/tensorflow/tensorboard/lib/js/colorScale/test/index.html new file mode 100644 index 0000000000..a6c3c04aa6 --- /dev/null +++ b/tensorflow/tensorboard/lib/js/colorScale/test/index.html @@ -0,0 +1,28 @@ +<!-- Copyright 2015 Google Inc. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +=============================================================================--> +<!doctype html> +<html> +<head> + <meta charset="utf-8"> + <script src="../../web-component-tester/browser.js"></script> + <script src="../../webcomponentsjs/webcomponents-lite.min.js"></script> + <link rel="import" href="../../tf-imports/d3.html"> +</head> +<body> + <script src="../colorScale.js"></script> + <script src="../palettes.js"></script> + <script src="colorScaleTests.js"></script> +</body> +</html> diff --git a/tensorflow/tools/dist_test/Dockerfile b/tensorflow/tools/dist_test/Dockerfile new file mode 100644 index 0000000000..fba23af55d --- /dev/null +++ b/tensorflow/tools/dist_test/Dockerfile @@ -0,0 +1,28 @@ +FROM ubuntu:14.04 + +MAINTAINER Shanqing Cai <cais@google.com> + +RUN apt-get update +RUN apt-get install -y \ + bc \ + curl \ + python \ + python-numpy \ + python-pip + +# Install Google Cloud SDK +RUN curl -O https://dl.google.com/dl/cloudsdk/channels/rapid/install_google_cloud_sdk.bash +RUN chmod +x install_google_cloud_sdk.bash +RUN ./install_google_cloud_sdk.bash --disable-prompts --install-dir=/var/gcloud + +# Install kubectl +RUN /var/gcloud/google-cloud-sdk/bin/gcloud components install kubectl + +# Install nightly TensorFlow pip +# TODO(cais): Should we build it locally instead? +RUN pip install \ + http://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_CONTAINER_TYPE=CPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.7.1-cp27-none-linux_x86_64.whl + +# Copy test files +COPY scripts /var/tf-dist-test/scripts +COPY python /var/tf-dist-test/python diff --git a/tensorflow/tools/dist_test/Dockerfile.local b/tensorflow/tools/dist_test/Dockerfile.local new file mode 100644 index 0000000000..4d82904707 --- /dev/null +++ b/tensorflow/tools/dist_test/Dockerfile.local @@ -0,0 +1,20 @@ +FROM jpetazzo/dind + +MAINTAINER Shanqing Cai <cais@google.com> + +RUN apt-get update + +RUN apt-get install -y \ + bc \ + build-essential \ + dbus \ + git \ + software-properties-common + +# Install the latest golang +RUN wget https://storage.googleapis.com/golang/go1.4.2.linux-amd64.tar.gz +RUN tar -C /usr/local -xzf go1.4.2.linux-amd64.tar.gz +RUN rm -f go1.4.2.linux-amd64.tar.gz +RUN echo 'PATH=/usr/local/go/bin:${PATH}' >> /root/.bashrc + +ADD . /var/tf-k8s diff --git a/tensorflow/tools/dist_test/README.md b/tensorflow/tools/dist_test/README.md new file mode 100644 index 0000000000..d986900bd6 --- /dev/null +++ b/tensorflow/tools/dist_test/README.md @@ -0,0 +1,76 @@ +# Testing Distributed Runtime in TensorFlow +This folder containers tools and test suites for the GRPC-based distributed +runtime in TensorFlow. + +There are three general modes of testing: + +**1) Launch a local Kubernetes (k8s) cluster and run the test suites on it** + +For example: + + ./local_test.sh + +This option makes use of the docker-in-docker (dind) containers. It requires +the docker0 network interface to be set to the promiscuous mode on the host: + + sudo ip link set docker0 promisc on + +The environment variable "TF_DIST_SERVER_DOCKER_IMAGE" can be used to override +the Docker image used to generate the TensorFlow GRPC server pods +("tensorflow/tf_grpc_test_server"). For example: + + export TF_DIST_SERVER_DOCKER_IMAGE=<docker_image_name> + ./local_test.sh + +**2) Launch a remote k8s cluster on Google Container Engine (GKE) and run the +test suite on it** + +For example: + + export TF_DIST_GCLOUD_PROJECT="tensorflow-testing" + export TF_DIST_GCLOUD_COMPUTE_ZONE="us-central1-f" + export CONTAINER_CLUSTER="test-cluster-1" + export TF_DIST_GCLOUD_KEY_FILE_DIR="/tmp/gcloud-secrets" + ./remote_test.sh + +Here you specify the Google Compute Engine (GCE) project, compute zone and +container cluster with the first three environment variables, in that order. +The environment variable "TF_DIST_GCLOUD_KEY_FILE_DIR" is a directory in which +the JSON service account key file named "tensorflow-testing.json" is located. +You can use the flag "--setup-cluster-only" to perform only the cluster setup +step and skip the testing step: + + ./remote_test.sh --setup-cluster-only + +**3) Run the test suite on an existing k8s TensorFlow cluster** + +For example: + + export TF_DIST_GRPC_SERVER_URL="grpc://11.22.33.44:2222" + ./remote_test.sh + +The IP address above is a dummy example. Such a cluster may have been set up +using the command described at the end of the previous section. + + +**Building the test server Docker image** + +To build the Docker image for a test server of TensorFlow distributed runtime, +run: + + ./build_server.sh <docker_image_name> + + +**Generating configuration file for TensorFlow k8s clusters** + +The script at "scripts/k8s_tensorflow.py" can be used to generate yaml +configuration files for a TensorFlow k8s cluster consisting of a number of +workers and parameter servers. For example: + + scripts/k8s_tensorflow.py \ + --num_workers 2 \ + --num_parameter_servers 2 \ + --grpc_port 2222 \ + --request_load_balancer \ + --docker_image "tensorflow/tf_grpc_test_server" \ + > tf-k8s-with-lb.yaml diff --git a/tensorflow/tools/dist_test/build_server.sh b/tensorflow/tools/dist_test/build_server.sh new file mode 100755 index 0000000000..8679bde2dc --- /dev/null +++ b/tensorflow/tools/dist_test/build_server.sh @@ -0,0 +1,44 @@ +#!/usr/bin/env bash +# Copyright 2016 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +# +# Builds the test server for distributed (GRPC) TensorFlow +# +# Usage: build_server.sh <docker_image_name> +# +# Note that the Dockerfile is located in ./server/ but the docker build should +# use the current directory as the context. + + +# Helper functions +die() { + echo $@ + exit 1 +} + +# Check arguments +if [[ $# != 1 ]]; then + die "Usage: $0 <docker_image_name>" +fi + +DOCKER_IMG_NAME=$1 + +# Current script directory +DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# Call docker build +docker build --no-cache -t "${DOCKER_IMG_NAME}" \ + -f "${DIR}/server/Dockerfile" \ + "${DIR}" diff --git a/tensorflow/tools/dist_test/local/Dockerfile b/tensorflow/tools/dist_test/local/Dockerfile new file mode 100644 index 0000000000..dece508c0d --- /dev/null +++ b/tensorflow/tools/dist_test/local/Dockerfile @@ -0,0 +1,20 @@ +FROM jpetazzo/dind + +MAINTAINER Shanqing Cai <cais@google.com> + +RUN apt-get update + +RUN apt-get install -y \ + build-essential \ + git \ + software-properties-common + +# Install the latest golang +RUN wget https://storage.googleapis.com/golang/go1.4.2.linux-amd64.tar.gz +RUN tar -C /usr/local -xzf go1.4.2.linux-amd64.tar.gz +RUN rm -f go1.4.2.linux-amd64.tar.gz +RUN echo 'PATH=/usr/local/go/bin:${PATH}' >> /root/.bashrc + +ADD start_local_k8s_cluster.sh /var/k8s/start_local_k8s_cluster.sh +ADD ../scripts /var/k8s/dist_test/scripts +ADD ../python /var/k8s/dist_test/python diff --git a/tensorflow/tools/dist_test/local/start_local_k8s_service.sh b/tensorflow/tools/dist_test/local/start_local_k8s_service.sh new file mode 100755 index 0000000000..51f4805ee8 --- /dev/null +++ b/tensorflow/tools/dist_test/local/start_local_k8s_service.sh @@ -0,0 +1,118 @@ +#!/usr/bin/env bash +# Copyright 2016 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +# +# Start a Kubernetes (k8s) cluster on the local machine. +# +# This script assumes that git, docker, and golang are installed and on +# the path. It will attempt to install the version of etcd recommended by the +# kubernetes source. +# +# Usage: start_local_k8s_service.sh +# +# This script obeys the following environment variables: +# TF_DIST_K8S_SRC_DIR: Overrides the default directory for k8s source code. +# TF_DIST_K8S_SRC_BRANCH: Overrides the default branch to run the local k8s +# cluster with. + + +# Configurations +K8S_SRC_REPO=https://github.com/kubernetes/kubernetes.git +K8S_SRC_DIR=${TF_DIST_K8S_SRC_DIR:-/local/kubernetes} +K8S_SRC_BRANCH=${TF_DIST_K8S_SRC_BRANCH:-release-1.2} + +# Helper functions +die() { + echo $@ + exit 1 +} + +# Start docker service. Try multiple times if necessary. +COUNTER=0 +while true; do + ((COUNTER++)) + service docker start + sleep 1 + + service docker status + if [[ $? == "0" ]]; then + echo "Docker service started successfully." + break; + else + echo "Docker service failed to start" + + # 23 is the exit code to signal failure to start docker service in the dind + # container. + exit 23 + + fi +done + +# Wait for docker0 net interface to appear +echo "Waiting for docker0 network interface to appear..." +while true; do + if [[ -z $(netstat -i | grep "^docker0") ]]; then + sleep 1 + else + break + fi +done +echo "docker0 interface has appeared." + +# Set docker0 to promiscuous mode +ip link set docker0 promisc on || \ + die "FAILED to set docker0 to promiscuous" +echo "Turned promisc on for docker0" + +# Check promiscuous mode of docker0 +netstat -i + +umask 000 +if [[ ! -d "${K8S_SRC_DIR}/.git" ]]; then + mkdir -p ${K8S_SRC_DIR} + git clone ${K8S_SRC_REPO} ${K8S_SRC_DIR} || \ + die "FAILED to clone k8s source from GitHub from: ${K8S_SRC_REPO}" +fi + +pushd ${K8S_SRC_DIR} +git checkout ${K8S_SRC_BRANCH} || \ + die "FAILED to checkout k8s source branch: ${K8S_SRC_BRANCH}" +git pull origin ${K8S_SRC_BRANCH} || \ + die "FAILED to pull from k8s source branch: ${K8S_SRC_BRANCH}" + +# Create kubectl binary + +# Install etcd +hack/install-etcd.sh + +export PATH=$(pwd)/third_party/etcd:${PATH} + +# Setup golang +export PATH=/usr/local/go/bin:${PATH} + +echo "etcd path: $(which etcd)" +echo "go path: $(which go)" + +# Create shortcut to kubectl +echo '#!/bin/bash' > /usr/local/bin/kubectl +echo "$(pwd)/cluster/kubectl.sh \\" >> /usr/local/bin/kubectl +echo ' $@' >> /usr/local/bin/kubectl +chmod +x /usr/local/bin/kubectl + +# Bring up local cluster +export KUBE_ENABLE_CLUSTER_DNS=true +hack/local-up-cluster.sh + +popd diff --git a/tensorflow/tools/dist_test/local/start_tf_cluster_container.sh b/tensorflow/tools/dist_test/local/start_tf_cluster_container.sh new file mode 100755 index 0000000000..b8448624ef --- /dev/null +++ b/tensorflow/tools/dist_test/local/start_tf_cluster_container.sh @@ -0,0 +1,91 @@ +#!/usr/bin/env bash +# Copyright 2016 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +# +# Starts a docker-in-docker (dind) container that is capable of running docker +# service and Kubernetes (k8s) cluster inside. +# +# Usage: start_tf_cluster_container.sh <local_k8s_dir> <docker_img_name> +# +# local_k8s_dir: Kubernetes (k8s) source directory on the host +# docker_img_name: Name of the docker image to start +# +# In addition, this script obeys the following environment variables: +# TF_DIST_SERVER_DOCKER_IMAGE: overrides the default docker image to launch +# TensorFlow (GRPC) servers with + +# Parse input arguments +if [[ $# != "2" ]]; then + echo "Usage: $0 <host_k8s_dir> <docker_img_name>" + exit 1 +fi + +HOST_K8S_DIR=$1 +DOCKER_IMG_NAME=$2 + +# Helper functions +die() { + echo $@ + exit 1 +} + +# Maximum number of tries to start the docker container with docker running +# inside +MAX_ATTEMPTS=100 + +# Map environment variables into the docker-in-docker (dind) container +DOCKER_ENV="" +if [[ ! -z "${TF_DIST_SERVER_DOCKER_IMAGE}" ]]; then + DOCKER_ENV="-e TF_DIST_SERVER_DOCKER_IMAGE=${TF_DIST_SERVER_DOCKER_IMAGE}" +fi + +# Verify that the promisc (promiscuous mode) flag is set on docker0 network +# interface +if [[ -z $(netstat -i | grep "^docker0" | awk '{print $NF}' | grep -o P) ]]; +then + die "FAILED: Cannot proceed with dind k8s container creation because "\ +"network interface 'docker0' is not set to promisc on the host." +fi + +# Create cache for k8s source +if [[ ! -d ${HOST_K8S_DIR} ]]; then + umask 000 + mkdir -p ${HOST_K8S_DIR} || die "FAILED to create directory for k8s source" +fi + +# Attempt to start docker service in docker container. +# Try multiple times if necessary. +COUNTER=1 +while true; do + ((COUNTER++)) + docker run --net=host --privileged ${DOCKER_ENV} \ + -v ${HOST_K8S_DIR}:/local/kubernetes \ + ${DOCKER_IMG_NAME} \ + /var/tf-k8s/local/start_local_k8s_service.sh + + if [[ $? == "23" ]]; then + if [[ $(echo "${COUNTER}>=${MAX_ATTEMPTS}" | bc -l) == "1" ]]; then + echo "Reached maximum number of attempts (${MAX_ATTEMPTS}) "\ +"while attempting to start docker-in-docker for local k8s TensorFlow cluster" + exit 1 + fi + + echo "Docker service failed to start." + echo "Will make another attempt (#${COUNTER}) to start it..." + sleep 1 + else + break + fi +done diff --git a/tensorflow/tools/dist_test/local/test_local_tf_cluster.sh b/tensorflow/tools/dist_test/local/test_local_tf_cluster.sh new file mode 100755 index 0000000000..895a2fe24c --- /dev/null +++ b/tensorflow/tools/dist_test/local/test_local_tf_cluster.sh @@ -0,0 +1,88 @@ +#!/usr/bin/env bash +# Copyright 2016 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +# +# Launch a Kubernetes (k8s) TensorFlow cluster on the local machine and run +# the distributed test suite. +# +# This script assumes that a TensorFlow cluster is already running on the +# local machine and can be controlled by the "kubectl" binary. +# +# Usage: test_local_tf_cluster.sh +# + +export GCLOUD_BIN=/usr/local/bin/gcloud +export TF_DIST_LOCAL_CLUSTER=1 + +# TODO(cais): Do not hard-code the numbers of workers and ps +NUM_WORKERS=2 +NUM_PARAMETER_SERVERS=2 + +# Get current script directory +DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# Get utility functions +source "${DIR}/../scripts/utils.sh" + +# Wait for the kube-system pods to be running +KUBECTL_BIN=$(which kubectl) +if [[ -z ${KUBECTL_BIN} ]]; then + die "FAILED to find path to kubectl" +fi + +echo "Waiting for kube-system pods to be all running..." +echo "" + +MAX_ATTEMPTS=360 +COUNTER=0 +while true; do + sleep 1 + ((COUNTER++)) + if [[ $(echo "${COUNTER}>${MAX_ATTEMPTS}" | bc -l) == "1" ]]; then + die "Reached maximum polling attempts while waiting for all pods in "\ +"kube-system to be running in local k8s TensorFlow cluster" + fi + + if [[ $(are_all_pods_running "${KUBECTL_BIN}" "kube-system") == "1" ]]; then + break + fi +done + +# Create the local k8s tf cluster +${DIR}/../scripts/create_tf_cluster.sh \ + ${NUM_WORKERS} ${NUM_PARAMETER_SERVERS} | \ + tee /tmp/tf_cluster.log || \ + die "FAILED to create local tf cluster" + +DOCKER_CONTAINER_ID=$(cat /tmp/tf_cluster.log | \ + grep "Docker container ID" | + awk '{print $NF}') +if [[ -z "${DOCKER_CONTAINER_ID}" ]]; then + die "FAILED to determine worker0 Docker container ID" +fi + +export TF_DIST_GRPC_SERVER_URL="grpc://tf-worker0:2222" +GRPC_ENV="TF_DIST_GRPC_SERVER_URL=${TF_DIST_GRPC_SERVER_URL}" + +docker exec \ + ${DOCKER_CONTAINER_ID} \ + /bin/bash -c \ + "${GRPC_ENV} /var/tf-k8s/scripts/dist_test.sh" + +if [[ $? != "0" ]]; then + die "Test of local k8s TensorFlow cluster FAILED" +else + echo "Test of local k8s TensorFlow cluster PASSED" +fi diff --git a/tensorflow/tools/dist_test/local_test.sh b/tensorflow/tools/dist_test/local_test.sh new file mode 100755 index 0000000000..d47324cbc3 --- /dev/null +++ b/tensorflow/tools/dist_test/local_test.sh @@ -0,0 +1,152 @@ +#!/usr/bin/env bash +# Copyright 2016 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +# +# Tests distributed TensorFlow on a locally running TF GRPC cluster. +# +# This script peforms the following steps: +# 1) Build the docker-in-docker (dind) image capable of running docker and +# Kubernetes (k8s) cluster inside. +# 2) Run a container from the aforementioned image and start docker service +# in it +# 3) Call a script to launch a k8s TensorFlow GRPC cluster inside the container +# and run the distributed test suite. +# +# Usage: local_test.sh [--leave-container-running] +# +# Arguments: +# --leave-container-running: Do not stop the docker-in-docker container after +# the termination of the tests, e.g., for debugging +# +# In addition, this script obeys the following environment variables: +# TF_DIST_SERVER_DOCKER_IMAGE: overrides the default docker image to launch +# TensorFlow (GRPC) servers with +# TF_DIST_DOCKER_NO_CACHE: do not use cache when building docker images + + +# Configurations +DOCKER_IMG_NAME="tensorflow/tf-dist-test-local-cluster" +LOCAL_K8S_CACHE=${HOME}/kubernetes + +# Helper function +get_container_id_by_image_name() { + # Get the id of a container by image name + # Usage: get_docker_container_id_by_image_name <img_name> + + echo $(docker ps | grep $1 | awk '{print $1}') +} + +# Current script directory +DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# Get utility functions +source ${DIR}/scripts/utils.sh + + +# First, make sure that no docker-in-docker container of the same image +# is already running +if [[ ! -z $(get_container_id_by_image_name ${DOCKER_IMG_NAME}) ]]; then + die "It appears that there is already at least one Docker container "\ +"of image name ${DOCKER_IMG_NAME} running. Please stop it before trying again" +fi + +# Build docker-in-docker image for local k8s cluster +NO_CACHE_FLAG="" +if [[ ! -z "${TF_DIST_DOCKER_NO_CACHE}" ]] && + [[ "${TF_DIST_DOCKER_NO_CACHE}" != "0" ]]; then + NO_CACHE_FLAG="--no-cache" +fi + +docker build ${NO_CACHE_FLAG} -t ${DOCKER_IMG_NAME} \ + -f ${DIR}/Dockerfile.local ${DIR} + + +# Attempt to start the docker container with docker, which will run the k8s +# cluster inside. + +# Get current script directory +CONTAINER_START_LOG=$(mktemp --suffix=.log) +echo "Log file for starting cluster container: ${CONTAINER_START_LOG}" +echo "" + +${DIR}/local/start_tf_cluster_container.sh \ + ${LOCAL_K8S_CACHE} \ + ${DOCKER_IMG_NAME} | \ + tee ${CONTAINER_START_LOG} & + +# Poll start log until the k8s service is started properly or when maximum +# attempt count is reached. +MAX_SERVER_POLLING_ATTEMPTS=600 + +echo "Waiting for docker-in-docker container for local k8s TensorFlow "\ +"cluster to start and launch Kubernetes..." + +COUNTER=0 +while true; do + sleep 1 + + ((COUNTER++)) + if [[ $(echo "${COUNTER}>=${MAX_SERVER_POLLING_ATTEMPTS}" | bc -l) == "1" ]]; then + die "Reached maximum number of attempts (${MAX_SERVER_POLLING_ATTEMPTS}) "\ +"while waiting for docker-in-docker for local k8s TensorFlow cluster to start" + fi + + # Check for hitting max attempt while trying to start docker-in-docker + if [[ $(grep -i "Reached maximum number of attempts" \ + "${CONTAINER_START_LOG}" | wc -l) == "1" ]]; then + die "Docker-in-docker container for local k8s TensorFlow cluster "\ +"FAILED to start" + fi + + if [[ $(grep -i "Local Kubernetes cluster is running" \ + "${CONTAINER_START_LOG}" | wc -l) == "1" ]]; then + break + fi +done + +# Determine the id of the docker-in-docker container +DIND_ID=$(get_container_id_by_image_name ${DOCKER_IMG_NAME}) + +echo "Docker-in-docker container for local k8s TensorFlow cluster has been "\ +"started successfully." +echo "Docker-in-docker container ID: ${DIND_ID}" +echo "Launching k8s tf cluster and tests in container ${DIND_ID} ..." +echo "" + +# Launch k8s tf cluster in the docker-in-docker container and perform tests +docker exec ${DIND_ID} \ + /var/tf-k8s/local/test_local_tf_cluster.sh +TEST_RES=$? + +# Tear down: stop docker-in-docker container +if [[ $1 != "--leave-container-running" ]]; then + echo "" + echo "Stopping docker-in-docker container ${DIND_ID}" + + docker stop --time=1 ${DIND_ID} || \ + echo "WARNING: Failed to stop container ${DIND_ID} !!" + + echo "" +else + echo "Will not terminate DIND container ${DIND_ID}" +fi + +if [[ "${TEST_RES}" != "0" ]]; then + die "Test of distributed TensorFlow runtime on docker-in-docker local "\ +"k8s cluster FAILED" +else + echo "Test of distributed TensorFlow runtime on docker-in-docker local "\ +"k8s cluster PASSED" +fi diff --git a/tensorflow/tools/dist_test/python/mnist_replica.py b/tensorflow/tools/dist_test/python/mnist_replica.py new file mode 100755 index 0000000000..e40aae38c2 --- /dev/null +++ b/tensorflow/tools/dist_test/python/mnist_replica.py @@ -0,0 +1,144 @@ +# Copyright 2016 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Distributed MNIST training and validation, with model replicas. + +A simple softmax model with one hidden layer is defined. The parameters +(weights and biases) are located on two parameter servers (ps), while the +ops are defined on a worker node. The TF sessions also run on the worker +node. +Multiple invocations of this script can be done in parallel, with different +values for --worker_index. There should be exactly one invocation with +--worker_index, which will create a master session that carries out variable +initialization. The other, non-master, sessions will wait for the master +session to finish the initialization before proceeding to the training stage. + +The coordination between the multpile worker invocations occurs due to +the definition of the parameters on the same ps devices. The parameter updates +from one worker is visible to all other workers. As such, the workers can +perform forward computation and gradient calculation in parallel, which +should lead to increased training speed for the simple model. +""" + + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +import sys +import tempfile +import time + +from six.moves import xrange # pylint: disable=redefined-builtin + +import tensorflow as tf +from tensorflow.examples.tutorials.mnist import input_data + + +flags = tf.app.flags +flags.DEFINE_string("data_dir", "/tmp/mnist-data", + "Directory for storing mnist data") +flags.DEFINE_boolean("download_only", False, + """Only perform downloading of data; Do not proceed to + model definition or training""") +flags.DEFINE_integer("worker_index", 0, + """Worker task index, should be >= 0. worker_index=0 is + the master worker task the performs the variable + initialization""") +flags.DEFINE_integer("hidden_units", 100, + "Number of units in the hidden layer of the NN") +flags.DEFINE_integer("train_steps", 50, "Number of training steps") +flags.DEFINE_integer("batch_size", 100, "Training batch size") +flags.DEFINE_float("learning_rate", 0.01, "Learning rate") +flags.DEFINE_string("worker_grpc_url", None, + "Worker GRPC URL (e.g., grpc://1.2.3.4:2222, or " + "grpc://tf-worker0:2222)") +FLAGS = flags.FLAGS + +IMAGE_PIXELS = 28 + +if __name__ == "__main__": + mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True) + if FLAGS.download_only: + sys.exit(0) + + print("Worker GRPC URL: %s" % FLAGS.worker_grpc_url) + print("Worker index = %d" % FLAGS.worker_index) + + with tf.Graph().as_default(): + # Variables of the hidden layer + with tf.device("/job:ps/task:0"): + hid_w = tf.Variable( + tf.truncated_normal([IMAGE_PIXELS * IMAGE_PIXELS, FLAGS.hidden_units], + stddev=1.0 / IMAGE_PIXELS), name="hid_w") + hid_b = tf.Variable(tf.zeros([FLAGS.hidden_units]), name="hid_b") + + # Variables of the softmax layer + with tf.device("/job:ps/task:1"): + sm_w = tf.Variable( + tf.truncated_normal([FLAGS.hidden_units, 10], + stddev=1.0 / math.sqrt(FLAGS.hidden_units)), + name="sm_w") + sm_b = tf.Variable(tf.zeros([10]), name="sm_b") + + # Ops: located on the worker specified with FLAGS.worker_index + with tf.device("/job:worker/task:%d" % FLAGS.worker_index): + x = tf.placeholder(tf.float32, [None, IMAGE_PIXELS * IMAGE_PIXELS]) + y_ = tf.placeholder(tf.float32, [None, 10]) + + hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b) + hid = tf.nn.relu(hid_lin) + + y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b)) + cross_entropy = -tf.reduce_sum(y_ * + tf.log(tf.clip_by_value(y, 1e-10, 1.0))) + train_step = tf.train.AdamOptimizer( + FLAGS.learning_rate).minimize(cross_entropy) + + train_dir = tempfile.mkdtemp() + print(FLAGS.worker_index) + sv = tf.train.Supervisor(logdir=train_dir, + is_chief=(FLAGS.worker_index == 0)) + + # The chief worker (worker_index==0) session will prepare the session, + # while the remaining workers will wait for the preparation to complete. + sess = sv.prepare_or_wait_for_session(FLAGS.worker_grpc_url) + + # Perform training + time_begin = time.time() + print("Training begins @ %f" % time_begin) + + # TODO(cais): terminate when a global step counter reaches FLAGS.train_steps + for i in xrange(FLAGS.train_steps): + # Training feed + batch_xs, batch_ys = mnist.train.next_batch(FLAGS.batch_size) + train_feed = {x: batch_xs, + y_: batch_ys} + + sess.run(train_step, feed_dict=train_feed) + + time_end = time.time() + print("Training ends @ %f" % time_end) + training_time = time_end - time_begin + print("Training elapsed time: %f s" % training_time) + + # Validation feed + val_feed = {x: mnist.validation.images, + y_: mnist.validation.labels} + val_xent = sess.run(cross_entropy, feed_dict=val_feed) + print("After %d training step(s), validation cross entropy = %g" % + (FLAGS.train_steps, val_xent)) + diff --git a/tensorflow/tools/dist_test/remote_test.sh b/tensorflow/tools/dist_test/remote_test.sh new file mode 100755 index 0000000000..5f331c4cac --- /dev/null +++ b/tensorflow/tools/dist_test/remote_test.sh @@ -0,0 +1,92 @@ +#!/usr/bin/env bash +# Copyright 2016 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +# +# This is the entry-point script to testing TensorFlow's distributed runtime. +# It builds a docker image with the necessary gcloud and Kubernetes (k8s) tools +# installed, and then execute k8s cluster preparation and distributed TensorFlow +# runs from within a container based on the image. +# +# Usage: +# remote_test.sh [--setup-cluster-only] +# Arguments: +# --setup-cluster-only: +# Setup the TensorFlow k8s cluster only, and do not perform testing of +# the distributed runtime. +# +# +# If any of the following environment variable has non-empty values, it will +# be mapped into the docker container to override the default values (see +# dist_test.sh) +# TF_DIST_GRPC_SERVER_URL: URL to an existing Tensorflow GRPC server. +# If set to any non-empty and valid value (e.g., +# grpc://1.2.3.4:2222), it will cause the test +# to bypass the k8s cluster setup and +# teardown process, and just use the this URL +# as the master session. +# TF_DIST_GCLOUD_PROJECT: gcloud project in which the GKE cluster +# will be created (takes effect only if +# TF_DIST_GRPC_SERVER_URL is empty, same below) +# TF_DIST_GCLOUD_COMPUTE_ZONE: gcloud compute zone. +# TF_DIST_CONTAINER_CLUSTER: name of the GKE cluster +# TF_DIST_GCLOUD_KEY_FILE_DIR: path to the host directory that contains +# the gloud service key file +# "tensorflow-testing.json" +# TF_DIST_GRPC_PORT: port on which to create the TensorFlow GRPC +# servers +# TF_DIST_DOCKER_NO_CACHE: do not use cache when building docker images + +DOCKER_IMG_NAME="tensorflow/tf-dist-test-client" + +# Get current script directory +DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# Prepare environment variables for the docker container +DOCKER_ENV_FLAGS="" +if [[ ! -z "$TF_DIST_GRPC_SERVER_URL" ]]; then + DOCKER_ENV_FLAGS="${DOCKER_ENV_FLAGS} "\ +"-e TF_DIST_GRPC_SERVER_URL=${TF_DIST_GRPC_SERVER_URL}" +fi +if [[ ! -z "$TF_DIST_GCLOUD_PROJECT" ]]; then + DOCKER_ENV_FLAGS="${DOCKER_ENV_FLAGS} "\ +"-e TF_DIST_GCLOUD_PROJECT=${TF_DIST_GCLOUD_PROJECT}" +fi +if [[ ! -z "$TF_DIST_GCLOUD_COMPUTE_ZONE" ]]; then + DOCKER_ENV_FLAGS="${DOCKER_ENV_FLAGS} "\ +"-e TF_DIST_GCLOUD_COMPUTE_ZONE=${TF_DIST_GCLOUD_COMPUTE_ZONE}" +fi +if [[ ! -z "$TF_DIST_CONTAINER_CLUSTER" ]]; then + DOCKER_ENV_FLAGS="${DOCKER_ENV_FLAGS} "\ +"-e TF_DIST_CONTAINER_CLUSTER=${TF_DIST_CONTAINER_CLUSTER}" +fi +if [[ ! -z "$TF_DIST_GRPC_PORT" ]]; then + DOCKER_ENV_FLAGS="${DOCKER_ENV_FLAGS} "\ +"-e TF_DIST_GRPC_PORT=${TF_DIST_GRPC_PORT}" +fi + +NO_CACHE_FLAG="" +if [[ ! -z "${TF_DIST_DOCKER_NO_CACHE}" ]] && + [[ "${TF_DIST_DOCKER_NO_CACHE}" != "0" ]]; then + NO_CACHE_FLAG="--no-cache" +fi + +docker build ${NO_CACHE_FLAG} \ + -t ${DOCKER_IMG_NAME} -f "${DIR}/Dockerfile" "${DIR}" +KEY_FILE_DIR=${TF_DIST_GCLOUD_KEY_FILE_DIR:-"${HOME}/gcloud-secrets"} + +docker run -v ${KEY_FILE_DIR}:/var/gcloud/secrets \ + ${DOCKER_ENV_FLAGS} \ + ${DOCKER_IMG_NAME} \ + /var/tf-dist-test/scripts/dist_test.sh $@ diff --git a/tensorflow/tools/dist_test/scripts/create_tf_cluster.sh b/tensorflow/tools/dist_test/scripts/create_tf_cluster.sh new file mode 100755 index 0000000000..22c0c43037 --- /dev/null +++ b/tensorflow/tools/dist_test/scripts/create_tf_cluster.sh @@ -0,0 +1,231 @@ +#!/usr/bin/env bash +# Copyright 2016 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +# +# Create a Kubernetes (k8s) cluster of TensorFlow workers +# +# Usage: +# create_tf_cluster.sh <num_workers> <num_parameter_servers> +# +# In addition, this script obeys values in the folllowing environment variables: +# TF_DIST_LOCAL_CLUSTER: create TensorFlow cluster on local machine +# TF_DIST_SERVER_DOCKER_IMAGE: overrides the default docker image to launch +# TensorFlow (GRPC) servers with +# TF_DIST_GCLOUD_PROJECT: gcloud project in which the GKE cluster +# will be created (valid only if aforementioned +# TF_DIST_GRPC_SERVER_URL is empty). +# TF_DIST_GCLOUD_COMPUTE_ZONE: gcloud compute zone. +# TF_DIST_CONTAINER_CLUSTER: name of the GKE cluster +# TF_DIST_GCLOUD_KEY_FILE: if non-empty, will override GCLOUD_KEY_FILE +# TF_DIST_GRPC_PORT: overrides the default port (2222) +# to run the GRPC servers on + +# Configurations +# gcloud operation timeout (steps) +GCLOUD_OP_MAX_STEPS=360 + +GRPC_PORT=${TF_DIST_GRPC_PORT:-2222} + +DEFAULT_GCLOUD_BIN=/var/gcloud/google-cloud-sdk/bin/gcloud +GCLOUD_KEY_FILE=${TF_DIST_GCLOUD_KEY_FILE:-\ +"/var/gcloud/secrets/tensorflow-testing.json"} +GCLOUD_PROJECT=${TF_DIST_GCLOUD_PROJECT:-"tensorflow-testing"} + +GCLOUD_COMPUTE_ZONE=${TF_DIST_GCLOUD_COMPUTE_ZONE:-"us-central1-f"} +CONTAINER_CLUSTER=${TF_DIST_CONTAINER_CLUSTER:-"test-cluster"} + +SERVER_DOCKER_IMAGE=${TF_DIST_SERVER_DOCKER_IMAGE:-\ +"tensorflow/tf_grpc_test_server"} + +# Get current script directory +DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# Get utility functions +source "${DIR}/utils.sh" + +# Check input arguments +if [[ $# != 2 ]]; then + die "Usage: $0 <num_workers> <num_parameter_servers>" +fi + +NUM_WORKERS=$1 +NUM_PARAMETER_SERVERS=$2 + +# Verify port string +if [[ -z $(echo "${GRPC_PORT}" | grep -E "^[0-9]{1,5}") ]]; then + die "Invalid GRPC port: \"${GRPC_PORT}\"" +fi +echo "GRPC port to be used when creating the k8s TensorFlow cluster: "\ +"${GRPC_PORT}" + +if [[ -z "${TF_DIST_LOCAL_CLUSTER}" ]] || + [[ "${TF_DIST_LOCAL_CLUSTER}" == "0" ]]; then + IS_LOCAL_CLUSTER="0" +else + IS_LOCAL_CLUSTER="1" +fi + +if [[ ${IS_LOCAL_CLUSTER} == "0" ]]; then + # Locate gcloud binary path + GCLOUD_BIN=$(which gcloud) + if [[ -z "${GCLOUD_BIN}" ]]; then + GCLOUD_BIN="${DEFAULT_GCLOUD_BIN}" + fi + + if [[ ! -f "${GCLOUD_BIN}" ]]; then + die "gcloud binary cannot be found at: ${GCLOUD_BIN}" + fi + echo "Path to gcloud binary: ${GCLOUD_BIN}" + + # Path to gcloud service key file + if [[ ! -f "${GCLOUD_KEY_FILE}" ]]; then + die "gcloud service account key file cannot be found at: ${GCLOUD_KEY_FILE}" + fi + echo "Path to gcloud key file: ${GCLOUD_KEY_FILE}" + + echo "GCLOUD_PROJECT: ${GCLOUD_PROJECT}" + echo "GCLOUD_COMPUTER_ZONE: ${GCLOUD_COMPUTE_ZONE}" + echo "CONTAINER_CLUSTER: ${CONTAINER_CLUSTER}" + + # Activate gcloud service account + "${GCLOUD_BIN}" auth activate-service-account --key-file "${GCLOUD_KEY_FILE}" + + # Set gcloud project + "${GCLOUD_BIN}" config set project "${GCLOUD_PROJECT}" + + # Set compute zone + "${GCLOUD_BIN}" config set compute/zone "${GCLOUD_COMPUTE_ZONE}" + + # Set container cluster + "${GCLOUD_BIN}" config set container/cluster "${CONTAINER_CLUSTER}" + + # Get container cluster credentials + "${GCLOUD_BIN}" container clusters get-credentials "${CONTAINER_CLUSTER}" + if [[ $? != "0" ]]; then + die "FAILED to get credentials for container cluster: ${CONTAINER_CLUSTER}" + fi + + # If there is any existing tf k8s cluster, delete it first + "${DIR}/delete_tf_cluster.sh" "${GCLOUD_OP_MAX_STEPS}" +fi + +# Path to kubectl binary +KUBECTL_BIN=$(dirname "${GCLOUD_BIN}")/kubectl +if [[ ! -f "${KUBECTL_BIN}" ]]; then + die "kubectl binary cannot be found at: ${KUBECTL_BIN}" +fi +echo "Path to kubectl binary: ${KUBECTL_BIN}" + +# Create yaml file for k8s TensorFlow cluster creation +# Path to the (Python) script for generating k8s yaml file +K8S_GEN_TF_YAML="${DIR}/k8s_tensorflow.py" +if [[ ! -f ${K8S_GEN_TF_YAML} ]]; then + die "FAILED to find yaml-generating script at: ${K8S_GEN_TF_YAML}" +fi + +K8S_YAML="/tmp/k8s_tf_lb.yaml" +rm -f "${K8S_YAML}" + +echo "" +echo "Generating k8s cluster yaml config file with the following settings" +echo " Server docker image: ${SERVER_DOCKER_IMAGE}" +echo " Number of workers: ${NUM_WORKERS}" +echo " Number of parameter servers: ${NUM_PARAMETER_SERVERS}" +echo " GRPC port: ${GRPC_PORT}" +echo "" + +${K8S_GEN_TF_YAML} \ + --docker_image "${SERVER_DOCKER_IMAGE}" \ + --num_workers "${NUM_WORKERS}" \ + --num_parameter_servers "${NUM_PARAMETER_SERVERS}" \ + --grpc_port "${GRPC_PORT}" \ + --request_load_balancer=True \ + > "${K8S_YAML}" || \ + die "Generation of the yaml configuration file for k8s cluster FAILED" + +if [[ ! -f "${K8S_YAML}" ]]; then + die "FAILED to generate yaml file for TensorFlow k8s container cluster" +else + echo "Generated yaml configuration file for k8s TensorFlow cluster: "\ +"${K8S_YAML}" +fi + +# Create tf k8s container cluster +"${KUBECTL_BIN}" create -f "${K8S_YAML}" + +# Wait for external IP of worker services to become available +get_tf_worker_external_ip() { + echo $("${KUBECTL_BIN}" get svc | grep "^tf-worker0" | \ + awk '{print $3}' | grep -E "[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+") +} + +if [[ ${IS_LOCAL_CLUSTER} == "0" ]]; then + echo "Waiting for external IP of tf-worker0 service to emerge..." + echo "" + + COUNTER=0 + while true; do + sleep 1 + ((COUNTER++)) + if [[ $(echo "${COUNTER}>${GCLOUD_OP_MAX_STEPS}" | bc -l) == "1" ]]; then + die "Reached maximum polling steps while waiting for external IP "\ +"of tf-worker0 service to emerge" + fi + + SVC_EXTERN_IP=$(get_tf_worker_external_ip) + + if [[ ! -z "${SVC_EXTERN_IP}" ]]; then + break + fi + done + + GRPC_SERVER_URL="grpc://${SVC_EXTERN_IP}:${GRPC_PORT}" + echo "GRPC URL of tf-worker0: ${GRPC_SERVER_URL}" + +else + echo "Waiting for tf pods to be all running..." + echo "" + + COUNTER=0 + while true; do + sleep 1 + ((COUNTER++)) + if [[ $(echo "${COUNTER}>${GCLOUD_OP_MAX_STEPS}" | bc -l) == "1" ]]; then + die "Reached maximum polling steps while waiting for all tf pods to "\ +"be running in local k8s TensorFlow cluster" + fi + + PODS_STAT=$(are_all_pods_running "${KUBECTL_BIN}") + + if [[ ${PODS_STAT} == "2" ]]; then + # Error has occurred + die "Error(s) occurred while tring to launch tf k8s cluster. "\ +"One possible cause is that the Docker image used to launch the cluster is "\ +"invalid: \"${SERVER_DOCKER_IMAGE}\"" + fi + + if [[ ${PODS_STAT} == "1" ]]; then + break + fi + done + + # Determine the tf-worker0 docker container id + WORKER0_ID=$(docker ps | grep "k8s_tf-worker0" | awk '{print $1}') + echo "WORKER0 Docker container ID: ${WORKER0_ID}" + +fi + + +echo "Cluster setup complete." diff --git a/tensorflow/tools/dist_test/scripts/delete_tf_cluster.sh b/tensorflow/tools/dist_test/scripts/delete_tf_cluster.sh new file mode 100755 index 0000000000..0f96b4b57a --- /dev/null +++ b/tensorflow/tools/dist_test/scripts/delete_tf_cluster.sh @@ -0,0 +1,87 @@ +#!/usr/bin/env bash +# Copyright 2016 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +# +# This script checks for any existing TensorFlow worker services, replication +# controllers and pods in the Kubernetes (k8s) container cluster and delete +# them if there are any. +# +# Usage: delete_tf_cluster [max_steps] +# +# max_steps: Maximum number polling steps for kubectl operations + +# Helper functions +die() { + echo $@ + exit 1 +} + +# Path to kubectl binary +DEFAULT_KUBECTL_BIN=/var/gcloud/google-cloud-sdk/bin/kubectl +KUBECTL_BIN=$(which kubectl) +if [[ -z "${KUBECTL_BIN}" ]]; then + KUBECTL_BIN="${DEFAULT_KUBECTL_BIN}" +fi +if [[ ! -f "${KUBECTL_BIN}" ]]; then + die "kubectl binary cannot be found at: \"${KUBECTL_BIN}\"" +else + echo "Path to kubectl binary: ${KUBECTL_BIN}" +fi + +MAX_STEPS=${1:-240} + + +# Helper functions for kubectl workflow +get_tf_svc_count() { + echo $("${KUBECTL_BIN}" get svc | grep "tf-" | wc -l) +} + +get_tf_rc_count() { + echo $("${KUBECTL_BIN}" get rc | grep "tf-" | wc -l) +} + +get_tf_pods_count() { + echo $("${KUBECTL_BIN}" get pods | grep "tf-" | wc -l) +} + + +# Delete all running services, replication-controllers and pods, in that order +ITEMS_TO_DELETE="svc rc pods" +for ITEM in ${ITEMS_TO_DELETE}; do + K8S_ITEM_COUNT=$(get_tf_${ITEM}_count) + if [[ ${K8S_ITEM_COUNT} != "0" ]]; then + echo "There are currently ${K8S_ITEM_COUNT} tf ${ITEM}(s) running. " + echo "Attempting to delete those..." + + "${KUBECTL_BIN}" delete --all ${ITEM} + + # Wait until all are deleted + # TODO(cais): Add time out + COUNTER=0 + while true; do + sleep 1 + + ((COUNTER++)) + if [[ $(echo "${COUNTER}>${MAX_STEPS}" | bc -l) == "1" ]]; then + die "Reached maximum polling steps while trying to delete all tf ${ITEM}" + fi + + if [[ $(get_tf_${ITEM}_count) == "0" ]]; then + break + fi + done + fi + +done diff --git a/tensorflow/tools/dist_test/scripts/dist_mnist_test.sh b/tensorflow/tools/dist_test/scripts/dist_mnist_test.sh new file mode 100755 index 0000000000..e0aad2b5c2 --- /dev/null +++ b/tensorflow/tools/dist_test/scripts/dist_mnist_test.sh @@ -0,0 +1,137 @@ +#!/usr/bin/env bash +# Copyright 2016 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +# +# This script invokes dist_mnist.py multiple times concurrently to test the +# TensorFlow's distributed runtime over a Kubernetes (k8s) cluster with the +# grpc pods and service set up. +# +# Usage: +# dist_mnist_test.sh <worker_grpc_url> +# +# worker_grp_url is the IP address or the GRPC URL of the worker of the main +# worker session, e.g., grpc://1.2.3.4:2222 + + +# Configurations +TIMEOUT=120 # Timeout for MNIST replica sessions + +# Helper functions +die() { + echo $@ + exit 1 +} + +if [[ $# != 1 ]]; then + die "Usage: $0 <WORKER_GRPC_URL>" +fi +WORKER_GRPC_URL=$1 + +# Verify the validity of the GRPC URL +if [[ -z $(echo "${WORKER_GRPC_URL}" | \ + grep -E "^grpc://.+:[0-9]+") ]]; then + die "Invalid worker GRPC URL: \"${WORKER_GRPC_URL}\"" +fi + +# Current working directory +DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PY_DIR=$(dirname "${DIR}")/python + +MNIST_REPLICA="${PY_DIR}/mnist_replica.py" + +WKR_LOG_PREFIX="/tmp/worker" + +# First, download the data from a single process, to avoid race-condition +# during data downloading +timeout ${TIMEOUT} python "${MNIST_REPLICA}" \ + --download_only=True || \ + die "Download-only step of MNIST replica FAILED" + +# Run a number of workers in parallel +N_WORKERS=2 +INDICES="" +IDX=0 +while true; do + timeout ${TIMEOUT} \ + python "${MNIST_REPLICA}" \ + --worker_grpc_url="${WORKER_GRPC_URL}" \ + --worker_index=${IDX} 2>&1 > \ + "${WKR_LOG_PREFIX}${IDX}.log" & + # TODO(cais): have each trainer process contact a different worker once + # supervisor and sync_replicas etc. are all working in OSS TensorFlow. + + INDICES="${INDICES} ${IDX}" + + ((IDX++)) + if [[ $(echo "${IDX}==${N_WORKERS}" | bc -l) == "1" ]]; then + break + fi +done + +# Function for getting final validation cross entropy from worker log files +get_final_val_xent() { + echo $(cat $1 | grep "^After.*validation cross entropy = " | \ + awk '{print $NF}') +} + +# Poll until all final validation cross entropy values become available or +# operation times out +COUNTER=0 +while true; do + ((COUNTER++)) + if [[ $(echo "${COUNTER}>${TIMEOUT}" | bc -l) == "1" ]]; then + die "Reached maximum polling steps while polling for final validation "\ +"cross entropies from all workers" + fi + + N_AVAIL=0 + VAL_XENTS="" + for N in ${INDICES}; do + VAL_XENT=$(get_final_val_xent "${WKR_LOG_PREFIX}${N}.log") + if [[ ! -z ${VAL_XENT} ]]; then + ((N_AVAIL++)) + VAL_XENTS="${VAL_XENTS} ${VAL_XENT}" + fi + done + + if [[ "${N_AVAIL}" == "2" ]]; then + # Print out the content of the log files + for M in ${INDICES}; do + echo "===================================================" + echo "=== Log file from worker ${M} ===" + cat "${WKR_LOG_PREFIX}${M}.log" + echo "===================================================" + echo "" + done + + break + else + sleep 1 + fi +done + +# Sanity check on the validation entropies +# TODO(cais): In addition to this basic sanity check, we could run the training +# with 1 and 2 workers, each for a few times and use scipy.stats to do a t-test +# to verify tha tthe 2-worker training gives significantly lower final cross +# entropy +VAL_XENTS=(${VAL_XENTS}) +for N in ${INDICES}; do + echo "Final validation cross entropy from worker${N}: ${VAL_XENTS[N]}" + if [[ $(echo "${VAL_XENTS[N]}>0" | bc -l) != "1" ]]; then + die "Sanity checks on the final validation cross entropy values FAILED" + fi + +done diff --git a/tensorflow/tools/dist_test/scripts/dist_test.sh b/tensorflow/tools/dist_test/scripts/dist_test.sh new file mode 100755 index 0000000000..f8ade7eff8 --- /dev/null +++ b/tensorflow/tools/dist_test/scripts/dist_test.sh @@ -0,0 +1,118 @@ +#!/usr/bin/env bash +# Copyright 2016 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +# +# Performs tests of TensorFlow's distributed runtime over a Kubernetes (k8s) +# container cluster. +# +# This script tears down any existing TensorFlow cluster, consisting of +# services, replication controllers and pods, before creating a new cluster. +# The cluster containers a number of parameter server services and a number of +# worker services. The paramater servers will hold parameters of the ML model, +# e.g., weights and biases of the NN layers, while the workers will hold the +# TensorFlow ops. +# +# Usage: +# dist_test.sh [--setup-cluster-only] +# +# --setup-cluster-only lets the script only set up the k8s container network +# +# This script obeys values in the folllowing environment variables: +# TF_DIST_GRPC_SERVER_URL: If it is set to a valid grpc server url (e.g., +# (grpc://1.2.3.4:2222), the script will bypass +# the cluster setup and teardown processes and +# just use this URL. + + +# Configurations +NUM_WORKERS=2 # Number of worker container +NUM_PARAMETER_SERVERS=2 # Number of parameter servers + +# Helper functions +die() { + echo $@ + exit 1 +} + +# gcloud operation timeout (steps) +GCLOUD_OP_MAX_STEPS=240 + +GRPC_SERVER_URL=${TF_DIST_GRPC_SERVER_URL} + +# Report gcloud / GKE parameters +echo "GRPC_SERVER_URL: ${GRPC_SERVER_URL}" + +# Get current script directory +DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# Locate path to kubectl binary +TEARDOWN_WHEN_DONE=1 +if [[ ! -z "${GRPC_SERVER_URL}" ]]; then + TEARDOWN_WHEN_DONE=0 + # Verify the validity of the GRPC URL + if [[ -z $(echo "${GRPC_SERVER_URL}" | \ + grep -E "^grpc://.+:[0-9]+") ]]; then + die "Invalid GRPC_SERVER_URL: \"${GRPC_SERVER_URL}\"" + else + echo "The preset GRPC_SERVER_URL appears to be valid: ${GRPC_SERVER_URL}" + echo "Will bypass the TensorFlow k8s cluster setup and teardown process" + echo "" + fi +else + TMP=$(mktemp) + "${DIR}/create_tf_cluster.sh" ${NUM_WORKERS} ${NUM_PARAMETER_SERVERS} 2>&1 | \ + tee "${TMP}" || \ + die "Creation of TensorFlow k8s cluster FAILED" + + GRPC_SERVER_URL=$(cat ${TMP} | grep "GRPC URL of tf-worker0: .*" | \ + awk '{print $NF}') + if [[ -z "${GRPC_SERVER_URL}" ]]; then + die "FAILED to determine GRPC server URL" + fi + rm -f ${TMP} + + if [[ $1 == "--setup-cluster-only" ]]; then + echo "Skipping testing of distributed runtime due to "\ +"option flag --setup-cluster-only" + exit 0 + fi +fi + +# Invoke script to perform distributed MNIST training +MNIST_DIST_TEST_BIN="${DIR}/dist_mnist_test.sh" +if [[ ! -f "${MNIST_DIST_TEST_BIN}" ]]; then + die "FAILED to find distributed mnist client test script at "\ +"${MNIST_DIST_TEST_BIN}" +fi + +echo "Performing distributed MNIST training through grpc session @ "\ +"${GRPC_SERVER_URL}..." + +"${MNIST_DIST_TEST_BIN}" "${GRPC_SERVER_URL}" + +if [[ $? == "0" ]]; then + echo "MNIST-replica test PASSED" +else + die "MNIST-replica test FAILED" +fi + +# Tear down current k8s TensorFlow cluster +if [[ "${TEARDOWN_WHEN_DONE}" == "1" ]]; then + echo "Tearing down k8s TensorFlow cluster..." + "${DIR}/delete_tf_cluster.sh" "${GCLOUD_OP_MAX_STEPS}" && \ + echo "Cluster tear-down SUCCEEDED" || \ + die "Cluster tear-down FAILED" +fi +echo "SUCCESS: Test of distributed TensorFlow runtime PASSED" diff --git a/tensorflow/tools/dist_test/scripts/k8s_tensorflow.py b/tensorflow/tools/dist_test/scripts/k8s_tensorflow.py new file mode 100755 index 0000000000..e3fde2180a --- /dev/null +++ b/tensorflow/tools/dist_test/scripts/k8s_tensorflow.py @@ -0,0 +1,245 @@ +#!/usr/bin/python +# Copyright 2016 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Generates YAML configuration files for distributed Tensorflow workers. + +The workers will be run in a Kubernetes (k8s) container cluster. +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +import sys + +# Note: It is intentional that we do not import tensorflow in this script. The +# machine that launches a TensorFlow k8s cluster does not have to have the +# Python package of TensorFlow installed on it. + + +DEFAULT_DOCKER_IMAGE = 'tensorflow/tf_grpc_test_server' +DEFAULT_PORT = 2222 + +# TODO(cais): Consider adding resource requests/limits to the pods. +WORKER_RC = ( + """apiVersion: v1 +kind: ReplicationController +metadata: + name: tf-worker{worker_id} +spec: + replicas: 1 + template: + metadata: + labels: + tf-worker: "{worker_id}" + spec: + containers: + - name: tf-worker{worker_id} + image: {docker_image} + args: + - --cluster_spec={cluster_spec} + - --job_name=worker + - --task_id={worker_id} + ports: + - containerPort: {port} +""") +WORKER_SVC = ( + """apiVersion: v1 +kind: Service +metadata: + name: tf-worker{worker_id} + labels: + tf-worker: "{worker_id}" +spec: + ports: + - port: {port} + targetPort: {port} + selector: + tf-worker: "{worker_id}" +""") +WORKER_LB_SVC = ( + """apiVersion: v1 +kind: Service +metadata: + name: tf-worker{worker_id} + labels: + tf-worker: "{worker_id}" +spec: + type: LoadBalancer + ports: + - port: {port} + selector: + tf-worker: "{worker_id}" +""") +PARAM_SERVER_RC = ( + """apiVersion: v1 +kind: ReplicationController +metadata: + name: tf-ps{param_server_id} +spec: + replicas: 1 + template: + metadata: + labels: + tf-ps: "{param_server_id}" + spec: + containers: + - name: tf-ps{param_server_id} + image: {docker_image} + args: + - --cluster_spec={cluster_spec} + - --job_name=ps + - --task_id={param_server_id} + ports: + - containerPort: {port} +""") +PARAM_SERVER_SVC = ( + """apiVersion: v1 +kind: Service +metadata: + name: tf-ps{param_server_id} + labels: + tf-ps: "{param_server_id}" +spec: + ports: + - port: {port} + selector: + tf-ps: "{param_server_id}" +""") + + +def main(): + """Do arg parsing.""" + parser = argparse.ArgumentParser() + parser.add_argument('--num_workers', + type=int, + default=2, + help='How many worker pods to run') + parser.add_argument('--num_parameter_servers', + type=int, + default=1, + help='How many paramater server pods to run') + parser.add_argument('--grpc_port', + type=int, + default=DEFAULT_PORT, + help='GRPC server port (Default: %d)' % DEFAULT_PORT) + parser.add_argument('--request_load_balancer', + type=bool, + default=False, + help='To request worker0 to be exposed on a public IP ' + 'address via an external load balancer, enabling you to ' + 'run client processes from outside the cluster') + parser.add_argument('--docker_image', + type=str, + default=DEFAULT_DOCKER_IMAGE, + help='Override default docker image for the TensorFlow ' + 'GRPC server') + args = parser.parse_args() + + if args.num_workers <= 0: + sys.stderr.write('--num_workers must be greater than 0; received %d\n' + % args.num_workers) + sys.exit(1) + if args.num_parameter_servers <= 0: + sys.stderr.write( + '--num_parameter_servers must be greater than 0; received %d\n' + % args.num_parameter_servers) + sys.exit(1) + + # Generate contents of yaml config + yaml_config = GenerateConfig(args.num_workers, + args.num_parameter_servers, + args.grpc_port, + args.request_load_balancer, + args.docker_image) + print(yaml_config) # pylint: disable=superfluous-parens + + +def GenerateConfig(num_workers, + num_param_servers, + port, + request_load_balancer, + docker_image): + """Generate configuration strings.""" + config = '' + for worker in range(num_workers): + config += WORKER_RC.format( + port=port, + worker_id=worker, + docker_image=docker_image, + cluster_spec=WorkerClusterSpec(num_workers, + num_param_servers, + port)) + config += '---\n' + if worker == 0 and request_load_balancer: + config += WORKER_LB_SVC.format(port=port, + worker_id=worker) + else: + config += WORKER_SVC.format(port=port, + worker_id=worker) + config += '---\n' + + for param_server in range(num_param_servers): + config += PARAM_SERVER_RC.format( + port=port, + param_server_id=param_server, + docker_image=docker_image, + cluster_spec=ParamServerClusterSpec(num_workers, + num_param_servers, + port)) + config += '---\n' + config += PARAM_SERVER_SVC.format(port=port, + param_server_id=param_server) + config += '---\n' + + return config + + +def WorkerClusterSpec(num_workers, + num_param_servers, + port): + """Generates worker cluster spec.""" + return ClusterSpec(num_workers, num_param_servers, port) + + +def ParamServerClusterSpec(num_workers, + num_param_servers, + port): + """Generates parameter server spec.""" + return ClusterSpec(num_workers, num_param_servers, port) + + +def ClusterSpec(num_workers, + num_param_servers, + port): + """Generates general cluster spec.""" + spec = 'worker|' + for worker in range(num_workers): + spec += 'tf-worker%d:%d' % (worker, port) + if worker != num_workers-1: + spec += ';' + + spec += ',ps|' + for param_server in range(num_param_servers): + spec += 'tf-ps%d:%d' % (param_server, port) + if param_server != num_param_servers-1: + spec += ';' + + return spec + + +if __name__ == '__main__': + main() diff --git a/tensorflow/tools/dist_test/scripts/utils.sh b/tensorflow/tools/dist_test/scripts/utils.sh new file mode 100644 index 0000000000..bc4485baf0 --- /dev/null +++ b/tensorflow/tools/dist_test/scripts/utils.sh @@ -0,0 +1,56 @@ +#!/usr/bin/env bash +# Copyright 2016 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +# +# Utility functions for dist_test scripts + + +# Print info and exit with code 1 +die() { + echo $@ + exit 1 +} + + +# Determine if all k8s pods in a namespace are all in the "Running" state +are_all_pods_running() { + # Usage: are_all_pods_running <KUBECTL_BIN> [namespace] + KUBECTL_BIN=$1 + + if [[ -z "$2" ]]; then + NS_FLAG="" + else + NS_FLAG="--namespace=$2" + fi + + sleep 1 # Wait for the status to settle + NPODS=$("${KUBECTL_BIN}" "${NS_FLAG}" get pods | tail -n +2 | wc -l) + NRUNNING=$("${KUBECTL_BIN}" "${NS_FLAG}" get pods | tail -n +2 | \ + grep "Running" | wc -l) + NERR=$("${KUBECTL_BIN}" "${NS_FLAG}" get pods | tail -n +2 | \ + grep "Err" | wc -l) + + if [[ ${NERR} != "0" ]]; then + # "2" signifies that error has occurred + echo "2" + elif [[ ${NPODS} == ${NRUNNING} ]]; then + # "1" signifies that all pods are in Running state + echo "1" + else + # "0" signifies that some pods have not entered Running state, but + # no error has occurred + echo "0" + fi +} diff --git a/tensorflow/tools/dist_test/server/Dockerfile b/tensorflow/tools/dist_test/server/Dockerfile new file mode 100644 index 0000000000..bf384413f1 --- /dev/null +++ b/tensorflow/tools/dist_test/server/Dockerfile @@ -0,0 +1,59 @@ +# Copyright 2016 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +# +# Test server for TensorFlow GRPC server +# +# To build the image, use ../build_server.sh + +FROM ubuntu:14.04 + +MAINTAINER Shanqing Cai <cais@google.com> + +# Pick up some TF dependencies +RUN apt-get update && apt-get install -y \ + bc \ + curl \ + dnsutils \ + python-numpy \ + python-pip \ + && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +RUN curl -O https://bootstrap.pypa.io/get-pip.py && \ + python get-pip.py && \ + rm get-pip.py + +# Install TensorFlow CPU version. +RUN pip --no-cache-dir install \ + http://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_CONTAINER_TYPE=CPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.7.1-cp27-none-linux_x86_64.whl + +# Copy files, including the GRPC server binary at +# server/grpc_tensorflow_server.py +ADD . /var/tf-k8s + +# Download MNIST data for tests +RUN mkdir -p /tmp/mnist-data +RUN curl -o /tmp/mnist-data/train-labels-idx1-ubyte.gz \ + http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz +RUN curl -o /tmp/mnist-data/train-images-idx3-ubyte.gz \ + http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz +RUN curl -o /tmp/mnist-data/t10k-labels-idx1-ubyte.gz \ + http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz +RUN curl -o /tmp/mnist-data/t10k-images-idx3-ubyte.gz \ + http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz + +# Container entry point +ENTRYPOINT ["/var/tf-k8s/server/grpc_tensorflow_server.py"] diff --git a/tensorflow/tools/dist_test/server/grpc_tensorflow_server.py b/tensorflow/tools/dist_test/server/grpc_tensorflow_server.py new file mode 100755 index 0000000000..b9742112de --- /dev/null +++ b/tensorflow/tools/dist_test/server/grpc_tensorflow_server.py @@ -0,0 +1,122 @@ +#!/usr/bin/python +# Copyright 2016 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Python-based TensorFlow GRPC server. + +Takes input arguments cluster_spec, job_name and task_id, and start a blocking +TensorFlow GRPC server. + +Usage: + grpc_tensorflow_server.py --cluster_spec=SPEC --job_name=NAME --task_id=ID + +Where: + SPEC is <JOB>(,<JOB>)* + JOB is <NAME>|<HOST:PORT>(;<HOST:PORT>)* + NAME is a valid job name ([a-z][0-9a-z]*) + HOST is a hostname or IP address + PORT is a port number +""" + + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tensorflow as tf + + +FLAGS = tf.app.flags.FLAGS + +tf.app.flags.DEFINE_string("cluster_spec", "", + """Cluster spec: SPEC. + SPEC is <JOB>(,<JOB>)*," + JOB is <NAME>|<HOST:PORT>(;<HOST:PORT>)*," + NAME is a valid job name ([a-z][0-9a-z]*)," + HOST is a hostname or IP address," + PORT is a port number." +E.g., local|localhost:2222;localhost:2223, ps|ps0:2222;ps1:2222""") +tf.app.flags.DEFINE_string("job_name", "", "Job name: e.g., local") +tf.app.flags.DEFINE_integer("task_id", 0, "Task index, e.g., 0") +tf.app.flags.DEFINE_boolean("verbose", False, "Verbose mode") + + +def parse_cluster_spec(cluster_spec, cluster): + """Parse content of cluster_spec string and inject info into cluster protobuf. + + Args: + cluster_spec: cluster specification string, e.g., + "local|localhost:2222;localhost:2223" + cluster: cluster protobuf. + + Raises: + ValueError: if the cluster_spec string is invalid. + """ + + job_strings = cluster_spec.split(",") + + for job_string in job_strings: + job_def = cluster.job.add() + + if job_string.count("|") != 1: + raise ValueError("Not exactly one instance of '|' in cluster_spec") + + job_name = job_string.split("|")[0] + + if not job_name: + raise ValueError("Empty job_name in cluster_spec") + + job_def.name = job_name + + if FLAGS.verbose: + print("Added job named \"%s\"" % job_name) + + job_tasks = job_string.split("|")[1].split(";") + for i in range(len(job_tasks)): + if not job_tasks[i]: + raise ValueError("Empty job_task string at position %d" % i) + + job_def.tasks[i] = job_tasks[i] + + if FLAGS.verbose: + print(" Added task \"%s\" to job \"%s\"" % (job_tasks[i], job_name)) + + +def main(unused_args): + # Create Protobuf ServerDef + server_def = tf.ServerDef(protocol="grpc") + + # Cluster info + parse_cluster_spec(FLAGS.cluster_spec, server_def.cluster) + + # Job name + if not FLAGS.job_name: + raise ValueError("Empty job_name") + server_def.job_name = FLAGS.job_name + + # Task index + if FLAGS.task_id < 0: + raise ValueError("Invalid task_id: %d" % FLAGS.task_id) + server_def.task_index = FLAGS.task_id + + # Create GrpcServer instance + server = tf.GrpcServer(server_def) + + # join() is blocking, unlike start() + server.join() + + +if __name__ == "__main__": + tf.app.run() diff --git a/tensorflow/user_ops/BUILD b/tensorflow/user_ops/BUILD index 5bd935000d..cb279e0fc4 100644 --- a/tensorflow/user_ops/BUILD +++ b/tensorflow/user_ops/BUILD @@ -30,7 +30,10 @@ py_tests( name = "ackermann_test", srcs = ["ackermann_test.py"], data = [":ackermann_op.so"], - tags = ["notsan"], + tags = [ + "noasan", + "notsan", + ], ) filegroup( diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl index f0d0702a48..1e3ea6e0a6 100644 --- a/tensorflow/workspace.bzl +++ b/tensorflow/workspace.bzl @@ -13,8 +13,8 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""): native.new_http_archive( name = "eigen_archive", - url = "https://bitbucket.org/eigen/eigen/get/db7b61411772.tar.gz", - sha256 = "832e1e082b91d40ad909a079b98630ce52bd904d1ec0c3cb4cdcd2e24bcf95e6", + url = "https://bitbucket.org/eigen/eigen/get/0a13bf3e579d.tar.gz", + sha256 = "85c9075a51b56e4e20f3814020c726301b84c5df80fc6072d0056d512eb4bf30", build_file = path_prefix + "eigen.BUILD", ) diff --git a/third_party/eigen3/Eigen/Cholesky b/third_party/eigen3/Eigen/Cholesky index e9d61cc882..236fc00cd4 100644 --- a/third_party/eigen3/Eigen/Cholesky +++ b/third_party/eigen3/Eigen/Cholesky @@ -1 +1 @@ -#include "eigen-eigen-db7b61411772/Eigen/Cholesky" +#include "eigen-eigen-0a13bf3e579d/Eigen/Cholesky" diff --git a/third_party/eigen3/Eigen/Core b/third_party/eigen3/Eigen/Core index b4320a07f0..b106690ce8 100644 --- a/third_party/eigen3/Eigen/Core +++ b/third_party/eigen3/Eigen/Core @@ -1 +1 @@ -#include "eigen-eigen-db7b61411772/Eigen/Core" +#include "eigen-eigen-0a13bf3e579d/Eigen/Core" diff --git a/third_party/eigen3/Eigen/Eigenvalues b/third_party/eigen3/Eigen/Eigenvalues index f32af39fa6..be72e68a6e 100644 --- a/third_party/eigen3/Eigen/Eigenvalues +++ b/third_party/eigen3/Eigen/Eigenvalues @@ -1 +1 @@ -#include "eigen-eigen-db7b61411772/Eigen/Eigenvalues" +#include "eigen-eigen-0a13bf3e579d/Eigen/Eigenvalues" diff --git a/third_party/eigen3/Eigen/LU b/third_party/eigen3/Eigen/LU index 960cec6ad1..d925a388fb 100644 --- a/third_party/eigen3/Eigen/LU +++ b/third_party/eigen3/Eigen/LU @@ -1 +1 @@ -#include "eigen-eigen-db7b61411772/Eigen/LU" +#include "eigen-eigen-0a13bf3e579d/Eigen/LU" diff --git a/third_party/eigen3/Eigen/QR b/third_party/eigen3/Eigen/QR index fd5aa1c519..8198ac216c 100644 --- a/third_party/eigen3/Eigen/QR +++ b/third_party/eigen3/Eigen/QR @@ -1 +1 @@ -#include "eigen-eigen-db7b61411772/Eigen/QR" +#include "eigen-eigen-0a13bf3e579d/Eigen/QR" diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/Tensor b/third_party/eigen3/unsupported/Eigen/CXX11/Tensor index f374207f41..51f7e7bddd 100644 --- a/third_party/eigen3/unsupported/Eigen/CXX11/Tensor +++ b/third_party/eigen3/unsupported/Eigen/CXX11/Tensor @@ -1 +1 @@ -#include "eigen-eigen-db7b61411772/unsupported/Eigen/CXX11/Tensor" +#include "eigen-eigen-0a13bf3e579d/unsupported/Eigen/CXX11/Tensor" |