diff options
Diffstat (limited to 'tensorflow/python/ops/gradients.py')
-rw-r--r-- | tensorflow/python/ops/gradients.py | 661 |
1 files changed, 661 insertions, 0 deletions
diff --git a/tensorflow/python/ops/gradients.py b/tensorflow/python/ops/gradients.py new file mode 100644 index 0000000000..ffa7828c04 --- /dev/null +++ b/tensorflow/python/ops/gradients.py @@ -0,0 +1,661 @@ +"""Implements the graph generation for computation of gradients.""" + +import collections +import warnings + +import tensorflow.python.platform + +import numpy as np + +from tensorflow.python.framework import ops +from tensorflow.python.framework import tensor_shape +from tensorflow.python.framework import tensor_util +from tensorflow.python.framework import types +# pylint: disable=unused-import +from tensorflow.python.ops import array_grad +from tensorflow.python.ops import array_ops +from tensorflow.python.ops import constant_op +from tensorflow.python.ops import control_flow_grad +from tensorflow.python.ops import control_flow_ops +from tensorflow.python.ops import logging_ops +from tensorflow.python.ops import linalg_grad +from tensorflow.python.ops import math_grad +# pylint: enable=unused-import +from tensorflow.python.ops import math_ops +from tensorflow.python.ops import linalg_ops +from tensorflow.python.platform import logging + + +# Warn the user if we convert a sparse representation to dense with at +# least this number of elements. +_LARGE_SPARSE_NUM_ELEMENTS = 100000000 + + +def _IndexedSlicesToTensor(value, dtype=None, name=None): + """Converts an IndexedSlices object `value` to a Tensor. + + NOTE(mrry): This function is potentially expensive. + + Args: + value: An ops.IndexedSlices object. + dtype: The dtype of the Tensor to be returned. + name: Optional name to use for the returned Tensor. + + Returns: + A dense Tensor representing the values in the given IndexedSlices. + + Raises: + ValueError: If the IndexedSlices does not have the same dtype. + """ + if dtype and not dtype.is_compatible_with(value.dtype): + raise ValueError( + "Tensor conversion requested dtype %s for IndexedSlices with dtype %s" + % (dtype.name, value.dtype.name)) + if value.dense_shape is None: + raise ValueError( + "Tensor conversion requested for IndexedSlices without dense_shape: %s" + % str(value)) + # TODO(mrry): Consider adding static shape information to + # IndexedSlices, to avoid using numpy here. + dense_shape_value = tensor_util.ConstantValue(value.dense_shape) + if dense_shape_value is not None: + num_elements = np.prod(dense_shape_value) + if num_elements >= _LARGE_SPARSE_NUM_ELEMENTS: + warnings.warn( + "Converting sparse IndexedSlices to a dense Tensor with %d elements. " + "This may consume a large amount of memory." % num_elements) + else: + warnings.warn( + "Converting sparse IndexedSlices to a dense Tensor of unknown shape. " + "This may consume a large amount of memory.") + return math_ops.unsorted_segment_sum( + value.values, value.indices, value.dense_shape[0], name=name) + + +ops.register_tensor_conversion_function(ops.IndexedSlices, _IndexedSlicesToTensor) + + +def _MarkReachedOps(from_ops, reached_ops): + """Mark all ops reached from "from_ops". + + Args: + from_ops: list of Operations. + reached_ops: list of booleans, indexed by operation id. + """ + queue = collections.deque() + queue.extend(from_ops) + while queue: + op = queue.popleft() + if not reached_ops[op._id]: + reached_ops[op._id] = True + for output in op.outputs: + queue.extend(output.consumers()) + + +def _GatherInputs(to_ops, reached_ops): + """List all inputs of to_ops that are in reached_ops. + + Args: + to_ops: list of Operations. + reached_ops: list of booleans, indexed by operation id. + + Returns: + The list of all inputs of to_ops that are in reached_ops. + That list includes all elements of to_ops. + """ + inputs = [] + queue = collections.deque() + queue.extend(to_ops) + while queue: + op = queue.popleft() + # We are interested in this op. + if reached_ops[op._id]: + inputs.append(op) + # Clear the boolean so we won't add the inputs again. + reached_ops[op._id] = False + for inp in op.inputs: + queue.append(inp.op) + return inputs + + +def _GetGradsDevice(op, colocate_gradients_with_ops): + """Gets the device to which to assign gradients of "op". + + Args: + op: an Operation. + colocate_gradients_with_ops: If True, try colocating gradients with the + corresponding op. + + Returns: + A device string. + """ + if colocate_gradients_with_ops and op.device: + return op.device + else: + return op.graph.get_default_device() + + +def _PendingCount(graph, to_ops, from_ops): + """Initialize the pending count for ops between two lists of Operations. + + 'pending_count[op._id]' indicates the number of backprop inputs + to this operation. + + Args: + graph: a Graph. + to_ops: list of Operations. + from_ops: list of Operations. + + Returns: + A tuple containing: (1) a list of integers indexed by operation id, + indicating the number of backprop inputs to this operation, and (2) + a boolean which is True if any of the ops in between from_ops and to_ops + contain control flow loops. + """ + # Mark reachable ops from from_ops. + reached_ops = [False] * (graph._last_id + 1) + for op in to_ops: + reached_ops[op._id] = True + _MarkReachedOps(from_ops, reached_ops) + + # Mark between ops. + between_ops = [False] * (graph._last_id + 1) + between_op_list = [] + queue = collections.deque() + queue.extend(to_ops) + while queue: + op = queue.popleft() + # We are interested in this op. + if reached_ops[op._id]: + between_ops[op._id] = True + between_op_list.append(op) + # Clear the boolean so we won't add the inputs again. + reached_ops[op._id] = False + for inp in op.inputs: + queue.append(inp.op) + + # Initialize pending count for between ops. + pending_count = [0] * (graph._last_id + 1) + has_control_flow = False + for op in between_op_list: + for x in op.inputs: + if between_ops[x.op._id]: + pending_count[x.op._id] += 1 + for x in op.control_inputs: + if between_ops[x._id]: + pending_count[x._id] += 1 + if op.type == "Exit": + has_control_flow = True + return pending_count, has_control_flow + + +def _AsList(x): + return x if isinstance(x, (list, tuple)) else [x] + + +def _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops): + """Fill in default values for grad_ys. + + Args: + grad_ys: List of gradients, can contain None. + ys: List of tensors. + colocate_gradients_with_ops: If True, try colocating gradients with + the corresponding op. + + Returns: + A list of gradients to use, without None. + + Raises: + ValueError: If one of the grad_ys is invalid. + """ + if len(grad_ys) != len(ys): + raise ValueError("Passed %d grad_ys for %d ys" % (len(grad_ys), len(ys))) + grad_ys = ops.convert_n_to_tensor_or_indexed_slices(grad_ys, name="grad_y") + for i in xrange(len(grad_ys)): + grad_y = grad_ys[i] + y = ys[i] + if grad_y is None: + with ops.device(_GetGradsDevice(y.op, colocate_gradients_with_ops)): + grad_ys[i] = array_ops.fill(array_ops.shape(y), + constant_op.constant(1, dtype=y.dtype)) + else: + if grad_y.dtype != y.dtype: + raise ValueError("Y and ys_grad must be of the same type, " + "not y: %s, ys_grad: %s " % + (types.as_dtype(y.dtype).name, + types.as_dtype(grad_y.dtype).name)) + return grad_ys + + +def _VerifyGeneratedGradients(grads, op): + """Verify that gradients are valid in number and type. + + Args: + grads: List of generated gradients. + op: Operation for which the gradients where generated. + + Raises: + ValueError: if the gradients are invalid. + """ + if len(grads) != len(op.inputs): + raise ValueError("Num gradients %d generated for op %s do not match num " + "inputs %d" % (len(grads), op.node_def, len(op.inputs))) + for i in xrange(len(grads)): + grad = grads[i] + inp = op.inputs[i] + if grad is not None: + if not grad.dtype.is_compatible_with(inp.dtype): + raise ValueError( + "Gradient type %s generated for op %s does " + "not match input type %s" % + (types.as_dtype(grad.dtype).name, op.node_def, + types.as_dtype(inp.dtype).name)) + + +def _StopOps(from_ops, pending_count): + """The set of ops that terminate the gradient computation. + + This computes the frontier of the forward graph *before* which backprop + should stop. Operations in the returned set will not be differentiated. + This set is defined as the subset of `from_ops` containing ops that have + no predecessor in `from_ops`. `pending_count` is the result of + `_PendingCount(g, xs, from_ops)`. An 'op' has predecessors in `from_ops` + iff pending_count[op._id] > 0. + + Args: + from_ops: list of Operations. + pending_count: List of integers, indexed by operation id. + + Returns: + The set of operations. + """ + stop_ops = set() + for op in from_ops: + is_stop_op = True + for inp in op.inputs: + if pending_count[inp.op._id] > 0: + is_stop_op = False + break + if is_stop_op: + stop_ops.add(op._id) + return stop_ops + + +def gradients(ys, xs, grad_ys=None, name="gradients", + colocate_gradients_with_ops=False, + gate_gradients=False, + aggregation_method=None): + """Constructs symbolic partial derivatives of `ys` w.r.t. x in `xs`. + + `ys` and `xs` are each a `Tensor` or a list of tensors. `grad_ys` + is a list of `Tensor`, holding the gradients received by the + `ys`. The list must be the same length as `ys`. + + `gradients()` adds ops to the graph to output the partial + derivatives of `ys` with respect to `xs`. It returns a list of + `Tensor` of length `len(xs)` where each tensor is the `sum(dy/dx)` + for y in `ys`. + + `grad_ys` is a list of tensors of the same length as `ys` that holds + the initial gradients for each y in `ys`. When `grad_ys` is None, + we fill in a tensor of '1's of the shape of y for each y in `ys`. A + user can provide their own initial 'grad_ys` to compute the + derivatives using a different initial gradient for each y (e.g., if + one wanted to weight the gradient differently for each value in + each y). + + Args: + ys: A `Tensor` or list of tensors to be differentiated. + xs: A `Tensor` or list of tensors to be used for differentiation. + grad_ys: Optional. A `Tensor` or list of tensors the same size as + `ys` and holding the gradients computed for each y in `ys`. + name: Optional name to use for grouping all the gradient ops together. + defaults to 'gradients'. + colocate_gradients_with_ops: If True, try colocating gradients with + the corresponding op. + gate_gradients: If True, add a tuple around the gradients returned + for an operations. This avoids some race conditions. + aggregation_method: Specifies the method used to combine gradient terms. + Accepted values are constants defined in the class `AggregationMethod`. + + Returns: + A list of `sum(dy/dx)` for each x in `xs`. + + Raises: + LookupError: if one of the operations between `x` and `y` does not + have a registered gradient function. + ValueError: if the arguments are invalid. + + """ + ys = _AsList(ys) + xs = _AsList(xs) + if grad_ys is None: + grad_ys = [None] * len(ys) + else: + grad_ys = _AsList(grad_ys) + with ops.op_scope(ys + xs + grad_ys, name, "gradients"): + ys = ops.convert_n_to_tensor_or_indexed_slices(ys, name="y") + xs = ops.convert_n_to_tensor_or_indexed_slices(xs, name="x") + grad_ys = _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops) + + # The approach we take here is as follows: Create a list of all ops in the + # subgraph between the ys and xs. Visit these ops in reverse order of ids + # to ensure that when we visit an op the gradients w.r.t its outputs have + # been collected. Then aggregate these gradients if needed, call the op's + # gradient function, and add the generated gradients to the gradients for + # its input. + + # Initialize the pending count for ops in the connected subgraph from ys + # to the xs. + to_ops = [t.op for t in ys] + from_ops = [t.op for t in xs] + pending_count, has_control_flow = _PendingCount( + ops.get_default_graph(), to_ops, from_ops) + + # Iterate over the collected ops. + # + # grads: op => list of gradients received on each output endpoint of the + # op. The gradients for each endpoint are initially collected as a list. + # When it is time to call the op's gradient function, for each endpoint we + # aggregate the list of received gradients into a Add() Operation if there + # is more than one. + grads = {} + + # Add the initial gradients for the ys. + for y, grad_y in zip(ys, grad_ys): + _SetGrad(grads, y, grad_y) + + # Initialize queue with to_ops. + queue = collections.deque() + # Add the ops in 'to_ops' into the queue. + to_ops_set = set() + for op in to_ops: + if op._id not in to_ops_set: + to_ops_set.add(op._id) + queue.append(op) + # The set of 'from_ops'. + stop_ops = _StopOps(from_ops, pending_count) + while queue: + # generate gradient subgraph for op. + op = queue.popleft() + with ops.device(_GetGradsDevice(op, colocate_gradients_with_ops)): + if has_control_flow: + control_flow_ops.EnterGradWhileContext(op) + out_grads = _AggregatedGrads(grads, op, has_control_flow, + aggregation_method) + grad_fn = None + if any(out_grads) and op._id not in stop_ops: + # A grad_fn must be defined, either as a function or as None + # for ops that do not have gradients. + try: + grad_fn = ops.get_gradient_function(op) + except LookupError: + raise LookupError( + "No gradient defined for operation '%s' (op type: %s)" % + (op.name, op.type)) + if grad_fn and any(out_grads): + # NOTE: If _AggregatedGrads didn't compute a value for the i'th + # output, it means that the cost does not depend on output[i], + # therefore dC/doutput[i] is 0. + for i, out_grad in enumerate(out_grads): + if (not out_grad + and types.as_dtype(op.outputs[i].dtype).base_dtype in ( + types.float32, types.float64)): + # Only floating-point outputs get a zero gradient. Gradient + # functions should ignore the gradient for other outputs. + out_grads[i] = array_ops.zeros_like(op.outputs[i]) + with ops.name_scope(op.name + "_grad"): + # pylint: disable=protected-access + with ops.get_default_graph()._original_op(op): + # pylint: enable=protected-access + op_wrapper = op + if has_control_flow: + op_wrapper = control_flow_ops.MakeWrapper(op) + in_grads = _AsList(grad_fn(op_wrapper, *out_grads)) + _VerifyGeneratedGradients(in_grads, op) + if gate_gradients and len(in_grads) > 1: + in_grads = control_flow_ops.tuple(in_grads) + logging.vlog(1, "Gradient for '" + op.name + "'") + logging.vlog(1, " in --> %s", + ", ".join([x.name for x in out_grads if x])) + logging.vlog(1, " out --> %s", + ", ".join([x.name for x in in_grads if x])) + else: + # If no grad_fn is defined or none of out_grads is available, + # just propagates a list of None backwards. + in_grads = [None] * len(op.inputs) + for t_in, in_grad in zip(op.inputs, in_grads): + if in_grad: + _SetGrad(grads, t_in, in_grad) + if has_control_flow: + control_flow_ops.ExitGradWhileContext(op) + + # update pending count for the inputs of op. + for x in op.inputs: + pending_count[x.op._id] -= 1 + ready = (pending_count[x.op._id] == 0) + if has_control_flow and not ready: + ready = (pending_count[x.op._id] > 0 and + control_flow_ops.IsLoopSwitch(x.op)) + if ready: + queue.append(x.op) + for x in op.control_inputs: + pending_count[x._id] -= 1 + if pending_count[x._id] is 0: + queue.append(x) + return [_GetGrad(grads, x) for x in xs] + + +def _SetGrad(grads, t, grad): + """Sets gradient "grad" in "grads" for tensor "t".""" + op = t.op + op_grads = grads.get(op) + if not op_grads: + op_grads = [[] for _ in xrange(len(op.outputs))] + grads[op] = op_grads + t_grads = op_grads[t.value_index] + if isinstance(t_grads, list): + t_grads.append(grad) + else: + assert op.type == "Switch" + op_grads[t.value_index] = grad + + +def _GetGrad(grads, t): + """Gets gradient for tensor "t".""" + op = t.op + op_grads = grads.get(op) + if not op_grads: return None + t_grad = op_grads[t.value_index] + assert not isinstance(t_grad, list), ( + "gradients list should have been aggregated by now.") + return t_grad + + +def _GetGrads(grads, op): + """Gets all gradients for op.""" + if op in grads: + return grads[op] + else: + return [[] for _ in xrange(len(op.outputs))] + + +def _HandleNestedIndexedSlices(grad): + assert isinstance(grad, ops.IndexedSlices) + if isinstance(grad.values, ops.Tensor): + return grad + else: + assert isinstance(grad.values, ops.IndexedSlices) + g = _HandleNestedIndexedSlices(grad.values) + return ops.IndexedSlices( + g.values, array_ops.gather(grad.indices, g.indices), g.dense_shape) + + +def _AccumulatorShape(inputs): + shape = tensor_shape.unknown_shape() + for i in inputs: + if isinstance(i, ops.Tensor): + shape = shape.merge_with(i.get_shape()) + return shape + + +class AggregationMethod(object): + """A class listing aggregation methods used to combine gradients. + + Computing partial derivatives can require aggregating gradient + contributions. This class lists the various methods that can + be used to combine gradients in the graph: + + * `ADD_N`: All of the gradient terms are summed as part of one + operation using the "AddN" op. It has the property that all + gradients must be ready before any aggregation is performed. + * `DEFAULT`: The system-chosen default aggregation method. + """ + ADD_N = 0 + DEFAULT = ADD_N + # The following are experimental and may not be supported in future releases. + EXPERIMENTAL_TREE = 1 + EXPERIMENTAL_ACCUMULATE_N = 2 + + +def _AggregatedGrads(grads, op, has_control_flow, aggregation_method=None): + """Get the aggregated gradients for op. + + Args: + grads: The map of memoized gradients. + op: The op to get gradients for. + has_control_flow: True iff the graph contains control flow ops. + aggregation_method: Specifies the method used to combine gradient terms. + Accepted values are constants defined in the class `AggregationMethod`. + + Returns: + A list of gradients, one per each output of `op`. If the gradients + for a particular output is a list, this function aggregates it + before returning. + + Raises: + TypeError: if the incoming grads are not Tensors or IndexedSlices. + ValueError: if the arguments are invalid. + + """ + if aggregation_method is None: + aggregation_method = AggregationMethod.DEFAULT + if aggregation_method not in [AggregationMethod.ADD_N, + AggregationMethod.EXPERIMENTAL_TREE, + AggregationMethod.EXPERIMENTAL_ACCUMULATE_N]: + raise ValueError("Invalid aggregation_method specified.") + out_grads = _GetGrads(grads, op) + for i, out_grad in enumerate(out_grads): + if has_control_flow: + if isinstance(out_grad, (ops.Tensor, ops.IndexedSlices)): + assert op.type == "Switch" + continue + # Grads have to be Tensors or IndexedSlices + if not all([isinstance(g, (ops.Tensor, ops.IndexedSlices)) + for g in out_grad if g]): + raise TypeError("gradients have to be either all Tensors " + "or all IndexedSlices") + # Aggregate multiple gradients, and convert [] to None. + if out_grad: + if all([isinstance(g, ops.Tensor) for g in out_grad if g]): + tensor_shape = _AccumulatorShape(out_grad) + if len(out_grad) < 2: + used = "nop" + out_grads[i] = out_grad[0] + elif (aggregation_method == AggregationMethod.EXPERIMENTAL_ACCUMULATE_N + and len(out_grad) > 2 and tensor_shape.is_fully_defined()): + # The benefit of using AccumulateN is that its inputs can be combined + # in any order and this can allow the expression to be evaluated with + # a smaller memory footprint. When used with gpu_allocator_retry, + # it is possible to compute a sum of terms which are much larger than + # total GPU memory. + # AccumulateN can currently only be used if we know the shape for + # an accumulator variable. If this is not known, or if we only have + # 2 grads then we fall through to the "tree" case below. + used = "accumulate_n" + out_grads[i] = math_ops.accumulate_n(out_grad) + elif aggregation_method in [AggregationMethod.EXPERIMENTAL_TREE, + AggregationMethod.EXPERIMENTAL_ACCUMULATE_N + ]: + # Aggregate all gradients by doing pairwise sums: this may + # reduce performance, but it can improve memory because the + # gradients can be released earlier. + # + # TODO(vrv): Consider replacing this with a version of + # tf.AddN() that eagerly frees its inputs as soon as they are + # ready, so the order of this tree does not become a problem. + used = "tree" + with ops.name_scope(op.name + "_gradient_sum"): + running_sum = out_grad[0] + for grad in out_grad[1:]: + running_sum = math_ops.add_n([running_sum, grad]) + out_grads[i] = running_sum + else: + used = "add_n" + out_grads[i] = math_ops.add_n(out_grad) + logging.vlog(2, " _AggregatedGrads %d x %s using %s", len(out_grad), + tensor_shape, used) + else: + out_grad = math_ops._as_indexed_slices_list([g for g in out_grad if g]) + out_grad = [_HandleNestedIndexedSlices(x) for x in out_grad] + # Form IndexedSlices out of the concatenated values and + # indices. + out_grads[i] = ops.IndexedSlices( + array_ops.concat(0, [x.values for x in out_grad]), + array_ops.concat(0, [x.indices for x in out_grad]), + out_grad[0].dense_shape) + else: + out_grads[i] = [] + return out_grads + + +# TODO(vrv): Make this available when we want to make it public. +def _hessian_vector_product(ys, xs, v): + """Multiply the Hessian of `ys` wrt `xs` by `v`. + + This is an efficient construction that uses a backprop-like approach + to compute the product between the Hessian and another vector. The + Hessian is usually too large to be explicitly computed or even + represented, but this method allows us to at least multiply by it + for the same big-O cost as backprop. + + Implicit Hessian-vector products are the main practical, scalable way + of using second derivatives with neural networks. They allow us to + do things like construct Krylov subspaces and approximate conjugate + gradient descent. + + Example: if `y` = 1/2 `x`^T A `x`, then `hessian_vector_product(y, + x, v)` will return an expression that evaluates to the same values + as (A + A.T) `v`. + + Args: + ys: A scalar value, or a tensor or list of tensors to be summed to + yield a scalar. + xs: A list of tensors that we should construct the Hessian over. + v: A list of tensors, with the same shapes as xs, that we want to + multiply by the Hessian. + + Returns: + A list of tensors (or if the list would be length 1, a single tensor) + containing the product between the Hessian and `v`. + + Raises: + ValueError: `xs` and `v` have different length. + + """ + + # Validate the input + length = len(xs) + if len(v) != length: + raise ValueError("xs and v must have the same length.") + + # First backprop + grads = gradients(ys, xs) + + assert len(grads) == length + elemwise_products = [math_ops.mul(grad_elem, array_ops.stop_gradient(v_elem)) + for grad_elem, v_elem in zip(grads, v) + if grad_elem is not None] + + # Second backprop + return gradients(elemwise_products, xs) |