diff options
Diffstat (limited to 'tensorflow/python/training/optimizer.py')
-rw-r--r-- | tensorflow/python/training/optimizer.py | 426 |
1 files changed, 426 insertions, 0 deletions
diff --git a/tensorflow/python/training/optimizer.py b/tensorflow/python/training/optimizer.py new file mode 100644 index 0000000000..1186117169 --- /dev/null +++ b/tensorflow/python/training/optimizer.py @@ -0,0 +1,426 @@ +"""Base class for optimizers.""" +# pylint: disable=g-bad-name +import types + +from tensorflow.python.framework import ops +from tensorflow.python.framework import types as tf_types +from tensorflow.python.ops import array_ops +from tensorflow.python.ops import control_flow_ops +from tensorflow.python.ops import gradients +from tensorflow.python.ops import state_ops +from tensorflow.python.ops import variables + + +class Optimizer(object): + """Base class for optimizers. + + This class defines the API to add Ops to train a model. You never use this + class directly, but instead instantiate one of its subclasses such as + `GradientDescentOptimizer`, `AdagradOptimizer`, or `MomentumOptimizer`. + + ### Usage + + ``` + # Create an optimizer with the desired parameters. + opt = GradientDescentOptimizer(learning_rate=0.1) + # Add Ops to the graph to minimize a cost by updating a list of variables. + # "cost" is a Tensor, and the list of variables contains variables.Variable + # objects. + opt_op = opt.minimize(cost, <list of variables>) + ``` + + In the training program you will just have to run the returned Op. + + ``` + # Execute opt_op to do one step of training: + opt_op.run() + ``` + + ### Processing gradients before applying them. + + Calling `minimize()` takes care of both computing the gradients and + applying them to the variables. If you want to process the gradients + before applying them you can instead use the optimizer in three steps: + + 1. Compute the gradients with `compute_gradients()`. + 2. Process the gradients as you wish. + 3. Apply the processed gradients with `apply_gradients()`. + + Example: + + ``` + # Create an optimizer. + opt = GradientDescentOptimizer(learning_rate=0.1) + + # Compute the gradients for a list of variables. + grads_and_vars = opt.compute_gradients(loss, <list of variables>) + + # grads_and_vars is a list of tuples (gradient, variable). Do whatever you + # need to the 'gradient' part, for example cap them, etc. + capped_grads_and_vars = [(MyCapper(gv[0]), gv[1])) for gv in grads_and_vars] + + # Ask the optimizer to apply the capped gradients. + opt.apply_gradients(capped_grads_and_vars) + ``` + + @@__init__ + + @@minimize + @@compute_gradients + @@apply_gradients + + ### Gating Gradients + + Both `minimize()` and `compute_gradients()` accept a `gate_gradient` argument + that controls the degree of parallelism during the application of the + gradients. + + The possible values are: `GATE_NONE`, `GATE_OP`, and `GATE_GRAPH`. + + <b>GATE_NONE</b>: Compute and apply gradients in parallel. This provides the + maximum parallelism in execution, at the cost of some non-reproducibility in + the results. For example the two gradients of MatMul depend on the input + values: With `GATE_NONE` one of the gradients could be applied to one of the + inputs _before_ the other gradient is computed resulting in non-reproducible + results. + + <b>GATE_OP</b>: For each Op, make sure all gradients are computed before they + are used. This prevents race conditions for Ops that generate gradients for + multiple inputs where the gradients depend on the inputs. + + <b>GATE_GRAPH</b>: Make sure all gradients for all variables are computed + before any one of them is used. This provides the least parallelism but can + be useful if you want to process all gradients before applying any of them. + + ### Slots + + Some optimizer subclasses, such as `MomentumOptimizer` and `AdagradOptimizer` + allocate and manage additional variables associated with the variables to + train. These are called <i>Slots</i>. Slots have names and you can ask the + optimizer for the names of the slots that it uses. Once you have a slot name + you can ask the optimizer for the variable it created to hold the slot value. + + This can be useful if you want to log debug a training algorithm, report stats + about the slots, etc. + + @@get_slot_names + @@get_slot + """ + + # Values for gate_gradients. + GATE_NONE = 0 + GATE_OP = 1 + GATE_GRAPH = 2 + + def __init__(self, use_locking, name): + """Create a new Optimizer. + + This must be called by the constructors of subclasses. + + Args: + use_locking: Bool. If True apply use locks to prevent concurrent updates + to variables. + name: A non-empty string. The name to use for accumulators created + for the optimizer. + + Raises: + ValueError: if name is malformed. + """ + if not name: + raise ValueError("Must specify the optimizer name") + self._use_locking = use_locking + self._name = name + # Dictionary of slots. + # {slot_name : { variable_to_train: slot_for_the_variable, ...}, ... } + self._slots = {} + + def minimize(self, loss, global_step=None, var_list=None, + gate_gradients=GATE_OP, name=None): + """Add operations to minimize 'loss' by updating 'var_list'. + + This method simply combines calls compute_gradients() and + apply_gradients(). If you want to process the gradient before applying them + call compute_gradients() and apply_gradients() explicitly instead of using + this function. + + Args: + loss: A Tensor containing the value to minimize. + global_step: Optional Variable to increment by one after the + variables have been updated. + var_list: Optional list of variables.Variable to update to minimize + 'loss'. Defaults to the list of variables collected in the graph + under the key GraphKeys.TRAINABLE_VARIABLES. + gate_gradients: How to gate the computation of gradients. Can be + GATE_NONE, GATE_OP, or GATE_GRAPH. + name: Optional name for the returned operation. + + Returns: + An Operation that updates the variables in 'var_list'. If 'global_step' + was not None, that operation also increments global_step. + + Raises: + ValueError: if some of the variables are not variables.Variable objects. + """ + grads_and_vars = self.compute_gradients(loss, var_list=var_list, + gate_gradients=gate_gradients) + return self.apply_gradients(grads_and_vars, global_step=global_step, + name=name) + + def compute_gradients(self, loss, var_list=None, gate_gradients=GATE_OP): + """Compute gradients of "loss" for the variables in "var_list". + + This is the first part of minimize(). It returns a list + of (gradient, variable) pairs where "gradient" is the gradient + for "variable". Note that "gradient" can be a Tensor, a + IndexedSlices, or None if there is no gradient for the + given variable. + + Args: + loss: A Tensor containing the value to minimize. + var_list: Optional list of variables.Variable to update to minimize + "loss". Defaults to the list of variables collected in the graph + under the key GraphKey.TRAINABLE_VARIABLES. + gate_gradients: How to gate the computation of gradients. Can be + GATE_NONE, GATE_OP, or GATE_GRAPH. + + Returns: + A list of (gradient, variable) pairs. + + Raises: + TypeError: If var_list contains anything else than variables.Variable. + ValueError: If some arguments are invalid. + """ + if gate_gradients not in [Optimizer.GATE_NONE, Optimizer.GATE_OP, + Optimizer.GATE_GRAPH]: + raise ValueError("gate_gradients must be one of: Optimizer.GATE_NONE, " + "Optimizer.GATE_OP, Optimizer.GATE_GRAPH. Not %s" % + gate_gradients) + self._assert_valid_dtypes([loss]) + if var_list is None: + var_list = variables.trainable_variables() + for var in var_list: + if not isinstance(var, variables.Variable): + raise TypeError("Argument is not a variables.Variable: %s" % var) + grads = gradients.gradients( + loss, var_list, gate_gradients=(gate_gradients == Optimizer.GATE_OP)) + if gate_gradients == Optimizer.GATE_GRAPH: + grads = control_flow_ops.tuple(grads) + grads_and_vars = zip(grads, var_list) + self._assert_valid_dtypes([v for g, v in grads_and_vars if g is not None]) + return grads_and_vars + + def apply_gradients(self, grads_and_vars, global_step=None, name=None): + """Apply gradients to variables. + + This is the second part of minimize(). It returns an Operation that + applies gradients. + + Args: + grads_and_vars: List of (gradient, variable) pairs as returned by + compute_gradients(). + global_step: Optional Variable to increment by one after the + variables have been updated. + name: Optional name for the returned operation. Default to the + name passed to the Optimizer constructor. + + Returns: + An Operation that applies the specified gradients. If 'global_step' + was not None, that operation also increments global_step. + + Raises: + TypeError: if grads_and_vars is malformed. + """ + # This is a default implementation of apply_gradients() that can be shared + # by most optimizers. It relies on the subclass implementing the following + # methods: _create_slots(), _prepare(), _apply_dense(), and _apply_sparse(). + for g, v in grads_and_vars: + if not isinstance(g, (ops.Tensor, ops.IndexedSlices, types.NoneType)): + raise TypeError( + "Gradient must be a Tensor, IndexedSlices, or None: %s" % g) + if not isinstance(v, variables.Variable): + raise TypeError( + "Variable must be a variables.Variable: %s" % v) + if g is not None: + self._assert_valid_dtypes([g, v]) + self._create_slots([v for g, v in grads_and_vars if g is not None]) + update_ops = [] + with ops.op_scope([], name, self._name) as name: + self._prepare() + for grad, var in grads_and_vars: + if not grad: + continue + with ops.name_scope("update_" + var.op.name), ops.device(var.device): + if isinstance(grad, ops.Tensor): + update_ops.append(self._apply_dense(grad, var)) + else: + update_ops.append(self._apply_sparse(grad, var)) + if global_step is None: + return self._finish(update_ops, name) + else: + with ops.control_dependencies([self._finish(update_ops, "update")]): + with ops.device(global_step.device): + return state_ops.assign_add(global_step, 1, name=name).op + + def get_slot(self, var, name): + """Return a slot named "name" created for "var" by the Optimizer. + + Some Optimizer subclasses use additional variables. For example + Momentum and Adagrad use variables to accumulate updates. This method + gives access to these Variables if for some reason you need them. + + Use get_slot_names() to get the list of slot names created by the Optimizer. + + Args: + var: A variable passed to minimize() or apply_gradients(). + name: A string. + + Returns: + The Variable for the slot if it was created, None otherwise. + """ + named_slots = self._slots.get(name, None) + if not named_slots: + return None + return named_slots.get(var, None) + + def get_slot_names(self): + """Return a list of the names of slots created by the Optimizer. + + See get_slot(). + + Returns: + A list of strings. + """ + return sorted(self._slots.keys()) + + def _assert_valid_dtypes(self, tensors): + """Asserts tensors are all valid types (see _valid_dtypes). + + Args: + tensors: tensors to check. + Raises: + ValueError: if any tensor is not a valid type. + """ + valid_dtypes = self._valid_dtypes() + for t in tensors: + dtype = t.dtype.base_dtype + if dtype not in valid_dtypes: + raise ValueError( + "Invalid type %s for %s, expected: %s." % ( + dtype, t.name, [v for v in valid_dtypes])) + + # -------------- + # Methods to be implemented by subclasses if they want to use the + # inherited implementation of apply_gradients() or compute_gradients(). + # -------------- + def _valid_dtypes(self): + """Valid types for loss, variables and gradients. + + Defaults to float32. Subclasses should override to allow other types. + + Returns: + Valid types for loss, variables and gradients. + """ + return set([tf_types.float32]) + + def _create_slots(self, var_list): + """Create all slots needed by the variables. + + Args: + var_list: A list of variables.Variable. + """ + # No slots needed by default + pass + + def _prepare(self): + """Create all needed tensors before applying gradients. + + This is called with the name_scope using the "name" that + users have chosen for the application of gradients. + """ + pass + + def _apply_dense(self, grad, var): + """Add ops to apply dense gradients to "var". + + Args: + grad: A Tensor. + var: A variables.Variable. + + Return: + An Operation. + """ + raise NotImplementedError() + + def _apply_sparse(self, grad, var): + """Add ops to apply sparse gradients to "var". + + Args: + grad: IndexedSlices. + var: A variables.Variable. + + Return: + An Operation. + """ + raise NotImplementedError() + + def _finish(self, update_ops, name_scope): + """Do what is needed to finish the update. + + This is called with the name_scope using the "name" that + users have chosen for the application of gradients. + + Args: + update_ops: List of Operations to update variables. This list contains + the values returned by the _apply_dense() and _apply_sparse() calls. + name_scope: string. Name to use for the returned operation. + + Returns: + The operation to apply updates. + """ + return control_flow_ops.group(*update_ops, name=name_scope) + + # -------------- + # Utility methods for subclasses. + # -------------- + + def _get_or_make_slot(self, var, val, slot_name, op_name): + """Find or create a slot for a variable. + + Args: + var: A variables.Variable. + val: A Tensor. The initial value of the slot. + slot_name: Name for the slot. + op_name: Name to use when scoping the Variable that + needs to be created for the slot. + + Returns: + A variables.Variable. + """ + named_slots = self._slots.get(slot_name, None) + if named_slots is None: + named_slots = {} + self._slots[slot_name] = named_slots + slot = named_slots.get(var, None) + if slot is None: + # Scope the slot name in the namespace of the Variable and + # create the slot on the same device as the variable. + with ops.name_scope(var.op.name + "/" + op_name) as scope: + with ops.device(var.device): + slot = variables.Variable(val, name=scope, trainable=False) + named_slots[var] = slot + return slot + + def _zeros_slot(self, var, slot_name, op_name): + """Find or create a slot initialized with 0.0. + + Args: + var: A variables.Variable. + slot_name: Name for the slot. + op_name: Name to use when scoping the Variable that + needs to be created for the slot. + + Returns: + A variables.Variable. + """ + val = array_ops.zeros(var.get_shape().as_list(), dtype=var.dtype) + return self._get_or_make_slot(var, val, slot_name, op_name) |