tensorflow/python/training/optimizer.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426

"""Base class for optimizers."""
# pylint: disable=g-bad-name
import types

from tensorflow.python.framework import ops
from tensorflow.python.framework import types as tf_types
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import control_flow_ops
from tensorflow.python.ops import gradients
from tensorflow.python.ops import state_ops
from tensorflow.python.ops import variables


class Optimizer(object):
  """Base class for optimizers.

  This class defines the API to add Ops to train a model.  You never use this
  class directly, but instead instantiate one of its subclasses such as
  `GradientDescentOptimizer`, `AdagradOptimizer`, or `MomentumOptimizer`.

  ### Usage

  ```
  # Create an optimizer with the desired parameters.
  opt = GradientDescentOptimizer(learning_rate=0.1)
  # Add Ops to the graph to minimize a cost by updating a list of variables.
  # "cost" is a Tensor, and the list of variables contains variables.Variable
  # objects.
  opt_op = opt.minimize(cost, <list of variables>)
  ```

  In the training program you will just have to run the returned Op.

  ```
  # Execute opt_op to do one step of training:
  opt_op.run()
  ```

  ### Processing gradients before applying them.

  Calling `minimize()` takes care of both computing the gradients and
  applying them to the variables.  If you want to process the gradients
  before applying them you can instead use the optimizer in three steps:

  1.  Compute the gradients with `compute_gradients()`.
  2.  Process the gradients as you wish.
  3.  Apply the processed gradients with `apply_gradients()`.

  Example:

  ```
  # Create an optimizer.
  opt = GradientDescentOptimizer(learning_rate=0.1)

  # Compute the gradients for a list of variables.
  grads_and_vars = opt.compute_gradients(loss, <list of variables>)

  # grads_and_vars is a list of tuples (gradient, variable).  Do whatever you
  # need to the 'gradient' part, for example cap them, etc.
  capped_grads_and_vars = [(MyCapper(gv[0]), gv[1])) for gv in grads_and_vars]

  # Ask the optimizer to apply the capped gradients.
  opt.apply_gradients(capped_grads_and_vars)
  ```

  @@__init__

  @@minimize
  @@compute_gradients
  @@apply_gradients

  ### Gating Gradients

  Both `minimize()` and `compute_gradients()` accept a `gate_gradient` argument
  that controls the degree of parallelism during the application of the
  gradients.

  The possible values are: `GATE_NONE`, `GATE_OP`, and `GATE_GRAPH`.

  <b>GATE_NONE</b>: Compute and apply gradients in parallel.  This provides the
  maximum parallelism in execution, at the cost of some non-reproducibility in
  the results.  For example the two gradients of MatMul depend on the input
  values: With `GATE_NONE` one of the gradients could be applied to one of the
  inputs _before_ the other gradient is computed resulting in non-reproducible
  results.

  <b>GATE_OP</b>: For each Op, make sure all gradients are computed before they
  are used.  This prevents race conditions for Ops that generate gradients for
  multiple inputs where the gradients depend on the inputs.

  <b>GATE_GRAPH</b>: Make sure all gradients for all variables are computed
  before any one of them is used.  This provides the least parallelism but can
  be useful if you want to process all gradients before applying any of them.

  ### Slots

  Some optimizer subclasses, such as `MomentumOptimizer` and `AdagradOptimizer`
  allocate and manage additional variables associated with the variables to
  train.  These are called <i>Slots</i>.  Slots have names and you can ask the
  optimizer for the names of the slots that it uses.  Once you have a slot name
  you can ask the optimizer for the variable it created to hold the slot value.

  This can be useful if you want to log debug a training algorithm, report stats
  about the slots, etc.

  @@get_slot_names
  @@get_slot
  """

  # Values for gate_gradients.
  GATE_NONE = 0
  GATE_OP = 1
  GATE_GRAPH = 2

  def __init__(self, use_locking, name):
    """Create a new Optimizer.

    This must be called by the constructors of subclasses.

    Args:
      use_locking: Bool. If True apply use locks to prevent concurrent updates
        to variables.
      name: A non-empty string.  The name to use for accumulators created
        for the optimizer.

    Raises:
      ValueError: if name is malformed.
    """
    if not name:
      raise ValueError("Must specify the optimizer name")
    self._use_locking = use_locking
    self._name = name
    # Dictionary of slots.
    #  {slot_name : { variable_to_train: slot_for_the_variable, ...}, ... }
    self._slots = {}

  def minimize(self, loss, global_step=None, var_list=None,
               gate_gradients=GATE_OP, name=None):
    """Add operations to minimize 'loss' by updating 'var_list'.

    This method simply combines calls compute_gradients() and
    apply_gradients(). If you want to process the gradient before applying them
    call compute_gradients() and apply_gradients() explicitly instead of using
    this function.

    Args:
      loss: A Tensor containing the value to minimize.
      global_step: Optional Variable to increment by one after the
        variables have been updated.
      var_list: Optional list of variables.Variable to update to minimize
        'loss'.  Defaults to the list of variables collected in the graph
        under the key GraphKeys.TRAINABLE_VARIABLES.
      gate_gradients: How to gate the computation of gradients.  Can be
        GATE_NONE, GATE_OP, or  GATE_GRAPH.
      name: Optional name for the returned operation.

    Returns:
      An Operation that updates the variables in 'var_list'.  If 'global_step'
      was not None, that operation also increments global_step.

    Raises:
      ValueError: if some of the variables are not variables.Variable objects.
    """
    grads_and_vars = self.compute_gradients(loss, var_list=var_list,
                                            gate_gradients=gate_gradients)
    return self.apply_gradients(grads_and_vars, global_step=global_step,
                                name=name)

  def compute_gradients(self, loss, var_list=None, gate_gradients=GATE_OP):
    """Compute gradients of "loss" for the variables in "var_list".

    This is the first part of minimize().  It returns a list
    of (gradient, variable) pairs where "gradient" is the gradient
    for "variable".  Note that "gradient" can be a Tensor, a
    IndexedSlices, or None if there is no gradient for the
    given variable.

    Args:
      loss: A Tensor containing the value to minimize.
      var_list: Optional list of variables.Variable to update to minimize
        "loss".  Defaults to the list of variables collected in the graph
        under the key GraphKey.TRAINABLE_VARIABLES.
      gate_gradients: How to gate the computation of gradients.  Can be
        GATE_NONE, GATE_OP, or  GATE_GRAPH.

    Returns:
      A list of (gradient, variable) pairs.

    Raises:
      TypeError: If var_list contains anything else than variables.Variable.
      ValueError: If some arguments are invalid.
    """
    if gate_gradients not in [Optimizer.GATE_NONE, Optimizer.GATE_OP,
                              Optimizer.GATE_GRAPH]:
      raise ValueError("gate_gradients must be one of: Optimizer.GATE_NONE, "
                       "Optimizer.GATE_OP, Optimizer.GATE_GRAPH.  Not %s" %
                       gate_gradients)
    self._assert_valid_dtypes([loss])
    if var_list is None:
      var_list = variables.trainable_variables()
    for var in var_list:
      if not isinstance(var, variables.Variable):
        raise TypeError("Argument is not a variables.Variable: %s" % var)
    grads = gradients.gradients(
        loss, var_list, gate_gradients=(gate_gradients == Optimizer.GATE_OP))
    if gate_gradients == Optimizer.GATE_GRAPH:
      grads = control_flow_ops.tuple(grads)
    grads_and_vars = zip(grads, var_list)
    self._assert_valid_dtypes([v for g, v in grads_and_vars if g is not None])
    return grads_and_vars

  def apply_gradients(self, grads_and_vars, global_step=None, name=None):
    """Apply gradients to variables.

    This is the second part of minimize(). It returns an Operation that
    applies gradients.

    Args:
      grads_and_vars: List of (gradient, variable) pairs as returned by
        compute_gradients().
      global_step: Optional Variable to increment by one after the
        variables have been updated.
      name: Optional name for the returned operation.  Default to the
        name passed to the Optimizer constructor.

    Returns:
      An Operation that applies the specified gradients. If 'global_step'
      was not None, that operation also increments global_step.

    Raises:
      TypeError: if grads_and_vars is malformed.
    """
    # This is a default implementation of apply_gradients() that can be shared
    # by most optimizers.  It relies on the subclass implementing the following
    # methods: _create_slots(), _prepare(), _apply_dense(), and _apply_sparse().
    for g, v in grads_and_vars:
      if not isinstance(g, (ops.Tensor, ops.IndexedSlices, types.NoneType)):
        raise TypeError(
            "Gradient must be a Tensor, IndexedSlices, or None: %s" % g)
      if not isinstance(v, variables.Variable):
        raise TypeError(
            "Variable must be a variables.Variable: %s" % v)
      if g is not None:
        self._assert_valid_dtypes([g, v])
    self._create_slots([v for g, v in grads_and_vars if g is not None])
    update_ops = []
    with ops.op_scope([], name, self._name) as name:
      self._prepare()
      for grad, var in grads_and_vars:
        if not grad:
          continue
        with ops.name_scope("update_" + var.op.name), ops.device(var.device):
          if isinstance(grad, ops.Tensor):
            update_ops.append(self._apply_dense(grad, var))
          else:
            update_ops.append(self._apply_sparse(grad, var))
      if global_step is None:
        return self._finish(update_ops, name)
      else:
        with ops.control_dependencies([self._finish(update_ops, "update")]):
          with ops.device(global_step.device):
            return state_ops.assign_add(global_step, 1, name=name).op

  def get_slot(self, var, name):
    """Return a slot named "name" created for "var" by the Optimizer.

    Some Optimizer subclasses use additional variables.  For example
    Momentum and Adagrad use variables to accumulate updates.  This method
    gives access to these Variables if for some reason you need them.

    Use get_slot_names() to get the list of slot names created by the Optimizer.

    Args:
      var: A variable passed to minimize() or apply_gradients().
      name: A string.

    Returns:
      The Variable for the slot if it was created, None otherwise.
    """
    named_slots = self._slots.get(name, None)
    if not named_slots:
      return None
    return named_slots.get(var, None)

  def get_slot_names(self):
    """Return a list of the names of slots created by the Optimizer.

    See get_slot().

    Returns:
      A list of strings.
    """
    return sorted(self._slots.keys())

  def _assert_valid_dtypes(self, tensors):
    """Asserts tensors are all valid types (see _valid_dtypes).

    Args:
      tensors: tensors to check.
    Raises:
      ValueError: if any tensor is not a valid type.
    """
    valid_dtypes = self._valid_dtypes()
    for t in tensors:
      dtype = t.dtype.base_dtype
      if dtype not in valid_dtypes:
        raise ValueError(
            "Invalid type %s for %s, expected: %s." % (
                dtype, t.name, [v for v in valid_dtypes]))

  # --------------
  # Methods to be implemented by subclasses if they want to use the
  # inherited implementation of apply_gradients() or compute_gradients().
  # --------------
  def _valid_dtypes(self):
    """Valid types for loss, variables and gradients.

    Defaults to float32. Subclasses should override to allow other types.

    Returns:
      Valid types for loss, variables and gradients.
    """
    return set([tf_types.float32])

  def _create_slots(self, var_list):
    """Create all slots needed by the variables.

    Args:
      var_list: A list of variables.Variable.
    """
    # No slots needed by default
    pass

  def _prepare(self):
    """Create all needed tensors before applying gradients.

    This is called with the name_scope using the "name" that
    users have chosen for the application of gradients.
    """
    pass

  def _apply_dense(self, grad, var):
    """Add ops to apply dense gradients to "var".

    Args:
      grad: A Tensor.
      var: A variables.Variable.

    Return:
      An Operation.
    """
    raise NotImplementedError()

  def _apply_sparse(self, grad, var):
    """Add ops to apply sparse gradients to "var".

    Args:
      grad: IndexedSlices.
      var: A variables.Variable.

    Return:
      An Operation.
    """
    raise NotImplementedError()

  def _finish(self, update_ops, name_scope):
    """Do what is needed to finish the update.

    This is called with the name_scope using the "name" that
    users have chosen for the application of gradients.

    Args:
      update_ops: List of Operations to update variables.  This list contains
        the values returned by the _apply_dense() and _apply_sparse() calls.
      name_scope: string.  Name to use for the returned operation.

    Returns:
      The operation to apply updates.
    """
    return control_flow_ops.group(*update_ops, name=name_scope)

  # --------------
  # Utility methods for subclasses.
  # --------------

  def _get_or_make_slot(self, var, val, slot_name, op_name):
    """Find or create a slot for a variable.

    Args:
      var: A variables.Variable.
      val: A Tensor.  The initial value of the slot.
      slot_name: Name for the slot.
      op_name: Name to use when scoping the Variable that
        needs to be created for  the slot.

    Returns:
      A variables.Variable.
    """
    named_slots = self._slots.get(slot_name, None)
    if named_slots is None:
      named_slots = {}
      self._slots[slot_name] = named_slots
    slot = named_slots.get(var, None)
    if slot is None:
      # Scope the slot name in the namespace of the Variable and
      # create the slot on the same device as the variable.
      with ops.name_scope(var.op.name + "/" + op_name) as scope:
        with ops.device(var.device):
          slot = variables.Variable(val, name=scope, trainable=False)
      named_slots[var] = slot
    return slot

  def _zeros_slot(self, var, slot_name, op_name):
    """Find or create a slot initialized with 0.0.

    Args:
      var: A variables.Variable.
      slot_name: Name for the slot.
      op_name: Name to use when scoping the Variable that
        needs to be created for  the slot.

    Returns:
      A variables.Variable.
    """
    val = array_ops.zeros(var.get_shape().as_list(), dtype=var.dtype)
    return self._get_or_make_slot(var, val, slot_name, op_name)