1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
|
"""Base class for optimizers."""
# pylint: disable=g-bad-name
import types
from tensorflow.python.framework import ops
from tensorflow.python.framework import types as tf_types
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import control_flow_ops
from tensorflow.python.ops import gradients
from tensorflow.python.ops import state_ops
from tensorflow.python.ops import variables
class Optimizer(object):
"""Base class for optimizers.
This class defines the API to add Ops to train a model. You never use this
class directly, but instead instantiate one of its subclasses such as
`GradientDescentOptimizer`, `AdagradOptimizer`, or `MomentumOptimizer`.
### Usage
```
# Create an optimizer with the desired parameters.
opt = GradientDescentOptimizer(learning_rate=0.1)
# Add Ops to the graph to minimize a cost by updating a list of variables.
# "cost" is a Tensor, and the list of variables contains variables.Variable
# objects.
opt_op = opt.minimize(cost, <list of variables>)
```
In the training program you will just have to run the returned Op.
```
# Execute opt_op to do one step of training:
opt_op.run()
```
### Processing gradients before applying them.
Calling `minimize()` takes care of both computing the gradients and
applying them to the variables. If you want to process the gradients
before applying them you can instead use the optimizer in three steps:
1. Compute the gradients with `compute_gradients()`.
2. Process the gradients as you wish.
3. Apply the processed gradients with `apply_gradients()`.
Example:
```
# Create an optimizer.
opt = GradientDescentOptimizer(learning_rate=0.1)
# Compute the gradients for a list of variables.
grads_and_vars = opt.compute_gradients(loss, <list of variables>)
# grads_and_vars is a list of tuples (gradient, variable). Do whatever you
# need to the 'gradient' part, for example cap them, etc.
capped_grads_and_vars = [(MyCapper(gv[0]), gv[1])) for gv in grads_and_vars]
# Ask the optimizer to apply the capped gradients.
opt.apply_gradients(capped_grads_and_vars)
```
@@__init__
@@minimize
@@compute_gradients
@@apply_gradients
### Gating Gradients
Both `minimize()` and `compute_gradients()` accept a `gate_gradient` argument
that controls the degree of parallelism during the application of the
gradients.
The possible values are: `GATE_NONE`, `GATE_OP`, and `GATE_GRAPH`.
<b>GATE_NONE</b>: Compute and apply gradients in parallel. This provides the
maximum parallelism in execution, at the cost of some non-reproducibility in
the results. For example the two gradients of MatMul depend on the input
values: With `GATE_NONE` one of the gradients could be applied to one of the
inputs _before_ the other gradient is computed resulting in non-reproducible
results.
<b>GATE_OP</b>: For each Op, make sure all gradients are computed before they
are used. This prevents race conditions for Ops that generate gradients for
multiple inputs where the gradients depend on the inputs.
<b>GATE_GRAPH</b>: Make sure all gradients for all variables are computed
before any one of them is used. This provides the least parallelism but can
be useful if you want to process all gradients before applying any of them.
### Slots
Some optimizer subclasses, such as `MomentumOptimizer` and `AdagradOptimizer`
allocate and manage additional variables associated with the variables to
train. These are called <i>Slots</i>. Slots have names and you can ask the
optimizer for the names of the slots that it uses. Once you have a slot name
you can ask the optimizer for the variable it created to hold the slot value.
This can be useful if you want to log debug a training algorithm, report stats
about the slots, etc.
@@get_slot_names
@@get_slot
"""
# Values for gate_gradients.
GATE_NONE = 0
GATE_OP = 1
GATE_GRAPH = 2
def __init__(self, use_locking, name):
"""Create a new Optimizer.
This must be called by the constructors of subclasses.
Args:
use_locking: Bool. If True apply use locks to prevent concurrent updates
to variables.
name: A non-empty string. The name to use for accumulators created
for the optimizer.
Raises:
ValueError: if name is malformed.
"""
if not name:
raise ValueError("Must specify the optimizer name")
self._use_locking = use_locking
self._name = name
# Dictionary of slots.
# {slot_name : { variable_to_train: slot_for_the_variable, ...}, ... }
self._slots = {}
def minimize(self, loss, global_step=None, var_list=None,
gate_gradients=GATE_OP, name=None):
"""Add operations to minimize 'loss' by updating 'var_list'.
This method simply combines calls compute_gradients() and
apply_gradients(). If you want to process the gradient before applying them
call compute_gradients() and apply_gradients() explicitly instead of using
this function.
Args:
loss: A Tensor containing the value to minimize.
global_step: Optional Variable to increment by one after the
variables have been updated.
var_list: Optional list of variables.Variable to update to minimize
'loss'. Defaults to the list of variables collected in the graph
under the key GraphKeys.TRAINABLE_VARIABLES.
gate_gradients: How to gate the computation of gradients. Can be
GATE_NONE, GATE_OP, or GATE_GRAPH.
name: Optional name for the returned operation.
Returns:
An Operation that updates the variables in 'var_list'. If 'global_step'
was not None, that operation also increments global_step.
Raises:
ValueError: if some of the variables are not variables.Variable objects.
"""
grads_and_vars = self.compute_gradients(loss, var_list=var_list,
gate_gradients=gate_gradients)
return self.apply_gradients(grads_and_vars, global_step=global_step,
name=name)
def compute_gradients(self, loss, var_list=None, gate_gradients=GATE_OP):
"""Compute gradients of "loss" for the variables in "var_list".
This is the first part of minimize(). It returns a list
of (gradient, variable) pairs where "gradient" is the gradient
for "variable". Note that "gradient" can be a Tensor, a
IndexedSlices, or None if there is no gradient for the
given variable.
Args:
loss: A Tensor containing the value to minimize.
var_list: Optional list of variables.Variable to update to minimize
"loss". Defaults to the list of variables collected in the graph
under the key GraphKey.TRAINABLE_VARIABLES.
gate_gradients: How to gate the computation of gradients. Can be
GATE_NONE, GATE_OP, or GATE_GRAPH.
Returns:
A list of (gradient, variable) pairs.
Raises:
TypeError: If var_list contains anything else than variables.Variable.
ValueError: If some arguments are invalid.
"""
if gate_gradients not in [Optimizer.GATE_NONE, Optimizer.GATE_OP,
Optimizer.GATE_GRAPH]:
raise ValueError("gate_gradients must be one of: Optimizer.GATE_NONE, "
"Optimizer.GATE_OP, Optimizer.GATE_GRAPH. Not %s" %
gate_gradients)
self._assert_valid_dtypes([loss])
if var_list is None:
var_list = variables.trainable_variables()
for var in var_list:
if not isinstance(var, variables.Variable):
raise TypeError("Argument is not a variables.Variable: %s" % var)
grads = gradients.gradients(
loss, var_list, gate_gradients=(gate_gradients == Optimizer.GATE_OP))
if gate_gradients == Optimizer.GATE_GRAPH:
grads = control_flow_ops.tuple(grads)
grads_and_vars = zip(grads, var_list)
self._assert_valid_dtypes([v for g, v in grads_and_vars if g is not None])
return grads_and_vars
def apply_gradients(self, grads_and_vars, global_step=None, name=None):
"""Apply gradients to variables.
This is the second part of minimize(). It returns an Operation that
applies gradients.
Args:
grads_and_vars: List of (gradient, variable) pairs as returned by
compute_gradients().
global_step: Optional Variable to increment by one after the
variables have been updated.
name: Optional name for the returned operation. Default to the
name passed to the Optimizer constructor.
Returns:
An Operation that applies the specified gradients. If 'global_step'
was not None, that operation also increments global_step.
Raises:
TypeError: if grads_and_vars is malformed.
"""
# This is a default implementation of apply_gradients() that can be shared
# by most optimizers. It relies on the subclass implementing the following
# methods: _create_slots(), _prepare(), _apply_dense(), and _apply_sparse().
for g, v in grads_and_vars:
if not isinstance(g, (ops.Tensor, ops.IndexedSlices, types.NoneType)):
raise TypeError(
"Gradient must be a Tensor, IndexedSlices, or None: %s" % g)
if not isinstance(v, variables.Variable):
raise TypeError(
"Variable must be a variables.Variable: %s" % v)
if g is not None:
self._assert_valid_dtypes([g, v])
self._create_slots([v for g, v in grads_and_vars if g is not None])
update_ops = []
with ops.op_scope([], name, self._name) as name:
self._prepare()
for grad, var in grads_and_vars:
if not grad:
continue
with ops.name_scope("update_" + var.op.name), ops.device(var.device):
if isinstance(grad, ops.Tensor):
update_ops.append(self._apply_dense(grad, var))
else:
update_ops.append(self._apply_sparse(grad, var))
if global_step is None:
return self._finish(update_ops, name)
else:
with ops.control_dependencies([self._finish(update_ops, "update")]):
with ops.device(global_step.device):
return state_ops.assign_add(global_step, 1, name=name).op
def get_slot(self, var, name):
"""Return a slot named "name" created for "var" by the Optimizer.
Some Optimizer subclasses use additional variables. For example
Momentum and Adagrad use variables to accumulate updates. This method
gives access to these Variables if for some reason you need them.
Use get_slot_names() to get the list of slot names created by the Optimizer.
Args:
var: A variable passed to minimize() or apply_gradients().
name: A string.
Returns:
The Variable for the slot if it was created, None otherwise.
"""
named_slots = self._slots.get(name, None)
if not named_slots:
return None
return named_slots.get(var, None)
def get_slot_names(self):
"""Return a list of the names of slots created by the Optimizer.
See get_slot().
Returns:
A list of strings.
"""
return sorted(self._slots.keys())
def _assert_valid_dtypes(self, tensors):
"""Asserts tensors are all valid types (see _valid_dtypes).
Args:
tensors: tensors to check.
Raises:
ValueError: if any tensor is not a valid type.
"""
valid_dtypes = self._valid_dtypes()
for t in tensors:
dtype = t.dtype.base_dtype
if dtype not in valid_dtypes:
raise ValueError(
"Invalid type %s for %s, expected: %s." % (
dtype, t.name, [v for v in valid_dtypes]))
# --------------
# Methods to be implemented by subclasses if they want to use the
# inherited implementation of apply_gradients() or compute_gradients().
# --------------
def _valid_dtypes(self):
"""Valid types for loss, variables and gradients.
Defaults to float32. Subclasses should override to allow other types.
Returns:
Valid types for loss, variables and gradients.
"""
return set([tf_types.float32])
def _create_slots(self, var_list):
"""Create all slots needed by the variables.
Args:
var_list: A list of variables.Variable.
"""
# No slots needed by default
pass
def _prepare(self):
"""Create all needed tensors before applying gradients.
This is called with the name_scope using the "name" that
users have chosen for the application of gradients.
"""
pass
def _apply_dense(self, grad, var):
"""Add ops to apply dense gradients to "var".
Args:
grad: A Tensor.
var: A variables.Variable.
Return:
An Operation.
"""
raise NotImplementedError()
def _apply_sparse(self, grad, var):
"""Add ops to apply sparse gradients to "var".
Args:
grad: IndexedSlices.
var: A variables.Variable.
Return:
An Operation.
"""
raise NotImplementedError()
def _finish(self, update_ops, name_scope):
"""Do what is needed to finish the update.
This is called with the name_scope using the "name" that
users have chosen for the application of gradients.
Args:
update_ops: List of Operations to update variables. This list contains
the values returned by the _apply_dense() and _apply_sparse() calls.
name_scope: string. Name to use for the returned operation.
Returns:
The operation to apply updates.
"""
return control_flow_ops.group(*update_ops, name=name_scope)
# --------------
# Utility methods for subclasses.
# --------------
def _get_or_make_slot(self, var, val, slot_name, op_name):
"""Find or create a slot for a variable.
Args:
var: A variables.Variable.
val: A Tensor. The initial value of the slot.
slot_name: Name for the slot.
op_name: Name to use when scoping the Variable that
needs to be created for the slot.
Returns:
A variables.Variable.
"""
named_slots = self._slots.get(slot_name, None)
if named_slots is None:
named_slots = {}
self._slots[slot_name] = named_slots
slot = named_slots.get(var, None)
if slot is None:
# Scope the slot name in the namespace of the Variable and
# create the slot on the same device as the variable.
with ops.name_scope(var.op.name + "/" + op_name) as scope:
with ops.device(var.device):
slot = variables.Variable(val, name=scope, trainable=False)
named_slots[var] = slot
return slot
def _zeros_slot(self, var, slot_name, op_name):
"""Find or create a slot initialized with 0.0.
Args:
var: A variables.Variable.
slot_name: Name for the slot.
op_name: Name to use when scoping the Variable that
needs to be created for the slot.
Returns:
A variables.Variable.
"""
val = array_ops.zeros(var.get_shape().as_list(), dtype=var.dtype)
return self._get_or_make_slot(var, val, slot_name, op_name)
|