tensorflow/core/api_def/base_api/api_def_SparseApplyCenteredRMSProp.pbtxt


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90

op {
  graph_op_name: "SparseApplyCenteredRMSProp"
  in_arg {
    name: "var"
    description: <<END
Should be from a Variable().
END
  }
  in_arg {
    name: "mg"
    description: <<END
Should be from a Variable().
END
  }
  in_arg {
    name: "ms"
    description: <<END
Should be from a Variable().
END
  }
  in_arg {
    name: "mom"
    description: <<END
Should be from a Variable().
END
  }
  in_arg {
    name: "lr"
    description: <<END
Scaling factor. Must be a scalar.
END
  }
  in_arg {
    name: "rho"
    description: <<END
Decay rate. Must be a scalar.
END
  }
  in_arg {
    name: "epsilon"
    description: <<END
Ridge term. Must be a scalar.
END
  }
  in_arg {
    name: "grad"
    description: <<END
The gradient.
END
  }
  in_arg {
    name: "indices"
    description: <<END
A vector of indices into the first dimension of var, ms and mom.
END
  }
  out_arg {
    name: "out"
    description: <<END
Same as "var".
END
  }
  attr {
    name: "use_locking"
    description: <<END
If `True`, updating of the var, mg, ms, and mom tensors is
protected by a lock; otherwise the behavior is undefined, but may exhibit less
contention.
END
  }
  summary: "Update \'*var\' according to the centered RMSProp algorithm."
  description: <<END
The centered RMSProp algorithm uses an estimate of the centered second moment
(i.e., the variance) for normalization, as opposed to regular RMSProp, which
uses the (uncentered) second moment. This often helps with training, but is
slightly more expensive in terms of computation and memory.

Note that in dense implementation of this algorithm, mg, ms, and mom will
update even if the grad is zero, but in this sparse implementation, mg, ms,
and mom will not update in iterations during which the grad is zero.

mean_square = decay * mean_square + (1-decay) * gradient ** 2
mean_grad = decay * mean_grad + (1-decay) * gradient
Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)

$$ms <- rho * ms_{t-1} + (1-rho) * grad * grad$$
$$mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)$$
$$var <- var - mom$$
END
}