Adagrad Dual Averaging optimizer for sparse linear models, that takes care of lazy updates correctly.

Change: 130714247
author: A. Unique TensorFlower <gardener@tensorflow.org> 2016-08-18 19:40:58 -0800
committer: TensorFlower Gardener <gardener@tensorflow.org> 2016-08-18 20:47:26 -0700
commit: 5da979895d63ed6c0933e96c9dacdc30bf05a09c (patch)
tree: 2e7e45c23ede25a407a1d192a9ebcc7c13b54c09 /tensorflow/core/kernels/training_ops.cc
parent: 0eae3401b731295342af3387bb8586a6b1f70274 (diff)
1 files changed, 339 insertions, 24 deletions
diff --git a/tensorflow/core/kernels/training_ops.cc b/tensorflow/core/kernels/training_ops.cc
index ca7231e6cc..bac8d8c80d 100644
--- a/tensorflow/core/kernels/training_ops.cc
+++ b/tensorflow/core/kernels/training_ops.cc
@@ -1,4 +1,4 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -18,15 +18,13 @@ limitations under the License.
 #include "tensorflow/core/kernels/training_ops.h"
 #include <algorithm>
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/kernels/bounds_check.h"
-
-#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/bounds_check.h"
 
 namespace tensorflow {
 
-typedef Eigen::ThreadPoolDevice CPUDevice;
-typedef Eigen::GpuDevice GPUDevice;
+using CPUDevice = Eigen::ThreadPoolDevice;
+using GPUDevice = Eigen::GpuDevice;
 
 namespace {
 template <class T>
@@ -95,6 +93,43 @@ struct ApplyProximalGradientDescent<CPUDevice, T> {
 };
 
 template <typename T>
+struct ApplyAdagradDA<CPUDevice, T> {
+  void operator()(const CPUDevice& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat gradient_accum,
+                  typename TTypes<T>::Flat gradient_squared_accum,
+                  typename TTypes<T>::ConstScalar lr, int64 global_step,
+                  typename TTypes<T>::ConstScalar l1,
+                  typename TTypes<T>::ConstScalar l2,
+                  typename TTypes<T>::ConstFlat grad) {
+    // Accumulate gradient, and gradient_squared
+    gradient_accum.device(d) += grad;
+    gradient_squared_accum.device(d) += grad.square();
+
+    // AdagradDA update:
+    // Let g to be gradient accumulator, gg to be gradient squared accumulator,
+    // T be the global step, lr is the learning rate, and k the initial
+    // gradient squared accumulator value.
+    // w = \dfrac{sign(-g)*lr*|g - l1*T|_{+}}{l2*T*lr + \sqrt{k+gg})}
+    if (l1() > 0) {
+      var.device(d) =
+          lr() * var.constant(-1.0) * gradient_accum.sign() *
+          (gradient_accum.abs() -
+           var.constant(static_cast<float>(global_step)) * var.constant(l1()))
+              .cwiseMax(T(0.0)) /
+          (var.constant(l2()) *
+               var.constant(static_cast<float>(global_step) * lr()) +
+           gradient_squared_accum.sqrt());
+    } else {
+      var.device(d) =
+          lr() * gradient_accum * var.constant(-1.0) /
+          (var.constant(l2()) *
+               var.constant(static_cast<float>(global_step) * lr()) +
+           gradient_squared_accum.sqrt());
+    }
+  }
+};
+
+template <typename T>
 struct ApplyAdagrad<CPUDevice, T> {
   void operator()(const CPUDevice& d, typename TTypes<T>::Flat var,
                   typename TTypes<T>::Flat accum,
@@ -413,8 +448,8 @@ class ApplyAdadeltaOp : public OpKernel {
   }
 };
 
-typedef Eigen::ThreadPoolDevice CPUDevice;
-typedef Eigen::GpuDevice GPUDevice;
+using CPUDevice = Eigen::ThreadPoolDevice;
+using GPUDevice = Eigen::GpuDevice;
 
 #define REGISTER_KERNELS(D, T)                                         \
   REGISTER_KERNEL_BUILDER(                                             \
@@ -698,6 +733,9 @@ class SparseApplyProximalGradientDescentOp : public OpKernel {
         ctx, grad.dim_size(0) == N,
         errors::InvalidArgument(
             "grad must be the same size as indices in the first dimension."));
+    OP_REQUIRES(ctx, inner_dim > 0,
+                errors::InvalidArgument(
+                    "Inner dimension should be greater than zero."));
 
     if (N > 0) {
       if (inner_dim > 1) {
@@ -735,7 +773,6 @@ class SparseApplyProximalGradientDescentOp : public OpKernel {
           }
         }
       } else {
-        CHECK_EQ(1, inner_dim);
         auto indices_vec = indices.vec<Tindex>();
         auto var_flat = var.flat<T>();
         auto grad_flat = grad.flat<T>();
@@ -834,8 +871,8 @@ class ApplyAdagradOp : public OpKernel {
   bool use_exclusive_lock_;
 };
 
-typedef Eigen::ThreadPoolDevice CPUDevice;
-typedef Eigen::GpuDevice GPUDevice;
+using CPUDevice = Eigen::ThreadPoolDevice;
+using GPUDevice = Eigen::GpuDevice;
 
 #define REGISTER_KERNELS(D, T)                                        \
   REGISTER_KERNEL_BUILDER(                                            \
@@ -928,8 +965,8 @@ class ApplyProximalAdagradOp : public OpKernel {
   bool use_exclusive_lock_;
 };
 
-typedef Eigen::ThreadPoolDevice CPUDevice;
-typedef Eigen::GpuDevice GPUDevice;
+using CPUDevice = Eigen::ThreadPoolDevice;
+using GPUDevice = Eigen::GpuDevice;
 
 #define REGISTER_KERNELS(D, T)                                                \
   REGISTER_KERNEL_BUILDER(                                                    \
@@ -1010,6 +1047,10 @@ class SparseApplyAdagradOp : public OpKernel {
         errors::InvalidArgument(
             "grad must be the same size as indices in the first dimension."));
 
+    OP_REQUIRES(ctx, inner_dim > 0,
+                errors::InvalidArgument(
+                    "Inner dimension should be greater than zero."));
+
     if (N > 0) {
       if (inner_dim > 1) {
         const Tindex first_dim_size = var.dim_size(0);
@@ -1034,7 +1075,6 @@ class SparseApplyAdagradOp : public OpKernel {
           v -= g.constant(lr_scalar) * g * a.rsqrt();
         }
       } else {
-        CHECK_EQ(1, inner_dim);
         auto indices_vec = indices.vec<Tindex>();
         auto var_flat = var.flat<T>();
         auto accum_flat = accum.flat<T>();
@@ -1142,6 +1182,10 @@ class SparseApplyProximalAdagradOp : public OpKernel {
         errors::InvalidArgument(
             "grad must be the same size as indices in the first dimension."));
 
+    OP_REQUIRES(ctx, inner_dim > 0,
+                errors::InvalidArgument(
+                    "Inner dimension should be greater than zero."));
+
     if (N > 0) {
       if (inner_dim > 1) {
         const Tindex first_dim_size = var.dim_size(0);
@@ -1180,7 +1224,6 @@ class SparseApplyProximalAdagradOp : public OpKernel {
           }
         }
       } else {
-        CHECK_EQ(1, inner_dim);
         auto indices_vec = indices.vec<Tindex>();
         auto var_flat = var.flat<T>();
         auto accum_flat = accum.flat<T>();
@@ -1237,6 +1280,275 @@ REGISTER_KERNELS(double, int64);
 #undef REGISTER_KERNELS
 
 template <typename Device, typename T>
+class ApplyAdagradDAOp : public OpKernel {
+ public:
+  explicit ApplyAdagradDAOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
+    Tensor var = ctx->mutable_input(0, use_exclusive_lock_);
+    Tensor gradient_accum = ctx->mutable_input(1, use_exclusive_lock_);
+    Tensor gradient_squared_accum = ctx->mutable_input(2, use_exclusive_lock_);
+    OP_REQUIRES(
+        ctx, var.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", def().input(0)));
+    OP_REQUIRES(
+        ctx, gradient_accum.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", def().input(1)));
+    OP_REQUIRES(
+        ctx, gradient_squared_accum.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", def().input(2)));
+    OP_REQUIRES(
+        ctx, var.shape().IsSameSize(gradient_accum.shape()),
+        errors::InvalidArgument("var and accum do not have the same shape",
+                                var.shape().DebugString(), " ",
+                                gradient_accum.shape().DebugString()));
+    OP_REQUIRES(
+        ctx, var.shape().IsSameSize(gradient_squared_accum.shape()),
+        errors::InvalidArgument("var and accum do not have the same shape",
+                                var.shape().DebugString(), " ",
+                                gradient_squared_accum.shape().DebugString()));
+
+    const Tensor& grad = ctx->input(3);
+    OP_REQUIRES(
+        ctx, var.shape().IsSameSize(grad.shape()),
+        errors::InvalidArgument("var and grad do not have the same shape",
+                                var.shape().DebugString(), " ",
+                                grad.shape().DebugString()));
+
+    const Tensor& lr = ctx->input(4);
+    OP_REQUIRES(ctx, IsLegacyScalar(lr.shape()),
+                errors::InvalidArgument("lr is not a scalar: ",
+                                        lr.shape().DebugString()));
+    const Tensor& l1 = ctx->input(5);
+    OP_REQUIRES(
+        ctx, TensorShapeUtils::IsScalar(l1.shape()),
+        errors::InvalidArgument("l1 regularization strength is not a scalar: ",
+                                l1.shape().DebugString()));
+    const Tensor& l2 = ctx->input(6);
+    OP_REQUIRES(
+        ctx, TensorShapeUtils::IsScalar(l2.shape()),
+        errors::InvalidArgument("l2 regularization strength is not a scalar: ",
+                                l2.shape().DebugString()));
+    const Tensor& global_step = ctx->input(7);
+    OP_REQUIRES(ctx, IsLegacyScalar(global_step.shape()),
+                errors::InvalidArgument("global_step is not a scalar: ",
+                                        global_step.shape().DebugString()));
+
+    const Device& device = ctx->template eigen_device<Device>();
+    functor::ApplyAdagradDA<Device, T>()(
+        device, var.flat<T>(), gradient_accum.flat<T>(),
+        gradient_squared_accum.flat<T>(), lr.scalar<T>(),
+        global_step.scalar<int64>()(), l1.scalar<T>(), l2.scalar<T>(),
+        grad.flat<T>());
+
+    ctx->forward_ref_input_to_ref_output(0, 0);
+  }
+
+ private:
+  bool use_exclusive_lock_;
+};
+
+using CPUDevice = Eigen::ThreadPoolDevice;
+using GPUDevice = Eigen::GpuDevice;
+
+#define REGISTER_KERNELS(D, T)                                          \
+  REGISTER_KERNEL_BUILDER(                                              \
+      Name("ApplyAdagradDA").Device(DEVICE_##D).TypeConstraint<T>("T"), \
+      ApplyAdagradDAOp<D##Device, T>);
+
+REGISTER_KERNELS(CPU, float);
+REGISTER_KERNELS(CPU, double);
+#undef REGISTER_KERNELS
+
+// Note, this op works on cpu only.
+template <typename T, typename Tindex>
+class SparseApplyAdagradDAOp : public OpKernel {
+ public:
+  explicit SparseApplyAdagradDAOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
+  }
+
+  void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
+    auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
+    Tensor var = ctx->mutable_input(0, use_exclusive_lock_);
+    Tensor gradient_accum = ctx->mutable_input(1, use_exclusive_lock_);
+    Tensor gradient_squared_accum = ctx->mutable_input(2, use_exclusive_lock_);
+    OP_REQUIRES(
+        ctx, var.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", def().input(0)));
+    OP_REQUIRES(
+        ctx, gradient_accum.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", def().input(1)));
+    OP_REQUIRES(
+        ctx, gradient_squared_accum.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", def().input(2)));
+    OP_REQUIRES(
+        ctx, var.shape().IsSameSize(gradient_accum.shape()),
+        errors::InvalidArgument("var and accum do not have the same shape",
+                                var.shape().DebugString(), " ",
+                                gradient_accum.shape().DebugString()));
+    OP_REQUIRES(
+        ctx, var.shape().IsSameSize(gradient_squared_accum.shape()),
+        errors::InvalidArgument("var and accum do not have the same shape",
+                                var.shape().DebugString(), " ",
+                                gradient_squared_accum.shape().DebugString()));
+
+    OP_REQUIRES(ctx, TensorShapeUtils::IsVectorOrHigher(var.shape()),
+                errors::InvalidArgument("var must be at least 1 dimensional"));
+
+    const Tensor& grad = ctx->input(3);
+    const Tensor& indices = ctx->input(4);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(indices.shape()),
+                errors::InvalidArgument("indices must be one-dimensional"));
+
+    const Tensor& lr = ctx->input(5);
+    OP_REQUIRES(ctx, IsLegacyScalar(lr.shape()),
+                errors::InvalidArgument("lr is not a scalar: ",
+                                        lr.shape().DebugString()));
+
+    const Tensor& l1 = ctx->input(6);
+    OP_REQUIRES(
+        ctx, TensorShapeUtils::IsScalar(l1.shape()),
+        errors::InvalidArgument("l1 regularization strength is not a scalar: ",
+                                l1.shape().DebugString()));
+
+    const Tensor& l2 = ctx->input(7);
+    OP_REQUIRES(
+        ctx, TensorShapeUtils::IsScalar(l2.shape()),
+        errors::InvalidArgument("l2 regularization strength is not a scalar: ",
+                                l2.shape().DebugString()));
+
+    const Tensor& global_step = ctx->input(8);
+    OP_REQUIRES(ctx, IsLegacyScalar(global_step.shape()),
+                errors::InvalidArgument("global_step is not a scalar: ",
+                                        global_step.shape().DebugString()));
+
+    int64 inner_dim = 1;
+    for (int d = 1; d < var.dims(); d++) {
+      OP_REQUIRES(ctx, var.dim_size(d) == grad.dim_size(d),
+                  errors::InvalidArgument(strings::StrCat(
+                      "var and grad must match in dimension ", d)));
+      inner_dim *= grad.dim_size(d);
+    }
+    const Tindex N = indices.dim_size(0);
+    OP_REQUIRES(
+        ctx, grad.dim_size(0) == N,
+        errors::InvalidArgument(
+            "grad must be the same size as indices in the first dimension."));
+
+    OP_REQUIRES(ctx, inner_dim > 0,
+                errors::InvalidArgument(
+                    "Inner dimension should be greater than zero."));
+
+    // AdagradDA update:
+    // Let g to be gradient accumulator, gg to be gradient squared accumulator,
+    // T be the global step, lr is the learning rate, and k the initial
+    // gradient squared accumulator value.
+    // w = \dfrac{sign(-g)*lr*|g - l1*T|_{+}}{l2*T*lr + \sqrt{k+gg})}
+    if (N > 0) {
+      if (inner_dim > 1) {
+        const Tindex first_dim_size = var.dim_size(0);
+        auto indices_vec = indices.vec<Tindex>();
+        auto var_flat = var.flat_outer_dims<T>();
+        auto gradient_accum_flat = gradient_accum.flat_outer_dims<T>();
+        auto gradient_squared_accum_flat =
+            gradient_squared_accum.flat_outer_dims<T>();
+        auto grad_flat = grad.flat_outer_dims<T>();
+        T lr_scalar = lr.scalar<T>()();
+        T global_step_scalar = global_step.scalar<int64>()();
+        T l1_scalar = l1.scalar<T>()();
+        T l2_scalar = l2.scalar<T>()();
+        const double gs_lr = global_step_scalar * lr_scalar;
+
+        for (Tindex i = 0; i < N; i++) {
+          const Tindex index = internal::SubtleMustCopy(indices_vec(i));
+          OP_REQUIRES(ctx, FastBoundsCheck(index, first_dim_size),
+                      errors::InvalidArgument(
+                          strings::StrCat("Index ", index, " at offset ", i,
+                                          " in indices is out of range")));
+          auto ga = gradient_accum_flat.template chip<0>(index);
+          auto da = gradient_squared_accum_flat.template chip<0>(index);
+          auto g = grad_flat.template chip<0>(i);
+          auto v = var_flat.template chip<0>(index);
+          ga += g;
+          da += g.square();
+          if (l1_scalar > 0) {
+            v = (ga.abs() / ga.constant(global_step_scalar)) -
+                ga.constant(l1_scalar);
+            v = ga.constant(-1.0) * ga.sign() *
+                v.cwiseMax(static_cast<T>(0.0)) /
+                (v.constant(l2_scalar) + da.sqrt() / v.constant(gs_lr));
+          } else {
+            v = ga.constant(-1.0) * (ga / ga.constant(global_step_scalar)) /
+                (v.constant(l2_scalar) + da.sqrt() / v.constant(gs_lr));
+          }
+        }
+      } else {
+        auto indices_vec = indices.vec<Tindex>();
+        auto var_flat = var.flat<T>();
+        auto gradient_accum_flat = gradient_accum.flat<T>();
+        auto gradient_squared_accum_flat = gradient_squared_accum.flat<T>();
+        auto grad_flat = grad.flat<T>();
+        const double lr_scalar = lr.scalar<T>()();
+        const int64 global_step_scalar = global_step.scalar<int64>()();
+        const double l1_scalar = l1.scalar<T>()();
+        const double l2_scalar = l2.scalar<T>()();
+        const Tindex first_dim_size = var_flat.size();
+        const double gs_l1 = global_step_scalar * l1_scalar;
+        const double gs_l2_lr = global_step_scalar * l2_scalar * lr_scalar;
+
+        for (Tindex i = 0; i < N; i++) {
+          const Tindex index = internal::SubtleMustCopy(indices_vec(i));
+          OP_REQUIRES(ctx, FastBoundsCheck(index, first_dim_size),
+                      errors::InvalidArgument(
+                          strings::StrCat("Index ", index, " at offset ", i,
+                                          " in indices is out of range")));
+          T& ga = gradient_accum_flat(index);
+          T& da = gradient_squared_accum_flat(index);
+          const double g = grad_flat(i);
+          ga += g;
+          da += g * g;
+          if (l1_scalar > 0) {
+            var_flat(index) = sgn(-ga) * lr_scalar *
+                              std::max((std::abs(ga) - gs_l1), 0.0) /
+                              (gs_l2_lr + std::sqrt(da));
+          } else {
+            var_flat(index) = (-ga * lr_scalar) / (gs_l2_lr + std::sqrt(da));
+          }
+        }
+      }
+    }
+
+    ctx->forward_ref_input_to_ref_output(0, 0);
+  }
+
+ private:
+  bool use_exclusive_lock_;
+};
+
+#define REGISTER_KERNELS(T, Tindices)                                \
+  REGISTER_KERNEL_BUILDER(Name("SparseApplyAdagradDA")               \
+                              .Device(DEVICE_CPU)                    \
+                              .TypeConstraint<T>("T")                \
+                              .TypeConstraint<Tindices>("Tindices"), \
+                          SparseApplyAdagradDAOp<T, Tindices>);
+
+REGISTER_KERNELS(float, int32);
+REGISTER_KERNELS(float, int64);
+REGISTER_KERNELS(double, int32);
+REGISTER_KERNELS(double, int64);
+#undef REGISTER_KERNELS
+
+template <typename Device, typename T>
 class ApplyFtrlOp : public OpKernel {
  public:
   explicit ApplyFtrlOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
@@ -1312,8 +1624,8 @@ class ApplyFtrlOp : public OpKernel {
   bool use_exclusive_lock_;
 };
 
-typedef Eigen::ThreadPoolDevice CPUDevice;
-typedef Eigen::GpuDevice GPUDevice;
+using CPUDevice = Eigen::ThreadPoolDevice;
+using GPUDevice = Eigen::GpuDevice;
 
 #define REGISTER_KERNELS(D, T)                                     \
   REGISTER_KERNEL_BUILDER(                                         \
@@ -1403,6 +1715,10 @@ class SparseApplyFtrlOp : public OpKernel {
         errors::InvalidArgument(
             "grad must be the same size as indices in the first dimension."));
 
+    OP_REQUIRES(ctx, inner_dim > 0,
+                errors::InvalidArgument(
+                    "Inner dimension should be greater than zero."));
+
     if (N > 0) {
       if (inner_dim > 1) {
         const Tindex first_dim_size = var.dim_size(0);
@@ -1453,7 +1769,6 @@ class SparseApplyFtrlOp : public OpKernel {
           accum += grad.square();
         }
       } else {
-        CHECK_EQ(1, inner_dim);
         auto indices_vec = indices.vec<Tindex>();
         auto var_flat = var.flat<T>();
         auto accum_flat = accum.flat<T>();
@@ -1567,8 +1882,8 @@ class ApplyMomentumOp : public OpKernel {
   bool use_nesterov_;
 };
 
-typedef Eigen::ThreadPoolDevice CPUDevice;
-typedef Eigen::GpuDevice GPUDevice;
+using CPUDevice = Eigen::ThreadPoolDevice;
+using GPUDevice = Eigen::GpuDevice;
 
 #define REGISTER_KERNELS(D, T)                                         \
   REGISTER_KERNEL_BUILDER(                                             \
@@ -1793,8 +2108,8 @@ class ApplyAdamOp : public OpKernel {
   bool use_exclusive_lock_;
 };
 
-typedef Eigen::ThreadPoolDevice CPUDevice;
-typedef Eigen::GpuDevice GPUDevice;
+using CPUDevice = Eigen::ThreadPoolDevice;
+using GPUDevice = Eigen::GpuDevice;
 
 #define REGISTER_KERNELS(D, T)                                     \
   REGISTER_KERNEL_BUILDER(                                         \
@@ -1910,8 +2225,8 @@ class ApplyRMSPropOp : public OpKernel {
   bool use_exclusive_lock_;
 };
 
-typedef Eigen::ThreadPoolDevice CPUDevice;
-typedef Eigen::GpuDevice GPUDevice;
+using CPUDevice = Eigen::ThreadPoolDevice;
+using GPUDevice = Eigen::GpuDevice;
 
 #define REGISTER_KERNELS(D, T)                                        \
   REGISTER_KERNEL_BUILDER(                                            \
author	A. Unique TensorFlower <gardener@tensorflow.org>	2016-08-18 19:40:58 -0800
committer	TensorFlower Gardener <gardener@tensorflow.org>	2016-08-18 20:47:26 -0700
commit	5da979895d63ed6c0933e96c9dacdc30bf05a09c (patch)
tree	2e7e45c23ede25a407a1d192a9ebcc7c13b54c09 /tensorflow/core/kernels/training_ops.cc
parent	0eae3401b731295342af3387bb8586a6b1f70274 (diff)