/* Copyright 2016 The TensorFlow Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ #define EIGEN_USE_THREADS #include "tensorflow/core/lib/bfloat16/bfloat16.h" #include #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/kernels/bounds_check.h" #include "tensorflow/core/kernels/training_op_helpers.h" #include "tensorflow/core/kernels/training_ops.h" #include "tensorflow/core/kernels/variable_ops.h" #ifdef TENSORFLOW_USE_SYCL #include "tensorflow/core/common_runtime/sycl/sycl_util.h" #endif // TENSORFLOW_USE_SYCL namespace tensorflow { using CPUDevice = Eigen::ThreadPoolDevice; using GPUDevice = Eigen::GpuDevice; using SYCLDevice = Eigen::SyclDevice; namespace { template inline T sgn(const T x) { T zero(0); T one(1); return (x == zero ? zero : (x < zero ? -one : one)); } } // namespace namespace functor { template struct ApplyGradientDescent { void operator()(const CPUDevice& d, typename TTypes::Flat var, typename TTypes::ConstScalar lr, typename TTypes::ConstFlat grad) { var.device(d) -= grad * lr(); } }; #ifdef TENSORFLOW_USE_SYCL template struct ApplyGradientDescentSYCL { void operator()(const SYCLDevice& d, typename TTypes::Flat var, T lr, typename TTypes::ConstFlat grad) { var.device(d) -= grad * lr; } }; #endif template struct ApplyAdadelta { void operator()(const CPUDevice& d, typename TTypes::Flat var, typename TTypes::Flat accum, typename TTypes::Flat accum_update, typename TTypes::ConstScalar lr, typename TTypes::ConstScalar rho, typename TTypes::ConstScalar epsilon, typename TTypes::ConstFlat grad) { accum.device(d) = accum * rho() + grad.square() * (static_cast(1) - rho()); const auto update = (accum_update + epsilon()).sqrt() * (accum + epsilon()).rsqrt() * grad; var.device(d) -= update * lr(); accum_update.device(d) = accum_update * rho() + update.square() * (static_cast(1) - rho()); } }; template struct ApplyProximalGradientDescent { void operator()(const CPUDevice& d, typename TTypes::Flat var, typename TTypes::ConstScalar lr, typename TTypes::ConstScalar l1, typename TTypes::ConstScalar l2, typename TTypes::ConstFlat grad) { // Note that here is Fobos update, for details please refer: // http://papers.nips.cc/paper/3793-efficient-learning-using-forward-backward-splitting.pdf // TODO(xbing): merge the logic for ProximalGradientDescent and // ProximalAdagrad. auto prox_var = var; // compute v = w - lr * grad. prox_var.device(d) -= grad * lr(); if (l1() > 0) { // compute sign(v) * max(|v| - lr * l1, 0) var.device(d) = prox_var.sign() * (prox_var.abs() - var.constant(lr() * l1())).cwiseMax(T(0.0)) / (var.constant(1.0) + var.constant(l2() * lr())); } else { var.device(d) = prox_var / (var.constant(1.0) + var.constant(l2() * lr())); } } }; template struct ApplyAdagradDA { void operator()(const CPUDevice& d, typename TTypes::Flat var, typename TTypes::Flat gradient_accum, typename TTypes::Flat gradient_squared_accum, typename TTypes::ConstScalar lr, int64 global_step, typename TTypes::ConstScalar l1, typename TTypes::ConstScalar l2, typename TTypes::ConstFlat grad) { // Accumulate gradient, and gradient_squared gradient_accum.device(d) += grad; gradient_squared_accum.device(d) += grad.square(); // AdagradDA update: // Let g to be gradient accumulator, gg to be gradient squared accumulator, // T be the global step, lr is the learning rate, and k the initial // gradient squared accumulator value. // w = \dfrac{sign(-g)*lr*|g - l1*T|_{+}}{l2*T*lr + \sqrt{k+gg})} if (l1() > 0) { var.device(d) = lr() * var.constant(-1.0) * gradient_accum.sign() * (gradient_accum.abs() - var.constant(static_cast(global_step)) * var.constant(l1())) .cwiseMax(T(0.0)) / (var.constant(l2()) * var.constant(static_cast(global_step) * lr()) + gradient_squared_accum.sqrt()); } else { var.device(d) = lr() * gradient_accum * var.constant(-1.0) / (var.constant(l2()) * var.constant(static_cast(global_step) * lr()) + gradient_squared_accum.sqrt()); } } }; template struct ApplyAdagrad { void operator()(const CPUDevice& d, typename TTypes::Flat var, typename TTypes::Flat accum, typename TTypes::ConstScalar lr, typename TTypes::ConstFlat grad, bool update_slots) { if (update_slots) { accum.device(d) += grad.square(); } var.device(d) -= grad * lr() * accum.rsqrt(); } }; template struct ApplyProximalAdagrad { void operator()(const CPUDevice& d, typename TTypes::Flat var, typename TTypes::Flat accum, typename TTypes::ConstScalar lr, typename TTypes::ConstScalar l1, typename TTypes::ConstScalar l2, typename TTypes::ConstFlat grad) { // Fobos update per paper with Adagrad learning rate. accum.device(d) += grad.square(); // Adagrad learning rate. auto learning_rate = accum.constant(lr()) * accum.rsqrt(); auto prox_var = var; // compute v = w - lr * grad. prox_var.device(d) -= grad * learning_rate; if (l1() > 0) { // compute sign(v) * max(|v| - lr * l1, 0) var.device(d) = prox_var.sign() * (prox_var.abs() - learning_rate * prox_var.constant(l1())) .cwiseMax(T(0.0)) / (var.constant(1.0) + var.constant(l2()) * learning_rate); } else { var.device(d) = prox_var / (var.constant(1.0) + var.constant(l2()) * learning_rate); } } }; template struct ApplyFtrlV2 { void operator()(const CPUDevice& d, typename TTypes::Flat var, typename TTypes::Flat accum, typename TTypes::Flat linear, typename TTypes::ConstFlat grad, typename TTypes::ConstScalar lr, typename TTypes::ConstScalar l1, typename TTypes::ConstScalar l2, typename TTypes::ConstScalar l2_shrinkage, typename TTypes::ConstScalar lr_power) { auto grad_with_shrinkage = grad + static_cast(2) * l2_shrinkage() * var; auto new_accum = accum + grad * grad; // special case for which lr_power=-0.5. if (lr_power() == static_cast(-0.5)) { linear.device(d) += grad_with_shrinkage - (new_accum.sqrt() - accum.sqrt()) / lr() * var; } else { linear.device(d) += grad_with_shrinkage - (new_accum.pow(-lr_power()) - accum.pow(-lr_power())) / lr() * var; } auto x = (linear.constant(l1()) * linear.sign() - linear); if (lr_power() == static_cast(-0.5)) { auto y = new_accum.sqrt() / new_accum.constant(lr()) + linear.constant(static_cast(2) * l2()); auto pre_shrink = x / y; var.device(d) = (linear.abs() > linear.constant(l1())) .select(pre_shrink, var.constant(static_cast(0))); } else { auto y = new_accum.pow(-lr_power()) / new_accum.constant(lr()) + linear.constant(static_cast(2) * l2()); auto pre_shrink = x / y; var.device(d) = (linear.abs() > linear.constant(l1())) .select(pre_shrink, var.constant(static_cast(0))); } accum.device(d) += grad * grad; } }; template struct ApplyFtrl { void operator()(const CPUDevice& d, typename TTypes::Flat var, typename TTypes::Flat accum, typename TTypes::Flat linear, typename TTypes::ConstFlat grad, typename TTypes::ConstScalar lr, typename TTypes::ConstScalar l1, typename TTypes::ConstScalar l2, typename TTypes::ConstScalar lr_power) { auto new_accum = accum + grad.square(); // special case for which lr_power=-0.5. if (lr_power() == static_cast(-0.5)) { linear.device(d) += grad - (new_accum.sqrt() - accum.sqrt()) / lr() * var; } else { linear.device(d) += grad - (new_accum.pow(-lr_power()) - accum.pow(-lr_power())) / lr() * var; } auto x = (linear.constant(l1()) * linear.sign() - linear); if (lr_power() == static_cast(-0.5)) { auto y = new_accum.sqrt() / new_accum.constant(lr()) + linear.constant(static_cast(2) * l2()); auto pre_shrink = x / y; var.device(d) = (linear.abs() > linear.constant(l1())) .select(pre_shrink, var.constant(static_cast(0))); } else { auto y = new_accum.pow(-lr_power()) / new_accum.constant(lr()) + linear.constant(static_cast(2) * l2()); auto pre_shrink = x / y; var.device(d) = (linear.abs() > linear.constant(l1())) .select(pre_shrink, var.constant(static_cast(0))); } accum.device(d) += grad.square(); } }; template struct ApplyMomentum { void operator()(const CPUDevice& d, typename TTypes::Flat var, typename TTypes::Flat accum, typename TTypes::ConstScalar lr, typename TTypes::ConstFlat grad, typename TTypes::ConstScalar momentum, bool use_nesterov) { accum.device(d) = accum * momentum() + grad; if (use_nesterov) { var.device(d) -= grad * lr() + accum * momentum() * lr(); } else { var.device(d) -= accum * lr(); } } }; template struct ApplyAdamNonCuda { void operator()(const Device& d, typename TTypes::Flat var, typename TTypes::Flat m, typename TTypes::Flat v, typename TTypes::ConstScalar beta1_power, typename TTypes::ConstScalar beta2_power, typename TTypes::ConstScalar lr, typename TTypes::ConstScalar beta1, typename TTypes::ConstScalar beta2, typename TTypes::ConstScalar epsilon, typename TTypes::ConstFlat grad, bool use_nesterov) { const T alpha = lr() * Eigen::numext::sqrt(T(1) - beta2_power()) / (T(1) - beta1_power()); // beta1 == μ // beta2 == ν // v == n // var == θ m.device(d) += (grad - m) * (T(1) - beta1()); v.device(d) += (grad.square() - v) * (T(1) - beta2()); if (use_nesterov) { var.device(d) -= ((grad * (T(1) - beta1()) + beta1() * m) * alpha) / (v.sqrt() + epsilon()); } else { var.device(d) -= (m * alpha) / (v.sqrt() + epsilon()); } } }; #ifdef TENSORFLOW_USE_SYCL template struct ApplyAdamSYCL { void operator()(const SYCLDevice& d, typename TTypes::Flat var, typename TTypes::Flat m, typename TTypes::Flat v, T beta1_power, T beta2_power, T lr, T beta1, T beta2, T epsilon, typename TTypes::ConstFlat grad) { const T alpha = lr * Eigen::numext::sqrt(T(1) - beta2_power) / (T(1) - beta1_power); m.device(d) += (grad - m) * (T(1) - beta1); v.device(d) += (grad.square() - v) * (T(1) - beta2); var.device(d) -= (m * alpha) / (v.sqrt() + epsilon); } }; #endif // TENSORFLOW_USE_SYCL template struct ApplyAdam : ApplyAdamNonCuda {}; template struct ApplyAdaMaxNonCuda { void operator()(const Device& d, typename TTypes::Flat var, typename TTypes::Flat m, typename TTypes::Flat v, typename TTypes::ConstScalar beta1_power, typename TTypes::ConstScalar lr, typename TTypes::ConstScalar beta1, typename TTypes::ConstScalar beta2, typename TTypes::ConstScalar epsilon, typename TTypes::ConstFlat grad) { m.device(d) += (grad - m) * (T(1) - beta1()); // Here v is u in section 7.1 v.device(d) = (beta2() * v).cwiseMax(grad.abs()); // var is θ in section 7.1 var.device(d) -= lr() / (T(1) - beta1_power()) * (m / (v + epsilon())); } }; template struct ApplyAdaMax : ApplyAdaMaxNonCuda {}; template struct ApplyRMSProp { void operator()(const CPUDevice& d, typename TTypes::Flat var, typename TTypes::Flat ms, typename TTypes::Flat mom, typename TTypes::ConstScalar lr, typename TTypes::ConstScalar rho, typename TTypes::ConstScalar momentum, typename TTypes::ConstScalar epsilon, typename TTypes::ConstFlat grad) { ms.device(d) += (grad.square() - ms) * (static_cast(1) - rho()); mom.device(d) = mom * momentum() + (grad * lr()) / ((ms + epsilon()).sqrt()); var.device(d) -= mom; } }; template struct ApplyCenteredRMSProp { void operator()(const CPUDevice& d, typename TTypes::Flat var, typename TTypes::Flat mg, typename TTypes::Flat ms, typename TTypes::Flat mom, typename TTypes::ConstScalar lr, typename TTypes::ConstScalar rho, typename TTypes::ConstScalar momentum, typename TTypes::ConstScalar epsilon, typename TTypes::ConstFlat grad) { ms.device(d) += (grad.square() - ms) * (static_cast(1) - rho()); mg.device(d) += (grad - mg) * (static_cast(1) - rho()); auto denom = (ms - mg.square()) + epsilon(); mom.device(d) = mom * momentum() + (grad * lr()) / denom.sqrt(); var.device(d) -= mom; } }; template struct ApplyAddSign { void operator()(const CPUDevice& d, typename TTypes::Flat var, typename TTypes::Flat m, typename TTypes::ConstScalar lr, typename TTypes::ConstScalar alpha, typename TTypes::ConstScalar sign_decay, typename TTypes::ConstScalar beta, typename TTypes::ConstFlat grad) { m.device(d) = m * beta() + grad * (static_cast(1) - beta()); auto sign_gm = grad.sign() * m.sign(); var.device(d) -= lr() * (alpha() + sign_decay() * sign_gm) * grad; } }; template struct ApplyPowerSign { void operator()(const CPUDevice& d, typename TTypes::Flat var, typename TTypes::Flat m, typename TTypes::ConstScalar lr, typename TTypes::ConstScalar logbase, typename TTypes::ConstScalar sign_decay, typename TTypes::ConstScalar beta, typename TTypes::ConstFlat grad) { m.device(d) = m * beta() + grad * (static_cast(1) - beta()); auto sign_gm = grad.sign() * m.sign(); auto grad_scale = (logbase() * sign_decay() * sign_gm).exp(); var.device(d) -= lr() * grad_scale * grad; } }; } // namespace functor template class ApplyGradientDescentOp : public OpKernel { public: explicit ApplyGradientDescentOp(OpKernelConstruction* ctx) : OpKernel(ctx) { OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_)); } void Compute(OpKernelContext* ctx) override { auto locks = MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0}); Tensor var; OP_REQUIRES_OK(ctx, GetInputTensorFromVariable( ctx, 0, use_exclusive_lock_, false, &var)); OP_REQUIRES( ctx, var.IsInitialized(), errors::FailedPrecondition( "Attempting to use uninitialized variables: ", requested_input(0))); const Tensor& alpha = ctx->input(1); OP_REQUIRES(ctx, IsLegacyScalar(alpha.shape()), errors::InvalidArgument("alpha is not a scalar: ", alpha.shape().DebugString())); const Tensor& delta = ctx->input(2); OP_REQUIRES( ctx, var.shape().IsSameSize(delta.shape()), errors::InvalidArgument("var and delta do not have the same shape", var.shape().DebugString(), " ", delta.shape().DebugString())); const Device& device = ctx->template eigen_device(); functor::ApplyGradientDescent()( device, var.flat(), alpha.scalar(), delta.flat()); MaybeForwardRefInputToRefOutput(ctx, 0, 0); } private: bool use_exclusive_lock_; }; #ifdef TENSORFLOW_USE_SYCL template class ApplyGradientDescentOp : public OpKernel { public: explicit ApplyGradientDescentOp(OpKernelConstruction* ctx) : OpKernel(ctx) { OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_)); } void Compute(OpKernelContext* ctx) override { auto locks = MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0}); Tensor var; OP_REQUIRES_OK(ctx, GetInputTensorFromVariable( ctx, 0, use_exclusive_lock_, false, &var)); OP_REQUIRES( ctx, var.IsInitialized(), errors::FailedPrecondition( "Attempting to use uninitialized variables: ", requested_input(0))); const Tensor& alpha_dev = ctx->input(1); OP_REQUIRES(ctx, IsLegacyScalar(alpha_dev.shape()), errors::InvalidArgument("alpha is not a scalar: ", alpha_dev.shape().DebugString())); const Tensor& delta = ctx->input(2); OP_REQUIRES( ctx, var.shape().IsSameSize(delta.shape()), errors::InvalidArgument("var and delta do not have the same shape", var.shape().DebugString(), " ", delta.shape().DebugString())); auto device = ctx->eigen_sycl_device(); auto size = sizeof(T); T alpha = T(0); auto src_ptr = GetBase(&alpha_dev); device.memcpyDeviceToHost(&alpha, static_cast(src_ptr), size); functor::ApplyGradientDescentSYCL()(device, var.flat(), alpha, delta.flat()); MaybeForwardRefInputToRefOutput(ctx, 0, 0); } private: bool use_exclusive_lock_; }; #endif // TENSORFLOW_USE_SYCL #define REGISTER_KERNELS(D, T) \ REGISTER_KERNEL_BUILDER( \ Name("ApplyGradientDescent").Device(DEVICE_##D).TypeConstraint("T"), \ ApplyGradientDescentOp); \ REGISTER_KERNEL_BUILDER(Name("ResourceApplyGradientDescent") \ .Device(DEVICE_##D) \ .HostMemory("var") \ .TypeConstraint("T"), \ ApplyGradientDescentOp); #define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T); TF_CALL_half(REGISTER_CPU_KERNELS); TF_CALL_bfloat16(REGISTER_CPU_KERNELS); TF_CALL_float(REGISTER_CPU_KERNELS); TF_CALL_double(REGISTER_CPU_KERNELS); #if GOOGLE_CUDA // Forward declarations of the functor specializations for GPU. namespace functor { #define DECLARE_GPU_SPEC(T) \ template <> \ void ApplyGradientDescent::operator()( \ const GPUDevice& d, typename TTypes::Flat var, \ typename TTypes::ConstScalar alpha, \ typename TTypes::ConstFlat delta); \ extern template struct ApplyGradientDescent; DECLARE_GPU_SPEC(Eigen::half); DECLARE_GPU_SPEC(float); DECLARE_GPU_SPEC(double); #undef DECLARE_GPU_SPEC } // namespace functor REGISTER_KERNELS(GPU, Eigen::half); REGISTER_KERNELS(GPU, float); REGISTER_KERNELS(GPU, double); #endif #ifdef TENSORFLOW_USE_SYCL #define REGISTER_SYCL_KERNELS(T) REGISTER_KERNELS(SYCL, T); TF_CALL_float(REGISTER_SYCL_KERNELS); TF_CALL_double(REGISTER_SYCL_KERNELS); #undef REGISTER_SYCL_KERNELS #endif // TENSORFLOW_USE_SYCL #undef REGISTER_CPU_KERNELS #undef REGISTER_KERNELS template class ApplyAdadeltaOp : public OpKernel { public: explicit ApplyAdadeltaOp(OpKernelConstruction* ctx) : OpKernel(ctx) { OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_)); } void Compute(OpKernelContext* ctx) override { Var* resource; mutex* mu = GetTrainingVariableMutex(ctx, 0, &resource); core::ScopedUnref scoped_unref(resource); if (use_exclusive_lock_ && mu != nullptr) { mutex_lock l1(*mu); // Don't try to acquire a lock on the second ref as they share the same // mutex. // // mutex_lock l2(*ctx->input_ref_mutex(1)); DoValidate(ctx); if (!ctx->status().ok()) return; DoCompute(ctx); } else { DoValidate(ctx); if (!ctx->status().ok()) return; DoCompute(ctx); } MaybeForwardRefInputToRefOutput(ctx, 0, 0); } private: bool use_exclusive_lock_; void DoValidate(OpKernelContext* ctx) { Tensor var; OP_REQUIRES_OK(ctx, GetInputTensorFromVariable( ctx, 0, use_exclusive_lock_, false, &var)); Tensor accum; OP_REQUIRES_OK(ctx, GetInputTensorFromVariable( ctx, 1, use_exclusive_lock_, false, &accum)); Tensor accum_update; OP_REQUIRES_OK(ctx, GetInputTensorFromVariable( ctx, 2, use_exclusive_lock_, false, &accum_update)); OP_REQUIRES( ctx, var.IsInitialized(), errors::FailedPrecondition( "Attempting to use uninitialized variables: ", requested_input(0))); OP_REQUIRES( ctx, accum.IsInitialized(), errors::FailedPrecondition( "Attempting to use uninitialized variables: ", requested_input(1))); OP_REQUIRES( ctx, accum_update.IsInitialized(), errors::FailedPrecondition( "Attempting to use uninitialized variables: ", requested_input(2))); const Tensor& lr = ctx->input(3); const Tensor& rho = ctx->input(4); const Tensor& epsilon = ctx->input(5); const Tensor& grad = ctx->input(6); OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr.shape()), errors::InvalidArgument("lr is not a scalar: ", lr.shape().DebugString())); OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(rho.shape()), errors::InvalidArgument("rho is not a scalar: ", rho.shape().DebugString())); OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(epsilon.shape()), errors::InvalidArgument("epsilon is not a scalar: ", epsilon.shape().DebugString())); OP_REQUIRES( ctx, var.shape().IsSameSize(accum.shape()), errors::InvalidArgument("var and accum do not have the same shape", var.shape().DebugString(), " ", accum.shape().DebugString())); OP_REQUIRES( ctx, var.shape().IsSameSize(grad.shape()), errors::InvalidArgument("var and grad do not have the same shape", var.shape().DebugString(), " ", grad.shape().DebugString())); } void DoCompute(OpKernelContext* ctx) { const Device& device = ctx->template eigen_device(); Tensor var; OP_REQUIRES_OK(ctx, GetInputTensorFromVariable( ctx, 0, use_exclusive_lock_, false, &var)); Tensor accum; OP_REQUIRES_OK(ctx, GetInputTensorFromVariable( ctx, 1, use_exclusive_lock_, false, &accum)); Tensor accum_update; OP_REQUIRES_OK(ctx, GetInputTensorFromVariable( ctx, 2, use_exclusive_lock_, false, &accum_update)); const Tensor& lr = ctx->input(3); const Tensor& rho = ctx->input(4); const Tensor& epsilon = ctx->input(5); const Tensor& grad = ctx->input(6); functor::ApplyAdadelta()( device, var.flat(), accum.flat(), accum_update.flat(), lr.scalar(), rho.scalar(), epsilon.scalar(), grad.flat()); } }; #define REGISTER_KERNELS(D, T) \ REGISTER_KERNEL_BUILDER( \ Name("ApplyAdadelta").Device(DEVICE_##D).TypeConstraint("T"), \ ApplyAdadeltaOp); \ REGISTER_KERNEL_BUILDER(Name("ResourceApplyAdadelta") \ .Device(DEVICE_##D) \ .HostMemory("var") \ .HostMemory("accum") \ .HostMemory("accum_update") \ .TypeConstraint("T"), \ ApplyAdadeltaOp); #define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T); TF_CALL_half(REGISTER_CPU_KERNELS); TF_CALL_bfloat16(REGISTER_CPU_KERNELS); TF_CALL_float(REGISTER_CPU_KERNELS); TF_CALL_double(REGISTER_CPU_KERNELS); #if GOOGLE_CUDA // Forward declarations of the functor specializations for GPU. namespace functor { #define DECLARE_GPU_SPEC(T) \ template <> \ void ApplyAdadelta::operator()( \ const GPUDevice& d, typename TTypes::Flat var, \ typename TTypes::Flat accum, typename TTypes::Flat accum_update, \ typename TTypes::ConstScalar lr, typename TTypes::ConstScalar rho, \ typename TTypes::ConstScalar epsilon, \ typename TTypes::ConstFlat grad); \ extern template struct ApplyAdadelta; DECLARE_GPU_SPEC(Eigen::half); DECLARE_GPU_SPEC(float); DECLARE_GPU_SPEC(double); #undef DECLARE_GPU_SPEC } // namespace functor REGISTER_KERNELS(GPU, Eigen::half); REGISTER_KERNELS(GPU, float); REGISTER_KERNELS(GPU, double); #endif #undef REGISTER_CPU_KERNELS #undef REGISTER_KERNELS // Note, this op works on cpu only. template class SparseApplyAdadeltaOp : public OpKernel { public: explicit SparseApplyAdadeltaOp(OpKernelConstruction* ctx) : OpKernel(ctx) { OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_)); } void Compute(OpKernelContext* ctx) override { Var* var; mutex* mu = GetTrainingVariableMutex(ctx, 0, &var); core::ScopedUnref scoped_unref(var); // mu_accum is actually the same mutex as mu_var since currently we use a // global mutex. // // mutex* mu_accum = ctx->input_ref_mutex(1); if (use_exclusive_lock_ && mu != nullptr) { mutex_lock ml(*mu); DoCompute(ctx); } else { DoCompute(ctx); } } void DoCompute(OpKernelContext* ctx) { Tensor var; OP_REQUIRES_OK(ctx, GetInputTensorFromVariable( ctx, 0, use_exclusive_lock_, true, &var)); Tensor accum_grad; OP_REQUIRES_OK(ctx, GetInputTensorFromVariable( ctx, 1, use_exclusive_lock_, true, &accum_grad)); Tensor accum_update; OP_REQUIRES_OK(ctx, GetInputTensorFromVariable( ctx, 2, use_exclusive_lock_, true, &accum_update)); OP_REQUIRES( ctx, var.IsInitialized(), errors::FailedPrecondition( "Attempting to use uninitialized variables: ", requested_input(0))); OP_REQUIRES( ctx, accum_grad.IsInitialized(), errors::FailedPrecondition( "Attempting to use uninitialized variables: ", requested_input(1))); OP_REQUIRES( ctx, accum_update.IsInitialized(), errors::FailedPrecondition( "Attempting to use uninitialized variables: ", requested_input(2))); OP_REQUIRES( ctx, var.shape().IsSameSize(accum_grad.shape()), errors::InvalidArgument("var and accum_grad do not have the same shape", var.shape().DebugString(), " ", accum_grad.shape().DebugString())); OP_REQUIRES(ctx, var.shape().IsSameSize(accum_update.shape()), errors::InvalidArgument( "var and accum_update do not have the same shape", var.shape().DebugString(), " ", accum_update.shape().DebugString())); OP_REQUIRES(ctx, TensorShapeUtils::IsVectorOrHigher(var.shape()), errors::InvalidArgument("var must be at least 1 dimensional")); const Tensor& lr = ctx->input(3); OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr.shape()), errors::InvalidArgument("lr is not a scalar: ", lr.shape().DebugString())); const Tensor& rho = ctx->input(4); OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(rho.shape()), errors::InvalidArgument("rho is not a scalar: ", rho.shape().DebugString())); const Tensor& epsilon = ctx->input(5); OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(epsilon.shape()), errors::InvalidArgument("epsilon is not a scalar: ", epsilon.shape().DebugString())); const Tensor& grad = ctx->input(6); const Tensor& indices = ctx->input(7); OP_REQUIRES(ctx, TensorShapeUtils::IsVector(indices.shape()), errors::InvalidArgument("indices must be one-dimensional")); for (int d = 1; d < var.dims(); d++) { OP_REQUIRES(ctx, var.dim_size(d) == grad.dim_size(d), errors::InvalidArgument(strings::StrCat( "var and grad must match in dimension ", d))); } const Tindex N = indices.dim_size(0); OP_REQUIRES( ctx, grad.dim_size(0) == N, errors::InvalidArgument( "grad must be the same size as indices in the first dimension.")); if (N > 0) { const Tindex first_dim_size = var.dim_size(0); // Validate all the indices are in range auto indices_vec = indices.vec(); for (Tindex i = 0; i < N; i++) { const Tindex index = indices_vec(i); OP_REQUIRES(ctx, index >= 0 && index < first_dim_size, errors::InvalidArgument( strings::StrCat("Index ", index, " at offset ", i, " in indices is out of range"))); } auto var_flat = var.flat_outer_dims(); auto accum_grad_flat = accum_grad.flat_outer_dims(); auto accum_update_flat = accum_update.flat_outer_dims(); auto grad_flat = grad.flat_outer_dims(); const T lr_scalar = lr.scalar()(); const T rho_scalar = rho.scalar()(); const T epsilon_scalar = epsilon.scalar()(); for (Tindex i = 0; i < N; i++) { const Tindex index = indices_vec(i); auto accum_ = accum_grad_flat.template chip<0>(index); auto accum_update_ = accum_update_flat.template chip<0>(index); auto grad_ = grad_flat.template chip<0>(i); accum_ = accum_ * accum_.constant(rho_scalar) + grad_.square() * grad_.constant(T(1) - rho_scalar); const auto update = (accum_update_ + accum_update_.constant(epsilon_scalar)).sqrt() * (accum_ + accum_.constant(epsilon_scalar)).rsqrt() * grad_; auto v = var_flat.template chip<0>(index); v -= update * update.constant(lr_scalar); accum_update_ = accum_update_ * accum_update_.constant(rho_scalar) + update.square() * update.constant(static_cast(1) - rho_scalar); } } MaybeForwardRefInputToRefOutput(ctx, 0, 0); } private: bool use_exclusive_lock_; }; #define REGISTER_KERNELS(T, Tindices) \ REGISTER_KERNEL_BUILDER(Name("SparseApplyAdadelta") \ .Device(DEVICE_CPU) \ .TypeConstraint("T") \ .TypeConstraint("Tindices"), \ SparseApplyAdadeltaOp); \ REGISTER_KERNEL_BUILDER(Name("ResourceSparseApplyAdadelta") \ .Device(DEVICE_CPU) \ .TypeConstraint("T") \ .TypeConstraint("Tindices"), \ SparseApplyAdadeltaOp); #define REGISTER_CPU_KERNELS(T) \ REGISTER_KERNELS(T, int32); \ REGISTER_KERNELS(T, int64); TF_CALL_half(REGISTER_CPU_KERNELS); TF_CALL_bfloat16(REGISTER_CPU_KERNELS); TF_CALL_float(REGISTER_CPU_KERNELS); TF_CALL_double(REGISTER_CPU_KERNELS); #undef REGISTER_CPU_KERNELS #undef REGISTER_KERNELS // Note, this op works on cpu only. template class ApplyProximalGradientDescentOp : public OpKernel { public: explicit ApplyProximalGradientDescentOp(OpKernelConstruction* ctx) : OpKernel(ctx) { OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_)); } void Compute(OpKernelContext* ctx) override { auto locks = MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0}); Tensor var; OP_REQUIRES_OK(ctx, GetInputTensorFromVariable( ctx, 0, use_exclusive_lock_, false, &var)); OP_REQUIRES( ctx, var.IsInitialized(), errors::FailedPrecondition( "Attempting to use uninitialized variables: ", requested_input(0))); const Tensor& alpha = ctx->input(1); OP_REQUIRES(ctx, IsLegacyScalar(alpha.shape()), errors::InvalidArgument("alpha is not a scalar: ", alpha.shape().DebugString())); const Tensor& l1 = ctx->input(2); OP_REQUIRES( ctx, TensorShapeUtils::IsScalar(l1.shape()), errors::InvalidArgument("l1 regularization strength is not a scalar: ", l1.shape().DebugString())); const Tensor& l2 = ctx->input(3); OP_REQUIRES( ctx, TensorShapeUtils::IsScalar(l2.shape()), errors::InvalidArgument("l2 regularization strength is not a scalar: ", l2.shape().DebugString())); const Tensor& delta = ctx->input(4); OP_REQUIRES( ctx, var.shape().IsSameSize(delta.shape()), errors::InvalidArgument("var and delta do not have the same shape", var.shape().DebugString(), " ", delta.shape().DebugString())); const Device& device = ctx->template eigen_device(); functor::ApplyProximalGradientDescent()( device, var.flat(), alpha.scalar(), l1.scalar(), l2.scalar(), delta.flat()); MaybeForwardRefInputToRefOutput(ctx, 0, 0); } private: bool use_exclusive_lock_; }; #define REGISTER_KERNELS(D, T) \ REGISTER_KERNEL_BUILDER(Name("ApplyProximalGradientDescent") \ .Device(DEVICE_##D) \ .TypeConstraint("T"), \ ApplyProximalGradientDescentOp); \ REGISTER_KERNEL_BUILDER(Name("ResourceApplyProximalGradientDescent") \ .HostMemory("var") \ .Device(DEVICE_##D) \ .TypeConstraint("T"), \ ApplyProximalGradientDescentOp); REGISTER_KERNELS(CPU, float); REGISTER_KERNELS(CPU, double); #undef REGISTER_KERNELS // Note, this op works on cpu only. template class SparseApplyProximalGradientDescentOp : public OpKernel { public: explicit SparseApplyProximalGradientDescentOp(OpKernelConstruction* ctx) : OpKernel(ctx) { OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_)); } void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS { auto locks = MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0}); Tensor var; OP_REQUIRES_OK(ctx, GetInputTensorFromVariable( ctx, 0, use_exclusive_lock_, true, &var)); OP_REQUIRES(ctx, TensorShapeUtils::IsVectorOrHigher(var.shape()), errors::InvalidArgument("var must be at least 1 dimensional")); const Tensor& lr = ctx->input(1); OP_REQUIRES(ctx, IsLegacyScalar(lr.shape()), errors::InvalidArgument("lr is not a scalar: ", lr.shape().DebugString())); const Tensor& l1 = ctx->input(2); OP_REQUIRES( ctx, TensorShapeUtils::IsScalar(l1.shape()), errors::InvalidArgument("l1 regularization strength is not a scalar: ", l1.shape().DebugString())); const Tensor& l2 = ctx->input(3); OP_REQUIRES( ctx, TensorShapeUtils::IsScalar(l2.shape()), errors::InvalidArgument("l2 regularization strength is not a scalar: ", l2.shape().DebugString())); const Tensor& grad = ctx->input(4); const Tensor& indices = ctx->input(5); OP_REQUIRES(ctx, TensorShapeUtils::IsVector(indices.shape()), errors::InvalidArgument("indices must be one-dimensional")); int64 inner_dim = 1; for (int d = 1; d < var.dims(); d++) { OP_REQUIRES(ctx, var.dim_size(d) == grad.dim_size(d), errors::InvalidArgument(strings::StrCat( "var and grad must match in dimension ", d))); inner_dim *= grad.dim_size(d); } const Tindex N = indices.dim_size(0); OP_REQUIRES( ctx, grad.dim_size(0) == N, errors::InvalidArgument( "grad must be the same size as indices in the first dimension.")); OP_REQUIRES(ctx, inner_dim > 0, errors::InvalidArgument( "Inner dimension should be greater than zero.")); if (N > 0) { if (inner_dim > 1) { const Tindex first_dim_size = var.dim_size(0); auto indices_vec = indices.vec(); auto var_flat = var.flat_outer_dims(); auto grad_flat = grad.flat_outer_dims(); T lr_scalar = lr.scalar()(); T l1_scalar = l1.scalar()(); T l2_scalar = l2.scalar()(); // TODO(xbing): extract the common logic for the Fobos update. for (Tindex i = 0; i < N; i++) { const Tindex index = internal::SubtleMustCopy(indices_vec(i)); OP_REQUIRES(ctx, FastBoundsCheck(index, first_dim_size), errors::InvalidArgument( strings::StrCat("Index ", index, " at offset ", i, " in indices is out of range"))); auto g = grad_flat.template chip<0>(i); auto v = var_flat.template chip<0>(index); // compute learning_rate for current step. auto learning_rate = v.constant(lr_scalar); auto prox_v = v; // v = w - g * learning_rate. prox_v -= g * learning_rate; if (l1_scalar > 0) { // compute sign(v) * max(|v|, 0) v = prox_v.sign() * (prox_v.abs() - learning_rate * prox_v.constant(l1_scalar)) .cwiseMax(static_cast(0.0)) / (v.constant(1.0) + v.constant(l2_scalar) * learning_rate); } else { v = prox_v / (v.constant(1.0) + v.constant(l2_scalar) * learning_rate); } } } else { auto indices_vec = indices.vec(); auto var_flat = var.flat(); auto grad_flat = grad.flat(); T lr_scalar = lr.scalar()(); T l1_scalar = l1.scalar()(); T l2_scalar = l2.scalar()(); const Tindex first_dim_size = var_flat.size(); for (Tindex i = 0; i < N; i++) { const Tindex index = internal::SubtleMustCopy(indices_vec(i)); OP_REQUIRES(ctx, FastBoundsCheck(index, first_dim_size), errors::InvalidArgument( strings::StrCat("Index ", index, " at offset ", i, " in indices is out of range"))); const T& g = grad_flat(i); auto learning_rate = lr_scalar; auto prox_v = var_flat(index); prox_v -= learning_rate * g; if (l1_scalar > 0) { var_flat(index) = sgn(prox_v) * std::max(std::abs(prox_v) - learning_rate * l1_scalar, static_cast(0.0)) / (1.0 + l2_scalar * learning_rate); } else { var_flat(index) = prox_v / (1.0 + l2_scalar * learning_rate); } } } } MaybeForwardRefInputToRefOutput(ctx, 0, 0); } private: bool use_exclusive_lock_; }; #define REGISTER_KERNELS(T, Tindices) \ REGISTER_KERNEL_BUILDER(Name("SparseApplyProximalGradientDescent") \ .Device(DEVICE_CPU) \ .TypeConstraint("T") \ .TypeConstraint("Tindices"), \ SparseApplyProximalGradientDescentOp); \ REGISTER_KERNEL_BUILDER(Name("ResourceSparseApplyProximalGradientDescent") \ .Device(DEVICE_CPU) \ .TypeConstraint("T") \ .TypeConstraint("Tindices"), \ SparseApplyProximalGradientDescentOp); REGISTER_KERNELS(float, int32); REGISTER_KERNELS(float, int64); REGISTER_KERNELS(double, int32); REGISTER_KERNELS(double, int64); #undef REGISTER_KERNELS template class ApplyAdagradOp : public OpKernel { public: explicit ApplyAdagradOp(OpKernelConstruction* ctx) : OpKernel(ctx) { OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_)); OP_REQUIRES_OK(ctx, ctx->GetAttr("update_slots", &update_slots_)); } void Compute(OpKernelContext* ctx) override { auto locks = MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0, 1}); Tensor var; OP_REQUIRES_OK(ctx, GetInputTensorFromVariable( ctx, 0, use_exclusive_lock_, false, &var)); Tensor accum; OP_REQUIRES_OK(ctx, GetInputTensorFromVariable( ctx, 1, use_exclusive_lock_, false, &accum)); OP_REQUIRES( ctx, var.IsInitialized(), errors::FailedPrecondition( "Attempting to use uninitialized variables: ", requested_input(0))); OP_REQUIRES( ctx, accum.IsInitialized(), errors::FailedPrecondition( "Attempting to use uninitialized variables: ", requested_input(1))); const Tensor& lr = ctx->input(2); OP_REQUIRES(ctx, IsLegacyScalar(lr.shape()), errors::InvalidArgument("lr is not a scalar: ", lr.shape().DebugString())); const Tensor& grad = ctx->input(3); OP_REQUIRES( ctx, var.shape().IsSameSize(accum.shape()), errors::InvalidArgument("var and accum do not have the same shape", var.shape().DebugString(), " ", accum.shape().DebugString())); OP_REQUIRES( ctx, var.shape().IsSameSize(grad.shape()), errors::InvalidArgument("var and grad do not have the same shape", var.shape().DebugString(), " ", grad.shape().DebugString())); const Device& device = ctx->template eigen_device(); functor::ApplyAdagrad()(device, var.flat(), accum.flat(), lr.scalar(), grad.flat(), update_slots_); MaybeForwardRefInputToRefOutput(ctx, 0, 0); } private: bool use_exclusive_lock_; bool update_slots_; }; #define REGISTER_KERNELS(D, T) \ REGISTER_KERNEL_BUILDER( \ Name("ApplyAdagrad").Device(DEVICE_##D).TypeConstraint("T"), \ ApplyAdagradOp); \ REGISTER_KERNEL_BUILDER(Name("ResourceApplyAdagrad") \ .HostMemory("var") \ .HostMemory("accum") \ .Device(DEVICE_##D) \ .TypeConstraint("T"), \ ApplyAdagradOp); #define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T); TF_CALL_half(REGISTER_CPU_KERNELS); TF_CALL_bfloat16(REGISTER_CPU_KERNELS); TF_CALL_float(REGISTER_CPU_KERNELS); TF_CALL_double(REGISTER_CPU_KERNELS); #if GOOGLE_CUDA // Forward declarations of the functor specializations for GPU. namespace functor { #define DECLARE_GPU_SPEC(T) \ template <> \ void ApplyAdagrad::operator()( \ const GPUDevice& d, typename TTypes::Flat var, \ typename TTypes::Flat accum, typename TTypes::ConstScalar lr, \ typename TTypes::ConstFlat grad, bool update_slots); \ extern template struct ApplyAdagrad; DECLARE_GPU_SPEC(Eigen::half); DECLARE_GPU_SPEC(float); DECLARE_GPU_SPEC(double); #undef DECLARE_GPU_SPEC } // namespace functor REGISTER_KERNELS(GPU, Eigen::half); REGISTER_KERNELS(GPU, float); REGISTER_KERNELS(GPU, double); #endif #undef REGISTER_CPU_KERNELS #undef REGISTER_KERNELS template class ApplyProximalAdagradOp : public OpKernel { public: explicit ApplyProximalAdagradOp(OpKernelConstruction* ctx) : OpKernel(ctx) { OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_)); } void Compute(OpKernelContext* ctx) override { auto locks = MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0, 1}); Tensor var; OP_REQUIRES_OK(ctx, GetInputTensorFromVariable( ctx, 0, use_exclusive_lock_, false, &var)); Tensor accum; OP_REQUIRES_OK(ctx, GetInputTensorFromVariable( ctx, 1, use_exclusive_lock_, false, &accum)); OP_REQUIRES( ctx, var.IsInitialized(), errors::FailedPrecondition( "Attempting to use uninitialized variables: ", requested_input(0))); OP_REQUIRES( ctx, accum.IsInitialized(), errors::FailedPrecondition( "Attempting to use uninitialized variables: ", requested_input(1))); OP_REQUIRES( ctx, var.shape().IsSameSize(accum.shape()), errors::InvalidArgument("var and accum do not have the same shape", var.shape().DebugString(), " ", accum.shape().DebugString())); const Tensor& lr = ctx->input(2); OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr.shape()) && lr.scalar()() > static_cast(0), errors::InvalidArgument("lr is not a positive scalar: ", lr.shape().DebugString())); const Tensor& l1 = ctx->input(3); OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(l1.shape()) && l1.scalar()() >= static_cast(0), errors::InvalidArgument("l1 regularization strength is not a " "non-negative scalar: ", l1.shape().DebugString())); const Tensor& l2 = ctx->input(4); OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(l2.shape()) && l2.scalar()() >= static_cast(0), errors::InvalidArgument("l2 regularization strength is not a " "non-negative scalar: ", l2.shape().DebugString())); const Tensor& grad = ctx->input(5); OP_REQUIRES( ctx, var.shape().IsSameSize(grad.shape()), errors::InvalidArgument("var and grad do not have the same shape", var.shape().DebugString(), " ", grad.shape().DebugString())); const Device& device = ctx->template eigen_device(); functor::ApplyProximalAdagrad()( device, var.flat(), accum.flat(), lr.scalar(), l1.scalar(), l2.scalar(), grad.flat()); MaybeForwardRefInputToRefOutput(ctx, 0, 0); } private: bool use_exclusive_lock_; }; #define REGISTER_KERNELS(D, T) \ REGISTER_KERNEL_BUILDER( \ Name("ApplyProximalAdagrad").Device(DEVICE_##D).TypeConstraint("T"), \ ApplyProximalAdagradOp); \ REGISTER_KERNEL_BUILDER(Name("ResourceApplyProximalAdagrad") \ .Device(DEVICE_##D) \ .HostMemory("var") \ .HostMemory("accum") \ .TypeConstraint("T"), \ ApplyProximalAdagradOp); REGISTER_KERNELS(CPU, float); REGISTER_KERNELS(CPU, double); #undef REGISTER_KERNELS namespace { template inline T FtrlCompute(const T& accum, const T& linear, const T& lr, const T& l1, const T& l2, const T& lr_power) { T quadratic; if (lr_power == static_cast(-0.5)) { quadratic = Eigen::numext::sqrt(accum) / lr + static_cast(2) * l2; } else { quadratic = Eigen::numext::pow(accum, -lr_power) / lr + static_cast(2) * l2; } auto l1_reg_adjust = std::max(std::min(linear, l1), -l1); return (l1_reg_adjust - linear) / quadratic; } } // namespace // Note, this op works on cpu only. template class SparseApplyAdagradOp : public OpKernel { public: explicit SparseApplyAdagradOp(OpKernelConstruction* ctx) : OpKernel(ctx) { OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_)); OP_REQUIRES_OK(ctx, ctx->GetAttr("update_slots", &update_slots_)); } void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS { auto locks = MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0, 1}); Tensor var; OP_REQUIRES_OK(ctx, GetInputTensorFromVariable( ctx, 0, use_exclusive_lock_, true, &var)); Tensor accum; OP_REQUIRES_OK(ctx, GetInputTensorFromVariable( ctx, 1, use_exclusive_lock_, true, &accum)); OP_REQUIRES( ctx, var.IsInitialized(), errors::FailedPrecondition( "Attempting to use uninitialized variables: ", requested_input(0))); OP_REQUIRES( ctx, accum.IsInitialized(), errors::FailedPrecondition( "Attempting to use uninitialized variables: ", requested_input(1))); OP_REQUIRES( ctx, var.shape().IsSameSize(accum.shape()), errors::InvalidArgument("var and accum do not have the same shape", var.shape().DebugString(), " ", accum.shape().DebugString())); OP_REQUIRES(ctx, TensorShapeUtils::IsVectorOrHigher(var.shape()), errors::InvalidArgument("var must be at least 1 dimensional")); const Tensor& lr = ctx->input(2); OP_REQUIRES(ctx, IsLegacyScalar(lr.shape()), errors::InvalidArgument("lr is not a scalar: ", lr.shape().DebugString())); const Tensor& grad = ctx->input(3); const Tensor& indices = ctx->input(4); OP_REQUIRES(ctx, TensorShapeUtils::IsVector(indices.shape()), errors::InvalidArgument("indices must be one-dimensional")); int64 inner_dim = 1; for (int d = 1; d < var.dims(); d++) { OP_REQUIRES(ctx, var.dim_size(d) == grad.dim_size(d), errors::InvalidArgument(strings::StrCat( "var and grad must match in dimension ", d))); inner_dim *= grad.dim_size(d); } const Tindex N = indices.dim_size(0); OP_REQUIRES( ctx, grad.dim_size(0) == N, errors::InvalidArgument( "grad must be the same size as indices in the first dimension.")); OP_REQUIRES(ctx, inner_dim > 0, errors::InvalidArgument( "Inner dimension should be greater than zero.")); if (N > 0) { if (inner_dim > 1) { const Tindex first_dim_size = var.dim_size(0); auto indices_vec = indices.vec(); auto var_flat = var.flat_outer_dims(); auto accum_flat = accum.flat_outer_dims(); auto grad_flat = grad.flat_outer_dims(); T lr_scalar = lr.scalar()(); // Note(yonghui): It might be worth multi-threading square() and // rsqrt(). for (Tindex i = 0; i < N; i++) { const Tindex index = internal::SubtleMustCopy(indices_vec(i)); OP_REQUIRES(ctx, FastBoundsCheck(index, first_dim_size), errors::InvalidArgument( strings::StrCat("Index ", index, " at offset ", i, " in indices is out of range"))); auto a = accum_flat.template chip<0>(index); auto g = grad_flat.template chip<0>(i); auto v = var_flat.template chip<0>(index); if (update_slots_) { a += g.square(); } v -= g.constant(lr_scalar) * g * a.rsqrt(); } } else { auto indices_vec = indices.vec(); auto var_flat = var.flat(); auto accum_flat = accum.flat(); auto grad_flat = grad.flat(); T lr_scalar = lr.scalar()(); const Tindex first_dim_size = accum_flat.size(); for (Tindex i = 0; i < N; i++) { const Tindex index = internal::SubtleMustCopy(indices_vec(i)); OP_REQUIRES(ctx, FastBoundsCheck(index, first_dim_size), errors::InvalidArgument( strings::StrCat("Index ", index, " at offset ", i, " in indices is out of range"))); T& a = accum_flat(index); const T& g = grad_flat(i); if (update_slots_) { a += g * g; } var_flat(index) -= lr_scalar * g / Eigen::numext::sqrt(a); } } } MaybeForwardRefInputToRefOutput(ctx, 0, 0); } private: bool use_exclusive_lock_; bool update_slots_; }; #define REGISTER_KERNELS(T, Tindices) \ REGISTER_KERNEL_BUILDER(Name("SparseApplyAdagrad") \ .Device(DEVICE_CPU) \ .TypeConstraint("T") \ .TypeConstraint("Tindices"), \ SparseApplyAdagradOp); \ REGISTER_KERNEL_BUILDER(Name("ResourceSparseApplyAdagrad") \ .Device(DEVICE_CPU) \ .TypeConstraint("T") \ .TypeConstraint("Tindices"), \ SparseApplyAdagradOp); #define REGISTER_CPU_KERNELS(T) \ REGISTER_KERNELS(T, int32); \ REGISTER_KERNELS(T, int64); TF_CALL_half(REGISTER_CPU_KERNELS); TF_CALL_bfloat16(REGISTER_CPU_KERNELS); TF_CALL_float(REGISTER_CPU_KERNELS); TF_CALL_double(REGISTER_CPU_KERNELS); #undef REGISTER_CPU_KERNELS #undef REGISTER_KERNELS // Note, this op works on cpu only. template class SparseApplyProximalAdagradOp : public OpKernel { public: explicit SparseApplyProximalAdagradOp(OpKernelConstruction* ctx) : OpKernel(ctx) { OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_)); } void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS { auto locks = MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0, 1}); Tensor var; OP_REQUIRES_OK(ctx, GetInputTensorFromVariable( ctx, 0, use_exclusive_lock_, true, &var)); Tensor accum; OP_REQUIRES_OK(ctx, GetInputTensorFromVariable( ctx, 1, use_exclusive_lock_, true, &accum)); OP_REQUIRES( ctx, var.IsInitialized(), errors::FailedPrecondition( "Attempting to use uninitialized variables: ", requested_input(0))); OP_REQUIRES( ctx, accum.IsInitialized(), errors::FailedPrecondition( "Attempting to use uninitialized variables: ", requested_input(1))); OP_REQUIRES( ctx, var.shape().IsSameSize(accum.shape()), errors::InvalidArgument("var and accum do not have the same shape", var.shape().DebugString(), " ", accum.shape().DebugString())); OP_REQUIRES(ctx, TensorShapeUtils::IsVectorOrHigher(var.shape()), errors::InvalidArgument("var must be at least 1 dimensional")); const Tensor& lr = ctx->input(2); OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr.shape()) && lr.scalar()() > static_cast(0), errors::InvalidArgument("lr is not a positive scalar: ", lr.shape().DebugString())); const Tensor& l1 = ctx->input(3); OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(l1.shape()) && l1.scalar()() >= static_cast(0), errors::InvalidArgument("l1 regularization strength is not a " "non-negative scalar: ", l1.shape().DebugString())); const Tensor& l2 = ctx->input(4); OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(l2.shape()) && l2.scalar()() >= static_cast(0), errors::InvalidArgument("l2 regularization strength is not a " "non-negative scalar: ", l2.shape().DebugString())); const Tensor& grad = ctx->input(5); const Tensor& indices = ctx->input(6); OP_REQUIRES(ctx, TensorShapeUtils::IsVector(indices.shape()), errors::InvalidArgument("indices must be one-dimensional")); int64 inner_dim = 1; for (int d = 1; d < var.dims(); d++) { OP_REQUIRES(ctx, var.dim_size(d) == grad.dim_size(d), errors::InvalidArgument(strings::StrCat( "var and grad must match in dimension ", d))); inner_dim *= grad.dim_size(d); } const Tindex N = indices.dim_size(0); OP_REQUIRES( ctx, grad.dim_size(0) == N, errors::InvalidArgument( "grad must be the same size as indices in the first dimension.")); OP_REQUIRES(ctx, inner_dim > 0, errors::InvalidArgument( "Inner dimension should be greater than zero.")); if (N > 0) { if (inner_dim > 1) { const Tindex first_dim_size = var.dim_size(0); auto indices_vec = indices.vec(); auto var_flat = var.flat_outer_dims(); auto accum_flat = accum.flat_outer_dims(); auto grad_flat = grad.flat_outer_dims(); T lr_scalar = lr.scalar()(); T l1_scalar = l1.scalar()(); T l2_scalar = l2.scalar()(); for (Tindex i = 0; i < N; i++) { const Tindex index = internal::SubtleMustCopy(indices_vec(i)); OP_REQUIRES(ctx, FastBoundsCheck(index, first_dim_size), errors::InvalidArgument( strings::StrCat("Index ", index, " at offset ", i, " in indices is out of range"))); auto a = accum_flat.template chip<0>(index); auto g = grad_flat.template chip<0>(i); auto v = var_flat.template chip<0>(index); a += g.square(); // compute learning_rate for current step. auto learning_rate = a.constant(lr_scalar) * a.rsqrt(); auto prox_v = v; // v = w - g * learning_rate. prox_v -= g * learning_rate; if (l1_scalar > 0) { // compute sign(v) * max(|v|, 0) v = prox_v.sign() * (prox_v.abs() - learning_rate * prox_v.constant(l1_scalar)) .cwiseMax(static_cast(0.0)) / (v.constant(1.0) + v.constant(l2_scalar) * learning_rate); } else { v = prox_v / (v.constant(1.0) + v.constant(l2_scalar) * learning_rate); } } } else { auto indices_vec = indices.vec(); auto var_flat = var.flat(); auto accum_flat = accum.flat(); auto grad_flat = grad.flat(); T lr_scalar = lr.scalar()(); T l1_scalar = l1.scalar()(); T l2_scalar = l2.scalar()(); const Tindex first_dim_size = accum_flat.size(); for (Tindex i = 0; i < N; i++) { const Tindex index = internal::SubtleMustCopy(indices_vec(i)); OP_REQUIRES(ctx, FastBoundsCheck(index, first_dim_size), errors::InvalidArgument( strings::StrCat("Index ", index, " at offset ", i, " in indices is out of range"))); T& a = accum_flat(index); const T& g = grad_flat(i); a += g * g; auto learning_rate = lr_scalar / std::sqrt(a); auto prox_v = var_flat(index); prox_v -= learning_rate * g; if (l1_scalar > 0) { var_flat(index) = sgn(prox_v) * std::max(std::abs(prox_v) - learning_rate * l1_scalar, static_cast(0.0)) / (1.0 + l2_scalar * learning_rate); } else { var_flat(index) = prox_v / (1.0 + l2_scalar * learning_rate); } } } } MaybeForwardRefInputToRefOutput(ctx, 0, 0); } private: bool use_exclusive_lock_; }; #define REGISTER_KERNELS(T, Tindices) \ REGISTER_KERNEL_BUILDER(Name("SparseApplyProximalAdagrad") \ .Device(DEVICE_CPU) \ .TypeConstraint("T") \ .TypeConstraint("Tindices"), \ SparseApplyProximalAdagradOp); \ REGISTER_KERNEL_BUILDER(Name("ResourceSparseApplyProximalAdagrad") \ .Device(DEVICE_CPU) \ .TypeConstraint("T") \ .TypeConstraint("Tindices"), \ SparseApplyProximalAdagradOp); REGISTER_KERNELS(float, int32); REGISTER_KERNELS(float, int64); REGISTER_KERNELS(double, int32); REGISTER_KERNELS(double, int64); #undef REGISTER_KERNELS template class ApplyAdagradDAOp : public OpKernel { public: explicit ApplyAdagradDAOp(OpKernelConstruction* ctx) : OpKernel(ctx) { OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_)); } void Compute(OpKernelContext* ctx) override { auto locks = MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0, 1, 2}); Tensor var; OP_REQUIRES_OK(ctx, GetInputTensorFromVariable( ctx, 0, use_exclusive_lock_, false, &var)); Tensor gradient_accum; OP_REQUIRES_OK( ctx, GetInputTensorFromVariable(ctx, 1, use_exclusive_lock_, false, &gradient_accum)); Tensor gradient_squared_accum; OP_REQUIRES_OK( ctx, GetInputTensorFromVariable( ctx, 2, use_exclusive_lock_, false, &gradient_squared_accum)); OP_REQUIRES( ctx, var.IsInitialized(), errors::FailedPrecondition( "Attempting to use uninitialized variables: ", requested_input(0))); OP_REQUIRES( ctx, gradient_accum.IsInitialized(), errors::FailedPrecondition( "Attempting to use uninitialized variables: ", requested_input(1))); OP_REQUIRES( ctx, gradient_squared_accum.IsInitialized(), errors::FailedPrecondition( "Attempting to use uninitialized variables: ", requested_input(2))); OP_REQUIRES( ctx, var.shape().IsSameSize(gradient_accum.shape()), errors::InvalidArgument("var and accum do not have the same shape", var.shape().DebugString(), " ", gradient_accum.shape().DebugString())); OP_REQUIRES( ctx, var.shape().IsSameSize(gradient_squared_accum.shape()), errors::InvalidArgument("var and accum do not have the same shape", var.shape().DebugString(), " ", gradient_squared_accum.shape().DebugString())); const Tensor& grad = ctx->input(3); OP_REQUIRES( ctx, var.shape().IsSameSize(grad.shape()), errors::InvalidArgument("var and grad do not have the same shape", var.shape().DebugString(), " ", grad.shape().DebugString())); const Tensor& lr = ctx->input(4); OP_REQUIRES(ctx, IsLegacyScalar(lr.shape()), errors::InvalidArgument("lr is not a scalar: ", lr.shape().DebugString())); const Tensor& l1 = ctx->input(5); OP_REQUIRES( ctx, TensorShapeUtils::IsScalar(l1.shape()), errors::InvalidArgument("l1 regularization strength is not a scalar: ", l1.shape().DebugString())); const Tensor& l2 = ctx->input(6); OP_REQUIRES( ctx, TensorShapeUtils::IsScalar(l2.shape()), errors::InvalidArgument("l2 regularization strength is not a scalar: ", l2.shape().DebugString())); const Tensor& global_step = ctx->input(7); OP_REQUIRES(ctx, IsLegacyScalar(global_step.shape()), errors::InvalidArgument("global_step is not a scalar: ", global_step.shape().DebugString())); const Device& device = ctx->template eigen_device(); functor::ApplyAdagradDA()( device, var.flat(), gradient_accum.flat(), gradient_squared_accum.flat(), lr.scalar(), global_step.scalar()(), l1.scalar(), l2.scalar(), grad.flat()); MaybeForwardRefInputToRefOutput(ctx, 0, 0); } private: bool use_exclusive_lock_; }; #define REGISTER_KERNELS(D, T) \ REGISTER_KERNEL_BUILDER( \ Name("ApplyAdagradDA").Device(DEVICE_##D).TypeConstraint("T"), \ ApplyAdagradDAOp); \ REGISTER_KERNEL_BUILDER(Name("ResourceApplyAdagradDA") \ .Device(DEVICE_##D) \ .HostMemory("var") \ .HostMemory("gradient_accumulator") \ .HostMemory("gradient_squared_accumulator") \ .TypeConstraint("T"), \ ApplyAdagradDAOp); REGISTER_KERNELS(CPU, float); REGISTER_KERNELS(CPU, double); #undef REGISTER_KERNELS // Note, this op works on cpu only. template class SparseApplyAdagradDAOp : public OpKernel { public: explicit SparseApplyAdagradDAOp(OpKernelConstruction* ctx) : OpKernel(ctx) { OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_)); } void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS { auto locks = MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0, 1, 2}); Tensor var; OP_REQUIRES_OK(ctx, GetInputTensorFromVariable( ctx, 0, use_exclusive_lock_, true, &var)); Tensor gradient_accum; OP_REQUIRES_OK(ctx, GetInputTensorFromVariable( ctx, 1, use_exclusive_lock_, true, &gradient_accum)); Tensor gradient_squared_accum; OP_REQUIRES_OK( ctx, GetInputTensorFromVariable( ctx, 2, use_exclusive_lock_, true, &gradient_squared_accum)); OP_REQUIRES( ctx, var.IsInitialized(), errors::FailedPrecondition( "Attempting to use uninitialized variables: ", requested_input(0))); OP_REQUIRES( ctx, gradient_accum.IsInitialized(), errors::FailedPrecondition( "Attempting to use uninitialized variables: ", requested_input(1))); OP_REQUIRES( ctx, gradient_squared_accum.IsInitialized(), errors::FailedPrecondition( "Attempting to use uninitialized variables: ", requested_input(2))); OP_REQUIRES( ctx, var.shape().IsSameSize(gradient_accum.shape()), errors::InvalidArgument("var and accum do not have the same shape", var.shape().DebugString(), " ", gradient_accum.shape().DebugString())); OP_REQUIRES( ctx, var.shape().IsSameSize(gradient_squared_accum.shape()), errors::InvalidArgument("var and accum do not have the same shape", var.shape().DebugString(), " ", gradient_squared_accum.shape().DebugString())); OP_REQUIRES(ctx, TensorShapeUtils::IsVectorOrHigher(var.shape()), errors::InvalidArgument("var must be at least 1 dimensional")); const Tensor& grad = ctx->input(3); const Tensor& indices = ctx->input(4); OP_REQUIRES(ctx, TensorShapeUtils::IsVector(indices.shape()), errors::InvalidArgument("indices must be one-dimensional")); const Tensor& lr = ctx->input(5); OP_REQUIRES(ctx, IsLegacyScalar(lr.shape()), errors::InvalidArgument("lr is not a scalar: ", lr.shape().DebugString())); const Tensor& l1 = ctx->input(6); OP_REQUIRES( ctx, TensorShapeUtils::IsScalar(l1.shape()), errors::InvalidArgument("l1 regularization strength is not a scalar: ", l1.shape().DebugString())); const Tensor& l2 = ctx->input(7); OP_REQUIRES( ctx, TensorShapeUtils::IsScalar(l2.shape()), errors::InvalidArgument("l2 regularization strength is not a scalar: ", l2.shape().DebugString())); const Tensor& global_step = ctx->input(8); OP_REQUIRES(ctx, IsLegacyScalar(global_step.shape()), errors::InvalidArgument("global_step is not a scalar: ", global_step.shape().DebugString())); int64 inner_dim = 1; for (int d = 1; d < var.dims(); d++) { OP_REQUIRES(ctx, var.dim_size(d) == grad.dim_size(d), errors::InvalidArgument(strings::StrCat( "var and grad must match in dimension ", d))); inner_dim *= grad.dim_size(d); } const Tindex N = indices.dim_size(0); OP_REQUIRES( ctx, grad.dim_size(0) == N, errors::InvalidArgument( "grad must be the same size as indices in the first dimension.")); OP_REQUIRES(ctx, inner_dim > 0, errors::InvalidArgument( "Inner dimension should be greater than zero.")); // AdagradDA update: // Let g to be gradient accumulator, gg to be gradient squared accumulator, // T be the global step, lr is the learning rate, and k the initial // gradient squared accumulator value. // w = \dfrac{sign(-g)*lr*|g - l1*T|_{+}}{l2*T*lr + \sqrt{k+gg})} if (N > 0) { if (inner_dim > 1) { const Tindex first_dim_size = var.dim_size(0); auto indices_vec = indices.vec(); auto var_flat = var.flat_outer_dims(); auto gradient_accum_flat = gradient_accum.flat_outer_dims(); auto gradient_squared_accum_flat = gradient_squared_accum.flat_outer_dims(); auto grad_flat = grad.flat_outer_dims(); T lr_scalar = lr.scalar()(); T global_step_scalar = global_step.scalar()(); T l1_scalar = l1.scalar