diff options
Diffstat (limited to 'tensorflow/core/kernels/training_ops.cc')
-rw-r--r-- | tensorflow/core/kernels/training_ops.cc | 282 |
1 files changed, 269 insertions, 13 deletions
diff --git a/tensorflow/core/kernels/training_ops.cc b/tensorflow/core/kernels/training_ops.cc index d331a8debf..f6b6194f0a 100644 --- a/tensorflow/core/kernels/training_ops.cc +++ b/tensorflow/core/kernels/training_ops.cc @@ -23,6 +23,10 @@ limitations under the License. #include "tensorflow/core/kernels/training_op_helpers.h" #include "tensorflow/core/kernels/variable_ops.h" +#ifdef TENSORFLOW_USE_SYCL +#include "tensorflow/core/common_runtime/sycl/sycl_util.h" +#endif // TENSORFLOW_USE_SYCL + namespace tensorflow { using CPUDevice = Eigen::ThreadPoolDevice; @@ -50,16 +54,27 @@ struct ApplyGradientDescent<CPUDevice, T> { #ifdef TENSORFLOW_USE_SYCL template <typename T> -struct ApplyGradientDescent<SYCLDevice, T> { +struct ApplyGradientDescentSYCL { void operator()(const SYCLDevice& d, typename TTypes<T>::Flat var, - typename TTypes<T>::ConstScalar lr, - typename TTypes<T>::ConstFlat grad) { - var.device(d) -= grad * lr(); + T lr, typename TTypes<T>::ConstFlat grad) { + var.device(d) -= grad * lr; } }; #endif template <typename T> +struct ApplyDelayCompensatedGradientDescent<CPUDevice, T> { + void operator()(const CPUDevice& d, typename TTypes<T>::Flat var, + typename TTypes<T>::ConstScalar lr, + typename TTypes<T>::ConstFlat grad, + typename TTypes<T>::ConstScalar variance, + typename TTypes<T>::Flat shadow) { + var.device(d) -= lr() * (grad + variance() * grad * (var - shadow)); + shadow.device(d) = var; + } +}; + +template <typename T> struct ApplyAdadelta<CPUDevice, T> { void operator()(const CPUDevice& d, typename TTypes<T>::Flat var, typename TTypes<T>::Flat accum, @@ -264,10 +279,24 @@ struct ApplyAdamNonCuda { } }; +#ifdef TENSORFLOW_USE_SYCL template <typename T> -struct ApplyAdam<CPUDevice, T> : ApplyAdamNonCuda<CPUDevice, T> {}; +struct ApplyAdamSYCL { + void operator()(const SYCLDevice& d, typename TTypes<T>::Flat var, + typename TTypes<T>::Flat m, typename TTypes<T>::Flat v, + T beta1_power, T beta2_power, T lr, T beta1, T beta2, T epsilon, + typename TTypes<T>::ConstFlat grad) { + const T alpha = lr * Eigen::numext::sqrt(T(1) - beta2_power) / + (T(1) - beta1_power); + m.device(d) += (grad - m) * (T(1) - beta1); + v.device(d) += (grad.square() - v) * (T(1) - beta2); + var.device(d) -= (m * alpha) / (v.sqrt() + epsilon); + } +}; +#endif // TENSORFLOW_USE_SYCL + template <typename T> -struct ApplyAdam<SYCLDevice, T> : ApplyAdamNonCuda<SYCLDevice, T> {}; +struct ApplyAdam<CPUDevice, T> : ApplyAdamNonCuda<CPUDevice, T> {}; template <typename T> struct ApplyRMSProp<CPUDevice, T> { @@ -346,6 +375,51 @@ class ApplyGradientDescentOp : public OpKernel { bool use_exclusive_lock_; }; +#ifdef TENSORFLOW_USE_SYCL +template <typename T> +class ApplyGradientDescentOp < SYCLDevice, T > : public OpKernel { + public: + explicit ApplyGradientDescentOp(OpKernelConstruction* ctx) : OpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_)); + } + + void Compute(OpKernelContext* ctx) override { + auto locks = MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0}); + Tensor var; + OP_REQUIRES_OK(ctx, GetInputTensorFromVariable(ctx, 0, use_exclusive_lock_, &var)); + + OP_REQUIRES( + ctx, var.IsInitialized(), + errors::FailedPrecondition( + "Attempting to use uninitialized variables: ", def().input(0))); + const Tensor& alpha_dev = ctx->input(1); + OP_REQUIRES(ctx, IsLegacyScalar(alpha_dev.shape()), + errors::InvalidArgument("alpha is not a scalar: ", + alpha_dev.shape().DebugString())); + const Tensor& delta = ctx->input(2); + OP_REQUIRES( + ctx, var.shape().IsSameSize(delta.shape()), + errors::InvalidArgument("var and delta do not have the same shape", + var.shape().DebugString(), " ", + delta.shape().DebugString())); + + auto device = ctx->eigen_sycl_device(); + auto size = sizeof(T); + T alpha = T(0); + auto src_ptr = GetBase(&alpha_dev); + device.memcpyDeviceToHost(&alpha, static_cast<const T *>(src_ptr), size); + + functor::ApplyGradientDescentSYCL<T>()(device, var.flat<T>(), + alpha, delta.flat<T>()); + + MaybeForwardRefInputToRefOutput(ctx, 0, 0); + } + + private: + bool use_exclusive_lock_; +}; +#endif // TENSORFLOW_USE_SYCL + #define REGISTER_KERNELS(D, T) \ REGISTER_KERNEL_BUILDER( \ Name("ApplyGradientDescent").Device(DEVICE_##D).TypeConstraint<T>("T"), \ @@ -361,13 +435,6 @@ TF_CALL_half(REGISTER_CPU_KERNELS); TF_CALL_float(REGISTER_CPU_KERNELS); TF_CALL_double(REGISTER_CPU_KERNELS); -#ifdef TENSORFLOW_USE_SYCL -#define REGISTER_SYCL_KERNELS(T) REGISTER_KERNELS(SYCL, T); -TF_CALL_float(REGISTER_SYCL_KERNELS); -TF_CALL_double(REGISTER_SYCL_KERNELS); -#undef REGISTER_SYCL_KERNELS -#endif - #if GOOGLE_CUDA // Forward declarations of the functor specializations for GPU. namespace functor { @@ -388,6 +455,81 @@ REGISTER_KERNELS(GPU, Eigen::half); REGISTER_KERNELS(GPU, float); REGISTER_KERNELS(GPU, double); #endif + +#ifdef TENSORFLOW_USE_SYCL +#define REGISTER_SYCL_KERNELS(T) REGISTER_KERNELS(SYCL, T); +TF_CALL_float(REGISTER_SYCL_KERNELS); +TF_CALL_double(REGISTER_SYCL_KERNELS); +#undef REGISTER_SYCL_KERNELS +#endif // TENSORFLOW_USE_SYCL + +#undef REGISTER_CPU_KERNELS +#undef REGISTER_KERNELS + +template <typename Device, typename T> +class ApplyDelayCompensatedGradientDescentOp : public OpKernel { + public: + explicit ApplyDelayCompensatedGradientDescentOp(OpKernelConstruction* ctx) : OpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_)); + } + + void Compute(OpKernelContext* ctx) override { + auto locks = MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0, 4}); + Tensor var; + OP_REQUIRES_OK(ctx, GetInputTensorFromVariable(ctx, 0, use_exclusive_lock_, &var)); + OP_REQUIRES( + ctx, var.IsInitialized(), + errors::FailedPrecondition( + "Attempting to use uninitialized variables: ", def().input(0))); + const Tensor& alpha = ctx->input(1); + OP_REQUIRES(ctx, IsLegacyScalar(alpha.shape()), + errors::InvalidArgument("alpha is not a scalar: ", + alpha.shape().DebugString())); + const Tensor& delta = ctx->input(2); + OP_REQUIRES( + ctx, var.shape().IsSameSize(delta.shape()), + errors::InvalidArgument("var and delta do not have the same shape", + var.shape().DebugString(), " ", + delta.shape().DebugString())); + const Tensor& lambda = ctx->input(3); + OP_REQUIRES(ctx, IsLegacyScalar(lambda.shape()), + errors::InvalidArgument("lambda is not a scalar: ", + lambda.shape().DebugString())); + Tensor shadow; + OP_REQUIRES_OK(ctx, GetInputTensorFromVariable(ctx, 4, use_exclusive_lock_, &shadow)); + OP_REQUIRES( + ctx, shadow.shape().IsSameSize(var.shape()), + errors::InvalidArgument("shadow and var do not have the same shape", + shadow.shape().DebugString(), " ", + var.shape().DebugString())); + + const Device& device = ctx->template eigen_device<Device>(); + functor::ApplyDelayCompensatedGradientDescent<Device, T>()( + device, var.flat<T>(), alpha.scalar<T>(), delta.flat<T>(), + lambda.scalar<T>(), shadow.flat<T>() + ); + + MaybeForwardRefInputToRefOutput(ctx, 0, 0); + } + + private: + bool use_exclusive_lock_; +}; + +#define REGISTER_KERNELS(D, T) \ + REGISTER_KERNEL_BUILDER( \ + Name("ApplyDelayCompensatedGradientDescent") \ + .Device(DEVICE_##D) \ + .HostMemory("var") \ + .HostMemory("shadow") \ + .TypeConstraint<T>("T"), \ + ApplyDelayCompensatedGradientDescentOp<D##Device, T>); +#define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T); + +TF_CALL_half(REGISTER_CPU_KERNELS); +TF_CALL_float(REGISTER_CPU_KERNELS); +TF_CALL_double(REGISTER_CPU_KERNELS); + #undef REGISTER_CPU_KERNELS #undef REGISTER_KERNELS @@ -2343,6 +2485,120 @@ class ApplyAdamOp : public OpKernel { bool use_nesterov_; }; +#ifdef TENSORFLOW_USE_SYCL +template <typename T> +class ApplyAdamOp < SYCLDevice, T> : public OpKernel { + public: + explicit ApplyAdamOp(OpKernelConstruction* ctx) : OpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_)); + } + + void Compute(OpKernelContext* ctx) override { + auto locks = MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0, 1, 2}); + + Tensor var; + OP_REQUIRES_OK(ctx, GetInputTensorFromVariable(ctx, 0, use_exclusive_lock_, &var)); + Tensor m; + OP_REQUIRES_OK(ctx, GetInputTensorFromVariable(ctx, 1, use_exclusive_lock_, &m)); + Tensor v; + OP_REQUIRES_OK(ctx, GetInputTensorFromVariable(ctx, 2, use_exclusive_lock_, &v)); + OP_REQUIRES( + ctx, var.IsInitialized(), + errors::FailedPrecondition( + "Attempting to use uninitialized variables: ", def().input(0))); + OP_REQUIRES( + ctx, m.IsInitialized(), + errors::FailedPrecondition( + "Attempting to use uninitialized variables: ", def().input(1))); + OP_REQUIRES( + ctx, v.IsInitialized(), + errors::FailedPrecondition( + "Attempting to use uninitialized variables: ", def().input(2))); + + const Tensor& beta1_power_dev = ctx->input(3); + const Tensor& beta2_power_dev = ctx->input(4); + const Tensor& lr_dev = ctx->input(5); + const Tensor& beta1_dev = ctx->input(6); + const Tensor& beta2_dev = ctx->input(7); + const Tensor& epsilon_dev = ctx->input(8); + + T beta1_power = 0; + T beta2_power = 0; + T lr = 0; + T beta1 = 0; + T beta2 = 0; + T epsilon = 0; + + auto device = ctx->eigen_sycl_device(); + auto size = sizeof(T); + auto src_ptr = GetBase(&beta1_power_dev); + device.memcpyDeviceToHost(&beta1_power, static_cast<const T *>(src_ptr), size); + + src_ptr = GetBase(&beta2_power_dev); + device.memcpyDeviceToHost(&beta2_power, static_cast<const T *>(src_ptr), size); + + src_ptr = GetBase(&lr_dev); + device.memcpyDeviceToHost(&lr, static_cast<const T *>(src_ptr), size); + + src_ptr = GetBase(&beta1_dev); + device.memcpyDeviceToHost(&beta1, static_cast<const T *>(src_ptr), size); + + src_ptr = GetBase(&beta2_dev); + device.memcpyDeviceToHost(&beta2, static_cast<const T *>(src_ptr), size); + + src_ptr = GetBase(&epsilon_dev); + device.memcpyDeviceToHost(&epsilon, static_cast<const T *>(src_ptr), size); + + + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta1_power_dev.shape()), + errors::InvalidArgument("beta1_power is not a scalar: ", + beta1_power_dev.shape().DebugString())); + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta2_power_dev.shape()), + errors::InvalidArgument("beta2_power is not a scalar: ", + beta2_power_dev.shape().DebugString())); + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr_dev.shape()), + errors::InvalidArgument("lr is not a scalar : ", + lr_dev.shape().DebugString())); + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta1_dev.shape()), + errors::InvalidArgument("beta1 is not a scalar: ", + beta1_dev.shape().DebugString())); + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta2_dev.shape()), + errors::InvalidArgument("beta2 is not a scalar: ", + beta2_dev.shape().DebugString())); + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(epsilon_dev.shape()), + errors::InvalidArgument("epsilon is not a scalar: ", + epsilon_dev.shape().DebugString())); + + const Tensor& grad = ctx->input(9); + + OP_REQUIRES(ctx, var.shape().IsSameSize(m.shape()), + errors::InvalidArgument("var and m do not have the same shape", + var.shape().DebugString(), " ", + m.shape().DebugString())); + OP_REQUIRES(ctx, var.shape().IsSameSize(v.shape()), + errors::InvalidArgument("var and v do not have the same shape", + var.shape().DebugString(), " ", + v.shape().DebugString())); + OP_REQUIRES( + ctx, var.shape().IsSameSize(grad.shape()), + errors::InvalidArgument("var and grad do not have the same shape", + var.shape().DebugString(), " ", + grad.shape().DebugString())); + + functor::ApplyAdamSYCL<T>()(device, var.flat<T>(), m.flat<T>(), + v.flat<T>(), beta1_power, + beta2_power, lr, + beta1, beta2, + epsilon, grad.flat<T>()); + + MaybeForwardRefInputToRefOutput(ctx, 0, 0); + } + + private: + bool use_exclusive_lock_; +}; +#endif // TENSORFLOW_USE_SYCL + using CPUDevice = Eigen::ThreadPoolDevice; using GPUDevice = Eigen::GpuDevice; |