// See docs in ../ops/nn_ops.cc.

#define EIGEN_USE_THREADS

#include "tensorflow/core/framework/numeric_op.h"
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/framework/register_types.h"
#include "tensorflow/core/kernels/batch_norm_op.h"
#include "tensorflow/core/public/tensor.h"
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"

namespace tensorflow {

typedef Eigen::ThreadPoolDevice CPUDevice;
typedef Eigen::GpuDevice GPUDevice;

template <typename Device, typename T>
class BatchNormOp : public OpKernel {
 public:
  explicit BatchNormOp(OpKernelConstruction* context) : OpKernel(context) {
    OP_REQUIRES_OK(context,
                   context->GetAttr("variance_epsilon", &variance_epsilon_));
    OP_REQUIRES_OK(context, context->GetAttr("scale_after_normalization",
                                             &scale_after_normalization_));
  }

  void Compute(OpKernelContext* context) override {
    const Tensor& input = context->input(0);
    const Tensor& mean = context->input(1);
    const Tensor& var = context->input(2);
    const Tensor& beta = context->input(3);
    const Tensor& gamma = context->input(4);

    OP_REQUIRES(context, input.dims() == 4,
                errors::InvalidArgument("input must be 4-dimensional",
                                        input.shape().ShortDebugString()));
    OP_REQUIRES(context, mean.dims() == 1,
                errors::InvalidArgument("mean must be 1-dimensional",
                                        mean.shape().ShortDebugString()));
    OP_REQUIRES(context, var.dims() == 1,
                errors::InvalidArgument("var must be 1-dimensional",
                                        var.shape().ShortDebugString()));
    OP_REQUIRES(context, beta.dims() == 1,
                errors::InvalidArgument("beta must be 1-dimensional",
                                        beta.shape().ShortDebugString()));
    OP_REQUIRES(context, gamma.dims() == 1,
                errors::InvalidArgument("gamma must be 1-dimensional",
                                        gamma.shape().ShortDebugString()));

    Tensor* output = nullptr;
    OP_REQUIRES_OK(context,
                   context->allocate_output(0, input.shape(), &output));

    functor::BatchNorm<Device, T>()(
        context->eigen_device<Device>(), input.tensor<T, 4>(), mean.vec<T>(),
        var.vec<T>(), beta.vec<T>(), gamma.vec<T>(), variance_epsilon_,
        scale_after_normalization_, output->tensor<T, 4>());
  }

 private:
  float variance_epsilon_;
  bool scale_after_normalization_;
};

template <typename Device, typename T>
class BatchNormGradOp : public OpKernel {
 public:
  explicit BatchNormGradOp(OpKernelConstruction* context) : OpKernel(context) {
    OP_REQUIRES_OK(context,
                   context->GetAttr("variance_epsilon", &variance_epsilon_));
    OP_REQUIRES_OK(context, context->GetAttr("scale_after_normalization",
                                             &scale_after_normalization_));
  }

  void Compute(OpKernelContext* context) override {
    const Tensor& input = context->input(0);
    const Tensor& mean = context->input(1);
    const Tensor& var = context->input(2);
    const Tensor& gamma = context->input(3);
    const Tensor& out_backprop = context->input(4);

    OP_REQUIRES(context, input.dims() == 4,
                errors::InvalidArgument("input must be 4-dimensional",
                                        input.shape().ShortDebugString()));
    OP_REQUIRES(context, mean.dims() == 1,
                errors::InvalidArgument("mean must be 1-dimensional",
                                        mean.shape().ShortDebugString()));
    OP_REQUIRES(context, var.dims() == 1,
                errors::InvalidArgument("var must be 1-dimensional",
                                        var.shape().ShortDebugString()));
    OP_REQUIRES(context, gamma.dims() == 1,
                errors::InvalidArgument("gamma must be 1-dimensional",
                                        gamma.shape().ShortDebugString()));
    OP_REQUIRES(
        context, out_backprop.dims() == 4,
        errors::InvalidArgument("out_backprop must be 4-dimensional",
                                out_backprop.shape().ShortDebugString()));

    Tensor* dx = nullptr;
    OP_REQUIRES_OK(context, context->allocate_output(0, input.shape(), &dx));
    Tensor* dm = nullptr;
    OP_REQUIRES_OK(context, context->allocate_output(1, mean.shape(), &dm));
    Tensor* dv = nullptr;
    OP_REQUIRES_OK(context, context->allocate_output(2, var.shape(), &dv));
    Tensor* db = nullptr;
    OP_REQUIRES_OK(context, context->allocate_output(3, mean.shape(), &db));
    Tensor* dg = nullptr;
    OP_REQUIRES_OK(context, context->allocate_output(4, gamma.shape(), &dg));

    // Scratch buffer of [depth] dimension, aka the 4th dimension of input,
    // which is dim_size(3), for calculating various combinations of
    // (var + epsilon).
    Tensor scratch1;
    OP_REQUIRES_OK(context, context->allocate_temp(
                                DataTypeToEnum<T>::value,
                                TensorShape({input.dim_size(3)}), &scratch1));

    // Scratch buffer of [depth] dimension for saving intermediate calculation
    // values.
    Tensor scratch2;
    OP_REQUIRES_OK(context, context->allocate_temp(
                                DataTypeToEnum<T>::value,
                                TensorShape({input.dim_size(3)}), &scratch2));

    functor::BatchNormGrad<Device, T>()(
        context->eigen_device<Device>(), input.tensor<T, 4>(), mean.vec<T>(),
        var.vec<T>(), gamma.vec<T>(), out_backprop.tensor<T, 4>(),
        variance_epsilon_, scale_after_normalization_, dx->tensor<T, 4>(),
        dm->vec<T>(), dv->vec<T>(), db->vec<T>(), dg->vec<T>(),
        scratch1.vec<T>(), scratch2.vec<T>());
  }

 private:
  float variance_epsilon_;
  bool scale_after_normalization_;
};

#define REGISTER_KERNEL(T)                                         \
  REGISTER_KERNEL_BUILDER(Name("BatchNormWithGlobalNormalization") \
                              .Device(DEVICE_CPU)                  \
                              .TypeConstraint<T>("T"),             \
                          BatchNormOp<CPUDevice, T>);

REGISTER_KERNEL(float);
REGISTER_KERNEL(double);
#undef REGISTER_KERNEL

#if GOOGLE_CUDA
// Forward declarations of the functor specializations for GPU.
namespace functor {
#define DECLARE_GPU_SPEC(T)                                                  \
  template <>                                                                \
  void BatchNorm<GPUDevice, T>::operator()(                                  \
      const GPUDevice& d, typename TTypes<T, 4>::ConstTensor input,          \
      typename TTypes<T>::ConstVec mean, typename TTypes<T>::ConstVec var,   \
      typename TTypes<T>::ConstVec beta, typename TTypes<T>::ConstVec gamma, \
      float variance_epsilon, bool scale_after_normalization,                \
      typename TTypes<T, 4>::Tensor output);                                 \
  extern template struct BatchNorm<GPUDevice, T>;

#define DECLARE_GPU_SPECS(T) DECLARE_GPU_SPEC(T);

DECLARE_GPU_SPECS(float);
#undef DECLARE_GPU_SPEC
}  // namespace functor

// Registration of the GPU implementations.
#define REGISTER_GPU_KERNEL(T)                                     \
  REGISTER_KERNEL_BUILDER(Name("BatchNormWithGlobalNormalization") \
                              .Device(DEVICE_GPU)                  \
                              .TypeConstraint<T>("T"),             \
                          BatchNormOp<GPUDevice, T>);

REGISTER_GPU_KERNEL(float);
#undef REGISTER_GPU_KERNEL

#endif  // GOOGLE_CUDA

#define REGISTER_KERNEL(T)                                             \
  REGISTER_KERNEL_BUILDER(Name("BatchNormWithGlobalNormalizationGrad") \
                              .Device(DEVICE_CPU)                      \
                              .TypeConstraint<T>("T"),                 \
                          BatchNormGradOp<CPUDevice, T>);

REGISTER_KERNEL(float);
REGISTER_KERNEL(double);
#undef REGISTER_KERNEL

#if GOOGLE_CUDA
// Forward declarations of the functor specializations for GPU.
namespace functor {
#define DECLARE_GPU_SPEC(T)                                                    \
  template <>                                                                  \
  void BatchNormGrad<GPUDevice, T>::operator()(                                \
      const GPUDevice& d, typename TTypes<T, 4>::ConstTensor input,            \
      typename TTypes<T>::ConstVec mean, typename TTypes<T>::ConstVec var,     \
      typename TTypes<T>::ConstVec gamma,                                      \
      typename TTypes<T, 4>::ConstTensor out_backprop, float variance_epsilon, \
      bool scale_after_normalization, typename TTypes<T, 4>::Tensor dx,        \
      typename TTypes<T>::Vec dm, typename TTypes<T>::Vec dv,                  \
      typename TTypes<T>::Vec db, typename TTypes<T>::Vec dg,                  \
      typename TTypes<T>::Vec scratch1, typename TTypes<T>::Vec scratch2);     \
  extern template struct BatchNormGrad<GPUDevice, T>;

#define DECLARE_GPU_SPECS(T) DECLARE_GPU_SPEC(T);

DECLARE_GPU_SPECS(float);
#undef DECLARE_GPU_SPEC
}  // namespace functor

// Registration of the GPU implementations.
#define REGISTER_GPU_KERNEL(T)                                         \
  REGISTER_KERNEL_BUILDER(Name("BatchNormWithGlobalNormalizationGrad") \
                              .Device(DEVICE_GPU)                      \
                              .TypeConstraint<T>("T"),                 \
                          BatchNormGradOp<GPUDevice, T>);

REGISTER_GPU_KERNEL(float);
#undef REGISTER_GPU_KERNEL

#endif  // GOOGLE_CUDA

}  // namespace tensorflow