From f41959ccb2d9d4c722fe8fc3351401d53bcf4900 Mon Sep 17 00:00:00 2001 From: Manjunath Kudlur Date: Fri, 6 Nov 2015 16:27:58 -0800 Subject: TensorFlow: Initial commit of TensorFlow library. TensorFlow is an open source software library for numerical computation using data flow graphs. Base CL: 107276108 --- tensorflow/core/kernels/lrn_op.cc | 228 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 228 insertions(+) create mode 100644 tensorflow/core/kernels/lrn_op.cc (limited to 'tensorflow/core/kernels/lrn_op.cc') diff --git a/tensorflow/core/kernels/lrn_op.cc b/tensorflow/core/kernels/lrn_op.cc new file mode 100644 index 0000000000..e5abf5906f --- /dev/null +++ b/tensorflow/core/kernels/lrn_op.cc @@ -0,0 +1,228 @@ +// LRN = Local Response Normalization +// See docs in ../ops/nn_ops.cc. + +#define EIGEN_USE_THREADS + +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/public/tensor.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" + +#ifndef __ANDROID__ +#include "tensorflow/core/util/work_sharder.h" +#endif + +namespace tensorflow { + +// Create a depth-by-depth band matrix with 1s along a swath of size (2 * +// depth_radius + 1) around the diagonal. +static void GetBandMatrix(int depth, int64 depth_radius, + Eigen::Tensor* result) { + result->setZero(); + for (int row = 0; row < depth; ++row) { + const int begin = std::max(0, row - depth_radius); + const int end = std::min(depth, row + depth_radius + 1); + Eigen::DSizes start(row, begin); + Eigen::DSizes sizes(1, end - begin); + result->slice(start, sizes).setConstant(1.0f); + } +} + +class LRNOp : public OpKernel { + public: + explicit LRNOp(OpKernelConstruction* context) : OpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius_)); + OP_REQUIRES_OK(context, context->GetAttr("bias", &bias_)); + OP_REQUIRES_OK(context, context->GetAttr("alpha", &alpha_)); + OP_REQUIRES_OK(context, context->GetAttr("beta", &beta_)); + } + + void Compute(OpKernelContext* context) override { + const Tensor& in = context->input(0); + OP_REQUIRES(context, in.dims() == 4, + errors::InvalidArgument("in must be 4-dimensional")); + const int64 batch = in.dim_size(0); + const int64 rows = in.dim_size(1); + const int64 cols = in.dim_size(2); + const int64 depth = in.dim_size(3); + Tensor* output = nullptr; + OP_REQUIRES_OK(context, + context->allocate_output( + 0, TensorShape({batch, rows, cols, depth}), &output)); + +#ifdef __ANDROID__ + MognetLRN(in, batch, rows, cols, depth, output); +#else + const int nodes = cols * rows; + auto in_shaped = in.shaped({nodes * batch, depth}); + + // Multiplying the input with the band matrix has the effect of reducing the + // correct patch along the depth. + Eigen::Tensor multiplier(depth, depth); + GetBandMatrix(depth, depth_radius_, &multiplier); + + auto out_shaped = output->shaped({nodes * batch, depth}); + Eigen::array dims = {{DimPair(1, 0)}}; + /// TODO(keveman): Optimize for beta in {0, 1, 0.5} + out_shaped.device(context->eigen_cpu_device()) = + in_shaped / + in_shaped.square() + .contract(multiplier, dims) + .unaryExpr([this](float x) { return bias_ + alpha_ * x; }) + .pow(beta_); +#endif + } + + private: + typedef Eigen::Tensor::DimensionPair DimPair; + + void MognetLRN(const Tensor& in, const int batch, const int rows, + const int cols, const int depth, Tensor* out) { + Eigen::Map> + data_in(in.flat().data(), depth, batch * rows * cols); + + Eigen::Map> data_out( + out->flat().data(), depth, batch * rows * cols); + + const int double_depth_radius = depth_radius_ * 2; + Eigen::VectorXf padded_square(data_in.rows() + double_depth_radius); + padded_square.setZero(); + for (int r = 0; r < data_in.cols(); ++r) { + // Do local response normalization for data_in(:, r) + // first, compute the square and store them in buffer for repeated use + padded_square.block(depth_radius_, 0, data_out.rows(), 1) = + data_in.col(r).cwiseProduct(data_in.col(r)) * alpha_; + // Then, compute the scale and writes them to data_out + float accumulated_scale = 0; + for (int i = 0; i < double_depth_radius; ++i) { + accumulated_scale += padded_square(i); + } + for (int i = 0; i < data_in.rows(); ++i) { + accumulated_scale += padded_square(i + double_depth_radius); + data_out(i, r) = bias_ + accumulated_scale; + accumulated_scale -= padded_square(i); + } + } + + // In a few cases, the pow computation could benefit from speedups. + if (beta_ == 1) { + data_out.array() = data_in.array() * data_out.array().inverse(); + } else if (beta_ == 0.5) { + data_out.array() = data_in.array() * data_out.array().sqrt().inverse(); + } else { + data_out.array() = data_in.array() * data_out.array().pow(-beta_); + } + } + + int64 depth_radius_; + float bias_; + float alpha_; + float beta_; +}; + +REGISTER_KERNEL_BUILDER(Name("LRN").Device(DEVICE_CPU), LRNOp); + +#ifndef __ANDROID__ + +class LRNGradOp : public OpKernel { + public: + explicit LRNGradOp(OpKernelConstruction* context) : OpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius_)); + OP_REQUIRES_OK(context, context->GetAttr("bias", &bias_)); + OP_REQUIRES_OK(context, context->GetAttr("alpha", &alpha_)); + OP_REQUIRES_OK(context, context->GetAttr("beta", &beta_)); + } + + void Compute(OpKernelContext* context) override { + const Tensor& in_grads = context->input(0); + const Tensor& in_image = context->input(1); + const Tensor& out_image = context->input(2); + + OP_REQUIRES(context, in_grads.dims() == 4 && in_image.dims() == 4, + errors::InvalidArgument("inputs must be 4-dimensional")); + const int64 batch = in_grads.dim_size(0); + const int64 rows = in_grads.dim_size(1); + const int64 cols = in_grads.dim_size(2); + const int64 depth = in_grads.dim_size(3); + OP_REQUIRES( + context, + in_image.dim_size(0) == batch && in_image.dim_size(1) == rows && + in_image.dim_size(2) == cols && in_image.dim_size(3) == depth && + out_image.dim_size(0) == batch && out_image.dim_size(1) == rows && + out_image.dim_size(2) == cols && out_image.dim_size(3) == depth, + errors::InvalidArgument( + "input_grads, input_image, and out_image should have the same " + "shape")); + const auto nodes = cols * rows; + auto grads_shaped = in_grads.shaped({nodes * batch, depth}); + auto in_shaped = in_image.shaped({nodes * batch, depth}); + auto activations = out_image.shaped({nodes * batch, depth}); + + Tensor* output = nullptr; + OP_REQUIRES_OK(context, + context->allocate_output( + 0, TensorShape({batch, rows, cols, depth}), &output)); + auto out_shaped = output->shaped({nodes * batch, depth}); + out_shaped.setZero(); + + auto shard = [this, activations, in_shaped, grads_shaped, out_shaped, + depth](int64 begin, int64 end) { + for (int64 i = begin; i < end; ++i) { + for (int64 j = 0; j < depth; ++j) { + // Let y be the LRN activations and x be the inputs along the depth + // dimension. (LRN operates independently along rows, cols, and + // batch). + // We have + // yi = xi / (bias + alpha(sum_j_{i - depth_radius}^{i + depth_radius} + // x_j^2))^beta + // + // Let N = (bias + alpha(sum_j_{i - depth_radius}^{i + depth_radius} + // x_j^2)) + // dy_i/dx_i = (N^beta - xi. beta*N^(beta-1)*2*alpha*xi)/N^(2*beta) + // dy_i/dx_j = ( - xi. beta*N^(beta-1)*2*alpha*xj)/N^(2*beta) + // + // NOTE(keveman) : We can compute N by doing (yi/xi) ^ (1/beta). + // However, this is numerically unstable for small values of xi. We + // compute N explicitly here to avoid that. + + int64 depth_begin = std::max(0, j - depth_radius_); + int64 depth_end = std::min(depth, j + depth_radius_ + 1); + + float norm = 0.0f; + for (int64 k = depth_begin; k < depth_end; ++k) { + norm += in_shaped(i, k) * in_shaped(i, k); + } + norm = alpha_ * norm + bias_; + DCHECK_GT(norm, 1e-6); + for (int64 k = depth_begin; k < depth_end; ++k) { + float dyi = -2.0f * alpha_ * beta_ * in_shaped(i, k) * + activations(i, j) / norm; + if (k == j) { + dyi += std::pow(norm, -beta_); + } + dyi *= grads_shaped(i, j); + const_cast::Tensor&>(out_shaped)(i, k) += dyi; + } + } + } + }; + auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads()); + Shard(worker_threads.num_threads, worker_threads.workers, nodes * batch, + depth * depth, shard); + } + + private: + typedef Eigen::Tensor::DimensionPair DimPair; + + int64 depth_radius_; + float bias_; + float alpha_; + float beta_; +}; + +REGISTER_KERNEL_BUILDER(Name("LRNGrad").Device(DEVICE_CPU), LRNGradOp); + +#endif // __ANDROID__ + +} // namespace tensorflow -- cgit v1.2.3