aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/core/kernels/mkl_lrn_op.cc
diff options
context:
space:
mode:
Diffstat (limited to 'tensorflow/core/kernels/mkl_lrn_op.cc')
-rw-r--r--tensorflow/core/kernels/mkl_lrn_op.cc722
1 files changed, 722 insertions, 0 deletions
diff --git a/tensorflow/core/kernels/mkl_lrn_op.cc b/tensorflow/core/kernels/mkl_lrn_op.cc
new file mode 100644
index 0000000000..edca8e2553
--- /dev/null
+++ b/tensorflow/core/kernels/mkl_lrn_op.cc
@@ -0,0 +1,722 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// LRN = Local Response Normalization
+// See docs in ../ops/nn_ops.cc. This opkernel uses MKL library, create MKL
+// layout and primitives, use MKL dnn primitives to compute local
+// response normalization
+
+#ifdef INTEL_MKL
+
+#define EIGEN_USE_THREADS
+#include <vector>
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "third_party/mkl/include/mkl_dnn.h"
+#include "third_party/mkl/include/mkl_dnn_types.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/util/mkl_util.h"
+#include "tensorflow/core/util/tensor_format.h"
+
+#if !defined(IS_MOBILE_PLATFORM)
+#include "tensorflow/core/util/work_sharder.h"
+#endif
+
+namespace tensorflow {
+
+namespace {
+// Create a depth-by-depth band matrix with 1s along a swath of size (2 *
+// depth_radius + 1) around the diagonal.
+template <typename T>
+void GetBandMatrix(int depth, int depth_radius,
+ Eigen::Tensor<T, 2, Eigen::RowMajor>* result) {
+ result->setZero();
+ for (int row = 0; row < depth; ++row) {
+ const int begin = std::max<int>(0, row - depth_radius);
+ const int end = std::min<int>(depth, row + depth_radius + 1);
+ Eigen::DSizes<Eigen::DenseIndex, 2> start(row, begin);
+ Eigen::DSizes<Eigen::DenseIndex, 2> sizes(1, end - begin);
+ result->slice(start, sizes).setConstant(T(1));
+ }
+}
+
+} // namespace
+
+template <typename T>
+class MklLRNOp : public OpKernel {
+ public:
+ ~MklLRNOp() {}
+
+ explicit MklLRNOp(OpKernelConstruction* context) : OpKernel(context) {
+ int64 depth_radius64;
+ OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius64));
+ OP_REQUIRES(
+ context,
+ FastBoundsCheck(depth_radius64, std::numeric_limits<int>::max()),
+ errors::InvalidArgument("depth_radius = ", depth_radius64,
+ " larger than int max"));
+ depth_radius_ = static_cast<size_t>(depth_radius64);
+
+ OP_REQUIRES_OK(context, context->GetAttr("bias", &bias_));
+ OP_REQUIRES_OK(context, context->GetAttr("alpha", &alpha_));
+ OP_REQUIRES_OK(context, context->GetAttr("beta", &beta_));
+ workspace_enabled_ = false;
+ context->GetAttr("workspace_enabled", &workspace_enabled_);
+ }
+
+ void Compute(OpKernelContext* context) override {
+ MklLRNOpContext mkl_context;
+
+ const Tensor& input = MklGetInput(context, 0);
+ GetMklShape(context, 0, &mkl_context.input_shape);
+ bool input_in_mkl_format = mkl_context.input_shape.IsMklTensor();
+
+ // Sanity checks
+ mkl_context.in_dims = input_in_mkl_format
+ ? mkl_context.input_shape.GetDimension()
+ : input.dims();
+ OP_REQUIRES(context, mkl_context.in_dims == 4,
+ errors::InvalidArgument("input must be 4-dimensional"));
+ OP_REQUIRES(
+ context,
+ FastBoundsCheck(input.NumElements(), std::numeric_limits<int>::max()),
+ errors::InvalidArgument("argument to LRN too large"));
+
+ if (!input_in_mkl_format) {
+ mkl_context.MklDefaultToEigen(context, depth_radius_, bias_, alpha_,
+ beta_, input);
+ return;
+ }
+
+ if (input_in_mkl_format) {
+ // MKL supports normalization over channel dimension only
+ if (mkl_context.input_shape.tf_dim_idx(mkl_context.in_dims - 1) ==
+ MklDims::C) {
+ mkl_context.lt_input =
+ static_cast<dnnLayout_t>(mkl_context.input_shape.GetCurLayout());
+ workspace_enabled_ = true;
+ } else {
+ mkl_context.MklDefaultToEigen(context, depth_radius_, bias_, alpha_,
+ beta_, input);
+ return;
+ }
+ }
+
+ int kernel_size = 2 * depth_radius_ + 1;
+
+ CHECK_EQ(dnnLRNCreateForward_F32(
+ &mkl_context.lrn_fwd, NULL, mkl_context.lt_input, kernel_size,
+ static_cast<float>(alpha_ * kernel_size), beta_, bias_),
+ E_SUCCESS);
+
+ // Allocate output tensor and shape
+ Tensor* output = nullptr;
+ Tensor* workspace = nullptr;
+
+ // Convert Inputs if needed
+ Tensor mkl_tmp_input_buf_tensor;
+ mkl_context.MklPrepareLRNInputs(context, &mkl_tmp_input_buf_tensor);
+
+ // Allocate Layer Outputs
+ mkl_context.MklAllocateOutputs(context, &output, &workspace,
+ workspace_enabled_);
+
+ Tensor mkl_tmp_workspace_buf_tensor;
+ mkl_context.MklPrepareLRNOutputs(context, output, workspace,
+ &mkl_tmp_workspace_buf_tensor,
+ workspace_enabled_);
+
+ // Execute LRN.
+ CHECK_EQ(dnnExecute_F32(mkl_context.lrn_fwd, mkl_context.lrn_res),
+ E_SUCCESS);
+
+ // Release MKL resources.
+ mkl_context.MklCleanup();
+ }
+
+ private:
+ typedef struct {
+ size_t in_dims;
+ size_t in_sizes[4];
+ size_t in_strides[4];
+ size_t out_sizes[4];
+ size_t out_strides[4];
+ MklShape input_shape;
+ dnnPrimitive_t lrn_fwd = nullptr;
+ dnnPrimitive_t convert_input = nullptr;
+ /* dnnPrimitive_t convert_output; */
+ dnnLayout_t lt_input = nullptr;
+ /* dnnLayout_t lt_output; */
+ dnnLayout_t lt_internal_input = nullptr;
+ dnnLayout_t lt_internal_workspace = nullptr;
+ dnnLayout_t lt_internal_output = nullptr;
+ void* lrn_res[dnnResourceNumber];
+
+ // Convert Inputs if needed
+ void MklPrepareLRNInputs(OpKernelContext* context,
+ Tensor* mkl_tmp_input_buf_tensor) {
+ const Tensor& input = MklGetInput(context, 0);
+ void* mkl_buf_input =
+ const_cast<void*>(static_cast<const void*>(input.flat<T>().data()));
+
+ CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&lt_internal_input, lrn_fwd,
+ dnnResourceSrc),
+ E_SUCCESS);
+
+ void* mkl_buf_convert_input = nullptr;
+ bool mkl_convert_input = false;
+ mkl_convert_input = !dnnLayoutCompare_F32(lt_internal_input, lt_input);
+
+ if (mkl_convert_input) {
+ CHECK_EQ(dnnConversionCreate_F32(&convert_input, lt_input,
+ lt_internal_input),
+ E_SUCCESS);
+ AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, lt_internal_input,
+ &mkl_buf_convert_input);
+ CHECK_EQ(dnnConversionExecute_F32(convert_input, mkl_buf_input,
+ mkl_buf_convert_input),
+ E_SUCCESS);
+ dnnDelete_F32(convert_input);
+ }
+
+ lrn_res[dnnResourceSrc] =
+ (mkl_convert_input) ? mkl_buf_convert_input : mkl_buf_input;
+ }
+
+ // Allocate Layer Outputs
+ void MklAllocateOutputs(OpKernelContext* context, Tensor** output,
+ Tensor** workspace, bool workspace_enabled_) {
+ TensorShape mkl_output_tf_shape; /* First tensor */
+ MklShape mkl_output_mkl_shape; /* Second tensor */
+
+ mkl_output_mkl_shape.SetMklTensor(true);
+ mkl_output_mkl_shape.SetMklLayout(lrn_fwd, dnnResourceDst);
+ mkl_output_mkl_shape.SetTfLayout(in_dims, input_shape.GetSizes(),
+ input_shape.GetStrides());
+ mkl_output_mkl_shape.SetTfDimOrder(in_dims,
+ input_shape.GetTfToMklDimMap());
+ mkl_output_tf_shape.AddDim(
+ dnnLayoutGetMemorySize_F32(
+ static_cast<dnnLayout_t>(mkl_output_mkl_shape.GetMklLayout())) /
+ sizeof(T));
+ AllocateOutputSetMklShape(context, 0, output,
+ mkl_output_tf_shape /* First tensor */,
+ mkl_output_mkl_shape /* Second Tensor */);
+
+ if (workspace_enabled_) {
+ TensorShape mkl_workspace_tf_shape; /* First tensor */
+ MklShape mkl_workspace_mkl_shape; /* Second tensor */
+ mkl_workspace_mkl_shape.SetMklTensor(false);
+ mkl_workspace_mkl_shape.SetMklLayout(lrn_fwd, dnnResourceWorkspace);
+ // Assumes workspace has same TF layout and TF dim order as input
+ mkl_workspace_mkl_shape.SetTfLayout(in_dims, input_shape.GetSizes(),
+ input_shape.GetStrides());
+ mkl_workspace_mkl_shape.SetTfDimOrder(in_dims,
+ input_shape.GetTfToMklDimMap());
+ mkl_workspace_tf_shape.AddDim(
+ dnnLayoutGetMemorySize_F32(static_cast<dnnLayout_t>(
+ mkl_workspace_mkl_shape.GetMklLayout())) /
+ sizeof(T));
+ AllocateOutputSetMklShape(context, 1, workspace,
+ mkl_workspace_tf_shape /* First tensor */,
+ mkl_workspace_mkl_shape /* Second Tensor */);
+ }
+ }
+
+ void MklPrepareLRNOutputs(OpKernelContext* context, Tensor* output,
+ Tensor* workspace,
+ Tensor* mkl_tmp_workspace_buf_tensor,
+ bool workspace_enabled_) {
+ CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&lt_internal_workspace, lrn_fwd,
+ dnnResourceWorkspace),
+ E_SUCCESS);
+
+ CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&lt_internal_output, lrn_fwd,
+ dnnResourceDst),
+ E_SUCCESS);
+
+ void* mkl_buf_output =
+ const_cast<void*>(static_cast<const void*>(output->flat<T>().data()));
+ lrn_res[dnnResourceDst] = mkl_buf_output;
+
+ void* mkl_buf_workspace = nullptr;
+ if (workspace_enabled_) {
+ mkl_buf_workspace = const_cast<void*>(
+ static_cast<const void*>(workspace->flat<T>().data()));
+ } else {
+ AllocTmpBuffer(context, mkl_tmp_workspace_buf_tensor,
+ lt_internal_workspace, &mkl_buf_workspace);
+ }
+ lrn_res[dnnResourceWorkspace] = mkl_buf_workspace;
+ }
+
+ // Fallback implementation - Taken from lrn_op.cc
+ // TODO(intelft) Check if we can use EigenLRNOp directly instead of making a
+ // copy.
+ void MklDefaultToEigen(OpKernelContext* context, int depth_radius_,
+ float bias_, float alpha_, float beta_,
+ const Tensor& input) {
+ const int batch = static_cast<int>(input.dim_size(0));
+ const int rows = static_cast<int>(input.dim_size(1));
+ const int cols = static_cast<int>(input.dim_size(2));
+ const int depth = static_cast<int>(input.dim_size(3));
+ const int nodes = cols * rows;
+
+ auto in_shaped = input.shaped<T, 2>({nodes * batch, depth});
+ // Multiplying the input with the band matrix has the effect of reducing
+ // the
+ // correct patch along the depth.
+ Eigen::Tensor<T, 2, Eigen::RowMajor> multiplier(depth, depth);
+ GetBandMatrix<T>(depth, depth_radius_, &multiplier);
+
+ Tensor *output, *workspace;
+ MklShape mkl_output_mkl_shape, mkl_workspace_mkl_shape;
+ mkl_output_mkl_shape.SetMklTensor(false);
+ mkl_output_mkl_shape.SetDimensions(4);
+ AllocateOutputSetMklShape(context, 0, &output, input.shape(),
+ mkl_output_mkl_shape);
+
+ mkl_workspace_mkl_shape.SetMklTensor(false);
+ mkl_workspace_mkl_shape.SetDimensions(4);
+ AllocateOutputSetMklShape(context, 1, &workspace, input.shape(),
+ mkl_workspace_mkl_shape);
+
+ auto out_shaped = output->shaped<T, 2>({nodes * batch, depth});
+ Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};
+ auto tmp = in_shaped.square().contract(multiplier, dims) * alpha_ + bias_;
+ if (beta_ == T(1)) {
+ out_shaped.device(context->eigen_cpu_device()) =
+ in_shaped * tmp.inverse();
+ } else if (beta_ == T(0.5)) {
+ out_shaped.device(context->eigen_cpu_device()) =
+ in_shaped * tmp.rsqrt();
+ } else {
+ out_shaped.device(context->eigen_cpu_device()) =
+ in_shaped * (tmp.log() * -beta_).exp();
+ }
+ }
+
+ // Release MKL resources.
+ void MklCleanup() {
+ dnnDelete_F32(lrn_fwd);
+ dnnLayoutDelete_F32(lt_internal_input);
+ dnnLayoutDelete_F32(lt_internal_workspace);
+ dnnLayoutDelete_F32(lt_internal_output);
+ }
+ } MklLRNOpContext;
+
+ typedef typename Eigen::Tensor<T, 1, Eigen::RowMajor>::DimensionPair DimPair;
+
+ bool workspace_enabled_;
+ int depth_radius_;
+ float bias_;
+ float alpha_;
+ float beta_;
+};
+
+template <typename T>
+class MklLRNGradOp : public OpKernel {
+ public:
+ explicit MklLRNGradOp(OpKernelConstruction* context) : OpKernel(context) {
+ int64 depth_radius64;
+ OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius64));
+ OP_REQUIRES(
+ context,
+ FastBoundsCheck(depth_radius64, std::numeric_limits<int>::max()),
+ errors::InvalidArgument("depth_radius = ", depth_radius64,
+ " larger than int max"));
+ depth_radius_ = static_cast<int>(depth_radius64);
+ OP_REQUIRES_OK(context, context->GetAttr("bias", &bias_));
+ OP_REQUIRES_OK(context, context->GetAttr("alpha", &alpha_));
+ OP_REQUIRES_OK(context, context->GetAttr("beta", &beta_));
+ workspace_enabled_ = false;
+ context->GetAttr("workspace_enabled", &workspace_enabled_);
+ }
+
+ void Compute(OpKernelContext* context) override {
+ MklLRNGradOpContext mkl_context;
+ mkl_context.depth_radius_ = depth_radius_;
+ mkl_context.bias_ = bias_;
+ mkl_context.alpha_ = alpha_;
+ mkl_context.beta_ = beta_;
+
+ const Tensor& in_grads = MklGetInput(context, 0);
+ const Tensor& in_image = MklGetInput(context, 1);
+ const Tensor& out_image = MklGetInput(context, 2);
+
+ GetMklShape(context, 0, &mkl_context.ingrad_shape);
+ GetMklShape(context, 1, &mkl_context.inimage_shape);
+ GetMklShape(context, 2, &mkl_context.outimage_shape);
+
+ bool ingrad_in_mkl_format = mkl_context.ingrad_shape.IsMklTensor();
+ bool inimage_in_mkl_format = mkl_context.inimage_shape.IsMklTensor();
+ bool outimage_in_mkl_format = mkl_context.outimage_shape.IsMklTensor();
+
+ mkl_context.in_dims = inimage_in_mkl_format
+ ? mkl_context.inimage_shape.GetDimension()
+ : in_image.dims();
+ OP_REQUIRES(context, mkl_context.in_dims == 4,
+ errors::InvalidArgument("input images must be 4-dimensional"));
+
+ if (!workspace_enabled_) {
+ mkl_context.MklDefaultToEigen(context);
+ return;
+ }
+ if (ingrad_in_mkl_format || inimage_in_mkl_format) {
+ const MklShape* tmp_mkl_shape = (ingrad_in_mkl_format)
+ ? &mkl_context.ingrad_shape
+ : &mkl_context.inimage_shape;
+ if (tmp_mkl_shape->tf_dim_idx(mkl_context.in_dims - 1) != MklDims::C) {
+ // Fallback to eigen
+ mkl_context.MklDefaultToEigen(context);
+ return;
+ } else { // MKL supports normalization over channel dimension only
+ for (int i = 0; i < mkl_context.in_dims; i++) {
+ mkl_context.in_sizes[i] = mkl_context.out_sizes[i] =
+ tmp_mkl_shape->GetSizes()[i];
+ mkl_context.in_strides[i] = mkl_context.out_strides[i] =
+ tmp_mkl_shape->GetStrides()[i];
+ }
+ }
+ } else {
+ // Fallback to eigen
+ mkl_context.MklDefaultToEigen(context);
+ return;
+ }
+
+ // Dimensions check for sanity purpose
+ if (ingrad_in_mkl_format) {
+ OP_REQUIRES(
+ context, mkl_context.ingrad_shape.GetDimension() == 4,
+ errors::InvalidArgument("input gradient must be 4-dimensional"));
+ } else {
+ OP_REQUIRES(
+ context, in_grads.dims() == 4,
+ errors::InvalidArgument("input gradient must be 4-dimensional"));
+ }
+
+ if (outimage_in_mkl_format) {
+ OP_REQUIRES(
+ context, mkl_context.outimage_shape.GetDimension() == 4,
+ errors::InvalidArgument("Output image must be 4-dimensional"));
+ } else {
+ OP_REQUIRES(
+ context, out_image.dims() == 4,
+ errors::InvalidArgument("Output image must be 4-dimensional"));
+ }
+
+ // Prepare mkl input layout
+ mkl_context.MklPrepareLRNInputsLayouts(context);
+ int ksize = 2 * depth_radius_ + 1;
+
+ CHECK_EQ(dnnLRNCreateBackward_F32(
+ &mkl_context.lrn_bwd, NULL, mkl_context.lt_input,
+ mkl_context.lt_output, ksize,
+ static_cast<float>(alpha_ * ksize), beta_, bias_),
+ E_SUCCESS);
+
+ // Allocate output tensor and shape.
+ TensorShape mkl_output_tf_shape; /* First tensor */
+ MklShape mkl_output_mkl_shape; /* Second tensor */
+ mkl_output_mkl_shape.SetMklTensor(true);
+ CHECK_NE(mkl_context.lrn_bwd, nullptr);
+ mkl_output_mkl_shape.SetMklLayout(mkl_context.lrn_bwd, dnnResourceDiffSrc);
+ mkl_output_mkl_shape.SetTfLayout(mkl_context.in_dims, mkl_context.out_sizes,
+ mkl_context.out_strides);
+ if (ingrad_in_mkl_format) {
+ mkl_output_mkl_shape.SetTfDimOrder(
+ mkl_context.in_dims, mkl_context.ingrad_shape.GetTfToMklDimMap());
+ } else {
+ mkl_output_mkl_shape.SetTfDimOrder(
+ mkl_context.in_dims, mkl_context.inimage_shape.GetTfToMklDimMap());
+ }
+ mkl_output_tf_shape.AddDim(
+ dnnLayoutGetMemorySize_F32(
+ static_cast<dnnLayout_t>(mkl_output_mkl_shape.GetMklLayout())) /
+ sizeof(T));
+ Tensor* output = nullptr;
+ AllocateOutputSetMklShape(context, 0, &output, mkl_output_tf_shape,
+ mkl_output_mkl_shape);
+
+ // Get pointers to output data.
+ void* user_output =
+ const_cast<void*>(static_cast<const void*>(output->flat<T>().data()));
+
+ Tensor mkl_tmp_input_buf_tensor, mkl_tmp_image_buf_tensor,
+ mkl_tmp_outimage_buf_tensor, mkl_tmp_workspace_buf_tensor;
+ // Convert Inputs if needed
+ mkl_context.MklPrepareLRNGradInput(
+ context, &mkl_tmp_input_buf_tensor, &mkl_tmp_image_buf_tensor,
+ &mkl_tmp_outimage_buf_tensor, &mkl_tmp_workspace_buf_tensor);
+
+ // We do not do any conversion for output. But we simply emit it
+ // in MKL format.
+ mkl_context.res_lrn_bwd[dnnResourceDiffSrc] = user_output;
+ // Execute LRN backward using dnnExecute
+ CHECK_EQ(dnnExecute_F32(mkl_context.lrn_bwd, mkl_context.res_lrn_bwd),
+ E_SUCCESS);
+ // Release MKL resources.
+ mkl_context.Mklcleanup();
+ }
+
+ private:
+ typedef struct {
+ int depth_radius_;
+ float bias_;
+ float alpha_;
+ float beta_;
+ size_t in_dims;
+ size_t in_sizes[4];
+ size_t in_strides[4];
+ size_t out_sizes[4];
+ size_t out_strides[4];
+ MklShape ingrad_shape, inimage_shape, outimage_shape;
+ dnnPrimitive_t lrn_bwd = nullptr;
+ dnnPrimitive_t convert_input = nullptr;
+ /* dnnPrimitive_t convert_output; */
+ dnnLayout_t lt_input = nullptr;
+ dnnLayout_t lt_output = nullptr;
+ dnnLayout_t lt_bdw_input = nullptr;
+ dnnLayout_t lt_workspace = nullptr;
+ dnnLayout_t lt_internal_input = nullptr;
+ /* dnnLayout_t lt_internal_workspace;
+ dnnLayout_t lt_internal_output; */
+ void* res_lrn_bwd[dnnResourceNumber];
+
+ // prepare mkl input
+ void MklPrepareLRNInputsLayouts(OpKernelContext* context) {
+ bool ingrad_in_mkl_format = ingrad_shape.IsMklTensor();
+ bool inimage_in_mkl_format = inimage_shape.IsMklTensor();
+ if (!ingrad_in_mkl_format) {
+ CHECK_EQ(dnnLayoutCreate_F32(&lt_input, in_dims, in_sizes, in_strides),
+ E_SUCCESS);
+ } else {
+ lt_input = static_cast<dnnLayout_t>(ingrad_shape.GetCurLayout());
+ }
+
+ if (!inimage_in_mkl_format) {
+ CHECK_EQ(
+ dnnLayoutCreate_F32(&lt_output, in_dims, out_sizes, out_strides),
+ E_SUCCESS);
+ } else {
+ lt_output = static_cast<dnnLayout_t>(inimage_shape.GetCurLayout());
+ }
+ }
+
+ // convert input if needed
+ void MklPrepareLRNGradInput(OpKernelContext* context,
+ Tensor* mkl_tmp_input_buf_tensor,
+ Tensor* mkl_tmp_image_buf_tensor,
+ Tensor* mkl_tmp_outimage_buf_tensor,
+ Tensor* mkl_tmp_workspace_buf_tensor) {
+ const Tensor& in_grads = MklGetInput(context, 0);
+ const Tensor& in_image = MklGetInput(context, 1);
+ const Tensor& out_image = MklGetInput(context, 2);
+
+ void* user_input = const_cast<void*>(
+ static_cast<const void*>(in_grads.flat<T>().data()));
+ void* user_fwd_input = const_cast<void*>(
+ static_cast<const void*>(in_image.flat<T>().data()));
+ void* user_fwd_output = const_cast<void*>(
+ static_cast<const void*>(out_image.flat<T>().data()));
+ CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&lt_workspace, lrn_bwd,
+ dnnResourceWorkspace),
+ E_SUCCESS);
+ CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&lt_bdw_input, lrn_bwd,
+ dnnResourceDiffDst),
+ E_SUCCESS);
+
+ bool ingrad_in_mkl_format = ingrad_shape.IsMklTensor();
+ if (ingrad_in_mkl_format) {
+ if (!dnnLayoutCompare_F32(lt_bdw_input, lt_input)) {
+ AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, lt_bdw_input,
+ &res_lrn_bwd[dnnResourceDiffDst]);
+ ingrad_shape.GetConvertedFlatData(lt_bdw_input, user_input,
+ res_lrn_bwd[dnnResourceDiffDst]);
+ } else {
+ res_lrn_bwd[dnnResourceDiffDst] = user_input;
+ }
+ } else {
+ if (!dnnLayoutCompare_F32(lt_bdw_input, lt_input)) {
+ CHECK_EQ(
+ dnnConversionCreate_F32(&convert_input, lt_input, lt_bdw_input),
+ E_SUCCESS);
+
+ AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, lt_bdw_input,
+ &res_lrn_bwd[dnnResourceDiffDst]);
+ CHECK_EQ(dnnConversionExecute_F32(convert_input, user_input,
+ res_lrn_bwd[dnnResourceDiffDst]),
+ E_SUCCESS);
+ dnnDelete_F32(convert_input);
+ } else {
+ res_lrn_bwd[dnnResourceDiffDst] = user_input;
+ }
+ }
+
+// Although MKL documentation for LRN does not specify setting/getting
+// of dnnResourceSrc and dnnResourceDst, Caffe code sets dnnResourceSrc.
+// So we set dnnResourceSrc here. But we do not know why we are setting
+// dnnResourceDst.
+#if 0
+ // NOTE: The code below is kept just so that we know how we should handle
+ // dnnResourceSrc if the primitive layout for dnnResourceSrc was supported.
+
+ if (!dnnLayoutCompare_F32(lt_internal_input,
+ static_cast<dnnLayout_t>inimage_shape.GetCurLayout())) {
+ AllocTmpBuffer(context, mkl_tmp_image_buf_tensor, lt_internal_input,
+ &res_lrn_bwd[dnnResourceSrc]);
+ inimage_shape.GetConvertedFlatData(lt_internal_input,
+ user_fwd_input,
+ res_lrn_bwd[dnnResourceSrc]);
+ } else {
+ res_lrn_bwd[dnnResourceSrc] = user_fwd_input;
+ }
+#endif
+
+ // Since we cannot get expected layout for dnnResourceSrc, we construct
+ // buffer using
+ // MKL format if input is in MKL format.
+ if (inimage_shape.IsMklTensor()) {
+ AllocTmpBuffer(context, mkl_tmp_image_buf_tensor,
+ (dnnLayout_t)inimage_shape.GetCurLayout(),
+ &res_lrn_bwd[dnnResourceSrc]);
+ } else {
+ res_lrn_bwd[dnnResourceSrc] = user_fwd_input;
+ }
+
+ // Same comment as above.
+ if (outimage_shape.IsMklTensor()) {
+ AllocTmpBuffer(context, mkl_tmp_outimage_buf_tensor,
+ (dnnLayout_t)outimage_shape.GetCurLayout(),
+ &res_lrn_bwd[dnnResourceDst]);
+ } else {
+ res_lrn_bwd[dnnResourceDst] = user_fwd_output;
+ }
+
+ // Allocate buffer for workspace.
+ AllocTmpBuffer(context, mkl_tmp_workspace_buf_tensor, lt_workspace,
+ &res_lrn_bwd[dnnResourceWorkspace]);
+ }
+
+ // Fallback implementation - Taken from lrn_op.cc
+ // TODO(intelft) Check if we can use EigenLRNOp directly instead of making a
+ // copy.
+ void MklDefaultToEigen(OpKernelContext* context) {
+ // CHECK(false);
+ Tensor in_grads = MklGetInput(context, 0);
+ Tensor in_image = MklGetInput(context, 1);
+ Tensor out_image = MklGetInput(context, 2);
+
+ GetMklShape(context, 0, &ingrad_shape);
+ GetMklShape(context, 1, &inimage_shape);
+ GetMklShape(context, 2, &outimage_shape);
+
+ const int64 batch = static_cast<int64>(in_grads.dim_size(0));
+ const int64 rows = static_cast<int64>(in_grads.dim_size(1));
+ const int64 cols = static_cast<int64>(in_grads.dim_size(2));
+ const int64 depth = static_cast<int64>(in_grads.dim_size(3));
+ const auto nodes = cols * rows;
+
+ auto grads_shaped = in_grads.shaped<T, 2>({nodes * batch, depth});
+ auto in_shaped = in_image.shaped<T, 2>({nodes * batch, depth});
+ auto activations = out_image.shaped<T, 2>({nodes * batch, depth});
+
+ Tensor* output;
+ MklShape mkl_output_mkl_shape;
+ mkl_output_mkl_shape.SetMklTensor(false);
+ mkl_output_mkl_shape.SetDimensions(4);
+ AllocateOutputSetMklShape(context, 0, &output, in_grads.shape(),
+ mkl_output_mkl_shape);
+
+ auto out_shaped = output->shaped<T, 2>({nodes * batch, depth});
+ out_shaped.setZero();
+ auto shard = [this, activations, in_shaped, grads_shaped, out_shaped,
+ depth](int64 begin, int64 end) {
+ for (int64 i = begin; i < end; ++i) {
+ for (int64 j = 0; j < depth; ++j) {
+ int64 depth_begin = std::max<int64>(0, j - depth_radius_);
+ int64 depth_end = std::min<int64>(depth, j + depth_radius_ + 1);
+
+ T norm(0);
+ for (int64 k = depth_begin; k < depth_end; ++k) {
+ norm += in_shaped(i, k) * in_shaped(i, k);
+ }
+ norm = alpha_ * norm + bias_;
+ DCHECK_GT(norm, T(1e-6));
+ for (int64 k = depth_begin; k < depth_end; ++k) {
+ T dyi = T(-2) * alpha_ * beta_ * in_shaped(i, k) *
+ activations(i, j) / norm;
+ if (k == j) {
+ dyi += Eigen::numext::pow(norm, -beta_);
+ }
+ dyi *= grads_shaped(i, j);
+ const_cast<typename TTypes<T, 2>::Tensor&>(out_shaped)(i, k) +=
+ dyi;
+ }
+ }
+ }
+ };
+ auto worker_threads =
+ *(context->device()->tensorflow_cpu_worker_threads());
+ Shard(worker_threads.num_threads, worker_threads.workers, nodes * batch,
+ depth * depth, shard);
+ }
+
+ // release mkl resources
+ void Mklcleanup() {
+ bool ingrad_in_mkl_format = ingrad_shape.IsMklTensor();
+ bool inimage_in_mkl_format = inimage_shape.IsMklTensor();
+ if (!ingrad_in_mkl_format) {
+ CHECK_EQ(dnnLayoutDelete_F32(lt_input), E_SUCCESS);
+ }
+
+ if (!inimage_in_mkl_format) {
+ CHECK_EQ(dnnLayoutDelete_F32(lt_output), E_SUCCESS);
+ }
+ dnnDelete_F32(lrn_bwd);
+ dnnLayoutDelete_F32(lt_bdw_input);
+ dnnLayoutDelete_F32(lt_workspace);
+ }
+ } MklLRNGradOpContext;
+
+ typedef typename Eigen::Tensor<T, 1, Eigen::RowMajor>::DimensionPair DimPair;
+ bool workspace_enabled_;
+ int depth_radius_;
+ float bias_;
+ float alpha_;
+ float beta_;
+};
+
+#define REGISTER_MKL_LRN_CPU(T) \
+ REGISTER_KERNEL_BUILDER(Name("_MklLRN") \
+ .Device(DEVICE_CPU) \
+ .TypeConstraint<T>("T") \
+ .Label(mkl_op_registry::kMklOpLabel), \
+ MklLRNOp<T>); \
+ REGISTER_KERNEL_BUILDER(Name("_MklLRNGrad") \
+ .Device(DEVICE_CPU) \
+ .TypeConstraint<T>("T") \
+ .Label(mkl_op_registry::kMklOpLabel), \
+ MklLRNGradOp<T>);
+
+TF_CALL_float(REGISTER_MKL_LRN_CPU);
+
+} // namespace tensorflow
+
+#endif // INTEL_MKL