diff options
Diffstat (limited to 'tensorflow/core/kernels/mkl_lrn_op.cc')
-rw-r--r-- | tensorflow/core/kernels/mkl_lrn_op.cc | 722 |
1 files changed, 722 insertions, 0 deletions
diff --git a/tensorflow/core/kernels/mkl_lrn_op.cc b/tensorflow/core/kernels/mkl_lrn_op.cc new file mode 100644 index 0000000000..edca8e2553 --- /dev/null +++ b/tensorflow/core/kernels/mkl_lrn_op.cc @@ -0,0 +1,722 @@ +/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +// LRN = Local Response Normalization +// See docs in ../ops/nn_ops.cc. This opkernel uses MKL library, create MKL +// layout and primitives, use MKL dnn primitives to compute local +// response normalization + +#ifdef INTEL_MKL + +#define EIGEN_USE_THREADS +#include <vector> +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" +#include "third_party/mkl/include/mkl_dnn.h" +#include "third_party/mkl/include/mkl_dnn_types.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/kernels/bounds_check.h" +#include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/util/mkl_util.h" +#include "tensorflow/core/util/tensor_format.h" + +#if !defined(IS_MOBILE_PLATFORM) +#include "tensorflow/core/util/work_sharder.h" +#endif + +namespace tensorflow { + +namespace { +// Create a depth-by-depth band matrix with 1s along a swath of size (2 * +// depth_radius + 1) around the diagonal. +template <typename T> +void GetBandMatrix(int depth, int depth_radius, + Eigen::Tensor<T, 2, Eigen::RowMajor>* result) { + result->setZero(); + for (int row = 0; row < depth; ++row) { + const int begin = std::max<int>(0, row - depth_radius); + const int end = std::min<int>(depth, row + depth_radius + 1); + Eigen::DSizes<Eigen::DenseIndex, 2> start(row, begin); + Eigen::DSizes<Eigen::DenseIndex, 2> sizes(1, end - begin); + result->slice(start, sizes).setConstant(T(1)); + } +} + +} // namespace + +template <typename T> +class MklLRNOp : public OpKernel { + public: + ~MklLRNOp() {} + + explicit MklLRNOp(OpKernelConstruction* context) : OpKernel(context) { + int64 depth_radius64; + OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius64)); + OP_REQUIRES( + context, + FastBoundsCheck(depth_radius64, std::numeric_limits<int>::max()), + errors::InvalidArgument("depth_radius = ", depth_radius64, + " larger than int max")); + depth_radius_ = static_cast<size_t>(depth_radius64); + + OP_REQUIRES_OK(context, context->GetAttr("bias", &bias_)); + OP_REQUIRES_OK(context, context->GetAttr("alpha", &alpha_)); + OP_REQUIRES_OK(context, context->GetAttr("beta", &beta_)); + workspace_enabled_ = false; + context->GetAttr("workspace_enabled", &workspace_enabled_); + } + + void Compute(OpKernelContext* context) override { + MklLRNOpContext mkl_context; + + const Tensor& input = MklGetInput(context, 0); + GetMklShape(context, 0, &mkl_context.input_shape); + bool input_in_mkl_format = mkl_context.input_shape.IsMklTensor(); + + // Sanity checks + mkl_context.in_dims = input_in_mkl_format + ? mkl_context.input_shape.GetDimension() + : input.dims(); + OP_REQUIRES(context, mkl_context.in_dims == 4, + errors::InvalidArgument("input must be 4-dimensional")); + OP_REQUIRES( + context, + FastBoundsCheck(input.NumElements(), std::numeric_limits<int>::max()), + errors::InvalidArgument("argument to LRN too large")); + + if (!input_in_mkl_format) { + mkl_context.MklDefaultToEigen(context, depth_radius_, bias_, alpha_, + beta_, input); + return; + } + + if (input_in_mkl_format) { + // MKL supports normalization over channel dimension only + if (mkl_context.input_shape.tf_dim_idx(mkl_context.in_dims - 1) == + MklDims::C) { + mkl_context.lt_input = + static_cast<dnnLayout_t>(mkl_context.input_shape.GetCurLayout()); + workspace_enabled_ = true; + } else { + mkl_context.MklDefaultToEigen(context, depth_radius_, bias_, alpha_, + beta_, input); + return; + } + } + + int kernel_size = 2 * depth_radius_ + 1; + + CHECK_EQ(dnnLRNCreateForward_F32( + &mkl_context.lrn_fwd, NULL, mkl_context.lt_input, kernel_size, + static_cast<float>(alpha_ * kernel_size), beta_, bias_), + E_SUCCESS); + + // Allocate output tensor and shape + Tensor* output = nullptr; + Tensor* workspace = nullptr; + + // Convert Inputs if needed + Tensor mkl_tmp_input_buf_tensor; + mkl_context.MklPrepareLRNInputs(context, &mkl_tmp_input_buf_tensor); + + // Allocate Layer Outputs + mkl_context.MklAllocateOutputs(context, &output, &workspace, + workspace_enabled_); + + Tensor mkl_tmp_workspace_buf_tensor; + mkl_context.MklPrepareLRNOutputs(context, output, workspace, + &mkl_tmp_workspace_buf_tensor, + workspace_enabled_); + + // Execute LRN. + CHECK_EQ(dnnExecute_F32(mkl_context.lrn_fwd, mkl_context.lrn_res), + E_SUCCESS); + + // Release MKL resources. + mkl_context.MklCleanup(); + } + + private: + typedef struct { + size_t in_dims; + size_t in_sizes[4]; + size_t in_strides[4]; + size_t out_sizes[4]; + size_t out_strides[4]; + MklShape input_shape; + dnnPrimitive_t lrn_fwd = nullptr; + dnnPrimitive_t convert_input = nullptr; + /* dnnPrimitive_t convert_output; */ + dnnLayout_t lt_input = nullptr; + /* dnnLayout_t lt_output; */ + dnnLayout_t lt_internal_input = nullptr; + dnnLayout_t lt_internal_workspace = nullptr; + dnnLayout_t lt_internal_output = nullptr; + void* lrn_res[dnnResourceNumber]; + + // Convert Inputs if needed + void MklPrepareLRNInputs(OpKernelContext* context, + Tensor* mkl_tmp_input_buf_tensor) { + const Tensor& input = MklGetInput(context, 0); + void* mkl_buf_input = + const_cast<void*>(static_cast<const void*>(input.flat<T>().data())); + + CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(<_internal_input, lrn_fwd, + dnnResourceSrc), + E_SUCCESS); + + void* mkl_buf_convert_input = nullptr; + bool mkl_convert_input = false; + mkl_convert_input = !dnnLayoutCompare_F32(lt_internal_input, lt_input); + + if (mkl_convert_input) { + CHECK_EQ(dnnConversionCreate_F32(&convert_input, lt_input, + lt_internal_input), + E_SUCCESS); + AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, lt_internal_input, + &mkl_buf_convert_input); + CHECK_EQ(dnnConversionExecute_F32(convert_input, mkl_buf_input, + mkl_buf_convert_input), + E_SUCCESS); + dnnDelete_F32(convert_input); + } + + lrn_res[dnnResourceSrc] = + (mkl_convert_input) ? mkl_buf_convert_input : mkl_buf_input; + } + + // Allocate Layer Outputs + void MklAllocateOutputs(OpKernelContext* context, Tensor** output, + Tensor** workspace, bool workspace_enabled_) { + TensorShape mkl_output_tf_shape; /* First tensor */ + MklShape mkl_output_mkl_shape; /* Second tensor */ + + mkl_output_mkl_shape.SetMklTensor(true); + mkl_output_mkl_shape.SetMklLayout(lrn_fwd, dnnResourceDst); + mkl_output_mkl_shape.SetTfLayout(in_dims, input_shape.GetSizes(), + input_shape.GetStrides()); + mkl_output_mkl_shape.SetTfDimOrder(in_dims, + input_shape.GetTfToMklDimMap()); + mkl_output_tf_shape.AddDim( + dnnLayoutGetMemorySize_F32( + static_cast<dnnLayout_t>(mkl_output_mkl_shape.GetMklLayout())) / + sizeof(T)); + AllocateOutputSetMklShape(context, 0, output, + mkl_output_tf_shape /* First tensor */, + mkl_output_mkl_shape /* Second Tensor */); + + if (workspace_enabled_) { + TensorShape mkl_workspace_tf_shape; /* First tensor */ + MklShape mkl_workspace_mkl_shape; /* Second tensor */ + mkl_workspace_mkl_shape.SetMklTensor(false); + mkl_workspace_mkl_shape.SetMklLayout(lrn_fwd, dnnResourceWorkspace); + // Assumes workspace has same TF layout and TF dim order as input + mkl_workspace_mkl_shape.SetTfLayout(in_dims, input_shape.GetSizes(), + input_shape.GetStrides()); + mkl_workspace_mkl_shape.SetTfDimOrder(in_dims, + input_shape.GetTfToMklDimMap()); + mkl_workspace_tf_shape.AddDim( + dnnLayoutGetMemorySize_F32(static_cast<dnnLayout_t>( + mkl_workspace_mkl_shape.GetMklLayout())) / + sizeof(T)); + AllocateOutputSetMklShape(context, 1, workspace, + mkl_workspace_tf_shape /* First tensor */, + mkl_workspace_mkl_shape /* Second Tensor */); + } + } + + void MklPrepareLRNOutputs(OpKernelContext* context, Tensor* output, + Tensor* workspace, + Tensor* mkl_tmp_workspace_buf_tensor, + bool workspace_enabled_) { + CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(<_internal_workspace, lrn_fwd, + dnnResourceWorkspace), + E_SUCCESS); + + CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(<_internal_output, lrn_fwd, + dnnResourceDst), + E_SUCCESS); + + void* mkl_buf_output = + const_cast<void*>(static_cast<const void*>(output->flat<T>().data())); + lrn_res[dnnResourceDst] = mkl_buf_output; + + void* mkl_buf_workspace = nullptr; + if (workspace_enabled_) { + mkl_buf_workspace = const_cast<void*>( + static_cast<const void*>(workspace->flat<T>().data())); + } else { + AllocTmpBuffer(context, mkl_tmp_workspace_buf_tensor, + lt_internal_workspace, &mkl_buf_workspace); + } + lrn_res[dnnResourceWorkspace] = mkl_buf_workspace; + } + + // Fallback implementation - Taken from lrn_op.cc + // TODO(intelft) Check if we can use EigenLRNOp directly instead of making a + // copy. + void MklDefaultToEigen(OpKernelContext* context, int depth_radius_, + float bias_, float alpha_, float beta_, + const Tensor& input) { + const int batch = static_cast<int>(input.dim_size(0)); + const int rows = static_cast<int>(input.dim_size(1)); + const int cols = static_cast<int>(input.dim_size(2)); + const int depth = static_cast<int>(input.dim_size(3)); + const int nodes = cols * rows; + + auto in_shaped = input.shaped<T, 2>({nodes * batch, depth}); + // Multiplying the input with the band matrix has the effect of reducing + // the + // correct patch along the depth. + Eigen::Tensor<T, 2, Eigen::RowMajor> multiplier(depth, depth); + GetBandMatrix<T>(depth, depth_radius_, &multiplier); + + Tensor *output, *workspace; + MklShape mkl_output_mkl_shape, mkl_workspace_mkl_shape; + mkl_output_mkl_shape.SetMklTensor(false); + mkl_output_mkl_shape.SetDimensions(4); + AllocateOutputSetMklShape(context, 0, &output, input.shape(), + mkl_output_mkl_shape); + + mkl_workspace_mkl_shape.SetMklTensor(false); + mkl_workspace_mkl_shape.SetDimensions(4); + AllocateOutputSetMklShape(context, 1, &workspace, input.shape(), + mkl_workspace_mkl_shape); + + auto out_shaped = output->shaped<T, 2>({nodes * batch, depth}); + Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}}; + auto tmp = in_shaped.square().contract(multiplier, dims) * alpha_ + bias_; + if (beta_ == T(1)) { + out_shaped.device(context->eigen_cpu_device()) = + in_shaped * tmp.inverse(); + } else if (beta_ == T(0.5)) { + out_shaped.device(context->eigen_cpu_device()) = + in_shaped * tmp.rsqrt(); + } else { + out_shaped.device(context->eigen_cpu_device()) = + in_shaped * (tmp.log() * -beta_).exp(); + } + } + + // Release MKL resources. + void MklCleanup() { + dnnDelete_F32(lrn_fwd); + dnnLayoutDelete_F32(lt_internal_input); + dnnLayoutDelete_F32(lt_internal_workspace); + dnnLayoutDelete_F32(lt_internal_output); + } + } MklLRNOpContext; + + typedef typename Eigen::Tensor<T, 1, Eigen::RowMajor>::DimensionPair DimPair; + + bool workspace_enabled_; + int depth_radius_; + float bias_; + float alpha_; + float beta_; +}; + +template <typename T> +class MklLRNGradOp : public OpKernel { + public: + explicit MklLRNGradOp(OpKernelConstruction* context) : OpKernel(context) { + int64 depth_radius64; + OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius64)); + OP_REQUIRES( + context, + FastBoundsCheck(depth_radius64, std::numeric_limits<int>::max()), + errors::InvalidArgument("depth_radius = ", depth_radius64, + " larger than int max")); + depth_radius_ = static_cast<int>(depth_radius64); + OP_REQUIRES_OK(context, context->GetAttr("bias", &bias_)); + OP_REQUIRES_OK(context, context->GetAttr("alpha", &alpha_)); + OP_REQUIRES_OK(context, context->GetAttr("beta", &beta_)); + workspace_enabled_ = false; + context->GetAttr("workspace_enabled", &workspace_enabled_); + } + + void Compute(OpKernelContext* context) override { + MklLRNGradOpContext mkl_context; + mkl_context.depth_radius_ = depth_radius_; + mkl_context.bias_ = bias_; + mkl_context.alpha_ = alpha_; + mkl_context.beta_ = beta_; + + const Tensor& in_grads = MklGetInput(context, 0); + const Tensor& in_image = MklGetInput(context, 1); + const Tensor& out_image = MklGetInput(context, 2); + + GetMklShape(context, 0, &mkl_context.ingrad_shape); + GetMklShape(context, 1, &mkl_context.inimage_shape); + GetMklShape(context, 2, &mkl_context.outimage_shape); + + bool ingrad_in_mkl_format = mkl_context.ingrad_shape.IsMklTensor(); + bool inimage_in_mkl_format = mkl_context.inimage_shape.IsMklTensor(); + bool outimage_in_mkl_format = mkl_context.outimage_shape.IsMklTensor(); + + mkl_context.in_dims = inimage_in_mkl_format + ? mkl_context.inimage_shape.GetDimension() + : in_image.dims(); + OP_REQUIRES(context, mkl_context.in_dims == 4, + errors::InvalidArgument("input images must be 4-dimensional")); + + if (!workspace_enabled_) { + mkl_context.MklDefaultToEigen(context); + return; + } + if (ingrad_in_mkl_format || inimage_in_mkl_format) { + const MklShape* tmp_mkl_shape = (ingrad_in_mkl_format) + ? &mkl_context.ingrad_shape + : &mkl_context.inimage_shape; + if (tmp_mkl_shape->tf_dim_idx(mkl_context.in_dims - 1) != MklDims::C) { + // Fallback to eigen + mkl_context.MklDefaultToEigen(context); + return; + } else { // MKL supports normalization over channel dimension only + for (int i = 0; i < mkl_context.in_dims; i++) { + mkl_context.in_sizes[i] = mkl_context.out_sizes[i] = + tmp_mkl_shape->GetSizes()[i]; + mkl_context.in_strides[i] = mkl_context.out_strides[i] = + tmp_mkl_shape->GetStrides()[i]; + } + } + } else { + // Fallback to eigen + mkl_context.MklDefaultToEigen(context); + return; + } + + // Dimensions check for sanity purpose + if (ingrad_in_mkl_format) { + OP_REQUIRES( + context, mkl_context.ingrad_shape.GetDimension() == 4, + errors::InvalidArgument("input gradient must be 4-dimensional")); + } else { + OP_REQUIRES( + context, in_grads.dims() == 4, + errors::InvalidArgument("input gradient must be 4-dimensional")); + } + + if (outimage_in_mkl_format) { + OP_REQUIRES( + context, mkl_context.outimage_shape.GetDimension() == 4, + errors::InvalidArgument("Output image must be 4-dimensional")); + } else { + OP_REQUIRES( + context, out_image.dims() == 4, + errors::InvalidArgument("Output image must be 4-dimensional")); + } + + // Prepare mkl input layout + mkl_context.MklPrepareLRNInputsLayouts(context); + int ksize = 2 * depth_radius_ + 1; + + CHECK_EQ(dnnLRNCreateBackward_F32( + &mkl_context.lrn_bwd, NULL, mkl_context.lt_input, + mkl_context.lt_output, ksize, + static_cast<float>(alpha_ * ksize), beta_, bias_), + E_SUCCESS); + + // Allocate output tensor and shape. + TensorShape mkl_output_tf_shape; /* First tensor */ + MklShape mkl_output_mkl_shape; /* Second tensor */ + mkl_output_mkl_shape.SetMklTensor(true); + CHECK_NE(mkl_context.lrn_bwd, nullptr); + mkl_output_mkl_shape.SetMklLayout(mkl_context.lrn_bwd, dnnResourceDiffSrc); + mkl_output_mkl_shape.SetTfLayout(mkl_context.in_dims, mkl_context.out_sizes, + mkl_context.out_strides); + if (ingrad_in_mkl_format) { + mkl_output_mkl_shape.SetTfDimOrder( + mkl_context.in_dims, mkl_context.ingrad_shape.GetTfToMklDimMap()); + } else { + mkl_output_mkl_shape.SetTfDimOrder( + mkl_context.in_dims, mkl_context.inimage_shape.GetTfToMklDimMap()); + } + mkl_output_tf_shape.AddDim( + dnnLayoutGetMemorySize_F32( + static_cast<dnnLayout_t>(mkl_output_mkl_shape.GetMklLayout())) / + sizeof(T)); + Tensor* output = nullptr; + AllocateOutputSetMklShape(context, 0, &output, mkl_output_tf_shape, + mkl_output_mkl_shape); + + // Get pointers to output data. + void* user_output = + const_cast<void*>(static_cast<const void*>(output->flat<T>().data())); + + Tensor mkl_tmp_input_buf_tensor, mkl_tmp_image_buf_tensor, + mkl_tmp_outimage_buf_tensor, mkl_tmp_workspace_buf_tensor; + // Convert Inputs if needed + mkl_context.MklPrepareLRNGradInput( + context, &mkl_tmp_input_buf_tensor, &mkl_tmp_image_buf_tensor, + &mkl_tmp_outimage_buf_tensor, &mkl_tmp_workspace_buf_tensor); + + // We do not do any conversion for output. But we simply emit it + // in MKL format. + mkl_context.res_lrn_bwd[dnnResourceDiffSrc] = user_output; + // Execute LRN backward using dnnExecute + CHECK_EQ(dnnExecute_F32(mkl_context.lrn_bwd, mkl_context.res_lrn_bwd), + E_SUCCESS); + // Release MKL resources. + mkl_context.Mklcleanup(); + } + + private: + typedef struct { + int depth_radius_; + float bias_; + float alpha_; + float beta_; + size_t in_dims; + size_t in_sizes[4]; + size_t in_strides[4]; + size_t out_sizes[4]; + size_t out_strides[4]; + MklShape ingrad_shape, inimage_shape, outimage_shape; + dnnPrimitive_t lrn_bwd = nullptr; + dnnPrimitive_t convert_input = nullptr; + /* dnnPrimitive_t convert_output; */ + dnnLayout_t lt_input = nullptr; + dnnLayout_t lt_output = nullptr; + dnnLayout_t lt_bdw_input = nullptr; + dnnLayout_t lt_workspace = nullptr; + dnnLayout_t lt_internal_input = nullptr; + /* dnnLayout_t lt_internal_workspace; + dnnLayout_t lt_internal_output; */ + void* res_lrn_bwd[dnnResourceNumber]; + + // prepare mkl input + void MklPrepareLRNInputsLayouts(OpKernelContext* context) { + bool ingrad_in_mkl_format = ingrad_shape.IsMklTensor(); + bool inimage_in_mkl_format = inimage_shape.IsMklTensor(); + if (!ingrad_in_mkl_format) { + CHECK_EQ(dnnLayoutCreate_F32(<_input, in_dims, in_sizes, in_strides), + E_SUCCESS); + } else { + lt_input = static_cast<dnnLayout_t>(ingrad_shape.GetCurLayout()); + } + + if (!inimage_in_mkl_format) { + CHECK_EQ( + dnnLayoutCreate_F32(<_output, in_dims, out_sizes, out_strides), + E_SUCCESS); + } else { + lt_output = static_cast<dnnLayout_t>(inimage_shape.GetCurLayout()); + } + } + + // convert input if needed + void MklPrepareLRNGradInput(OpKernelContext* context, + Tensor* mkl_tmp_input_buf_tensor, + Tensor* mkl_tmp_image_buf_tensor, + Tensor* mkl_tmp_outimage_buf_tensor, + Tensor* mkl_tmp_workspace_buf_tensor) { + const Tensor& in_grads = MklGetInput(context, 0); + const Tensor& in_image = MklGetInput(context, 1); + const Tensor& out_image = MklGetInput(context, 2); + + void* user_input = const_cast<void*>( + static_cast<const void*>(in_grads.flat<T>().data())); + void* user_fwd_input = const_cast<void*>( + static_cast<const void*>(in_image.flat<T>().data())); + void* user_fwd_output = const_cast<void*>( + static_cast<const void*>(out_image.flat<T>().data())); + CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(<_workspace, lrn_bwd, + dnnResourceWorkspace), + E_SUCCESS); + CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(<_bdw_input, lrn_bwd, + dnnResourceDiffDst), + E_SUCCESS); + + bool ingrad_in_mkl_format = ingrad_shape.IsMklTensor(); + if (ingrad_in_mkl_format) { + if (!dnnLayoutCompare_F32(lt_bdw_input, lt_input)) { + AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, lt_bdw_input, + &res_lrn_bwd[dnnResourceDiffDst]); + ingrad_shape.GetConvertedFlatData(lt_bdw_input, user_input, + res_lrn_bwd[dnnResourceDiffDst]); + } else { + res_lrn_bwd[dnnResourceDiffDst] = user_input; + } + } else { + if (!dnnLayoutCompare_F32(lt_bdw_input, lt_input)) { + CHECK_EQ( + dnnConversionCreate_F32(&convert_input, lt_input, lt_bdw_input), + E_SUCCESS); + + AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, lt_bdw_input, + &res_lrn_bwd[dnnResourceDiffDst]); + CHECK_EQ(dnnConversionExecute_F32(convert_input, user_input, + res_lrn_bwd[dnnResourceDiffDst]), + E_SUCCESS); + dnnDelete_F32(convert_input); + } else { + res_lrn_bwd[dnnResourceDiffDst] = user_input; + } + } + +// Although MKL documentation for LRN does not specify setting/getting +// of dnnResourceSrc and dnnResourceDst, Caffe code sets dnnResourceSrc. +// So we set dnnResourceSrc here. But we do not know why we are setting +// dnnResourceDst. +#if 0 + // NOTE: The code below is kept just so that we know how we should handle + // dnnResourceSrc if the primitive layout for dnnResourceSrc was supported. + + if (!dnnLayoutCompare_F32(lt_internal_input, + static_cast<dnnLayout_t>inimage_shape.GetCurLayout())) { + AllocTmpBuffer(context, mkl_tmp_image_buf_tensor, lt_internal_input, + &res_lrn_bwd[dnnResourceSrc]); + inimage_shape.GetConvertedFlatData(lt_internal_input, + user_fwd_input, + res_lrn_bwd[dnnResourceSrc]); + } else { + res_lrn_bwd[dnnResourceSrc] = user_fwd_input; + } +#endif + + // Since we cannot get expected layout for dnnResourceSrc, we construct + // buffer using + // MKL format if input is in MKL format. + if (inimage_shape.IsMklTensor()) { + AllocTmpBuffer(context, mkl_tmp_image_buf_tensor, + (dnnLayout_t)inimage_shape.GetCurLayout(), + &res_lrn_bwd[dnnResourceSrc]); + } else { + res_lrn_bwd[dnnResourceSrc] = user_fwd_input; + } + + // Same comment as above. + if (outimage_shape.IsMklTensor()) { + AllocTmpBuffer(context, mkl_tmp_outimage_buf_tensor, + (dnnLayout_t)outimage_shape.GetCurLayout(), + &res_lrn_bwd[dnnResourceDst]); + } else { + res_lrn_bwd[dnnResourceDst] = user_fwd_output; + } + + // Allocate buffer for workspace. + AllocTmpBuffer(context, mkl_tmp_workspace_buf_tensor, lt_workspace, + &res_lrn_bwd[dnnResourceWorkspace]); + } + + // Fallback implementation - Taken from lrn_op.cc + // TODO(intelft) Check if we can use EigenLRNOp directly instead of making a + // copy. + void MklDefaultToEigen(OpKernelContext* context) { + // CHECK(false); + Tensor in_grads = MklGetInput(context, 0); + Tensor in_image = MklGetInput(context, 1); + Tensor out_image = MklGetInput(context, 2); + + GetMklShape(context, 0, &ingrad_shape); + GetMklShape(context, 1, &inimage_shape); + GetMklShape(context, 2, &outimage_shape); + + const int64 batch = static_cast<int64>(in_grads.dim_size(0)); + const int64 rows = static_cast<int64>(in_grads.dim_size(1)); + const int64 cols = static_cast<int64>(in_grads.dim_size(2)); + const int64 depth = static_cast<int64>(in_grads.dim_size(3)); + const auto nodes = cols * rows; + + auto grads_shaped = in_grads.shaped<T, 2>({nodes * batch, depth}); + auto in_shaped = in_image.shaped<T, 2>({nodes * batch, depth}); + auto activations = out_image.shaped<T, 2>({nodes * batch, depth}); + + Tensor* output; + MklShape mkl_output_mkl_shape; + mkl_output_mkl_shape.SetMklTensor(false); + mkl_output_mkl_shape.SetDimensions(4); + AllocateOutputSetMklShape(context, 0, &output, in_grads.shape(), + mkl_output_mkl_shape); + + auto out_shaped = output->shaped<T, 2>({nodes * batch, depth}); + out_shaped.setZero(); + auto shard = [this, activations, in_shaped, grads_shaped, out_shaped, + depth](int64 begin, int64 end) { + for (int64 i = begin; i < end; ++i) { + for (int64 j = 0; j < depth; ++j) { + int64 depth_begin = std::max<int64>(0, j - depth_radius_); + int64 depth_end = std::min<int64>(depth, j + depth_radius_ + 1); + + T norm(0); + for (int64 k = depth_begin; k < depth_end; ++k) { + norm += in_shaped(i, k) * in_shaped(i, k); + } + norm = alpha_ * norm + bias_; + DCHECK_GT(norm, T(1e-6)); + for (int64 k = depth_begin; k < depth_end; ++k) { + T dyi = T(-2) * alpha_ * beta_ * in_shaped(i, k) * + activations(i, j) / norm; + if (k == j) { + dyi += Eigen::numext::pow(norm, -beta_); + } + dyi *= grads_shaped(i, j); + const_cast<typename TTypes<T, 2>::Tensor&>(out_shaped)(i, k) += + dyi; + } + } + } + }; + auto worker_threads = + *(context->device()->tensorflow_cpu_worker_threads()); + Shard(worker_threads.num_threads, worker_threads.workers, nodes * batch, + depth * depth, shard); + } + + // release mkl resources + void Mklcleanup() { + bool ingrad_in_mkl_format = ingrad_shape.IsMklTensor(); + bool inimage_in_mkl_format = inimage_shape.IsMklTensor(); + if (!ingrad_in_mkl_format) { + CHECK_EQ(dnnLayoutDelete_F32(lt_input), E_SUCCESS); + } + + if (!inimage_in_mkl_format) { + CHECK_EQ(dnnLayoutDelete_F32(lt_output), E_SUCCESS); + } + dnnDelete_F32(lrn_bwd); + dnnLayoutDelete_F32(lt_bdw_input); + dnnLayoutDelete_F32(lt_workspace); + } + } MklLRNGradOpContext; + + typedef typename Eigen::Tensor<T, 1, Eigen::RowMajor>::DimensionPair DimPair; + bool workspace_enabled_; + int depth_radius_; + float bias_; + float alpha_; + float beta_; +}; + +#define REGISTER_MKL_LRN_CPU(T) \ + REGISTER_KERNEL_BUILDER(Name("_MklLRN") \ + .Device(DEVICE_CPU) \ + .TypeConstraint<T>("T") \ + .Label(mkl_op_registry::kMklOpLabel), \ + MklLRNOp<T>); \ + REGISTER_KERNEL_BUILDER(Name("_MklLRNGrad") \ + .Device(DEVICE_CPU) \ + .TypeConstraint<T>("T") \ + .Label(mkl_op_registry::kMklOpLabel), \ + MklLRNGradOp<T>); + +TF_CALL_float(REGISTER_MKL_LRN_CPU); + +} // namespace tensorflow + +#endif // INTEL_MKL |