1 files changed, 506 insertions, 0 deletions
diff --git a/tensorflow/core/kernels/mkl_maxpooling_op.cc b/tensorflow/core/kernels/mkl_maxpooling_op.cc
new file mode 100644
index 0000000000..9d6cfb0c97
--- /dev/null
+++ b/tensorflow/core/kernels/mkl_maxpooling_op.cc
@@ -0,0 +1,506 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/nn_ops.cc.
+#ifdef INTEL_MKL
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/kernels/mkl_pooling_ops_common.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/util/mkl_util.h"
+#include "tensorflow/core/util/padding.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+// An implementation of MaxPooling (forward).
+template <typename Device, typename T>
+class MklMaxPoolingOp : public OpKernel {
+ public:
+  explicit MklMaxPoolingOp(OpKernelConstruction* context) : OpKernel(context) {
+    string data_format;
+
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+    OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
+    OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
+    OP_REQUIRES(context, ksize_.size() == 4,
+                errors::InvalidArgument("Sliding window ksize field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
+    OP_REQUIRES(context, stride_.size() == 4,
+                errors::InvalidArgument("Sliding window stride field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+    OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
+                errors::Unimplemented("Pooling is not yet supported on the "
+                                      "batch dimension."));
+
+    workspace_enabled_ = false;
+    // We may not get this attribute for this node if it does not go through
+    // graph rewrite pass. So we do not check for error while retrieving this
+    // attribute value.
+    context->GetAttr("workspace_enabled", &workspace_enabled_);
+  }
+
+  void Compute(OpKernelContext* context) override {
+    MklMaxPoolingOpContext mkl_context;
+    // Get the input tensor
+    const Tensor& tensor_in = MklGetInput(context, 0);
+    GetMklShape(context, 0, &mkl_context.input_shape);
+    bool input_in_mkl_format = mkl_context.input_shape.IsMklTensor();
+
+    mkl_context.params.in_dim = 4;
+    MklPoolParameters pool_params;
+    if (input_in_mkl_format == false) {
+      pool_params.Init(context, ksize_, stride_, padding_, data_format_,
+                       tensor_in.shape());
+      OP_REQUIRES(
+          context, (pool_params.depth_window == 1),
+          errors::Unimplemented("Depthwise max pooling not supported by MKL"));
+
+    } else {
+      pool_params.Init(context, ksize_, stride_, padding_, data_format_,
+                       &mkl_context.input_shape);
+    }
+
+    // Extract the parameters for the op from the pooling specs
+
+    ExtractMklOpParams(context, data_format_, pool_params, &mkl_context.params);
+
+    mkl_context.MklCreateLayoutsAndPrimitives(context);
+
+    // Declare output tensor
+    TensorShape tensor_out_shape;
+    MklShape mkl_out_shape;
+    mkl_out_shape.SetMklTensor(true);
+    mkl_out_shape.SetMklLayout(mkl_context.prim_pooling_fwd, dnnResourceDst);
+    mkl_out_shape.SetTfLayout(mkl_context.params.in_dim,
+                              mkl_context.params.out_sizes,
+                              mkl_context.params.out_strides);
+    mkl_out_shape.SetTfDimOrder(mkl_context.params.in_dim, data_format_);
+
+    Tensor* output_tensor = nullptr;
+    tensor_out_shape.AddDim(dnnLayoutGetMemorySize_F32(static_cast<dnnLayout_t>(
+                                mkl_out_shape.GetMklLayout())) /
+                            sizeof(T));
+    AllocateOutputSetMklshape(context, 0, &output_tensor, tensor_out_shape,
+                              mkl_out_shape);
+
+    if (!workspace_enabled_) {
+      mkl_out_shape.SetMklTensor(false);
+    }
+
+    Tensor* workspace_tensor;
+    void* workspace_buf = nullptr;
+    if (workspace_enabled_) {
+      TensorShape workspace_shape;
+      workspace_shape.AddDim(
+          dnnLayoutGetMemorySize_F32(
+              static_cast<dnnLayout_t>(mkl_context.lt_workspace)) /
+          sizeof(T));
+      AllocateOutputSetMklshape(context, 1, &workspace_tensor, workspace_shape,
+                                mkl_out_shape);
+      mkl_context.pooling_res[dnnResourceWorkspace] = const_cast<void*>(
+          static_cast<const void*>(workspace_tensor->flat<T>().data()));
+    } else {
+      AllocTmpBuffer(context, workspace_tensor, mkl_context.lt_workspace,
+                     &workspace_buf);
+      mkl_context.pooling_res[dnnResourceWorkspace] = workspace_buf;
+    }
+
+    mkl_context.pooling_res[dnnResourceSrc] =
+        const_cast<void*>(static_cast<const void*>(tensor_in.flat<T>().data()));
+    mkl_context.pooling_res[dnnResourceDst] = const_cast<void*>(
+        static_cast<const void*>(output_tensor->flat<T>().data()));
+
+    CHECK_EQ(
+        dnnExecute_F32(mkl_context.prim_pooling_fwd, mkl_context.pooling_res),
+        E_SUCCESS);
+
+    mkl_context.MklCleanup();
+  }
+
+ private:
+  typedef struct {
+    MklPoolingOpParams params;
+    MklShape input_shape;
+    void* pooling_res[dnnResourceNumber];
+    dnnPrimitive_t prim_pooling_fwd;
+    dnnLayout_t lt_user_input, lt_workspace;
+
+    void MklCreateLayoutsAndPrimitives(OpKernelContext* context) {
+      bool input_in_mkl_format = input_shape.IsMklTensor();
+      // Create or use existing DNN user layout
+      if (input_in_mkl_format == false) {
+        CHECK_EQ(dnnLayoutCreate_F32(&lt_user_input, params.in_dim,
+                                     params.in_sizes, params.in_strides),
+                 E_SUCCESS);
+      } else {
+        lt_user_input = (dnnLayout_t)input_shape.GetCurLayout();
+      }
+
+      dnnAlgorithm_t algorithm = dnnAlgorithmPoolingMax;
+      dnnPrimitiveAttributes_t primAttr = nullptr;
+
+      // Create DNN primitives
+      CHECK_EQ(dnnPoolingCreateForward_F32(
+                   &prim_pooling_fwd, primAttr, algorithm, lt_user_input,
+                   params.kernel_size, params.kernel_stride, params.in_offset,
+                   dnnBorderZerosAsymm),
+               E_SUCCESS);
+
+      // Creates layout for the workspace
+      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&lt_workspace, prim_pooling_fwd,
+                                                dnnResourceWorkspace),
+               E_SUCCESS);
+    }
+
+    void MklCleanup() {
+      bool input_in_mkl_format = input_shape.IsMklTensor();
+      CHECK_EQ(dnnDelete_F32(prim_pooling_fwd), E_SUCCESS);
+      if (!input_in_mkl_format) {
+        CHECK_EQ(dnnLayoutDelete_F32(lt_user_input), E_SUCCESS);
+      }
+      CHECK_EQ(dnnLayoutDelete_F32(lt_workspace), E_SUCCESS);
+    }
+  } MklMaxPoolingOpContext;
+
+  std::vector<int32> ksize_;
+  std::vector<int32> stride_;
+  Padding padding_;
+  TensorFormat data_format_;
+  bool workspace_enabled_;
+};
+
+// The operation to compute MaxPool gradients.
+// It takes three inputs:
+//   - The original input tensor
+//   - The original output tensor
+//   - Backprop tensor for output
+// It produces one output: backprop tensor for input.
+template <class Device, class T>
+class MklMaxPoolingGradOp : public OpKernel {
+ public:
+  explicit MklMaxPoolingGradOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    string data_format;
+
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+    OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
+    OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
+    OP_REQUIRES(context, ksize_.size() == 4,
+                errors::InvalidArgument("Sliding window ksize field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
+    OP_REQUIRES(context, stride_.size() == 4,
+                errors::InvalidArgument("Sliding window strides field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+    OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
+                errors::Unimplemented(
+                    "Pooling is not yet supported on the batch dimension."));
+    workspace_enabled_ = false;
+    // We may not get this attribute for this node if it does not go through
+    // graph rewrite pass. So we do not check for error while retrieving this
+    // attribute value.
+    context->GetAttr("workspace_enabled", &workspace_enabled_);
+  }
+
+  void Compute(OpKernelContext* context) override {
+    MklMaxPoolingGradOpContext mkl_context;
+    // Input - The original input tensor
+    const Tensor& tensor_in = MklGetInput(context, 0);
+
+    // Output - Backprop tensor for input.
+    Tensor* output_tensor = nullptr;
+
+    GetMklShape(context, 0, &mkl_context.input_shape);
+    GetMklShape(context, 2, &mkl_context.output_backprop_shape);
+    bool input_in_mkl_format = mkl_context.input_shape.IsMklTensor();
+
+    if (input_in_mkl_format == false)
+      mkl_context.params.in_dim = tensor_in.dims();
+    else
+      mkl_context.params.in_dim = mkl_context.input_shape.GetDimension();
+
+    MklPoolParameters pool_params;
+    if (input_in_mkl_format == false) {
+      pool_params.Init(context, ksize_, stride_, padding_, data_format_,
+                       tensor_in.shape());
+      OP_REQUIRES(
+          context, (pool_params.depth_window == 1),
+          errors::Unimplemented("Depthwise max pooling not supported by MKL"));
+
+    } else {
+      pool_params.Init(context, ksize_, stride_, padding_, data_format_,
+                       &mkl_context.input_shape);
+    }
+
+    // Extract the parameters for the op from the pooling specs
+    ExtractMklOpParams(context, data_format_, pool_params, &mkl_context.params);
+
+    mkl_context.MklCreateLayouts(context);
+    mkl_context.MklCreatePrimitives(context, workspace_enabled_);
+    mkl_context.MklPrepareInputs(context, workspace_enabled_);
+
+    // Create shape for the input back prop output
+    TensorShape mkl_input_backprop;
+    MklShape mkl_output_shape;
+    mkl_output_shape.SetMklTensor(true);
+    mkl_output_shape.SetMklLayout(mkl_context.prim_pooling_bwd,
+                                  dnnResourceDiffSrc);
+    mkl_output_shape.SetTfLayout(mkl_context.params.in_dim,
+                                 mkl_context.params.in_sizes,
+                                 mkl_context.params.in_strides);
+    mkl_output_shape.SetTfDimOrder(mkl_context.params.in_dim, data_format_);
+
+    mkl_input_backprop.AddDim(
+        dnnLayoutGetMemorySize_F32(
+            static_cast<dnnLayout_t>(mkl_output_shape.GetMklLayout())) /
+        sizeof(T));
+    AllocateOutputSetMklshape(context, 0, &output_tensor, mkl_input_backprop,
+                              mkl_output_shape);
+    mkl_context.pooling_res[dnnResourceDiffSrc] = const_cast<void*>(
+        static_cast<const void*>(output_tensor->flat<T>().data()));
+
+    int64 output_size = output_tensor->NumElements();
+    for (int64 i = 0; i < output_size; ++i) {
+      (static_cast<float*>(mkl_context.pooling_res[dnnResourceDiffSrc]))[i] = 0;
+    }
+
+    CHECK_EQ(
+        dnnExecute_F32(mkl_context.prim_pooling_bwd, mkl_context.pooling_res),
+        E_SUCCESS);
+
+    mkl_context.MklCleanup(workspace_enabled_);
+  }
+
+ private:
+  typedef struct {
+    MklPoolingOpParams params;
+    MklShape input_shape, output_backprop_shape;
+    void* pooling_resfwd[dnnResourceNumber];
+    void* pooling_res[dnnResourceNumber];
+    dnnPrimitive_t prim_pooling_fwd, prim_pooling_bwd, convert_input,
+        convert_outbackprop;
+    dnnLayout_t lt_outbackprop_user, lt_outbackprop_prim, lt_input_user,
+        lt_input_prim;
+    void* input_buf;
+    void* outbackprop_buf;
+
+    void MklCreateLayouts(OpKernelContext* context) {
+      bool input_in_mkl_format = input_shape.IsMklTensor();
+      bool outbackprop_in_mkl_format = output_backprop_shape.IsMklTensor();
+      // Create DNN user layout for input and outbackprop or get existing layout
+      if (input_in_mkl_format == false) {
+        CHECK_EQ(dnnLayoutCreate_F32(&lt_input_user, params.in_dim,
+                                     params.in_sizes, params.in_strides),
+                 E_SUCCESS);
+      } else {
+        lt_input_user = (dnnLayout_t)input_shape.GetCurLayout();
+      }
+
+      // We dont care about the output layout for now as we can create it from
+      // primitives for the max pooling fwd prop
+      if (outbackprop_in_mkl_format == false) {
+        CHECK_EQ(dnnLayoutCreate_F32(&lt_outbackprop_user, params.in_dim,
+                                     params.out_sizes, params.out_strides),
+                 E_SUCCESS);
+      } else {
+        lt_outbackprop_user = (dnnLayout_t)output_backprop_shape.GetCurLayout();
+      }
+    }
+
+    // Create DNN primitives
+    void MklCreatePrimitives(OpKernelContext* context, bool workspace_enabled) {
+      dnnAlgorithm_t algorithm = dnnAlgorithmPoolingMax;
+      dnnPrimitiveAttributes_t primAttr = nullptr;
+
+      if (workspace_enabled == false) {
+        CHECK_EQ(dnnPoolingCreateForward_F32(
+                     &prim_pooling_fwd, primAttr, algorithm, lt_input_user,
+                     params.kernel_size, params.kernel_stride, params.in_offset,
+                     dnnBorderZerosAsymm),
+                 E_SUCCESS);
+      }
+
+      CHECK_EQ(dnnPoolingCreateBackward_F32(
+                   &prim_pooling_bwd, primAttr, algorithm, lt_input_user,
+                   params.kernel_size, params.kernel_stride, params.in_offset,
+                   dnnBorderZerosAsymm),
+               E_SUCCESS);
+
+      // Creates conversions
+      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(
+                   &lt_outbackprop_prim, prim_pooling_bwd, dnnResourceDiffDst),
+               E_SUCCESS);
+
+      // Tensors needed to create temporary buffers
+      Tensor input_buf_tensor, outbackprop_buf_tensor;
+
+      if (workspace_enabled == false) {
+        CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(
+                     &lt_input_prim, prim_pooling_fwd, dnnResourceSrc),
+                 E_SUCCESS);
+        if (!dnnLayoutCompare_F32(lt_input_user, lt_input_prim)) {
+          CHECK_EQ(dnnConversionCreate_F32(&convert_input, lt_input_user,
+                                           lt_input_prim),
+                   E_SUCCESS);
+          AllocTmpBuffer(context, &input_buf_tensor, lt_input_prim, &input_buf);
+        }
+      }
+
+      if (!dnnLayoutCompare_F32(lt_outbackprop_user, lt_outbackprop_prim)) {
+        CHECK_EQ(
+            dnnConversionCreate_F32(&convert_outbackprop, lt_outbackprop_user,
+                                    lt_outbackprop_prim),
+            E_SUCCESS);
+        AllocTmpBuffer(context, &outbackprop_buf_tensor, lt_outbackprop_prim,
+                       &outbackprop_buf);
+      }
+    }
+
+    // Compare incoming tensor layouts with MKL preferred layouts and convert
+    // data to the preferred layout if necessary
+    void MklPrepareInputs(OpKernelContext* context, bool workspace_enabled) {
+      const Tensor& tensor_in = MklGetInput(context, 0);
+      const Tensor& out_backprop = MklGetInput(context, 2);
+      bool input_in_mkl_format = input_shape.IsMklTensor();
+      bool outbackprop_in_mkl_format = output_backprop_shape.IsMklTensor();
+
+      void* tmp_output_buf;
+      Tensor tmp_output_buf_tensor;
+
+      void* workspace_buf;
+      Tensor workspace_buf_tensor;
+
+      if (workspace_enabled == false) {
+        if (convert_input != nullptr) {
+          if (input_in_mkl_format == false) {
+            CHECK_EQ(dnnConversionExecute_F32(
+                         convert_input,
+                         const_cast<void*>(static_cast<const void*>(
+                             tensor_in.flat<T>().data())),
+                         input_buf),
+                     E_SUCCESS);
+            CHECK_EQ(dnnDelete_F32(convert_input), E_SUCCESS);
+            convert_input = nullptr;
+          } else {
+            input_shape.GetConvertedFlatData(
+                lt_input_prim,
+                const_cast<void*>(
+                    static_cast<const void*>(tensor_in.flat<T>().data())),
+                input_buf);
+          }
+          pooling_resfwd[dnnResourceSrc] = input_buf;
+        } else {
+          pooling_resfwd[dnnResourceSrc] = const_cast<void*>(
+              static_cast<const void*>(tensor_in.flat<T>().data()));
+        }
+
+        dnnLayout_t lt_workspace;
+        CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(
+                     &lt_workspace, prim_pooling_fwd, dnnResourceWorkspace),
+                 E_SUCCESS);
+        AllocTmpBuffer(context, &workspace_buf_tensor, lt_workspace,
+                       &workspace_buf);
+        pooling_resfwd[dnnResourceWorkspace] = workspace_buf;
+
+        dnnLayoutDelete_F32(lt_workspace);
+
+        // We create the layout for max pooling fwd prop tmp output here
+        AllocTmpBuffer(context, &tmp_output_buf_tensor, lt_outbackprop_prim,
+                       &tmp_output_buf);
+        pooling_resfwd[dnnResourceDst] = tmp_output_buf;
+
+        CHECK_EQ(dnnExecute_F32(prim_pooling_fwd, pooling_resfwd), E_SUCCESS);
+        pooling_res[dnnResourceWorkspace] =
+            pooling_resfwd[dnnResourceWorkspace];
+      } else {
+        const Tensor& workspace = MklGetInput(context, 3);
+        pooling_res[dnnResourceWorkspace] = const_cast<void*>(
+            static_cast<const void*>(workspace.flat<T>().data()));
+      }
+
+      // Out backprop conversions if needed
+      if (convert_outbackprop != nullptr) {
+        if (outbackprop_in_mkl_format == false) {
+          CHECK_EQ(dnnConversionExecute_F32(
+                       convert_outbackprop,
+                       const_cast<void*>(static_cast<const void*>(
+                           out_backprop.flat<T>().data())),
+                       outbackprop_buf),
+                   E_SUCCESS);
+          CHECK_EQ(dnnDelete_F32(convert_outbackprop), E_SUCCESS);
+        } else {
+          output_backprop_shape.GetConvertedFlatData(
+              lt_outbackprop_prim,
+              const_cast<void*>(
+                  static_cast<const void*>(out_backprop.flat<T>().data())),
+              outbackprop_buf);
+        }
+        pooling_res[dnnResourceDiffDst] = outbackprop_buf;
+      } else {
+        pooling_res[dnnResourceDiffDst] = const_cast<void*>(
+            static_cast<const void*>(out_backprop.flat<T>().data()));
+      }
+    }
+
+    void MklCleanup(bool workspace_enabled) {
+      bool input_in_mkl_format = input_shape.IsMklTensor();
+      bool outbackprop_in_mkl_format = output_backprop_shape.IsMklTensor();
+      if (workspace_enabled == false) {
+        CHECK_EQ(dnnDelete_F32(prim_pooling_fwd), E_SUCCESS);
+      }
+      CHECK_EQ(dnnDelete_F32(prim_pooling_bwd), E_SUCCESS);
+      if (outbackprop_in_mkl_format == false) {
+        CHECK_EQ(dnnLayoutDelete_F32(lt_outbackprop_user), E_SUCCESS);
+      }
+      CHECK_EQ(dnnLayoutDelete_F32(lt_outbackprop_prim), E_SUCCESS);
+      if (input_in_mkl_format == false) {
+        CHECK_EQ(dnnLayoutDelete_F32(lt_input_user), E_SUCCESS);
+      }
+      if (workspace_enabled == false) {
+        CHECK_EQ(dnnLayoutDelete_F32(lt_input_prim), E_SUCCESS);
+      }
+    }
+  } MklMaxPoolingGradOpContext;
+
+  std::vector<int32> ksize_;
+  std::vector<int32> stride_;
+  Padding padding_;
+  TensorFormat data_format_;
+
+  bool workspace_enabled_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("MklMaxPool")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<float>("T")
+                            .Label(mkl_layer_registry::kMklLayerLabel),
+                        MklMaxPoolingOp<CPUDevice, float>);
+
+REGISTER_KERNEL_BUILDER(Name("MklMaxPoolGrad")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<float>("T")
+                            .Label(mkl_layer_registry::kMklLayerLabel),
+                        MklMaxPoolingGradOp<CPUDevice, float>);
+
+}  // namespace tensorflow
+#endif  // INTEL_MKL