Adding missing reorders in ReLU and AddN (#15406)

author: Niranjan Hasabnis <niranjan.hasabnis@intel.com> 2017-12-26 11:21:06 -0800
committer: drpngx <drpngx@users.noreply.github.com> 2017-12-26 11:21:06 -0800
commit: 64cb8494a1628e799ecf869e8c1beb302c907720 (patch)
tree: 631da5b32de285d6a928c3db9509489f35b4c452
parent: ced991d65a95ebbdd57c2e0670d88563042a3180 (diff)
2 files changed, 179 insertions, 70 deletions
diff --git a/tensorflow/core/kernels/mkl_aggregate_ops.cc b/tensorflow/core/kernels/mkl_aggregate_ops.cc
index 9aabbbdb6b..44b94be3a0 100644
--- a/tensorflow/core/kernels/mkl_aggregate_ops.cc
+++ b/tensorflow/core/kernels/mkl_aggregate_ops.cc
@@ -294,7 +294,7 @@ class MklAddNOp : public OpKernel {
 
     try {
       auto cpu_engine = engine(engine::cpu, 0);
-      size_t src1_idx = 0, src2_idx = 1;
+      size_t src1_idx = 0, src2_idx = 1, output_idx = 0;
       const Tensor& src1_tensor = MklGetInput(ctx, src1_idx);
       const Tensor& src2_tensor = MklGetInput(ctx, src2_idx);
 
@@ -312,7 +312,7 @@ class MklAddNOp : public OpKernel {
          Tensor* dst_tensor = nullptr;
          MklShape mkl_shape_dst;
          mkl_shape_dst.SetMklTensor(false);
-         AllocateOutputSetMklShape(ctx, src1_idx, &dst_tensor,
+         AllocateOutputSetMklShape(ctx, output_idx, &dst_tensor,
          src1_tensor.shape(), mkl_shape_dst);
          float user_i1 = (src1_tensor.scalar<T>()());
          float user_i2 = (src2_tensor.scalar<T>()());
@@ -327,13 +327,12 @@ class MklAddNOp : public OpKernel {
            Tensor* dst_tensor = nullptr;
            MklShape mkl_shape_dst;
            mkl_shape_dst.SetMklTensor(false);
-           AllocateOutputSetMklShape(ctx, src1_idx, &dst_tensor,
+           AllocateOutputSetMklShape(ctx, output_idx, &dst_tensor,
            src1_tensor.shape(), mkl_shape_dst);
            return;
         }
       }
 
-      // element-wise add operator for tensor input1 and tensor input2
       std::vector<double> coeff(2, 1.0);
       MklDnnData<T> src1(&cpu_engine);
       MklDnnData<T> src2(&cpu_engine);
@@ -345,70 +344,124 @@ class MklAddNOp : public OpKernel {
       memory::desc md1({}, memory::data_undef, memory::format_undef);
       memory::desc md2({}, memory::data_undef, memory::format_undef);
 
-      if ( input1_in_mkl_format || input2_in_mkl_format ) {
-        if ( input1_in_mkl_format ) {
-          md1 = src1_mkl_shape.GetMklLayout();
-          md2 = md1;
-          dst.SetUsrMem(md1);
-        } else {
-          md2 = src2_mkl_shape.GetMklLayout();
-          md1 = md2;
-          dst.SetUsrMem(md2);
-        }
+      // For creating Sum primitive, we need to ensure that all inputs are in
+      // same format. What that means is if we have a mixed input case - where
+      // one input is in Tensorflow format and one input is in MKL format -,
+      // then we need to ensure that all inputs are in same format for
+      // primitive construction. For performance reason, we say that all inputs
+      // are in MKL format in such case, and insert reorder for input that is
+      // in Tensorflow format into MKL format. On the other hand, if both the
+      // inputs are in MKL format or both are in Tensorflow format, then we
+      // dont need reorder.
+      if (!input1_in_mkl_format && !input2_in_mkl_format) {
+        // If both the inputs are in Tensorflow format, we create blocked memory
+        // descriptor.
+        dims = TFShapeToMklDnnDims(src1_tensor.shape());
+        strides = CalculateTFStrides(dims);
+        md1 = MklDnnData<T>::CreateBlockedMemDesc(dims, strides);
+        md2 = md1;
+      } else if (input1_in_mkl_format && !input2_in_mkl_format) {
+        // If one input is in MKL format and other is in Tensorflow, then
+        // create respective descriptors describing the actual case. For input
+        // in Mkl format, we just get Mkl layout from MklDnnShape. For input in
+        // Tensorflow format, we create memory descriptor using data format.
+        md1 = src1_mkl_shape.GetMklLayout();
+
+        memory::format src1_mkl_data_format = src1_mkl_shape.GetTfDataFormat();
+        auto src1_tf_data_format = MklDnnDataFormatToTFDataFormat(
+                                    src1_mkl_data_format);
+        auto src2_dims = TFShapeToMklDnnDimsInNCHW(src2_tensor.shape(),
+                                    src1_tf_data_format);
+        md2 = memory::desc(src2_dims, MklDnnType<T>(),
+                           src1_mkl_data_format);
+      } else if (input2_in_mkl_format && !input1_in_mkl_format) {
+        // Same comment as above.
+        memory::format src2_mkl_data_format = src2_mkl_shape.GetTfDataFormat();
+        auto src2_tf_data_format = MklDnnDataFormatToTFDataFormat(
+                                     src2_mkl_data_format);
+        auto src1_dims = TFShapeToMklDnnDimsInNCHW(src1_tensor.shape(),
+                                    src2_tf_data_format);
+        md1 = memory::desc(src1_dims, MklDnnType<T>(),
+                           src2_mkl_data_format);
+
+        md2 = src2_mkl_shape.GetMklLayout();
       } else {
-         dims = TFShapeToMklDnnDims(src1_tensor.shape());
-         strides = CalculateTFStrides(dims);
-         md1 = MklDnnData<T>::CreateBlockedMemDesc(dims, strides);
-         md2 = md1;
-         dst.SetUsrMem(dims, strides);
+        // If both the inputs are in MKL format, we use Mkl layout of the input
+        // tensors.
+        md1 = src1_mkl_shape.GetMklLayout();
+        md2 = src2_mkl_shape.GetMklLayout();
       }
-
-      std::vector<memory::primitive_desc> srcs_pd;
-
       src1.SetUsrMem(md1, &src1_tensor);
-      auto mpd1 = src1.GetUsrMemPrimDesc();
-      srcs_pd.push_back(mpd1);
-
       src2.SetUsrMem(md2, &src2_tensor);
-      auto mpd2 = src2.GetUsrMemPrimDesc();
-      srcs_pd.push_back(mpd2);
 
+      // As per comment above, we tell MKLDNN that both the inputs are in same
+      // format. So we set common memory descriptor in MKL format, if any of the
+      // inputs are in MKL format. Let's get memory descriptor that we will use
+      // for both the inputs.
+      // We set output memory descriptor in MKL format, if any of the
+      // inputs are in MKL format.
+      memory::desc common_md({}, memory::data_undef, memory::format_undef);
+      if (input1_in_mkl_format || input2_in_mkl_format) {
+        common_md = input1_in_mkl_format ? md1 : md2;
+        dst.SetUsrMem(common_md);
+      } else {
+        // Since both the inputs are in Tensorflow format, and have
+        // same shape, we can get memory descriptor from any input.
+        common_md = md1;
+        dst.SetUsrMem(common_md);
+      }
+
+      std::vector<memory::primitive_desc> srcs_pd;
+      // Memory descriptor for 1st input
+      srcs_pd.push_back(memory::primitive_desc(common_md, cpu_engine));
+      // Memory descriptor for 2nd input
+      srcs_pd.push_back(memory::primitive_desc(common_md, cpu_engine));
+      auto sum_pd = sum::primitive_desc(dst.GetUsrMemDesc(), coeff, srcs_pd);
+
+      // Now we setup resources for primitive execution.
+      // First, we need to check if any of the inputs need to be reordered as
+      // per the logic described above. Since output will be in MKL format if
+      // atleast one input is in MKL format, we choose output descriptor for
+      // reorder.
       std::vector<primitive::at> inputs;
+      std::vector<primitive> net;
+      // Check if actual input format of the tensor is different than common_pd
+      // we told MKLDNN. In that case, we will need reorder.
+      src1.CheckReorderToOpMem(srcs_pd[0], &net);
+      src2.CheckReorderToOpMem(srcs_pd[1], &net);
       inputs.push_back(src1.GetOpMem());
       inputs.push_back(src2.GetOpMem());
-      auto output_pd = dst.GetUsrMemPrimDesc();
+
+      // Allocate output tensor now.
       Tensor* dst_tensor = nullptr;
-      auto sum_pd = sum::primitive_desc(dst.GetUsrMemDesc(), coeff, srcs_pd);
-      auto sum_op = sum(sum_pd, inputs, dst.GetOpMem());
-      if ( input2_in_mkl_format || input1_in_mkl_format ) {
-         MklDnnShape output_mkl_shape;
-         output_mkl_shape.SetMklTensor(true);
-         output_mkl_shape.SetMklLayout(&output_pd);
-         output_mkl_shape.SetElemType(MklDnnType<T>());
-         if ( input1_in_mkl_format ) {
+      MklDnnShape output_mkl_shape;
+      TensorShape output_tf_shape;
+
+      if (input2_in_mkl_format || input1_in_mkl_format) {
+        output_mkl_shape.SetMklTensor(true);
+        auto output_pd = dst.GetUsrMemPrimDesc();
+        output_mkl_shape.SetMklLayout(&output_pd);
+        output_mkl_shape.SetElemType(MklDnnType<T>());
+        if (input1_in_mkl_format) {
           output_mkl_shape.SetTfLayout(src1_dims_size,
-          src1_mkl_shape.GetSizesAsMklDnnDims(),
-          src1_mkl_shape.GetTfDataFormat());
-         } else {
+                                       src1_mkl_shape.GetSizesAsMklDnnDims(),
+                                       src1_mkl_shape.GetTfDataFormat());
+        } else {
           output_mkl_shape.SetTfLayout(src2_dims_size,
-          src2_mkl_shape.GetSizesAsMklDnnDims(),
-          src2_mkl_shape.GetTfDataFormat());
-         }
-         TensorShape output_tf_shape;
-         output_tf_shape.AddDim((output_pd.get_size() / sizeof(T))
-         + (output_pd.get_size()%sizeof(T) == 0 ? 0 : 1));
-         AllocateOutputSetMklShape(ctx, src1_idx, &dst_tensor, output_tf_shape,
-                                output_mkl_shape);
+                                       src2_mkl_shape.GetSizesAsMklDnnDims(),
+                                       src2_mkl_shape.GetTfDataFormat());
+        }
+        output_tf_shape.AddDim((output_pd.get_size() / sizeof(T)));
       } else {
-         MklShape mkl_shape_dst;
-         mkl_shape_dst.SetMklTensor(false);
-         AllocateOutputSetMklShape(ctx, src1_idx,
-         &dst_tensor, src1_tensor.shape(), mkl_shape_dst);
+        output_mkl_shape.SetMklTensor(false);
+        output_tf_shape = src1_tensor.shape();
       }
-
+      AllocateOutputSetMklShape(ctx, output_idx, &dst_tensor,
+                                output_tf_shape, output_mkl_shape);
       dst.SetUsrMemDataHandle(dst_tensor);
-      std::vector<primitive> net;
-      net.push_back(sum_op);
+
+      // Create Sum op, and submit net for execution.
+      net.push_back(sum(sum_pd, inputs, dst.GetOpMem()));
       stream(stream::kind::eager).submit(net).wait();
     } catch (mkldnn::error &e) {
       string error_msg = "Status: " + std::to_string(e.status) +
diff --git a/tensorflow/core/kernels/mkl_relu_op.cc b/tensorflow/core/kernels/mkl_relu_op.cc
index 45bdd0ad5c..dc899d8c7e 100644
--- a/tensorflow/core/kernels/mkl_relu_op.cc
+++ b/tensorflow/core/kernels/mkl_relu_op.cc
@@ -500,30 +500,81 @@ class MklReluGradOpBase : public OpKernel {
       // Set DNN primitives for src & diff_dst
       memory::desc src_md({}, memory::data_undef, memory::format_undef);
       memory::desc diff_dst_md({}, memory::data_undef, memory::format_undef);
-      if (dnn_shape_src.IsMklTensor() || dnn_shape_diff_dst.IsMklTensor()) {
-        if (dnn_shape_diff_dst.IsMklTensor()) {
-          diff_dst_md = dnn_shape_diff_dst.GetMklLayout();
-          src_md = diff_dst_md;
-        } else {
-          src_md = dnn_shape_src.GetMklLayout();
-          diff_dst_md = src_md;
-        }
-      } else {
+
+      // For creating Sum primitive, we need to ensure that all inputs are in
+      // same format. What that means is if we have a mixed input case - where
+      // one input is in Tensorflow format and one input is in MKL format -,
+      // then we need to ensure that all inputs are in same format for
+      // primitive construction. For performance reason, we say that all inputs
+      // are in MKL format in such case, and insert reorder for input that is
+      // in Tensorflow format into MKL format. On the other hand, if both the
+      // inputs are in MKL format or both are in Tensorflow format, then we
+      // dont need reorder.
+      if (!dnn_shape_src.IsMklTensor() && !dnn_shape_diff_dst.IsMklTensor()) {
+        // If both the inputs are in Tensorflow format, we create blocked memory
+        // descriptor.
         auto src_dims = TFShapeToMklDnnDims(src_tensor.shape());
         auto src_strides = CalculateTFStrides(src_dims);
         src_md = MklDnnData<T>::CreateBlockedMemDesc(src_dims, src_strides);
         diff_dst_md = src_md;
+      } else if (dnn_shape_src.IsMklTensor() &&
+                 !dnn_shape_diff_dst.IsMklTensor()) {
+        // If one input is in MKL format and other is in Tensorflow, then
+        // create respective descriptors describing the actual case. For input
+        // in Mkl format, we just get Mkl layout from MklDnnShape. For input in
+        // Tensorflow format, we create memory descriptor using data format.
+        src_md = dnn_shape_src.GetMklLayout();
+
+        memory::format src_mkl_data_format = dnn_shape_src.GetTfDataFormat();
+        auto src_tf_data_format = MklDnnDataFormatToTFDataFormat(
+                                    src_mkl_data_format);
+        auto diff_dst_dims = TFShapeToMklDnnDimsInNCHW(diff_dst_tensor.shape(),
+                                                       src_tf_data_format);
+        diff_dst_md = memory::desc(diff_dst_dims, MklDnnType<T>(),
+                                   src_mkl_data_format);
+      } else if (!dnn_shape_src.IsMklTensor() &&
+                  dnn_shape_diff_dst.IsMklTensor()) {
+        // Same comment as above.
+        diff_dst_md = dnn_shape_diff_dst.GetMklLayout();
+
+        memory::format diff_dst_mkl_data_format =
+          dnn_shape_diff_dst.GetTfDataFormat();
+        auto diff_dst_tf_data_format = MklDnnDataFormatToTFDataFormat(
+                                          diff_dst_mkl_data_format);
+        auto src_dims = TFShapeToMklDnnDimsInNCHW(src_tensor.shape(),
+                                                  diff_dst_tf_data_format);
+        src_md = memory::desc(src_dims, MklDnnType<T>(),
+                              diff_dst_mkl_data_format);
+      } else {
+        // If both the inputs are in MKL format, we use Mkl layout of the input
+        // tensors.
+        src_md = dnn_shape_src.GetMklLayout();
+        diff_dst_md = dnn_shape_diff_dst.GetMklLayout();
       }
+
       src.SetUsrMem(src_md, &src_tensor);
       diff_dst.SetUsrMem(diff_dst_md, &diff_dst_tensor);
 
+      // As per comment above, we tell MKLDNN that both the inputs are in same
+      // format. So we set common memory descriptor in MKL format, if any of the
+      // inputs are in MKL format. Let's get memory descriptor that we will use
+      // for both the inputs.
+      memory::desc common_md({}, memory::data_undef, memory::format_undef);
+      if (dnn_shape_src.IsMklTensor() || dnn_shape_diff_dst.IsMklTensor()) {
+        common_md = dnn_shape_src.IsMklTensor() ? src_md : diff_dst_md;
+      } else {
+        // Since both the inputs are in Tensorflow format, and have
+        // same shape, we can get memory descriptor from any input.
+        common_md = src_md;
+      }
+
       T alpha = 0, beta = 0;
       std::shared_ptr<relu_forward::primitive_desc> relu_fwd_pd;
       auto relu_fwd_desc = relu_forward::desc(prop_kind::forward_training,
                                               alg_kind, src_md, alpha, beta);
       relu_fwd_pd.reset(new relu_forward::primitive_desc(relu_fwd_desc,
                                                          cpu_engine));
-      auto relu_bwd_desc = relu_backward::desc(alg_kind, diff_dst_md, src_md,
+      auto relu_bwd_desc = relu_backward::desc(alg_kind, common_md, common_md,
                                                 alpha, beta);
       auto relu_bwd_pd  = relu_backward::primitive_desc(relu_bwd_desc,
                                                 cpu_engine, *relu_fwd_pd);
@@ -547,9 +598,9 @@ class MklReluGradOpBase : public OpKernel {
       AllocateOutputSetMklShape(context, diff_src_index, &diff_src_tensor,
                                  tf_shape_diff_src, dnn_shape_diff_src);
 
-      // diff_src memory descriptor is same as diff_dst memory descriptor.
-      auto diff_src_md = diff_dst_md;
-      diff_src.SetUsrMem(diff_src_md, diff_src_tensor);
+      // diff_src memory descriptor is same as memory descriptor for both
+      // inputs.
+      diff_src.SetUsrMem(common_md, diff_src_tensor);
 
       PrepareAndExecuteNet(relu_bwd_pd, &src, &diff_src, &diff_dst);
      } catch (mkldnn::error &e) {
@@ -567,6 +618,14 @@ class MklReluGradOpBase : public OpKernel {
                   MklDnnData<T>* src, MklDnnData<T>* diff_src, MklDnnData<T>*
                   diff_dst) {
     std::vector<primitive> net;
+
+    // Check if we need to reorder original input tensors into common_md layout
+    // that we set for primitive creation. diff_src_primitive_desc is same as
+    // common_md.
+    src->CheckReorderToOpMem(relu_prim_desc.diff_src_primitive_desc(), &net);
+    diff_dst->CheckReorderToOpMem(relu_prim_desc.diff_src_primitive_desc(),
+                                  &net);
+
     net.push_back(relu_backward(relu_prim_desc, src->GetOpMem(),
                                 diff_dst->GetOpMem(), diff_src->GetOpMem()));
     stream(stream::kind::eager).submit(net).wait();
@@ -622,7 +681,6 @@ class MklReluGradOp : public MklReluGradOpBase<Device, T, eltwise_relu> {
     MklDnnShape dnn_shape_diff_dst;
     GetMklShape(context, diff_dst_index, &dnn_shape_diff_dst);
 
-    int src_dims_size = src_tensor.dims();
     MklDnnShape dnn_shape_diff_src;
     dnn_shape_diff_src.SetMklTensor(false);
     AllocateOutputSetMklShape(context, diff_src_index, &diff_src_tensor,
@@ -690,7 +748,6 @@ class MklEluGradOp : public MklReluGradOpBase<Device, T, eltwise_elu> {
     MklDnnShape dnn_shape_diff_dst;
     GetMklShape(context, diff_dst_index, &dnn_shape_diff_dst);
 
-    int src_dims_size = src_tensor.dims();
     MklDnnShape dnn_shape_diff_src;
     dnn_shape_diff_src.SetMklTensor(false);
     AllocateOutputSetMklShape(context, diff_src_index, &diff_src_tensor,
@@ -762,7 +819,6 @@ class MklTanhGradOp : public MklReluGradOpBase<Device, T, eltwise_tanh> {
     MklDnnShape dnn_shape_diff_dst;
     GetMklShape(context, diff_dst_index, &dnn_shape_diff_dst);
 
-    int src_dims_size = src_tensor.dims();
     MklDnnShape dnn_shape_diff_src;
     dnn_shape_diff_src.SetMklTensor(false);
     AllocateOutputSetMklShape(context, diff_src_index, &diff_src_tensor,
author	Niranjan Hasabnis <niranjan.hasabnis@intel.com>	2017-12-26 11:21:06 -0800
committer	drpngx <drpngx@users.noreply.github.com>	2017-12-26 11:21:06 -0800
commit	64cb8494a1628e799ecf869e8c1beb302c907720 (patch)
tree	631da5b32de285d6a928c3db9509489f35b4c452
parent	ced991d65a95ebbdd57c2e0670d88563042a3180 (diff)