diff options
author | Niranjan Hasabnis <niranjan.hasabnis@intel.com> | 2017-12-26 11:21:06 -0800 |
---|---|---|
committer | drpngx <drpngx@users.noreply.github.com> | 2017-12-26 11:21:06 -0800 |
commit | 64cb8494a1628e799ecf869e8c1beb302c907720 (patch) | |
tree | 631da5b32de285d6a928c3db9509489f35b4c452 | |
parent | ced991d65a95ebbdd57c2e0670d88563042a3180 (diff) |
Adding missing reorders in ReLU and AddN (#15406)
-rw-r--r-- | tensorflow/core/kernels/mkl_aggregate_ops.cc | 161 | ||||
-rw-r--r-- | tensorflow/core/kernels/mkl_relu_op.cc | 88 |
2 files changed, 179 insertions, 70 deletions
diff --git a/tensorflow/core/kernels/mkl_aggregate_ops.cc b/tensorflow/core/kernels/mkl_aggregate_ops.cc index 9aabbbdb6b..44b94be3a0 100644 --- a/tensorflow/core/kernels/mkl_aggregate_ops.cc +++ b/tensorflow/core/kernels/mkl_aggregate_ops.cc @@ -294,7 +294,7 @@ class MklAddNOp : public OpKernel { try { auto cpu_engine = engine(engine::cpu, 0); - size_t src1_idx = 0, src2_idx = 1; + size_t src1_idx = 0, src2_idx = 1, output_idx = 0; const Tensor& src1_tensor = MklGetInput(ctx, src1_idx); const Tensor& src2_tensor = MklGetInput(ctx, src2_idx); @@ -312,7 +312,7 @@ class MklAddNOp : public OpKernel { Tensor* dst_tensor = nullptr; MklShape mkl_shape_dst; mkl_shape_dst.SetMklTensor(false); - AllocateOutputSetMklShape(ctx, src1_idx, &dst_tensor, + AllocateOutputSetMklShape(ctx, output_idx, &dst_tensor, src1_tensor.shape(), mkl_shape_dst); float user_i1 = (src1_tensor.scalar<T>()()); float user_i2 = (src2_tensor.scalar<T>()()); @@ -327,13 +327,12 @@ class MklAddNOp : public OpKernel { Tensor* dst_tensor = nullptr; MklShape mkl_shape_dst; mkl_shape_dst.SetMklTensor(false); - AllocateOutputSetMklShape(ctx, src1_idx, &dst_tensor, + AllocateOutputSetMklShape(ctx, output_idx, &dst_tensor, src1_tensor.shape(), mkl_shape_dst); return; } } - // element-wise add operator for tensor input1 and tensor input2 std::vector<double> coeff(2, 1.0); MklDnnData<T> src1(&cpu_engine); MklDnnData<T> src2(&cpu_engine); @@ -345,70 +344,124 @@ class MklAddNOp : public OpKernel { memory::desc md1({}, memory::data_undef, memory::format_undef); memory::desc md2({}, memory::data_undef, memory::format_undef); - if ( input1_in_mkl_format || input2_in_mkl_format ) { - if ( input1_in_mkl_format ) { - md1 = src1_mkl_shape.GetMklLayout(); - md2 = md1; - dst.SetUsrMem(md1); - } else { - md2 = src2_mkl_shape.GetMklLayout(); - md1 = md2; - dst.SetUsrMem(md2); - } + // For creating Sum primitive, we need to ensure that all inputs are in + // same format. What that means is if we have a mixed input case - where + // one input is in Tensorflow format and one input is in MKL format -, + // then we need to ensure that all inputs are in same format for + // primitive construction. For performance reason, we say that all inputs + // are in MKL format in such case, and insert reorder for input that is + // in Tensorflow format into MKL format. On the other hand, if both the + // inputs are in MKL format or both are in Tensorflow format, then we + // dont need reorder. + if (!input1_in_mkl_format && !input2_in_mkl_format) { + // If both the inputs are in Tensorflow format, we create blocked memory + // descriptor. + dims = TFShapeToMklDnnDims(src1_tensor.shape()); + strides = CalculateTFStrides(dims); + md1 = MklDnnData<T>::CreateBlockedMemDesc(dims, strides); + md2 = md1; + } else if (input1_in_mkl_format && !input2_in_mkl_format) { + // If one input is in MKL format and other is in Tensorflow, then + // create respective descriptors describing the actual case. For input + // in Mkl format, we just get Mkl layout from MklDnnShape. For input in + // Tensorflow format, we create memory descriptor using data format. + md1 = src1_mkl_shape.GetMklLayout(); + + memory::format src1_mkl_data_format = src1_mkl_shape.GetTfDataFormat(); + auto src1_tf_data_format = MklDnnDataFormatToTFDataFormat( + src1_mkl_data_format); + auto src2_dims = TFShapeToMklDnnDimsInNCHW(src2_tensor.shape(), + src1_tf_data_format); + md2 = memory::desc(src2_dims, MklDnnType<T>(), + src1_mkl_data_format); + } else if (input2_in_mkl_format && !input1_in_mkl_format) { + // Same comment as above. + memory::format src2_mkl_data_format = src2_mkl_shape.GetTfDataFormat(); + auto src2_tf_data_format = MklDnnDataFormatToTFDataFormat( + src2_mkl_data_format); + auto src1_dims = TFShapeToMklDnnDimsInNCHW(src1_tensor.shape(), + src2_tf_data_format); + md1 = memory::desc(src1_dims, MklDnnType<T>(), + src2_mkl_data_format); + + md2 = src2_mkl_shape.GetMklLayout(); } else { - dims = TFShapeToMklDnnDims(src1_tensor.shape()); - strides = CalculateTFStrides(dims); - md1 = MklDnnData<T>::CreateBlockedMemDesc(dims, strides); - md2 = md1; - dst.SetUsrMem(dims, strides); + // If both the inputs are in MKL format, we use Mkl layout of the input + // tensors. + md1 = src1_mkl_shape.GetMklLayout(); + md2 = src2_mkl_shape.GetMklLayout(); } - - std::vector<memory::primitive_desc> srcs_pd; - src1.SetUsrMem(md1, &src1_tensor); - auto mpd1 = src1.GetUsrMemPrimDesc(); - srcs_pd.push_back(mpd1); - src2.SetUsrMem(md2, &src2_tensor); - auto mpd2 = src2.GetUsrMemPrimDesc(); - srcs_pd.push_back(mpd2); + // As per comment above, we tell MKLDNN that both the inputs are in same + // format. So we set common memory descriptor in MKL format, if any of the + // inputs are in MKL format. Let's get memory descriptor that we will use + // for both the inputs. + // We set output memory descriptor in MKL format, if any of the + // inputs are in MKL format. + memory::desc common_md({}, memory::data_undef, memory::format_undef); + if (input1_in_mkl_format || input2_in_mkl_format) { + common_md = input1_in_mkl_format ? md1 : md2; + dst.SetUsrMem(common_md); + } else { + // Since both the inputs are in Tensorflow format, and have + // same shape, we can get memory descriptor from any input. + common_md = md1; + dst.SetUsrMem(common_md); + } + + std::vector<memory::primitive_desc> srcs_pd; + // Memory descriptor for 1st input + srcs_pd.push_back(memory::primitive_desc(common_md, cpu_engine)); + // Memory descriptor for 2nd input + srcs_pd.push_back(memory::primitive_desc(common_md, cpu_engine)); + auto sum_pd = sum::primitive_desc(dst.GetUsrMemDesc(), coeff, srcs_pd); + + // Now we setup resources for primitive execution. + // First, we need to check if any of the inputs need to be reordered as + // per the logic described above. Since output will be in MKL format if + // atleast one input is in MKL format, we choose output descriptor for + // reorder. std::vector<primitive::at> inputs; + std::vector<primitive> net; + // Check if actual input format of the tensor is different than common_pd + // we told MKLDNN. In that case, we will need reorder. + src1.CheckReorderToOpMem(srcs_pd[0], &net); + src2.CheckReorderToOpMem(srcs_pd[1], &net); inputs.push_back(src1.GetOpMem()); inputs.push_back(src2.GetOpMem()); - auto output_pd = dst.GetUsrMemPrimDesc(); + + // Allocate output tensor now. Tensor* dst_tensor = nullptr; - auto sum_pd = sum::primitive_desc(dst.GetUsrMemDesc(), coeff, srcs_pd); - auto sum_op = sum(sum_pd, inputs, dst.GetOpMem()); - if ( input2_in_mkl_format || input1_in_mkl_format ) { - MklDnnShape output_mkl_shape; - output_mkl_shape.SetMklTensor(true); - output_mkl_shape.SetMklLayout(&output_pd); - output_mkl_shape.SetElemType(MklDnnType<T>()); - if ( input1_in_mkl_format ) { + MklDnnShape output_mkl_shape; + TensorShape output_tf_shape; + + if (input2_in_mkl_format || input1_in_mkl_format) { + output_mkl_shape.SetMklTensor(true); + auto output_pd = dst.GetUsrMemPrimDesc(); + output_mkl_shape.SetMklLayout(&output_pd); + output_mkl_shape.SetElemType(MklDnnType<T>()); + if (input1_in_mkl_format) { output_mkl_shape.SetTfLayout(src1_dims_size, - src1_mkl_shape.GetSizesAsMklDnnDims(), - src1_mkl_shape.GetTfDataFormat()); - } else { + src1_mkl_shape.GetSizesAsMklDnnDims(), + src1_mkl_shape.GetTfDataFormat()); + } else { output_mkl_shape.SetTfLayout(src2_dims_size, - src2_mkl_shape.GetSizesAsMklDnnDims(), - src2_mkl_shape.GetTfDataFormat()); - } - TensorShape output_tf_shape; - output_tf_shape.AddDim((output_pd.get_size() / sizeof(T)) - + (output_pd.get_size()%sizeof(T) == 0 ? 0 : 1)); - AllocateOutputSetMklShape(ctx, src1_idx, &dst_tensor, output_tf_shape, - output_mkl_shape); + src2_mkl_shape.GetSizesAsMklDnnDims(), + src2_mkl_shape.GetTfDataFormat()); + } + output_tf_shape.AddDim((output_pd.get_size() / sizeof(T))); } else { - MklShape mkl_shape_dst; - mkl_shape_dst.SetMklTensor(false); - AllocateOutputSetMklShape(ctx, src1_idx, - &dst_tensor, src1_tensor.shape(), mkl_shape_dst); + output_mkl_shape.SetMklTensor(false); + output_tf_shape = src1_tensor.shape(); } - + AllocateOutputSetMklShape(ctx, output_idx, &dst_tensor, + output_tf_shape, output_mkl_shape); dst.SetUsrMemDataHandle(dst_tensor); - std::vector<primitive> net; - net.push_back(sum_op); + + // Create Sum op, and submit net for execution. + net.push_back(sum(sum_pd, inputs, dst.GetOpMem())); stream(stream::kind::eager).submit(net).wait(); } catch (mkldnn::error &e) { string error_msg = "Status: " + std::to_string(e.status) + diff --git a/tensorflow/core/kernels/mkl_relu_op.cc b/tensorflow/core/kernels/mkl_relu_op.cc index 45bdd0ad5c..dc899d8c7e 100644 --- a/tensorflow/core/kernels/mkl_relu_op.cc +++ b/tensorflow/core/kernels/mkl_relu_op.cc @@ -500,30 +500,81 @@ class MklReluGradOpBase : public OpKernel { // Set DNN primitives for src & diff_dst memory::desc src_md({}, memory::data_undef, memory::format_undef); memory::desc diff_dst_md({}, memory::data_undef, memory::format_undef); - if (dnn_shape_src.IsMklTensor() || dnn_shape_diff_dst.IsMklTensor()) { - if (dnn_shape_diff_dst.IsMklTensor()) { - diff_dst_md = dnn_shape_diff_dst.GetMklLayout(); - src_md = diff_dst_md; - } else { - src_md = dnn_shape_src.GetMklLayout(); - diff_dst_md = src_md; - } - } else { + + // For creating Sum primitive, we need to ensure that all inputs are in + // same format. What that means is if we have a mixed input case - where + // one input is in Tensorflow format and one input is in MKL format -, + // then we need to ensure that all inputs are in same format for + // primitive construction. For performance reason, we say that all inputs + // are in MKL format in such case, and insert reorder for input that is + // in Tensorflow format into MKL format. On the other hand, if both the + // inputs are in MKL format or both are in Tensorflow format, then we + // dont need reorder. + if (!dnn_shape_src.IsMklTensor() && !dnn_shape_diff_dst.IsMklTensor()) { + // If both the inputs are in Tensorflow format, we create blocked memory + // descriptor. auto src_dims = TFShapeToMklDnnDims(src_tensor.shape()); auto src_strides = CalculateTFStrides(src_dims); src_md = MklDnnData<T>::CreateBlockedMemDesc(src_dims, src_strides); diff_dst_md = src_md; + } else if (dnn_shape_src.IsMklTensor() && + !dnn_shape_diff_dst.IsMklTensor()) { + // If one input is in MKL format and other is in Tensorflow, then + // create respective descriptors describing the actual case. For input + // in Mkl format, we just get Mkl layout from MklDnnShape. For input in + // Tensorflow format, we create memory descriptor using data format. + src_md = dnn_shape_src.GetMklLayout(); + + memory::format src_mkl_data_format = dnn_shape_src.GetTfDataFormat(); + auto src_tf_data_format = MklDnnDataFormatToTFDataFormat( + src_mkl_data_format); + auto diff_dst_dims = TFShapeToMklDnnDimsInNCHW(diff_dst_tensor.shape(), + src_tf_data_format); + diff_dst_md = memory::desc(diff_dst_dims, MklDnnType<T>(), + src_mkl_data_format); + } else if (!dnn_shape_src.IsMklTensor() && + dnn_shape_diff_dst.IsMklTensor()) { + // Same comment as above. + diff_dst_md = dnn_shape_diff_dst.GetMklLayout(); + + memory::format diff_dst_mkl_data_format = + dnn_shape_diff_dst.GetTfDataFormat(); + auto diff_dst_tf_data_format = MklDnnDataFormatToTFDataFormat( + diff_dst_mkl_data_format); + auto src_dims = TFShapeToMklDnnDimsInNCHW(src_tensor.shape(), + diff_dst_tf_data_format); + src_md = memory::desc(src_dims, MklDnnType<T>(), + diff_dst_mkl_data_format); + } else { + // If both the inputs are in MKL format, we use Mkl layout of the input + // tensors. + src_md = dnn_shape_src.GetMklLayout(); + diff_dst_md = dnn_shape_diff_dst.GetMklLayout(); } + src.SetUsrMem(src_md, &src_tensor); diff_dst.SetUsrMem(diff_dst_md, &diff_dst_tensor); + // As per comment above, we tell MKLDNN that both the inputs are in same + // format. So we set common memory descriptor in MKL format, if any of the + // inputs are in MKL format. Let's get memory descriptor that we will use + // for both the inputs. + memory::desc common_md({}, memory::data_undef, memory::format_undef); + if (dnn_shape_src.IsMklTensor() || dnn_shape_diff_dst.IsMklTensor()) { + common_md = dnn_shape_src.IsMklTensor() ? src_md : diff_dst_md; + } else { + // Since both the inputs are in Tensorflow format, and have + // same shape, we can get memory descriptor from any input. + common_md = src_md; + } + T alpha = 0, beta = 0; std::shared_ptr<relu_forward::primitive_desc> relu_fwd_pd; auto relu_fwd_desc = relu_forward::desc(prop_kind::forward_training, alg_kind, src_md, alpha, beta); relu_fwd_pd.reset(new relu_forward::primitive_desc(relu_fwd_desc, cpu_engine)); - auto relu_bwd_desc = relu_backward::desc(alg_kind, diff_dst_md, src_md, + auto relu_bwd_desc = relu_backward::desc(alg_kind, common_md, common_md, alpha, beta); auto relu_bwd_pd = relu_backward::primitive_desc(relu_bwd_desc, cpu_engine, *relu_fwd_pd); @@ -547,9 +598,9 @@ class MklReluGradOpBase : public OpKernel { AllocateOutputSetMklShape(context, diff_src_index, &diff_src_tensor, tf_shape_diff_src, dnn_shape_diff_src); - // diff_src memory descriptor is same as diff_dst memory descriptor. - auto diff_src_md = diff_dst_md; - diff_src.SetUsrMem(diff_src_md, diff_src_tensor); + // diff_src memory descriptor is same as memory descriptor for both + // inputs. + diff_src.SetUsrMem(common_md, diff_src_tensor); PrepareAndExecuteNet(relu_bwd_pd, &src, &diff_src, &diff_dst); } catch (mkldnn::error &e) { @@ -567,6 +618,14 @@ class MklReluGradOpBase : public OpKernel { MklDnnData<T>* src, MklDnnData<T>* diff_src, MklDnnData<T>* diff_dst) { std::vector<primitive> net; + + // Check if we need to reorder original input tensors into common_md layout + // that we set for primitive creation. diff_src_primitive_desc is same as + // common_md. + src->CheckReorderToOpMem(relu_prim_desc.diff_src_primitive_desc(), &net); + diff_dst->CheckReorderToOpMem(relu_prim_desc.diff_src_primitive_desc(), + &net); + net.push_back(relu_backward(relu_prim_desc, src->GetOpMem(), diff_dst->GetOpMem(), diff_src->GetOpMem())); stream(stream::kind::eager).submit(net).wait(); @@ -622,7 +681,6 @@ class MklReluGradOp : public MklReluGradOpBase<Device, T, eltwise_relu> { MklDnnShape dnn_shape_diff_dst; GetMklShape(context, diff_dst_index, &dnn_shape_diff_dst); - int src_dims_size = src_tensor.dims(); MklDnnShape dnn_shape_diff_src; dnn_shape_diff_src.SetMklTensor(false); AllocateOutputSetMklShape(context, diff_src_index, &diff_src_tensor, @@ -690,7 +748,6 @@ class MklEluGradOp : public MklReluGradOpBase<Device, T, eltwise_elu> { MklDnnShape dnn_shape_diff_dst; GetMklShape(context, diff_dst_index, &dnn_shape_diff_dst); - int src_dims_size = src_tensor.dims(); MklDnnShape dnn_shape_diff_src; dnn_shape_diff_src.SetMklTensor(false); AllocateOutputSetMklShape(context, diff_src_index, &diff_src_tensor, @@ -762,7 +819,6 @@ class MklTanhGradOp : public MklReluGradOpBase<Device, T, eltwise_tanh> { MklDnnShape dnn_shape_diff_dst; GetMklShape(context, diff_dst_index, &dnn_shape_diff_dst); - int src_dims_size = src_tensor.dims(); MklDnnShape dnn_shape_diff_src; dnn_shape_diff_src.SetMklTensor(false); AllocateOutputSetMklShape(context, diff_src_index, &diff_src_tensor, |