aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorGravatar Niranjan Hasabnis <niranjan.hasabnis@intel.com>2017-12-26 11:21:06 -0800
committerGravatar drpngx <drpngx@users.noreply.github.com>2017-12-26 11:21:06 -0800
commit64cb8494a1628e799ecf869e8c1beb302c907720 (patch)
tree631da5b32de285d6a928c3db9509489f35b4c452
parentced991d65a95ebbdd57c2e0670d88563042a3180 (diff)
Adding missing reorders in ReLU and AddN (#15406)
-rw-r--r--tensorflow/core/kernels/mkl_aggregate_ops.cc161
-rw-r--r--tensorflow/core/kernels/mkl_relu_op.cc88
2 files changed, 179 insertions, 70 deletions
diff --git a/tensorflow/core/kernels/mkl_aggregate_ops.cc b/tensorflow/core/kernels/mkl_aggregate_ops.cc
index 9aabbbdb6b..44b94be3a0 100644
--- a/tensorflow/core/kernels/mkl_aggregate_ops.cc
+++ b/tensorflow/core/kernels/mkl_aggregate_ops.cc
@@ -294,7 +294,7 @@ class MklAddNOp : public OpKernel {
try {
auto cpu_engine = engine(engine::cpu, 0);
- size_t src1_idx = 0, src2_idx = 1;
+ size_t src1_idx = 0, src2_idx = 1, output_idx = 0;
const Tensor& src1_tensor = MklGetInput(ctx, src1_idx);
const Tensor& src2_tensor = MklGetInput(ctx, src2_idx);
@@ -312,7 +312,7 @@ class MklAddNOp : public OpKernel {
Tensor* dst_tensor = nullptr;
MklShape mkl_shape_dst;
mkl_shape_dst.SetMklTensor(false);
- AllocateOutputSetMklShape(ctx, src1_idx, &dst_tensor,
+ AllocateOutputSetMklShape(ctx, output_idx, &dst_tensor,
src1_tensor.shape(), mkl_shape_dst);
float user_i1 = (src1_tensor.scalar<T>()());
float user_i2 = (src2_tensor.scalar<T>()());
@@ -327,13 +327,12 @@ class MklAddNOp : public OpKernel {
Tensor* dst_tensor = nullptr;
MklShape mkl_shape_dst;
mkl_shape_dst.SetMklTensor(false);
- AllocateOutputSetMklShape(ctx, src1_idx, &dst_tensor,
+ AllocateOutputSetMklShape(ctx, output_idx, &dst_tensor,
src1_tensor.shape(), mkl_shape_dst);
return;
}
}
- // element-wise add operator for tensor input1 and tensor input2
std::vector<double> coeff(2, 1.0);
MklDnnData<T> src1(&cpu_engine);
MklDnnData<T> src2(&cpu_engine);
@@ -345,70 +344,124 @@ class MklAddNOp : public OpKernel {
memory::desc md1({}, memory::data_undef, memory::format_undef);
memory::desc md2({}, memory::data_undef, memory::format_undef);
- if ( input1_in_mkl_format || input2_in_mkl_format ) {
- if ( input1_in_mkl_format ) {
- md1 = src1_mkl_shape.GetMklLayout();
- md2 = md1;
- dst.SetUsrMem(md1);
- } else {
- md2 = src2_mkl_shape.GetMklLayout();
- md1 = md2;
- dst.SetUsrMem(md2);
- }
+ // For creating Sum primitive, we need to ensure that all inputs are in
+ // same format. What that means is if we have a mixed input case - where
+ // one input is in Tensorflow format and one input is in MKL format -,
+ // then we need to ensure that all inputs are in same format for
+ // primitive construction. For performance reason, we say that all inputs
+ // are in MKL format in such case, and insert reorder for input that is
+ // in Tensorflow format into MKL format. On the other hand, if both the
+ // inputs are in MKL format or both are in Tensorflow format, then we
+ // dont need reorder.
+ if (!input1_in_mkl_format && !input2_in_mkl_format) {
+ // If both the inputs are in Tensorflow format, we create blocked memory
+ // descriptor.
+ dims = TFShapeToMklDnnDims(src1_tensor.shape());
+ strides = CalculateTFStrides(dims);
+ md1 = MklDnnData<T>::CreateBlockedMemDesc(dims, strides);
+ md2 = md1;
+ } else if (input1_in_mkl_format && !input2_in_mkl_format) {
+ // If one input is in MKL format and other is in Tensorflow, then
+ // create respective descriptors describing the actual case. For input
+ // in Mkl format, we just get Mkl layout from MklDnnShape. For input in
+ // Tensorflow format, we create memory descriptor using data format.
+ md1 = src1_mkl_shape.GetMklLayout();
+
+ memory::format src1_mkl_data_format = src1_mkl_shape.GetTfDataFormat();
+ auto src1_tf_data_format = MklDnnDataFormatToTFDataFormat(
+ src1_mkl_data_format);
+ auto src2_dims = TFShapeToMklDnnDimsInNCHW(src2_tensor.shape(),
+ src1_tf_data_format);
+ md2 = memory::desc(src2_dims, MklDnnType<T>(),
+ src1_mkl_data_format);
+ } else if (input2_in_mkl_format && !input1_in_mkl_format) {
+ // Same comment as above.
+ memory::format src2_mkl_data_format = src2_mkl_shape.GetTfDataFormat();
+ auto src2_tf_data_format = MklDnnDataFormatToTFDataFormat(
+ src2_mkl_data_format);
+ auto src1_dims = TFShapeToMklDnnDimsInNCHW(src1_tensor.shape(),
+ src2_tf_data_format);
+ md1 = memory::desc(src1_dims, MklDnnType<T>(),
+ src2_mkl_data_format);
+
+ md2 = src2_mkl_shape.GetMklLayout();
} else {
- dims = TFShapeToMklDnnDims(src1_tensor.shape());
- strides = CalculateTFStrides(dims);
- md1 = MklDnnData<T>::CreateBlockedMemDesc(dims, strides);
- md2 = md1;
- dst.SetUsrMem(dims, strides);
+ // If both the inputs are in MKL format, we use Mkl layout of the input
+ // tensors.
+ md1 = src1_mkl_shape.GetMklLayout();
+ md2 = src2_mkl_shape.GetMklLayout();
}
-
- std::vector<memory::primitive_desc> srcs_pd;
-
src1.SetUsrMem(md1, &src1_tensor);
- auto mpd1 = src1.GetUsrMemPrimDesc();
- srcs_pd.push_back(mpd1);
-
src2.SetUsrMem(md2, &src2_tensor);
- auto mpd2 = src2.GetUsrMemPrimDesc();
- srcs_pd.push_back(mpd2);
+ // As per comment above, we tell MKLDNN that both the inputs are in same
+ // format. So we set common memory descriptor in MKL format, if any of the
+ // inputs are in MKL format. Let's get memory descriptor that we will use
+ // for both the inputs.
+ // We set output memory descriptor in MKL format, if any of the
+ // inputs are in MKL format.
+ memory::desc common_md({}, memory::data_undef, memory::format_undef);
+ if (input1_in_mkl_format || input2_in_mkl_format) {
+ common_md = input1_in_mkl_format ? md1 : md2;
+ dst.SetUsrMem(common_md);
+ } else {
+ // Since both the inputs are in Tensorflow format, and have
+ // same shape, we can get memory descriptor from any input.
+ common_md = md1;
+ dst.SetUsrMem(common_md);
+ }
+
+ std::vector<memory::primitive_desc> srcs_pd;
+ // Memory descriptor for 1st input
+ srcs_pd.push_back(memory::primitive_desc(common_md, cpu_engine));
+ // Memory descriptor for 2nd input
+ srcs_pd.push_back(memory::primitive_desc(common_md, cpu_engine));
+ auto sum_pd = sum::primitive_desc(dst.GetUsrMemDesc(), coeff, srcs_pd);
+
+ // Now we setup resources for primitive execution.
+ // First, we need to check if any of the inputs need to be reordered as
+ // per the logic described above. Since output will be in MKL format if
+ // atleast one input is in MKL format, we choose output descriptor for
+ // reorder.
std::vector<primitive::at> inputs;
+ std::vector<primitive> net;
+ // Check if actual input format of the tensor is different than common_pd
+ // we told MKLDNN. In that case, we will need reorder.
+ src1.CheckReorderToOpMem(srcs_pd[0], &net);
+ src2.CheckReorderToOpMem(srcs_pd[1], &net);
inputs.push_back(src1.GetOpMem());
inputs.push_back(src2.GetOpMem());
- auto output_pd = dst.GetUsrMemPrimDesc();
+
+ // Allocate output tensor now.
Tensor* dst_tensor = nullptr;
- auto sum_pd = sum::primitive_desc(dst.GetUsrMemDesc(), coeff, srcs_pd);
- auto sum_op = sum(sum_pd, inputs, dst.GetOpMem());
- if ( input2_in_mkl_format || input1_in_mkl_format ) {
- MklDnnShape output_mkl_shape;
- output_mkl_shape.SetMklTensor(true);
- output_mkl_shape.SetMklLayout(&output_pd);
- output_mkl_shape.SetElemType(MklDnnType<T>());
- if ( input1_in_mkl_format ) {
+ MklDnnShape output_mkl_shape;
+ TensorShape output_tf_shape;
+
+ if (input2_in_mkl_format || input1_in_mkl_format) {
+ output_mkl_shape.SetMklTensor(true);
+ auto output_pd = dst.GetUsrMemPrimDesc();
+ output_mkl_shape.SetMklLayout(&output_pd);
+ output_mkl_shape.SetElemType(MklDnnType<T>());
+ if (input1_in_mkl_format) {
output_mkl_shape.SetTfLayout(src1_dims_size,
- src1_mkl_shape.GetSizesAsMklDnnDims(),
- src1_mkl_shape.GetTfDataFormat());
- } else {
+ src1_mkl_shape.GetSizesAsMklDnnDims(),
+ src1_mkl_shape.GetTfDataFormat());
+ } else {
output_mkl_shape.SetTfLayout(src2_dims_size,
- src2_mkl_shape.GetSizesAsMklDnnDims(),
- src2_mkl_shape.GetTfDataFormat());
- }
- TensorShape output_tf_shape;
- output_tf_shape.AddDim((output_pd.get_size() / sizeof(T))
- + (output_pd.get_size()%sizeof(T) == 0 ? 0 : 1));
- AllocateOutputSetMklShape(ctx, src1_idx, &dst_tensor, output_tf_shape,
- output_mkl_shape);
+ src2_mkl_shape.GetSizesAsMklDnnDims(),
+ src2_mkl_shape.GetTfDataFormat());
+ }
+ output_tf_shape.AddDim((output_pd.get_size() / sizeof(T)));
} else {
- MklShape mkl_shape_dst;
- mkl_shape_dst.SetMklTensor(false);
- AllocateOutputSetMklShape(ctx, src1_idx,
- &dst_tensor, src1_tensor.shape(), mkl_shape_dst);
+ output_mkl_shape.SetMklTensor(false);
+ output_tf_shape = src1_tensor.shape();
}
-
+ AllocateOutputSetMklShape(ctx, output_idx, &dst_tensor,
+ output_tf_shape, output_mkl_shape);
dst.SetUsrMemDataHandle(dst_tensor);
- std::vector<primitive> net;
- net.push_back(sum_op);
+
+ // Create Sum op, and submit net for execution.
+ net.push_back(sum(sum_pd, inputs, dst.GetOpMem()));
stream(stream::kind::eager).submit(net).wait();
} catch (mkldnn::error &e) {
string error_msg = "Status: " + std::to_string(e.status) +
diff --git a/tensorflow/core/kernels/mkl_relu_op.cc b/tensorflow/core/kernels/mkl_relu_op.cc
index 45bdd0ad5c..dc899d8c7e 100644
--- a/tensorflow/core/kernels/mkl_relu_op.cc
+++ b/tensorflow/core/kernels/mkl_relu_op.cc
@@ -500,30 +500,81 @@ class MklReluGradOpBase : public OpKernel {
// Set DNN primitives for src & diff_dst
memory::desc src_md({}, memory::data_undef, memory::format_undef);
memory::desc diff_dst_md({}, memory::data_undef, memory::format_undef);
- if (dnn_shape_src.IsMklTensor() || dnn_shape_diff_dst.IsMklTensor()) {
- if (dnn_shape_diff_dst.IsMklTensor()) {
- diff_dst_md = dnn_shape_diff_dst.GetMklLayout();
- src_md = diff_dst_md;
- } else {
- src_md = dnn_shape_src.GetMklLayout();
- diff_dst_md = src_md;
- }
- } else {
+
+ // For creating Sum primitive, we need to ensure that all inputs are in
+ // same format. What that means is if we have a mixed input case - where
+ // one input is in Tensorflow format and one input is in MKL format -,
+ // then we need to ensure that all inputs are in same format for
+ // primitive construction. For performance reason, we say that all inputs
+ // are in MKL format in such case, and insert reorder for input that is
+ // in Tensorflow format into MKL format. On the other hand, if both the
+ // inputs are in MKL format or both are in Tensorflow format, then we
+ // dont need reorder.
+ if (!dnn_shape_src.IsMklTensor() && !dnn_shape_diff_dst.IsMklTensor()) {
+ // If both the inputs are in Tensorflow format, we create blocked memory
+ // descriptor.
auto src_dims = TFShapeToMklDnnDims(src_tensor.shape());
auto src_strides = CalculateTFStrides(src_dims);
src_md = MklDnnData<T>::CreateBlockedMemDesc(src_dims, src_strides);
diff_dst_md = src_md;
+ } else if (dnn_shape_src.IsMklTensor() &&
+ !dnn_shape_diff_dst.IsMklTensor()) {
+ // If one input is in MKL format and other is in Tensorflow, then
+ // create respective descriptors describing the actual case. For input
+ // in Mkl format, we just get Mkl layout from MklDnnShape. For input in
+ // Tensorflow format, we create memory descriptor using data format.
+ src_md = dnn_shape_src.GetMklLayout();
+
+ memory::format src_mkl_data_format = dnn_shape_src.GetTfDataFormat();
+ auto src_tf_data_format = MklDnnDataFormatToTFDataFormat(
+ src_mkl_data_format);
+ auto diff_dst_dims = TFShapeToMklDnnDimsInNCHW(diff_dst_tensor.shape(),
+ src_tf_data_format);
+ diff_dst_md = memory::desc(diff_dst_dims, MklDnnType<T>(),
+ src_mkl_data_format);
+ } else if (!dnn_shape_src.IsMklTensor() &&
+ dnn_shape_diff_dst.IsMklTensor()) {
+ // Same comment as above.
+ diff_dst_md = dnn_shape_diff_dst.GetMklLayout();
+
+ memory::format diff_dst_mkl_data_format =
+ dnn_shape_diff_dst.GetTfDataFormat();
+ auto diff_dst_tf_data_format = MklDnnDataFormatToTFDataFormat(
+ diff_dst_mkl_data_format);
+ auto src_dims = TFShapeToMklDnnDimsInNCHW(src_tensor.shape(),
+ diff_dst_tf_data_format);
+ src_md = memory::desc(src_dims, MklDnnType<T>(),
+ diff_dst_mkl_data_format);
+ } else {
+ // If both the inputs are in MKL format, we use Mkl layout of the input
+ // tensors.
+ src_md = dnn_shape_src.GetMklLayout();
+ diff_dst_md = dnn_shape_diff_dst.GetMklLayout();
}
+
src.SetUsrMem(src_md, &src_tensor);
diff_dst.SetUsrMem(diff_dst_md, &diff_dst_tensor);
+ // As per comment above, we tell MKLDNN that both the inputs are in same
+ // format. So we set common memory descriptor in MKL format, if any of the
+ // inputs are in MKL format. Let's get memory descriptor that we will use
+ // for both the inputs.
+ memory::desc common_md({}, memory::data_undef, memory::format_undef);
+ if (dnn_shape_src.IsMklTensor() || dnn_shape_diff_dst.IsMklTensor()) {
+ common_md = dnn_shape_src.IsMklTensor() ? src_md : diff_dst_md;
+ } else {
+ // Since both the inputs are in Tensorflow format, and have
+ // same shape, we can get memory descriptor from any input.
+ common_md = src_md;
+ }
+
T alpha = 0, beta = 0;
std::shared_ptr<relu_forward::primitive_desc> relu_fwd_pd;
auto relu_fwd_desc = relu_forward::desc(prop_kind::forward_training,
alg_kind, src_md, alpha, beta);
relu_fwd_pd.reset(new relu_forward::primitive_desc(relu_fwd_desc,
cpu_engine));
- auto relu_bwd_desc = relu_backward::desc(alg_kind, diff_dst_md, src_md,
+ auto relu_bwd_desc = relu_backward::desc(alg_kind, common_md, common_md,
alpha, beta);
auto relu_bwd_pd = relu_backward::primitive_desc(relu_bwd_desc,
cpu_engine, *relu_fwd_pd);
@@ -547,9 +598,9 @@ class MklReluGradOpBase : public OpKernel {
AllocateOutputSetMklShape(context, diff_src_index, &diff_src_tensor,
tf_shape_diff_src, dnn_shape_diff_src);
- // diff_src memory descriptor is same as diff_dst memory descriptor.
- auto diff_src_md = diff_dst_md;
- diff_src.SetUsrMem(diff_src_md, diff_src_tensor);
+ // diff_src memory descriptor is same as memory descriptor for both
+ // inputs.
+ diff_src.SetUsrMem(common_md, diff_src_tensor);
PrepareAndExecuteNet(relu_bwd_pd, &src, &diff_src, &diff_dst);
} catch (mkldnn::error &e) {
@@ -567,6 +618,14 @@ class MklReluGradOpBase : public OpKernel {
MklDnnData<T>* src, MklDnnData<T>* diff_src, MklDnnData<T>*
diff_dst) {
std::vector<primitive> net;
+
+ // Check if we need to reorder original input tensors into common_md layout
+ // that we set for primitive creation. diff_src_primitive_desc is same as
+ // common_md.
+ src->CheckReorderToOpMem(relu_prim_desc.diff_src_primitive_desc(), &net);
+ diff_dst->CheckReorderToOpMem(relu_prim_desc.diff_src_primitive_desc(),
+ &net);
+
net.push_back(relu_backward(relu_prim_desc, src->GetOpMem(),
diff_dst->GetOpMem(), diff_src->GetOpMem()));
stream(stream::kind::eager).submit(net).wait();
@@ -622,7 +681,6 @@ class MklReluGradOp : public MklReluGradOpBase<Device, T, eltwise_relu> {
MklDnnShape dnn_shape_diff_dst;
GetMklShape(context, diff_dst_index, &dnn_shape_diff_dst);
- int src_dims_size = src_tensor.dims();
MklDnnShape dnn_shape_diff_src;
dnn_shape_diff_src.SetMklTensor(false);
AllocateOutputSetMklShape(context, diff_src_index, &diff_src_tensor,
@@ -690,7 +748,6 @@ class MklEluGradOp : public MklReluGradOpBase<Device, T, eltwise_elu> {
MklDnnShape dnn_shape_diff_dst;
GetMklShape(context, diff_dst_index, &dnn_shape_diff_dst);
- int src_dims_size = src_tensor.dims();
MklDnnShape dnn_shape_diff_src;
dnn_shape_diff_src.SetMklTensor(false);
AllocateOutputSetMklShape(context, diff_src_index, &diff_src_tensor,
@@ -762,7 +819,6 @@ class MklTanhGradOp : public MklReluGradOpBase<Device, T, eltwise_tanh> {
MklDnnShape dnn_shape_diff_dst;
GetMklShape(context, diff_dst_index, &dnn_shape_diff_dst);
- int src_dims_size = src_tensor.dims();
MklDnnShape dnn_shape_diff_src;
dnn_shape_diff_src.SetMklTensor(false);
AllocateOutputSetMklShape(context, diff_src_index, &diff_src_tensor,