Optimize CuboidConvolutionBackwardKernel (Conv3D kernel backprop).

* simplify contraction by collapsing inner dims into single dimension * get rid of expensive reverse op ~5X improvement when compiled with AVX. PiperOrigin-RevId: 211518363
author: Eugene Zhulenev <ezhulenev@google.com> 2018-09-04 13:52:01 -0700
committer: TensorFlower Gardener <gardener@tensorflow.org> 2018-09-04 13:56:35 -0700
commit: ffd9519c3fffe43473f06a1c8fdd12519490db3b (patch)
tree: 065aac9a95e45023dff9de2cf53bae9110c19370
parent: 0cd9b3e41d993f505feb54ff0b086ffbb21b595d (diff)
1 files changed, 96 insertions, 208 deletions
diff --git a/tensorflow/core/kernels/eigen_backward_cuboid_convolutions.h b/tensorflow/core/kernels/eigen_backward_cuboid_convolutions.h
index e13e548f86..3ebeb7be2b 100644
--- a/tensorflow/core/kernels/eigen_backward_cuboid_convolutions.h
+++ b/tensorflow/core/kernels/eigen_backward_cuboid_convolutions.h
@@ -323,47 +323,34 @@ CuboidConvolutionBackwardInput(
 template <typename OutputBackward, typename Input>
 EIGEN_ALWAYS_INLINE static const typename internal::conditional<
     internal::traits<OutputBackward>::Layout == ColMajor,
-    const TensorShufflingOp<
-        const array<typename internal::traits<OutputBackward>::Index, 5>,
-        const TensorReverseOp<
-            const array<bool, 5>,
+    TensorReshapingOp<
+        const DSizes<typename internal::traits<Input>::Index, 5>,
+        const TensorContractionOp<
+            const array<IndexPair<typename internal::traits<Input>::Index>, 1>,
             const TensorReshapingOp<
-                const DSizes<typename internal::traits<OutputBackward>::Index,
-                             5>,
-                const TensorContractionOp<
-                    const array<
-                        IndexPair<typename internal::traits<Input>::Index>, 2>,
-                    const TensorReshapingOp<
-                        const DSizes<typename internal::traits<Input>::Index,
-                                     3>,
-                        const Input>,
-                    const TensorReshapingOp<
-                        const DSizes<
-                            typename internal::traits<OutputBackward>::Index,
-                            4>,
-                        const TensorVolumePatchOp<
-                            Dynamic, Dynamic, Dynamic,
-                            const OutputBackward> > > > > >,
-    const TensorShufflingOp<
-        const array<typename internal::traits<OutputBackward>::Index, 5>,
-        const TensorReverseOp<
-            const array<bool, 5>,
+                const DSizes<typename internal::traits<Input>::Index, 2>,
+                const OutputBackward>,
+            const TensorShufflingOp<
+                const array<typename internal::traits<OutputBackward>::Index,
+                            2>,
+                const TensorReshapingOp<
+                    const DSizes<typename internal::traits<Input>::Index, 2>,
+                    const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic,
+                                              const Input> > > > >,
+    TensorReshapingOp<
+        const DSizes<typename internal::traits<Input>::Index, 5>,
+        const TensorContractionOp<
+            const array<IndexPair<typename internal::traits<Input>::Index>, 1>,
+            const TensorShufflingOp<
+                const array<typename internal::traits<OutputBackward>::Index,
+                            2>,
+                const TensorReshapingOp<
+                    const DSizes<typename internal::traits<Input>::Index, 2>,
+                    const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic,
+                                              const Input> > >,
             const TensorReshapingOp<
-                const DSizes<typename internal::traits<OutputBackward>::Index,
-                             5>,
-                const TensorContractionOp<
-                    const array<
-                        IndexPair<typename internal::traits<Input>::Index>, 2>,
-                    const TensorReshapingOp<
-                        const DSizes<
-                            typename internal::traits<OutputBackward>::Index,
-                            4>,
-                        const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic,
-                                                  const OutputBackward> >,
-                    const TensorReshapingOp<
-                        const DSizes<typename internal::traits<Input>::Index,
-                                     3>,
-                        const Input> > > > > >::type
+                const DSizes<typename internal::traits<Input>::Index, 2>,
+                const OutputBackward> > > >::type
 CuboidConvolutionBackwardKernel(
     const Input& input, const OutputBackward& output_backward,
     typename internal::traits<Input>::Index kernelPlanes,
@@ -406,213 +393,114 @@ CuboidConvolutionBackwardKernel(
   const TensorIndex outputCols =
       isColMajor ? out.dimension(3) : out.dimension(NumDims - 4);
 
+  // Number of filters. This is the same as the output depth.
   const TensorIndex kernelFilters =
       isColMajor ? out.dimension(0) : out.dimension(NumDims - 1);
+  // Number of channels. This is the same as the input depth.
   const TensorIndex kernelChannels =
       isColMajor ? in.dimension(0) : in.dimension(NumDims - 1);
 
-  TensorIndex forward_pad_z, forward_pad_y, forward_pad_x;
-  const TensorIndex size_z =
-      Eigen::divup(inputPlanes, static_cast<TensorIndex>(stridePlanes));
-  const TensorIndex size_y =
-      Eigen::divup(inputRows, static_cast<TensorIndex>(strideRows));
-  const TensorIndex size_x =
-      Eigen::divup(inputCols, static_cast<TensorIndex>(strideCols));
-
-  // Infer padding type.
-  if (size_z == outputPlanes && size_y == outputRows && size_x == outputCols) {
-    // SAME padding.
-    const TensorIndex dz = numext::maxi<TensorIndex>(
-        0, (size_z - 1) * stridePlanes + kernelPlanes - inputPlanes);
-    const TensorIndex dy = numext::maxi<TensorIndex>(
-        0, (size_y - 1) * strideRows + kernelRows - inputRows);
-    const TensorIndex dx = numext::maxi<TensorIndex>(
-        0, (size_x - 1) * strideCols + kernelCols - inputCols);
-
-    forward_pad_z = dz / 2;
-    forward_pad_y = dy / 2;
-    forward_pad_x = dx / 2;
-  } else {
-    // VALID padding.
-    forward_pad_z = 0;
-    forward_pad_y = 0;
-    forward_pad_x = 0;
-  }
-
-  const TensorIndex padding_ztop = kernelPlanes - 1 - forward_pad_z;
-  const TensorIndex padding_top = kernelRows - 1 - forward_pad_y;
-  const TensorIndex padding_left = kernelCols - 1 - forward_pad_x;
-
-  const TensorIndex padding_zbottom = inputPlanes + kernelPlanes - 1 -
-                                      (outputPlanes - 1) * stridePlanes - 1 -
-                                      padding_ztop;
-  const TensorIndex padding_bottom = inputRows + kernelRows - 1 -
-                                     (outputRows - 1) * strideRows - 1 -
-                                     padding_top;
-  const TensorIndex padding_right = inputCols + kernelCols - 1 -
-                                    (outputCols - 1) * strideCols - 1 -
-                                    padding_left;
-
-  eigen_assert(padding_ztop >= 0);
-  eigen_assert(padding_zbottom >= 0);
-  eigen_assert(padding_top >= 0);
-  eigen_assert(padding_left >= 0);
-  eigen_assert(padding_bottom >= 0);
-  eigen_assert(padding_right >= 0);
-
-  // The output_backward has dimensions out_depth X out_plaens X out_rows X
-  // out_cols X OTHERS
-  // When we extract the image patches from output_backward (with input as the
-  // kernel), it will have dimensions
-  //  (out_depth) X (input_planes * input_rows * input_cols) X (kernel_planes *
-  //  kernel_rows * kernel_cols) X OTHERS
-  DSizes<TensorIndex, 4> pre_contract_dims;
+  // TODO(ezhulenev): Add support for inflated strides. Without inflated strides
+  // effective kernel planes/rows/cols are always the same as the kernel itself
+  // (see eigen_spatial_convolutions for details).
+  const TensorIndex kernelPlanesEff = kernelPlanes;
+  const TensorIndex kernelRowsEff = kernelRows;
+  const TensorIndex kernelColsEff = kernelCols;
+
+  const TensorIndex padPlanes = numext::maxi<Index>(
+      0, (outputPlanes - 1) * stridePlanes + kernelPlanesEff - inputPlanes);
+  const TensorIndex padRows = numext::maxi<Index>(
+      0, (outputRows - 1) * strideRows + kernelRowsEff - inputRows);
+  const TensorIndex padCols = numext::maxi<Index>(
+      0, (outputCols - 1) * strideCols + kernelColsEff - inputCols);
+
+  const TensorIndex padding_top_z = padPlanes / 2;
+  const TensorIndex padding_bottom_z = padPlanes - padding_top_z;
+  const TensorIndex padding_top = padRows / 2;
+  const TensorIndex padding_bottom = padRows - padding_top;
+  const TensorIndex padding_left = padCols / 2;
+  const TensorIndex padding_right = padCols - padding_left;
+
+  // Reshaped output_backward before contraction.
+  DSizes<TensorIndex, 2> output_dims;
   if (isColMajor) {
-    pre_contract_dims[0] = kernelFilters;
-    pre_contract_dims[1] = inputRows * inputCols * inputPlanes;
-    pre_contract_dims[2] = kernelRows * kernelCols * kernelPlanes;
-    pre_contract_dims[3] = 1;
+    output_dims[0] = kernelFilters;
+    output_dims[1] = outputPlanes * outputRows * outputCols;
     for (int i = 4; i < NumDims; ++i) {
-      pre_contract_dims[3] *= out.dimension(i);
+      output_dims[1] *= out.dimension(i);
     }
   } else {
-    pre_contract_dims[3] = kernelFilters;
-    pre_contract_dims[2] = inputRows * inputCols * inputPlanes;
-    pre_contract_dims[1] = kernelRows * kernelCols * kernelPlanes;
-    pre_contract_dims[0] = 1;
+    output_dims[1] = kernelFilters;
+    output_dims[0] = outputCols * outputRows * outputPlanes;
     for (int i = 0; i < NumDims - 4; ++i) {
-      pre_contract_dims[0] *= out.dimension(i);
+      output_dims[0] *= out.dimension(i);
     }
   }
 
-  // The input has dimensions in_depth X (input_planes * input_rows *
-  // input_cols) X OTHERS
-  DSizes<TensorIndex, 3> input_dims;
+  // Reshaped extract_volume_patches(in)
+  DSizes<TensorIndex, 2> pre_contract_dims;
   if (isColMajor) {
-    input_dims[0] = kernelChannels;
-    input_dims[1] = inputRows * inputCols * inputPlanes;
-    input_dims[2] = 1;
+    pre_contract_dims[0] =
+        kernelChannels * kernelPlanes * kernelRows * kernelCols;
+    pre_contract_dims[1] = outputPlanes * outputRows * outputCols;
     for (int i = 4; i < NumDims; ++i) {
-      input_dims[2] *= in.dimension(i);
+      pre_contract_dims[1] *= in.dimension(i);
     }
-    eigen_assert(input_dims[2] == pre_contract_dims[3]);
+    eigen_assert(output_dims[1] == pre_contract_dims[1]);
   } else {
-    input_dims[2] = kernelChannels;
-    input_dims[1] = inputRows * inputCols * inputPlanes;
-    input_dims[0] = 1;
+    pre_contract_dims[1] =
+        kernelCols * kernelRows * kernelPlanes * kernelChannels;
+    pre_contract_dims[0] = outputCols * outputRows * outputPlanes;
     for (int i = 0; i < NumDims - 4; ++i) {
-      input_dims[0] *= in.dimension(i);
+      pre_contract_dims[0] *= in.dimension(i);
     }
-    eigen_assert(input_dims[0] == pre_contract_dims[0]);
+    eigen_assert(output_dims[0] == pre_contract_dims[0]);
   }
 
-  // We will contract along dimensions (1, 2) in and (1, 3) in out, if
-  // this is col-major.
-  // For row-major, it's dimensions (0, 1) in and (0, 2) in out.
-  array<IndexPair<TensorIndex>, 2> contract_dims;
-  if (isColMajor) {
-    // col-major: in.contract(output.patches)
-    contract_dims[0] = IndexPair<TensorIndex>(1, 1);
-    contract_dims[1] = IndexPair<TensorIndex>(2, 3);
-  } else {
-    // row-major: output.patches.contract(in)
-    contract_dims[0] = IndexPair<TensorIndex>(0, 0);
-    contract_dims[1] = IndexPair<TensorIndex>(2, 1);
-  }
+  array<TensorIndex, 2> shuffle_dims;
+  shuffle_dims[0] = 1;
+  shuffle_dims[1] = 0;
 
-  // After the contraction, the kernel will have dimension
-  //   in_depth X out_depth X kernel_patches X kernel_rows X kernel_cols
-  // We will need to shuffle the first two dimensions and reverse the spatial
-  // dimensions.
-  // The end shape is:
-  //   out_depth X in_shape X kernel_planes X kernel_rows X kernel_cols
+  array<IndexPair<TensorIndex>, 1> contract_dims;
+  contract_dims[0] = IndexPair<TensorIndex>(1, 0);
 
-  // This is the shape of the kernel *before* the shuffling.
   DSizes<TensorIndex, 5> kernel_dims;
   if (isColMajor) {
-    kernel_dims[0] = kernelChannels;
-    kernel_dims[1] = kernelFilters;
+    kernel_dims[0] = kernelFilters;
+    kernel_dims[1] = kernelChannels;
     kernel_dims[2] = kernelPlanes;
     kernel_dims[3] = kernelRows;
     kernel_dims[4] = kernelCols;
   } else {
-    kernel_dims[0] = kernelCols;
-    kernel_dims[1] = kernelRows;
+    kernel_dims[4] = kernelFilters;
+    kernel_dims[3] = kernelChannels;
     kernel_dims[2] = kernelPlanes;
-    kernel_dims[3] = kernelFilters;
-    kernel_dims[4] = kernelChannels;
-  }
-
-  // Flip filters and channels.
-  array<TensorIndex, 5> kernel_shuffle;
-  if (isColMajor) {
-    kernel_shuffle[0] = 1;
-    kernel_shuffle[1] = 0;
-    kernel_shuffle[2] = 2;
-    kernel_shuffle[3] = 3;
-    kernel_shuffle[4] = 4;
-  } else {
-    kernel_shuffle[0] = 0;
-    kernel_shuffle[1] = 1;
-    kernel_shuffle[2] = 2;
-    kernel_shuffle[3] = 4;
-    kernel_shuffle[4] = 3;
-  }
-
-  // Reverse the spatial dimensions.
-  array<bool, 5> kernel_reverse;
-  if (isColMajor) {
-    kernel_reverse[0] = false;
-    kernel_reverse[1] = false;
-    kernel_reverse[2] = true;
-    kernel_reverse[3] = true;
-    kernel_reverse[4] = true;
-  } else {
-    kernel_reverse[0] = true;
-    kernel_reverse[1] = true;
-    kernel_reverse[2] = true;
-    kernel_reverse[3] = false;
-    kernel_reverse[4] = false;
+    kernel_dims[1] = kernelRows;
+    kernel_dims[0] = kernelCols;
   }
 
-  DSizes<TensorIndex, NumDims> strides;
-  for (int i = 0; i < NumDims; i++) {
-    strides[i] = 1;
-  }
-  if (isColMajor) {
-    strides[1] = stridePlanes;
-    strides[2] = strideRows;
-    strides[3] = strideCols;
-  } else {
-    strides[NumDims - 2] = stridePlanes;
-    strides[NumDims - 3] = strideRows;
-    strides[NumDims - 4] = strideCols;
-  }
   return choose(
       Cond<internal::traits<Input>::Layout == ColMajor>(),
-      input.reshape(input_dims)
-          .contract(output_backward
+      output_backward.reshape(output_dims)
+          .contract(input
                         .extract_volume_patches(
-                            inputPlanes, inputRows, inputCols, 1, 1, 1,
-                            stridePlanes, strideRows, strideCols,
-
-                            padding_ztop, padding_zbottom, padding_top,
-                            padding_bottom, padding_left, padding_right)
-                        .reshape(pre_contract_dims),
+                            kernelPlanes, kernelRows, kernelCols, stridePlanes,
+                            strideRows, strideCols, 1, 1, 1, padding_top_z,
+                            padding_bottom_z, padding_top, padding_bottom,
+                            padding_left, padding_right)
+                        .reshape(pre_contract_dims)
+                        .shuffle(shuffle_dims),
                     contract_dims)
-          .reshape(kernel_dims)
-          .reverse(kernel_reverse)
-          .shuffle(kernel_shuffle),
-      output_backward
-          .extract_volume_patches(inputPlanes, inputRows, inputCols, 1, 1, 1,
-                                  stridePlanes, strideRows, strideCols,
-                                  padding_ztop, padding_zbottom, padding_top,
+          .reshape(kernel_dims),
+      input
+          .extract_volume_patches(kernelPlanes, kernelRows, kernelCols,
+                                  stridePlanes, strideRows, strideCols, 1, 1, 1,
+                                  padding_top_z, padding_bottom_z, padding_top,
                                   padding_bottom, padding_left, padding_right)
           .reshape(pre_contract_dims)
-          .contract(input.reshape(input_dims), contract_dims)
-          .reshape(kernel_dims)
-          .reverse(kernel_reverse)
-          .shuffle(kernel_shuffle));
+          .shuffle(shuffle_dims)
+          .contract(output_backward.reshape(output_dims), contract_dims)
+          .reshape(kernel_dims));
 }
 
 }  // end namespace Eigen
author	Eugene Zhulenev <ezhulenev@google.com>	2018-09-04 13:52:01 -0700
committer	TensorFlower Gardener <gardener@tensorflow.org>	2018-09-04 13:56:35 -0700
commit	ffd9519c3fffe43473f06a1c8fdd12519490db3b (patch)
tree	065aac9a95e45023dff9de2cf53bae9110c19370
parent	0cd9b3e41d993f505feb54ff0b086ffbb21b595d (diff)