[XLA:GPU] Pad convolution features of size 3 up to 4.

PiperOrigin-RevId: 214532043
author: Justin Lebar <jlebar@google.com> 2018-09-25 17:20:42 -0700
committer: TensorFlower Gardener <gardener@tensorflow.org> 2018-09-25 17:25:18 -0700
commit: 05d103bf25110157c34b9ea6420061a23aa6d4ec (patch)
tree: 2b73a1666f7cfdab5cd34bedc9cdda8cc7316852 /tensorflow/compiler/xla/service/gpu/pad_for_tensor_cores.cc
parent: 4177bc92c3b9301877521ba9b26377b80fa27601 (diff)
1 files changed, 20 insertions, 3 deletions
diff --git a/tensorflow/compiler/xla/service/gpu/pad_for_tensor_cores.cc b/tensorflow/compiler/xla/service/gpu/pad_for_tensor_cores.cc
index 2d270f630b..e3869b5c36 100644
--- a/tensorflow/compiler/xla/service/gpu/pad_for_tensor_cores.cc
+++ b/tensorflow/compiler/xla/service/gpu/pad_for_tensor_cores.cc
@@ -37,15 +37,32 @@ static constexpr int64 kDesiredNumFeaturesFactor = 8;
 // there's additional room for speedups.  Achieving those speedups without also
 // slowing other things down will likely require a more sophisticated heuristic,
 // possibly some form of auto-tuning.
-static constexpr double kMaxBytesTouchedIncrease = 1.2;
+//
+// This value should be >= 4/3, otherwise the "dims of size 3 padded up to 4"
+// special case inside PadShape won't fire.
+static constexpr double kMaxBytesTouchedIncrease = 1.35;
 
 // Pads the given dimensions in the given shape up to a multiple of
 // kDesiredNumFeaturesFactor.
 static Shape PadShape(Shape s, absl::Span<const int64> dims) {
   for (int64 dim : dims) {
     int64 dim_to_pad_size = s.dimensions(dim);
-    int64 new_dim_to_pad_size =
-        RoundUpToNearest(dim_to_pad_size, kDesiredNumFeaturesFactor);
+
+    // Round dim_to_pad_size up to the next multiple of
+    // kDesiredNumFeaturesFactor.
+    //
+    // Special case: dims of size 3 are rounded up to 4, not
+    // kDesiredNumFeaturesFactor.  Empirically (and on the advice of nvidia),
+    // this helps, but as of writing, it's not supported by anything in the
+    // cudnn docs.
+    int64 new_dim_to_pad_size;
+    if (dim_to_pad_size == 3) {
+      new_dim_to_pad_size = 4;
+    } else {
+      new_dim_to_pad_size =
+          RoundUpToNearest(dim_to_pad_size, kDesiredNumFeaturesFactor);
+    }
+
     s.set_dimensions(dim, new_dim_to_pad_size);
   }
   return s;
author	Justin Lebar <jlebar@google.com>	2018-09-25 17:20:42 -0700
committer	TensorFlower Gardener <gardener@tensorflow.org>	2018-09-25 17:25:18 -0700
commit	05d103bf25110157c34b9ea6420061a23aa6d4ec (patch)
tree	2b73a1666f7cfdab5cd34bedc9cdda8cc7316852 /tensorflow/compiler/xla/service/gpu/pad_for_tensor_cores.cc
parent	4177bc92c3b9301877521ba9b26377b80fa27601 (diff)