[TF:XLA] Split XLA Concat Ops that fail on large sets of inputs.

Make the test large to prevent occasional timeouts on CPU. This should normally complete in well under a minute. PiperOrigin-RevId: 212953337
author: A. Unique TensorFlower <gardener@tensorflow.org> 2018-09-14 02:30:05 -0700
committer: TensorFlower Gardener <gardener@tensorflow.org> 2018-09-14 02:36:29 -0700
commit: 54cbee5d034af8693aa39cc5877c3dfcd62d3740 (patch)
tree: 716ed89052251374c030978107a71da81f7693c8 /tensorflow/compiler/tf2xla
parent: c335f3ae6872715c4873eb8af3ff2e42833bc6c0 (diff)
1 files changed, 32 insertions, 1 deletions
diff --git a/tensorflow/compiler/tf2xla/kernels/concat_op.cc b/tensorflow/compiler/tf2xla/kernels/concat_op.cc
index f410605104..0ae23aa6df 100644
--- a/tensorflow/compiler/tf2xla/kernels/concat_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/concat_op.cc
@@ -37,6 +37,16 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
+// Used to determine the number of Tensors allowed in a Concat op to prevent
+// going over the max gpu parameter memory size. This is an issue because concat
+// is variadic and can have an unlimited number of arguments when called.
+// Concat ops with more Tensors than this will be split into multiple concat
+// ops.
+//
+// TODO(b/112613927): Remove the logic here and put it properly in an HLO pass
+// along with boxing large numbers of parameters.
+constexpr int64 kMaxConcatArgsPerOp = 500;
+
 // --------------------------------------------------------------------------
 class ConcatBaseOp : public XlaOpKernel {
  public:
@@ -74,6 +84,7 @@ class ConcatBaseOp : public XlaOpKernel {
     // Make a vector holding the XlaOp for each of the inputs that has non-zero
     // elements.
     std::vector<xla::XlaOp> input_data;
+    std::vector<xla::XlaOp> partial_concats;
     int output_concat_dim = 0;
     const bool input_is_scalar = IsLegacyScalar(input_shape);
     for (int i = 0; i < N; ++i) {
@@ -94,10 +105,30 @@ class ConcatBaseOp : public XlaOpKernel {
         input_data.push_back(handle);
       }
       output_concat_dim += in_shape.dims() > 0 ? in_shape.dim_size(axis) : 1;
+
+      // Concat is associative, so it can be split into many operations when too
+      // many arguments are in a single op. This is a temporary workaround for
+      // b/112613927 where too many parameters in an XlaLaunchOp later result in
+      // too many parameters to a single GPU kernel.
+      if (i && i % kMaxConcatArgsPerOp == 0) {
+        partial_concats.push_back(
+            xla::ConcatInDim(ctx->builder(), input_data, axis));
+        input_data.clear();
+      }
     }
+    // Add any inputs that have not been put into another concat yet.
+    partial_concats.insert(partial_concats.end(), input_data.begin(),
+                           input_data.end());
 
     VLOG(1) << "Concat dim " << concat_dim << " equivalent to " << axis;
-    ctx->SetOutput(0, xla::ConcatInDim(ctx->builder(), input_data, axis));
+    // Don't add an additional "identity" concatenate for better readibility of
+    // IR.
+    if (partial_concats.size() == 1) {
+      ctx->SetOutput(0, partial_concats.front());
+    } else {
+      ctx->SetOutput(0,
+                     xla::ConcatInDim(ctx->builder(), partial_concats, axis));
+    }
   }
 
  private:
author	A. Unique TensorFlower <gardener@tensorflow.org>	2018-09-14 02:30:05 -0700
committer	TensorFlower Gardener <gardener@tensorflow.org>	2018-09-14 02:36:29 -0700
commit	54cbee5d034af8693aa39cc5877c3dfcd62d3740 (patch)
tree	716ed89052251374c030978107a71da81f7693c8 /tensorflow/compiler/tf2xla
parent	c335f3ae6872715c4873eb8af3ff2e42833bc6c0 (diff)