[XLA:GPU] Don't autotune while other kernels are running.

XLA:GPU autotunes gemm and conv thunks, trying multiple algorithms in sequence and picking the fastest one. If other work is running concurrently with our autotuning, this can mess up the results. In particular, even if the GPU is totally deterministic, the concurrent work may finish before we finish autotuning, giving an unfair advantage to the later algorithms. To address this, we modify GpuExecutable to wait until the GPU is quiescent before executing a thunk which performs autotuning. We then cross our fingers and hope that whatever is fastest while the GPU is quiescent will also be fastest in the "real world", with (potentially) concurrent work going on. PiperOrigin-RevId: 178041481
author: Justin Lebar <jlebar@google.com> 2017-12-05 18:13:48 -0800
committer: TensorFlower Gardener <gardener@tensorflow.org> 2017-12-05 18:17:54 -0800
commit: 1a786ab335aabe9020cff4f0ab69a5844de70fbc (patch)
tree: 40372760fb79097b43657620cca85b69e5651c54
parent: cefd2c73cd785c201e9c0cb9890b2bff9310021c (diff)
5 files changed, 53 insertions, 12 deletions
diff --git a/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc b/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
index 037eec8ef5..899cc5c83b 100644
--- a/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
@@ -314,7 +314,9 @@ tensorflow::Status ConvolutionThunk::ConvolveWithTune(
     const ConvolutionDescriptor& convolution_descriptor,
     const BufferAllocations& buffer_allocations, se::Stream* stream) {
   // TODO(b/29126320): Try cudnn v5's new auto-tuner when it's rolled out.
-  if (best_algorithm_.algorithm().is_default()) {
+  if (!best_algorithm_.has_value()) {
+    best_algorithm_.emplace();
+
     // Auto-tuning either is disabled or only happens in the first run of this
     // function.
     VLOG(2) << "Profiling for best convolution algorithm used for "
@@ -363,35 +365,35 @@ tensorflow::Status ConvolutionThunk::ConvolveWithTune(
     }
 
     if (best_result.is_valid()) {
-      best_algorithm_.set_algorithm(best_result.algorithm());
+      best_algorithm_->set_algorithm(best_result.algorithm());
     } else {
       LOG(ERROR) << "No convolution algorithm works with profiling. Fall back "
                     "to the default algorithm.";
-      best_algorithm_.set_algorithm(AlgorithmDesc());
+      best_algorithm_->set_algorithm(AlgorithmDesc());
     }
 
     if (best_result_without_scratch.is_valid()) {
-      best_algorithm_.set_algorithm_no_scratch(
+      best_algorithm_->set_algorithm_no_scratch(
           best_result_without_scratch.algorithm());
     } else {
       LOG(ERROR) << "No convolution algorithm without scratch works with "
                     "profiling. Fall back "
                     "to the default algorithm.";
-      best_algorithm_.set_algorithm_no_scratch(AlgorithmDesc());
+      best_algorithm_->set_algorithm_no_scratch(AlgorithmDesc());
     }
   }
 
   {
     VLOG(2) << "Using convolution algorithm ("
-            << AlgorithmToString(best_algorithm_.algorithm()) << ", "
-            << AlgorithmToString(best_algorithm_.algorithm_no_scratch())
+            << AlgorithmToString(best_algorithm_->algorithm()) << ", "
+            << AlgorithmToString(best_algorithm_->algorithm_no_scratch())
             << ") for ConvolutionThunk: " << this;
     ConvolveScratchAllocator scratch_allocator(
         buffer_allocations.device_ordinal(),
         buffer_allocations.memory_allocator());
     return Convolve(input_descriptor, input_data, filter_descriptor,
                     filter_data, output_descriptor, output_data,
-                    convolution_descriptor, best_algorithm_, stream,
+                    convolution_descriptor, *best_algorithm_, stream,
                     &scratch_allocator, nullptr);
   }
 }
diff --git a/tensorflow/compiler/xla/service/gpu/convolution_thunk.h b/tensorflow/compiler/xla/service/gpu/convolution_thunk.h
index 5ac5db2f04..7c25a2e645 100644
--- a/tensorflow/compiler/xla/service/gpu/convolution_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/convolution_thunk.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/optional.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 
 namespace xla {
@@ -87,6 +88,14 @@ class ConvolutionThunk : public Thunk {
       const BufferAllocations& buffer_allocations,
       perftools::gputools::Stream* stream) override;
 
+  // Returns true if the next run of ExecuteOnStream will do autotuning.  If so,
+  // we want the GPU to be quiescent during autotuning, so as not to introduce
+  // noise in our results.
+  bool ShouldHaltAllActivityBeforeRunning(
+      perftools::gputools::Stream*) override {
+    return !best_algorithm_.has_value();
+  }
+
  private:
   tensorflow::Status ConvolveWithTune(
       const perftools::gputools::dnn::BatchDescriptor& input_descriptor,
@@ -121,9 +130,10 @@ class ConvolutionThunk : public Thunk {
 
   // Fastest cuDNN convolution algorithm for this thunk learned from
   // auto-tuning. If auto-tuning is disabled or failed, best_algorithm_ is set
-  // to the default value indicating cuDNN's convolution will choose
-  // the best algorithm from some heuristics based on its parameters.
-  perftools::gputools::dnn::AlgorithmConfig best_algorithm_;
+  // to the default value, indicating cuDNN's convolution will choose the best
+  // algorithm from some heuristics based on its parameters.
+  tensorflow::gtl::optional<perftools::gputools::dnn::AlgorithmConfig>
+      best_algorithm_;
 
   const ConvolutionKind convolution_kind_;
 
diff --git a/tensorflow/compiler/xla/service/gpu/gemm_thunk.h b/tensorflow/compiler/xla/service/gpu/gemm_thunk.h
index 983cb87292..8c6a1f51a8 100644
--- a/tensorflow/compiler/xla/service/gpu/gemm_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/gemm_thunk.h
@@ -52,6 +52,15 @@ class GemmThunk : public Thunk {
       const BufferAllocations& buffer_allocations,
       perftools::gputools::Stream* stream) override;
 
+  // Returns true if we'll perform autotuning if run on the given stream.  If
+  // so, we want the GPU to be quiescent during autotuning, so as not to
+  // introduce noise in our results.
+  bool ShouldHaltAllActivityBeforeRunning(
+      perftools::gputools::Stream* stream) override {
+    return autotune_results_.count(
+               stream->parent()->GetDeviceDescription().name()) != 0;
+  }
+
  private:
   const BufferAllocation::Slice lhs_buffer_;
   const BufferAllocation::Slice rhs_buffer_;
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
index 0fd85e4fb0..21e9fc96f6 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@@ -167,9 +167,16 @@ Status GpuExecutable::ExecuteThunks(
       stream->ThenWaitFor(FindOrDie(thunk_to_finish_event, dependency).get());
     }
 
+    // If this thunk requests it, wait for all currently-executing thunks to
+    // finish.  This is useful e.g. if the thunk is about to perform autotuning.
+    if (thunk->ShouldHaltAllActivityBeforeRunning(stream)) {
+      main_stream->BlockHostUntilDone();
+    }
+
     profiler.StartOperation();
     VLOG(2) << "Executing the thunk for "
-            << thunk->hlo_instruction()->ToString();
+            << thunk->hlo_instruction()->ToString() << " on stream "
+            << stream_no;
     TF_RETURN_IF_ERROR(thunk->ExecuteOnStream(buffer_allocations, stream));
     if (thunk_schedule_->Depended(thunk)) {
       auto finish_event = MakeUnique<se::Event>(main_stream->parent());
diff --git a/tensorflow/compiler/xla/service/gpu/thunk.h b/tensorflow/compiler/xla/service/gpu/thunk.h
index 0ff27888ad..486ea7d7e1 100644
--- a/tensorflow/compiler/xla/service/gpu/thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/thunk.h
@@ -70,6 +70,19 @@ class Thunk {
     return tensorflow::Status::OK();
   }
 
+  // Users of Thunk should call ShouldHaltAllActivityBeforeRunning(stream)
+  // before calling ExecuteOnStream(stream).  If it returns true, it's the
+  // user's responsibility to wait for all activity on the GPU to finish before
+  // calling ExecuteOnStream.
+  //
+  // This value is not required to be constant for a given Thunk.  For example,
+  // a Thunk that performs autotuning may return true for its first run and
+  // false thereafter.
+  virtual bool ShouldHaltAllActivityBeforeRunning(
+      perftools::gputools::Stream* /*stream*/) {
+    return false;
+  }
+
   // Execute the kernel for the thunk on the given stream. This method must be
   // called after Initialize and can be called multiple times over Thunk's
   // lifetime. Stream argument must be non-null.
author	Justin Lebar <jlebar@google.com>	2017-12-05 18:13:48 -0800
committer	TensorFlower Gardener <gardener@tensorflow.org>	2017-12-05 18:17:54 -0800
commit	1a786ab335aabe9020cff4f0ab69a5844de70fbc (patch)
tree	40372760fb79097b43657620cca85b69e5651c54
parent	cefd2c73cd785c201e9c0cb9890b2bff9310021c (diff)