Adding autotune to the Cudnn conv algorithm selection.

For now, use TF_CUDNN_USE_AUTOTUNE=1 to enable this feature. Once it is mature enough, it will be turned on by default. Support for the backward steps will be added later. Change: 121769364
author: Xiaoqiang Zheng <zhengxq@google.com> 2016-05-07 22:22:44 -0800
committer: TensorFlower Gardener <gardener@tensorflow.org> 2016-05-09 09:21:14 -0700
commit: 6ff265ebae14586db5db623b7502ddbdbc8bbd12 (patch)
tree: d4df06ea887dfb603b22c9df1147b41b38994b54 /tensorflow/stream_executor
parent: a846576a67da4bdd0b610a17b5c8d0d92e41f094 (diff)
9 files changed, 248 insertions, 41 deletions
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index fbaa42effb..f35c59a82a 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <dlfcn.h>
 #include <functional>
+#include <memory>
 
 #include "tensorflow/stream_executor/cuda/cuda_activation.h"
 #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
@@ -24,6 +25,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h"
 #include "tensorflow/stream_executor/cuda/cuda_platform_id.h"
 #include "tensorflow/stream_executor/cuda/cuda_stream.h"
+#include "tensorflow/stream_executor/cuda/cuda_timer.h"
 #include "tensorflow/stream_executor/dnn.h"
 #include "tensorflow/stream_executor/dso_loader.h"
 #include "tensorflow/stream_executor/lib/env.h"
@@ -36,7 +38,9 @@ limitations under the License.
 #include "tensorflow/stream_executor/scratch_allocator.h"
 #include "tensorflow/stream_executor/stream.h"
 #include "tensorflow/stream_executor/stream_executor_pimpl.h"
+// clang-format off
 #include "third_party/gpus/cuda/include/cudnn.h"
+// clang-format on
 
 namespace {
 
@@ -255,6 +259,22 @@ cudnnHandle_t ToHandle(void* opaque_handle) {
   return static_cast<cudnnHandle_t>(opaque_handle);
 }
 
+cudnnConvolutionFwdAlgo_t ToConvForwardAlgo(dnn::AlgorithmType algorithm) {
+  cudnnConvolutionFwdAlgo_t algo = cudnnConvolutionFwdAlgo_t(algorithm);
+  switch (algo) {
+    case CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM:
+    case CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM:
+    case CUDNN_CONVOLUTION_FWD_ALGO_GEMM:
+    case CUDNN_CONVOLUTION_FWD_ALGO_DIRECT:
+    case CUDNN_CONVOLUTION_FWD_ALGO_FFT:
+    case CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING:
+      return algo;
+    default:
+      LOG(FATAL) << "Unsupported Cudnn convolution forward algorithm: "
+                 << algorithm;
+  }
+}
+
 }  // namespace
 
 CudnnSupport::CudnnSupport(CUDAExecutor* parent)
@@ -647,7 +667,8 @@ bool CudnnSupport::DoConvolve(
     const DeviceMemory<float>& filter_data,
     const ConvolutionDescriptor& convolution_descriptor,
     const BatchDescriptor& output_descriptor, DeviceMemory<float>* output_data,
-    ScratchAllocator* scratch_allocator) {
+    ScratchAllocator* scratch_allocator, dnn::AlgorithmType algorithm,
+    dnn::ProfileResult* output_profile_result) {
   ScopedTensorDescriptor input_nd{parent_, batch_descriptor, CUDNN_DATA_FLOAT};
   ScopedTensorDescriptor output_nd{parent_, output_descriptor,
                                    CUDNN_DATA_FLOAT};
@@ -667,52 +688,101 @@ bool CudnnSupport::DoConvolve(
   // Beta is the scaling factor for output.
   float beta = 0.0;
 
-  auto get_algorithm = [&](bool specify_limit)
-      SHARED_LOCKS_REQUIRED(dnn_handle_mutex_) {
-        cudnnConvolutionFwdPreference_t preference =
-            specify_limit ? CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT
-                          : CUDNN_CONVOLUTION_FWD_NO_WORKSPACE;
-
-        auto memory_limit_bytes =
-            scratch_allocator == nullptr
-                ? 0
-                : scratch_allocator->GetMemoryLimitInBytes(stream);
-        if (memory_limit_bytes < 0) {
-          memory_limit_bytes = 0;
-        }
+  const bool is_profiling = output_profile_result != nullptr;
+  cudnnConvolutionFwdAlgo_t algo;
+  DeviceMemory<uint8> scratch;
 
-        cudnnConvolutionFwdAlgo_t algo;
-        status = dynload::cudnnGetConvolutionForwardAlgorithm(
-            parent_, ToHandle(dnn_handle_), input_nd.handle(), filter.handle(),
-            conv.handle(), output_nd.handle(),
-            /*preference=*/preference,
-            /*memoryLimitInBytes=*/memory_limit_bytes, /*algo=*/&algo);
-        CHECK_EQ(status, CUDNN_STATUS_SUCCESS) << "Unable to find a suitable "
-                                                  "algorithm for doing forward "
-                                                  "convolution";
-        return algo;
-      };
+  if (algorithm == dnn::kDefaultAlgorithm) {
+    // With the default algorithm, use Cudnn's heuristics.
+    auto get_algorithm = [&](bool specify_limit)
+        SHARED_LOCKS_REQUIRED(dnn_handle_mutex_) {
+          cudnnConvolutionFwdPreference_t preference =
+              specify_limit ? CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT
+                            : CUDNN_CONVOLUTION_FWD_NO_WORKSPACE;
+
+          auto memory_limit_bytes =
+              scratch_allocator == nullptr
+                  ? 0
+                  : scratch_allocator->GetMemoryLimitInBytes(stream);
+          if (memory_limit_bytes < 0) {
+            memory_limit_bytes = 0;
+          }
+
+          cudnnConvolutionFwdAlgo_t algo_to_use;
+          status = dynload::cudnnGetConvolutionForwardAlgorithm(
+              parent_, ToHandle(dnn_handle_), input_nd.handle(),
+              filter.handle(), conv.handle(), output_nd.handle(),
+              /*preference=*/preference,
+              /*memoryLimitInBytes=*/memory_limit_bytes,
+              /*algo=*/&algo_to_use);
+          CHECK_EQ(status, CUDNN_STATUS_SUCCESS)
+              << "Unable to find a suitable "
+                 "algorithm for doing forward "
+                 "convolution";
+          return algo_to_use;
+        };
+
+    algo = get_algorithm(/*specify_limit=*/scratch_allocator != nullptr);
+
+    if (scratch_allocator != nullptr) {
+      size_t size_in_bytes;
+      status = dynload::cudnnGetConvolutionForwardWorkspaceSize(
+          parent_, ToHandle(dnn_handle_), /*srcDesc=*/input_nd.handle(),
+          /*filterDesc=*/filter.handle(), /*convDesc=*/conv.handle(),
+          /*destDesc=*/output_nd.handle(), /*algo=*/algo,
+          /*sizeInBytes=*/&size_in_bytes);
+      if (status == CUDNN_STATUS_SUCCESS && size_in_bytes != 0) {
+        scratch = scratch_allocator->AllocateBytes(stream, size_in_bytes)
+                      .ValueOrDie();
+      }
+    }
 
-  auto algo = get_algorithm(/*specify_limit=*/scratch_allocator != nullptr);
+    // If we didn't allocate any scratch space (perhaps because of failed
+    // allocation), we force a switch back to the "no workspace" algorithm.
+    if (scratch == nullptr) {
+      algo = get_algorithm(/*specify_limit=*/false);
+    }
+  } else {
+    // An algorithm has been specified.
+    algo = ToConvForwardAlgo(algorithm);
 
-  DeviceMemory<uint8> scratch;
-  if (scratch_allocator != nullptr) {
     size_t size_in_bytes;
     status = dynload::cudnnGetConvolutionForwardWorkspaceSize(
         parent_, ToHandle(dnn_handle_), /*srcDesc=*/input_nd.handle(),
         /*filterDesc=*/filter.handle(), /*convDesc=*/conv.handle(),
         /*destDesc=*/output_nd.handle(), /*algo=*/algo,
         /*sizeInBytes=*/&size_in_bytes);
-    if (status == CUDNN_STATUS_SUCCESS && size_in_bytes != 0) {
-      scratch =
-          scratch_allocator->AllocateBytes(stream, size_in_bytes).ValueOrDie();
+    if (status != CUDNN_STATUS_SUCCESS) {
+      if (is_profiling) {
+        // Silently return when we are profiling.
+        return false;
+      }
+      LOG(FATAL) << "Cannot query the size of workspace needed for the given "
+                    "algorithm: "
+                 << algorithm;
+    }
+    if (size_in_bytes != 0) {
+      if (scratch_allocator == nullptr) {
+        LOG(FATAL) << "An allocator must be specified when scratch memory is "
+                      "needed";
+      }
+      auto allocated = scratch_allocator->AllocateBytes(stream, size_in_bytes);
+      if (is_profiling && !allocated.ok()) {
+        // Silently return when we are profiling.
+        return false;
+      }
+      scratch = allocated.ValueOrDie();
     }
   }
 
-  // If we didn't allocate any scratch space (perhaps because of failed
-  // allocation), we force a switch back to the "no workspace" algorithm.
-  if (scratch == nullptr) {
-    algo = get_algorithm(/*specify_limit=*/false);
+  std::unique_ptr<CUDATimer> timer;
+  if (is_profiling) {
+    timer.reset(new CUDATimer(parent_));
+    timer->Init();
+    // The start and stop of the timer should be as close to the Cudnn call as
+    // possible. It is still possible for other threads to issue workload on
+    // to this stream. So it could take multiple profiling measurements.
+    timer->Start(AsCUDAStream(stream));
   }
 
   status = dynload::cudnnConvolutionForward(
@@ -725,11 +795,38 @@ bool CudnnSupport::DoConvolve(
       /*destDesc=*/output_nd.handle(), /*destData=*/output_data->opaque());
 
   if (status != CUDNN_STATUS_SUCCESS) {
+    if (is_profiling) {
+      // Silently return when we are profiling.
+      return false;
+    }
     LOG(FATAL) << "failed to enqueue convolution on stream: "
                << ToString(status);
-    return false;
   }
 
+  if (is_profiling) {
+    timer->Stop(AsCUDAStream(stream));
+    output_profile_result->set_is_valid(true);
+    output_profile_result->set_algorithm(algo);
+    output_profile_result->set_elapsed_time_in_ms(
+        timer->GetElapsedMilliseconds());
+    timer->Destroy();
+  }
+
+  return true;
+}
+
+bool CudnnSupport::GetConvolveAlgorithms(
+    std::vector<dnn::AlgorithmType>* out_algorithms) {
+  out_algorithms->assign({
+      // clang-format off
+      CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM,
+      CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM,
+      CUDNN_CONVOLUTION_FWD_ALGO_GEMM,
+      CUDNN_CONVOLUTION_FWD_ALGO_DIRECT,
+      CUDNN_CONVOLUTION_FWD_ALGO_FFT,
+      CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING,
+      // clang-format on
+  });
   return true;
 }
 
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.h b/tensorflow/stream_executor/cuda/cuda_dnn.h
index 37be60ec63..76af118962 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.h
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.h
@@ -44,6 +44,9 @@ class CudnnSupport : public dnn::DnnSupport {
 
   port::Status Init() override;
 
+  bool GetConvolveAlgorithms(
+      std::vector<dnn::AlgorithmType>* out_algorithms) override;
+
   bool DoConvolve(Stream* stream, const dnn::BatchDescriptor& input_descriptor,
                   const DeviceMemory<float>& input_data,
                   const dnn::FilterDescriptor& filter_descriptor,
@@ -51,7 +54,9 @@ class CudnnSupport : public dnn::DnnSupport {
                   const dnn::ConvolutionDescriptor& convolution_descriptor,
                   const dnn::BatchDescriptor& output_descriptor,
                   DeviceMemory<float>* output_data,
-                  ScratchAllocator* scratch_allocator) override;
+                  ScratchAllocator* scratch_allocator,
+                  dnn::AlgorithmType algorithm,
+                  dnn::ProfileResult* output_profile_result) override;
 
   bool DoConvolve(Stream* stream, const dnn::BatchDescriptor& batch_descriptor,
                   const DeviceMemory<double>& input_data,
diff --git a/tensorflow/stream_executor/cuda/cuda_driver.cc b/tensorflow/stream_executor/cuda/cuda_driver.cc
index c88dc88d29..66cfccdd90 100644
--- a/tensorflow/stream_executor/cuda/cuda_driver.cc
+++ b/tensorflow/stream_executor/cuda/cuda_driver.cc
@@ -94,6 +94,7 @@ PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuEventDestroy_v2);
 PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuEventElapsedTime);
 PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuEventQuery);
 PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuEventRecord);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuEventSynchronize);
 PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuFuncGetAttribute);
 PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuFuncSetCacheConfig);
 PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuGetErrorName);
@@ -1069,7 +1070,14 @@ CUDADriver::ContextGetSharedMemConfig(CUcontext context) {
                                                   float *elapsed_milliseconds,
                                                   CUevent start, CUevent stop) {
   ScopedActivateContext activated{context};
-  CUresult res = dynload::cuEventElapsedTime(elapsed_milliseconds, start, stop);
+  // The stop event must have completed in order for cuEventElapsedTime to
+  // work.
+  CUresult res = dynload::cuEventSynchronize(stop);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "failed to synchronize the stop event: " << ToString(res);
+    return false;
+  }
+  res = dynload::cuEventElapsedTime(elapsed_milliseconds, start, stop);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to get elapsed time between events: "
                << ToString(res);
diff --git a/tensorflow/stream_executor/dnn.cc b/tensorflow/stream_executor/dnn.cc
index 41f3bf83d3..7e98db723f 100644
--- a/tensorflow/stream_executor/dnn.cc
+++ b/tensorflow/stream_executor/dnn.cc
@@ -22,6 +22,11 @@ namespace perftools {
 namespace gputools {
 namespace dnn {
 
+bool DnnSupport::GetConvolveAlgorithms(
+    std::vector<AlgorithmType>* out_algorithms) {
+  return false;
+}
+
 string QuantizedActivationModeString(QuantizedActivationMode mode) {
   switch (mode) {
     case dnn::QuantizedActivationMode::k8Bit:
diff --git a/tensorflow/stream_executor/dnn.h b/tensorflow/stream_executor/dnn.h
index 3d19f481c9..8cba8295db 100644
--- a/tensorflow/stream_executor/dnn.h
+++ b/tensorflow/stream_executor/dnn.h
@@ -527,6 +527,30 @@ class PoolingDescriptor {
   std::vector<int64> strides_;
 };
 
+typedef int64 AlgorithmType;
+constexpr AlgorithmType kDefaultAlgorithm = -1;
+
+// Describes the result from a perf experiment.
+//
+// Arguments:
+//  is_valid: indicates whether a valid measurement was obtained.
+//  algorithm: returns the exact algorithm that was used.
+//  elapsed_time_in_ms: returns the measured elapsed time in milliseconds.
+class ProfileResult {
+ public:
+  bool is_valid() const { return is_valid_; }
+  void set_is_valid(bool val) { is_valid_ = val; }
+  AlgorithmType algorithm() const { return algorithm_; }
+  void set_algorithm(AlgorithmType val) { algorithm_ = val; }
+  float elapsed_time_in_ms() const { return elapsed_time_in_ms_; }
+  void set_elapsed_time_in_ms(float val) { elapsed_time_in_ms_ = val; }
+
+ private:
+  bool is_valid_ = false;
+  AlgorithmType algorithm_ = kDefaultAlgorithm;
+  float elapsed_time_in_ms_ = -1.0f;
+};
+
 // Describes a local response normalization (LRN). LRN is used e.g. in
 // dist_belief.
 //
@@ -655,6 +679,12 @@ class DnnSupport {
   //    convolution result.
   //  scratch_allocator: un-owned, may-be-null object that may allocate scratch
   //    space in order to speed up the convolution operation.
+  //  algorithm: an integer to specify which algorithm should be used for the
+  //    operation. kDefaultAlgorithm means the system will pick an algorithm
+  //    by default. The coding of the algorithm is be interpretted by the
+  //    underlying implementation.
+  //  output_profile_result: the output profile result for this call. The
+  //    profiling is only enabled when this is not nullptr.
   //
   // input_descriptor, filter_descriptor, convolution_descriptor and
   // output_descriptor together specify exactly how the convolution is aligned
@@ -677,8 +707,12 @@ class DnnSupport {
       const DeviceMemory<float>& filter_data,
       const dnn::ConvolutionDescriptor& convolution_descriptor,
       const dnn::BatchDescriptor& output_descriptor,
-      DeviceMemory<float>* output_data,
-      ScratchAllocator* scratch_allocator) = 0;
+      DeviceMemory<float>* output_data, ScratchAllocator* scratch_allocator,
+      AlgorithmType algorithm, ProfileResult* output_profile_result) = 0;
+
+  // Return a list of algorithms supported by the forward convolution pass.
+  virtual bool GetConvolveAlgorithms(
+      std::vector<AlgorithmType>* out_algorithms);
 
   // Enqueues a double-precision convolution operation onto the stream.
   // See DoConvolve above for argument details.
diff --git a/tensorflow/stream_executor/stream.cc b/tensorflow/stream_executor/stream.cc
index cee781f77b..b02df02c90 100644
--- a/tensorflow/stream_executor/stream.cc
+++ b/tensorflow/stream_executor/stream.cc
@@ -295,7 +295,41 @@ Stream &Stream::ThenConvolveWithScratch(
       CheckError(dnn->DoConvolve(
           this, input_descriptor, input_data, filter_descriptor, filter_data,
           convolution_descriptor, output_descriptor, output,
-          /*scratch_allocator=*/scratch_allocator));
+          /*scratch_allocator=*/scratch_allocator, dnn::kDefaultAlgorithm,
+          nullptr));
+    } else {
+      SetError();
+      LOG(WARNING)
+          << "attempting to perform DNN operation using StreamExecutor "
+             "without DNN support";
+    }
+  }
+  return *this;
+}
+
+Stream &Stream::ThenConvolveWithAlgorithm(
+    const dnn::BatchDescriptor &input_descriptor,
+    const DeviceMemory<float> &input_data,
+    const dnn::FilterDescriptor &filter_descriptor,
+    const DeviceMemory<float> &filter_data,
+    const dnn::ConvolutionDescriptor &convolution_descriptor,
+    const dnn::BatchDescriptor &output_descriptor, DeviceMemory<float> *output,
+    ScratchAllocator *scratch_allocator, dnn::AlgorithmType algorithm,
+    dnn::ProfileResult *output_profile_result) {
+  VLOG_CALL(PARAM(input_descriptor), PARAM(input_data),
+            PARAM(filter_descriptor), PARAM(filter_data),
+            PARAM(convolution_descriptor), PARAM(output_descriptor),
+            PARAM(output));
+
+  if (ok()) {
+    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+      auto status = dnn->DoConvolve(
+          this, input_descriptor, input_data, filter_descriptor, filter_data,
+          convolution_descriptor, output_descriptor, output, scratch_allocator,
+          algorithm, output_profile_result);
+      if (!status && !output_profile_result) {
+        SetError();
+      }
     } else {
       SetError();
       LOG(WARNING)
diff --git a/tensorflow/stream_executor/stream.h b/tensorflow/stream_executor/stream.h
index 599146f49b..ca52949657 100644
--- a/tensorflow/stream_executor/stream.h
+++ b/tensorflow/stream_executor/stream.h
@@ -66,6 +66,8 @@ namespace dnn {
 struct BatchDescriptor;
 struct FilterDescriptor;
 struct ConvolutionDescriptor;
+struct ProfileResult;
+typedef int64 AlgorithmType;
 }  // namespace dnn
 
 class StreamExecutor;
@@ -228,6 +230,16 @@ class Stream {
       const dnn::BatchDescriptor &output_descriptor,
       DeviceMemory<float> *output, ScratchAllocator *scratch_allocator);
 
+  Stream &ThenConvolveWithAlgorithm(
+      const dnn::BatchDescriptor &input_descriptor,
+      const DeviceMemory<float> &input_data,
+      const dnn::FilterDescriptor &filter_descriptor,
+      const DeviceMemory<float> &filter_data,
+      const dnn::ConvolutionDescriptor &convolution_descriptor,
+      const dnn::BatchDescriptor &output_descriptor,
+      DeviceMemory<float> *output, ScratchAllocator *scratch_allocator,
+      dnn::AlgorithmType algorithm, dnn::ProfileResult *output_profile_result);
+
   Stream &ThenSeparableConvolve(
       const dnn::BatchDescriptor &input_descriptor,
       const DeviceMemory<float> &input_data,
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.cc b/tensorflow/stream_executor/stream_executor_pimpl.cc
index acaa0efcb2..fe32039d71 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.cc
+++ b/tensorflow/stream_executor/stream_executor_pimpl.cc
@@ -286,6 +286,15 @@ bool StreamExecutor::SupportsDnn() const {
   return implementation_->SupportsDnn();
 }
 
+bool StreamExecutor::GetConvolveAlgorithms(
+    std::vector<dnn::AlgorithmType> *out_algorithms) {
+  dnn::DnnSupport *dnn_support = AsDnn();
+  if (!dnn_support) {
+    return false;
+  }
+  return dnn_support->GetConvolveAlgorithms(out_algorithms);
+}
+
 dnn::DnnSupport *StreamExecutor::AsDnn() {
   mutex_lock lock{mu_};
   if (dnn_ != nullptr) {
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.h b/tensorflow/stream_executor/stream_executor_pimpl.h
index f624e0fcdb..31b110a8e0 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.h
+++ b/tensorflow/stream_executor/stream_executor_pimpl.h
@@ -338,6 +338,9 @@ class StreamExecutor {
   // platform that underlies this interface.
   bool SupportsDnn() const;
 
+  // Get the list of supported algorithms for the forward convolution opeartion.
+  bool GetConvolveAlgorithms(std::vector<dnn::AlgorithmType> *out_algorithms);
+
   // Returns the device ordinal that this StreamExecutor was initialized with.
   // Meaningless before initialization.
   int device_ordinal() const { return device_ordinal_; }
author	Xiaoqiang Zheng <zhengxq@google.com>	2016-05-07 22:22:44 -0800
committer	TensorFlower Gardener <gardener@tensorflow.org>	2016-05-09 09:21:14 -0700
commit	6ff265ebae14586db5db623b7502ddbdbc8bbd12 (patch)
tree	d4df06ea887dfb603b22c9df1147b41b38994b54 /tensorflow/stream_executor
parent	a846576a67da4bdd0b610a17b5c8d0d92e41f094 (diff)