Automated g4 rollback of changelist 155305963

PiperOrigin-RevId: 156011931
author: Peter Hawkins <phawkins@google.com> 2017-05-14 19:31:26 -0700
committer: TensorFlower Gardener <gardener@tensorflow.org> 2017-05-14 19:35:08 -0700
commit: 45f4d963f218340cdc3620a9fc0350655023f87d (patch)
tree: e79be3decdc7e521f1c4c121d6cd5b38ad46a960
parent: e4633442278246a175ea51db01eb16fcbc01342e (diff)
5 files changed, 37 insertions, 67 deletions
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
index af5cf8ca4b..dc3a289a71 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
@@ -188,52 +188,41 @@ tensorflow::Status PrepareHloModuleForIrEmitting(
   return pipeline.Run(hlo_module).status();
 }
 
-// Invokes the ptxas tool on the given PTX string, and stores the resulting
-// SASS in *cubin. If -v 2 or greater, runs ptxas with -v and dumps the
-// resulting stderr (which contains register allocation info, etc.)
-// to VLOG(2). If ptxas binary is not found *sass is set to "".
-Status CompilePTX(const string& ptx, int cc_major, int cc_minor,
-                  string* cubin) {
-  *cubin = "";
-
+// Invokes the ptxas tool on the given PTX string, and dumps its output.
+void DumpPtxasInfo(const string& ptx) {
   const string ptxas_path =
       tensorflow::io::JoinPath(tensorflow::CudaRoot(), "bin/ptxas");
-
   // Do not log PTX stats if ptxas is not found at the given path.
-  LOG(INFO) << "Invoking ptxas at path \"" << ptxas_path << "\".";
-  TF_RETURN_IF_ERROR(tensorflow::Env::Default()->FileExists(ptxas_path));
+  if (!tensorflow::Env::Default()->FileExists(ptxas_path).ok()) {
+    LOG(WARNING)
+        << "Failed to dump PTX stats because ptxas is not found at path \""
+        << ptxas_path << "\".";
+    return;
+  }
 
   // Write `ptx` into a temporary file.
   char tempdir_template[] = "/tmp/ptxXXXXXX";
   char* tempdir_name = mkdtemp(tempdir_template);
   CHECK_NOTNULL(tempdir_name);
   string ptx_path = tensorflow::io::JoinPath(tempdir_name, "ptx");
-
   TF_CHECK_OK(
       tensorflow::WriteStringToFile(tensorflow::Env::Default(), ptx_path, ptx));
   LOG(INFO) << "ptx file written to: " << ptx_path;
 
   // Invoke ptxas and collect its output.
-  tensorflow::SubProcess ptxas_info;
-  string arch = tensorflow::strings::StrCat("sm_", cc_major, cc_minor);
-  string cubin_path = tensorflow::io::JoinPath(tempdir_name, "cubin");
-
-  if (VLOG_IS_ON(2)) {
-    ptxas_info.SetProgram(ptxas_path, {ptxas_path, "-v", "-o", cubin_path,
-                                       "-arch", arch, ptx_path});
-  } else {
-    ptxas_info.SetProgram(
-        ptxas_path, {ptxas_path, "-o", cubin_path, "-arch", arch, ptx_path});
-  }
-  ptxas_info.SetChannelAction(tensorflow::CHAN_STDERR, tensorflow::ACTION_PIPE);
-  CHECK(ptxas_info.Start());
+  tensorflow::SubProcess ptxas_info_dumper;
+  ptxas_info_dumper.SetProgram(ptxas_path, {ptxas_path, ptx_path, "-o",
+                                            "/dev/null", "-v", "-arch=sm_35"});
+  ptxas_info_dumper.SetChannelAction(tensorflow::CHAN_STDERR,
+                                     tensorflow::ACTION_PIPE);
+  CHECK(ptxas_info_dumper.Start());
   string stderr_output;
-  int ptxas_exit_status = ptxas_info.Communicate(
+  int exit_status = ptxas_info_dumper.Communicate(
       /*stdin_input=*/nullptr, /*stdout_output=*/nullptr, &stderr_output);
-
-  TF_RET_CHECK(ptxas_exit_status == 0);
-  return tensorflow::ReadFileToString(tensorflow::Env::Default(), cubin_path,
-                                      cubin);
+  XLA_LOG_LINES(tensorflow::INFO, stderr_output);
+  if (exit_status != 0) {
+    LOG(FATAL) << "Invalid PTX. See the error message above for reasons.";
+  }
 }
 
 }  // namespace
@@ -309,14 +298,10 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::Compile(
 
   // Reserve space for the PTX to be generated for this module.
   string* ptx;
-  string* cubin;
   {
     tensorflow::mutex_lock lock(mutex_);
     generated_ptxes_.emplace_back(MakeUnique<string>());
     ptx = generated_ptxes_.back().get();
-
-    generated_cubins_.emplace_back(MakeUnique<string>());
-    cubin = generated_cubins_.back().get();
   }
   int cc_major, cc_minor;
   if (!stream_exec->GetDeviceDescription().cuda_compute_capability(&cc_major,
@@ -333,6 +318,9 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::Compile(
   XLA_VLOG_LINES(2, llvm_ir::DumpModuleToString(llvm_module));
   VLOG(2) << "PTX:";
   XLA_VLOG_LINES(2, *ptx);
+  if (VLOG_IS_ON(2)) {
+    DumpPtxasInfo(*ptx);
+  }
 
   auto thunk_schedule = MakeUnique<ThunkSchedule>(
       ir_emitter.ConsumeThunkSequence(), std::move(stream_assignment),
@@ -340,13 +328,9 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::Compile(
   VLOG(2) << "Printing the thunk schedule...";
   XLA_VLOG_LINES(2, thunk_schedule->ToString());
 
-  TF_RET_CHECK(CompilePTX(*ptx, cc_major, cc_minor, cubin).ok());
-
   auto* gpu_executable =
-      new GpuExecutable(*cubin, *ptx, {cc_major, cc_minor},
-                        std::move(thunk_schedule), std::move(hlo_module),
+      new GpuExecutable(*ptx, std::move(thunk_schedule), std::move(hlo_module),
                         std::move(module_config), std::move(buffer_assignment));
-
   if (flags->xla_gpu_embed_ir) {
     DCHECK_NE("", ir_module_string_before_opt);
     gpu_executable->set_ir_module_string(ir_module_string_before_opt);
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.h b/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
index 99c7ba5199..22f492b422 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
@@ -71,7 +71,6 @@ class GpuCompiler : public Compiler {
   // StreamExecutor (b/24776264).
   tensorflow::mutex mutex_;
   std::vector<std::unique_ptr<string>> generated_ptxes_ GUARDED_BY(mutex_);
-  std::vector<std::unique_ptr<string>> generated_cubins_ GUARDED_BY(mutex_);
 
   // The size in bytes of a pointer. Used for computing ShapeSizeBytes.
   int64 pointer_size_;
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
index b4b788162f..32f0368b4b 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@@ -107,17 +107,13 @@ class HloExecutionProfiler {
 
 // Implementation note: HLO profiling is always enabled for GPU executables,
 // since we can use timers around thunks.
-GpuExecutable::GpuExecutable(tensorflow::StringPiece cubin,
-                             tensorflow::StringPiece ptx,
-                             std::pair<int, int> compute_capability,
+GpuExecutable::GpuExecutable(tensorflow::StringPiece ptx,
                              std::unique_ptr<ThunkSchedule> thunk_schedule,
                              std::unique_ptr<HloModule> hlo_module,
                              std::unique_ptr<HloModuleConfig> module_config,
                              std::unique_ptr<BufferAssignment> assignment)
     : Executable(std::move(hlo_module), std::move(module_config)),
-      cubin_(cubin),
       ptx_(ptx),
-      compute_capability_(compute_capability),
       thunk_schedule_(std::move(thunk_schedule)),
       assignment_(std::move(assignment)) {}
 
@@ -190,13 +186,6 @@ StatusOr<se::DeviceMemoryBase> GpuExecutable::ExecuteOnStream(
   // false.
   TF_RET_CHECK(!module_config().has_hybrid_result());
 
-  // Ensure the compute capability of the cubin and the stream match.
-  std::pair<int, int> stream_compute_compatibility;
-  stream->parent()->GetDeviceDescription().cuda_compute_capability(
-      &stream_compute_compatibility.first,
-      &stream_compute_compatibility.second);
-  TF_RET_CHECK(stream_compute_compatibility == compute_capability_);
-
   BufferAllocations::Builder buffer_allocations_builder;
   for (BufferAllocation::Index i = 0; i < assignment_->Allocations().size();
        ++i) {
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.h b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
index 09a92c4e4c..e308de79ba 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
@@ -40,17 +40,15 @@ limitations under the License.
 
 namespace xla {
 namespace gpu {
+
 // GPU-targeting implementation of the XLA Executable interface.
 //
 // Launches the given CUDA kernel via the StreamExecutor.
-
-// GPUExecutable should eventually be updated to associate a compute
-// capability with the PTX and store multiple cubins, each with their own
-// associated CC's, rather than including CC as a property of GpuExecutable.
+//
+// This is an immutable data type after initialization, and thus thread safe.
 class GpuExecutable : public Executable {
  public:
-  GpuExecutable(tensorflow::StringPiece cubin, tensorflow::StringPiece ptx,
-                std::pair<int, int> compute_capability,
+  GpuExecutable(tensorflow::StringPiece ptx,
                 std::unique_ptr<ThunkSchedule> thunk_schedule,
                 std::unique_ptr<HloModule> hlo_module,
                 std::unique_ptr<HloModuleConfig> module_config,
@@ -64,8 +62,7 @@ class GpuExecutable : public Executable {
     ir_module_string_ = ir_module_string;
   }
 
-  // Returns the compiled CUDA binary for the computation.
-  tensorflow::StringPiece cubin() const { return cubin_; }
+  // Returns the compiled PTX for the computation.
   tensorflow::StringPiece ptx() const { return ptx_; }
 
   StatusOr<perftools::gputools::DeviceMemoryBase> ExecuteOnStream(
@@ -107,10 +104,8 @@ class GpuExecutable : public Executable {
   // This string should be modified only before ExecuteOnStream.
   string ir_module_string_;
 
-  // The reference to the compiled PTX & CUDA binary for the computation.
-  tensorflow::StringPiece cubin_;
-  tensorflow::StringPiece ptx_;
-  std::pair<int, int> compute_capability_;
+  // The reference to the compiled PTX for the computation.
+  const tensorflow::StringPiece ptx_;
 
   // The thunks to be invoked by this GpuExecutable. They are generated by the
   // IrEmitter.
diff --git a/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc b/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
index 48ccc63f3d..69399e36c4 100644
--- a/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
@@ -41,10 +41,13 @@ tensorflow::Status KernelThunk::Initialize(const GpuExecutable& executable) {
     // Already initialized by another thread.
     return tensorflow::Status::OK();
   }
-  loader_spec_.reset(new se::MultiKernelLoaderSpec(io_buffers_.size() + 1));
 
-  tensorflow::StringPiece cubin = executable.cubin();
-  loader_spec_->AddCudaCubinInMemory(cubin.data(), kernel_name_);
+  loader_spec_.reset(new se::MultiKernelLoaderSpec(io_buffers_.size() + 1));
+  tensorflow::StringPiece ptx = executable.ptx();
+  // Convert tensorflow::StringPiece to se::port::StringPiece because
+  // StreamExecutor uses the latter.
+  loader_spec_->AddCudaPtxInMemory(
+      se::port::StringPiece(ptx.data(), ptx.size()), kernel_name_);
   return tensorflow::Status::OK();
 }
author	Peter Hawkins <phawkins@google.com>	2017-05-14 19:31:26 -0700
committer	TensorFlower Gardener <gardener@tensorflow.org>	2017-05-14 19:35:08 -0700
commit	45f4d963f218340cdc3620a9fc0350655023f87d (patch)
tree	e79be3decdc7e521f1c4c121d6cd5b38ad46a960
parent	e4633442278246a175ea51db01eb16fcbc01342e (diff)