[XLA:GPU] Don't crash with --vmodule=gpu_compiler=2 if we can't run ptxas.

At --vmodule=gpu_compiler=2, we run ptxas over our generated PTX, to validate it, and also to dump out stats like the number of registers used. But previously, this would fail if your GPU was anything other than sm_35 (i.e. K20/40/80), because we didn't pass down cc_major/cc_minor to ptxas. And moreover, if ptxas failed to compile your program, we'd LOG(FATAL), which is probably no what you want. This change fixes both those issues. Tested on my local GTX1080. PiperOrigin-RevId: 172403304
author: Justin Lebar <jlebar@google.com> 2017-10-16 17:22:26 -0700
committer: TensorFlower Gardener <gardener@tensorflow.org> 2017-10-16 17:26:46 -0700
commit: 684f88fa7e61721c3264dc70abeed2b3e6fa7717 (patch)
tree: 7531a10eb6980e4b7840afc43aec250bf144b60b
parent: 99dffc958a1cfa4e5a2f81e8f4085277a0c34bd9 (diff)
1 files changed, 14 insertions, 6 deletions
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
index 57f11db11f..3e16e4e3c4 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
@@ -67,6 +67,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/cuda_libdevice_path.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
@@ -84,6 +85,8 @@ namespace gpu {
 
 namespace {
 
+using tensorflow::strings::StrCat;
+
 // Any address of a variable residing in global memory or returned by one of the
 // memory allocation routines from the driver or runtime API is always aligned
 // to at least 256 bytes.
@@ -223,7 +226,7 @@ tensorflow::Status PrepareHloModuleForIrEmitting(
 }
 
 // Invokes the ptxas tool on the given PTX string, and dumps its output.
-void DumpPtxasInfo(const string& ptx) {
+void DumpPtxasInfo(const string& ptx, int cc_major, int cc_minor) {
   const string ptxas_path =
       tensorflow::io::JoinPath(tensorflow::CudaRoot(), "bin/ptxas");
   // Do not log PTX stats if ptxas is not found at the given path.
@@ -245,17 +248,22 @@ void DumpPtxasInfo(const string& ptx) {
 
   // Invoke ptxas and collect its output.
   tensorflow::SubProcess ptxas_info_dumper;
-  ptxas_info_dumper.SetProgram(ptxas_path, {ptxas_path, ptx_path, "-o",
-                                            "/dev/null", "-v", "-arch=sm_35"});
+  ptxas_info_dumper.SetProgram(ptxas_path,
+                               {ptxas_path, ptx_path, "-o", "/dev/null", "-v",
+                                StrCat("-arch=sm_", cc_major, cc_minor)});
   ptxas_info_dumper.SetChannelAction(tensorflow::CHAN_STDERR,
                                      tensorflow::ACTION_PIPE);
-  CHECK(ptxas_info_dumper.Start());
+  if (!ptxas_info_dumper.Start()) {
+    LOG(ERROR) << "Failed to launch ptxas.";
+    return;
+  }
   string stderr_output;
   int exit_status = ptxas_info_dumper.Communicate(
       /*stdin_input=*/nullptr, /*stdout_output=*/nullptr, &stderr_output);
   XLA_LOG_LINES(tensorflow::INFO, stderr_output);
   if (exit_status != 0) {
-    LOG(FATAL) << "Invalid PTX. See the error message above for reasons.";
+    LOG(ERROR) << "ptxas exited with non-zero error code " << exit_status
+               << ".";
   }
 }
 
@@ -387,7 +395,7 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::Compile(
   VLOG(2) << "PTX:";
   XLA_VLOG_LINES(2, *ptx);
   if (VLOG_IS_ON(2)) {
-    DumpPtxasInfo(*ptx);
+    DumpPtxasInfo(*ptx, cc_major, cc_minor);
   }
 
   auto thunk_schedule = MakeUnique<ThunkSchedule>(
author	Justin Lebar <jlebar@google.com>	2017-10-16 17:22:26 -0700
committer	TensorFlower Gardener <gardener@tensorflow.org>	2017-10-16 17:26:46 -0700
commit	684f88fa7e61721c3264dc70abeed2b3e6fa7717 (patch)
tree	7531a10eb6980e4b7840afc43aec250bf144b60b
parent	99dffc958a1cfa4e5a2f81e8f4085277a0c34bd9 (diff)