[XLA] Read GPU architecture from StreamExecutor, rather than a flag.

Previously, we read the GPU architecture from the --gpu_architecture flag, which defaulted to compute_35. We'd then use this value when choosing a libdevice file and when telling LLVM which GPU architecture we're compiling for. Now we read this value from the StreamExecutor for the device on which we're going to run our computation. This change also adds more supported GPU architectures to the GPU backend, so we choose the right libdevice for your GPU. This change is necessary before we can begin emitting fp16 arithmetic and other sm_60+ ops. Change: 147971326
author: Justin Lebar <jlebar@google.com> 2017-02-19 12:33:44 -0800
committer: TensorFlower Gardener <gardener@tensorflow.org> 2017-02-19 12:53:26 -0800
commit: 7b02fa6a27022275517ed5b851b06ba19a11bdf0 (patch)
tree: bcb4be30b332a11f57dc6677196307fe5f6d981e
parent: 2c3469018589ffece9938797f618e5b3228074fa (diff)
5 files changed, 91 insertions, 53 deletions
diff --git a/tensorflow/compiler/xla/legacy_flags/gpu_backend_lib_flags.cc b/tensorflow/compiler/xla/legacy_flags/gpu_backend_lib_flags.cc
index c355b1ed9b..f8f6ea26b1 100644
--- a/tensorflow/compiler/xla/legacy_flags/gpu_backend_lib_flags.cc
+++ b/tensorflow/compiler/xla/legacy_flags/gpu_backend_lib_flags.cc
@@ -38,7 +38,6 @@ static void AllocateFlags() {
   flags->dump_temp_products_to = "";
   flags->ftz = false;
   flags->fma = true;
-  flags->gpu_architecture = "compute_35";
   flags->verbose_ptx_asm = false;
   flags->kernel = "";
   flags->llvm_dump_passes = false;
@@ -51,8 +50,6 @@ static void AllocateFlags() {
                        "If empty, no dump is produced"),
       tensorflow::Flag("ftz", &flags->ftz, "flush to zero semantics"),
       tensorflow::Flag("fma", &flags->fma, "use FMA synthesis"),
-      tensorflow::Flag("gpu_architecture", &flags->gpu_architecture,
-                       "GPU architecture"),
       tensorflow::Flag("verbose_ptx_asm", &flags->verbose_ptx_asm,
                        "emit PTX assembly with extra comments"),
       tensorflow::Flag("kernel", &flags->kernel,
diff --git a/tensorflow/compiler/xla/legacy_flags/gpu_backend_lib_flags.h b/tensorflow/compiler/xla/legacy_flags/gpu_backend_lib_flags.h
index fbb8863454..31cb50e9da 100644
--- a/tensorflow/compiler/xla/legacy_flags/gpu_backend_lib_flags.h
+++ b/tensorflow/compiler/xla/legacy_flags/gpu_backend_lib_flags.h
@@ -36,7 +36,6 @@ typedef struct {
   string dump_temp_products_to;  // temporary compilation products dir
   bool ftz;                      // flush to zero semantics
   bool fma;                      // use FMA synthesis
-  string gpu_architecture;       // GPU architecture
   bool verbose_ptx_asm;          // emit PTX assembly with extra comments
   string kernel;                 // only emit the IR and PTX for this kernel
   bool llvm_dump_passes;         // dump the passes LLVM runs to stderr
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
index ad283d7e66..24ee5771d5 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
@@ -287,8 +287,16 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::Compile(
     generated_ptxes_.emplace_back(MakeUnique<string>());
     ptx = generated_ptxes_.back().get();
   }
-  TF_ASSIGN_OR_RETURN(
-      *ptx, CompileToPtx(&llvm_module, *module_config, libdevice_dir_));
+  int cc_major, cc_minor;
+  if (!stream_exec->GetDeviceDescription().cuda_compute_capability(&cc_major,
+                                                                   &cc_minor)) {
+    LOG(WARNING)
+        << "Couldn't get compute capability for device; assuming sm_20.";
+    cc_major = 2;
+    cc_minor = 0;
+  }
+  TF_ASSIGN_OR_RETURN(*ptx, CompileToPtx(&llvm_module, {cc_major, cc_minor},
+                                         *module_config, libdevice_dir_));
 
   VLOG(2) << "LLVM module after optimizations:";
   XLA_VLOG_LINES(2, llvm_ir::DumpModuleToString(llvm_module));
diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
index 12ea573a9c..7f0808f5ab 100644
--- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
+++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
@@ -68,29 +68,64 @@ namespace {
 // Default inline threshold value to use in llvm.
 const int kDefaultInlineThreshold = 1100;
 
-// Information about a GPU architecture for the backend.
-struct GpuBackendInfo {
-  string libdevice_name;
-  string sm_name;
-};
-
-// Maps supported CUDA compute capability to a libdevice file to link for this
-// capability.
-std::map<string, GpuBackendInfo> gpu_info_map = {
-    {"compute_20", {"libdevice.compute_20.10.bc", "sm_20"}},
-    {"compute_30", {"libdevice.compute_30.10.bc", "sm_30"}},
-    {"compute_35", {"libdevice.compute_35.10.bc", "sm_35"}},
-
-    // NVIDIA does not provide a separate libdevice for CC 3.7, but we can use
-    // the one for 3.5.
-    {"compute_37", {"libdevice.compute_35.10.bc", "sm_37"}},
-};
-
-// Validate the --gpu_architecture command-line flag.
-static void ValidateGPUArchitecture(const string& value) {
-  if (!gpu_info_map.count(value)) {
-    LOG(FATAL) << "value for --gpu_architecture must be compute_{20,30,35,37}";
+// Gets the libdevice filename for a particular compute capability.  When
+// presented with a GPU we don't recognize, we just return the libdevice from
+// compute_20.
+static string GetLibdeviceFilename(std::pair<int, int> compute_capability) {
+  // There are only four libdevice files: compute_{20,30,35,50}.  Each GPU
+  // version gets mapped to one of these.  Note in particular that sm_60 and
+  // sm_61 map to libdevice.compute_30.
+  static auto* m = new std::map<std::pair<int, int>, int>({{{2, 0}, 20},
+                                                           {{2, 1}, 20},
+                                                           {{3, 0}, 30},
+                                                           {{3, 2}, 30},
+                                                           {{3, 5}, 35},
+                                                           {{3, 7}, 35},
+                                                           {{5, 0}, 50},
+                                                           {{5, 2}, 50},
+                                                           {{5, 3}, 50},
+                                                           {{6, 0}, 30},
+                                                           {{6, 1}, 30},
+                                                           {{6, 2}, 30}});
+  int libdevice_version = 20;
+  auto it = m->find(compute_capability);
+  if (it != m->end()) {
+    libdevice_version = it->second;
+  } else {
+    LOG(WARNING) << "Unknown compute capability (" << compute_capability.first
+                 << ", " << compute_capability.second << ") ."
+                 << "Defaulting to libdevice for compute_" << libdevice_version;
+  }
+  return tensorflow::strings::StrCat("libdevice.compute_", libdevice_version,
+                                     ".10.bc");
+}
+
+// Gets the GPU name as it's known to LLVM for a given compute capability.  If
+// we see an unrecognized compute capability, we return "sm_20".
+static string GetSmName(std::pair<int, int> compute_capability) {
+  static auto* m = new std::map<std::pair<int, int>, int>({{{2, 0}, 20},
+                                                           {{2, 1}, 21},
+                                                           {{3, 0}, 30},
+                                                           {{3, 2}, 32},
+                                                           {{3, 5}, 35},
+                                                           {{3, 7}, 37},
+                                                           {{5, 0}, 50},
+                                                           {{5, 2}, 52},
+                                                           {{5, 3}, 53},
+                                                           {{6, 0}, 60},
+                                                           {{6, 1}, 61},
+                                                           {{6, 2}, 62}});
+  int sm_version = 20;
+  auto it = m->find(compute_capability);
+  if (it != m->end()) {
+    sm_version = it->second;
+  } else {
+    LOG(WARNING) << "Unknown compute capability (" << compute_capability.first
+                 << ", " << compute_capability.second << ") ."
+                 << "Defaulting to telling LLVM that we're compiling for sm_"
+                 << sm_version;
   }
+  return tensorflow::strings::StrCat("sm_", sm_version);
 }
 
 // Convenience function for producing a name of a temporary compilation product
@@ -270,39 +305,37 @@ bool CouldNeedLibdevice(const llvm::Module& module) {
 }
 
 // Links libdevice into the given module if the module needs libdevice.
-tensorflow::Status LinkLibdeviceIfNecessary(const string& libdevice_dir_path,
-                                            llvm::Module* module) {
+tensorflow::Status LinkLibdeviceIfNecessary(
+    llvm::Module* module, std::pair<int, int> compute_capability,
+    const string& libdevice_dir_path) {
   if (!CouldNeedLibdevice(*module)) {
     return tensorflow::Status::OK();
   }
 
   llvm::Linker linker(*module);
-  legacy_flags::GpuBackendLibFlags* flags =
-      legacy_flags::GetGpuBackendLibFlags();
-  ValidateGPUArchitecture(flags->gpu_architecture);
-  string libdevice_bc_filename =
-      gpu_info_map[flags->gpu_architecture].libdevice_name;
-  string libdevice_bc_fullpath =
-      tensorflow::io::JoinPath(libdevice_dir_path, libdevice_bc_filename);
-  TF_RETURN_IF_ERROR(
-      tensorflow::Env::Default()->FileExists(libdevice_bc_fullpath));
+  string libdevice_path = tensorflow::io::JoinPath(
+      libdevice_dir_path, GetLibdeviceFilename(compute_capability));
+  TF_RETURN_IF_ERROR(tensorflow::Env::Default()->FileExists(libdevice_path));
+  VLOG(1) << "Linking with libdevice from: " << libdevice_path;
   std::unique_ptr<llvm::Module> libdevice_module =
-      LoadIRModule(libdevice_bc_fullpath, &module->getContext());
-  VLOG(1) << "Linking with libdevice from: " << libdevice_bc_fullpath;
+      LoadIRModule(libdevice_path, &module->getContext());
   if (linker.linkInModule(std::move(libdevice_module),
                           llvm::Linker::Flags::InternalizeLinkedSymbols |
                               llvm::Linker::Flags::LinkOnlyNeeded)) {
-    LOG(FATAL) << "Error linking libdevice from " << libdevice_bc_fullpath;
+    return tensorflow::errors::Internal(tensorflow::strings::StrCat(
+        "Error linking libdevice from ", libdevice_path));
   }
   return tensorflow::Status::OK();
 }
 
 StatusOr<string> CompileModuleToPtx(llvm::Module* module,
+                                    std::pair<int, int> compute_capability,
                                     const HloModuleConfig& hlo_module_config,
                                     const string& libdevice_dir_path) {
   // Link the input module with libdevice, to pull in implementations of some
   // builtins.
-  TF_RETURN_IF_ERROR(LinkLibdeviceIfNecessary(libdevice_dir_path, module));
+  TF_RETURN_IF_ERROR(
+      LinkLibdeviceIfNecessary(module, compute_capability, libdevice_dir_path));
 
   legacy_flags::GpuBackendLibFlags* flags =
       legacy_flags::GetGpuBackendLibFlags();
@@ -351,11 +384,8 @@ StatusOr<string> CompileModuleToPtx(llvm::Module* module,
 
   // Figure out the exact name of the processor as known to the NVPTX backend
   // from the gpu_architecture flag.
-  ValidateGPUArchitecture(flags->gpu_architecture);
-  string cpu_name = gpu_info_map[flags->gpu_architecture].sm_name;
-
-  std::unique_ptr<llvm::TargetMachine> target_machine =
-      GetTargetMachine(target_triple, cpu_name, hlo_module_config);
+  std::unique_ptr<llvm::TargetMachine> target_machine = GetTargetMachine(
+      target_triple, GetSmName(compute_capability), hlo_module_config);
   module_passes.add(llvm::createTargetTransformInfoWrapperPass(
       target_machine->getTargetIRAnalysis()));
 
@@ -466,6 +496,7 @@ void GPUBackendInit() {
 }  // namespace
 
 StatusOr<string> CompileToPtx(llvm::Module* module,
+                              std::pair<int, int> compute_capability,
                               const HloModuleConfig& hlo_module_config,
                               const string& libdevice_dir_path) {
   static std::once_flag backend_init_flag;
@@ -477,7 +508,8 @@ StatusOr<string> CompileToPtx(llvm::Module* module,
         "Compile module " + llvm_ir::AsString(module->getName()),
         /*vlog_level=*/2);
     TF_ASSIGN_OR_RETURN(
-        ptx, CompileModuleToPtx(module, hlo_module_config, libdevice_dir_path));
+        ptx, CompileModuleToPtx(module, compute_capability, hlo_module_config,
+                                libdevice_dir_path));
   }
   return ptx;
 }
diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h
index cf6f3197bb..fd89407217 100644
--- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h
+++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h
@@ -18,6 +18,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_LLVM_GPU_BACKEND_GPU_BACKEND_LIB_H_
 
 #include <string>
+#include <utility>
 
 #include "external/llvm/include/llvm/IR/Module.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
@@ -28,14 +29,15 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-// The Compile.* interfaces each create their own llvm::LLVMContext objects for
-// thread safety, but note that LLVM's multithreaded support is very
-// preliminary; multithreaded use is not recommended at this time.
-//
 // Compiles the argument module and returns it. libdevice_dir_path is the parent
 // directory of the libdevice bitcode libraries. The contents of the module may
 // be changed.
+//
+// The Compile.* interfaces each create their own llvm::LLVMContext objects for
+// thread safety, but note that LLVM's multithreaded support is very
+// preliminary; multithreaded use is not recommended at this time.
 StatusOr<string> CompileToPtx(llvm::Module* module,
+                              std::pair<int, int> compute_capability,
                               const HloModuleConfig& hlo_module_config,
                               const string& libdevice_dir_path);
author	Justin Lebar <jlebar@google.com>	2017-02-19 12:33:44 -0800
committer	TensorFlower Gardener <gardener@tensorflow.org>	2017-02-19 12:53:26 -0800
commit	7b02fa6a27022275517ed5b851b06ba19a11bdf0 (patch)
tree	bcb4be30b332a11f57dc6677196307fe5f6d981e
parent	2c3469018589ffece9938797f618e5b3228074fa (diff)