diff options
author | Peter Hawkins <phawkins@google.com> | 2017-05-14 19:31:26 -0700 |
---|---|---|
committer | TensorFlower Gardener <gardener@tensorflow.org> | 2017-05-14 19:35:08 -0700 |
commit | 45f4d963f218340cdc3620a9fc0350655023f87d (patch) | |
tree | e79be3decdc7e521f1c4c121d6cd5b38ad46a960 | |
parent | e4633442278246a175ea51db01eb16fcbc01342e (diff) |
Automated g4 rollback of changelist 155305963
PiperOrigin-RevId: 156011931
5 files changed, 37 insertions, 67 deletions
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc index af5cf8ca4b..dc3a289a71 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc +++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc @@ -188,52 +188,41 @@ tensorflow::Status PrepareHloModuleForIrEmitting( return pipeline.Run(hlo_module).status(); } -// Invokes the ptxas tool on the given PTX string, and stores the resulting -// SASS in *cubin. If -v 2 or greater, runs ptxas with -v and dumps the -// resulting stderr (which contains register allocation info, etc.) -// to VLOG(2). If ptxas binary is not found *sass is set to "". -Status CompilePTX(const string& ptx, int cc_major, int cc_minor, - string* cubin) { - *cubin = ""; - +// Invokes the ptxas tool on the given PTX string, and dumps its output. +void DumpPtxasInfo(const string& ptx) { const string ptxas_path = tensorflow::io::JoinPath(tensorflow::CudaRoot(), "bin/ptxas"); - // Do not log PTX stats if ptxas is not found at the given path. - LOG(INFO) << "Invoking ptxas at path \"" << ptxas_path << "\"."; - TF_RETURN_IF_ERROR(tensorflow::Env::Default()->FileExists(ptxas_path)); + if (!tensorflow::Env::Default()->FileExists(ptxas_path).ok()) { + LOG(WARNING) + << "Failed to dump PTX stats because ptxas is not found at path \"" + << ptxas_path << "\"."; + return; + } // Write `ptx` into a temporary file. char tempdir_template[] = "/tmp/ptxXXXXXX"; char* tempdir_name = mkdtemp(tempdir_template); CHECK_NOTNULL(tempdir_name); string ptx_path = tensorflow::io::JoinPath(tempdir_name, "ptx"); - TF_CHECK_OK( tensorflow::WriteStringToFile(tensorflow::Env::Default(), ptx_path, ptx)); LOG(INFO) << "ptx file written to: " << ptx_path; // Invoke ptxas and collect its output. - tensorflow::SubProcess ptxas_info; - string arch = tensorflow::strings::StrCat("sm_", cc_major, cc_minor); - string cubin_path = tensorflow::io::JoinPath(tempdir_name, "cubin"); - - if (VLOG_IS_ON(2)) { - ptxas_info.SetProgram(ptxas_path, {ptxas_path, "-v", "-o", cubin_path, - "-arch", arch, ptx_path}); - } else { - ptxas_info.SetProgram( - ptxas_path, {ptxas_path, "-o", cubin_path, "-arch", arch, ptx_path}); - } - ptxas_info.SetChannelAction(tensorflow::CHAN_STDERR, tensorflow::ACTION_PIPE); - CHECK(ptxas_info.Start()); + tensorflow::SubProcess ptxas_info_dumper; + ptxas_info_dumper.SetProgram(ptxas_path, {ptxas_path, ptx_path, "-o", + "/dev/null", "-v", "-arch=sm_35"}); + ptxas_info_dumper.SetChannelAction(tensorflow::CHAN_STDERR, + tensorflow::ACTION_PIPE); + CHECK(ptxas_info_dumper.Start()); string stderr_output; - int ptxas_exit_status = ptxas_info.Communicate( + int exit_status = ptxas_info_dumper.Communicate( /*stdin_input=*/nullptr, /*stdout_output=*/nullptr, &stderr_output); - - TF_RET_CHECK(ptxas_exit_status == 0); - return tensorflow::ReadFileToString(tensorflow::Env::Default(), cubin_path, - cubin); + XLA_LOG_LINES(tensorflow::INFO, stderr_output); + if (exit_status != 0) { + LOG(FATAL) << "Invalid PTX. See the error message above for reasons."; + } } } // namespace @@ -309,14 +298,10 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::Compile( // Reserve space for the PTX to be generated for this module. string* ptx; - string* cubin; { tensorflow::mutex_lock lock(mutex_); generated_ptxes_.emplace_back(MakeUnique<string>()); ptx = generated_ptxes_.back().get(); - - generated_cubins_.emplace_back(MakeUnique<string>()); - cubin = generated_cubins_.back().get(); } int cc_major, cc_minor; if (!stream_exec->GetDeviceDescription().cuda_compute_capability(&cc_major, @@ -333,6 +318,9 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::Compile( XLA_VLOG_LINES(2, llvm_ir::DumpModuleToString(llvm_module)); VLOG(2) << "PTX:"; XLA_VLOG_LINES(2, *ptx); + if (VLOG_IS_ON(2)) { + DumpPtxasInfo(*ptx); + } auto thunk_schedule = MakeUnique<ThunkSchedule>( ir_emitter.ConsumeThunkSequence(), std::move(stream_assignment), @@ -340,13 +328,9 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::Compile( VLOG(2) << "Printing the thunk schedule..."; XLA_VLOG_LINES(2, thunk_schedule->ToString()); - TF_RET_CHECK(CompilePTX(*ptx, cc_major, cc_minor, cubin).ok()); - auto* gpu_executable = - new GpuExecutable(*cubin, *ptx, {cc_major, cc_minor}, - std::move(thunk_schedule), std::move(hlo_module), + new GpuExecutable(*ptx, std::move(thunk_schedule), std::move(hlo_module), std::move(module_config), std::move(buffer_assignment)); - if (flags->xla_gpu_embed_ir) { DCHECK_NE("", ir_module_string_before_opt); gpu_executable->set_ir_module_string(ir_module_string_before_opt); diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.h b/tensorflow/compiler/xla/service/gpu/gpu_compiler.h index 99c7ba5199..22f492b422 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.h +++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.h @@ -71,7 +71,6 @@ class GpuCompiler : public Compiler { // StreamExecutor (b/24776264). tensorflow::mutex mutex_; std::vector<std::unique_ptr<string>> generated_ptxes_ GUARDED_BY(mutex_); - std::vector<std::unique_ptr<string>> generated_cubins_ GUARDED_BY(mutex_); // The size in bytes of a pointer. Used for computing ShapeSizeBytes. int64 pointer_size_; diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc index b4b788162f..32f0368b4b 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc +++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc @@ -107,17 +107,13 @@ class HloExecutionProfiler { // Implementation note: HLO profiling is always enabled for GPU executables, // since we can use timers around thunks. -GpuExecutable::GpuExecutable(tensorflow::StringPiece cubin, - tensorflow::StringPiece ptx, - std::pair<int, int> compute_capability, +GpuExecutable::GpuExecutable(tensorflow::StringPiece ptx, std::unique_ptr<ThunkSchedule> thunk_schedule, std::unique_ptr<HloModule> hlo_module, std::unique_ptr<HloModuleConfig> module_config, std::unique_ptr<BufferAssignment> assignment) : Executable(std::move(hlo_module), std::move(module_config)), - cubin_(cubin), ptx_(ptx), - compute_capability_(compute_capability), thunk_schedule_(std::move(thunk_schedule)), assignment_(std::move(assignment)) {} @@ -190,13 +186,6 @@ StatusOr<se::DeviceMemoryBase> GpuExecutable::ExecuteOnStream( // false. TF_RET_CHECK(!module_config().has_hybrid_result()); - // Ensure the compute capability of the cubin and the stream match. - std::pair<int, int> stream_compute_compatibility; - stream->parent()->GetDeviceDescription().cuda_compute_capability( - &stream_compute_compatibility.first, - &stream_compute_compatibility.second); - TF_RET_CHECK(stream_compute_compatibility == compute_capability_); - BufferAllocations::Builder buffer_allocations_builder; for (BufferAllocation::Index i = 0; i < assignment_->Allocations().size(); ++i) { diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.h b/tensorflow/compiler/xla/service/gpu/gpu_executable.h index 09a92c4e4c..e308de79ba 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_executable.h +++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.h @@ -40,17 +40,15 @@ limitations under the License. namespace xla { namespace gpu { + // GPU-targeting implementation of the XLA Executable interface. // // Launches the given CUDA kernel via the StreamExecutor. - -// GPUExecutable should eventually be updated to associate a compute -// capability with the PTX and store multiple cubins, each with their own -// associated CC's, rather than including CC as a property of GpuExecutable. +// +// This is an immutable data type after initialization, and thus thread safe. class GpuExecutable : public Executable { public: - GpuExecutable(tensorflow::StringPiece cubin, tensorflow::StringPiece ptx, - std::pair<int, int> compute_capability, + GpuExecutable(tensorflow::StringPiece ptx, std::unique_ptr<ThunkSchedule> thunk_schedule, std::unique_ptr<HloModule> hlo_module, std::unique_ptr<HloModuleConfig> module_config, @@ -64,8 +62,7 @@ class GpuExecutable : public Executable { ir_module_string_ = ir_module_string; } - // Returns the compiled CUDA binary for the computation. - tensorflow::StringPiece cubin() const { return cubin_; } + // Returns the compiled PTX for the computation. tensorflow::StringPiece ptx() const { return ptx_; } StatusOr<perftools::gputools::DeviceMemoryBase> ExecuteOnStream( @@ -107,10 +104,8 @@ class GpuExecutable : public Executable { // This string should be modified only before ExecuteOnStream. string ir_module_string_; - // The reference to the compiled PTX & CUDA binary for the computation. - tensorflow::StringPiece cubin_; - tensorflow::StringPiece ptx_; - std::pair<int, int> compute_capability_; + // The reference to the compiled PTX for the computation. + const tensorflow::StringPiece ptx_; // The thunks to be invoked by this GpuExecutable. They are generated by the // IrEmitter. diff --git a/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc b/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc index 48ccc63f3d..69399e36c4 100644 --- a/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc +++ b/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc @@ -41,10 +41,13 @@ tensorflow::Status KernelThunk::Initialize(const GpuExecutable& executable) { // Already initialized by another thread. return tensorflow::Status::OK(); } - loader_spec_.reset(new se::MultiKernelLoaderSpec(io_buffers_.size() + 1)); - tensorflow::StringPiece cubin = executable.cubin(); - loader_spec_->AddCudaCubinInMemory(cubin.data(), kernel_name_); + loader_spec_.reset(new se::MultiKernelLoaderSpec(io_buffers_.size() + 1)); + tensorflow::StringPiece ptx = executable.ptx(); + // Convert tensorflow::StringPiece to se::port::StringPiece because + // StreamExecutor uses the latter. + loader_spec_->AddCudaPtxInMemory( + se::port::StringPiece(ptx.data(), ptx.size()), kernel_name_); return tensorflow::Status::OK(); } |