aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
diff options
context:
space:
mode:
authorGravatar Sanjoy Das <sanjoy@google.com>2018-07-26 16:43:58 -0700
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2018-07-26 16:47:23 -0700
commit4009f82f71f0421e4ed1f50d38e9105074062d1e (patch)
tree6d732c7bea7a1e776d159989c50a69eb7f1697be /tensorflow/compiler/xla/service/gpu/gpu_executable.cc
parente336ee65a5c887e9a2f0b4c82c333bca405707a5 (diff)
Implement constant buffer allocation for XLA:GPU
This CL teaches XLA:GPU to use "normal" buffer assignment for constant instructions. Constant instructions are mapped to a BufferAllocation, like all other instructions, except the storage for this buffer is allocated statically as a global in the generated PTX. This CL does not change how we access the constants -- in IrEmitterUnnested::BuildKernelThunk (used for top level computations) and in HloToIrBindings::EmitBasePointersForHlos (used for nested computations) we bind the kConstant instructions to the llvm::GlobalVariable backing them. So users of constant instructions still access the globals corresponding to the constants directly. However, we no longer emit the constant literals inline. Instead we emit a constant with a zero initializer and then memcpy in the contents of the literal when we load the CUBIN/PTX. This works around compile time issues in LLVM and ptxas caused by large constants. We also populate `BufferAllocations` with the device pointers for the constant globals. This is at least needed for TupleThunk today because TupleThunk wants the addresses for the sub-buffers on the host. I'm not sure if there are other places in XLA:GPU that rely on there being an entry in BufferAllocations for every BufferAllocation. PiperOrigin-RevId: 206243319
Diffstat (limited to 'tensorflow/compiler/xla/service/gpu/gpu_executable.cc')
-rw-r--r--tensorflow/compiler/xla/service/gpu/gpu_executable.cc55
1 files changed, 54 insertions, 1 deletions
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
index 9767836cd6..52c8595ffb 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@@ -181,6 +181,51 @@ Status GpuExecutable::ExecuteThunks(
return Status::OK();
}
+StatusOr<const GpuExecutable::BufferAllocToDeviceMemoryMap*>
+GpuExecutable::ResolveConstantGlobals(se::StreamExecutor* executor) {
+ tensorflow::mutex_lock lock(module_handle_mutex_);
+ auto it = module_globals_.find(executor);
+ if (it != module_globals_.end()) {
+ return &it->second;
+ }
+
+ se::MultiModuleLoaderSpec module_spec;
+ module_spec.AddCudaCubinInMemory(cubin());
+ module_spec.AddCudaPtxInMemory(ptx().c_str());
+
+ tensorflow::gtl::FlatMap<int64, se::DeviceMemoryBase> globals;
+ se::ModuleHandle module_handle;
+ executor->LoadModule(module_spec, &module_handle);
+
+ for (BufferAllocation::Index i = 0; i < assignment_->Allocations().size();
+ ++i) {
+ const BufferAllocation& allocation = assignment_->GetAllocation(i);
+ if (allocation.is_constant()) {
+ TF_ASSIGN_OR_RETURN(
+ se::DeviceMemoryBase global,
+ executor->GetUntypedSymbol(
+ ConstantBufferAllocationToGlobalName(allocation), module_handle));
+ VLOG(3) << "Resolved global "
+ << ConstantBufferAllocationToGlobalName(allocation) << " to "
+ << global.opaque();
+ InsertOrDie(&globals, i, global);
+
+ const Literal& literal = LiteralForConstantAllocation(allocation);
+ CHECK(ShapeUtil::IsArray(literal.shape()));
+ if (!ShouldEmitLiteralInLlvmIr(literal)) {
+ VLOG(3) << "H2D memcpy for constant with shape "
+ << ShapeUtil::HumanString(literal.shape());
+ TF_RETURN_IF_ERROR(executor->SynchronousMemcpyH2D(
+ literal.untyped_data(), allocation.size(), &global));
+ }
+ }
+ }
+
+ module_handles_.emplace(executor,
+ se::ScopedModuleHandle(executor, module_handle));
+ return &module_globals_.emplace(executor, std::move(globals)).first->second;
+}
+
StatusOr<ScopedShapedBuffer> GpuExecutable::ExecuteOnStream(
const ServiceExecutableRunOptions* run_options,
tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
@@ -192,6 +237,10 @@ StatusOr<ScopedShapedBuffer> GpuExecutable::ExecuteOnStream(
}
BufferAllocations::Builder buffer_allocations_builder;
+ se::StreamExecutor* executor = run_options->stream()->parent();
+
+ TF_ASSIGN_OR_RETURN(auto* const globals, ResolveConstantGlobals(executor));
+
for (BufferAllocation::Index i = 0; i < assignment_->Allocations().size();
++i) {
const BufferAllocation& allocation = assignment_->GetAllocation(i);
@@ -213,8 +262,12 @@ StatusOr<ScopedShapedBuffer> GpuExecutable::ExecuteOnStream(
buffer_allocations_builder.RegisterBuffer(i, buffer);
}
+
+ if (allocation.is_constant()) {
+ buffer_allocations_builder.RegisterBuffer(i, FindOrDie(*globals, i));
+ }
}
- se::StreamExecutor* executor = run_options->stream()->parent();
+
TF_ASSIGN_OR_RETURN(
auto buffer_allocations,
buffer_allocations_builder.Build(