diff options
author | 2018-07-26 16:43:58 -0700 | |
---|---|---|
committer | 2018-07-26 16:47:23 -0700 | |
commit | 4009f82f71f0421e4ed1f50d38e9105074062d1e (patch) | |
tree | 6d732c7bea7a1e776d159989c50a69eb7f1697be /tensorflow/compiler/xla/service/gpu/gpu_executable.cc | |
parent | e336ee65a5c887e9a2f0b4c82c333bca405707a5 (diff) |
Implement constant buffer allocation for XLA:GPU
This CL teaches XLA:GPU to use "normal" buffer assignment for constant
instructions. Constant instructions are mapped to a BufferAllocation, like all
other instructions, except the storage for this buffer is allocated statically
as a global in the generated PTX.
This CL does not change how we access the constants -- in
IrEmitterUnnested::BuildKernelThunk (used for top level computations) and in
HloToIrBindings::EmitBasePointersForHlos (used for nested computations) we bind
the kConstant instructions to the llvm::GlobalVariable backing them. So users
of constant instructions still access the globals corresponding to the constants
directly.
However, we no longer emit the constant literals inline. Instead we emit a
constant with a zero initializer and then memcpy in the contents of the literal
when we load the CUBIN/PTX. This works around compile time issues in LLVM and
ptxas caused by large constants.
We also populate `BufferAllocations` with the device pointers for the constant
globals. This is at least needed for TupleThunk today because TupleThunk wants
the addresses for the sub-buffers on the host. I'm not sure if there are other
places in XLA:GPU that rely on there being an entry in BufferAllocations for
every BufferAllocation.
PiperOrigin-RevId: 206243319
Diffstat (limited to 'tensorflow/compiler/xla/service/gpu/gpu_executable.cc')
-rw-r--r-- | tensorflow/compiler/xla/service/gpu/gpu_executable.cc | 55 |
1 files changed, 54 insertions, 1 deletions
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc index 9767836cd6..52c8595ffb 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc +++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc @@ -181,6 +181,51 @@ Status GpuExecutable::ExecuteThunks( return Status::OK(); } +StatusOr<const GpuExecutable::BufferAllocToDeviceMemoryMap*> +GpuExecutable::ResolveConstantGlobals(se::StreamExecutor* executor) { + tensorflow::mutex_lock lock(module_handle_mutex_); + auto it = module_globals_.find(executor); + if (it != module_globals_.end()) { + return &it->second; + } + + se::MultiModuleLoaderSpec module_spec; + module_spec.AddCudaCubinInMemory(cubin()); + module_spec.AddCudaPtxInMemory(ptx().c_str()); + + tensorflow::gtl::FlatMap<int64, se::DeviceMemoryBase> globals; + se::ModuleHandle module_handle; + executor->LoadModule(module_spec, &module_handle); + + for (BufferAllocation::Index i = 0; i < assignment_->Allocations().size(); + ++i) { + const BufferAllocation& allocation = assignment_->GetAllocation(i); + if (allocation.is_constant()) { + TF_ASSIGN_OR_RETURN( + se::DeviceMemoryBase global, + executor->GetUntypedSymbol( + ConstantBufferAllocationToGlobalName(allocation), module_handle)); + VLOG(3) << "Resolved global " + << ConstantBufferAllocationToGlobalName(allocation) << " to " + << global.opaque(); + InsertOrDie(&globals, i, global); + + const Literal& literal = LiteralForConstantAllocation(allocation); + CHECK(ShapeUtil::IsArray(literal.shape())); + if (!ShouldEmitLiteralInLlvmIr(literal)) { + VLOG(3) << "H2D memcpy for constant with shape " + << ShapeUtil::HumanString(literal.shape()); + TF_RETURN_IF_ERROR(executor->SynchronousMemcpyH2D( + literal.untyped_data(), allocation.size(), &global)); + } + } + } + + module_handles_.emplace(executor, + se::ScopedModuleHandle(executor, module_handle)); + return &module_globals_.emplace(executor, std::move(globals)).first->second; +} + StatusOr<ScopedShapedBuffer> GpuExecutable::ExecuteOnStream( const ServiceExecutableRunOptions* run_options, tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments, @@ -192,6 +237,10 @@ StatusOr<ScopedShapedBuffer> GpuExecutable::ExecuteOnStream( } BufferAllocations::Builder buffer_allocations_builder; + se::StreamExecutor* executor = run_options->stream()->parent(); + + TF_ASSIGN_OR_RETURN(auto* const globals, ResolveConstantGlobals(executor)); + for (BufferAllocation::Index i = 0; i < assignment_->Allocations().size(); ++i) { const BufferAllocation& allocation = assignment_->GetAllocation(i); @@ -213,8 +262,12 @@ StatusOr<ScopedShapedBuffer> GpuExecutable::ExecuteOnStream( buffer_allocations_builder.RegisterBuffer(i, buffer); } + + if (allocation.is_constant()) { + buffer_allocations_builder.RegisterBuffer(i, FindOrDie(*globals, i)); + } } - se::StreamExecutor* executor = run_options->stream()->parent(); + TF_ASSIGN_OR_RETURN( auto buffer_allocations, buffer_allocations_builder.Build( |