Implement constant buffer allocation for XLA:GPU

This CL teaches XLA:GPU to use "normal" buffer assignment for constant instructions. Constant instructions are mapped to a BufferAllocation, like all other instructions, except the storage for this buffer is allocated statically as a global in the generated PTX. This CL does not change how we access the constants -- in IrEmitterUnnested::BuildKernelThunk (used for top level computations) and in HloToIrBindings::EmitBasePointersForHlos (used for nested computations) we bind the kConstant instructions to the llvm::GlobalVariable backing them. So users of constant instructions still access the globals corresponding to the constants directly. However, we no longer emit the constant literals inline. Instead we emit a constant with a zero initializer and then memcpy in the contents of the literal when we load the CUBIN/PTX. This works around compile time issues in LLVM and ptxas caused by large constants. We also populate `BufferAllocations` with the device pointers for the constant globals. This is at least needed for TupleThunk today because TupleThunk wants the addresses for the sub-buffers on the host. I'm not sure if there are other places in XLA:GPU that rely on there being an entry in BufferAllocations for every BufferAllocation. PiperOrigin-RevId: 206243319
author: Sanjoy Das <sanjoy@google.com> 2018-07-26 16:43:58 -0700
committer: TensorFlower Gardener <gardener@tensorflow.org> 2018-07-26 16:47:23 -0700
commit: 4009f82f71f0421e4ed1f50d38e9105074062d1e (patch)
tree: 6d732c7bea7a1e776d159989c50a69eb7f1697be /tensorflow/compiler/xla/service/gpu/gpu_executable.cc
parent: e336ee65a5c887e9a2f0b4c82c333bca405707a5 (diff)
1 files changed, 54 insertions, 1 deletions
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
index 9767836cd6..52c8595ffb 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@@ -181,6 +181,51 @@ Status GpuExecutable::ExecuteThunks(
   return Status::OK();
 }
 
+StatusOr<const GpuExecutable::BufferAllocToDeviceMemoryMap*>
+GpuExecutable::ResolveConstantGlobals(se::StreamExecutor* executor) {
+  tensorflow::mutex_lock lock(module_handle_mutex_);
+  auto it = module_globals_.find(executor);
+  if (it != module_globals_.end()) {
+    return &it->second;
+  }
+
+  se::MultiModuleLoaderSpec module_spec;
+  module_spec.AddCudaCubinInMemory(cubin());
+  module_spec.AddCudaPtxInMemory(ptx().c_str());
+
+  tensorflow::gtl::FlatMap<int64, se::DeviceMemoryBase> globals;
+  se::ModuleHandle module_handle;
+  executor->LoadModule(module_spec, &module_handle);
+
+  for (BufferAllocation::Index i = 0; i < assignment_->Allocations().size();
+       ++i) {
+    const BufferAllocation& allocation = assignment_->GetAllocation(i);
+    if (allocation.is_constant()) {
+      TF_ASSIGN_OR_RETURN(
+          se::DeviceMemoryBase global,
+          executor->GetUntypedSymbol(
+              ConstantBufferAllocationToGlobalName(allocation), module_handle));
+      VLOG(3) << "Resolved global "
+              << ConstantBufferAllocationToGlobalName(allocation) << " to "
+              << global.opaque();
+      InsertOrDie(&globals, i, global);
+
+      const Literal& literal = LiteralForConstantAllocation(allocation);
+      CHECK(ShapeUtil::IsArray(literal.shape()));
+      if (!ShouldEmitLiteralInLlvmIr(literal)) {
+        VLOG(3) << "H2D memcpy for constant with shape "
+                << ShapeUtil::HumanString(literal.shape());
+        TF_RETURN_IF_ERROR(executor->SynchronousMemcpyH2D(
+            literal.untyped_data(), allocation.size(), &global));
+      }
+    }
+  }
+
+  module_handles_.emplace(executor,
+                          se::ScopedModuleHandle(executor, module_handle));
+  return &module_globals_.emplace(executor, std::move(globals)).first->second;
+}
+
 StatusOr<ScopedShapedBuffer> GpuExecutable::ExecuteOnStream(
     const ServiceExecutableRunOptions* run_options,
     tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
@@ -192,6 +237,10 @@ StatusOr<ScopedShapedBuffer> GpuExecutable::ExecuteOnStream(
   }
 
   BufferAllocations::Builder buffer_allocations_builder;
+  se::StreamExecutor* executor = run_options->stream()->parent();
+
+  TF_ASSIGN_OR_RETURN(auto* const globals, ResolveConstantGlobals(executor));
+
   for (BufferAllocation::Index i = 0; i < assignment_->Allocations().size();
        ++i) {
     const BufferAllocation& allocation = assignment_->GetAllocation(i);
@@ -213,8 +262,12 @@ StatusOr<ScopedShapedBuffer> GpuExecutable::ExecuteOnStream(
 
       buffer_allocations_builder.RegisterBuffer(i, buffer);
     }
+
+    if (allocation.is_constant()) {
+      buffer_allocations_builder.RegisterBuffer(i, FindOrDie(*globals, i));
+    }
   }
-  se::StreamExecutor* executor = run_options->stream()->parent();
+
   TF_ASSIGN_OR_RETURN(
       auto buffer_allocations,
       buffer_allocations_builder.Build(
author	Sanjoy Das <sanjoy@google.com>	2018-07-26 16:43:58 -0700
committer	TensorFlower Gardener <gardener@tensorflow.org>	2018-07-26 16:47:23 -0700
commit	4009f82f71f0421e4ed1f50d38e9105074062d1e (patch)
tree	6d732c7bea7a1e776d159989c50a69eb7f1697be /tensorflow/compiler/xla/service/gpu/gpu_executable.cc
parent	e336ee65a5c887e9a2f0b4c82c333bca405707a5 (diff)