aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
diff options
context:
space:
mode:
Diffstat (limited to 'tensorflow/compiler/xla/service/gpu/gpu_executable.cc')
-rw-r--r--tensorflow/compiler/xla/service/gpu/gpu_executable.cc454
1 files changed, 454 insertions, 0 deletions
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
new file mode 100644
index 0000000000..f654ffd22d
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@@ -0,0 +1,454 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/gpu_executable.h"
+
+#include <set>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/compiler/xla/map_util.h"
+#include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/logical_buffer.h"
+#include "tensorflow/compiler/xla/service/shaped_buffer.h"
+#include "tensorflow/compiler/xla/service/transfer_manager.h"
+#include "tensorflow/compiler/xla/shape_tree.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace se = ::perftools::gputools;
+
+namespace xla {
+namespace gpu {
+namespace {
+
+// A helper class for profiling HLO in the course of GPU program execution.
+// All of the profiling is guarded internally, to avoid the caller needing to
+// have lots of conditionals sprinkled around.
+class HloExecutionProfiler {
+ public:
+ // If profiling is enabled, start an execution timer running.
+ explicit HloExecutionProfiler(bool do_profile, HloExecutionProfile* profile,
+ se::Stream* stream)
+ : do_profile_(do_profile), profile_(profile), stream_(stream) {
+ if (do_profile_) {
+ clock_rate_ghz_ =
+ stream->parent()->GetDeviceDescription().clock_rate_ghz();
+ execution_timer_.reset(new se::Timer(stream->parent()));
+ per_op_timer_.reset(new se::Timer(stream->parent()));
+ stream->InitTimer(execution_timer_.get())
+ .ThenStartTimer(execution_timer_.get());
+ stream->InitTimer(per_op_timer_.get());
+ }
+ }
+
+ // If profiling is enabled, sets the total cycle count on the profile from the
+ // execution timer.
+ ~HloExecutionProfiler() {
+ if (do_profile_) {
+ stream_->ThenStopTimer(execution_timer_.get());
+ stream_->BlockHostUntilDone();
+ profile_->set_total_cycles_executed(execution_timer_->Nanoseconds() *
+ clock_rate_ghz_);
+ }
+ }
+
+ // If profiling is enabled, starts the per-operation timer.
+ void StartOperation() {
+ if (do_profile_) {
+ stream_->ThenStartTimer(per_op_timer_.get());
+ }
+ }
+
+ // If profiling is enabled, stops the per-operation timer and records the time
+ // that the hlo_instruction took to execute in the profile.
+ void FinishOperation(const HloInstruction* hlo_instruction) {
+ if (do_profile_) {
+ stream_->ThenStopTimer(per_op_timer_.get());
+ stream_->BlockHostUntilDone();
+ profile_->AddProfileResult(
+ hlo_instruction, per_op_timer_->Nanoseconds() * clock_rate_ghz_);
+ }
+ }
+
+ private:
+ const bool do_profile_;
+ double clock_rate_ghz_;
+ HloExecutionProfile* profile_;
+ se::Stream* stream_;
+ std::unique_ptr<se::Timer> execution_timer_;
+ std::unique_ptr<se::Timer> per_op_timer_;
+};
+
+} // namespace
+
+// Implementation note: HLO profiling is always enabled for GPU executables,
+// since we can use timers around thunks.
+GpuExecutable::GpuExecutable(
+ tensorflow::StringPiece ptx, std::unique_ptr<ThunkSchedule> thunk_schedule,
+ std::unique_ptr<HloModule> hlo_module,
+ std::unique_ptr<HloModuleConfig> module_config,
+ std::unique_ptr<BufferAssignment> assignment,
+ std::unique_ptr<TempBufferOffsets> temp_buffer_offsets)
+ : Executable(std::move(hlo_module), std::move(module_config)),
+ ptx_(ptx),
+ thunk_schedule_(std::move(thunk_schedule)),
+ assignment_(std::move(assignment)),
+ temp_buffer_offsets_(std::move(temp_buffer_offsets)) {}
+
+Status GpuExecutable::ExecuteThunks(
+ se::Stream* main_stream, const BufferAllocations& buffer_allocations,
+ HloExecutionProfile* hlo_execution_profile) {
+ bool do_profile = hlo_execution_profile != nullptr;
+ if (do_profile) {
+ LOG(WARNING) << "PROFILING: profiling is enabled";
+ }
+ HloExecutionProfiler profiler(do_profile, hlo_execution_profile, main_stream);
+
+ std::vector<std::unique_ptr<se::Stream>> sub_streams;
+ // Stream 0 indicates `main_stream` and substreams start from stream 1.
+ for (int32 i = 1; i < thunk_schedule_->StreamCount(); ++i) {
+ auto sub_stream = MakeUnique<se::Stream>(main_stream->parent());
+ sub_stream->Init();
+ sub_streams.emplace_back(std::move(sub_stream));
+ }
+
+ std::map<const Thunk*, std::unique_ptr<se::Event>> thunk_to_finish_event;
+ for (Thunk* thunk : thunk_schedule_->TotalOrder()) {
+ TF_RETURN_IF_ERROR(thunk->Initialize(*this));
+ int32 stream_no =
+ thunk_schedule_->StreamNumberForHlo(*thunk->hlo_instruction());
+ se::Stream* stream =
+ (stream_no == 0 ? main_stream : sub_streams[stream_no - 1].get());
+
+ for (const Thunk* dependency : thunk_schedule_->DependsOn(thunk)) {
+ stream->ThenWaitFor(FindOrDie(thunk_to_finish_event, dependency).get());
+ }
+
+ profiler.StartOperation();
+ VLOG(2) << "Executing the thunk for "
+ << thunk->hlo_instruction()->ToString();
+ TF_RETURN_IF_ERROR(thunk->ExecuteOnStream(buffer_allocations, stream));
+ if (thunk_schedule_->Depended(thunk)) {
+ auto finish_event = MakeUnique<se::Event>(main_stream->parent());
+ finish_event->Init();
+ stream->ThenRecordEvent(finish_event.get());
+ thunk_to_finish_event[thunk] = std::move(finish_event);
+ }
+ profiler.FinishOperation(thunk->hlo_instruction());
+ }
+
+ main_stream->ThenWaitFor(&sub_streams);
+ // Make sure kernels are completed before deallocating temporary buffers.
+ // TODO(b/30100571): we could potentially postpone deallocating the temp
+ // buffers until a different computation is executed.
+ if (!main_stream->BlockHostUntilDone()) {
+ return InternalError("Failed to complete all kernels launched on stream %p",
+ main_stream);
+ }
+
+ return Status::OK();
+}
+
+StatusOr<se::DeviceMemoryBase> GpuExecutable::ExecuteOnStream(
+ const ExecutableRunOptions* run_options,
+ tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> arguments,
+ HloExecutionProfile* hlo_execution_profile) {
+ se::Stream* stream = run_options->stream();
+ DeviceMemoryAllocator* memory_allocator = run_options->allocator();
+ // This ExecuteOnStream overload should only be called if has_hybrid_result is
+ // false.
+ TF_RET_CHECK(!module_config().has_hybrid_result());
+
+ BufferAllocations::Builder buffer_allocations_builder;
+ for (BufferAllocation::Index i = 0; i < assignment_->Allocations().size();
+ ++i) {
+ const BufferAllocation& allocation = assignment_->GetAllocation(i);
+ if (allocation.is_entry_computation_parameter()) {
+ buffer_allocations_builder.RegisterBuffer(
+ i, arguments[allocation.parameter_number()]);
+ }
+ }
+ se::StreamExecutor* executor = stream->parent();
+ TF_ASSIGN_OR_RETURN(auto buffer_allocations,
+ buffer_allocations_builder.Build(
+ *assignment_, *temp_buffer_offsets_,
+ executor->device_ordinal(), memory_allocator));
+
+ TF_RETURN_IF_ERROR(
+ ExecuteThunks(stream, *buffer_allocations, hlo_execution_profile));
+
+ HloInstruction* root = hlo_module_->entry_computation()->root_instruction();
+ TF_ASSIGN_OR_RETURN(const BufferAllocation* output_allocation,
+ assignment_->GetUniqueTopLevelOutputAllocation());
+ se::DeviceMemoryBase output_buffer_address =
+ buffer_allocations->GetDeviceAddress(output_allocation->index());
+
+ if (ShapeUtil::IsTuple(root->shape())) {
+ std::set<se::DeviceMemoryBase> referred_by_output;
+ if (GetRootPointsToSet().IsAmbiguous()) {
+ // The points-to set of the root is ambiguous so we need to examine the
+ // result data to determine which buffers are contained in the result.
+ TF_ASSIGN_OR_RETURN(
+ TransferManager * transfer_manager,
+ TransferManager::GetForPlatform(executor->platform()));
+ TF_ASSIGN_OR_RETURN(referred_by_output,
+ transfer_manager->GatherBufferPointersFromTuple(
+ executor, output_buffer_address, root->shape()));
+ } else {
+ // The points-to set of the root is unambiguous so it's known statically
+ // which buffers are in the result. Gather these buffers using the root's
+ // points-to set.
+ TF_RETURN_IF_ERROR(GetRootPointsToSet().ForEachElement(
+ [&referred_by_output, &buffer_allocations, this](
+ const ShapeIndex& /*index*/, bool /*is_leaf*/,
+ const std::vector<const LogicalBuffer*>& buffers) {
+ // The points to set is unambiguous so the set should be a
+ // singleton. That is, we know exactly which instruction produced
+ // the array at this element.
+ CHECK_EQ(1, buffers.size());
+ HloInstruction* hlo = buffers[0]->instruction();
+ TF_ASSIGN_OR_RETURN(const BufferAllocation* allocation,
+ this->assignment_->GetUniqueAllocation(
+ hlo, buffers[0]->index()));
+ CHECK(!allocation->is_entry_computation_parameter());
+ referred_by_output.insert(
+ buffer_allocations->GetDeviceAddress(allocation->index()));
+ return Status::OK();
+ }));
+ }
+ TF_RETURN_IF_ERROR(
+ buffer_allocations->TearDown(referred_by_output, *assignment_));
+ } else {
+ // If the computation result is not a tuple, we can delete all temporary
+ // buffers that are not the output.
+ TF_RETURN_IF_ERROR(
+ buffer_allocations->TearDown({output_buffer_address}, *assignment_));
+ }
+ return output_buffer_address;
+}
+
+StatusOr<std::unique_ptr<ShapedBuffer>> GpuExecutable::ExecuteOnStream(
+ const ExecutableRunOptions* run_options,
+ tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
+ HloExecutionProfile* hlo_execution_profile) {
+ se::Stream* stream = run_options->stream();
+ DeviceMemoryAllocator* memory_allocator = run_options->allocator();
+ // This ExecuteOnStream overload should only be called by the LocalService
+ // which sets has_hybrid_result to true.
+ TF_RET_CHECK(module_config().has_hybrid_result());
+
+ if (GetRootPointsToSet().IsAmbiguous()) {
+ return Unimplemented("Points-to set of root instruction is ambiguous");
+ }
+
+ BufferAllocations::Builder buffer_allocations_builder;
+ for (BufferAllocation::Index i = 0; i < assignment_->Allocations().size();
+ ++i) {
+ const BufferAllocation& allocation = assignment_->GetAllocation(i);
+ if (allocation.is_entry_computation_parameter()) {
+ auto param_no = allocation.parameter_number();
+ if (ShapeUtil::IsTuple(arguments[param_no]->shape())) {
+ return Unimplemented("Tuple ShapedBuffer arguments not supported");
+ }
+ buffer_allocations_builder.RegisterBuffer(
+ i, arguments[param_no]->buffer(/*index=*/{}));
+ }
+ }
+ se::StreamExecutor* executor = stream->parent();
+ TF_ASSIGN_OR_RETURN(auto buffer_allocations,
+ buffer_allocations_builder.Build(
+ *assignment_, *temp_buffer_offsets_,
+ executor->device_ordinal(), memory_allocator));
+
+ TF_RETURN_IF_ERROR(
+ ExecuteThunks(stream, *buffer_allocations, hlo_execution_profile));
+
+ HloInstruction* root = hlo_module_->entry_computation()->root_instruction();
+ auto device_ordinal = executor->device_ordinal();
+ TF_ASSIGN_OR_RETURN(auto shaped_buffer,
+ ShapedBuffer::MakeShapedBuffer(
+ root->shape(), executor->platform(), device_ordinal));
+
+ // Copy DeviceMemoryBase values which contain the array(s) of the result into
+ // the respective location in ShapedBuffer.
+ std::set<se::DeviceMemoryBase> buffers_in_result;
+ TF_RETURN_IF_ERROR(
+ shaped_buffer->mutable_shape_index_to_buffer_entry()
+ ->ForEachMutableElement(
+ [&buffer_allocations, &buffers_in_result, &shaped_buffer, this](
+ const ShapeIndex& index, bool is_leaf, size_t* buffer_entry) {
+ if (is_leaf) {
+ const std::vector<const LogicalBuffer*>& sources =
+ this->GetRootPointsToSet().element(index);
+ // The points to set is unambiguous so the set should be a
+ // singleton. That is, we know exactly which instruction
+ // produced the array at this element.
+ CHECK_EQ(1, sources.size());
+ auto src_hlo = sources[0]->instruction();
+
+ VLOG(4) << "Looking at: " << sources[0];
+
+ // The source instruction should have a non-parameter buffer
+ // assigned.
+ TF_ASSIGN_OR_RETURN(const BufferAllocation* allocation,
+ this->assignment_->GetUniqueAllocation(
+ src_hlo, sources[0]->index()));
+ CHECK(!allocation->is_entry_computation_parameter());
+
+ perftools::gputools::DeviceMemoryBase src_base =
+ buffer_allocations->GetDeviceAddress(allocation->index());
+ CHECK(!src_base.is_null() || src_base.size() == 0);
+ shaped_buffer->mutable_buffers()->push_back(src_base);
+ *buffer_entry = shaped_buffer->mutable_buffers()->size() - 1;
+
+ buffers_in_result.insert(src_base);
+ }
+ return Status::OK();
+ }));
+ TF_RETURN_IF_ERROR(
+ buffer_allocations->TearDown(buffers_in_result, *assignment_));
+
+ return std::move(shaped_buffer);
+}
+
+Status GpuExecutable::ExecuteOnStream(
+ const ExecutableRunOptions* run_options,
+ tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
+ ShapedBuffer* result_buffer, HloExecutionProfile* hlo_execution_profile) {
+ se::Stream* stream = run_options->stream();
+ DeviceMemoryAllocator* memory_allocator = run_options->allocator();
+ // This ExecuteOnStream overload should only be called by the LocalService
+ // which sets has_hybrid_result to true.
+ TF_RET_CHECK(module_config().has_hybrid_result());
+
+ // Every array element in the result of the computation must be unambiguously
+ // produced by a single instruction.
+ // This ensures that the buffers inside result_buffer can be assigned without
+ // conflict to the respective instructions because there is a one-to-one
+ // correspondence between hlo instructions and array buffers in the result.
+ if (GetRootPointsToSet().IsAmbiguous()) {
+ return Unimplemented(
+ "Points-to set of root instruction is ambiguous or not distinct");
+ }
+
+ DCHECK(ShapeUtil::Compatible(result_buffer->shape(), result_shape()));
+
+ BufferAllocations::Builder buffer_allocations_builder;
+ for (BufferAllocation::Index i = 0; i < assignment_->Allocations().size();
+ ++i) {
+ const BufferAllocation& allocation = assignment_->GetAllocation(i);
+ if (allocation.is_entry_computation_parameter()) {
+ auto param_no = allocation.parameter_number();
+ if (ShapeUtil::IsTuple(arguments[param_no]->shape())) {
+ return Unimplemented("Tuple ShapedBuffer arguments not supported");
+ }
+ buffer_allocations_builder.RegisterBuffer(
+ i, arguments[param_no]->buffer(/*index=*/{}));
+ }
+ }
+
+ // If two tuple elements point to the same buffer, one of the results in the
+ // result buffer is considered the canonical location while the other result
+ // points to it (instead of, say, making a copy of the result).
+ // buffer_index_to_shape_index maps a buffer index to its canonical location
+ // in the result buffer.
+ std::unordered_map<BufferAllocation::Index, size_t>
+ buffer_index_to_shape_index;
+
+ // Register DeviceMemoryBase values in result_buffer to their corresponding
+ // buffer indices. These buffers will not be allocated in the call to
+ // BufferAllocationsBuilder::Build.
+ std::set<se::DeviceMemoryBase> buffers_in_result;
+ TF_RETURN_IF_ERROR(
+ result_buffer->mutable_shape_index_to_buffer_entry()
+ ->ForEachMutableElement(
+ [&buffer_allocations_builder, &buffers_in_result,
+ &buffer_index_to_shape_index, result_buffer, this](
+ const ShapeIndex& index, bool is_leaf, size_t* buffer_entry) {
+ if (is_leaf) {
+ const std::vector<const LogicalBuffer*>& sources =
+ this->GetRootPointsToSet().element(index);
+ // The points to set is unambiguous so the set should be a
+ // singleton. That is, we know exactly which instruction
+ // produced the array at this element.
+ CHECK_EQ(1, sources.size());
+ auto src_hlo = sources[0]->instruction();
+
+ VLOG(4) << "Looking at: " << sources[0];
+
+ // The source instruction should have a non-parameter buffer
+ // assigned.
+ TF_ASSIGN_OR_RETURN(const BufferAllocation* allocation,
+ this->assignment_->GetUniqueAllocation(
+ src_hlo, sources[0]->index()));
+ CHECK(!allocation->is_entry_computation_parameter());
+
+ auto insert_result = buffer_index_to_shape_index.emplace(
+ allocation->index(), *buffer_entry);
+ if (insert_result.second) {
+ // The points-to set is distinct so this buffer should not
+ // have been assigned in a previous invocation of this
+ // lambda.
+ perftools::gputools::DeviceMemoryBase memory_base =
+ result_buffer->buffer(index);
+ CHECK(!memory_base.is_null());
+ buffer_allocations_builder.RegisterBuffer(
+ allocation->index(), memory_base);
+ buffers_in_result.insert(memory_base);
+ } else {
+ // Record the fact that this tuple element is identical to
+ // some
+ // prior result.
+ *buffer_entry = insert_result.first->second;
+ }
+ }
+ return Status::OK();
+ }));
+
+ se::StreamExecutor* executor = stream->parent();
+ auto device_ordinal = executor->device_ordinal();
+ TF_ASSIGN_OR_RETURN(
+ auto buffer_allocations,
+ buffer_allocations_builder.Build(*assignment_, *temp_buffer_offsets_,
+ device_ordinal, memory_allocator));
+
+ TF_RETURN_IF_ERROR(
+ ExecuteThunks(stream, *buffer_allocations, hlo_execution_profile));
+
+ return buffer_allocations->TearDown(buffers_in_result, *assignment_);
+}
+
+StatusOr<se::DeviceMemoryBase> GpuExecutable::ExecuteAsyncOnStream(
+ const ExecutableRunOptions* run_options,
+ tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> arguments) {
+ // TODO(b/30671675): Implement asynchronous execution mode.
+ return Unimplemented(
+ "Asynchronous execution on stream is not yet supported on GPU.");
+}
+
+const PointsToSet& GpuExecutable::GetRootPointsToSet() const {
+ return assignment_->points_to_analysis().GetPointsToSet(
+ module().entry_computation()->root_instruction());
+}
+
+} // namespace gpu
+} // namespace xla