diff options
Diffstat (limited to 'tensorflow/compiler/xla/service/gpu/gpu_executable.cc')
-rw-r--r-- | tensorflow/compiler/xla/service/gpu/gpu_executable.cc | 454 |
1 files changed, 454 insertions, 0 deletions
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc new file mode 100644 index 0000000000..f654ffd22d --- /dev/null +++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc @@ -0,0 +1,454 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/service/gpu/gpu_executable.h" + +#include <set> +#include <utility> +#include <vector> + +#include "tensorflow/compiler/xla/map_util.h" +#include "tensorflow/compiler/xla/ptr_util.h" +#include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h" +#include "tensorflow/compiler/xla/service/hlo_computation.h" +#include "tensorflow/compiler/xla/service/hlo_instruction.h" +#include "tensorflow/compiler/xla/service/logical_buffer.h" +#include "tensorflow/compiler/xla/service/shaped_buffer.h" +#include "tensorflow/compiler/xla/service/transfer_manager.h" +#include "tensorflow/compiler/xla/shape_tree.h" +#include "tensorflow/compiler/xla/shape_util.h" +#include "tensorflow/compiler/xla/status_macros.h" +#include "tensorflow/compiler/xla/util.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/types.h" + +namespace se = ::perftools::gputools; + +namespace xla { +namespace gpu { +namespace { + +// A helper class for profiling HLO in the course of GPU program execution. +// All of the profiling is guarded internally, to avoid the caller needing to +// have lots of conditionals sprinkled around. +class HloExecutionProfiler { + public: + // If profiling is enabled, start an execution timer running. + explicit HloExecutionProfiler(bool do_profile, HloExecutionProfile* profile, + se::Stream* stream) + : do_profile_(do_profile), profile_(profile), stream_(stream) { + if (do_profile_) { + clock_rate_ghz_ = + stream->parent()->GetDeviceDescription().clock_rate_ghz(); + execution_timer_.reset(new se::Timer(stream->parent())); + per_op_timer_.reset(new se::Timer(stream->parent())); + stream->InitTimer(execution_timer_.get()) + .ThenStartTimer(execution_timer_.get()); + stream->InitTimer(per_op_timer_.get()); + } + } + + // If profiling is enabled, sets the total cycle count on the profile from the + // execution timer. + ~HloExecutionProfiler() { + if (do_profile_) { + stream_->ThenStopTimer(execution_timer_.get()); + stream_->BlockHostUntilDone(); + profile_->set_total_cycles_executed(execution_timer_->Nanoseconds() * + clock_rate_ghz_); + } + } + + // If profiling is enabled, starts the per-operation timer. + void StartOperation() { + if (do_profile_) { + stream_->ThenStartTimer(per_op_timer_.get()); + } + } + + // If profiling is enabled, stops the per-operation timer and records the time + // that the hlo_instruction took to execute in the profile. + void FinishOperation(const HloInstruction* hlo_instruction) { + if (do_profile_) { + stream_->ThenStopTimer(per_op_timer_.get()); + stream_->BlockHostUntilDone(); + profile_->AddProfileResult( + hlo_instruction, per_op_timer_->Nanoseconds() * clock_rate_ghz_); + } + } + + private: + const bool do_profile_; + double clock_rate_ghz_; + HloExecutionProfile* profile_; + se::Stream* stream_; + std::unique_ptr<se::Timer> execution_timer_; + std::unique_ptr<se::Timer> per_op_timer_; +}; + +} // namespace + +// Implementation note: HLO profiling is always enabled for GPU executables, +// since we can use timers around thunks. +GpuExecutable::GpuExecutable( + tensorflow::StringPiece ptx, std::unique_ptr<ThunkSchedule> thunk_schedule, + std::unique_ptr<HloModule> hlo_module, + std::unique_ptr<HloModuleConfig> module_config, + std::unique_ptr<BufferAssignment> assignment, + std::unique_ptr<TempBufferOffsets> temp_buffer_offsets) + : Executable(std::move(hlo_module), std::move(module_config)), + ptx_(ptx), + thunk_schedule_(std::move(thunk_schedule)), + assignment_(std::move(assignment)), + temp_buffer_offsets_(std::move(temp_buffer_offsets)) {} + +Status GpuExecutable::ExecuteThunks( + se::Stream* main_stream, const BufferAllocations& buffer_allocations, + HloExecutionProfile* hlo_execution_profile) { + bool do_profile = hlo_execution_profile != nullptr; + if (do_profile) { + LOG(WARNING) << "PROFILING: profiling is enabled"; + } + HloExecutionProfiler profiler(do_profile, hlo_execution_profile, main_stream); + + std::vector<std::unique_ptr<se::Stream>> sub_streams; + // Stream 0 indicates `main_stream` and substreams start from stream 1. + for (int32 i = 1; i < thunk_schedule_->StreamCount(); ++i) { + auto sub_stream = MakeUnique<se::Stream>(main_stream->parent()); + sub_stream->Init(); + sub_streams.emplace_back(std::move(sub_stream)); + } + + std::map<const Thunk*, std::unique_ptr<se::Event>> thunk_to_finish_event; + for (Thunk* thunk : thunk_schedule_->TotalOrder()) { + TF_RETURN_IF_ERROR(thunk->Initialize(*this)); + int32 stream_no = + thunk_schedule_->StreamNumberForHlo(*thunk->hlo_instruction()); + se::Stream* stream = + (stream_no == 0 ? main_stream : sub_streams[stream_no - 1].get()); + + for (const Thunk* dependency : thunk_schedule_->DependsOn(thunk)) { + stream->ThenWaitFor(FindOrDie(thunk_to_finish_event, dependency).get()); + } + + profiler.StartOperation(); + VLOG(2) << "Executing the thunk for " + << thunk->hlo_instruction()->ToString(); + TF_RETURN_IF_ERROR(thunk->ExecuteOnStream(buffer_allocations, stream)); + if (thunk_schedule_->Depended(thunk)) { + auto finish_event = MakeUnique<se::Event>(main_stream->parent()); + finish_event->Init(); + stream->ThenRecordEvent(finish_event.get()); + thunk_to_finish_event[thunk] = std::move(finish_event); + } + profiler.FinishOperation(thunk->hlo_instruction()); + } + + main_stream->ThenWaitFor(&sub_streams); + // Make sure kernels are completed before deallocating temporary buffers. + // TODO(b/30100571): we could potentially postpone deallocating the temp + // buffers until a different computation is executed. + if (!main_stream->BlockHostUntilDone()) { + return InternalError("Failed to complete all kernels launched on stream %p", + main_stream); + } + + return Status::OK(); +} + +StatusOr<se::DeviceMemoryBase> GpuExecutable::ExecuteOnStream( + const ExecutableRunOptions* run_options, + tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> arguments, + HloExecutionProfile* hlo_execution_profile) { + se::Stream* stream = run_options->stream(); + DeviceMemoryAllocator* memory_allocator = run_options->allocator(); + // This ExecuteOnStream overload should only be called if has_hybrid_result is + // false. + TF_RET_CHECK(!module_config().has_hybrid_result()); + + BufferAllocations::Builder buffer_allocations_builder; + for (BufferAllocation::Index i = 0; i < assignment_->Allocations().size(); + ++i) { + const BufferAllocation& allocation = assignment_->GetAllocation(i); + if (allocation.is_entry_computation_parameter()) { + buffer_allocations_builder.RegisterBuffer( + i, arguments[allocation.parameter_number()]); + } + } + se::StreamExecutor* executor = stream->parent(); + TF_ASSIGN_OR_RETURN(auto buffer_allocations, + buffer_allocations_builder.Build( + *assignment_, *temp_buffer_offsets_, + executor->device_ordinal(), memory_allocator)); + + TF_RETURN_IF_ERROR( + ExecuteThunks(stream, *buffer_allocations, hlo_execution_profile)); + + HloInstruction* root = hlo_module_->entry_computation()->root_instruction(); + TF_ASSIGN_OR_RETURN(const BufferAllocation* output_allocation, + assignment_->GetUniqueTopLevelOutputAllocation()); + se::DeviceMemoryBase output_buffer_address = + buffer_allocations->GetDeviceAddress(output_allocation->index()); + + if (ShapeUtil::IsTuple(root->shape())) { + std::set<se::DeviceMemoryBase> referred_by_output; + if (GetRootPointsToSet().IsAmbiguous()) { + // The points-to set of the root is ambiguous so we need to examine the + // result data to determine which buffers are contained in the result. + TF_ASSIGN_OR_RETURN( + TransferManager * transfer_manager, + TransferManager::GetForPlatform(executor->platform())); + TF_ASSIGN_OR_RETURN(referred_by_output, + transfer_manager->GatherBufferPointersFromTuple( + executor, output_buffer_address, root->shape())); + } else { + // The points-to set of the root is unambiguous so it's known statically + // which buffers are in the result. Gather these buffers using the root's + // points-to set. + TF_RETURN_IF_ERROR(GetRootPointsToSet().ForEachElement( + [&referred_by_output, &buffer_allocations, this]( + const ShapeIndex& /*index*/, bool /*is_leaf*/, + const std::vector<const LogicalBuffer*>& buffers) { + // The points to set is unambiguous so the set should be a + // singleton. That is, we know exactly which instruction produced + // the array at this element. + CHECK_EQ(1, buffers.size()); + HloInstruction* hlo = buffers[0]->instruction(); + TF_ASSIGN_OR_RETURN(const BufferAllocation* allocation, + this->assignment_->GetUniqueAllocation( + hlo, buffers[0]->index())); + CHECK(!allocation->is_entry_computation_parameter()); + referred_by_output.insert( + buffer_allocations->GetDeviceAddress(allocation->index())); + return Status::OK(); + })); + } + TF_RETURN_IF_ERROR( + buffer_allocations->TearDown(referred_by_output, *assignment_)); + } else { + // If the computation result is not a tuple, we can delete all temporary + // buffers that are not the output. + TF_RETURN_IF_ERROR( + buffer_allocations->TearDown({output_buffer_address}, *assignment_)); + } + return output_buffer_address; +} + +StatusOr<std::unique_ptr<ShapedBuffer>> GpuExecutable::ExecuteOnStream( + const ExecutableRunOptions* run_options, + tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments, + HloExecutionProfile* hlo_execution_profile) { + se::Stream* stream = run_options->stream(); + DeviceMemoryAllocator* memory_allocator = run_options->allocator(); + // This ExecuteOnStream overload should only be called by the LocalService + // which sets has_hybrid_result to true. + TF_RET_CHECK(module_config().has_hybrid_result()); + + if (GetRootPointsToSet().IsAmbiguous()) { + return Unimplemented("Points-to set of root instruction is ambiguous"); + } + + BufferAllocations::Builder buffer_allocations_builder; + for (BufferAllocation::Index i = 0; i < assignment_->Allocations().size(); + ++i) { + const BufferAllocation& allocation = assignment_->GetAllocation(i); + if (allocation.is_entry_computation_parameter()) { + auto param_no = allocation.parameter_number(); + if (ShapeUtil::IsTuple(arguments[param_no]->shape())) { + return Unimplemented("Tuple ShapedBuffer arguments not supported"); + } + buffer_allocations_builder.RegisterBuffer( + i, arguments[param_no]->buffer(/*index=*/{})); + } + } + se::StreamExecutor* executor = stream->parent(); + TF_ASSIGN_OR_RETURN(auto buffer_allocations, + buffer_allocations_builder.Build( + *assignment_, *temp_buffer_offsets_, + executor->device_ordinal(), memory_allocator)); + + TF_RETURN_IF_ERROR( + ExecuteThunks(stream, *buffer_allocations, hlo_execution_profile)); + + HloInstruction* root = hlo_module_->entry_computation()->root_instruction(); + auto device_ordinal = executor->device_ordinal(); + TF_ASSIGN_OR_RETURN(auto shaped_buffer, + ShapedBuffer::MakeShapedBuffer( + root->shape(), executor->platform(), device_ordinal)); + + // Copy DeviceMemoryBase values which contain the array(s) of the result into + // the respective location in ShapedBuffer. + std::set<se::DeviceMemoryBase> buffers_in_result; + TF_RETURN_IF_ERROR( + shaped_buffer->mutable_shape_index_to_buffer_entry() + ->ForEachMutableElement( + [&buffer_allocations, &buffers_in_result, &shaped_buffer, this]( + const ShapeIndex& index, bool is_leaf, size_t* buffer_entry) { + if (is_leaf) { + const std::vector<const LogicalBuffer*>& sources = + this->GetRootPointsToSet().element(index); + // The points to set is unambiguous so the set should be a + // singleton. That is, we know exactly which instruction + // produced the array at this element. + CHECK_EQ(1, sources.size()); + auto src_hlo = sources[0]->instruction(); + + VLOG(4) << "Looking at: " << sources[0]; + + // The source instruction should have a non-parameter buffer + // assigned. + TF_ASSIGN_OR_RETURN(const BufferAllocation* allocation, + this->assignment_->GetUniqueAllocation( + src_hlo, sources[0]->index())); + CHECK(!allocation->is_entry_computation_parameter()); + + perftools::gputools::DeviceMemoryBase src_base = + buffer_allocations->GetDeviceAddress(allocation->index()); + CHECK(!src_base.is_null() || src_base.size() == 0); + shaped_buffer->mutable_buffers()->push_back(src_base); + *buffer_entry = shaped_buffer->mutable_buffers()->size() - 1; + + buffers_in_result.insert(src_base); + } + return Status::OK(); + })); + TF_RETURN_IF_ERROR( + buffer_allocations->TearDown(buffers_in_result, *assignment_)); + + return std::move(shaped_buffer); +} + +Status GpuExecutable::ExecuteOnStream( + const ExecutableRunOptions* run_options, + tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments, + ShapedBuffer* result_buffer, HloExecutionProfile* hlo_execution_profile) { + se::Stream* stream = run_options->stream(); + DeviceMemoryAllocator* memory_allocator = run_options->allocator(); + // This ExecuteOnStream overload should only be called by the LocalService + // which sets has_hybrid_result to true. + TF_RET_CHECK(module_config().has_hybrid_result()); + + // Every array element in the result of the computation must be unambiguously + // produced by a single instruction. + // This ensures that the buffers inside result_buffer can be assigned without + // conflict to the respective instructions because there is a one-to-one + // correspondence between hlo instructions and array buffers in the result. + if (GetRootPointsToSet().IsAmbiguous()) { + return Unimplemented( + "Points-to set of root instruction is ambiguous or not distinct"); + } + + DCHECK(ShapeUtil::Compatible(result_buffer->shape(), result_shape())); + + BufferAllocations::Builder buffer_allocations_builder; + for (BufferAllocation::Index i = 0; i < assignment_->Allocations().size(); + ++i) { + const BufferAllocation& allocation = assignment_->GetAllocation(i); + if (allocation.is_entry_computation_parameter()) { + auto param_no = allocation.parameter_number(); + if (ShapeUtil::IsTuple(arguments[param_no]->shape())) { + return Unimplemented("Tuple ShapedBuffer arguments not supported"); + } + buffer_allocations_builder.RegisterBuffer( + i, arguments[param_no]->buffer(/*index=*/{})); + } + } + + // If two tuple elements point to the same buffer, one of the results in the + // result buffer is considered the canonical location while the other result + // points to it (instead of, say, making a copy of the result). + // buffer_index_to_shape_index maps a buffer index to its canonical location + // in the result buffer. + std::unordered_map<BufferAllocation::Index, size_t> + buffer_index_to_shape_index; + + // Register DeviceMemoryBase values in result_buffer to their corresponding + // buffer indices. These buffers will not be allocated in the call to + // BufferAllocationsBuilder::Build. + std::set<se::DeviceMemoryBase> buffers_in_result; + TF_RETURN_IF_ERROR( + result_buffer->mutable_shape_index_to_buffer_entry() + ->ForEachMutableElement( + [&buffer_allocations_builder, &buffers_in_result, + &buffer_index_to_shape_index, result_buffer, this]( + const ShapeIndex& index, bool is_leaf, size_t* buffer_entry) { + if (is_leaf) { + const std::vector<const LogicalBuffer*>& sources = + this->GetRootPointsToSet().element(index); + // The points to set is unambiguous so the set should be a + // singleton. That is, we know exactly which instruction + // produced the array at this element. + CHECK_EQ(1, sources.size()); + auto src_hlo = sources[0]->instruction(); + + VLOG(4) << "Looking at: " << sources[0]; + + // The source instruction should have a non-parameter buffer + // assigned. + TF_ASSIGN_OR_RETURN(const BufferAllocation* allocation, + this->assignment_->GetUniqueAllocation( + src_hlo, sources[0]->index())); + CHECK(!allocation->is_entry_computation_parameter()); + + auto insert_result = buffer_index_to_shape_index.emplace( + allocation->index(), *buffer_entry); + if (insert_result.second) { + // The points-to set is distinct so this buffer should not + // have been assigned in a previous invocation of this + // lambda. + perftools::gputools::DeviceMemoryBase memory_base = + result_buffer->buffer(index); + CHECK(!memory_base.is_null()); + buffer_allocations_builder.RegisterBuffer( + allocation->index(), memory_base); + buffers_in_result.insert(memory_base); + } else { + // Record the fact that this tuple element is identical to + // some + // prior result. + *buffer_entry = insert_result.first->second; + } + } + return Status::OK(); + })); + + se::StreamExecutor* executor = stream->parent(); + auto device_ordinal = executor->device_ordinal(); + TF_ASSIGN_OR_RETURN( + auto buffer_allocations, + buffer_allocations_builder.Build(*assignment_, *temp_buffer_offsets_, + device_ordinal, memory_allocator)); + + TF_RETURN_IF_ERROR( + ExecuteThunks(stream, *buffer_allocations, hlo_execution_profile)); + + return buffer_allocations->TearDown(buffers_in_result, *assignment_); +} + +StatusOr<se::DeviceMemoryBase> GpuExecutable::ExecuteAsyncOnStream( + const ExecutableRunOptions* run_options, + tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> arguments) { + // TODO(b/30671675): Implement asynchronous execution mode. + return Unimplemented( + "Asynchronous execution on stream is not yet supported on GPU."); +} + +const PointsToSet& GpuExecutable::GetRootPointsToSet() const { + return assignment_->points_to_analysis().GetPointsToSet( + module().entry_computation()->root_instruction()); +} + +} // namespace gpu +} // namespace xla |