diff options
14 files changed, 333 insertions, 75 deletions
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD index bd10f1dc5c..cbf0a0ddd5 100644 --- a/tensorflow/compiler/xla/service/BUILD +++ b/tensorflow/compiler/xla/service/BUILD @@ -1151,6 +1151,7 @@ cc_library( "//tensorflow/compiler/xla:types", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/core:lib", + "//tensorflow/core:lib_internal", ], ) diff --git a/tensorflow/compiler/xla/service/buffer_assignment.cc b/tensorflow/compiler/xla/service/buffer_assignment.cc index e1107d9782..ea1dce7124 100644 --- a/tensorflow/compiler/xla/service/buffer_assignment.cc +++ b/tensorflow/compiler/xla/service/buffer_assignment.cc @@ -28,7 +28,6 @@ limitations under the License. #include "tensorflow/compiler/xla/service/heap_simulator.h" #include "tensorflow/compiler/xla/service/hlo.pb.h" #include "tensorflow/compiler/xla/service/hlo_opcode.h" -#include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/compiler/xla/types.h" @@ -74,6 +73,9 @@ void BufferAllocation::AddAssignment(const LogicalBuffer& buffer, int64 offset, << " offset out of range"; CHECK_LE(offset + size, size_) << "LogicalBuffer " << buffer << " size out of range"; + CHECK_EQ(buffer.color(), color()) + << "Buffer color " << buffer.color() + << " does not match allocation color " << color() << "."; OffsetSize offset_size; offset_size.offset = offset; offset_size.size = size; @@ -86,6 +88,7 @@ BufferAllocationProto BufferAllocation::ToProto() const { proto.set_size(size_); proto.set_is_thread_local(is_thread_local_); proto.set_is_reusable(is_reusable_); + proto.set_color(color_.value()); if (is_entry_computation_parameter_) { proto.set_is_entry_computation_parameter(true); proto.set_parameter_number(parameter_number_); @@ -105,6 +108,9 @@ string BufferAllocation::ToString() const { tensorflow::strings::StrAppend( &output, tensorflow::strings::Printf("allocation %lld: %p, size %lld", index_, this, size())); + if (color().value() != 0) { + tensorflow::strings::StrAppend(&output, ", color ", color().value()); + } if (is_entry_computation_parameter()) { tensorflow::strings::StrAppend(&output, ", parameter ", parameter_number()); } @@ -248,11 +254,11 @@ BufferAssignment::GetUniqueTopLevelOutputSlice() const { module_->entry_computation()->root_instruction()); } -BufferAllocation* BufferAssignment::NewEmptyAllocation(int64 size, - bool is_thread_local, - bool is_reusable) { +BufferAllocation* BufferAssignment::NewEmptyAllocation( + int64 size, bool is_thread_local, bool is_reusable, + LogicalBuffer::Color color) { BufferAllocation::Index index = allocations_.size(); - allocations_.emplace_back(index, size, is_thread_local, is_reusable); + allocations_.emplace_back(index, size, is_thread_local, is_reusable, color); BufferAllocation* allocation = &allocations_.back(); return allocation; } @@ -262,7 +268,7 @@ BufferAllocation* BufferAssignment::NewAllocation(const LogicalBuffer& buffer, bool is_thread_local, bool is_reusable) { BufferAllocation* allocation = - NewEmptyAllocation(size, is_thread_local, is_reusable); + NewEmptyAllocation(size, is_thread_local, is_reusable, buffer.color()); AddAssignment(allocation, buffer, /*offset=*/0, size); return allocation; } @@ -282,33 +288,55 @@ void BufferAssignment::AddAssignment(BufferAllocation* allocation, allocation_index_for_buffer_[&buffer] = allocation->index(); } -// Combines allocations of temporary buffers into one big BufferAllocation. +// Combines allocations of temporary buffers of the same color into one big +// BufferAllocation. void BufferAssignment::CombineTempAllocations() { + FlatMap<LogicalBuffer::Color, BufferAllocation, LogicalBuffer::Color::Hasher> + combined_allocation_map; + // Move all temp allocations into a single run at the end of the allocations - // vector, and combine them into the first allocation of the run. + // vector. const auto first_temp_it = std::partition(allocations_.begin(), allocations_.end(), [](const BufferAllocation& allocation) { return !allocation.IsPreallocatedTempBuffer(); }); + + // Walk over the run of temp allocations, collecting the allocations belonging + // to the same color. if (first_temp_it != allocations_.end()) { - BufferAllocation* combined = &*first_temp_it; - const auto second_temp_it = std::next(first_temp_it); - for (auto it = second_temp_it; it != allocations_.end(); ++it) { + for (auto it = first_temp_it; it != allocations_.end(); ++it) { + const BufferAllocation& temp_allocation = *it; + LogicalBuffer::Color color = temp_allocation.color(); + auto combined_it = combined_allocation_map.find(color); + if (combined_it == combined_allocation_map.end()) { + // We have found the first temp allocation of this color. Collect + // the other temp allocations of the same color into it. + combined_allocation_map.emplace(color, temp_allocation); + continue; + } + + auto* combined_allocation = &combined_it->second; // Each temp allocation is placed end-to-end, accounting for alignment. // The offset of each buffer in the combined allocation is computed from // the base offset of the allocation. - const int64 base = RoundUpToNearest(combined->size(), alignment_); - combined->set_size(base + it->size()); - for (const auto& buffer_offset_size : it->assigned_buffers_) { + const int64 base = + RoundUpToNearest(combined_allocation->size(), alignment_); + combined_allocation->set_size(base + temp_allocation.size()); + for (const auto& buffer_offset_size : temp_allocation.assigned_buffers_) { const LogicalBuffer* buffer = buffer_offset_size.first; const int64 offset = buffer_offset_size.second.offset; const int64 size = buffer_offset_size.second.size; - combined->AddAssignment(*buffer, base + offset, size); + combined_allocation->AddAssignment(*buffer, base + offset, size); } } - allocations_.erase(second_temp_it, allocations_.end()); - temp_allocation_ = combined; + // Replace all existing temporary allocations with the new combined + // allocations. + allocations_.erase(first_temp_it, allocations_.end()); + for (auto& combined : combined_allocation_map) { + allocations_.push_back(combined.second); + temp_allocation_total_size_ += combined.second.size(); + } } // Update allocation indices to their new positions. @@ -548,8 +576,9 @@ Status GatherComputationsByAllocationType( StatusOr<std::unique_ptr<BufferAssignment>> BufferAssigner::Run( const HloModule* module, std::unique_ptr<HloOrdering> hlo_ordering, LogicalBuffer::SizeFunction buffer_size, int64 alignment, - bool allow_input_output_aliasing) { - BufferAssigner assigner(alignment, allow_input_output_aliasing); + bool allow_input_output_aliasing, TuplePointsToAnalysis::Colorer colorer) { + BufferAssigner assigner(alignment, allow_input_output_aliasing, + std::move(colorer)); return assigner.CreateAssignment(module, std::move(hlo_ordering), std::move(buffer_size)); } @@ -564,6 +593,12 @@ bool BufferAssigner::MaybeAssignBuffer(BufferAllocation* allocation, VLOG(4) << "Trying to assign " << buffer << " to allocation: " << *allocation; + if (buffer.color() != allocation->color()) { + VLOG(4) << "Can't assign: buffer has color" << buffer.color() + << " and allocation has color " << allocation->color() << "."; + return false; + } + if (buffer_size(buffer) > allocation->size()) { VLOG(4) << "Can't assign: buffer is larger than allocation (" << buffer_size(buffer) << " > " << allocation->size() << ")"; @@ -863,6 +898,19 @@ Status BufferAssigner::AssignBuffersForComputation( return Status::OK(); } +FlatMap<LogicalBuffer::Color, FlatSet<const LogicalBuffer*>, + LogicalBuffer::Color::Hasher> +BufferAssigner::SplitBuffersByColor( + const FlatSet<const LogicalBuffer*>& buffers) { + FlatMap<LogicalBuffer::Color, FlatSet<const LogicalBuffer*>, + LogicalBuffer::Color::Hasher> + color_map; + for (auto buffer : buffers) { + color_map[buffer->color()].insert(buffer); + } + return color_map; +} + Status BufferAssigner::AssignBuffersWithSequentialOrdering( const FlatMap<const HloComputation*, FlatSet<const LogicalBuffer*>>& buffers_to_assign_sequentially, @@ -888,14 +936,20 @@ Status BufferAssigner::AssignBuffersWithSequentialOrdering( all_buffers_to_assign.insert(buffers_to_assign.begin(), buffers_to_assign.end()); } - TF_ASSIGN_OR_RETURN( - const HeapSimulator::Result result, - HeapSimulator::Run(MakeUnique<DecreasingSizeRunsHeap>( - MakeUnique<LazyBestFitHeap>(alignment_)), - assignment->module(), module_sequence, - assignment->points_to_analysis(), - assignment->buffer_size_, &all_buffers_to_assign)); - AssignBuffersFromHeapSimulator(result, assignment); + auto color_map = SplitBuffersByColor(all_buffers_to_assign); + for (auto& single_colored_set : color_map) { + VLOG(2) << "Simulating heap for color " << single_colored_set.first; + TF_ASSIGN_OR_RETURN( + const HeapSimulator::Result result, + HeapSimulator::Run(MakeUnique<DecreasingSizeRunsHeap>( + MakeUnique<LazyBestFitHeap>(alignment_)), + assignment->module(), module_sequence, + assignment->points_to_analysis(), + assignment->buffer_size_, + &single_colored_set.second)); + AssignBuffersFromHeapSimulator(result, assignment, + single_colored_set.first); + } } else { // Run the heap-simulation on a per-computation basis. Buffers for // sub-computations are assigned disjoint BufferAllocations, assuming the @@ -907,21 +961,28 @@ Status BufferAssigner::AssignBuffersWithSequentialOrdering( const std::vector<const HloInstruction*>* instruction_sequence = hlo_ordering.SequentialOrder(*computation); CHECK(instruction_sequence != nullptr) << computation->name(); - TF_ASSIGN_OR_RETURN( - const HeapSimulator::Result result, - HeapSimulator::Run(MakeUnique<DecreasingSizeRunsHeap>( - MakeUnique<LazyBestFitHeap>(alignment_)), - *computation, *instruction_sequence, - assignment->points_to_analysis(), - assignment->buffer_size_, &buffers_to_assign)); - AssignBuffersFromHeapSimulator(result, assignment); + auto color_map = SplitBuffersByColor(buffers_to_assign); + for (auto& single_colored_set : color_map) { + VLOG(2) << "Simulating heap for color " << single_colored_set.first; + TF_ASSIGN_OR_RETURN( + const HeapSimulator::Result result, + HeapSimulator::Run(MakeUnique<DecreasingSizeRunsHeap>( + MakeUnique<LazyBestFitHeap>(alignment_)), + *computation, *instruction_sequence, + assignment->points_to_analysis(), + assignment->buffer_size_, + &single_colored_set.second)); + AssignBuffersFromHeapSimulator(result, assignment, + single_colored_set.first); + } } } return Status::OK(); } void BufferAssigner::AssignBuffersFromHeapSimulator( - const HeapSimulator::Result& result, BufferAssignment* assignment) { + const HeapSimulator::Result& result, BufferAssignment* assignment, + LogicalBuffer::Color color) { if (assignment->stats_.preallocated_temp_fragmentation_bytes == -1) { assignment->stats_.preallocated_temp_fragmentation_bytes = result.fragmentation_size; @@ -931,7 +992,7 @@ void BufferAssigner::AssignBuffersFromHeapSimulator( } BufferAllocation* allocation = assignment->NewEmptyAllocation( - result.heap_size, /*is_thread_local=*/false, /*is_reusable=*/true); + result.heap_size, /*is_thread_local=*/false, /*is_reusable=*/true, color); for (const auto& buffer_chunk : result.chunk_map) { const LogicalBuffer& buffer = *buffer_chunk.first; const HeapSimulator::Chunk& chunk = buffer_chunk.second; @@ -1234,7 +1295,8 @@ StatusOr<std::unique_ptr<BufferAssignment>> BufferAssigner::CreateAssignment( const HloModule* module, std::unique_ptr<HloOrdering> hlo_ordering, LogicalBuffer::SizeFunction buffer_size) { TF_ASSIGN_OR_RETURN(std::unique_ptr<BufferLiveness> liveness, - BufferLiveness::Run(module, std::move(hlo_ordering))); + BufferLiveness::Run(module, std::move(hlo_ordering), + std::move(colorer_))); VLOG(1) << "Assigning buffers to module " << module->name(); XLA_VLOG_LINES(2, module->ToString()); diff --git a/tensorflow/compiler/xla/service/buffer_assignment.h b/tensorflow/compiler/xla/service/buffer_assignment.h index 327f44bd2e..b3933f11c1 100644 --- a/tensorflow/compiler/xla/service/buffer_assignment.h +++ b/tensorflow/compiler/xla/service/buffer_assignment.h @@ -29,6 +29,7 @@ limitations under the License. #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/service/hlo_module.h" #include "tensorflow/compiler/xla/service/logical_buffer.h" +#include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/xla_data.pb.h" @@ -58,11 +59,12 @@ class BufferAllocation { using Index = int64; BufferAllocation(Index index, int64 size, bool is_thread_local, - bool is_reusable) + bool is_reusable, LogicalBuffer::Color color) : index_(index), size_(size), is_thread_local_(is_thread_local), - is_reusable_(is_reusable) {} + is_reusable_(is_reusable), + color_(color) {} ~BufferAllocation() {} // Returns the index of this allocation. @@ -97,6 +99,10 @@ class BufferAllocation { // large as any LogicalBuffer assigned to this allocation. int64 size() const { return size_; } + // Returns the color of the allocation. Only logical buffers with a matching + // color can reside in this allocation. + LogicalBuffer::Color color() const { return color_; } + struct OffsetSize { int64 offset = 0; int64 size = 0; @@ -217,6 +223,9 @@ class BufferAllocation { // Whether this buffer is usable by more than one logical buffer. bool is_reusable_; + // Color of the allocation. + LogicalBuffer::Color color_; + // Whether this allocation holds an entry computation parameter. Entry // computation parameters are special be cause they have lifetimes which may // outlast the computation. @@ -250,10 +259,10 @@ class BufferAssignment { return allocations_; } - // Returns the single allocation holding all temporary buffers. Returns - // nullptr if there are no temporary buffers, or if the assignment uses more - // than one allocation to hold temporary buffers. - const BufferAllocation* GetTempAllocation() const { return temp_allocation_; } + // Returns the total size allocation holding all temporary buffers. + int64 temp_allocation_total_size() const { + return temp_allocation_total_size_; + } // Returns whether the given buffer has been assigned an allocation. bool HasAllocation(const LogicalBuffer& buffer) const; @@ -352,7 +361,8 @@ class BufferAssignment { // Creates and returns a new BufferAllocation, with no assigned // LogicalBuffers. Ownership is maintained internally. BufferAllocation* NewEmptyAllocation(int64 size, bool is_thread_local, - bool is_reusable); + bool is_reusable, + LogicalBuffer::Color color); // Helper that calls NewEmptyAllocation and AddAssignment in one call, // creating an allocation containing a single LogicalBuffer. @@ -383,8 +393,8 @@ class BufferAssignment { // The vector of buffer allocations. Indexed by BufferAllocation::Index. std::vector<BufferAllocation> allocations_; - // The single allocation holding all temporary buffers. - BufferAllocation* temp_allocation_ = nullptr; + // The total size of all temporary buffers. + int64 temp_allocation_total_size_ = 0; // Maps Buffers to the index of the BufferAllocation which holds the buffer. tensorflow::gtl::FlatMap<const LogicalBuffer*, BufferAllocation::Index> @@ -408,18 +418,22 @@ class BufferAssigner { public: // Build and return a BufferAssignment for the given module. The given // HloOrdering is used to determine buffer liveness. buffer_size is a function - // which returns the size of a LogicalBuffer. Alignment is the the minimum + // which returns the size of a LogicalBuffer. Alignment is the minimum // alignment of any buffer. allow_input_output_aliasing specifies whether // input buffer are allowed to be reused as outbut buffers by the client code. static StatusOr<std::unique_ptr<BufferAssignment>> Run( const HloModule* module, std::unique_ptr<HloOrdering> hlo_ordering, LogicalBuffer::SizeFunction buffer_size, int64 alignment, - bool allow_input_output_aliasing = false); + bool allow_input_output_aliasing = false, + TuplePointsToAnalysis::Colorer colorer = + TuplePointsToAnalysis::DefaultColorer()); private: - BufferAssigner(int64 alignment, bool allow_input_output_aliasing) + BufferAssigner(int64 alignment, bool allow_input_output_aliasing, + TuplePointsToAnalysis::Colorer colorer) : alignment_(alignment), - allow_input_output_aliasing_(allow_input_output_aliasing) {} + allow_input_output_aliasing_(allow_input_output_aliasing), + colorer_(colorer) {} virtual ~BufferAssigner() = default; // Create a buffer assignment. @@ -456,7 +470,8 @@ class BufferAssigner { // Uses the results of the heap simulator to create a single allocation, with // LogicalBuffers packed to specific offsets. void AssignBuffersFromHeapSimulator(const HeapSimulator::Result& result, - BufferAssignment* assignment); + BufferAssignment* assignment, + LogicalBuffer::Color color); // Tries to assign the given instruction to the given buffer. Returns if the // assignment was successful. @@ -501,6 +516,14 @@ class BufferAssigner { const LogicalBuffer::SizeFunction& buffer_size, std::vector<ColocatedBufferSet>* colocated_buffer_sets); + // Split a set of buffers into several sets, each of which contains buffers + // colored with the same color. + tensorflow::gtl::FlatMap<LogicalBuffer::Color, + tensorflow::gtl::FlatSet<const LogicalBuffer*>, + LogicalBuffer::Color::Hasher> + SplitBuffersByColor( + const tensorflow::gtl::FlatSet<const LogicalBuffer*>& buffers); + // Minimum alignment of any buffer. int64 alignment_; @@ -508,6 +531,9 @@ class BufferAssigner { // buffers can be shared if their sizes match. bool allow_input_output_aliasing_; + // Functor used to assign colors to newly allocated logical buffers. + TuplePointsToAnalysis::Colorer colorer_; + TF_DISALLOW_COPY_AND_ASSIGN(BufferAssigner); }; diff --git a/tensorflow/compiler/xla/service/buffer_assignment_test.cc b/tensorflow/compiler/xla/service/buffer_assignment_test.cc index a3b057a257..2258d20956 100644 --- a/tensorflow/compiler/xla/service/buffer_assignment_test.cc +++ b/tensorflow/compiler/xla/service/buffer_assignment_test.cc @@ -18,6 +18,7 @@ limitations under the License. #include <memory> #include <set> #include <string> +#include <utility> #include <vector> #include "tensorflow/compiler/xla/literal_util.h" @@ -88,6 +89,16 @@ class BufferAssignmentTest : public HloTestBase { .ConsumeValueOrDie(); } + std::unique_ptr<BufferAssignment> RunColoredBufferAssignment( + HloModule* module, TuplePointsToAnalysis::Colorer colorer, + int64 alignment = 1) { + return BufferAssigner::Run(module, + MakeUnique<DependencyHloOrdering>(module), + backend_->compiler()->BufferSizeBytesFunction(), + alignment, false, std::move(colorer)) + .ConsumeValueOrDie(); + } + // Builds an x+1.0 computation to use in a Map. std::unique_ptr<HloComputation> BuildMapComputationPlus1(const string& name) { auto builder = HloComputation::Builder(name); @@ -337,7 +348,113 @@ TEST_F(BufferAssignmentTest, Basic) { // The add node can reuse the mul node's buffer. const BufferAllocation& add_buffer = GetTopLevelAllocation(*buffers, add); - EXPECT_EQ(add_buffer.index(), add_buffer.index()); + EXPECT_EQ(add_buffer.index(), mul_buffer.index()); + + // The sub node has a valid output buffer assigned. + GetAssignedOutputAllocation(*buffers, sub); +} + +TEST_F(BufferAssignmentTest, BasicUniquelyColored) { + // paramscalar ------- (mul) -- (add) -- (sub) + // / / / + // param0[100] -------/ / / + // / / + // param1[100] --------------/--------/ + // The output of each op is colored with a different color, so we can not + // share anything. + auto builder = HloComputation::Builder(TestName()); + auto paramscalar = + builder.AddInstruction(HloInstruction::CreateParameter(0, r0f32_, "")); + auto param0 = builder.AddInstruction( + HloInstruction::CreateParameter(1, f32vec100_, "")); + auto param1 = builder.AddInstruction( + HloInstruction::CreateParameter(2, f32vec100_, "")); + auto mul = builder.AddInstruction(HloInstruction::CreateBinary( + f32vec100_, HloOpcode::kMultiply, paramscalar, param0)); + auto add = builder.AddInstruction( + HloInstruction::CreateBinary(f32vec100_, HloOpcode::kAdd, mul, param1)); + auto sub = builder.AddInstruction(HloInstruction::CreateBinary( + f32vec100_, HloOpcode::kSubtract, add, param1)); + auto module = CreateNewModule(); + module->AddEntryComputation(builder.Build()); + + auto buffers = RunColoredBufferAssignment( + module.get(), + [](const HloInstruction* instruction, const ShapeIndex& index) { + static int64 serial = 0; + return LogicalBuffer::Color(serial++); + }); + + // Distinct input buffers were assigned for parameters. + BufferAllocation paramscalar_buffer = + GetAssignedInputAllocation(*buffers, paramscalar); + BufferAllocation param0_buffer = GetAssignedInputAllocation(*buffers, param0); + BufferAllocation param1_buffer = GetAssignedInputAllocation(*buffers, param1); + EXPECT_NE(paramscalar_buffer.index(), param0_buffer.index()); + EXPECT_NE(paramscalar_buffer.index(), param1_buffer.index()); + EXPECT_NE(param0_buffer.index(), param1_buffer.index()); + + // The mul node has a valid buffer assigned, doesn't share with input. + const BufferAllocation& mul_buffer = GetTopLevelAllocation(*buffers, mul); + EXPECT_NE(mul_buffer.index(), param0_buffer.index()); + + // The add node can not reuse the mul node's buffer due to coloring. + const BufferAllocation& add_buffer = GetTopLevelAllocation(*buffers, add); + EXPECT_NE(add_buffer.index(), mul_buffer.index()); + + // The sub node has a valid output buffer assigned. + GetAssignedOutputAllocation(*buffers, sub); +} + +TEST_F(BufferAssignmentTest, BasicPartiallyColored) { + // paramscalar ------- (mul) -- (add) -- (sub) + // / / / + // param0[100] -------/ / / + // / / + // param1[100] --------------/--------/ + // The output of the mul and the add have the color 1, and the other buffers + // have the color 0, which allows the mul and add to share buffers. + auto builder = HloComputation::Builder(TestName()); + auto paramscalar = + builder.AddInstruction(HloInstruction::CreateParameter(0, r0f32_, "")); + auto param0 = builder.AddInstruction( + HloInstruction::CreateParameter(1, f32vec100_, "")); + auto param1 = builder.AddInstruction( + HloInstruction::CreateParameter(2, f32vec100_, "")); + auto mul = builder.AddInstruction(HloInstruction::CreateBinary( + f32vec100_, HloOpcode::kMultiply, paramscalar, param0)); + auto add = builder.AddInstruction( + HloInstruction::CreateBinary(f32vec100_, HloOpcode::kAdd, mul, param1)); + auto sub = builder.AddInstruction(HloInstruction::CreateBinary( + f32vec100_, HloOpcode::kSubtract, add, param1)); + auto module = CreateNewModule(); + module->AddEntryComputation(builder.Build()); + + auto buffers = RunColoredBufferAssignment( + module.get(), + [](const HloInstruction* instruction, const ShapeIndex& index) { + return (instruction->opcode() == HloOpcode::kAdd || + instruction->opcode() == HloOpcode::kMultiply) + ? LogicalBuffer::Color(1) + : LogicalBuffer::Color(0); + }); + + // Distinct input buffers were assigned for parameters. + BufferAllocation paramscalar_buffer = + GetAssignedInputAllocation(*buffers, paramscalar); + BufferAllocation param0_buffer = GetAssignedInputAllocation(*buffers, param0); + BufferAllocation param1_buffer = GetAssignedInputAllocation(*buffers, param1); + EXPECT_NE(paramscalar_buffer.index(), param0_buffer.index()); + EXPECT_NE(paramscalar_buffer.index(), param1_buffer.index()); + EXPECT_NE(param0_buffer.index(), param1_buffer.index()); + + // The mul node has a valid buffer assigned, doesn't share with input. + const BufferAllocation& mul_buffer = GetTopLevelAllocation(*buffers, mul); + EXPECT_NE(mul_buffer.index(), param0_buffer.index()); + + // The add node can reuse the mul node's buffer. + const BufferAllocation& add_buffer = GetTopLevelAllocation(*buffers, add); + EXPECT_EQ(add_buffer.index(), mul_buffer.index()); // The sub node has a valid output buffer assigned. GetAssignedOutputAllocation(*buffers, sub); diff --git a/tensorflow/compiler/xla/service/buffer_liveness.cc b/tensorflow/compiler/xla/service/buffer_liveness.cc index d69a84cd0e..1b14c26340 100644 --- a/tensorflow/compiler/xla/service/buffer_liveness.cc +++ b/tensorflow/compiler/xla/service/buffer_liveness.cc @@ -37,15 +37,17 @@ namespace xla { /* static */ StatusOr<std::unique_ptr<BufferLiveness>> BufferLiveness::Run( - const HloModule* module, std::unique_ptr<HloOrdering> hlo_ordering) { + const HloModule* module, std::unique_ptr<HloOrdering> hlo_ordering, + TuplePointsToAnalysis::Colorer colorer) { std::unique_ptr<BufferLiveness> liveness( - new BufferLiveness(module, std::move(hlo_ordering))); + new BufferLiveness(module, std::move(hlo_ordering), std::move(colorer))); TF_RETURN_IF_ERROR(liveness->Analyze()); return std::move(liveness); } tensorflow::Status BufferLiveness::Analyze() { - TF_ASSIGN_OR_RETURN(points_to_analysis_, TuplePointsToAnalysis::Run(module_)); + TF_ASSIGN_OR_RETURN(points_to_analysis_, + TuplePointsToAnalysis::Run(module_, colorer_)); for (auto& computation : module_->computations()) { // Gather all instructions whose buffers might alias other instructions into // the set aliased_buffers_. This includes those contained as a tuple diff --git a/tensorflow/compiler/xla/service/buffer_liveness.h b/tensorflow/compiler/xla/service/buffer_liveness.h index 4c94d1a27d..9bb2564a83 100644 --- a/tensorflow/compiler/xla/service/buffer_liveness.h +++ b/tensorflow/compiler/xla/service/buffer_liveness.h @@ -39,7 +39,9 @@ class BufferLiveness { // Constructs a buffer liveness object for the given module assuming the given // HLO instruction ordering. static StatusOr<std::unique_ptr<BufferLiveness>> Run( - const HloModule* module, std::unique_ptr<HloOrdering> hlo_ordering); + const HloModule* module, std::unique_ptr<HloOrdering> hlo_ordering, + TuplePointsToAnalysis::Colorer colorer = + TuplePointsToAnalysis::DefaultColorer()); // Returns true if the live range of the buffer containing the output of 'a' // may overlap with the live range of the buffer of 'b'. If instruction 'a' @@ -69,8 +71,11 @@ class BufferLiveness { private: explicit BufferLiveness(const HloModule* module, - std::unique_ptr<HloOrdering> hlo_ordering) - : module_(module), hlo_ordering_(std::move(hlo_ordering)) {} + std::unique_ptr<HloOrdering> hlo_ordering, + TuplePointsToAnalysis::Colorer colorer) + : module_(module), + hlo_ordering_(std::move(hlo_ordering)), + colorer_(colorer) {} // Perform buffer liveness analysis. This method must be called prior to // MayInterfere or MaybeLiveOut. @@ -93,6 +98,8 @@ class BufferLiveness { tensorflow::gtl::FlatSet<const LogicalBuffer*> maybe_live_out_buffers_; std::unique_ptr<TuplePointsToAnalysis> points_to_analysis_; + + TuplePointsToAnalysis::Colorer colorer_; }; } // namespace xla diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc index 08756c9eb0..5fa2bfdd7e 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc @@ -193,9 +193,11 @@ llvm::Function* IrEmitterUnnested::BuildKernelPrototype( // the result). We also know how many bytes can be dereferenced in it. const llvm::Argument& temp_buffer = *std::prev(kernel->arg_end()); int64 temp_buffer_arg_no = temp_buffer.getArgNo(); - if (const BufferAllocation* allocation = - ir_emitter_context_->buffer_assignment().GetTempAllocation()) { - kernel->addDereferenceableAttr(temp_buffer_arg_no + 1, allocation->size()); + int64 temp_allocation_total_size = + ir_emitter_context_->buffer_assignment().temp_allocation_total_size(); + if (temp_allocation_total_size != 0) { + kernel->addDereferenceableAttr(temp_buffer_arg_no + 1, + temp_allocation_total_size); } kernel->addAttribute(temp_buffer_arg_no + 1, llvm::Attribute::NoAlias); diff --git a/tensorflow/compiler/xla/service/heap_simulator_test.cc b/tensorflow/compiler/xla/service/heap_simulator_test.cc index 0a6900f733..60a0768a86 100644 --- a/tensorflow/compiler/xla/service/heap_simulator_test.cc +++ b/tensorflow/compiler/xla/service/heap_simulator_test.cc @@ -507,10 +507,11 @@ class HeapAlgorithmTestBase : public ::testing::Test { private: // Create a dummy LogicalBuffer to pass to the heap algorithm. Since the // algorithms only use the buffer as a handle, we don't need to fill in much - // other than the id. + // other than the id and color. const LogicalBuffer* DummyLogicalBuffer() { const LogicalBuffer::Id id = buffers_.size(); - buffers_.emplace_back(MakeUnique<LogicalBuffer>(nullptr, ShapeIndex{}, id)); + buffers_.emplace_back(MakeUnique<LogicalBuffer>(nullptr, ShapeIndex{}, id, + LogicalBuffer::Color(0))); return buffers_.back().get(); } diff --git a/tensorflow/compiler/xla/service/hlo.proto b/tensorflow/compiler/xla/service/hlo.proto index cfd1f0f53b..af853385d6 100644 --- a/tensorflow/compiler/xla/service/hlo.proto +++ b/tensorflow/compiler/xla/service/hlo.proto @@ -106,6 +106,8 @@ message LogicalBufferProto { // The location where the buffer is defined. Location defined_at = 3; + + int64 color = 4; } // Serialization of BufferAllocation. @@ -125,7 +127,8 @@ message BufferAllocationProto { bool is_entry_computation_parameter = 5; int64 parameter_number = 6; bool maybe_live_out = 7; - repeated Assigned assigned = 8; + int64 color = 8; + repeated Assigned assigned = 9; } // A trace of a HeapSimulator run. diff --git a/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.cc b/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.cc index fc337a8924..02710ff57f 100644 --- a/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.cc +++ b/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.cc @@ -29,7 +29,8 @@ namespace llvm_ir { // Sentry allocation used to represent parameters of the entry computation in // alias_scope_metadata_ and noalias_metadata_. static const BufferAllocation* kParameterAllocation = new BufferAllocation( - /*index=*/-1, /*size=*/0, /*is_thread_local=*/false, /*is_reusable=*/false); + /*index=*/-1, /*size=*/0, /*is_thread_local=*/false, /*is_reusable=*/false, + LogicalBuffer::Color(0)); void AliasAnalysis::AddAliasingInformationToIrArray(const HloInstruction& hlo, llvm_ir::IrArray* array) { diff --git a/tensorflow/compiler/xla/service/logical_buffer.cc b/tensorflow/compiler/xla/service/logical_buffer.cc index 971d35727f..d24a592f46 100644 --- a/tensorflow/compiler/xla/service/logical_buffer.cc +++ b/tensorflow/compiler/xla/service/logical_buffer.cc @@ -29,7 +29,7 @@ namespace xla { string LogicalBuffer::ToString() const { return tensorflow::strings::StrCat(instruction_->FullyQualifiedName(), "[", tensorflow::str_util::Join(index_, ","), - "](#", id_, ")"); + "](#", id_, " @", color_.value(), ")"); } std::ostream& operator<<(std::ostream& out, const LogicalBuffer& buffer) { @@ -55,6 +55,7 @@ LogicalBufferProto LogicalBuffer::ToProto(const SizeFunction& size_fn) const { LogicalBufferProto::Location proto_location = ToLocationProto(*instruction_, index_); proto.mutable_defined_at()->Swap(&proto_location); + proto.set_color(color_.value()); return proto; } diff --git a/tensorflow/compiler/xla/service/logical_buffer.h b/tensorflow/compiler/xla/service/logical_buffer.h index 61d79f7007..566cd01ea4 100644 --- a/tensorflow/compiler/xla/service/logical_buffer.h +++ b/tensorflow/compiler/xla/service/logical_buffer.h @@ -16,6 +16,7 @@ limitations under the License. #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_LOGICAL_BUFFER_H_ #define TENSORFLOW_COMPILER_XLA_SERVICE_LOGICAL_BUFFER_H_ +#include <functional> #include <iosfwd> #include <string> #include <vector> @@ -26,6 +27,7 @@ limitations under the License. #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/xla_data.pb.h" #include "tensorflow/core/lib/gtl/array_slice.h" +#include "tensorflow/core/lib/gtl/int_type.h" #include "tensorflow/core/platform/macros.h" #include "tensorflow/core/platform/types.h" @@ -82,6 +84,8 @@ namespace xla { // LogicalBuffer(%tuple_constant, {1, 1}) // Holds value "43" class LogicalBuffer { public: + TF_LIB_GTL_DEFINE_INT_TYPE(Color, int64); + // Id is a unique identifier for the LogicalBuffer to facilitate efficient // collections of LogicalBuffers with stable iteration order. // LogicalBuffers are typically created and accessed through @@ -93,8 +97,9 @@ class LogicalBuffer { using SizeFunction = std::function<int64(const LogicalBuffer&)>; using AlignmentFunction = std::function<int64(const LogicalBuffer&)>; - LogicalBuffer(HloInstruction* instruction, const ShapeIndex& index, Id id) - : instruction_(instruction), index_(index), id_(id) {} + LogicalBuffer(HloInstruction* instruction, const ShapeIndex& index, Id id, + Color color) + : instruction_(instruction), index_(index), id_(id), color_(color) {} Id id() const { return id_; } @@ -105,6 +110,11 @@ class LogicalBuffer { // defined. Index used defined as in ShapeUtil::GetSubshape() const ShapeIndex& index() const { return index_; } + // Return the color of the logical buffer. Differently colored buffers can + // not be parts of the same allocation. + Color color() const { return color_; } + void set_color(Color color) { color_ = color; } + // Return the shape of the buffer. This reference points into the shape field // of the instruction defining the buffer. Therefore, the returned shape will // contain the layout of instruction, if any. @@ -137,6 +147,7 @@ class LogicalBuffer { HloInstruction* instruction_; ShapeIndex index_; Id id_; + Color color_; // Similar to HLO constructs (HloInstruction, etc), pointers are used for // comparison to equality, so disable all copying. diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc b/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc index 1f91868dea..ad6f015c70 100644 --- a/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc +++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc @@ -125,13 +125,18 @@ void PointsToSet::add_tuple_source(const ShapeIndex& index, } /* static */ StatusOr<std::unique_ptr<TuplePointsToAnalysis>> -TuplePointsToAnalysis::Run(const HloModule* module) { +TuplePointsToAnalysis::Run(const HloModule* module, Colorer colorer) { std::unique_ptr<TuplePointsToAnalysis> analysis( - new TuplePointsToAnalysis(module)); + new TuplePointsToAnalysis(module, std::move(colorer))); TF_RETURN_IF_ERROR(analysis->Analyze()); return std::move(analysis); } +/* static */ StatusOr<std::unique_ptr<TuplePointsToAnalysis>> +TuplePointsToAnalysis::Run(const HloModule* module) { + return Run(module, DefaultColorer()); +} + Status TuplePointsToAnalysis::Analyze() { points_to_.clear(); for (auto& computation : module_->computations()) { @@ -179,8 +184,8 @@ Status TuplePointsToAnalysis::PopulateDefinedBuffersAndAliases( const LogicalBuffer& TuplePointsToAnalysis::NewLogicalBuffer( HloInstruction* instruction, const ShapeIndex& index) { CHECK_EQ(logical_buffers_.size(), next_buffer_id_); - logical_buffers_.push_back( - MakeUnique<LogicalBuffer>(instruction, index, next_buffer_id_)); + logical_buffers_.push_back(MakeUnique<LogicalBuffer>( + instruction, index, next_buffer_id_, colorer_(instruction, index))); ++next_buffer_id_; return *logical_buffers_.back(); } diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis.h b/tensorflow/compiler/xla/service/tuple_points_to_analysis.h index ffe15ac856..4d7fc7cbc9 100644 --- a/tensorflow/compiler/xla/service/tuple_points_to_analysis.h +++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis.h @@ -142,7 +142,15 @@ std::ostream& operator<<(std::ostream& out, const BufferAlias& buffer_alias); // the potential sources of each buffer in each instruction's output. class TuplePointsToAnalysis : public DfsHloVisitorWithDefault { public: - // Runs points-to analysis on 'module'. + using Colorer = std::function<LogicalBuffer::Color( + const HloInstruction* instruction, const ShapeIndex& index)>; + + // Runs points-to analysis on 'module' with the provided buffer color + // assigner. + static StatusOr<std::unique_ptr<TuplePointsToAnalysis>> Run( + const HloModule* module, Colorer colorer); + + // Runs points-to analysis on 'module' with the default color assigner. static StatusOr<std::unique_ptr<TuplePointsToAnalysis>> Run( const HloModule* module); @@ -207,8 +215,16 @@ class TuplePointsToAnalysis : public DfsHloVisitorWithDefault { string ToString() const; + static Colorer DefaultColorer() { + return [](const HloInstruction* instruction, const ShapeIndex& index) { + return LogicalBuffer::Color(0); + }; + } + private: - explicit TuplePointsToAnalysis(const HloModule* module) : module_(module) {} + explicit TuplePointsToAnalysis(const HloModule* module, + Colorer colorer = DefaultColorer()) + : module_(module), colorer_(colorer) {} // Perform the analysis. Should be called immediately after constructing the // object and before calling GetPointsToSet. @@ -267,6 +283,9 @@ class TuplePointsToAnalysis : public DfsHloVisitorWithDefault { // The ID of the next logical buffer created. LogicalBuffer::Id next_buffer_id_ = 0; + // Used to color the created logical buffers. + Colorer colorer_; + TF_DISALLOW_COPY_AND_ASSIGN(TuplePointsToAnalysis); }; |