aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--tensorflow/compiler/xla/service/BUILD1
-rw-r--r--tensorflow/compiler/xla/service/buffer_assignment.cc138
-rw-r--r--tensorflow/compiler/xla/service/buffer_assignment.h54
-rw-r--r--tensorflow/compiler/xla/service/buffer_assignment_test.cc119
-rw-r--r--tensorflow/compiler/xla/service/buffer_liveness.cc8
-rw-r--r--tensorflow/compiler/xla/service/buffer_liveness.h13
-rw-r--r--tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc8
-rw-r--r--tensorflow/compiler/xla/service/heap_simulator_test.cc5
-rw-r--r--tensorflow/compiler/xla/service/hlo.proto5
-rw-r--r--tensorflow/compiler/xla/service/llvm_ir/alias_analysis.cc3
-rw-r--r--tensorflow/compiler/xla/service/logical_buffer.cc3
-rw-r--r--tensorflow/compiler/xla/service/logical_buffer.h15
-rw-r--r--tensorflow/compiler/xla/service/tuple_points_to_analysis.cc13
-rw-r--r--tensorflow/compiler/xla/service/tuple_points_to_analysis.h23
14 files changed, 333 insertions, 75 deletions
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index bd10f1dc5c..cbf0a0ddd5 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -1151,6 +1151,7 @@ cc_library(
"//tensorflow/compiler/xla:types",
"//tensorflow/compiler/xla:xla_data_proto",
"//tensorflow/core:lib",
+ "//tensorflow/core:lib_internal",
],
)
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.cc b/tensorflow/compiler/xla/service/buffer_assignment.cc
index e1107d9782..ea1dce7124 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment.cc
@@ -28,7 +28,6 @@ limitations under the License.
#include "tensorflow/compiler/xla/service/heap_simulator.h"
#include "tensorflow/compiler/xla/service/hlo.pb.h"
#include "tensorflow/compiler/xla/service/hlo_opcode.h"
-#include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
#include "tensorflow/compiler/xla/shape_util.h"
#include "tensorflow/compiler/xla/status_macros.h"
#include "tensorflow/compiler/xla/types.h"
@@ -74,6 +73,9 @@ void BufferAllocation::AddAssignment(const LogicalBuffer& buffer, int64 offset,
<< " offset out of range";
CHECK_LE(offset + size, size_)
<< "LogicalBuffer " << buffer << " size out of range";
+ CHECK_EQ(buffer.color(), color())
+ << "Buffer color " << buffer.color()
+ << " does not match allocation color " << color() << ".";
OffsetSize offset_size;
offset_size.offset = offset;
offset_size.size = size;
@@ -86,6 +88,7 @@ BufferAllocationProto BufferAllocation::ToProto() const {
proto.set_size(size_);
proto.set_is_thread_local(is_thread_local_);
proto.set_is_reusable(is_reusable_);
+ proto.set_color(color_.value());
if (is_entry_computation_parameter_) {
proto.set_is_entry_computation_parameter(true);
proto.set_parameter_number(parameter_number_);
@@ -105,6 +108,9 @@ string BufferAllocation::ToString() const {
tensorflow::strings::StrAppend(
&output, tensorflow::strings::Printf("allocation %lld: %p, size %lld",
index_, this, size()));
+ if (color().value() != 0) {
+ tensorflow::strings::StrAppend(&output, ", color ", color().value());
+ }
if (is_entry_computation_parameter()) {
tensorflow::strings::StrAppend(&output, ", parameter ", parameter_number());
}
@@ -248,11 +254,11 @@ BufferAssignment::GetUniqueTopLevelOutputSlice() const {
module_->entry_computation()->root_instruction());
}
-BufferAllocation* BufferAssignment::NewEmptyAllocation(int64 size,
- bool is_thread_local,
- bool is_reusable) {
+BufferAllocation* BufferAssignment::NewEmptyAllocation(
+ int64 size, bool is_thread_local, bool is_reusable,
+ LogicalBuffer::Color color) {
BufferAllocation::Index index = allocations_.size();
- allocations_.emplace_back(index, size, is_thread_local, is_reusable);
+ allocations_.emplace_back(index, size, is_thread_local, is_reusable, color);
BufferAllocation* allocation = &allocations_.back();
return allocation;
}
@@ -262,7 +268,7 @@ BufferAllocation* BufferAssignment::NewAllocation(const LogicalBuffer& buffer,
bool is_thread_local,
bool is_reusable) {
BufferAllocation* allocation =
- NewEmptyAllocation(size, is_thread_local, is_reusable);
+ NewEmptyAllocation(size, is_thread_local, is_reusable, buffer.color());
AddAssignment(allocation, buffer, /*offset=*/0, size);
return allocation;
}
@@ -282,33 +288,55 @@ void BufferAssignment::AddAssignment(BufferAllocation* allocation,
allocation_index_for_buffer_[&buffer] = allocation->index();
}
-// Combines allocations of temporary buffers into one big BufferAllocation.
+// Combines allocations of temporary buffers of the same color into one big
+// BufferAllocation.
void BufferAssignment::CombineTempAllocations() {
+ FlatMap<LogicalBuffer::Color, BufferAllocation, LogicalBuffer::Color::Hasher>
+ combined_allocation_map;
+
// Move all temp allocations into a single run at the end of the allocations
- // vector, and combine them into the first allocation of the run.
+ // vector.
const auto first_temp_it =
std::partition(allocations_.begin(), allocations_.end(),
[](const BufferAllocation& allocation) {
return !allocation.IsPreallocatedTempBuffer();
});
+
+ // Walk over the run of temp allocations, collecting the allocations belonging
+ // to the same color.
if (first_temp_it != allocations_.end()) {
- BufferAllocation* combined = &*first_temp_it;
- const auto second_temp_it = std::next(first_temp_it);
- for (auto it = second_temp_it; it != allocations_.end(); ++it) {
+ for (auto it = first_temp_it; it != allocations_.end(); ++it) {
+ const BufferAllocation& temp_allocation = *it;
+ LogicalBuffer::Color color = temp_allocation.color();
+ auto combined_it = combined_allocation_map.find(color);
+ if (combined_it == combined_allocation_map.end()) {
+ // We have found the first temp allocation of this color. Collect
+ // the other temp allocations of the same color into it.
+ combined_allocation_map.emplace(color, temp_allocation);
+ continue;
+ }
+
+ auto* combined_allocation = &combined_it->second;
// Each temp allocation is placed end-to-end, accounting for alignment.
// The offset of each buffer in the combined allocation is computed from
// the base offset of the allocation.
- const int64 base = RoundUpToNearest(combined->size(), alignment_);
- combined->set_size(base + it->size());
- for (const auto& buffer_offset_size : it->assigned_buffers_) {
+ const int64 base =
+ RoundUpToNearest(combined_allocation->size(), alignment_);
+ combined_allocation->set_size(base + temp_allocation.size());
+ for (const auto& buffer_offset_size : temp_allocation.assigned_buffers_) {
const LogicalBuffer* buffer = buffer_offset_size.first;
const int64 offset = buffer_offset_size.second.offset;
const int64 size = buffer_offset_size.second.size;
- combined->AddAssignment(*buffer, base + offset, size);
+ combined_allocation->AddAssignment(*buffer, base + offset, size);
}
}
- allocations_.erase(second_temp_it, allocations_.end());
- temp_allocation_ = combined;
+ // Replace all existing temporary allocations with the new combined
+ // allocations.
+ allocations_.erase(first_temp_it, allocations_.end());
+ for (auto& combined : combined_allocation_map) {
+ allocations_.push_back(combined.second);
+ temp_allocation_total_size_ += combined.second.size();
+ }
}
// Update allocation indices to their new positions.
@@ -548,8 +576,9 @@ Status GatherComputationsByAllocationType(
StatusOr<std::unique_ptr<BufferAssignment>> BufferAssigner::Run(
const HloModule* module, std::unique_ptr<HloOrdering> hlo_ordering,
LogicalBuffer::SizeFunction buffer_size, int64 alignment,
- bool allow_input_output_aliasing) {
- BufferAssigner assigner(alignment, allow_input_output_aliasing);
+ bool allow_input_output_aliasing, TuplePointsToAnalysis::Colorer colorer) {
+ BufferAssigner assigner(alignment, allow_input_output_aliasing,
+ std::move(colorer));
return assigner.CreateAssignment(module, std::move(hlo_ordering),
std::move(buffer_size));
}
@@ -564,6 +593,12 @@ bool BufferAssigner::MaybeAssignBuffer(BufferAllocation* allocation,
VLOG(4) << "Trying to assign " << buffer << " to allocation: " << *allocation;
+ if (buffer.color() != allocation->color()) {
+ VLOG(4) << "Can't assign: buffer has color" << buffer.color()
+ << " and allocation has color " << allocation->color() << ".";
+ return false;
+ }
+
if (buffer_size(buffer) > allocation->size()) {
VLOG(4) << "Can't assign: buffer is larger than allocation ("
<< buffer_size(buffer) << " > " << allocation->size() << ")";
@@ -863,6 +898,19 @@ Status BufferAssigner::AssignBuffersForComputation(
return Status::OK();
}
+FlatMap<LogicalBuffer::Color, FlatSet<const LogicalBuffer*>,
+ LogicalBuffer::Color::Hasher>
+BufferAssigner::SplitBuffersByColor(
+ const FlatSet<const LogicalBuffer*>& buffers) {
+ FlatMap<LogicalBuffer::Color, FlatSet<const LogicalBuffer*>,
+ LogicalBuffer::Color::Hasher>
+ color_map;
+ for (auto buffer : buffers) {
+ color_map[buffer->color()].insert(buffer);
+ }
+ return color_map;
+}
+
Status BufferAssigner::AssignBuffersWithSequentialOrdering(
const FlatMap<const HloComputation*, FlatSet<const LogicalBuffer*>>&
buffers_to_assign_sequentially,
@@ -888,14 +936,20 @@ Status BufferAssigner::AssignBuffersWithSequentialOrdering(
all_buffers_to_assign.insert(buffers_to_assign.begin(),
buffers_to_assign.end());
}
- TF_ASSIGN_OR_RETURN(
- const HeapSimulator::Result result,
- HeapSimulator::Run(MakeUnique<DecreasingSizeRunsHeap>(
- MakeUnique<LazyBestFitHeap>(alignment_)),
- assignment->module(), module_sequence,
- assignment->points_to_analysis(),
- assignment->buffer_size_, &all_buffers_to_assign));
- AssignBuffersFromHeapSimulator(result, assignment);
+ auto color_map = SplitBuffersByColor(all_buffers_to_assign);
+ for (auto& single_colored_set : color_map) {
+ VLOG(2) << "Simulating heap for color " << single_colored_set.first;
+ TF_ASSIGN_OR_RETURN(
+ const HeapSimulator::Result result,
+ HeapSimulator::Run(MakeUnique<DecreasingSizeRunsHeap>(
+ MakeUnique<LazyBestFitHeap>(alignment_)),
+ assignment->module(), module_sequence,
+ assignment->points_to_analysis(),
+ assignment->buffer_size_,
+ &single_colored_set.second));
+ AssignBuffersFromHeapSimulator(result, assignment,
+ single_colored_set.first);
+ }
} else {
// Run the heap-simulation on a per-computation basis. Buffers for
// sub-computations are assigned disjoint BufferAllocations, assuming the
@@ -907,21 +961,28 @@ Status BufferAssigner::AssignBuffersWithSequentialOrdering(
const std::vector<const HloInstruction*>* instruction_sequence =
hlo_ordering.SequentialOrder(*computation);
CHECK(instruction_sequence != nullptr) << computation->name();
- TF_ASSIGN_OR_RETURN(
- const HeapSimulator::Result result,
- HeapSimulator::Run(MakeUnique<DecreasingSizeRunsHeap>(
- MakeUnique<LazyBestFitHeap>(alignment_)),
- *computation, *instruction_sequence,
- assignment->points_to_analysis(),
- assignment->buffer_size_, &buffers_to_assign));
- AssignBuffersFromHeapSimulator(result, assignment);
+ auto color_map = SplitBuffersByColor(buffers_to_assign);
+ for (auto& single_colored_set : color_map) {
+ VLOG(2) << "Simulating heap for color " << single_colored_set.first;
+ TF_ASSIGN_OR_RETURN(
+ const HeapSimulator::Result result,
+ HeapSimulator::Run(MakeUnique<DecreasingSizeRunsHeap>(
+ MakeUnique<LazyBestFitHeap>(alignment_)),
+ *computation, *instruction_sequence,
+ assignment->points_to_analysis(),
+ assignment->buffer_size_,
+ &single_colored_set.second));
+ AssignBuffersFromHeapSimulator(result, assignment,
+ single_colored_set.first);
+ }
}
}
return Status::OK();
}
void BufferAssigner::AssignBuffersFromHeapSimulator(
- const HeapSimulator::Result& result, BufferAssignment* assignment) {
+ const HeapSimulator::Result& result, BufferAssignment* assignment,
+ LogicalBuffer::Color color) {
if (assignment->stats_.preallocated_temp_fragmentation_bytes == -1) {
assignment->stats_.preallocated_temp_fragmentation_bytes =
result.fragmentation_size;
@@ -931,7 +992,7 @@ void BufferAssigner::AssignBuffersFromHeapSimulator(
}
BufferAllocation* allocation = assignment->NewEmptyAllocation(
- result.heap_size, /*is_thread_local=*/false, /*is_reusable=*/true);
+ result.heap_size, /*is_thread_local=*/false, /*is_reusable=*/true, color);
for (const auto& buffer_chunk : result.chunk_map) {
const LogicalBuffer& buffer = *buffer_chunk.first;
const HeapSimulator::Chunk& chunk = buffer_chunk.second;
@@ -1234,7 +1295,8 @@ StatusOr<std::unique_ptr<BufferAssignment>> BufferAssigner::CreateAssignment(
const HloModule* module, std::unique_ptr<HloOrdering> hlo_ordering,
LogicalBuffer::SizeFunction buffer_size) {
TF_ASSIGN_OR_RETURN(std::unique_ptr<BufferLiveness> liveness,
- BufferLiveness::Run(module, std::move(hlo_ordering)));
+ BufferLiveness::Run(module, std::move(hlo_ordering),
+ std::move(colorer_)));
VLOG(1) << "Assigning buffers to module " << module->name();
XLA_VLOG_LINES(2, module->ToString());
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.h b/tensorflow/compiler/xla/service/buffer_assignment.h
index 327f44bd2e..b3933f11c1 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.h
+++ b/tensorflow/compiler/xla/service/buffer_assignment.h
@@ -29,6 +29,7 @@ limitations under the License.
#include "tensorflow/compiler/xla/service/hlo_instruction.h"
#include "tensorflow/compiler/xla/service/hlo_module.h"
#include "tensorflow/compiler/xla/service/logical_buffer.h"
+#include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
#include "tensorflow/compiler/xla/statusor.h"
#include "tensorflow/compiler/xla/types.h"
#include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -58,11 +59,12 @@ class BufferAllocation {
using Index = int64;
BufferAllocation(Index index, int64 size, bool is_thread_local,
- bool is_reusable)
+ bool is_reusable, LogicalBuffer::Color color)
: index_(index),
size_(size),
is_thread_local_(is_thread_local),
- is_reusable_(is_reusable) {}
+ is_reusable_(is_reusable),
+ color_(color) {}
~BufferAllocation() {}
// Returns the index of this allocation.
@@ -97,6 +99,10 @@ class BufferAllocation {
// large as any LogicalBuffer assigned to this allocation.
int64 size() const { return size_; }
+ // Returns the color of the allocation. Only logical buffers with a matching
+ // color can reside in this allocation.
+ LogicalBuffer::Color color() const { return color_; }
+
struct OffsetSize {
int64 offset = 0;
int64 size = 0;
@@ -217,6 +223,9 @@ class BufferAllocation {
// Whether this buffer is usable by more than one logical buffer.
bool is_reusable_;
+ // Color of the allocation.
+ LogicalBuffer::Color color_;
+
// Whether this allocation holds an entry computation parameter. Entry
// computation parameters are special be cause they have lifetimes which may
// outlast the computation.
@@ -250,10 +259,10 @@ class BufferAssignment {
return allocations_;
}
- // Returns the single allocation holding all temporary buffers. Returns
- // nullptr if there are no temporary buffers, or if the assignment uses more
- // than one allocation to hold temporary buffers.
- const BufferAllocation* GetTempAllocation() const { return temp_allocation_; }
+ // Returns the total size allocation holding all temporary buffers.
+ int64 temp_allocation_total_size() const {
+ return temp_allocation_total_size_;
+ }
// Returns whether the given buffer has been assigned an allocation.
bool HasAllocation(const LogicalBuffer& buffer) const;
@@ -352,7 +361,8 @@ class BufferAssignment {
// Creates and returns a new BufferAllocation, with no assigned
// LogicalBuffers. Ownership is maintained internally.
BufferAllocation* NewEmptyAllocation(int64 size, bool is_thread_local,
- bool is_reusable);
+ bool is_reusable,
+ LogicalBuffer::Color color);
// Helper that calls NewEmptyAllocation and AddAssignment in one call,
// creating an allocation containing a single LogicalBuffer.
@@ -383,8 +393,8 @@ class BufferAssignment {
// The vector of buffer allocations. Indexed by BufferAllocation::Index.
std::vector<BufferAllocation> allocations_;
- // The single allocation holding all temporary buffers.
- BufferAllocation* temp_allocation_ = nullptr;
+ // The total size of all temporary buffers.
+ int64 temp_allocation_total_size_ = 0;
// Maps Buffers to the index of the BufferAllocation which holds the buffer.
tensorflow::gtl::FlatMap<const LogicalBuffer*, BufferAllocation::Index>
@@ -408,18 +418,22 @@ class BufferAssigner {
public:
// Build and return a BufferAssignment for the given module. The given
// HloOrdering is used to determine buffer liveness. buffer_size is a function
- // which returns the size of a LogicalBuffer. Alignment is the the minimum
+ // which returns the size of a LogicalBuffer. Alignment is the minimum
// alignment of any buffer. allow_input_output_aliasing specifies whether
// input buffer are allowed to be reused as outbut buffers by the client code.
static StatusOr<std::unique_ptr<BufferAssignment>> Run(
const HloModule* module, std::unique_ptr<HloOrdering> hlo_ordering,
LogicalBuffer::SizeFunction buffer_size, int64 alignment,
- bool allow_input_output_aliasing = false);
+ bool allow_input_output_aliasing = false,
+ TuplePointsToAnalysis::Colorer colorer =
+ TuplePointsToAnalysis::DefaultColorer());
private:
- BufferAssigner(int64 alignment, bool allow_input_output_aliasing)
+ BufferAssigner(int64 alignment, bool allow_input_output_aliasing,
+ TuplePointsToAnalysis::Colorer colorer)
: alignment_(alignment),
- allow_input_output_aliasing_(allow_input_output_aliasing) {}
+ allow_input_output_aliasing_(allow_input_output_aliasing),
+ colorer_(colorer) {}
virtual ~BufferAssigner() = default;
// Create a buffer assignment.
@@ -456,7 +470,8 @@ class BufferAssigner {
// Uses the results of the heap simulator to create a single allocation, with
// LogicalBuffers packed to specific offsets.
void AssignBuffersFromHeapSimulator(const HeapSimulator::Result& result,
- BufferAssignment* assignment);
+ BufferAssignment* assignment,
+ LogicalBuffer::Color color);
// Tries to assign the given instruction to the given buffer. Returns if the
// assignment was successful.
@@ -501,6 +516,14 @@ class BufferAssigner {
const LogicalBuffer::SizeFunction& buffer_size,
std::vector<ColocatedBufferSet>* colocated_buffer_sets);
+ // Split a set of buffers into several sets, each of which contains buffers
+ // colored with the same color.
+ tensorflow::gtl::FlatMap<LogicalBuffer::Color,
+ tensorflow::gtl::FlatSet<const LogicalBuffer*>,
+ LogicalBuffer::Color::Hasher>
+ SplitBuffersByColor(
+ const tensorflow::gtl::FlatSet<const LogicalBuffer*>& buffers);
+
// Minimum alignment of any buffer.
int64 alignment_;
@@ -508,6 +531,9 @@ class BufferAssigner {
// buffers can be shared if their sizes match.
bool allow_input_output_aliasing_;
+ // Functor used to assign colors to newly allocated logical buffers.
+ TuplePointsToAnalysis::Colorer colorer_;
+
TF_DISALLOW_COPY_AND_ASSIGN(BufferAssigner);
};
diff --git a/tensorflow/compiler/xla/service/buffer_assignment_test.cc b/tensorflow/compiler/xla/service/buffer_assignment_test.cc
index a3b057a257..2258d20956 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
#include <memory>
#include <set>
#include <string>
+#include <utility>
#include <vector>
#include "tensorflow/compiler/xla/literal_util.h"
@@ -88,6 +89,16 @@ class BufferAssignmentTest : public HloTestBase {
.ConsumeValueOrDie();
}
+ std::unique_ptr<BufferAssignment> RunColoredBufferAssignment(
+ HloModule* module, TuplePointsToAnalysis::Colorer colorer,
+ int64 alignment = 1) {
+ return BufferAssigner::Run(module,
+ MakeUnique<DependencyHloOrdering>(module),
+ backend_->compiler()->BufferSizeBytesFunction(),
+ alignment, false, std::move(colorer))
+ .ConsumeValueOrDie();
+ }
+
// Builds an x+1.0 computation to use in a Map.
std::unique_ptr<HloComputation> BuildMapComputationPlus1(const string& name) {
auto builder = HloComputation::Builder(name);
@@ -337,7 +348,113 @@ TEST_F(BufferAssignmentTest, Basic) {
// The add node can reuse the mul node's buffer.
const BufferAllocation& add_buffer = GetTopLevelAllocation(*buffers, add);
- EXPECT_EQ(add_buffer.index(), add_buffer.index());
+ EXPECT_EQ(add_buffer.index(), mul_buffer.index());
+
+ // The sub node has a valid output buffer assigned.
+ GetAssignedOutputAllocation(*buffers, sub);
+}
+
+TEST_F(BufferAssignmentTest, BasicUniquelyColored) {
+ // paramscalar ------- (mul) -- (add) -- (sub)
+ // / / /
+ // param0[100] -------/ / /
+ // / /
+ // param1[100] --------------/--------/
+ // The output of each op is colored with a different color, so we can not
+ // share anything.
+ auto builder = HloComputation::Builder(TestName());
+ auto paramscalar =
+ builder.AddInstruction(HloInstruction::CreateParameter(0, r0f32_, ""));
+ auto param0 = builder.AddInstruction(
+ HloInstruction::CreateParameter(1, f32vec100_, ""));
+ auto param1 = builder.AddInstruction(
+ HloInstruction::CreateParameter(2, f32vec100_, ""));
+ auto mul = builder.AddInstruction(HloInstruction::CreateBinary(
+ f32vec100_, HloOpcode::kMultiply, paramscalar, param0));
+ auto add = builder.AddInstruction(
+ HloInstruction::CreateBinary(f32vec100_, HloOpcode::kAdd, mul, param1));
+ auto sub = builder.AddInstruction(HloInstruction::CreateBinary(
+ f32vec100_, HloOpcode::kSubtract, add, param1));
+ auto module = CreateNewModule();
+ module->AddEntryComputation(builder.Build());
+
+ auto buffers = RunColoredBufferAssignment(
+ module.get(),
+ [](const HloInstruction* instruction, const ShapeIndex& index) {
+ static int64 serial = 0;
+ return LogicalBuffer::Color(serial++);
+ });
+
+ // Distinct input buffers were assigned for parameters.
+ BufferAllocation paramscalar_buffer =
+ GetAssignedInputAllocation(*buffers, paramscalar);
+ BufferAllocation param0_buffer = GetAssignedInputAllocation(*buffers, param0);
+ BufferAllocation param1_buffer = GetAssignedInputAllocation(*buffers, param1);
+ EXPECT_NE(paramscalar_buffer.index(), param0_buffer.index());
+ EXPECT_NE(paramscalar_buffer.index(), param1_buffer.index());
+ EXPECT_NE(param0_buffer.index(), param1_buffer.index());
+
+ // The mul node has a valid buffer assigned, doesn't share with input.
+ const BufferAllocation& mul_buffer = GetTopLevelAllocation(*buffers, mul);
+ EXPECT_NE(mul_buffer.index(), param0_buffer.index());
+
+ // The add node can not reuse the mul node's buffer due to coloring.
+ const BufferAllocation& add_buffer = GetTopLevelAllocation(*buffers, add);
+ EXPECT_NE(add_buffer.index(), mul_buffer.index());
+
+ // The sub node has a valid output buffer assigned.
+ GetAssignedOutputAllocation(*buffers, sub);
+}
+
+TEST_F(BufferAssignmentTest, BasicPartiallyColored) {
+ // paramscalar ------- (mul) -- (add) -- (sub)
+ // / / /
+ // param0[100] -------/ / /
+ // / /
+ // param1[100] --------------/--------/
+ // The output of the mul and the add have the color 1, and the other buffers
+ // have the color 0, which allows the mul and add to share buffers.
+ auto builder = HloComputation::Builder(TestName());
+ auto paramscalar =
+ builder.AddInstruction(HloInstruction::CreateParameter(0, r0f32_, ""));
+ auto param0 = builder.AddInstruction(
+ HloInstruction::CreateParameter(1, f32vec100_, ""));
+ auto param1 = builder.AddInstruction(
+ HloInstruction::CreateParameter(2, f32vec100_, ""));
+ auto mul = builder.AddInstruction(HloInstruction::CreateBinary(
+ f32vec100_, HloOpcode::kMultiply, paramscalar, param0));
+ auto add = builder.AddInstruction(
+ HloInstruction::CreateBinary(f32vec100_, HloOpcode::kAdd, mul, param1));
+ auto sub = builder.AddInstruction(HloInstruction::CreateBinary(
+ f32vec100_, HloOpcode::kSubtract, add, param1));
+ auto module = CreateNewModule();
+ module->AddEntryComputation(builder.Build());
+
+ auto buffers = RunColoredBufferAssignment(
+ module.get(),
+ [](const HloInstruction* instruction, const ShapeIndex& index) {
+ return (instruction->opcode() == HloOpcode::kAdd ||
+ instruction->opcode() == HloOpcode::kMultiply)
+ ? LogicalBuffer::Color(1)
+ : LogicalBuffer::Color(0);
+ });
+
+ // Distinct input buffers were assigned for parameters.
+ BufferAllocation paramscalar_buffer =
+ GetAssignedInputAllocation(*buffers, paramscalar);
+ BufferAllocation param0_buffer = GetAssignedInputAllocation(*buffers, param0);
+ BufferAllocation param1_buffer = GetAssignedInputAllocation(*buffers, param1);
+ EXPECT_NE(paramscalar_buffer.index(), param0_buffer.index());
+ EXPECT_NE(paramscalar_buffer.index(), param1_buffer.index());
+ EXPECT_NE(param0_buffer.index(), param1_buffer.index());
+
+ // The mul node has a valid buffer assigned, doesn't share with input.
+ const BufferAllocation& mul_buffer = GetTopLevelAllocation(*buffers, mul);
+ EXPECT_NE(mul_buffer.index(), param0_buffer.index());
+
+ // The add node can reuse the mul node's buffer.
+ const BufferAllocation& add_buffer = GetTopLevelAllocation(*buffers, add);
+ EXPECT_EQ(add_buffer.index(), mul_buffer.index());
// The sub node has a valid output buffer assigned.
GetAssignedOutputAllocation(*buffers, sub);
diff --git a/tensorflow/compiler/xla/service/buffer_liveness.cc b/tensorflow/compiler/xla/service/buffer_liveness.cc
index d69a84cd0e..1b14c26340 100644
--- a/tensorflow/compiler/xla/service/buffer_liveness.cc
+++ b/tensorflow/compiler/xla/service/buffer_liveness.cc
@@ -37,15 +37,17 @@ namespace xla {
/* static */
StatusOr<std::unique_ptr<BufferLiveness>> BufferLiveness::Run(
- const HloModule* module, std::unique_ptr<HloOrdering> hlo_ordering) {
+ const HloModule* module, std::unique_ptr<HloOrdering> hlo_ordering,
+ TuplePointsToAnalysis::Colorer colorer) {
std::unique_ptr<BufferLiveness> liveness(
- new BufferLiveness(module, std::move(hlo_ordering)));
+ new BufferLiveness(module, std::move(hlo_ordering), std::move(colorer)));
TF_RETURN_IF_ERROR(liveness->Analyze());
return std::move(liveness);
}
tensorflow::Status BufferLiveness::Analyze() {
- TF_ASSIGN_OR_RETURN(points_to_analysis_, TuplePointsToAnalysis::Run(module_));
+ TF_ASSIGN_OR_RETURN(points_to_analysis_,
+ TuplePointsToAnalysis::Run(module_, colorer_));
for (auto& computation : module_->computations()) {
// Gather all instructions whose buffers might alias other instructions into
// the set aliased_buffers_. This includes those contained as a tuple
diff --git a/tensorflow/compiler/xla/service/buffer_liveness.h b/tensorflow/compiler/xla/service/buffer_liveness.h
index 4c94d1a27d..9bb2564a83 100644
--- a/tensorflow/compiler/xla/service/buffer_liveness.h
+++ b/tensorflow/compiler/xla/service/buffer_liveness.h
@@ -39,7 +39,9 @@ class BufferLiveness {
// Constructs a buffer liveness object for the given module assuming the given
// HLO instruction ordering.
static StatusOr<std::unique_ptr<BufferLiveness>> Run(
- const HloModule* module, std::unique_ptr<HloOrdering> hlo_ordering);
+ const HloModule* module, std::unique_ptr<HloOrdering> hlo_ordering,
+ TuplePointsToAnalysis::Colorer colorer =
+ TuplePointsToAnalysis::DefaultColorer());
// Returns true if the live range of the buffer containing the output of 'a'
// may overlap with the live range of the buffer of 'b'. If instruction 'a'
@@ -69,8 +71,11 @@ class BufferLiveness {
private:
explicit BufferLiveness(const HloModule* module,
- std::unique_ptr<HloOrdering> hlo_ordering)
- : module_(module), hlo_ordering_(std::move(hlo_ordering)) {}
+ std::unique_ptr<HloOrdering> hlo_ordering,
+ TuplePointsToAnalysis::Colorer colorer)
+ : module_(module),
+ hlo_ordering_(std::move(hlo_ordering)),
+ colorer_(colorer) {}
// Perform buffer liveness analysis. This method must be called prior to
// MayInterfere or MaybeLiveOut.
@@ -93,6 +98,8 @@ class BufferLiveness {
tensorflow::gtl::FlatSet<const LogicalBuffer*> maybe_live_out_buffers_;
std::unique_ptr<TuplePointsToAnalysis> points_to_analysis_;
+
+ TuplePointsToAnalysis::Colorer colorer_;
};
} // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 08756c9eb0..5fa2bfdd7e 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -193,9 +193,11 @@ llvm::Function* IrEmitterUnnested::BuildKernelPrototype(
// the result). We also know how many bytes can be dereferenced in it.
const llvm::Argument& temp_buffer = *std::prev(kernel->arg_end());
int64 temp_buffer_arg_no = temp_buffer.getArgNo();
- if (const BufferAllocation* allocation =
- ir_emitter_context_->buffer_assignment().GetTempAllocation()) {
- kernel->addDereferenceableAttr(temp_buffer_arg_no + 1, allocation->size());
+ int64 temp_allocation_total_size =
+ ir_emitter_context_->buffer_assignment().temp_allocation_total_size();
+ if (temp_allocation_total_size != 0) {
+ kernel->addDereferenceableAttr(temp_buffer_arg_no + 1,
+ temp_allocation_total_size);
}
kernel->addAttribute(temp_buffer_arg_no + 1, llvm::Attribute::NoAlias);
diff --git a/tensorflow/compiler/xla/service/heap_simulator_test.cc b/tensorflow/compiler/xla/service/heap_simulator_test.cc
index 0a6900f733..60a0768a86 100644
--- a/tensorflow/compiler/xla/service/heap_simulator_test.cc
+++ b/tensorflow/compiler/xla/service/heap_simulator_test.cc
@@ -507,10 +507,11 @@ class HeapAlgorithmTestBase : public ::testing::Test {
private:
// Create a dummy LogicalBuffer to pass to the heap algorithm. Since the
// algorithms only use the buffer as a handle, we don't need to fill in much
- // other than the id.
+ // other than the id and color.
const LogicalBuffer* DummyLogicalBuffer() {
const LogicalBuffer::Id id = buffers_.size();
- buffers_.emplace_back(MakeUnique<LogicalBuffer>(nullptr, ShapeIndex{}, id));
+ buffers_.emplace_back(MakeUnique<LogicalBuffer>(nullptr, ShapeIndex{}, id,
+ LogicalBuffer::Color(0)));
return buffers_.back().get();
}
diff --git a/tensorflow/compiler/xla/service/hlo.proto b/tensorflow/compiler/xla/service/hlo.proto
index cfd1f0f53b..af853385d6 100644
--- a/tensorflow/compiler/xla/service/hlo.proto
+++ b/tensorflow/compiler/xla/service/hlo.proto
@@ -106,6 +106,8 @@ message LogicalBufferProto {
// The location where the buffer is defined.
Location defined_at = 3;
+
+ int64 color = 4;
}
// Serialization of BufferAllocation.
@@ -125,7 +127,8 @@ message BufferAllocationProto {
bool is_entry_computation_parameter = 5;
int64 parameter_number = 6;
bool maybe_live_out = 7;
- repeated Assigned assigned = 8;
+ int64 color = 8;
+ repeated Assigned assigned = 9;
}
// A trace of a HeapSimulator run.
diff --git a/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.cc b/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.cc
index fc337a8924..02710ff57f 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.cc
@@ -29,7 +29,8 @@ namespace llvm_ir {
// Sentry allocation used to represent parameters of the entry computation in
// alias_scope_metadata_ and noalias_metadata_.
static const BufferAllocation* kParameterAllocation = new BufferAllocation(
- /*index=*/-1, /*size=*/0, /*is_thread_local=*/false, /*is_reusable=*/false);
+ /*index=*/-1, /*size=*/0, /*is_thread_local=*/false, /*is_reusable=*/false,
+ LogicalBuffer::Color(0));
void AliasAnalysis::AddAliasingInformationToIrArray(const HloInstruction& hlo,
llvm_ir::IrArray* array) {
diff --git a/tensorflow/compiler/xla/service/logical_buffer.cc b/tensorflow/compiler/xla/service/logical_buffer.cc
index 971d35727f..d24a592f46 100644
--- a/tensorflow/compiler/xla/service/logical_buffer.cc
+++ b/tensorflow/compiler/xla/service/logical_buffer.cc
@@ -29,7 +29,7 @@ namespace xla {
string LogicalBuffer::ToString() const {
return tensorflow::strings::StrCat(instruction_->FullyQualifiedName(), "[",
tensorflow::str_util::Join(index_, ","),
- "](#", id_, ")");
+ "](#", id_, " @", color_.value(), ")");
}
std::ostream& operator<<(std::ostream& out, const LogicalBuffer& buffer) {
@@ -55,6 +55,7 @@ LogicalBufferProto LogicalBuffer::ToProto(const SizeFunction& size_fn) const {
LogicalBufferProto::Location proto_location =
ToLocationProto(*instruction_, index_);
proto.mutable_defined_at()->Swap(&proto_location);
+ proto.set_color(color_.value());
return proto;
}
diff --git a/tensorflow/compiler/xla/service/logical_buffer.h b/tensorflow/compiler/xla/service/logical_buffer.h
index 61d79f7007..566cd01ea4 100644
--- a/tensorflow/compiler/xla/service/logical_buffer.h
+++ b/tensorflow/compiler/xla/service/logical_buffer.h
@@ -16,6 +16,7 @@ limitations under the License.
#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_LOGICAL_BUFFER_H_
#define TENSORFLOW_COMPILER_XLA_SERVICE_LOGICAL_BUFFER_H_
+#include <functional>
#include <iosfwd>
#include <string>
#include <vector>
@@ -26,6 +27,7 @@ limitations under the License.
#include "tensorflow/compiler/xla/types.h"
#include "tensorflow/compiler/xla/xla_data.pb.h"
#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/gtl/int_type.h"
#include "tensorflow/core/platform/macros.h"
#include "tensorflow/core/platform/types.h"
@@ -82,6 +84,8 @@ namespace xla {
// LogicalBuffer(%tuple_constant, {1, 1}) // Holds value "43"
class LogicalBuffer {
public:
+ TF_LIB_GTL_DEFINE_INT_TYPE(Color, int64);
+
// Id is a unique identifier for the LogicalBuffer to facilitate efficient
// collections of LogicalBuffers with stable iteration order.
// LogicalBuffers are typically created and accessed through
@@ -93,8 +97,9 @@ class LogicalBuffer {
using SizeFunction = std::function<int64(const LogicalBuffer&)>;
using AlignmentFunction = std::function<int64(const LogicalBuffer&)>;
- LogicalBuffer(HloInstruction* instruction, const ShapeIndex& index, Id id)
- : instruction_(instruction), index_(index), id_(id) {}
+ LogicalBuffer(HloInstruction* instruction, const ShapeIndex& index, Id id,
+ Color color)
+ : instruction_(instruction), index_(index), id_(id), color_(color) {}
Id id() const { return id_; }
@@ -105,6 +110,11 @@ class LogicalBuffer {
// defined. Index used defined as in ShapeUtil::GetSubshape()
const ShapeIndex& index() const { return index_; }
+ // Return the color of the logical buffer. Differently colored buffers can
+ // not be parts of the same allocation.
+ Color color() const { return color_; }
+ void set_color(Color color) { color_ = color; }
+
// Return the shape of the buffer. This reference points into the shape field
// of the instruction defining the buffer. Therefore, the returned shape will
// contain the layout of instruction, if any.
@@ -137,6 +147,7 @@ class LogicalBuffer {
HloInstruction* instruction_;
ShapeIndex index_;
Id id_;
+ Color color_;
// Similar to HLO constructs (HloInstruction, etc), pointers are used for
// comparison to equality, so disable all copying.
diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc b/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
index 1f91868dea..ad6f015c70 100644
--- a/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
+++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
@@ -125,13 +125,18 @@ void PointsToSet::add_tuple_source(const ShapeIndex& index,
}
/* static */ StatusOr<std::unique_ptr<TuplePointsToAnalysis>>
-TuplePointsToAnalysis::Run(const HloModule* module) {
+TuplePointsToAnalysis::Run(const HloModule* module, Colorer colorer) {
std::unique_ptr<TuplePointsToAnalysis> analysis(
- new TuplePointsToAnalysis(module));
+ new TuplePointsToAnalysis(module, std::move(colorer)));
TF_RETURN_IF_ERROR(analysis->Analyze());
return std::move(analysis);
}
+/* static */ StatusOr<std::unique_ptr<TuplePointsToAnalysis>>
+TuplePointsToAnalysis::Run(const HloModule* module) {
+ return Run(module, DefaultColorer());
+}
+
Status TuplePointsToAnalysis::Analyze() {
points_to_.clear();
for (auto& computation : module_->computations()) {
@@ -179,8 +184,8 @@ Status TuplePointsToAnalysis::PopulateDefinedBuffersAndAliases(
const LogicalBuffer& TuplePointsToAnalysis::NewLogicalBuffer(
HloInstruction* instruction, const ShapeIndex& index) {
CHECK_EQ(logical_buffers_.size(), next_buffer_id_);
- logical_buffers_.push_back(
- MakeUnique<LogicalBuffer>(instruction, index, next_buffer_id_));
+ logical_buffers_.push_back(MakeUnique<LogicalBuffer>(
+ instruction, index, next_buffer_id_, colorer_(instruction, index)));
++next_buffer_id_;
return *logical_buffers_.back();
}
diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis.h b/tensorflow/compiler/xla/service/tuple_points_to_analysis.h
index ffe15ac856..4d7fc7cbc9 100644
--- a/tensorflow/compiler/xla/service/tuple_points_to_analysis.h
+++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis.h
@@ -142,7 +142,15 @@ std::ostream& operator<<(std::ostream& out, const BufferAlias& buffer_alias);
// the potential sources of each buffer in each instruction's output.
class TuplePointsToAnalysis : public DfsHloVisitorWithDefault {
public:
- // Runs points-to analysis on 'module'.
+ using Colorer = std::function<LogicalBuffer::Color(
+ const HloInstruction* instruction, const ShapeIndex& index)>;
+
+ // Runs points-to analysis on 'module' with the provided buffer color
+ // assigner.
+ static StatusOr<std::unique_ptr<TuplePointsToAnalysis>> Run(
+ const HloModule* module, Colorer colorer);
+
+ // Runs points-to analysis on 'module' with the default color assigner.
static StatusOr<std::unique_ptr<TuplePointsToAnalysis>> Run(
const HloModule* module);
@@ -207,8 +215,16 @@ class TuplePointsToAnalysis : public DfsHloVisitorWithDefault {
string ToString() const;
+ static Colorer DefaultColorer() {
+ return [](const HloInstruction* instruction, const ShapeIndex& index) {
+ return LogicalBuffer::Color(0);
+ };
+ }
+
private:
- explicit TuplePointsToAnalysis(const HloModule* module) : module_(module) {}
+ explicit TuplePointsToAnalysis(const HloModule* module,
+ Colorer colorer = DefaultColorer())
+ : module_(module), colorer_(colorer) {}
// Perform the analysis. Should be called immediately after constructing the
// object and before calling GetPointsToSet.
@@ -267,6 +283,9 @@ class TuplePointsToAnalysis : public DfsHloVisitorWithDefault {
// The ID of the next logical buffer created.
LogicalBuffer::Id next_buffer_id_ = 0;
+ // Used to color the created logical buffers.
+ Colorer colorer_;
+
TF_DISALLOW_COPY_AND_ASSIGN(TuplePointsToAnalysis);
};