14 files changed, 333 insertions, 75 deletions
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index bd10f1dc5c..cbf0a0ddd5 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -1151,6 +1151,7 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
     ],
 )
 
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.cc b/tensorflow/compiler/xla/service/buffer_assignment.cc
index e1107d9782..ea1dce7124 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment.cc
@@ -28,7 +28,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/heap_simulator.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
-#include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -74,6 +73,9 @@ void BufferAllocation::AddAssignment(const LogicalBuffer& buffer, int64 offset,
                           << " offset out of range";
   CHECK_LE(offset + size, size_)
       << "LogicalBuffer " << buffer << " size out of range";
+  CHECK_EQ(buffer.color(), color())
+      << "Buffer color " << buffer.color()
+      << " does not match allocation color " << color() << ".";
   OffsetSize offset_size;
   offset_size.offset = offset;
   offset_size.size = size;
@@ -86,6 +88,7 @@ BufferAllocationProto BufferAllocation::ToProto() const {
   proto.set_size(size_);
   proto.set_is_thread_local(is_thread_local_);
   proto.set_is_reusable(is_reusable_);
+  proto.set_color(color_.value());
   if (is_entry_computation_parameter_) {
     proto.set_is_entry_computation_parameter(true);
     proto.set_parameter_number(parameter_number_);
@@ -105,6 +108,9 @@ string BufferAllocation::ToString() const {
   tensorflow::strings::StrAppend(
       &output, tensorflow::strings::Printf("allocation %lld: %p, size %lld",
                                            index_, this, size()));
+  if (color().value() != 0) {
+    tensorflow::strings::StrAppend(&output, ", color ", color().value());
+  }
   if (is_entry_computation_parameter()) {
     tensorflow::strings::StrAppend(&output, ", parameter ", parameter_number());
   }
@@ -248,11 +254,11 @@ BufferAssignment::GetUniqueTopLevelOutputSlice() const {
       module_->entry_computation()->root_instruction());
 }
 
-BufferAllocation* BufferAssignment::NewEmptyAllocation(int64 size,
-                                                       bool is_thread_local,
-                                                       bool is_reusable) {
+BufferAllocation* BufferAssignment::NewEmptyAllocation(
+    int64 size, bool is_thread_local, bool is_reusable,
+    LogicalBuffer::Color color) {
   BufferAllocation::Index index = allocations_.size();
-  allocations_.emplace_back(index, size, is_thread_local, is_reusable);
+  allocations_.emplace_back(index, size, is_thread_local, is_reusable, color);
   BufferAllocation* allocation = &allocations_.back();
   return allocation;
 }
@@ -262,7 +268,7 @@ BufferAllocation* BufferAssignment::NewAllocation(const LogicalBuffer& buffer,
                                                   bool is_thread_local,
                                                   bool is_reusable) {
   BufferAllocation* allocation =
-      NewEmptyAllocation(size, is_thread_local, is_reusable);
+      NewEmptyAllocation(size, is_thread_local, is_reusable, buffer.color());
   AddAssignment(allocation, buffer, /*offset=*/0, size);
   return allocation;
 }
@@ -282,33 +288,55 @@ void BufferAssignment::AddAssignment(BufferAllocation* allocation,
   allocation_index_for_buffer_[&buffer] = allocation->index();
 }
 
-// Combines allocations of temporary buffers into one big BufferAllocation.
+// Combines allocations of temporary buffers of the same color into one big
+// BufferAllocation.
 void BufferAssignment::CombineTempAllocations() {
+  FlatMap<LogicalBuffer::Color, BufferAllocation, LogicalBuffer::Color::Hasher>
+      combined_allocation_map;
+
   // Move all temp allocations into a single run at the end of the allocations
-  // vector, and combine them into the first allocation of the run.
+  // vector.
   const auto first_temp_it =
       std::partition(allocations_.begin(), allocations_.end(),
                      [](const BufferAllocation& allocation) {
                        return !allocation.IsPreallocatedTempBuffer();
                      });
+
+  // Walk over the run of temp allocations, collecting the allocations belonging
+  // to the same color.
   if (first_temp_it != allocations_.end()) {
-    BufferAllocation* combined = &*first_temp_it;
-    const auto second_temp_it = std::next(first_temp_it);
-    for (auto it = second_temp_it; it != allocations_.end(); ++it) {
+    for (auto it = first_temp_it; it != allocations_.end(); ++it) {
+      const BufferAllocation& temp_allocation = *it;
+      LogicalBuffer::Color color = temp_allocation.color();
+      auto combined_it = combined_allocation_map.find(color);
+      if (combined_it == combined_allocation_map.end()) {
+        // We have found the first temp allocation of this color. Collect
+        // the other temp allocations of the same color into it.
+        combined_allocation_map.emplace(color, temp_allocation);
+        continue;
+      }
+
+      auto* combined_allocation = &combined_it->second;
       // Each temp allocation is placed end-to-end, accounting for alignment.
       // The offset of each buffer in the combined allocation is computed from
       // the base offset of the allocation.
-      const int64 base = RoundUpToNearest(combined->size(), alignment_);
-      combined->set_size(base + it->size());
-      for (const auto& buffer_offset_size : it->assigned_buffers_) {
+      const int64 base =
+          RoundUpToNearest(combined_allocation->size(), alignment_);
+      combined_allocation->set_size(base + temp_allocation.size());
+      for (const auto& buffer_offset_size : temp_allocation.assigned_buffers_) {
         const LogicalBuffer* buffer = buffer_offset_size.first;
         const int64 offset = buffer_offset_size.second.offset;
         const int64 size = buffer_offset_size.second.size;
-        combined->AddAssignment(*buffer, base + offset, size);
+        combined_allocation->AddAssignment(*buffer, base + offset, size);
       }
     }
-    allocations_.erase(second_temp_it, allocations_.end());
-    temp_allocation_ = combined;
+    // Replace all existing temporary allocations with the new combined
+    // allocations.
+    allocations_.erase(first_temp_it, allocations_.end());
+    for (auto& combined : combined_allocation_map) {
+      allocations_.push_back(combined.second);
+      temp_allocation_total_size_ += combined.second.size();
+    }
   }
 
   // Update allocation indices to their new positions.
@@ -548,8 +576,9 @@ Status GatherComputationsByAllocationType(
 StatusOr<std::unique_ptr<BufferAssignment>> BufferAssigner::Run(
     const HloModule* module, std::unique_ptr<HloOrdering> hlo_ordering,
     LogicalBuffer::SizeFunction buffer_size, int64 alignment,
-    bool allow_input_output_aliasing) {
-  BufferAssigner assigner(alignment, allow_input_output_aliasing);
+    bool allow_input_output_aliasing, TuplePointsToAnalysis::Colorer colorer) {
+  BufferAssigner assigner(alignment, allow_input_output_aliasing,
+                          std::move(colorer));
   return assigner.CreateAssignment(module, std::move(hlo_ordering),
                                    std::move(buffer_size));
 }
@@ -564,6 +593,12 @@ bool BufferAssigner::MaybeAssignBuffer(BufferAllocation* allocation,
 
   VLOG(4) << "Trying to assign " << buffer << " to allocation: " << *allocation;
 
+  if (buffer.color() != allocation->color()) {
+    VLOG(4) << "Can't assign: buffer has color" << buffer.color()
+            << " and allocation has color " << allocation->color() << ".";
+    return false;
+  }
+
   if (buffer_size(buffer) > allocation->size()) {
     VLOG(4) << "Can't assign: buffer is larger than allocation ("
             << buffer_size(buffer) << " > " << allocation->size() << ")";
@@ -863,6 +898,19 @@ Status BufferAssigner::AssignBuffersForComputation(
   return Status::OK();
 }
 
+FlatMap<LogicalBuffer::Color, FlatSet<const LogicalBuffer*>,
+        LogicalBuffer::Color::Hasher>
+BufferAssigner::SplitBuffersByColor(
+    const FlatSet<const LogicalBuffer*>& buffers) {
+  FlatMap<LogicalBuffer::Color, FlatSet<const LogicalBuffer*>,
+          LogicalBuffer::Color::Hasher>
+      color_map;
+  for (auto buffer : buffers) {
+    color_map[buffer->color()].insert(buffer);
+  }
+  return color_map;
+}
+
 Status BufferAssigner::AssignBuffersWithSequentialOrdering(
     const FlatMap<const HloComputation*, FlatSet<const LogicalBuffer*>>&
         buffers_to_assign_sequentially,
@@ -888,14 +936,20 @@ Status BufferAssigner::AssignBuffersWithSequentialOrdering(
       all_buffers_to_assign.insert(buffers_to_assign.begin(),
                                    buffers_to_assign.end());
     }
-    TF_ASSIGN_OR_RETURN(
-        const HeapSimulator::Result result,
-        HeapSimulator::Run(MakeUnique<DecreasingSizeRunsHeap>(
-                               MakeUnique<LazyBestFitHeap>(alignment_)),
-                           assignment->module(), module_sequence,
-                           assignment->points_to_analysis(),
-                           assignment->buffer_size_, &all_buffers_to_assign));
-    AssignBuffersFromHeapSimulator(result, assignment);
+    auto color_map = SplitBuffersByColor(all_buffers_to_assign);
+    for (auto& single_colored_set : color_map) {
+      VLOG(2) << "Simulating heap for color " << single_colored_set.first;
+      TF_ASSIGN_OR_RETURN(
+          const HeapSimulator::Result result,
+          HeapSimulator::Run(MakeUnique<DecreasingSizeRunsHeap>(
+                                 MakeUnique<LazyBestFitHeap>(alignment_)),
+                             assignment->module(), module_sequence,
+                             assignment->points_to_analysis(),
+                             assignment->buffer_size_,
+                             &single_colored_set.second));
+      AssignBuffersFromHeapSimulator(result, assignment,
+                                     single_colored_set.first);
+    }
   } else {
     // Run the heap-simulation on a per-computation basis. Buffers for
     // sub-computations are assigned disjoint BufferAllocations, assuming the
@@ -907,21 +961,28 @@ Status BufferAssigner::AssignBuffersWithSequentialOrdering(
       const std::vector<const HloInstruction*>* instruction_sequence =
           hlo_ordering.SequentialOrder(*computation);
       CHECK(instruction_sequence != nullptr) << computation->name();
-      TF_ASSIGN_OR_RETURN(
-          const HeapSimulator::Result result,
-          HeapSimulator::Run(MakeUnique<DecreasingSizeRunsHeap>(
-                                 MakeUnique<LazyBestFitHeap>(alignment_)),
-                             *computation, *instruction_sequence,
-                             assignment->points_to_analysis(),
-                             assignment->buffer_size_, &buffers_to_assign));
-      AssignBuffersFromHeapSimulator(result, assignment);
+      auto color_map = SplitBuffersByColor(buffers_to_assign);
+      for (auto& single_colored_set : color_map) {
+        VLOG(2) << "Simulating heap for color " << single_colored_set.first;
+        TF_ASSIGN_OR_RETURN(
+            const HeapSimulator::Result result,
+            HeapSimulator::Run(MakeUnique<DecreasingSizeRunsHeap>(
+                                   MakeUnique<LazyBestFitHeap>(alignment_)),
+                               *computation, *instruction_sequence,
+                               assignment->points_to_analysis(),
+                               assignment->buffer_size_,
+                               &single_colored_set.second));
+        AssignBuffersFromHeapSimulator(result, assignment,
+                                       single_colored_set.first);
+      }
     }
   }
   return Status::OK();
 }
 
 void BufferAssigner::AssignBuffersFromHeapSimulator(
-    const HeapSimulator::Result& result, BufferAssignment* assignment) {
+    const HeapSimulator::Result& result, BufferAssignment* assignment,
+    LogicalBuffer::Color color) {
   if (assignment->stats_.preallocated_temp_fragmentation_bytes == -1) {
     assignment->stats_.preallocated_temp_fragmentation_bytes =
         result.fragmentation_size;
@@ -931,7 +992,7 @@ void BufferAssigner::AssignBuffersFromHeapSimulator(
   }
 
   BufferAllocation* allocation = assignment->NewEmptyAllocation(
-      result.heap_size, /*is_thread_local=*/false, /*is_reusable=*/true);
+      result.heap_size, /*is_thread_local=*/false, /*is_reusable=*/true, color);
   for (const auto& buffer_chunk : result.chunk_map) {
     const LogicalBuffer& buffer = *buffer_chunk.first;
     const HeapSimulator::Chunk& chunk = buffer_chunk.second;
@@ -1234,7 +1295,8 @@ StatusOr<std::unique_ptr<BufferAssignment>> BufferAssigner::CreateAssignment(
     const HloModule* module, std::unique_ptr<HloOrdering> hlo_ordering,
     LogicalBuffer::SizeFunction buffer_size) {
   TF_ASSIGN_OR_RETURN(std::unique_ptr<BufferLiveness> liveness,
-                      BufferLiveness::Run(module, std::move(hlo_ordering)));
+                      BufferLiveness::Run(module, std::move(hlo_ordering),
+                                          std::move(colorer_)));
 
   VLOG(1) << "Assigning buffers to module " << module->name();
   XLA_VLOG_LINES(2, module->ToString());
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.h b/tensorflow/compiler/xla/service/buffer_assignment.h
index 327f44bd2e..b3933f11c1 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.h
+++ b/tensorflow/compiler/xla/service/buffer_assignment.h
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/logical_buffer.h"
+#include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -58,11 +59,12 @@ class BufferAllocation {
   using Index = int64;
 
   BufferAllocation(Index index, int64 size, bool is_thread_local,
-                   bool is_reusable)
+                   bool is_reusable, LogicalBuffer::Color color)
       : index_(index),
         size_(size),
         is_thread_local_(is_thread_local),
-        is_reusable_(is_reusable) {}
+        is_reusable_(is_reusable),
+        color_(color) {}
   ~BufferAllocation() {}
 
   // Returns the index of this allocation.
@@ -97,6 +99,10 @@ class BufferAllocation {
   // large as any LogicalBuffer assigned to this allocation.
   int64 size() const { return size_; }
 
+  // Returns the color of the allocation. Only logical buffers with a matching
+  // color can reside in this allocation.
+  LogicalBuffer::Color color() const { return color_; }
+
   struct OffsetSize {
     int64 offset = 0;
     int64 size = 0;
@@ -217,6 +223,9 @@ class BufferAllocation {
   // Whether this buffer is usable by more than one logical buffer.
   bool is_reusable_;
 
+  // Color of the allocation.
+  LogicalBuffer::Color color_;
+
   // Whether this allocation holds an entry computation parameter. Entry
   // computation parameters are special be cause they have lifetimes which may
   // outlast the computation.
@@ -250,10 +259,10 @@ class BufferAssignment {
     return allocations_;
   }
 
-  // Returns the single allocation holding all temporary buffers.  Returns
-  // nullptr if there are no temporary buffers, or if the assignment uses more
-  // than one allocation to hold temporary buffers.
-  const BufferAllocation* GetTempAllocation() const { return temp_allocation_; }
+  // Returns the total size allocation holding all temporary buffers.
+  int64 temp_allocation_total_size() const {
+    return temp_allocation_total_size_;
+  }
 
   // Returns whether the given buffer has been assigned an allocation.
   bool HasAllocation(const LogicalBuffer& buffer) const;
@@ -352,7 +361,8 @@ class BufferAssignment {
   // Creates and returns a new BufferAllocation, with no assigned
   // LogicalBuffers. Ownership is maintained internally.
   BufferAllocation* NewEmptyAllocation(int64 size, bool is_thread_local,
-                                       bool is_reusable);
+                                       bool is_reusable,
+                                       LogicalBuffer::Color color);
 
   // Helper that calls NewEmptyAllocation and AddAssignment in one call,
   // creating an allocation containing a single LogicalBuffer.
@@ -383,8 +393,8 @@ class BufferAssignment {
   // The vector of buffer allocations. Indexed by BufferAllocation::Index.
   std::vector<BufferAllocation> allocations_;
 
-  // The single allocation holding all temporary buffers.
-  BufferAllocation* temp_allocation_ = nullptr;
+  // The total size of all temporary buffers.
+  int64 temp_allocation_total_size_ = 0;
 
   // Maps Buffers to the index of the BufferAllocation which holds the buffer.
   tensorflow::gtl::FlatMap<const LogicalBuffer*, BufferAllocation::Index>
@@ -408,18 +418,22 @@ class BufferAssigner {
  public:
   // Build and return a BufferAssignment for the given module. The given
   // HloOrdering is used to determine buffer liveness. buffer_size is a function
-  // which returns the size of a LogicalBuffer. Alignment is the the minimum
+  // which returns the size of a LogicalBuffer. Alignment is the minimum
   // alignment of any buffer. allow_input_output_aliasing specifies whether
   // input buffer are allowed to be reused as outbut buffers by the client code.
   static StatusOr<std::unique_ptr<BufferAssignment>> Run(
       const HloModule* module, std::unique_ptr<HloOrdering> hlo_ordering,
       LogicalBuffer::SizeFunction buffer_size, int64 alignment,
-      bool allow_input_output_aliasing = false);
+      bool allow_input_output_aliasing = false,
+      TuplePointsToAnalysis::Colorer colorer =
+          TuplePointsToAnalysis::DefaultColorer());
 
  private:
-  BufferAssigner(int64 alignment, bool allow_input_output_aliasing)
+  BufferAssigner(int64 alignment, bool allow_input_output_aliasing,
+                 TuplePointsToAnalysis::Colorer colorer)
       : alignment_(alignment),
-        allow_input_output_aliasing_(allow_input_output_aliasing) {}
+        allow_input_output_aliasing_(allow_input_output_aliasing),
+        colorer_(colorer) {}
   virtual ~BufferAssigner() = default;
 
   // Create a buffer assignment.
@@ -456,7 +470,8 @@ class BufferAssigner {
   // Uses the results of the heap simulator to create a single allocation, with
   // LogicalBuffers packed to specific offsets.
   void AssignBuffersFromHeapSimulator(const HeapSimulator::Result& result,
-                                      BufferAssignment* assignment);
+                                      BufferAssignment* assignment,
+                                      LogicalBuffer::Color color);
 
   // Tries to assign the given instruction to the given buffer. Returns if the
   // assignment was successful.
@@ -501,6 +516,14 @@ class BufferAssigner {
       const LogicalBuffer::SizeFunction& buffer_size,
       std::vector<ColocatedBufferSet>* colocated_buffer_sets);
 
+  // Split a set of buffers into several sets, each of which contains buffers
+  // colored with the same color.
+  tensorflow::gtl::FlatMap<LogicalBuffer::Color,
+                           tensorflow::gtl::FlatSet<const LogicalBuffer*>,
+                           LogicalBuffer::Color::Hasher>
+  SplitBuffersByColor(
+      const tensorflow::gtl::FlatSet<const LogicalBuffer*>& buffers);
+
   // Minimum alignment of any buffer.
   int64 alignment_;
 
@@ -508,6 +531,9 @@ class BufferAssigner {
   // buffers can be shared if their sizes match.
   bool allow_input_output_aliasing_;
 
+  // Functor used to assign colors to newly allocated logical buffers.
+  TuplePointsToAnalysis::Colorer colorer_;
+
   TF_DISALLOW_COPY_AND_ASSIGN(BufferAssigner);
 };
 
diff --git a/tensorflow/compiler/xla/service/buffer_assignment_test.cc b/tensorflow/compiler/xla/service/buffer_assignment_test.cc
index a3b057a257..2258d20956 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <memory>
 #include <set>
 #include <string>
+#include <utility>
 #include <vector>
 
 #include "tensorflow/compiler/xla/literal_util.h"
@@ -88,6 +89,16 @@ class BufferAssignmentTest : public HloTestBase {
         .ConsumeValueOrDie();
   }
 
+  std::unique_ptr<BufferAssignment> RunColoredBufferAssignment(
+      HloModule* module, TuplePointsToAnalysis::Colorer colorer,
+      int64 alignment = 1) {
+    return BufferAssigner::Run(module,
+                               MakeUnique<DependencyHloOrdering>(module),
+                               backend_->compiler()->BufferSizeBytesFunction(),
+                               alignment, false, std::move(colorer))
+        .ConsumeValueOrDie();
+  }
+
   // Builds an x+1.0 computation to use in a Map.
   std::unique_ptr<HloComputation> BuildMapComputationPlus1(const string& name) {
     auto builder = HloComputation::Builder(name);
@@ -337,7 +348,113 @@ TEST_F(BufferAssignmentTest, Basic) {
 
   // The add node can reuse the mul node's buffer.
   const BufferAllocation& add_buffer = GetTopLevelAllocation(*buffers, add);
-  EXPECT_EQ(add_buffer.index(), add_buffer.index());
+  EXPECT_EQ(add_buffer.index(), mul_buffer.index());
+
+  // The sub node has a valid output buffer assigned.
+  GetAssignedOutputAllocation(*buffers, sub);
+}
+
+TEST_F(BufferAssignmentTest, BasicUniquelyColored) {
+  // paramscalar ------- (mul) -- (add) -- (sub)
+  //                     /        /        /
+  // param0[100] -------/        /        /
+  //                            /        /
+  // param1[100] --------------/--------/
+  // The output of each op is colored with a different color, so we can not
+  // share anything.
+  auto builder = HloComputation::Builder(TestName());
+  auto paramscalar =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, r0f32_, ""));
+  auto param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, f32vec100_, ""));
+  auto param1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(2, f32vec100_, ""));
+  auto mul = builder.AddInstruction(HloInstruction::CreateBinary(
+      f32vec100_, HloOpcode::kMultiply, paramscalar, param0));
+  auto add = builder.AddInstruction(
+      HloInstruction::CreateBinary(f32vec100_, HloOpcode::kAdd, mul, param1));
+  auto sub = builder.AddInstruction(HloInstruction::CreateBinary(
+      f32vec100_, HloOpcode::kSubtract, add, param1));
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build());
+
+  auto buffers = RunColoredBufferAssignment(
+      module.get(),
+      [](const HloInstruction* instruction, const ShapeIndex& index) {
+        static int64 serial = 0;
+        return LogicalBuffer::Color(serial++);
+      });
+
+  // Distinct input buffers were assigned for parameters.
+  BufferAllocation paramscalar_buffer =
+      GetAssignedInputAllocation(*buffers, paramscalar);
+  BufferAllocation param0_buffer = GetAssignedInputAllocation(*buffers, param0);
+  BufferAllocation param1_buffer = GetAssignedInputAllocation(*buffers, param1);
+  EXPECT_NE(paramscalar_buffer.index(), param0_buffer.index());
+  EXPECT_NE(paramscalar_buffer.index(), param1_buffer.index());
+  EXPECT_NE(param0_buffer.index(), param1_buffer.index());
+
+  // The mul node has a valid buffer assigned, doesn't share with input.
+  const BufferAllocation& mul_buffer = GetTopLevelAllocation(*buffers, mul);
+  EXPECT_NE(mul_buffer.index(), param0_buffer.index());
+
+  // The add node can not reuse the mul node's buffer due to coloring.
+  const BufferAllocation& add_buffer = GetTopLevelAllocation(*buffers, add);
+  EXPECT_NE(add_buffer.index(), mul_buffer.index());
+
+  // The sub node has a valid output buffer assigned.
+  GetAssignedOutputAllocation(*buffers, sub);
+}
+
+TEST_F(BufferAssignmentTest, BasicPartiallyColored) {
+  // paramscalar ------- (mul) -- (add) -- (sub)
+  //                     /        /        /
+  // param0[100] -------/        /        /
+  //                            /        /
+  // param1[100] --------------/--------/
+  // The output of the mul and the add have the color 1, and the other buffers
+  // have the color 0, which allows the mul and add to share buffers.
+  auto builder = HloComputation::Builder(TestName());
+  auto paramscalar =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, r0f32_, ""));
+  auto param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, f32vec100_, ""));
+  auto param1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(2, f32vec100_, ""));
+  auto mul = builder.AddInstruction(HloInstruction::CreateBinary(
+      f32vec100_, HloOpcode::kMultiply, paramscalar, param0));
+  auto add = builder.AddInstruction(
+      HloInstruction::CreateBinary(f32vec100_, HloOpcode::kAdd, mul, param1));
+  auto sub = builder.AddInstruction(HloInstruction::CreateBinary(
+      f32vec100_, HloOpcode::kSubtract, add, param1));
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build());
+
+  auto buffers = RunColoredBufferAssignment(
+      module.get(),
+      [](const HloInstruction* instruction, const ShapeIndex& index) {
+        return (instruction->opcode() == HloOpcode::kAdd ||
+                instruction->opcode() == HloOpcode::kMultiply)
+                   ? LogicalBuffer::Color(1)
+                   : LogicalBuffer::Color(0);
+      });
+
+  // Distinct input buffers were assigned for parameters.
+  BufferAllocation paramscalar_buffer =
+      GetAssignedInputAllocation(*buffers, paramscalar);
+  BufferAllocation param0_buffer = GetAssignedInputAllocation(*buffers, param0);
+  BufferAllocation param1_buffer = GetAssignedInputAllocation(*buffers, param1);
+  EXPECT_NE(paramscalar_buffer.index(), param0_buffer.index());
+  EXPECT_NE(paramscalar_buffer.index(), param1_buffer.index());
+  EXPECT_NE(param0_buffer.index(), param1_buffer.index());
+
+  // The mul node has a valid buffer assigned, doesn't share with input.
+  const BufferAllocation& mul_buffer = GetTopLevelAllocation(*buffers, mul);
+  EXPECT_NE(mul_buffer.index(), param0_buffer.index());
+
+  // The add node can reuse the mul node's buffer.
+  const BufferAllocation& add_buffer = GetTopLevelAllocation(*buffers, add);
+  EXPECT_EQ(add_buffer.index(), mul_buffer.index());
 
   // The sub node has a valid output buffer assigned.
   GetAssignedOutputAllocation(*buffers, sub);
diff --git a/tensorflow/compiler/xla/service/buffer_liveness.cc b/tensorflow/compiler/xla/service/buffer_liveness.cc
index d69a84cd0e..1b14c26340 100644
--- a/tensorflow/compiler/xla/service/buffer_liveness.cc
+++ b/tensorflow/compiler/xla/service/buffer_liveness.cc
@@ -37,15 +37,17 @@ namespace xla {
 
 /* static */
 StatusOr<std::unique_ptr<BufferLiveness>> BufferLiveness::Run(
-    const HloModule* module, std::unique_ptr<HloOrdering> hlo_ordering) {
+    const HloModule* module, std::unique_ptr<HloOrdering> hlo_ordering,
+    TuplePointsToAnalysis::Colorer colorer) {
   std::unique_ptr<BufferLiveness> liveness(
-      new BufferLiveness(module, std::move(hlo_ordering)));
+      new BufferLiveness(module, std::move(hlo_ordering), std::move(colorer)));
   TF_RETURN_IF_ERROR(liveness->Analyze());
   return std::move(liveness);
 }
 
 tensorflow::Status BufferLiveness::Analyze() {
-  TF_ASSIGN_OR_RETURN(points_to_analysis_, TuplePointsToAnalysis::Run(module_));
+  TF_ASSIGN_OR_RETURN(points_to_analysis_,
+                      TuplePointsToAnalysis::Run(module_, colorer_));
   for (auto& computation : module_->computations()) {
     // Gather all instructions whose buffers might alias other instructions into
     // the set aliased_buffers_.  This includes those contained as a tuple
diff --git a/tensorflow/compiler/xla/service/buffer_liveness.h b/tensorflow/compiler/xla/service/buffer_liveness.h
index 4c94d1a27d..9bb2564a83 100644
--- a/tensorflow/compiler/xla/service/buffer_liveness.h
+++ b/tensorflow/compiler/xla/service/buffer_liveness.h
@@ -39,7 +39,9 @@ class BufferLiveness {
   // Constructs a buffer liveness object for the given module assuming the given
   // HLO instruction ordering.
   static StatusOr<std::unique_ptr<BufferLiveness>> Run(
-      const HloModule* module, std::unique_ptr<HloOrdering> hlo_ordering);
+      const HloModule* module, std::unique_ptr<HloOrdering> hlo_ordering,
+      TuplePointsToAnalysis::Colorer colorer =
+          TuplePointsToAnalysis::DefaultColorer());
 
   // Returns true if the live range of the buffer containing the output of 'a'
   // may overlap with the live range of the buffer of 'b'. If instruction 'a'
@@ -69,8 +71,11 @@ class BufferLiveness {
 
  private:
   explicit BufferLiveness(const HloModule* module,
-                          std::unique_ptr<HloOrdering> hlo_ordering)
-      : module_(module), hlo_ordering_(std::move(hlo_ordering)) {}
+                          std::unique_ptr<HloOrdering> hlo_ordering,
+                          TuplePointsToAnalysis::Colorer colorer)
+      : module_(module),
+        hlo_ordering_(std::move(hlo_ordering)),
+        colorer_(colorer) {}
 
   // Perform buffer liveness analysis. This method must be called prior to
   // MayInterfere or MaybeLiveOut.
@@ -93,6 +98,8 @@ class BufferLiveness {
   tensorflow::gtl::FlatSet<const LogicalBuffer*> maybe_live_out_buffers_;
 
   std::unique_ptr<TuplePointsToAnalysis> points_to_analysis_;
+
+  TuplePointsToAnalysis::Colorer colorer_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 08756c9eb0..5fa2bfdd7e 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -193,9 +193,11 @@ llvm::Function* IrEmitterUnnested::BuildKernelPrototype(
   // the result).  We also know how many bytes can be dereferenced in it.
   const llvm::Argument& temp_buffer = *std::prev(kernel->arg_end());
   int64 temp_buffer_arg_no = temp_buffer.getArgNo();
-  if (const BufferAllocation* allocation =
-          ir_emitter_context_->buffer_assignment().GetTempAllocation()) {
-    kernel->addDereferenceableAttr(temp_buffer_arg_no + 1, allocation->size());
+  int64 temp_allocation_total_size =
+      ir_emitter_context_->buffer_assignment().temp_allocation_total_size();
+  if (temp_allocation_total_size != 0) {
+    kernel->addDereferenceableAttr(temp_buffer_arg_no + 1,
+                                   temp_allocation_total_size);
   }
   kernel->addAttribute(temp_buffer_arg_no + 1, llvm::Attribute::NoAlias);
 
diff --git a/tensorflow/compiler/xla/service/heap_simulator_test.cc b/tensorflow/compiler/xla/service/heap_simulator_test.cc
index 0a6900f733..60a0768a86 100644
--- a/tensorflow/compiler/xla/service/heap_simulator_test.cc
+++ b/tensorflow/compiler/xla/service/heap_simulator_test.cc
@@ -507,10 +507,11 @@ class HeapAlgorithmTestBase : public ::testing::Test {
  private:
   // Create a dummy LogicalBuffer to pass to the heap algorithm.  Since the
   // algorithms only use the buffer as a handle, we don't need to fill in much
-  // other than the id.
+  // other than the id and color.
   const LogicalBuffer* DummyLogicalBuffer() {
     const LogicalBuffer::Id id = buffers_.size();
-    buffers_.emplace_back(MakeUnique<LogicalBuffer>(nullptr, ShapeIndex{}, id));
+    buffers_.emplace_back(MakeUnique<LogicalBuffer>(nullptr, ShapeIndex{}, id,
+                                                    LogicalBuffer::Color(0)));
     return buffers_.back().get();
   }
 
diff --git a/tensorflow/compiler/xla/service/hlo.proto b/tensorflow/compiler/xla/service/hlo.proto
index cfd1f0f53b..af853385d6 100644
--- a/tensorflow/compiler/xla/service/hlo.proto
+++ b/tensorflow/compiler/xla/service/hlo.proto
@@ -106,6 +106,8 @@ message LogicalBufferProto {
 
   // The location where the buffer is defined.
   Location defined_at = 3;
+
+  int64 color = 4;
 }
 
 // Serialization of BufferAllocation.
@@ -125,7 +127,8 @@ message BufferAllocationProto {
   bool is_entry_computation_parameter = 5;
   int64 parameter_number = 6;
   bool maybe_live_out = 7;
-  repeated Assigned assigned = 8;
+  int64 color = 8;
+  repeated Assigned assigned = 9;
 }
 
 // A trace of a HeapSimulator run.
diff --git a/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.cc b/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.cc
index fc337a8924..02710ff57f 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.cc
@@ -29,7 +29,8 @@ namespace llvm_ir {
 // Sentry allocation used to represent parameters of the entry computation in
 // alias_scope_metadata_ and noalias_metadata_.
 static const BufferAllocation* kParameterAllocation = new BufferAllocation(
-    /*index=*/-1, /*size=*/0, /*is_thread_local=*/false, /*is_reusable=*/false);
+    /*index=*/-1, /*size=*/0, /*is_thread_local=*/false, /*is_reusable=*/false,
+    LogicalBuffer::Color(0));
 
 void AliasAnalysis::AddAliasingInformationToIrArray(const HloInstruction& hlo,
                                                     llvm_ir::IrArray* array) {
diff --git a/tensorflow/compiler/xla/service/logical_buffer.cc b/tensorflow/compiler/xla/service/logical_buffer.cc
index 971d35727f..d24a592f46 100644
--- a/tensorflow/compiler/xla/service/logical_buffer.cc
+++ b/tensorflow/compiler/xla/service/logical_buffer.cc
@@ -29,7 +29,7 @@ namespace xla {
 string LogicalBuffer::ToString() const {
   return tensorflow::strings::StrCat(instruction_->FullyQualifiedName(), "[",
                                      tensorflow::str_util::Join(index_, ","),
-                                     "](#", id_, ")");
+                                     "](#", id_, " @", color_.value(), ")");
 }
 
 std::ostream& operator<<(std::ostream& out, const LogicalBuffer& buffer) {
@@ -55,6 +55,7 @@ LogicalBufferProto LogicalBuffer::ToProto(const SizeFunction& size_fn) const {
   LogicalBufferProto::Location proto_location =
       ToLocationProto(*instruction_, index_);
   proto.mutable_defined_at()->Swap(&proto_location);
+  proto.set_color(color_.value());
   return proto;
 }
 
diff --git a/tensorflow/compiler/xla/service/logical_buffer.h b/tensorflow/compiler/xla/service/logical_buffer.h
index 61d79f7007..566cd01ea4 100644
--- a/tensorflow/compiler/xla/service/logical_buffer.h
+++ b/tensorflow/compiler/xla/service/logical_buffer.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_LOGICAL_BUFFER_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_LOGICAL_BUFFER_H_
 
+#include <functional>
 #include <iosfwd>
 #include <string>
 #include <vector>
@@ -26,6 +27,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/gtl/int_type.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -82,6 +84,8 @@ namespace xla {
 //   LogicalBuffer(%tuple_constant, {1, 1})  // Holds value "43"
 class LogicalBuffer {
  public:
+  TF_LIB_GTL_DEFINE_INT_TYPE(Color, int64);
+
   // Id is a unique identifier for the LogicalBuffer to facilitate efficient
   // collections of LogicalBuffers with stable iteration order.
   // LogicalBuffers are typically created and accessed through
@@ -93,8 +97,9 @@ class LogicalBuffer {
   using SizeFunction = std::function<int64(const LogicalBuffer&)>;
   using AlignmentFunction = std::function<int64(const LogicalBuffer&)>;
 
-  LogicalBuffer(HloInstruction* instruction, const ShapeIndex& index, Id id)
-      : instruction_(instruction), index_(index), id_(id) {}
+  LogicalBuffer(HloInstruction* instruction, const ShapeIndex& index, Id id,
+                Color color)
+      : instruction_(instruction), index_(index), id_(id), color_(color) {}
 
   Id id() const { return id_; }
 
@@ -105,6 +110,11 @@ class LogicalBuffer {
   // defined. Index used defined as in ShapeUtil::GetSubshape()
   const ShapeIndex& index() const { return index_; }
 
+  // Return the color of the logical buffer. Differently colored buffers can
+  // not be parts of the same allocation.
+  Color color() const { return color_; }
+  void set_color(Color color) { color_ = color; }
+
   // Return the shape of the buffer. This reference points into the shape field
   // of the instruction defining the buffer.  Therefore, the returned shape will
   // contain the layout of instruction, if any.
@@ -137,6 +147,7 @@ class LogicalBuffer {
   HloInstruction* instruction_;
   ShapeIndex index_;
   Id id_;
+  Color color_;
 
   // Similar to HLO constructs (HloInstruction, etc), pointers are used for
   // comparison to equality, so disable all copying.
diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc b/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
index 1f91868dea..ad6f015c70 100644
--- a/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
+++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
@@ -125,13 +125,18 @@ void PointsToSet::add_tuple_source(const ShapeIndex& index,
 }
 
 /* static */ StatusOr<std::unique_ptr<TuplePointsToAnalysis>>
-TuplePointsToAnalysis::Run(const HloModule* module) {
+TuplePointsToAnalysis::Run(const HloModule* module, Colorer colorer) {
   std::unique_ptr<TuplePointsToAnalysis> analysis(
-      new TuplePointsToAnalysis(module));
+      new TuplePointsToAnalysis(module, std::move(colorer)));
   TF_RETURN_IF_ERROR(analysis->Analyze());
   return std::move(analysis);
 }
 
+/* static */ StatusOr<std::unique_ptr<TuplePointsToAnalysis>>
+TuplePointsToAnalysis::Run(const HloModule* module) {
+  return Run(module, DefaultColorer());
+}
+
 Status TuplePointsToAnalysis::Analyze() {
   points_to_.clear();
   for (auto& computation : module_->computations()) {
@@ -179,8 +184,8 @@ Status TuplePointsToAnalysis::PopulateDefinedBuffersAndAliases(
 const LogicalBuffer& TuplePointsToAnalysis::NewLogicalBuffer(
     HloInstruction* instruction, const ShapeIndex& index) {
   CHECK_EQ(logical_buffers_.size(), next_buffer_id_);
-  logical_buffers_.push_back(
-      MakeUnique<LogicalBuffer>(instruction, index, next_buffer_id_));
+  logical_buffers_.push_back(MakeUnique<LogicalBuffer>(
+      instruction, index, next_buffer_id_, colorer_(instruction, index)));
   ++next_buffer_id_;
   return *logical_buffers_.back();
 }
diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis.h b/tensorflow/compiler/xla/service/tuple_points_to_analysis.h
index ffe15ac856..4d7fc7cbc9 100644
--- a/tensorflow/compiler/xla/service/tuple_points_to_analysis.h
+++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis.h
@@ -142,7 +142,15 @@ std::ostream& operator<<(std::ostream& out, const BufferAlias& buffer_alias);
 // the potential sources of each buffer in each instruction's output.
 class TuplePointsToAnalysis : public DfsHloVisitorWithDefault {
  public:
-  // Runs points-to analysis on 'module'.
+  using Colorer = std::function<LogicalBuffer::Color(
+      const HloInstruction* instruction, const ShapeIndex& index)>;
+
+  // Runs points-to analysis on 'module' with the provided buffer color
+  // assigner.
+  static StatusOr<std::unique_ptr<TuplePointsToAnalysis>> Run(
+      const HloModule* module, Colorer colorer);
+
+  // Runs points-to analysis on 'module' with the default color assigner.
   static StatusOr<std::unique_ptr<TuplePointsToAnalysis>> Run(
       const HloModule* module);
 
@@ -207,8 +215,16 @@ class TuplePointsToAnalysis : public DfsHloVisitorWithDefault {
 
   string ToString() const;
 
+  static Colorer DefaultColorer() {
+    return [](const HloInstruction* instruction, const ShapeIndex& index) {
+      return LogicalBuffer::Color(0);
+    };
+  }
+
  private:
-  explicit TuplePointsToAnalysis(const HloModule* module) : module_(module) {}
+  explicit TuplePointsToAnalysis(const HloModule* module,
+                                 Colorer colorer = DefaultColorer())
+      : module_(module), colorer_(colorer) {}
 
   // Perform the analysis. Should be called immediately after constructing the
   // object and before calling GetPointsToSet.
@@ -267,6 +283,9 @@ class TuplePointsToAnalysis : public DfsHloVisitorWithDefault {
   // The ID of the next logical buffer created.
   LogicalBuffer::Id next_buffer_id_ = 0;
 
+  // Used to color the created logical buffers.
+  Colorer colorer_;
+
   TF_DISALLOW_COPY_AND_ASSIGN(TuplePointsToAnalysis);
 };