diff options
author | A. Unique TensorFlower <gardener@tensorflow.org> | 2017-05-08 14:10:10 -0700 |
---|---|---|
committer | TensorFlower Gardener <gardener@tensorflow.org> | 2017-05-10 15:26:43 -0700 |
commit | 09f3fb939c9b395a9bc747cf81d15b2dc2804c3e (patch) | |
tree | c69f74947e38a7be1313e22f66397454f67fc849 | |
parent | 70c303386909fe1a0d34cada6fe5a42565279849 (diff) |
Merged commit includes the following changes:
155425029 by A. Unique TensorFlower <gardener@tensorflow.org>:
Internal change.
--
155424167 by A. Unique TensorFlower <gardener@tensorflow.org>:
Internal change.
--
PiperOrigin-RevId: 155425029
-rw-r--r-- | tensorflow/compiler/xla/service/BUILD | 1 | ||||
-rw-r--r-- | tensorflow/compiler/xla/service/buffer_liveness.cc | 10 | ||||
-rw-r--r-- | tensorflow/compiler/xla/service/copy_insertion.cc | 7 | ||||
-rw-r--r-- | tensorflow/compiler/xla/service/hlo_instruction.h | 2 | ||||
-rw-r--r-- | tensorflow/compiler/xla/service/hlo_rematerialization.cc | 4 | ||||
-rw-r--r-- | tensorflow/compiler/xla/service/liveness_util.cc | 98 | ||||
-rw-r--r-- | tensorflow/compiler/xla/service/liveness_util_test.cc | 70 | ||||
-rw-r--r-- | tensorflow/compiler/xla/service/tuple_points_to_analysis.cc | 26 | ||||
-rw-r--r-- | tensorflow/compiler/xla/service/tuple_points_to_analysis.h | 15 | ||||
-rw-r--r-- | tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc | 9 | ||||
-rw-r--r-- | third_party/eigen3/unsupported/Eigen/CXX11/eigen.threadpool (renamed from tensorflow/opensource_only/eigen.threadpool) | 0 |
11 files changed, 173 insertions, 69 deletions
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD index fd47ffe806..7bdc2afe5e 100644 --- a/tensorflow/compiler/xla/service/BUILD +++ b/tensorflow/compiler/xla/service/BUILD @@ -1203,6 +1203,7 @@ cc_library( ":buffer_liveness", ":hlo", ":hlo_pass", + ":liveness_util", ":logical_buffer", ":tuple_points_to_analysis", "//tensorflow/compiler/xla:status_macros", diff --git a/tensorflow/compiler/xla/service/buffer_liveness.cc b/tensorflow/compiler/xla/service/buffer_liveness.cc index 38c2c81551..3be4810490 100644 --- a/tensorflow/compiler/xla/service/buffer_liveness.cc +++ b/tensorflow/compiler/xla/service/buffer_liveness.cc @@ -45,9 +45,7 @@ StatusOr<std::unique_ptr<BufferLiveness>> BufferLiveness::Run( } tensorflow::Status BufferLiveness::Analyze() { - TF_ASSIGN_OR_RETURN(points_to_analysis_, - TuplePointsToAnalysis::Run( - module_, /*include_loop_fusion_instructions=*/true)); + TF_ASSIGN_OR_RETURN(points_to_analysis_, TuplePointsToAnalysis::Run(module_)); for (auto& computation : module_->computations()) { // Gather all instructions whose buffers might alias other instructions into // the set aliased_buffers_. This includes those contained as a tuple @@ -117,11 +115,7 @@ bool BufferLiveness::live_range_strictly_before(const LogicalBuffer& a, // If 'b' is a user of 'a' then the buffers interfere unless 'a.instruction' // and 'b.instruction' emit the same shape/layout, and 'b.instruction' meets - // one of following qualifications: - // *) Is element-wise. - // *) Is a loop fusion instruction (with DynamicUpdateSlice fused root) where - // the singleton use of 'a' at 'a.index' is the fused root at operand 0. - // *) Use of 'operand' is DynamicUpdateSlice at operand index 0. + // the qualifications specified in CanShareOperandBufferWithUser. for (const BufferAlias& alias : points_to_analysis_->GetBufferAliases(a)) { if (b.instruction()->IsUserOf(alias.instruction()) && !CanShareOperandBufferWithUser(alias.instruction(), alias.index(), diff --git a/tensorflow/compiler/xla/service/copy_insertion.cc b/tensorflow/compiler/xla/service/copy_insertion.cc index 7db28aed3c..907b0307d4 100644 --- a/tensorflow/compiler/xla/service/copy_insertion.cc +++ b/tensorflow/compiler/xla/service/copy_insertion.cc @@ -21,6 +21,7 @@ limitations under the License. #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/service/hlo_module.h" #include "tensorflow/compiler/xla/service/hlo_opcode.h" +#include "tensorflow/compiler/xla/service/liveness_util.h" #include "tensorflow/compiler/xla/service/logical_buffer.h" #include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h" #include "tensorflow/compiler/xla/status_macros.h" @@ -319,6 +320,7 @@ Status InstructionCopier::RecordIndicesWhichInterfereWithOtherInstruction( if (liveness.MayInterfere(*instruction_buffer, *other_buffer)) { VLOG(2) << "Adding copy of buffer for instruction: " << instruction_->name() + << " instruction_buffer: " << instruction_buffer->ToString() << " at index: " << tensorflow::str_util::Join(index, ",") << " because of interference with buffer: " << other_buffer->ToString(); @@ -351,6 +353,11 @@ Status InstructionCopier::RecordControlPredecessors( for (const BufferAlias& alias : points_to_analysis.GetBufferAliases(*buffer)) { for (HloInstruction* user : alias.instruction()->users()) { + if (DoesNotUseOperandBuffer(alias.instruction(), alias.index(), + user, points_to_analysis)) { + continue; + } + if (user != instruction_) { control_predecessors_.mutable_element(index)->push_back(user); } diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h index 43935690df..d300d99ade 100644 --- a/tensorflow/compiler/xla/service/hlo_instruction.h +++ b/tensorflow/compiler/xla/service/hlo_instruction.h @@ -56,6 +56,8 @@ class HloInstruction { kLoop, // Fused into a loop. kInput, // Op's input is fused into the op itself. kOutput, // Op's output is fused into the op itself. + // REQUIRES: At least one operand buffer must be able + // to alias the output buffer. kTransposeDot, // Fused into a dot with transposed operands. kConvBackwardFilter, // Fused into a backward filter convolution. kConvBackwardInput, // Fused into a backward input convolution. diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization.cc b/tensorflow/compiler/xla/service/hlo_rematerialization.cc index b1ee2e46b0..5d4fd7c2de 100644 --- a/tensorflow/compiler/xla/service/hlo_rematerialization.cc +++ b/tensorflow/compiler/xla/service/hlo_rematerialization.cc @@ -1156,9 +1156,7 @@ StatusOr<bool> HloRematerialization::Run( VLOG(1) << "HloRematerialization() with memory limit of " << HumanReadableNumBytes(memory_limit_bytes); - TF_ASSIGN_OR_RETURN(points_to_analysis_, - TuplePointsToAnalysis::Run( - module, /*include_loop_fusion_instructions=*/true)); + TF_ASSIGN_OR_RETURN(points_to_analysis_, TuplePointsToAnalysis::Run(module)); // Adjust memory limit to account for the output of the entry // computation. This is necessary because the per-computation accounting in diff --git a/tensorflow/compiler/xla/service/liveness_util.cc b/tensorflow/compiler/xla/service/liveness_util.cc index e0991fcb76..16e11ca6c6 100644 --- a/tensorflow/compiler/xla/service/liveness_util.cc +++ b/tensorflow/compiler/xla/service/liveness_util.cc @@ -99,6 +99,41 @@ std::vector<std::pair<HloInstruction*, int64>> GetAllUsesOfInstructionAtIndex( return uses; } +// Returns true if there is exactly one use of 'operand' at 'operand_index' +// in 'fusion.fused_instructions', where the singleton use is the fused +// root at operand index 'use_operand_index'. Returns false otherwise. +// +// REQUIRES: 'fusion' opcode is a kFusion instruction. +bool HasUniqueFusedUseOfOperandAt( + HloInstruction* operand, const ShapeIndex& operand_index, + HloInstruction* fusion, const int64 use_operand_index, + const TuplePointsToAnalysis& points_to_analysis) { + CHECK_EQ(HloOpcode::kFusion, fusion->opcode()); + // Check that 'operand' is unique in the operand list of 'fusion'. + if (fusion->OperandIndices(operand).size() > 1) { + return false; + } + // Find fusion parameter associated with 'operand'. + const auto& fused_params = fusion->fused_parameters(); + auto fused_param_it = std::find_if( + fused_params.begin(), fused_params.end(), + [&](HloInstruction* fused_param) { + return fusion->operand(fused_param->parameter_number()) == operand; + }); + if (fused_param_it == fused_params.end()) { + return false; + } + auto* fused_param = *fused_param_it; + // Get all uses of 'operand' at 'index' from 'fusion.fused_instructions'. + auto fused_param_uses = GetAllUsesOfInstructionAtIndex( + fused_param, operand_index, points_to_analysis); + // Return true iff there is exactly one use of 'operand' at 'index', and + // this singleton use is the fused root (at index in 'use_operand_indices'). + return fused_param_uses.size() == 1 && + fused_param_uses[0].first == fusion->fused_expression_root() && + fused_param_uses[0].second == use_operand_index; +} + } // namespace // User and operand can share buffers iff both instructions emit the same shape @@ -107,6 +142,9 @@ std::vector<std::pair<HloInstruction*, int64>> GetAllUsesOfInstructionAtIndex( // *) Is a loop fusion instruction where the only use of 'operand' at 'index' // in the set 'user.fused_instructions' is a DynamicUpdateSlice fused root // at operand 0. Or... +// *) Is a kDot -> kAdd (or fused kTransposeDot -> kAdd) output fusion +// instruction where the only use of 'operand' at 'index' in the set +// 'user.fused_instructions' is a kAdd fused root at operand 0 or 1. Or... // *) The 'user' of 'operand' is DynamicUpdateSlice or While at operand index 0. bool CanShareOperandBufferWithUser( HloInstruction* operand, const ShapeIndex& operand_index, @@ -126,30 +164,46 @@ bool CanShareOperandBufferWithUser( if (user->opcode() == HloOpcode::kCopy) { return false; } - // Check if 'user' is a loop fusion instruction with a kDynamicUpdateSlice - // fused root instruction. - if (user->opcode() == HloOpcode::kFusion && - user->fusion_kind() == HloInstruction::FusionKind::kLoop && - user->fused_expression_root()->opcode() == - HloOpcode::kDynamicUpdateSlice) { - for (auto& fused_param : user->fused_parameters()) { - // Find fusion parameter associated with 'operand'. - if (user->operand(fused_param->parameter_number()) != operand) { - continue; - } - // Get all uses of 'operand' at 'index' from 'user.fused_instructions'. - auto fused_param_uses = GetAllUsesOfInstructionAtIndex( - fused_param, operand_index, points_to_analysis); - // Return true iff there is exactly one use of 'operand' at 'index', and - // this singleton use is the fused root at operand index 0. - if (fused_param_uses.size() == 1 && - fused_param_uses[0].first == user->fused_expression_root() && - fused_param_uses[0].second == 0) { - return true; + if (user->opcode() == HloOpcode::kFusion) { + if (user->fusion_kind() == HloInstruction::FusionKind::kLoop && + user->fused_expression_root()->opcode() == + HloOpcode::kDynamicUpdateSlice) { + // Loop fusion with kDynamicUpdateSlice fused root. + // + // Returns true iff there is exactly one use of 'operand' at shape index + // 'operand_index', and this singleton use is the fused root at operand + // index 0. + return HasUniqueFusedUseOfOperandAt(operand, operand_index, user, 0, + points_to_analysis); + } else if (user->fusion_kind() == HloInstruction::FusionKind::kOutput && + user->fused_expression_root()->opcode() == HloOpcode::kAdd) { + // Output fusion with kAdd fused root. + + // Check if one operand of kAdd fused root is either kDot, or nested + // kFusion of kind kTransposeDot. + auto* add = user->fused_expression_root(); + auto add_operand_it = + std::find_if(add->operands().begin(), add->operands().end(), + [&](HloInstruction* operand) { + return operand->opcode() == HloOpcode::kDot || + (operand->opcode() == HloOpcode::kFusion && + operand->fusion_kind() == + HloInstruction::FusionKind::kTransposeDot); + }); + if (add_operand_it == add->operands().end()) { + return false; } - break; + auto* matched_add_operand = *add_operand_it; + // Calculate operand index of 'add' operand which was not matched above. + const int64 other_add_operand_index = + matched_add_operand == add->operand(0) ? 1 : 0; + // Returns true iff there is exactly one use of 'operand' at shape index + // 'operand_index', and this singleton use is the fused root (at operand + // index 'other_add_operand_index'). + return HasUniqueFusedUseOfOperandAt(operand, operand_index, user, + other_add_operand_index, + points_to_analysis); } - return false; } if (user->opcode() == HloOpcode::kDynamicUpdateSlice || user->opcode() == HloOpcode::kWhile) { diff --git a/tensorflow/compiler/xla/service/liveness_util_test.cc b/tensorflow/compiler/xla/service/liveness_util_test.cc index 1ee0292511..0ddd0caa35 100644 --- a/tensorflow/compiler/xla/service/liveness_util_test.cc +++ b/tensorflow/compiler/xla/service/liveness_util_test.cc @@ -34,9 +34,7 @@ class PointsToAnalysisTestBase : public HloTestBase { void RunAnalysis() { CHECK_NOTNULL(module_.get()); points_to_analysis_ = - TuplePointsToAnalysis::Run(module_.get(), - /*include_loop_fusion_instructions=*/true) - .ConsumeValueOrDie(); + TuplePointsToAnalysis::Run(module_.get()).ConsumeValueOrDie(); } void BuildModuleAndRunAnalysis(std::unique_ptr<HloComputation> computation) { @@ -231,6 +229,72 @@ TEST_F(CanShareOperandBufferWithUserTest, DynamicUpdateSliceCanShare) { CanShareOperandBufferWithUser(starts, {}, dus, {}, *points_to_analysis_)); } +TEST_F(CanShareOperandBufferWithUserTest, FusedDotAdd) { + auto builder = HloComputation::Builder(TestName()); + Shape data_shape = ShapeUtil::MakeShape(F32, {2, 2}); + + auto a = builder.AddInstruction(HloInstruction::CreateConstant( + LiteralUtil::CreateR2<float>({{1.0, 0.0}, {0.0, 1.0}}))); + auto b = builder.AddInstruction(HloInstruction::CreateConstant( + LiteralUtil::CreateR2<float>({{2.0, 2.0}, {2.0, 2.0}}))); + + auto dot = builder.AddInstruction( + HloInstruction::CreateBinary(data_shape, HloOpcode::kDot, a, b)); + + auto one = builder.AddInstruction( + HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0))); + auto add_operand = builder.AddInstruction( + HloInstruction::CreateBroadcast(data_shape, one, {1})); + + auto add = builder.AddInstruction(HloInstruction::CreateBinary( + data_shape, HloOpcode::kAdd, dot, add_operand)); + + BuildModule(builder.Build()); + auto fusion = computation_->CreateFusionInstruction( + {add, dot}, HloInstruction::FusionKind::kOutput); + RunAnalysis(); + + // Output fused dot add should be able to share buffer with 'add_operand'. + EXPECT_TRUE(CanShareOperandBufferWithUser(add_operand, {}, fusion, {}, + *points_to_analysis_)); +} + +TEST_F(CanShareOperandBufferWithUserTest, FusedTransposeDotAdd) { + auto builder = HloComputation::Builder(TestName()); + Shape data_shape = ShapeUtil::MakeShape(F32, {2, 2}); + + auto a = builder.AddInstruction(HloInstruction::CreateConstant( + LiteralUtil::CreateR2<float>({{1.0, 0.0}, {0.0, 1.0}}))); + auto b = builder.AddInstruction(HloInstruction::CreateConstant( + LiteralUtil::CreateR2<float>({{2.0, 2.0}, {2.0, 2.0}}))); + auto b_t = builder.AddInstruction( + HloInstruction::CreateTranspose(data_shape, b, {1, 0})); + + auto dot = builder.AddInstruction( + HloInstruction::CreateBinary(data_shape, HloOpcode::kDot, a, b_t)); + + auto one = builder.AddInstruction( + HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0))); + auto add_operand = builder.AddInstruction( + HloInstruction::CreateBroadcast(data_shape, one, {1})); + + auto add = builder.AddInstruction(HloInstruction::CreateBinary( + data_shape, HloOpcode::kAdd, dot, add_operand)); + + BuildModule(builder.Build()); + + auto nested_fusion = computation_->CreateFusionInstruction( + {dot, b_t}, HloInstruction::FusionKind::kTransposeDot); + + auto fusion = computation_->CreateFusionInstruction( + {add, nested_fusion}, HloInstruction::FusionKind::kOutput); + RunAnalysis(); + + // Output fused transpose-dot-add should be share buffer with 'add_operand'. + EXPECT_TRUE(CanShareOperandBufferWithUser(add_operand, {}, fusion, {}, + *points_to_analysis_)); +} + TEST_F(CanShareOperandBufferWithUserTest, WhileCanShare) { Shape data_shape = ShapeUtil::MakeShape(F32, {8}); diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc b/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc index 98c51b48f9..554adaf0e3 100644 --- a/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc +++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc @@ -131,10 +131,9 @@ void PointsToSet::add_tuple_source(const ShapeIndex& index, } /* static */ StatusOr<std::unique_ptr<TuplePointsToAnalysis>> -TuplePointsToAnalysis::Run(const HloModule* module, - const bool include_loop_fusion_instructions) { +TuplePointsToAnalysis::Run(const HloModule* module) { std::unique_ptr<TuplePointsToAnalysis> analysis( - new TuplePointsToAnalysis(module, include_loop_fusion_instructions)); + new TuplePointsToAnalysis(module)); TF_RETURN_IF_ERROR(analysis->Analyze()); return std::move(analysis); } @@ -145,17 +144,14 @@ Status TuplePointsToAnalysis::Analyze() { TF_RETURN_IF_ERROR(computation->Accept(this)); TF_RETURN_IF_ERROR( PopulateDefinedBuffersAndAliases(computation->instructions())); - if (include_loop_fusion_instructions_) { - // Run points-to analysis on loop fusion instructions in 'computation'. - for (auto& instruction : computation->instructions()) { - if (instruction->opcode() != HloOpcode::kFusion || - instruction->fusion_kind() != HloInstruction::FusionKind::kLoop) { - continue; - } - TF_RETURN_IF_ERROR(instruction->fused_expression_root()->Accept(this)); - TF_RETURN_IF_ERROR(PopulateDefinedBuffersAndAliases( - instruction->fused_instructions())); + // Run points-to analysis on fusion instructions in 'computation'. + for (auto& instruction : computation->instructions()) { + if (instruction->opcode() != HloOpcode::kFusion) { + continue; } + TF_RETURN_IF_ERROR(instruction->fused_expression_root()->Accept(this)); + TF_RETURN_IF_ERROR( + PopulateDefinedBuffersAndAliases(instruction->fused_instructions())); } } @@ -482,9 +478,7 @@ string TuplePointsToAnalysis::ToString() const { for (const HloInstruction* instruction : computation->MakeInstructionPostOrder()) { InstructionToString(instruction, &output); - if (include_loop_fusion_instructions_ && - instruction->opcode() == HloOpcode::kFusion && - instruction->fusion_kind() == HloInstruction::FusionKind::kLoop) { + if (instruction->opcode() == HloOpcode::kFusion) { for (auto& fused : instruction->fused_instructions()) { InstructionToString(fused.get(), &output); } diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis.h b/tensorflow/compiler/xla/service/tuple_points_to_analysis.h index a384529171..85a71b56ce 100644 --- a/tensorflow/compiler/xla/service/tuple_points_to_analysis.h +++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis.h @@ -148,12 +148,9 @@ std::ostream& operator<<(std::ostream& out, const BufferAlias& buffer_alias); // the potential sources of each buffer in each instruction's output. class TuplePointsToAnalysis : public DfsHloVisitorWithDefault { public: - // Runs points-to analysis on 'module'. If 'include_loop_fusion_instructions' - // is true, includes fused instructions from each loop fusion instruction - // in 'module' in the points-to analysis. + // Runs points-to analysis on 'module'. static StatusOr<std::unique_ptr<TuplePointsToAnalysis>> Run( - const HloModule* module, - const bool include_loop_fusion_instructions = false); + const HloModule* module); // Return the points-to set of an instruction. This describes the potential // sources of each buffer in the instruction's output. @@ -218,10 +215,7 @@ class TuplePointsToAnalysis : public DfsHloVisitorWithDefault { string ToString() const; private: - explicit TuplePointsToAnalysis(const HloModule* module, - const bool include_loop_fusion_instructions) - : module_(module), - include_loop_fusion_instructions_(include_loop_fusion_instructions) {} + explicit TuplePointsToAnalysis(const HloModule* module) : module_(module) {} // Perform the analysis. Should be called immediately after constructing the // object and before calling GetPointsToSet. @@ -261,9 +255,6 @@ class TuplePointsToAnalysis : public DfsHloVisitorWithDefault { // The module this analysis is performed on. const HloModule* module_; - // Whether to run points-to analysis on loop fusion instructions in 'module_'. - const bool include_loop_fusion_instructions_; - // A map containing a PointsToSet for every HLO instruction. tensorflow::gtl::FlatMap<const HloInstruction*, std::unique_ptr<PointsToSet>> points_to_; diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc b/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc index 808050bdab..87e1b058b7 100644 --- a/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc +++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc @@ -52,11 +52,10 @@ class TuplePointsToAnalysisTest : public HloTestBase { module_->AddEntryComputation(std::move(computation)); } - void RunAnalysis(const bool include_loop_fusion_instructions = false) { + void RunAnalysis() { CHECK_NOTNULL(module_.get()); - points_to_analysis_ = TuplePointsToAnalysis::Run( - module_.get(), include_loop_fusion_instructions) - .ConsumeValueOrDie(); + points_to_analysis_ = + TuplePointsToAnalysis::Run(module_.get()).ConsumeValueOrDie(); } // Returns the LogicalBuffer defined at the given instruction and @@ -609,7 +608,7 @@ class FusionPointsToAnalysisTest : public TuplePointsToAnalysisTest { auto* fusion = module_->entry_computation()->root_instruction(); EXPECT_THAT(fusion, op::Fusion(tuple_param0)); // Run points-to analysis (should include fused instructions from 'fusion'). - RunAnalysis(/*include_loop_fusion_instructions=*/true); + RunAnalysis(); // Check points-to set of fusion parameter associated with 'tuple_param0'. auto* fusion_param = GetFusionParameterForOperand(fusion, tuple_param0); diff --git a/tensorflow/opensource_only/eigen.threadpool b/third_party/eigen3/unsupported/Eigen/CXX11/eigen.threadpool index d2639af4d9..d2639af4d9 100644 --- a/tensorflow/opensource_only/eigen.threadpool +++ b/third_party/eigen3/unsupported/Eigen/CXX11/eigen.threadpool |