diff options
4 files changed, 115 insertions, 101 deletions
diff --git a/tensorflow/compiler/xla/service/bfloat16_propagation.cc b/tensorflow/compiler/xla/service/bfloat16_propagation.cc index b6f3c84c7e..b21c83a07f 100644 --- a/tensorflow/compiler/xla/service/bfloat16_propagation.cc +++ b/tensorflow/compiler/xla/service/bfloat16_propagation.cc @@ -615,7 +615,6 @@ Status BFloat16Propagation::ResolveInconsistentFusions(HloModule* module) { // (1) a is F32 but tuple is BF16 // (2) after adding conversion // (3) after tuple simplifier and DCE. - bool needs_tuple_simplifier = false; for (auto computation : module->MakeComputationPostOrder()) { auto insts = computation->MakeInstructionPostOrder(); for (auto inst_it = insts.rbegin(); inst_it != insts.rend(); ++inst_it) { @@ -629,67 +628,25 @@ Status BFloat16Propagation::ResolveInconsistentFusions(HloModule* module) { continue; } ShapeTree<HloInstruction*> converted_outputs(hlo->shape()); - // Iterate through nodes in the shape tree in pre-order and initialize - // each non-root node with a corresponding get-tuple-element. For a leaf - // node, if its shape does not match the fusion output, create a - // conversion node to overwrite the node value. - for (auto it = converted_outputs.begin(); it != converted_outputs.end(); - ++it) { - ShapeIndex output_index = it->first; - HloInstruction*& output = it->second; - const Shape subshape = - ShapeUtil::GetSubshape(hlo->shape(), output_index); - if (output_index.empty()) { - output = fusion_root; - } else { - ShapeIndex parent_index = output_index; - parent_index.pop_back(); - output = fusion_computation->AddInstruction( - HloInstruction::CreateGetTupleElement( - subshape, converted_outputs.element(parent_index), - output_index.back())); - } - if (!ShapeUtil::IsArray(subshape)) { - continue; - } - if (!ShapeUtil::Compatible( - subshape, - ShapeUtil::GetSubshape(fusion_root->shape(), output_index))) { - output = fusion_computation->AddInstruction( - HloInstruction::CreateConvert(subshape, output)); - } - } - // Iterate through nodes in the shape tree in reverse pre-order and create - // a tuple instruction for each non-leaf node where the elements are the - // values of its child nodes. - for (auto it = converted_outputs.rbegin(); it != converted_outputs.rend(); - ++it) { - ShapeIndex output_index = it->first; - HloInstruction*& output = it->second; - const Shape& subshape = - ShapeUtil::GetSubshape(hlo->shape(), output_index); - if (!ShapeUtil::IsTuple(subshape)) { - continue; - } - std::vector<HloInstruction*> elements( - ShapeUtil::TupleElementCount(subshape)); - ShapeIndex child_index = output_index; - for (int64 i = 0; i < elements.size(); ++i) { - child_index.push_back(i); - elements[i] = converted_outputs.element(child_index); - child_index.pop_back(); - } - output = fusion_computation->AddInstruction( - HloInstruction::CreateTuple(elements)); - } - fusion_computation->set_root_instruction(converted_outputs.element({})); - needs_tuple_simplifier |= ShapeUtil::IsTuple(hlo->shape()); + // Deep copy the fusion root, and convert a leaf node only if its shape + // does not match the fusion output. + TF_ASSIGN_OR_RETURN( + HloInstruction * copy, + fusion_computation->DeepCopyInstructionWithCustomCopier( + fusion_root, + [hlo](HloInstruction* leaf, const ShapeIndex& leaf_index, + HloComputation* comp) { + const Shape& hlo_subshape = + ShapeUtil::GetSubshape(hlo->shape(), leaf_index); + if (ShapeUtil::Compatible(leaf->shape(), hlo_subshape)) { + return leaf; + } + return comp->AddInstruction( + HloInstruction::CreateConvert(hlo_subshape, leaf)); + })); + fusion_computation->set_root_instruction(copy); } } - if (needs_tuple_simplifier) { - TupleSimplifier tuple_simplifier; - TF_RETURN_IF_ERROR(tuple_simplifier.Run(module).status()); - } return Status::OK(); } @@ -758,10 +715,38 @@ StatusOr<bool> BFloat16Propagation::Run(HloModule* module) { changes_to_bf16_.clear(); changed_ = false; + auto computations_topological_order = module->MakeComputationPostOrder(); + + // Before running the propagation pass, we insert copies (kConvert to the same + // type) of F32 inputs to while loops. This prevents other uses of the same + // input from aliasing the while loop input/output, so that there's greater + // chance to use BF16 inside the loop. If some of these added copies do not + // help, they will remain F32 after BF16 propagation and will be removed since + // they are no-ops. + for (auto computation : computations_topological_order) { + for (auto inst : computation->MakeInstructionPostOrder()) { + if (inst->opcode() != HloOpcode::kWhile) { + continue; + } + + auto operand = inst->mutable_operand(0); + TF_ASSIGN_OR_RETURN( + HloInstruction * copy, + computation->DeepCopyInstructionWithCustomCopier( + operand, [](HloInstruction* leaf, const ShapeIndex& leaf_index, + HloComputation* comp) { + if (leaf->shape().element_type() != F32) { + return leaf; + } + return comp->AddInstruction( + HloInstruction::CreateConvert(leaf->shape(), leaf)); + })); + TF_RETURN_IF_ERROR(operand->ReplaceUseWith(inst, copy)); + } + } + TF_ASSIGN_OR_RETURN(dataflow_, HloDataflowAnalysis::Run(*module)); - const auto& computations_topological_order = - module->MakeComputationPostOrder(); // The first step is a forward pass (parameters to root), where we determine // the potential candidate instructions to use bfloat16 in the outputs that // are not likely to cause overhead from extra explicit conversions. This is @@ -810,23 +795,27 @@ StatusOr<bool> BFloat16Propagation::Run(HloModule* module) { } } + // Removes redundant HLOs added by this pass, either when inserting + // de-aliasing copies to while loop inputs, or later when converting output + // types. + auto clean_up = [this, module]() { + TF_RETURN_IF_ERROR(SkipNoopConversions(module)); + TupleSimplifier tuple_simplifier; + TF_RETURN_IF_ERROR(tuple_simplifier.Run(module).status()); + HloDCE dce; + TF_RETURN_IF_ERROR(dce.Run(module).status()); + return Status::OK(); + }; + if (!changed_) { + TF_RETURN_IF_ERROR(clean_up()); return false; } TF_RETURN_IF_ERROR(ResolveInconsistentFusions(module)); TF_RETURN_IF_ERROR(ResolveConvertedConstants(module)); - // This pass could have turned an F32 -> BF16 conversion to a no-op (BF16 -> - // BF16), so we skip them now. - TF_RETURN_IF_ERROR(SkipNoopConversions(module)); - - { - // We may have dead HLOs after ResolveInconsistentFusions, - // ResolveConvertedConstants and SkipNoopConversions. - HloDCE dce; - TF_RETURN_IF_ERROR(dce.Run(module).status()); - } + TF_RETURN_IF_ERROR(clean_up()); return true; } diff --git a/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc b/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc index 23aa83ea88..aeafb25ad7 100644 --- a/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc +++ b/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc @@ -240,12 +240,10 @@ TEST_F(BFloat16PropagationTest, SameValueReferencedTwice) { EXPECT_TRUE(PropagatePrecision(module.get())); EXPECT_EQ(computation->root_instruction(), dot); - EXPECT_TRUE(OutputsBF16(add0)); EXPECT_TRUE(OutputsBF16(add1)); EXPECT_TRUE(OutputsBF16(lhs)); - // rhs is a get-tuple-element, which does not define a buffer, but its shape - // should also be adjusted accordingly. - EXPECT_TRUE(OutputsBF16(rhs)); + + // add0 and rhs have been eliminated by simplification and DCE. } // Tests that a non-fusion computation's root should not be changed. @@ -734,10 +732,8 @@ TEST_F(BFloat16PropagationTest, NoopConversionRemoved) { EXPECT_TRUE(PropagatePrecision(module.get())); EXPECT_EQ(computation->root_instruction(), add2); - EXPECT_EQ(add2->operand(0), gte0); - EXPECT_EQ(add2->operand(1), gte1); - EXPECT_EQ(gte0->shape().element_type(), BF16); - EXPECT_EQ(gte1->shape().element_type(), BF16); + EXPECT_EQ(add2->operand(0), add0); + EXPECT_EQ(add2->operand(1), add1); EXPECT_EQ(add0->shape().element_type(), BF16); EXPECT_EQ(add1->shape().element_type(), BF16); } diff --git a/tensorflow/compiler/xla/service/hlo_computation.cc b/tensorflow/compiler/xla/service/hlo_computation.cc index d4b13e0599..166a83fade 100644 --- a/tensorflow/compiler/xla/service/hlo_computation.cc +++ b/tensorflow/compiler/xla/service/hlo_computation.cc @@ -528,8 +528,10 @@ HloInstruction* HloComputation::CreateFusionInstruction( } StatusOr<HloInstruction*> HloComputation::DeepCopyHelper( - HloInstruction* instruction, const ShapeTree<bool>* indices_to_copy, - ShapeTree<HloInstruction*>* copies_added, ShapeIndex* index) { + HloInstruction* instruction, ShapeIndex* index, + const std::function< + HloInstruction*(HloInstruction* leaf, const ShapeIndex& leaf_index, + HloComputation* computation)>& copy_leaf) { if (ShapeUtil::IsTuple(instruction->shape())) { std::vector<HloInstruction*> elements; for (int64 i = 0; i < ShapeUtil::TupleElementCount(instruction->shape()); @@ -540,9 +542,8 @@ StatusOr<HloInstruction*> HloComputation::DeepCopyHelper( instruction, i)); index->push_back(i); - TF_ASSIGN_OR_RETURN( - HloInstruction * element, - DeepCopyHelper(gte, indices_to_copy, copies_added, index)); + TF_ASSIGN_OR_RETURN(HloInstruction * element, + DeepCopyHelper(gte, index, copy_leaf)); elements.push_back(element); index->pop_back(); } @@ -556,19 +557,7 @@ StatusOr<HloInstruction*> HloComputation::DeepCopyHelper( // Array shape. TF_RET_CHECK(ShapeUtil::IsArray(instruction->shape())); - if (indices_to_copy == nullptr || indices_to_copy->element(*index)) { - // Use kCopy to copy array elements - HloInstruction* copy = AddInstruction(HloInstruction::CreateUnary( - instruction->shape(), HloOpcode::kCopy, instruction)); - if (copies_added != nullptr) { - *copies_added->mutable_element(*index) = copy; - } - return copy; - } else { - // Elements which are not to be copied are passed through - // transparently. - return instruction; - } + return copy_leaf(instruction, *index, this); } StatusOr<HloInstruction*> HloComputation::DeepCopyInstruction( @@ -590,7 +579,36 @@ StatusOr<HloInstruction*> HloComputation::DeepCopyInstruction( } ShapeIndex index; - return DeepCopyHelper(instruction, indices_to_copy, copies_added, &index); + auto copy_leaf = [indices_to_copy, copies_added]( + HloInstruction* leaf, const ShapeIndex& leaf_index, + HloComputation* computation) { + if (indices_to_copy == nullptr || indices_to_copy->element(leaf_index)) { + HloInstruction* copy = computation->AddInstruction( + HloInstruction::CreateUnary(leaf->shape(), HloOpcode::kCopy, leaf)); + if (copies_added != nullptr) { + *copies_added->mutable_element(leaf_index) = copy; + } + return copy; + } + // Elements which are not to be copied are passed through + // transparently. + return leaf; + }; + return DeepCopyHelper(instruction, &index, copy_leaf); +} + +StatusOr<HloInstruction*> HloComputation::DeepCopyInstructionWithCustomCopier( + HloInstruction* instruction, + const std::function< + HloInstruction*(HloInstruction* leaf, const ShapeIndex& leaf_index, + HloComputation* computation)>& copy_leaf) { + if (instruction->parent() != this) { + return FailedPrecondition( + "Can't deep copy instruction %s: instruction is not in computation %s", + instruction->name().c_str(), name().c_str()); + } + ShapeIndex index; + return DeepCopyHelper(instruction, &index, copy_leaf); } ProgramShape HloComputation::ComputeProgramShape() const { diff --git a/tensorflow/compiler/xla/service/hlo_computation.h b/tensorflow/compiler/xla/service/hlo_computation.h index c1c3e79ebc..abc1da4da3 100644 --- a/tensorflow/compiler/xla/service/hlo_computation.h +++ b/tensorflow/compiler/xla/service/hlo_computation.h @@ -16,6 +16,7 @@ limitations under the License. #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_COMPUTATION_H_ #define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_COMPUTATION_H_ +#include <functional> #include <list> #include <memory> #include <string> @@ -254,6 +255,14 @@ class HloComputation { const ShapeTree<bool>* indices_to_copy = nullptr, ShapeTree<HloInstruction*>* copies_added = nullptr); + // As above, but uses a custom function to copy the leaf nodes, which could + // create alternative HLOs other than kCopy, or even pass-throughs. + StatusOr<HloInstruction*> DeepCopyInstructionWithCustomCopier( + HloInstruction* instruction, + const std::function< + HloInstruction*(HloInstruction* leaf, const ShapeIndex& leaf_index, + HloComputation* computation)>& copy_leaf); + // Computes and returns the ProgramShape of this computation (shape of // parameters and result with layout). ProgramShape ComputeProgramShape() const; @@ -378,8 +387,10 @@ class HloComputation { // Internal helper for recursive copying of an instruction. Creates and // returns a deep copy of the given instruction. StatusOr<HloInstruction*> DeepCopyHelper( - HloInstruction* instruction, const ShapeTree<bool>* indices_to_copy, - ShapeTree<HloInstruction*>* copies_added, ShapeIndex* index); + HloInstruction* instruction, ShapeIndex* index, + const std::function< + HloInstruction*(HloInstruction* leaf, const ShapeIndex& leaf_index, + HloComputation* computation)>& copy_leaf); // Internal helper to collect unreachable roots. std::vector<HloInstruction*> CollectUnreachableRoots() const; |