aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--tensorflow/compiler/xla/service/bfloat16_propagation.cc133
-rw-r--r--tensorflow/compiler/xla/service/bfloat16_propagation_test.cc12
-rw-r--r--tensorflow/compiler/xla/service/hlo_computation.cc56
-rw-r--r--tensorflow/compiler/xla/service/hlo_computation.h15
4 files changed, 115 insertions, 101 deletions
diff --git a/tensorflow/compiler/xla/service/bfloat16_propagation.cc b/tensorflow/compiler/xla/service/bfloat16_propagation.cc
index b6f3c84c7e..b21c83a07f 100644
--- a/tensorflow/compiler/xla/service/bfloat16_propagation.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_propagation.cc
@@ -615,7 +615,6 @@ Status BFloat16Propagation::ResolveInconsistentFusions(HloModule* module) {
// (1) a is F32 but tuple is BF16
// (2) after adding conversion
// (3) after tuple simplifier and DCE.
- bool needs_tuple_simplifier = false;
for (auto computation : module->MakeComputationPostOrder()) {
auto insts = computation->MakeInstructionPostOrder();
for (auto inst_it = insts.rbegin(); inst_it != insts.rend(); ++inst_it) {
@@ -629,67 +628,25 @@ Status BFloat16Propagation::ResolveInconsistentFusions(HloModule* module) {
continue;
}
ShapeTree<HloInstruction*> converted_outputs(hlo->shape());
- // Iterate through nodes in the shape tree in pre-order and initialize
- // each non-root node with a corresponding get-tuple-element. For a leaf
- // node, if its shape does not match the fusion output, create a
- // conversion node to overwrite the node value.
- for (auto it = converted_outputs.begin(); it != converted_outputs.end();
- ++it) {
- ShapeIndex output_index = it->first;
- HloInstruction*& output = it->second;
- const Shape subshape =
- ShapeUtil::GetSubshape(hlo->shape(), output_index);
- if (output_index.empty()) {
- output = fusion_root;
- } else {
- ShapeIndex parent_index = output_index;
- parent_index.pop_back();
- output = fusion_computation->AddInstruction(
- HloInstruction::CreateGetTupleElement(
- subshape, converted_outputs.element(parent_index),
- output_index.back()));
- }
- if (!ShapeUtil::IsArray(subshape)) {
- continue;
- }
- if (!ShapeUtil::Compatible(
- subshape,
- ShapeUtil::GetSubshape(fusion_root->shape(), output_index))) {
- output = fusion_computation->AddInstruction(
- HloInstruction::CreateConvert(subshape, output));
- }
- }
- // Iterate through nodes in the shape tree in reverse pre-order and create
- // a tuple instruction for each non-leaf node where the elements are the
- // values of its child nodes.
- for (auto it = converted_outputs.rbegin(); it != converted_outputs.rend();
- ++it) {
- ShapeIndex output_index = it->first;
- HloInstruction*& output = it->second;
- const Shape& subshape =
- ShapeUtil::GetSubshape(hlo->shape(), output_index);
- if (!ShapeUtil::IsTuple(subshape)) {
- continue;
- }
- std::vector<HloInstruction*> elements(
- ShapeUtil::TupleElementCount(subshape));
- ShapeIndex child_index = output_index;
- for (int64 i = 0; i < elements.size(); ++i) {
- child_index.push_back(i);
- elements[i] = converted_outputs.element(child_index);
- child_index.pop_back();
- }
- output = fusion_computation->AddInstruction(
- HloInstruction::CreateTuple(elements));
- }
- fusion_computation->set_root_instruction(converted_outputs.element({}));
- needs_tuple_simplifier |= ShapeUtil::IsTuple(hlo->shape());
+ // Deep copy the fusion root, and convert a leaf node only if its shape
+ // does not match the fusion output.
+ TF_ASSIGN_OR_RETURN(
+ HloInstruction * copy,
+ fusion_computation->DeepCopyInstructionWithCustomCopier(
+ fusion_root,
+ [hlo](HloInstruction* leaf, const ShapeIndex& leaf_index,
+ HloComputation* comp) {
+ const Shape& hlo_subshape =
+ ShapeUtil::GetSubshape(hlo->shape(), leaf_index);
+ if (ShapeUtil::Compatible(leaf->shape(), hlo_subshape)) {
+ return leaf;
+ }
+ return comp->AddInstruction(
+ HloInstruction::CreateConvert(hlo_subshape, leaf));
+ }));
+ fusion_computation->set_root_instruction(copy);
}
}
- if (needs_tuple_simplifier) {
- TupleSimplifier tuple_simplifier;
- TF_RETURN_IF_ERROR(tuple_simplifier.Run(module).status());
- }
return Status::OK();
}
@@ -758,10 +715,38 @@ StatusOr<bool> BFloat16Propagation::Run(HloModule* module) {
changes_to_bf16_.clear();
changed_ = false;
+ auto computations_topological_order = module->MakeComputationPostOrder();
+
+ // Before running the propagation pass, we insert copies (kConvert to the same
+ // type) of F32 inputs to while loops. This prevents other uses of the same
+ // input from aliasing the while loop input/output, so that there's greater
+ // chance to use BF16 inside the loop. If some of these added copies do not
+ // help, they will remain F32 after BF16 propagation and will be removed since
+ // they are no-ops.
+ for (auto computation : computations_topological_order) {
+ for (auto inst : computation->MakeInstructionPostOrder()) {
+ if (inst->opcode() != HloOpcode::kWhile) {
+ continue;
+ }
+
+ auto operand = inst->mutable_operand(0);
+ TF_ASSIGN_OR_RETURN(
+ HloInstruction * copy,
+ computation->DeepCopyInstructionWithCustomCopier(
+ operand, [](HloInstruction* leaf, const ShapeIndex& leaf_index,
+ HloComputation* comp) {
+ if (leaf->shape().element_type() != F32) {
+ return leaf;
+ }
+ return comp->AddInstruction(
+ HloInstruction::CreateConvert(leaf->shape(), leaf));
+ }));
+ TF_RETURN_IF_ERROR(operand->ReplaceUseWith(inst, copy));
+ }
+ }
+
TF_ASSIGN_OR_RETURN(dataflow_, HloDataflowAnalysis::Run(*module));
- const auto& computations_topological_order =
- module->MakeComputationPostOrder();
// The first step is a forward pass (parameters to root), where we determine
// the potential candidate instructions to use bfloat16 in the outputs that
// are not likely to cause overhead from extra explicit conversions. This is
@@ -810,23 +795,27 @@ StatusOr<bool> BFloat16Propagation::Run(HloModule* module) {
}
}
+ // Removes redundant HLOs added by this pass, either when inserting
+ // de-aliasing copies to while loop inputs, or later when converting output
+ // types.
+ auto clean_up = [this, module]() {
+ TF_RETURN_IF_ERROR(SkipNoopConversions(module));
+ TupleSimplifier tuple_simplifier;
+ TF_RETURN_IF_ERROR(tuple_simplifier.Run(module).status());
+ HloDCE dce;
+ TF_RETURN_IF_ERROR(dce.Run(module).status());
+ return Status::OK();
+ };
+
if (!changed_) {
+ TF_RETURN_IF_ERROR(clean_up());
return false;
}
TF_RETURN_IF_ERROR(ResolveInconsistentFusions(module));
TF_RETURN_IF_ERROR(ResolveConvertedConstants(module));
- // This pass could have turned an F32 -> BF16 conversion to a no-op (BF16 ->
- // BF16), so we skip them now.
- TF_RETURN_IF_ERROR(SkipNoopConversions(module));
-
- {
- // We may have dead HLOs after ResolveInconsistentFusions,
- // ResolveConvertedConstants and SkipNoopConversions.
- HloDCE dce;
- TF_RETURN_IF_ERROR(dce.Run(module).status());
- }
+ TF_RETURN_IF_ERROR(clean_up());
return true;
}
diff --git a/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc b/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc
index 23aa83ea88..aeafb25ad7 100644
--- a/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc
@@ -240,12 +240,10 @@ TEST_F(BFloat16PropagationTest, SameValueReferencedTwice) {
EXPECT_TRUE(PropagatePrecision(module.get()));
EXPECT_EQ(computation->root_instruction(), dot);
- EXPECT_TRUE(OutputsBF16(add0));
EXPECT_TRUE(OutputsBF16(add1));
EXPECT_TRUE(OutputsBF16(lhs));
- // rhs is a get-tuple-element, which does not define a buffer, but its shape
- // should also be adjusted accordingly.
- EXPECT_TRUE(OutputsBF16(rhs));
+
+ // add0 and rhs have been eliminated by simplification and DCE.
}
// Tests that a non-fusion computation's root should not be changed.
@@ -734,10 +732,8 @@ TEST_F(BFloat16PropagationTest, NoopConversionRemoved) {
EXPECT_TRUE(PropagatePrecision(module.get()));
EXPECT_EQ(computation->root_instruction(), add2);
- EXPECT_EQ(add2->operand(0), gte0);
- EXPECT_EQ(add2->operand(1), gte1);
- EXPECT_EQ(gte0->shape().element_type(), BF16);
- EXPECT_EQ(gte1->shape().element_type(), BF16);
+ EXPECT_EQ(add2->operand(0), add0);
+ EXPECT_EQ(add2->operand(1), add1);
EXPECT_EQ(add0->shape().element_type(), BF16);
EXPECT_EQ(add1->shape().element_type(), BF16);
}
diff --git a/tensorflow/compiler/xla/service/hlo_computation.cc b/tensorflow/compiler/xla/service/hlo_computation.cc
index d4b13e0599..166a83fade 100644
--- a/tensorflow/compiler/xla/service/hlo_computation.cc
+++ b/tensorflow/compiler/xla/service/hlo_computation.cc
@@ -528,8 +528,10 @@ HloInstruction* HloComputation::CreateFusionInstruction(
}
StatusOr<HloInstruction*> HloComputation::DeepCopyHelper(
- HloInstruction* instruction, const ShapeTree<bool>* indices_to_copy,
- ShapeTree<HloInstruction*>* copies_added, ShapeIndex* index) {
+ HloInstruction* instruction, ShapeIndex* index,
+ const std::function<
+ HloInstruction*(HloInstruction* leaf, const ShapeIndex& leaf_index,
+ HloComputation* computation)>& copy_leaf) {
if (ShapeUtil::IsTuple(instruction->shape())) {
std::vector<HloInstruction*> elements;
for (int64 i = 0; i < ShapeUtil::TupleElementCount(instruction->shape());
@@ -540,9 +542,8 @@ StatusOr<HloInstruction*> HloComputation::DeepCopyHelper(
instruction, i));
index->push_back(i);
- TF_ASSIGN_OR_RETURN(
- HloInstruction * element,
- DeepCopyHelper(gte, indices_to_copy, copies_added, index));
+ TF_ASSIGN_OR_RETURN(HloInstruction * element,
+ DeepCopyHelper(gte, index, copy_leaf));
elements.push_back(element);
index->pop_back();
}
@@ -556,19 +557,7 @@ StatusOr<HloInstruction*> HloComputation::DeepCopyHelper(
// Array shape.
TF_RET_CHECK(ShapeUtil::IsArray(instruction->shape()));
- if (indices_to_copy == nullptr || indices_to_copy->element(*index)) {
- // Use kCopy to copy array elements
- HloInstruction* copy = AddInstruction(HloInstruction::CreateUnary(
- instruction->shape(), HloOpcode::kCopy, instruction));
- if (copies_added != nullptr) {
- *copies_added->mutable_element(*index) = copy;
- }
- return copy;
- } else {
- // Elements which are not to be copied are passed through
- // transparently.
- return instruction;
- }
+ return copy_leaf(instruction, *index, this);
}
StatusOr<HloInstruction*> HloComputation::DeepCopyInstruction(
@@ -590,7 +579,36 @@ StatusOr<HloInstruction*> HloComputation::DeepCopyInstruction(
}
ShapeIndex index;
- return DeepCopyHelper(instruction, indices_to_copy, copies_added, &index);
+ auto copy_leaf = [indices_to_copy, copies_added](
+ HloInstruction* leaf, const ShapeIndex& leaf_index,
+ HloComputation* computation) {
+ if (indices_to_copy == nullptr || indices_to_copy->element(leaf_index)) {
+ HloInstruction* copy = computation->AddInstruction(
+ HloInstruction::CreateUnary(leaf->shape(), HloOpcode::kCopy, leaf));
+ if (copies_added != nullptr) {
+ *copies_added->mutable_element(leaf_index) = copy;
+ }
+ return copy;
+ }
+ // Elements which are not to be copied are passed through
+ // transparently.
+ return leaf;
+ };
+ return DeepCopyHelper(instruction, &index, copy_leaf);
+}
+
+StatusOr<HloInstruction*> HloComputation::DeepCopyInstructionWithCustomCopier(
+ HloInstruction* instruction,
+ const std::function<
+ HloInstruction*(HloInstruction* leaf, const ShapeIndex& leaf_index,
+ HloComputation* computation)>& copy_leaf) {
+ if (instruction->parent() != this) {
+ return FailedPrecondition(
+ "Can't deep copy instruction %s: instruction is not in computation %s",
+ instruction->name().c_str(), name().c_str());
+ }
+ ShapeIndex index;
+ return DeepCopyHelper(instruction, &index, copy_leaf);
}
ProgramShape HloComputation::ComputeProgramShape() const {
diff --git a/tensorflow/compiler/xla/service/hlo_computation.h b/tensorflow/compiler/xla/service/hlo_computation.h
index c1c3e79ebc..abc1da4da3 100644
--- a/tensorflow/compiler/xla/service/hlo_computation.h
+++ b/tensorflow/compiler/xla/service/hlo_computation.h
@@ -16,6 +16,7 @@ limitations under the License.
#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_COMPUTATION_H_
#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_COMPUTATION_H_
+#include <functional>
#include <list>
#include <memory>
#include <string>
@@ -254,6 +255,14 @@ class HloComputation {
const ShapeTree<bool>* indices_to_copy = nullptr,
ShapeTree<HloInstruction*>* copies_added = nullptr);
+ // As above, but uses a custom function to copy the leaf nodes, which could
+ // create alternative HLOs other than kCopy, or even pass-throughs.
+ StatusOr<HloInstruction*> DeepCopyInstructionWithCustomCopier(
+ HloInstruction* instruction,
+ const std::function<
+ HloInstruction*(HloInstruction* leaf, const ShapeIndex& leaf_index,
+ HloComputation* computation)>& copy_leaf);
+
// Computes and returns the ProgramShape of this computation (shape of
// parameters and result with layout).
ProgramShape ComputeProgramShape() const;
@@ -378,8 +387,10 @@ class HloComputation {
// Internal helper for recursive copying of an instruction. Creates and
// returns a deep copy of the given instruction.
StatusOr<HloInstruction*> DeepCopyHelper(
- HloInstruction* instruction, const ShapeTree<bool>* indices_to_copy,
- ShapeTree<HloInstruction*>* copies_added, ShapeIndex* index);
+ HloInstruction* instruction, ShapeIndex* index,
+ const std::function<
+ HloInstruction*(HloInstruction* leaf, const ShapeIndex& leaf_index,
+ HloComputation* computation)>& copy_leaf);
// Internal helper to collect unreachable roots.
std::vector<HloInstruction*> CollectUnreachableRoots() const;