aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorGravatar Thomas Joerg <tjoerg@google.com>2018-06-08 09:52:21 -0700
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2018-06-08 09:54:23 -0700
commit8566ebe58ff5b08864ddef6fe743fdd80962465b (patch)
tree1dca46a24894e372f507f195e75f21d497fc3333
parent1faacc23e3341645ce11a9720775cb27c0694f4d (diff)
[XLA:GPU] Add a mulit-output fusion pass to fuse sibling reduce instructions.
Stop creating pre-fused nodes in BatchNormExpander. PiperOrigin-RevId: 199807585
-rw-r--r--tensorflow/compiler/xla/service/gpu/BUILD29
-rw-r--r--tensorflow/compiler/xla/service/gpu/gpu_compiler.cc6
-rw-r--r--tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc118
-rw-r--r--tensorflow/compiler/xla/service/gpu/multi_output_fusion.h55
-rw-r--r--tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc138
5 files changed, 343 insertions, 3 deletions
diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index 5e5ca7c72c..5e02631a58 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -424,6 +424,34 @@ tf_cc_test(
)
cc_library(
+ name = "multi_output_fusion",
+ srcs = ["multi_output_fusion.cc"],
+ hdrs = ["multi_output_fusion.h"],
+ deps = [
+ "//tensorflow/compiler/xla:shape_util",
+ "//tensorflow/compiler/xla/service:hlo",
+ "//tensorflow/compiler/xla/service:multi_output_fusion",
+ "//tensorflow/core:lib",
+ ],
+)
+
+tf_cc_test(
+ name = "multi_output_fusion_test",
+ srcs = ["multi_output_fusion_test.cc"],
+ deps = [
+ ":multi_output_fusion",
+ "//tensorflow/compiler/xla:status_macros",
+ "//tensorflow/compiler/xla:util",
+ "//tensorflow/compiler/xla/service:hlo",
+ "//tensorflow/compiler/xla/service:hlo_matchers",
+ "//tensorflow/compiler/xla/service:hlo_parser",
+ "//tensorflow/compiler/xla/tests:hlo_test_base",
+ "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+ "//tensorflow/core:lib",
+ ],
+)
+
+cc_library(
name = "gpu_copy_insertion",
srcs = ["gpu_copy_insertion.cc"],
hdrs = ["gpu_copy_insertion.h"],
@@ -523,6 +551,7 @@ cc_library(
":instruction_fusion",
":ir_emission_utils",
":ir_emitter",
+ ":multi_output_fusion",
":pad_insertion",
":partition_assignment",
":stream_assignment",
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
index b857219807..c995736af9 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
@@ -52,6 +52,7 @@ limitations under the License.
#include "tensorflow/compiler/xla/service/gpu/ir_emitter_context.h"
#include "tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h"
#include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h"
+#include "tensorflow/compiler/xla/service/gpu/multi_output_fusion.h"
#include "tensorflow/compiler/xla/service/gpu/pad_insertion.h"
#include "tensorflow/compiler/xla/service/gpu/partition_assignment.h"
#include "tensorflow/compiler/xla/service/gpu/stream_assignment.h"
@@ -159,13 +160,11 @@ Status OptimizeHloModule(HloModule* hlo_module, se::StreamExecutor* stream_exec,
if (hlo_module->config().debug_options().xla_gpu_use_cudnn_batchnorm()) {
pass.AddPass<CudnnBatchNormRewriter>();
}
- // TODO(kramerb): Remove use_fusion once instruction fusion can create
- // multi-output fusions from the unfused expander output.
pass.AddPass<BatchNormExpander>(
/*rewrite_training_op=*/true,
/*rewrite_inference_op=*/true,
/*rewrite_grad_op=*/true,
- /*use_fusion=*/true);
+ /*use_fusion=*/false);
// Rewrite gather ops into smaller ones.
pass.AddPass<GatherExpander>();
@@ -261,6 +260,7 @@ Status OptimizeHloModule(HloModule* hlo_module, se::StreamExecutor* stream_exec,
fusion.AddPass<GpuInstructionFusion>(/*may_duplicate=*/false);
fusion.AddPass<GpuInstructionFusion>(/*may_duplicate=*/true);
fusion.AddPass<FusionMerger>();
+ fusion.AddPass<GpuMultiOutputFusion>();
TF_RETURN_IF_ERROR(fusion.Run(hlo_module).status());
HloPassPipeline reduce_pipeline("reduce-precision");
diff --git a/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc b/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc
new file mode 100644
index 0000000000..86c5c4fb6f
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc
@@ -0,0 +1,118 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/multi_output_fusion.h"
+
+#include <stdint.h>
+#include <algorithm>
+#include <iterator>
+#include <list>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+namespace gpu {
+
+GpuMultiOutputFusion::GpuMultiOutputFusion() : MultiOutputFusion(INT64_MAX) {}
+
+bool GpuMultiOutputFusion::ShapesCompatibleForFusion(HloInstruction* instr1,
+ HloInstruction* instr2) {
+ auto get_element_shape = [&](HloInstruction* instr) {
+ const HloInstruction* element_instr = instr;
+ if (instr->opcode() == HloOpcode::kFusion) {
+ auto fused_expression_root = instr->fused_expression_root();
+ if (instr->IsMultiOutputFusion()) {
+ // The shapes in all tuple operands should agree. Just pick the first
+ // one.
+ element_instr = fused_expression_root->operands()[0];
+ } else {
+ element_instr = fused_expression_root;
+ }
+ }
+ return element_instr->shape();
+ };
+
+ // The elementwise output shapes must be the same (including layout)
+ return ShapeUtil::ShapeUtil::Equal(get_element_shape(instr1),
+ get_element_shape(instr2));
+}
+
+bool GpuMultiOutputFusion::IsProfitableOperand(HloInstruction* instr) {
+ // kConstant instruction will not have memory reads, so it won't be a profit
+ // source. Skip them.
+ if (instr->opcode() == HloOpcode::kConstant &&
+ ShapeUtil::IsEffectiveScalar(instr->shape())) {
+ return false;
+ }
+ // We don't target to fuse producer/consumer instructions -- this should
+ // be taken care of by the instruction_fusion pass. If instr has only
+ // one user, it will not have sibling instructions. We won't consider it.
+ if (instr->user_count() < 2) {
+ return false;
+ }
+ return true;
+}
+
+namespace {
+bool IsReduction(HloInstruction* instr) {
+ if (instr->IsMultiOutputFusion()) {
+ for (const HloInstruction* operand :
+ instr->fused_expression_root()->operands()) {
+ if (operand->opcode() == HloOpcode::kReduce) {
+ return true;
+ }
+ }
+ return false;
+ } else if (instr->opcode() == HloOpcode::kFusion) {
+ return instr->fused_expression_root()->opcode() == HloOpcode::kReduce;
+ } else {
+ return instr->opcode() == HloOpcode::kReduce;
+ }
+}
+} // namespace
+
+bool GpuMultiOutputFusion::IsFusible(HloInstruction* instr) {
+ return IsReduction(instr);
+}
+
+int64 GpuMultiOutputFusion::GetProfit(HloInstruction* instr1,
+ HloInstruction* instr2) {
+ tensorflow::gtl::FlatSet<HloInstruction*> in_list;
+ for (auto instr : instr1->operands()) {
+ if (!IsProfitableOperand(instr)) {
+ continue;
+ }
+ in_list.insert(instr);
+ }
+ int64 profit = 0;
+ for (auto instr : instr2->operands()) {
+ if (!IsProfitableOperand(instr) || in_list.count(instr) == 0) {
+ continue;
+ }
+ profit += ShapeUtil::ByteSizeOf(instr->shape());
+ }
+ VLOG(2) << "Fusing instr1=" << instr1->name() << " instr2=" << instr2->name()
+ << ", the profit is =" << profit;
+ return profit;
+}
+
+} // namespace gpu
+} // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/multi_output_fusion.h b/tensorflow/compiler/xla/service/gpu/multi_output_fusion.h
new file mode 100644
index 0000000000..5451a93cec
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/multi_output_fusion.h
@@ -0,0 +1,55 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_MULTI_OUTPUT_FUSION_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_MULTI_OUTPUT_FUSION_H_
+
+#include "tensorflow/compiler/xla/service/multi_output_fusion.h"
+
+namespace xla {
+namespace gpu {
+
+// Multi-output fusion of sibling and producer-consumer instructions for the
+// Jellyfish backend.
+class GpuMultiOutputFusion : public MultiOutputFusion {
+ public:
+ GpuMultiOutputFusion();
+
+ protected:
+ // Test if instr1 and instr2 have the compatible shapes that can be legally
+ // fused.
+ bool ShapesCompatibleForFusion(HloInstruction* instr1,
+ HloInstruction* instr2) override;
+
+ // We currently only consider reduce and reduce fusion nodes as candidates.
+ bool IsFusible(HloInstruction* instr) override;
+
+ // This function estimates the amount of memory reads saved by merging
+ // instr1 and instr2 into one multi-output fusion instruction. For a fusion
+ // instruction, all the operands need to be loaded from memory. If we merge
+ // instr1 and instr2, common operands will not be loaded twice. The profit is
+ // estimated as the size of the common operands b/w instr1 and instr2.
+ int64 GetProfit(HloInstruction* instr1, HloInstruction* instr2) override;
+
+ // Whether fusing the instruction can reduce memory reads.
+ //
+ // TODO(tjoerg): Move this method up into the MultiOutputFusion base class.
+ bool IsProfitableOperand(HloInstruction* instr) override;
+};
+
+} // namespace gpu
+} // namespace xla
+
+#endif // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_MULTI_OUTPUT_FUSION_H_
diff --git a/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc b/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc
new file mode 100644
index 0000000000..d0b4c88487
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc
@@ -0,0 +1,138 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/multi_output_fusion.h"
+
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+
+namespace op = xla::testing::opcode_matchers;
+
+namespace xla {
+namespace gpu {
+
+using InstructionFusionTest = HloTestBase;
+
+const char kModulePrefix[] = R"(
+ HloModule test_module
+
+ scalar_add_computation {
+ scalar_lhs = f32[] parameter(0)
+ scalar_rhs = f32[] parameter(1)
+ ROOT add = f32[] add(scalar_lhs, scalar_rhs)
+ })";
+
+TEST_F(InstructionFusionTest, MultiOutputFusionSiblingReduceAndReduceFusion) {
+ // Fusion with reduce instruction root and a sibling reduce instruction
+ // sharing the same input param.
+ auto module = ParseHloString(tensorflow::strings::StrCat(kModulePrefix, R"(
+ fused_computation {
+ p1.1 = f32[128,512,28,28]{3,2,1,0} parameter(1)
+ mul = f32[128,512,28,28]{3,2,1,0} multiply(p1.1, p1.1)
+ const.1 = f32[] parameter(0)
+ ROOT reduce.1 = f32[512]{0} reduce(mul, const.1), dimensions={0,2,3}, to_apply=scalar_add_computation
+ }
+
+ ENTRY entry {
+ p0 = f32[] parameter(0)
+ p1 = f32[128,512,28,28]{3,2,1,0} parameter(1)
+ const.2 = f32[] constant(1)
+ fusion = f32[512] fusion(p0, p1), kind=kInput, calls=fused_computation
+ reduce.2 = f32[512]{0} reduce(p1, const.2), dimensions={0,2,3}, to_apply=scalar_add_computation
+ ROOT root = (f32[512]{0}, f32[512]{0}) tuple(fusion, reduce.2)
+ })"))
+ .ValueOrDie();
+ ASSERT_TRUE(GpuMultiOutputFusion().Run(module.get()).ValueOrDie());
+ SCOPED_TRACE(module->ToString());
+ const HloInstruction* fusion =
+ module->entry_computation()->root_instruction()->operand(0)->operand(0);
+ ASSERT_TRUE(fusion->IsMultiOutputFusion());
+ EXPECT_THAT(fusion->fused_expression_root(),
+ op::Tuple(op::Reduce(), op::Reduce()));
+}
+
+TEST_F(InstructionFusionTest, MultiOutputFusionSiblingReduceFusions) {
+ // Two sibling fusions with reduce instruction roots sharing the same input
+ // param.
+ auto module = ParseHloString(tensorflow::strings::StrCat(kModulePrefix, R"(
+ fused_computation_1 {
+ p1.1 = f32[128,512,28,28]{3,2,1,0} parameter(1)
+ mul = f32[128,512,28,28]{3,2,1,0} multiply(p1.1, p1.1)
+ const.1 = f32[] parameter(0)
+ ROOT reduce.1 = f32[512]{0} reduce(mul, const.1), dimensions={0,2,3}, to_apply=scalar_add_computation
+ }
+
+ fused_computation_2 {
+ p1.2 = f32[128,512,28,28]{3,2,1,0} parameter(1)
+ const.2 = f32[] parameter(0)
+ ROOT reduce.2 = f32[512]{0} reduce(p1.2, const.2), dimensions={0,2,3}, to_apply=scalar_add_computation
+ }
+
+ ENTRY entry {
+ p0 = f32[] parameter(0)
+ p1 = f32[128,512,28,28]{3,2,1,0} parameter(1)
+ fusion.1 = f32[512] fusion(p0, p1), kind=kInput, calls=fused_computation_1
+ fusion.2 = f32[512] fusion(p0, p1), kind=kInput, calls=fused_computation_2
+ ROOT root = (f32[512]{0}, f32[512]{0}) tuple(fusion.1, fusion.2)
+ })"))
+ .ValueOrDie();
+ ASSERT_TRUE(GpuMultiOutputFusion().Run(module.get()).ValueOrDie());
+ SCOPED_TRACE(module->ToString());
+ const HloInstruction* fusion =
+ module->entry_computation()->root_instruction()->operand(0)->operand(0);
+ ASSERT_TRUE(fusion->IsMultiOutputFusion());
+ EXPECT_THAT(fusion->fused_expression_root(),
+ op::Tuple(op::Reduce(), op::Reduce()));
+}
+
+TEST_F(InstructionFusionTest,
+ MultiOutputFusionSiblingReduceAndReduceMultiOutputFusion) {
+ // Multi-output fusion with two reduce instructions root and a sibling reduce
+ // instruction sharing the same input param.
+ auto module = ParseHloString(tensorflow::strings::StrCat(kModulePrefix, R"(
+ fused_computation (p0: f32[128,512,28,28]) -> (f32[512], f32[512]) {
+ const.1 = f32[] constant(1)
+ p0.1 = f32[128,512,28,28]{3,2,1,0} parameter(0)
+ mul = f32[128,512,28,28]{3,2,1,0} multiply(f32[128,512,28,28]{3,2,1,0} p0.1, f32[128,512,28,28]{3,2,1,0} p0.1)
+ reduce.1 = f32[512]{0} reduce(f32[128,512,28,28]{3,2,1,0} mul, f32[] const.1), dimensions={0,2,3}, to_apply=scalar_add_computation
+ reduce.2 = f32[512]{0} reduce(f32[128,512,28,28]{3,2,1,0} p0.1, f32[] const.1), dimensions={0,2,3}, to_apply=scalar_add_computation
+ ROOT tuple = (f32[512]{0}, f32[512]{0}) tuple(f32[512]{0} reduce.1, f32[512]{0} reduce.2)
+ }
+
+ ENTRY entry (p0: f32[128,512,28,28]) -> (f32[512], f32[512], f32[512]) {
+ p0 = f32[128,512,28,28]{3,2,1,0} parameter(0)
+ const = f32[] constant(1)
+ fusion = (f32[512]{0}, f32[512]{0}) fusion(f32[128,512,28,28]{3,2,1,0} p0), kind=kInput, calls=fused_computation
+ get-tuple-element = f32[512]{0} get-tuple-element((f32[512]{0}, f32[512]{0}) fusion), index=0
+ get-tuple-element.1 = f32[512]{0} get-tuple-element((f32[512]{0}, f32[512]{0}) fusion), index=1
+ reduce.3 = f32[512]{0} reduce(p0, const), dimensions={0,2,3}, to_apply=scalar_add_computation
+ ROOT root = (f32[512]{0}, f32[512]{0}, f32[512]{0}) tuple(f32[512]{0} get-tuple-element, f32[512]{0} get-tuple-element.1, f32[512]{0} reduce.3)
+ })"))
+ .ValueOrDie();
+ ASSERT_TRUE(GpuMultiOutputFusion().Run(module.get()).ValueOrDie());
+ SCOPED_TRACE(module->ToString());
+ const HloInstruction* fusion =
+ module->entry_computation()->root_instruction()->operand(0)->operand(0);
+ ASSERT_TRUE(fusion->IsMultiOutputFusion());
+ EXPECT_THAT(fusion->fused_expression_root(),
+ op::Tuple(op::Reduce(), op::Reduce(), op::Reduce()));
+}
+
+} // namespace gpu
+} // namespace xla