diff options
author | 2017-07-25 00:05:41 -0700 | |
---|---|---|
committer | 2017-07-25 00:09:39 -0700 | |
commit | d1a9ea61ef8271b3d2fe273a68ff5940fcba7ccd (patch) | |
tree | fb4527dc7de04f4a2a1a0bc101334c31d2cbef38 /tensorflow | |
parent | 73b120ea3b517b6af2267ca078bf571f966fd606 (diff) |
[XLA] Teach CPU and GPU compilers to optionally invoke the HLO insert-reduce-precision-operations pass.
This also required a few additions and fixups. We add pieces to ReducePrecisionInsertion to translate between the protocol-buffer representation of the pass options and the predicate-function actually used in the pass. To facilitate this translation, we also add a function to HloOpcode to return the number of opcodes so that we can iterate over the whole set easily.
PiperOrigin-RevId: 163037250
Diffstat (limited to 'tensorflow')
-rw-r--r-- | tensorflow/compiler/xla/service/BUILD | 1 | ||||
-rw-r--r-- | tensorflow/compiler/xla/service/cpu/BUILD | 1 | ||||
-rw-r--r-- | tensorflow/compiler/xla/service/cpu/cpu_compiler.cc | 18 | ||||
-rw-r--r-- | tensorflow/compiler/xla/service/gpu/BUILD | 1 | ||||
-rw-r--r-- | tensorflow/compiler/xla/service/gpu/gpu_compiler.cc | 31 | ||||
-rw-r--r-- | tensorflow/compiler/xla/service/hlo_opcode.h | 5 | ||||
-rw-r--r-- | tensorflow/compiler/xla/service/reduce_precision_insertion.cc | 39 | ||||
-rw-r--r-- | tensorflow/compiler/xla/service/reduce_precision_insertion.h | 22 | ||||
-rw-r--r-- | tensorflow/compiler/xla/tests/reduce_precision_test.cc | 90 | ||||
-rw-r--r-- | tensorflow/compiler/xla/xla.proto | 23 |
10 files changed, 221 insertions, 10 deletions
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD index 696dc28564..a4612bb6c1 100644 --- a/tensorflow/compiler/xla/service/BUILD +++ b/tensorflow/compiler/xla/service/BUILD @@ -1945,6 +1945,7 @@ cc_library( ":buffer_liveness", ":hlo", ":hlo_pass", + "//tensorflow/compiler/xla:shape_util", "//tensorflow/core:lib", ], ) diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD index 7248cb5f4c..2ca4af67cd 100644 --- a/tensorflow/compiler/xla/service/cpu/BUILD +++ b/tensorflow/compiler/xla/service/cpu/BUILD @@ -72,6 +72,7 @@ cc_library( "//tensorflow/compiler/xla/service:hlo_subcomputation_unification", "//tensorflow/compiler/xla/service:hlo_verifier", "//tensorflow/compiler/xla/service:inliner", + "//tensorflow/compiler/xla/service:reduce_precision_insertion", "//tensorflow/compiler/xla/service:reshape_mover", "//tensorflow/compiler/xla/service:transpose_folding", "//tensorflow/compiler/xla/service/llvm_ir:llvm_util", # fixdeps: keep diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc index 6d819355c4..b86342d0b3 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc +++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc @@ -74,6 +74,7 @@ limitations under the License. #include "tensorflow/compiler/xla/service/hlo_verifier.h" #include "tensorflow/compiler/xla/service/inliner.h" #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h" +#include "tensorflow/compiler/xla/service/reduce_precision_insertion.h" #include "tensorflow/compiler/xla/service/reshape_mover.h" #include "tensorflow/compiler/xla/service/transpose_folding.h" #include "tensorflow/compiler/xla/status_macros.h" @@ -253,6 +254,14 @@ Status CpuCompiler::RunHloPasses(HloModule* module) { HloPassPipeline pipeline("CPU"); pipeline.AddInvariantChecker<HloVerifier>(); + for (const auto& reduce_precision_options : + module->config().debug_options().hlo_reduce_precision_options()) { + if (reduce_precision_options.pass_timing() == + HloReducePrecisionOptions::BEFORE_OP_FUSION) { + pipeline.AddPass<ReducePrecisionInsertion>(reduce_precision_options); + } + } + // TODO(b/35786417): Re-enable inliner pass after fixing the bug and deciding // where we will take this pass in future. // pipeline.AddPass<Inliner>(); @@ -278,6 +287,15 @@ Status CpuCompiler::RunHloPasses(HloModule* module) { TransposeFolding::NeverFoldTranspose); pipeline.AddPass<HloCSE>(/*is_layout_sensitive=*/false); pipeline.AddPass<CpuInstructionFusion>(); + + for (const auto& reduce_precision_options : + module->config().debug_options().hlo_reduce_precision_options()) { + if (reduce_precision_options.pass_timing() == + HloReducePrecisionOptions::AFTER_OP_FUSION) { + pipeline.AddPass<ReducePrecisionInsertion>(reduce_precision_options); + } + } + pipeline.AddPass<CpuLayoutAssignment>( module->mutable_entry_computation_layout()); // The LayoutAssignment pass may leave behind kCopy instructions which are diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD index fa95e23499..cdd7c8187c 100644 --- a/tensorflow/compiler/xla/service/gpu/BUILD +++ b/tensorflow/compiler/xla/service/gpu/BUILD @@ -432,6 +432,7 @@ cc_library( "//tensorflow/compiler/xla/service:hlo_proto_util", "//tensorflow/compiler/xla/service:hlo_subcomputation_unification", "//tensorflow/compiler/xla/service:hlo_verifier", + "//tensorflow/compiler/xla/service:reduce_precision_insertion", "//tensorflow/compiler/xla/service:reshape_mover", "//tensorflow/compiler/xla/service:transpose_folding", "//tensorflow/compiler/xla/service/gpu/llvm_gpu_backend", diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc index d60c45a5c3..2acf95084a 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc +++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc @@ -56,6 +56,7 @@ limitations under the License. #include "tensorflow/compiler/xla/service/hlo_subcomputation_unification.h" #include "tensorflow/compiler/xla/service/hlo_verifier.h" #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h" +#include "tensorflow/compiler/xla/service/reduce_precision_insertion.h" #include "tensorflow/compiler/xla/service/reshape_mover.h" #include "tensorflow/compiler/xla/service/transpose_folding.h" #include "tensorflow/compiler/xla/status_macros.h" @@ -123,6 +124,15 @@ tensorflow::Status OptimizeHloModule(HloModule* hlo_module, { HloPassPipeline pipeline("optimization"); pipeline.AddInvariantChecker<HloVerifier>(); + + for (const auto& reduce_precision_options : + hlo_module->config().debug_options().hlo_reduce_precision_options()) { + if (reduce_precision_options.pass_timing() == + HloReducePrecisionOptions::BEFORE_OP_FUSION) { + pipeline.AddPass<ReducePrecisionInsertion>(reduce_precision_options); + } + } + { auto& pass = pipeline.AddPass<HloPassFix<HloPassPipeline>>("simplification"); @@ -149,8 +159,27 @@ tensorflow::Status OptimizeHloModule(HloModule* hlo_module, fusion.AddPass<GpuInstructionFusion>(/*may_duplicate=*/false); fusion.AddPass<GpuInstructionFusion>(/*may_duplicate=*/true); fusion.AddPass<FusionMerger>(); - return fusion.Run(hlo_module).status(); + TF_RETURN_IF_ERROR(fusion.Run(hlo_module).status()); + + HloPassPipeline reduce_pipeline("reduce-precision"); + for (const auto& reduce_precision_options : + hlo_module->config().debug_options().hlo_reduce_precision_options()) { + if (reduce_precision_options.pass_timing() == + HloReducePrecisionOptions::AFTER_OP_FUSION) { + reduce_pipeline.AddPass<ReducePrecisionInsertion>( + reduce_precision_options); + } + } + StatusOr<bool> reduce_result = reduce_pipeline.Run(hlo_module); + TF_RETURN_IF_ERROR(reduce_result.status()); + + if (reduce_result.ValueOrDie()) { + // Do another fusion pass, with the expectation that we may be able to + // fuse the new ReducePrecision operations. + TF_RETURN_IF_ERROR(fusion.Run(hlo_module).status()); + } } + return tensorflow::Status::OK(); } // Modifies the given HLO module so that it will be accepted by IrEmitter. diff --git a/tensorflow/compiler/xla/service/hlo_opcode.h b/tensorflow/compiler/xla/service/hlo_opcode.h index 358e611d57..8a6376b2d1 100644 --- a/tensorflow/compiler/xla/service/hlo_opcode.h +++ b/tensorflow/compiler/xla/service/hlo_opcode.h @@ -112,6 +112,11 @@ bool HloOpcodeIsComparison(HloOpcode opcode); // Returns true iff the given opcode has variadic operands. bool HloOpcodeIsVariadic(HloOpcode opcode); +// Returns the number of HloOpcode values. +inline const uint32_t HloOpcodeCount() { + return static_cast<uint32_t>(HloOpcode::kWhile) + 1; +} + } // namespace xla #endif // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_OPCODE_H_ diff --git a/tensorflow/compiler/xla/service/reduce_precision_insertion.cc b/tensorflow/compiler/xla/service/reduce_precision_insertion.cc index dafefdc491..e083226b14 100644 --- a/tensorflow/compiler/xla/service/reduce_precision_insertion.cc +++ b/tensorflow/compiler/xla/service/reduce_precision_insertion.cc @@ -16,6 +16,7 @@ limitations under the License. #include "tensorflow/compiler/xla/service/reduce_precision_insertion.h" #include "tensorflow/compiler/xla/service/hlo_module.h" +#include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/core/platform/logging.h" namespace xla { @@ -30,14 +31,15 @@ StatusOr<bool> ReducePrecisionInsertion::Run(HloModule* module) { for (auto& instruction : computation->instructions()) { VLOG(3) << "Visited instruction: " << instruction->ToString(); - // For now, ReducePrecision is only implemented for F32 data, so this + // For now, ReducePrecision is only implemented for F32 arrays, so this // ignore instructions that produce other data. In particular, this // currently ignores instructions producing tuples, even if those tuples - // contain F32 data inside them. The assumption is that in most cases + // contain F32 arrays inside them. The assumption is that in most cases // equivalent behavior can be obtained by adding ReducePrecision - // instructions after the instructions that pull the F32 data out of the - // tuples. + // instructions after the instructions that pull the F32 arrays out of + // the tuples. if (instruction->shape().element_type() == PrimitiveType::F32 && + !ShapeUtil::IsScalar(instruction->shape()) && should_reduce_output_precision_(instruction->opcode())) { instructions_to_suffix.push_back(instruction.get()); } @@ -58,4 +60,33 @@ StatusOr<bool> ReducePrecisionInsertion::Run(HloModule* module) { return changed; } +ReducePrecisionInsertion::OpcodeFilterFunction +ReducePrecisionInsertion::make_filter_function( + const HloReducePrecisionOptions& reduce_precision_options) { + // Implement the filter function with a lookup table. + std::vector<bool> filter(HloOpcodeCount(), false); + for (const auto& opcode : reduce_precision_options.opcodes_to_suffix()) { + filter[opcode] = true; + } + return [filter](const HloOpcode opcode) { + return filter[static_cast<unsigned int>(opcode)]; + }; +} + +HloReducePrecisionOptions ReducePrecisionInsertion::make_options_proto( + const HloReducePrecisionOptions::PassTiming pass_timing, + const int exponent_bits, const int mantissa_bits, + const OpcodeFilterFunction& should_reduce_output_precision) { + HloReducePrecisionOptions options; + options.set_pass_timing(pass_timing); + options.set_exponent_bits(exponent_bits); + options.set_mantissa_bits(mantissa_bits); + for (uint32_t opcode = 0; opcode < HloOpcodeCount(); opcode++) { + if (should_reduce_output_precision(static_cast<HloOpcode>(opcode))) { + options.add_opcodes_to_suffix(opcode); + } + } + return options; +} + } // namespace xla diff --git a/tensorflow/compiler/xla/service/reduce_precision_insertion.h b/tensorflow/compiler/xla/service/reduce_precision_insertion.h index e9c8bba031..34b865b9ce 100644 --- a/tensorflow/compiler/xla/service/reduce_precision_insertion.h +++ b/tensorflow/compiler/xla/service/reduce_precision_insertion.h @@ -42,6 +42,17 @@ class ReducePrecisionInsertion : public HloPassInterface { : exponent_bits_(exponent_bits), mantissa_bits_(mantissa_bits), should_reduce_output_precision_(should_reduce_output_precision) {} + + // Version of the constructor that takes an HloReducePrecisionOptions proto + // rather than explicitly-enumerated parameters, for convenience when + // creating passes based on DebugOptions. + explicit ReducePrecisionInsertion( + const HloReducePrecisionOptions& reduce_precision_options) + : exponent_bits_(reduce_precision_options.exponent_bits()), + mantissa_bits_(reduce_precision_options.mantissa_bits()), + should_reduce_output_precision_( + make_filter_function(reduce_precision_options)) {} + ~ReducePrecisionInsertion() override{}; tensorflow::StringPiece name() const override { @@ -52,6 +63,15 @@ class ReducePrecisionInsertion : public HloPassInterface { // (reduce-precision instructions were inserted). StatusOr<bool> Run(HloModule* module) override; + // Convert between the (inconvenient) xla.proto HloReducePrecisionOptions + // representation and OpcodeFilterFunction functions. + static OpcodeFilterFunction make_filter_function( + const HloReducePrecisionOptions& reduce_precision_options); + static HloReducePrecisionOptions make_options_proto( + const HloReducePrecisionOptions::PassTiming pass_timing, + const int exponent_bits, const int mantissa_bits, + const OpcodeFilterFunction& should_reduce_output_precision); + private: // Parameters for the precision reduction to be added. const int exponent_bits_; @@ -59,7 +79,7 @@ class ReducePrecisionInsertion : public HloPassInterface { // Function to determine (from the opcode) whether a given instruction should // have a reduce-precision instruction inserted in its output stream. - const OpcodeFilterFunction& should_reduce_output_precision_; + const OpcodeFilterFunction should_reduce_output_precision_; }; } // namespace xla diff --git a/tensorflow/compiler/xla/tests/reduce_precision_test.cc b/tensorflow/compiler/xla/tests/reduce_precision_test.cc index 48212dc7d1..527205bbb0 100644 --- a/tensorflow/compiler/xla/tests/reduce_precision_test.cc +++ b/tensorflow/compiler/xla/tests/reduce_precision_test.cc @@ -26,6 +26,7 @@ limitations under the License. #include "tensorflow/compiler/xla/layout_util.h" #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h" #include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/service/reduce_precision_insertion.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/test.h" #include "tensorflow/compiler/xla/tests/client_library_test_base.h" @@ -39,8 +40,11 @@ limitations under the License. namespace xla { namespace { -class ReducePrecisionTest : public ClientLibraryTestBase, - public ::testing::WithParamInterface<int> {}; +// Tests to confirm that the ReducePrecision operation produces the expected +// numerical values. +class ReducePrecisionAccuracyTest : public ClientLibraryTestBase, + public ::testing::WithParamInterface<int> { +}; // For reduction to IEEE-f16, we want to test the following cases, in both // positive and negative variants. (Note: IEEE-f16 is 5 exponent bits and 10 @@ -201,7 +205,7 @@ static const uint32_t test_values[][4] = { FPVAL(11111111, 1111111111, 1111111111111) // NaN }}; -XLA_TEST_P(ReducePrecisionTest, ReducePrecisionF32) { +XLA_TEST_P(ReducePrecisionAccuracyTest, ReducePrecisionF32) { int index = GetParam(); int exponent_bits = exponent_sizes[index]; int mantissa_bits = mantissa_sizes[index]; @@ -238,9 +242,87 @@ XLA_TEST_P(ReducePrecisionTest, ReducePrecisionF32) { ComputeAndCompareR1<float>(&builder, expected_values, {a_data.get()}); } -INSTANTIATE_TEST_CASE_P(ReducePrecisionTest, ReducePrecisionTest, +INSTANTIATE_TEST_CASE_P(ReducePrecisionAccuracyTest, + ReducePrecisionAccuracyTest, ::testing::Values(0, 1, 2, 3), TestDataToString); +// Tests to confirm that the compiler optimization functions add the expected +// ReducePrecisionInsertion passes. +class ReducePrecisionInsertionTest : public ClientLibraryTestBase {}; + +XLA_TEST_F(ReducePrecisionInsertionTest, ReducePrecisionBeforeFusion) { + ComputationBuilder builder(client_, TestName()); + + std::unique_ptr<Literal> a_literal = Literal::CreateR1<float>({1.00001}); + std::unique_ptr<GlobalData> a_data = + client_->TransferToServer(*a_literal).ConsumeValueOrDie(); + auto a = builder.Parameter(0, a_literal->shape(), "a"); + + // Abs doesn't affect resolution. + auto abs = builder.Abs(a); + + // Near 1.0, Log(x) approximates x - 1; this lets us confirm that the + // reduce-precision operation showed up in the correct place in the + // graph. + auto log = builder.Log(abs); + + // Insert precision-reduction after the Abs(x) operation, rounding that + // result to exactly 1.0f. + auto reduce_precision_pass = execution_options_.mutable_debug_options() + ->add_hlo_reduce_precision_options(); + *reduce_precision_pass = ReducePrecisionInsertion::make_options_proto( + HloReducePrecisionOptions::BEFORE_OP_FUSION, 5, 10, + [](const HloOpcode opcode) { return opcode == HloOpcode::kAbs; }); + + ComputeAndCompareR1<float>(&builder, {0.0f}, {a_data.get()}); +} + +XLA_TEST_F(ReducePrecisionInsertionTest, ReducePrecisionSkippedAfterFusion) { + ComputationBuilder builder(client_, TestName()); + + std::unique_ptr<Literal> a_literal = Literal::CreateR1<float>({1.00001}); + std::unique_ptr<GlobalData> a_data = + client_->TransferToServer(*a_literal).ConsumeValueOrDie(); + auto a = builder.Parameter(0, a_literal->shape(), "a"); + + // These two operations should be fused by any reasonable backend. + auto abs = builder.Abs(a); + auto neg = builder.Neg(abs); + + // Add a pass after operation fusion, suffixing kAbs operations. This + // should not see into the fusion nodes and thus should not affect the + // result. + auto reduce_precision_pass = execution_options_.mutable_debug_options() + ->add_hlo_reduce_precision_options(); + *reduce_precision_pass = ReducePrecisionInsertion::make_options_proto( + HloReducePrecisionOptions::AFTER_OP_FUSION, 5, 10, + [](const HloOpcode opcode) { return opcode == HloOpcode::kAbs; }); + + ComputeAndCompareR1<float>(&builder, {-1.00001f}, {a_data.get()}); +} + +XLA_TEST_F(ReducePrecisionInsertionTest, ReducePrecisionAddedAfterFusion) { + ComputationBuilder builder(client_, TestName()); + + std::unique_ptr<Literal> a_literal = Literal::CreateR1<float>({1.00001}); + std::unique_ptr<GlobalData> a_data = + client_->TransferToServer(*a_literal).ConsumeValueOrDie(); + auto a = builder.Parameter(0, a_literal->shape(), "a"); + + // These two operations should be fused by any reasonable backend. + auto abs = builder.Abs(a); + auto neg = builder.Neg(abs); + + // Add a pass after operation fusion, suffixing kFusion operations. + auto reduce_precision_pass = execution_options_.mutable_debug_options() + ->add_hlo_reduce_precision_options(); + *reduce_precision_pass = ReducePrecisionInsertion::make_options_proto( + HloReducePrecisionOptions::AFTER_OP_FUSION, 5, 10, + [](const HloOpcode opcode) { return opcode == HloOpcode::kFusion; }); + + ComputeAndCompareR1<float>(&builder, {-1.0f}, {a_data.get()}); +} + } // namespace } // namespace xla diff --git a/tensorflow/compiler/xla/xla.proto b/tensorflow/compiler/xla/xla.proto index 00fb7f12b8..be4e00f63c 100644 --- a/tensorflow/compiler/xla/xla.proto +++ b/tensorflow/compiler/xla/xla.proto @@ -20,6 +20,24 @@ import "tensorflow/compiler/xla/service/session.proto"; package xla; +// Options for the HLO insert-reduce-precision-operations pass. +message HloReducePrecisionOptions { + // When to run the pass. + enum PassTiming { + BEFORE_OP_FUSION = 0; + AFTER_OP_FUSION = 1; + } + PassTiming pass_timing = 1; + + // Exponent and mantissa bit counts for the reduced precision. + uint32 exponent_bits = 2; + uint32 mantissa_bits = 3; + + // Opcodes for operations that should be suffixed with reduced-precision + // operations. + repeated uint32 opcodes_to_suffix = 4; +} + // Debugging options for XLA. These options may change at any time - there are // no guarantees about backward or forward compatibility for these fields. message DebugOptions { @@ -112,6 +130,11 @@ message DebugOptions { // the generated IR. bool xla_llvm_enable_invariant_load_metadata = 72; + // Options for inserting reduce-precision operations for numerical + // experimentation. This is a repeated field, as we may want to have + // multiple passes with different parameters. + repeated HloReducePrecisionOptions hlo_reduce_precision_options = 80; + // This is used by ClientLibraryTestBase::ComputeAndCompare*. If true, the // computation will run n! times with all permunations of layouts for the // output shape in rank n. For example, with a 3D shape, all permutations of |