aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow
diff options
context:
space:
mode:
authorGravatar A. Unique TensorFlower <gardener@tensorflow.org>2017-07-25 00:05:41 -0700
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2017-07-25 00:09:39 -0700
commitd1a9ea61ef8271b3d2fe273a68ff5940fcba7ccd (patch)
treefb4527dc7de04f4a2a1a0bc101334c31d2cbef38 /tensorflow
parent73b120ea3b517b6af2267ca078bf571f966fd606 (diff)
[XLA] Teach CPU and GPU compilers to optionally invoke the HLO insert-reduce-precision-operations pass.
This also required a few additions and fixups. We add pieces to ReducePrecisionInsertion to translate between the protocol-buffer representation of the pass options and the predicate-function actually used in the pass. To facilitate this translation, we also add a function to HloOpcode to return the number of opcodes so that we can iterate over the whole set easily. PiperOrigin-RevId: 163037250
Diffstat (limited to 'tensorflow')
-rw-r--r--tensorflow/compiler/xla/service/BUILD1
-rw-r--r--tensorflow/compiler/xla/service/cpu/BUILD1
-rw-r--r--tensorflow/compiler/xla/service/cpu/cpu_compiler.cc18
-rw-r--r--tensorflow/compiler/xla/service/gpu/BUILD1
-rw-r--r--tensorflow/compiler/xla/service/gpu/gpu_compiler.cc31
-rw-r--r--tensorflow/compiler/xla/service/hlo_opcode.h5
-rw-r--r--tensorflow/compiler/xla/service/reduce_precision_insertion.cc39
-rw-r--r--tensorflow/compiler/xla/service/reduce_precision_insertion.h22
-rw-r--r--tensorflow/compiler/xla/tests/reduce_precision_test.cc90
-rw-r--r--tensorflow/compiler/xla/xla.proto23
10 files changed, 221 insertions, 10 deletions
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 696dc28564..a4612bb6c1 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -1945,6 +1945,7 @@ cc_library(
":buffer_liveness",
":hlo",
":hlo_pass",
+ "//tensorflow/compiler/xla:shape_util",
"//tensorflow/core:lib",
],
)
diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD
index 7248cb5f4c..2ca4af67cd 100644
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@@ -72,6 +72,7 @@ cc_library(
"//tensorflow/compiler/xla/service:hlo_subcomputation_unification",
"//tensorflow/compiler/xla/service:hlo_verifier",
"//tensorflow/compiler/xla/service:inliner",
+ "//tensorflow/compiler/xla/service:reduce_precision_insertion",
"//tensorflow/compiler/xla/service:reshape_mover",
"//tensorflow/compiler/xla/service:transpose_folding",
"//tensorflow/compiler/xla/service/llvm_ir:llvm_util", # fixdeps: keep
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
index 6d819355c4..b86342d0b3 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@@ -74,6 +74,7 @@ limitations under the License.
#include "tensorflow/compiler/xla/service/hlo_verifier.h"
#include "tensorflow/compiler/xla/service/inliner.h"
#include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
+#include "tensorflow/compiler/xla/service/reduce_precision_insertion.h"
#include "tensorflow/compiler/xla/service/reshape_mover.h"
#include "tensorflow/compiler/xla/service/transpose_folding.h"
#include "tensorflow/compiler/xla/status_macros.h"
@@ -253,6 +254,14 @@ Status CpuCompiler::RunHloPasses(HloModule* module) {
HloPassPipeline pipeline("CPU");
pipeline.AddInvariantChecker<HloVerifier>();
+ for (const auto& reduce_precision_options :
+ module->config().debug_options().hlo_reduce_precision_options()) {
+ if (reduce_precision_options.pass_timing() ==
+ HloReducePrecisionOptions::BEFORE_OP_FUSION) {
+ pipeline.AddPass<ReducePrecisionInsertion>(reduce_precision_options);
+ }
+ }
+
// TODO(b/35786417): Re-enable inliner pass after fixing the bug and deciding
// where we will take this pass in future.
// pipeline.AddPass<Inliner>();
@@ -278,6 +287,15 @@ Status CpuCompiler::RunHloPasses(HloModule* module) {
TransposeFolding::NeverFoldTranspose);
pipeline.AddPass<HloCSE>(/*is_layout_sensitive=*/false);
pipeline.AddPass<CpuInstructionFusion>();
+
+ for (const auto& reduce_precision_options :
+ module->config().debug_options().hlo_reduce_precision_options()) {
+ if (reduce_precision_options.pass_timing() ==
+ HloReducePrecisionOptions::AFTER_OP_FUSION) {
+ pipeline.AddPass<ReducePrecisionInsertion>(reduce_precision_options);
+ }
+ }
+
pipeline.AddPass<CpuLayoutAssignment>(
module->mutable_entry_computation_layout());
// The LayoutAssignment pass may leave behind kCopy instructions which are
diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index fa95e23499..cdd7c8187c 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -432,6 +432,7 @@ cc_library(
"//tensorflow/compiler/xla/service:hlo_proto_util",
"//tensorflow/compiler/xla/service:hlo_subcomputation_unification",
"//tensorflow/compiler/xla/service:hlo_verifier",
+ "//tensorflow/compiler/xla/service:reduce_precision_insertion",
"//tensorflow/compiler/xla/service:reshape_mover",
"//tensorflow/compiler/xla/service:transpose_folding",
"//tensorflow/compiler/xla/service/gpu/llvm_gpu_backend",
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
index d60c45a5c3..2acf95084a 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
@@ -56,6 +56,7 @@ limitations under the License.
#include "tensorflow/compiler/xla/service/hlo_subcomputation_unification.h"
#include "tensorflow/compiler/xla/service/hlo_verifier.h"
#include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
+#include "tensorflow/compiler/xla/service/reduce_precision_insertion.h"
#include "tensorflow/compiler/xla/service/reshape_mover.h"
#include "tensorflow/compiler/xla/service/transpose_folding.h"
#include "tensorflow/compiler/xla/status_macros.h"
@@ -123,6 +124,15 @@ tensorflow::Status OptimizeHloModule(HloModule* hlo_module,
{
HloPassPipeline pipeline("optimization");
pipeline.AddInvariantChecker<HloVerifier>();
+
+ for (const auto& reduce_precision_options :
+ hlo_module->config().debug_options().hlo_reduce_precision_options()) {
+ if (reduce_precision_options.pass_timing() ==
+ HloReducePrecisionOptions::BEFORE_OP_FUSION) {
+ pipeline.AddPass<ReducePrecisionInsertion>(reduce_precision_options);
+ }
+ }
+
{
auto& pass =
pipeline.AddPass<HloPassFix<HloPassPipeline>>("simplification");
@@ -149,8 +159,27 @@ tensorflow::Status OptimizeHloModule(HloModule* hlo_module,
fusion.AddPass<GpuInstructionFusion>(/*may_duplicate=*/false);
fusion.AddPass<GpuInstructionFusion>(/*may_duplicate=*/true);
fusion.AddPass<FusionMerger>();
- return fusion.Run(hlo_module).status();
+ TF_RETURN_IF_ERROR(fusion.Run(hlo_module).status());
+
+ HloPassPipeline reduce_pipeline("reduce-precision");
+ for (const auto& reduce_precision_options :
+ hlo_module->config().debug_options().hlo_reduce_precision_options()) {
+ if (reduce_precision_options.pass_timing() ==
+ HloReducePrecisionOptions::AFTER_OP_FUSION) {
+ reduce_pipeline.AddPass<ReducePrecisionInsertion>(
+ reduce_precision_options);
+ }
+ }
+ StatusOr<bool> reduce_result = reduce_pipeline.Run(hlo_module);
+ TF_RETURN_IF_ERROR(reduce_result.status());
+
+ if (reduce_result.ValueOrDie()) {
+ // Do another fusion pass, with the expectation that we may be able to
+ // fuse the new ReducePrecision operations.
+ TF_RETURN_IF_ERROR(fusion.Run(hlo_module).status());
+ }
}
+ return tensorflow::Status::OK();
}
// Modifies the given HLO module so that it will be accepted by IrEmitter.
diff --git a/tensorflow/compiler/xla/service/hlo_opcode.h b/tensorflow/compiler/xla/service/hlo_opcode.h
index 358e611d57..8a6376b2d1 100644
--- a/tensorflow/compiler/xla/service/hlo_opcode.h
+++ b/tensorflow/compiler/xla/service/hlo_opcode.h
@@ -112,6 +112,11 @@ bool HloOpcodeIsComparison(HloOpcode opcode);
// Returns true iff the given opcode has variadic operands.
bool HloOpcodeIsVariadic(HloOpcode opcode);
+// Returns the number of HloOpcode values.
+inline const uint32_t HloOpcodeCount() {
+ return static_cast<uint32_t>(HloOpcode::kWhile) + 1;
+}
+
} // namespace xla
#endif // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_OPCODE_H_
diff --git a/tensorflow/compiler/xla/service/reduce_precision_insertion.cc b/tensorflow/compiler/xla/service/reduce_precision_insertion.cc
index dafefdc491..e083226b14 100644
--- a/tensorflow/compiler/xla/service/reduce_precision_insertion.cc
+++ b/tensorflow/compiler/xla/service/reduce_precision_insertion.cc
@@ -16,6 +16,7 @@ limitations under the License.
#include "tensorflow/compiler/xla/service/reduce_precision_insertion.h"
#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/shape_util.h"
#include "tensorflow/core/platform/logging.h"
namespace xla {
@@ -30,14 +31,15 @@ StatusOr<bool> ReducePrecisionInsertion::Run(HloModule* module) {
for (auto& instruction : computation->instructions()) {
VLOG(3) << "Visited instruction: " << instruction->ToString();
- // For now, ReducePrecision is only implemented for F32 data, so this
+ // For now, ReducePrecision is only implemented for F32 arrays, so this
// ignore instructions that produce other data. In particular, this
// currently ignores instructions producing tuples, even if those tuples
- // contain F32 data inside them. The assumption is that in most cases
+ // contain F32 arrays inside them. The assumption is that in most cases
// equivalent behavior can be obtained by adding ReducePrecision
- // instructions after the instructions that pull the F32 data out of the
- // tuples.
+ // instructions after the instructions that pull the F32 arrays out of
+ // the tuples.
if (instruction->shape().element_type() == PrimitiveType::F32 &&
+ !ShapeUtil::IsScalar(instruction->shape()) &&
should_reduce_output_precision_(instruction->opcode())) {
instructions_to_suffix.push_back(instruction.get());
}
@@ -58,4 +60,33 @@ StatusOr<bool> ReducePrecisionInsertion::Run(HloModule* module) {
return changed;
}
+ReducePrecisionInsertion::OpcodeFilterFunction
+ReducePrecisionInsertion::make_filter_function(
+ const HloReducePrecisionOptions& reduce_precision_options) {
+ // Implement the filter function with a lookup table.
+ std::vector<bool> filter(HloOpcodeCount(), false);
+ for (const auto& opcode : reduce_precision_options.opcodes_to_suffix()) {
+ filter[opcode] = true;
+ }
+ return [filter](const HloOpcode opcode) {
+ return filter[static_cast<unsigned int>(opcode)];
+ };
+}
+
+HloReducePrecisionOptions ReducePrecisionInsertion::make_options_proto(
+ const HloReducePrecisionOptions::PassTiming pass_timing,
+ const int exponent_bits, const int mantissa_bits,
+ const OpcodeFilterFunction& should_reduce_output_precision) {
+ HloReducePrecisionOptions options;
+ options.set_pass_timing(pass_timing);
+ options.set_exponent_bits(exponent_bits);
+ options.set_mantissa_bits(mantissa_bits);
+ for (uint32_t opcode = 0; opcode < HloOpcodeCount(); opcode++) {
+ if (should_reduce_output_precision(static_cast<HloOpcode>(opcode))) {
+ options.add_opcodes_to_suffix(opcode);
+ }
+ }
+ return options;
+}
+
} // namespace xla
diff --git a/tensorflow/compiler/xla/service/reduce_precision_insertion.h b/tensorflow/compiler/xla/service/reduce_precision_insertion.h
index e9c8bba031..34b865b9ce 100644
--- a/tensorflow/compiler/xla/service/reduce_precision_insertion.h
+++ b/tensorflow/compiler/xla/service/reduce_precision_insertion.h
@@ -42,6 +42,17 @@ class ReducePrecisionInsertion : public HloPassInterface {
: exponent_bits_(exponent_bits),
mantissa_bits_(mantissa_bits),
should_reduce_output_precision_(should_reduce_output_precision) {}
+
+ // Version of the constructor that takes an HloReducePrecisionOptions proto
+ // rather than explicitly-enumerated parameters, for convenience when
+ // creating passes based on DebugOptions.
+ explicit ReducePrecisionInsertion(
+ const HloReducePrecisionOptions& reduce_precision_options)
+ : exponent_bits_(reduce_precision_options.exponent_bits()),
+ mantissa_bits_(reduce_precision_options.mantissa_bits()),
+ should_reduce_output_precision_(
+ make_filter_function(reduce_precision_options)) {}
+
~ReducePrecisionInsertion() override{};
tensorflow::StringPiece name() const override {
@@ -52,6 +63,15 @@ class ReducePrecisionInsertion : public HloPassInterface {
// (reduce-precision instructions were inserted).
StatusOr<bool> Run(HloModule* module) override;
+ // Convert between the (inconvenient) xla.proto HloReducePrecisionOptions
+ // representation and OpcodeFilterFunction functions.
+ static OpcodeFilterFunction make_filter_function(
+ const HloReducePrecisionOptions& reduce_precision_options);
+ static HloReducePrecisionOptions make_options_proto(
+ const HloReducePrecisionOptions::PassTiming pass_timing,
+ const int exponent_bits, const int mantissa_bits,
+ const OpcodeFilterFunction& should_reduce_output_precision);
+
private:
// Parameters for the precision reduction to be added.
const int exponent_bits_;
@@ -59,7 +79,7 @@ class ReducePrecisionInsertion : public HloPassInterface {
// Function to determine (from the opcode) whether a given instruction should
// have a reduce-precision instruction inserted in its output stream.
- const OpcodeFilterFunction& should_reduce_output_precision_;
+ const OpcodeFilterFunction should_reduce_output_precision_;
};
} // namespace xla
diff --git a/tensorflow/compiler/xla/tests/reduce_precision_test.cc b/tensorflow/compiler/xla/tests/reduce_precision_test.cc
index 48212dc7d1..527205bbb0 100644
--- a/tensorflow/compiler/xla/tests/reduce_precision_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_precision_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
#include "tensorflow/compiler/xla/layout_util.h"
#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/reduce_precision_insertion.h"
#include "tensorflow/compiler/xla/statusor.h"
#include "tensorflow/compiler/xla/test.h"
#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
@@ -39,8 +40,11 @@ limitations under the License.
namespace xla {
namespace {
-class ReducePrecisionTest : public ClientLibraryTestBase,
- public ::testing::WithParamInterface<int> {};
+// Tests to confirm that the ReducePrecision operation produces the expected
+// numerical values.
+class ReducePrecisionAccuracyTest : public ClientLibraryTestBase,
+ public ::testing::WithParamInterface<int> {
+};
// For reduction to IEEE-f16, we want to test the following cases, in both
// positive and negative variants. (Note: IEEE-f16 is 5 exponent bits and 10
@@ -201,7 +205,7 @@ static const uint32_t test_values[][4] = {
FPVAL(11111111, 1111111111, 1111111111111) // NaN
}};
-XLA_TEST_P(ReducePrecisionTest, ReducePrecisionF32) {
+XLA_TEST_P(ReducePrecisionAccuracyTest, ReducePrecisionF32) {
int index = GetParam();
int exponent_bits = exponent_sizes[index];
int mantissa_bits = mantissa_sizes[index];
@@ -238,9 +242,87 @@ XLA_TEST_P(ReducePrecisionTest, ReducePrecisionF32) {
ComputeAndCompareR1<float>(&builder, expected_values, {a_data.get()});
}
-INSTANTIATE_TEST_CASE_P(ReducePrecisionTest, ReducePrecisionTest,
+INSTANTIATE_TEST_CASE_P(ReducePrecisionAccuracyTest,
+ ReducePrecisionAccuracyTest,
::testing::Values(0, 1, 2, 3), TestDataToString);
+// Tests to confirm that the compiler optimization functions add the expected
+// ReducePrecisionInsertion passes.
+class ReducePrecisionInsertionTest : public ClientLibraryTestBase {};
+
+XLA_TEST_F(ReducePrecisionInsertionTest, ReducePrecisionBeforeFusion) {
+ ComputationBuilder builder(client_, TestName());
+
+ std::unique_ptr<Literal> a_literal = Literal::CreateR1<float>({1.00001});
+ std::unique_ptr<GlobalData> a_data =
+ client_->TransferToServer(*a_literal).ConsumeValueOrDie();
+ auto a = builder.Parameter(0, a_literal->shape(), "a");
+
+ // Abs doesn't affect resolution.
+ auto abs = builder.Abs(a);
+
+ // Near 1.0, Log(x) approximates x - 1; this lets us confirm that the
+ // reduce-precision operation showed up in the correct place in the
+ // graph.
+ auto log = builder.Log(abs);
+
+ // Insert precision-reduction after the Abs(x) operation, rounding that
+ // result to exactly 1.0f.
+ auto reduce_precision_pass = execution_options_.mutable_debug_options()
+ ->add_hlo_reduce_precision_options();
+ *reduce_precision_pass = ReducePrecisionInsertion::make_options_proto(
+ HloReducePrecisionOptions::BEFORE_OP_FUSION, 5, 10,
+ [](const HloOpcode opcode) { return opcode == HloOpcode::kAbs; });
+
+ ComputeAndCompareR1<float>(&builder, {0.0f}, {a_data.get()});
+}
+
+XLA_TEST_F(ReducePrecisionInsertionTest, ReducePrecisionSkippedAfterFusion) {
+ ComputationBuilder builder(client_, TestName());
+
+ std::unique_ptr<Literal> a_literal = Literal::CreateR1<float>({1.00001});
+ std::unique_ptr<GlobalData> a_data =
+ client_->TransferToServer(*a_literal).ConsumeValueOrDie();
+ auto a = builder.Parameter(0, a_literal->shape(), "a");
+
+ // These two operations should be fused by any reasonable backend.
+ auto abs = builder.Abs(a);
+ auto neg = builder.Neg(abs);
+
+ // Add a pass after operation fusion, suffixing kAbs operations. This
+ // should not see into the fusion nodes and thus should not affect the
+ // result.
+ auto reduce_precision_pass = execution_options_.mutable_debug_options()
+ ->add_hlo_reduce_precision_options();
+ *reduce_precision_pass = ReducePrecisionInsertion::make_options_proto(
+ HloReducePrecisionOptions::AFTER_OP_FUSION, 5, 10,
+ [](const HloOpcode opcode) { return opcode == HloOpcode::kAbs; });
+
+ ComputeAndCompareR1<float>(&builder, {-1.00001f}, {a_data.get()});
+}
+
+XLA_TEST_F(ReducePrecisionInsertionTest, ReducePrecisionAddedAfterFusion) {
+ ComputationBuilder builder(client_, TestName());
+
+ std::unique_ptr<Literal> a_literal = Literal::CreateR1<float>({1.00001});
+ std::unique_ptr<GlobalData> a_data =
+ client_->TransferToServer(*a_literal).ConsumeValueOrDie();
+ auto a = builder.Parameter(0, a_literal->shape(), "a");
+
+ // These two operations should be fused by any reasonable backend.
+ auto abs = builder.Abs(a);
+ auto neg = builder.Neg(abs);
+
+ // Add a pass after operation fusion, suffixing kFusion operations.
+ auto reduce_precision_pass = execution_options_.mutable_debug_options()
+ ->add_hlo_reduce_precision_options();
+ *reduce_precision_pass = ReducePrecisionInsertion::make_options_proto(
+ HloReducePrecisionOptions::AFTER_OP_FUSION, 5, 10,
+ [](const HloOpcode opcode) { return opcode == HloOpcode::kFusion; });
+
+ ComputeAndCompareR1<float>(&builder, {-1.0f}, {a_data.get()});
+}
+
} // namespace
} // namespace xla
diff --git a/tensorflow/compiler/xla/xla.proto b/tensorflow/compiler/xla/xla.proto
index 00fb7f12b8..be4e00f63c 100644
--- a/tensorflow/compiler/xla/xla.proto
+++ b/tensorflow/compiler/xla/xla.proto
@@ -20,6 +20,24 @@ import "tensorflow/compiler/xla/service/session.proto";
package xla;
+// Options for the HLO insert-reduce-precision-operations pass.
+message HloReducePrecisionOptions {
+ // When to run the pass.
+ enum PassTiming {
+ BEFORE_OP_FUSION = 0;
+ AFTER_OP_FUSION = 1;
+ }
+ PassTiming pass_timing = 1;
+
+ // Exponent and mantissa bit counts for the reduced precision.
+ uint32 exponent_bits = 2;
+ uint32 mantissa_bits = 3;
+
+ // Opcodes for operations that should be suffixed with reduced-precision
+ // operations.
+ repeated uint32 opcodes_to_suffix = 4;
+}
+
// Debugging options for XLA. These options may change at any time - there are
// no guarantees about backward or forward compatibility for these fields.
message DebugOptions {
@@ -112,6 +130,11 @@ message DebugOptions {
// the generated IR.
bool xla_llvm_enable_invariant_load_metadata = 72;
+ // Options for inserting reduce-precision operations for numerical
+ // experimentation. This is a repeated field, as we may want to have
+ // multiple passes with different parameters.
+ repeated HloReducePrecisionOptions hlo_reduce_precision_options = 80;
+
// This is used by ClientLibraryTestBase::ComputeAndCompare*. If true, the
// computation will run n! times with all permunations of layouts for the
// output shape in rank n. For example, with a 3D shape, all permutations of