aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
diff options
context:
space:
mode:
Diffstat (limited to 'tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc')
-rw-r--r--tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc20
1 files changed, 16 insertions, 4 deletions
diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
index 76c9b6ab33..d937123357 100644
--- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
@@ -72,6 +72,7 @@ limitations under the License.
#include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
#include "tensorflow/compiler/xla/service/reduce_precision_insertion.h"
#include "tensorflow/compiler/xla/service/reshape_mover.h"
+#include "tensorflow/compiler/xla/service/scatter_expander.h"
#include "tensorflow/compiler/xla/service/transpose_folding.h"
#include "tensorflow/compiler/xla/service/tuple_simplifier.h"
#include "tensorflow/compiler/xla/service/while_loop_constant_sinking.h"
@@ -130,8 +131,12 @@ string GetLibdeviceDir(const string& config_cuda_data_dir) {
}
// Runs optimization passes on the given HLO module.
+//
+// It takes a compiler pointer, as passes may compile and execute HLOs on the
+// fly for cuDNN verification or other purposes.
Status OptimizeHloModule(HloModule* hlo_module, se::StreamExecutor* stream_exec,
- DeviceMemoryAllocator* device_allocator) {
+ DeviceMemoryAllocator* device_allocator,
+ Compiler* compiler) {
{
HloPassPipeline pipeline("optimization");
pipeline.AddInvariantChecker<HloVerifier>();
@@ -167,6 +172,8 @@ Status OptimizeHloModule(HloModule* hlo_module, se::StreamExecutor* stream_exec,
// elimination has to come after that pass.
pipeline.AddPass<ZeroSizedHloElimination>();
+ pipeline.AddPass<ScatterExpander>();
+
pass.AddPass<AlgebraicSimplifier>(
/*is_layout_sensitive=*/false,
[](const Shape&, const Shape&) { return false; });
@@ -245,8 +252,8 @@ Status OptimizeHloModule(HloModule* hlo_module, se::StreamExecutor* stream_exec,
// the gte(customcall, 0) would probably already be into a fusion node. We
// can't simplify across HloComputation boundaries, so in this case we
// wouldn't be able to simplify away the new_tuple bits.
- pipeline.AddPass<CudnnConvolutionAlgorithmPicker>(stream_exec,
- device_allocator);
+ pipeline.AddPass<CudnnConvolutionAlgorithmPicker>(
+ stream_exec, device_allocator, compiler);
// Clean up new_tuple described above.
pipeline.AddPass<TupleSimplifier>();
@@ -492,11 +499,15 @@ NVPTXCompiler::NVPTXCompiler()
StatusOr<std::unique_ptr<HloModule>> NVPTXCompiler::RunHloPasses(
std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
DeviceMemoryAllocator* device_allocator) {
+ // We dump the post-optimization HLO in RunBackend so no need to dump it here.
+ VLOG(2) << "*** HLO Before Optimization";
+ XLA_VLOG_LINES(2, module->ToString());
+
XLA_SCOPED_LOGGING_TIMER("NVPTXCompiler::RunHloPasses");
tracing::ScopedActivity activity("HLO Transforms", module->name(),
/*is_expensive=*/true);
TF_RETURN_IF_ERROR(
- OptimizeHloModule(module.get(), stream_exec, device_allocator));
+ OptimizeHloModule(module.get(), stream_exec, device_allocator, this));
return std::move(module);
}
@@ -548,6 +559,7 @@ StatusOr<std::unique_ptr<Executable>> NVPTXCompiler::RunBackend(
// include headers, so no need for us to print them ourselves.
XLA_VLOG_LINES(1, buffer_assignment->GetStats().ToString());
XLA_VLOG_LINES(2, buffer_assignment->ToString());
+ VLOG(2) << "*** HLO After Optimization";
XLA_VLOG_LINES(2, module->ToString());
const string xla_dump_optimized_hlo_proto_to =
module->config().debug_options().xla_dump_optimized_hlo_proto_to();