aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/compiler/xla/service/gpu/tests
diff options
context:
space:
mode:
Diffstat (limited to 'tensorflow/compiler/xla/service/gpu/tests')
-rw-r--r--tensorflow/compiler/xla/service/gpu/tests/BUILD223
-rw-r--r--tensorflow/compiler/xla/service/gpu/tests/gpu_alignment_test.cc54
-rw-r--r--tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.cc50
-rw-r--r--tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h42
-rw-r--r--tensorflow/compiler/xla/service/gpu/tests/gpu_copy_test.cc59
-rw-r--r--tensorflow/compiler/xla/service/gpu/tests/gpu_ftz_test.cc119
-rw-r--r--tensorflow/compiler/xla/service/gpu/tests/gpu_fusion_test.cc59
-rw-r--r--tensorflow/compiler/xla/service/gpu/tests/gpu_index_test.cc147
-rw-r--r--tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc177
-rw-r--r--tensorflow/compiler/xla/service/gpu/tests/gpu_ldg_test.cc141
-rw-r--r--tensorflow/compiler/xla/service/gpu/tests/gpu_noalias_test.cc68
-rw-r--r--tensorflow/compiler/xla/service/gpu/tests/gpu_unrolling_test.cc185
-rw-r--r--tensorflow/compiler/xla/service/gpu/tests/infeed_test.cc121
13 files changed, 1445 insertions, 0 deletions
diff --git a/tensorflow/compiler/xla/service/gpu/tests/BUILD b/tensorflow/compiler/xla/service/gpu/tests/BUILD
new file mode 100644
index 0000000000..686c3c16c9
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/tests/BUILD
@@ -0,0 +1,223 @@
+# Description: GPU-specific XLA tests. For example, codegen tests that
+# verify the IR emitted.
+#
+# TODO(jlebar): None of these tests actually use the GPU, so they should not
+# need to run on machines with GPUs present.
+
+licenses(["notice"]) # Apache 2.0
+
+package(default_visibility = [":friends"])
+
+package_group(
+ name = "friends",
+ includes = [
+ "//tensorflow/compiler/xla:friends",
+ ],
+)
+
+# Filegroup used to collect source files for dependency checking.
+filegroup(
+ name = "c_srcs",
+ data = glob([
+ "**/*.cc",
+ "**/*.h",
+ ]),
+)
+
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+
+cc_library(
+ name = "gpu_codegen_test",
+ testonly = True,
+ srcs = ["gpu_codegen_test.cc"],
+ hdrs = ["gpu_codegen_test.h"],
+ tags = [
+ "requires-gpu-sm35",
+ ],
+ deps = [
+ "//tensorflow/compiler/xla:util",
+ "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
+ "//tensorflow/compiler/xla/service:gpu_plugin",
+ "//tensorflow/compiler/xla/service/gpu:gpu_executable",
+ "//tensorflow/compiler/xla/tests:filecheck",
+ "//tensorflow/compiler/xla/tests:llvm_irgen_test_base",
+ "//tensorflow/core:lib",
+ ],
+)
+
+tf_cc_test(
+ name = "gpu_copy_test",
+ srcs = ["gpu_copy_test.cc"],
+ tags = [
+ "requires-gpu-sm35",
+ ],
+ deps = [
+ ":gpu_codegen_test",
+ "//tensorflow/compiler/xla:literal",
+ "//tensorflow/compiler/xla:literal_util",
+ "//tensorflow/compiler/xla:util",
+ "//tensorflow/compiler/xla:xla_data_proto",
+ "//tensorflow/compiler/xla/service:hlo",
+ "//tensorflow/core:test",
+ "//tensorflow/core:test_main",
+ ],
+)
+
+tf_cc_test(
+ name = "gpu_ftz_test",
+ srcs = ["gpu_ftz_test.cc"],
+ tags = [
+ "requires-gpu-sm35",
+ ],
+ deps = [
+ ":gpu_codegen_test",
+ "//tensorflow/core:test_main",
+ ],
+)
+
+tf_cc_test(
+ name = "gpu_index_test",
+ srcs = ["gpu_index_test.cc"],
+ tags = [
+ "requires-gpu-sm35",
+ ],
+ deps = [
+ ":gpu_codegen_test",
+ "//tensorflow/compiler/xla:literal",
+ "//tensorflow/compiler/xla:shape_util",
+ "//tensorflow/compiler/xla:util",
+ "//tensorflow/compiler/xla:xla_data_proto",
+ "//tensorflow/compiler/xla:xla_proto",
+ "//tensorflow/compiler/xla/service:hlo",
+ "//tensorflow/compiler/xla/service:hlo_module_config",
+ "//tensorflow/compiler/xla/service:hlo_parser",
+ "//tensorflow/compiler/xla/tests:hlo_test_base",
+ "//tensorflow/core:test",
+ "//tensorflow/core:test_main",
+ ],
+)
+
+tf_cc_test(
+ name = "gpu_infeed_test",
+ srcs = ["infeed_test.cc"],
+ tags = [
+ "requires-gpu-sm35",
+ ],
+ deps = [
+ ":gpu_codegen_test",
+ "//tensorflow/compiler/xla:literal",
+ "//tensorflow/compiler/xla:shape_util",
+ "//tensorflow/compiler/xla:test_helpers",
+ "//tensorflow/compiler/xla:xla_data_proto",
+ "//tensorflow/compiler/xla/client:global_data",
+ "//tensorflow/compiler/xla/client:local_client",
+ "//tensorflow/compiler/xla/client/lib:arithmetic",
+ "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+ "//tensorflow/compiler/xla/tests:client_library_test_base",
+ "//tensorflow/compiler/xla/tests:literal_test_util",
+ "//tensorflow/core:lib",
+ "//tensorflow/core:test_main",
+ ],
+)
+
+tf_cc_test(
+ name = "gpu_kernel_tiling_test",
+ srcs = ["gpu_kernel_tiling_test.cc"],
+ tags = [
+ "requires-gpu-sm35",
+ ],
+ deps = [
+ ":gpu_codegen_test",
+ "//tensorflow/compiler/xla/service:hlo",
+ "//tensorflow/compiler/xla/service:hlo_module_config",
+ "//tensorflow/compiler/xla/service:hlo_parser",
+ "//tensorflow/compiler/xla/tests:hlo_test_base",
+ "//tensorflow/core:test",
+ "//tensorflow/core:test_main",
+ ],
+)
+
+tf_cc_test(
+ name = "gpu_ldg_test",
+ srcs = ["gpu_ldg_test.cc"],
+ tags = ["requires-gpu-sm35"],
+ deps = [
+ ":gpu_codegen_test",
+ "//tensorflow/compiler/xla:literal",
+ "//tensorflow/compiler/xla:shape_util",
+ "//tensorflow/compiler/xla:util",
+ "//tensorflow/compiler/xla:xla_data_proto",
+ "//tensorflow/compiler/xla/service:hlo",
+ "//tensorflow/core:test",
+ "//tensorflow/core:test_main",
+ ],
+)
+
+tf_cc_test(
+ name = "gpu_noalias_test",
+ srcs = ["gpu_noalias_test.cc"],
+ tags = [
+ "requires-gpu-sm35",
+ ],
+ deps = [
+ ":gpu_codegen_test",
+ "//tensorflow/compiler/xla:literal",
+ "//tensorflow/compiler/xla:shape_util",
+ "//tensorflow/compiler/xla:util",
+ "//tensorflow/compiler/xla:xla_data_proto",
+ "//tensorflow/compiler/xla/service:hlo",
+ "//tensorflow/core:test",
+ "//tensorflow/core:test_main",
+ ],
+)
+
+tf_cc_test(
+ name = "gpu_fusion_test",
+ srcs = ["gpu_fusion_test.cc"],
+ tags = [
+ "requires-gpu-sm35",
+ ],
+ deps = [
+ ":gpu_codegen_test",
+ "//tensorflow/compiler/xla/service:hlo_module_config",
+ "//tensorflow/compiler/xla/service:hlo_parser",
+ "//tensorflow/compiler/xla/tests:hlo_test_base",
+ "//tensorflow/core:test",
+ "//tensorflow/core:test_main",
+ ],
+)
+
+tf_cc_test(
+ name = "gpu_unrolling_test",
+ srcs = ["gpu_unrolling_test.cc"],
+ tags = [
+ "requires-gpu-sm35",
+ ],
+ deps = [
+ ":gpu_codegen_test",
+ "//tensorflow/compiler/xla/service:hlo_module_config",
+ "//tensorflow/compiler/xla/service:hlo_parser",
+ "//tensorflow/compiler/xla/tests:hlo_test_base",
+ "//tensorflow/core:test",
+ "//tensorflow/core:test_main",
+ ],
+)
+
+tf_cc_test(
+ name = "gpu_alignment_test",
+ testonly = True,
+ srcs = ["gpu_alignment_test.cc"],
+ tags = [
+ "requires-gpu-sm35",
+ ],
+ deps = [
+ ":gpu_codegen_test",
+ "//tensorflow/compiler/xla/service:gpu_plugin",
+ "//tensorflow/compiler/xla/service/cpu:custom_call_target_registry",
+ "//tensorflow/compiler/xla/service/llvm_ir:alias_analysis",
+ "//tensorflow/compiler/xla/tests:filecheck",
+ "//tensorflow/compiler/xla/tests:llvm_irgen_test_base",
+ "//tensorflow/core:test",
+ "//tensorflow/core:test_main",
+ ],
+)
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_alignment_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_alignment_test.cc
new file mode 100644
index 0000000000..672c68e59b
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_alignment_test.cc
@@ -0,0 +1,54 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <utility>
+
+#include "tensorflow/compiler/xla/service/cpu/custom_call_target_registry.h"
+#include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/alias_analysis.h"
+#include "tensorflow/compiler/xla/tests/filecheck.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+class GpuAlignmentTest : public GpuCodegenTest {};
+
+TEST_F(GpuAlignmentTest, Test) {
+ const char* hlo_string = R"(
+HloModule GpuAlignmentTest
+
+ENTRY main {
+ zero = f32[] constant(0)
+ tok = token[] after-all()
+ a = f32[100] parameter(0)
+ b_tup = (f32[200], token[]) infeed(tok)
+ b = f32[200] get-tuple-element(b_tup), index=0
+ a_padded = f32[150] pad(a, zero), padding=0_50
+ b_sliced = f32[150] slice(b), slice={[0:150]}
+ ROOT c = f32[150] add(a_padded, b_sliced)
+}
+)";
+
+ CompileAndVerifyIr(hlo_string, R"(
+CHECK: @fusion(i8* align 64 dereferenceable(600) %alloc0, i8* align 16 dereferenceable(400) %alloc1, i8* align 64 dereferenceable(864) %temp_buf)
+)");
+}
+
+} // namespace
+} // namespace gpu
+} // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.cc
new file mode 100644
index 0000000000..4b8415fe91
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.cc
@@ -0,0 +1,50 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
+#include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_executable.h"
+#include "tensorflow/compiler/xla/tests/filecheck.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace xla {
+namespace gpu {
+
+std::unique_ptr<HloModule> GpuCodegenTest::CreateNewModuleWithFTZ(bool ftz) {
+ HloModuleConfig config;
+ auto debug_options = legacy_flags::GetDebugOptionsFromFlags();
+ debug_options.set_xla_gpu_ftz(ftz);
+ debug_options.set_xla_gpu_max_kernel_unroll_factor(1);
+ // TODO(b/38354253): Change tests to use Parameters instead of Constants.
+ debug_options.add_xla_disable_hlo_passes("constant_folding");
+ config.set_debug_options(debug_options);
+
+ return MakeUnique<HloModule>(TestName(), config);
+}
+
+void GpuCodegenTest::CompileAndVerifyPtx(std::unique_ptr<HloModule> hlo_module,
+ const string& pattern) {
+ std::unique_ptr<Executable> executable =
+ std::move(CompileToExecutable(std::move(hlo_module)).ValueOrDie());
+ string ptx_str =
+ std::string(static_cast<GpuExecutable*>(executable.get())->ptx());
+ StatusOr<bool> filecheck_result = RunFileCheck(ptx_str, pattern);
+ ASSERT_TRUE(filecheck_result.ok());
+ EXPECT_TRUE(filecheck_result.ValueOrDie());
+}
+
+} // namespace gpu
+} // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h b/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h
new file mode 100644
index 0000000000..e4a3573bab
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h
@@ -0,0 +1,42 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_TESTS_GPU_CODEGEN_TEST_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_TESTS_GPU_CODEGEN_TEST_H_
+
+#include <string>
+
+#include "tensorflow/compiler/xla/tests/llvm_irgen_test_base.h"
+
+namespace xla {
+namespace gpu {
+
+// Tests that verify IR or PTX emitted by the GPU backend is as expected.
+class GpuCodegenTest : public LlvmIrGenTestBase {
+ protected:
+ // Like HloTestBase::CreateNewModule(), with a flag for configuring the ftz
+ // option.
+ std::unique_ptr<HloModule> CreateNewModuleWithFTZ(bool ftz);
+
+ // Compiles the given HLO module to PTX and verifies the PTX matches the given
+ // FileCheck pattern. (See http://llvm.org/docs/CommandGuide/FileCheck.html).
+ void CompileAndVerifyPtx(std::unique_ptr<HloModule> hlo_module,
+ const string& pattern);
+};
+
+} // namespace gpu
+} // namespace xla
+
+#endif // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_TESTS_GPU_CODEGEN_TEST_H_
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_copy_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_copy_test.cc
new file mode 100644
index 0000000000..ce69e058e6
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_copy_test.cc
@@ -0,0 +1,59 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <utility>
+
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+namespace gpu {
+
+class GpuCopyTest : public GpuCodegenTest {};
+
+// The GPU backend should not emit a copy kernel for the kCopy instruction in
+// this test. Instead, it should generate a CopyThunk which invokes cuMemcpy at
+// runtime.
+TEST_F(GpuCopyTest, UseMemcpy) {
+ HloComputation::Builder builder(TestName());
+
+ std::unique_ptr<Literal> literal =
+ LiteralUtil::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}});
+ HloInstruction* constant = builder.AddInstruction(
+ HloInstruction::CreateConstant(std::move(literal)));
+ builder.AddInstruction(HloInstruction::CreateUnary(
+ constant->shape(), HloOpcode::kCopy, constant));
+
+ std::unique_ptr<HloComputation> computation = builder.Build();
+
+ auto hlo_module = CreateNewModule();
+ hlo_module->AddEntryComputation(std::move(computation));
+
+ // There should not be any kernel prefixed "copy".
+ CompileAndVerifyIr(std::move(hlo_module), "; CHECK-NOT: define void @_copy",
+ /*match_optimized_ir=*/false);
+}
+
+} // namespace gpu
+} // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_ftz_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_ftz_test.cc
new file mode 100644
index 0000000000..177b94934c
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_ftz_test.cc
@@ -0,0 +1,119 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
+
+// Check that the ftz (flush denormals to zero) flag is reflected in PTX as
+// expected.
+
+namespace xla {
+namespace gpu {
+namespace {
+
+class GpuFtzTest : public GpuCodegenTest {
+ public:
+ explicit GpuFtzTest(bool ftz) : ftz_(ftz) {}
+
+ // Creates an HLO module that performs the given binary operation on some
+ // data.
+ std::unique_ptr<HloModule> CreateBinaryOpModule(HloOpcode op) {
+ HloComputation::Builder builder(TestName());
+
+ Shape param_shape = ShapeUtil::MakeShapeWithLayout(
+ F32, /*dimensions=*/{100, 100}, /*minor_to_major=*/{1, 0});
+ HloInstruction* x = builder.AddInstruction(HloInstruction::CreateParameter(
+ /* parameter_number=*/0, param_shape, "x"));
+ HloInstruction* y = builder.AddInstruction(HloInstruction::CreateParameter(
+ /* parameter_number=*/1, param_shape, "y"));
+ builder.AddInstruction(HloInstruction::CreateBinary(param_shape, op, x, y));
+
+ auto hlo_module = CreateNewModuleWithFTZ(ftz_);
+ hlo_module->AddEntryComputation(builder.Build());
+ return hlo_module;
+ }
+
+ // Creates an HLO module that performs the given unary operation on some data.
+ std::unique_ptr<HloModule> CreateUnaryOpModule(HloOpcode op) {
+ HloComputation::Builder builder(TestName());
+
+ Shape param_shape = ShapeUtil::MakeShapeWithLayout(
+ F32, /*dimensions=*/{100, 100}, /*minor_to_major=*/{1, 0});
+ HloInstruction* x = builder.AddInstruction(HloInstruction::CreateParameter(
+ /* parameter_number=*/0, param_shape, "x"));
+ builder.AddInstruction(HloInstruction::CreateUnary(param_shape, op, x));
+
+ auto hlo_module = CreateNewModuleWithFTZ(ftz_);
+ hlo_module->AddEntryComputation(builder.Build());
+ return hlo_module;
+ }
+
+ bool ftz_;
+};
+
+class GpuFtzEnabledTest : public GpuFtzTest {
+ public:
+ GpuFtzEnabledTest() : GpuFtzTest(/*ftz=*/true) {}
+};
+
+class GpuFtzDisabledTest : public GpuFtzTest {
+ public:
+ GpuFtzDisabledTest() : GpuFtzTest(/*ftz=*/false) {}
+};
+
+// Check that we emit mul.ftz.f32 when in ftz mode, and plain mul.f32 otherwise.
+TEST_F(GpuFtzEnabledTest, MultiplyFtz) {
+ CompileAndVerifyPtx(CreateBinaryOpModule(HloOpcode::kMultiply), R"(
+ CHECK-NOT: mul.f32
+ CHECK: mul.ftz.f32
+ CHECK-NOT: mul.f32
+ )");
+}
+TEST_F(GpuFtzDisabledTest, MultiplyFtz) {
+ CompileAndVerifyPtx(CreateBinaryOpModule(HloOpcode::kMultiply), R"(
+ CHECK-NOT: mul.ftz.f32
+ CHECK: mul.f32
+ CHECK-NOT: mul.ftz.f32
+ )");
+}
+
+// In NVPTX, exp(float) is implemented in libdevice, and consults __nvvm_reflect
+// to determine whether or not ftz is enabled. The implementation uses two
+// calls to ex2.approx. When ftz is on, we get two calls to the ftz version;
+// when ftz is off, we get one call to the ftz version and one call to the
+// regular version.
+TEST_F(GpuFtzEnabledTest, ExpFtz) {
+ CompileAndVerifyPtx(CreateUnaryOpModule(HloOpcode::kExp), R"(
+ CHECK-NOT: ex2.approx.f32
+ CHECK: ex2.approx.ftz.f32
+ CHECK-NOT: ex2.approx.f32
+ CHECK: ex2.approx.ftz.f32
+ CHECK-NOT: ex2.approx.f32
+ CHECK-NOT: ex2.approx.ftz.f32
+ )");
+}
+
+TEST_F(GpuFtzDisabledTest, ExpFtz) {
+ CompileAndVerifyPtx(CreateUnaryOpModule(HloOpcode::kExp), R"(
+ CHECK-NOT: ex2.approx.f32
+ CHECK-DAG: ex2.approx.ftz.f32
+ CHECK-DAG: ex2.approx.f32
+ CHECK-NOT: ex2.approx.f32
+ CHECK-NOT: ex2.approx.ftz.f32
+ )");
+}
+
+} // namespace
+} // namespace gpu
+} // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_fusion_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_fusion_test.cc
new file mode 100644
index 0000000000..674b436a8e
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_fusion_test.cc
@@ -0,0 +1,59 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <utility>
+
+#include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
+#include "tensorflow/compiler/xla/service/hlo_module_config.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+class GpuFusionTest : public GpuCodegenTest {};
+
+TEST_F(GpuFusionTest, FusedReshape) {
+ const char* hlo_text = R"(
+ HloModule test_module
+
+ fused_computation {
+ p0.param_0 = f32[4,1,1]{2,1,0} parameter(0)
+ p1.param_1 = f32[4,1]{1,0} parameter(1)
+ reshape = f32[4,1]{1,0} reshape(p0.param_0)
+ ROOT add = f32[4,1] add(reshape, p1.param_1)
+ }
+
+ ENTRY BroadcastIntoAdd {
+ p0 = f32[4,1,1]{2,1,0} parameter(0)
+ p1 = f32[4,1]{1,0} parameter(1)
+ ROOT fusion = f32[4,1]{1,0} fusion(p0, p1), kind=kLoop,
+ calls=fused_computation
+ }
+)";
+
+ CompileAndVerifyIr(hlo_text,
+ R"(
+; CHECK-LABEL: @fusion
+; CHECK: fadd
+; CHECK: }
+ )");
+}
+
+} // namespace
+} // namespace gpu
+} // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_index_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_index_test.cc
new file mode 100644
index 0000000000..e5958165ef
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_index_test.cc
@@ -0,0 +1,147 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <utility>
+
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_module_config.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/xla.pb.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+namespace gpu {
+
+// This file tests the index expressions used to reference source tensors. When
+// the destination tensor and source tensor have compatible shapes, the linear
+// index is used to access the source tensor. Otherwise, dimensional indices
+// computed from the linear index are used to access the source tensor.
+
+class GpuIndexTest : public GpuCodegenTest {};
+
+TEST_F(GpuIndexTest, CompatibleUseLinearIndex) {
+ HloComputation::Builder builder(TestName());
+
+ auto param_shape = ShapeUtil::MakeShape(F32, {5, 7, 2});
+ HloInstruction* param_x = builder.AddInstruction(
+ HloInstruction::CreateParameter(0, param_shape, "x"));
+ HloInstruction* param_y = builder.AddInstruction(
+ HloInstruction::CreateParameter(1, param_shape, "y"));
+ builder.AddInstruction(HloInstruction::CreateBinary(
+ ShapeUtil::MakeShape(PRED, {5, 7, 2}), HloOpcode::kGe, param_x, param_y));
+
+ auto hlo_module = CreateNewModule();
+ hlo_module->AddEntryComputation(builder.Build());
+
+ // Check the optimized IR as the unoptimized IR contains dead udiv and urem.
+ CompileAndVerifyIr(std::move(hlo_module),
+ R"(
+; CHECK-NOT: udiv
+; CHECK-NOT: urem
+ )",
+ /*match_optimized_ir=*/true);
+}
+
+TEST_F(GpuIndexTest, CompatibleUseLinearIndexWithReshape) {
+ HloModuleConfig config;
+ config.set_debug_options(HloTestBase::GetDebugOptionsForTest());
+ auto module = ParseHloString(R"(
+ HloModule test_module
+
+ ENTRY CompatibleUseLinearIndexWithReshape {
+ x = f32[5,7,2]{2,1,0} parameter(0)
+ y = f32[5,14]{1,0} parameter(1)
+ reshape = f32[5,7,2]{2,1,0} reshape(y)
+ ROOT gte = pred[5,7,2]{2,1,0} greater-than-or-equal-to(x, reshape)
+ })",
+ config)
+ .ValueOrDie();
+
+ // Check the optimized IR as the unoptimized IR contains dead udiv and urem.
+ CompileAndVerifyIr(std::move(module),
+ R"(
+; CHECK-NOT: udiv
+; CHECK-NOT: urem
+ )",
+ /*match_optimized_ir=*/true);
+}
+
+TEST_F(GpuIndexTest, CompatibleUseLinearIndexWithReshapeAndBroadcast) {
+ HloModuleConfig config;
+ config.set_debug_options(HloTestBase::GetDebugOptionsForTest());
+ auto module = ParseHloString(R"(
+ HloModule test_module
+
+ ENTRY CompatibleUseLinearIndexWithReshape {
+ x = f32[5,7,2]{2,1,0} parameter(0)
+ y = f32[14]{0} parameter(1)
+ reshape = f32[7,2]{1,0} reshape(y)
+ broadcast = f32[5,7,2]{2,1,0} broadcast(reshape), dimensions={1,2}
+ ROOT gte = pred[5,7,2]{2,1,0} greater-than-or-equal-to(x, broadcast)
+ })",
+ config)
+ .ValueOrDie();
+
+ // Check the optimized IR reuses the linear index by calculating modulo 14.
+ CompileAndVerifyIr(std::move(module),
+ R"(
+; CHECK: %[[urem1:.*]] = urem i{{[0-9]*}} %[[linear_index:.*]], 14
+; CHECK: %[[bitcast:.*]] = bitcast i8 addrspace(1)* %[[alloc:.*]] to float addrspace(1)*
+; CHECK: %[[idx1:.*]] = zext i{{[0-9]*}} %[[urem1]] to i64
+; CHECK: getelementptr inbounds float, float addrspace(1)* %[[bitcast]], i64 %[[idx1]]
+ )",
+ /*match_optimized_ir=*/true);
+}
+
+TEST_F(GpuIndexTest, CompatibleUseLinearIndexWithSizeOneDimensions) {
+ HloModuleConfig config;
+ auto debug_options = HloTestBase::GetDebugOptionsForTest();
+ debug_options.set_xla_gpu_max_kernel_unroll_factor(1);
+ config.set_debug_options(debug_options);
+
+ auto module = ParseHloString(R"(
+ HloModule test_module
+
+ ENTRY CompatibleUseLinearIndexWithSizeOneDimensions {
+ x = f32[1,1024,1,256]{3,2,1,0} parameter(0)
+ ROOT y = f16[1,1024,1,256]{2,3,1,0} convert(x)
+ })",
+ config)
+ .ValueOrDie();
+
+ // Check that the unoptimized IR reuses the linear index.
+ CompileAndVerifyIr(std::move(module),
+ R"(
+; CHECK-LABEL: @fusion
+; CHECK: udiv i32 %[[linear_index:.*]], 262144
+; CHECK: %[[ld_addr:.*]] = getelementptr inbounds float, float* {{.*}}, i32 %[[linear_index]]
+; CHECK: load float, float* %[[ld_addr]]
+; CHECK: %[[st_addr:.*]] = getelementptr inbounds half, half* {{.*}}, i32 %[[linear_index]]
+; CHECK: store half {{.*}}, half* %[[st_addr]]
+ )",
+ /*match_optimized_ir=*/false);
+}
+
+} // namespace gpu
+} // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc
new file mode 100644
index 0000000000..cca35316f0
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc
@@ -0,0 +1,177 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <utility>
+
+#include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
+#include "tensorflow/compiler/xla/service/hlo_module_config.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+class GpuKernelTilingTest : public GpuCodegenTest {
+ protected:
+ GpuKernelTilingTest() {
+ auto debug_options = HloTestBase::GetDebugOptionsForTest();
+ config_.set_debug_options(debug_options);
+ // Disable layout_assignment to use the preassigned layouts.
+ debug_options.add_xla_disable_hlo_passes("layout_assignment");
+ }
+ HloModuleConfig config_;
+};
+
+TEST_F(GpuKernelTilingTest, UnnestedTransposeWithProperDimensionsTiled) {
+ const char *const kHloString = R"(
+ HloModule unnested_transpose_1
+
+ ENTRY unnested_transpose_1 {
+ para0 = f16[32,3,64]{2,1,0} parameter(0)
+ ROOT copy1 = f16[32,3,64]{1,0,2} copy(para0)
+ })";
+
+ // Check that a call to llvm.nvvm.barrier0 is generated.
+ auto hlo_module = ParseHloString(kHloString, config_).ValueOrDie();
+ CompileAndVerifyIr(std::move(hlo_module),
+ R"(
+; CHECK-LABEL: define void @copy
+; CHECK: tail call void @llvm.nvvm.barrier0()
+; CHECK: }
+)",
+ /*match_optimized_ir=*/true);
+
+ // Check that the kernel runs correctly.
+ EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{0.0}));
+}
+
+TEST_F(GpuKernelTilingTest, UnnestedTransposeWithSmallDimensionsNotTiled) {
+ const char *const kHloString = R"(
+ HloModule unnested_transpose_2
+
+ ENTRY unnested_transpose_2 {
+ para0 = f16[2,3,64]{2,1,0} parameter(0)
+ ROOT copy1 = f16[2,3,64]{1,0,2} copy(para0)
+ })";
+
+ // Check that a call to llvm.nvvm.barrier0 is not generated.
+ auto hlo_module = ParseHloString(kHloString, config_).ValueOrDie();
+ CompileAndVerifyIr(std::move(hlo_module),
+ R"(
+; CHECK-LABEL: define void @copy
+; CHECK-NOT: tail call void @llvm.nvvm.barrier0()
+; CHECK: }
+)",
+ /*match_optimized_ir=*/true);
+}
+
+TEST_F(GpuKernelTilingTest, SimpleFusionWithTransposeTiled) {
+ const char *const kHloString = R"(
+ HloModule multiple_output_fusion_1
+ fused_computation.1 {
+ param0 = f32[4,5,6,7,8]{4,3,2,1,0} parameter(0)
+ copy = f32[4,5,6,7,8]{2,1,4,3,0} copy(param0)
+ ROOT convert = f16[4,5,6,7,8]{2,1,4,3,0} convert(copy)
+ }
+
+ ENTRY copy_in_fusion_run_without_hlo_passes {
+ para0 = f32[4,5,6,7,8]{4,3,2,1,0} parameter(0)
+ ROOT fusion.1 = f16[4,5,6,7,8]{2,1,4,3,0} fusion(para0), kind=kLoop,
+ calls=fused_computation.1
+ })";
+
+ // Check that a call to llvm.nvvm.barrier0 is generated.
+ auto hlo_module = ParseHloString(kHloString, config_).ValueOrDie();
+ CompileAndVerifyIr(std::move(hlo_module),
+ R"(
+; CHECK-LABEL: define void @fusion
+; CHECK: tail call void @llvm.nvvm.barrier0()
+; CHECK: }
+)",
+ /*match_optimized_ir=*/true);
+
+ // Check that the kernel runs correctly.
+ EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{0.0}));
+}
+
+TEST_F(GpuKernelTilingTest, MultipleOutputFusionWithOnePossibleTransposeTiled) {
+ const char *const kHloString = R"(
+ HloModule multiple_output_fusion_1
+ fused_computation.1 {
+ param0 = f16[8,31,31,65]{3,2,1,0} parameter(0)
+ param1 = f16[8,31,31,65]{3,2,1,0} parameter(1)
+ copy0 = f16[8,31,31,65]{2,1,3,0} copy(param0)
+ copy1 = f16[8,31,31,65]{2,1,3,0} copy(param1)
+ ROOT tuple1 = (f16[8,31,31,65]{2,1,3,0}, f16[8,31,31,65]{2,1,3,0})
+ tuple(copy0, copy1)
+ }
+
+ ENTRY multiple_output_fusion_1 {
+ para0 = f16[8,31,31,65]{3,2,1,0} parameter(0)
+ para1 = f16[8,31,31,65]{3,2,1,0} parameter(1)
+ ROOT fusion.1 = (f16[8,31,31,65]{2,1,3,0}, f16[8,31,31,65]{2,1,3,0})
+ fusion(para0,para1), kind=kLoop, calls=fused_computation.1
+ })";
+
+ // Check that a call to llvm.nvvm.barrier0 is generated.
+ auto hlo_module = ParseHloString(kHloString, config_).ValueOrDie();
+ CompileAndVerifyIr(std::move(hlo_module),
+ R"(
+; CHECK-LABEL: define void @fusion
+; CHECK: tail call void @llvm.nvvm.barrier0()
+; CHECK: }
+)",
+ /*match_optimized_ir=*/true);
+
+ // Check that the kernel runs correctly.
+ EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{0.0}));
+}
+
+TEST_F(GpuKernelTilingTest,
+ MultipleOutputFusionWithTwoPossibleTransposesNotTiled) {
+ const char *const kHloString = R"(
+ HloModule multiple_output_fusion_2
+ fused_computation.1 {
+ param0 = f16[8,31,31,65]{3,2,1,0} parameter(0)
+ param1 = f16[8,31,31,65]{1,3,2,0} parameter(1)
+ copy2 = f16[8,31,31,65]{2,1,3,0} copy(param0)
+ copy3 = f16[8,31,31,65]{2,1,3,0} copy(param1)
+ ROOT tuple1 = (f16[8,31,31,65]{2,1,3,0}, f16[8,31,31,65]{2,1,3,0})
+ tuple(copy2, copy3)
+ }
+
+ ENTRY multiple_output_fusion_2 {
+ para0 = f16[8,31,31,65]{3,2,1,0} parameter(0)
+ para1 = f16[8,31,31,65]{1,3,2,0} parameter(1)
+ ROOT fusion1 = (f16[8,31,31,65]{2,1,3,0}, f16[8,31,31,65]{2,1,3,0})
+ fusion(para0,para1), kind=kLoop, calls=fused_computation.1
+ })";
+
+ // Check that a call to llvm.nvvm.barrier0 is not generated.
+ auto hlo_module = ParseHloString(kHloString, config_).ValueOrDie();
+ CompileAndVerifyIr(std::move(hlo_module),
+ R"(
+; CHECK-LABEL: define void @fusion
+; CHECK-NOT: tail call void @llvm.nvvm.barrier0()
+; CHECK: }
+)",
+ /*match_optimized_ir=*/true);
+}
+
+} // namespace
+} // namespace gpu
+} // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_ldg_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_ldg_test.cc
new file mode 100644
index 0000000000..6c9ae7bada
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_ldg_test.cc
@@ -0,0 +1,141 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Tests that we emit ld.global.nc (the PTX instruction corresponding to CUDA's
+// __ldg builtin) for reads of buffers that don't change during a kernel's
+// execution.
+
+#include <memory>
+#include <utility>
+
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+namespace gpu {
+
+class GpuLdgTest : public GpuCodegenTest {};
+
+// Parameters are never overwritten, so parameter reads should get ld.global.nc
+// reads.
+TEST_F(GpuLdgTest, LdgForParamRead) {
+ HloComputation::Builder builder(TestName());
+
+ auto shape = ShapeUtil::MakeShape(F32, {2, 2});
+ HloInstruction* param =
+ builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "x"));
+ builder.AddInstruction(
+ HloInstruction::CreateBinary(shape, HloOpcode::kAdd, param, param));
+ std::unique_ptr<HloComputation> computation = builder.Build();
+
+ auto hlo_module = CreateNewModule();
+ hlo_module->AddEntryComputation(std::move(computation));
+
+ CompileAndVerifyPtx(std::move(hlo_module), R"(
+ CHECK-NOT: ld.global.f32
+ CHECK: ld.global.nc.f32
+ )");
+}
+
+// Check that reading a buffer produced by a non-parameter HLO also results in
+// ld.global.nc, if that buffer isn't modified within the instruction that reads
+// it.
+TEST_F(GpuLdgTest, LdgForNonParamRead) {
+ HloComputation::Builder builder(TestName());
+
+ auto shape = ShapeUtil::MakeShape(F32, {2, 2});
+ HloInstruction* param =
+ builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "x"));
+ HloInstruction* add = builder.AddInstruction(
+ HloInstruction::CreateBinary(shape, HloOpcode::kAdd, param, param));
+ HloInstruction* square = builder.AddInstruction(
+ HloInstruction::CreateBinary(shape, HloOpcode::kMultiply, add, add));
+ builder.AddInstruction(HloInstruction::CreateTuple({add, square}));
+ std::unique_ptr<HloComputation> computation = builder.Build();
+
+ auto hlo_module = CreateNewModule();
+ hlo_module->AddEntryComputation(std::move(computation));
+
+ CompileAndVerifyPtx(std::move(hlo_module), R"(
+ CHECK: {
+ CHECK-NOT: ld.global.f32
+ CHECK: ld.global.nc.f32
+ CHECK: }
+ )");
+}
+
+// Check that reading a buffer that's modified in-place does not produce
+// ld.global.nc.
+//
+// We do this by creating a reduce that feeds into a sin. We don't currently
+// fuse sin into reduce, and the sin is elementwise, so it reuses its input
+// buffer as its output.
+//
+// It seems like a fair bet that we won't start fusing sin into the output of
+// reduce in the foreseeable future. But if that turns out to be wrong, I give
+// you, future reader, permission to delete this test.
+TEST_F(GpuLdgTest, NoLdgWhenSharingBuffer) {
+ auto hlo_module = CreateNewModule();
+ HloComputation::Builder builder(TestName());
+
+ HloComputation* reduce_computation;
+ {
+ auto embedded_builder = HloComputation::Builder("add");
+ auto lhs = embedded_builder.AddInstruction(HloInstruction::CreateParameter(
+ 0, ShapeUtil::MakeShape(F32, {}), "lhs"));
+ auto rhs = embedded_builder.AddInstruction(HloInstruction::CreateParameter(
+ 1, ShapeUtil::MakeShape(F32, {}), "rhs"));
+ embedded_builder.AddInstruction(
+ HloInstruction::CreateBinary(lhs->shape(), HloOpcode::kAdd, lhs, rhs));
+ reduce_computation =
+ hlo_module->AddEmbeddedComputation(embedded_builder.Build());
+ }
+
+ auto param_shape = ShapeUtil::MakeShape(F32, {2, 2});
+ auto reduce_shape = ShapeUtil::MakeShape(F32, {2});
+ HloInstruction* param = builder.AddInstruction(
+ HloInstruction::CreateParameter(0, param_shape, "x"));
+ HloInstruction* reduce = builder.AddInstruction(HloInstruction::CreateReduce(
+ reduce_shape,
+ builder.AddInstruction(HloInstruction::CreateBinary(
+ param_shape, HloOpcode::kAdd, param, param)),
+ builder.AddInstruction(
+ HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0))),
+ {0}, reduce_computation));
+ builder.AddInstruction(
+ HloInstruction::CreateUnary(reduce_shape, HloOpcode::kSin, reduce));
+
+ std::unique_ptr<HloComputation> computation = builder.Build();
+ hlo_module->AddEntryComputation(std::move(computation));
+
+ CompileAndVerifyPtx(std::move(hlo_module), R"(
+ CHECK-LABEL: .entry sin
+ CHECK: {
+ CHECK-NOT: ld.global.nc.f32
+ CHECK: ld.global.f32
+ CHECK: }
+ )");
+}
+
+} // namespace gpu
+} // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_noalias_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_noalias_test.cc
new file mode 100644
index 0000000000..c42e5704a4
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_noalias_test.cc
@@ -0,0 +1,68 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <utility>
+
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+namespace gpu {
+
+class GpuNoAliasTest : public GpuCodegenTest {};
+
+TEST_F(GpuNoAliasTest, Concat) {
+ HloComputation::Builder builder(TestName());
+
+ auto param_shape = ShapeUtil::MakeShape(F32, {2, 2});
+ HloInstruction* param_x = builder.AddInstruction(
+ HloInstruction::CreateParameter(0, param_shape, "x"));
+ HloInstruction* param_y = builder.AddInstruction(
+ HloInstruction::CreateParameter(1, param_shape, "y"));
+ HloInstruction* concat =
+ builder.AddInstruction(HloInstruction::CreateConcatenate(
+ ShapeUtil::MakeShape(F32, {2, 4}), {param_x, param_y}, 1));
+ builder.AddInstruction(HloInstruction::CreateConcatenate(
+ ShapeUtil::MakeShape(F32, {2, 6}), {concat, param_x}, 1));
+
+ std::unique_ptr<HloComputation> computation = builder.Build();
+
+ auto hlo_module = CreateNewModule();
+ hlo_module->AddEntryComputation(std::move(computation));
+
+ CompileAndVerifyIr(std::move(hlo_module),
+ R"(
+; CHECK: %[[x_gep:.*]] = getelementptr inbounds [2 x [2 x float]], [2 x [2 x float]]* %x{{.*}}, i32 0
+; CHECK: load float, float* %[[x_gep]], {{.*}}, !noalias ![[param_noalias:.*]]
+; CHECK: %[[y_gep:.*]] = getelementptr inbounds [2 x [2 x float]], [2 x [2 x float]]* %y{{.*}}, i32 0
+; CHECK: load float, float* %[[y_gep]], {{.*}}, !noalias ![[param_noalias]]
+; CHECK: %[[result_ptr:.*]] = bitcast [2 x [6 x float]]* %fusion{{.*}} to float*
+; CHECK: %[[result_gep:.*]] = getelementptr inbounds float, float* %[[result_ptr]]
+; CHECK: store float {{.*}}, float* %[[result_gep]], !alias.scope ![[param_noalias]]
+; CHECK: ![[param_noalias]] = !{![[retval_buffer:.*]]}
+ )",
+ /*match_optimized_ir=*/false);
+}
+
+} // namespace gpu
+} // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_unrolling_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_unrolling_test.cc
new file mode 100644
index 0000000000..9622936306
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_unrolling_test.cc
@@ -0,0 +1,185 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <utility>
+
+#include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
+#include "tensorflow/compiler/xla/service/hlo_module_config.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+class GpuUnrollingTest : public GpuCodegenTest {};
+
+const char *const kAddModule = R"(
+ HloModule test_module
+
+ fused_computation {
+ p0.param_0 = f32[2,2]{1,0} parameter(0)
+ p1.param_1 = f32[2,2]{1,0} parameter(1)
+ ROOT add = f32[2,2] add(p0.param_0, p1.param_1)
+ }
+
+ ENTRY BroadcastIntoAdd {
+ p0 = f32[2,2]{1,0} parameter(0)
+ p1 = f32[2,2]{1,0} parameter(1)
+ ROOT fusion = f32[2,2]{1,0} fusion(p0, p1), kind=kLoop,
+ calls=fused_computation
+ })";
+
+TEST_F(GpuUnrollingTest, DoNotUnroll) {
+ HloModuleConfig config;
+ auto debug_options = HloTestBase::GetDebugOptionsForTest();
+ debug_options.set_xla_gpu_max_kernel_unroll_factor(1);
+ config.set_debug_options(debug_options);
+ auto hlo_module = ParseHloString(kAddModule, config).ValueOrDie();
+
+ CompileAndVerifyIr(std::move(hlo_module),
+ R"(
+; CHECK-LABEL: @fusion
+; CHECK: fadd
+; CHECK-NOT: fadd
+; CHECK: }
+ )",
+ /*match_optimized_ir=*/true);
+}
+
+TEST_F(GpuUnrollingTest, UnrollFourTimes) {
+ HloModuleConfig config;
+ auto debug_options = HloTestBase::GetDebugOptionsForTest();
+ // We request a factor of 8, but the computation works on 4 elements, limiting
+ // the maximum unroll factor.
+ debug_options.set_xla_gpu_max_kernel_unroll_factor(8);
+ config.set_debug_options(debug_options);
+ auto hlo_module = ParseHloString(kAddModule, config).ValueOrDie();
+
+ CompileAndVerifyIr(std::move(hlo_module),
+ R"(
+; CHECK-LABEL: @fusion
+; CHECK: fadd
+; CHECK: fadd
+; CHECK: fadd
+; CHECK: fadd
+; CHECK-NOT: fadd
+; CHECK: }
+ )",
+ /*match_optimized_ir=*/true);
+}
+
+TEST_F(GpuUnrollingTest, UnrollDefaultTimes) {
+ // The default unrolling factor is 4.
+ HloModuleConfig config;
+ config.set_debug_options(legacy_flags::GetDebugOptionsFromFlags());
+ auto hlo_module = ParseHloString(kAddModule, config).ValueOrDie();
+
+ CompileAndVerifyIr(std::move(hlo_module),
+ R"(
+; CHECK-LABEL: @fusion
+; CHECK: load <4 x float>
+; CHECK: fadd
+; CHECK: fadd
+; CHECK: fadd
+; CHECK: fadd
+; CHECK-NOT: fadd
+; CHECK: store <4 x float>
+; CHECK: }
+ )",
+ /*match_optimized_ir=*/true);
+}
+
+TEST_F(GpuUnrollingTest, UnrollUnfusedAdd) {
+ HloModuleConfig config;
+ auto debug_options = HloTestBase::GetDebugOptionsForTest();
+ debug_options.set_xla_gpu_max_kernel_unroll_factor(4);
+ config.set_debug_options(debug_options);
+
+ const char *const kUnfusedAddModule = R"(
+ HloModule test_module
+
+ ENTRY AddFunc {
+ p0 = f32[2,2]{1,0} parameter(0)
+ p1 = f32[2,2]{1,0} parameter(1)
+ ROOT add = f32[2,2]{1,0} add(p0, p1)
+ })";
+ auto hlo_module = ParseHloString(kUnfusedAddModule, config).ValueOrDie();
+
+ CompileAndVerifyIr(std::move(hlo_module),
+ R"(
+; CHECK-LABEL: @add
+; CHECK: load <4 x float>
+; CHECK: fadd
+; CHECK: fadd
+; CHECK: fadd
+; CHECK: fadd
+; CHECK-NOT: fadd
+; CHECK: store <4 x float>
+; CHECK: }
+ )",
+ /*match_optimized_ir=*/true);
+}
+
+TEST_F(GpuUnrollingTest, UnrollMultiOutputFusion) {
+ HloModuleConfig config;
+ auto debug_options = HloTestBase::GetDebugOptionsForTest();
+ debug_options.set_xla_gpu_max_kernel_unroll_factor(2);
+ config.set_debug_options(debug_options);
+
+ const char *const kMultiOutputFusionModule = R"(
+ HloModule test_module
+
+ fused_computation {
+ p0.param_0 = f32[2,2]{1,0} parameter(0)
+ p1.param_1 = f32[2,2]{1,0} parameter(1)
+ add = f32[2,2]{1,0} add(p0.param_0, p1.param_1)
+ mul = f32[2,2]{1,0} multiply(p0.param_0, p1.param_1)
+ ROOT tuple = (f32[2,2]{1,0}, f32[2,2]{1,0}) tuple(add, mul)
+ }
+
+ ENTRY BroadcastIntoAdd {
+ p0 = f32[2,2]{1,0} parameter(0)
+ p1 = f32[2,2]{1,0} parameter(1)
+ ROOT fusion = (f32[2,2]{1,0}, f32[2,2]{1,0}) fusion(p0, p1), kind=kLoop,
+ calls=fused_computation
+ })";
+ auto hlo_module =
+ ParseHloString(kMultiOutputFusionModule, config).ValueOrDie();
+
+ CompileAndVerifyIr(std::move(hlo_module),
+ R"(
+; CHECK-LABEL: @fusion
+; CHECK: load <2 x float>
+; CHECK: load <2 x float>
+; CHECK-NOT: load <2 x float>
+; CHECK: fadd
+; CHECK: fmul
+; CHECK: fadd
+; CHECK: fmul
+; CHECK: store <2 x float>
+; CHECK: store <2 x float>
+; CHECK-NOT: store <2 x float>
+; CHECK-NOT: fadd
+; CHECK-NOT: fmul
+; CHECK: }
+ )",
+ /*match_optimized_ir=*/true);
+}
+
+} // namespace
+} // namespace gpu
+} // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/tests/infeed_test.cc b/tensorflow/compiler/xla/service/gpu/tests/infeed_test.cc
new file mode 100644
index 0000000000..ba5cd2d84d
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/tests/infeed_test.cc
@@ -0,0 +1,121 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <unistd.h>
+#include <memory>
+
+#include "tensorflow/compiler/xla/client/global_data.h"
+#include "tensorflow/compiler/xla/client/lib/arithmetic.h"
+#include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/math/math_util.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+namespace {
+
+class InfeedTest : public ClientLibraryTestBase {
+ protected:
+ // Transfers the given literal to the infeed interface of the device, and
+ // check if the returned data from Infeed HLO is same as the literal.
+ void TestInfeedRoundTrip(const Literal& literal) {
+ // TODO(b/30481585) Explicitly reset the Infeed state so that the
+ // test is not affected by the state from the previous tests.
+ ASSERT_IS_OK(client_->TransferToInfeed(literal));
+ XlaBuilder builder(TestName());
+ Infeed(&builder, literal.shape());
+ if (ShapeUtil::IsTuple(literal.shape())) {
+ // TODO(b/30609564): Use ComputeAndCompareLiteral instead.
+ ComputeAndCompareTuple(&builder, literal, {});
+ } else {
+ ComputeAndCompareLiteral(&builder, literal, {});
+ }
+ }
+};
+
+TEST_F(InfeedTest, SingleInfeedR0Bool) {
+ TestInfeedRoundTrip(*LiteralUtil::CreateR0<bool>(true));
+}
+
+TEST_F(InfeedTest, SingleInfeedR1U32) {
+ TestInfeedRoundTrip(*LiteralUtil::CreateR1<uint32>({1, 2, 3}));
+}
+
+TEST_F(InfeedTest, SingleInfeedR2F32) {
+ TestInfeedRoundTrip(*LiteralUtil::CreateR2F32Linspace(0.0, 1.0, 128, 64));
+}
+
+TEST_F(InfeedTest, SingleInfeedR3F32) {
+ TestInfeedRoundTrip(
+ *LiteralUtil::CreateR3({{{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}},
+ {{1.1f, 2.1f, 3.1f}, {6.1f, 3.5f, 2.8f}}}));
+}
+
+TEST_F(InfeedTest, SingleInfeedR3F32DifferentLayout) {
+ const Layout r3_dim0minor = LayoutUtil::MakeLayout({0, 1, 2});
+ const Layout r3_dim0major = LayoutUtil::MakeLayout({2, 1, 0});
+
+ TestInfeedRoundTrip(*LiteralUtil::CreateR3WithLayout(
+ {{{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}},
+ {{1.1f, 2.1f, 3.1f}, {6.1f, 3.5f, 2.8f}}},
+ r3_dim0minor));
+
+ TestInfeedRoundTrip(*LiteralUtil::CreateR3WithLayout(
+ {{{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}},
+ {{1.1f, 2.1f, 3.1f}, {6.1f, 3.5f, 2.8f}}},
+ r3_dim0major));
+}
+
+TEST_F(InfeedTest, SingleInfeedR4S32) {
+ TestInfeedRoundTrip(*LiteralUtil::CreateR4(
+ {{{{1, -2}, {-4, 5}, {6, 7}}, {{8, 9}, {10, 11}, {12, 13}}},
+ {{{10, 3}, {7, -2}, {3, 6}}, {{2, 5}, {-11, 5}, {-2, -5}}}}));
+}
+
+// Tests that a large infeed can be handled.
+TEST_F(InfeedTest, LargeInfeed) {
+ Array4D<float> array(80, 100, 8, 128);
+ array.FillIota(1.0f);
+ TestInfeedRoundTrip(*LiteralUtil::CreateR4FromArray4D<float>(array));
+}
+
+TEST_F(InfeedTest, SingleInfeedTuple) {
+ TestInfeedRoundTrip(
+ *LiteralUtil::MakeTuple({LiteralUtil::CreateR1<uint32>({1, 2, 3}).get(),
+ LiteralUtil::CreateR0<bool>(false).get()}));
+}
+
+TEST_F(InfeedTest, SingleInfeedEmptyTuple) {
+ TestInfeedRoundTrip(*LiteralUtil::MakeTuple({}));
+}
+
+// Tests that a large tuple infeed can be handled.
+TEST_F(InfeedTest, SingleInfeedLargeTuple) {
+ Array4D<float> array(40, 100, 8, 128);
+ array.FillIota(1.0f);
+ TestInfeedRoundTrip(*LiteralUtil::MakeTuple(
+ {LiteralUtil::CreateR4FromArray4D<float>(array).get(),
+ LiteralUtil::CreateR0<int32>(5).get()}));
+}
+
+} // namespace
+} // namespace xla