aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/compiler/xla/service/gpu/tests/gpu_ldg_test.cc
diff options
context:
space:
mode:
Diffstat (limited to 'tensorflow/compiler/xla/service/gpu/tests/gpu_ldg_test.cc')
-rw-r--r--tensorflow/compiler/xla/service/gpu/tests/gpu_ldg_test.cc141
1 files changed, 141 insertions, 0 deletions
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_ldg_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_ldg_test.cc
new file mode 100644
index 0000000000..6c9ae7bada
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_ldg_test.cc
@@ -0,0 +1,141 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Tests that we emit ld.global.nc (the PTX instruction corresponding to CUDA's
+// __ldg builtin) for reads of buffers that don't change during a kernel's
+// execution.
+
+#include <memory>
+#include <utility>
+
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+namespace gpu {
+
+class GpuLdgTest : public GpuCodegenTest {};
+
+// Parameters are never overwritten, so parameter reads should get ld.global.nc
+// reads.
+TEST_F(GpuLdgTest, LdgForParamRead) {
+ HloComputation::Builder builder(TestName());
+
+ auto shape = ShapeUtil::MakeShape(F32, {2, 2});
+ HloInstruction* param =
+ builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "x"));
+ builder.AddInstruction(
+ HloInstruction::CreateBinary(shape, HloOpcode::kAdd, param, param));
+ std::unique_ptr<HloComputation> computation = builder.Build();
+
+ auto hlo_module = CreateNewModule();
+ hlo_module->AddEntryComputation(std::move(computation));
+
+ CompileAndVerifyPtx(std::move(hlo_module), R"(
+ CHECK-NOT: ld.global.f32
+ CHECK: ld.global.nc.f32
+ )");
+}
+
+// Check that reading a buffer produced by a non-parameter HLO also results in
+// ld.global.nc, if that buffer isn't modified within the instruction that reads
+// it.
+TEST_F(GpuLdgTest, LdgForNonParamRead) {
+ HloComputation::Builder builder(TestName());
+
+ auto shape = ShapeUtil::MakeShape(F32, {2, 2});
+ HloInstruction* param =
+ builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "x"));
+ HloInstruction* add = builder.AddInstruction(
+ HloInstruction::CreateBinary(shape, HloOpcode::kAdd, param, param));
+ HloInstruction* square = builder.AddInstruction(
+ HloInstruction::CreateBinary(shape, HloOpcode::kMultiply, add, add));
+ builder.AddInstruction(HloInstruction::CreateTuple({add, square}));
+ std::unique_ptr<HloComputation> computation = builder.Build();
+
+ auto hlo_module = CreateNewModule();
+ hlo_module->AddEntryComputation(std::move(computation));
+
+ CompileAndVerifyPtx(std::move(hlo_module), R"(
+ CHECK: {
+ CHECK-NOT: ld.global.f32
+ CHECK: ld.global.nc.f32
+ CHECK: }
+ )");
+}
+
+// Check that reading a buffer that's modified in-place does not produce
+// ld.global.nc.
+//
+// We do this by creating a reduce that feeds into a sin. We don't currently
+// fuse sin into reduce, and the sin is elementwise, so it reuses its input
+// buffer as its output.
+//
+// It seems like a fair bet that we won't start fusing sin into the output of
+// reduce in the foreseeable future. But if that turns out to be wrong, I give
+// you, future reader, permission to delete this test.
+TEST_F(GpuLdgTest, NoLdgWhenSharingBuffer) {
+ auto hlo_module = CreateNewModule();
+ HloComputation::Builder builder(TestName());
+
+ HloComputation* reduce_computation;
+ {
+ auto embedded_builder = HloComputation::Builder("add");
+ auto lhs = embedded_builder.AddInstruction(HloInstruction::CreateParameter(
+ 0, ShapeUtil::MakeShape(F32, {}), "lhs"));
+ auto rhs = embedded_builder.AddInstruction(HloInstruction::CreateParameter(
+ 1, ShapeUtil::MakeShape(F32, {}), "rhs"));
+ embedded_builder.AddInstruction(
+ HloInstruction::CreateBinary(lhs->shape(), HloOpcode::kAdd, lhs, rhs));
+ reduce_computation =
+ hlo_module->AddEmbeddedComputation(embedded_builder.Build());
+ }
+
+ auto param_shape = ShapeUtil::MakeShape(F32, {2, 2});
+ auto reduce_shape = ShapeUtil::MakeShape(F32, {2});
+ HloInstruction* param = builder.AddInstruction(
+ HloInstruction::CreateParameter(0, param_shape, "x"));
+ HloInstruction* reduce = builder.AddInstruction(HloInstruction::CreateReduce(
+ reduce_shape,
+ builder.AddInstruction(HloInstruction::CreateBinary(
+ param_shape, HloOpcode::kAdd, param, param)),
+ builder.AddInstruction(
+ HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0))),
+ {0}, reduce_computation));
+ builder.AddInstruction(
+ HloInstruction::CreateUnary(reduce_shape, HloOpcode::kSin, reduce));
+
+ std::unique_ptr<HloComputation> computation = builder.Build();
+ hlo_module->AddEntryComputation(std::move(computation));
+
+ CompileAndVerifyPtx(std::move(hlo_module), R"(
+ CHECK-LABEL: .entry sin
+ CHECK: {
+ CHECK-NOT: ld.global.nc.f32
+ CHECK: ld.global.f32
+ CHECK: }
+ )");
+}
+
+} // namespace gpu
+} // namespace xla