aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorGravatar Benjamin Kramer <kramerb@google.com>2018-10-09 14:19:07 -0700
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2018-10-09 14:28:11 -0700
commitfa1542234857acf56af6e7f0dbe8d2084a18fa00 (patch)
tree1254448bf59e0fc3330d421059f53e0258dc56b6
parentb145f46b735fe1e383be6629cafaa5269b07b7fb (diff)
[XLA:GPU] Pattern match atomic "apply" into an atomic store
Otherwise we'd emit a CAS loop. PiperOrigin-RevId: 216421161
-rw-r--r--tensorflow/compiler/xla/service/gpu/ir_emitter.cc15
-rw-r--r--tensorflow/compiler/xla/service/gpu/tests/BUILD12
-rw-r--r--tensorflow/compiler/xla/service/gpu/tests/gpu_atomic_test.cc58
3 files changed, 85 insertions, 0 deletions
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
index b7c37bcf3c..47102347cb 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
@@ -179,6 +179,21 @@ bool IrEmitter::MaybeEmitDirectAtomicOperation(
bool is_atomic_integral = element_type == S32 || element_type == U32 ||
element_type == S64 || element_type == U64;
llvm::Value* source = Load(source_address, "source");
+
+ // kCopy of RHS -> atomic store.
+ if (root_opcode == HloOpcode::kCopy &&
+ (element_type == F32 || is_atomic_integral) &&
+ computation.root_instruction()->operand(0)->opcode() ==
+ HloOpcode::kParameter &&
+ computation.root_instruction()->operand(0)->parameter_number() == 1) {
+ llvm::StoreInst* store = Store(source, output_address);
+ store->setAtomic(llvm::AtomicOrdering::Unordered);
+ // Derive a minimum alignment from the type. The optimizer can increase it
+ // later.
+ store->setAlignment(ShapeUtil::ByteSizeOfPrimitiveType(element_type));
+ return true;
+ }
+
if (root_opcode == HloOpcode::kAdd) {
// NVPTX supports atomicAdd on F32 and integer types.
if (element_type == F32) {
diff --git a/tensorflow/compiler/xla/service/gpu/tests/BUILD b/tensorflow/compiler/xla/service/gpu/tests/BUILD
index a725533567..1f0436278c 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/tests/BUILD
@@ -223,3 +223,15 @@ tf_cc_test(
"@com_google_absl//absl/strings",
],
)
+
+tf_cc_test(
+ name = "gpu_atomic_test",
+ srcs = ["gpu_atomic_test.cc"],
+ tags = tf_cuda_tests_tags(),
+ deps = [
+ ":gpu_codegen_test",
+ "//tensorflow/compiler/xla/tests:filecheck",
+ "//tensorflow/core:test",
+ "//tensorflow/core:test_main",
+ ],
+)
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_atomic_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_atomic_test.cc
new file mode 100644
index 0000000000..6b18c4c637
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_atomic_test.cc
@@ -0,0 +1,58 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <utility>
+
+#include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
+#include "tensorflow/compiler/xla/tests/filecheck.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+class GpuAtomicTest : public GpuCodegenTest {};
+
+TEST_F(GpuAtomicTest, TestStore) {
+ const char* hlo_string = R"(
+ HloModule TensorFlowScatterV1
+
+ update_s32 (lhs: s32[], rhs: s32[]) -> s32[] {
+ lhs = s32[] parameter(0)
+ ROOT rhs = s32[] parameter(1)
+ }
+
+ ENTRY main {
+ operand = s32[3,3] parameter(0)
+ indices = s32[2] parameter(1)
+ updates = s32[2,3] parameter(2)
+ ROOT scatter = s32[3,3] scatter(operand, indices, updates),
+ to_apply=update_s32,
+ update_window_dims={1},
+ inserted_window_dims={0},
+ scatter_dims_to_operand_dims={0},
+ index_vector_dim=1
+ }
+)";
+
+ CompileAndVerifyIr(hlo_string, R"(
+CHECK: store atomic{{.*}}unordered, align 4
+)");
+}
+
+} // namespace
+} // namespace gpu
+} // namespace xla