From fa1542234857acf56af6e7f0dbe8d2084a18fa00 Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <kramerb@google.com>
Date: Tue, 9 Oct 2018 14:19:07 -0700
Subject: [XLA:GPU] Pattern match atomic "apply" into an atomic store

Otherwise we'd emit a CAS loop.

PiperOrigin-RevId: 216421161
---
 tensorflow/compiler/xla/service/gpu/ir_emitter.cc  | 15 ++++++
 tensorflow/compiler/xla/service/gpu/tests/BUILD    | 12 +++++
 .../xla/service/gpu/tests/gpu_atomic_test.cc       | 58 ++++++++++++++++++++++
 3 files changed, 85 insertions(+)
 create mode 100644 tensorflow/compiler/xla/service/gpu/tests/gpu_atomic_test.cc
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
index b7c37bcf3c..47102347cb 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
@@ -179,6 +179,21 @@ bool IrEmitter::MaybeEmitDirectAtomicOperation(
   bool is_atomic_integral = element_type == S32 || element_type == U32 ||
                             element_type == S64 || element_type == U64;
   llvm::Value* source = Load(source_address, "source");
+
+  // kCopy of RHS -> atomic store.
+  if (root_opcode == HloOpcode::kCopy &&
+      (element_type == F32 || is_atomic_integral) &&
+      computation.root_instruction()->operand(0)->opcode() ==
+          HloOpcode::kParameter &&
+      computation.root_instruction()->operand(0)->parameter_number() == 1) {
+    llvm::StoreInst* store = Store(source, output_address);
+    store->setAtomic(llvm::AtomicOrdering::Unordered);
+    // Derive a minimum alignment from the type. The optimizer can increase it
+    // later.
+    store->setAlignment(ShapeUtil::ByteSizeOfPrimitiveType(element_type));
+    return true;
+  }
+
   if (root_opcode == HloOpcode::kAdd) {
     // NVPTX supports atomicAdd on F32 and integer types.
     if (element_type == F32) {
diff --git a/tensorflow/compiler/xla/service/gpu/tests/BUILD b/tensorflow/compiler/xla/service/gpu/tests/BUILD
index a725533567..1f0436278c 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/tests/BUILD
@@ -223,3 +223,15 @@ tf_cc_test(
         "@com_google_absl//absl/strings",
     ],
 )
+
+tf_cc_test(
+    name = "gpu_atomic_test",
+    srcs = ["gpu_atomic_test.cc"],
+    tags = tf_cuda_tests_tags(),
+    deps = [
+        ":gpu_codegen_test",
+        "//tensorflow/compiler/xla/tests:filecheck",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_atomic_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_atomic_test.cc
new file mode 100644
index 0000000000..6b18c4c637
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_atomic_test.cc
@@ -0,0 +1,58 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <utility>
+
+#include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
+#include "tensorflow/compiler/xla/tests/filecheck.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+class GpuAtomicTest : public GpuCodegenTest {};
+
+TEST_F(GpuAtomicTest, TestStore) {
+  const char* hlo_string = R"(
+    HloModule TensorFlowScatterV1
+
+    update_s32 (lhs: s32[], rhs: s32[]) -> s32[] {
+      lhs = s32[] parameter(0)
+      ROOT rhs = s32[] parameter(1)
+    }
+
+    ENTRY main {
+      operand = s32[3,3] parameter(0)
+      indices = s32[2] parameter(1)
+      updates = s32[2,3] parameter(2)
+      ROOT scatter = s32[3,3] scatter(operand, indices, updates),
+          to_apply=update_s32,
+          update_window_dims={1},
+          inserted_window_dims={0},
+          scatter_dims_to_operand_dims={0},
+          index_vector_dim=1
+    }
+)";
+
+  CompileAndVerifyIr(hlo_string, R"(
+CHECK: store atomic{{.*}}unordered, align 4
+)");
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
-- 
cgit v1.2.3