[XLA:GPU] Pattern match atomic "apply" into an atomic store

Otherwise we'd emit a CAS loop. PiperOrigin-RevId: 216421161
author: Benjamin Kramer <kramerb@google.com> 2018-10-09 14:19:07 -0700
committer: TensorFlower Gardener <gardener@tensorflow.org> 2018-10-09 14:28:11 -0700
commit: fa1542234857acf56af6e7f0dbe8d2084a18fa00 (patch)
tree: 1254448bf59e0fc3330d421059f53e0258dc56b6 /tensorflow/compiler/xla/service/gpu/ir_emitter.cc
parent: b145f46b735fe1e383be6629cafaa5269b07b7fb (diff)
1 files changed, 15 insertions, 0 deletions
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
index b7c37bcf3c..47102347cb 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
@@ -179,6 +179,21 @@ bool IrEmitter::MaybeEmitDirectAtomicOperation(
   bool is_atomic_integral = element_type == S32 || element_type == U32 ||
                             element_type == S64 || element_type == U64;
   llvm::Value* source = Load(source_address, "source");
+
+  // kCopy of RHS -> atomic store.
+  if (root_opcode == HloOpcode::kCopy &&
+      (element_type == F32 || is_atomic_integral) &&
+      computation.root_instruction()->operand(0)->opcode() ==
+          HloOpcode::kParameter &&
+      computation.root_instruction()->operand(0)->parameter_number() == 1) {
+    llvm::StoreInst* store = Store(source, output_address);
+    store->setAtomic(llvm::AtomicOrdering::Unordered);
+    // Derive a minimum alignment from the type. The optimizer can increase it
+    // later.
+    store->setAlignment(ShapeUtil::ByteSizeOfPrimitiveType(element_type));
+    return true;
+  }
+
   if (root_opcode == HloOpcode::kAdd) {
     // NVPTX supports atomicAdd on F32 and integer types.
     if (element_type == F32) {
author	Benjamin Kramer <kramerb@google.com>	2018-10-09 14:19:07 -0700
committer	TensorFlower Gardener <gardener@tensorflow.org>	2018-10-09 14:28:11 -0700
commit	fa1542234857acf56af6e7f0dbe8d2084a18fa00 (patch)
tree	1254448bf59e0fc3330d421059f53e0258dc56b6 /tensorflow/compiler/xla/service/gpu/ir_emitter.cc
parent	b145f46b735fe1e383be6629cafaa5269b07b7fb (diff)