aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h
diff options
context:
space:
mode:
Diffstat (limited to 'tensorflow/compiler/xla/service/gpu/ir_emission_utils.h')
-rw-r--r--tensorflow/compiler/xla/service/gpu/ir_emission_utils.h16
1 files changed, 10 insertions, 6 deletions
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h
index 59455f389e..9bb4c42b15 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h
@@ -125,13 +125,17 @@ llvm::Value* EmitPrintf(tensorflow::StringPiece fmt,
llvm::IRBuilder<>* builder);
// Emits code to shuffle data between threads of a warp. This has the same
-// semantics as the PTX "shfl.down" instruction [0] but works for values of any
-// size. The last operand of the emitted "shfl" is `kWarpSize - 1`.
+// semantics as the PTX "shfl.sync.down" instruction but works for values that
+// aren't 32 bits in size. The last operand of the emitted "shfl" is
+// `kWarpSize - 1`.
//
-// [0]
-// http://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-shfl
-llvm::Value* EmitShuffleDown(llvm::Value* value, llvm::Value* offset,
- llvm::IRBuilder<>* builder);
+// This function emits a "full-warp" shuffle, which all threads of a warp
+// participate in. *Do not use this function from a divergent context:* You
+// can't correctly do so on both Volta and earlier GPUs.
+//
+// https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-shfl-sync
+llvm::Value* EmitFullWarpShuffleDown(llvm::Value* value, llvm::Value* offset,
+ llvm::IRBuilder<>* builder);
} // namespace gpu
} // namespace xla