aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/core/util/cuda_kernel_helper.h
diff options
context:
space:
mode:
Diffstat (limited to 'tensorflow/core/util/cuda_kernel_helper.h')
-rw-r--r--tensorflow/core/util/cuda_kernel_helper.h12
1 files changed, 12 insertions, 0 deletions
diff --git a/tensorflow/core/util/cuda_kernel_helper.h b/tensorflow/core/util/cuda_kernel_helper.h
index 8fa0dfbed9..cf11f419a4 100644
--- a/tensorflow/core/util/cuda_kernel_helper.h
+++ b/tensorflow/core/util/cuda_kernel_helper.h
@@ -752,6 +752,12 @@ __device__ EIGEN_ALWAYS_INLINE T CudaShuffleDown(unsigned mask, T value,
return __shfl_down_sync(mask, value, delta, width);
}
+__device__ EIGEN_ALWAYS_INLINE Eigen::half CudaShuffleDown(
+ unsigned mask, Eigen::half value, int delta, int width = warpSize) {
+ return Eigen::half(
+ __shfl_down_sync(mask, static_cast<uint16>(value), delta, width));
+}
+
// Variant of the (undocumented) version from the CUDA SDK, but using unsigned
// instead of float for lo and hi (which is incorrect with ftz, for example).
// A bug has been filed with NVIDIA and will be fixed in the next CUDA release.
@@ -774,6 +780,12 @@ __device__ EIGEN_ALWAYS_INLINE T CudaShuffleXor(unsigned mask, T value,
return __shfl_xor_sync(mask, value, laneMask, width);
}
+__device__ EIGEN_ALWAYS_INLINE Eigen::half CudaShuffleXor(
+ unsigned mask, Eigen::half value, int laneMask, int width = warpSize) {
+ return Eigen::half(
+ __shfl_xor_sync(mask, static_cast<uint16>(value), laneMask, width));
+}
+
// Variant of the (undocumented) version from the CUDA SDK, but using unsigned
// instead of float for lo and hi (which is incorrect with ftz, for example).
// A bug has been filed with NVIDIA and will be fixed in the next CUDA release.