aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/contrib/reduce_slice_ops
diff options
context:
space:
mode:
authorGravatar A. Unique TensorFlower <gardener@tensorflow.org>2018-01-26 05:15:18 -0800
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2018-01-26 05:22:51 -0800
commitabdc62aee1eeba32be56d761a2f9988306356084 (patch)
treed49fa0848904b212443245702923255ba18cca58 /tensorflow/contrib/reduce_slice_ops
parentc8c2e4932afccb594bfe05e22facea1aba9dd454 (diff)
Roll CL 179861781 forward with fix: Wrappers for CUDA 9 warp-synchronous intrinsics.
PiperOrigin-RevId: 183374082
Diffstat (limited to 'tensorflow/contrib/reduce_slice_ops')
-rw-r--r--tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops_gpu.cu.cc11
1 files changed, 6 insertions, 5 deletions
diff --git a/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops_gpu.cu.cc b/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops_gpu.cu.cc
index 8e6870fadd..501cddb8c8 100644
--- a/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops_gpu.cu.cc
+++ b/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops_gpu.cu.cc
@@ -34,9 +34,9 @@ namespace functor {
__global__ void ReduceSliceDeviceKernel##reduceop( \
Cuda3DLaunchConfig config, Index indices_width, Index bound, \
const T begin, const Index *indices, const T *input, T *out) { \
- CUDA_AXIS_KERNEL_LOOP(x, config.virtual_thread_count, x) { \
- CUDA_AXIS_KERNEL_LOOP(y, config.virtual_thread_count, y) { \
- CUDA_AXIS_KERNEL_LOOP(z, config.virtual_thread_count, z) { \
+ CUDA_AXIS_KERNEL_LOOP(x, config.virtual_thread_count.x, X) { \
+ CUDA_AXIS_KERNEL_LOOP(y, config.virtual_thread_count.y, Y) { \
+ CUDA_AXIS_KERNEL_LOOP(z, config.virtual_thread_count.z, Z) { \
Index outidx = x * config.virtual_thread_count.y * \
config.virtual_thread_count.z + \
y * config.virtual_thread_count.z + z; \
@@ -68,8 +68,9 @@ namespace functor {
if (sizex * sizey * sizez == 0) { \
return; \
} \
- Cuda3DLaunchConfig config = GetCuda3DLaunchConfig(sizex, sizey, sizez, d,\
- ReduceSliceDeviceKernel##reduceop<T, Index>, 0, 0); \
+ Cuda3DLaunchConfig config = GetCuda3DLaunchConfig( \
+ sizex, sizey, sizez, d, ReduceSliceDeviceKernel##reduceop<T, Index>, \
+ 0, 0); \
\
ReduceSliceDeviceKernel##reduceop<T, Index> \
<<<config.block_count, config.thread_per_block, 0, d.stream()>>>( \