aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/core/kernels/l2loss_op_gpu.cu.cc
diff options
context:
space:
mode:
authorGravatar A. Unique TensorFlower <gardener@tensorflow.org>2017-09-13 14:32:25 -0700
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2017-09-13 14:36:05 -0700
commitd6f9d6109474a9162ef4d99520a2d4ef0becfb14 (patch)
tree697b9ef4b78b4ab570d7f1a074219844813cc9ed /tensorflow/core/kernels/l2loss_op_gpu.cu.cc
parentf445958edbca3ad292c9ed8c9de0c7e047b1d2bd (diff)
Switch the softmax to use the new deterministic reductions on the GPU,
results in a speed up of 10-40x on the existing ImageNet benchmarks and 2-3x on the newly added transformer benchmarks. Update the benchmark to also run on the GPU. Remove duplicate cpu tests. PiperOrigin-RevId: 168596693
Diffstat (limited to 'tensorflow/core/kernels/l2loss_op_gpu.cu.cc')
-rw-r--r--tensorflow/core/kernels/l2loss_op_gpu.cu.cc4
1 files changed, 2 insertions, 2 deletions
diff --git a/tensorflow/core/kernels/l2loss_op_gpu.cu.cc b/tensorflow/core/kernels/l2loss_op_gpu.cu.cc
index 73b6472254..5826997de6 100644
--- a/tensorflow/core/kernels/l2loss_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/l2loss_op_gpu.cu.cc
@@ -21,8 +21,8 @@ limitations under the License.
#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/reduction_gpu_kernels.cu.h"
#include "tensorflow/core/kernels/reduction_ops_common.h"
-#include "tensorflow/core/kernels/reduction_ops_gpu_kernels.h"
namespace tensorflow {
@@ -56,7 +56,7 @@ class L2LossOp<GPUDevice, T> : public OpKernel {
Constants<GPUDevice> constants;
functor::ReduceImpl<T, cub::Sum, T*, inputIterType, ReductionAxes>(
context, (T*)output->flat<T>().data(), input_itr, 1,
- input.flat<T>().size(), 1, 1, 0, constants.kZero, cub::Sum(), T(0));
+ input.flat<T>().size(), 1, 1, 0, constants.kZero, cub::Sum());
}
};