aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/core/kernels/where_op.h
diff options
context:
space:
mode:
authorGravatar Eugene Brevdo <ebrevdo@google.com>2017-06-29 15:33:13 -0700
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2017-06-29 15:37:15 -0700
commit8280e0ae9083a65b23608b34723f07e028a56dc8 (patch)
tree0f2df282cfd5cd712920e440cea88a093668cbf2 /tensorflow/core/kernels/where_op.h
parent4aa7c4d2330ce110b5be348144ee67143841272c (diff)
GPU-enabled WhereOp using CUB.
* Import CUB. * Add GPU-enabled async WhereOp. * Added benchmarks. * Added support for bool ResourceVariables on GPU. Benchmark results on machine with single K40 tesla GPU: Where on bool matrix shape [m x n] with p percentage values true below. For small-medium sizes, running WhereOp on GPU is ~4-2x slower. For realistic large problem sizes, it's 2-5x faster. This timing ignores the time spent copying a tensor from GPU -> CPU and back from CPU -> GPU when the WhereOp is between GPU computations (so the performance impact should actually be better). Benchmark: m_10_n_10_p_0.01_use_gpu_False wall_time: 9.01e-05 s Throughput: 0.00129 GB/s Benchmark: m_10_n_10_p_0.01_use_gpu_True wall_time: 0.000187 s Throughput: 0.000621 GB/s Benchmark: m_10_n_10_p_0.5_use_gpu_False wall_time: 9.3e-05 s Throughput: 0.00968 GB/s Benchmark: m_10_n_10_p_0.5_use_gpu_True wall_time: 0.000252 s Throughput: 0.00357 GB/s Benchmark: m_10_n_10_p_0.99_use_gpu_False wall_time: 0.000152 s Throughput: 0.0111 GB/s Benchmark: m_10_n_10_p_0.99_use_gpu_True wall_time: 0.000245 s Throughput: 0.00687 GB/s Benchmark: m_10_n_100_p_0.01_use_gpu_False wall_time: 9.3e-05 s Throughput: 0.0125 GB/s Benchmark: m_10_n_100_p_0.01_use_gpu_True wall_time: 0.000253 s Throughput: 0.00458 GB/s Benchmark: m_10_n_100_p_0.5_use_gpu_False wall_time: 9.8e-05 s Throughput: 0.0918 GB/s Benchmark: m_10_n_100_p_0.5_use_gpu_True wall_time: 0.00026 s Throughput: 0.0346 GB/s Benchmark: m_10_n_100_p_0.99_use_gpu_False wall_time: 0.000104 s Throughput: 0.162 GB/s Benchmark: m_10_n_100_p_0.99_use_gpu_True wall_time: 0.000288 s Throughput: 0.0586 GB/s Benchmark: m_10_n_1000_p_0.01_use_gpu_False wall_time: 0.000105 s Throughput: 0.111 GB/s Benchmark: m_10_n_1000_p_0.01_use_gpu_True wall_time: 0.000283 s Throughput: 0.041 GB/s Benchmark: m_10_n_1000_p_0.5_use_gpu_False wall_time: 0.000185 s Throughput: 0.486 GB/s Benchmark: m_10_n_1000_p_0.5_use_gpu_True wall_time: 0.000335 s Throughput: 0.269 GB/s Benchmark: m_10_n_1000_p_0.99_use_gpu_False wall_time: 0.000203 s Throughput: 0.83 GB/s Benchmark: m_10_n_1000_p_0.99_use_gpu_True wall_time: 0.000346 s Throughput: 0.486 GB/s Benchmark: m_10_n_10000_p_0.01_use_gpu_False wall_time: 0.00019 s Throughput: 0.609 GB/s Benchmark: m_10_n_10000_p_0.01_use_gpu_True wall_time: 0.00028 s Throughput: 0.414 GB/s Benchmark: m_10_n_10000_p_0.5_use_gpu_False wall_time: 0.00117 s Throughput: 0.771 GB/s Benchmark: m_10_n_10000_p_0.5_use_gpu_True wall_time: 0.000426 s Throughput: 2.11 GB/s Benchmark: m_10_n_10000_p_0.99_use_gpu_False wall_time: 0.0014 s Throughput: 1.2 GB/s Benchmark: m_10_n_10000_p_0.99_use_gpu_True wall_time: 0.000482 s Throughput: 3.5 GB/s Benchmark: m_10_n_100000_p_0.01_use_gpu_False wall_time: 0.00129 s Throughput: 0.899 GB/s Benchmark: m_10_n_100000_p_0.01_use_gpu_True wall_time: 0.000336 s Throughput: 3.45 GB/s Benchmark: m_10_n_100000_p_0.5_use_gpu_False wall_time: 0.0102 s Throughput: 0.885 GB/s Benchmark: m_10_n_100000_p_0.5_use_gpu_True wall_time: 0.00136 s Throughput: 6.6 GB/s Benchmark: m_10_n_100000_p_0.99_use_gpu_False wall_time: 0.0116 s Throughput: 1.45 GB/s Benchmark: m_10_n_100000_p_0.99_use_gpu_True wall_time: 0.00233 s Throughput: 7.23 GB/s Benchmark: m_10_n_1000000_p_0.01_use_gpu_False wall_time: 0.0111 s Throughput: 1.04 GB/s Benchmark: m_10_n_1000000_p_0.01_use_gpu_True wall_time: 0.00109 s Throughput: 10.6 GB/s Benchmark: m_10_n_1000000_p_0.5_use_gpu_False wall_time: 0.0895 s Throughput: 1.01 GB/s Benchmark: m_10_n_1000000_p_0.5_use_gpu_True wall_time: 0.0103 s Throughput: 8.7 GB/s Benchmark: m_10_n_1000000_p_0.99_use_gpu_False wall_time: 0.107 s Throughput: 1.58 GB/s Benchmark: m_10_n_1000000_p_0.99_use_gpu_True wall_time: 0.0201 s Throughput: 8.39 GB/s PiperOrigin-RevId: 160582709
Diffstat (limited to 'tensorflow/core/kernels/where_op.h')
-rw-r--r--tensorflow/core/kernels/where_op.h61
1 files changed, 16 insertions, 45 deletions
diff --git a/tensorflow/core/kernels/where_op.h b/tensorflow/core/kernels/where_op.h
index aa27123714..e040325e3d 100644
--- a/tensorflow/core/kernels/where_op.h
+++ b/tensorflow/core/kernels/where_op.h
@@ -17,6 +17,7 @@ limitations under the License.
#define TENSORFLOW_KERNELS_WHERE_OP_H_
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/framework/tensor_types.h"
#include "tensorflow/core/platform/macros.h"
#include "tensorflow/core/platform/types.h"
@@ -25,55 +26,25 @@ namespace tensorflow {
namespace functor {
-template <typename Device>
+template <typename Device, typename TIndex>
struct NumTrue {
- EIGEN_ALWAYS_INLINE static void Compute(
- const Device& d, typename TTypes<bool>::ConstFlat input,
- TTypes<int64>::Scalar num_true) {
- num_true.device(d) = input.template cast<int64>().sum();
- }
+ EIGEN_ALWAYS_INLINE static Status Compute(
+ OpKernelContext* ctx, const Device& d, TTypes<bool>::ConstFlat input,
+ typename TTypes<TIndex>::Scalar num_true);
};
-template <typename Device, int NDIM>
+template <typename Device, int NDIM, typename TIndex>
struct Where {
- EIGEN_ALWAYS_INLINE static int64 Compute(
- const Device& d, typename TTypes<bool, NDIM>::ConstTensor input,
- typename TTypes<int64>::Matrix output) {
- Eigen::DenseIndex true_n = 0;
- Eigen::DSizes<Eigen::DenseIndex, NDIM> dims = input.dimensions();
- Eigen::DSizes<Eigen::DenseIndex, NDIM> strides;
-
- // Calculate strides for RowMajor order.
- EIGEN_STATIC_ASSERT((static_cast<int>(decltype(input)::Layout) ==
- static_cast<int>(Eigen::RowMajor)),
- INTERNAL_ERROR_INPUT_SHOULD_BE_ROWMAJOR);
-
- strides[NDIM - 1] = 1;
- for (int i = NDIM - 2; i >= 0; --i) {
- strides[i] = strides[i + 1] * dims[i + 1];
- }
-
- Eigen::DenseIndex output_size = output.dimension(0);
- for (Eigen::DenseIndex n = 0; n < input.size(); ++n) {
- if (input.data()[n]) {
- if (TF_PREDICT_TRUE(true_n < output_size)) {
- WriteIndexRowMajor(output, strides, true_n, n);
- }
- ++true_n;
- }
- }
- return true_n;
- }
-
- EIGEN_ALWAYS_INLINE static void WriteIndexRowMajor(
- typename TTypes<int64>::Matrix output,
- const Eigen::DSizes<Eigen::DenseIndex, NDIM>& strides,
- Eigen::DenseIndex true_n, Eigen::DenseIndex index) {
- for (int i = 0; i < NDIM; ++i) {
- output(true_n, i) = index / strides[i];
- index %= strides[i];
- }
- }
+ // Copies indices of true values in input into output. The pointer
+ // found_true should sit on the host. Compute should copy the
+ // number of true elements found into it. At the end, if
+ // *found_true != output.dimension(0),
+ // then the input may have changed between the initial counting of
+ // the true values and the call to Where.
+ EIGEN_ALWAYS_INLINE static Status Compute(
+ OpKernelContext* ctx, const Device& d,
+ typename TTypes<bool, NDIM>::ConstTensor input,
+ typename TTypes<int64>::Matrix output, TIndex* found_true);
};
} // namespace functor