1 files changed, 157 insertions, 0 deletions
diff --git a/tensorflow/core/kernels/segment_reduction_ops_test.cc b/tensorflow/core/kernels/segment_reduction_ops_test.cc
new file mode 100644
index 0000000000..87647a21a8
--- /dev/null
+++ b/tensorflow/core/kernels/segment_reduction_ops_test.cc
@@ -0,0 +1,157 @@
+#include <functional>
+
+#include "tensorflow/core/public/session_options.h"
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/public/tensor.h"
+#include <gtest/gtest.h>
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/graph/testlib.h"
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+
+namespace tensorflow {
+
+template <typename Index>
+static void BM_SegmentReduction(int iters, string reduction, Index num_rows,
+                                Index num_cols, Index segment_size) {
+  testing::StopTiming();
+  std::unique_ptr<Device> device(
+      DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0"));
+
+  // Create inputs
+  gtl::InlinedVector<TensorValue, 4> reduction_inputs;
+  TensorShape shape1({num_rows, num_cols});
+  Tensor input1(DT_FLOAT, shape1);
+  reduction_inputs.push_back({nullptr, &input1});
+
+  TensorShape shape2({num_rows});
+  Tensor input2(DataTypeToEnum<Index>::v(), shape2);
+  test::FillFn<Index>(&input2, [&num_rows, &segment_size](Index i) -> Index {
+    return std::min(i / segment_size, num_rows - 1);
+  });
+  reduction_inputs.push_back({nullptr, &input2});
+
+  NodeDef reduction_node_def;
+  TF_CHECK_OK(NodeDefBuilder(reduction, reduction)
+                  .Input(FakeInput(DT_FLOAT))
+                  .Input(FakeInput(DataTypeToEnum<Index>::v()))
+                  .Finalize(&reduction_node_def));
+  Status status;
+  std::unique_ptr<OpKernel> reduction_op(CreateOpKernel(
+      DEVICE_CPU, device.get(), cpu_allocator(), reduction_node_def, &status));
+  OpKernelContext::Params params;
+  params.device = device.get();
+  params.frame_iter = FrameAndIter(0, 0);
+  params.inputs = &reduction_inputs;
+  params.op_kernel = reduction_op.get();
+  params.output_alloc_attr = [&device, &reduction_op, &params](int index) {
+    AllocatorAttributes attr;
+    const bool on_host =
+        (reduction_op->output_memory_types()[index] == HOST_MEMORY);
+    attr.set_on_host(on_host);
+    return attr;
+  };
+
+  std::unique_ptr<OpKernelContext> reduction_context(
+      new OpKernelContext(params));
+
+  reduction_op->Compute(reduction_context.get());
+  TF_CHECK_OK(reduction_context->status());
+  testing::StartTiming();
+  for (int i = 0; i < iters; ++i) {
+    delete reduction_context->release_output(0).tensor;
+    reduction_op->Compute(reduction_context.get());
+  }
+  int64 bytes_per_iter =
+      static_cast<int64>(num_rows * num_cols * sizeof(float));
+  testing::BytesProcessed(bytes_per_iter * iters);
+}
+
+#define BM_Reduce(O, R, C, S)                                      \
+  static void BM_Reduce_##O##_##R##_##C##_##S##_int32(int iters) { \
+    BM_SegmentReduction<int32>(iters, #O, R, C, S);                \
+  }                                                                \
+  static void BM_Reduce_##O##_##R##_##C##_##S##_int64(int iters) { \
+    BM_SegmentReduction<int64>(iters, #O, R, C, S);                \
+  }                                                                \
+  BENCHMARK(BM_Reduce_##O##_##R##_##C##_##S##_int32);              \
+  BENCHMARK(BM_Reduce_##O##_##R##_##C##_##S##_int64);
+
+#define BM_Reduce_Arg(R, C, S)    \
+  BM_Reduce(SegmentSum, R, C, S); \
+  BM_Reduce(SegmentMean, R, C, S);
+
+BM_Reduce_Arg(64, 32, 1);
+BM_Reduce_Arg(4096, 128, 1);
+
+BM_Reduce_Arg(16, 8, 2);
+BM_Reduce_Arg(64, 32, 2);
+BM_Reduce_Arg(4096, 32, 2);
+BM_Reduce_Arg(4096, 128, 2);
+
+static void SparseSegmentMeanGradHelper(int iters, float uniqueness, int size) {
+  testing::StopTiming();
+  RequireDefaultOps();
+  Graph* g = new Graph(OpRegistry::Global());
+  CHECK_LE(uniqueness, 1.0);
+  CHECK_GT(uniqueness, 0.0);
+
+  const int kNumIndices = size;
+  Tensor indices(DT_INT32, TensorShape({kNumIndices}));
+  auto indices_flat = indices.flat<int32>();
+  Tensor segments(DT_INT32, TensorShape({kNumIndices}));
+  auto segments_flat = segments.flat<int32>();
+
+  int kUniqueIndices = uniqueness * kNumIndices;
+  Tensor output_dim0(DT_INT32, TensorShape({}));
+  output_dim0.scalar<int32>()() = kUniqueIndices;
+
+  for (int i = 0; i < kNumIndices; ++i) {
+    indices_flat(i) = (i * 31) % kUniqueIndices;
+    segments_flat(i) = i * .8;
+  }
+
+  const int kDim1 = segments_flat(kNumIndices - 1) + 1;
+  const int kDim2 = 128;
+  Tensor input(DT_FLOAT, TensorShape({kDim1, kDim2}));
+  input.flat<float>().setRandom();
+
+  Node* node;
+  TF_CHECK_OK(NodeBuilder(g->NewName("n"), "SparseSegmentMeanGrad")
+                  .Input(test::graph::Constant(g, input))
+                  .Input(test::graph::Constant(g, indices))
+                  .Input(test::graph::Constant(g, segments))
+                  .Input(test::graph::Constant(g, output_dim0))
+                  .Attr("T", DT_FLOAT)
+                  .Finalize(g, &node));
+
+  testing::UseRealTime();
+  testing::BytesProcessed(static_cast<int64>(iters) * (kDim1 * kDim2) *
+                          sizeof(float));
+  testing::StartTiming();
+  test::Benchmark("cpu", g).Run(iters);
+}
+
+static void BM_SparseSegmentMeanGrad_Low(int iters, int size) {
+  return SparseSegmentMeanGradHelper(iters, 1.0, size);
+}
+
+static void BM_SparseSegmentMeanGrad_High(int iters, int size) {
+  return SparseSegmentMeanGradHelper(iters, 0.01, size);
+}
+
+BENCHMARK(BM_SparseSegmentMeanGrad_Low)->Arg(1000)->Arg(100000);
+BENCHMARK(BM_SparseSegmentMeanGrad_High)->Arg(1000)->Arg(100000);
+
+}  // namespace tensorflow