#include #include "tensorflow/core/public/session_options.h" #include "tensorflow/core/common_runtime/device.h" #include "tensorflow/core/common_runtime/device_factory.h" #include "tensorflow/core/framework/allocator.h" #include "tensorflow/core/framework/fake_input.h" #include "tensorflow/core/framework/graph.pb.h" #include "tensorflow/core/framework/node_def_builder.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/tensor_testutil.h" #include "tensorflow/core/framework/types.h" #include "tensorflow/core/framework/types.pb.h" #include "tensorflow/core/kernels/ops_util.h" #include "tensorflow/core/platform/test_benchmark.h" #include "tensorflow/core/public/tensor.h" #include #include "tensorflow/core/graph/node_builder.h" #include "tensorflow/core/graph/testlib.h" #include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h" namespace tensorflow { template static void BM_SegmentReduction(int iters, string reduction, Index num_rows, Index num_cols, Index segment_size) { testing::StopTiming(); std::unique_ptr device( DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0")); // Create inputs gtl::InlinedVector reduction_inputs; TensorShape shape1({num_rows, num_cols}); Tensor input1(DT_FLOAT, shape1); reduction_inputs.push_back({nullptr, &input1}); TensorShape shape2({num_rows}); Tensor input2(DataTypeToEnum::v(), shape2); test::FillFn(&input2, [&num_rows, &segment_size](Index i) -> Index { return std::min(i / segment_size, num_rows - 1); }); reduction_inputs.push_back({nullptr, &input2}); NodeDef reduction_node_def; TF_CHECK_OK(NodeDefBuilder(reduction, reduction) .Input(FakeInput(DT_FLOAT)) .Input(FakeInput(DataTypeToEnum::v())) .Finalize(&reduction_node_def)); Status status; std::unique_ptr reduction_op(CreateOpKernel( DEVICE_CPU, device.get(), cpu_allocator(), reduction_node_def, &status)); OpKernelContext::Params params; params.device = device.get(); params.frame_iter = FrameAndIter(0, 0); params.inputs = &reduction_inputs; params.op_kernel = reduction_op.get(); params.output_alloc_attr = [&device, &reduction_op, ¶ms](int index) { AllocatorAttributes attr; const bool on_host = (reduction_op->output_memory_types()[index] == HOST_MEMORY); attr.set_on_host(on_host); return attr; }; std::unique_ptr reduction_context( new OpKernelContext(params)); reduction_op->Compute(reduction_context.get()); TF_CHECK_OK(reduction_context->status()); testing::StartTiming(); for (int i = 0; i < iters; ++i) { delete reduction_context->release_output(0).tensor; reduction_op->Compute(reduction_context.get()); } int64 bytes_per_iter = static_cast(num_rows * num_cols * sizeof(float)); testing::BytesProcessed(bytes_per_iter * iters); } #define BM_Reduce(O, R, C, S) \ static void BM_Reduce_##O##_##R##_##C##_##S##_int32(int iters) { \ BM_SegmentReduction(iters, #O, R, C, S); \ } \ static void BM_Reduce_##O##_##R##_##C##_##S##_int64(int iters) { \ BM_SegmentReduction(iters, #O, R, C, S); \ } \ BENCHMARK(BM_Reduce_##O##_##R##_##C##_##S##_int32); \ BENCHMARK(BM_Reduce_##O##_##R##_##C##_##S##_int64); #define BM_Reduce_Arg(R, C, S) \ BM_Reduce(SegmentSum, R, C, S); \ BM_Reduce(SegmentMean, R, C, S); BM_Reduce_Arg(64, 32, 1); BM_Reduce_Arg(4096, 128, 1); BM_Reduce_Arg(16, 8, 2); BM_Reduce_Arg(64, 32, 2); BM_Reduce_Arg(4096, 32, 2); BM_Reduce_Arg(4096, 128, 2); static void SparseSegmentMeanGradHelper(int iters, float uniqueness, int size) { testing::StopTiming(); RequireDefaultOps(); Graph* g = new Graph(OpRegistry::Global()); CHECK_LE(uniqueness, 1.0); CHECK_GT(uniqueness, 0.0); const int kNumIndices = size; Tensor indices(DT_INT32, TensorShape({kNumIndices})); auto indices_flat = indices.flat(); Tensor segments(DT_INT32, TensorShape({kNumIndices})); auto segments_flat = segments.flat(); int kUniqueIndices = uniqueness * kNumIndices; Tensor output_dim0(DT_INT32, TensorShape({})); output_dim0.scalar()() = kUniqueIndices; for (int i = 0; i < kNumIndices; ++i) { indices_flat(i) = (i * 31) % kUniqueIndices; segments_flat(i) = i * .8; } const int kDim1 = segments_flat(kNumIndices - 1) + 1; const int kDim2 = 128; Tensor input(DT_FLOAT, TensorShape({kDim1, kDim2})); input.flat().setRandom(); Node* node; TF_CHECK_OK(NodeBuilder(g->NewName("n"), "SparseSegmentMeanGrad") .Input(test::graph::Constant(g, input)) .Input(test::graph::Constant(g, indices)) .Input(test::graph::Constant(g, segments)) .Input(test::graph::Constant(g, output_dim0)) .Attr("T", DT_FLOAT) .Finalize(g, &node)); testing::UseRealTime(); testing::BytesProcessed(static_cast(iters) * (kDim1 * kDim2) * sizeof(float)); testing::StartTiming(); test::Benchmark("cpu", g).Run(iters); } static void BM_SparseSegmentMeanGrad_Low(int iters, int size) { return SparseSegmentMeanGradHelper(iters, 1.0, size); } static void BM_SparseSegmentMeanGrad_High(int iters, int size) { return SparseSegmentMeanGradHelper(iters, 0.01, size); } BENCHMARK(BM_SparseSegmentMeanGrad_Low)->Arg(1000)->Arg(100000); BENCHMARK(BM_SparseSegmentMeanGrad_High)->Arg(1000)->Arg(100000); } // namespace tensorflow