aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/core/kernels/concat_op_test.cc
diff options
context:
space:
mode:
Diffstat (limited to 'tensorflow/core/kernels/concat_op_test.cc')
-rw-r--r--tensorflow/core/kernels/concat_op_test.cc240
1 files changed, 240 insertions, 0 deletions
diff --git a/tensorflow/core/kernels/concat_op_test.cc b/tensorflow/core/kernels/concat_op_test.cc
new file mode 100644
index 0000000000..4ccc5b5b19
--- /dev/null
+++ b/tensorflow/core/kernels/concat_op_test.cc
@@ -0,0 +1,240 @@
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/graph/testlib.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/public/tensor.h"
+#include <gtest/gtest.h>
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace tensorflow {
+namespace {
+
+// For the benchmark, we set up two 2-dimensional tensors, each kDim1 x 'dim'
+// in size, and concat them together along "concat_dimension"
+template <typename T>
+static void ConcatHelper(int iters, int concat_dimension, int dim2) {
+ testing::StopTiming();
+ RequireDefaultOps();
+ Graph* g = new Graph(OpRegistry::Global());
+
+ DataType dt = DataTypeToEnum<T>::v();
+ const int kDim1 = 100;
+ Tensor concat_dim(DT_INT32, TensorShape({}));
+ concat_dim.scalar<int32>()() = concat_dimension;
+ Tensor in0(dt, TensorShape({kDim1, dim2}));
+ in0.flat<T>().setRandom();
+ Tensor in1(dt, TensorShape({kDim1, dim2}));
+ in1.flat<T>().setRandom();
+
+ Node* node;
+ TF_CHECK_OK(
+ NodeBuilder(g->NewName("n"), "Concat")
+ .Input(test::graph::Constant(g, concat_dim))
+ .Input({test::graph::Constant(g, in0), test::graph::Constant(g, in1)})
+ .Attr("N", 2)
+ .Attr("T", dt)
+ .Finalize(g, &node));
+
+ testing::BytesProcessed(static_cast<int64>(iters) *
+ ((kDim1 * dim2) + (kDim1 * dim2)) * sizeof(T));
+ testing::StartTiming();
+ test::Benchmark("cpu", g).Run(iters);
+ testing::UseRealTime();
+}
+
+static void BM_ConcatDim0Float(int iters, int dim2) {
+ ConcatHelper<float>(iters, 0, dim2);
+}
+
+static void BM_ConcatDim1Float(int iters, int dim2) {
+ ConcatHelper<float>(iters, 1, dim2);
+}
+
+BENCHMARK(BM_ConcatDim0Float)->Arg(1000)->Arg(100000)->Arg(1000000);
+BENCHMARK(BM_ConcatDim1Float)->Arg(1000)->Arg(100000)->Arg(1000000);
+
+static void BM_ConcatDim1int16(int iters, int dim2) {
+ ConcatHelper<int16>(iters, 1, dim2);
+}
+static void BM_ConcatDim1bfloat16(int iters, int dim2) {
+ ConcatHelper<bfloat16>(iters, 1, dim2);
+}
+
+BENCHMARK(BM_ConcatDim1int16)->Arg(1000)->Arg(100000)->Arg(1000000);
+BENCHMARK(BM_ConcatDim1bfloat16)->Arg(1000)->Arg(100000)->Arg(1000000);
+
+template <typename T>
+static void ConcatManyHelper(int iters, int concat_dimension, int dim2) {
+ testing::StopTiming();
+ RequireDefaultOps();
+ Graph* g = new Graph(OpRegistry::Global());
+
+ DataType dt = DataTypeToEnum<T>::v();
+ const int kDim1 = 40000;
+ const int kNumInputs = 64;
+ Tensor concat_dim(DT_INT32, TensorShape({}));
+ concat_dim.scalar<int32>()() = concat_dimension;
+ std::vector<NodeBuilder::NodeOut> inputs;
+ inputs.reserve(kNumInputs);
+ for (int i = 0; i < kNumInputs; ++i) {
+ Tensor in(dt, TensorShape({kDim1, dim2}));
+ in.flat<T>().setRandom();
+ inputs.push_back(test::graph::Constant(g, in));
+ }
+
+ Node* node;
+ TF_CHECK_OK(NodeBuilder(g->NewName("n"), "Concat")
+ .Input(test::graph::Constant(g, concat_dim))
+ .Input(inputs)
+ .Attr("N", 64)
+ .Attr("T", dt)
+ .Finalize(g, &node));
+ testing::BytesProcessed(static_cast<int64>(iters) * kDim1 * dim2 *
+ kNumInputs * sizeof(T));
+ testing::StartTiming();
+ test::Benchmark("cpu", g).Run(iters);
+ testing::UseRealTime();
+}
+
+static void BM_ConcatManyDim1bfloat16(int iters, int dim2) {
+ ConcatManyHelper<bfloat16>(iters, 1, dim2);
+}
+
+BENCHMARK(BM_ConcatManyDim1bfloat16)->Arg(18)->Arg(34)->Arg(60);
+
+static void MemcpyAlternativeHelper(int iters, int concat_dimension, int dim2) {
+ testing::StopTiming();
+
+ const int kDim1 = 100;
+ std::vector<float> data1(kDim1 * dim2, 1.0f);
+ std::vector<float> data2(kDim1 * dim2, 2.0f);
+
+ testing::BytesProcessed(static_cast<int64>(iters) *
+ ((kDim1 * dim2) + (kDim1 * dim2)) * sizeof(float));
+ testing::StartTiming();
+ while (--iters > 0) {
+ const int n0 = data1.size();
+ const int n1 = data2.size();
+ float* result = new float[n0 + n1];
+ memcpy(&result[0], &data1[0], n0 * sizeof(float));
+ memcpy(&result[n0], &data2[0], n1 * sizeof(float));
+ delete[] result;
+ }
+}
+
+static void BM_MemcpyAlternativeDim0(int iters, int dim2) {
+ MemcpyAlternativeHelper(iters, 0, dim2);
+}
+static void BM_MemcpyAlternativeDim1(int iters, int dim2) {
+ MemcpyAlternativeHelper(iters, 1, dim2);
+}
+
+BENCHMARK(BM_MemcpyAlternativeDim0)->Arg(1000)->Arg(100000)->Arg(1000000);
+BENCHMARK(BM_MemcpyAlternativeDim1)->Arg(1000)->Arg(100000)->Arg(1000000);
+
+typedef Eigen::TensorMap<Eigen::Tensor<bfloat16, 1, Eigen::RowMajor>,
+ Eigen::Unaligned> EigenMap;
+static void MemcpyManyAlternative1(int iters, int dim2) {
+ testing::StopTiming();
+
+ const int kDim1 = 40000;
+ const int kNumCopies = 64;
+ const int size = kDim1 * dim2 * kNumCopies;
+ bfloat16* data = new bfloat16[size];
+ EigenMap map(data, size);
+ map.setRandom();
+
+ testing::BytesProcessed(static_cast<int64>(iters) * kDim1 * dim2 *
+ kNumCopies * sizeof(bfloat16));
+ testing::StartTiming();
+ while (iters-- > 0) {
+ std::vector<bfloat16*> inputs(kNumCopies);
+ for (int i = 0; i < kNumCopies; ++i) {
+ inputs[i] = &data[i * kDim1 * dim2];
+ }
+ bfloat16* result = new bfloat16[size];
+ for (int j = 0; j < kNumCopies; ++j) {
+ bfloat16* output = &result[j * dim2];
+ for (int i = 0; i < kDim1; ++i) {
+ if (i + 1 < kDim1) {
+ port::prefetch<port::PREFETCH_HINT_T0>(inputs[j] + dim2);
+ }
+ memcpy(output, inputs[j], dim2 * sizeof(bfloat16));
+ inputs[j] += dim2;
+ output += dim2 * kNumCopies;
+ }
+ }
+ delete[] result;
+ }
+ delete[] data;
+}
+
+static void MemcpyManyAlternative2(int iters, int dim2) {
+ testing::StopTiming();
+
+ const int kDim1 = 40000;
+ const int kNumCopies = 64;
+ const int size = kDim1 * dim2 * kNumCopies;
+ bfloat16* data = new bfloat16[size];
+ EigenMap map(data, size);
+ map.setRandom();
+
+ testing::BytesProcessed(static_cast<int64>(iters) * kDim1 * dim2 *
+ kNumCopies * sizeof(bfloat16));
+ testing::StartTiming();
+ std::vector<bfloat16*> inputs(kNumCopies);
+ while (--iters > 0) {
+ bfloat16* result = new bfloat16[size];
+ for (int i = 0; i < kNumCopies; ++i) {
+ inputs[i] = &data[i * kDim1 * dim2];
+ }
+ bfloat16* output = result;
+ for (int i = 0; i < kDim1; ++i) {
+ for (int j = 0; j < kNumCopies; ++j) {
+ if (j + 1 < kNumCopies) {
+ port::prefetch<port::PREFETCH_HINT_T0>(inputs[j + 1]);
+ }
+ memcpy(output, inputs[j], dim2 * sizeof(bfloat16));
+ inputs[j] += dim2;
+ output += dim2;
+ }
+ }
+ delete[] result;
+ }
+ delete[] data;
+}
+
+BENCHMARK(MemcpyManyAlternative1)
+ ->Arg(16)
+ ->Arg(17)
+ ->Arg(18)
+ ->Arg(32)
+ ->Arg(33)
+ ->Arg(34)
+ ->Arg(60)
+ ->Arg(64)
+ ->Arg(65);
+
+BENCHMARK(MemcpyManyAlternative2)
+ ->Arg(16)
+ ->Arg(17)
+ ->Arg(18)
+ ->Arg(32)
+ ->Arg(33)
+ ->Arg(34)
+ ->Arg(60)
+ ->Arg(64)
+ ->Arg(65);
+
+} // namespace
+} // namespace tensorflow