Add benchmark tests for depthwise conv forward gpu kernels

Benchmark Time(ns) CPU(ns) Iterations BM_ConvFloatDepthwiseFwdGPU_conv0 4800416 4937895 141 32.7G items/s 32_112_112_3_8_24_3_3_1_2_gpu BM_ConvFloatDepthwiseFwdGPU_conv1 13550072 13922813 100 30.9G items/s 32_112_112_64_1_64_3_3_1_2_gpu BM_ConvFloatDepthwiseFwdGPU_conv2 7032385 7324553 100 29.4G items/s 32_56_56_128_1_128_3_3_1_2_gpu BM_ConvFloatDepthwiseFwdGPU_conv3 2285033 2425335 228 22.2G items/s 32_56_56_128_1_128_3_3_2_2_gpu BM_ConvFloatDepthwiseFwdGPU_conv4 1743948 1858093 359 29.0G items/s 32_28_28_128_1_128_3_3_1_2_gpu BM_ConvFloatDepthwiseFwdGPU_conv5 1784560 1897147 320 28.4G items/s 32_14_14_512_1_512_3_3_1_2_gpu BM_ConvFloatDepthwiseFwdGPU_conv6 971179 1044185 562 25.8G items/s 32_7_7_1024_1_1024_3_3_1_2_gpu Change: 117553964
author: Jianmin Chen <goog.jmchen@gmail.com> 2016-03-18 08:25:28 -0800
committer: TensorFlower Gardener <gardener@tensorflow.org> 2016-03-18 09:28:02 -0700
commit: a8d61ab8162b56a950b2b5ee8310d2d6c569ebac (patch)
tree: 0dd1787091024f12e1fc1fb048ea0c0d3bfa6118 /tensorflow/core/kernels/nn_ops_test.cc
parent: eb289f5171c97bfec011488ba4b7f24f2d7f5a38 (diff)
1 files changed, 17 insertions, 2 deletions
diff --git a/tensorflow/core/kernels/nn_ops_test.cc b/tensorflow/core/kernels/nn_ops_test.cc
index 05808840f9..dddb8bbb4b 100644
--- a/tensorflow/core/kernels/nn_ops_test.cc
+++ b/tensorflow/core/kernels/nn_ops_test.cc
@@ -492,6 +492,8 @@ static void BM_ConvFloatDepthwise(int iters, int batch, int rows, int cols,
 // OD: output_depth
 // KR: kernel_rows
 // KC: kernel_cols
+// STR: stride
+// PAD: padding
 
 #define BM_ConvFloatDepthwiseFwd(BS, R, C, ID, DM, OD, KR, KC, STR, PAD,    \
                                  LABEL)                                     \
@@ -509,12 +511,25 @@ static void BM_ConvFloatDepthwise(int iters, int batch, int rows, int cols,
         strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", DM, "_", OD, "_", \
                         KR, "_", KC, "_", STR, "_", PAD, "_cpu4"));         \
   }                                                                         \
+  static void BM_ConvFloatDepthwiseFwdGPU_##LABEL(int iters) {              \
+    BM_ConvFloatDepthwise(                                                  \
+        iters, BS, R, C, ID, DM, OD, KR, KC, DEPTHWISE_CONV_OP_FWD, 1, STR, \
+        PAD, true,                                                          \
+        strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", DM, "_", OD, "_", \
+                        KR, "_", KC, "_", STR, "_", PAD, "_gpu"));          \
+  }                                                                         \
   BENCHMARK(BM_ConvFloatDepthwiseFwdCPU1_##LABEL);                          \
-  BENCHMARK(BM_ConvFloatDepthwiseFwdCPU4_##LABEL)
+  BENCHMARK(BM_ConvFloatDepthwiseFwdCPU4_##LABEL);                          \
+  BENCHMARK(BM_ConvFloatDepthwiseFwdGPU_##LABEL);
 
-// TODO(andydavis,jmchen) Add more benchmarks.
+// The configurations below are mostly from mobilenet models.
 BM_ConvFloatDepthwiseFwd(32, 112, 112, 3, 8, 24, 3, 3, 1, SAME, conv0);
 BM_ConvFloatDepthwiseFwd(32, 112, 112, 64, 1, 64, 3, 3, 1, SAME, conv1);
+BM_ConvFloatDepthwiseFwd(32, 56, 56, 128, 1, 128, 3, 3, 1, SAME, conv2);
+BM_ConvFloatDepthwiseFwd(32, 56, 56, 128, 1, 128, 3, 3, 2, SAME, conv3);
+BM_ConvFloatDepthwiseFwd(32, 28, 28, 128, 1, 128, 3, 3, 1, SAME, conv4);
+BM_ConvFloatDepthwiseFwd(32, 14, 14, 512, 1, 512, 3, 3, 1, SAME, conv5);
+BM_ConvFloatDepthwiseFwd(32, 7, 7, 1024, 1, 1024, 3, 3, 1, SAME, conv6);
 
 static void BM_LRNFloat(int iters, int depth, int cols, int rows,
                         int batch_size, int range, int num_threads,
author	Jianmin Chen <goog.jmchen@gmail.com>	2016-03-18 08:25:28 -0800
committer	TensorFlower Gardener <gardener@tensorflow.org>	2016-03-18 09:28:02 -0700
commit	a8d61ab8162b56a950b2b5ee8310d2d6c569ebac (patch)
tree	0dd1787091024f12e1fc1fb048ea0c0d3bfa6118 /tensorflow/core/kernels/nn_ops_test.cc
parent	eb289f5171c97bfec011488ba4b7f24f2d7f5a38 (diff)