diff options
author | Jianmin Chen <goog.jmchen@gmail.com> | 2016-03-18 08:25:28 -0800 |
---|---|---|
committer | TensorFlower Gardener <gardener@tensorflow.org> | 2016-03-18 09:28:02 -0700 |
commit | a8d61ab8162b56a950b2b5ee8310d2d6c569ebac (patch) | |
tree | 0dd1787091024f12e1fc1fb048ea0c0d3bfa6118 /tensorflow/core/kernels/nn_ops_test.cc | |
parent | eb289f5171c97bfec011488ba4b7f24f2d7f5a38 (diff) |
Add benchmark tests for depthwise conv forward gpu kernels
Benchmark Time(ns) CPU(ns) Iterations
BM_ConvFloatDepthwiseFwdGPU_conv0 4800416 4937895 141 32.7G items/s 32_112_112_3_8_24_3_3_1_2_gpu
BM_ConvFloatDepthwiseFwdGPU_conv1 13550072 13922813 100 30.9G items/s 32_112_112_64_1_64_3_3_1_2_gpu
BM_ConvFloatDepthwiseFwdGPU_conv2 7032385 7324553 100 29.4G items/s 32_56_56_128_1_128_3_3_1_2_gpu
BM_ConvFloatDepthwiseFwdGPU_conv3 2285033 2425335 228 22.2G items/s 32_56_56_128_1_128_3_3_2_2_gpu
BM_ConvFloatDepthwiseFwdGPU_conv4 1743948 1858093 359 29.0G items/s 32_28_28_128_1_128_3_3_1_2_gpu
BM_ConvFloatDepthwiseFwdGPU_conv5 1784560 1897147 320 28.4G items/s 32_14_14_512_1_512_3_3_1_2_gpu
BM_ConvFloatDepthwiseFwdGPU_conv6 971179 1044185 562 25.8G items/s 32_7_7_1024_1_1024_3_3_1_2_gpu
Change: 117553964
Diffstat (limited to 'tensorflow/core/kernels/nn_ops_test.cc')
-rw-r--r-- | tensorflow/core/kernels/nn_ops_test.cc | 19 |
1 files changed, 17 insertions, 2 deletions
diff --git a/tensorflow/core/kernels/nn_ops_test.cc b/tensorflow/core/kernels/nn_ops_test.cc index 05808840f9..dddb8bbb4b 100644 --- a/tensorflow/core/kernels/nn_ops_test.cc +++ b/tensorflow/core/kernels/nn_ops_test.cc @@ -492,6 +492,8 @@ static void BM_ConvFloatDepthwise(int iters, int batch, int rows, int cols, // OD: output_depth // KR: kernel_rows // KC: kernel_cols +// STR: stride +// PAD: padding #define BM_ConvFloatDepthwiseFwd(BS, R, C, ID, DM, OD, KR, KC, STR, PAD, \ LABEL) \ @@ -509,12 +511,25 @@ static void BM_ConvFloatDepthwise(int iters, int batch, int rows, int cols, strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", DM, "_", OD, "_", \ KR, "_", KC, "_", STR, "_", PAD, "_cpu4")); \ } \ + static void BM_ConvFloatDepthwiseFwdGPU_##LABEL(int iters) { \ + BM_ConvFloatDepthwise( \ + iters, BS, R, C, ID, DM, OD, KR, KC, DEPTHWISE_CONV_OP_FWD, 1, STR, \ + PAD, true, \ + strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", DM, "_", OD, "_", \ + KR, "_", KC, "_", STR, "_", PAD, "_gpu")); \ + } \ BENCHMARK(BM_ConvFloatDepthwiseFwdCPU1_##LABEL); \ - BENCHMARK(BM_ConvFloatDepthwiseFwdCPU4_##LABEL) + BENCHMARK(BM_ConvFloatDepthwiseFwdCPU4_##LABEL); \ + BENCHMARK(BM_ConvFloatDepthwiseFwdGPU_##LABEL); -// TODO(andydavis,jmchen) Add more benchmarks. +// The configurations below are mostly from mobilenet models. BM_ConvFloatDepthwiseFwd(32, 112, 112, 3, 8, 24, 3, 3, 1, SAME, conv0); BM_ConvFloatDepthwiseFwd(32, 112, 112, 64, 1, 64, 3, 3, 1, SAME, conv1); +BM_ConvFloatDepthwiseFwd(32, 56, 56, 128, 1, 128, 3, 3, 1, SAME, conv2); +BM_ConvFloatDepthwiseFwd(32, 56, 56, 128, 1, 128, 3, 3, 2, SAME, conv3); +BM_ConvFloatDepthwiseFwd(32, 28, 28, 128, 1, 128, 3, 3, 1, SAME, conv4); +BM_ConvFloatDepthwiseFwd(32, 14, 14, 512, 1, 512, 3, 3, 1, SAME, conv5); +BM_ConvFloatDepthwiseFwd(32, 7, 7, 1024, 1, 1024, 3, 3, 1, SAME, conv6); static void BM_LRNFloat(int iters, int depth, int cols, int rows, int batch_size, int range, int num_threads, |