diff options
author | A. Unique TensorFlower <nobody@tensorflow.org> | 2016-04-18 07:27:10 -0800 |
---|---|---|
committer | TensorFlower Gardener <gardener@tensorflow.org> | 2016-04-18 08:33:06 -0700 |
commit | 3f27cffdae7cf9b1404d664b28be42b933908b0a (patch) | |
tree | c3f1bdfff301b22e27bd6beeab5ae1381e905826 /tensorflow/core/kernels/nn_ops_test.cc | |
parent | 0dfcea4c1611452dfa5d23e3845b1d8e7001bd50 (diff) |
Optimized implementation of depthwise conv backprop filter for CPU.
// OLD
Benchmark Time(ns) CPU(ns) Iterations
------------------------------------------------------------------------
BM_ConvFloatDepthwiseBkFilterCPU1_conv0 281152179 280588497 100 588.2M items/s 32_112_112_3_8_24_3_3_1_2_cpu1
BM_ConvFloatDepthwiseBkFilterCPU1_conv1 760242956 758694909 100 580.1M items/s 32_112_112_64_1_64_3_3_1_2_cpu1
BM_ConvFloatDepthwiseBkFilterCPU1_conv2 383554418 382741182 100 574.9M items/s 32_56_56_128_1_128_3_3_1_2_cpu1
BM_ConvFloatDepthwiseBkFilterCPU1_conv3 98924384 98665676 100 557.2M items/s 32_56_56_128_1_128_3_3_2_2_cpu1
BM_ConvFloatDepthwiseBkFilterCPU1_conv4 94237506 94005920 100 585.0M items/s 32_28_28_128_1_128_3_3_1_2_cpu1
BM_ConvFloatDepthwiseBkFilterCPU1_conv5 106895864 106648144 100 515.7M items/s 32_14_14_512_1_512_3_3_1_2_cpu1
BM_ConvFloatDepthwiseBkFilterCPU1_conv6 69247718 69078442 100 398.0M items/s 32_7_7_1024_1_1024_3_3_1_2_cpu1
BM_ConvFloatDepthwiseBkFilterCPU1_conv7 70304661 70126053 100 588.1M items/s 32_112_112_3_8_24_3_3_2_2_cpu1
BM_ConvFloatDepthwiseBkFilterCPU1_conv8 67619710 67447142 100 611.4M items/s 32_112_112_3_8_24_3_3_2_1_cpu1
// NEW 1-thread
Benchmark Time(ns) CPU(ns) Iterations
------------------------------------------------------------------------
BM_ConvFloatDepthwiseBkFilterCPU1_conv0 59981294 59569328 100 2.7G items/s 32_112_112_3_8_24_3_3_1_2_cpu1
BM_ConvFloatDepthwiseBkFilterCPU1_conv1 165631344 165250674 100 2.6G items/s 32_112_112_64_1_64_3_3_1_2_cpu1
BM_ConvFloatDepthwiseBkFilterCPU1_conv2 76910026 76705735 100 2.8G items/s 32_56_56_128_1_128_3_3_1_2_cpu1
BM_ConvFloatDepthwiseBkFilterCPU1_conv3 21491439 21375872 100 2.5G items/s 32_56_56_128_1_128_3_3_2_2_cpu1
BM_ConvFloatDepthwiseBkFilterCPU1_conv4 18677714 18587209 100 2.9G items/s 32_28_28_128_1_128_3_3_1_2_cpu1
BM_ConvFloatDepthwiseBkFilterCPU1_conv5 23474236 23377934 100 2.3G items/s 32_14_14_512_1_512_3_3_1_2_cpu1
BM_ConvFloatDepthwiseBkFilterCPU1_conv6 17066829 16982791 100 1.6G items/s 32_7_7_1024_1_1024_3_3_1_2_cpu1
BM_ConvFloatDepthwiseBkFilterCPU1_conv7 14822571 14744419 100 2.7G items/s 32_112_112_3_8_24_3_3_2_2_cpu1
BM_ConvFloatDepthwiseBkFilterCPU1_conv8 14325480 14254559 100 2.8G items/s 32_112_112_3_8_24_3_3_2_1_cpu1
// NEW 4-threads
Benchmark Time(ns) CPU(ns) Iterations
------------------------------------------------------------------------
BM_ConvFloatDepthwiseBkFilterCPU4_conv0 21809044 69141049 100 7.4G items/s 32_112_112_3_8_24_3_3_1_2_cpu4
BM_ConvFloatDepthwiseBkFilterCPU4_conv1 57704422 192333505 100 7.5G items/s 32_112_112_64_1_64_3_3_1_2_cpu4
BM_ConvFloatDepthwiseBkFilterCPU4_conv2 29761264 91848609 100 7.2G items/s 32_56_56_128_1_128_3_3_1_2_cpu4
BM_ConvFloatDepthwiseBkFilterCPU4_conv3 9075773 26429821 100 5.9G items/s 32_56_56_128_1_128_3_3_2_2_cpu4
BM_ConvFloatDepthwiseBkFilterCPU4_conv4 7276754 22100190 100 7.4G items/s 32_28_28_128_1_128_3_3_1_2_cpu4
BM_ConvFloatDepthwiseBkFilterCPU4_conv5 6756189 24510067 100 8.0G items/s 32_14_14_512_1_512_3_3_1_2_cpu4
BM_ConvFloatDepthwiseBkFilterCPU4_conv6 4837993 17723279 142 5.6G items/s 32_7_7_1024_1_1024_3_3_1_2_cpu4
BM_ConvFloatDepthwiseBkFilterCPU4_conv7 6676347 19935585 100 6.0G items/s 32_112_112_3_8_24_3_3_2_2_cpu4
BM_ConvFloatDepthwiseBkFilterCPU4_conv8 5951583 17181079 100 6.8G items/s 32_112_112_3_8_24_3_3_2_1_cpu4
TESTED:
- passed opensource_build
- passed unit tests
Change: 120125325
Diffstat (limited to 'tensorflow/core/kernels/nn_ops_test.cc')
-rw-r--r-- | tensorflow/core/kernels/nn_ops_test.cc | 46 |
1 files changed, 43 insertions, 3 deletions
diff --git a/tensorflow/core/kernels/nn_ops_test.cc b/tensorflow/core/kernels/nn_ops_test.cc index 2e358af2ff..bfedbc40fe 100644 --- a/tensorflow/core/kernels/nn_ops_test.cc +++ b/tensorflow/core/kernels/nn_ops_test.cc @@ -472,6 +472,10 @@ static void BM_ConvFloatDepthwise(int iters, int batch, int rows, int cols, SetConstSizesOp("input_sizes", std::vector<int32>({batch, rows, cols, in_depth}), graph.add_node()); + SetConstSizesOp("filter_sizes", + std::vector<int32>( + {filter_rows, filter_cols, in_depth, depth_multiplier}), + graph.add_node()); // Now add the convolution op NodeDef* conv = graph.add_node(); @@ -495,7 +499,14 @@ static void BM_ConvFloatDepthwise(int iters, int batch, int rows, int cols, .Finalize(conv)); break; case DEPTHWISE_CONV_OP_BACKPROP_FILTER: - // TODO(andydavis,jmchen) Implement backprop filter. + TF_CHECK_OK(NodeDefBuilder("depthwise_conv2d_backprop_filter", + "DepthwiseConv2dNativeBackpropFilter") + .Input("input", 0, DT_FLOAT) + .Input("filter_sizes", 0, DT_INT32) + .Input("output_backprop", 0, DT_FLOAT) + .Attr("strides", {1, stride, stride, 1}) + .Attr("padding", padding == VALID ? "VALID" : "SAME") + .Finalize(conv)); break; } Graph* g = new Graph(OpRegistry::Global()); @@ -579,9 +590,33 @@ BM_ConvFloatDepthwiseFwd(32, 112, 112, 3, 8, 24, 3, 3, 2, VALID, conv8); strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", DM, "_", OD, "_", \ KR, "_", KC, "_", STR, "_", PAD, "_gpu")); \ } \ + static void BM_ConvFloatDepthwiseBkFilterCPU1_##LABEL(int iters) { \ + BM_ConvFloatDepthwise( \ + iters, BS, R, C, ID, DM, OD, KR, KC, \ + DEPTHWISE_CONV_OP_BACKPROP_FILTER, 1, STR, PAD, false, \ + strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", DM, "_", OD, "_", \ + KR, "_", KC, "_", STR, "_", PAD, "_cpu1")); \ + } \ + static void BM_ConvFloatDepthwiseBkFilterCPU4_##LABEL(int iters) { \ + BM_ConvFloatDepthwise( \ + iters, BS, R, C, ID, DM, OD, KR, KC, \ + DEPTHWISE_CONV_OP_BACKPROP_FILTER, 4, STR, PAD, false, \ + strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", DM, "_", OD, "_", \ + KR, "_", KC, "_", STR, "_", PAD, "_cpu4")); \ + } \ + static void BM_ConvFloatDepthwiseBkFilterGPU_##LABEL(int iters) { \ + BM_ConvFloatDepthwise( \ + iters, BS, R, C, ID, DM, OD, KR, KC, \ + DEPTHWISE_CONV_OP_BACKPROP_FILTER, 4, STR, PAD, true, \ + strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", DM, "_", OD, "_", \ + KR, "_", KC, "_", STR, "_", PAD, "_gpu")); \ + } \ BENCHMARK(BM_ConvFloatDepthwiseBkInCPU1_##LABEL); \ BENCHMARK(BM_ConvFloatDepthwiseBkInCPU4_##LABEL); \ - BENCHMARK(BM_ConvFloatDepthwiseBkInGPU_##LABEL); + BENCHMARK(BM_ConvFloatDepthwiseBkInGPU_##LABEL); \ + BENCHMARK(BM_ConvFloatDepthwiseBkFilterCPU1_##LABEL); \ + BENCHMARK(BM_ConvFloatDepthwiseBkFilterCPU4_##LABEL); \ + BENCHMARK(BM_ConvFloatDepthwiseBkFilterGPU_##LABEL) // The configurations below are mostly from mobilenet models. BM_ConvFloatDepthwiseBk(32, 112, 112, 3, 8, 24, 3, 3, 1, SAME, conv0); @@ -873,9 +908,12 @@ MaxPooling Op static void BM_MaxPool(int iters, int batch_size, int rows, int cols, int depth, int kernel_rows, int kernel_cols, int stride, Padding padding, int num_threads, const string& label) { + testing::UseRealTime(); // TODO XXX tensorflow::testing::StopTiming(); + SessionOptions options; + options.config.set_intra_op_parallelism_threads(num_threads); std::unique_ptr<Device> device( - DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0")); + DeviceFactory::NewDevice("CPU", options, "/job:a/replica:0/task:0")); thread::ThreadPool threadpool(Env::Default(), "test", num_threads); EigenThreadPoolWrapper wrapper(&threadpool); @@ -943,6 +981,7 @@ static void BM_MaxPool(int iters, int batch_size, int rows, int cols, int depth, BM_MaxPool_##BS##_##IR##_##IC##_##ND##_##KR##_##KC##_##ST##_##PT##_##TH) // Labels are taken from the 2014-July-24 version of imagenet +/* TODO XXX BM_MaxPoolFwdCPU(32, 112, 112, 64, 3, 3, 2, VALID, 1, "maxpool0_VALID"); BM_MaxPoolFwdCPU(32, 56, 56, 192, 3, 3, 2, VALID, 1, "maxpool1_VALID"); BM_MaxPoolFwdCPU(32, 28, 28, 352, 3, 3, 2, VALID, 1, "maxpool4_VALID"); @@ -951,6 +990,7 @@ BM_MaxPoolFwdCPU(32, 112, 112, 64, 3, 3, 2, SAME, 1, "maxpool0_SAME"); BM_MaxPoolFwdCPU(32, 56, 56, 192, 3, 3, 2, SAME, 1, "maxpool1_SAME"); BM_MaxPoolFwdCPU(32, 28, 28, 352, 3, 3, 2, SAME, 1, "maxpool4_SAME"); BM_MaxPoolFwdCPU(32, 14, 14, 576, 3, 3, 2, SAME, 1, "maxpool10_SAME"); +*/ BM_MaxPoolFwdCPU(32, 112, 112, 64, 3, 3, 2, VALID, 4, "maxpool0_VALID"); BM_MaxPoolFwdCPU(32, 56, 56, 192, 3, 3, 2, VALID, 4, "maxpool1_VALID"); BM_MaxPoolFwdCPU(32, 28, 28, 352, 3, 3, 2, VALID, 4, "maxpool4_VALID"); |