aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/core/kernels/nn_ops_test.cc
diff options
context:
space:
mode:
authorGravatar A. Unique TensorFlower <nobody@tensorflow.org>2016-04-18 07:27:10 -0800
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2016-04-18 08:33:06 -0700
commit3f27cffdae7cf9b1404d664b28be42b933908b0a (patch)
treec3f1bdfff301b22e27bd6beeab5ae1381e905826 /tensorflow/core/kernels/nn_ops_test.cc
parent0dfcea4c1611452dfa5d23e3845b1d8e7001bd50 (diff)
Optimized implementation of depthwise conv backprop filter for CPU.
// OLD Benchmark Time(ns) CPU(ns) Iterations ------------------------------------------------------------------------ BM_ConvFloatDepthwiseBkFilterCPU1_conv0 281152179 280588497 100 588.2M items/s 32_112_112_3_8_24_3_3_1_2_cpu1 BM_ConvFloatDepthwiseBkFilterCPU1_conv1 760242956 758694909 100 580.1M items/s 32_112_112_64_1_64_3_3_1_2_cpu1 BM_ConvFloatDepthwiseBkFilterCPU1_conv2 383554418 382741182 100 574.9M items/s 32_56_56_128_1_128_3_3_1_2_cpu1 BM_ConvFloatDepthwiseBkFilterCPU1_conv3 98924384 98665676 100 557.2M items/s 32_56_56_128_1_128_3_3_2_2_cpu1 BM_ConvFloatDepthwiseBkFilterCPU1_conv4 94237506 94005920 100 585.0M items/s 32_28_28_128_1_128_3_3_1_2_cpu1 BM_ConvFloatDepthwiseBkFilterCPU1_conv5 106895864 106648144 100 515.7M items/s 32_14_14_512_1_512_3_3_1_2_cpu1 BM_ConvFloatDepthwiseBkFilterCPU1_conv6 69247718 69078442 100 398.0M items/s 32_7_7_1024_1_1024_3_3_1_2_cpu1 BM_ConvFloatDepthwiseBkFilterCPU1_conv7 70304661 70126053 100 588.1M items/s 32_112_112_3_8_24_3_3_2_2_cpu1 BM_ConvFloatDepthwiseBkFilterCPU1_conv8 67619710 67447142 100 611.4M items/s 32_112_112_3_8_24_3_3_2_1_cpu1 // NEW 1-thread Benchmark Time(ns) CPU(ns) Iterations ------------------------------------------------------------------------ BM_ConvFloatDepthwiseBkFilterCPU1_conv0 59981294 59569328 100 2.7G items/s 32_112_112_3_8_24_3_3_1_2_cpu1 BM_ConvFloatDepthwiseBkFilterCPU1_conv1 165631344 165250674 100 2.6G items/s 32_112_112_64_1_64_3_3_1_2_cpu1 BM_ConvFloatDepthwiseBkFilterCPU1_conv2 76910026 76705735 100 2.8G items/s 32_56_56_128_1_128_3_3_1_2_cpu1 BM_ConvFloatDepthwiseBkFilterCPU1_conv3 21491439 21375872 100 2.5G items/s 32_56_56_128_1_128_3_3_2_2_cpu1 BM_ConvFloatDepthwiseBkFilterCPU1_conv4 18677714 18587209 100 2.9G items/s 32_28_28_128_1_128_3_3_1_2_cpu1 BM_ConvFloatDepthwiseBkFilterCPU1_conv5 23474236 23377934 100 2.3G items/s 32_14_14_512_1_512_3_3_1_2_cpu1 BM_ConvFloatDepthwiseBkFilterCPU1_conv6 17066829 16982791 100 1.6G items/s 32_7_7_1024_1_1024_3_3_1_2_cpu1 BM_ConvFloatDepthwiseBkFilterCPU1_conv7 14822571 14744419 100 2.7G items/s 32_112_112_3_8_24_3_3_2_2_cpu1 BM_ConvFloatDepthwiseBkFilterCPU1_conv8 14325480 14254559 100 2.8G items/s 32_112_112_3_8_24_3_3_2_1_cpu1 // NEW 4-threads Benchmark Time(ns) CPU(ns) Iterations ------------------------------------------------------------------------ BM_ConvFloatDepthwiseBkFilterCPU4_conv0 21809044 69141049 100 7.4G items/s 32_112_112_3_8_24_3_3_1_2_cpu4 BM_ConvFloatDepthwiseBkFilterCPU4_conv1 57704422 192333505 100 7.5G items/s 32_112_112_64_1_64_3_3_1_2_cpu4 BM_ConvFloatDepthwiseBkFilterCPU4_conv2 29761264 91848609 100 7.2G items/s 32_56_56_128_1_128_3_3_1_2_cpu4 BM_ConvFloatDepthwiseBkFilterCPU4_conv3 9075773 26429821 100 5.9G items/s 32_56_56_128_1_128_3_3_2_2_cpu4 BM_ConvFloatDepthwiseBkFilterCPU4_conv4 7276754 22100190 100 7.4G items/s 32_28_28_128_1_128_3_3_1_2_cpu4 BM_ConvFloatDepthwiseBkFilterCPU4_conv5 6756189 24510067 100 8.0G items/s 32_14_14_512_1_512_3_3_1_2_cpu4 BM_ConvFloatDepthwiseBkFilterCPU4_conv6 4837993 17723279 142 5.6G items/s 32_7_7_1024_1_1024_3_3_1_2_cpu4 BM_ConvFloatDepthwiseBkFilterCPU4_conv7 6676347 19935585 100 6.0G items/s 32_112_112_3_8_24_3_3_2_2_cpu4 BM_ConvFloatDepthwiseBkFilterCPU4_conv8 5951583 17181079 100 6.8G items/s 32_112_112_3_8_24_3_3_2_1_cpu4 TESTED: - passed opensource_build - passed unit tests Change: 120125325
Diffstat (limited to 'tensorflow/core/kernels/nn_ops_test.cc')
-rw-r--r--tensorflow/core/kernels/nn_ops_test.cc46
1 files changed, 43 insertions, 3 deletions
diff --git a/tensorflow/core/kernels/nn_ops_test.cc b/tensorflow/core/kernels/nn_ops_test.cc
index 2e358af2ff..bfedbc40fe 100644
--- a/tensorflow/core/kernels/nn_ops_test.cc
+++ b/tensorflow/core/kernels/nn_ops_test.cc
@@ -472,6 +472,10 @@ static void BM_ConvFloatDepthwise(int iters, int batch, int rows, int cols,
SetConstSizesOp("input_sizes",
std::vector<int32>({batch, rows, cols, in_depth}),
graph.add_node());
+ SetConstSizesOp("filter_sizes",
+ std::vector<int32>(
+ {filter_rows, filter_cols, in_depth, depth_multiplier}),
+ graph.add_node());
// Now add the convolution op
NodeDef* conv = graph.add_node();
@@ -495,7 +499,14 @@ static void BM_ConvFloatDepthwise(int iters, int batch, int rows, int cols,
.Finalize(conv));
break;
case DEPTHWISE_CONV_OP_BACKPROP_FILTER:
- // TODO(andydavis,jmchen) Implement backprop filter.
+ TF_CHECK_OK(NodeDefBuilder("depthwise_conv2d_backprop_filter",
+ "DepthwiseConv2dNativeBackpropFilter")
+ .Input("input", 0, DT_FLOAT)
+ .Input("filter_sizes", 0, DT_INT32)
+ .Input("output_backprop", 0, DT_FLOAT)
+ .Attr("strides", {1, stride, stride, 1})
+ .Attr("padding", padding == VALID ? "VALID" : "SAME")
+ .Finalize(conv));
break;
}
Graph* g = new Graph(OpRegistry::Global());
@@ -579,9 +590,33 @@ BM_ConvFloatDepthwiseFwd(32, 112, 112, 3, 8, 24, 3, 3, 2, VALID, conv8);
strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", DM, "_", OD, "_", \
KR, "_", KC, "_", STR, "_", PAD, "_gpu")); \
} \
+ static void BM_ConvFloatDepthwiseBkFilterCPU1_##LABEL(int iters) { \
+ BM_ConvFloatDepthwise( \
+ iters, BS, R, C, ID, DM, OD, KR, KC, \
+ DEPTHWISE_CONV_OP_BACKPROP_FILTER, 1, STR, PAD, false, \
+ strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", DM, "_", OD, "_", \
+ KR, "_", KC, "_", STR, "_", PAD, "_cpu1")); \
+ } \
+ static void BM_ConvFloatDepthwiseBkFilterCPU4_##LABEL(int iters) { \
+ BM_ConvFloatDepthwise( \
+ iters, BS, R, C, ID, DM, OD, KR, KC, \
+ DEPTHWISE_CONV_OP_BACKPROP_FILTER, 4, STR, PAD, false, \
+ strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", DM, "_", OD, "_", \
+ KR, "_", KC, "_", STR, "_", PAD, "_cpu4")); \
+ } \
+ static void BM_ConvFloatDepthwiseBkFilterGPU_##LABEL(int iters) { \
+ BM_ConvFloatDepthwise( \
+ iters, BS, R, C, ID, DM, OD, KR, KC, \
+ DEPTHWISE_CONV_OP_BACKPROP_FILTER, 4, STR, PAD, true, \
+ strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", DM, "_", OD, "_", \
+ KR, "_", KC, "_", STR, "_", PAD, "_gpu")); \
+ } \
BENCHMARK(BM_ConvFloatDepthwiseBkInCPU1_##LABEL); \
BENCHMARK(BM_ConvFloatDepthwiseBkInCPU4_##LABEL); \
- BENCHMARK(BM_ConvFloatDepthwiseBkInGPU_##LABEL);
+ BENCHMARK(BM_ConvFloatDepthwiseBkInGPU_##LABEL); \
+ BENCHMARK(BM_ConvFloatDepthwiseBkFilterCPU1_##LABEL); \
+ BENCHMARK(BM_ConvFloatDepthwiseBkFilterCPU4_##LABEL); \
+ BENCHMARK(BM_ConvFloatDepthwiseBkFilterGPU_##LABEL)
// The configurations below are mostly from mobilenet models.
BM_ConvFloatDepthwiseBk(32, 112, 112, 3, 8, 24, 3, 3, 1, SAME, conv0);
@@ -873,9 +908,12 @@ MaxPooling Op
static void BM_MaxPool(int iters, int batch_size, int rows, int cols, int depth,
int kernel_rows, int kernel_cols, int stride,
Padding padding, int num_threads, const string& label) {
+ testing::UseRealTime(); // TODO XXX
tensorflow::testing::StopTiming();
+ SessionOptions options;
+ options.config.set_intra_op_parallelism_threads(num_threads);
std::unique_ptr<Device> device(
- DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0"));
+ DeviceFactory::NewDevice("CPU", options, "/job:a/replica:0/task:0"));
thread::ThreadPool threadpool(Env::Default(), "test", num_threads);
EigenThreadPoolWrapper wrapper(&threadpool);
@@ -943,6 +981,7 @@ static void BM_MaxPool(int iters, int batch_size, int rows, int cols, int depth,
BM_MaxPool_##BS##_##IR##_##IC##_##ND##_##KR##_##KC##_##ST##_##PT##_##TH)
// Labels are taken from the 2014-July-24 version of imagenet
+/* TODO XXX
BM_MaxPoolFwdCPU(32, 112, 112, 64, 3, 3, 2, VALID, 1, "maxpool0_VALID");
BM_MaxPoolFwdCPU(32, 56, 56, 192, 3, 3, 2, VALID, 1, "maxpool1_VALID");
BM_MaxPoolFwdCPU(32, 28, 28, 352, 3, 3, 2, VALID, 1, "maxpool4_VALID");
@@ -951,6 +990,7 @@ BM_MaxPoolFwdCPU(32, 112, 112, 64, 3, 3, 2, SAME, 1, "maxpool0_SAME");
BM_MaxPoolFwdCPU(32, 56, 56, 192, 3, 3, 2, SAME, 1, "maxpool1_SAME");
BM_MaxPoolFwdCPU(32, 28, 28, 352, 3, 3, 2, SAME, 1, "maxpool4_SAME");
BM_MaxPoolFwdCPU(32, 14, 14, 576, 3, 3, 2, SAME, 1, "maxpool10_SAME");
+*/
BM_MaxPoolFwdCPU(32, 112, 112, 64, 3, 3, 2, VALID, 4, "maxpool0_VALID");
BM_MaxPoolFwdCPU(32, 56, 56, 192, 3, 3, 2, VALID, 4, "maxpool1_VALID");
BM_MaxPoolFwdCPU(32, 28, 28, 352, 3, 3, 2, VALID, 4, "maxpool4_VALID");