diff options
author | Justin Lebar <jlebar@google.com> | 2017-02-07 10:56:37 -0800 |
---|---|---|
committer | TensorFlower Gardener <gardener@tensorflow.org> | 2017-02-07 11:17:29 -0800 |
commit | 7135d08d4e6067865d7b5f2907013c960a12ae4f (patch) | |
tree | 1bb78bc46cb823fe40121f7b77da10020520042a /tensorflow/compiler/xla/reference_util.cc | |
parent | 73899707ae5f1b35c88bcb5898d8bb05e7b69715 (diff) |
[XLA] Speed up ReferenceUtil::ConvArray4DGeneralDimensionsDilated.
Avoid 64-bit divides where possible. Per Agner Fog's manuals, 64-bit
idiv is at least 4x slower than 32-bit idiv. perf confirms that the
idivs are on the critical path.
http://www.agner.org/optimize/instruction_tables.pdf
Change: 146806085
Diffstat (limited to 'tensorflow/compiler/xla/reference_util.cc')
-rw-r--r-- | tensorflow/compiler/xla/reference_util.cc | 34 |
1 files changed, 27 insertions, 7 deletions
diff --git a/tensorflow/compiler/xla/reference_util.cc b/tensorflow/compiler/xla/reference_util.cc index 142d2c2163..bff59454e7 100644 --- a/tensorflow/compiler/xla/reference_util.cc +++ b/tensorflow/compiler/xla/reference_util.cc @@ -335,18 +335,38 @@ ReferenceUtil::ConvArray4DGeneralDimensionsDilated( result_dimensions[2], result_dimensions[3]); result->Fill(0.0); + const auto is_int32 = [](int64 x) { + return x >= std::numeric_limits<int32>::min() && + x <= std::numeric_limits<int32>::max(); + }; + + // 64-bit idiv/mod are much more expensive x86-64 than 32-bit idiv/imod (at + // least on x86-64), so we avoid them where possible. + const auto fast_idiv64 = [&](int64 a, int64 b) { + if (is_int32(a) && is_int32(b)) { + return static_cast<int64>(static_cast<int32>(a) / static_cast<int32>(b)); + } + return a / b; + }; + const auto fast_imod64 = [&](int64 a, int64 b) { + if (is_int32(a) && is_int32(b)) { + return static_cast<int64>(static_cast<int32>(a) % static_cast<int32>(b)); + } + return a % b; + }; + // Lambda to access the lhs operand at the given 4D index. const auto lhs_element = [&](int64 batch, int64 feature, int64 height, int64 width) { - if (height % dy != 0 || width % dx != 0) { + if (fast_imod64(height, dy) != 0 || fast_imod64(width, dx) != 0) { return 0.0f; } std::array<int64, 4> index; index[dnums.batch_dimension()] = batch; index[dnums.feature_dimension()] = feature; - index[dnums.spatial_dimensions(0)] = height / dy; - index[dnums.spatial_dimensions(1)] = width / dx; + index[dnums.spatial_dimensions(0)] = fast_idiv64(height, dy); + index[dnums.spatial_dimensions(1)] = fast_idiv64(width, dx); return lhs(index[0], index[1], index[2], index[3]); }; @@ -354,13 +374,13 @@ ReferenceUtil::ConvArray4DGeneralDimensionsDilated( const auto rhs_element = [&](int64 kernel_output_feature, int64 kernel_input_feature, int64 height, int64 width) { - CHECK_EQ(height % dky, 0); - CHECK_EQ(width % dkx, 0); + CHECK_EQ(fast_imod64(height, dky), 0); + CHECK_EQ(fast_imod64(width, dkx), 0); std::array<int64, 4> index; index[dnums.kernel_output_feature_dimension()] = kernel_output_feature; index[dnums.kernel_input_feature_dimension()] = kernel_input_feature; - index[dnums.kernel_spatial_dimensions(0)] = height / dky; - index[dnums.kernel_spatial_dimensions(1)] = width / dkx; + index[dnums.kernel_spatial_dimensions(0)] = fast_idiv64(height, dky); + index[dnums.kernel_spatial_dimensions(1)] = fast_idiv64(width, dkx); return rhs(index[0], index[1], index[2], index[3]); }; |