aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/compiler/xla/reference_util.cc
diff options
context:
space:
mode:
authorGravatar Justin Lebar <jlebar@google.com>2017-02-07 10:56:37 -0800
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2017-02-07 11:17:29 -0800
commit7135d08d4e6067865d7b5f2907013c960a12ae4f (patch)
tree1bb78bc46cb823fe40121f7b77da10020520042a /tensorflow/compiler/xla/reference_util.cc
parent73899707ae5f1b35c88bcb5898d8bb05e7b69715 (diff)
[XLA] Speed up ReferenceUtil::ConvArray4DGeneralDimensionsDilated.
Avoid 64-bit divides where possible. Per Agner Fog's manuals, 64-bit idiv is at least 4x slower than 32-bit idiv. perf confirms that the idivs are on the critical path. http://www.agner.org/optimize/instruction_tables.pdf Change: 146806085
Diffstat (limited to 'tensorflow/compiler/xla/reference_util.cc')
-rw-r--r--tensorflow/compiler/xla/reference_util.cc34
1 files changed, 27 insertions, 7 deletions
diff --git a/tensorflow/compiler/xla/reference_util.cc b/tensorflow/compiler/xla/reference_util.cc
index 142d2c2163..bff59454e7 100644
--- a/tensorflow/compiler/xla/reference_util.cc
+++ b/tensorflow/compiler/xla/reference_util.cc
@@ -335,18 +335,38 @@ ReferenceUtil::ConvArray4DGeneralDimensionsDilated(
result_dimensions[2], result_dimensions[3]);
result->Fill(0.0);
+ const auto is_int32 = [](int64 x) {
+ return x >= std::numeric_limits<int32>::min() &&
+ x <= std::numeric_limits<int32>::max();
+ };
+
+ // 64-bit idiv/mod are much more expensive x86-64 than 32-bit idiv/imod (at
+ // least on x86-64), so we avoid them where possible.
+ const auto fast_idiv64 = [&](int64 a, int64 b) {
+ if (is_int32(a) && is_int32(b)) {
+ return static_cast<int64>(static_cast<int32>(a) / static_cast<int32>(b));
+ }
+ return a / b;
+ };
+ const auto fast_imod64 = [&](int64 a, int64 b) {
+ if (is_int32(a) && is_int32(b)) {
+ return static_cast<int64>(static_cast<int32>(a) % static_cast<int32>(b));
+ }
+ return a % b;
+ };
+
// Lambda to access the lhs operand at the given 4D index.
const auto lhs_element = [&](int64 batch, int64 feature, int64 height,
int64 width) {
- if (height % dy != 0 || width % dx != 0) {
+ if (fast_imod64(height, dy) != 0 || fast_imod64(width, dx) != 0) {
return 0.0f;
}
std::array<int64, 4> index;
index[dnums.batch_dimension()] = batch;
index[dnums.feature_dimension()] = feature;
- index[dnums.spatial_dimensions(0)] = height / dy;
- index[dnums.spatial_dimensions(1)] = width / dx;
+ index[dnums.spatial_dimensions(0)] = fast_idiv64(height, dy);
+ index[dnums.spatial_dimensions(1)] = fast_idiv64(width, dx);
return lhs(index[0], index[1], index[2], index[3]);
};
@@ -354,13 +374,13 @@ ReferenceUtil::ConvArray4DGeneralDimensionsDilated(
const auto rhs_element = [&](int64 kernel_output_feature,
int64 kernel_input_feature, int64 height,
int64 width) {
- CHECK_EQ(height % dky, 0);
- CHECK_EQ(width % dkx, 0);
+ CHECK_EQ(fast_imod64(height, dky), 0);
+ CHECK_EQ(fast_imod64(width, dkx), 0);
std::array<int64, 4> index;
index[dnums.kernel_output_feature_dimension()] = kernel_output_feature;
index[dnums.kernel_input_feature_dimension()] = kernel_input_feature;
- index[dnums.kernel_spatial_dimensions(0)] = height / dky;
- index[dnums.kernel_spatial_dimensions(1)] = width / dkx;
+ index[dnums.kernel_spatial_dimensions(0)] = fast_idiv64(height, dky);
+ index[dnums.kernel_spatial_dimensions(1)] = fast_idiv64(width, dkx);
return rhs(index[0], index[1], index[2], index[3]);
};