Added support for fast integer divisions by a constant

Sped up tensor slicing by a factor of 3 by using these fast integer divisions.
author: Benoit Steiner <benoit.steiner.goog@gmail.com> 2014-08-14 22:13:21 -0700
committer: Benoit Steiner <benoit.steiner.goog@gmail.com> 2014-08-14 22:13:21 -0700
commit: 33c702c79fe227a5b22229c26af276d359a6cb1d (patch)
tree: 86ca1888d05abb165b3ddbe892cfff7b5464573d /unsupported
parent: 756292f8aa124c842d1e6d9beeb0c416c0d9a7f3 (diff)
5 files changed, 177 insertions, 10 deletions
diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor
index 0775d440a..82552c3c2 100644
--- a/unsupported/Eigen/CXX11/Tensor
+++ b/unsupported/Eigen/CXX11/Tensor
@@ -34,6 +34,7 @@
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h"
 
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorBase.h"
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
new file mode 100644
index 000000000..cf97031be
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
@@ -0,0 +1,82 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_INTDIV_H
+#define EIGEN_CXX11_TENSOR_TENSOR_INTDIV_H
+
+
+namespace Eigen {
+
+/** \internal
+  *
+  * \class TensorIntDiv
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Fast integer division by a constant.
+  *
+  * See the paper from Granlund and Montgomery for explanation.
+  *   (at http://dx.doi.org/10.1145/773473.178249)
+  *
+  * \sa Tensor
+  */
+
+namespace internal {
+
+template <typename T>
+struct TensorIntDivisor {
+ public:
+  TensorIntDivisor() {
+    multiplier = 0;
+    shift1 = 0;
+    shift2 = 0;
+  }
+
+  // Must have 1 <= divider <= 2^31-1
+  TensorIntDivisor(const T divider) {
+    static const int N = 32;
+    eigen_assert(divider > 0);
+    eigen_assert(divider <= (1<<(N-1)) - 1);
+
+    // fast ln2
+    const int leading_zeros = __builtin_clz(divider);
+    const int l = N - (leading_zeros+1);
+
+    multiplier = (static_cast<uint64_t>(1) << (N+l)) / divider - (static_cast<uint64_t>(1) << N) + 1;
+    shift1 = (std::min)(1, l);
+    shift2 = (std::max)(0, l-1);
+  }
+
+  // Must have 0 <= numerator <= 2^32-1
+  T divide(const T numerator) const {
+    static const int N = 32;
+    eigen_assert(numerator >= 0);
+    eigen_assert(numerator <= (1ull<<N) - 1);
+
+    uint32_t t1 = (multiplier * numerator) >> 32;
+    uint32_t t = (static_cast<uint32_t>(numerator) - t1) >> shift1;
+    return (t1 + t) >> shift2;
+  }
+
+ private:
+  uint64_t multiplier;
+  int32_t shift1;
+  int32_t shift2;
+};
+
+
+template <typename T>
+static T operator / (const T& numerator, const TensorIntDivisor<T>& divisor) {
+  return divisor.divide(numerator);
+}
+
+
+} // end namespace internal
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_INTDIV_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
index 2b1b503cf..ca3735d64 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@@ -305,8 +305,10 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
     for (int i = 0; i < NumDims; ++i) {
       if (i > 0) {
         m_outputStrides[i] = m_outputStrides[i-1] * output_dims[i-1];
+        m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
       } else {
         m_outputStrides[0] = 1;
+        m_fastOutputStrides[0] = 1;
       }
     }
   }
@@ -331,7 +333,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
   {
     Index inputIndex = 0;
     for (int i = NumDims - 1; i > 0; --i) {
-      const Index idx = index / m_outputStrides[i];
+      const Index idx = index / m_fastOutputStrides[i];
       inputIndex += (idx + m_offsets[i]) * m_inputStrides[i];
       index -= idx * m_outputStrides[i];
     }
@@ -349,8 +351,8 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
     Index inputIndices[] = {0, 0};
     Index indices[] = {index, index + packetSize - 1};
     for (int i = NumDims - 1; i > 0; --i) {
-      const Index idx0 = indices[0] / m_outputStrides[i];
-      const Index idx1 = indices[1] / m_outputStrides[i];
+      const Index idx0 = indices[0] / m_fastOutputStrides[i];
+      const Index idx1 = indices[1] / m_fastOutputStrides[i];
       inputIndices[0] += (idx0 + m_offsets[i]) * m_inputStrides[i];
       inputIndices[1] += (idx1 + m_offsets[i]) * m_inputStrides[i];
       indices[0] -= idx0 * m_outputStrides[i];
@@ -379,6 +381,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
  private:
   Dimensions m_dimensions;
   array<Index, NumDims> m_outputStrides;
+  array<internal::TensorIntDivisor<Index>, NumDims> m_fastOutputStrides;
   array<Index, NumDims> m_inputStrides;
   const StartIndices m_offsets;
   TensorEvaluator<ArgType, Device> m_impl;
@@ -418,9 +421,11 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
     for (int i = 0; i < NumDims; ++i) {
       if (i > 0) {
         m_outputStrides[i] = m_outputStrides[i-1] * output_dims[i-1];
+        m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
       } else {
         m_outputStrides[0] = 1;
-      }
+        m_fastOutputStrides[0] = 1;
+     }
     }
   }
 
@@ -444,7 +449,7 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
   {
     Index inputIndex = 0;
     for (int i = NumDims - 1; i > 0; --i) {
-      const Index idx = index / m_outputStrides[i];
+      const Index idx = index / m_fastOutputStrides[i];
       inputIndex += (idx + m_offsets[i]) * m_inputStrides[i];
       index -= idx * m_outputStrides[i];
     }
@@ -460,8 +465,8 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
     Index inputIndices[] = {0, 0};
     Index indices[] = {index, index + packetSize - 1};
     for (int i = NumDims - 1; i > 0; --i) {
-      const Index idx0 = indices[0] / m_outputStrides[i];
-      const Index idx1 = indices[1] / m_outputStrides[i];
+      const Index idx0 = indices[0] / m_fastOutputStrides[i];
+      const Index idx1 = indices[1] / m_fastOutputStrides[i];
       inputIndices[0] += (idx0 + m_offsets[i]) * m_inputStrides[i];
       inputIndices[1] += (idx1 + m_offsets[i]) * m_inputStrides[i];
       indices[0] -= idx0 * m_outputStrides[i];
@@ -489,7 +494,7 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
   {
     Index inputIndex = 0;
     for (int i = NumDims - 1; i > 0; --i) {
-      const Index idx = index / m_outputStrides[i];
+      const Index idx = index / m_fastOutputStrides[i];
       inputIndex += (idx + m_offsets[i]) * m_inputStrides[i];
       index -= idx * m_outputStrides[i];
     }
@@ -504,8 +509,8 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
     Index inputIndices[] = {0, 0};
     Index indices[] = {index, index + packetSize - 1};
     for (int i = NumDims - 1; i > 0; --i) {
-      const Index idx0 = indices[0] / m_outputStrides[i];
-      const Index idx1 = indices[1] / m_outputStrides[i];
+      const Index idx0 = indices[0] / m_fastOutputStrides[i];
+      const Index idx1 = indices[1] / m_fastOutputStrides[i];
       inputIndices[0] += (idx0 + m_offsets[i]) * m_inputStrides[i];
       inputIndices[1] += (idx1 + m_offsets[i]) * m_inputStrides[i];
       indices[0] -= idx0 * m_outputStrides[i];
@@ -532,6 +537,7 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
  private:
   Dimensions m_dimensions;
   array<Index, NumDims> m_outputStrides;
+  array<internal::TensorIntDivisor<Index>, NumDims> m_fastOutputStrides;
   array<Index, NumDims> m_inputStrides;
   const StartIndices m_offsets;
   TensorEvaluator<ArgType, Device> m_impl;
diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt
index 520935105..e2204827e 100644
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@@ -106,6 +106,7 @@ if(EIGEN_TEST_CXX11)
   ei_add_test(cxx11_tensor_convolution "-std=c++0x")
   ei_add_test(cxx11_tensor_expr "-std=c++0x")
 #  ei_add_test(cxx11_tensor_fixed_size "-std=c++0x")
+  ei_add_test(cxx11_tensor_intdiv "-std=c++0x")
   ei_add_test(cxx11_tensor_lvalue "-std=c++0x")
   ei_add_test(cxx11_tensor_map "-std=c++0x")
 #  ei_add_test(cxx11_tensor_morphing "-std=c++0x")
diff --git a/unsupported/test/cxx11_tensor_intdiv.cpp b/unsupported/test/cxx11_tensor_intdiv.cpp
new file mode 100644
index 000000000..a510dc695
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_intdiv.cpp
@@ -0,0 +1,77 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+
+static void test_signed_32bit()
+{
+  for (int32_t i = 1; i < 25000; ++i) {
+    const Eigen::internal::TensorIntDivisor<int32_t> div(i);
+
+    for (int32_t j = 0; j < 25000; ++j) {
+      const int32_t fast_div = j / div;
+      const int32_t slow_div = j / i;
+      VERIFY_IS_EQUAL(fast_div, slow_div);
+    }
+  }
+}
+
+
+static void test_unsigned_32bit()
+{
+  for (uint32_t i = 1; i < 25000; ++i) {
+    const Eigen::internal::TensorIntDivisor<uint32_t> div(i);
+
+    for (uint32_t j = 0; j < 25000; ++j) {
+      const uint32_t fast_div = j / div;
+      const uint32_t slow_div = j / i;
+      VERIFY_IS_EQUAL(fast_div, slow_div);
+    }
+  }
+}
+
+
+static void test_signed_64bit()
+{
+  for (int64_t i = 2; i < 25000; ++i) {
+    const Eigen::internal::TensorIntDivisor<int64_t> div(i);
+
+    for (int64_t j = 0; j < 25000; ++j) {
+      const int64_t fast_div = j / div;
+      const int64_t slow_div = j / i;
+      VERIFY_IS_EQUAL(fast_div, slow_div);
+    }
+  }
+}
+
+
+static void test_unsigned_64bit()
+{
+  for (uint64_t i = 2; i < 25000; ++i) {
+    const Eigen::internal::TensorIntDivisor<uint64_t> div(i);
+
+    for (uint64_t j = 0; j < 25000; ++j) {
+      const uint64_t fast_div = j / div;
+      const uint64_t slow_div = j / i;
+      VERIFY_IS_EQUAL(fast_div, slow_div);
+    }
+  }
+}
+
+
+void test_cxx11_tensor_intdiv()
+{
+  CALL_SUBTEST(test_signed_32bit());
+  CALL_SUBTEST(test_unsigned_32bit());
+  CALL_SUBTEST(test_signed_64bit());
+  CALL_SUBTEST(test_unsigned_64bit());
+}
author	Benoit Steiner <benoit.steiner.goog@gmail.com>	2014-08-14 22:13:21 -0700
committer	Benoit Steiner <benoit.steiner.goog@gmail.com>	2014-08-14 22:13:21 -0700
commit	33c702c79fe227a5b22229c26af276d359a6cb1d (patch)
tree	86ca1888d05abb165b3ddbe892cfff7b5464573d /unsupported
parent	756292f8aa124c842d1e6d9beeb0c416c0d9a7f3 (diff)