aboutsummaryrefslogtreecommitdiffhomepage
path: root/third_party
diff options
context:
space:
mode:
Diffstat (limited to 'third_party')
-rw-r--r--third_party/eigen3/Eigen/src/Core/util/Macros.h12
-rw-r--r--third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h22
-rw-r--r--third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h16
3 files changed, 28 insertions, 22 deletions
diff --git a/third_party/eigen3/Eigen/src/Core/util/Macros.h b/third_party/eigen3/Eigen/src/Core/util/Macros.h
index 729a451324..b531327afb 100644
--- a/third_party/eigen3/Eigen/src/Core/util/Macros.h
+++ b/third_party/eigen3/Eigen/src/Core/util/Macros.h
@@ -296,10 +296,14 @@
// 16 byte alignment on all platforms where vectorization might be enabled. In theory we could always
// enable alignment, but it can be a cause of problems on some platforms, so we just disable it in
// certain common platform (compiler+architecture combinations) to avoid these problems.
-// Only static alignment is really problematic (relies on nonstandard compiler extensions that don't
-// work everywhere, for example don't work on GCC/ARM), try to keep heap alignment even
-// when we have to disable static alignment.
-#if EIGEN_COMP_GNUC && !(EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_PPC || EIGEN_ARCH_IA64)
+// Only static alignment is really problematic (relies on nonstandard compiler extensions),
+// try to keep heap alignment even when we have to disable static alignment.
+#if EIGEN_COMP_GNUC && !(EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM_OR_ARM64 || EIGEN_ARCH_PPC || EIGEN_ARCH_IA64)
+#define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1
+#elif EIGEN_ARCH_ARM_OR_ARM64 && EIGEN_COMP_GNUC_STRICT && EIGEN_GNUC_AT_MOST(4, 6)
+// Old versions of GCC on ARM, at least 4.4, were once seen to have buggy static alignment support.
+// Not sure which version fixed it, hopefully it doesn't affect 4.7, which is still somewhat in use.
+// 4.8 and newer seem definitely unaffected.
#define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1
#else
#define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 0
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h
index b6eeb73832..a62682c728 100644
--- a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h
@@ -760,11 +760,15 @@ struct GpuDevice {
GpuDevice()
: stream_(perftools::gputools::MachineManager::singleton()->stream_for_device(0)),
allocator_(nullptr),
- stream_exec_(stream_->parent()) {}
+ stream_exec_(stream_->parent()),
+ device_descr_(&(stream_exec_->GetDeviceDescription())) {}
GpuDevice(perftools::gputools::Stream* stream,
const Allocator* alloc = nullptr)
- : stream_(stream), allocator_(alloc), stream_exec_(stream_->parent()) { }
+ : stream_(stream),
+ allocator_(alloc),
+ stream_exec_(stream_->parent()),
+ device_descr_(&(stream_exec_->GetDeviceDescription())) {}
EIGEN_STRONG_INLINE perftools::gputools::Stream* stream() const {
return stream_;
@@ -873,28 +877,25 @@ struct GpuDevice {
stream_->BlockHostUntilDone();
}
- // A gpu::DeviceDescription is cached inside a StreamExecutor, so these calls
- // aren't expensive/wasteful.
EIGEN_DEVICE_FUNC inline int getNumCudaMultiProcessors() const {
- return stream_exec_->GetDeviceDescription().core_count();
+ return device_descr_->core_count();
}
EIGEN_DEVICE_FUNC inline int maxCudaThreadsPerBlock() const {
- return stream_exec_->GetDeviceDescription().threads_per_block_limit();
+ return device_descr_->threads_per_block_limit();
}
EIGEN_DEVICE_FUNC inline int maxCudaThreadsPerMultiProcessor() const {
- return stream_exec_->GetDeviceDescription().threads_per_core_limit();
+ return device_descr_->threads_per_core_limit();
}
EIGEN_DEVICE_FUNC inline int sharedMemPerBlock() const {
- return stream_exec_->GetDeviceDescription().shared_memory_per_block();
+ return device_descr_->shared_memory_per_block();
}
EIGEN_DEVICE_FUNC inline int majorDeviceVersion() const {
int major, minor;
- if (stream_exec_->GetDeviceDescription().cuda_compute_capability(&major,
- &minor)) {
+ if (device_descr_->cuda_compute_capability(&major, &minor)) {
return major;
} else {
return 0;
@@ -906,6 +907,7 @@ struct GpuDevice {
private:
perftools::gputools::Stream* stream_;
perftools::gputools::StreamExecutor* stream_exec_;
+ const perftools::gputools::DeviceDescription* device_descr_;
const Allocator* allocator_;
};
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
index 3e90b08c99..6d63b23b2f 100644
--- a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
@@ -115,7 +115,7 @@ namespace {
}
-template <typename T>
+template <typename T, bool div_gt_one = false>
struct TensorIntDivisor {
public:
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor() {
@@ -166,7 +166,7 @@ struct TensorIntDivisor {
// Optimized version for signed 32 bit integers.
// Derived from Hacker's Delight.
template <>
-class TensorIntDivisor<int32_t> {
+class TensorIntDivisor<int32_t, true> {
public:
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor() {
magic = 0;
@@ -225,15 +225,15 @@ private:
};
-template <typename T>
-static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator / (const T& numerator, const TensorIntDivisor<T>& divisor) {
+template <typename T, bool div_gt_one>
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator / (const T& numerator, const TensorIntDivisor<T, div_gt_one>& divisor) {
return divisor.divide(numerator);
}
#else
// Reverse to the old code since gcudacc doesn't support the code above.
-template <typename T>
+template <typename T, bool div_gt_one = false>
struct TensorIntDivisor {
public:
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor() {
@@ -285,7 +285,7 @@ struct TensorIntDivisor {
// Optimized version for signed 32 bit integers.
// Derived from Hacker's Delight.
template <>
-class TensorIntDivisor<int> {
+class TensorIntDivisor<int, true> {
public:
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor() {
magic = 0;
@@ -344,8 +344,8 @@ private:
};
-template <typename T>
-static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator / (const T& numerator, const TensorIntDivisor<T>& divisor) {
+template <typename T, bool div_gt_one>
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator / (const T& numerator, const TensorIntDivisor<T, div_gt_one>& divisor) {
return divisor.divide(numerator);
}