diff options
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h | 30 |
1 files changed, 20 insertions, 10 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index a38af84d5..36718e26f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -109,6 +109,24 @@ struct TensorEvaluator const Device& m_device; }; +namespace { +template <typename T> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T loadConstant(const T* address) { + return *address; +} +// Use the texture cache on CUDA devices whenever possible +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 +template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +float loadConstant(const float* address) { + return __ldg(address); +} +template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +double loadConstant(const double* address) { + return __ldg(address); +} +#endif +} + // Default evaluator for rvalues template<typename Derived, typename Device> @@ -150,11 +168,7 @@ struct TensorEvaluator<const Derived, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { eigen_assert(m_data); -#ifdef __CUDA_ARCH__ - return __ldg(m_data+index); -#else - return m_data[index]; -#endif + return loadConstant(m_data+index); } template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE @@ -167,11 +181,7 @@ struct TensorEvaluator<const Derived, Device> eigen_assert(m_data); const Index index = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_dims.IndexOfColMajor(coords) : m_dims.IndexOfRowMajor(coords); -#ifdef __CUDA_ARCH__ - return __ldg(m_data+index); -#else - return m_data[index]; -#endif + return loadConstant(m_data+index); } EIGEN_DEVICE_FUNC const Scalar* data() const { return m_data; } |