diff options
author | Vijay Vasudevan <vrv@google.com> | 2015-12-02 15:05:37 -0800 |
---|---|---|
committer | Vijay Vasudevan <vrv@google.com> | 2015-12-02 15:05:37 -0800 |
commit | bb7a7a8858dc18ba733ed64e0733e27a4224ece8 (patch) | |
tree | 26dc98ddbbb220fd008de2925f482edf00a8c6bf /third_party | |
parent | bf6b536bde7d8060c489b51fedb58968b8cbfd7c (diff) |
TensorFlow: upstream changes from eigen to fix build from
changes in last commit.
Diffstat (limited to 'third_party')
8 files changed, 312 insertions, 33 deletions
diff --git a/third_party/eigen3/Eigen/src/Core/functors/UnaryFunctors.h b/third_party/eigen3/Eigen/src/Core/functors/UnaryFunctors.h index 2a22e5bc19..6feb229339 100644 --- a/third_party/eigen3/Eigen/src/Core/functors/UnaryFunctors.h +++ b/third_party/eigen3/Eigen/src/Core/functors/UnaryFunctors.h @@ -486,6 +486,39 @@ struct functor_traits<scalar_cube_op<Scalar> > { enum { Cost = 2*NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasMul }; }; +/** \internal + * \brief Template functor to compute the signum of a scalar + * \sa class CwiseUnaryOp, Cwise::sign() + */ +template<typename Scalar,bool iscpx=(NumTraits<Scalar>::IsComplex!=0) > struct scalar_sign_op; +template<typename Scalar> +struct scalar_sign_op<Scalar,false> { + EIGEN_EMPTY_STRUCT_CTOR(scalar_sign_op) + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const + { + return Scalar( (a>Scalar(0)) - (a<Scalar(0)) ); + } +}; +template<typename Scalar> +struct scalar_sign_op<Scalar,true> { + EIGEN_EMPTY_STRUCT_CTOR(scalar_sign_op) + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const + { + typename NumTraits<Scalar>::Real aa = std::abs(a); + return (aa==0) ? Scalar(0) : (a/aa); + } +}; +template<typename Scalar> +struct functor_traits<scalar_sign_op<Scalar> > +{ enum { + Cost = + NumTraits<Scalar>::IsComplex + ? ( 8*NumTraits<Scalar>::MulCost ) // roughly + : ( 3*NumTraits<Scalar>::AddCost), + PacketAccess = false, + }; +}; + } // end namespace internal } // end namespace Eigen diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/Tensor b/third_party/eigen3/unsupported/Eigen/CXX11/Tensor index 3904c72eef..2113b3a00a 100644 --- a/third_party/eigen3/unsupported/Eigen/CXX11/Tensor +++ b/third_party/eigen3/unsupported/Eigen/CXX11/Tensor @@ -59,7 +59,7 @@ #include <curand_kernel.h> #endif // defined(__CUDACC__) #else -#include "perftools/gputools/executor/gcuda.h" +#include "platforms/gpus/gcudacc/runtime/gcudacc_runtime.h" #ifdef __CUDACC__ #include "third_party/gpus/cuda/curand_device/curand_kernel.h" #endif // defined(__CUDACC__) @@ -88,6 +88,7 @@ #include "unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h" diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index 35ebca151b..720c3b6a82 100644 --- a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -81,6 +81,12 @@ class TensorBase<Derived, ReadOnlyAccessors> } EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_sign_op<Scalar>, const Derived> + sign() const { + return unaryExpr(internal::scalar_sign_op<Scalar>()); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_sqrt_op<Scalar>, const Derived> sqrt() const { return unaryExpr(internal::scalar_sqrt_op<Scalar>()); diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h index a62682c728..48859fe5fa 100644 --- a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h @@ -757,11 +757,17 @@ static inline void setCudaSharedMemConfig(cudaSharedMemConfig cache_config) { } struct GpuDevice { - GpuDevice() - : stream_(perftools::gputools::MachineManager::singleton()->stream_for_device(0)), - allocator_(nullptr), - stream_exec_(stream_->parent()), - device_descr_(&(stream_exec_->GetDeviceDescription())) {} + // Default constructor: Get [cached] device 0 and its default stream. + GpuDevice() : allocator_(nullptr) { + perftools::gputools::Platform* platform = + perftools::gputools::MultiPlatformManager::PlatformWithName("cuda") + .ValueOrDie(); + stream_exec_ = platform->ExecutorForDevice(0).ValueOrDie(); + // TODO(rspringer): If we ever pull from an executor aside from 0, this will + // need to be preceded by a call to SetDevice(N); + stream_ = platforms::gpus::gcudacc::GetDefaultStream(); + device_descr_ = &(stream_exec_->GetDeviceDescription()); + } GpuDevice(perftools::gputools::Stream* stream, const Allocator* alloc = nullptr) diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 863c28ab43..b7cea143ff 100644 --- a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -418,11 +418,13 @@ inline void TensorExecutor<Expression, GpuDevice, false, Tileable>::run( TensorEvaluator<Expression, GpuDevice> evaluator(expr, device); const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); if (needs_assign) { - const int num_blocks = device.getNumCudaMultiProcessors() * - device.maxCudaThreadsPerMultiProcessor() / - device.maxCudaThreadsPerBlock(); const int block_size = device.maxCudaThreadsPerBlock(); + const int max_blocks = device.getNumCudaMultiProcessors() * + device.maxCudaThreadsPerMultiProcessor() / block_size; const Index size = array_prod(evaluator.dimensions()); + // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0. + const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, (size + block_size - 1) / block_size), 1); + LAUNCH_CUDA_KERNEL( (EigenMetaKernel_NonVectorizable<TensorEvaluator<Expression, GpuDevice>, Index>), @@ -438,11 +440,13 @@ inline void TensorExecutor<Expression, GpuDevice, true, Tileable>::run( TensorEvaluator<Expression, GpuDevice> evaluator(expr, device); const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); if (needs_assign) { - const int num_blocks = device.getNumCudaMultiProcessors() * - device.maxCudaThreadsPerMultiProcessor() / - device.maxCudaThreadsPerBlock(); const int block_size = device.maxCudaThreadsPerBlock(); + const int max_blocks = device.getNumCudaMultiProcessors() * + device.maxCudaThreadsPerMultiProcessor() / block_size; const Index size = array_prod(evaluator.dimensions()); + // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0. + const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, (size + block_size - 1) / block_size), 1); + LAUNCH_CUDA_KERNEL( (EigenMetaKernel_Vectorizable<TensorEvaluator<Expression, GpuDevice>, Index>), diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h index 6d63b23b2f..8330f65dde 100644 --- a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h @@ -59,13 +59,8 @@ namespace { template <typename T> struct DividerTraits { -#if defined(__SIZEOF_INT128__) && !defined(__CUDACC__) typedef typename conditional<sizeof(T) == 8, uint64_t, uint32_t>::type type; static const int N = sizeof(T) * 8; -#else - typedef uint32_t type; - static const int N = 32; -#endif }; @@ -78,40 +73,39 @@ namespace { #endif } + template <typename T> + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t muluh(const uint64_t a, const T b) { #if defined(__CUDA_ARCH__) - template <typename T> - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t muluh(const uint64_t a, const T b) { return __umul64hi(a, b); - } -#else - template <typename T> - EIGEN_ALWAYS_INLINE uint64_t muluh(const uint64_t a, const T b) { -#if defined(__SIZEOF_INT128__) && !defined(__CUDACC__) +#elif defined(__SIZEOF_INT128__) __uint128_t v = static_cast<__uint128_t>(a) * static_cast<__uint128_t>(b); return static_cast<uint64_t>(v >> 64); #else - EIGEN_STATIC_ASSERT(sizeof(T) == 4, YOU_MADE_A_PROGRAMMING_MISTAKE); - return (a * b) >> 32; + return (TensorUInt128<static_val<0>, uint64_t>(a) * TensorUInt128<static_val<0>, uint64_t>(b)).upper(); #endif } -#endif template <int N, typename T> struct DividerHelper { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t computeMultiplier (const int log_div, const T divider) { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t computeMultiplier(const int log_div, const T divider) { EIGEN_STATIC_ASSERT(N == 32, YOU_MADE_A_PROGRAMMING_MISTAKE); return (static_cast<uint64_t>(1) << (N+log_div)) / divider - (static_cast<uint64_t>(1) << N) + 1; } }; -#if defined(__SIZEOF_INT128__) && !defined(__CUDACC__) template <typename T> struct DividerHelper<64, T> { - static EIGEN_ALWAYS_INLINE uint64_t computeMultiplier(const int log_div, const T divider) { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t computeMultiplier(const int log_div, const T divider) { +#if defined(__SIZEOF_INT128__) && !defined(__CUDA_ARCH__) return ((static_cast<__uint128_t>(1) << (64+log_div)) / static_cast<__uint128_t>(divider) - (static_cast<__uint128_t>(1) << 64) + 1); +#else + const uint64_t shift = 1ULL << log_div; + TensorUInt128<uint64_t, uint64_t> result = (TensorUInt128<uint64_t, static_val<0> >(shift, 0) / TensorUInt128<static_val<0>, uint64_t>(divider) - TensorUInt128<static_val<1>, static_val<0> >(1, 0) + TensorUInt128<static_val<0>, static_val<1> >(1)); + return static_cast<uint64_t>(result); +#endif } }; -#endif + } diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h index 2e59a147bc..efa2f358db 100644 --- a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h @@ -141,6 +141,7 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> m_unshuffledInputStrides[i] = m_unshuffledInputStrides[i - 1] * input_dims[i - 1]; m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1]; + m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]); } } else { m_unshuffledInputStrides[NumDims - 1] = 1; @@ -149,6 +150,7 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> m_unshuffledInputStrides[i] = m_unshuffledInputStrides[i + 1] * input_dims[i + 1]; m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1]; + m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]); } } @@ -319,14 +321,14 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> Index inputIndex = 0; if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { for (int i = NumDims - 1; i > 0; --i) { - const Index idx = index / m_outputStrides[i]; + const Index idx = index / m_fastOutputStrides[i]; inputIndex += idx * m_inputStrides[i]; index -= idx * m_outputStrides[i]; } return inputIndex + index * m_inputStrides[0]; } else { for (int i = 0; i < NumDims - 1; ++i) { - const Index idx = index / m_outputStrides[i]; + const Index idx = index / m_fastOutputStrides[i]; inputIndex += idx * m_inputStrides[i]; index -= idx * m_outputStrides[i]; } @@ -338,6 +340,7 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> Dimensions m_dimensions; array<Index, NumDims> m_inverseShuffle; array<Index, NumDims> m_outputStrides; + array<internal::TensorIntDivisor<Index>, NumDims> m_fastOutputStrides; array<Index, NumDims> m_inputStrides; array<Index, NumDims> m_unshuffledInputStrides; TensorEvaluator<ArgType, Device> m_impl; diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h new file mode 100644 index 0000000000..44aff63702 --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h @@ -0,0 +1,232 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_UINT128_H +#define EIGEN_CXX11_TENSOR_TENSOR_UINT128_H + +namespace Eigen { +namespace internal { + +template <uint64_t n> +struct static_val { + static const uint64_t value = n; + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE operator uint64_t() const { return n; } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static_val() { } + template <typename T> + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static_val(const T& v) { + eigen_assert(v == n); + } +}; + + +template <typename HIGH = uint64_t, typename LOW = uint64_t> +struct TensorUInt128 +{ + HIGH high; + LOW low; + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE + TensorUInt128(int x) : high(0), low(x) { + eigen_assert(x >= 0); + } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE + TensorUInt128(int64_t x) : high(0), low(x) { + eigen_assert(x >= 0); + } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE + TensorUInt128(uint64_t x) : high(0), low(x) { } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE + TensorUInt128(uint64_t y, uint64_t x) : high(y), low(x) { } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE operator LOW() const { + return low; + } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LOW lower() const { + return low; + } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HIGH upper() const { + return high; + } +}; + + +template <typename HL, typename LL, typename HR, typename LR> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +static bool operator == (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs) +{ + return (lhs.high == rhs.high) & (lhs.low == rhs.low); +} + +template <typename HL, typename LL, typename HR, typename LR> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +static bool operator != (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs) +{ + return (lhs.high != rhs.high) | (lhs.low != rhs.low); +} + +template <typename HL, typename LL, typename HR, typename LR> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +static bool operator >= (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs) +{ + if (lhs.high != rhs.high) { + return lhs.high > rhs.high; + } + return lhs.low >= rhs.low; +} + +template <typename HL, typename LL, typename HR, typename LR> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +static bool operator < (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs) +{ + if (lhs.high != rhs.high) { + return lhs.high < rhs.high; + } + return lhs.low < rhs.low; +} + +template <typename HL, typename LL, typename HR, typename LR> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +static TensorUInt128<uint64_t, uint64_t> operator + (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs) +{ + TensorUInt128<uint64_t, uint64_t> result(lhs.high + rhs.high, lhs.low + rhs.low); + if (result.low < rhs.low) { + result.high += 1; + } + return result; +} + +template <typename HL, typename LL, typename HR, typename LR> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +static TensorUInt128<uint64_t, uint64_t> operator - (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs) +{ + TensorUInt128<uint64_t, uint64_t> result(lhs.high - rhs.high, lhs.low - rhs.low); + if (result.low > lhs.low) { + result.high -= 1; + } + return result; +} + + +template <typename HL, typename LL, typename HR, typename LR> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +static TensorUInt128<uint64_t, uint64_t> operator * (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs) +{ + // Split each 128-bit integer into 4 32-bit integers, and then do the + // multiplications by hand as follow: + // lhs a b c d + // rhs e f g h + // ----------- + // ah bh ch dh + // bg cg dg + // cf df + // de + // The result is stored in 2 64bit integers, high and low. + + const uint64_t LOW = 0x00000000FFFFFFFFLL; + const uint64_t HIGH = 0xFFFFFFFF00000000LL; + + uint64_t d = lhs.low & LOW; + uint64_t c = (lhs.low & HIGH) >> 32LL; + uint64_t b = lhs.high & LOW; + uint64_t a = (lhs.high & HIGH) >> 32LL; + + uint64_t h = rhs.low & LOW; + uint64_t g = (rhs.low & HIGH) >> 32LL; + uint64_t f = rhs.high & LOW; + uint64_t e = (rhs.high & HIGH) >> 32LL; + + // Compute the low 32 bits of low + uint64_t acc = d * h; + uint64_t low = acc & LOW; + // Compute the high 32 bits of low. Add a carry every time we wrap around + acc >>= 32LL; + uint64_t carry = 0; + uint64_t acc2 = acc + c * h; + if (acc2 < acc) { + carry++; + } + acc = acc2 + d * g; + if (acc < acc2) { + carry++; + } + low |= (acc << 32LL); + + // Carry forward the high bits of acc to initiate the computation of the + // low 32 bits of high + acc2 = (acc >> 32LL) | (carry << 32LL); + carry = 0; + + acc = acc2 + b * h; + if (acc < acc2) { + carry++; + } + acc2 = acc + c * g; + if (acc2 < acc) { + carry++; + } + acc = acc2 + d * f; + if (acc < acc2) { + carry++; + } + uint64_t high = acc & LOW; + + // Start to compute the high 32 bits of high. + acc2 = (acc >> 32LL) | (carry << 32LL); + + acc = acc2 + a * h; + acc2 = acc + b * g; + acc = acc2 + c * f; + acc2 = acc + d * e; + high |= (acc2 << 32LL); + + return TensorUInt128<uint64_t, uint64_t>(high, low); +} + +template <typename HL, typename LL, typename HR, typename LR> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +static TensorUInt128<uint64_t, uint64_t> operator / (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs) +{ + if (rhs == TensorUInt128<static_val<0>, static_val<1> >(1)) { + return TensorUInt128<uint64_t, uint64_t>(lhs.high, lhs.low); + } else if (lhs < rhs) { + return TensorUInt128<uint64_t, uint64_t>(0); + } else { + // calculate the biggest power of 2 times rhs that's less than or equal to lhs + TensorUInt128<uint64_t, uint64_t> power2(1); + TensorUInt128<uint64_t, uint64_t> d(rhs); + TensorUInt128<uint64_t, uint64_t> tmp(lhs - d); + while (lhs >= d) { + tmp = tmp - d; + d = d + d; + power2 = power2 + power2; + } + + tmp = TensorUInt128<uint64_t, uint64_t>(lhs.high, lhs.low); + TensorUInt128<uint64_t, uint64_t> result(0); + while (power2 != TensorUInt128<static_val<0>, static_val<0> >(0)) { + if (tmp >= d) { + tmp = tmp - d; + result = result + power2; + } + // Shift right + power2 = TensorUInt128<uint64_t, uint64_t>(power2.high >> 1, (power2.low >> 1) | (power2.high << 63)); + d = TensorUInt128<uint64_t, uint64_t>(d.high >> 1, (d.low >> 1) | (d.high << 63)); + } + + return result; + } +} + + +} // namespace internal +} // namespace Eigen + + +#endif // EIGEN_CXX11_TENSOR_TENSOR_UINT128_H |