aboutsummaryrefslogtreecommitdiffhomepage
path: root/unsupported
diff options
context:
space:
mode:
authorGravatar Ville Kallioniemi <ville.kallioniemi@gmail.com>2016-02-01 19:32:31 -0700
committerGravatar Ville Kallioniemi <ville.kallioniemi@gmail.com>2016-02-01 19:32:31 -0700
commitf0fdefa96fdbdafe0daaa47a2dd54b9e77cf9716 (patch)
treee175666422c11be478eaaf68c934e1fc913fec6f /unsupported
parent02db1228ed9ca3728ae0685a5e1602fe7299ae50 (diff)
parent64ce78c2ec52aa2fd2e408c7c4160b06e8fc1a03 (diff)
Rebase to latest.
Diffstat (limited to 'unsupported')
-rw-r--r--unsupported/Eigen/AlignedVector32
-rw-r--r--unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h39
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorBase.h6
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h8
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h8
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h5
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h8
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h8
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h12
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h2
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h4
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h2
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h12
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h6
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h4
-rw-r--r--unsupported/Eigen/src/Splines/SplineFitting.h2
-rw-r--r--unsupported/test/CMakeLists.txt28
-rw-r--r--unsupported/test/cxx11_tensor_argmax_cuda.cu (renamed from unsupported/test/cxx11_tensor_argmax_cuda.cpp)22
-rw-r--r--unsupported/test/cxx11_tensor_contract_cuda.cu (renamed from unsupported/test/cxx11_tensor_contract_cuda.cpp)108
-rw-r--r--unsupported/test/cxx11_tensor_cuda.cu (renamed from unsupported/test/cxx11_tensor_cuda.cpp)158
-rw-r--r--unsupported/test/cxx11_tensor_device.cu (renamed from unsupported/test/cxx11_tensor_device.cpp)22
-rw-r--r--unsupported/test/cxx11_tensor_empty.cpp40
-rw-r--r--unsupported/test/cxx11_tensor_random_cuda.cu (renamed from unsupported/test/cxx11_tensor_random_cuda.cpp)0
-rw-r--r--unsupported/test/cxx11_tensor_reduction_cuda.cu (renamed from unsupported/test/cxx11_tensor_reduction_cuda.cpp)7
-rw-r--r--unsupported/test/cxx11_tensor_thread_pool.cpp85
25 files changed, 403 insertions, 195 deletions
diff --git a/unsupported/Eigen/AlignedVector3 b/unsupported/Eigen/AlignedVector3
index f5c40a189..135eec572 100644
--- a/unsupported/Eigen/AlignedVector3
+++ b/unsupported/Eigen/AlignedVector3
@@ -188,7 +188,7 @@ template<typename _Scalar> class AlignedVector3
}
template<typename Derived>
- inline bool isApprox(const MatrixBase<Derived>& other, RealScalar eps=NumTraits<Scalar>::dummy_precision()) const
+ inline bool isApprox(const MatrixBase<Derived>& other, const RealScalar& eps=NumTraits<Scalar>::dummy_precision()) const
{
return m_coeffs.template head<3>().isApprox(other,eps);
}
diff --git a/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h b/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h
index 456b34d0b..89aeb03e7 100644
--- a/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h
+++ b/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h
@@ -25,6 +25,16 @@ template <typename T, size_t n> class array {
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const T& operator[] (size_t index) const { return values[index]; }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE T& front() { return values[0]; }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const T& front() const { return values[0]; }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE T& back() { return values[n-1]; }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const T& back() const { return values[n-1]; }
+
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
static std::size_t size() { return n; }
@@ -123,13 +133,33 @@ template <typename T> class array<T, 0> {
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE T& operator[] (size_t) {
eigen_assert(false && "Can't index a zero size array");
- return *static_cast<T*>(NULL);
+ return dummy;
}
-
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const T& operator[] (size_t) const {
eigen_assert(false && "Can't index a zero size array");
- return *static_cast<const T*>(NULL);
+ return dummy;
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE T& front() {
+ eigen_assert(false && "Can't index a zero size array");
+ return dummy;
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const T& front() const {
+ eigen_assert(false && "Can't index a zero size array");
+ return dummy;
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE T& back() {
+ eigen_assert(false && "Can't index a zero size array");
+ return dummy;
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const T& back() const {
+ eigen_assert(false && "Can't index a zero size array");
+ return dummy;
}
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::size_t size() { return 0; }
@@ -142,6 +172,9 @@ template <typename T> class array<T, 0> {
eigen_assert(l.size() == 0);
}
#endif
+
+ private:
+ T dummy;
};
namespace internal {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
index 392acf302..cca716d6f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
@@ -129,6 +129,12 @@ class TensorBase<Derived, ReadOnlyAccessors>
}
EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_digamma_op<Scalar>, const Derived>
+ digamma() const {
+ return unaryExpr(internal::scalar_digamma_op<Scalar>());
+ }
+
+ EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_erf_op<Scalar>, const Derived>
erf() const {
return unaryExpr(internal::scalar_erf_op<Scalar>());
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index 624e814e2..1adb68894 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -378,7 +378,7 @@ struct TensorContractionEvaluatorBase
}
template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
- void evalGemv(Scalar* buffer) const {
+ EIGEN_DEVICE_FUNC void evalGemv(Scalar* buffer) const {
const Index rows = m_i_size;
const Index cols = m_k_size;
@@ -516,7 +516,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
Base(op, device) { }
template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
- void evalProduct(Scalar* buffer) const {
+ EIGEN_DEVICE_FUNC void evalProduct(Scalar* buffer) const {
if (this->m_j_size == 1) {
this->template evalGemv<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer);
return;
@@ -582,10 +582,8 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
OutputMapper output(buffer, m);
- typedef typename internal::gemm_blocking_space<ColMajor, LhsScalar, RhsScalar, Dynamic, Dynamic, Dynamic> BlockingType;
-
// Sizes of the blocks to load in cache. See the Goto paper for details.
- BlockingType blocking(m, n, k, 1, true);
+ internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index, internal::ShardByCol> blocking(k, m, n, 1);
const Index kc = blocking.kc();
const Index mc = numext::mini(m, blocking.mc());
const Index nc = numext::mini(n, blocking.nc());
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
index 78ed5038f..3d3f6904f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
@@ -28,7 +28,7 @@ class TensorContractionBlocking {
typedef typename LhsMapper::Scalar LhsScalar;
typedef typename RhsMapper::Scalar RhsScalar;
- TensorContractionBlocking(Index k, Index m, Index n, Index num_threads = 1) :
+ EIGEN_DEVICE_FUNC TensorContractionBlocking(Index k, Index m, Index n, Index num_threads = 1) :
kc_(k), mc_(m), nc_(n)
{
if (ShardingType == ShardByCol) {
@@ -41,9 +41,9 @@ class TensorContractionBlocking {
}
}
- EIGEN_ALWAYS_INLINE Index kc() const { return kc_; }
- EIGEN_ALWAYS_INLINE Index mc() const { return mc_; }
- EIGEN_ALWAYS_INLINE Index nc() const { return nc_; }
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index kc() const { return kc_; }
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index mc() const { return mc_; }
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index nc() const { return nc_; }
private:
Index kc_;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
index 9b6d18090..63c8ae126 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
@@ -426,15 +426,16 @@ class TensorContractionSubMapper {
};
-template<typename Scalar, typename Index, int side,
+template<typename Scalar_, typename Index, int side,
typename Tensor,
typename nocontract_t, typename contract_t,
int packet_size,
bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
class TensorContractionInputMapper
- : public BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> {
+ : public BaseTensorContractionMapper<Scalar_, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> {
public:
+ typedef Scalar_ Scalar;
typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> Base;
typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> SubMapper;
typedef SubMapper VectorMapper;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
index 576bea295..51a3b9490 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
@@ -176,10 +176,10 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
// compute block sizes (which depend on number of threads)
const Index num_threads = this->m_device.numThreads();
- Index mc = m;
- Index nc = n;
- Index kc = k;
- internal::computeProductBlockingSizes<LhsScalar,RhsScalar,1>(kc, mc, nc, num_threads);
+ internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index, internal::ShardByCol> blocking(k, m, n, num_threads);
+ Index mc = blocking.mc();
+ Index nc = blocking.nc();
+ Index kc = blocking.kc();
eigen_assert(mc <= m);
eigen_assert(nc <= n);
eigen_assert(kc <= k);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
index 367a152a0..67c797802 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
@@ -21,7 +21,7 @@ namespace Eigen {
*/
namespace internal {
-template <typename Index, typename InputDims, size_t NumKernelDims, int Layout>
+template <typename Index, typename InputDims, int NumKernelDims, int Layout>
class IndexMapper {
public:
IndexMapper(const InputDims& input_dims, const array<Index, NumKernelDims>& kernel_dims,
@@ -123,7 +123,7 @@ class IndexMapper {
}
inputIndex += p * m_inputStrides[NumKernelDims];
} else {
- int limit = 0;
+ std::ptrdiff_t limit = 0;
if (NumKernelDims < NumDims) {
limit = NumDims - NumKernelDims - 1;
}
@@ -147,7 +147,7 @@ class IndexMapper {
}
outputIndex += p * m_outputStrides[NumKernelDims];
} else {
- int limit = 0;
+ std::ptrdiff_t limit = 0;
if (NumKernelDims < NumDims) {
limit = NumDims - NumKernelDims - 1;
}
@@ -206,7 +206,7 @@ class IndexMapper {
}
private:
- static const size_t NumDims = internal::array_size<InputDims>::value;
+ static const int NumDims = internal::array_size<InputDims>::value;
array<Index, NumDims> m_inputStrides;
array<Index, NumDims> m_outputStrides;
array<Index, NumDims> m_cudaInputStrides;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
index 5abdc489b..e684ab8f7 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
@@ -109,10 +109,12 @@ class CudaStreamDevice : public StreamInterface {
struct GpuDevice {
// The StreamInterface is not owned: the caller is
// responsible for its initialization and eventual destruction.
- explicit GpuDevice(const StreamInterface* stream) : stream_(stream) {
+ explicit GpuDevice(const StreamInterface* stream) : stream_(stream), max_blocks_(INT_MAX) {
+ eigen_assert(stream);
+ }
+ explicit GpuDevice(const StreamInterface* stream, int num_blocks) : stream_(stream), max_blocks_(num_blocks) {
eigen_assert(stream);
}
-
// TODO(bsteiner): This is an internal API, we should not expose it.
EIGEN_STRONG_INLINE const cudaStream_t& stream() const {
return stream_->stream();
@@ -246,6 +248,10 @@ struct GpuDevice {
#endif
}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxBlocks() const {
+ return max_blocks_;
+ }
+
// This function checks if the CUDA runtime recorded an error for the
// underlying stream device.
inline bool ok() const {
@@ -259,7 +265,7 @@ struct GpuDevice {
private:
const StreamInterface* stream_;
-
+ int max_blocks_;
};
#ifndef __CUDA_ARCH__
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
index e7daf7304..bd83d5de8 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
@@ -136,7 +136,7 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType>, Device>
}
template<int LoadMode>
- EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
{
return internal::ploadt<Packet, LoadMode>(m_buffer + index);
}
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index d2ab70f2b..df15c6204 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -220,7 +220,7 @@ EIGEN_DEVICE_FUNC inline void TensorExecutor<Expression, GpuDevice, false>::run(
if (needs_assign)
{
const int block_size = device.maxCudaThreadsPerBlock();
- const int max_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size;
+ const int max_blocks = numext::maxi<int>(device.maxBlocks(), device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size);
const Index size = array_prod(evaluator.dimensions());
// Create a least one block to ensure we won't crash if we're called with tensors of size 0.
const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, (size + block_size - 1) / block_size), 1);
@@ -239,7 +239,7 @@ EIGEN_DEVICE_FUNC inline void TensorExecutor<Expression, GpuDevice, true>::run(c
if (needs_assign)
{
const int block_size = device.maxCudaThreadsPerBlock();
- const int max_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size;
+ const int max_blocks = numext::maxi<int>(device.maxBlocks(), device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size);
const Index size = array_prod(evaluator.dimensions());
// Create a least one block to ensure we won't crash if we're called with tensors of size 0.
const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, (size + block_size - 1) / block_size), 1);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
index c9b0b2f28..58b864787 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
@@ -106,7 +106,6 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
- m_impl.evalSubExprsIfNeeded(NULL);
const Index numValues = m_impl.dimensions().TotalSize();
m_buffer = (CoeffReturnType*)m_device.allocate(numValues * sizeof(CoeffReturnType));
// Should initialize the memory in case we're dealing with non POD types.
@@ -119,7 +118,6 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
EvalTo evalToTmp(m_buffer, m_op);
const bool PacketAccess = internal::IsVectorizable<Device, const ArgType>::value;
internal::TensorExecutor<const EvalTo, Device, PacketAccess>::run(evalToTmp, m_device);
- m_impl.cleanup();
return true;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index 09ee0c2c6..22aea5ea4 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -345,8 +345,8 @@ template <typename Self, typename Op, typename Device>
struct InnerReducer {
static const bool HasOptimizedImplementation = false;
- static void run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) {
- assert(false && "Not implemented");
+ EIGEN_DEVICE_FUNC static void run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) {
+ eigen_assert(false && "Not implemented");
}
};
@@ -355,8 +355,8 @@ template <typename Self, typename Op, typename Device>
struct OuterReducer {
static const bool HasOptimizedImplementation = false;
- static void run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) {
- assert(false && "Not implemented");
+ EIGEN_DEVICE_FUNC static void run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) {
+ eigen_assert(false && "Not implemented");
}
};
@@ -463,7 +463,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
}
} else {
- m_outputStrides[NumOutputDims - 1] = 1;
+ m_outputStrides.back() = 1;
for (int i = NumOutputDims - 2; i >= 0; --i) {
m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
}
@@ -479,7 +479,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
input_strides[i] = input_strides[i-1] * input_dims[i-1];
}
} else {
- input_strides[NumInputDims - 1] = 1;
+ input_strides.back() = 1;
for (int i = NumInputDims - 2; i >= 0; --i) {
input_strides[i] = input_strides[i + 1] * input_dims[i + 1];
}
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
index 98631fc7f..ed933b6ac 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
@@ -41,7 +41,10 @@ class TensorStorage<T, FixedDimensions, Options_>
private:
static const std::size_t Size = FixedDimensions::total_size;
- EIGEN_ALIGN_MAX T m_data[Size];
+ // Allocate an array of size at least one to prevent compiler warnings.
+ static const std::size_t MinSize = max_n_1<Size>::size;
+ EIGEN_ALIGN_MAX T m_data[MinSize];
+
FixedDimensions m_dimensions;
public:
@@ -105,7 +108,6 @@ class TensorStorage<T, DSizes<IndexType, NumIndices_>, Options_>
EIGEN_DEVICE_FUNC void resize(Index size, const array<Index, NumIndices_>& nbDimensions)
{
- eigen_assert(size >= 1);
const Index currentSz = internal::array_prod(m_dimensions);
if(size != currentSz)
{
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
index 19352eb5e..0d34f7ee6 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
@@ -34,11 +34,11 @@ struct TensorUInt128
LOW low;
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
- TensorUInt128(int x) : high(0), low(x) {
+ TensorUInt128(int32_t x) : high(0), low(x) {
eigen_assert(x >= 0);
}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
- TensorUInt128(unsigned int x) : high(0), low(x) { }
+ TensorUInt128(uint32_t x) : high(0), low(x) { }
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
TensorUInt128(long x) : high(0), low(x) {
eigen_assert(x >= 0);
diff --git a/unsupported/Eigen/src/Splines/SplineFitting.h b/unsupported/Eigen/src/Splines/SplineFitting.h
index d3c245fa9..8e6a5aaed 100644
--- a/unsupported/Eigen/src/Splines/SplineFitting.h
+++ b/unsupported/Eigen/src/Splines/SplineFitting.h
@@ -167,7 +167,7 @@ namespace Eigen
derivativeKnots.data(), derivativeKnots.data() + derivativeKnots.size(),
temporaryKnots.data());
- // Number of control points (one for each point and derivative) plus spline order.
+ // Number of knots (one for each point and derivative) plus spline order.
DenseIndex numKnots = numParameters + numDerivatives + degree + 1;
knots.resize(numKnots);
diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt
index 97257b183..eed724bcf 100644
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@@ -147,13 +147,27 @@ if(EIGEN_TEST_CXX11)
ei_add_test(cxx11_tensor_sugar "-std=c++0x")
ei_add_test(cxx11_tensor_fft "-std=c++0x")
ei_add_test(cxx11_tensor_ifft "-std=c++0x")
+ ei_add_test(cxx11_tensor_empty "-std=c++0x")
- # These tests needs nvcc
-# ei_add_test(cxx11_tensor_device "-std=c++0x")
-# ei_add_test(cxx11_tensor_cuda "-std=c++0x")
-# ei_add_test(cxx11_tensor_contract_cuda "-std=c++0x")
-# ei_add_test(cxx11_tensor_reduction_cuda "-std=c++0x")
-# ei_add_test(cxx11_tensor_random_cuda "-std=c++0x")
-# ei_add_test(cxx11_tensor_argmax_cuda "-std=c++0x")
+endif()
+# These tests needs nvcc
+find_package(CUDA 7.0)
+if(CUDA_FOUND)
+ set(CUDA_PROPAGATE_HOST_FLAGS OFF)
+ if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
+ set(CUDA_NVCC_FLAGS "-ccbin /usr/bin/clang" CACHE STRING "nvcc flags" FORCE)
+ endif()
+ set(CUDA_NVCC_FLAGS "-std=c++11 --relaxed-constexpr -arch compute_30")
+ cuda_include_directories("${CMAKE_CURRENT_BINARY_DIR}" "${CUDA_TOOLKIT_ROOT_DIR}/include")
+ set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
+
+ ei_add_test(cxx11_tensor_device)
+ ei_add_test(cxx11_tensor_cuda)
+ ei_add_test(cxx11_tensor_contract_cuda)
+ ei_add_test(cxx11_tensor_reduction_cuda)
+ ei_add_test(cxx11_tensor_random_cuda)
+ ei_add_test(cxx11_tensor_argmax_cuda)
+
+ unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
endif()
diff --git a/unsupported/test/cxx11_tensor_argmax_cuda.cpp b/unsupported/test/cxx11_tensor_argmax_cuda.cu
index d37490d15..45311d4f7 100644
--- a/unsupported/test/cxx11_tensor_argmax_cuda.cpp
+++ b/unsupported/test/cxx11_tensor_argmax_cuda.cu
@@ -56,6 +56,10 @@ void test_cuda_simple_argmax()
VERIFY_IS_EQUAL(out_max(Eigen::array<DenseIndex, 1>(0)), 72*53*97 - 1);
VERIFY_IS_EQUAL(out_min(Eigen::array<DenseIndex, 1>(0)), 0);
+
+ cudaFree(d_in);
+ cudaFree(d_out_max);
+ cudaFree(d_out_min);
}
template <int DataLayout>
@@ -141,6 +145,9 @@ void test_cuda_argmax_dim()
// Expect max to be in the last index of the reduced dimension
VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1);
}
+
+ cudaFree(d_in);
+ cudaFree(d_out);
}
}
@@ -227,15 +234,18 @@ void test_cuda_argmin_dim()
// Expect max to be in the last index of the reduced dimension
VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1);
}
+
+ cudaFree(d_in);
+ cudaFree(d_out);
}
}
void test_cxx11_tensor_cuda()
{
- CALL_SUBTEST(test_cuda_simple_argmax<RowMajor>());
- CALL_SUBTEST(test_cuda_simple_argmax<ColMajor>());
- CALL_SUBTEST(test_cuda_argmax_dim<RowMajor>());
- CALL_SUBTEST(test_cuda_argmax_dim<ColMajor>());
- CALL_SUBTEST(test_cuda_argmin_dim<RowMajor>());
- CALL_SUBTEST(test_cuda_argmin_dim<ColMajor>());
+ CALL_SUBTEST_1(test_cuda_simple_argmax<RowMajor>());
+ CALL_SUBTEST_1(test_cuda_simple_argmax<ColMajor>());
+ CALL_SUBTEST_2(test_cuda_argmax_dim<RowMajor>());
+ CALL_SUBTEST_2(test_cuda_argmax_dim<ColMajor>());
+ CALL_SUBTEST_3(test_cuda_argmin_dim<RowMajor>());
+ CALL_SUBTEST_3(test_cuda_argmin_dim<ColMajor>());
}
diff --git a/unsupported/test/cxx11_tensor_contract_cuda.cpp b/unsupported/test/cxx11_tensor_contract_cuda.cu
index 035a093e6..6d1ef07f9 100644
--- a/unsupported/test/cxx11_tensor_contract_cuda.cpp
+++ b/unsupported/test/cxx11_tensor_contract_cuda.cu
@@ -22,16 +22,16 @@ using Eigen::Tensor;
typedef Tensor<float, 1>::DimensionPair DimPair;
template<int DataLayout>
-static void test_cuda_contraction(int m_size, int k_size, int n_size)
+void test_cuda_contraction(int m_size, int k_size, int n_size)
{
- cout<<"Calling with ("<<m_size<<","<<k_size<<","<<n_size<<")"<<std::endl;
+ std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
// with these dimensions, the output has 300 * 140 elements, which is
// more than 30 * 1024, which is the number of threads in blocks on
// a 15 SM GK110 GPU
- Tensor<float, 2, DataLayout> t_left(Eigen::array<int, 2>(m_size, k_size));
- Tensor<float, 2, DataLayout> t_right(Eigen::array<int, 2>(k_size, n_size));
- Tensor<float, 2, DataLayout> t_result(Eigen::array<int, 2>(m_size, n_size));
- Tensor<float, 2, DataLayout> t_result_gpu(Eigen::array<int, 2>(m_size, n_size));
+ Tensor<float, 2, DataLayout> t_left(m_size, k_size);
+ Tensor<float, 2, DataLayout> t_right(k_size, n_size);
+ Tensor<float, 2, DataLayout> t_result(m_size, n_size);
+ Tensor<float, 2, DataLayout> t_result_gpu(m_size, n_size);
Eigen::array<DimPair, 1> dims(DimPair(1, 0));
t_left.setRandom();
@@ -67,12 +67,16 @@ static void test_cuda_contraction(int m_size, int k_size, int n_size)
t_result = t_left.contract(t_right, dims);
cudaMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost);
- for (size_t i = 0; i < t_result.dimensions().TotalSize(); i++) {
- if (fabs(t_result.data()[i] - t_result_gpu.data()[i]) >= 1e-4) {
- cout << "mismatch detected at index " << i << ": " << t_result.data()[i]
- << " vs " << t_result_gpu.data()[i] << endl;
- assert(false);
+ for (size_t i = 0; i < t_result.size(); i++) {
+ if (fabs(t_result(i) - t_result_gpu(i)) < 1e-4f) {
+ continue;
}
+ if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), 1e-4f)) {
+ continue;
+ }
+ std::cout << "mismatch detected at index " << i << ": " << t_result(i)
+ << " vs " << t_result_gpu(i) << std::endl;
+ assert(false);
}
cudaFree((void*)d_t_left);
@@ -80,41 +84,69 @@ static void test_cuda_contraction(int m_size, int k_size, int n_size)
cudaFree((void*)d_t_result);
}
-
-void test_cxx11_tensor_cuda()
-{
- cout<<"Calling contraction tests"<<std::endl;
- CALL_SUBTEST(test_cuda_contraction<ColMajor>(128, 128, 128));
- CALL_SUBTEST(test_cuda_contraction<RowMajor>(128, 128, 128));
+template<int DataLayout>
+void test_cuda_contraction_m() {
for (int k = 32; k < 256; k++) {
- CALL_SUBTEST(test_cuda_contraction<ColMajor>(128, k, 128));
- CALL_SUBTEST(test_cuda_contraction<RowMajor>(128, k, 128));
+ test_cuda_contraction<ColMajor>(k, 128, 128);
+ test_cuda_contraction<RowMajor>(k, 128, 128);
}
+}
+
+template<int DataLayout>
+void test_cuda_contraction_k() {
for (int k = 32; k < 256; k++) {
- CALL_SUBTEST(test_cuda_contraction<ColMajor>(128, 128, k));
- CALL_SUBTEST(test_cuda_contraction<RowMajor>(128, 128, k));
+ test_cuda_contraction<ColMajor>(128, k, 128);
+ test_cuda_contraction<RowMajor>(128, k, 128);
}
+}
+
+template<int DataLayout>
+void test_cuda_contraction_n() {
for (int k = 32; k < 256; k++) {
- CALL_SUBTEST(test_cuda_contraction<ColMajor>(k, 128, 128));
- CALL_SUBTEST(test_cuda_contraction<RowMajor>(k, 128, 128));
+ test_cuda_contraction<ColMajor>(128, 128, k);
+ test_cuda_contraction<RowMajor>(128, 128, k);
}
+}
- int m_sizes[] = {31, 39, 63, 64, 65,
- 127, 129, 255, 257, 511,
- 512, 513, 1023, 1024, 1025 };
- int n_sizes[] = {31, 39, 63, 64, 65,
- 127, 129, 255, 257, 511,
- 512, 513, 1023, 1024, 1025 };
-
- int k_sizes[] = { 31, 39, 63, 64, 65,
- 95, 96, 127, 129, 255,
- 257, 511, 512, 513, 1023,
- 1024, 1025};
- for (int i = 0; i <15; i++)
- for (int j = 0; j < 15; j++)
+template<int DataLayout>
+void test_cuda_contraction_sizes() {
+ int m_sizes[] = { 31, 39, 63, 64, 65,
+ 127, 129, 255, 257 , 511,
+ 512, 513, 1023, 1024, 1025};
+
+ int n_sizes[] = { 31, 39, 63, 64, 65,
+ 127, 129, 255, 257, 511,
+ 512, 513, 1023, 1024, 1025};
+
+ int k_sizes[] = { 31, 39, 63, 64, 65,
+ 95, 96, 127, 129, 255,
+ 257, 511, 512, 513, 1023,
+ 1024, 1025};
+
+ for (int i = 0; i < 15; i++) {
+ for (int j = 0; j < 15; j++) {
for (int k = 0; k < 17; k++) {
- CALL_SUBTEST(test_cuda_contraction<ColMajor>(m_sizes[i], n_sizes[j], k_sizes[k]));
- CALL_SUBTEST(test_cuda_contraction<RowMajor>(m_sizes[i], n_sizes[j], k_sizes[k]));
+ test_cuda_contraction<DataLayout>(m_sizes[i], n_sizes[j], k_sizes[k]);
}
+ }
+ }
+}
+
+void test_cxx11_tensor_cuda()
+{
+ CALL_SUBTEST_1(test_cuda_contraction<ColMajor>(128, 128, 128));
+ CALL_SUBTEST_1(test_cuda_contraction<RowMajor>(128, 128, 128));
+
+ CALL_SUBTEST_2(test_cuda_contraction_m<ColMajor>());
+ CALL_SUBTEST_3(test_cuda_contraction_m<RowMajor>());
+
+ CALL_SUBTEST_4(test_cuda_contraction_k<ColMajor>());
+ CALL_SUBTEST_5(test_cuda_contraction_k<RowMajor>());
+
+ CALL_SUBTEST_6(test_cuda_contraction_n<ColMajor>());
+ CALL_SUBTEST_7(test_cuda_contraction_n<RowMajor>());
+
+ CALL_SUBTEST_8(test_cuda_contraction_sizes<ColMajor>());
+ CALL_SUBTEST_9(test_cuda_contraction_sizes<RowMajor>());
}
diff --git a/unsupported/test/cxx11_tensor_cuda.cpp b/unsupported/test/cxx11_tensor_cuda.cu
index 49e1894ab..60f9314a5 100644
--- a/unsupported/test/cxx11_tensor_cuda.cpp
+++ b/unsupported/test/cxx11_tensor_cuda.cu
@@ -63,6 +63,10 @@ void test_cuda_elementwise_small() {
out(Eigen::array<int, 1>(i)),
in1(Eigen::array<int, 1>(i)) + in2(Eigen::array<int, 1>(i)));
}
+
+ cudaFree(d_in1);
+ cudaFree(d_in2);
+ cudaFree(d_out);
}
void test_cuda_elementwise()
@@ -113,6 +117,11 @@ void test_cuda_elementwise()
}
}
}
+
+ cudaFree(d_in1);
+ cudaFree(d_in2);
+ cudaFree(d_in3);
+ cudaFree(d_out);
}
void test_cuda_reduction()
@@ -131,8 +140,7 @@ void test_cuda_reduction()
cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice);
- cudaStream_t stream;
- assert(cudaStreamCreate(&stream) == cudaSuccess);
+ Eigen::CudaStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_in1(d_in1, 72,53,97,113);
@@ -159,10 +167,13 @@ void test_cuda_reduction()
VERIFY_IS_APPROX(out(i,j), expected);
}
}
+
+ cudaFree(d_in1);
+ cudaFree(d_out);
}
template<int DataLayout>
-static void test_cuda_contraction()
+void test_cuda_contraction()
{
// with these dimensions, the output has 300 * 140 elements, which is
// more than 30 * 1024, which is the number of threads in blocks on
@@ -189,8 +200,7 @@ static void test_cuda_contraction()
cudaMemcpy(d_t_left, t_left.data(), t_left_bytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_t_right, t_right.data(), t_right_bytes, cudaMemcpyHostToDevice);
- cudaStream_t stream;
- assert(cudaStreamCreate(&stream) == cudaSuccess);
+ Eigen::CudaStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_t_left(d_t_left, 6, 50, 3, 31);
@@ -214,14 +224,18 @@ static void test_cuda_contraction()
for (size_t i = 0; i < t_result.dimensions().TotalSize(); i++) {
if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) {
- cout << "mismatch detected at index " << i << ": " << t_result.data()[i] << " vs " << m_result.data()[i] << endl;
+ std::cout << "mismatch detected at index " << i << ": " << t_result.data()[i] << " vs " << m_result.data()[i] << std::endl;
assert(false);
}
}
+
+ cudaFree(d_t_left);
+ cudaFree(d_t_right);
+ cudaFree(d_t_result);
}
template<int DataLayout>
-static void test_cuda_convolution_1d()
+void test_cuda_convolution_1d()
{
Tensor<float, 4, DataLayout> input(74,37,11,137);
Tensor<float, 1, DataLayout> kernel(4);
@@ -243,8 +257,7 @@ static void test_cuda_convolution_1d()
cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
- cudaStream_t stream;
- assert(cudaStreamCreate(&stream) == cudaSuccess);
+ Eigen::CudaStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_input(d_input, 74,37,11,137);
@@ -269,9 +282,13 @@ static void test_cuda_convolution_1d()
}
}
}
+
+ cudaFree(d_input);
+ cudaFree(d_kernel);
+ cudaFree(d_out);
}
-static void test_cuda_convolution_inner_dim_col_major_1d()
+void test_cuda_convolution_inner_dim_col_major_1d()
{
Tensor<float, 4, ColMajor> input(74,9,11,7);
Tensor<float, 1, ColMajor> kernel(4);
@@ -293,8 +310,7 @@ static void test_cuda_convolution_inner_dim_col_major_1d()
cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
- cudaStream_t stream;
- assert(cudaStreamCreate(&stream) == cudaSuccess);
+ Eigen::CudaStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<float, 4, ColMajor> > gpu_input(d_input,74,9,11,7);
@@ -319,9 +335,13 @@ static void test_cuda_convolution_inner_dim_col_major_1d()
}
}
}
+
+ cudaFree(d_input);
+ cudaFree(d_kernel);
+ cudaFree(d_out);
}
-static void test_cuda_convolution_inner_dim_row_major_1d()
+void test_cuda_convolution_inner_dim_row_major_1d()
{
Tensor<float, 4, RowMajor> input(7,9,11,74);
Tensor<float, 1, RowMajor> kernel(4);
@@ -343,8 +363,7 @@ static void test_cuda_convolution_inner_dim_row_major_1d()
cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
- cudaStream_t stream;
- assert(cudaStreamCreate(&stream) == cudaSuccess);
+ Eigen::CudaStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<float, 4, RowMajor> > gpu_input(d_input, 7,9,11,74);
@@ -369,10 +388,14 @@ static void test_cuda_convolution_inner_dim_row_major_1d()
}
}
}
+
+ cudaFree(d_input);
+ cudaFree(d_kernel);
+ cudaFree(d_out);
}
template<int DataLayout>
-static void test_cuda_convolution_2d()
+void test_cuda_convolution_2d()
{
Tensor<float, 4, DataLayout> input(74,37,11,137);
Tensor<float, 2, DataLayout> kernel(3,4);
@@ -394,8 +417,7 @@ static void test_cuda_convolution_2d()
cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
- cudaStream_t stream;
- assert(cudaStreamCreate(&stream) == cudaSuccess);
+ Eigen::CudaStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_input(d_input,74,37,11,137);
@@ -430,10 +452,14 @@ static void test_cuda_convolution_2d()
}
}
}
+
+ cudaFree(d_input);
+ cudaFree(d_kernel);
+ cudaFree(d_out);
}
template<int DataLayout>
-static void test_cuda_convolution_3d()
+void test_cuda_convolution_3d()
{
Tensor<float, 5, DataLayout> input(Eigen::array<int, 5>(74,37,11,137,17));
Tensor<float, 3, DataLayout> kernel(3,4,2);
@@ -455,8 +481,7 @@ static void test_cuda_convolution_3d()
cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
- cudaStream_t stream;
- assert(cudaStreamCreate(&stream) == cudaSuccess);
+ Eigen::CudaStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_input(d_input,74,37,11,137,17);
@@ -505,6 +530,10 @@ static void test_cuda_convolution_3d()
}
}
}
+
+ cudaFree(d_input);
+ cudaFree(d_kernel);
+ cudaFree(d_out);
}
@@ -542,6 +571,9 @@ void test_cuda_lgamma(const Scalar stddev)
VERIFY_IS_APPROX(out(i,j), (std::lgamma)(in(i,j)));
}
}
+
+ cudaFree(d_in);
+ cudaFree(d_out);
}
template <typename Scalar>
@@ -578,6 +610,9 @@ void test_cuda_erf(const Scalar stddev)
VERIFY_IS_APPROX(out(i,j), (std::erf)(in(i,j)));
}
}
+
+ cudaFree(d_in);
+ cudaFree(d_out);
}
template <typename Scalar>
@@ -614,51 +649,50 @@ void test_cuda_erfc(const Scalar stddev)
VERIFY_IS_APPROX(out(i,j), (std::erfc)(in(i,j)));
}
}
+
+ cudaFree(d_in);
+ cudaFree(d_out);
}
void test_cxx11_tensor_cuda()
{
- CALL_SUBTEST(test_cuda_elementwise_small());
- CALL_SUBTEST(test_cuda_elementwise());
- CALL_SUBTEST(test_cuda_reduction());
- CALL_SUBTEST(test_cuda_contraction<ColMajor>());
- CALL_SUBTEST(test_cuda_contraction<RowMajor>());
- CALL_SUBTEST(test_cuda_convolution_1d<ColMajor>());
- CALL_SUBTEST(test_cuda_convolution_1d<RowMajor>());
- CALL_SUBTEST(test_cuda_convolution_inner_dim_col_major_1d());
- CALL_SUBTEST(test_cuda_convolution_inner_dim_row_major_1d());
- CALL_SUBTEST(test_cuda_convolution_2d<ColMajor>());
- CALL_SUBTEST(test_cuda_convolution_2d<RowMajor>());
- CALL_SUBTEST(test_cuda_convolution_3d<ColMajor>());
- CALL_SUBTEST(test_cuda_convolution_3d<RowMajor>());
- CALL_SUBTEST(test_cuda_lgamma<float>(1.0f));
- CALL_SUBTEST(test_cuda_lgamma<float>(100.0f));
- CALL_SUBTEST(test_cuda_lgamma<float>(0.01f));
- CALL_SUBTEST(test_cuda_lgamma<float>(0.001f));
- CALL_SUBTEST(test_cuda_erf<float>(1.0f));
- CALL_SUBTEST(test_cuda_erf<float>(100.0f));
- CALL_SUBTEST(test_cuda_erf<float>(0.01f));
- CALL_SUBTEST(test_cuda_erf<float>(0.001f));
- CALL_SUBTEST(test_cuda_erfc<float>(1.0f));
+ CALL_SUBTEST_1(test_cuda_elementwise_small());
+ CALL_SUBTEST_1(test_cuda_elementwise());
+ CALL_SUBTEST_1(test_cuda_reduction());
+ CALL_SUBTEST_2(test_cuda_contraction<ColMajor>());
+ CALL_SUBTEST_2(test_cuda_contraction<RowMajor>());
+ CALL_SUBTEST_3(test_cuda_convolution_1d<ColMajor>());
+ CALL_SUBTEST_3(test_cuda_convolution_1d<RowMajor>());
+ CALL_SUBTEST_3(test_cuda_convolution_inner_dim_col_major_1d());
+ CALL_SUBTEST_3(test_cuda_convolution_inner_dim_row_major_1d());
+ CALL_SUBTEST_3(test_cuda_convolution_2d<ColMajor>());
+ CALL_SUBTEST_3(test_cuda_convolution_2d<RowMajor>());
+ CALL_SUBTEST_3(test_cuda_convolution_3d<ColMajor>());
+ CALL_SUBTEST_3(test_cuda_convolution_3d<RowMajor>());
+ CALL_SUBTEST_4(test_cuda_lgamma<float>(1.0f));
+ CALL_SUBTEST_4(test_cuda_lgamma<float>(100.0f));
+ CALL_SUBTEST_4(test_cuda_lgamma<float>(0.01f));
+ CALL_SUBTEST_4(test_cuda_lgamma<float>(0.001f));
+ CALL_SUBTEST_4(test_cuda_erf<float>(1.0f));
+ CALL_SUBTEST_4(test_cuda_erf<float>(100.0f));
+ CALL_SUBTEST_4(test_cuda_erf<float>(0.01f));
+ CALL_SUBTEST_4(test_cuda_erf<float>(0.001f));
+ CALL_SUBTEST_4(test_cuda_erfc<float>(1.0f));
// CALL_SUBTEST(test_cuda_erfc<float>(100.0f));
- CALL_SUBTEST(test_cuda_erfc<float>(5.0f)); // CUDA erfc lacks precision for large inputs
- CALL_SUBTEST(test_cuda_erfc<float>(0.01f));
- CALL_SUBTEST(test_cuda_erfc<float>(0.001f));
- CALL_SUBTEST(test_cuda_tanh<double>(1.0));
- CALL_SUBTEST(test_cuda_tanh<double>(100.0));
- CALL_SUBTEST(test_cuda_tanh<double>(0.01));
- CALL_SUBTEST(test_cuda_tanh<double>(0.001));
- CALL_SUBTEST(test_cuda_lgamma<double>(1.0));
- CALL_SUBTEST(test_cuda_lgamma<double>(100.0));
- CALL_SUBTEST(test_cuda_lgamma<double>(0.01));
- CALL_SUBTEST(test_cuda_lgamma<double>(0.001));
- CALL_SUBTEST(test_cuda_erf<double>(1.0));
- CALL_SUBTEST(test_cuda_erf<double>(100.0));
- CALL_SUBTEST(test_cuda_erf<double>(0.01));
- CALL_SUBTEST(test_cuda_erf<double>(0.001));
- CALL_SUBTEST(test_cuda_erfc<double>(1.0));
+ CALL_SUBTEST_4(test_cuda_erfc<float>(5.0f)); // CUDA erfc lacks precision for large inputs
+ CALL_SUBTEST_4(test_cuda_erfc<float>(0.01f));
+ CALL_SUBTEST_4(test_cuda_erfc<float>(0.001f));
+ CALL_SUBTEST_4(test_cuda_lgamma<double>(1.0));
+ CALL_SUBTEST_4(test_cuda_lgamma<double>(100.0));
+ CALL_SUBTEST_4(test_cuda_lgamma<double>(0.01));
+ CALL_SUBTEST_4(test_cuda_lgamma<double>(0.001));
+ CALL_SUBTEST_4(test_cuda_erf<double>(1.0));
+ CALL_SUBTEST_4(test_cuda_erf<double>(100.0));
+ CALL_SUBTEST_4(test_cuda_erf<double>(0.01));
+ CALL_SUBTEST_4(test_cuda_erf<double>(0.001));
+ CALL_SUBTEST_4(test_cuda_erfc<double>(1.0));
// CALL_SUBTEST(test_cuda_erfc<double>(100.0));
- CALL_SUBTEST(test_cuda_erfc<double>(5.0)); // CUDA erfc lacks precision for large inputs
- CALL_SUBTEST(test_cuda_erfc<double>(0.01));
- CALL_SUBTEST(test_cuda_erfc<double>(0.001));
+ CALL_SUBTEST_4(test_cuda_erfc<double>(5.0)); // CUDA erfc lacks precision for large inputs
+ CALL_SUBTEST_4(test_cuda_erfc<double>(0.01));
+ CALL_SUBTEST_4(test_cuda_erfc<double>(0.001));
}
diff --git a/unsupported/test/cxx11_tensor_device.cpp b/unsupported/test/cxx11_tensor_device.cu
index ed5dd7505..cbe9e6449 100644
--- a/unsupported/test/cxx11_tensor_device.cpp
+++ b/unsupported/test/cxx11_tensor_device.cu
@@ -109,19 +109,19 @@ struct GPUContext {
// The actual expression to evaluate
template <typename Context>
-static void test_contextual_eval(Context* context)
+void test_contextual_eval(Context* context)
{
context->out().device(context->device()) = context->in1() + context->in2() * 3.14f + context->in1().constant(2.718f);
}
template <typename Context>
-static void test_forced_contextual_eval(Context* context)
+void test_forced_contextual_eval(Context* context)
{
context->out().device(context->device()) = (context->in1() + context->in2()).eval() * 3.14f + context->in1().constant(2.718f);
}
template <typename Context>
-static void test_compound_assignment(Context* context)
+void test_compound_assignment(Context* context)
{
context->out().device(context->device()) = context->in1().constant(2.718f);
context->out().device(context->device()) += context->in1() + context->in2() * 3.14f;
@@ -129,7 +129,7 @@ static void test_compound_assignment(Context* context)
template <typename Context>
-static void test_contraction(Context* context)
+void test_contraction(Context* context)
{
Eigen::array<std::pair<int, int>, 2> dims;
dims[0] = std::make_pair(1, 1);
@@ -145,7 +145,7 @@ static void test_contraction(Context* context)
template <typename Context>
-static void test_1d_convolution(Context* context)
+void test_1d_convolution(Context* context)
{
Eigen::DSizes<int, 3> indices(0,0,0);
Eigen::DSizes<int, 3> sizes(40,49,70);
@@ -155,7 +155,7 @@ static void test_1d_convolution(Context* context)
}
template <typename Context>
-static void test_2d_convolution(Context* context)
+void test_2d_convolution(Context* context)
{
Eigen::DSizes<int, 3> indices(0,0,0);
Eigen::DSizes<int, 3> sizes(40,49,69);
@@ -165,7 +165,7 @@ static void test_2d_convolution(Context* context)
}
template <typename Context>
-static void test_3d_convolution(Context* context)
+void test_3d_convolution(Context* context)
{
Eigen::DSizes<int, 3> indices(0,0,0);
Eigen::DSizes<int, 3> sizes(39,49,69);
@@ -175,7 +175,7 @@ static void test_3d_convolution(Context* context)
}
-static void test_cpu() {
+void test_cpu() {
Eigen::Tensor<float, 3> in1(40,50,70);
Eigen::Tensor<float, 3> in2(40,50,70);
Eigen::Tensor<float, 3> out(40,50,70);
@@ -267,7 +267,7 @@ static void test_cpu() {
}
}
-static void test_gpu() {
+void test_gpu() {
Eigen::Tensor<float, 3> in1(40,50,70);
Eigen::Tensor<float, 3> in2(40,50,70);
Eigen::Tensor<float, 3> out(40,50,70);
@@ -383,6 +383,6 @@ static void test_gpu() {
void test_cxx11_tensor_device()
{
- CALL_SUBTEST(test_cpu());
- CALL_SUBTEST(test_gpu());
+ CALL_SUBTEST_1(test_cpu());
+ CALL_SUBTEST_2(test_gpu());
}
diff --git a/unsupported/test/cxx11_tensor_empty.cpp b/unsupported/test/cxx11_tensor_empty.cpp
new file mode 100644
index 000000000..9130fff35
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_empty.cpp
@@ -0,0 +1,40 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+
+static void test_empty_tensor()
+{
+ Tensor<float, 2> source;
+ Tensor<float, 2> tgt1 = source;
+ Tensor<float, 2> tgt2(source);
+ Tensor<float, 2> tgt3;
+ tgt3 = tgt1;
+ tgt3 = tgt2;
+}
+
+static void test_empty_fixed_size_tensor()
+{
+ TensorFixedSize<float, Sizes<0>> source;
+ TensorFixedSize<float, Sizes<0>> tgt1 = source;
+ TensorFixedSize<float, Sizes<0>> tgt2(source);
+ TensorFixedSize<float, Sizes<0>> tgt3;
+ tgt3 = tgt1;
+ tgt3 = tgt2;
+}
+
+
+void test_cxx11_tensor_empty()
+{
+ CALL_SUBTEST(test_empty_tensor());
+ CALL_SUBTEST(test_empty_fixed_size_tensor());
+}
diff --git a/unsupported/test/cxx11_tensor_random_cuda.cpp b/unsupported/test/cxx11_tensor_random_cuda.cu
index 5d091de15..5d091de15 100644
--- a/unsupported/test/cxx11_tensor_random_cuda.cpp
+++ b/unsupported/test/cxx11_tensor_random_cuda.cu
diff --git a/unsupported/test/cxx11_tensor_reduction_cuda.cpp b/unsupported/test/cxx11_tensor_reduction_cuda.cu
index 9e06eb126..cad0c08e0 100644
--- a/unsupported/test/cxx11_tensor_reduction_cuda.cpp
+++ b/unsupported/test/cxx11_tensor_reduction_cuda.cu
@@ -48,9 +48,12 @@ static void test_full_reductions() {
// Check that the CPU and GPU reductions return the same result.
VERIFY_IS_APPROX(full_redux(), full_redux_gpu());
+
+ gpu_device.deallocate(gpu_in_ptr);
+ gpu_device.deallocate(gpu_out_ptr);
}
void test_cxx11_tensor_reduction_cuda() {
- CALL_SUBTEST(test_full_reductions<ColMajor>());
- CALL_SUBTEST(test_full_reductions<RowMajor>());
+ CALL_SUBTEST_1(test_full_reductions<ColMajor>());
+ CALL_SUBTEST_2(test_full_reductions<RowMajor>());
}
diff --git a/unsupported/test/cxx11_tensor_thread_pool.cpp b/unsupported/test/cxx11_tensor_thread_pool.cpp
index e28cf55e2..e46197464 100644
--- a/unsupported/test/cxx11_tensor_thread_pool.cpp
+++ b/unsupported/test/cxx11_tensor_thread_pool.cpp
@@ -17,7 +17,7 @@
using Eigen::Tensor;
-static void test_multithread_elementwise()
+void test_multithread_elementwise()
{
Tensor<float, 3> in1(2,3,7);
Tensor<float, 3> in2(2,3,7);
@@ -40,7 +40,7 @@ static void test_multithread_elementwise()
}
-static void test_multithread_compound_assignment()
+void test_multithread_compound_assignment()
{
Tensor<float, 3> in1(2,3,7);
Tensor<float, 3> in2(2,3,7);
@@ -64,7 +64,7 @@ static void test_multithread_compound_assignment()
}
template<int DataLayout>
-static void test_multithread_contraction()
+void test_multithread_contraction()
{
Tensor<float, 4, DataLayout> t_left(30, 50, 37, 31);
Tensor<float, 5, DataLayout> t_right(37, 31, 70, 2, 10);
@@ -91,15 +91,20 @@ static void test_multithread_contraction()
for (ptrdiff_t i = 0; i < t_result.size(); i++) {
VERIFY(&t_result.data()[i] != &m_result.data()[i]);
- if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) {
- std::cout << "mismatch detected: " << t_result.data()[i] << " vs " << m_result.data()[i] << std::endl;
- assert(false);
+ if (fabs(t_result(i) - m_result(i)) < 1e-4) {
+ continue;
+ }
+ if (Eigen::internal::isApprox(t_result(i), m_result(i), 1e-4f)) {
+ continue;
}
+ std::cout << "mismatch detected at index " << i << ": " << t_result(i)
+ << " vs " << m_result(i) << std::endl;
+ assert(false);
}
}
template<int DataLayout>
-static void test_contraction_corner_cases()
+void test_contraction_corner_cases()
{
Tensor<float, 2, DataLayout> t_left(32, 500);
Tensor<float, 2, DataLayout> t_right(32, 28*28);
@@ -186,7 +191,7 @@ static void test_contraction_corner_cases()
}
template<int DataLayout>
-static void test_multithread_contraction_agrees_with_singlethread() {
+void test_multithread_contraction_agrees_with_singlethread() {
int contract_size = internal::random<int>(1, 5000);
Tensor<float, 3, DataLayout> left(internal::random<int>(1, 80),
@@ -229,7 +234,7 @@ static void test_multithread_contraction_agrees_with_singlethread() {
template<int DataLayout>
-static void test_multithreaded_reductions() {
+void test_multithreaded_reductions() {
const int num_threads = internal::random<int>(3, 11);
ThreadPool thread_pool(num_threads);
Eigen::ThreadPoolDevice thread_pool_device(&thread_pool, num_threads);
@@ -239,19 +244,19 @@ static void test_multithreaded_reductions() {
Tensor<float, 2, DataLayout> t1(num_rows, num_cols);
t1.setRandom();
- Tensor<float, 1, DataLayout> full_redux(1);
+ Tensor<float, 0, DataLayout> full_redux;
full_redux = t1.sum();
- Tensor<float, 1, DataLayout> full_redux_tp(1);
+ Tensor<float, 0, DataLayout> full_redux_tp;
full_redux_tp.device(thread_pool_device) = t1.sum();
// Check that the single threaded and the multi threaded reductions return
// the same result.
- VERIFY_IS_APPROX(full_redux(0), full_redux_tp(0));
+ VERIFY_IS_APPROX(full_redux(), full_redux_tp());
}
-static void test_memcpy() {
+void test_memcpy() {
for (int i = 0; i < 5; ++i) {
const int num_threads = internal::random<int>(3, 11);
@@ -270,7 +275,7 @@ static void test_memcpy() {
}
-static void test_multithread_random()
+void test_multithread_random()
{
Eigen::ThreadPool tp(2);
Eigen::ThreadPoolDevice device(&tp, 2);
@@ -278,26 +283,52 @@ static void test_multithread_random()
t.device(device) = t.random<Eigen::internal::NormalRandomGenerator<float>>();
}
+template<int DataLayout>
+void test_multithread_shuffle()
+{
+ Tensor<float, 4, DataLayout> tensor(17,5,7,11);
+ tensor.setRandom();
+
+ const int num_threads = internal::random<int>(2, 11);
+ ThreadPool threads(num_threads);
+ Eigen::ThreadPoolDevice device(&threads, num_threads);
+
+ Tensor<float, 4, DataLayout> shuffle(7,5,11,17);
+ array<ptrdiff_t, 4> shuffles = {{2,1,3,0}};
+ shuffle.device(device) = tensor.shuffle(shuffles);
+
+ for (int i = 0; i < 17; ++i) {
+ for (int j = 0; j < 5; ++j) {
+ for (int k = 0; k < 7; ++k) {
+ for (int l = 0; l < 11; ++l) {
+ VERIFY_IS_EQUAL(tensor(i,j,k,l), shuffle(k,j,l,i));
+ }
+ }
+ }
+ }
+}
+
void test_cxx11_tensor_thread_pool()
{
- CALL_SUBTEST(test_multithread_elementwise());
- CALL_SUBTEST(test_multithread_compound_assignment());
+ CALL_SUBTEST_1(test_multithread_elementwise());
+ CALL_SUBTEST_1(test_multithread_compound_assignment());
- CALL_SUBTEST(test_multithread_contraction<ColMajor>());
- CALL_SUBTEST(test_multithread_contraction<RowMajor>());
+ CALL_SUBTEST_2(test_multithread_contraction<ColMajor>());
+ CALL_SUBTEST_2(test_multithread_contraction<RowMajor>());
- CALL_SUBTEST(test_multithread_contraction_agrees_with_singlethread<ColMajor>());
- CALL_SUBTEST(test_multithread_contraction_agrees_with_singlethread<RowMajor>());
+ CALL_SUBTEST_3(test_multithread_contraction_agrees_with_singlethread<ColMajor>());
+ CALL_SUBTEST_3(test_multithread_contraction_agrees_with_singlethread<RowMajor>());
// Exercise various cases that have been problematic in the past.
- CALL_SUBTEST(test_contraction_corner_cases<ColMajor>());
- CALL_SUBTEST(test_contraction_corner_cases<RowMajor>());
-
- CALL_SUBTEST(test_multithreaded_reductions<ColMajor>());
- CALL_SUBTEST(test_multithreaded_reductions<RowMajor>());
+ CALL_SUBTEST_4(test_contraction_corner_cases<ColMajor>());
+ CALL_SUBTEST_4(test_contraction_corner_cases<RowMajor>());
- CALL_SUBTEST(test_memcpy());
+ CALL_SUBTEST_5(test_multithreaded_reductions<ColMajor>());
+ CALL_SUBTEST_5(test_multithreaded_reductions<RowMajor>());
- CALL_SUBTEST(test_multithread_random());
+ CALL_SUBTEST_6(test_memcpy());
+ CALL_SUBTEST_6(test_multithread_random());
+ CALL_SUBTEST_6(test_multithread_shuffle<ColMajor>());
+ CALL_SUBTEST_6(test_multithread_shuffle<RowMajor>());
}