diff options
author | Deanna Hood <deanna.m.hood@gmail.com> | 2015-04-20 14:01:35 -0400 |
---|---|---|
committer | Deanna Hood <deanna.m.hood@gmail.com> | 2015-04-20 14:01:35 -0400 |
commit | 0250f4a9f2f7bb54ed760cec045939a1bbe58170 (patch) | |
tree | 5e0372ddef57f51f7adb56bb9569037e6e9556ad /unsupported | |
parent | 0339502a4feb6340f1e9f6f6ca8b3ef4d263f366 (diff) | |
parent | 0eb220c00d9773c29c7d169ad0e20745b0ef21bb (diff) |
Merged default into unary-array-cwise-functors
Diffstat (limited to 'unsupported')
19 files changed, 740 insertions, 347 deletions
diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index 34107ae71..ae6c3fe7e 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -49,8 +49,8 @@ #include "unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h" -#include "unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h" @@ -80,8 +80,8 @@ #include "unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h" -#include "unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h" #include "unsupported/Eigen/CXX11/src/Tensor/Tensor.h" diff --git a/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h b/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h index 494f95690..9dea2055a 100644 --- a/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h +++ b/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h @@ -266,16 +266,16 @@ array<t, n> repeat(t v) { } template<std::size_t I, class Head, class Tail> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Head::type array_get(type_list<Head, Tail>& a) { +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Head::type array_get(type_list<Head, Tail>&) { return get<I, type_list<Head, Tail> >::value; } template<std::size_t I, class Head, class Tail> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Head::type array_get(const type_list<Head, Tail>& a) { +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Head::type array_get(const type_list<Head, Tail>&) { return get<I, type_list<Head, Tail> >::value; } template <class NList> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NList::HeadType::type array_prod(const NList& l) { +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NList::HeadType::type array_prod(const NList&) { return arg_prod<NList>::value; }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/README.md b/unsupported/Eigen/CXX11/src/Tensor/README.md index ed1026be2..87e57cebb 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/README.md +++ b/unsupported/Eigen/CXX11/src/Tensor/README.md @@ -1157,7 +1157,41 @@ in TensorFunctors.h for information on how to implement a reduction operator. ## Convolutions -TBD: convolve(const KernelDerived& kernel, const Dimensions& dims) +### <Operation> convolve(const Kernel& kernel, const Dimensions& dims) + +Returns a tensor that is the output of the convolution of the input tensor with the kernel, +along the specified dimensions of the input tensor. The dimension size for dimensions of the output tensor +which were part of the convolution will be reduced by the formula: +output_dim_size = input_dim_size - kernel_dim_size + 1 (requires: input_dim_size >= kernel_dim_size). +The dimension sizes for dimensions that were not part of the convolution will remain the same. +Performance of the convolution can depend on the length of the stride(s) of the input tensor dimension(s) along which the +convolution is computed (the first dimension has the shortest stride for ColMajor, whereas RowMajor's shortest stride is +for the last dimension). + + // Compute convolution along the second and third dimension. + Tensor<float, 4, DataLayout> input(3, 3, 7, 11); + Tensor<float, 2, DataLayout> kernel(2, 2); + Tensor<float, 4, DataLayout> output(3, 2, 6, 11); + input.setRandom(); + kernel.setRandom(); + + Eigen::array<ptrdiff_t, 2> dims({1, 2}); // Specify second and third dimension for convolution. + output = input.convolve(kernel, dims); + + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 2; ++j) { + for (int k = 0; k < 6; ++k) { + for (int l = 0; l < 11; ++l) { + const float result = output(i,j,k,l); + const float expected = input(i,j+0,k+0,l) * kernel(0,0) + + input(i,j+1,k+0,l) * kernel(1,0) + + input(i,j+0,k+1,l) * kernel(0,1) + + input(i,j+1,k+1,l) * kernel(1,1); + VERIFY_IS_APPROX(result, expected); + } + } + } + } ## Geometrical Operations diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index 201b0fc9e..86e72c3a4 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -520,48 +520,101 @@ class TensorBase<Derived, WriteAccessors> : public TensorBase<Derived, ReadOnlyA } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - TensorLayoutSwapOp<Derived> + const TensorLayoutSwapOp<const Derived> swap_layout() const { + return TensorLayoutSwapOp<const Derived>(derived()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorLayoutSwapOp<Derived> + swap_layout() { return TensorLayoutSwapOp<Derived>(derived()); } + template <typename Axis, typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - TensorConcatenationOp<const Axis, Derived, OtherDerived> + const TensorConcatenationOp<const Axis, const Derived, const OtherDerived> concatenate(const OtherDerived& other, const Axis& axis) const { - return TensorConcatenationOp<const Axis, Derived, OtherDerived>(derived(), other.derived(), axis); + return TensorConcatenationOp<const Axis, const Derived, const OtherDerived>(derived(), other, axis); } + template <typename Axis, typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorConcatenationOp<const Axis, Derived, OtherDerived> + concatenate(const OtherDerived& other, const Axis& axis) { + return TensorConcatenationOp<const Axis, Derived, OtherDerived>(derived(), other, axis); + } + template <typename NewDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - TensorReshapingOp<const NewDimensions, Derived> + const TensorReshapingOp<const NewDimensions, const Derived> reshape(const NewDimensions& newDimensions) const { + return TensorReshapingOp<const NewDimensions, const Derived>(derived(), newDimensions); + } + template <typename NewDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorReshapingOp<const NewDimensions, Derived> + reshape(const NewDimensions& newDimensions) { return TensorReshapingOp<const NewDimensions, Derived>(derived(), newDimensions); } + template <typename StartIndices, typename Sizes> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - TensorSlicingOp<const StartIndices, const Sizes, Derived> + const TensorSlicingOp<const StartIndices, const Sizes, const Derived> slice(const StartIndices& startIndices, const Sizes& sizes) const { + return TensorSlicingOp<const StartIndices, const Sizes, const Derived>(derived(), startIndices, sizes); + } + template <typename StartIndices, typename Sizes> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorSlicingOp<const StartIndices, const Sizes, Derived> + slice(const StartIndices& startIndices, const Sizes& sizes) { return TensorSlicingOp<const StartIndices, const Sizes, Derived>(derived(), startIndices, sizes); } + template <DenseIndex DimId> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - TensorChippingOp<DimId, Derived> + const TensorChippingOp<DimId, const Derived> chip(const Index offset) const { + return TensorChippingOp<DimId, const Derived>(derived(), offset, DimId); + } + template <Index DimId> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorChippingOp<DimId, Derived> + chip(const Index offset) { return TensorChippingOp<DimId, Derived>(derived(), offset, DimId); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - TensorChippingOp<Dynamic, Derived> + const TensorChippingOp<Dynamic, const Derived> chip(const Index offset, const Index dim) const { + return TensorChippingOp<Dynamic, const Derived>(derived(), offset, dim); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorChippingOp<Dynamic, Derived> + chip(const Index offset, const Index dim) { return TensorChippingOp<Dynamic, Derived>(derived(), offset, dim); } + template <typename ReverseDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - TensorReverseOp<const ReverseDimensions, Derived> + const TensorReverseOp<const ReverseDimensions, const Derived> reverse(const ReverseDimensions& rev) const { + return TensorReverseOp<const ReverseDimensions, const Derived>(derived(), rev); + } + template <typename ReverseDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorReverseOp<const ReverseDimensions, Derived> + reverse(const ReverseDimensions& rev) { return TensorReverseOp<const ReverseDimensions, Derived>(derived(), rev); } + template <typename Shuffle> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - TensorShufflingOp<const Shuffle, Derived> + const TensorShufflingOp<const Shuffle, const Derived> shuffle(const Shuffle& shuffle) const { + return TensorShufflingOp<const Shuffle, const Derived>(derived(), shuffle); + } + template <typename Shuffle> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorShufflingOp<const Shuffle, Derived> + shuffle(const Shuffle& shuffle) { return TensorShufflingOp<const Shuffle, Derived>(derived(), shuffle); } + template <typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - TensorStridingOp<const Strides, Derived> + const TensorStridingOp<const Strides, const Derived> stride(const Strides& strides) const { + return TensorStridingOp<const Strides, const Derived>(derived(), strides); + } + template <typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorStridingOp<const Strides, Derived> + stride(const Strides& strides) { return TensorStridingOp<const Strides, Derived>(derived(), strides); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h index dc9586cbc..3b99ef069 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -157,6 +157,8 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device> eigen_assert(NumInputDims > m_dim.actualDim()); const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions(); + eigen_assert(op.offset() < input_dims[m_dim.actualDim()]); + int j = 0; for (int i = 0; i < NumInputDims; ++i) { if (i != m_dim.actualDim()) { @@ -246,7 +248,9 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { Scalar* result = m_impl.data(); - if (m_dim.actualDim() == NumDims && result) { + if (((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == NumDims) || + (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == 0)) && + result) { return result + m_inputOffset; } else { return NULL; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index 591fd2464..1db5f1232 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -21,8 +21,8 @@ namespace Eigen { */ namespace internal { - -template <typename Index, typename InputDims, size_t NumKernelDims> class IndexMapper { +template <typename Index, typename InputDims, size_t NumKernelDims, int Layout> +class IndexMapper { public: IndexMapper(const InputDims& input_dims, const array<Index, NumKernelDims>& kernel_dims, const array<Index, NumKernelDims>& indices) { @@ -38,13 +38,19 @@ template <typename Index, typename InputDims, size_t NumKernelDims> class IndexM array<Index, NumDims> inputStrides; array<Index, NumDims> outputStrides; - for (int i = 0; i < NumDims; ++i) { - if (i > 0) { + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + inputStrides[0] = 1; + outputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { inputStrides[i] = inputStrides[i-1] * input_dims[i-1]; outputStrides[i] = outputStrides[i-1] * dimensions[i-1]; - } else { - inputStrides[0] = 1; - outputStrides[0] = 1; + } + } else { + inputStrides[NumDims - 1] = 1; + outputStrides[NumDims - 1] = 1; + for (int i = static_cast<int>(NumDims) - 2; i >= 0; --i) { + inputStrides[i] = inputStrides[i + 1] * input_dims[i + 1]; + outputStrides[i] = outputStrides[i + 1] * dimensions[i + 1]; } } @@ -52,13 +58,20 @@ template <typename Index, typename InputDims, size_t NumKernelDims> class IndexM array<Index, NumDims> cudaOutputDimensions; array<Index, NumDims> tmp = dimensions; array<Index, NumDims> ordering; + const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) + ? 0 + : NumDims - NumKernelDims; for (int i = 0; i < NumKernelDims; ++i) { - ordering[i] = indices[i]; + const Index index = i + offset; + ordering[index] = indices[i]; tmp[indices[i]] = -1; - cudaInputDimensions[i] = input_dims[ordering[i]]; - cudaOutputDimensions[i] = dimensions[ordering[i]]; + cudaInputDimensions[index] = input_dims[indices[i]]; + cudaOutputDimensions[index] = dimensions[indices[i]]; } - int written = NumKernelDims; + + int written = static_cast<int>(Layout) == static_cast<int>(ColMajor) + ? NumKernelDims + : 0; for (int i = 0; i < NumDims; ++i) { if (tmp[i] >= 0) { ordering[written] = i; @@ -73,61 +86,123 @@ template <typename Index, typename InputDims, size_t NumKernelDims> class IndexM m_outputStrides[i] = outputStrides[ordering[i]]; } - for (int i = 0; i < NumDims; ++i) { - if (i > NumKernelDims) { - m_cudaInputStrides[i] = m_cudaInputStrides[i-1] * cudaInputDimensions[i-1]; - m_cudaOutputStrides[i] = m_cudaOutputStrides[i-1] * cudaOutputDimensions[i-1]; - } else { - m_cudaInputStrides[i] = 1; - m_cudaOutputStrides[i] = 1; + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + for (int i = 0; i < NumDims; ++i) { + if (i > NumKernelDims) { + m_cudaInputStrides[i] = + m_cudaInputStrides[i - 1] * cudaInputDimensions[i - 1]; + m_cudaOutputStrides[i] = + m_cudaOutputStrides[i - 1] * cudaOutputDimensions[i - 1]; + } else { + m_cudaInputStrides[i] = 1; + m_cudaOutputStrides[i] = 1; + } + } + } else { + for (int i = NumDims - 1; i >= 0; --i) { + if (i + 1 < offset) { + m_cudaInputStrides[i] = + m_cudaInputStrides[i + 1] * cudaInputDimensions[i + 1]; + m_cudaOutputStrides[i] = + m_cudaOutputStrides[i + 1] * cudaOutputDimensions[i + 1]; + } else { + m_cudaInputStrides[i] = 1; + m_cudaOutputStrides[i] = 1; + } } } } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputPlaneToTensorInputOffset(Index p) const { Index inputIndex = 0; - for (int d = NumDims - 1; d > NumKernelDims; --d) { - const Index idx = p / m_cudaInputStrides[d]; - inputIndex += idx * m_inputStrides[d]; - p -= idx * m_cudaInputStrides[d]; + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + for (int d = NumDims - 1; d > NumKernelDims; --d) { + const Index idx = p / m_cudaInputStrides[d]; + inputIndex += idx * m_inputStrides[d]; + p -= idx * m_cudaInputStrides[d]; + } + inputIndex += p * m_inputStrides[NumKernelDims]; + } else { + int limit = 0; + if (NumKernelDims < NumDims) { + limit = NumDims - NumKernelDims - 1; + } + for (int d = 0; d < limit; ++d) { + const Index idx = p / m_cudaInputStrides[d]; + inputIndex += idx * m_inputStrides[d]; + p -= idx * m_cudaInputStrides[d]; + } + inputIndex += p * m_inputStrides[limit]; } - inputIndex += p * m_inputStrides[NumKernelDims]; return inputIndex; } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputPlaneToTensorOutputOffset(Index p) const { Index outputIndex = 0; - for (int d = NumDims - 1; d > NumKernelDims; --d) { - const Index idx = p / m_cudaOutputStrides[d]; - outputIndex += idx * m_outputStrides[d]; - p -= idx * m_cudaOutputStrides[d]; + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + for (int d = NumDims - 1; d > NumKernelDims; --d) { + const Index idx = p / m_cudaOutputStrides[d]; + outputIndex += idx * m_outputStrides[d]; + p -= idx * m_cudaOutputStrides[d]; + } + outputIndex += p * m_outputStrides[NumKernelDims]; + } else { + int limit = 0; + if (NumKernelDims < NumDims) { + limit = NumDims - NumKernelDims - 1; + } + for (int d = 0; d < limit; ++d) { + const Index idx = p / m_cudaOutputStrides[d]; + outputIndex += idx * m_outputStrides[d]; + p -= idx * m_cudaOutputStrides[d]; + } + outputIndex += p * m_outputStrides[limit]; } - outputIndex += p * m_outputStrides[NumKernelDims]; return outputIndex; } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i) const { - return i * m_inputStrides[0]; + const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) + ? 0 + : NumDims - NumKernelDims; + return i * m_inputStrides[offset]; } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i) const { - return i * m_outputStrides[0]; + const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) + ? 0 + : NumDims - NumKernelDims; + return i * m_outputStrides[offset]; } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j) const { - return i * m_inputStrides[0] + j*m_inputStrides[1]; + const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) + ? 0 + : NumDims - NumKernelDims; + return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1]; } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j) const { - return i * m_outputStrides[0] + j * m_outputStrides[1]; + const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) + ? 0 + : NumDims - NumKernelDims; + return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1]; } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j, Index k) const { - return i * m_inputStrides[0] + j*m_inputStrides[1] + k*m_inputStrides[2]; + const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) + ? 0 + : NumDims - NumKernelDims; + return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1] + + k * m_inputStrides[offset + 2]; } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const { - return i * m_outputStrides[0] + j*m_outputStrides[1] + k*m_outputStrides[2]; + const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) + ? 0 + : NumDims - NumKernelDims; + return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1] + + k * m_outputStrides[offset + 2]; } private: @@ -237,35 +312,61 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_kernel(NULL), m_local_kernel(false), m_device(device) { EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); - // Only column major tensors are supported for now. - EIGEN_STATIC_ASSERT((static_cast<int>(Layout) == static_cast<int>(ColMajor)), YOU_MADE_A_PROGRAMMING_MISTAKE); const typename TensorEvaluator<InputArgType, Device>::Dimensions& input_dims = m_inputImpl.dimensions(); const typename TensorEvaluator<KernelArgType, Device>::Dimensions& kernel_dims = m_kernelImpl.dimensions(); - m_inputStride[0] = 1; - for (int i = 1; i < NumDims; ++i) { - m_inputStride[i] = m_inputStride[i-1] * input_dims[i-1]; + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + m_inputStride[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_inputStride[i] = m_inputStride[i - 1] * input_dims[i - 1]; + } + } else { + m_inputStride[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_inputStride[i] = m_inputStride[i + 1] * input_dims[i + 1]; + } } m_dimensions = m_inputImpl.dimensions(); - for (int i = 0; i < NumKernelDims; ++i) { - const Index index = op.indices()[i]; - const Index input_dim = input_dims[index]; - const Index kernel_dim = kernel_dims[i]; - const Index result_dim = input_dim - kernel_dim + 1; - m_dimensions[index] = result_dim; - if (i > 0) { - m_kernelStride[i] = m_kernelStride[i-1] * kernel_dims[i-1]; - } else { - m_kernelStride[0] = 1; + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + for (int i = 0; i < NumKernelDims; ++i) { + const Index index = op.indices()[i]; + const Index input_dim = input_dims[index]; + const Index kernel_dim = kernel_dims[i]; + const Index result_dim = input_dim - kernel_dim + 1; + m_dimensions[index] = result_dim; + if (i > 0) { + m_kernelStride[i] = m_kernelStride[i - 1] * kernel_dims[i - 1]; + } else { + m_kernelStride[0] = 1; + } + m_indexStride[i] = m_inputStride[index]; + } + + m_outputStride[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_outputStride[i] = m_outputStride[i - 1] * m_dimensions[i - 1]; + } + } else { + for (int i = NumKernelDims - 1; i >= 0; --i) { + const Index index = op.indices()[i]; + const Index input_dim = input_dims[index]; + const Index kernel_dim = kernel_dims[i]; + const Index result_dim = input_dim - kernel_dim + 1; + m_dimensions[index] = result_dim; + if (i < NumKernelDims - 1) { + m_kernelStride[i] = m_kernelStride[i + 1] * kernel_dims[i + 1]; + } else { + m_kernelStride[NumKernelDims - 1] = 1; + } + m_indexStride[i] = m_inputStride[index]; } - m_indexStride[i] = m_inputStride[index]; - } - m_outputStride[0] = 1; - for (int i = 1; i < NumDims; ++i) { - m_outputStride[i] = m_outputStride[i-1] * m_dimensions[i-1]; + m_outputStride[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_outputStride[i] = m_outputStride[i + 1] * m_dimensions[i + 1]; + } } } @@ -310,13 +411,24 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; Index indices[2] = {index, index+PacketSize-1}; Index startInputs[2] = {0, 0}; - for (int i = NumDims - 1; i > 0; --i) { - const Index idx0 = indices[0] / m_outputStride[i]; - const Index idx1 = indices[1] / m_outputStride[i]; - startInputs[0] += idx0 * m_inputStride[i]; - startInputs[1] += idx1 * m_inputStride[i]; - indices[0] -= idx0 * m_outputStride[i]; - indices[1] -= idx1 * m_outputStride[i]; + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx0 = indices[0] / m_outputStride[i]; + const Index idx1 = indices[1] / m_outputStride[i]; + startInputs[0] += idx0 * m_inputStride[i]; + startInputs[1] += idx1 * m_inputStride[i]; + indices[0] -= idx0 * m_outputStride[i]; + indices[1] -= idx1 * m_outputStride[i]; + } + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx0 = indices[0] / m_outputStride[i]; + const Index idx1 = indices[1] / m_outputStride[i]; + startInputs[0] += idx0 * m_inputStride[i]; + startInputs[1] += idx1 * m_inputStride[i]; + indices[0] -= idx0 * m_outputStride[i]; + indices[1] -= idx1 * m_outputStride[i]; + } } startInputs[0] += indices[0]; startInputs[1] += indices[1]; @@ -344,10 +456,18 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr private: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const { Index startInput = 0; - for (int i = NumDims - 1; i > 0; --i) { - const Index idx = index / m_outputStride[i]; - startInput += idx * m_inputStride[i]; - index -= idx * m_outputStride[i]; + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_outputStride[i]; + startInput += idx * m_inputStride[i]; + index -= idx * m_outputStride[i]; + } + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = index / m_outputStride[i]; + startInput += idx * m_inputStride[i]; + index -= idx * m_outputStride[i]; + } } startInput += index; return startInput; @@ -378,7 +498,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr } } - EIGEN_STRONG_INLINE void preloadKernel() { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() { // Don't make a local copy of the kernel unless we have to (i.e. it's an // expression that needs to be evaluated) const Scalar* in_place = m_kernelImpl.data(); @@ -431,11 +551,14 @@ struct GetKernelSize<Dynamic> { } }; - - - -template <typename InputEvaluator, typename Index, typename InputDims, int StaticKernelSize> -__global__ void EigenConvolutionKernel1D(InputEvaluator eval, const internal::IndexMapper<Index, InputDims, 1> indexMapper, const float* __restrict kernel, const int numPlanes, const int numX, const int maxX, const int kernelSize, float* buffer) { +template <typename InputEvaluator, typename Index, typename InputDims, + int StaticKernelSize> +__global__ void EigenConvolutionKernel1D( + InputEvaluator eval, + const internal::IndexMapper<Index, InputDims, 1, InputEvaluator::Layout> + indexMapper, + const float* __restrict kernel, const int numPlanes, const int numX, + const int maxX, const int kernelSize, float* buffer) { extern __shared__ float s[]; const int first_x = blockIdx.x * maxX; @@ -453,7 +576,7 @@ __global__ void EigenConvolutionKernel1D(InputEvaluator eval, const internal::In #pragma unroll for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) { const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x); - s[i + plane_kernel_offset] = eval.coeff(tensor_index); + s[i + plane_kernel_offset] = eval.coeff(tensor_index); } __syncthreads(); @@ -476,9 +599,15 @@ __global__ void EigenConvolutionKernel1D(InputEvaluator eval, const internal::In } }; - -template <typename InputEvaluator, typename Index, typename InputDims, int StaticKernelSizeX, int StaticKernelSizeY> -__global__ void EigenConvolutionKernel2D(InputEvaluator eval, const internal::IndexMapper<Index, InputDims, 2> indexMapper, const float* __restrict kernel, const int numPlanes, const int numX, const int maxX, const int numY, const int maxY, const int kernelSizeX, const int kernelSizeY, float* buffer) { +template <typename InputEvaluator, typename Index, typename InputDims, + int StaticKernelSizeX, int StaticKernelSizeY> +__global__ void EigenConvolutionKernel2D( + InputEvaluator eval, + const internal::IndexMapper<Index, InputDims, 2, InputEvaluator::Layout> + indexMapper, + const float* __restrict kernel, const int numPlanes, const int numX, + const int maxX, const int numY, const int maxY, const int kernelSizeX, + const int kernelSizeY, float* buffer) { extern __shared__ float s[]; const int first_x = blockIdx.x * maxX; @@ -538,9 +667,15 @@ __global__ void EigenConvolutionKernel2D(InputEvaluator eval, const internal::In } }; - template <typename InputEvaluator, typename Index, typename InputDims> -__global__ void EigenConvolutionKernel3D(InputEvaluator eval, const internal::IndexMapper<Index, InputDims, 3> indexMapper, const float* __restrict kernel, const size_t numPlanes, const size_t numX, const size_t maxX, const size_t numY, const size_t maxY, const size_t numZ, const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY, const size_t kernelSizeZ, float* buffer) { +__global__ void EigenConvolutionKernel3D( + InputEvaluator eval, + const internal::IndexMapper<Index, InputDims, 3, InputEvaluator::Layout> + indexMapper, + const float* __restrict kernel, const size_t numPlanes, const size_t numX, + const size_t maxX, const size_t numY, const size_t maxY, const size_t numZ, + const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY, + const size_t kernelSizeZ, float* buffer) { extern __shared__ float s[]; // Load inputs to shared memory @@ -622,8 +757,6 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr : m_inputImpl(op.inputExpression(), device), m_kernelArg(op.kernelExpression()), m_kernelImpl(op.kernelExpression(), device), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device) { EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, GpuDevice>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, GpuDevice>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); - // Only column major tensors are supported for now. - EIGEN_STATIC_ASSERT((static_cast<int>(Layout) == static_cast<int>(ColMajor)), YOU_MADE_A_PROGRAMMING_MISTAKE); const typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions& input_dims = m_inputImpl.dimensions(); const typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions& kernel_dims = m_kernelImpl.dimensions(); @@ -712,10 +845,14 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr const int numX = dimensions()[m_indices[0]]; const int numP = dimensions().TotalSize() / numX; - int maxX; dim3 block_size; - if (m_indices[0] == 0) { + + const int single_stride_dim = + static_cast<int>(Layout) == static_cast<int>(ColMajor) + ? 0 + : m_inputImpl.dimensions().rank() - 1; + if (m_indices[0] == single_stride_dim) { // Maximum the reuse const int inner_dim = ((maxSharedMem / (sizeof(Scalar)) - kernel_size + 1 + 31) / 32) * 32; maxX = (std::min<int>)(inner_dim, numX); @@ -747,7 +884,8 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr const array<Index, 1> indices(m_indices[0]); const array<Index, 1> kernel_dims(m_kernelImpl.dimensions()[0]); - internal::IndexMapper<Index, InputDims, 1> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); + internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper( + m_inputImpl.dimensions(), kernel_dims, indices); switch(kernel_size) { case 4: { LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data); @@ -765,11 +903,15 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr } case 2: { - const int kernel_size_x = m_kernelImpl.dimensions()[0]; - const int kernel_size_y = m_kernelImpl.dimensions()[1]; - - const int numX = dimensions()[m_indices[0]]; - const int numY = dimensions()[m_indices[1]]; + const int idxX = + static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1; + const int idxY = + static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0; + const int kernel_size_x = m_kernelImpl.dimensions()[idxX]; + const int kernel_size_y = m_kernelImpl.dimensions()[idxY]; + + const int numX = dimensions()[m_indices[idxX]]; + const int numY = dimensions()[m_indices[idxY]]; const int numP = dimensions().TotalSize() / (numX*numY); const float scaling_factor = sqrtf(static_cast<float>(maxSharedMem) / (sizeof(Scalar) * kernel_size_y * kernel_size_x)); @@ -798,9 +940,11 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr //cout << "launching 2D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " maxX: " << maxX << " maxY: " << maxY << " maxP: " << maxP << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl; - const array<Index, 2> indices(m_indices[0], m_indices[1]); - const array<Index, 2> kernel_dims(m_kernelImpl.dimensions()[0], m_kernelImpl.dimensions()[1]); - internal::IndexMapper<Index, InputDims, 2> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); + const array<Index, 2> indices(m_indices[idxX], m_indices[idxY]); + const array<Index, 2> kernel_dims(m_kernelImpl.dimensions()[idxX], + m_kernelImpl.dimensions()[idxY]); + internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper( + m_inputImpl.dimensions(), kernel_dims, indices); switch (kernel_size_x) { case 4: { switch (kernel_size_y) { @@ -837,13 +981,20 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr } case 3: { - const int kernel_size_x = m_kernelImpl.dimensions()[0]; - const int kernel_size_y = m_kernelImpl.dimensions()[1]; - const int kernel_size_z = m_kernelImpl.dimensions()[2]; - - const int numX = dimensions()[m_indices[0]]; - const int numY = dimensions()[m_indices[1]]; - const int numZ = dimensions()[m_indices[2]]; + const int idxX = + static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2; + const int idxY = + static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1; + const int idxZ = + static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0; + + const int kernel_size_x = m_kernelImpl.dimensions()[idxX]; + const int kernel_size_y = m_kernelImpl.dimensions()[idxY]; + const int kernel_size_z = m_kernelImpl.dimensions()[idxZ]; + + const int numX = dimensions()[m_indices[idxX]]; + const int numY = dimensions()[m_indices[idxY]]; + const int numZ = dimensions()[m_indices[idxZ]]; const int numP = dimensions().TotalSize() / (numX*numY*numZ); const int maxX = (std::min<int>)(128, (std::min<int>)(maxSharedMem / (sizeof(Scalar) * kernel_size_y * kernel_size_z) - kernel_size_x + 1, numX)); @@ -860,16 +1011,20 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr assert(shared_mem <= maxSharedMem); //cout << "launching 3D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl; - const array<Index, 3> indices(m_indices[0], m_indices[1], m_indices[2]); - const array<Index, 3> kernel_dims(m_kernelImpl.dimensions()[0], m_kernelImpl.dimensions()[1], m_kernelImpl.dimensions()[2]); - internal::IndexMapper<Index, InputDims, 3> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); + const array<Index, 3> indices(m_indices[idxX], m_indices[idxY], + m_indices[idxZ]); + const array<Index, 3> kernel_dims(m_kernelImpl.dimensions()[idxX], + m_kernelImpl.dimensions()[idxY], + m_kernelImpl.dimensions()[idxZ]); + internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper( + m_inputImpl.dimensions(), kernel_dims, indices); LAUNCH_CUDA_KERNEL((EigenConvolutionKernel3D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data); break; } default: { - assert(false && "not supported yet"); + EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE); } } } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h index 649bdb308..17f10c07b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h @@ -21,8 +21,7 @@ namespace Eigen { * Example: * C.device(EIGEN_GPU) = A + B; * - * Todo: thread pools. - * Todo: operator +=, -=, *= and so on. + * Todo: operator *= and /=. */ template <typename ExpressionType, typename DeviceType> class TensorDevice { @@ -33,8 +32,7 @@ template <typename ExpressionType, typename DeviceType> class TensorDevice { EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) { typedef TensorAssignOp<ExpressionType, const OtherDerived> Assign; Assign assign(m_expression, other); - static const bool Vectorize = TensorEvaluator<const Assign, DeviceType>::PacketAccess; - internal::TensorExecutor<const Assign, DeviceType, Vectorize>::run(assign, m_device); + internal::TensorExecutor<const Assign, DeviceType>::run(assign, m_device); return *this; } @@ -45,8 +43,18 @@ template <typename ExpressionType, typename DeviceType> class TensorDevice { Sum sum(m_expression, other); typedef TensorAssignOp<ExpressionType, const Sum> Assign; Assign assign(m_expression, sum); - static const bool Vectorize = TensorEvaluator<const Assign, DeviceType>::PacketAccess; - internal::TensorExecutor<const Assign, DeviceType, Vectorize>::run(assign, m_device); + internal::TensorExecutor<const Assign, DeviceType>::run(assign, m_device); + return *this; + } + + template<typename OtherDerived> + EIGEN_STRONG_INLINE TensorDevice& operator-=(const OtherDerived& other) { + typedef typename OtherDerived::Scalar Scalar; + typedef TensorCwiseBinaryOp<internal::scalar_difference_op<Scalar>, const ExpressionType, const OtherDerived> Difference; + Difference difference(m_expression, other); + typedef TensorAssignOp<ExpressionType, const Difference> Assign; + Assign assign(m_expression, difference); + internal::TensorExecutor<const Assign, DeviceType>::run(assign, m_device); return *this; } @@ -65,8 +73,7 @@ template <typename ExpressionType> class TensorDevice<ExpressionType, ThreadPool EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) { typedef TensorAssignOp<ExpressionType, const OtherDerived> Assign; Assign assign(m_expression, other); - static const bool Vectorize = TensorEvaluator<const Assign, ThreadPoolDevice>::PacketAccess; - internal::TensorExecutor<const Assign, ThreadPoolDevice, Vectorize>::run(assign, m_device); + internal::TensorExecutor<const Assign, ThreadPoolDevice>::run(assign, m_device); return *this; } @@ -77,8 +84,18 @@ template <typename ExpressionType> class TensorDevice<ExpressionType, ThreadPool Sum sum(m_expression, other); typedef TensorAssignOp<ExpressionType, const Sum> Assign; Assign assign(m_expression, sum); - static const bool Vectorize = TensorEvaluator<const Assign, ThreadPoolDevice>::PacketAccess; - internal::TensorExecutor<const Assign, ThreadPoolDevice, Vectorize>::run(assign, m_device); + internal::TensorExecutor<const Assign, ThreadPoolDevice>::run(assign, m_device); + return *this; + } + + template<typename OtherDerived> + EIGEN_STRONG_INLINE TensorDevice& operator-=(const OtherDerived& other) { + typedef typename OtherDerived::Scalar Scalar; + typedef TensorCwiseBinaryOp<internal::scalar_difference_op<Scalar>, const ExpressionType, const OtherDerived> Difference; + Difference difference(m_expression, other); + typedef TensorAssignOp<ExpressionType, const Difference> Assign; + Assign assign(m_expression, difference); + internal::TensorExecutor<const Assign, ThreadPoolDevice>::run(assign, m_device); return *this; } @@ -99,7 +116,7 @@ template <typename ExpressionType> class TensorDevice<ExpressionType, GpuDevice> EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) { typedef TensorAssignOp<ExpressionType, const OtherDerived> Assign; Assign assign(m_expression, other); - internal::TensorExecutor<const Assign, GpuDevice, false>::run(assign, m_device); + internal::TensorExecutor<const Assign, GpuDevice>::run(assign, m_device); return *this; } @@ -110,13 +127,24 @@ template <typename ExpressionType> class TensorDevice<ExpressionType, GpuDevice> Sum sum(m_expression, other); typedef TensorAssignOp<ExpressionType, const Sum> Assign; Assign assign(m_expression, sum); - internal::TensorExecutor<const Assign, GpuDevice, false>::run(assign, m_device); + internal::TensorExecutor<const Assign, GpuDevice>::run(assign, m_device); + return *this; + } + + template<typename OtherDerived> + EIGEN_STRONG_INLINE TensorDevice& operator-=(const OtherDerived& other) { + typedef typename OtherDerived::Scalar Scalar; + typedef TensorCwiseBinaryOp<internal::scalar_difference_op<Scalar>, const ExpressionType, const OtherDerived> Difference; + Difference difference(m_expression, other); + typedef TensorAssignOp<ExpressionType, const Difference> Assign; + Assign assign(m_expression, difference); + internal::TensorExecutor<const Assign, GpuDevice>::run(assign, m_device); return *this; } protected: const GpuDevice& m_device; - ExpressionType m_expression; + ExpressionType& m_expression; }; #endif diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h index 5e805fd95..43917cbc3 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h @@ -145,39 +145,39 @@ template <std::size_t V1=0, std::size_t V2=0, std::size_t V3=0, std::size_t V4=0 Sizes() { } template <typename DenseIndex> - explicit Sizes(const array<DenseIndex, Base::count>& indices) { + explicit Sizes(const array<DenseIndex, Base::count>& /*indices*/) { // todo: add assertion } #ifdef EIGEN_HAS_VARIADIC_TEMPLATES - template <typename... DenseIndex> Sizes(DenseIndex... indices) { } - explicit Sizes(std::initializer_list<std::size_t> l) { + template <typename... DenseIndex> Sizes(DenseIndex... /*indices*/) { } + explicit Sizes(std::initializer_list<std::size_t>) { // todo: add assertion } #else - EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex i0) { + EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex) { } - EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex i0, const DenseIndex i1) { + EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex, const DenseIndex) { } - EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2) { + EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex, const DenseIndex, const DenseIndex) { } - EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3) { + EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex) { } - EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3, const DenseIndex i4) { + EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex) { } #endif - template <typename T> Sizes& operator = (const T& other) { + template <typename T> Sizes& operator = (const T&) { // to do: check the size of other return *this; } template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t IndexOfColMajor(const array<DenseIndex, Base::count>& indices) const { - return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count - 1, false>::run(indices, *static_cast<const Base*>(this); + return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count - 1, false>::run(indices, *static_cast<const Base*>(this)); } template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t IndexOfRowMajor(const array<DenseIndex, Base::count>& indices) const { - return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count - 1, true>::run(indices, *static_cast<const Base*>(this); + return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count - 1, true>::run(indices, *static_cast<const Base*>(this)); } }; @@ -343,7 +343,7 @@ template <std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::s template <std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5> struct array_size<Sizes<V1,V2,V3,V4,V5> > { static const size_t value = Sizes<V1,V2,V3,V4,V5>::count; }; -template <std::size_t n, std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_get(const Sizes<V1,V2,V3,V4,V5>& a) { +template <std::size_t n, std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_get(const Sizes<V1,V2,V3,V4,V5>&) { return get<n, typename Sizes<V1,V2,V3,V4,V5>::Base>::value; }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index d084880de..9198c17ef 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -352,11 +352,12 @@ template<typename IfArgType, typename ThenArgType, typename ElseArgType, typenam struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>, Device> { typedef TensorSelectOp<IfArgType, ThenArgType, ElseArgType> XprType; + typedef typename XprType::Scalar Scalar; enum { IsAligned = TensorEvaluator<ThenArgType, Device>::IsAligned & TensorEvaluator<ElseArgType, Device>::IsAligned, - PacketAccess = TensorEvaluator<ThenArgType, Device>::PacketAccess & TensorEvaluator<ElseArgType, Device>::PacketAccess/* & - TensorEvaluator<IfArgType>::PacketAccess*/, + PacketAccess = TensorEvaluator<ThenArgType, Device>::PacketAccess & TensorEvaluator<ElseArgType, Device>::PacketAccess & + internal::packet_traits<Scalar>::HasBlend, Layout = TensorEvaluator<IfArgType, Device>::Layout, CoordAccess = false, // to be implemented }; @@ -373,7 +374,6 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType> } typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; typedef typename internal::traits<XprType>::Scalar CoeffReturnType; typedef typename internal::traits<XprType>::Packet PacketReturnType; typedef typename TensorEvaluator<IfArgType, Device>::Dimensions Dimensions; @@ -403,7 +403,7 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType> template<int LoadMode> EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const { - static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; + const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; internal::Selector<PacketSize> select; for (Index i = 0; i < PacketSize; ++i) { select.select[i] = m_condImpl.coeff(index+i); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h index 38586d067..25f085a59 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h @@ -77,7 +77,7 @@ template <typename T> struct MeanReducer } template <typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const { - return (saccum + predux(vaccum)) / (scalarCount_ + packetCount_ * packet_traits<Packet>::size); + return (saccum + predux(vaccum)) / (scalarCount_ + packetCount_ * unpacket_traits<Packet>::size); } protected: diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h index a844a4d68..66ddfd554 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h @@ -54,7 +54,7 @@ struct traits<Tensor<Scalar_, NumIndices_, Options_> > static const int Layout = Options_ & RowMajor ? RowMajor : ColMajor; enum { Options = Options_, - Flags = compute_tensor_flags<Scalar_, Options_>::ret | LvalueBit, + Flags = compute_tensor_flags<Scalar_, Options_>::ret | (is_const<Scalar_>::value ? 0 : LvalueBit), }; }; @@ -69,7 +69,7 @@ struct traits<TensorFixedSize<Scalar_, Dimensions, Options_> > static const int Layout = Options_ & RowMajor ? RowMajor : ColMajor; enum { Options = Options_, - Flags = compute_tensor_flags<Scalar_, Options_>::ret | LvalueBit, + Flags = compute_tensor_flags<Scalar_, Options_>::ret | (is_const<Scalar_>::value ? 0: LvalueBit), }; }; @@ -86,7 +86,7 @@ struct traits<TensorMap<PlainObjectType, Options_> > static const int Layout = BaseTraits::Layout; enum { Options = Options_, - Flags = ((BaseTraits::Flags | LvalueBit) & ~AlignedBit) | (Options&Aligned ? AlignedBit : 0), + Flags = (BaseTraits::Flags & ~AlignedBit) | (Options&Aligned ? AlignedBit : 0), }; }; @@ -102,7 +102,7 @@ struct traits<TensorRef<PlainObjectType> > static const int Layout = BaseTraits::Layout; enum { Options = BaseTraits::Options, - Flags = ((BaseTraits::Flags | LvalueBit) & ~AlignedBit) | (Options&Aligned ? AlignedBit : 0), + Flags = (BaseTraits::Flags & ~AlignedBit) | (Options&Aligned ? AlignedBit : 0), }; }; diff --git a/unsupported/Eigen/src/SparseExtra/DynamicSparseMatrix.h b/unsupported/Eigen/src/SparseExtra/DynamicSparseMatrix.h index e1284c782..54e0c5d63 100644 --- a/unsupported/Eigen/src/SparseExtra/DynamicSparseMatrix.h +++ b/unsupported/Eigen/src/SparseExtra/DynamicSparseMatrix.h @@ -361,7 +361,6 @@ struct evaluator<DynamicSparseMatrix<_Scalar,_Options,_StorageIndex> > : evaluator_base<DynamicSparseMatrix<_Scalar,_Options,_StorageIndex> > { typedef _Scalar Scalar; - typedef _StorageIndex Index; typedef DynamicSparseMatrix<_Scalar,_Options,_StorageIndex> SparseMatrixType; typedef typename SparseMatrixType::InnerIterator InnerIterator; typedef typename SparseMatrixType::ReverseInnerIterator ReverseInnerIterator; @@ -378,6 +377,8 @@ struct evaluator<DynamicSparseMatrix<_Scalar,_Options,_StorageIndex> > operator const SparseMatrixType&() const { return *m_matrix; } Scalar coeff(Index row, Index col) const { return m_matrix->coeff(row,col); } + + Index nonZerosEstimate() const { return m_matrix->nonZeros(); } const SparseMatrixType *m_matrix; }; diff --git a/unsupported/Eigen/src/SparseExtra/MarketIO.h b/unsupported/Eigen/src/SparseExtra/MarketIO.h index 25ff4228d..100e617b2 100644 --- a/unsupported/Eigen/src/SparseExtra/MarketIO.h +++ b/unsupported/Eigen/src/SparseExtra/MarketIO.h @@ -18,7 +18,7 @@ namespace Eigen { namespace internal { template <typename Scalar> - inline bool GetMarketLine (std::stringstream& line, int& M, int& N, int& i, int& j, Scalar& value) + inline bool GetMarketLine (std::stringstream& line, Index& M, Index& N, Index& i, Index& j, Scalar& value) { line >> i >> j >> value; i--; @@ -31,7 +31,7 @@ namespace internal return false; } template <typename Scalar> - inline bool GetMarketLine (std::stringstream& line, int& M, int& N, int& i, int& j, std::complex<Scalar>& value) + inline bool GetMarketLine (std::stringstream& line, Index& M, Index& N, Index& i, Index& j, std::complex<Scalar>& value) { Scalar valR, valI; line >> i >> j >> valR >> valI; diff --git a/unsupported/test/cxx11_tensor_chipping.cpp b/unsupported/test/cxx11_tensor_chipping.cpp index d83417872..bfc2bad18 100644 --- a/unsupported/test/cxx11_tensor_chipping.cpp +++ b/unsupported/test/cxx11_tensor_chipping.cpp @@ -340,11 +340,9 @@ static void test_chip_as_lvalue() } } - -template<int DataLayout> -static void test_chip_raw_data() +static void test_chip_raw_data_col_major() { - Tensor<float, 5, DataLayout> tensor(2,3,5,7,11); + Tensor<float, 5, ColMajor> tensor(2,3,5,7,11); tensor.setRandom(); typedef TensorEvaluator<decltype(tensor.template chip<4>(3)), DefaultDevice> Evaluator4; @@ -353,12 +351,7 @@ static void test_chip_raw_data() for (int j = 0; j < 3; ++j) { for (int k = 0; k < 5; ++k) { for (int l = 0; l < 7; ++l) { - int chip_index; - if (DataLayout == ColMajor) { - chip_index = i + 2 * (j + 3 * (k + 5 * l)); - } else { - chip_index = 11 * (l + 7 * (k + 5 * (j + 3 * i))); - } + int chip_index = i + 2 * (j + 3 * (k + 5 * l)); VERIFY_IS_EQUAL(chip.data()[chip_index], tensor(i,j,k,l,3)); } } @@ -382,6 +375,41 @@ static void test_chip_raw_data() VERIFY_IS_EQUAL(chip3.data(), static_cast<float*>(0)); } +static void test_chip_raw_data_row_major() +{ + Tensor<float, 5, RowMajor> tensor(11,7,5,3,2); + tensor.setRandom(); + + typedef TensorEvaluator<decltype(tensor.template chip<0>(3)), DefaultDevice> Evaluator0; + auto chip = Evaluator0(tensor.template chip<0>(3), DefaultDevice()); + for (int i = 0; i < 7; ++i) { + for (int j = 0; j < 5; ++j) { + for (int k = 0; k < 3; ++k) { + for (int l = 0; l < 2; ++l) { + int chip_index = l + 2 * (k + 3 * (j + 5 * i)); + VERIFY_IS_EQUAL(chip.data()[chip_index], tensor(3,i,j,k,l)); + } + } + } + } + + typedef TensorEvaluator<decltype(tensor.template chip<1>(0)), DefaultDevice> Evaluator1; + auto chip1 = Evaluator1(tensor.template chip<1>(0), DefaultDevice()); + VERIFY_IS_EQUAL(chip1.data(), static_cast<float*>(0)); + + typedef TensorEvaluator<decltype(tensor.template chip<2>(0)), DefaultDevice> Evaluator2; + auto chip2 = Evaluator2(tensor.template chip<2>(0), DefaultDevice()); + VERIFY_IS_EQUAL(chip2.data(), static_cast<float*>(0)); + + typedef TensorEvaluator<decltype(tensor.template chip<3>(0)), DefaultDevice> Evaluator3; + auto chip3 = Evaluator3(tensor.template chip<3>(0), DefaultDevice()); + VERIFY_IS_EQUAL(chip3.data(), static_cast<float*>(0)); + + typedef TensorEvaluator<decltype(tensor.template chip<4>(0)), DefaultDevice> Evaluator4; + auto chip4 = Evaluator4(tensor.template chip<4>(0), DefaultDevice()); + VERIFY_IS_EQUAL(chip4.data(), static_cast<float*>(0)); +} + void test_cxx11_tensor_chipping() { CALL_SUBTEST(test_simple_chip<ColMajor>()); @@ -392,6 +420,6 @@ void test_cxx11_tensor_chipping() CALL_SUBTEST(test_chip_in_expr<RowMajor>()); CALL_SUBTEST(test_chip_as_lvalue<ColMajor>()); CALL_SUBTEST(test_chip_as_lvalue<RowMajor>()); - CALL_SUBTEST(test_chip_raw_data<ColMajor>()); - CALL_SUBTEST(test_chip_raw_data<RowMajor>()); + CALL_SUBTEST(test_chip_raw_data_col_major()); + CALL_SUBTEST(test_chip_raw_data_row_major()); } diff --git a/unsupported/test/cxx11_tensor_const.cpp b/unsupported/test/cxx11_tensor_const.cpp index 0ffb02afd..ad9c9da39 100644 --- a/unsupported/test/cxx11_tensor_const.cpp +++ b/unsupported/test/cxx11_tensor_const.cpp @@ -13,8 +13,6 @@ using Eigen::Tensor; - - static void test_simple_assign() { Tensor<int, 3> random(2,3,7); @@ -33,7 +31,32 @@ static void test_simple_assign() } } + +static void test_assign_of_const_tensor() +{ + Tensor<int, 3> random(2,3,7); + random.setRandom(); + + TensorMap<Tensor<const int, 3> > constant1(random.data(), 2, 3, 7); + TensorMap<const Tensor<int, 3> > constant2(random.data(), 2, 3, 7); + const TensorMap<Tensor<int, 3> > constant3(random.data(), 2, 3, 7); + + Tensor<int, 2> result1 = constant1.chip(0, 2); + Tensor<int, 2> result2 = constant2.chip(0, 2); + Tensor<int, 2> result3 = constant3.chip(0, 2); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + VERIFY_IS_EQUAL((result1(i,j)), random(i,j,0)); + VERIFY_IS_EQUAL((result2(i,j)), random(i,j,0)); + VERIFY_IS_EQUAL((result3(i,j)), random(i,j,0)); + } + } +} + + void test_cxx11_tensor_const() { CALL_SUBTEST(test_simple_assign()); + CALL_SUBTEST(test_assign_of_const_tensor()); } diff --git a/unsupported/test/cxx11_tensor_convolution.cpp b/unsupported/test/cxx11_tensor_convolution.cpp index 4672db463..3a12dae62 100644 --- a/unsupported/test/cxx11_tensor_convolution.cpp +++ b/unsupported/test/cxx11_tensor_convolution.cpp @@ -14,15 +14,16 @@ using Eigen::Tensor; using Eigen::DefaultDevice; +template <int DataLayout> static void test_evals() { - Tensor<float, 2> input(3, 3); - Tensor<float, 1> kernel(2); + Tensor<float, 2, DataLayout> input(3, 3); + Tensor<float, 1, DataLayout> kernel(2); input.setRandom(); kernel.setRandom(); - Tensor<float, 2> result(2,3); + Tensor<float, 2, DataLayout> result(2,3); result.setZero(); Eigen::array<Tensor<float, 2>::Index, 1> dims3({0}); @@ -41,15 +42,15 @@ static void test_evals() VERIFY_IS_APPROX(result(1,2), input(1,2)*kernel(0) + input(2,2)*kernel(1)); // index 5 } - +template <int DataLayout> static void test_expr() { - Tensor<float, 2> input(3, 3); - Tensor<float, 2> kernel(2, 2); + Tensor<float, 2, DataLayout> input(3, 3); + Tensor<float, 2, DataLayout> kernel(2, 2); input.setRandom(); kernel.setRandom(); - Tensor<float, 2> result(2,2); + Tensor<float, 2, DataLayout> result(2,2); Eigen::array<ptrdiff_t, 2> dims({0, 1}); result = input.convolve(kernel, dims); @@ -63,10 +64,10 @@ static void test_expr() input(2,1)*kernel(1,0) + input(2,2)*kernel(1,1)); } - +template <int DataLayout> static void test_modes() { - Tensor<float, 1> input(3); - Tensor<float, 1> kernel(3); + Tensor<float, 1, DataLayout> input(3); + Tensor<float, 1, DataLayout> kernel(3); input(0) = 1.0f; input(1) = 2.0f; input(2) = 3.0f; @@ -74,13 +75,13 @@ static void test_modes() { kernel(1) = 1.0f; kernel(2) = 0.0f; - const Eigen::array<ptrdiff_t, 1> dims{{0}}; + const Eigen::array<ptrdiff_t, 1> dims({0}); Eigen::array<std::pair<ptrdiff_t, ptrdiff_t>, 1> padding; // Emulate VALID mode (as defined in // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html). padding[0] = std::make_pair(0, 0); - Tensor<float, 1> valid(1); + Tensor<float, 1, DataLayout> valid(1); valid = input.pad(padding).convolve(kernel, dims); VERIFY_IS_EQUAL(valid.dimension(0), 1); VERIFY_IS_APPROX(valid(0), 2.5f); @@ -88,7 +89,7 @@ static void test_modes() { // Emulate SAME mode (as defined in // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html). padding[0] = std::make_pair(1, 1); - Tensor<float, 1> same(3); + Tensor<float, 1, DataLayout> same(3); same = input.pad(padding).convolve(kernel, dims); VERIFY_IS_EQUAL(same.dimension(0), 3); VERIFY_IS_APPROX(same(0), 1.0f); @@ -98,7 +99,7 @@ static void test_modes() { // Emulate FULL mode (as defined in // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html). padding[0] = std::make_pair(2, 2); - Tensor<float, 1> full(5); + Tensor<float, 1, DataLayout> full(5); full = input.pad(padding).convolve(kernel, dims); VERIFY_IS_EQUAL(full.dimension(0), 5); VERIFY_IS_APPROX(full(0), 0.0f); @@ -108,18 +109,18 @@ static void test_modes() { VERIFY_IS_APPROX(full(4), 1.5f); } - +template <int DataLayout> static void test_strides() { - Tensor<float, 1> input(13); - Tensor<float, 1> kernel(3); + Tensor<float, 1, DataLayout> input(13); + Tensor<float, 1, DataLayout> kernel(3); input.setRandom(); kernel.setRandom(); - const Eigen::array<ptrdiff_t, 1> dims{{0}}; - const Eigen::array<ptrdiff_t, 1> stride_of_3{{3}}; - const Eigen::array<ptrdiff_t, 1> stride_of_2{{2}}; + const Eigen::array<ptrdiff_t, 1> dims({0}); + const Eigen::array<ptrdiff_t, 1> stride_of_3({3}); + const Eigen::array<ptrdiff_t, 1> stride_of_2({2}); - Tensor<float, 1> result; + Tensor<float, 1, DataLayout> result; result = input.stride(stride_of_3).convolve(kernel, dims).stride(stride_of_2); VERIFY_IS_EQUAL(result.dimension(0), 2); @@ -129,13 +130,14 @@ static void test_strides() { input(12)*kernel(2))); } - - - void test_cxx11_tensor_convolution() { - CALL_SUBTEST(test_evals()); - CALL_SUBTEST(test_expr()); - CALL_SUBTEST(test_modes()); - CALL_SUBTEST(test_strides()); + CALL_SUBTEST(test_evals<ColMajor>()); + CALL_SUBTEST(test_evals<RowMajor>()); + CALL_SUBTEST(test_expr<ColMajor>()); + CALL_SUBTEST(test_expr<RowMajor>()); + CALL_SUBTEST(test_modes<ColMajor>()); + CALL_SUBTEST(test_modes<RowMajor>()); + CALL_SUBTEST(test_strides<ColMajor>()); + CALL_SUBTEST(test_strides<RowMajor>()); } diff --git a/unsupported/test/cxx11_tensor_cuda.cpp b/unsupported/test/cxx11_tensor_cuda.cpp index 8c1ca1bf8..78934165f 100644 --- a/unsupported/test/cxx11_tensor_cuda.cpp +++ b/unsupported/test/cxx11_tensor_cuda.cpp @@ -117,11 +117,10 @@ void test_cuda_elementwise() } } - void test_cuda_reduction() { - Tensor<float, 4> in1(Eigen::array<int, 4>(72,53,97,113)); - Tensor<float, 2> out(Eigen::array<int, 2>(72,97)); + Tensor<float, 4> in1(72,53,97,113); + Tensor<float, 2> out(72,97); in1.setRandom(); std::size_t in1_bytes = in1.size() * sizeof(float); @@ -138,8 +137,8 @@ void test_cuda_reduction() assert(cudaStreamCreate(&stream) == cudaSuccess); Eigen::GpuDevice gpu_device(&stream); - Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_in1(d_in1, Eigen::array<int, 4>(72,53,97,113)); - Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, Eigen::array<int, 2>(72,97)); + Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_in1(d_in1, 72,53,97,113); + Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97); array<int, 2> reduction_axis; reduction_axis[0] = 1; @@ -156,10 +155,10 @@ void test_cuda_reduction() for (int k = 0; k < 53; ++k) { for (int l = 0; l < 113; ++l) { expected = - std::max<float>(expected, in1(Eigen::array<int, 4>(i, k, j, l))); + std::max<float>(expected, in1(i, k, j, l)); } } - VERIFY_IS_APPROX(out(Eigen::array<int, 2>(i,j)), expected); + VERIFY_IS_APPROX(out(i,j), expected); } } } @@ -170,7 +169,7 @@ static void test_cuda_contraction() // with these dimensions, the output has 300 * 140 elements, which is // more than 30 * 1024, which is the number of threads in blocks on // a 15 SM GK110 GPU - Tensor<float, 4, DataLayout> t_left(Eigen::array<int, 4>(6, 50, 3, 31)); + Tensor<float, 4, DataLayout> t_left(6, 50, 3, 31); Tensor<float, 5, DataLayout> t_right(Eigen::array<int, 5>(3, 31, 7, 20, 1)); Tensor<float, 5, DataLayout> t_result(Eigen::array<int, 5>(6, 50, 7, 20, 1)); @@ -196,12 +195,9 @@ static void test_cuda_contraction() assert(cudaStreamCreate(&stream) == cudaSuccess); Eigen::GpuDevice gpu_device(&stream); - Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > - gpu_t_left(d_t_left, Eigen::array<int, 4>(6, 50, 3, 31)); - Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > - gpu_t_right(d_t_right, Eigen::array<int, 5>(3, 31, 7, 20, 1)); - Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > - gpu_t_result(d_t_result, Eigen::array<int, 5>(6, 50, 7, 20, 1)); + Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_t_left(d_t_left, 6, 50, 3, 31); + Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_t_right(d_t_right, 3, 31, 7, 20, 1); + Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_t_result(d_t_result, 6, 50, 7, 20, 1); typedef Eigen::Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> > MapXf; MapXf m_left(t_left.data(), 300, 93); @@ -226,11 +222,12 @@ static void test_cuda_contraction() } } +template<int DataLayout> static void test_cuda_convolution_1d() { - Tensor<float, 4> input(Eigen::array<int, 4>(74,37,11,137)); - Tensor<float, 1> kernel(Eigen::array<int, 1>(4)); - Tensor<float, 4> out(Eigen::array<int, 4>(74,34,11,137)); + Tensor<float, 4, DataLayout> input(74,37,11,137); + Tensor<float, 1, DataLayout> kernel(4); + Tensor<float, 4, DataLayout> out(74,34,11,137); input = input.constant(10.0f) + input.random(); kernel = kernel.constant(7.0f) + kernel.random(); @@ -252,9 +249,9 @@ static void test_cuda_convolution_1d() assert(cudaStreamCreate(&stream) == cudaSuccess); Eigen::GpuDevice gpu_device(&stream); - Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_input(d_input, Eigen::array<int, 4>(74,37,11,137)); - Eigen::TensorMap<Eigen::Tensor<float, 1> > gpu_kernel(d_kernel, Eigen::array<int, 1>(4)); - Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_out(d_out, Eigen::array<int, 4>(74,34,11,137)); + Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_input(d_input, 74,37,11,137); + Eigen::TensorMap<Eigen::Tensor<float, 1, DataLayout> > gpu_kernel(d_kernel, 4); + Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_out(d_out, 74,34,11,137); Eigen::array<int, 1> dims(1); gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims); @@ -266,11 +263,9 @@ static void test_cuda_convolution_1d() for (int j = 0; j < 34; ++j) { for (int k = 0; k < 11; ++k) { for (int l = 0; l < 137; ++l) { - const float result = out(Eigen::array<int, 4>(i,j,k,l)); - const float expected = input(Eigen::array<int, 4>(i,j+0,k,l)) * kernel(Eigen::array<int, 1>(0)) + - input(Eigen::array<int, 4>(i,j+1,k,l)) * kernel(Eigen::array<int, 1>(1)) + - input(Eigen::array<int, 4>(i,j+2,k,l)) * kernel(Eigen::array<int, 1>(2)) + - input(Eigen::array<int, 4>(i,j+3,k,l)) * kernel(Eigen::array<int, 1>(3)); + const float result = out(i,j,k,l); + const float expected = input(i,j+0,k,l) * kernel(0) + input(i,j+1,k,l) * kernel(1) + + input(i,j+2,k,l) * kernel(2) + input(i,j+3,k,l) * kernel(3); VERIFY_IS_APPROX(result, expected); } } @@ -278,12 +273,11 @@ static void test_cuda_convolution_1d() } } - -static void test_cuda_convolution_2d() +static void test_cuda_convolution_inner_dim_col_major_1d() { - Tensor<float, 4> input(Eigen::array<int, 4>(74,37,11,137)); - Tensor<float, 2> kernel(Eigen::array<int, 2>(3,4)); - Tensor<float, 4> out(Eigen::array<int, 4>(74,35,8,137)); + Tensor<float, 4, ColMajor> input(74,9,11,7); + Tensor<float, 1, ColMajor> kernel(4); + Tensor<float, 4, ColMajor> out(71,9,11,7); input = input.constant(10.0f) + input.random(); kernel = kernel.constant(7.0f) + kernel.random(); @@ -305,46 +299,35 @@ static void test_cuda_convolution_2d() assert(cudaStreamCreate(&stream) == cudaSuccess); Eigen::GpuDevice gpu_device(&stream); - Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_input(d_input, Eigen::array<int, 4>(74,37,11,137)); - Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_kernel(d_kernel, Eigen::array<int, 2>(3,4)); - Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_out(d_out, Eigen::array<int, 4>(74,35,8,137)); + Eigen::TensorMap<Eigen::Tensor<float, 4, ColMajor> > gpu_input(d_input,74,9,11,7); + Eigen::TensorMap<Eigen::Tensor<float, 1, ColMajor> > gpu_kernel(d_kernel,4); + Eigen::TensorMap<Eigen::Tensor<float, 4, ColMajor> > gpu_out(d_out,71,9,11,7); - Eigen::array<int, 2> dims(1,2); + Eigen::array<int, 1> dims(0); gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims); assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); - for (int i = 0; i < 74; ++i) { - for (int j = 0; j < 35; ++j) { - for (int k = 0; k < 8; ++k) { - for (int l = 0; l < 137; ++l) { - const float result = out(Eigen::array<int, 4>(i,j,k,l)); - const float expected = input(Eigen::array<int, 4>(i,j+0,k+0,l)) * kernel(Eigen::array<int, 2>(0,0)) + - input(Eigen::array<int, 4>(i,j+1,k+0,l)) * kernel(Eigen::array<int, 2>(1,0)) + - input(Eigen::array<int, 4>(i,j+2,k+0,l)) * kernel(Eigen::array<int, 2>(2,0)) + - input(Eigen::array<int, 4>(i,j+0,k+1,l)) * kernel(Eigen::array<int, 2>(0,1)) + - input(Eigen::array<int, 4>(i,j+1,k+1,l)) * kernel(Eigen::array<int, 2>(1,1)) + - input(Eigen::array<int, 4>(i,j+2,k+1,l)) * kernel(Eigen::array<int, 2>(2,1)) + - input(Eigen::array<int, 4>(i,j+0,k+2,l)) * kernel(Eigen::array<int, 2>(0,2)) + - input(Eigen::array<int, 4>(i,j+1,k+2,l)) * kernel(Eigen::array<int, 2>(1,2)) + - input(Eigen::array<int, 4>(i,j+2,k+2,l)) * kernel(Eigen::array<int, 2>(2,2)) + - input(Eigen::array<int, 4>(i,j+0,k+3,l)) * kernel(Eigen::array<int, 2>(0,3)) + - input(Eigen::array<int, 4>(i,j+1,k+3,l)) * kernel(Eigen::array<int, 2>(1,3)) + - input(Eigen::array<int, 4>(i,j+2,k+3,l)) * kernel(Eigen::array<int, 2>(2,3)); - VERIFY_IS_APPROX(result, expected); + for (int i = 0; i < 71; ++i) { + for (int j = 0; j < 9; ++j) { + for (int k = 0; k < 11; ++k) { + for (int l = 0; l < 7; ++l) { + const float result = out(i,j,k,l); + const float expected = input(i+0,j,k,l) * kernel(0) + input(i+1,j,k,l) * kernel(1) + + input(i+2,j,k,l) * kernel(2) + input(i+3,j,k,l) * kernel(3); + VERIFY_IS_APPROX(result, expected); } } } } } - -static void test_cuda_convolution_3d() +static void test_cuda_convolution_inner_dim_row_major_1d() { - Tensor<float, 5> input(Eigen::array<int, 5>(74,37,11,137,17)); - Tensor<float, 3> kernel(Eigen::array<int, 3>(3,4,2)); - Tensor<float, 5> out(Eigen::array<int, 5>(74,35,8,136,17)); + Tensor<float, 4, RowMajor> input(7,9,11,74); + Tensor<float, 1, RowMajor> kernel(4); + Tensor<float, 4, RowMajor> out(7,9,11,71); input = input.constant(10.0f) + input.random(); kernel = kernel.constant(7.0f) + kernel.random(); @@ -366,139 +349,166 @@ static void test_cuda_convolution_3d() assert(cudaStreamCreate(&stream) == cudaSuccess); Eigen::GpuDevice gpu_device(&stream); - Eigen::TensorMap<Eigen::Tensor<float, 5> > gpu_input(d_input, Eigen::array<int, 5>(74,37,11,137,17)); - Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_kernel(d_kernel, Eigen::array<int, 3>(3,4,2)); - Eigen::TensorMap<Eigen::Tensor<float, 5> > gpu_out(d_out, Eigen::array<int, 5>(74,35,8,136,17)); + Eigen::TensorMap<Eigen::Tensor<float, 4, RowMajor> > gpu_input(d_input, 7,9,11,74); + Eigen::TensorMap<Eigen::Tensor<float, 1, RowMajor> > gpu_kernel(d_kernel, 4); + Eigen::TensorMap<Eigen::Tensor<float, 4, RowMajor> > gpu_out(d_out, 7,9,11,71); - Eigen::array<int, 3> dims(1,2,3); + Eigen::array<int, 1> dims(3); gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims); assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); - for (int i = 0; i < 74; ++i) { - for (int j = 0; j < 35; ++j) { - for (int k = 0; k < 8; ++k) { - for (int l = 0; l < 136; ++l) { - for (int m = 0; m < 17; ++m) { - const float result = out(Eigen::array<int, 5>(i,j,k,l,m)); - const float expected = input(Eigen::array<int, 5>(i,j+0,k+0,l+0,m)) * kernel(Eigen::array<int, 3>(0,0,0)) + - input(Eigen::array<int, 5>(i,j+1,k+0,l+0,m)) * kernel(Eigen::array<int, 3>(1,0,0)) + - input(Eigen::array<int, 5>(i,j+2,k+0,l+0,m)) * kernel(Eigen::array<int, 3>(2,0,0)) + - input(Eigen::array<int, 5>(i,j+0,k+1,l+0,m)) * kernel(Eigen::array<int, 3>(0,1,0)) + - input(Eigen::array<int, 5>(i,j+1,k+1,l+0,m)) * kernel(Eigen::array<int, 3>(1,1,0)) + - input(Eigen::array<int, 5>(i,j+2,k+1,l+0,m)) * kernel(Eigen::array<int, 3>(2,1,0)) + - input(Eigen::array<int, 5>(i,j+0,k+2,l+0,m)) * kernel(Eigen::array<int, 3>(0,2,0)) + - input(Eigen::array<int, 5>(i,j+1,k+2,l+0,m)) * kernel(Eigen::array<int, 3>(1,2,0)) + - input(Eigen::array<int, 5>(i,j+2,k+2,l+0,m)) * kernel(Eigen::array<int, 3>(2,2,0)) + - input(Eigen::array<int, 5>(i,j+0,k+3,l+0,m)) * kernel(Eigen::array<int, 3>(0,3,0)) + - input(Eigen::array<int, 5>(i,j+1,k+3,l+0,m)) * kernel(Eigen::array<int, 3>(1,3,0)) + - input(Eigen::array<int, 5>(i,j+2,k+3,l+0,m)) * kernel(Eigen::array<int, 3>(2,3,0)) + - input(Eigen::array<int, 5>(i,j+0,k+0,l+1,m)) * kernel(Eigen::array<int, 3>(0,0,1)) + - input(Eigen::array<int, 5>(i,j+1,k+0,l+1,m)) * kernel(Eigen::array<int, 3>(1,0,1)) + - input(Eigen::array<int, 5>(i,j+2,k+0,l+1,m)) * kernel(Eigen::array<int, 3>(2,0,1)) + - input(Eigen::array<int, 5>(i,j+0,k+1,l+1,m)) * kernel(Eigen::array<int, 3>(0,1,1)) + - input(Eigen::array<int, 5>(i,j+1,k+1,l+1,m)) * kernel(Eigen::array<int, 3>(1,1,1)) + - input(Eigen::array<int, 5>(i,j+2,k+1,l+1,m)) * kernel(Eigen::array<int, 3>(2,1,1)) + - input(Eigen::array<int, 5>(i,j+0,k+2,l+1,m)) * kernel(Eigen::array<int, 3>(0,2,1)) + - input(Eigen::array<int, 5>(i,j+1,k+2,l+1,m)) * kernel(Eigen::array<int, 3>(1,2,1)) + - input(Eigen::array<int, 5>(i,j+2,k+2,l+1,m)) * kernel(Eigen::array<int, 3>(2,2,1)) + - input(Eigen::array<int, 5>(i,j+0,k+3,l+1,m)) * kernel(Eigen::array<int, 3>(0,3,1)) + - input(Eigen::array<int, 5>(i,j+1,k+3,l+1,m)) * kernel(Eigen::array<int, 3>(1,3,1)) + - input(Eigen::array<int, 5>(i,j+2,k+3,l+1,m)) * kernel(Eigen::array<int, 3>(2,3,1)); - VERIFY_IS_APPROX(result, expected); - } + for (int i = 0; i < 7; ++i) { + for (int j = 0; j < 9; ++j) { + for (int k = 0; k < 11; ++k) { + for (int l = 0; l < 71; ++l) { + const float result = out(i,j,k,l); + const float expected = input(i,j,k,l+0) * kernel(0) + input(i,j,k,l+1) * kernel(1) + + input(i,j,k,l+2) * kernel(2) + input(i,j,k,l+3) * kernel(3); + VERIFY_IS_APPROX(result, expected); } } } } } -static float* CudaCopyFloat(float* data, int size) { - const int nbytes = size * sizeof(float); - float* result = NULL; - if (cudaMalloc((void**)(&result), nbytes) != cudaSuccess) { - return NULL; - } else { - if (data != NULL) { - cudaMemcpy(result, data, nbytes, cudaMemcpyHostToDevice); - } - return result; - } -} - -static void test_cuda_constant_broadcast() +template<int DataLayout> +static void test_cuda_convolution_2d() { + Tensor<float, 4, DataLayout> input(74,37,11,137); + Tensor<float, 2, DataLayout> kernel(3,4); + Tensor<float, 4, DataLayout> out(74,35,8,137); + input = input.constant(10.0f) + input.random(); + kernel = kernel.constant(7.0f) + kernel.random(); + + std::size_t input_bytes = input.size() * sizeof(float); + std::size_t kernel_bytes = kernel.size() * sizeof(float); + std::size_t out_bytes = out.size() * sizeof(float); + + float* d_input; + float* d_kernel; + float* d_out; + cudaMalloc((void**)(&d_input), input_bytes); + cudaMalloc((void**)(&d_kernel), kernel_bytes); + cudaMalloc((void**)(&d_out), out_bytes); + + cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice); + cudaStream_t stream; assert(cudaStreamCreate(&stream) == cudaSuccess); Eigen::GpuDevice gpu_device(&stream); - Tensor<float, 1> t1(10); - for (int i = 0; i < 10; ++i) { - t1(i) = 10.0f * i; - } - float* t1_cuda = CudaCopyFloat(t1.data(), t1.size()); - Eigen::TensorMap<Eigen::Tensor<float, 1> > t1_gpu(t1_cuda, 10); - - Tensor<float, 1> t2(1); - t2 = t2.constant(20.0f); - float* t2_cuda = CudaCopyFloat(t2.data(), t2.size()); - Eigen::TensorMap<Eigen::TensorFixedSize<float, Sizes<1> > > t2_gpu(t2_cuda, 1); + Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_input(d_input,74,37,11,137); + Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > gpu_kernel(d_kernel,3,4); + Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_out(d_out,74,35,8,137); - float* t3_cuda = CudaCopyFloat(NULL, 10); - Eigen::TensorMap<Eigen::Tensor<float, 1> > t3_gpu(t3_cuda, 10); - - t3_gpu.device(gpu_device) = - t1_gpu + t2_gpu.broadcast(Eigen::array<int, 1>(10)); + Eigen::array<int, 2> dims(1,2); + gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims); - Eigen::Tensor<float, 1> t3(10); - cudaMemcpy(t3.data(), t3_gpu.data(), 10 * sizeof(float), - cudaMemcpyDeviceToHost); + assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); - for (int i = 0; i < 10; ++i) { - VERIFY_IS_APPROX(t3(i), t1(i) + t2(0)); + for (int i = 0; i < 74; ++i) { + for (int j = 0; j < 35; ++j) { + for (int k = 0; k < 8; ++k) { + for (int l = 0; l < 137; ++l) { + const float result = out(i,j,k,l); + const float expected = input(i,j+0,k+0,l) * kernel(0,0) + + input(i,j+1,k+0,l) * kernel(1,0) + + input(i,j+2,k+0,l) * kernel(2,0) + + input(i,j+0,k+1,l) * kernel(0,1) + + input(i,j+1,k+1,l) * kernel(1,1) + + input(i,j+2,k+1,l) * kernel(2,1) + + input(i,j+0,k+2,l) * kernel(0,2) + + input(i,j+1,k+2,l) * kernel(1,2) + + input(i,j+2,k+2,l) * kernel(2,2) + + input(i,j+0,k+3,l) * kernel(0,3) + + input(i,j+1,k+3,l) * kernel(1,3) + + input(i,j+2,k+3,l) * kernel(2,3); + VERIFY_IS_APPROX(result, expected); + } + } + } } } - -void test_cuda_cast() +template<int DataLayout> +static void test_cuda_convolution_3d() { - Tensor<double, 3> in(Eigen::array<int, 3>(72,53,97)); - Tensor<float, 3> out(Eigen::array<int, 3>(72,53,97)); - in.setRandom(); + Tensor<float, 5, DataLayout> input(Eigen::array<int, 5>(74,37,11,137,17)); + Tensor<float, 3, DataLayout> kernel(3,4,2); + Tensor<float, 5, DataLayout> out(Eigen::array<int, 5>(74,35,8,136,17)); + input = input.constant(10.0f) + input.random(); + kernel = kernel.constant(7.0f) + kernel.random(); - std::size_t in_bytes = in.size() * sizeof(double); + std::size_t input_bytes = input.size() * sizeof(float); + std::size_t kernel_bytes = kernel.size() * sizeof(float); std::size_t out_bytes = out.size() * sizeof(float); - double* d_in; + float* d_input; + float* d_kernel; float* d_out; - cudaMalloc((void**)(&d_in), in_bytes); + cudaMalloc((void**)(&d_input), input_bytes); + cudaMalloc((void**)(&d_kernel), kernel_bytes); cudaMalloc((void**)(&d_out), out_bytes); - cudaMemcpy(d_in, in.data(), in_bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice); cudaStream_t stream; assert(cudaStreamCreate(&stream) == cudaSuccess); Eigen::GpuDevice gpu_device(&stream); - Eigen::TensorMap<Eigen::Tensor<double, 3> > gpu_in(d_in, Eigen::array<int, 3>(72,53,97)); - Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_out(d_out, Eigen::array<int, 3>(72,53,97)); + Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_input(d_input,74,37,11,137,17); + Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> > gpu_kernel(d_kernel,3,4,2); + Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_out(d_out,74,35,8,136,17); - gpu_out.device(gpu_device) = gpu_in.template cast<float>(); + Eigen::array<int, 3> dims(1,2,3); + gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims); assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); - for (int i = 0; i < 72; ++i) { - for (int j = 0; j < 53; ++j) { - for (int k = 0; k < 97; ++k) { - VERIFY_IS_APPROX(out(Eigen::array<int, 3>(i,j,k)), static_cast<float>(in(Eigen::array<int, 3>(i,j,k)))); + for (int i = 0; i < 74; ++i) { + for (int j = 0; j < 35; ++j) { + for (int k = 0; k < 8; ++k) { + for (int l = 0; l < 136; ++l) { + for (int m = 0; m < 17; ++m) { + const float result = out(i,j,k,l,m); + const float expected = input(i,j+0,k+0,l+0,m) * kernel(0,0,0) + + input(i,j+1,k+0,l+0,m) * kernel(1,0,0) + + input(i,j+2,k+0,l+0,m) * kernel(2,0,0) + + input(i,j+0,k+1,l+0,m) * kernel(0,1,0) + + input(i,j+1,k+1,l+0,m) * kernel(1,1,0) + + input(i,j+2,k+1,l+0,m) * kernel(2,1,0) + + input(i,j+0,k+2,l+0,m) * kernel(0,2,0) + + input(i,j+1,k+2,l+0,m) * kernel(1,2,0) + + input(i,j+2,k+2,l+0,m) * kernel(2,2,0) + + input(i,j+0,k+3,l+0,m) * kernel(0,3,0) + + input(i,j+1,k+3,l+0,m) * kernel(1,3,0) + + input(i,j+2,k+3,l+0,m) * kernel(2,3,0) + + input(i,j+0,k+0,l+1,m) * kernel(0,0,1) + + input(i,j+1,k+0,l+1,m) * kernel(1,0,1) + + input(i,j+2,k+0,l+1,m) * kernel(2,0,1) + + input(i,j+0,k+1,l+1,m) * kernel(0,1,1) + + input(i,j+1,k+1,l+1,m) * kernel(1,1,1) + + input(i,j+2,k+1,l+1,m) * kernel(2,1,1) + + input(i,j+0,k+2,l+1,m) * kernel(0,2,1) + + input(i,j+1,k+2,l+1,m) * kernel(1,2,1) + + input(i,j+2,k+2,l+1,m) * kernel(2,2,1) + + input(i,j+0,k+3,l+1,m) * kernel(0,3,1) + + input(i,j+1,k+3,l+1,m) * kernel(1,3,1) + + input(i,j+2,k+3,l+1,m) * kernel(2,3,1); + VERIFY_IS_APPROX(result, expected); + } + } } } } } - void test_cxx11_tensor_cuda() { CALL_SUBTEST(test_cuda_elementwise_small()); @@ -506,9 +516,12 @@ void test_cxx11_tensor_cuda() CALL_SUBTEST(test_cuda_reduction()); CALL_SUBTEST(test_cuda_contraction<ColMajor>()); CALL_SUBTEST(test_cuda_contraction<RowMajor>()); - CALL_SUBTEST(test_cuda_convolution_1d()); - CALL_SUBTEST(test_cuda_convolution_2d()); - CALL_SUBTEST(test_cuda_convolution_3d()); - CALL_SUBTEST(test_cuda_constant_broadcast()); - CALL_SUBTEST(test_cuda_cast()); + CALL_SUBTEST(test_cuda_convolution_1d<ColMajor>()); + CALL_SUBTEST(test_cuda_convolution_1d<RowMajor>()); + CALL_SUBTEST(test_cuda_convolution_inner_dim_col_major_1d()); + CALL_SUBTEST(test_cuda_convolution_inner_dim_row_major_1d()); + CALL_SUBTEST(test_cuda_convolution_2d<ColMajor>()); + CALL_SUBTEST(test_cuda_convolution_2d<RowMajor>()); + CALL_SUBTEST(test_cuda_convolution_3d<ColMajor>()); + CALL_SUBTEST(test_cuda_convolution_3d<RowMajor>()); } diff --git a/unsupported/test/cxx11_tensor_index_list.cpp b/unsupported/test/cxx11_tensor_index_list.cpp index c4d4f244f..4ce5add32 100644 --- a/unsupported/test/cxx11_tensor_index_list.cpp +++ b/unsupported/test/cxx11_tensor_index_list.cpp @@ -255,6 +255,17 @@ static void test_mixed_index_list() VERIFY_IS_APPROX(result3(0), expected); } + +static void test_dim_check() +{ + Eigen::IndexList<Eigen::type2index<1>, int> dim1; + dim1.set(1, 2); + Eigen::IndexList<Eigen::type2index<1>, int> dim2; + dim2.set(1, 2); + VERIFY(dimensions_match(dim1, dim2)); +} + + #endif void test_cxx11_tensor_index_list() @@ -264,5 +275,6 @@ void test_cxx11_tensor_index_list() CALL_SUBTEST(test_type2index_list()); CALL_SUBTEST(test_dynamic_index_list()); CALL_SUBTEST(test_mixed_index_list()); + CALL_SUBTEST(test_dim_check()); #endif } diff --git a/unsupported/test/cxx11_tensor_ref.cpp b/unsupported/test/cxx11_tensor_ref.cpp index aa369f278..c8f105e3d 100644 --- a/unsupported/test/cxx11_tensor_ref.cpp +++ b/unsupported/test/cxx11_tensor_ref.cpp @@ -196,6 +196,45 @@ static void test_coeff_ref() } +static void test_nested_ops_with_ref() +{ + Tensor<float, 4> t(2, 3, 5, 7); + t.setRandom(); + TensorMap<Tensor<const float, 4> > m(t.data(), 2, 3, 5, 7); + array<std::pair<ptrdiff_t, ptrdiff_t>, 4> paddings; + paddings[0] = std::make_pair(0, 0); + paddings[1] = std::make_pair(2, 1); + paddings[2] = std::make_pair(3, 4); + paddings[3] = std::make_pair(0, 0); + DSizes<Eigen::DenseIndex, 4> shuffle_dims(0, 1, 2, 3); + TensorRef<Tensor<const float, 4> > ref(m.pad(paddings)); + array<std::pair<ptrdiff_t, ptrdiff_t>, 4> trivial; + trivial[0] = std::make_pair(0, 0); + trivial[1] = std::make_pair(0, 0); + trivial[2] = std::make_pair(0, 0); + trivial[3] = std::make_pair(0, 0); + Tensor<float, 4> padded = ref.shuffle(shuffle_dims).pad(trivial); + VERIFY_IS_EQUAL(padded.dimension(0), 2+0); + VERIFY_IS_EQUAL(padded.dimension(1), 3+3); + VERIFY_IS_EQUAL(padded.dimension(2), 5+7); + VERIFY_IS_EQUAL(padded.dimension(3), 7+0); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 6; ++j) { + for (int k = 0; k < 12; ++k) { + for (int l = 0; l < 7; ++l) { + if (j >= 2 && j < 5 && k >= 3 && k < 8) { + VERIFY_IS_EQUAL(padded(i,j,k,l), t(i,j-2,k-3,l)); + } else { + VERIFY_IS_EQUAL(padded(i,j,k,l), 0.0f); + } + } + } + } + } +} + + void test_cxx11_tensor_ref() { CALL_SUBTEST(test_simple_lvalue_ref()); @@ -205,4 +244,5 @@ void test_cxx11_tensor_ref() CALL_SUBTEST(test_ref_of_ref()); CALL_SUBTEST(test_ref_in_expr()); CALL_SUBTEST(test_coeff_ref()); + CALL_SUBTEST(test_nested_ops_with_ref()); } |