diff options
author | Benoit Steiner <benoit.steiner.goog@gmail.com> | 2014-08-19 16:57:10 -0700 |
---|---|---|
committer | Benoit Steiner <benoit.steiner.goog@gmail.com> | 2014-08-19 16:57:10 -0700 |
commit | 9ac3c821ea3b956634116bcdf80bfab7d9a00d91 (patch) | |
tree | 110aada99764247e6ca34a5e1630e48a8becef8b /unsupported/Eigen/CXX11/src | |
parent | 33c702c79fe227a5b22229c26af276d359a6cb1d (diff) |
Improved the speed of convolutions when running on cuda devices
Diffstat (limited to 'unsupported/Eigen/CXX11/src')
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h | 632 |
1 files changed, 622 insertions, 10 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index 4158271c3..7d0a21c3b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -20,6 +20,126 @@ namespace Eigen { * */ namespace internal { + + +template <typename Index, typename InputDims, size_t NumKernelDims> class IndexMapper { + public: + IndexMapper(const InputDims& input_dims, const array<Index, NumKernelDims>& kernel_dims, + const array<Index, NumKernelDims>& indices) { + + array<Index, NumDims> dimensions = input_dims; + for (int i = 0; i < NumKernelDims; ++i) { + const Index index = indices[i]; + const Index input_dim = input_dims[index]; + const Index kernel_dim = kernel_dims[i]; + const Index result_dim = input_dim - kernel_dim + 1; + dimensions[index] = result_dim; + } + + array<Index, NumDims> inputStrides; + array<Index, NumDims> outputStrides; + for (int i = 0; i < NumDims; ++i) { + if (i > 0) { + inputStrides[i] = inputStrides[i-1] * input_dims[i-1]; + outputStrides[i] = outputStrides[i-1] * dimensions[i-1]; + } else { + inputStrides[0] = 1; + outputStrides[0] = 1; + } + } + + array<Index, NumDims> cudaInputDimensions; + array<Index, NumDims> cudaOutputDimensions; + array<Index, NumDims> tmp = dimensions; + array<Index, NumDims> ordering; + for (int i = 0; i < NumKernelDims; ++i) { + ordering[i] = indices[i]; + tmp[indices[i]] = -1; + cudaInputDimensions[i] = input_dims[ordering[i]]; + cudaOutputDimensions[i] = dimensions[ordering[i]]; + } + int written = NumKernelDims; + for (int i = 0; i < NumDims; ++i) { + if (tmp[i] >= 0) { + ordering[written] = i; + cudaInputDimensions[written] = input_dims[i]; + cudaOutputDimensions[written] = dimensions[i]; + ++written; + } + } + + for (int i = 0; i < NumDims; ++i) { + m_inputStrides[i] = inputStrides[ordering[i]]; + m_outputStrides[i] = outputStrides[ordering[i]]; + } + + for (int i = 0; i < NumDims; ++i) { + if (i > NumKernelDims) { + m_cudaInputStrides[i] = m_cudaInputStrides[i-1] * cudaInputDimensions[i-1]; + m_cudaOutputStrides[i] = m_cudaOutputStrides[i-1] * cudaOutputDimensions[i-1]; + } else { + m_cudaInputStrides[i] = 1; + m_cudaOutputStrides[i] = 1; + } + } + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputPlaneToTensorInputOffset(Index p) const { + Index inputIndex = 0; + for (int d = NumDims - 1; d > NumKernelDims; --d) { + const Index idx = p / m_cudaInputStrides[d]; + inputIndex += idx * m_inputStrides[d]; + p -= idx * m_cudaInputStrides[d]; + } + inputIndex += p * m_inputStrides[NumKernelDims]; + return inputIndex; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputPlaneToTensorOutputOffset(Index p) const { + Index outputIndex = 0; + for (int d = NumDims - 1; d > NumKernelDims; --d) { + const Index idx = p / m_cudaOutputStrides[d]; + outputIndex += idx * m_outputStrides[d]; + p -= idx * m_cudaOutputStrides[d]; + } + outputIndex += p * m_outputStrides[NumKernelDims]; + return outputIndex; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i) const { + return i * m_inputStrides[0]; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i) const { + return i * m_outputStrides[0]; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j) const { + return i * m_inputStrides[0] + j*m_inputStrides[1]; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j) const { + return i * m_outputStrides[0] + j * m_outputStrides[1]; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j, Index k) const { + return i * m_inputStrides[0] + j*m_inputStrides[1] + k*m_inputStrides[2]; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const { + return i * m_outputStrides[0] + j*m_outputStrides[1] + k*m_outputStrides[2]; + } + + private: + static const size_t NumDims = internal::array_size<InputDims>::value; + array<Index, NumDims> m_inputStrides; + array<Index, NumDims> m_outputStrides; + array<Index, NumDims> m_cudaInputStrides; + array<Index, NumDims> m_cudaOutputStrides; +}; + + + template<typename Dimensions, typename InputXprType, typename KernelXprType> struct traits<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> > { @@ -75,15 +195,15 @@ class TensorConvolutionOp : public TensorBase<TensorConvolutionOp<Indices, Input EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConvolutionOp(const InputXprType& input, const KernelXprType& kernel, const Indices& dims) : m_input_xpr(input), m_kernel_xpr(kernel), m_indices(dims) {} - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Indices& indices() const { return m_indices; } /** \returns the nested expressions */ - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename internal::remove_all<typename InputXprType::Nested>::type& inputExpression() const { return m_input_xpr; } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename internal::remove_all<typename KernelXprType::Nested>::type& kernelExpression() const { return m_kernel_xpr; } @@ -99,8 +219,8 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr { typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType; - static const int NumDims = TensorEvaluator<InputArgType, Device>::Dimensions::count; - static const int KernelDims = internal::array_size<Indices>::value; + static const int NumDims = internal::array_size<typename TensorEvaluator<InputArgType, Device>::Dimensions>::value; + static const int NumKernelDims = internal::array_size<Indices>::value; typedef typename XprType::Index Index; typedef DSizes<Index, NumDims> Dimensions; @@ -111,7 +231,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_dimensions(op.inputExpression().dimensions()) + : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device) { const typename TensorEvaluator<InputArgType, Device>::Dimensions& input_dims = m_inputImpl.dimensions(); const typename TensorEvaluator<KernelArgType, Device>::Dimensions& kernel_dims = m_kernelImpl.dimensions(); @@ -124,7 +244,8 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr } } - for (int i = 0; i < KernelDims; ++i) { + m_dimensions = m_inputImpl.dimensions(); + for (int i = 0; i < NumKernelDims; ++i) { const Index index = op.indices()[i]; const Index input_dim = input_dims[index]; const Index kernel_dim = kernel_dims[i]; @@ -148,6 +269,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr } } + typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketReturnType PacketReturnType; @@ -195,7 +317,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) { const Index input = firstIndex + j * m_indexStride[DimIndex]; const Index kernel = firstKernel + j * m_kernelStride[DimIndex]; - if (DimIndex < KernelDims-1) { + if (DimIndex < NumKernelDims-1) { convolve(input, kernel, DimIndex+1, accum); } else { @@ -207,17 +329,507 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr Scalar* data() const { return NULL; } private: + // No copy, no assignment + TensorEvaluator(const TensorEvaluator&); + TensorEvaluator& operator = (const TensorEvaluator&); + array<Index, NumDims> m_inputStride; array<Index, NumDims> m_outputStride; - array<Index, KernelDims> m_indexStride; - array<Index, KernelDims> m_kernelStride; + array<Index, NumKernelDims> m_indexStride; + array<Index, NumKernelDims> m_kernelStride; TensorEvaluator<InputArgType, Device> m_inputImpl; TensorEvaluator<KernelArgType, Device> m_kernelImpl; Dimensions m_dimensions; }; + + +// Use an optimized implementation of the evaluation code for GPUs whenever possible. +#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) + +template <int StaticKernelSize> +struct GetKernelSize { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int /*kernelSize*/) const { + return StaticKernelSize; + } +}; +template <> +struct GetKernelSize<Eigen::Dynamic> { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int kernelSize) const { + return kernelSize; + } +}; + + + + +template <typename InputEvaluator, typename Index, typename InputDims, int StaticKernelSize> +__global__ void EigenConvolutionKernel1D(InputEvaluator eval, const internal::IndexMapper<Index, InputDims, 1> indexMapper, const float* __restrict kernel, const int numPlanes, const int numX, const int maxX, const int kernelSize, float* buffer) { + extern __shared__ float s[]; + + const int first_x = blockIdx.x * maxX; + const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; + const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSize>()(kernelSize); + const int num_x_output = last_x - first_x + 1; + + const int first_plane = blockIdx.y * blockDim.y; + const int plane_stride = blockDim.y * gridDim.y; + + for (int p = first_plane + threadIdx.y; p < numPlanes; p += plane_stride) { + // Load inputs to shared memory + const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p); + const int plane_kernel_offset = threadIdx.y * num_x_input; + #pragma unroll + for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) { + const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x); + s[i + plane_kernel_offset] = eval.coeff(tensor_index); + } + + __syncthreads(); + + // Compute the convolution + const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p); + + #pragma unroll + for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) { + const int kernel_offset = plane_kernel_offset + i; + float result = 0.0f; + #pragma unroll + for (int k = 0; k < GetKernelSize<StaticKernelSize>()(kernelSize); ++k) { + result += s[k + kernel_offset] * kernel[k]; + } + const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x); + buffer[tensor_index] = result; + } + __syncthreads(); + } +}; + + +template <typename InputEvaluator, typename Index, typename InputDims, int StaticKernelSizeX, int StaticKernelSizeY> +__global__ void EigenConvolutionKernel2D(InputEvaluator eval, const internal::IndexMapper<Index, InputDims, 2> indexMapper, const float* __restrict kernel, const int numPlanes, const int numX, const int maxX, const int numY, const int maxY, const int kernelSizeX, const int kernelSizeY, float* buffer) { + extern __shared__ float s[]; + + const int first_x = blockIdx.x * maxX; + const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; + const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSizeX>()(kernelSizeX); + const int num_x_output = last_x - first_x + 1; + + const int first_y = blockIdx.y * maxY; + const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1; + const int num_y_input = last_y - first_y + GetKernelSize<StaticKernelSizeY>()(kernelSizeY); + const int num_y_output = last_y - first_y + 1; + + const int first_plane = blockIdx.z * blockDim.z; + const int plane_stride = blockDim.z * gridDim.z; + + for (int p = first_plane + threadIdx.z; p < numPlanes; p += plane_stride) { + + const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p); + const int plane_kernel_offset = threadIdx.z * num_y_input; + + // Load inputs to shared memory + #pragma unroll + for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) { + const int input_offset = num_x_input * (j + plane_kernel_offset); + #pragma unroll + for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) { + const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x, j+first_y); + s[i + input_offset] = eval.coeff(tensor_index); + } + } + + __syncthreads(); + + // Convolution + const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p); + + #pragma unroll + for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) { + #pragma unroll + for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) { + float result = 0.0f; + #pragma unroll + for (int l = 0; l < GetKernelSize<StaticKernelSizeY>()(kernelSizeY); ++l) { + const int kernel_offset = kernelSizeX * l; + const int input_offset = i + num_x_input * (j + l + plane_kernel_offset); + #pragma unroll + for (int k = 0; k < GetKernelSize<StaticKernelSizeX>()(kernelSizeX); ++k) { + result += s[k + input_offset] * kernel[k + kernel_offset]; + } + } + const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x, j+first_y); + buffer[tensor_index] = result; + } + } + + __syncthreads(); + } +}; + + +template <typename InputEvaluator, typename Index, typename InputDims> +__global__ void EigenConvolutionKernel3D(InputEvaluator eval, const internal::IndexMapper<Index, InputDims, 3> indexMapper, const float* __restrict kernel, const size_t numPlanes, const size_t numX, const size_t maxX, const size_t numY, const size_t maxY, const size_t numZ, const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY, const size_t kernelSizeZ, float* buffer) { + extern __shared__ float s[]; + + // Load inputs to shared memory + const int first_x = blockIdx.x * maxX; + const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; + const int num_x_input = last_x - first_x + kernelSizeX; + + const int first_y = blockIdx.y * maxY; + const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1; + const int num_y_input = last_y - first_y + kernelSizeY; + + const int first_z = blockIdx.z * maxZ; + const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1; + const int num_z_input = last_z - first_z + kernelSizeZ; + + for (int p = 0; p < numPlanes; ++p) { + + const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p); + const int plane_kernel_offset = 0; + + for (int k = threadIdx.z; k < num_z_input; k += blockDim.z) { + for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) { + for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) { + const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z); + s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index); + } + } + } + + __syncthreads(); + + // Convolution + const int num_z_output = last_z - first_z + 1; + const int num_y_output = last_y - first_y + 1; + const int num_x_output = last_x - first_x + 1; + const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p); + + for (int k = threadIdx.z; k < num_z_output; k += blockDim.z) { + for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) { + for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) { + float result = 0.0f; + for (int n = 0; n < kernelSizeZ; ++n) { + for (int m = 0; m < kernelSizeY; ++m) { + for (int l = 0; l < kernelSizeX; ++l) { + result += s[i + l + num_x_input * (j + m + num_y_input * (k + n + plane_kernel_offset))] * kernel[l + kernelSizeX * (m + kernelSizeY * n)]; + } + } + } + const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x, j+first_y, k+first_z); + buffer[tensor_index] = result; + } + } + } + __syncthreads(); + } +}; + + + +template<typename Indices, typename InputArgType, typename KernelArgType> +struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, GpuDevice> +{ + typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType; + + static const int NumDims = internal::array_size<typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions>::value; + static const int NumKernelDims = internal::array_size<Indices>::value; + typedef typename XprType::Index Index; + typedef DSizes<Index, NumDims> Dimensions; + typedef typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions KernelDimensions; + + enum { + IsAligned = TensorEvaluator<InputArgType, GpuDevice>::IsAligned & TensorEvaluator<KernelArgType, GpuDevice>::IsAligned, + PacketAccess = false, + }; + + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const GpuDevice& device) + : m_inputImpl(op.inputExpression(), device), m_kernelArg(op.kernelExpression()), m_kernelImpl(op.kernelExpression(), device), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device) + { + const typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions& input_dims = m_inputImpl.dimensions(); + const typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions& kernel_dims = m_kernelImpl.dimensions(); + + m_dimensions = m_inputImpl.dimensions(); + for (int i = 0; i < NumKernelDims; ++i) { + const Index index = op.indices()[i]; + const Index input_dim = input_dims[index]; + const Index kernel_dim = kernel_dims[i]; + const Index result_dim = input_dim - kernel_dim + 1; + m_dimensions[index] = result_dim; + } + } + + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename InputArgType::Scalar Scalar; + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + preloadKernel(); + m_inputImpl.evalSubExprsIfNeeded(NULL); + if (data) { + executeEval(data); + return false; + } else { + m_buf = (Scalar*)m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)); + executeEval(m_buf); + return true; + } + } + + EIGEN_STRONG_INLINE void cleanup() { + m_inputImpl.cleanup(); + if (m_buf) { + m_device.deallocate(m_buf); + m_buf = NULL; + } + if (m_local_kernel) { + m_device.deallocate((void*)m_kernel); + m_local_kernel = false; + } + m_kernel = NULL; + } + + EIGEN_STRONG_INLINE void preloadKernel() { + // Don't make a local copy of the kernel unless we have to (i.e. it's an + // expression that needs to be evaluated) + const Scalar* in_place = m_kernelImpl.data(); + if (in_place) { + m_kernel = in_place; + m_local_kernel = false; + } else { + size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar); + Scalar* local = (Scalar*)m_device.allocate(kernel_sz); + typedef TensorEvalToOp<const KernelArgType> EvalTo; + EvalTo evalToTmp(local, m_kernelArg); + internal::TensorExecutor<const EvalTo, GpuDevice, TensorEvaluator<KernelArgType, GpuDevice>::PacketAccess>::run(evalToTmp, m_device); + + m_kernel = local; + m_local_kernel = true; + } + } + + static unsigned int ceil(unsigned int num, unsigned int denom) { + const unsigned int rounded_toward_zero = num / denom; + if (num > rounded_toward_zero * denom) { + return rounded_toward_zero + 1; + } + return rounded_toward_zero; + } + + void executeEval(Scalar* data) const { + typedef typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions InputDims; + + const int maxSharedMem = sharedMemPerBlock(); + const int maxThreadsPerBlock = maxCudaThreadsPerBlock(); + const int maxBlocksPerProcessor = maxCudaThreadsPerMultiProcessor() / maxThreadsPerBlock; + const int numMultiProcessors = getNumCudaMultiProcessors(); + const int warpSize = 32; + + switch (NumKernelDims) { + case 1: { + const int kernel_size = m_kernelImpl.dimensions().TotalSize(); + + const int numX = dimensions()[m_indices[0]]; + const int numP = dimensions().TotalSize() / numX; + + int maxX; + dim3 block_size; + if (m_indices[0] == 0) { + // Maximum the reuse + const int inner_dim = ((maxSharedMem / (sizeof(Scalar)) - kernel_size + 1 + 31) / 32) * 32; + maxX = (std::min<int>)(inner_dim, numX); + const int maxP = (std::min<int>)(maxSharedMem / ((kernel_size - 1 + maxX) * sizeof(Scalar)), numP); + block_size.x = (std::min)(maxThreadsPerBlock, maxX); + block_size.y = (std::min<int>)(maxThreadsPerBlock / block_size.x, maxP); + } + else { + // Read as much as possible alongside the inner most dimension, that is the plane + const int inner_dim = maxSharedMem / ((warpSize + kernel_size) * sizeof(Scalar)); + const int maxP = (std::min<int>)(inner_dim, numP); + maxX = (std::min<int>)(maxSharedMem / (inner_dim * sizeof(Scalar)) - kernel_size + 1, numX); + + block_size.x = (std::min)(warpSize, maxX); + block_size.y = (std::min<int>)(maxThreadsPerBlock/block_size.x, maxP); + } + + const int shared_mem = block_size.y * (maxX + kernel_size - 1) * sizeof(Scalar); + assert(shared_mem <= maxSharedMem); + + const int num_x_blocks = ceil(numX, maxX); + const int blocksPerProcessor = (std::min)(maxBlocksPerProcessor, maxSharedMem / shared_mem); + const int num_y_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks); + + dim3 num_blocks(num_x_blocks, min<int>(num_y_blocks, ceil(numP, block_size.y))); + + + //cout << "launching 1D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " maxX: " << maxX << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl; + + const array<Index, 1> indices(m_indices[0]); + const array<Index, 1> kernel_dims(m_kernelImpl.dimensions()[0]); + internal::IndexMapper<Index, InputDims, 1> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); + switch(kernel_size) { + case 4: { + EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4> <<<num_blocks, block_size, shared_mem, m_device.stream()>>>(m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data); + break; + } + case 7: { + EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7> <<<num_blocks, block_size, shared_mem, m_device.stream()>>>(m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data); + break; + } + default: { + EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Eigen::Dynamic> <<<num_blocks, block_size, shared_mem, m_device.stream()>>>(m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data); + } + } + cudaError_t error = cudaGetLastError(); + assert(error == cudaSuccess); + break; + } + + case 2: { + const int kernel_size_x = m_kernelImpl.dimensions()[0]; + const int kernel_size_y = m_kernelImpl.dimensions()[1]; + + const int numX = dimensions()[m_indices[0]]; + const int numY = dimensions()[m_indices[1]]; + const int numP = dimensions().TotalSize() / (numX*numY); + + const float scaling_factor = sqrtf(static_cast<float>(maxSharedMem) / (sizeof(Scalar) * kernel_size_y * kernel_size_x)); + + // Snap maxX to warp size + int inner_dim = ((static_cast<int>(scaling_factor * kernel_size_x) - kernel_size_x + 1 + 32) / 32) * 32; + const int maxX = (std::min<int>)(inner_dim, numX); + const int maxY = (std::min<int>)(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1)) - kernel_size_y + 1, numY); + const int maxP = (std::min<int>)(maxSharedMem / ((kernel_size_x - 1 + maxX) * (kernel_size_y - 1 + maxY) * sizeof(Scalar)), numP); + + dim3 block_size; + block_size.x = (std::min)(1024, maxX); + block_size.y = (std::min<int>)(1024/block_size.x, maxY); + block_size.z = (std::min<int>)(1024/(block_size.x*block_size.y), maxP); + + const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * sizeof(Scalar); + assert(shared_mem <= maxSharedMem); + + const int num_x_blocks = ceil(numX, maxX); + const int num_y_blocks = ceil(numY, maxY); + const int blocksPerProcessor = (std::min)(maxBlocksPerProcessor, maxSharedMem / shared_mem); + const int num_z_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks * num_y_blocks); + + dim3 num_blocks(num_x_blocks, num_y_blocks, min<int>(num_z_blocks, ceil(numP, block_size.z))); + + + //cout << "launching 2D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " maxX: " << maxX << " maxY: " << maxY << " maxP: " << maxP << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl; + + const array<Index, 2> indices(m_indices[0], m_indices[1]); + const array<Index, 2> kernel_dims(m_kernelImpl.dimensions()[0], m_kernelImpl.dimensions()[1]); + internal::IndexMapper<Index, InputDims, 2> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); + switch (kernel_size_x) { + case 4: { + switch (kernel_size_y) { + case 7: { + EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, 7> <<<num_blocks, block_size, shared_mem, m_device.stream()>>>(m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data); + break; + } + default: { + EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, Eigen::Dynamic> <<<num_blocks, block_size, shared_mem, m_device.stream()>>>(m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data); + break; + } + } + break; + } + case 7: { + switch (kernel_size_y) { + case 4: { + EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, 4> <<<num_blocks, block_size, shared_mem, m_device.stream()>>>(m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data); + break; + } + default: { + EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, Eigen::Dynamic> <<<num_blocks, block_size, shared_mem, m_device.stream()>>>(m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data); + break; + } + } + break; + } + default: { + EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Eigen::Dynamic, Eigen::Dynamic> <<<num_blocks, block_size, shared_mem, m_device.stream()>>>(m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data); + break; + } + } + cudaError_t error = cudaGetLastError(); + assert(error == cudaSuccess); + break; + } + + case 3: { + const int kernel_size_x = m_kernelImpl.dimensions()[0]; + const int kernel_size_y = m_kernelImpl.dimensions()[1]; + const int kernel_size_z = m_kernelImpl.dimensions()[2]; + + const int numX = dimensions()[m_indices[0]]; + const int numY = dimensions()[m_indices[1]]; + const int numZ = dimensions()[m_indices[2]]; + const int numP = dimensions().TotalSize() / (numX*numY*numZ); + + const int maxX = (std::min<int>)(128, (std::min<int>)(maxSharedMem / (sizeof(Scalar) * kernel_size_y * kernel_size_z) - kernel_size_x + 1, numX)); + const int maxY = (std::min<int>)(128, (std::min<int>)(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * kernel_size_z) - kernel_size_y + 1, numY)); + const int maxZ = (std::min<int>)(128, (std::min<int>)(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1)) - kernel_size_z + 1, numZ)); + + dim3 block_size; + block_size.x = (std::min)(32, maxX); + block_size.y = (std::min)(32, maxY); + block_size.z = (std::min<int>)(1024/(block_size.x*block_size.y), maxZ); + dim3 num_blocks(ceil(numX, maxX), ceil(numY, maxY), ceil(numZ, maxZ)); + + const int shared_mem = (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) * sizeof(Scalar); + assert(shared_mem <= maxSharedMem); + + //cout << "launching 3D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl; + const array<Index, 3> indices(m_indices[0], m_indices[1], m_indices[2]); + const array<Index, 3> kernel_dims(m_kernelImpl.dimensions()[0], m_kernelImpl.dimensions()[1], m_kernelImpl.dimensions()[2]); + internal::IndexMapper<Index, InputDims, 3> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); + + EigenConvolutionKernel3D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims> <<<num_blocks, block_size, shared_mem, m_device.stream()>>>(m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data); + cudaError_t error = cudaGetLastError(); + assert(error == cudaSuccess); + break; + } + + default: { + assert(false && "not supported yet"); + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + assert(m_buf); + assert(index < m_dimensions.TotalSize()); + return m_buf[index]; + } + + private: + // No assignment (copies are needed by the kernels) + TensorEvaluator& operator = (const TensorEvaluator&); + + TensorEvaluator<InputArgType, GpuDevice> m_inputImpl; + TensorEvaluator<KernelArgType, GpuDevice> m_kernelImpl; + KernelArgType m_kernelArg; + Indices m_indices; + Dimensions m_dimensions; + Scalar* m_buf; + const Scalar* m_kernel; + bool m_local_kernel; + + const GpuDevice& m_device; +}; +#endif + + } // end namespace Eigen #endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H |