diff options
author | Benoit Steiner <benoit.steiner.goog@gmail.com> | 2014-09-03 11:38:13 -0700 |
---|---|---|
committer | Benoit Steiner <benoit.steiner.goog@gmail.com> | 2014-09-03 11:38:13 -0700 |
commit | b24fe22b1a4518f27ca064d496bfdb6c96d973ab (patch) | |
tree | 78a99bc733484a82c430d7724bdfafe026630a0e /unsupported/Eigen/CXX11/src/Tensor | |
parent | 2959045f2fe111f93b23517fd6f7afe49720a290 (diff) |
Improved the performance of the tensor convolution code by a factor of about 4.
Diffstat (limited to 'unsupported/Eigen/CXX11/src/Tensor')
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h | 139 | ||||
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h | 7 |
2 files changed, 107 insertions, 39 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index 7d0a21c3b..4a5fd9c79 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -226,22 +226,18 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr enum { IsAligned = TensorEvaluator<InputArgType, Device>::IsAligned & TensorEvaluator<KernelArgType, Device>::IsAligned, - PacketAccess = /*TensorEvaluator<InputArgType>::PacketAccess & TensorEvaluator<KernelArgType>::PacketAccess */ - false, + PacketAccess = TensorEvaluator<InputArgType, Device>::PacketAccess & TensorEvaluator<KernelArgType, Device>::PacketAccess, }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device) + : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernel(NULL), m_kernelArg(op.kernelExpression()), m_local_kernel(false), m_device(device) { const typename TensorEvaluator<InputArgType, Device>::Dimensions& input_dims = m_inputImpl.dimensions(); const typename TensorEvaluator<KernelArgType, Device>::Dimensions& kernel_dims = m_kernelImpl.dimensions(); - for (int i = 0; i < NumDims; ++i) { - if (i > 0) { - m_inputStride[i] = m_inputStride[i-1] * input_dims[i-1]; - } else { - m_inputStride[0] = 1; - } + m_inputStride[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_inputStride[i] = m_inputStride[i-1] * input_dims[i-1]; } m_dimensions = m_inputImpl.dimensions(); @@ -251,7 +247,6 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr const Index kernel_dim = kernel_dims[i]; const Index result_dim = input_dim - kernel_dim + 1; m_dimensions[index] = result_dim; - if (i > 0) { m_kernelStride[i] = m_kernelStride[i-1] * kernel_dims[i-1]; } else { @@ -260,16 +255,12 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr m_indexStride[i] = m_inputStride[index]; } - for (int i = 0; i < NumDims; ++i) { - if (i > 0) { - m_outputStride[i] = m_outputStride[i-1] * m_dimensions[i-1]; - } else { - m_outputStride[0] = 1; - } + m_outputStride[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_outputStride[i] = m_outputStride[i-1] * m_dimensions[i-1]; } } - typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketReturnType PacketReturnType; @@ -278,57 +269,126 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { m_inputImpl.evalSubExprsIfNeeded(NULL); - m_kernelImpl.evalSubExprsIfNeeded(NULL); + preloadKernel(); return true; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { m_inputImpl.cleanup(); - m_kernelImpl.cleanup(); + if (m_local_kernel) { + m_device.deallocate((void*)m_kernel); + m_local_kernel = false; + } + m_kernel = NULL; } - void evalTo(typename XprType::Scalar* buffer) const { + void evalTo(typename XprType::Scalar* buffer) { + evalSubExprsIfNeeded(NULL); for (int i = 0; i < dimensions().TotalSize(); ++i) { buffer[i] += coeff(i); } + cleanup(); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - Index startInput = 0; - for (int i = NumDims - 1; i >= 0; --i) { - const Index idx = index / m_outputStride[i]; - startInput += idx * m_inputStride[i]; - index -= idx * m_outputStride[i]; - } - CoeffReturnType result = CoeffReturnType(0); - convolve(startInput, 0, 0, result); + convolve(firstInput(index), 0, NumKernelDims-1, result); return result; } - /* TODO: vectorization template<int LoadMode> - EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const + EIGEN_DEVICE_FUNC PacketReturnType packet(const Index index) const { - assert(false); - }*/ + const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; + Index indices[2] = {index, index+PacketSize-1}; + Index startInputs[2] = {0, 0}; + for (int i = NumDims - 1; i > 0; --i) { + const Index idx0 = indices[0] / m_outputStride[i]; + const Index idx1 = indices[1] / m_outputStride[i]; + startInputs[0] += idx0 * m_inputStride[i]; + startInputs[1] += idx1 * m_inputStride[i]; + indices[0] -= idx0 * m_outputStride[i]; + indices[1] -= idx1 * m_outputStride[i]; + } + startInputs[0] += indices[0]; + startInputs[1] += indices[1]; + + if (startInputs[1]-startInputs[0] == PacketSize-1) { + PacketReturnType result = internal::pset1<PacketReturnType>(0); + convolvePacket(startInputs[0], 0, NumKernelDims-1, result); + return result; + } else { + EIGEN_ALIGN_DEFAULT Scalar data[PacketSize]; + data[0] = Scalar(0); + convolve(startInputs[0], 0, NumKernelDims-1, data[0]); + for (int i = 1; i < PacketSize-1; ++i) { + data[i] = Scalar(0); + convolve(firstInput(index+i), 0, NumKernelDims-1, data[i]); + } + data[PacketSize-1] = Scalar(0); + convolve(startInputs[1], 0, NumKernelDims-1, data[PacketSize-1]); + return internal::pload<PacketReturnType>(data); + } + } + + Scalar* data() const { return NULL; } + + private: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const { + Index startInput = 0; + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_outputStride[i]; + startInput += idx * m_inputStride[i]; + index -= idx * m_outputStride[i]; + } + startInput += index; + return startInput; + } EIGEN_DEVICE_FUNC void convolve(Index firstIndex, Index firstKernel, int DimIndex, CoeffReturnType& accum) const { for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) { const Index input = firstIndex + j * m_indexStride[DimIndex]; const Index kernel = firstKernel + j * m_kernelStride[DimIndex]; - if (DimIndex < NumKernelDims-1) { - convolve(input, kernel, DimIndex+1, accum); + if (DimIndex > 0) { + convolve(input, kernel, DimIndex-1, accum); } else { + accum += m_inputImpl.coeff(input) * m_kernel[kernel]; + } + } + } - accum += m_inputImpl.coeff(input) * m_kernelImpl.coeff(kernel); + template <typename Packet> + EIGEN_DEVICE_FUNC void convolvePacket(Index firstIndex, Index firstKernel, int DimIndex, Packet& accum) const { + for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) { + const Index input = firstIndex + j * m_indexStride[DimIndex]; + const Index kernel = firstKernel + j * m_kernelStride[DimIndex]; + if (DimIndex > 0) { + convolvePacket(input, kernel, DimIndex-1, accum); + } else { + accum = internal::pmadd<Packet>(m_inputImpl.template packet<Unaligned>(input), internal::pset1<Packet>(m_kernel[kernel]), accum); } } } - Scalar* data() const { return NULL; } + EIGEN_STRONG_INLINE void preloadKernel() { + // Don't make a local copy of the kernel unless we have to (i.e. it's an + // expression that needs to be evaluated) + const Scalar* in_place = m_kernelImpl.data(); + if (in_place) { + m_kernel = in_place; + m_local_kernel = false; + } else { + size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar); + Scalar* local = (Scalar*)m_device.allocate(kernel_sz); + typedef TensorEvalToOp<const KernelArgType> EvalTo; + EvalTo evalToTmp(local, m_kernelArg); + internal::TensorExecutor<const EvalTo, Device, TensorEvaluator<KernelArgType, Device>::PacketAccess>::run(evalToTmp, m_device); + + m_kernel = local; + m_local_kernel = true; + } + } - private: // No copy, no assignment TensorEvaluator(const TensorEvaluator&); TensorEvaluator& operator = (const TensorEvaluator&); @@ -341,6 +401,11 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr TensorEvaluator<InputArgType, Device> m_inputImpl; TensorEvaluator<KernelArgType, Device> m_kernelImpl; Dimensions m_dimensions; + + KernelArgType m_kernelArg; + const Scalar* m_kernel; + bool m_local_kernel; + const Device& m_device; }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h index db716a80e..587cbd5ca 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h @@ -108,8 +108,9 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType>, Device> EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeeded() { - m_impl.evalSubExprsIfNeeded(); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) { @@ -134,6 +135,8 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType>, Device> return internal::ploadt<Packet, LoadMode>(m_buffer + index); } + Scalar* data() const { return NULL; } + private: TensorEvaluator<ArgType, Device> m_impl; const Device& m_device; |