aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorGravatar Benoit Steiner <benoit.steiner.goog@gmail.com>2014-09-03 11:38:13 -0700
committerGravatar Benoit Steiner <benoit.steiner.goog@gmail.com>2014-09-03 11:38:13 -0700
commitb24fe22b1a4518f27ca064d496bfdb6c96d973ab (patch)
tree78a99bc733484a82c430d7724bdfafe026630a0e
parent2959045f2fe111f93b23517fd6f7afe49720a290 (diff)
Improved the performance of the tensor convolution code by a factor of about 4.
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h139
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h7
2 files changed, 107 insertions, 39 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
index 7d0a21c3b..4a5fd9c79 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
@@ -226,22 +226,18 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
enum {
IsAligned = TensorEvaluator<InputArgType, Device>::IsAligned & TensorEvaluator<KernelArgType, Device>::IsAligned,
- PacketAccess = /*TensorEvaluator<InputArgType>::PacketAccess & TensorEvaluator<KernelArgType>::PacketAccess */
- false,
+ PacketAccess = TensorEvaluator<InputArgType, Device>::PacketAccess & TensorEvaluator<KernelArgType, Device>::PacketAccess,
};
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
- : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device)
+ : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernel(NULL), m_kernelArg(op.kernelExpression()), m_local_kernel(false), m_device(device)
{
const typename TensorEvaluator<InputArgType, Device>::Dimensions& input_dims = m_inputImpl.dimensions();
const typename TensorEvaluator<KernelArgType, Device>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
- for (int i = 0; i < NumDims; ++i) {
- if (i > 0) {
- m_inputStride[i] = m_inputStride[i-1] * input_dims[i-1];
- } else {
- m_inputStride[0] = 1;
- }
+ m_inputStride[0] = 1;
+ for (int i = 1; i < NumDims; ++i) {
+ m_inputStride[i] = m_inputStride[i-1] * input_dims[i-1];
}
m_dimensions = m_inputImpl.dimensions();
@@ -251,7 +247,6 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
const Index kernel_dim = kernel_dims[i];
const Index result_dim = input_dim - kernel_dim + 1;
m_dimensions[index] = result_dim;
-
if (i > 0) {
m_kernelStride[i] = m_kernelStride[i-1] * kernel_dims[i-1];
} else {
@@ -260,16 +255,12 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
m_indexStride[i] = m_inputStride[index];
}
- for (int i = 0; i < NumDims; ++i) {
- if (i > 0) {
- m_outputStride[i] = m_outputStride[i-1] * m_dimensions[i-1];
- } else {
- m_outputStride[0] = 1;
- }
+ m_outputStride[0] = 1;
+ for (int i = 1; i < NumDims; ++i) {
+ m_outputStride[i] = m_outputStride[i-1] * m_dimensions[i-1];
}
}
-
typedef typename XprType::Scalar Scalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename XprType::PacketReturnType PacketReturnType;
@@ -278,57 +269,126 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
m_inputImpl.evalSubExprsIfNeeded(NULL);
- m_kernelImpl.evalSubExprsIfNeeded(NULL);
+ preloadKernel();
return true;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
m_inputImpl.cleanup();
- m_kernelImpl.cleanup();
+ if (m_local_kernel) {
+ m_device.deallocate((void*)m_kernel);
+ m_local_kernel = false;
+ }
+ m_kernel = NULL;
}
- void evalTo(typename XprType::Scalar* buffer) const {
+ void evalTo(typename XprType::Scalar* buffer) {
+ evalSubExprsIfNeeded(NULL);
for (int i = 0; i < dimensions().TotalSize(); ++i) {
buffer[i] += coeff(i);
}
+ cleanup();
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
{
- Index startInput = 0;
- for (int i = NumDims - 1; i >= 0; --i) {
- const Index idx = index / m_outputStride[i];
- startInput += idx * m_inputStride[i];
- index -= idx * m_outputStride[i];
- }
-
CoeffReturnType result = CoeffReturnType(0);
- convolve(startInput, 0, 0, result);
+ convolve(firstInput(index), 0, NumKernelDims-1, result);
return result;
}
- /* TODO: vectorization
template<int LoadMode>
- EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const
+ EIGEN_DEVICE_FUNC PacketReturnType packet(const Index index) const
{
- assert(false);
- }*/
+ const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+ Index indices[2] = {index, index+PacketSize-1};
+ Index startInputs[2] = {0, 0};
+ for (int i = NumDims - 1; i > 0; --i) {
+ const Index idx0 = indices[0] / m_outputStride[i];
+ const Index idx1 = indices[1] / m_outputStride[i];
+ startInputs[0] += idx0 * m_inputStride[i];
+ startInputs[1] += idx1 * m_inputStride[i];
+ indices[0] -= idx0 * m_outputStride[i];
+ indices[1] -= idx1 * m_outputStride[i];
+ }
+ startInputs[0] += indices[0];
+ startInputs[1] += indices[1];
+
+ if (startInputs[1]-startInputs[0] == PacketSize-1) {
+ PacketReturnType result = internal::pset1<PacketReturnType>(0);
+ convolvePacket(startInputs[0], 0, NumKernelDims-1, result);
+ return result;
+ } else {
+ EIGEN_ALIGN_DEFAULT Scalar data[PacketSize];
+ data[0] = Scalar(0);
+ convolve(startInputs[0], 0, NumKernelDims-1, data[0]);
+ for (int i = 1; i < PacketSize-1; ++i) {
+ data[i] = Scalar(0);
+ convolve(firstInput(index+i), 0, NumKernelDims-1, data[i]);
+ }
+ data[PacketSize-1] = Scalar(0);
+ convolve(startInputs[1], 0, NumKernelDims-1, data[PacketSize-1]);
+ return internal::pload<PacketReturnType>(data);
+ }
+ }
+
+ Scalar* data() const { return NULL; }
+
+ private:
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
+ Index startInput = 0;
+ for (int i = NumDims - 1; i > 0; --i) {
+ const Index idx = index / m_outputStride[i];
+ startInput += idx * m_inputStride[i];
+ index -= idx * m_outputStride[i];
+ }
+ startInput += index;
+ return startInput;
+ }
EIGEN_DEVICE_FUNC void convolve(Index firstIndex, Index firstKernel, int DimIndex, CoeffReturnType& accum) const {
for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
const Index input = firstIndex + j * m_indexStride[DimIndex];
const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
- if (DimIndex < NumKernelDims-1) {
- convolve(input, kernel, DimIndex+1, accum);
+ if (DimIndex > 0) {
+ convolve(input, kernel, DimIndex-1, accum);
} else {
+ accum += m_inputImpl.coeff(input) * m_kernel[kernel];
+ }
+ }
+ }
- accum += m_inputImpl.coeff(input) * m_kernelImpl.coeff(kernel);
+ template <typename Packet>
+ EIGEN_DEVICE_FUNC void convolvePacket(Index firstIndex, Index firstKernel, int DimIndex, Packet& accum) const {
+ for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
+ const Index input = firstIndex + j * m_indexStride[DimIndex];
+ const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
+ if (DimIndex > 0) {
+ convolvePacket(input, kernel, DimIndex-1, accum);
+ } else {
+ accum = internal::pmadd<Packet>(m_inputImpl.template packet<Unaligned>(input), internal::pset1<Packet>(m_kernel[kernel]), accum);
}
}
}
- Scalar* data() const { return NULL; }
+ EIGEN_STRONG_INLINE void preloadKernel() {
+ // Don't make a local copy of the kernel unless we have to (i.e. it's an
+ // expression that needs to be evaluated)
+ const Scalar* in_place = m_kernelImpl.data();
+ if (in_place) {
+ m_kernel = in_place;
+ m_local_kernel = false;
+ } else {
+ size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
+ Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
+ typedef TensorEvalToOp<const KernelArgType> EvalTo;
+ EvalTo evalToTmp(local, m_kernelArg);
+ internal::TensorExecutor<const EvalTo, Device, TensorEvaluator<KernelArgType, Device>::PacketAccess>::run(evalToTmp, m_device);
+
+ m_kernel = local;
+ m_local_kernel = true;
+ }
+ }
- private:
// No copy, no assignment
TensorEvaluator(const TensorEvaluator&);
TensorEvaluator& operator = (const TensorEvaluator&);
@@ -341,6 +401,11 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
TensorEvaluator<InputArgType, Device> m_inputImpl;
TensorEvaluator<KernelArgType, Device> m_kernelImpl;
Dimensions m_dimensions;
+
+ KernelArgType m_kernelArg;
+ const Scalar* m_kernel;
+ bool m_local_kernel;
+ const Device& m_device;
};
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
index db716a80e..587cbd5ca 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
@@ -108,8 +108,9 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType>, Device>
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); }
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeeded() {
- m_impl.evalSubExprsIfNeeded();
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
+ m_impl.evalSubExprsIfNeeded(NULL);
+ return true;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) {
@@ -134,6 +135,8 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType>, Device>
return internal::ploadt<Packet, LoadMode>(m_buffer + index);
}
+ Scalar* data() const { return NULL; }
+
private:
TensorEvaluator<ArgType, Device> m_impl;
const Device& m_device;