From 235e83aba608cf3d94b033bfbf551f8c136a3fab Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Thu, 14 Apr 2016 13:57:35 -0700 Subject: Eigen cost model part 1. This implements a basic recursive framework to estimate the cost of evaluating tensor expressions. --- unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h | 5 + unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h | 25 ++- .../Eigen/CXX11/src/Tensor/TensorBroadcasting.h | 51 ++++-- .../Eigen/CXX11/src/Tensor/TensorChipping.h | 63 ++++--- .../Eigen/CXX11/src/Tensor/TensorConcatenation.h | 15 ++ .../Eigen/CXX11/src/Tensor/TensorContraction.h | 4 + .../Eigen/CXX11/src/Tensor/TensorConversion.h | 17 +- .../Eigen/CXX11/src/Tensor/TensorConvolution.h | 46 +++++- .../Eigen/CXX11/src/Tensor/TensorCostModel.h | 181 +++++++++++++++++++++ .../Eigen/CXX11/src/Tensor/TensorCustomOp.h | 25 ++- unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h | 15 +- .../Eigen/CXX11/src/Tensor/TensorEvaluator.h | 36 ++++ unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h | 12 +- .../Eigen/CXX11/src/Tensor/TensorForcedEval.h | 12 +- .../Eigen/CXX11/src/Tensor/TensorFunctors.h | 13 ++ .../Eigen/CXX11/src/Tensor/TensorGenerator.h | 8 + .../Eigen/CXX11/src/Tensor/TensorImagePatch.h | 31 ++-- .../Eigen/CXX11/src/Tensor/TensorInflation.h | 31 ++-- .../Eigen/CXX11/src/Tensor/TensorLayoutSwap.h | 4 + .../Eigen/CXX11/src/Tensor/TensorMorphing.h | 9 + unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h | 62 +++++-- unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h | 30 ++-- .../Eigen/CXX11/src/Tensor/TensorReduction.h | 33 ++-- unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h | 40 +++-- .../Eigen/CXX11/src/Tensor/TensorShuffling.h | 36 ++-- .../Eigen/CXX11/src/Tensor/TensorStriding.h | 47 +++--- .../Eigen/CXX11/src/Tensor/TensorVolumePatch.h | 27 +-- 27 files changed, 693 insertions(+), 185 deletions(-) create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h (limited to 'unsupported/Eigen/CXX11/src/Tensor') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h index f1ec04c49..babafe108 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h @@ -112,6 +112,11 @@ struct TensorEvaluator, Device> return CoeffReturnType(index, m_impl.coeff(index)); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, 1); + } + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } protected: diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h index 199d2ce41..5abff0800 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h @@ -89,6 +89,12 @@ template struct TensorEvaluator, Device> { typedef TensorAssignOp XprType; + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef typename TensorEvaluator::Dimensions Dimensions; + static const int PacketSize = internal::unpacket_traits::size; enum { IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, @@ -104,12 +110,6 @@ struct TensorEvaluator, Device> EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); } - typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - typedef typename TensorEvaluator::Dimensions Dimensions; - EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { // The dimensions of the lhs and the rhs tensors should be equal to prevent @@ -150,6 +150,19 @@ struct TensorEvaluator, Device> return m_leftImpl.template packet(index); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + // We assume that evalPacket or evalScalar is called to perform the + // assignment and account for the cost of the write here, but reduce left + // cost by one load because we are using m_leftImpl.coeffRef. + TensorOpCost left = m_leftImpl.costPerCoeff(vectorized); + return m_rightImpl.costPerCoeff(vectorized) + + TensorOpCost( + numext::maxi(0.0, left.bytes_loaded() - sizeof(CoeffReturnType)), + left.bytes_stored(), left.compute_cycles()) + + TensorOpCost(0, sizeof(CoeffReturnType), 0, vectorized, PacketSize); + } + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_leftImpl.data(); } private: diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index b6e6db12a..c771496e2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -101,6 +101,9 @@ struct TensorEvaluator, Device> typedef DSizes Dimensions; typedef typename XprType::Scalar Scalar; typedef typename TensorEvaluator::Dimensions InputDimensions; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; enum { IsAligned = false, @@ -140,9 +143,6 @@ struct TensorEvaluator, Device> } } - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { @@ -247,9 +247,8 @@ struct TensorEvaluator, Device> template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const { - const int packetSize = internal::unpacket_traits::size; - EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); const Index originalIndex = index; @@ -284,12 +283,12 @@ struct TensorEvaluator, Device> // Todo: this could be extended to the second dimension if we're not // broadcasting alongside the first dimension, and so on. - if (innermostLoc + packetSize <= m_impl.dimensions()[0]) { + if (innermostLoc + PacketSize <= m_impl.dimensions()[0]) { return m_impl.template packet(inputIndex); } else { - EIGEN_ALIGN_MAX typename internal::remove_const::type values[packetSize]; + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; values[0] = m_impl.coeff(inputIndex); - for (int i = 1; i < packetSize; ++i) { + for (int i = 1; i < PacketSize; ++i) { values[i] = coeffColMajor(originalIndex+i); } PacketReturnType rslt = internal::pload(values); @@ -300,9 +299,8 @@ struct TensorEvaluator, Device> template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const { - const int packetSize = internal::unpacket_traits::size; - EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); const Index originalIndex = index; @@ -337,12 +335,12 @@ struct TensorEvaluator, Device> // Todo: this could be extended to the second dimension if we're not // broadcasting alongside the first dimension, and so on. - if (innermostLoc + packetSize <= m_impl.dimensions()[NumDims-1]) { + if (innermostLoc + PacketSize <= m_impl.dimensions()[NumDims-1]) { return m_impl.template packet(inputIndex); } else { - EIGEN_ALIGN_MAX typename internal::remove_const::type values[packetSize]; + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; values[0] = m_impl.coeff(inputIndex); - for (int i = 1; i < packetSize; ++i) { + for (int i = 1; i < PacketSize; ++i) { values[i] = coeffRowMajor(originalIndex+i); } PacketReturnType rslt = internal::pload(values); @@ -350,6 +348,29 @@ struct TensorEvaluator, Device> } } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + double compute_cost = TensorOpCost::AddCost(); + if (NumDims > 0) { + for (int i = NumDims - 1; i > 0; --i) { + compute_cost += TensorOpCost::DivCost(); + if (internal::index_statically_eq()(i, 1)) { + compute_cost += + TensorOpCost::MulCost() + TensorOpCost::AddCost(); + } else { + if (!internal::index_statically_eq()(i, 1)) { + compute_cost += TensorOpCost::MulCost() + + TensorOpCost::ModCost() + + TensorOpCost::AddCost(); + } + } + compute_cost += + TensorOpCost::MulCost() + TensorOpCost::AddCost(); + } + } + return m_impl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); + } EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h index c21a98fe0..2742dbb95 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -134,6 +134,10 @@ struct TensorEvaluator, Device> typedef typename XprType::Index Index; typedef DSizes Dimensions; typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; + enum { // Alignment can't be guaranteed at compile time since it depends on the @@ -180,9 +184,6 @@ struct TensorEvaluator, Device> m_inputOffset = m_stride * op.offset(); } - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { @@ -202,17 +203,16 @@ struct TensorEvaluator, Device> template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - const int packetSize = internal::unpacket_traits::size; - EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); if ((static_cast(Layout) == static_cast(ColMajor) && m_dim.actualDim() == 0) || (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == NumInputDims-1)) { // m_stride is equal to 1, so let's avoid the integer division. eigen_assert(m_stride == 1); Index inputIndex = index * m_inputStride + m_inputOffset; - EIGEN_ALIGN_MAX typename internal::remove_const::type values[packetSize]; - for (int i = 0; i < packetSize; ++i) { + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + for (int i = 0; i < PacketSize; ++i) { values[i] = m_impl.coeff(inputIndex); inputIndex += m_inputStride; } @@ -226,13 +226,13 @@ struct TensorEvaluator, Device> } else { const Index idx = index / m_stride; const Index rem = index - idx * m_stride; - if (rem + packetSize <= m_stride) { + if (rem + PacketSize <= m_stride) { Index inputIndex = idx * m_inputStride + m_inputOffset + rem; return m_impl.template packet(inputIndex); } else { // Cross the stride boundary. Fallback to slow path. - EIGEN_ALIGN_MAX typename internal::remove_const::type values[packetSize]; - for (int i = 0; i < packetSize; ++i) { + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + for (int i = 0; i < PacketSize; ++i) { values[i] = coeff(index); ++index; } @@ -242,6 +242,28 @@ struct TensorEvaluator, Device> } } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + double cost = 0; + if ((static_cast(Layout) == static_cast(ColMajor) && + m_dim.actualDim() == 0) || + (static_cast(Layout) == static_cast(RowMajor) && + m_dim.actualDim() == NumInputDims - 1)) { + cost += TensorOpCost::MulCost() + TensorOpCost::AddCost(); + } else if ((static_cast(Layout) == static_cast(ColMajor) && + m_dim.actualDim() == NumInputDims - 1) || + (static_cast(Layout) == static_cast(RowMajor) && + m_dim.actualDim() == 0)) { + cost += TensorOpCost::AddCost(); + } else { + cost += 3 * TensorOpCost::MulCost() + TensorOpCost::DivCost() + + 3 * TensorOpCost::AddCost(); + } + + return m_impl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, cost, vectorized, PacketSize); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const { CoeffReturnType* result = const_cast(m_impl.data()); if (((static_cast(Layout) == static_cast(ColMajor) && m_dim.actualDim() == NumDims) || @@ -298,6 +320,9 @@ struct TensorEvaluator, Device> typedef typename XprType::Index Index; typedef DSizes Dimensions; typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; enum { IsAligned = false, @@ -309,9 +334,6 @@ struct TensorEvaluator, Device> : Base(op, device) { } - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) { return this->m_impl.coeffRef(this->srcCoeff(index)); @@ -320,17 +342,16 @@ struct TensorEvaluator, Device> template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) { - static const int packetSize = internal::unpacket_traits::size; - EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) if ((static_cast(this->Layout) == static_cast(ColMajor) && this->m_dim.actualDim() == 0) || (static_cast(this->Layout) == static_cast(RowMajor) && this->m_dim.actualDim() == NumInputDims-1)) { // m_stride is equal to 1, so let's avoid the integer division. eigen_assert(this->m_stride == 1); - EIGEN_ALIGN_MAX typename internal::remove_const::type values[packetSize]; + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; internal::pstore(values, x); Index inputIndex = index * this->m_inputStride + this->m_inputOffset; - for (int i = 0; i < packetSize; ++i) { + for (int i = 0; i < PacketSize; ++i) { this->m_impl.coeffRef(inputIndex) = values[i]; inputIndex += this->m_inputStride; } @@ -342,14 +363,14 @@ struct TensorEvaluator, Device> } else { const Index idx = index / this->m_stride; const Index rem = index - idx * this->m_stride; - if (rem + packetSize <= this->m_stride) { + if (rem + PacketSize <= this->m_stride) { const Index inputIndex = idx * this->m_inputStride + this->m_inputOffset + rem; this->m_impl.template writePacket(inputIndex, x); } else { // Cross stride boundary. Fallback to slow path. - EIGEN_ALIGN_MAX typename internal::remove_const::type values[packetSize]; + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; internal::pstore(values, x); - for (int i = 0; i < packetSize; ++i) { + for (int i = 0; i < PacketSize; ++i) { this->coeffRef(index) = values[i]; ++index; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h index 7738f18fb..839c6e3e5 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h @@ -260,6 +260,21 @@ struct TensorEvaluator() + + 2 * TensorOpCost::MulCost() + + TensorOpCost::DivCost() + + TensorOpCost::ModCost()); + const double lhs_size = m_leftImpl.dimensions().TotalSize(); + const double rhs_size = m_rightImpl.dimensions().TotalSize(); + return (lhs_size / (lhs_size + rhs_size)) * + m_leftImpl.costPerCoeff(vectorized) + + (rhs_size / (lhs_size + rhs_size)) * + m_rightImpl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, compute_cost); + } + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } protected: diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index f070ba61e..1627d4870 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -440,6 +440,10 @@ struct TensorContractionEvaluatorBase return m_result[index]; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + return TensorOpCost(sizeof(CoeffReturnType), 0, 0); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { return internal::ploadt(m_result + index); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h index a96776a77..a2f1f71f5 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h @@ -177,7 +177,6 @@ template struct ConversionSubExprEval struct TensorEvaluator, Device> @@ -190,6 +189,7 @@ struct TensorEvaluator, Device> typedef typename internal::remove_all::Scalar>::type SrcType; typedef typename PacketType::type PacketReturnType; typedef typename PacketType::type PacketSourceType; + static const int PacketSize = internal::unpacket_traits::size; enum { IsAligned = false, @@ -231,6 +231,21 @@ struct TensorEvaluator, Device> return converter.template packet(index); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + const double cast_cost = TensorOpCost::CastCost(); + if (vectorized) { + const double SrcCoeffRatio = + internal::type_casting_traits::SrcCoeffRatio; + const double TgtCoeffRatio = + internal::type_casting_traits::TgtCoeffRatio; + return m_impl.costPerCoeff(vectorized) * (SrcCoeffRatio / PacketSize) + + TensorOpCost(0, 0, TgtCoeffRatio * (cast_cost / PacketSize)); + } else { + return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, cast_cost); + } + } + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } protected: diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index 4fe1fb943..680fc0a68 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -297,6 +297,11 @@ struct TensorEvaluator Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; + enum { IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess, @@ -367,10 +372,6 @@ struct TensorEvaluator::type PacketReturnType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { @@ -405,7 +406,6 @@ struct TensorEvaluator EIGEN_DEVICE_FUNC PacketReturnType packet(const Index index) const { - const int PacketSize = internal::unpacket_traits::size; Index indices[2] = {index, index+PacketSize-1}; Index startInputs[2] = {0, 0}; if (static_cast(Layout) == static_cast(ColMajor)) { @@ -448,6 +448,23 @@ struct TensorEvaluator() + TensorOpCost::MulCost(); + const double firstIndex_compute_cost = + NumDims * + (2 * TensorOpCost::AddCost() + 2 * TensorOpCost::MulCost() + + TensorOpCost::DivCost()); + return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) + + kernel_size * (m_inputImpl.costPerCoeff(vectorized) + + m_kernelImpl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, convolve_compute_cost, vectorized, + PacketSize)); + } + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } private: @@ -1044,6 +1061,25 @@ struct TensorEvaluator(m_buf+index); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost + // model. + const double kernel_size = m_kernelImpl.dimensions().TotalSize(); + // We ignore the use of fused multiply-add. + const double convolve_compute_cost = + TensorOpCost::AddCost() + TensorOpCost::MulCost(); + const double firstIndex_compute_cost = + NumDims * + (2 * TensorOpCost::AddCost() + 2 * TensorOpCost::MulCost() + + TensorOpCost::DivCost()); + return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) + + kernel_size * (m_inputImpl.costPerCoeff(vectorized) + + m_kernelImpl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, convolve_compute_cost, vectorized, + PacketSize)); + } + private: // No assignment (copies are needed by the kernels) TensorEvaluator& operator = (const TensorEvaluator&); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h new file mode 100644 index 000000000..366352853 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h @@ -0,0 +1,181 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Rasmus Munk Larsen +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H +#define EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H + +#if !defined(EIGEN_USE_GPU) +#define EIGEN_USE_COST_MODEL +#endif + +namespace Eigen { + +/** \class TensorEvaluator + * \ingroup CXX11_Tensor_Module + * + * \brief A cost model used to limit the number of threads used for evaluating + * tensor expression. + * + */ + +// Class storing the cost of evaluating a tensor expression in terms of the +// estimated number of operand bytes loads, bytes stored, and compute cycles. +class TensorOpCost { + public: + // TODO(rmlarsen): Fix the scalar op costs in Eigen proper. Even a simple + // model based on minimal reciprocal throughput numbers from Intel or + // Agner Fog's tables would be better than what is there now. + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int MulCost() { + return internal::functor_traits< + internal::scalar_product_op>::Cost; + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int AddCost() { + return internal::functor_traits>::Cost; + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int DivCost() { + return internal::functor_traits< + internal::scalar_quotient_op>::Cost; + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int ModCost() { + return internal::functor_traits>::Cost; + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int CastCost() { + return internal::functor_traits< + internal::scalar_cast_op>::Cost; + } + + TensorOpCost() : bytes_loaded_(0), bytes_stored_(0), compute_cycles_(0) {} + TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles) + : bytes_loaded_(bytes_loaded), + bytes_stored_(bytes_stored), + compute_cycles_(compute_cycles) {} + + TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles, + bool vectorized, double packet_size) + : bytes_loaded_(bytes_loaded), + bytes_stored_(bytes_stored), + compute_cycles_(vectorized ? compute_cycles / packet_size + : compute_cycles) { + using std::isfinite; + eigen_assert(bytes_loaded >= 0 && (isfinite)(bytes_loaded)); + eigen_assert(bytes_stored >= 0 && (isfinite)(bytes_stored)); + eigen_assert(compute_cycles >= 0 && (isfinite)(compute_cycles)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_loaded() const { + return bytes_loaded_; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_stored() const { + return bytes_stored_; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double compute_cycles() const { + return compute_cycles_; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double total_cost( + double load_cost, double store_cost, double compute_cost) const { + return load_cost * bytes_loaded_ + store_cost * bytes_stored_ + + compute_cost * compute_cycles_; + } + + // TODO(rmlarsen): Define min in terms of total cost, not elementwise. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& cwiseMin( + const TensorOpCost& rhs) { + bytes_loaded_ = numext::mini(bytes_loaded_, rhs.bytes_loaded()); + bytes_stored_ = numext::mini(bytes_stored_, rhs.bytes_stored()); + compute_cycles_ = numext::mini(compute_cycles_, rhs.compute_cycles()); + return *this; + } + + // TODO(rmlarsen): Define max in terms of total cost, not elementwise. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& cwiseMax( + const TensorOpCost& rhs) { + bytes_loaded_ = numext::maxi(bytes_loaded_, rhs.bytes_loaded()); + bytes_stored_ = numext::maxi(bytes_stored_, rhs.bytes_stored()); + compute_cycles_ = numext::maxi(compute_cycles_, rhs.compute_cycles()); + return *this; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator+=( + const TensorOpCost& rhs) { + bytes_loaded_ += rhs.bytes_loaded(); + bytes_stored_ += rhs.bytes_stored(); + compute_cycles_ += rhs.compute_cycles(); + return *this; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator*=(double rhs) { + bytes_loaded_ *= rhs; + bytes_stored_ *= rhs; + compute_cycles_ *= rhs; + return *this; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator+( + TensorOpCost lhs, const TensorOpCost& rhs) { + lhs += rhs; + return lhs; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*( + TensorOpCost lhs, double rhs) { + lhs *= rhs; + return lhs; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*( + double lhs, TensorOpCost rhs) { + rhs *= lhs; + return rhs; + } + + friend std::ostream& operator<<(std::ostream& os, const TensorOpCost& tc) { + return os << "[bytes_loaded = " << tc.bytes_loaded() + << ", bytes_stored = " << tc.bytes_stored() + << ", compute_cycles = " << tc.compute_cycles() << "]"; + } + + private: + double bytes_loaded_; + double bytes_stored_; + double compute_cycles_; +}; + +// TODO(rmlarsen): Implement a policy that chooses an "optimal" number of theads +// in [1:max_threads] instead of just switching multi-threading off for small +// work units. +template +class TensorCostModel { + public: + // Costs in device cycles. + static const int kLoadCycles = 3; + static const int kStoreCycles = 3; + // Scaling from Eigen compute cost to device cycles. + static const int kDeviceCyclesPerComputeCycle = 1; + + // Implements a simple "binary" policy: Return 1 if total cost is below + // kMinWorkToParallelize and max_threads otherwise. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int numThreads( + double output_size, const TensorOpCost& cost_per_coeff, int max_threads) { + // Compute total cost C in device cycles. + const double total_cost = + output_size * + cost_per_coeff.total_cost(kLoadCycles, kStoreCycles, + kDeviceCyclesPerComputeCycle); + // Smallest work unit to parallelize. + const double kMinParallelCost = 1e6; + return total_cost < kMinParallelCost ? 1 : max_threads; + } +}; + +} // namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h index b58e513b4..e020d076f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h @@ -83,8 +83,10 @@ struct TensorEvaluator, Devi typedef typename internal::traits::Index Index; static const int NumDims = internal::traits::NumDimensions; typedef DSizes Dimensions; - typedef - typename internal::remove_const::type Scalar; + typedef typename internal::remove_const::type Scalar; + typedef typename internal::remove_const::type CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; enum { IsAligned = false, @@ -101,9 +103,6 @@ struct TensorEvaluator, Devi m_dimensions = op.func().dimensions(op.expression()); } - typedef typename internal::remove_const::type CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { @@ -134,6 +133,11 @@ struct TensorEvaluator, Devi return internal::ploadt(m_result + index); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + // TODO(rmlarsen): Extend CustomOp API to return its cost estimate. + return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); + } + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_result; } protected: @@ -236,6 +240,9 @@ struct TensorEvaluator::NumDimensions; typedef DSizes Dimensions; typedef typename XprType::Scalar Scalar; + typedef typename internal::remove_const::type CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; enum { IsAligned = false, @@ -252,9 +259,6 @@ struct TensorEvaluator::type CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { @@ -284,6 +288,11 @@ struct TensorEvaluator(m_result + index); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + // TODO(rmlarsen): Extend CustomOp API to return its cost estimate. + return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); + } + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_result; } protected: diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h index 1fb27a65b..893542d29 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h @@ -88,6 +88,10 @@ struct TensorEvaluator, Device> typedef TensorEvalToOp XprType; typedef typename ArgType::Scalar Scalar; typedef typename TensorEvaluator::Dimensions Dimensions; + typedef typename XprType::Index Index; + typedef typename internal::remove_const::type CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; enum { IsAligned = true, @@ -104,10 +108,6 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~TensorEvaluator() { } - typedef typename XprType::Index Index; - typedef typename internal::remove_const::type CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* scalar) { @@ -138,6 +138,13 @@ struct TensorEvaluator, Device> return internal::ploadt(m_buffer + index); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + // We assume that evalPacket or evalScalar is called to perform the + // assignment and account for the cost of the write here. + return m_impl.costPerCoeff(vectorized) + + TensorOpCost(0, sizeof(CoeffReturnType), 0, vectorized, PacketSize); + } + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_buffer; } private: diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index 947a8ed88..f1f9a90df 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -101,6 +101,11 @@ struct TensorEvaluator } } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, + internal::unpacket_traits::size); + } + EIGEN_DEVICE_FUNC Scalar* data() const { return m_data; } protected: @@ -219,6 +224,7 @@ struct TensorEvaluator, Device> typedef typename XprType::Scalar Scalar; typedef typename internal::traits::Scalar CoeffReturnType; typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; typedef typename TensorEvaluator::Dimensions Dimensions; EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } @@ -237,6 +243,12 @@ struct TensorEvaluator, Device> return m_functor.template packetOp(index); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, + internal::unpacket_traits::size); + } + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } private: @@ -270,6 +282,7 @@ struct TensorEvaluator, Device> typedef typename XprType::Scalar Scalar; typedef typename internal::traits::Scalar CoeffReturnType; typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; typedef typename TensorEvaluator::Dimensions Dimensions; EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } @@ -293,6 +306,12 @@ struct TensorEvaluator, Device> return m_functor.packetOp(m_argImpl.template packet(index)); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + const double functor_cost = internal::functor_traits::Cost; + return m_argImpl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, functor_cost, vectorized, PacketSize); + } + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } private: @@ -330,6 +349,7 @@ struct TensorEvaluator::Scalar CoeffReturnType; typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; typedef typename TensorEvaluator::Dimensions Dimensions; EIGEN_DEVICE_FUNC const Dimensions& dimensions() const @@ -358,6 +378,14 @@ struct TensorEvaluator(index), m_rightImpl.template packet(index)); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + const double functor_cost = internal::functor_traits::Cost; + return m_leftImpl.costPerCoeff(vectorized) + + m_rightImpl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, functor_cost, vectorized, PacketSize); + } + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } private: @@ -398,6 +426,7 @@ struct TensorEvaluator typedef typename XprType::Index Index; typedef typename internal::traits::Scalar CoeffReturnType; typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; typedef typename TensorEvaluator::Dimensions Dimensions; EIGEN_DEVICE_FUNC const Dimensions& dimensions() const @@ -435,6 +464,13 @@ struct TensorEvaluator m_elseImpl.template packet(index)); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + return m_condImpl.costPerCoeff(vectorized) + + m_thenImpl.costPerCoeff(vectorized) + .cwiseMax(m_elseImpl.costPerCoeff(vectorized)); + } + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } private: diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h index d6db45ade..ece2ed91b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h @@ -129,6 +129,7 @@ struct TensorEvaluator, D typedef typename internal::conditional::type OutputScalar; typedef OutputScalar CoeffReturnType; typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; enum { IsAligned = false, @@ -176,7 +177,6 @@ struct TensorEvaluator, D } } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { if (m_data) { m_device.deallocate(m_data); @@ -189,11 +189,17 @@ struct TensorEvaluator, D return m_data[index]; } - template - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType packet(Index index) const { + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType + packet(Index index) const { return internal::ploadt(m_data + index); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); + } + EIGEN_DEVICE_FUNC Scalar* data() const { return m_data; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h index 14f480901..d2b0b30a1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h @@ -83,6 +83,10 @@ struct TensorEvaluator, Device> typedef TensorForcedEvalOp XprType; typedef typename ArgType::Scalar Scalar; typedef typename TensorEvaluator::Dimensions Dimensions; + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; enum { IsAligned = true, @@ -95,10 +99,6 @@ struct TensorEvaluator, Device> : m_impl(op.expression(), device), m_op(op.expression()), m_device(device), m_buffer(NULL) { } - typedef typename XprType::Index Index; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { @@ -132,6 +132,10 @@ struct TensorEvaluator, Device> return internal::ploadt(m_buffer + index); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); + } + EIGEN_DEVICE_FUNC Scalar* data() const { return m_buffer; } private: diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h index b7c13f67f..ccaa757d1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h @@ -594,6 +594,8 @@ template <> class UniformRandomGenerator > { template struct functor_traits > { enum { + // Rough estimate. + Cost = 100 * NumTraits::MulCost, PacketAccess = UniformRandomGenerator::PacketAccess }; }; @@ -774,6 +776,8 @@ template class NormalRandomGenerator { template struct functor_traits > { enum { + // Rough estimate. + Cost = 100 * NumTraits::MulCost, PacketAccess = NormalRandomGenerator::PacketAccess }; }; @@ -807,6 +811,15 @@ class GaussianGenerator { array m_two_sigmas; }; +template +struct functor_traits > { + enum { + Cost = NumDims * (2 * NumTraits::AddCost + NumTraits::MulCost + + functor_traits >::Cost) + + functor_traits >::Cost, + PacketAccess = GaussianGenerator::PacketAccess + }; +}; } // end namespace internal } // end namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h index e4154bd0b..9e5791bd3 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h @@ -145,6 +145,14 @@ struct TensorEvaluator, Device> return rslt; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + // TODO(rmlarsen): This is just a placeholder. Define interface to make + // generators return their cost. + return TensorOpCost(0, 0, TensorOpCost::AddCost() + + TensorOpCost::MulCost()); + } + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } protected: diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h index 72594a05c..0b769f278 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h @@ -159,6 +159,9 @@ struct TensorEvaluator, Device> typedef TensorEvaluator, Device> Self; typedef TensorEvaluator Impl; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; enum { IsAligned = false, @@ -307,9 +310,6 @@ struct TensorEvaluator, Device> } } - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { @@ -362,15 +362,14 @@ struct TensorEvaluator, Device> template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - const Index packetSize = internal::unpacket_traits::size; - EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); if (m_in_row_strides != 1 || m_in_col_strides != 1 || m_row_inflate_strides != 1 || m_col_inflate_strides != 1) { return packetWithPossibleZero(index); } - const Index indices[2] = {index, index + packetSize - 1}; + const Index indices[2] = {index, index + PacketSize - 1}; const Index patchIndex = indices[0] / m_fastPatchStride; if (patchIndex != indices[1] / m_fastPatchStride) { return packetWithPossibleZero(index); @@ -434,12 +433,24 @@ struct TensorEvaluator, Device> Index rowInflateStride() const { return m_row_inflate_strides; } Index colInflateStride() const { return m_col_inflate_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + // We conservatively estimate the cost for the code path where the computed + // index is inside the original image and + // TensorEvaluator::CoordAccess is false. + const double compute_cost = 3 * TensorOpCost::DivCost() + + 6 * TensorOpCost::MulCost() + + 8 * TensorOpCost::MulCost(); + return m_impl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); + } + protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const { - const int packetSize = internal::unpacket_traits::size; - EIGEN_ALIGN_MAX typename internal::remove_const::type values[packetSize]; - for (int i = 0; i < packetSize; ++i) { + const int PacketSize = internal::unpacket_traits::size; + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + for (int i = 0; i < PacketSize; ++i) { values[i] = coeff(index+i); } PacketReturnType rslt = internal::pload(values); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h index 368e6f685..de2f67d74 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h @@ -81,6 +81,10 @@ struct TensorEvaluator, Device> typedef typename XprType::Index Index; static const int NumDims = internal::array_size::Dimensions>::value; typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; enum { IsAligned = /*TensorEvaluator::IsAligned*/ false, @@ -123,11 +127,6 @@ struct TensorEvaluator, Device> } } - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { @@ -190,18 +189,30 @@ struct TensorEvaluator, Device> template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - const int packetSize = internal::unpacket_traits::size; - EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); - EIGEN_ALIGN_MAX typename internal::remove_const::type values[packetSize]; - for (int i = 0; i < packetSize; ++i) { + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + for (int i = 0; i < PacketSize; ++i) { values[i] = coeff(index+i); } PacketReturnType rslt = internal::pload(values); return rslt; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + const double compute_cost = NumDims * (3 * TensorOpCost::DivCost() + + 3 * TensorOpCost::MulCost() + + 2 * TensorOpCost::AddCost()); + const double input_size = m_impl.dimensions().TotalSize(); + const double output_size = m_dimensions.TotalSize(); + if (output_size == 0) + return TensorOpCost(); + return m_impl.costPerCoeff(vectorized) + + TensorOpCost(sizeof(CoeffReturnType) * input_size / output_size, 0, + compute_cost, vectorized, PacketSize); + } + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } protected: diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h index 9b85914ff..63a8476ef 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h @@ -155,6 +155,10 @@ struct TensorEvaluator, Device> return m_impl.template packet(index); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + return m_impl.costPerCoeff(vectorized); + } + EIGEN_DEVICE_FUNC Scalar* data() const { return m_impl.data(); } const TensorEvaluator& impl() const { return m_impl; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index a9c222ea0..bfa65a607 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -142,6 +142,10 @@ struct TensorEvaluator, Device> return m_impl.template packet(index); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + return m_impl.costPerCoeff(vectorized); + } + EIGEN_DEVICE_FUNC Scalar* data() const { return const_cast(m_impl.data()); } const TensorEvaluator& impl() const { return m_impl; } @@ -449,6 +453,11 @@ struct TensorEvaluator, Devi } } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, NumDims); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { Scalar* result = m_impl.data(); if (result) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h index a595a0175..88b838b27 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h @@ -87,6 +87,10 @@ struct TensorEvaluator, Device typedef typename XprType::Index Index; static const int NumDims = internal::array_size::value; typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; enum { IsAligned = false, @@ -129,10 +133,6 @@ struct TensorEvaluator, Device } } - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { @@ -224,21 +224,51 @@ struct TensorEvaluator, Device return m_impl.coeff(inputIndex); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + TensorOpCost cost = m_impl.costPerCoeff(vectorized); + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = 0; i < NumDims; ++i) + updateCostPerDimension(cost, i, i == 0); + } else { + for (int i = NumDims - 1; i >= 0; --i) + updateCostPerDimension(cost, i, i == NumDims - 1); + } + return cost; + } + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + private: + void updateCostPerDimension(TensorOpCost& cost, int i, bool first) const { + const double in = static_cast(m_impl.dimensions()[i]); + const double out = in + m_padding[i].first + m_padding[i].second; + if (out == 0) + return; + const double reduction = in / out; + cost *= reduction; + if (first) { + cost += TensorOpCost(0, 0, 2 * TensorOpCost::AddCost() + + reduction * (1 * TensorOpCost::AddCost())); + } else { + cost += TensorOpCost(0, 0, 2 * TensorOpCost::AddCost() + + 2 * TensorOpCost::MulCost() + + reduction * (2 * TensorOpCost::MulCost() + + 1 * TensorOpCost::DivCost())); + } + } + protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const { - const int packetSize = internal::unpacket_traits::size; - EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); const Index initialIndex = index; Index inputIndex = 0; for (int i = NumDims - 1; i > 0; --i) { const Index first = index; - const Index last = index + packetSize - 1; + const Index last = index + PacketSize - 1; const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i]; const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i]; const Index lastPaddedRight = m_outputStrides[i+1]; @@ -263,7 +293,7 @@ struct TensorEvaluator, Device } } - const Index last = index + packetSize - 1; + const Index last = index + PacketSize - 1; const Index first = index; const Index lastPaddedLeft = m_padding[0].first; const Index firstPaddedRight = (m_dimensions[0] - m_padding[0].second); @@ -288,16 +318,15 @@ struct TensorEvaluator, Device EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const { - const int packetSize = internal::unpacket_traits::size; - EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); const Index initialIndex = index; Index inputIndex = 0; for (int i = 0; i < NumDims - 1; ++i) { const Index first = index; - const Index last = index + packetSize - 1; + const Index last = index + PacketSize - 1; const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i+1]; const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i+1]; const Index lastPaddedRight = m_outputStrides[i]; @@ -322,7 +351,7 @@ struct TensorEvaluator, Device } } - const Index last = index + packetSize - 1; + const Index last = index + PacketSize - 1; const Index first = index; const Index lastPaddedLeft = m_padding[NumDims-1].first; const Index firstPaddedRight = (m_dimensions[NumDims-1] - m_padding[NumDims-1].second); @@ -347,9 +376,8 @@ struct TensorEvaluator, Device EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const { - const int packetSize = internal::unpacket_traits::size; - EIGEN_ALIGN_MAX typename internal::remove_const::type values[packetSize]; - for (int i = 0; i < packetSize; ++i) { + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + for (int i = 0; i < PacketSize; ++i) { values[i] = coeff(index+i); } PacketReturnType rslt = internal::pload(values); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h index 0bf460f4e..a87e45330 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h @@ -85,6 +85,10 @@ struct TensorEvaluator, Device> static const int NumDims = internal::array_size::Dimensions>::value + 1; typedef DSizes Dimensions; typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; + enum { IsAligned = false, @@ -137,9 +141,6 @@ struct TensorEvaluator, Device> } } - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { @@ -183,12 +184,11 @@ struct TensorEvaluator, Device> template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - const int packetSize = internal::unpacket_traits::size; - EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); Index output_stride_index = (static_cast(Layout) == static_cast(ColMajor)) ? NumDims - 1 : 0; - Index indices[2] = {index, index + packetSize - 1}; + Index indices[2] = {index, index + PacketSize - 1}; Index patchIndices[2] = {indices[0] / m_outputStrides[output_stride_index], indices[1] / m_outputStrides[output_stride_index]}; Index patchOffsets[2] = {indices[0] - patchIndices[0] * m_outputStrides[output_stride_index], @@ -229,15 +229,15 @@ struct TensorEvaluator, Device> inputIndices[0] += (patchIndices[0] + patchOffsets[0]); inputIndices[1] += (patchIndices[1] + patchOffsets[1]); - if (inputIndices[1] - inputIndices[0] == packetSize - 1) { + if (inputIndices[1] - inputIndices[0] == PacketSize - 1) { PacketReturnType rslt = m_impl.template packet(inputIndices[0]); return rslt; } else { - EIGEN_ALIGN_MAX CoeffReturnType values[packetSize]; + EIGEN_ALIGN_MAX CoeffReturnType values[PacketSize]; values[0] = m_impl.coeff(inputIndices[0]); - values[packetSize-1] = m_impl.coeff(inputIndices[1]); - for (int i = 1; i < packetSize-1; ++i) { + values[PacketSize-1] = m_impl.coeff(inputIndices[1]); + for (int i = 1; i < PacketSize-1; ++i) { values[i] = coeff(index+i); } PacketReturnType rslt = internal::pload(values); @@ -245,6 +245,14 @@ struct TensorEvaluator, Device> } } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + const double compute_cost = NumDims * (TensorOpCost::DivCost() + + TensorOpCost::MulCost() + + 2 * TensorOpCost::AddCost()); + return m_impl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); + } + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } protected: diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 00f870328..1c9e7ab66 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -411,6 +411,9 @@ struct TensorEvaluator, Device> typedef typename XprType::Scalar Scalar; typedef TensorEvaluator, Device> Self; static const bool InputPacketAccess = TensorEvaluator::PacketAccess; + typedef typename internal::remove_const::type CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; enum { IsAligned = false, @@ -495,9 +498,6 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - typedef typename internal::remove_const::type CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool evalSubExprsIfNeeded(CoeffReturnType* data) { m_impl.evalSubExprsIfNeeded(NULL); @@ -584,16 +584,15 @@ struct TensorEvaluator, Device> template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - const int packetSize = internal::unpacket_traits::size; - EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index + packetSize - 1 < dimensions().TotalSize()); + EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index + PacketSize - 1 < dimensions().TotalSize()); - EIGEN_ALIGN_MAX typename internal::remove_const::type values[packetSize]; + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; if (ReducingInnerMostDims) { const Index num_values_to_reduce = (static_cast(Layout) == static_cast(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumPreservedStrides - 1]; const Index firstIndex = firstInput(index); - for (Index i = 0; i < packetSize; ++i) { + for (Index i = 0; i < PacketSize; ++i) { Op reducer(m_reducer); values[i] = internal::InnerMostDimReducer::reduce(*this, firstIndex + i * num_values_to_reduce, num_values_to_reduce, reducer); @@ -602,18 +601,18 @@ struct TensorEvaluator, Device> const Index firstIndex = firstInput(index); const int innermost_dim = (static_cast(Layout) == static_cast(ColMajor)) ? 0 : NumOutputDims - 1; // TBD: extend this the the n innermost dimensions that we preserve. - if (((firstIndex % m_dimensions[innermost_dim]) + packetSize - 1) < m_dimensions[innermost_dim]) { + if (((firstIndex % m_dimensions[innermost_dim]) + PacketSize - 1) < m_dimensions[innermost_dim]) { Op reducer(m_reducer); typename Self::PacketReturnType accum = reducer.template initializePacket(); internal::InnerMostDimPreserver::reduce(*this, firstIndex, reducer, &accum); return reducer.finalizePacket(accum); } else { - for (int i = 0; i < packetSize; ++i) { + for (int i = 0; i < PacketSize; ++i) { values[i] = coeff(index + i); } } } else { - for (int i = 0; i < packetSize; ++i) { + for (int i = 0; i < PacketSize; ++i) { values[i] = coeff(index + i); } } @@ -621,6 +620,18 @@ struct TensorEvaluator, Device> return rslt; } + // Must be called after evalSubExprsIfNeeded(). + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + if (RunningFullReduction && m_result) { + return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); + } else { + const Index num_values_to_reduce = internal::array_prod(m_reducedDims); + const double compute_cost = num_values_to_reduce * internal::functor_traits::Cost; + return m_impl.costPerCoeff(vectorized) * num_values_to_reduce + + TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); + } + } + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } private: diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h index 96d92038c..1a59cc8f7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h @@ -104,6 +104,10 @@ struct TensorEvaluator, Device typedef typename XprType::Index Index; static const int NumDims = internal::array_size::value; typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; enum { IsAligned = false, @@ -135,10 +139,6 @@ struct TensorEvaluator, Device } } - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } @@ -195,21 +195,33 @@ struct TensorEvaluator, Device EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - const int packetSize = internal::unpacket_traits::size; - EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); // TODO(ndjaitly): write a better packing routine that uses // local structure. EIGEN_ALIGN_MAX typename internal::remove_const::type - values[packetSize]; - for (int i = 0; i < packetSize; ++i) { + values[PacketSize]; + for (int i = 0; i < PacketSize; ++i) { values[i] = coeff(index+i); } PacketReturnType rslt = internal::pload(values); return rslt; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + double compute_cost = NumDims * (2 * TensorOpCost::AddCost() + + 2 * TensorOpCost::MulCost() + + TensorOpCost::DivCost()); + for (int i = 0; i < NumDims; ++i) { + if (m_reverse[i]) { + compute_cost += 2 * TensorOpCost::AddCost(); + } + } + return m_impl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, compute_cost, false /* vectorized */, PacketSize); + } + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } protected: @@ -246,6 +258,7 @@ struct TensorEvaluator, Device> typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return this->m_dimensions; } @@ -256,14 +269,13 @@ struct TensorEvaluator, Device> template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) { - const int packetSize = internal::unpacket_traits::size; - EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); // This code is pilfered from TensorMorphing.h - EIGEN_ALIGN_MAX CoeffReturnType values[packetSize]; + EIGEN_ALIGN_MAX CoeffReturnType values[PacketSize]; internal::pstore(values, x); - for (int i = 0; i < packetSize; ++i) { + for (int i = 0; i < PacketSize; ++i) { this->coeffRef(index+i) = values[i]; } } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h index c19833ea5..e76533710 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h @@ -104,6 +104,9 @@ struct TensorEvaluator, Device> static const int NumDims = internal::array_size::Dimensions>::value; typedef DSizes Dimensions; typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; enum { IsAligned = false, @@ -145,9 +148,6 @@ struct TensorEvaluator, Device> } } - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { @@ -166,18 +166,25 @@ struct TensorEvaluator, Device> template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - const int packetSize = internal::unpacket_traits::size; - EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); - EIGEN_ALIGN_MAX typename internal::remove_const::type values[packetSize]; - for (int i = 0; i < packetSize; ++i) { + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + for (int i = 0; i < PacketSize; ++i) { values[i] = coeff(index+i); } PacketReturnType rslt = internal::pload(values); return rslt; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + const double compute_cost = NumDims * (2 * TensorOpCost::AddCost() + + 2 * TensorOpCost::MulCost() + + TensorOpCost::DivCost()); + return m_impl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, compute_cost, false /* vectorized */, PacketSize); + } + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } protected: @@ -219,6 +226,9 @@ struct TensorEvaluator, Device> static const int NumDims = internal::array_size::Dimensions>::value; typedef DSizes Dimensions; typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; enum { IsAligned = false, @@ -230,9 +240,6 @@ struct TensorEvaluator, Device> : Base(op, device) { } - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) { return this->m_impl.coeffRef(this->srcCoeff(index)); @@ -241,12 +248,11 @@ struct TensorEvaluator, Device> template EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) { - static const int packetSize = internal::unpacket_traits::size; - EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) - EIGEN_ALIGN_MAX typename internal::remove_const::type values[packetSize]; + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; internal::pstore(values, x); - for (int i = 0; i < packetSize; ++i) { + for (int i = 0; i < PacketSize; ++i) { this->coeffRef(index+i) = values[i]; } } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h index 085f8fd3d..23248c626 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h @@ -103,6 +103,10 @@ struct TensorEvaluator, Device> typedef typename XprType::Index Index; static const int NumDims = internal::array_size::Dimensions>::value; typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; enum { IsAligned = /*TensorEvaluator::IsAligned*/false, @@ -142,10 +146,6 @@ struct TensorEvaluator, Device> } } - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { @@ -164,12 +164,11 @@ struct TensorEvaluator, Device> template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - const int packetSize = internal::unpacket_traits::size; - EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); Index inputIndices[] = {0, 0}; - Index indices[] = {index, index + packetSize - 1}; + Index indices[] = {index, index + PacketSize - 1}; if (static_cast(Layout) == static_cast(ColMajor)) { for (int i = NumDims - 1; i > 0; --i) { const Index idx0 = indices[0] / m_outputStrides[i]; @@ -193,15 +192,15 @@ struct TensorEvaluator, Device> inputIndices[0] += indices[0] * m_inputStrides[NumDims-1]; inputIndices[1] += indices[1] * m_inputStrides[NumDims-1]; } - if (inputIndices[1] - inputIndices[0] == packetSize - 1) { + if (inputIndices[1] - inputIndices[0] == PacketSize - 1) { PacketReturnType rslt = m_impl.template packet(inputIndices[0]); return rslt; } else { - EIGEN_ALIGN_MAX typename internal::remove_const::type values[packetSize]; + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; values[0] = m_impl.coeff(inputIndices[0]); - values[packetSize-1] = m_impl.coeff(inputIndices[1]); - for (int i = 1; i < packetSize-1; ++i) { + values[PacketSize-1] = m_impl.coeff(inputIndices[1]); + for (int i = 1; i < PacketSize-1; ++i) { values[i] = coeff(index+i); } PacketReturnType rslt = internal::pload(values); @@ -209,6 +208,14 @@ struct TensorEvaluator, Device> } } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + const double compute_cost = NumDims * (2 * TensorOpCost::AddCost() + + 2 * TensorOpCost::MulCost() + + TensorOpCost::DivCost()); + return m_impl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, compute_cost, false /* vectorized */, PacketSize); + } + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } protected: @@ -266,6 +273,7 @@ struct TensorEvaluator, Device> typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { @@ -275,12 +283,11 @@ struct TensorEvaluator, Device> template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) { - const int packetSize = internal::unpacket_traits::size; - EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+packetSize-1 < this->dimensions().TotalSize()); + EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < this->dimensions().TotalSize()); Index inputIndices[] = {0, 0}; - Index indices[] = {index, index + packetSize - 1}; + Index indices[] = {index, index + PacketSize - 1}; if (static_cast(Layout) == static_cast(ColMajor)) { for (int i = NumDims - 1; i > 0; --i) { const Index idx0 = indices[0] / this->m_outputStrides[i]; @@ -304,15 +311,15 @@ struct TensorEvaluator, Device> inputIndices[0] += indices[0] * this->m_inputStrides[NumDims-1]; inputIndices[1] += indices[1] * this->m_inputStrides[NumDims-1]; } - if (inputIndices[1] - inputIndices[0] == packetSize - 1) { + if (inputIndices[1] - inputIndices[0] == PacketSize - 1) { this->m_impl.template writePacket(inputIndices[0], x); } else { - EIGEN_ALIGN_MAX Scalar values[packetSize]; + EIGEN_ALIGN_MAX Scalar values[PacketSize]; internal::pstore(values, x); this->m_impl.coeffRef(inputIndices[0]) = values[0]; - this->m_impl.coeffRef(inputIndices[1]) = values[packetSize-1]; - for (int i = 1; i < packetSize-1; ++i) { + this->m_impl.coeffRef(inputIndices[1]) = values[PacketSize-1]; + for (int i = 1; i < PacketSize-1; ++i) { this->coeffRef(index+i) = values[i]; } } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h index 5bdfbad46..f8db43dbe 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h @@ -171,6 +171,9 @@ struct TensorEvaluator, D static const int NumDims = NumInputDims + 1; typedef DSizes Dimensions; typedef typename internal::remove_const::type Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; enum { IsAligned = false, @@ -336,9 +339,6 @@ struct TensorEvaluator, D } } - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { @@ -408,16 +408,15 @@ struct TensorEvaluator, D template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - const Index packetSize = internal::unpacket_traits::size; - EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); if (m_in_row_strides != 1 || m_in_col_strides != 1 || m_row_inflate_strides != 1 || m_col_inflate_strides != 1 || m_in_plane_strides != 1 || m_plane_inflate_strides != 1) { return packetWithPossibleZero(index); } - const Index indices[2] = {index, index + packetSize - 1}; + const Index indices[2] = {index, index + PacketSize - 1}; const Index patchIndex = indices[0] / m_fastPatchStride; if (patchIndex != indices[1] / m_fastPatchStride) { return packetWithPossibleZero(index); @@ -495,6 +494,14 @@ struct TensorEvaluator, D return packetWithPossibleZero(index); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + const double compute_cost = + 10 * TensorOpCost::DivCost() + 21 * TensorOpCost::MulCost() + + 8 * TensorOpCost::AddCost(); + return TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); + } + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } const TensorEvaluator& impl() const { return m_impl; } @@ -518,9 +525,9 @@ struct TensorEvaluator, D protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const { - const int packetSize = internal::unpacket_traits::size; - EIGEN_ALIGN_MAX typename internal::remove_const::type values[packetSize]; - for (int i = 0; i < packetSize; ++i) { + const int PacketSize = internal::unpacket_traits::size; + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + for (int i = 0; i < PacketSize; ++i) { values[i] = coeff(index+i); } PacketReturnType rslt = internal::pload(values); -- cgit v1.2.3