diff options
Diffstat (limited to 'unsupported/Eigen/CXX11/src/Tensor')
36 files changed, 1037 insertions, 601 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h index f1ec04c49..babafe108 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h @@ -112,6 +112,11 @@ struct TensorEvaluator<const TensorIndexTupleOp<ArgType>, Device> return CoeffReturnType(index, m_impl.coeff(index)); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, 1); + } + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } protected: diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h index 199d2ce41..5abff0800 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h @@ -89,6 +89,12 @@ template<typename LeftArgType, typename RightArgType, typename Device> struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device> { typedef TensorAssignOp<LeftArgType, RightArgType> XprType; + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + typedef typename TensorEvaluator<RightArgType, Device>::Dimensions Dimensions; + static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; enum { IsAligned = TensorEvaluator<LeftArgType, Device>::IsAligned & TensorEvaluator<RightArgType, Device>::IsAligned, @@ -104,12 +110,6 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device> EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); } - typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - typedef typename TensorEvaluator<RightArgType, Device>::Dimensions Dimensions; - EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { // The dimensions of the lhs and the rhs tensors should be equal to prevent @@ -150,6 +150,19 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device> return m_leftImpl.template packet<LoadMode>(index); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + // We assume that evalPacket or evalScalar is called to perform the + // assignment and account for the cost of the write here, but reduce left + // cost by one load because we are using m_leftImpl.coeffRef. + TensorOpCost left = m_leftImpl.costPerCoeff(vectorized); + return m_rightImpl.costPerCoeff(vectorized) + + TensorOpCost( + numext::maxi(0.0, left.bytes_loaded() - sizeof(CoeffReturnType)), + left.bytes_stored(), left.compute_cycles()) + + TensorOpCost(0, sizeof(CoeffReturnType), 0, vectorized, PacketSize); + } + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_leftImpl.data(); } private: diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index 69d1802d5..1a34f3ccc 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -334,6 +334,12 @@ class TensorBase<Derived, ReadOnlyAccessors> return binaryExpr(other.derived(), internal::scalar_boolean_or_op()); } + template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp<internal::scalar_boolean_xor_op, const Derived, const OtherDerived> + operator^(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_boolean_xor_op()); + } + // Comparisons and tests. template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_LT>, const Derived, const OtherDerived> diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index b6e6db12a..c771496e2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -101,6 +101,9 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device> typedef DSizes<Index, NumDims> Dimensions; typedef typename XprType::Scalar Scalar; typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; enum { IsAligned = false, @@ -140,9 +143,6 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device> } } - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { @@ -247,9 +247,8 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device> template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const { - const int packetSize = internal::unpacket_traits<PacketReturnType>::size; - EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); const Index originalIndex = index; @@ -284,12 +283,12 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device> // Todo: this could be extended to the second dimension if we're not // broadcasting alongside the first dimension, and so on. - if (innermostLoc + packetSize <= m_impl.dimensions()[0]) { + if (innermostLoc + PacketSize <= m_impl.dimensions()[0]) { return m_impl.template packet<Unaligned>(inputIndex); } else { - EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize]; + EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; values[0] = m_impl.coeff(inputIndex); - for (int i = 1; i < packetSize; ++i) { + for (int i = 1; i < PacketSize; ++i) { values[i] = coeffColMajor(originalIndex+i); } PacketReturnType rslt = internal::pload<PacketReturnType>(values); @@ -300,9 +299,8 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device> template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const { - const int packetSize = internal::unpacket_traits<PacketReturnType>::size; - EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); const Index originalIndex = index; @@ -337,12 +335,12 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device> // Todo: this could be extended to the second dimension if we're not // broadcasting alongside the first dimension, and so on. - if (innermostLoc + packetSize <= m_impl.dimensions()[NumDims-1]) { + if (innermostLoc + PacketSize <= m_impl.dimensions()[NumDims-1]) { return m_impl.template packet<Unaligned>(inputIndex); } else { - EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize]; + EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; values[0] = m_impl.coeff(inputIndex); - for (int i = 1; i < packetSize; ++i) { + for (int i = 1; i < PacketSize; ++i) { values[i] = coeffRowMajor(originalIndex+i); } PacketReturnType rslt = internal::pload<PacketReturnType>(values); @@ -350,6 +348,29 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device> } } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + double compute_cost = TensorOpCost::AddCost<Index>(); + if (NumDims > 0) { + for (int i = NumDims - 1; i > 0; --i) { + compute_cost += TensorOpCost::DivCost<Index>(); + if (internal::index_statically_eq<Broadcast>()(i, 1)) { + compute_cost += + TensorOpCost::MulCost<Index>() + TensorOpCost::AddCost<Index>(); + } else { + if (!internal::index_statically_eq<InputDimensions>()(i, 1)) { + compute_cost += TensorOpCost::MulCost<Index>() + + TensorOpCost::ModCost<Index>() + + TensorOpCost::AddCost<Index>(); + } + } + compute_cost += + TensorOpCost::MulCost<Index>() + TensorOpCost::AddCost<Index>(); + } + } + return m_impl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); + } EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h index c21a98fe0..2742dbb95 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -134,6 +134,10 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device> typedef typename XprType::Index Index; typedef DSizes<Index, NumDims> Dimensions; typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; + enum { // Alignment can't be guaranteed at compile time since it depends on the @@ -180,9 +184,6 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device> m_inputOffset = m_stride * op.offset(); } - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { @@ -202,17 +203,16 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device> template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - const int packetSize = internal::unpacket_traits<PacketReturnType>::size; - EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == 0) || (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == NumInputDims-1)) { // m_stride is equal to 1, so let's avoid the integer division. eigen_assert(m_stride == 1); Index inputIndex = index * m_inputStride + m_inputOffset; - EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize]; - for (int i = 0; i < packetSize; ++i) { + EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; + for (int i = 0; i < PacketSize; ++i) { values[i] = m_impl.coeff(inputIndex); inputIndex += m_inputStride; } @@ -226,13 +226,13 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device> } else { const Index idx = index / m_stride; const Index rem = index - idx * m_stride; - if (rem + packetSize <= m_stride) { + if (rem + PacketSize <= m_stride) { Index inputIndex = idx * m_inputStride + m_inputOffset + rem; return m_impl.template packet<LoadMode>(inputIndex); } else { // Cross the stride boundary. Fallback to slow path. - EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize]; - for (int i = 0; i < packetSize; ++i) { + EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; + for (int i = 0; i < PacketSize; ++i) { values[i] = coeff(index); ++index; } @@ -242,6 +242,28 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device> } } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + double cost = 0; + if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && + m_dim.actualDim() == 0) || + (static_cast<int>(Layout) == static_cast<int>(RowMajor) && + m_dim.actualDim() == NumInputDims - 1)) { + cost += TensorOpCost::MulCost<Index>() + TensorOpCost::AddCost<Index>(); + } else if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && + m_dim.actualDim() == NumInputDims - 1) || + (static_cast<int>(Layout) == static_cast<int>(RowMajor) && + m_dim.actualDim() == 0)) { + cost += TensorOpCost::AddCost<Index>(); + } else { + cost += 3 * TensorOpCost::MulCost<Index>() + TensorOpCost::DivCost<Index>() + + 3 * TensorOpCost::AddCost<Index>(); + } + + return m_impl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, cost, vectorized, PacketSize); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const { CoeffReturnType* result = const_cast<CoeffReturnType*>(m_impl.data()); if (((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == NumDims) || @@ -298,6 +320,9 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device> typedef typename XprType::Index Index; typedef DSizes<Index, NumDims> Dimensions; typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; enum { IsAligned = false, @@ -309,9 +334,6 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device> : Base(op, device) { } - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) { return this->m_impl.coeffRef(this->srcCoeff(index)); @@ -320,17 +342,16 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device> template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) { - static const int packetSize = internal::unpacket_traits<PacketReturnType>::size; - EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) if ((static_cast<int>(this->Layout) == static_cast<int>(ColMajor) && this->m_dim.actualDim() == 0) || (static_cast<int>(this->Layout) == static_cast<int>(RowMajor) && this->m_dim.actualDim() == NumInputDims-1)) { // m_stride is equal to 1, so let's avoid the integer division. eigen_assert(this->m_stride == 1); - EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize]; + EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; internal::pstore<CoeffReturnType, PacketReturnType>(values, x); Index inputIndex = index * this->m_inputStride + this->m_inputOffset; - for (int i = 0; i < packetSize; ++i) { + for (int i = 0; i < PacketSize; ++i) { this->m_impl.coeffRef(inputIndex) = values[i]; inputIndex += this->m_inputStride; } @@ -342,14 +363,14 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device> } else { const Index idx = index / this->m_stride; const Index rem = index - idx * this->m_stride; - if (rem + packetSize <= this->m_stride) { + if (rem + PacketSize <= this->m_stride) { const Index inputIndex = idx * this->m_inputStride + this->m_inputOffset + rem; this->m_impl.template writePacket<StoreMode>(inputIndex, x); } else { // Cross stride boundary. Fallback to slow path. - EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize]; + EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; internal::pstore<CoeffReturnType, PacketReturnType>(values, x); - for (int i = 0; i < packetSize; ++i) { + for (int i = 0; i < PacketSize; ++i) { this->coeffRef(index) = values[i]; ++index; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h index 7738f18fb..839c6e3e5 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h @@ -260,6 +260,21 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy return rslt; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + const double compute_cost = NumDims * (2 * TensorOpCost::AddCost<Index>() + + 2 * TensorOpCost::MulCost<Index>() + + TensorOpCost::DivCost<Index>() + + TensorOpCost::ModCost<Index>()); + const double lhs_size = m_leftImpl.dimensions().TotalSize(); + const double rhs_size = m_rightImpl.dimensions().TotalSize(); + return (lhs_size / (lhs_size + rhs_size)) * + m_leftImpl.costPerCoeff(vectorized) + + (rhs_size / (lhs_size + rhs_size)) * + m_rightImpl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, compute_cost); + } + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } protected: diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index f070ba61e..97182258d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -426,6 +426,99 @@ struct TensorContractionEvaluatorBase buffer, resIncr, alpha); } + template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment> + EIGEN_DEVICE_FUNC void evalGemm(Scalar* buffer) const { + // columns in left side, rows in right side + const Index k = this->m_k_size; + + // rows in left side + const Index m = this->m_i_size; + + // columns in right side + const Index n = this->m_j_size; + + // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar) + this->m_device.memset(buffer, 0, m * n * sizeof(Scalar)); + + // define mr, nr, and all of my data mapper types + typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar; + typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar; + typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits; + + const Index nr = Traits::nr; + const Index mr = Traits::mr; + + typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator; + typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator; + + const Index lhs_packet_size = internal::unpacket_traits<typename LeftEvaluator::PacketReturnType>::size; + const Index rhs_packet_size = internal::unpacket_traits<typename RightEvaluator::PacketReturnType>::size; + + typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs, + LeftEvaluator, left_nocontract_t, + contract_t, lhs_packet_size, + lhs_inner_dim_contiguous, + false, Unaligned> LhsMapper; + + typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs, + RightEvaluator, right_nocontract_t, + contract_t, rhs_packet_size, + rhs_inner_dim_contiguous, + rhs_inner_dim_reordered, Unaligned> RhsMapper; + + typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper; + + // Declare GEBP packing and kernel structs + internal::gemm_pack_lhs<LhsScalar, Index, typename LhsMapper::SubMapper, mr, Traits::LhsProgress, ColMajor> pack_lhs; + internal::gemm_pack_rhs<RhsScalar, Index, typename RhsMapper::SubMapper, nr, ColMajor> pack_rhs; + + internal::gebp_kernel<LhsScalar, RhsScalar, Index, OutputMapper, mr, nr, false, false> gebp; + + // initialize data mappers + LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, + this->m_left_contracting_strides, this->m_k_strides); + + RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides, + this->m_right_contracting_strides, this->m_k_strides); + + OutputMapper output(buffer, m); + + // Sizes of the blocks to load in cache. See the Goto paper for details. + internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index, internal::ShardByCol> blocking(k, m, n, 1); + const Index kc = blocking.kc(); + const Index mc = numext::mini(m, blocking.mc()); + const Index nc = numext::mini(n, blocking.nc()); + const Index sizeA = mc * kc; + const Index sizeB = kc * nc; + + LhsScalar* blockA = static_cast<LhsScalar *>(this->m_device.allocate(sizeA * sizeof(LhsScalar))); + RhsScalar* blockB = static_cast<RhsScalar *>(this->m_device.allocate(sizeB * sizeof(RhsScalar))); + + for(Index i2=0; i2<m; i2+=mc) + { + const Index actual_mc = numext::mini(i2+mc,m)-i2; + for (Index k2 = 0; k2 < k; k2 += kc) { + // make sure we don't overshoot right edge of left matrix, then pack vertical panel + const Index actual_kc = numext::mini(k2 + kc, k) - k2; + pack_lhs(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc, 0, 0); + + // series of horizontal blocks + for (Index j2 = 0; j2 < n; j2 += nc) { + // make sure we don't overshoot right edge of right matrix, then pack block + const Index actual_nc = numext::mini(j2 + nc, n) - j2; + pack_rhs(blockB, rhs.getSubMapper(k2, j2), actual_kc, actual_nc, 0, 0); + + // call gebp (matrix kernel) + // The parameters here are copied from Eigen's GEMM implementation + gebp(output.getSubMapper(i2, j2), blockA, blockB, actual_mc, actual_kc, actual_nc, 1.0, -1, -1, 0, 0); + } + } + } + + this->m_device.deallocate(blockA); + this->m_device.deallocate(blockB); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { m_leftImpl.cleanup(); m_rightImpl.cleanup(); @@ -440,6 +533,10 @@ struct TensorContractionEvaluatorBase return m_result[index]; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const { + return TensorOpCost(sizeof(CoeffReturnType), 0, 0); + } + template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { return internal::ploadt<PacketReturnType, LoadMode>(m_result + index); @@ -529,100 +626,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT return; } - evalGemm<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer); - } - - template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment> - EIGEN_DEVICE_FUNC void evalGemm(Scalar* buffer) const { - // columns in left side, rows in right side - const Index k = this->m_k_size; - - // rows in left side - const Index m = this->m_i_size; - - // columns in right side - const Index n = this->m_j_size; - - // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar) - this->m_device.memset(buffer, 0, m * n * sizeof(Scalar)); - - // define mr, nr, and all of my data mapper types - typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar; - typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar; - typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits; - - const Index nr = Traits::nr; - const Index mr = Traits::mr; - - typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator; - typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator; - - const Index lhs_packet_size = internal::unpacket_traits<typename LeftEvaluator::PacketReturnType>::size; - const Index rhs_packet_size = internal::unpacket_traits<typename RightEvaluator::PacketReturnType>::size; - - typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs, - LeftEvaluator, left_nocontract_t, - contract_t, lhs_packet_size, - lhs_inner_dim_contiguous, - false, Unaligned> LhsMapper; - - typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs, - RightEvaluator, right_nocontract_t, - contract_t, rhs_packet_size, - rhs_inner_dim_contiguous, - rhs_inner_dim_reordered, Unaligned> RhsMapper; - - typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper; - - // Declare GEBP packing and kernel structs - internal::gemm_pack_lhs<LhsScalar, Index, typename LhsMapper::SubMapper, mr, Traits::LhsProgress, ColMajor> pack_lhs; - internal::gemm_pack_rhs<RhsScalar, Index, typename RhsMapper::SubMapper, nr, ColMajor> pack_rhs; - - internal::gebp_kernel<LhsScalar, RhsScalar, Index, OutputMapper, mr, nr, false, false> gebp; - - // initialize data mappers - LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, - this->m_left_contracting_strides, this->m_k_strides); - - RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides, - this->m_right_contracting_strides, this->m_k_strides); - - OutputMapper output(buffer, m); - - // Sizes of the blocks to load in cache. See the Goto paper for details. - internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index, internal::ShardByCol> blocking(k, m, n, 1); - const Index kc = blocking.kc(); - const Index mc = numext::mini(m, blocking.mc()); - const Index nc = numext::mini(n, blocking.nc()); - const Index sizeA = mc * kc; - const Index sizeB = kc * nc; - - LhsScalar* blockA = static_cast<LhsScalar *>(this->m_device.allocate(sizeA * sizeof(LhsScalar))); - RhsScalar* blockB = static_cast<RhsScalar *>(this->m_device.allocate(sizeB * sizeof(RhsScalar))); - - for(Index i2=0; i2<m; i2+=mc) - { - const Index actual_mc = numext::mini(i2+mc,m)-i2; - for (Index k2 = 0; k2 < k; k2 += kc) { - // make sure we don't overshoot right edge of left matrix, then pack vertical panel - const Index actual_kc = numext::mini(k2 + kc, k) - k2; - pack_lhs(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc, 0, 0); - - // series of horizontal blocks - for (Index j2 = 0; j2 < n; j2 += nc) { - // make sure we don't overshoot right edge of right matrix, then pack block - const Index actual_nc = numext::mini(j2 + nc, n) - j2; - pack_rhs(blockB, rhs.getSubMapper(k2, j2), actual_kc, actual_nc, 0, 0); - - // call gebp (matrix kernel) - // The parameters here are copied from Eigen's GEMM implementation - gebp(output.getSubMapper(i2, j2), blockA, blockB, actual_mc, actual_kc, actual_nc, 1.0, -1, -1, 0, 0); - } - } - } - - this->m_device.deallocate(blockA); - this->m_device.deallocate(blockB); + this->template evalGemm<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer); } }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h index 3d3f6904f..5cf7b4f71 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h @@ -35,9 +35,7 @@ class TensorContractionBlocking { computeProductBlockingSizes<LhsScalar, RhsScalar, 1>(kc_, mc_, nc_, num_threads); } else { - if (kc_ && mc_ && nc_) { - mc_ = (((m / num_threads) + 15) / 16) * 16; - } + computeProductBlockingSizes<LhsScalar, RhsScalar, 1>(kc_, nc_, mc_, num_threads); } } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h index a96776a77..a2f1f71f5 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h @@ -177,7 +177,6 @@ template <typename Eval, typename Scalar> struct ConversionSubExprEval<true, Eva }; - // Eval as rvalue template<typename TargetType, typename ArgType, typename Device> struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device> @@ -190,6 +189,7 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device> typedef typename internal::remove_all<typename internal::traits<ArgType>::Scalar>::type SrcType; typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; typedef typename PacketType<SrcType, Device>::type PacketSourceType; + static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; enum { IsAligned = false, @@ -231,6 +231,21 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device> return converter.template packet<LoadMode>(index); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + const double cast_cost = TensorOpCost::CastCost<SrcType, TargetType>(); + if (vectorized) { + const double SrcCoeffRatio = + internal::type_casting_traits<SrcType, TargetType>::SrcCoeffRatio; + const double TgtCoeffRatio = + internal::type_casting_traits<SrcType, TargetType>::TgtCoeffRatio; + return m_impl.costPerCoeff(vectorized) * (SrcCoeffRatio / PacketSize) + + TensorOpCost(0, 0, TgtCoeffRatio * (cast_cost / PacketSize)); + } else { + return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, cast_cost); + } + } + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } protected: diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index 4fe1fb943..ff3c5662d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -297,6 +297,11 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr typedef typename XprType::Index Index; typedef DSizes<Index, NumDims> Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; + enum { IsAligned = TensorEvaluator<InputArgType, Device>::IsAligned & TensorEvaluator<KernelArgType, Device>::IsAligned, PacketAccess = TensorEvaluator<InputArgType, Device>::PacketAccess & TensorEvaluator<KernelArgType, Device>::PacketAccess, @@ -367,10 +372,6 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr } } - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { @@ -405,7 +406,6 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr template<int LoadMode> EIGEN_DEVICE_FUNC PacketReturnType packet(const Index index) const { - const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; Index indices[2] = {index, index+PacketSize-1}; Index startInputs[2] = {0, 0}; if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { @@ -448,6 +448,23 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr } } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + const double kernel_size = m_kernelImpl.dimensions().TotalSize(); + // We ignore the use of fused multiply-add. + const double convolve_compute_cost = + TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>(); + const double firstIndex_compute_cost = + NumDims * + (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() + + TensorOpCost::DivCost<Index>()); + return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) + + kernel_size * (m_inputImpl.costPerCoeff(vectorized) + + m_kernelImpl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, convolve_compute_cost, vectorized, + PacketSize)); + } + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } private: @@ -773,6 +790,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType<CoeffReturnType, GpuDevice>::type PacketReturnType; typedef typename InputArgType::Scalar Scalar; + static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; } @@ -1044,6 +1062,25 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr return internal::ploadt<PacketReturnType, LoadMode>(m_buf+index); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost + // model. + const double kernel_size = m_kernelImpl.dimensions().TotalSize(); + // We ignore the use of fused multiply-add. + const double convolve_compute_cost = + TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>(); + const double firstIndex_compute_cost = + NumDims * + (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() + + TensorOpCost::DivCost<Index>()); + return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) + + kernel_size * (m_inputImpl.costPerCoeff(vectorized) + + m_kernelImpl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, convolve_compute_cost, vectorized, + PacketSize)); + } + private: // No assignment (copies are needed by the kernels) TensorEvaluator& operator = (const TensorEvaluator&); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h new file mode 100644 index 000000000..4e8f86674 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h @@ -0,0 +1,214 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Rasmus Munk Larsen <rmlarsen@google.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H +#define EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H + +//#if !defined(EIGEN_USE_GPU) +//#define EIGEN_USE_COST_MODEL +//#endif + +namespace Eigen { + +/** \class TensorEvaluator + * \ingroup CXX11_Tensor_Module + * + * \brief A cost model used to limit the number of threads used for evaluating + * tensor expression. + * + */ + +// Class storing the cost of evaluating a tensor expression in terms of the +// estimated number of operand bytes loads, bytes stored, and compute cycles. +class TensorOpCost { + public: + // TODO(rmlarsen): Fix the scalar op costs in Eigen proper. Even a simple + // model based on minimal reciprocal throughput numbers from Intel or + // Agner Fog's tables would be better than what is there now. + template <typename ArgType> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int MulCost() { + return internal::functor_traits< + internal::scalar_product_op<ArgType, ArgType>>::Cost; + } + template <typename ArgType> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int AddCost() { + return internal::functor_traits<internal::scalar_sum_op<ArgType>>::Cost; + } + template <typename ArgType> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int DivCost() { + return internal::functor_traits< + internal::scalar_quotient_op<ArgType, ArgType>>::Cost; + } + template <typename ArgType> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int ModCost() { + return internal::functor_traits<internal::scalar_mod_op<ArgType>>::Cost; + } + template <typename SrcType, typename TargetType> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int CastCost() { + return internal::functor_traits< + internal::scalar_cast_op<SrcType, TargetType>>::Cost; + } + + TensorOpCost() : bytes_loaded_(0), bytes_stored_(0), compute_cycles_(0) {} + TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles) + : bytes_loaded_(bytes_loaded), + bytes_stored_(bytes_stored), + compute_cycles_(compute_cycles) {} + + TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles, + bool vectorized, double packet_size) + : bytes_loaded_(bytes_loaded), + bytes_stored_(bytes_stored), + compute_cycles_(vectorized ? compute_cycles / packet_size + : compute_cycles) { + using std::isfinite; + eigen_assert(bytes_loaded >= 0 && (isfinite)(bytes_loaded)); + eigen_assert(bytes_stored >= 0 && (isfinite)(bytes_stored)); + eigen_assert(compute_cycles >= 0 && (isfinite)(compute_cycles)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_loaded() const { + return bytes_loaded_; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_stored() const { + return bytes_stored_; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double compute_cycles() const { + return compute_cycles_; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double total_cost( + double load_cost, double store_cost, double compute_cost) const { + return load_cost * bytes_loaded_ + store_cost * bytes_stored_ + + compute_cost * compute_cycles_; + } + + // Drop memory access component. Intended for cases when memory accesses are + // sequential or are completely masked by computations. + EIGEN_DEVICE_FUNC void dropMemoryCost() { + bytes_loaded_ = 0; + bytes_stored_ = 0; + } + + // TODO(rmlarsen): Define min in terms of total cost, not elementwise. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& cwiseMin( + const TensorOpCost& rhs) { + bytes_loaded_ = numext::mini(bytes_loaded_, rhs.bytes_loaded()); + bytes_stored_ = numext::mini(bytes_stored_, rhs.bytes_stored()); + compute_cycles_ = numext::mini(compute_cycles_, rhs.compute_cycles()); + return *this; + } + + // TODO(rmlarsen): Define max in terms of total cost, not elementwise. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& cwiseMax( + const TensorOpCost& rhs) { + bytes_loaded_ = numext::maxi(bytes_loaded_, rhs.bytes_loaded()); + bytes_stored_ = numext::maxi(bytes_stored_, rhs.bytes_stored()); + compute_cycles_ = numext::maxi(compute_cycles_, rhs.compute_cycles()); + return *this; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator+=( + const TensorOpCost& rhs) { + bytes_loaded_ += rhs.bytes_loaded(); + bytes_stored_ += rhs.bytes_stored(); + compute_cycles_ += rhs.compute_cycles(); + return *this; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator*=(double rhs) { + bytes_loaded_ *= rhs; + bytes_stored_ *= rhs; + compute_cycles_ *= rhs; + return *this; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator+( + TensorOpCost lhs, const TensorOpCost& rhs) { + lhs += rhs; + return lhs; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*( + TensorOpCost lhs, double rhs) { + lhs *= rhs; + return lhs; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*( + double lhs, TensorOpCost rhs) { + rhs *= lhs; + return rhs; + } + + friend std::ostream& operator<<(std::ostream& os, const TensorOpCost& tc) { + return os << "[bytes_loaded = " << tc.bytes_loaded() + << ", bytes_stored = " << tc.bytes_stored() + << ", compute_cycles = " << tc.compute_cycles() << "]"; + } + + private: + double bytes_loaded_; + double bytes_stored_; + double compute_cycles_; +}; + +// TODO(rmlarsen): Implement a policy that chooses an "optimal" number of theads +// in [1:max_threads] instead of just switching multi-threading off for small +// work units. +template <typename Device> +class TensorCostModel { + public: + // Scaling from Eigen compute cost to device cycles. + static const int kDeviceCyclesPerComputeCycle = 1; + + // Costs in device cycles. + static const int kStartupCycles = 100000; + static const int kPerThreadCycles = 100000; + static const int kTaskSize = 40000; + + // Returns the number of threads in [1:max_threads] to use for + // evaluating an expression with the given output size and cost per + // coefficient. + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int numThreads( + double output_size, const TensorOpCost& cost_per_coeff, int max_threads) { + double cost = totalCost(output_size, cost_per_coeff); + int threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9; + return numext::mini(max_threads, numext::maxi(1, threads)); + } + + // taskSize assesses parallel task size. + // Value of 1.0 means ideal parallel task size. Values < 1.0 mean that task + // granularity needs to be increased to mitigate parallelization overheads. + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double taskSize( + double output_size, const TensorOpCost& cost_per_coeff) { + return totalCost(output_size, cost_per_coeff) / kTaskSize; + } + + private: + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double totalCost( + double output_size, const TensorOpCost& cost_per_coeff) { + // Cost of memory fetches from L2 cache. 64 is typical cache line size. + // 11 is L2 cache latency on Haswell. + // We don't know whether data is in L1, L2 or L3. But we are most interested + // in single-threaded computational time around 100us-10ms (smaller time + // is too small for parallelization, larger time is not intersting + // either because we are probably using all available threads already). + // And for the target time range, L2 seems to be what matters. Data set + // fitting into L1 is too small to take noticeable time. Data set fitting + // only into L3 presumably will take more than 10ms to load and process. + const double kLoadCycles = 1.0 / 64 * 11; + const double kStoreCycles = 1.0 / 64 * 11; + // Scaling from Eigen compute cost to device cycles. + return output_size * + cost_per_coeff.total_cost(kLoadCycles, kStoreCycles, + kDeviceCyclesPerComputeCycle); + } +}; + +} // namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h index b58e513b4..e020d076f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h @@ -83,8 +83,10 @@ struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Devi typedef typename internal::traits<ArgType>::Index Index; static const int NumDims = internal::traits<ArgType>::NumDimensions; typedef DSizes<Index, NumDims> Dimensions; - typedef - typename internal::remove_const<typename ArgType::Scalar>::type Scalar; + typedef typename internal::remove_const<typename ArgType::Scalar>::type Scalar; + typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType; + typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; enum { IsAligned = false, @@ -101,9 +103,6 @@ struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Devi m_dimensions = op.func().dimensions(op.expression()); } - typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { @@ -134,6 +133,11 @@ struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Devi return internal::ploadt<PacketReturnType, LoadMode>(m_result + index); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + // TODO(rmlarsen): Extend CustomOp API to return its cost estimate. + return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); + } + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_result; } protected: @@ -236,6 +240,9 @@ struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, static const int NumDims = internal::traits<XprType>::NumDimensions; typedef DSizes<Index, NumDims> Dimensions; typedef typename XprType::Scalar Scalar; + typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType; + typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; enum { IsAligned = false, @@ -252,9 +259,6 @@ struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, m_dimensions = op.func().dimensions(op.lhsExpression(), op.rhsExpression()); } - typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { @@ -284,6 +288,11 @@ struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, return internal::ploadt<PacketReturnType, LoadMode>(m_result + index); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + // TODO(rmlarsen): Extend CustomOp API to return its cost estimate. + return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); + } + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_result; } protected: diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h index 821835cf3..1d2d162dc 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h @@ -291,15 +291,9 @@ struct GpuDevice { int max_blocks_; }; -#ifndef __CUDA_ARCH__ #define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \ (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \ assert(cudaGetLastError() == cudaSuccess); -#else -#define LAUNCH_CUDA_KERNEL(kernel, ...) \ - { const auto __attribute__((__unused__)) __makeTheKernelInstantiate = &(kernel); } \ - eigen_assert(false && "Cannot launch a kernel from another kernel" __CUDA_ARCH__); -#endif // FIXME: Should be device and kernel specific. diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h index 267f6f8e3..9d141395b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h @@ -44,6 +44,26 @@ struct DefaultDevice { #endif } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { +#ifndef __CUDA_ARCH__ + // Running on the host CPU + return l1CacheSize(); +#else + // Running on a CUDA device, return the amount of shared memory available. + return 48*1024; +#endif + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { +#ifndef __CUDA_ARCH__ + // Running single threaded on the host CPU + return l3CacheSize(); +#else + // Running on a CUDA device + return firstLevelCacheSize(); +#endif + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const { #ifndef __CUDA_ARCH__ // Running single threaded on the host CPU diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h index cd3dd214b..c02891465 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h @@ -12,145 +12,15 @@ namespace Eigen { -// This defines an interface that ThreadPoolDevice can take to use -// custom thread pools underneath. -class ThreadPoolInterface { - public: - virtual void Schedule(std::function<void()> fn) = 0; - - virtual ~ThreadPoolInterface() {} -}; - -// The implementation of the ThreadPool type ensures that the Schedule method -// runs the functions it is provided in FIFO order when the scheduling is done -// by a single thread. -// Environment provides a way to create threads and also allows to intercept -// task submission and execution. -template <typename Environment> -class ThreadPoolTempl : public ThreadPoolInterface { - public: - // Construct a pool that contains "num_threads" threads. - explicit ThreadPoolTempl(int num_threads, Environment env = Environment()) - : env_(env), threads_(num_threads), waiters_(num_threads) { - for (int i = 0; i < num_threads; i++) { - threads_.push_back(env.CreateThread([this]() { WorkerLoop(); })); - } - } - - // Wait until all scheduled work has finished and then destroy the - // set of threads. - ~ThreadPoolTempl() { - { - // Wait for all work to get done. - std::unique_lock<std::mutex> l(mu_); - while (!pending_.empty()) { - empty_.wait(l); - } - exiting_ = true; - - // Wakeup all waiters. - for (auto w : waiters_) { - w->ready = true; - w->task.f = nullptr; - w->cv.notify_one(); - } - } - - // Wait for threads to finish. - for (auto t : threads_) { - delete t; - } - } - - // Schedule fn() for execution in the pool of threads. The functions are - // executed in the order in which they are scheduled. - void Schedule(std::function<void()> fn) { - Task t = env_.CreateTask(std::move(fn)); - std::unique_lock<std::mutex> l(mu_); - if (waiters_.empty()) { - pending_.push_back(std::move(t)); - } else { - Waiter* w = waiters_.back(); - waiters_.pop_back(); - w->ready = true; - w->task = std::move(t); - w->cv.notify_one(); - } - } - - protected: - void WorkerLoop() { - std::unique_lock<std::mutex> l(mu_); - Waiter w; - Task t; - while (!exiting_) { - if (pending_.empty()) { - // Wait for work to be assigned to me - w.ready = false; - waiters_.push_back(&w); - while (!w.ready) { - w.cv.wait(l); - } - t = w.task; - w.task.f = nullptr; - } else { - // Pick up pending work - t = std::move(pending_.front()); - pending_.pop_front(); - if (pending_.empty()) { - empty_.notify_all(); - } - } - if (t.f) { - mu_.unlock(); - env_.ExecuteTask(t); - t.f = nullptr; - mu_.lock(); - } - } - } - - private: - typedef typename Environment::Task Task; - typedef typename Environment::EnvThread Thread; - - struct Waiter { - std::condition_variable cv; - Task task; - bool ready; - }; - - Environment env_; - std::mutex mu_; - MaxSizeVector<Thread*> threads_; // All threads - MaxSizeVector<Waiter*> waiters_; // Stack of waiting threads. - std::deque<Task> pending_; // Queue of pending work - std::condition_variable empty_; // Signaled on pending_.empty() - bool exiting_ = false; -}; - -struct StlThreadEnvironment { - struct Task { - std::function<void()> f; - }; - - // EnvThread constructor must start the thread, - // destructor must join the thread. - class EnvThread { - public: - EnvThread(std::function<void()> f) : thr_(f) {} - ~EnvThread() { thr_.join(); } - - private: - std::thread thr_; - }; - - EnvThread* CreateThread(std::function<void()> f) { return new EnvThread(f); } - Task CreateTask(std::function<void()> f) { return Task{std::move(f)}; } - void ExecuteTask(const Task& t) { t.f(); } -}; - -typedef ThreadPoolTempl<StlThreadEnvironment> ThreadPool; +// Use the SimpleThreadPool by default. We'll switch to the new non blocking +// thread pool later. +#ifdef EIGEN_USE_NONBLOCKING_THREAD_POOL +template <typename Env> using ThreadPoolTempl = NonBlockingThreadPoolTempl<Env>; +typedef NonBlockingThreadPool ThreadPool; +#else +template <typename Env> using ThreadPoolTempl = SimpleThreadPoolTempl<Env>; +typedef SimpleThreadPool ThreadPool; +#endif // Barrier is an object that allows one or more threads to wait until @@ -264,6 +134,15 @@ struct ThreadPoolDevice { return num_threads_; } + EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { + return l1CacheSize(); + } + + EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { + // The l3 cache size is shared between all the cores. + return l3CacheSize() / num_threads_; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const { // Should return an enum that encodes the ISA supported by the CPU return 1; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h index 1fb27a65b..5c6748a43 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h @@ -88,10 +88,14 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType>, Device> typedef TensorEvalToOp<ArgType> XprType; typedef typename ArgType::Scalar Scalar; typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions; + typedef typename XprType::Index Index; + typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType; + typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; enum { IsAligned = true, - PacketAccess = true, + PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, Layout = TensorEvaluator<ArgType, Device>::Layout, CoordAccess = false, // to be implemented RawAccess = true @@ -104,10 +108,6 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType>, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~TensorEvaluator() { } - typedef typename XprType::Index Index; - typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* scalar) { @@ -138,6 +138,13 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType>, Device> return internal::ploadt<PacketReturnType, LoadMode>(m_buffer + index); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + // We assume that evalPacket or evalScalar is called to perform the + // assignment and account for the cost of the write here. + return m_impl.costPerCoeff(vectorized) + + TensorOpCost(0, sizeof(CoeffReturnType), 0, vectorized, PacketSize); + } + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_buffer; } private: diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index 947a8ed88..ae4ce3c90 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -101,6 +101,11 @@ struct TensorEvaluator } } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, + internal::unpacket_traits<PacketReturnType>::size); + } + EIGEN_DEVICE_FUNC Scalar* data() const { return m_data; } protected: @@ -184,6 +189,11 @@ struct TensorEvaluator<const Derived, Device> return loadConstant(m_data+index); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, + internal::unpacket_traits<PacketReturnType>::size); + } + EIGEN_DEVICE_FUNC const Scalar* data() const { return m_data; } protected: @@ -219,6 +229,7 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device> typedef typename XprType::Scalar Scalar; typedef typename internal::traits<XprType>::Scalar CoeffReturnType; typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions; EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } @@ -237,6 +248,12 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device> return m_functor.template packetOp<Index, PacketReturnType>(index); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, + internal::unpacket_traits<PacketReturnType>::size); + } + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } private: @@ -270,6 +287,7 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device> typedef typename XprType::Scalar Scalar; typedef typename internal::traits<XprType>::Scalar CoeffReturnType; typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions; EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } @@ -293,6 +311,12 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device> return m_functor.packetOp(m_argImpl.template packet<LoadMode>(index)); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + const double functor_cost = internal::functor_traits<UnaryOp>::Cost; + return m_argImpl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, functor_cost, vectorized, PacketSize); + } + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } private: @@ -330,6 +354,7 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg typedef typename XprType::Scalar Scalar; typedef typename internal::traits<XprType>::Scalar CoeffReturnType; typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; typedef typename TensorEvaluator<LeftArgType, Device>::Dimensions Dimensions; EIGEN_DEVICE_FUNC const Dimensions& dimensions() const @@ -358,6 +383,14 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg return m_functor.packetOp(m_leftImpl.template packet<LoadMode>(index), m_rightImpl.template packet<LoadMode>(index)); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + const double functor_cost = internal::functor_traits<BinaryOp>::Cost; + return m_leftImpl.costPerCoeff(vectorized) + + m_rightImpl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, functor_cost, vectorized, PacketSize); + } + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } private: @@ -398,6 +431,7 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType> typedef typename XprType::Index Index; typedef typename internal::traits<XprType>::Scalar CoeffReturnType; typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; typedef typename TensorEvaluator<IfArgType, Device>::Dimensions Dimensions; EIGEN_DEVICE_FUNC const Dimensions& dimensions() const @@ -425,7 +459,6 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType> template<int LoadMode> EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const { - const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; internal::Selector<PacketSize> select; for (Index i = 0; i < PacketSize; ++i) { select.select[i] = m_condImpl.coeff(index+i); @@ -435,6 +468,13 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType> m_elseImpl.template packet<LoadMode>(index)); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + return m_condImpl.costPerCoeff(vectorized) + + m_thenImpl.costPerCoeff(vectorized) + .cwiseMax(m_elseImpl.costPerCoeff(vectorized)); + } + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } private: diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 4f4e07aaf..5c3d4d630 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -59,9 +59,16 @@ class TensorExecutor<Expression, DefaultDevice, true> { const Index size = array_prod(evaluator.dimensions()); const int PacketSize = unpacket_traits<typename TensorEvaluator<Expression, DefaultDevice>::PacketReturnType>::size; + // Manually unroll this loop since compilers don't do it. + const Index UnrolledSize = (size / (4 * PacketSize)) * 4 * PacketSize; + for (Index i = 0; i < UnrolledSize; i += 4*PacketSize) { + evaluator.evalPacket(i); + evaluator.evalPacket(i+PacketSize); + evaluator.evalPacket(i+2*PacketSize); + evaluator.evalPacket(i+3*PacketSize); + } const Index VectorizedSize = (size / PacketSize) * PacketSize; - - for (Index i = 0; i < VectorizedSize; i += PacketSize) { + for (Index i = UnrolledSize; i < VectorizedSize; i += PacketSize) { evaluator.evalPacket(i); } for (Index i = VectorizedSize; i < size; ++i) { @@ -78,8 +85,9 @@ class TensorExecutor<Expression, DefaultDevice, true> #ifdef EIGEN_USE_THREADS template <typename Evaluator, typename Index, bool Vectorizable> struct EvalRange { - static void run(Evaluator evaluator, const Index first, const Index last) { - eigen_assert(last > first); + static void run(Evaluator* evaluator_in, const Index first, const Index last) { + Evaluator evaluator = *evaluator_in; + eigen_assert(last >= first); for (Index i = first; i < last; ++i) { evaluator.evalScalar(i); } @@ -88,28 +96,34 @@ struct EvalRange { template <typename Evaluator, typename Index> struct EvalRange<Evaluator, Index, true> { - static void run(Evaluator evaluator, const Index first, const Index last) { - eigen_assert(last > first); - + static void run(Evaluator* evaluator_in, const Index first, const Index last) { + Evaluator evaluator = *evaluator_in; + eigen_assert(last >= first); Index i = first; - static const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size; + const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size; if (last - first >= PacketSize) { eigen_assert(first % PacketSize == 0); - Index lastPacket = last - (last % PacketSize); - for (; i < lastPacket; i += PacketSize) { + Index last_chunk_offset = last - 4 * PacketSize; + // Manually unroll this loop since compilers don't do it. + for (; i <= last_chunk_offset; i += 4*PacketSize) { + evaluator.evalPacket(i); + evaluator.evalPacket(i+PacketSize); + evaluator.evalPacket(i+2*PacketSize); + evaluator.evalPacket(i+3*PacketSize); + } + last_chunk_offset = last - PacketSize; + for (; i <= last_chunk_offset; i += PacketSize) { evaluator.evalPacket(i); } } - for (; i < last; ++i) { evaluator.evalScalar(i); } } }; -template<typename Expression, bool Vectorizable> -class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> -{ +template <typename Expression, bool Vectorizable> +class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> { public: typedef typename Expression::Index Index; static inline void run(const Expression& expr, const ThreadPoolDevice& device) @@ -119,24 +133,34 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); if (needs_assign) { + const Index PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1; const Index size = array_prod(evaluator.dimensions()); - - static const int PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1; - - int blocksz = std::ceil<int>(static_cast<float>(size)/device.numThreads()) + PacketSize - 1; - const Index blocksize = numext::maxi<Index>(PacketSize, (blocksz - (blocksz % PacketSize))); - const unsigned int numblocks = static_cast<unsigned int>(size / blocksize); - - Barrier barrier(numblocks); - for (unsigned int i = 0; i < numblocks; ++i) { - device.enqueue_with_barrier(&barrier, &EvalRange<Evaluator, Index, Vectorizable>::run, evaluator, i*blocksize, (i+1)*blocksize); + size_t num_threads = device.numThreads(); +#ifdef EIGEN_USE_COST_MODEL + if (num_threads > 1) { + num_threads = TensorCostModel<ThreadPoolDevice>::numThreads( + size, evaluator.costPerCoeff(Vectorizable), num_threads); } - - if (static_cast<Index>(numblocks) * blocksize < size) { - EvalRange<Evaluator, Index, Vectorizable>::run(evaluator, numblocks * blocksize, size); +#endif + if (num_threads == 1) { + EvalRange<Evaluator, Index, Vectorizable>::run(&evaluator, 0, size); + } else { + Index blocksz = std::ceil<Index>(static_cast<float>(size)/num_threads) + PacketSize - 1; + const Index blocksize = numext::maxi<Index>(PacketSize, (blocksz - (blocksz % PacketSize))); + const Index numblocks = size / blocksize; + + Barrier barrier(numblocks); + for (int i = 0; i < numblocks; ++i) { + device.enqueue_with_barrier( + &barrier, &EvalRange<Evaluator, Index, Vectorizable>::run, + &evaluator, i * blocksize, (i + 1) * blocksize); + } + if (numblocks * blocksize < size) { + EvalRange<Evaluator, Index, Vectorizable>::run( + &evaluator, numblocks * blocksize, size); + } + barrier.Wait(); } - - barrier.Wait(); } evaluator.cleanup(); } @@ -147,98 +171,78 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> // GPU: the evaluation of the expression is offloaded to a GPU. #if defined(EIGEN_USE_GPU) -template <typename Expression> -class TensorExecutor<Expression, GpuDevice, false> { +template <typename Expression, bool Vectorizable> +class TensorExecutor<Expression, GpuDevice, Vectorizable> { public: typedef typename Expression::Index Index; - static EIGEN_DEVICE_FUNC void run(const Expression& expr, const GpuDevice& device); + static void run(const Expression& expr, const GpuDevice& device); }; -template <typename Expression> -class TensorExecutor<Expression, GpuDevice, true> { - public: - typedef typename Expression::Index Index; - static EIGEN_DEVICE_FUNC void run(const Expression& expr, const GpuDevice& device); -}; #if defined(__CUDACC__) +template <typename Evaluator, typename Index, bool Vectorizable> +struct EigenMetaKernelEval { + static __device__ EIGEN_ALWAYS_INLINE + void run(Evaluator& eval, Index first, Index last, Index step_size) { + for (Index i = first; i < last; i += step_size) { + eval.evalScalar(i); + } + } +}; + +template <typename Evaluator, typename Index> +struct EigenMetaKernelEval<Evaluator, Index, true> { + static __device__ EIGEN_ALWAYS_INLINE + void run(Evaluator& eval, Index first, Index last, Index step_size) { + const Index PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size; + const Index vectorized_size = (last / PacketSize) * PacketSize; + const Index vectorized_step_size = step_size * PacketSize; + + // Use the vector path + for (Index i = first * PacketSize; i < vectorized_size; + i += vectorized_step_size) { + eval.evalPacket(i); + } + for (Index i = vectorized_size + first; i < last; i += step_size) { + eval.evalScalar(i); + } + } +}; template <typename Evaluator, typename Index> __global__ void __launch_bounds__(1024) -EigenMetaKernel_NonVectorizable(Evaluator memcopied_eval, Index size) { - // Cuda memcopies the kernel arguments. That's fine for POD, but for more - // complex types such as evaluators we should really conform to the C++ - // standard and call a proper copy constructor. - Evaluator eval(memcopied_eval); +EigenMetaKernel(Evaluator memcopied_eval, Index size) { const Index first_index = blockIdx.x * blockDim.x + threadIdx.x; const Index step_size = blockDim.x * gridDim.x; - // Use the scalar path - for (Index i = first_index; i < size; i += step_size) { - eval.evalScalar(i); - } -} - -template <typename Evaluator, typename Index> -__global__ void -__launch_bounds__(1024) -EigenMetaKernel_Vectorizable(Evaluator memcopied_eval, Index size) { // Cuda memcopies the kernel arguments. That's fine for POD, but for more // complex types such as evaluators we should really conform to the C++ // standard and call a proper copy constructor. Evaluator eval(memcopied_eval); - const Index first_index = blockIdx.x * blockDim.x + threadIdx.x; - const Index step_size = blockDim.x * gridDim.x; - - // Use the vector path - const Index PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size; - const Index vectorized_step_size = step_size * PacketSize; - const Index vectorized_size = (size / PacketSize) * PacketSize; - for (Index i = first_index * PacketSize; i < vectorized_size; - i += vectorized_step_size) { - eval.evalPacket(i); - } - for (Index i = vectorized_size + first_index; i < size; i += step_size) { - eval.evalScalar(i); - } + const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned; + EigenMetaKernelEval<Evaluator, Index, vectorizable>::run(eval, first_index, size, step_size); } /*static*/ -template <typename Expression> -EIGEN_DEVICE_FUNC inline void TensorExecutor<Expression, GpuDevice, false>::run(const Expression& expr, const GpuDevice& device) -{ +template <typename Expression, bool Vectorizable> +inline void TensorExecutor<Expression, GpuDevice, Vectorizable>::run( + const Expression& expr, const GpuDevice& device) { TensorEvaluator<Expression, GpuDevice> evaluator(expr, device); const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); - if (needs_assign) - { + if (needs_assign) { const int block_size = device.maxCudaThreadsPerBlock(); - const int max_blocks = numext::mini<int>(device.maxBlocks(), device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size); + const int max_blocks = device.getNumCudaMultiProcessors() * + device.maxCudaThreadsPerMultiProcessor() / block_size; const Index size = array_prod(evaluator.dimensions()); - // Create a least one block to ensure we won't crash if we're called with tensors of size 0. - const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, (size + block_size - 1) / block_size), 1); - LAUNCH_CUDA_KERNEL((EigenMetaKernel_NonVectorizable<TensorEvaluator<Expression, GpuDevice>, Index>), num_blocks, block_size, 0, device, evaluator, size); - } - evaluator.cleanup(); -} - + // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0. + const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, divup<int>(size, block_size)), 1); -/*static*/ -template<typename Expression> -EIGEN_DEVICE_FUNC inline void TensorExecutor<Expression, GpuDevice, true>::run(const Expression& expr, const GpuDevice& device) -{ - TensorEvaluator<Expression, GpuDevice> evaluator(expr, device); - const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); - if (needs_assign) - { - const int block_size = device.maxCudaThreadsPerBlock(); - const int max_blocks = numext::mini<int>(device.maxBlocks(), device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size); - const Index size = array_prod(evaluator.dimensions()); - // Create a least one block to ensure we won't crash if we're called with tensors of size 0. - const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, (size + block_size - 1) / block_size), 1); - LAUNCH_CUDA_KERNEL((EigenMetaKernel_Vectorizable<TensorEvaluator<Expression, GpuDevice>, Index>), num_blocks, block_size, 0, device, evaluator, size); + LAUNCH_CUDA_KERNEL( + (EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, Index>), + num_blocks, block_size, 0, device, evaluator, size); } evaluator.cleanup(); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h index d6db45ade..ece2ed91b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h @@ -129,6 +129,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D typedef typename internal::conditional<FFTResultType == RealPart || FFTResultType == ImagPart, RealScalar, ComplexScalar>::type OutputScalar; typedef OutputScalar CoeffReturnType; typedef typename PacketType<OutputScalar, Device>::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; enum { IsAligned = false, @@ -176,7 +177,6 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D } } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { if (m_data) { m_device.deallocate(m_data); @@ -189,11 +189,17 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D return m_data[index]; } - template<int LoadMode> - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType packet(Index index) const { + template <int LoadMode> + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType + packet(Index index) const { return internal::ploadt<PacketReturnType, LoadMode>(m_data + index); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); + } + EIGEN_DEVICE_FUNC Scalar* data() const { return m_data; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h index 14f480901..1ce53ad69 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h @@ -83,10 +83,14 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device> typedef TensorForcedEvalOp<ArgType> XprType; typedef typename ArgType::Scalar Scalar; typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions; + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; enum { IsAligned = true, - PacketAccess = (internal::packet_traits<Scalar>::size > 1), + PacketAccess = (PacketSize > 1), Layout = TensorEvaluator<ArgType, Device>::Layout, RawAccess = true }; @@ -95,10 +99,6 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device> : m_impl(op.expression(), device), m_op(op.expression()), m_device(device), m_buffer(NULL) { } - typedef typename XprType::Index Index; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { @@ -132,6 +132,10 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device> return internal::ploadt<PacketReturnType, LoadMode>(m_buffer + index); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); + } + EIGEN_DEVICE_FUNC Scalar* data() const { return m_buffer; } private: diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h index b7c13f67f..33cd00391 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h @@ -64,7 +64,7 @@ struct scalar_sigmoid_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_sigmoid_op) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& x) const { const T one = T(1); - return one / (one + std::exp(-x)); + return one / (one + numext::exp(-x)); } template <typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE @@ -158,8 +158,8 @@ template <typename T> struct MeanReducer } protected: - int scalarCount_; - int packetCount_; + DenseIndex scalarCount_; + DenseIndex packetCount_; }; template <typename T> struct MaxReducer @@ -594,6 +594,8 @@ template <> class UniformRandomGenerator<std::complex<double> > { template <typename Scalar> struct functor_traits<UniformRandomGenerator<Scalar> > { enum { + // Rough estimate. + Cost = 100 * NumTraits<Scalar>::MulCost, PacketAccess = UniformRandomGenerator<Scalar>::PacketAccess }; }; @@ -774,6 +776,8 @@ template <typename T> class NormalRandomGenerator { template <typename Scalar> struct functor_traits<NormalRandomGenerator<Scalar> > { enum { + // Rough estimate. + Cost = 100 * NumTraits<Scalar>::MulCost, PacketAccess = NormalRandomGenerator<Scalar>::PacketAccess }; }; @@ -799,7 +803,7 @@ class GaussianGenerator { T offset = coordinates[i] - m_means[i]; tmp += offset * offset / m_two_sigmas[i]; } - return std::exp(-tmp); + return numext::exp(-tmp); } private: @@ -807,6 +811,15 @@ class GaussianGenerator { array<T, NumDims> m_two_sigmas; }; +template <typename T, typename Index, size_t NumDims> +struct functor_traits<GaussianGenerator<T, Index, NumDims> > { + enum { + Cost = NumDims * (2 * NumTraits<T>::AddCost + NumTraits<T>::MulCost + + functor_traits<scalar_quotient_op<T, T> >::Cost) + + functor_traits<scalar_exp_op<T> >::Cost, + PacketAccess = GaussianGenerator<T, Index, NumDims>::PacketAccess + }; +}; } // end namespace internal } // end namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h index e4154bd0b..8ff7d5815 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h @@ -145,6 +145,14 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device> return rslt; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool) const { + // TODO(rmlarsen): This is just a placeholder. Define interface to make + // generators return their cost. + return TensorOpCost(0, 0, TensorOpCost::AddCost<Scalar>() + + TensorOpCost::MulCost<Scalar>()); + } + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } protected: diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h index 72594a05c..bafcc67bd 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h @@ -159,6 +159,9 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device> typedef TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device> Self; typedef TensorEvaluator<ArgType, Device> Impl; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; enum { IsAligned = false, @@ -307,9 +310,6 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device> } } - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { @@ -362,15 +362,14 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device> template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - const Index packetSize = internal::unpacket_traits<PacketReturnType>::size; - EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); if (m_in_row_strides != 1 || m_in_col_strides != 1 || m_row_inflate_strides != 1 || m_col_inflate_strides != 1) { return packetWithPossibleZero(index); } - const Index indices[2] = {index, index + packetSize - 1}; + const Index indices[2] = {index, index + PacketSize - 1}; const Index patchIndex = indices[0] / m_fastPatchStride; if (patchIndex != indices[1] / m_fastPatchStride) { return packetWithPossibleZero(index); @@ -434,12 +433,23 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device> Index rowInflateStride() const { return m_row_inflate_strides; } Index colInflateStride() const { return m_col_inflate_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + // We conservatively estimate the cost for the code path where the computed + // index is inside the original image and + // TensorEvaluator<ArgType, Device>::CoordAccess is false. + const double compute_cost = 3 * TensorOpCost::DivCost<Index>() + + 6 * TensorOpCost::MulCost<Index>() + + 8 * TensorOpCost::MulCost<Index>(); + return m_impl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); + } + protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const { - const int packetSize = internal::unpacket_traits<PacketReturnType>::size; - EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize]; - for (int i = 0; i < packetSize; ++i) { + EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; + for (int i = 0; i < PacketSize; ++i) { values[i] = coeff(index+i); } PacketReturnType rslt = internal::pload<PacketReturnType>(values); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h index 368e6f685..de2f67d74 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h @@ -81,6 +81,10 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device> typedef typename XprType::Index Index; static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value; typedef DSizes<Index, NumDims> Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; enum { IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false, @@ -123,11 +127,6 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device> } } - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { @@ -190,18 +189,30 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device> template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - const int packetSize = internal::unpacket_traits<PacketReturnType>::size; - EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); - EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize]; - for (int i = 0; i < packetSize; ++i) { + EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; + for (int i = 0; i < PacketSize; ++i) { values[i] = coeff(index+i); } PacketReturnType rslt = internal::pload<PacketReturnType>(values); return rslt; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + const double compute_cost = NumDims * (3 * TensorOpCost::DivCost<Index>() + + 3 * TensorOpCost::MulCost<Index>() + + 2 * TensorOpCost::AddCost<Index>()); + const double input_size = m_impl.dimensions().TotalSize(); + const double output_size = m_dimensions.TotalSize(); + if (output_size == 0) + return TensorOpCost(); + return m_impl.costPerCoeff(vectorized) + + TensorOpCost(sizeof(CoeffReturnType) * input_size / output_size, 0, + compute_cost, vectorized, PacketSize); + } + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } protected: diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h index 9b85914ff..63a8476ef 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h @@ -155,6 +155,10 @@ struct TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device> return m_impl.template packet<LoadMode>(index); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + return m_impl.costPerCoeff(vectorized); + } + EIGEN_DEVICE_FUNC Scalar* data() const { return m_impl.data(); } const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h index 6af2d45d4..cd04716bd 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h @@ -24,9 +24,17 @@ const T2& choose(Cond<false>, const T1&, const T2& second) { return second; } -template <typename T> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE + +template <typename T, typename X, typename Y> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T divup(const X x, const Y y) { + return static_cast<T>((x + y - 1) / y); +} + +template <typename T> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T divup(const T x, const T y) { - return (x + y - 1) / y; + return static_cast<T>((x + y - 1) / y); } template <size_t n> struct max_n_1 { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index a9c222ea0..bfa65a607 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -142,6 +142,10 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device> return m_impl.template packet<LoadMode>(index); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + return m_impl.costPerCoeff(vectorized); + } + EIGEN_DEVICE_FUNC Scalar* data() const { return const_cast<Scalar*>(m_impl.data()); } const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; } @@ -449,6 +453,11 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi } } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, NumDims); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { Scalar* result = m_impl.data(); if (result) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h index a595a0175..88b838b27 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h @@ -87,6 +87,10 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device typedef typename XprType::Index Index; static const int NumDims = internal::array_size<PaddingDimensions>::value; typedef DSizes<Index, NumDims> Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; enum { IsAligned = false, @@ -129,10 +133,6 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device } } - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { @@ -224,21 +224,51 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device return m_impl.coeff(inputIndex); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + TensorOpCost cost = m_impl.costPerCoeff(vectorized); + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + for (int i = 0; i < NumDims; ++i) + updateCostPerDimension(cost, i, i == 0); + } else { + for (int i = NumDims - 1; i >= 0; --i) + updateCostPerDimension(cost, i, i == NumDims - 1); + } + return cost; + } + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + private: + void updateCostPerDimension(TensorOpCost& cost, int i, bool first) const { + const double in = static_cast<double>(m_impl.dimensions()[i]); + const double out = in + m_padding[i].first + m_padding[i].second; + if (out == 0) + return; + const double reduction = in / out; + cost *= reduction; + if (first) { + cost += TensorOpCost(0, 0, 2 * TensorOpCost::AddCost<Index>() + + reduction * (1 * TensorOpCost::AddCost<Index>())); + } else { + cost += TensorOpCost(0, 0, 2 * TensorOpCost::AddCost<Index>() + + 2 * TensorOpCost::MulCost<Index>() + + reduction * (2 * TensorOpCost::MulCost<Index>() + + 1 * TensorOpCost::DivCost<Index>())); + } + } + protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const { - const int packetSize = internal::unpacket_traits<PacketReturnType>::size; - EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); const Index initialIndex = index; Index inputIndex = 0; for (int i = NumDims - 1; i > 0; --i) { const Index first = index; - const Index last = index + packetSize - 1; + const Index last = index + PacketSize - 1; const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i]; const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i]; const Index lastPaddedRight = m_outputStrides[i+1]; @@ -263,7 +293,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device } } - const Index last = index + packetSize - 1; + const Index last = index + PacketSize - 1; const Index first = index; const Index lastPaddedLeft = m_padding[0].first; const Index firstPaddedRight = (m_dimensions[0] - m_padding[0].second); @@ -288,16 +318,15 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const { - const int packetSize = internal::unpacket_traits<PacketReturnType>::size; - EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); const Index initialIndex = index; Index inputIndex = 0; for (int i = 0; i < NumDims - 1; ++i) { const Index first = index; - const Index last = index + packetSize - 1; + const Index last = index + PacketSize - 1; const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i+1]; const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i+1]; const Index lastPaddedRight = m_outputStrides[i]; @@ -322,7 +351,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device } } - const Index last = index + packetSize - 1; + const Index last = index + PacketSize - 1; const Index first = index; const Index lastPaddedLeft = m_padding[NumDims-1].first; const Index firstPaddedRight = (m_dimensions[NumDims-1] - m_padding[NumDims-1].second); @@ -347,9 +376,8 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const { - const int packetSize = internal::unpacket_traits<PacketReturnType>::size; - EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize]; - for (int i = 0; i < packetSize; ++i) { + EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; + for (int i = 0; i < PacketSize; ++i) { values[i] = coeff(index+i); } PacketReturnType rslt = internal::pload<PacketReturnType>(values); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h index 0bf460f4e..a87e45330 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h @@ -85,6 +85,10 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device> static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value + 1; typedef DSizes<Index, NumDims> Dimensions; typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; + enum { IsAligned = false, @@ -137,9 +141,6 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device> } } - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { @@ -183,12 +184,11 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device> template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - const int packetSize = internal::unpacket_traits<PacketReturnType>::size; - EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); Index output_stride_index = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? NumDims - 1 : 0; - Index indices[2] = {index, index + packetSize - 1}; + Index indices[2] = {index, index + PacketSize - 1}; Index patchIndices[2] = {indices[0] / m_outputStrides[output_stride_index], indices[1] / m_outputStrides[output_stride_index]}; Index patchOffsets[2] = {indices[0] - patchIndices[0] * m_outputStrides[output_stride_index], @@ -229,15 +229,15 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device> inputIndices[0] += (patchIndices[0] + patchOffsets[0]); inputIndices[1] += (patchIndices[1] + patchOffsets[1]); - if (inputIndices[1] - inputIndices[0] == packetSize - 1) { + if (inputIndices[1] - inputIndices[0] == PacketSize - 1) { PacketReturnType rslt = m_impl.template packet<Unaligned>(inputIndices[0]); return rslt; } else { - EIGEN_ALIGN_MAX CoeffReturnType values[packetSize]; + EIGEN_ALIGN_MAX CoeffReturnType values[PacketSize]; values[0] = m_impl.coeff(inputIndices[0]); - values[packetSize-1] = m_impl.coeff(inputIndices[1]); - for (int i = 1; i < packetSize-1; ++i) { + values[PacketSize-1] = m_impl.coeff(inputIndices[1]); + for (int i = 1; i < PacketSize-1; ++i) { values[i] = coeff(index+i); } PacketReturnType rslt = internal::pload<PacketReturnType>(values); @@ -245,6 +245,14 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device> } } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + const double compute_cost = NumDims * (TensorOpCost::DivCost<Index>() + + TensorOpCost::MulCost<Index>() + + 2 * TensorOpCost::AddCost<Index>()); + return m_impl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); + } + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } protected: diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 00f870328..885295f0a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -214,7 +214,7 @@ struct FullReducer { static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const Device&, typename Self::CoeffReturnType* output) { const typename Self::Index num_coeffs = array_prod(self.m_impl.dimensions()); - *output = InnerMostDimReducer<Self, Op>::reduce(self, 0, num_coeffs, reducer); + *output = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer); } }; @@ -222,18 +222,19 @@ struct FullReducer { #ifdef EIGEN_USE_THREADS // Multithreaded full reducers template <typename Self, typename Op, - bool vectorizable = (Self::InputPacketAccess & Op::PacketAccess)> + bool Vectorizable = (Self::InputPacketAccess & Op::PacketAccess)> struct FullReducerShard { static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer, typename Self::CoeffReturnType* output) { - *output = InnerMostDimReducer<Self, Op, vectorizable>::reduce( + *output = InnerMostDimReducer<Self, Op, Vectorizable>::reduce( self, firstIndex, numValuesToReduce, reducer); } }; -template <typename Self, typename Op> -struct FullReducer<Self, Op, ThreadPoolDevice, false> { +// Multithreaded full reducer +template <typename Self, typename Op, bool Vectorizable> +struct FullReducer<Self, Op, ThreadPoolDevice, Vectorizable> { static const bool HasOptimizedImplementation = !Op::IsStateful; static const int PacketSize = unpacket_traits<typename Self::PacketReturnType>::size; @@ -247,79 +248,44 @@ struct FullReducer<Self, Op, ThreadPoolDevice, false> { *output = reducer.finalize(reducer.initialize()); return; } - const std::size_t num_threads = device.numThreads(); - if (num_threads == 1) { - *output = InnerMostDimReducer<Self, Op, false>::reduce(self, 0, num_coeffs, reducer); - return; - } else { - const Index blocksize = std::floor<Index>(static_cast<float>(num_coeffs) / num_threads); - const unsigned int numblocks = blocksize > 0 ? static_cast<unsigned int>(num_coeffs / blocksize) : 0; - eigen_assert(num_coeffs >= static_cast<Index>(numblocks) * blocksize); - - Barrier barrier(numblocks); - MaxSizeVector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize()); - for (unsigned int i = 0; i < numblocks; ++i) { - device.enqueue_with_barrier(&barrier, &FullReducerShard<Self, Op, false>::run, self, - i * blocksize, blocksize, reducer, &shards[i]); - } - - typename Self::CoeffReturnType finalShard; - if (static_cast<Index>(numblocks) * blocksize < num_coeffs) { - finalShard = InnerMostDimReducer<Self, Op, false>::reduce( - self, numblocks * blocksize, num_coeffs - numblocks * blocksize, reducer); - } else { - finalShard = reducer.initialize(); - } - barrier.Wait(); - for (unsigned int i = 0; i < numblocks; ++i) { - reducer.reduce(shards[i], &finalShard); - } - *output = reducer.finalize(finalShard); - } - } -}; - -template <typename Self, typename Op> -struct FullReducer<Self, Op, ThreadPoolDevice, true> { - static const bool HasOptimizedImplementation = !Op::IsStateful; - static const int PacketSize = - unpacket_traits<typename Self::PacketReturnType>::size; - - // launch one reducer per thread and accumulate the result. - static void run(const Self& self, Op& reducer, const ThreadPoolDevice& device, - typename Self::CoeffReturnType* output) { - typedef typename Self::Index Index; - const Index num_coeffs = array_prod(self.m_impl.dimensions()); - if (num_coeffs == 0) { - *output = reducer.finalize(reducer.initialize()); - return; - } - const std::size_t num_threads = device.numThreads(); +#ifdef EIGEN_USE_COST_MODEL + const TensorOpCost cost = + self.m_impl.costPerCoeff(Vectorizable) + + TensorOpCost(0, 0, internal::functor_traits<Op>::Cost, Vectorizable, + PacketSize); + const int num_threads = TensorCostModel<ThreadPoolDevice>::numThreads( + num_coeffs, cost, device.numThreads()); +#else + const int num_threads = device.numThreads(); +#endif if (num_threads == 1) { - *output = InnerMostDimReducer<Self, Op, true>::reduce(self, 0, num_coeffs, reducer); + *output = + InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer); return; } - const Index blocksize = std::floor<Index>(static_cast<float>(num_coeffs) / num_threads); - const unsigned int numblocks = blocksize > 0 ? static_cast<unsigned int>(num_coeffs / blocksize) : 0; - eigen_assert(num_coeffs >= static_cast<Index>(numblocks) * blocksize); + const Index blocksize = + std::floor<Index>(static_cast<float>(num_coeffs) / num_threads); + const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0; + eigen_assert(num_coeffs >= numblocks * blocksize); Barrier barrier(numblocks); MaxSizeVector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize()); - for (unsigned int i = 0; i < numblocks; ++i) { - device.enqueue_with_barrier(&barrier, &FullReducerShard<Self, Op, true>::run, + for (Index i = 0; i < numblocks; ++i) { + device.enqueue_with_barrier(&barrier, &FullReducerShard<Self, Op, Vectorizable>::run, self, i * blocksize, blocksize, reducer, &shards[i]); } typename Self::CoeffReturnType finalShard; - if (static_cast<Index>(numblocks) * blocksize < num_coeffs) { - finalShard = InnerMostDimReducer<Self, Op, true>::reduce( - self, numblocks * blocksize, num_coeffs - numblocks * blocksize, reducer); + if (numblocks * blocksize < num_coeffs) { + finalShard = InnerMostDimReducer<Self, Op, Vectorizable>::reduce( + self, numblocks * blocksize, num_coeffs - numblocks * blocksize, + reducer); } else { finalShard = reducer.initialize(); } - barrier.Wait(); - for (unsigned int i = 0; i < numblocks; ++i) { + + for (Index i = 0; i < numblocks; ++i) { reducer.reduce(shards[i], &finalShard); } *output = reducer.finalize(finalShard); @@ -411,6 +377,9 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device> typedef typename XprType::Scalar Scalar; typedef TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device> Self; static const bool InputPacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess; + typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType; + typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; enum { IsAligned = false, @@ -495,8 +464,13 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + static bool size_large_enough(Index total_size) { +#ifndef EIGEN_USE_COST_MODEL + return total_size > 1024 * 1024; +#else + return true || total_size; +#endif + } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool evalSubExprsIfNeeded(CoeffReturnType* data) { m_impl.evalSubExprsIfNeeded(NULL); @@ -504,7 +478,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device> // Use the FullReducer if possible. if (RunningFullReduction && internal::FullReducer<Self, Op, Device>::HasOptimizedImplementation && ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) || - (!RunningOnGPU && (internal::array_prod(m_impl.dimensions()) > 1024 * 1024)))) { + (!RunningOnGPU && size_large_enough(internal::array_prod(m_impl.dimensions()))))) { bool need_assign = false; if (!data) { @@ -584,16 +558,15 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device> template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - const int packetSize = internal::unpacket_traits<PacketReturnType>::size; - EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index + packetSize - 1 < dimensions().TotalSize()); + EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index + PacketSize - 1 < dimensions().TotalSize()); - EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize]; + EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; if (ReducingInnerMostDims) { const Index num_values_to_reduce = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumPreservedStrides - 1]; const Index firstIndex = firstInput(index); - for (Index i = 0; i < packetSize; ++i) { + for (Index i = 0; i < PacketSize; ++i) { Op reducer(m_reducer); values[i] = internal::InnerMostDimReducer<Self, Op>::reduce(*this, firstIndex + i * num_values_to_reduce, num_values_to_reduce, reducer); @@ -602,18 +575,18 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device> const Index firstIndex = firstInput(index); const int innermost_dim = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? 0 : NumOutputDims - 1; // TBD: extend this the the n innermost dimensions that we preserve. - if (((firstIndex % m_dimensions[innermost_dim]) + packetSize - 1) < m_dimensions[innermost_dim]) { + if (((firstIndex % m_dimensions[innermost_dim]) + PacketSize - 1) < m_dimensions[innermost_dim]) { Op reducer(m_reducer); typename Self::PacketReturnType accum = reducer.template initializePacket<typename Self::PacketReturnType>(); internal::InnerMostDimPreserver<NumReducedDims-1, Self, Op>::reduce(*this, firstIndex, reducer, &accum); return reducer.finalizePacket(accum); } else { - for (int i = 0; i < packetSize; ++i) { + for (int i = 0; i < PacketSize; ++i) { values[i] = coeff(index + i); } } } else { - for (int i = 0; i < packetSize; ++i) { + for (int i = 0; i < PacketSize; ++i) { values[i] = coeff(index + i); } } @@ -621,6 +594,18 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device> return rslt; } + // Must be called after evalSubExprsIfNeeded(). + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + if (RunningFullReduction && m_result) { + return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); + } else { + const Index num_values_to_reduce = internal::array_prod(m_reducedDims); + const double compute_cost = num_values_to_reduce * internal::functor_traits<Op>::Cost; + return m_impl.costPerCoeff(vectorized) * num_values_to_reduce + + TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); + } + } + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } private: diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index c33d54d6e..fd2587dd5 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -130,13 +130,18 @@ struct FullReducer<Self, Op, GpuDevice, Vectorizable> { assert(false && "Should only be called on floats"); } - static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const GpuDevice& device, float* output) { + static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output) { typedef typename Self::Index Index; const Index num_coeffs = array_prod(self.m_impl.dimensions()); + // Don't crash when we're called with an input tensor of size 0. + if (num_coeffs == 0) { + return; + } + const int block_size = 256; const int num_per_thread = 128; - const int num_blocks = std::ceil(static_cast<float>(num_coeffs) / (block_size * num_per_thread)); + const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread); if (num_blocks > 1) { // We initialize the outputs outside the reduction kernel when we can't be sure that there @@ -231,7 +236,7 @@ struct InnerReducer<Self, Op, GpuDevice> { return true; } - static EIGEN_DEVICE_FUNC bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { + static bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { typedef typename Self::Index Index; // It's faster to use the usual code. @@ -310,7 +315,7 @@ struct OuterReducer<Self, Op, GpuDevice> { return true; } - static EIGEN_DEVICE_FUNC bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { + static bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { typedef typename Self::Index Index; // It's faster to use the usual code. diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h index 96d92038c..1a59cc8f7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h @@ -104,6 +104,10 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device typedef typename XprType::Index Index; static const int NumDims = internal::array_size<ReverseDimensions>::value; typedef DSizes<Index, NumDims> Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; enum { IsAligned = false, @@ -135,10 +139,6 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device } } - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } @@ -195,21 +195,33 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - const int packetSize = internal::unpacket_traits<PacketReturnType>::size; - EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); // TODO(ndjaitly): write a better packing routine that uses // local structure. EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type - values[packetSize]; - for (int i = 0; i < packetSize; ++i) { + values[PacketSize]; + for (int i = 0; i < PacketSize; ++i) { values[i] = coeff(index+i); } PacketReturnType rslt = internal::pload<PacketReturnType>(values); return rslt; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + double compute_cost = NumDims * (2 * TensorOpCost::AddCost<Index>() + + 2 * TensorOpCost::MulCost<Index>() + + TensorOpCost::DivCost<Index>()); + for (int i = 0; i < NumDims; ++i) { + if (m_reverse[i]) { + compute_cost += 2 * TensorOpCost::AddCost<Index>(); + } + } + return m_impl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, compute_cost, false /* vectorized */, PacketSize); + } + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } protected: @@ -246,6 +258,7 @@ struct TensorEvaluator<TensorReverseOp<ReverseDimensions, ArgType>, Device> typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return this->m_dimensions; } @@ -256,14 +269,13 @@ struct TensorEvaluator<TensorReverseOp<ReverseDimensions, ArgType>, Device> template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) { - const int packetSize = internal::unpacket_traits<PacketReturnType>::size; - EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); // This code is pilfered from TensorMorphing.h - EIGEN_ALIGN_MAX CoeffReturnType values[packetSize]; + EIGEN_ALIGN_MAX CoeffReturnType values[PacketSize]; internal::pstore<CoeffReturnType, PacketReturnType>(values, x); - for (int i = 0; i < packetSize; ++i) { + for (int i = 0; i < PacketSize; ++i) { this->coeffRef(index+i) = values[i]; } } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h index c19833ea5..e76533710 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h @@ -104,6 +104,9 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value; typedef DSizes<Index, NumDims> Dimensions; typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; enum { IsAligned = false, @@ -145,9 +148,6 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> } } - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { @@ -166,18 +166,25 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - const int packetSize = internal::unpacket_traits<PacketReturnType>::size; - EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); - EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize]; - for (int i = 0; i < packetSize; ++i) { + EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; + for (int i = 0; i < PacketSize; ++i) { values[i] = coeff(index+i); } PacketReturnType rslt = internal::pload<PacketReturnType>(values); return rslt; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + const double compute_cost = NumDims * (2 * TensorOpCost::AddCost<Index>() + + 2 * TensorOpCost::MulCost<Index>() + + TensorOpCost::DivCost<Index>()); + return m_impl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, compute_cost, false /* vectorized */, PacketSize); + } + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } protected: @@ -219,6 +226,9 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device> static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value; typedef DSizes<Index, NumDims> Dimensions; typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; enum { IsAligned = false, @@ -230,9 +240,6 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device> : Base(op, device) { } - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) { return this->m_impl.coeffRef(this->srcCoeff(index)); @@ -241,12 +248,11 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device> template <int StoreMode> EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) { - static const int packetSize = internal::unpacket_traits<PacketReturnType>::size; - EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) - EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize]; + EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; internal::pstore<CoeffReturnType, PacketReturnType>(values, x); - for (int i = 0; i < packetSize; ++i) { + for (int i = 0; i < PacketSize; ++i) { this->coeffRef(index+i) = values[i]; } } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h index 085f8fd3d..52b7d216a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h @@ -103,6 +103,10 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device> typedef typename XprType::Index Index; static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value; typedef DSizes<Index, NumDims> Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; enum { IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false, @@ -142,10 +146,6 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device> } } - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { @@ -164,12 +164,11 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device> template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - const int packetSize = internal::unpacket_traits<PacketReturnType>::size; - EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); Index inputIndices[] = {0, 0}; - Index indices[] = {index, index + packetSize - 1}; + Index indices[] = {index, index + PacketSize - 1}; if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { for (int i = NumDims - 1; i > 0; --i) { const Index idx0 = indices[0] / m_outputStrides[i]; @@ -193,15 +192,15 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device> inputIndices[0] += indices[0] * m_inputStrides[NumDims-1]; inputIndices[1] += indices[1] * m_inputStrides[NumDims-1]; } - if (inputIndices[1] - inputIndices[0] == packetSize - 1) { + if (inputIndices[1] - inputIndices[0] == PacketSize - 1) { PacketReturnType rslt = m_impl.template packet<Unaligned>(inputIndices[0]); return rslt; } else { - EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize]; + EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; values[0] = m_impl.coeff(inputIndices[0]); - values[packetSize-1] = m_impl.coeff(inputIndices[1]); - for (int i = 1; i < packetSize-1; ++i) { + values[PacketSize-1] = m_impl.coeff(inputIndices[1]); + for (int i = 1; i < PacketSize-1; ++i) { values[i] = coeff(index+i); } PacketReturnType rslt = internal::pload<PacketReturnType>(values); @@ -209,6 +208,20 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device> } } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + double compute_cost = (NumDims - 1) * (TensorOpCost::AddCost<Index>() + + TensorOpCost::MulCost<Index>() + + TensorOpCost::DivCost<Index>()) + + TensorOpCost::MulCost<Index>(); + if (vectorized) { + compute_cost *= 2; // packet() computes two indices + } + const int innerDim = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? 0 : (NumDims - 1); + return m_impl.costPerCoeff(vectorized && m_inputStrides[innerDim] == 1) + + // Computation is not vectorized per se, but it is done once per packet. + TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); + } + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } protected: @@ -266,6 +279,7 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device> typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { @@ -275,12 +289,11 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device> template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) { - const int packetSize = internal::unpacket_traits<PacketReturnType>::size; - EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+packetSize-1 < this->dimensions().TotalSize()); + EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < this->dimensions().TotalSize()); Index inputIndices[] = {0, 0}; - Index indices[] = {index, index + packetSize - 1}; + Index indices[] = {index, index + PacketSize - 1}; if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { for (int i = NumDims - 1; i > 0; --i) { const Index idx0 = indices[0] / this->m_outputStrides[i]; @@ -304,15 +317,15 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device> inputIndices[0] += indices[0] * this->m_inputStrides[NumDims-1]; inputIndices[1] += indices[1] * this->m_inputStrides[NumDims-1]; } - if (inputIndices[1] - inputIndices[0] == packetSize - 1) { + if (inputIndices[1] - inputIndices[0] == PacketSize - 1) { this->m_impl.template writePacket<Unaligned>(inputIndices[0], x); } else { - EIGEN_ALIGN_MAX Scalar values[packetSize]; + EIGEN_ALIGN_MAX Scalar values[PacketSize]; internal::pstore<Scalar, PacketReturnType>(values, x); this->m_impl.coeffRef(inputIndices[0]) = values[0]; - this->m_impl.coeffRef(inputIndices[1]) = values[packetSize-1]; - for (int i = 1; i < packetSize-1; ++i) { + this->m_impl.coeffRef(inputIndices[1]) = values[PacketSize-1]; + for (int i = 1; i < PacketSize-1; ++i) { this->coeffRef(index+i) = values[i]; } } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h index 3e56589c3..5950f38e2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h @@ -53,9 +53,7 @@ struct TensorUInt128 template<typename T> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE explicit TensorUInt128(const T& x) : high(0), low(x) { - typedef typename conditional<sizeof(T) == 8, uint64_t, uint32_t>::type UnsignedT; - typedef typename conditional<sizeof(LOW) == 8, uint64_t, uint32_t>::type UnsignedLow; - eigen_assert(static_cast<UnsignedT>(x) <= static_cast<UnsignedLow>(NumTraits<LOW>::highest())); + eigen_assert((static_cast<typename conditional<sizeof(T) == 8, uint64_t, uint32_t>::type>(x) <= static_cast<typename conditional<sizeof(LOW) == 8, uint64_t, uint32_t>::type>(NumTraits<LOW>::highest()))); eigen_assert(x >= 0); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h index 5bdfbad46..e735fc76f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h @@ -171,6 +171,9 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D static const int NumDims = NumInputDims + 1; typedef DSizes<Index, NumDims> Dimensions; typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; enum { IsAligned = false, @@ -336,9 +339,6 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D } } - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { @@ -408,16 +408,15 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - const Index packetSize = internal::unpacket_traits<PacketReturnType>::size; - EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); if (m_in_row_strides != 1 || m_in_col_strides != 1 || m_row_inflate_strides != 1 || m_col_inflate_strides != 1 || m_in_plane_strides != 1 || m_plane_inflate_strides != 1) { return packetWithPossibleZero(index); } - const Index indices[2] = {index, index + packetSize - 1}; + const Index indices[2] = {index, index + PacketSize - 1}; const Index patchIndex = indices[0] / m_fastPatchStride; if (patchIndex != indices[1] / m_fastPatchStride) { return packetWithPossibleZero(index); @@ -495,6 +494,14 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D return packetWithPossibleZero(index); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + const double compute_cost = + 10 * TensorOpCost::DivCost<Index>() + 21 * TensorOpCost::MulCost<Index>() + + 8 * TensorOpCost::AddCost<Index>(); + return TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); + } + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; } @@ -518,9 +525,8 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const { - const int packetSize = internal::unpacket_traits<PacketReturnType>::size; - EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize]; - for (int i = 0; i < packetSize; ++i) { + EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; + for (int i = 0; i < PacketSize; ++i) { values[i] = coeff(index+i); } PacketReturnType rslt = internal::pload<PacketReturnType>(values); |