// This file is part of Eigen, a lightweight C++ template library // for linear algebra. // // Copyright (C) 2014 Benoit Steiner // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. #ifndef EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H #define EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H namespace Eigen { /** \class TensorBroadcasting * \ingroup CXX11_Tensor_Module * * \brief Tensor broadcasting class. * * */ namespace internal { template struct traits > : public traits { typedef typename XprType::Scalar Scalar; typedef traits XprTraits; typedef typename XprTraits::StorageKind StorageKind; typedef typename XprTraits::Index Index; typedef typename XprType::Nested Nested; typedef typename remove_reference::type _Nested; static const int NumDimensions = XprTraits::NumDimensions; static const int Layout = XprTraits::Layout; typedef typename XprTraits::PointerType PointerType; }; template struct eval, Eigen::Dense> { typedef const TensorBroadcastingOp& type; }; template struct nested, 1, typename eval >::type> { typedef TensorBroadcastingOp type; }; template struct is_input_scalar { static const bool value = false; }; template <> struct is_input_scalar > { static const bool value = true; }; #ifndef EIGEN_EMULATE_CXX11_META_H template struct is_input_scalar > { static const bool value = (Sizes::total_size == 1); }; #endif } // end namespace internal template class TensorBroadcastingOp : public TensorBase, ReadOnlyAccessors> { public: typedef typename Eigen::internal::traits::Scalar Scalar; typedef typename Eigen::NumTraits::Real RealScalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename Eigen::internal::nested::type Nested; typedef typename Eigen::internal::traits::StorageKind StorageKind; typedef typename Eigen::internal::traits::Index Index; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBroadcastingOp(const XprType& expr, const Broadcast& broadcast) : m_xpr(expr), m_broadcast(broadcast) {} EIGEN_DEVICE_FUNC const Broadcast& broadcast() const { return m_broadcast; } EIGEN_DEVICE_FUNC const typename internal::remove_all::type& expression() const { return m_xpr; } protected: typename XprType::Nested m_xpr; const Broadcast m_broadcast; }; // Eval as rvalue template struct TensorEvaluator, Device> { typedef TensorBroadcastingOp XprType; typedef typename XprType::Index Index; static const int NumDims = internal::array_size::Dimensions>::value; typedef DSizes Dimensions; typedef typename XprType::Scalar Scalar; typedef typename TensorEvaluator::Dimensions InputDimensions; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; static const int PacketSize = internal::unpacket_traits::size; bool isCopy= false, nByOne = false, oneByN = false; enum { IsAligned = true, PacketAccess = TensorEvaluator::PacketAccess, Layout = TensorEvaluator::Layout, RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_broadcast(op.broadcast()),m_impl(op.expression(), device) { // The broadcasting op doesn't change the rank of the tensor. One can't broadcast a scalar // and store the result in a scalar. Instead one should reshape the scalar into a a N-D // tensor with N >= 1 of 1 element first and then broadcast. EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); const InputDimensions& input_dims = m_impl.dimensions(); isCopy = true; for (int i = 0; i < NumDims; ++i) { eigen_assert(input_dims[i] > 0); m_dimensions[i] = input_dims[i] * m_broadcast[i]; if (m_broadcast[i] != 1) { isCopy = false; } } if (static_cast(Layout) == static_cast(ColMajor)) { m_inputStrides[0] = 1; m_outputStrides[0] = 1; for (int i = 1; i < NumDims; ++i) { m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; } } else { m_inputStrides[NumDims-1] = 1; m_outputStrides[NumDims-1] = 1; for (int i = NumDims-2; i >= 0; --i) { m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1]; m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1]; } } if (input_dims[0] == 1) { oneByN = true; for (int i = 1; i < NumDims; ++i) { if (m_broadcast[i] != 1) { oneByN = false; break; } } } else if (input_dims[NumDims-1] == 1) { nByOne = true; for (int i = 0; i < NumDims-1; ++i) { if (m_broadcast[i] != 1) { nByOne = false; break; } } } // Handle special format like NCHW, its input shape is '[1, N..., 1]' and // broadcast shape is '[N, 1..., N]' if (!oneByN && !nByOne) { if (input_dims[0] == 1 && input_dims[NumDims-1] == 1 && NumDims > 2) { nByOne = true; oneByN = true; for (int i = 1; i < NumDims-1; ++i) { if (m_broadcast[i] != 1) { nByOne = false; oneByN = false; break; } } } } } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { m_impl.evalSubExprsIfNeeded(NULL); return true; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffReturnType coeff(Index index) const { if (internal::is_input_scalar::type>::value) { return m_impl.coeff(0); } if (static_cast(Layout) == static_cast(ColMajor)) { if (isCopy) { return m_impl.coeff(index); } else { return coeffColMajor(index); } } else { if (isCopy) { return m_impl.coeff(index); } else { return coeffRowMajor(index); } } } // TODO: attempt to speed this up. The integer divisions and modulo are slow EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffColMajor(Index index) const { Index inputIndex = 0; for (int i = NumDims - 1; i > 0; --i) { const Index idx = index / m_outputStrides[i]; if (internal::index_statically_eq(i, 1)) { eigen_assert(idx < m_impl.dimensions()[i]); inputIndex += idx * m_inputStrides[i]; } else { if (internal::index_statically_eq(i, 1)) { eigen_assert(idx % m_impl.dimensions()[i] == 0); } else { inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i]; } } index -= idx * m_outputStrides[i]; } if (internal::index_statically_eq(0, 1)) { eigen_assert(index < m_impl.dimensions()[0]); inputIndex += index; } else { if (internal::index_statically_eq(0, 1)) { eigen_assert(index % m_impl.dimensions()[0] == 0); } else { inputIndex += (index % m_impl.dimensions()[0]); } } return m_impl.coeff(inputIndex); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffRowMajor(Index index) const { Index inputIndex = 0; for (int i = 0; i < NumDims - 1; ++i) { const Index idx = index / m_outputStrides[i]; if (internal::index_statically_eq(i, 1)) { eigen_assert(idx < m_impl.dimensions()[i]); inputIndex += idx * m_inputStrides[i]; } else { if (internal::index_statically_eq(i, 1)) { eigen_assert(idx % m_impl.dimensions()[i] == 0); } else { inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i]; } } index -= idx * m_outputStrides[i]; } if (internal::index_statically_eq(NumDims-1, 1)) { eigen_assert(index < m_impl.dimensions()[NumDims-1]); inputIndex += index; } else { if (internal::index_statically_eq(NumDims-1, 1)) { eigen_assert(index % m_impl.dimensions()[NumDims-1] == 0); } else { inputIndex += (index % m_impl.dimensions()[NumDims-1]); } } return m_impl.coeff(inputIndex); } template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType packet(Index index) const { if (internal::is_input_scalar::type>::value) { return internal::pset1(m_impl.coeff(0)); } if (static_cast(Layout) == static_cast(ColMajor)) { if (isCopy) { return m_impl.template packet(index); } else if (oneByN && !nByOne) { return packetNByOne(index); } else if (!oneByN && nByOne) { return packetOneByN(index); } else if (oneByN && nByOne) { return packetOneByNByOne(index); } else { return packetColMajor(index); } } else { if (isCopy) { return m_impl.template packet(index); } else if (oneByN && !nByOne) { return packetOneByN(index); } else if (!oneByN && nByOne) { return packetNByOne(index); } else if (oneByN && nByOne) { return packetOneByNByOne(index); } else { return packetRowMajor(index); } } } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetOneByNByOne (Index index) const { EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; Index startDim, endDim; Index inputIndex, outputOffset, batchedIndex; if (static_cast(Layout) == static_cast(ColMajor)) { startDim = NumDims - 1; endDim = 1; } else { startDim = 0; endDim = NumDims - 2; } batchedIndex = index % m_outputStrides[startDim]; inputIndex = batchedIndex / m_outputStrides[endDim]; outputOffset = batchedIndex % m_outputStrides[endDim]; if (outputOffset + PacketSize <= m_outputStrides[endDim]) { values[0] = m_impl.coeff(inputIndex); return internal::pload1(values); } else { for (int i = 0, cur = 0; i < PacketSize; ++i, ++cur) { if (outputOffset + cur < m_outputStrides[endDim]) { values[i] = m_impl.coeff(inputIndex); } else { ++inputIndex; inputIndex = (inputIndex == m_inputStrides[startDim] ? 0 : inputIndex); values[i] = m_impl.coeff(inputIndex); outputOffset = 0; cur = 0; } } return internal::pload(values); } } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetOneByN(Index index) const { EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); Index dim, inputIndex; if (static_cast(Layout) == static_cast(ColMajor)) { dim = NumDims - 1; } else { dim = 0; } inputIndex = index % m_inputStrides[dim]; if (inputIndex + PacketSize <= m_inputStrides[dim]) { return m_impl.template packet(inputIndex); } else { EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; for (int i = 0; i < PacketSize; ++i) { if (inputIndex > m_inputStrides[dim]-1) { inputIndex = 0; } values[i] = m_impl.coeff(inputIndex++); } return internal::pload(values); } } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetNByOne(Index index) const { EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; Index dim, inputIndex, outputOffset; if (static_cast(Layout) == static_cast(ColMajor)) { dim = 1; } else { dim = NumDims - 2; } inputIndex = index / m_outputStrides[dim]; outputOffset = index % m_outputStrides[dim]; if (outputOffset + PacketSize <= m_outputStrides[dim]) { values[0] = m_impl.coeff(inputIndex); return internal::pload1(values); } else { for (int i = 0, cur = 0; i < PacketSize; ++i, ++cur) { if (outputOffset + cur < m_outputStrides[dim]) { values[i] = m_impl.coeff(inputIndex); } else { values[i] = m_impl.coeff(++inputIndex); outputOffset = 0; cur = 0; } } return internal::pload(values); } } // Ignore the LoadMode and always use unaligned loads since we can't guarantee // the alignment at compile time. template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const { EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); const Index originalIndex = index; Index inputIndex = 0; for (int i = NumDims - 1; i > 0; --i) { const Index idx = index / m_outputStrides[i]; if (internal::index_statically_eq(i, 1)) { eigen_assert(idx < m_impl.dimensions()[i]); inputIndex += idx * m_inputStrides[i]; } else { if (internal::index_statically_eq(i, 1)) { eigen_assert(idx % m_impl.dimensions()[i] == 0); } else { inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i]; } } index -= idx * m_outputStrides[i]; } Index innermostLoc; if (internal::index_statically_eq(0, 1)) { eigen_assert(index < m_impl.dimensions()[0]); innermostLoc = index; } else { if (internal::index_statically_eq(0, 1)) { eigen_assert(index % m_impl.dimensions()[0] == 0); innermostLoc = 0; } else { innermostLoc = index % m_impl.dimensions()[0]; } } inputIndex += innermostLoc; // Todo: this could be extended to the second dimension if we're not // broadcasting alongside the first dimension, and so on. if (innermostLoc + PacketSize <= m_impl.dimensions()[0]) { return m_impl.template packet(inputIndex); } else { EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; values[0] = m_impl.coeff(inputIndex); for (int i = 1; i < PacketSize; ++i) { if (innermostLoc + i < m_impl.dimensions()[0]) { values[i] = m_impl.coeff(inputIndex+i); } else { values[i] = coeffColMajor(originalIndex+i); } } PacketReturnType rslt = internal::pload(values); return rslt; } } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const { EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); const Index originalIndex = index; Index inputIndex = 0; for (int i = 0; i < NumDims - 1; ++i) { const Index idx = index / m_outputStrides[i]; if (internal::index_statically_eq(i, 1)) { eigen_assert(idx < m_impl.dimensions()[i]); inputIndex += idx * m_inputStrides[i]; } else { if (internal::index_statically_eq(i, 1)) { eigen_assert(idx % m_impl.dimensions()[i] == 0); } else { inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i]; } } index -= idx * m_outputStrides[i]; } Index innermostLoc; if (internal::index_statically_eq(NumDims-1, 1)) { eigen_assert(index < m_impl.dimensions()[NumDims-1]); innermostLoc = index; } else { if (internal::index_statically_eq(NumDims-1, 1)) { eigen_assert(index % m_impl.dimensions()[NumDims-1] == 0); innermostLoc = 0; } else { innermostLoc = index % m_impl.dimensions()[NumDims-1]; } } inputIndex += innermostLoc; // Todo: this could be extended to the second dimension if we're not // broadcasting alongside the first dimension, and so on. if (innermostLoc + PacketSize <= m_impl.dimensions()[NumDims-1]) { return m_impl.template packet(inputIndex); } else { EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; values[0] = m_impl.coeff(inputIndex); for (int i = 1; i < PacketSize; ++i) { if (innermostLoc + i < m_impl.dimensions()[NumDims-1]) { values[i] = m_impl.coeff(inputIndex+i); } else { values[i] = coeffRowMajor(originalIndex+i); } } PacketReturnType rslt = internal::pload(values); return rslt; } } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { double compute_cost = TensorOpCost::AddCost(); if (!isCopy && NumDims > 0) { for (int i = NumDims - 1; i > 0; --i) { compute_cost += TensorOpCost::DivCost(); if (internal::index_statically_eq(i, 1)) { compute_cost += TensorOpCost::MulCost() + TensorOpCost::AddCost(); } else { if (!internal::index_statically_eq(i, 1)) { compute_cost += TensorOpCost::MulCost() + TensorOpCost::ModCost() + TensorOpCost::AddCost(); } } compute_cost += TensorOpCost::MulCost() + TensorOpCost::AddCost(); } } return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); } EIGEN_DEVICE_FUNC typename Eigen::internal::traits::PointerType data() const { return NULL; } const TensorEvaluator& impl() const { return m_impl; } Broadcast functor() const { return m_broadcast; } protected: const Broadcast m_broadcast; Dimensions m_dimensions; array m_outputStrides; array m_inputStrides; TensorEvaluator m_impl; }; } // end namespace Eigen #endif // EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H