diff options
author | Eugene Zhulenev <ezhulenev@google.com> | 2018-07-23 15:50:55 -0700 |
---|---|---|
committer | Eugene Zhulenev <ezhulenev@google.com> | 2018-07-23 15:50:55 -0700 |
commit | d55efa6f0f9ab9ec758c6b40204be476c01b7528 (patch) | |
tree | a779840d7eb3990ff7da5681dcc3858a48d6fdb6 | |
parent | 34a75c3c5cec4e2bfe5c68164f8c3372f6ae5ecb (diff) |
TensorBlockIO
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h | 546 | ||||
-rw-r--r-- | unsupported/test/cxx11_tensor_block_access.cpp | 791 |
2 files changed, 1303 insertions, 34 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h index 59535cd91..8ffc9d093 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h @@ -14,6 +14,32 @@ namespace Eigen { namespace internal { +namespace { + +// Helper template to choose between ColMajor and RowMajor values. +template <int Layout> +struct cond; + +template <> +struct cond<ColMajor> { + template <typename T> + EIGEN_STRONG_INLINE const T& operator()(const T& col, + const T& /*row*/) const { + return col; + } +}; + +template <> +struct cond<RowMajor> { + template <typename T> + EIGEN_STRONG_INLINE const T& operator()(const T& /*col*/, + const T& row) const { + return row; + } +}; + +} // namespace + /** * \class TensorBlockShapeType * \ingroup CXX11_Tensor_Module @@ -82,6 +108,512 @@ class TensorBlock { Scalar* m_data; // Not owned. }; +template <typename Scalar, typename Index, bool Vectorizable> +struct TensorBlockCopyOp { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( + const Index num_coeff_to_copy, const Index dst_index, + const Index dst_stride, Scalar* EIGEN_RESTRICT dst_data, + const Index src_index, const Index src_stride, + const Scalar* EIGEN_RESTRICT src_data) { + for (Index i = 0; i < num_coeff_to_copy; ++i) { + dst_data[dst_index + i * dst_stride] = + src_data[src_index + i * src_stride]; + } + } +}; + +// NOTE: Benchmarks run on an implementation of this that broke each of the +// loops in these conditionals into it's own template specialization (to +// avoid conditionals in the caller's loop) did not show an improvement. +template <typename Scalar, typename Index> +struct TensorBlockCopyOp<Scalar, Index, true> { + typedef typename packet_traits<Scalar>::type Packet; + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( + const Index num_coeff_to_copy, const Index dst_index, + const Index dst_stride, Scalar* EIGEN_RESTRICT dst_data, + const Index src_index, const Index src_stride, + const Scalar* EIGEN_RESTRICT src_data) { + if (src_stride == 1) { + const Index packet_size = internal::unpacket_traits<Packet>::size; + const Index vectorized_size = + (num_coeff_to_copy / packet_size) * packet_size; + if (dst_stride == 1) { + // LINEAR + for (Index i = 0; i < vectorized_size; i += packet_size) { + Packet p = internal::ploadu<Packet>(src_data + src_index + i); + internal::pstoreu<Scalar, Packet>(dst_data + dst_index + i, p); + } + for (Index i = vectorized_size; i < num_coeff_to_copy; ++i) { + dst_data[dst_index + i] = src_data[src_index + i]; + } + } else { + // SCATTER + for (Index i = 0; i < vectorized_size; i += packet_size) { + Packet p = internal::ploadu<Packet>(src_data + src_index + i); + internal::pscatter<Scalar, Packet>( + dst_data + dst_index + i * dst_stride, p, dst_stride); + } + for (Index i = vectorized_size; i < num_coeff_to_copy; ++i) { + dst_data[dst_index + i * dst_stride] = src_data[src_index + i]; + } + } + } else if (src_stride == 0) { + const Index packet_size = internal::unpacket_traits<Packet>::size; + const Index vectorized_size = + (num_coeff_to_copy / packet_size) * packet_size; + if (dst_stride == 1) { + // LINEAR + for (Index i = 0; i < vectorized_size; i += packet_size) { + Packet p = internal::pload1<Packet>(src_data + src_index); + internal::pstoreu<Scalar, Packet>(dst_data + dst_index + i, p); + } + for (Index i = vectorized_size; i < num_coeff_to_copy; ++i) { + dst_data[dst_index + i] = src_data[src_index]; + } + } else { + // SCATTER + for (Index i = 0; i < vectorized_size; i += packet_size) { + Packet p = internal::pload1<Packet>(src_data + src_index); + internal::pscatter<Scalar, Packet>( + dst_data + dst_index + i * dst_stride, p, dst_stride); + } + for (Index i = vectorized_size; i < num_coeff_to_copy; ++i) { + dst_data[dst_index + i * dst_stride] = src_data[src_index]; + } + } + } else { + if (dst_stride == 1) { + // GATHER + const Index packet_size = internal::unpacket_traits<Packet>::size; + const Index vectorized_size = + (num_coeff_to_copy / packet_size) * packet_size; + for (Index i = 0; i < vectorized_size; i += packet_size) { + Packet p = internal::pgather<Scalar, Packet>( + src_data + src_index + i * src_stride, src_stride); + internal::pstoreu<Scalar, Packet>(dst_data + dst_index + i, p); + } + for (Index i = vectorized_size; i < num_coeff_to_copy; ++i) { + dst_data[dst_index + i] = src_data[src_index + i * src_stride]; + } + } else { + // RANDOM + for (Index i = 0; i < num_coeff_to_copy; ++i) { + dst_data[dst_index + i * dst_stride] = + src_data[src_index + i * src_stride]; + } + } + } + } +}; + +/** + * \class TensorBlockIO + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor block IO class. + * + * This class is responsible for copying data between a tensor and a tensor + * block. + */ +template <typename Scalar, typename Index, int NumDims, int Layout, + bool Vectorizable, bool BlockRead> +class TensorBlockIO { + public: + typedef typename internal::TensorBlock<Scalar, Index, NumDims, Layout> + TensorBlock; + typedef typename internal::TensorBlockCopyOp<Scalar, Index, Vectorizable> + TensorBlockCopyOp; + + protected: + struct BlockIteratorState { + Index input_stride; + Index output_stride; + Index input_span; + Index output_span; + Index size; + Index count; + }; + + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Copy( + const TensorBlock& block, Index first_coeff_index, + const array<Index, NumDims>& tensor_to_block_dim_map, + const array<Index, NumDims>& tensor_strides, const Scalar* src_data, + Scalar* dst_data) { + // Find the innermost tensor dimension whose size is not 1. This is the + // effective inner dim. If all dimensions are of size 1, then fallback to + // using the actual innermost dim to avoid out-of-bound access. + Index num_size_one_inner_dims = 0; + for (int i = 0; i < NumDims; ++i) { + const int dim = cond<Layout>()(i, NumDims - i - 1); + if (block.block_sizes()[tensor_to_block_dim_map[dim]] != 1) { + num_size_one_inner_dims = i; + break; + } + } + // Calculate strides and dimensions. + const Index tensor_stride1_dim = cond<Layout>()( + num_size_one_inner_dims, NumDims - num_size_one_inner_dims - 1); + const Index block_dim_for_tensor_stride1_dim = + NumDims == 0 ? 1 : tensor_to_block_dim_map[tensor_stride1_dim]; + size_t block_inner_dim_size = + NumDims == 0 ? 1 + : block.block_sizes()[block_dim_for_tensor_stride1_dim]; + for (int i = num_size_one_inner_dims + 1; i < NumDims; ++i) { + const int dim = cond<Layout>()(i, NumDims - i - 1); + const Index block_stride = + block.block_strides()[tensor_to_block_dim_map[dim]]; + if (block_inner_dim_size == block_stride && + block_stride == tensor_strides[dim]) { + block_inner_dim_size *= + block.block_sizes()[tensor_to_block_dim_map[dim]]; + ++num_size_one_inner_dims; + } else { + break; + } + } + + Index inputIndex; + Index outputIndex; + Index input_stride; + Index output_stride; + + // Setup strides to read/write along the tensor's stride1 dimension. + if (BlockRead) { + inputIndex = first_coeff_index; + outputIndex = 0; + input_stride = NumDims == 0 ? 1 : tensor_strides[tensor_stride1_dim]; + output_stride = + NumDims == 0 + ? 1 + : block.block_strides()[block_dim_for_tensor_stride1_dim]; + } else { + inputIndex = 0; + outputIndex = first_coeff_index; + input_stride = + NumDims == 0 + ? 1 + : block.block_strides()[block_dim_for_tensor_stride1_dim]; + output_stride = NumDims == 0 ? 1 : tensor_strides[tensor_stride1_dim]; + } + + const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1; + array<BlockIteratorState, at_least_1_dim> block_iter_state; + + // Initialize block iterator state. Squeeze away any dimension of size 1. + int num_squeezed_dims = 0; + for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) { + const int dim = cond<Layout>()(i + 1, NumDims - i - 2); + const Index size = block.block_sizes()[tensor_to_block_dim_map[dim]]; + if (size == 1) { + continue; + } + block_iter_state[num_squeezed_dims].size = size; + if (BlockRead) { + block_iter_state[num_squeezed_dims].input_stride = tensor_strides[dim]; + block_iter_state[num_squeezed_dims].output_stride = + block.block_strides()[tensor_to_block_dim_map[dim]]; + } else { + block_iter_state[num_squeezed_dims].input_stride = + block.block_strides()[tensor_to_block_dim_map[dim]]; + block_iter_state[num_squeezed_dims].output_stride = tensor_strides[dim]; + } + block_iter_state[num_squeezed_dims].input_span = + block_iter_state[num_squeezed_dims].input_stride * + (block_iter_state[num_squeezed_dims].size - 1); + block_iter_state[num_squeezed_dims].output_span = + block_iter_state[num_squeezed_dims].output_stride * + (block_iter_state[num_squeezed_dims].size - 1); + block_iter_state[num_squeezed_dims].count = 0; + ++num_squeezed_dims; + } + + // Iterate copying data from src to dst. + const Index block_total_size = + NumDims == 0 ? 1 : block.block_sizes().TotalSize(); + for (Index i = 0; i < block_total_size; i += block_inner_dim_size) { + TensorBlockCopyOp::Run(block_inner_dim_size, outputIndex, output_stride, + dst_data, inputIndex, input_stride, src_data); + // Update index. + for (int j = 0; j < num_squeezed_dims; ++j) { + if (++block_iter_state[j].count < block_iter_state[j].size) { + inputIndex += block_iter_state[j].input_stride; + outputIndex += block_iter_state[j].output_stride; + break; + } + block_iter_state[j].count = 0; + inputIndex -= block_iter_state[j].input_span; + outputIndex -= block_iter_state[j].output_span; + } + } + } +}; + +/** + * \class TensorBlockReader + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor block reader class. + * + * This class is responsible for reading a tensor block. + * + */ +template <typename Scalar, typename Index, int NumDims, int Layout, + bool Vectorizable> +class TensorBlockReader + : public TensorBlockIO<Scalar, Index, NumDims, Layout, Vectorizable, true> { + public: + typedef typename internal::TensorBlock<Scalar, Index, NumDims, Layout> + TensorBlock; + typedef TensorBlockIO<Scalar, Index, NumDims, Layout, Vectorizable, true> + Base; + + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( + TensorBlock* block, const Scalar* src_data) { + array<Index, NumDims> tensor_to_block_dim_map; + for (int i = 0; i < NumDims; ++i) { + tensor_to_block_dim_map[i] = i; + } + Base::Copy(*block, block->first_coeff_index(), tensor_to_block_dim_map, + block->tensor_strides(), src_data, block->data()); + } + + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( + TensorBlock* block, Index first_coeff_index, + const array<Index, NumDims>& tensor_to_block_dim_map, + const array<Index, NumDims>& tensor_strides, const Scalar* src_data) { + Base::Copy(*block, first_coeff_index, tensor_to_block_dim_map, + tensor_strides, src_data, block->data()); + } +}; + +/** + * \class TensorBlockWriter + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor block writer class. + * + * This class is responsible for writing a tensor block. + * + */ +template <typename Scalar, typename Index, int NumDims, int Layout, + bool Vectorizable> +class TensorBlockWriter : public TensorBlockIO<Scalar, Index, NumDims, Layout, + Vectorizable, false> { + public: + typedef typename internal::TensorBlock<Scalar, Index, NumDims, Layout> + TensorBlock; + typedef TensorBlockIO<Scalar, Index, NumDims, Layout, Vectorizable, false> + Base; + + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( + const TensorBlock& block, Scalar* dst_data) { + array<Index, NumDims> tensor_to_block_dim_map; + for (int i = 0; i < NumDims; ++i) { + tensor_to_block_dim_map[i] = i; + } + Base::Copy(block, block.first_coeff_index(), tensor_to_block_dim_map, + block.tensor_strides(), block.data(), dst_data); + } + + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( + const TensorBlock& block, Index first_coeff_index, + const array<Index, NumDims>& tensor_to_block_dim_map, + const array<Index, NumDims>& tensor_strides, Scalar* dst_data) { + Base::Copy(block, first_coeff_index, tensor_to_block_dim_map, + tensor_strides, block.data(), dst_data); + } +}; + +/** + * \class TensorBlockCwiseBinaryOp + * \ingroup CXX11_Tensor_Module + * + * \brief Carries out a cwise binary op on a number of coefficients. + * + * This class reads strided inputs from left and right operands, and writes the + * result of the cwise binary op to the strided output array. + * + */ +template <bool Vectorizable> +struct TensorBlockCwiseBinaryOp { + template <typename Index, typename BinaryFunctor, typename OutputScalar, + typename LeftScalar, typename RightScalar> + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( + const BinaryFunctor& functor, const Index num_coeff, + const Index output_index, const Index output_stride, + OutputScalar* output_data, const Index left_index, + const Index left_stride, const LeftScalar* left_data, + const Index right_index, const Index right_stride, + const RightScalar* right_data) { + for (Index i = 0; i < num_coeff; ++i) { + output_data[output_index + i * output_stride] = + functor(left_data[left_index + i * left_stride], + right_data[right_index + i * right_stride]); + } + } +}; + +template <> +struct TensorBlockCwiseBinaryOp<true> { + template <typename Index, typename BinaryFunctor, typename OutputScalar, + typename LeftScalar, typename RightScalar> + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( + const BinaryFunctor& functor, const Index num_coeff, + const Index output_index, const Index output_stride, + OutputScalar* output_data, const Index left_index, + const Index left_stride, const LeftScalar* left_data, + const Index right_index, const Index right_stride, + const RightScalar* right_data) { + EIGEN_STATIC_ASSERT(functor_traits<BinaryFunctor>::PacketAccess, + YOU_MADE_A_PROGRAMMING_MISTAKE); + typedef typename packet_traits<OutputScalar>::type OutputPacket; + typedef typename packet_traits<LeftScalar>::type LeftPacket; + typedef typename packet_traits<RightScalar>::type RightPacket; + const Index packet_size = unpacket_traits<OutputPacket>::size; + EIGEN_STATIC_ASSERT(unpacket_traits<LeftPacket>::size == packet_size, + YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT(unpacket_traits<RightPacket>::size == packet_size, + YOU_MADE_A_PROGRAMMING_MISTAKE); + const Index vectorized_size = (num_coeff / packet_size) * packet_size; + if (output_stride != 1 || left_stride != 1 || right_stride != 1) { + TensorBlockCwiseBinaryOp<false>::Run( + functor, num_coeff, output_index, output_stride, output_data, + left_index, left_stride, left_data, right_index, right_stride, + right_data); + return; + } + // Vectorization for the most common case. + for (Index i = 0; i < vectorized_size; i += packet_size) { + LeftPacket l = internal::ploadu<LeftPacket>(left_data + left_index + i); + RightPacket r = + internal::ploadu<RightPacket>(right_data + right_index + i); + OutputPacket p = functor.packetOp(l, r); + internal::pstoreu<OutputScalar, OutputPacket>( + output_data + output_index + i, p); + } + for (Index i = vectorized_size; i < num_coeff; ++i) { + output_data[output_index + i] = + functor(left_data[left_index + i], right_data[right_index + i]); + } + } +}; + +/** + * \class TensorBlockCwiseBinaryIO + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor block IO class for carrying out cwise binary ops. + * + * This class carries out the binary op on given blocks. + * + */ +template <typename BinaryFunctor, typename Index, typename OutputScalar, + int NumDims, int Layout> +struct TensorBlockCwiseBinaryIO { + typedef typename internal::TensorBlock<OutputScalar, Index, NumDims, + Layout>::Dimensions Dimensions; + typedef internal::TensorBlockCwiseBinaryOp< + functor_traits<BinaryFunctor>::PacketAccess> + TensorBlockCwiseBinaryOp; + + struct BlockIteratorState { + Index output_stride, output_span; + Index left_stride, left_span; + Index right_stride, right_span; + Index size, count; + }; + + template <typename LeftScalar, typename RightScalar> + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( + const BinaryFunctor& functor, const Dimensions& block_sizes, + const Dimensions& block_strides, OutputScalar* output_data, + const array<Index, NumDims>& left_strides, const LeftScalar* left_data, + const array<Index, NumDims>& right_strides, + const RightScalar* right_data) { + // Find the innermost dimension whose size is not 1. This is the effective + // inner dim. If all dimensions are of size 1, fallback to using the actual + // innermost dim to avoid out-of-bound access. + int num_size_one_inner_dims = 0; + for (int i = 0; i < NumDims; ++i) { + const int dim = cond<Layout>()(i, NumDims - i - 1); + if (block_sizes[dim] != 1) { + num_size_one_inner_dims = i; + break; + } + } + // Calculate strides and dimensions. + const int inner_dim = + NumDims == 0 ? 1 + : cond<Layout>()(num_size_one_inner_dims, + NumDims - num_size_one_inner_dims - 1); + Index inner_dim_size = NumDims == 0 ? 1 : block_sizes[inner_dim]; + for (int i = num_size_one_inner_dims + 1; i < NumDims; ++i) { + const int dim = cond<Layout>()(i, NumDims - i - 1); + // Merge multiple inner dims into one for larger inner dim size (i.e. + // fewer calls to TensorBlockCwiseBinaryOp::Run()). + if (inner_dim_size == block_strides[dim] && + block_strides[dim] == left_strides[dim] && + block_strides[dim] == right_strides[dim]) { + inner_dim_size *= block_sizes[dim]; + ++num_size_one_inner_dims; + } else { + break; + } + } + + Index output_index = 0, left_index = 0, right_index = 0; + const Index output_stride = NumDims == 0 ? 1 : block_strides[inner_dim]; + const Index left_stride = NumDims == 0 ? 1 : left_strides[inner_dim]; + const Index right_stride = NumDims == 0 ? 1 : right_strides[inner_dim]; + + const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1; + array<BlockIteratorState, at_least_1_dim> block_iter_state; + + // Initialize block iterator state. Squeeze away any dimension of size 1. + int num_squeezed_dims = 0; + for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) { + const int dim = cond<Layout>()(i + 1, NumDims - i - 2); + const Index size = block_sizes[dim]; + if (size == 1) { + continue; + } + auto& state = block_iter_state[num_squeezed_dims]; + state.output_stride = block_strides[dim]; + state.left_stride = left_strides[dim]; + state.right_stride = right_strides[dim]; + state.size = size; + state.output_span = state.output_stride * (size - 1); + state.left_span = state.left_stride * (size - 1); + state.right_span = state.right_stride * (size - 1); + state.count = 0; + ++num_squeezed_dims; + } + + // Compute cwise binary op. + const Index block_total_size = NumDims == 0 ? 1 : block_sizes.TotalSize(); + for (Index i = 0; i < block_total_size; i += inner_dim_size) { + TensorBlockCwiseBinaryOp::Run(functor, inner_dim_size, output_index, + output_stride, output_data, left_index, + left_stride, left_data, right_index, + right_stride, right_data); + // Update index. + for (int j = 0; j < num_squeezed_dims; ++j) { + auto& state = block_iter_state[j]; + if (++state.count < state.size) { + output_index += state.output_stride; + left_index += state.left_stride; + right_index += state.right_stride; + break; + } + state.count = 0; + output_index -= state.output_span; + left_index -= state.left_span; + right_index -= state.right_span; + } + } + } +}; + /** * \class TensorBlockMapper * \ingroup CXX11_Tensor_Module @@ -90,7 +622,7 @@ class TensorBlock { * * This class is responsible for iterating over the blocks of a tensor. */ -template <typename Scalar, typename Index, std::size_t NumDims, int Layout> +template <typename Scalar, typename Index, int NumDims, int Layout> class TensorBlockMapper { public: typedef typename internal::TensorBlock<Scalar, Index, NumDims, Layout> @@ -190,10 +722,6 @@ class TensorBlockMapper { } private: - static int InnerDimIndex(Index i) { - return Layout == static_cast<int>(ColMajor) ? i : NumDims - i - 1; - } - static Dimensions BlockDimensions(const Dimensions& tensor_dims, const TensorBlockShapeType block_shape, size_t min_target_size) { @@ -228,7 +756,7 @@ class TensorBlockMapper { // Add any un-allocated coefficients to inner dimension(s). Index total_size = block_dim_sizes.TotalSize(); for (int i = 0; i < NumDims; ++i) { - const int dim = InnerDimIndex(i); + const int dim = cond<Layout>()(i, NumDims - i - 1); if (block_dim_sizes[dim] < tensor_dims[dim]) { const Index total_size_other_dims = total_size / block_dim_sizes[dim]; @@ -245,7 +773,7 @@ class TensorBlockMapper { } else if (block_shape == TensorBlockShapeType::kSkewedInnerDims) { Index coeff_to_allocate = min_target_size; for (int i = 0; i < NumDims; ++i) { - const int dim = InnerDimIndex(i); + const int dim = cond<Layout>()(i, NumDims - i - 1); block_dim_sizes[dim] = numext::mini(coeff_to_allocate, tensor_dims[dim]); coeff_to_allocate = @@ -284,7 +812,7 @@ class TensorBlockMapper { * processed together. * */ -template <typename Scalar, typename Index, std::size_t NumDims, int Layout> +template <typename Scalar, typename Index, int NumDims, int Layout> class TensorSliceBlockMapper { public: typedef typename internal::TensorBlock<Scalar, Index, NumDims, Layout> @@ -360,7 +888,7 @@ class TensorSliceBlockMapper { prev_dim = curr_dim; } } else { - for (int i = 0; i < static_cast<int>(NumDims) - 1; ++i) { + for (int i = 0; i < NumDims - 1; ++i) { const Index idx = block_index / m_block_strides[i]; coords[i] = m_tensor_slice_offsets[i] + idx * m_block_dim_sizes[i]; sizes[i] = numext::mini( diff --git a/unsupported/test/cxx11_tensor_block_access.cpp b/unsupported/test/cxx11_tensor_block_access.cpp index 66e61aef1..15f2392a3 100644 --- a/unsupported/test/cxx11_tensor_block_access.cpp +++ b/unsupported/test/cxx11_tensor_block_access.cpp @@ -19,11 +19,33 @@ using Eigen::Index; using Eigen::RowMajor; using Eigen::ColMajor; +using internal::TensorBlockShapeType; + template<typename T> static const T& choose(int layout, const T& col, const T& row) { return layout == ColMajor ? col : row; } +static const TensorBlockShapeType RandomShape() { + return internal::random<bool>() + ? internal::TensorBlockShapeType::kUniformAllDims + : internal::TensorBlockShapeType::kSkewedInnerDims; +} + +template <int NumDims> +static std::size_t RandomTargetSize(const DSizes<Index, NumDims>& dims) { + return internal::random<int>(1, dims.TotalSize()); +} + +template <typename T> +static T* GenerateRandomData(const Index& size) { + T* data = new T[size]; + for (int i = 0; i < size; ++i) { + data[i] = internal::random<T>(); + } + return data; +} + template <int Layout> static void test_block_mapper_sanity() { @@ -75,9 +97,7 @@ static void test_block_mapper_sanity() template <typename T, int Layout, int NumDims> static void UpdateCoeffSet( const internal::TensorBlock<T, Index, 4, Layout>& block, - Index first_coeff_index, - int dim_index, - std::set<Index>* visited_coeffs) { + Index first_coeff_index, int dim_index, std::set<Index>* visited_coeffs) { const DSizes<Index, NumDims> block_sizes = block.block_sizes(); const DSizes<Index, NumDims> tensor_strides = block.tensor_strides(); @@ -103,18 +123,11 @@ static void test_block_mapper_maps_every_element() DSizes<Index, 4> dims(5, 7, 11, 17); - auto total_coeffs = static_cast<int>(dims.TotalSize()); - // Keep track of elements indices available via block access. std::set<Index> coeff_set; // Try different combinations of block types and sizes. - auto block_shape_type = - internal::random<bool>() - ? internal::TensorBlockShapeType::kUniformAllDims - : internal::TensorBlockShapeType::kSkewedInnerDims; - auto block_target_size = internal::random<int>(1, total_coeffs); - TensorBlockMapper block_mapper(dims, block_shape_type, block_target_size); + TensorBlockMapper block_mapper(dims, RandomShape(), RandomTargetSize(dims)); for (int i = 0; i < block_mapper.total_block_count(); ++i) { TensorBlock block = block_mapper.GetBlockForIndex(i, nullptr); @@ -124,6 +137,7 @@ static void test_block_mapper_maps_every_element() // Verify that every coefficient in the original Tensor is accessible through // TensorBlock only once. + auto total_coeffs = static_cast<int>(dims.TotalSize()); VERIFY_IS_EQUAL(coeff_set.size(), total_coeffs); VERIFY_IS_EQUAL(*coeff_set.begin(), static_cast<Index>(0)); VERIFY_IS_EQUAL(*coeff_set.rbegin(), static_cast<Index>(total_coeffs - 1)); @@ -146,13 +160,6 @@ static void test_slice_block_mapper_maps_every_element() auto total_coeffs = static_cast<int>(tensor_slice_extents.TotalSize()); - // Try different combinations of block types and sizes. - auto block_shape_type = - internal::random<bool>() - ? internal::TensorBlockShapeType::kUniformAllDims - : internal::TensorBlockShapeType::kSkewedInnerDims; - auto block_target_size = internal::random<int>(1, total_coeffs); - // Pick a random dimension sizes for the tensor blocks. DSizes<Index, 4> block_sizes; for (int i = 0; i < 4; ++i) { @@ -164,7 +171,7 @@ static void test_slice_block_mapper_maps_every_element() DimensionList<Index, 4>()); for (int i = 0; i < block_mapper.total_block_count(); ++i) { - TensorBlock block = block_mapper.GetBlockForIndex(i, NULL); + TensorBlock block = block_mapper.GetBlockForIndex(i, nullptr); UpdateCoeffSet<T, Layout, 4>(block, block.first_coeff_index(), choose(Layout, 3, 0), &coeff_set); } @@ -172,11 +179,745 @@ static void test_slice_block_mapper_maps_every_element() VERIFY_IS_EQUAL(coeff_set.size(), total_coeffs); } +template <int Layout> +static void test_block_io_copy_data_from_source_to_target() +{ + using T = float; + + typedef internal::TensorBlock<T, Index, 5, Layout> TensorBlock; + typedef internal::TensorBlockMapper<T, Index, 5, Layout> TensorBlockMapper; + + typedef internal::TensorBlockReader<T, Index, 5, Layout, true> + TensorBlockReader; + typedef internal::TensorBlockWriter<T, Index, 5, Layout, true> + TensorBlockWriter; + + typedef std::vector<T, aligned_allocator<T>> DataVector; + + DSizes<Index, 5> input_tensor_dims(5, 7, 11, 17, 3); + const auto input_tensor_size = input_tensor_dims.TotalSize(); + DataVector input_data(input_tensor_size, 0); + for (int i = 0; i < input_tensor_size; ++i) { + input_data[i] = internal::random<T>(); + } + + DataVector output_data(input_tensor_size, 0); + + TensorBlockMapper block_mapper(input_tensor_dims, RandomShape(), + RandomTargetSize(input_tensor_dims)); + + DataVector block_data(block_mapper.block_dims_total_size(), 0); + for (int i = 0; i < block_mapper.total_block_count(); ++i) { + TensorBlock block = block_mapper.GetBlockForIndex(i, block_data.data()); + TensorBlockReader::Run(&block, input_data.data()); + TensorBlockWriter::Run(block, output_data.data()); + } + + for (int i = 0; i < input_tensor_size; ++i) { + VERIFY_IS_EQUAL(input_data[i], output_data[i]); + } +} + +template <int Layout, int NumDims> +static int GetInputIndex(Index output_index, + const array<Index, NumDims>& output_to_input_dim_map, + const array<Index, NumDims>& input_strides, + const array<Index, NumDims>& output_strides) { + int input_index = 0; + if (Layout == ColMajor) { + for (int i = NumDims - 1; i > 0; --i) { + const int idx = output_index / output_strides[i]; + input_index += idx * input_strides[output_to_input_dim_map[i]]; + output_index -= idx * output_strides[i]; + } + return input_index + + output_index * input_strides[output_to_input_dim_map[0]]; + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const int idx = output_index / output_strides[i]; + input_index += idx * input_strides[output_to_input_dim_map[i]]; + output_index -= idx * output_strides[i]; + } + return input_index + + output_index * input_strides[output_to_input_dim_map[NumDims - 1]]; + } +} + +template <int Layout, int NumDims> +static array<Index, NumDims> ComputeStrides( + const array<Index, NumDims>& sizes) { + array<Index, NumDims> strides; + if (Layout == ColMajor) { + strides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + strides[i] = strides[i - 1] * sizes[i - 1]; + } + } else { + strides[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + strides[i] = strides[i + 1] * sizes[i + 1]; + } + } + return strides; +} + +template <int Layout> +static void test_block_io_copy_using_reordered_dimensions() { + typedef internal::TensorBlock<float, Index, 5, Layout> TensorBlock; + typedef internal::TensorBlockMapper<float, Index, 5, Layout> + TensorBlockMapper; + + typedef internal::TensorBlockReader<float, Index, 5, Layout, false> + TensorBlockReader; + typedef internal::TensorBlockWriter<float, Index, 5, Layout, false> + TensorBlockWriter; + + DSizes<Index, 5> input_tensor_dims(5, 7, 11, 17, 3); + const auto input_tensor_size = input_tensor_dims.TotalSize(); + + // Create a random input tensor. + auto* input_data = GenerateRandomData<float>(input_tensor_size); + + // Create a random dimension re-ordering/shuffle. + std::vector<Index> shuffle = {0, 1, 2, 3, 4}; + std::shuffle(shuffle.begin(), shuffle.end(), std::mt19937()); + + DSizes<Index, 5> output_tensor_dims; + array<Index, 5> input_to_output_dim_map; + array<Index, 5> output_to_input_dim_map; + for (Index i = 0; i < 5; ++i) { + output_tensor_dims[shuffle[i]] = input_tensor_dims[i]; + input_to_output_dim_map[i] = shuffle[i]; + output_to_input_dim_map[shuffle[i]] = i; + } + + // Random block shape and size. + TensorBlockMapper block_mapper(output_tensor_dims, RandomShape(), + RandomTargetSize(input_tensor_dims)); + + auto* block_data = new float[block_mapper.block_dims_total_size()]; + auto* output_data = new float[input_tensor_size]; + + array<Index, 5> input_tensor_strides = + ComputeStrides<Layout, 5>(input_tensor_dims); + array<Index, 5> output_tensor_strides = + ComputeStrides<Layout, 5>(output_tensor_dims); + + for (Index i = 0; i < block_mapper.total_block_count(); ++i) { + TensorBlock block = block_mapper.GetBlockForIndex(i, block_data); + const Index first_coeff_index = GetInputIndex<Layout, 5>( + block.first_coeff_index(), output_to_input_dim_map, + input_tensor_strides, output_tensor_strides); + TensorBlockReader::Run(&block, first_coeff_index, input_to_output_dim_map, + input_tensor_strides, input_data); + TensorBlockWriter::Run(block, first_coeff_index, input_to_output_dim_map, + input_tensor_strides, output_data); + } + + for (int i = 0; i < input_tensor_size; ++i) { + VERIFY_IS_EQUAL(input_data[i], output_data[i]); + } + + delete[] input_data; + delete[] block_data; + delete[] output_data; +} + +template <int Layout> +static void test_block_io_zero_stride() +{ + typedef internal::TensorBlock<float, Index, 5, Layout> TensorBlock; + typedef internal::TensorBlockReader<float, Index, 5, Layout, true> + TensorBlockReader; + typedef internal::TensorBlockWriter<float, Index, 5, Layout, true> + TensorBlockWriter; + + DSizes<Index, 5> input_tensor_dims(1, 2, 1, 3, 1); + const auto input_tensor_size = input_tensor_dims.TotalSize(); + + // Create a random input tensor. + auto* input_data = GenerateRandomData<float>(input_tensor_size); + + DSizes<Index, 5> output_tensor_dims(3, 2, 3, 3, 2); + + DSizes<Index, 5> input_tensor_strides( + ComputeStrides<Layout, 5>(input_tensor_dims)); + DSizes<Index, 5> output_tensor_strides( + ComputeStrides<Layout, 5>(output_tensor_dims)); + + DSizes<Index, 5> input_tensor_strides_with_zeros(input_tensor_strides); + input_tensor_strides_with_zeros[0] = 0; + input_tensor_strides_with_zeros[2] = 0; + input_tensor_strides_with_zeros[4] = 0; + + // Verify that data was correctly read/written from/into the block. + const auto verify_is_equal = [&](const float* output_data) { + for (int i = 0; i < output_tensor_dims[0]; ++i) { + for (int j = 0; j < output_tensor_dims[1]; ++j) { + for (int k = 0; k < output_tensor_dims[2]; ++k) { + for (int l = 0; l < output_tensor_dims[3]; ++l) { + for (int m = 0; m < output_tensor_dims[4]; ++m) { + const Index output_offset = + i * output_tensor_strides[0] + j * output_tensor_strides[1] + + k * output_tensor_strides[2] + l * output_tensor_strides[3] + + m * output_tensor_strides[4]; + const Index input_offset = + i % input_tensor_dims[0] * input_tensor_strides[0] + + j % input_tensor_dims[1] * input_tensor_strides[1] + + k % input_tensor_dims[2] * input_tensor_strides[2] + + l % input_tensor_dims[3] * input_tensor_strides[3] + + m % input_tensor_dims[4] * input_tensor_strides[4]; + VERIFY_IS_EQUAL(output_data[output_offset], + input_data[input_offset]); + } + } + } + } + } + }; + + { + auto* output_data = new float[output_tensor_dims.TotalSize()]; + TensorBlock read_block(0, output_tensor_dims, output_tensor_strides, + input_tensor_strides_with_zeros, output_data); + TensorBlockReader::Run(&read_block, input_data); + verify_is_equal(output_data); + delete[] output_data; + } + + { + auto* output_data = new float[output_tensor_dims.TotalSize()]; + TensorBlock write_block(0, output_tensor_dims, + input_tensor_strides_with_zeros, + output_tensor_strides, input_data); + TensorBlockWriter::Run(write_block, output_data); + verify_is_equal(output_data); + delete[] output_data; + } + + delete[] input_data; +} + +template <int Layout> +static void test_block_io_squeeze_ones() { + typedef internal::TensorBlock<float, Index, 5, Layout> TensorBlock; + typedef internal::TensorBlockReader<float, Index, 5, Layout, true> + TensorBlockReader; + typedef internal::TensorBlockWriter<float, Index, 5, Layout, true> + TensorBlockWriter; + + // Total size > 1. + { + DSizes<Index, 5> block_sizes(1, 2, 1, 2, 1); + const auto total_size = block_sizes.TotalSize(); + + // Create a random input tensor. + auto* input_data = GenerateRandomData<float>(total_size); + DSizes<Index, 5> strides(ComputeStrides<Layout, 5>(block_sizes)); + + { + auto* output_data = new float[block_sizes.TotalSize()]; + TensorBlock read_block(0, block_sizes, strides, strides, output_data); + TensorBlockReader::Run(&read_block, input_data); + for (int i = 0; i < total_size; ++i) { + VERIFY_IS_EQUAL(output_data[i], input_data[i]); + } + delete[] output_data; + } + + { + auto* output_data = new float[block_sizes.TotalSize()]; + TensorBlock write_block(0, block_sizes, strides, strides, input_data); + TensorBlockWriter::Run(write_block, output_data); + for (int i = 0; i < total_size; ++i) { + VERIFY_IS_EQUAL(output_data[i], input_data[i]); + } + delete[] output_data; + } + } + + // Total size == 1. + { + DSizes<Index, 5> block_sizes(1, 1, 1, 1, 1); + const auto total_size = block_sizes.TotalSize(); + + // Create a random input tensor. + auto* input_data = GenerateRandomData<float>(total_size); + DSizes<Index, 5> strides(ComputeStrides<Layout, 5>(block_sizes)); + + { + auto* output_data = new float[block_sizes.TotalSize()]; + TensorBlock read_block(0, block_sizes, strides, strides, output_data); + TensorBlockReader::Run(&read_block, input_data); + for (int i = 0; i < total_size; ++i) { + VERIFY_IS_EQUAL(output_data[i], input_data[i]); + } + delete[] output_data; + } + + { + auto* output_data = new float[block_sizes.TotalSize()]; + TensorBlock write_block(0, block_sizes, strides, strides, input_data); + TensorBlockWriter::Run(write_block, output_data); + for (int i = 0; i < total_size; ++i) { + VERIFY_IS_EQUAL(output_data[i], input_data[i]); + } + delete[] output_data; + } + } +} + +template <int Layout> +static void test_block_cwise_binary_io_basic() { + typedef internal::scalar_sum_op<float> BinaryFunctor; + typedef internal::TensorBlockCwiseBinaryIO<BinaryFunctor, Index, float, 5, + Layout> + TensorBlockCwiseBinaryIO; + + DSizes<Index, 5> block_sizes(2, 3, 5, 7, 11); + DSizes<Index, 5> strides(ComputeStrides<Layout, 5>(block_sizes)); + + const auto total_size = block_sizes.TotalSize(); + + // Create a random input tensors. + auto* left_data = GenerateRandomData<float>(total_size); + auto* right_data = GenerateRandomData<float>(total_size); + + auto* output_data = new float[total_size]; + BinaryFunctor functor; + TensorBlockCwiseBinaryIO::Run(functor, block_sizes, strides, output_data, + strides, left_data, strides, right_data); + for (int i = 0; i < total_size; ++i) { + VERIFY_IS_EQUAL(output_data[i], functor(left_data[i], right_data[i])); + } + + delete[] left_data; + delete[] right_data; + delete[] output_data; +} + +template <int Layout> +static void test_block_cwise_binary_io_squeeze_ones() { + typedef internal::scalar_sum_op<float> BinaryFunctor; + typedef internal::TensorBlockCwiseBinaryIO<BinaryFunctor, Index, float, 5, + Layout> + TensorBlockCwiseBinaryIO; + + DSizes<Index, 5> block_sizes(1, 2, 1, 3, 1); + DSizes<Index, 5> strides(ComputeStrides<Layout, 5>(block_sizes)); + + const auto total_size = block_sizes.TotalSize(); + + // Create a random input tensors. + auto* left_data = GenerateRandomData<float>(total_size); + auto* right_data = GenerateRandomData<float>(total_size); + + auto* output_data = new float[total_size]; + BinaryFunctor functor; + TensorBlockCwiseBinaryIO::Run(functor, block_sizes, strides, output_data, + strides, left_data, strides, right_data); + for (int i = 0; i < total_size; ++i) { + VERIFY_IS_EQUAL(output_data[i], functor(left_data[i], right_data[i])); + } + + delete[] left_data; + delete[] right_data; + delete[] output_data; +} + +template <int Layout> +static void test_block_cwise_binary_io_zero_strides() { + typedef internal::scalar_sum_op<float> BinaryFunctor; + typedef internal::TensorBlockCwiseBinaryIO<BinaryFunctor, Index, float, 5, + Layout> + TensorBlockCwiseBinaryIO; + + DSizes<Index, 5> left_sizes(1, 3, 1, 7, 1); + DSizes<Index, 5> left_strides(ComputeStrides<Layout, 5>(left_sizes)); + left_strides[0] = 0; + left_strides[2] = 0; + left_strides[4] = 0; + + DSizes<Index, 5> right_sizes(2, 1, 5, 1, 11); + DSizes<Index, 5> right_strides(ComputeStrides<Layout, 5>(right_sizes)); + right_strides[1] = 0; + right_strides[3] = 0; + + // Generate random data. + auto* left_data = GenerateRandomData<float>(left_sizes.TotalSize()); + auto* right_data = GenerateRandomData<float>(right_sizes.TotalSize()); + + DSizes<Index, 5> output_sizes(2, 3, 5, 7, 11); + DSizes<Index, 5> output_strides(ComputeStrides<Layout, 5>(output_sizes)); + + const auto output_total_size = output_sizes.TotalSize(); + auto* output_data = new float[output_total_size]; + + BinaryFunctor functor; + TensorBlockCwiseBinaryIO::Run(functor, output_sizes, output_strides, + output_data, left_strides, left_data, + right_strides, right_data); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + for (int m = 0; m < 11; ++m) { + Index output_index = i * output_strides[0] + j * output_strides[1] + + k * output_strides[2] + l * output_strides[3] + + m * output_strides[4]; + Index left_index = i * left_strides[0] + j * left_strides[1] + + k * left_strides[2] + l * left_strides[3] + + m * left_strides[4]; + Index right_index = i * right_strides[0] + j * right_strides[1] + + k * right_strides[2] + l * right_strides[3] + + m * right_strides[4]; + VERIFY_IS_EQUAL( + output_data[output_index], + functor(left_data[left_index], right_data[right_index])); + } + } + } + } + } + + delete[] left_data; + delete[] right_data; + delete[] output_data; +} + +template <int Layout> +static void test_uniform_block_shape() +{ + using T = int; + typedef internal::TensorBlock<T, Index, 5, Layout> TensorBlock; + typedef internal::TensorBlockMapper<T, Index, 5, Layout> TensorBlockMapper; + + { + // Test shape 'UniformAllDims' with uniform 'max_coeff count'. + DSizes<Index, 5> dims(11, 5, 6, 17, 7); + const size_t max_coeff_count = 5 * 5 * 5 * 5 * 5; + TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kUniformAllDims, + max_coeff_count); + TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr); + for (int i = 0; i < 5; ++i) { + VERIFY_IS_EQUAL(5, block.block_sizes()[i]); + } + VERIFY(block.block_sizes().TotalSize() <= max_coeff_count); + } + + // Test shape 'UniformAllDims' with larger 'max_coeff count' which spills + // partially into first inner-most dimension. + if (Layout == ColMajor) { + DSizes<Index, 5> dims(11, 5, 6, 17, 7); + const size_t max_coeff_count = 7 * 5 * 5 * 5 * 5; + TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kUniformAllDims, + max_coeff_count); + TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr); + VERIFY_IS_EQUAL(7, block.block_sizes()[0]); + for (int i = 1; i < 5; ++i) { + VERIFY_IS_EQUAL(5, block.block_sizes()[i]); + } + VERIFY(block.block_sizes().TotalSize() <= max_coeff_count); + } else { + DSizes<Index, 5> dims(11, 5, 6, 17, 7); + const size_t max_coeff_count = 5 * 5 * 5 * 5 * 6; + TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kUniformAllDims, + max_coeff_count); + TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr); + VERIFY_IS_EQUAL(6, block.block_sizes()[4]); + for (int i = 3; i >= 0; --i) { + VERIFY_IS_EQUAL(5, block.block_sizes()[i]); + } + VERIFY(block.block_sizes().TotalSize() <= max_coeff_count); + } + + // Test shape 'UniformAllDims' with larger 'max_coeff count' which spills + // fully into first inner-most dimension. + if (Layout == ColMajor) { + DSizes<Index, 5> dims(11, 5, 6, 17, 7); + const size_t max_coeff_count = 11 * 5 * 5 * 5 * 5; + TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kUniformAllDims, + max_coeff_count); + TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr); + VERIFY_IS_EQUAL(11, block.block_sizes()[0]); + for (int i = 1; i < 5; ++i) { + VERIFY_IS_EQUAL(5, block.block_sizes()[i]); + } + VERIFY(block.block_sizes().TotalSize() <= max_coeff_count); + } else { + DSizes<Index, 5> dims(11, 5, 6, 17, 7); + const size_t max_coeff_count = 5 * 5 * 5 * 5 * 7; + TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kUniformAllDims, + max_coeff_count); + TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr); + VERIFY_IS_EQUAL(7, block.block_sizes()[4]); + for (int i = 3; i >= 0; --i) { + VERIFY_IS_EQUAL(5, block.block_sizes()[i]); + } + VERIFY(block.block_sizes().TotalSize() <= max_coeff_count); + } + + // Test shape 'UniformAllDims' with larger 'max_coeff count' which spills + // fully into first few inner-most dimensions. + if (Layout == ColMajor) { + DSizes<Index, 5> dims(7, 5, 6, 17, 7); + const size_t max_coeff_count = 7 * 5 * 6 * 7 * 5; + TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kUniformAllDims, + max_coeff_count); + TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr); + VERIFY_IS_EQUAL(7, block.block_sizes()[0]); + VERIFY_IS_EQUAL(5, block.block_sizes()[1]); + VERIFY_IS_EQUAL(6, block.block_sizes()[2]); + VERIFY_IS_EQUAL(7, block.block_sizes()[3]); + VERIFY_IS_EQUAL(5, block.block_sizes()[4]); + VERIFY(block.block_sizes().TotalSize() <= max_coeff_count); + } else { + DSizes<Index, 5> dims(7, 5, 6, 9, 7); + const size_t max_coeff_count = 5 * 5 * 5 * 6 * 7; + TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kUniformAllDims, + max_coeff_count); + TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr); + VERIFY_IS_EQUAL(7, block.block_sizes()[4]); + VERIFY_IS_EQUAL(6, block.block_sizes()[3]); + VERIFY_IS_EQUAL(5, block.block_sizes()[2]); + VERIFY_IS_EQUAL(5, block.block_sizes()[1]); + VERIFY_IS_EQUAL(5, block.block_sizes()[0]); + VERIFY(block.block_sizes().TotalSize() <= max_coeff_count); + } + + // Test shape 'UniformAllDims' with full allocation to all dims. + if (Layout == ColMajor) { + DSizes<Index, 5> dims(7, 5, 6, 17, 7); + const size_t max_coeff_count = 7 * 5 * 6 * 17 * 7; + TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kUniformAllDims, + max_coeff_count); + TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr); + VERIFY_IS_EQUAL(7, block.block_sizes()[0]); + VERIFY_IS_EQUAL(5, block.block_sizes()[1]); + VERIFY_IS_EQUAL(6, block.block_sizes()[2]); + VERIFY_IS_EQUAL(17, block.block_sizes()[3]); + VERIFY_IS_EQUAL(7, block.block_sizes()[4]); + VERIFY(block.block_sizes().TotalSize() <= max_coeff_count); + } else { + DSizes<Index, 5> dims(7, 5, 6, 9, 7); + const size_t max_coeff_count = 7 * 5 * 6 * 9 * 7; + TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kUniformAllDims, + max_coeff_count); + TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr); + VERIFY_IS_EQUAL(7, block.block_sizes()[4]); + VERIFY_IS_EQUAL(9, block.block_sizes()[3]); + VERIFY_IS_EQUAL(6, block.block_sizes()[2]); + VERIFY_IS_EQUAL(5, block.block_sizes()[1]); + VERIFY_IS_EQUAL(7, block.block_sizes()[0]); + VERIFY(block.block_sizes().TotalSize() <= max_coeff_count); + } +} + +template <int Layout> +static void test_skewed_inner_dim_block_shape() +{ + using T = int; + typedef internal::TensorBlock<T, Index, 5, Layout> TensorBlock; + typedef internal::TensorBlockMapper<T, Index, 5, Layout> TensorBlockMapper; + + // Test shape 'SkewedInnerDims' with partial allocation to inner-most dim. + if (Layout == ColMajor) { + DSizes<Index, 5> dims(11, 5, 6, 17, 7); + const size_t max_coeff_count = 10 * 1 * 1 * 1 * 1; + TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kSkewedInnerDims, + max_coeff_count); + TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr); + VERIFY_IS_EQUAL(10, block.block_sizes()[0]); + for (int i = 1; i < 5; ++i) { + VERIFY_IS_EQUAL(1, block.block_sizes()[i]); + } + VERIFY(block.block_sizes().TotalSize() <= max_coeff_count); + } else { + DSizes<Index, 5> dims(11, 5, 6, 17, 7); + const size_t max_coeff_count = 1 * 1 * 1 * 1 * 6; + TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kSkewedInnerDims, + max_coeff_count); + TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr); + VERIFY_IS_EQUAL(6, block.block_sizes()[4]); + for (int i = 3; i >= 0; --i) { + VERIFY_IS_EQUAL(1, block.block_sizes()[i]); + } + VERIFY(block.block_sizes().TotalSize() <= max_coeff_count); + } + + // Test shape 'SkewedInnerDims' with full allocation to inner-most dim. + if (Layout == ColMajor) { + DSizes<Index, 5> dims(11, 5, 6, 17, 7); + const size_t max_coeff_count = 11 * 1 * 1 * 1 * 1; + TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kSkewedInnerDims, + max_coeff_count); + TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr); + VERIFY_IS_EQUAL(11, block.block_sizes()[0]); + for (int i = 1; i < 5; ++i) { + VERIFY_IS_EQUAL(1, block.block_sizes()[i]); + } + VERIFY(block.block_sizes().TotalSize() <= max_coeff_count); + } else { + DSizes<Index, 5> dims(11, 5, 6, 17, 7); + const size_t max_coeff_count = 1 * 1 * 1 * 1 * 7; + TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kSkewedInnerDims, + max_coeff_count); + TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr); + VERIFY_IS_EQUAL(7, block.block_sizes()[4]); + for (int i = 3; i >= 0; --i) { + VERIFY_IS_EQUAL(1, block.block_sizes()[i]); + } + VERIFY(block.block_sizes().TotalSize() <= max_coeff_count); + } + + // Test shape 'SkewedInnerDims' with full allocation to inner-most dim, + // and partial allocation to second inner-dim. + if (Layout == ColMajor) { + DSizes<Index, 5> dims(11, 5, 6, 17, 7); + const size_t max_coeff_count = 11 * 3 * 1 * 1 * 1; + TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kSkewedInnerDims, + max_coeff_count); + TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr); + VERIFY_IS_EQUAL(11, block.block_sizes()[0]); + VERIFY_IS_EQUAL(3, block.block_sizes()[1]); + for (int i = 2; i < 5; ++i) { + VERIFY_IS_EQUAL(1, block.block_sizes()[i]); + } + VERIFY(block.block_sizes().TotalSize() <= max_coeff_count); + } else { + DSizes<Index, 5> dims(11, 5, 6, 17, 7); + const size_t max_coeff_count = 1 * 1 * 1 * 15 * 7; + TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kSkewedInnerDims, + max_coeff_count); + TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr); + VERIFY_IS_EQUAL(7, block.block_sizes()[4]); + VERIFY_IS_EQUAL(15, block.block_sizes()[3]); + for (int i = 2; i >= 0; --i) { + VERIFY_IS_EQUAL(1, block.block_sizes()[i]); + } + VERIFY(block.block_sizes().TotalSize() <= max_coeff_count); + } + + // Test shape 'SkewedInnerDims' with full allocation to inner-most dim, + // and partial allocation to third inner-dim. + if (Layout == ColMajor) { + DSizes<Index, 5> dims(11, 5, 6, 17, 7); + const size_t max_coeff_count = 11 * 5 * 5 * 1 * 1; + TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kSkewedInnerDims, + max_coeff_count); + TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr); + VERIFY_IS_EQUAL(11, block.block_sizes()[0]); + VERIFY_IS_EQUAL(5, block.block_sizes()[1]); + VERIFY_IS_EQUAL(5, block.block_sizes()[2]); + for (int i = 3; i < 5; ++i) { + VERIFY_IS_EQUAL(1, block.block_sizes()[i]); + } + VERIFY(block.block_sizes().TotalSize() <= max_coeff_count); + } else { + DSizes<Index, 5> dims(11, 5, 6, 17, 7); + const size_t max_coeff_count = 1 * 1 * 5 * 17 * 7; + TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kSkewedInnerDims, + max_coeff_count); + TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr); + VERIFY_IS_EQUAL(7, block.block_sizes()[4]); + VERIFY_IS_EQUAL(17, block.block_sizes()[3]); + VERIFY_IS_EQUAL(5, block.block_sizes()[2]); + for (int i = 1; i >= 0; --i) { + VERIFY_IS_EQUAL(1, block.block_sizes()[i]); + } + VERIFY(block.block_sizes().TotalSize() <= max_coeff_count); + } + + // Test shape 'SkewedInnerDims' with full allocation to all dims. + if (Layout == ColMajor) { + DSizes<Index, 5> dims(11, 5, 6, 17, 7); + const size_t max_coeff_count = 11 * 5 * 6 * 17 * 7; + TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kSkewedInnerDims, + max_coeff_count); + TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr); + VERIFY_IS_EQUAL(11, block.block_sizes()[0]); + VERIFY_IS_EQUAL(5, block.block_sizes()[1]); + VERIFY_IS_EQUAL(6, block.block_sizes()[2]); + VERIFY_IS_EQUAL(17, block.block_sizes()[3]); + VERIFY_IS_EQUAL(7, block.block_sizes()[4]); + VERIFY(block.block_sizes().TotalSize() <= max_coeff_count); + } else { + DSizes<Index, 5> dims(11, 5, 6, 17, 7); + const size_t max_coeff_count = 11 * 5 * 6 * 17 * 7; + TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kSkewedInnerDims, + max_coeff_count); + TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr); + VERIFY_IS_EQUAL(7, block.block_sizes()[4]); + VERIFY_IS_EQUAL(17, block.block_sizes()[3]); + VERIFY_IS_EQUAL(6, block.block_sizes()[2]); + VERIFY_IS_EQUAL(5, block.block_sizes()[1]); + VERIFY_IS_EQUAL(11, block.block_sizes()[0]); + VERIFY(block.block_sizes().TotalSize() <= max_coeff_count); + } +} + +template <int Layout> +static void test_empty_dims(const internal::TensorBlockShapeType block_shape) +{ + using T = int; + + // Test blocking of tensors with zero dimensions: + // - we must not crash on asserts and divisions by zero + // - we must not return block with zero dimensions + // (recipe for overflows/underflows, divisions by zero and NaNs later) + // - total block count must be zero + { + typedef internal::TensorBlockMapper<T, Index, 1, Layout> TensorBlockMapper; + DSizes<Index, 1> dims(0); + for (int max_coeff_count = 0; max_coeff_count < 2; ++max_coeff_count) { + TensorBlockMapper block_mapper(dims, block_shape, max_coeff_count); + VERIFY_IS_EQUAL(block_mapper.total_block_count(), 0); + VERIFY(block_mapper.block_dims_total_size() >= 1); + } + } + + { + typedef internal::TensorBlockMapper<T, Index, 2, Layout> TensorBlockMapper; + for (int dim1 = 0; dim1 < 3; ++dim1) { + for (int dim2 = 0; dim2 < 3; ++dim2) { + DSizes<Index, 2> dims(dim1, dim2); + for (int max_coeff_count = 0; max_coeff_count < 2; ++max_coeff_count) { + TensorBlockMapper block_mapper(dims, block_shape, max_coeff_count); + if (dim1 * dim2 == 0) { + VERIFY_IS_EQUAL(block_mapper.total_block_count(), 0); + } + VERIFY(block_mapper.block_dims_total_size() >= 1); + } + } + } + } +} + +#define CALL_SUBTEST_LAYOUTS(NAME) \ + CALL_SUBTEST(NAME<ColMajor>()); \ + CALL_SUBTEST(NAME<RowMajor>()) + +#define CALL_SUBTEST_LAYOUTS_WITH_ARG(NAME, ARG) \ + CALL_SUBTEST(NAME<ColMajor>(ARG)); \ + CALL_SUBTEST(NAME<RowMajor>(ARG)) + EIGEN_DECLARE_TEST(cxx11_tensor_assign) { - CALL_SUBTEST(test_block_mapper_sanity<ColMajor>()); - CALL_SUBTEST(test_block_mapper_sanity<RowMajor>()); - CALL_SUBTEST(test_block_mapper_maps_every_element<ColMajor>()); - CALL_SUBTEST(test_block_mapper_maps_every_element<RowMajor>()); - CALL_SUBTEST(test_slice_block_mapper_maps_every_element<ColMajor>()); - CALL_SUBTEST(test_slice_block_mapper_maps_every_element<RowMajor>()); + CALL_SUBTEST_LAYOUTS(test_block_mapper_sanity); + CALL_SUBTEST_LAYOUTS(test_block_mapper_maps_every_element); + CALL_SUBTEST_LAYOUTS(test_slice_block_mapper_maps_every_element); + CALL_SUBTEST_LAYOUTS(test_block_io_copy_data_from_source_to_target); + CALL_SUBTEST_LAYOUTS(test_block_io_copy_using_reordered_dimensions); + CALL_SUBTEST_LAYOUTS(test_block_io_zero_stride); + CALL_SUBTEST_LAYOUTS(test_block_io_squeeze_ones); + CALL_SUBTEST_LAYOUTS(test_block_cwise_binary_io_basic); + CALL_SUBTEST_LAYOUTS(test_block_cwise_binary_io_squeeze_ones); + CALL_SUBTEST_LAYOUTS(test_block_cwise_binary_io_zero_strides); + CALL_SUBTEST_LAYOUTS(test_uniform_block_shape); + CALL_SUBTEST_LAYOUTS(test_skewed_inner_dim_block_shape); + + CALL_SUBTEST_LAYOUTS_WITH_ARG(test_empty_dims, TensorBlockShapeType::kUniformAllDims); + CALL_SUBTEST_LAYOUTS_WITH_ARG(test_empty_dims, TensorBlockShapeType::kSkewedInnerDims); } + +#undef CALL_SUBTEST_LAYOUTS +#undef CALL_SUBTEST_LAYOUTS_WITH_ARG
\ No newline at end of file |