From eab7e52db217240d0320e2618eafa37f45158b83 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Wed, 7 Aug 2019 12:57:42 -0700 Subject: [Eigen] Vectorize evaluation of coefficient-wise functions over tensor blocks if the strides are known to be 1. Provides up to 20-25% speedup of the TF cross entropy op with AVX. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A few benchmark numbers: name old time/op new time/op delta BM_Xent_16_10000_cpu 448µs ± 3% 389µs ± 2% -13.21% (p=0.008 n=5+5) BM_Xent_32_10000_cpu 575µs ± 6% 454µs ± 3% -21.00% (p=0.008 n=5+5) BM_Xent_64_10000_cpu 933µs ± 4% 712µs ± 1% -23.71% (p=0.008 n=5+5) --- unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h | 85 ++++++++++++++++++++++-- 1 file changed, 78 insertions(+), 7 deletions(-) (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h index f8942d527..49fb21dc8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h @@ -483,6 +483,7 @@ class TensorBlockWriter : public TensorBlockIO struct TensorBlockCwiseUnaryOp { template @@ -507,6 +508,31 @@ struct TensorBlockCwiseUnaryOp { } }; +template<> +struct TensorBlockCwiseUnaryOp { + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( + const UnaryFunctor& functor, const StorageIndex num_coeff, + const StorageIndex output_index, const StorageIndex output_stride, + OutputScalar* output_data, const StorageIndex input_index, + const StorageIndex input_stride, const InputScalar* input_data) { + if (input_stride == 1 && output_stride == 1) { + typedef const Array Input; + typedef Array Output; + + const Map input(&input_data[input_index], num_coeff); + Map output(&output_data[output_index], num_coeff); + + output = CwiseUnaryOp >(input, functor); + } else { + TensorBlockCwiseUnaryOp::Run( + functor, num_coeff, output_index, output_stride, output_data, + input_index, input_stride, input_data); + } + } +}; + /** * \class TensorBlockCwiseUnaryIO * \ingroup CXX11_Tensor_Module @@ -521,6 +547,11 @@ struct TensorBlockCwiseUnaryIO { typedef typename TensorBlock::Dimensions Dimensions; + typedef TensorBlockCwiseUnaryOp< + packet_traits::Vectorizable && + functor_traits::PacketAccess> + TensorBlockCwiseUnaryOpImpl; + struct BlockIteratorState { StorageIndex output_stride, output_span; StorageIndex input_stride, input_span; @@ -595,9 +626,9 @@ struct TensorBlockCwiseUnaryIO { const StorageIndex block_total_size = NumDims == 0 ? 1 : block_sizes.TotalSize(); for (StorageIndex i = 0; i < block_total_size; i += inner_dim_size) { - TensorBlockCwiseUnaryOp::Run(functor, inner_dim_size, output_index, - output_stride, output_data, input_index, - input_stride, input_data); + TensorBlockCwiseUnaryOpImpl::Run(functor, inner_dim_size, output_index, + output_stride, output_data, input_index, + input_stride, input_data); // Update index. for (int j = 0; j < num_squeezed_dims; ++j) { BlockIteratorState& state = block_iter_state[j]; @@ -624,6 +655,7 @@ struct TensorBlockCwiseUnaryIO { * result of the cwise binary op to the strided output array. * */ +template struct TensorBlockCwiseBinaryOp { template @@ -654,6 +686,40 @@ struct TensorBlockCwiseBinaryOp { } }; +template<> +struct TensorBlockCwiseBinaryOp { + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( + const BinaryFunctor& functor, const StorageIndex num_coeff, + const StorageIndex output_index, const StorageIndex output_stride, + OutputScalar* output_data, const StorageIndex left_index, + const StorageIndex left_stride, const LeftScalar* left_data, + const StorageIndex right_index, const StorageIndex right_stride, + const RightScalar* right_data) { + if (left_stride == 1 && right_stride == 1 && output_stride == 1) { + typedef const Array Lhs; + typedef const Array Rhs; + typedef Array Out; + + const LeftScalar* lhs_base = &left_data[left_index]; + const RightScalar* rhs_base = &right_data[right_index]; + OutputScalar* out_base = &output_data[output_index]; + + const Map lhs(lhs_base, num_coeff); + const Map rhs(rhs_base, num_coeff); + Map out(out_base, num_coeff); + + out = CwiseBinaryOp, Map >(lhs, rhs, functor); + } else { + TensorBlockCwiseBinaryOp::Run( + functor, num_coeff, output_index, output_stride, output_data, + left_index, left_stride, left_data, right_index, right_stride, + right_data); + } + } +}; + /** * \class TensorBlockCwiseBinaryIO * \ingroup CXX11_Tensor_Module @@ -668,6 +734,11 @@ template ::Dimensions Dimensions; + typedef TensorBlockCwiseBinaryOp< + packet_traits::Vectorizable && + functor_traits::PacketAccess> + TensorBlockCwiseBinaryOpImpl; + struct BlockIteratorState { StorageIndex output_stride, output_span; StorageIndex left_stride, left_span; @@ -748,10 +819,10 @@ struct TensorBlockCwiseBinaryIO { const StorageIndex block_total_size = NumDims == 0 ? 1 : block_sizes.TotalSize(); for (StorageIndex i = 0; i < block_total_size; i += inner_dim_size) { - TensorBlockCwiseBinaryOp::Run(functor, inner_dim_size, output_index, - output_stride, output_data, left_index, - left_stride, left_data, right_index, - right_stride, right_data); + TensorBlockCwiseBinaryOpImpl::Run(functor, inner_dim_size, output_index, + output_stride, output_data, left_index, + left_stride, left_data, right_index, + right_stride, right_data); // Update index. for (int j = 0; j < num_squeezed_dims; ++j) { BlockIteratorState& state = block_iter_state[j]; -- cgit v1.2.3