aboutsummaryrefslogtreecommitdiffhomepage
path: root/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
diff options
context:
space:
mode:
authorGravatar Rasmus Munk Larsen <rmlarsen@google.com>2019-08-07 12:57:42 -0700
committerGravatar Rasmus Munk Larsen <rmlarsen@google.com>2019-08-07 12:57:42 -0700
commiteab7e52db217240d0320e2618eafa37f45158b83 (patch)
tree30a915ab749df5fd599e4b4d6de867afd248c6c6 /unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
parent09871261653b4a373b2aed1561c38a7f5d21a21e (diff)
[Eigen] Vectorize evaluation of coefficient-wise functions over tensor blocks if the strides are known to be 1. Provides up to 20-25% speedup of the TF cross entropy op with AVX.
A few benchmark numbers: name old time/op new time/op delta BM_Xent_16_10000_cpu 448µs ± 3% 389µs ± 2% -13.21% (p=0.008 n=5+5) BM_Xent_32_10000_cpu 575µs ± 6% 454µs ± 3% -21.00% (p=0.008 n=5+5) BM_Xent_64_10000_cpu 933µs ± 4% 712µs ± 1% -23.71% (p=0.008 n=5+5)
Diffstat (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h')
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h85
1 files changed, 78 insertions, 7 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
index f8942d527..49fb21dc8 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
@@ -483,6 +483,7 @@ class TensorBlockWriter : public TensorBlockIO<Scalar, StorageIndex, NumDims,
* result of the cwise unary op to the strided output array.
*
*/
+template <bool Vectorizable>
struct TensorBlockCwiseUnaryOp {
template <typename StorageIndex, typename UnaryFunctor,
typename OutputScalar, typename InputScalar>
@@ -507,6 +508,31 @@ struct TensorBlockCwiseUnaryOp {
}
};
+template<>
+struct TensorBlockCwiseUnaryOp<true> {
+ template <typename StorageIndex, typename UnaryFunctor,
+ typename OutputScalar, typename InputScalar>
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
+ const UnaryFunctor& functor, const StorageIndex num_coeff,
+ const StorageIndex output_index, const StorageIndex output_stride,
+ OutputScalar* output_data, const StorageIndex input_index,
+ const StorageIndex input_stride, const InputScalar* input_data) {
+ if (input_stride == 1 && output_stride == 1) {
+ typedef const Array<InputScalar, Dynamic, 1> Input;
+ typedef Array<OutputScalar, Dynamic, 1> Output;
+
+ const Map<Input> input(&input_data[input_index], num_coeff);
+ Map<Output> output(&output_data[output_index], num_coeff);
+
+ output = CwiseUnaryOp<UnaryFunctor, Map<Input> >(input, functor);
+ } else {
+ TensorBlockCwiseUnaryOp<false>::Run(
+ functor, num_coeff, output_index, output_stride, output_data,
+ input_index, input_stride, input_data);
+ }
+ }
+};
+
/**
* \class TensorBlockCwiseUnaryIO
* \ingroup CXX11_Tensor_Module
@@ -521,6 +547,11 @@ struct TensorBlockCwiseUnaryIO {
typedef typename TensorBlock<OutputScalar, StorageIndex, NumDims,
Layout>::Dimensions Dimensions;
+ typedef TensorBlockCwiseUnaryOp<
+ packet_traits<OutputScalar>::Vectorizable &&
+ functor_traits<UnaryFunctor>::PacketAccess>
+ TensorBlockCwiseUnaryOpImpl;
+
struct BlockIteratorState {
StorageIndex output_stride, output_span;
StorageIndex input_stride, input_span;
@@ -595,9 +626,9 @@ struct TensorBlockCwiseUnaryIO {
const StorageIndex block_total_size =
NumDims == 0 ? 1 : block_sizes.TotalSize();
for (StorageIndex i = 0; i < block_total_size; i += inner_dim_size) {
- TensorBlockCwiseUnaryOp::Run(functor, inner_dim_size, output_index,
- output_stride, output_data, input_index,
- input_stride, input_data);
+ TensorBlockCwiseUnaryOpImpl::Run(functor, inner_dim_size, output_index,
+ output_stride, output_data, input_index,
+ input_stride, input_data);
// Update index.
for (int j = 0; j < num_squeezed_dims; ++j) {
BlockIteratorState& state = block_iter_state[j];
@@ -624,6 +655,7 @@ struct TensorBlockCwiseUnaryIO {
* result of the cwise binary op to the strided output array.
*
*/
+template<bool Vectorizable>
struct TensorBlockCwiseBinaryOp {
template <typename StorageIndex, typename BinaryFunctor, typename OutputScalar,
typename LeftScalar, typename RightScalar>
@@ -654,6 +686,40 @@ struct TensorBlockCwiseBinaryOp {
}
};
+template<>
+struct TensorBlockCwiseBinaryOp<true> {
+ template <typename StorageIndex, typename BinaryFunctor, typename OutputScalar,
+ typename LeftScalar, typename RightScalar>
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
+ const BinaryFunctor& functor, const StorageIndex num_coeff,
+ const StorageIndex output_index, const StorageIndex output_stride,
+ OutputScalar* output_data, const StorageIndex left_index,
+ const StorageIndex left_stride, const LeftScalar* left_data,
+ const StorageIndex right_index, const StorageIndex right_stride,
+ const RightScalar* right_data) {
+ if (left_stride == 1 && right_stride == 1 && output_stride == 1) {
+ typedef const Array<LeftScalar, Dynamic, 1> Lhs;
+ typedef const Array<RightScalar, Dynamic, 1> Rhs;
+ typedef Array<OutputScalar, Dynamic, 1> Out;
+
+ const LeftScalar* lhs_base = &left_data[left_index];
+ const RightScalar* rhs_base = &right_data[right_index];
+ OutputScalar* out_base = &output_data[output_index];
+
+ const Map<Lhs> lhs(lhs_base, num_coeff);
+ const Map<Rhs> rhs(rhs_base, num_coeff);
+ Map<Out> out(out_base, num_coeff);
+
+ out = CwiseBinaryOp<BinaryFunctor, Map<Lhs>, Map<Rhs> >(lhs, rhs, functor);
+ } else {
+ TensorBlockCwiseBinaryOp<false>::Run(
+ functor, num_coeff, output_index, output_stride, output_data,
+ left_index, left_stride, left_data, right_index, right_stride,
+ right_data);
+ }
+ }
+};
+
/**
* \class TensorBlockCwiseBinaryIO
* \ingroup CXX11_Tensor_Module
@@ -668,6 +734,11 @@ template <typename BinaryFunctor, typename StorageIndex, typename OutputScalar,
struct TensorBlockCwiseBinaryIO {
typedef typename TensorBlock<OutputScalar, StorageIndex, NumDims, Layout>::Dimensions Dimensions;
+ typedef TensorBlockCwiseBinaryOp<
+ packet_traits<OutputScalar>::Vectorizable &&
+ functor_traits<BinaryFunctor>::PacketAccess>
+ TensorBlockCwiseBinaryOpImpl;
+
struct BlockIteratorState {
StorageIndex output_stride, output_span;
StorageIndex left_stride, left_span;
@@ -748,10 +819,10 @@ struct TensorBlockCwiseBinaryIO {
const StorageIndex block_total_size =
NumDims == 0 ? 1 : block_sizes.TotalSize();
for (StorageIndex i = 0; i < block_total_size; i += inner_dim_size) {
- TensorBlockCwiseBinaryOp::Run(functor, inner_dim_size, output_index,
- output_stride, output_data, left_index,
- left_stride, left_data, right_index,
- right_stride, right_data);
+ TensorBlockCwiseBinaryOpImpl::Run(functor, inner_dim_size, output_index,
+ output_stride, output_data, left_index,
+ left_stride, left_data, right_index,
+ right_stride, right_data);
// Update index.
for (int j = 0; j < num_squeezed_dims; ++j) {
BlockIteratorState& state = block_iter_state[j];