From 83c0a16baf5ecac6288cd9b74536a82de8985b31 Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Tue, 31 Jul 2018 15:56:31 -0700 Subject: Add block evaluation support to TensorOps --- .../Eigen/CXX11/src/Tensor/TensorReduction.h | 437 ++++++++++++++++++++- 1 file changed, 431 insertions(+), 6 deletions(-) (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 375fc0802..05c0990dc 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -356,6 +356,70 @@ template __global__ void OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); #endif +template +class BlockReducer { + public: + typedef typename Self::Index Index; + typedef typename Self::Scalar Scalar; + typedef typename Self::CoeffReturnType CoeffReturnType; + typedef typename Self::PacketReturnType PacketReturnType; + explicit BlockReducer(const Op& reducer) : op_(reducer) { + accum_ = op_.initialize(); + } + void Reduce(Index index, Index num_values_to_reduce, Scalar* data) { + for (Index i = 0; i < num_values_to_reduce; ++i) { + op_.reduce(data[index + i], &accum_); + } + } + CoeffReturnType Finalize() { return op_.finalize(accum_); } + PacketReturnType FinalizePacket() { + // TODO(andydavis) This function should not be called for Scalar + // reductions: clean this up or add an assert here. + return PacketReturnType(); + } + + private: + CoeffReturnType accum_; + Op op_; +}; + +template +class BlockReducer { + public: + typedef typename Self::Index Index; + typedef typename Self::Scalar Scalar; + typedef typename Self::CoeffReturnType CoeffReturnType; + typedef typename Self::PacketReturnType PacketReturnType; + static const Index PacketSize = + internal::unpacket_traits::size; + + explicit BlockReducer(const Op& reducer) : op_(reducer) { + vaccum_ = op_.template initializePacket(); + accum_ = op_.initialize(); + } + void Reduce(Index index, Index num_values_to_reduce, Scalar* data) { + const Index vectorized_size = + (num_values_to_reduce / PacketSize) * PacketSize; + for (Index i = 0; i < vectorized_size; i += PacketSize) { + op_.reducePacket( + internal::ploadt(&data[index + i]), + &vaccum_); + } + for (Index i = vectorized_size; i < num_values_to_reduce; ++i) { + op_.reduce(data[index + i], &accum_); + } + } + CoeffReturnType Finalize() { return op_.finalizeBoth(accum_, vaccum_); } + PacketReturnType FinalizePacket() { return op_.finalizePacket(vaccum_); } + + private: + PacketReturnType vaccum_; + CoeffReturnType accum_; + Op op_; +}; + } // end namespace internal @@ -394,6 +458,7 @@ class TensorReductionOp : public TensorBase class MakePointer_, typename Device> struct TensorEvaluator, Device> { + typedef internal::reducer_traits ReducerTraits; typedef TensorReductionOp XprType; typedef typename XprType::Index Index; typedef ArgType ChildType; @@ -410,14 +475,19 @@ struct TensorEvaluator, static const int PacketSize = internal::unpacket_traits::size; enum { - IsAligned = false, + IsAligned = false, PacketAccess = Self::InputPacketAccess && Op::PacketAccess, - BlockAccess = false, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false + BlockAccess = TensorEvaluator::BlockAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false }; + using ScalarNoConst = typename internal::remove_const::type; + + using OutputTensorBlock = internal::TensorBlock; + using InputTensorBlock = internal::TensorBlock; + static const bool ReducingInnerMostDims = internal::are_inner_most_dims::value; static const bool PreservingInnerMostDims = internal::preserve_inner_most_dims::value; static const bool RunningFullReduction = (NumOutputDims==0); @@ -451,11 +521,13 @@ struct TensorEvaluator, m_outputStrides[0] = 1; for (int i = 1; i < NumOutputDims; ++i) { m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1]; + m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i]); } } else { - m_outputStrides.back() = 1; + m_outputStrides[NumOutputDims - 1] = 1; for (int i = NumOutputDims - 2; i >= 0; --i) { m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1]; + m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i]); } } } @@ -483,6 +555,7 @@ struct TensorEvaluator, ++reduceIndex; } else { m_preservedStrides[outputIndex] = input_strides[i]; + m_output_to_input_dim_map[outputIndex] = i; ++outputIndex; } } @@ -492,6 +565,16 @@ struct TensorEvaluator, if (NumOutputDims == 0) { m_preservedStrides[0] = internal::array_prod(input_dims); } + + m_numValuesToReduce = + NumOutputDims == 0 + ? internal::array_prod(input_dims) + : (static_cast(Layout) == static_cast(ColMajor)) + ? m_preservedStrides[0] + : m_preservedStrides[NumOutputDims - 1]; + + m_block_total_size_max = + numext::maxi(1, device.lastLevelCacheSize() / sizeof(Scalar)); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } @@ -686,6 +769,265 @@ struct TensorEvaluator, } } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( + std::vector* resources) const { + resources->push_back(internal::TensorOpResourceRequirements( + internal::TensorBlockShapeType::kSkewedInnerDims, + m_block_total_size_max)); + m_impl.getResourceRequirements(resources); + } + + EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void block( + OutputTensorBlock* output_block) const { + // Special case full reductions to avoid input block copy below. + if (NumInputDims == NumReducedDims) { + eigen_assert(output_block->first_coeff_index() == 0); + eigen_assert(output_block->block_sizes().TotalSize() == 1); + Op reducer(m_reducer); + output_block->data()[0] = internal::InnerMostDimReducer::reduce( + *this, 0, m_numValuesToReduce, reducer); + return; + } + + // Calculate input tensor 'slice' required to reduce output block coeffs. + DSizes input_slice_sizes(m_impl.dimensions()); + for (int i = 0; i < NumOutputDims; ++i) { + // Clip preserved input dimensions by output block size. + input_slice_sizes[m_output_to_input_dim_map[i]] = + output_block->block_sizes()[i]; + } + + // Shard input tensor slice into blocks (because it could be large if we + // need to reduce along several dimensions to calculate required output + // coefficients). + const Index max_coeff_count = + numext::mini(((m_device.firstLevelCacheSize()) / sizeof(Scalar)), + input_slice_sizes.TotalSize()); + + // Calculate max output shard size needed to keep working set of reducers + // in L1, while leaving enough space for reducer overhead and 'PacketSize' + // reductions. + DSizes target_input_block_sizes; + CalculateTargetInputBlockShape(max_coeff_count, input_slice_sizes, + &target_input_block_sizes); + // Calculate indices for first preserved dimension. + const Index first_preserved_dim_output_index = + static_cast(Layout) == static_cast(ColMajor) + ? 0 + : NumOutputDims - 1; + const Index first_preserved_dim_input_index = + m_output_to_input_dim_map[first_preserved_dim_output_index]; + const bool inner_most_dim_preserved = + first_preserved_dim_input_index == + (static_cast(Layout) == static_cast(ColMajor) + ? 0 + : NumInputDims - 1) | + PreservingInnerMostDims; + + // Calculate output block inner/outer dimension sizes. + const Index output_block_inner_dim_size = + output_block->block_sizes()[first_preserved_dim_output_index]; + const Index output_block_outer_dim_size = + output_block->block_sizes().TotalSize() / output_block_inner_dim_size; + // Calculate shard size for first preserved dimension. + const Index output_shard_size = + target_input_block_sizes[first_preserved_dim_input_index]; + const Index num_output_shards = + (output_block_inner_dim_size + output_shard_size - 1) / + output_shard_size; + + // Initialize 'tensor_slice_offsets' from input coords of output index. + DSizes tensor_slice_offsets; + GetInputCoordsForOutputIndex(output_block->first_coeff_index(), + &tensor_slice_offsets); + + // Store tensor slice offset in first preserved dimension to be used + // to update tensor slice extents in loop below. + const Index first_preserved_dim_offset_start = + tensor_slice_offsets[first_preserved_dim_input_index]; + + array block_iter_state; + + // Initialize state used to iterate through output coefficients + // and update 'tensor_slice_offsets' in outer preserved dims. + for (int i = 0; i < NumOutputDims - 1; ++i) { + const int dim = static_cast(Layout) == static_cast(ColMajor) + ? i + 1 + : NumOutputDims - i - 2; + block_iter_state[i].input_dim = m_output_to_input_dim_map[dim]; + block_iter_state[i].output_size = output_block->block_sizes()[dim]; + block_iter_state[i].output_count = 0; + } + + // Allocate input block memory. + ScalarNoConst* input_block_data = static_cast( + m_device.allocate(max_coeff_count * sizeof(Scalar))); + // Allocate reducer memory. + const bool packet_reductions_enabled = + (Self::InputPacketAccess & Self::ReducerTraits::PacketAccess); + const Index num_reducers = + (inner_most_dim_preserved && packet_reductions_enabled) + ? (output_shard_size / PacketSize + output_shard_size % PacketSize + + PacketSize) + : output_shard_size; + typedef internal::BlockReducer BlockReducer; + BlockReducer* reducers = static_cast( + m_device.allocate(num_reducers * sizeof(BlockReducer))); + + InputDimensions input_tensor_dims(m_impl.dimensions()); + for (Index output_outer_index = 0; + output_outer_index < output_block_outer_dim_size; + ++output_outer_index) { + for (Index output_shard_index = 0; output_shard_index < num_output_shards; + ++output_shard_index) { + // Initialize 'tensor_slice_extents' for this output shard. + DSizes tensor_slice_extents(input_slice_sizes); + for (int i = 0; i < NumInputDims; ++i) { + if (i == first_preserved_dim_input_index) { + // Clip first preserved dim size to output shard size. + tensor_slice_extents[i] = numext::mini( + output_shard_size, + input_slice_sizes[i] - (tensor_slice_offsets[i] - + first_preserved_dim_offset_start)); + + } else if (!m_reduced[i]) { + // Clip outer preserved dims to size 1, so that we reduce a + // contiguous set of output coefficients. + tensor_slice_extents[i] = 1; + } + } + + // Intialize output coefficient reducers. + for (int i = 0; i < num_reducers; ++i) { + new (&reducers[i]) BlockReducer(m_reducer); + } + + using TensorSliceBlockMapper = + internal::TensorSliceBlockMapper; + + // TODO(andydavis) Consider removing 'input_block_stride_order' if we + // find that scattered reads are not worth supporting in + // TensorSliceBlockMapper. + TensorSliceBlockMapper block_mapper( + input_tensor_dims, tensor_slice_offsets, tensor_slice_extents, + target_input_block_sizes, DimensionList()); + + const Index num_outputs_to_update = + tensor_slice_extents[first_preserved_dim_input_index]; + const Index preserved_dim_vector_reducer_count = + (inner_most_dim_preserved && packet_reductions_enabled) + ? num_outputs_to_update / PacketSize + : 0; + const Index preserved_dim_vector_coeff_count = + inner_most_dim_preserved + ? preserved_dim_vector_reducer_count * PacketSize + : 0; + const Index preserved_dim_reducer_limit = + (inner_most_dim_preserved && packet_reductions_enabled) + ? (preserved_dim_vector_reducer_count + + num_outputs_to_update % PacketSize) + : num_outputs_to_update; + + const Index total_block_count = block_mapper.total_block_count(); + for (Index b = 0; b < total_block_count; ++b) { + InputTensorBlock input_block = + block_mapper.GetBlockForIndex(b, input_block_data); + // Read. + m_impl.block(&input_block); + + Index num_values_to_reduce = 1; + for (Index i = 0; i < NumInputDims; ++i) { + if (m_reduced[i]) { + num_values_to_reduce *= input_block.block_sizes()[i]; + } + } + // Reduce. + if (inner_most_dim_preserved) { + const Index input_outer_dim_size = + input_block.block_sizes().TotalSize() / num_outputs_to_update; + for (Index input_outer_dim_index = 0; + input_outer_dim_index < input_outer_dim_size; + ++input_outer_dim_index) { + const Index input_outer_dim_base = + input_outer_dim_index * num_outputs_to_update; + for (Index i = 0; i < preserved_dim_vector_reducer_count; ++i) { + reducers[i].Reduce(input_outer_dim_base + i * PacketSize, + PacketSize, input_block.data()); + } + const Index scalar_reducer_base = + input_outer_dim_base + preserved_dim_vector_coeff_count; + for (Index i = preserved_dim_vector_reducer_count; + i < preserved_dim_reducer_limit; ++i) { + reducers[i].Reduce(scalar_reducer_base + i - + preserved_dim_vector_reducer_count, + 1, input_block.data()); + } + } + } else { + for (Index i = 0; i < num_outputs_to_update; ++i) { + reducers[i].Reduce(i * num_values_to_reduce, num_values_to_reduce, + input_block.data()); + } + } + } + + // Finalize all reducers for this output shard. + const Index output_base_index = + output_outer_index * output_block_inner_dim_size + + output_shard_index * output_shard_size; + if (inner_most_dim_preserved) { + EIGEN_ALIGN_MAX + typename internal::remove_const::type + values[PacketSize]; + for (Index i = 0; i < preserved_dim_vector_reducer_count; ++i) { + const Index reducer_base = output_base_index + i * PacketSize; + internal::pstore( + values, reducers[i].FinalizePacket()); + for (Index j = 0; j < PacketSize; ++j) { + output_block->data()[reducer_base + j] = values[j]; + } + } + const Index scalar_reducer_base = + output_base_index + preserved_dim_vector_coeff_count; + + for (Index i = preserved_dim_vector_reducer_count; + i < preserved_dim_reducer_limit; ++i) { + output_block->data()[scalar_reducer_base + i - + preserved_dim_vector_reducer_count] = + reducers[i].Finalize(); + } + } else { + for (int i = 0; i < num_outputs_to_update; ++i) { + output_block->data()[output_base_index + i] = + reducers[i].Finalize(); + } + } + + // Update 'tensor_slice_offsets' by num outputs for this output shard. + tensor_slice_offsets[first_preserved_dim_input_index] += + num_outputs_to_update; + } + // Update slice offset for inner preserved dim. + tensor_slice_offsets[first_preserved_dim_input_index] -= + output_block_inner_dim_size; + // Update slice offsets for remaining output dims. + for (int i = 0; i < NumOutputDims - 1; ++i) { + BlockIteratorState& b = block_iter_state[i]; + if (++b.output_count < b.output_size) { + ++tensor_slice_offsets[b.input_dim]; + break; + } + b.output_count = 0; + tensor_slice_offsets[b.input_dim] -= b.output_size - 1; + } + } + + // Free memory. + m_device.deallocate(input_block_data); + m_device.deallocate(reducers); + } + EIGEN_DEVICE_FUNC typename MakePointer_::Type data() const { return m_result; } #if defined(EIGEN_USE_SYCL) @@ -722,6 +1064,12 @@ struct TensorEvaluator, template friend struct internal::InnerReducer; + struct BlockIteratorState { + Index input_dim; + Index output_size; + Index output_count; + }; + // Returns the Index in the input tensor of the first value that needs to be // used to compute the reduction at output index "index". EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const { @@ -764,16 +1112,90 @@ struct TensorEvaluator, return startInput; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void GetInputCoordsForOutputIndex( + Index index, + DSizes* coords) const { + for (int i = 0; i < NumInputDims; ++i) { + (*coords)[i] = 0; + } + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = NumOutputDims - 1; i > 0; --i) { + const Index idx = index / m_fastOutputStrides[i]; + (*coords)[m_output_to_input_dim_map[i]] = idx; + index -= idx * m_outputStrides[i]; + } + (*coords)[m_output_to_input_dim_map[0]] = index; + } else { + for (int i = 0; i < NumOutputDims - 1; ++i) { + const Index idx = index / m_fastOutputStrides[i]; + (*coords)[m_output_to_input_dim_map[i]] = idx; + index -= idx * m_outputStrides[i]; + } + (*coords)[m_output_to_input_dim_map[NumOutputDims-1]] = index; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void CalculateTargetInputBlockShape( + const Index max_coeff_count, + const DSizes& input_slice_sizes, + DSizes* target_input_block_sizes) const { + typedef typename internal::packet_traits::type Packet; + typedef internal::BlockReducer BlockReducer; + // TODO(andydavis) Compute reducer overhead correctly for the case where + // we are preserving the inner most dimension, and a single reducer + // reduces a packet's worth of output coefficients. + const Index reducer_overhead = sizeof(BlockReducer) / sizeof(Scalar); + + Index coeff_to_allocate = max_coeff_count; + bool first_preserved_dim_allocated = false; + bool first_reduced_dim_allocated = false; + for (int i = 0; i < NumInputDims; ++i) { + const int dim = static_cast(Layout) == static_cast(ColMajor) + ? i + : NumInputDims - i - 1; + (*target_input_block_sizes)[dim] = 1; + if (m_reduced[dim]) { + // TODO(andydavis) Consider allocating to multiple reduced dimensions. + // Watch out for cases where reduced dimensions are not contiguous, + // which induces scattered reads. + if (!first_reduced_dim_allocated) { + (*target_input_block_sizes)[dim] = + numext::mini(input_slice_sizes[dim], coeff_to_allocate); + coeff_to_allocate /= (*target_input_block_sizes)[dim]; + first_reduced_dim_allocated = true; + } + } else if (!first_preserved_dim_allocated) { + // TODO(andydavis) Include output block size in this L1 working set + // calculation. + const Index allocated = max_coeff_count - coeff_to_allocate; + const Index alloc_size = numext::maxi( + static_cast(1), coeff_to_allocate / reducer_overhead); + (*target_input_block_sizes)[dim] = + numext::mini(input_slice_sizes[dim], alloc_size); + coeff_to_allocate = numext::maxi( + static_cast(1), + coeff_to_allocate / + ((*target_input_block_sizes)[dim] * reducer_overhead)); + first_preserved_dim_allocated = true; + } + } + } + // Bitmap indicating if an input dimension is reduced or not. array m_reduced; // Dimensions of the output of the operation. Dimensions m_dimensions; // Precomputed strides for the output tensor. array m_outputStrides; + array, NumOutputDims> m_fastOutputStrides; // Subset of strides of the input tensor for the non-reduced dimensions. // Indexed by output dimensions. static const int NumPreservedStrides = max_n_1::size; array m_preservedStrides; + // Map from output to input dimension index. + array m_output_to_input_dim_map; + // How many values go into each reduction + Index m_numValuesToReduce; // Subset of strides of the input tensor for the reduced dimensions. // Indexed by reduced dimensions. @@ -782,6 +1204,9 @@ struct TensorEvaluator, // Indexed by reduced dimensions. array m_reducedDims; + // Block size for tiled (aka TensorBlock) evaluation. + Index m_block_total_size_max; + // Evaluator for the input expression. TensorEvaluator m_impl; -- cgit v1.2.3 From 64abdf1d7eb17174f571751346dd0cbadcf3bc52 Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Wed, 1 Aug 2018 12:35:19 -0700 Subject: Fix typo + get rid of redundant member variables for block sizes --- .../Eigen/CXX11/src/Tensor/TensorBroadcasting.h | 6 +++--- unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h | 17 ++++++++--------- unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h | 11 +++++------ unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h | 12 +++--------- unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h | 10 +++------- unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h | 18 ++++++++++-------- 6 files changed, 32 insertions(+), 42 deletions(-) (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index b6dbe5a22..cca14aafd 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -120,7 +120,7 @@ struct TensorEvaluator, Device> // Block based access to the XprType (input) tensor. using TensorBlock = internal::TensorBlock; using TensorBlockReader = internal::TensorBlockReader; - // We do block based broadcasting using a a trick with 2x tensor rank and 0 + // We do block based broadcasting using a trick with 2x tensor rank and 0 // strides. See block method implementation for details. using BroadcastDimensions = DSizes; using BroadcastTensorBlock = internal::TensorBlock; @@ -589,8 +589,8 @@ struct TensorEvaluator, Device> std::vector* resources) const { // TODO(wuke): Targeting L1 size is 30% faster than targeting L{-1} on large // tensors. But this might need further tuning. - Index l1_cache_scalars = m_device.firstLevelCacheSize() / sizeof(Scalar); - Index block_total_size_max = numext::maxi(Index(1), l1_cache_scalars); + auto block_total_size_max = numext::maxi( + 1, m_device.firstLevelCacheSize() / sizeof(Scalar)); resources->push_back(internal::TensorOpResourceRequirements( internal::TensorBlockShapeType::kSkewedInnerDims, diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h index 7579ab507..aca2ead12 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -202,9 +202,6 @@ struct TensorEvaluator, Device> m_inputStrides[i] = m_inputStrides[i + 1] * input_dims[i + 1]; } } - - m_block_total_size_max = - numext::maxi(1, device.lastLevelCacheSize() / sizeof(Scalar)); } } @@ -290,9 +287,11 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( std::vector* resources) const { + auto block_total_size_max = numext::maxi( + 1, m_device.lastLevelCacheSize() / sizeof(Scalar)); resources->push_back(internal::TensorOpResourceRequirements( internal::TensorBlockShapeType::kSkewedInnerDims, - m_block_total_size_max)); + block_total_size_max)); m_impl.getResourceRequirements(resources); } @@ -370,13 +369,14 @@ struct TensorEvaluator, Device> { Index inputIndex; if ((static_cast(Layout) == static_cast(ColMajor) && m_dim.actualDim() == 0) || - (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == NumInputDims-1)) { + (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == NumInputDims - 1)) { // m_stride is equal to 1, so let's avoid the integer division. eigen_assert(m_stride == 1); inputIndex = index * m_inputStride + m_inputOffset; - } else if ((static_cast(Layout) == static_cast(ColMajor) && m_dim.actualDim() == NumInputDims-1) || - (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == 0)) { - // m_stride is aways greater than index, so let's avoid the integer division. + } else if ((static_cast(Layout) == static_cast(ColMajor) && m_dim.actualDim() == NumInputDims - 1) || + (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == 0)) { + // m_stride is aways greater than index, so let's avoid the integer + // division. eigen_assert(m_stride > index); inputIndex = index + m_inputOffset; } else { @@ -392,7 +392,6 @@ struct TensorEvaluator, Device> Index m_stride; Index m_inputOffset; Index m_inputStride; - Index m_block_total_size_max; DSizes m_inputStrides; TensorEvaluator m_impl; const internal::DimensionId m_dim; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h index 39759b6c3..a8247be90 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h @@ -259,7 +259,7 @@ struct TensorEvaluator, Device> #else EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator( const XprType& op, const Device& device) #endif - : m_impl(op.expression(), device) + : m_device(device), m_impl(op.expression(), device) #ifdef EIGEN_USE_SYCL , m_op(op) #endif @@ -404,9 +404,6 @@ struct TensorEvaluator, Device> } else { m_fastOutputDepth = internal::TensorIntDivisor(m_dimensions[NumDims-1]); } - - m_block_total_size_max = - numext::maxi(1, device.lastLevelCacheSize() / sizeof(Scalar)); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } @@ -551,9 +548,11 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( std::vector* resources) const { + auto block_total_size_max = numext::maxi( + 1, m_device.lastLevelCacheSize() / sizeof(Scalar)); resources->push_back(internal::TensorOpResourceRequirements( internal::TensorBlockShapeType::kSkewedInnerDims, - m_block_total_size_max)); + block_total_size_max)); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block( @@ -743,8 +742,8 @@ struct TensorEvaluator, Device> internal::TensorIntDivisor m_fastOutputDepth; Scalar m_paddingValue; - Index m_block_total_size_max; + const Device& m_device; TensorEvaluator m_impl; #ifdef EIGEN_USE_SYCL // Required for SYCL in order to construct the expression tree on the device diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index 2630311b8..6ddded0bd 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -560,9 +560,6 @@ struct TensorEvaluator, Devi m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i]); } } - - m_block_total_size_max = - numext::maxi(1, device.lastLevelCacheSize() / sizeof(Scalar)); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } @@ -672,9 +669,11 @@ struct TensorEvaluator, Devi EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( std::vector* resources) const { + auto block_total_size_max = numext::maxi( + 1, m_device.lastLevelCacheSize() / sizeof(Scalar)); resources->push_back(internal::TensorOpResourceRequirements( internal::TensorBlockShapeType::kSkewedInnerDims, - m_block_total_size_max)); + block_total_size_max)); m_impl.getResourceRequirements(resources); } @@ -761,7 +760,6 @@ struct TensorEvaluator, Devi Dimensions m_dimensions; bool m_is_identity; const StartIndices m_offsets; - Index m_block_total_size_max; }; @@ -1047,9 +1045,6 @@ struct TensorEvaluator(degenerate ? 1 : m_outputStrides[i]); } } - m_block_total_size_max = numext::maxi(static_cast(1), - device.lastLevelCacheSize() / - sizeof(Scalar)); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } @@ -1128,7 +1123,6 @@ struct TensorEvaluator m_dimensions; DSizes m_offsets; // offset in a flattened shape const Strides m_strides; - std::size_t m_block_total_size_max; //use by sycl const StartIndices m_exprStartIndices; //use by sycl diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index c41783106..73675e7dd 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -572,9 +572,6 @@ struct TensorEvaluator, : (static_cast(Layout) == static_cast(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumOutputDims - 1]; - - m_block_total_size_max = - numext::maxi(1, device.lastLevelCacheSize() / sizeof(Scalar)); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } @@ -771,9 +768,11 @@ struct TensorEvaluator, EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( std::vector* resources) const { + auto block_total_size_max = numext::maxi( + 1, m_device.lastLevelCacheSize() / sizeof(Scalar)); resources->push_back(internal::TensorOpResourceRequirements( internal::TensorBlockShapeType::kSkewedInnerDims, - m_block_total_size_max)); + block_total_size_max)); m_impl.getResourceRequirements(resources); } @@ -1204,9 +1203,6 @@ struct TensorEvaluator, // Indexed by reduced dimensions. array m_reducedDims; - // Block size for tiled (aka TensorBlock) evaluation. - Index m_block_total_size_max; - // Evaluator for the input expression. TensorEvaluator m_impl; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h index 77f47bf64..f94c1380d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h @@ -124,8 +124,11 @@ struct TensorEvaluator, Device> using TensorBlock = internal::TensorBlock; using TensorBlockReader = internal::TensorBlockReader; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_shuffle(op.shufflePermutation()) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, + const Device& device) + : m_device(device), + m_impl(op.expression(), device), + m_shuffle(op.shufflePermutation()) { const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); const Shuffle& shuffle = op.shufflePermutation(); @@ -162,9 +165,6 @@ struct TensorEvaluator, Device> for (int i = 0; i < NumDims; ++i) { m_inputStrides[i] = m_unshuffledInputStrides[shuffle[i]]; } - - m_block_total_size_max = - numext::maxi(1, device.firstLevelCacheSize() / sizeof(Scalar)); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } @@ -226,9 +226,10 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( std::vector* resources) const { + auto block_total_size_max = numext::maxi( + 1, m_device.firstLevelCacheSize() / sizeof(Scalar)); resources->push_back(internal::TensorOpResourceRequirements( - internal::TensorBlockShapeType::kUniformAllDims, - m_block_total_size_max)); + internal::TensorBlockShapeType::kUniformAllDims, block_total_size_max)); m_impl.getResourceRequirements(resources); } @@ -384,7 +385,8 @@ struct TensorEvaluator, Device> array, NumDims> m_fastOutputStrides; array m_inputStrides; array m_unshuffledInputStrides; - Index m_block_total_size_max; + + const Device& m_device; TensorEvaluator m_impl; /// required by sycl Shuffle m_shuffle; -- cgit v1.2.3 From 1b0373ae10687ecc51ad9a0bfd46aa4ee116ade1 Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Wed, 1 Aug 2018 15:55:46 -0700 Subject: Replace all using declarations with typedefs in Tensor ops --- unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h | 20 +++++------ .../Eigen/CXX11/src/Tensor/TensorBroadcasting.h | 17 +++++---- .../Eigen/CXX11/src/Tensor/TensorChipping.h | 16 +++++---- .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 40 ++++++++++++---------- .../Eigen/CXX11/src/Tensor/TensorImagePatch.h | 7 ++-- .../Eigen/CXX11/src/Tensor/TensorMorphing.h | 22 +++++++----- .../Eigen/CXX11/src/Tensor/TensorReduction.h | 14 ++++---- .../Eigen/CXX11/src/Tensor/TensorShuffling.h | 16 +++++---- 8 files changed, 88 insertions(+), 64 deletions(-) (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h index 33c4ef5b7..1db8d6124 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h @@ -152,11 +152,11 @@ struct TensorBlockCopyOp { const Scalar* src_base = &src_data[src_index]; Scalar* dst_base = &dst_data[dst_index]; - using Src = const Eigen::Array; - using Dst = Eigen::Array; + typedef const Eigen::Array Src; + typedef Eigen::Array Dst; - using SrcMap = Eigen::Map>; - using DstMap = Eigen::Map>; + typedef Eigen::Map> SrcMap; + typedef Eigen::Map> DstMap; const SrcMap src(src_base, num_coeff_to_copy, InnerStride<>(src_stride)); DstMap dst(dst_base, num_coeff_to_copy, InnerStride<>(dst_stride)); @@ -401,13 +401,13 @@ struct TensorBlockCwiseBinaryOp { const StorageIndex left_stride, const LeftScalar* left_data, const StorageIndex right_index, const StorageIndex right_stride, const RightScalar* right_data) { - using Lhs = const Eigen::Array; - using Rhs = const Eigen::Array; - using Out = Eigen::Array; + typedef const Eigen::Array Lhs; + typedef const Eigen::Array Rhs; + typedef Eigen::Array Out; - using LhsMap = Eigen::Map>; - using RhsMap = Eigen::Map>; - using OutMap = Eigen::Map>; + typedef Eigen::Map> LhsMap; + typedef Eigen::Map> RhsMap; + typedef Eigen::Map> OutMap; const LeftScalar* lhs_base = &left_data[left_index]; const RightScalar* rhs_base = &right_data[right_index]; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index cca14aafd..a4d750885 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -115,16 +115,21 @@ struct TensorEvaluator, Device> RawAccess = false }; - using ScalarNoConst = typename internal::remove_const::type; + typedef typename internal::remove_const::type ScalarNoConst; // Block based access to the XprType (input) tensor. - using TensorBlock = internal::TensorBlock; - using TensorBlockReader = internal::TensorBlockReader; + typedef internal::TensorBlock + TensorBlock; + typedef internal::TensorBlockReader + TensorBlockReader; + // We do block based broadcasting using a trick with 2x tensor rank and 0 // strides. See block method implementation for details. - using BroadcastDimensions = DSizes; - using BroadcastTensorBlock = internal::TensorBlock; - using BroadcastTensorBlockReader = internal::TensorBlockReader; + typedef DSizes BroadcastDimensions; + typedef internal::TensorBlock + BroadcastTensorBlock; + typedef internal::TensorBlockReader + BroadcastTensorBlockReader; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h index aca2ead12..b4c4162ef 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -152,10 +152,12 @@ struct TensorEvaluator, Device> RawAccess = false }; - using ScalarNoConst = typename internal::remove_const::type; + typedef typename internal::remove_const::type ScalarNoConst; - using InputTensorBlock = internal::TensorBlock; - using OutputTensorBlock = internal::TensorBlock; + typedef internal::TensorBlock + InputTensorBlock; + typedef internal::TensorBlock + OutputTensorBlock; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_dim(op.dim()), m_device(device), m_offset(op.offset()) @@ -426,10 +428,12 @@ struct TensorEvaluator, Device> RawAccess = false }; - using ScalarNoConst = typename internal::remove_const::type; + typedef typename internal::remove_const::type ScalarNoConst; - using InputTensorBlock = internal::TensorBlock; - using OutputTensorBlock = internal::TensorBlock; + typedef internal::TensorBlock + InputTensorBlock; + typedef internal::TensorBlock + OutputTensorBlock; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 17008917a..f11241f83 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -36,7 +36,7 @@ template class TensorExecutor { public: - using StorageIndex = typename Expression::Index; + typedef typename Expression::Index StorageIndex; EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(const Expression& expr, @@ -60,7 +60,7 @@ template class TensorExecutor { public: - using StorageIndex = typename Expression::Index; + typedef typename Expression::Index StorageIndex; EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(const Expression& expr, @@ -102,21 +102,22 @@ template class TensorExecutor { public: - using Scalar = typename traits::Scalar; - using ScalarNoConst = typename remove_const::type; + typedef typename traits::Scalar Scalar; + typedef typename remove_const::type ScalarNoConst; - using Evaluator = TensorEvaluator; - using StorageIndex = typename traits::Index; + typedef TensorEvaluator Evaluator; + typedef typename traits::Index StorageIndex; static const int NumDims = traits::NumDimensions; EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(const Expression& expr, const DefaultDevice& device = DefaultDevice()) { - using TensorBlock = - TensorBlock; - using TensorBlockMapper = TensorBlockMapper; + typedef TensorBlock + TensorBlock; + typedef TensorBlockMapper + TensorBlockMapper; Evaluator evaluator(expr, device); Index total_size = array_prod(evaluator.dimensions()); @@ -221,7 +222,7 @@ struct EvalRange { template class TensorExecutor { public: - using StorageIndex = typename Expression::Index; + typedef typename Expression::Index StorageIndex; static EIGEN_STRONG_INLINE void run(const Expression& expr, const ThreadPoolDevice& device) { @@ -249,20 +250,21 @@ class TensorExecutor { template class TensorExecutor { public: - using Scalar = typename traits::Scalar; - using ScalarNoConst = typename remove_const::type; + typedef typename traits::Scalar Scalar; + typedef typename remove_const::type ScalarNoConst; - using Evaluator = TensorEvaluator; - using StorageIndex = typename traits::Index; + typedef TensorEvaluator Evaluator; + typedef typename traits::Index StorageIndex; static const int NumDims = traits::NumDimensions; static EIGEN_STRONG_INLINE void run(const Expression& expr, const ThreadPoolDevice& device) { - using TensorBlock = - TensorBlock; - using TensorBlockMapper = - TensorBlockMapper; + typedef TensorBlock + TensorBlock; + typedef TensorBlockMapper + TensorBlockMapper; Evaluator evaluator(expr, device); StorageIndex total_size = array_prod(evaluator.dimensions()); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h index a8247be90..2b2f4a650 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h @@ -252,7 +252,8 @@ struct TensorEvaluator, Device> RawAccess = false }; - using OutputTensorBlock = internal::TensorBlock; + typedef internal::TensorBlock + OutputTensorBlock; #ifdef __SYCL_DEVICE_ONLY__ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator( const XprType op, const Device& device) @@ -557,8 +558,8 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block( OutputTensorBlock* output_block) const { - using ImagePatchCopyOp = internal::ImagePatchCopyOp; - using ImagePatchPaddingOp = internal::ImagePatchPaddingOp; + typedef internal::ImagePatchCopyOp ImagePatchCopyOp; + typedef internal::ImagePatchPaddingOp ImagePatchPaddingOp; // Calculate loop limits and various input/output dim sizes. const DSizes& block_sizes = output_block->block_sizes(); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index 6ddded0bd..d5b0c1237 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -123,11 +123,15 @@ struct TensorEvaluator, Device> RawAccess = TensorEvaluator::RawAccess }; - using ScalarNoConst = typename internal::remove_const::type; + typedef typename internal::remove_const::type ScalarNoConst; - using InputTensorBlock = internal::TensorBlock; - using OutputTensorBlock = internal::TensorBlock; - using OutputTensorBlockReader = internal::TensorBlockReader; + typedef internal::TensorBlock + InputTensorBlock; + typedef internal::TensorBlock + OutputTensorBlock; + typedef internal::TensorBlockReader + OutputTensorBlockReader; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_dimensions(op.dimensions()) @@ -512,9 +516,10 @@ struct TensorEvaluator, Devi RawAccess = false }; - using ScalarNoConst = typename internal::remove_const::type; + typedef typename internal::remove_const::type ScalarNoConst; - using TensorBlock = internal::TensorBlock; + typedef internal::TensorBlock + TensorBlock; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_device(device), m_dimensions(op.sizes()), m_offsets(op.startIndices()) @@ -787,9 +792,10 @@ struct TensorEvaluator, Device> RawAccess = (NumDims == 1) & TensorEvaluator::RawAccess }; - using ScalarNoConst = typename internal::remove_const::type; + typedef typename internal::remove_const::type ScalarNoConst; - using TensorBlock = internal::TensorBlock; + typedef internal::TensorBlock + TensorBlock; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 73675e7dd..80f179ba4 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -483,10 +483,12 @@ struct TensorEvaluator, RawAccess = false }; - using ScalarNoConst = typename internal::remove_const::type; + typedef typename internal::remove_const::type ScalarNoConst; - using OutputTensorBlock = internal::TensorBlock; - using InputTensorBlock = internal::TensorBlock; + typedef internal::TensorBlock + OutputTensorBlock; + typedef internal::TensorBlock + InputTensorBlock; static const bool ReducingInnerMostDims = internal::are_inner_most_dims::value; static const bool PreservingInnerMostDims = internal::preserve_inner_most_dims::value; @@ -901,9 +903,9 @@ struct TensorEvaluator, new (&reducers[i]) BlockReducer(m_reducer); } - using TensorSliceBlockMapper = - internal::TensorSliceBlockMapper; + typedef internal::TensorSliceBlockMapper + TensorSliceBlockMapper; // TODO(andydavis) Consider removing 'input_block_stride_order' if we // find that scattered reads are not worth supporting in diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h index f94c1380d..fbe69aabc 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h @@ -119,10 +119,12 @@ struct TensorEvaluator, Device> RawAccess = false }; - using ScalarNoConst = typename internal::remove_const::type; + typedef typename internal::remove_const::type ScalarNoConst; - using TensorBlock = internal::TensorBlock; - using TensorBlockReader = internal::TensorBlockReader; + typedef internal::TensorBlock + TensorBlock; + typedef internal::TensorBlockReader + TensorBlockReader; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -417,10 +419,12 @@ struct TensorEvaluator, Device> RawAccess = false }; - using ScalarNoConst = typename internal::remove_const::type; + typedef typename internal::remove_const::type ScalarNoConst; - using TensorBlock = internal::TensorBlock; - using TensorBlockWriter = internal::TensorBlockWriter; + typedef internal::TensorBlock + TensorBlock; + typedef internal::TensorBlockWriter + TensorBlockWriter; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) -- cgit v1.2.3 From f2209d06e428e0691de71f30fc2db4cb29191cd2 Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Fri, 10 Aug 2018 16:53:36 -0700 Subject: Add block evaluationto CwiseUnaryOp and add PreferBlockAccess enum to all evaluators --- unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h | 2 + .../Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h | 1 + unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h | 18 +-- unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h | 141 +++++++++++++++++++++ .../Eigen/CXX11/src/Tensor/TensorBroadcasting.h | 11 +- .../Eigen/CXX11/src/Tensor/TensorChipping.h | 13 +- .../Eigen/CXX11/src/Tensor/TensorConcatenation.h | 2 + .../Eigen/CXX11/src/Tensor/TensorContraction.h | 1 + .../Eigen/CXX11/src/Tensor/TensorConversion.h | 1 + .../Eigen/CXX11/src/Tensor/TensorConvolution.h | 2 + .../Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h | 1 + .../Eigen/CXX11/src/Tensor/TensorCustomOp.h | 2 + unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h | 1 + .../Eigen/CXX11/src/Tensor/TensorEvaluator.h | 76 ++++++++--- unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h | 1 + .../Eigen/CXX11/src/Tensor/TensorFixedSize.h | 1 + .../Eigen/CXX11/src/Tensor/TensorForcedEval.h | 1 + .../CXX11/src/Tensor/TensorForwardDeclarations.h | 7 +- .../Eigen/CXX11/src/Tensor/TensorGenerator.h | 1 + .../Eigen/CXX11/src/Tensor/TensorImagePatch.h | 13 +- .../Eigen/CXX11/src/Tensor/TensorInflation.h | 1 + .../Eigen/CXX11/src/Tensor/TensorLayoutSwap.h | 2 + .../Eigen/CXX11/src/Tensor/TensorMorphing.h | 46 ++++--- unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h | 1 + unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h | 1 + .../Eigen/CXX11/src/Tensor/TensorReduction.h | 13 +- unsupported/Eigen/CXX11/src/Tensor/TensorRef.h | 3 + unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h | 2 + unsupported/Eigen/CXX11/src/Tensor/TensorScan.h | 1 + .../Eigen/CXX11/src/Tensor/TensorShuffling.h | 24 ++-- .../Eigen/CXX11/src/Tensor/TensorStriding.h | 2 + unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h | 1 + .../Eigen/CXX11/src/Tensor/TensorVolumePatch.h | 1 + unsupported/test/cxx11_tensor_block_access.cpp | 111 ++++++++++++++++ unsupported/test/cxx11_tensor_executor.cpp | 31 +++++ 35 files changed, 454 insertions(+), 82 deletions(-) (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h index c0f33ba2d..ea3ab329d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h @@ -87,6 +87,7 @@ struct TensorEvaluator, Device> IsAligned = /*TensorEvaluator::IsAligned*/ false, PacketAccess = /*TensorEvaluator::PacketAccess*/ false, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false @@ -220,6 +221,7 @@ struct TensorEvaluator, Devi IsAligned = /*TensorEvaluator::IsAligned*/ false, PacketAccess = /*TensorEvaluator::PacketAccess*/ false, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator >, Device>::Layout, CoordAccess = false, // to be implemented RawAccess = false diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h index 442639868..5110e99ee 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h @@ -109,6 +109,7 @@ struct TensorEvaluator, Sy IsAligned = false, PacketAccess = false, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, RawAccess = false diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h index f1f877c16..9ec1ec726 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h @@ -102,14 +102,16 @@ struct TensorEvaluator, Device> static const int NumDims = XprType::NumDims; enum { - IsAligned = TensorEvaluator::IsAligned & - TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess & - TensorEvaluator::PacketAccess, - BlockAccess = TensorEvaluator::BlockAccess & - TensorEvaluator::BlockAccess, - Layout = TensorEvaluator::Layout, - RawAccess = TensorEvaluator::RawAccess + IsAligned = TensorEvaluator::IsAligned & + TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess & + TensorEvaluator::PacketAccess, + BlockAccess = TensorEvaluator::BlockAccess & + TensorEvaluator::BlockAccess, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess | + TensorEvaluator::PreferBlockAccess, + Layout = TensorEvaluator::Layout, + RawAccess = TensorEvaluator::RawAccess }; typedef typename internal::TensorBlock< diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h index 877603421..4a3e1ac17 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h @@ -381,6 +381,147 @@ class TensorBlockWriter : public TensorBlockIO + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( + const UnaryFunctor& functor, const StorageIndex num_coeff, + const StorageIndex output_index, const StorageIndex output_stride, + OutputScalar* output_data, const StorageIndex input_index, + const StorageIndex input_stride, const InputScalar* input_data) { + typedef const Eigen::Array Input; + typedef Eigen::Array Output; + + typedef Eigen::Map> InputMap; + typedef Eigen::Map> OutputMap; + + const InputScalar* input_base = &input_data[input_index]; + OutputScalar* output_base = &output_data[output_index]; + + const InputMap input(input_base, num_coeff, InnerStride<>(input_stride)); + OutputMap output(output_base, num_coeff, InnerStride<>(output_stride)); + + output = Eigen::CwiseUnaryOp(input, functor); + } +}; + +/** + * \class TensorBlockCwiseUnaryIO + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor block IO class for carrying out cwise unary ops. + * + * This class carries out the unary op on given blocks. + */ +template +struct TensorBlockCwiseUnaryIO { + typedef typename internal::TensorBlock::Dimensions Dimensions; + + struct BlockIteratorState { + StorageIndex output_stride, output_span; + StorageIndex input_stride, input_span; + StorageIndex size, count; + }; + + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( + const UnaryFunctor& functor, const Dimensions& block_sizes, + const Dimensions& block_strides, OutputScalar* output_data, + const array& input_strides, + const InputScalar* input_data) { + // Find the innermost dimension whose size is not 1. This is the effective + // inner dim. If all dimensions are of size 1, fallback to using the actual + // innermost dim to avoid out-of-bound access. + int num_size_one_inner_dims = 0; + for (int i = 0; i < NumDims; ++i) { + const int dim = cond()(i, NumDims - i - 1); + if (block_sizes[dim] != 1) { + num_size_one_inner_dims = i; + break; + } + } + // Calculate strides and dimensions. + const int inner_dim = + NumDims == 0 ? 1 + : cond()(num_size_one_inner_dims, + NumDims - num_size_one_inner_dims - 1); + StorageIndex inner_dim_size = NumDims == 0 ? 1 : block_sizes[inner_dim]; + for (int i = num_size_one_inner_dims + 1; i < NumDims; ++i) { + const int dim = cond()(i, NumDims - i - 1); + // Merge multiple inner dims into one for larger inner dim size (i.e. + // fewer calls to TensorBlockCwiseUnaryOp::Run()). + if (inner_dim_size == block_strides[dim] && + block_strides[dim] == input_strides[dim]) { + inner_dim_size *= block_sizes[dim]; + ++num_size_one_inner_dims; + } else { + break; + } + } + + StorageIndex output_index = 0, input_index = 0; + + const StorageIndex output_stride = + NumDims == 0 ? 1 : block_strides[inner_dim]; + const StorageIndex input_stride = + NumDims == 0 ? 1 : input_strides[inner_dim]; + + const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1; + array block_iter_state; + + // Initialize block iterator state. Squeeze away any dimension of size 1. + int num_squeezed_dims = 0; + for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) { + const int dim = cond()(i + 1, NumDims - i - 2); + const StorageIndex size = block_sizes[dim]; + if (size == 1) { + continue; + } + BlockIteratorState& state = block_iter_state[num_squeezed_dims]; + state.output_stride = block_strides[dim]; + state.input_stride = input_strides[dim]; + state.size = size; + state.output_span = state.output_stride * (size - 1); + state.input_span = state.input_stride * (size - 1); + state.count = 0; + ++num_squeezed_dims; + } + + // Compute cwise unary op. + const StorageIndex block_total_size = + NumDims == 0 ? 1 : block_sizes.TotalSize(); + for (StorageIndex i = 0; i < block_total_size; i += inner_dim_size) { + TensorBlockCwiseUnaryOp::Run(functor, inner_dim_size, output_index, + output_stride, output_data, input_index, + input_stride, input_data); + // Update index. + for (int j = 0; j < num_squeezed_dims; ++j) { + auto& state = block_iter_state[j]; + if (++state.count < state.size) { + output_index += state.output_stride; + input_index += state.input_stride; + break; + } + state.count = 0; + output_index -= state.output_span; + input_index -= state.input_span; + } + } + } +}; + /** * \class TensorBlockCwiseBinaryOp * \ingroup CXX11_Tensor_Module diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index a4d750885..5e812b04d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -108,11 +108,12 @@ struct TensorEvaluator, Device> bool isCopy= false, nByOne = false, oneByN = false; enum { - IsAligned = true, - PacketAccess = TensorEvaluator::PacketAccess, - BlockAccess = TensorEvaluator::BlockAccess, - Layout = TensorEvaluator::Layout, - RawAccess = false + IsAligned = true, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = TensorEvaluator::BlockAccess, + PreferBlockAccess = true, + Layout = TensorEvaluator::Layout, + RawAccess = false }; typedef typename internal::remove_const::type ScalarNoConst; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h index b4c4162ef..76fab39e2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -144,12 +144,13 @@ struct TensorEvaluator, Device> enum { // Alignment can't be guaranteed at compile time since it depends on the // slice offsets. - IsAligned = false, - PacketAccess = TensorEvaluator::PacketAccess, - BlockAccess = TensorEvaluator::BlockAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = TensorEvaluator::BlockAccess, + PreferBlockAccess = true, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false }; typedef typename internal::remove_const::type ScalarNoConst; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h index 27c92d8f6..3863ee8c3 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h @@ -123,6 +123,7 @@ struct TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, RawAccess = false }; @@ -308,6 +309,7 @@ template::PacketAccess & TensorEvaluator::PacketAccess, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, RawAccess = false }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index e604456e8..c459fc649 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -241,6 +241,7 @@ struct TensorContractionEvaluatorBase IsAligned = true, PacketAccess = (PacketType::size > 1), BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = true diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h index a7751eee1..1f613d3c7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h @@ -196,6 +196,7 @@ struct TensorEvaluator, Device> IsAligned = false, PacketAccess = true, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, RawAccess = false }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index a07e32db0..2d0e6599f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -308,6 +308,7 @@ struct TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false @@ -780,6 +781,7 @@ struct TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, PacketAccess = false, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h index d301d0c01..e79958fc9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h @@ -243,6 +243,7 @@ struct TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, PacketAccess = false, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h index 47b5a5a5e..d71b2e34b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h @@ -93,6 +93,7 @@ struct TensorEvaluator, Devi IsAligned = false, PacketAccess = (PacketType::size > 1), BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false @@ -255,6 +256,7 @@ struct TensorEvaluator::size > 1), BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h index 256d499f2..554ee5f59 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h @@ -108,6 +108,7 @@ struct TensorEvaluator, Device> IsAligned = TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = true diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index 028902fea..e30f9ad8e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -43,6 +43,7 @@ struct TensorEvaluator IsAligned = Derived::IsAligned, PacketAccess = (PacketType::size > 1), BlockAccess = internal::is_arithmetic::type>::value, + PreferBlockAccess = false, Layout = Derived::Layout, CoordAccess = NumCoords > 0, RawAccess = true @@ -195,6 +196,7 @@ struct TensorEvaluator IsAligned = Derived::IsAligned, PacketAccess = (PacketType::size > 1), BlockAccess = internal::is_arithmetic::type>::value, + PreferBlockAccess = false, Layout = Derived::Layout, CoordAccess = NumCoords > 0, RawAccess = true @@ -288,6 +290,7 @@ struct TensorEvaluator, Device> IsAligned = true, PacketAccess = internal::functor_traits::PacketAccess, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false @@ -351,27 +354,34 @@ struct TensorEvaluator, Device> typedef TensorCwiseUnaryOp XprType; enum { - IsAligned = TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess & - internal::functor_traits::PacketAccess, - BlockAccess = false, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false + IsAligned = TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess & + internal::functor_traits::PacketAccess, + BlockAccess = TensorEvaluator::PacketAccess, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false }; EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) - : m_functor(op.functor()), + : m_device(device), + m_functor(op.functor()), m_argImpl(op.nestedExpression(), device) { } typedef typename XprType::Index Index; typedef typename XprType::Scalar Scalar; + typedef typename internal::remove_const::type ScalarNoConst; typedef typename internal::traits::Scalar CoeffReturnType; typedef typename PacketType::type PacketReturnType; static const int PacketSize = PacketType::size; typedef typename TensorEvaluator::Dimensions Dimensions; + static const int NumDims = internal::array_size::value; + typedef internal::TensorBlock + TensorBlock; + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { @@ -399,6 +409,29 @@ struct TensorEvaluator, Device> TensorOpCost(0, 0, functor_cost, vectorized, PacketSize); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( + std::vector* resources) const { + m_argImpl.getResourceRequirements(resources); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block( + TensorBlock* output_block) const { + if (NumDims <= 0) { + output_block->data()[0] = coeff(0); + return; + } + internal::TensorBlockView arg_block(m_device, m_argImpl, + *output_block); + internal::TensorBlockCwiseUnaryIO::Run(m_functor, + output_block->block_sizes(), + output_block + ->block_strides(), + output_block->data(), + arg_block.block_strides(), + arg_block.data()); + } + EIGEN_DEVICE_FUNC typename Eigen::internal::traits::PointerType data() const { return NULL; } /// required by sycl in order to extract the accessor @@ -408,6 +441,7 @@ struct TensorEvaluator, Device> private: + const Device& m_device; const UnaryOp m_functor; TensorEvaluator m_argImpl; }; @@ -421,16 +455,18 @@ struct TensorEvaluator XprType; enum { - IsAligned = TensorEvaluator::IsAligned & - TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess & - TensorEvaluator::PacketAccess & - internal::functor_traits::PacketAccess, - BlockAccess = TensorEvaluator::BlockAccess & - TensorEvaluator::BlockAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false + IsAligned = TensorEvaluator::IsAligned & + TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess & + TensorEvaluator::PacketAccess & + internal::functor_traits::PacketAccess, + BlockAccess = TensorEvaluator::BlockAccess & + TensorEvaluator::BlockAccess, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess | + TensorEvaluator::PreferBlockAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false }; EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) @@ -501,7 +537,7 @@ struct TensorEvaluatordata()[0] = coeff(0); + output_block->data()[0] = coeff(Index(0)); return; } internal::TensorBlockView left_block( @@ -543,6 +579,7 @@ struct TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess & internal::functor_traits::PacketAccess, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false @@ -648,6 +685,7 @@ struct TensorEvaluator PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess & PacketType::HasBlend, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h index d6ab4d997..480cf1f39 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h @@ -136,6 +136,7 @@ struct TensorEvaluator, D IsAligned = false, PacketAccess = true, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, RawAccess = false diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h index 1342e47a6..71ba56773 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h @@ -42,6 +42,7 @@ class TensorFixedSize : public TensorBase0), PacketAccess = (internal::packet_traits::size > 1), BlockAccess = false, + PreferBlockAccess = false, Layout = Options_ & RowMajor ? RowMajor : ColMajor, CoordAccess = true, RawAccess = true diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h index 2778bf5ec..edf9f85d8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h @@ -99,6 +99,7 @@ struct TensorEvaluator, Device> IsAligned = true, PacketAccess = (PacketType::size > 1), BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, RawAccess = true }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h index 0dd524a30..b8a57ab99 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h @@ -132,7 +132,12 @@ struct IsVectorizable { template struct IsTileable { - static const bool value = TensorEvaluator::BlockAccess; + // Check that block evaluation is supported and it's a preferred option (at + // least one sub-expression has much faster block evaluation, e.g. + // broadcasting). + static const bool value = + TensorEvaluator::BlockAccess & + TensorEvaluator::PreferBlockAccess; }; template , Device> IsAligned = false, PacketAccess = (PacketType::size > 1), BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h index 2b2f4a650..1826d7022 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h @@ -244,12 +244,13 @@ struct TensorEvaluator, Device> static const int PacketSize = PacketType::size; enum { - IsAligned = false, - PacketAccess = TensorEvaluator::PacketAccess, - BlockAccess = true, - Layout = TensorEvaluator::Layout, - CoordAccess = false, - RawAccess = false + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = true, + PreferBlockAccess = true, + Layout = TensorEvaluator::Layout, + CoordAccess = false, + RawAccess = false }; typedef internal::TensorBlock diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h index 64f2ad81f..e28565009 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h @@ -91,6 +91,7 @@ struct TensorEvaluator, Device> IsAligned = /*TensorEvaluator::IsAligned*/ false, PacketAccess = TensorEvaluator::PacketAccess, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h index e3165fa10..998757d14 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h @@ -120,6 +120,7 @@ struct TensorEvaluator, Device> IsAligned = TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess, BlockAccess = false, + PreferBlockAccess = false, Layout = (static_cast(TensorEvaluator::Layout) == static_cast(ColMajor)) ? RowMajor : ColMajor, CoordAccess = false, // to be implemented RawAccess = TensorEvaluator::RawAccess @@ -183,6 +184,7 @@ template IsAligned = TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess, BlockAccess = false, + PreferBlockAccess = false, Layout = (static_cast(TensorEvaluator::Layout) == static_cast(ColMajor)) ? RowMajor : ColMajor, CoordAccess = false // to be implemented }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index b25c1eabc..6f9294ccf 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -111,16 +111,17 @@ struct TensorEvaluator, Device> static const int NumInputDims = internal::array_size::Dimensions>::value; enum { - IsAligned = TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess, + IsAligned = TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess, // TODO(andydavis, wuke) Enable BlockAccess for the general case when the // performance issue with block-based reshape is resolved. - BlockAccess = TensorEvaluator::BlockAccess && - TensorEvaluator::RawAccess && - NumInputDims > 0 && NumOutputDims > 0, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = TensorEvaluator::RawAccess + BlockAccess = TensorEvaluator::BlockAccess && + TensorEvaluator::RawAccess && + NumInputDims > 0 && NumOutputDims > 0, + PreferBlockAccess = true, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = TensorEvaluator::RawAccess }; typedef typename internal::remove_const::type ScalarNoConst; @@ -349,6 +350,7 @@ template IsAligned = TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = TensorEvaluator::RawAccess @@ -508,12 +510,13 @@ struct TensorEvaluator, Devi enum { // Alignment can't be guaranteed at compile time since it depends on the // slice offsets and sizes. - IsAligned = /*TensorEvaluator::IsAligned*/false, - PacketAccess = TensorEvaluator::PacketAccess, - BlockAccess = TensorEvaluator::BlockAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, - RawAccess = false + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = TensorEvaluator::BlockAccess, + PreferBlockAccess = true, + Layout = TensorEvaluator::Layout, + CoordAccess = false, + RawAccess = false }; typedef typename internal::remove_const::type ScalarNoConst; @@ -785,12 +788,13 @@ struct TensorEvaluator, Device> typedef Sizes Dimensions; enum { - IsAligned = /*TensorEvaluator::IsAligned*/false, - PacketAccess = TensorEvaluator::PacketAccess, - BlockAccess = TensorEvaluator::BlockAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, - RawAccess = (NumDims == 1) & TensorEvaluator::RawAccess + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = TensorEvaluator::BlockAccess, + PreferBlockAccess = true, + Layout = TensorEvaluator::Layout, + CoordAccess = false, + RawAccess = (NumDims == 1) & TensorEvaluator::RawAccess }; typedef typename internal::remove_const::type ScalarNoConst; @@ -972,6 +976,7 @@ struct TensorEvaluator::Layout, RawAccess = false }; @@ -1148,6 +1153,7 @@ struct TensorEvaluator::Layout, CoordAccess = TensorEvaluator::CoordAccess, RawAccess = false diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h index aa1db3c73..59c1704ed 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h @@ -97,6 +97,7 @@ struct TensorEvaluator, Device IsAligned = true, PacketAccess = TensorEvaluator::PacketAccess, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = true, RawAccess = false diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h index a0a1ad8f4..4292fe0c2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h @@ -95,6 +95,7 @@ struct TensorEvaluator, Device> IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, RawAccess = false diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 80f179ba4..3d534eaa2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -475,12 +475,13 @@ struct TensorEvaluator, static const int PacketSize = PacketType::size; enum { - IsAligned = false, - PacketAccess = Self::InputPacketAccess && Op::PacketAccess, - BlockAccess = TensorEvaluator::BlockAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false + IsAligned = false, + PacketAccess = Self::InputPacketAccess && Op::PacketAccess, + BlockAccess = TensorEvaluator::BlockAccess, + PreferBlockAccess = true, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false }; typedef typename internal::remove_const::type ScalarNoConst; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h index a6cade50f..6e15e75f9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h @@ -137,6 +137,7 @@ template class TensorRef : public TensorBase, Device> IsAligned = false, PacketAccess = false, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorRef::Layout, CoordAccess = false, // to be implemented RawAccess = false @@ -414,6 +416,7 @@ struct TensorEvaluator, Device> : public TensorEvaluator, Device IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false @@ -255,6 +256,7 @@ struct TensorEvaluator, Device> IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h index b1135f297..641366d9d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h @@ -97,6 +97,7 @@ struct TensorEvaluator, Device> { IsAligned = false, PacketAccess = (PacketType::size > 1), BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, RawAccess = true diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h index 98f125408..a5b541a68 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h @@ -111,12 +111,13 @@ struct TensorEvaluator, Device> static const int PacketSize = PacketType::size; enum { - IsAligned = false, - PacketAccess = (PacketType::size > 1), - BlockAccess = TensorEvaluator::BlockAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false + IsAligned = false, + PacketAccess = (PacketType::size > 1), + BlockAccess = TensorEvaluator::BlockAccess, + PreferBlockAccess = true, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false }; typedef typename internal::remove_const::type ScalarNoConst; @@ -412,11 +413,12 @@ struct TensorEvaluator, Device> static const int PacketSize = PacketType::size; enum { - IsAligned = false, - PacketAccess = (PacketType::size > 1), - BlockAccess = TensorEvaluator::BlockAccess, - Layout = TensorEvaluator::Layout, - RawAccess = false + IsAligned = false, + PacketAccess = (PacketType::size > 1), + BlockAccess = TensorEvaluator::BlockAccess, + PreferBlockAccess = true, + Layout = TensorEvaluator::Layout, + RawAccess = false }; typedef typename internal::remove_const::type ScalarNoConst; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h index 4b69072f2..221dc96c9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h @@ -113,6 +113,7 @@ struct TensorEvaluator, Device> IsAligned = /*TensorEvaluator::IsAligned*/false, PacketAccess = TensorEvaluator::PacketAccess, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false @@ -275,6 +276,7 @@ struct TensorEvaluator, Device> IsAligned = /*TensorEvaluator::IsAligned*/false, PacketAccess = TensorEvaluator::PacketAccess, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h index ea53bb04b..9fc54a4c0 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h @@ -96,6 +96,7 @@ struct TensorEvaluator, Device> IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, RawAccess = false diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h index 3c7d8bbc0..c1b7a58ca 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h @@ -200,6 +200,7 @@ struct TensorEvaluator, D IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, RawAccess = false diff --git a/unsupported/test/cxx11_tensor_block_access.cpp b/unsupported/test/cxx11_tensor_block_access.cpp index 6feeff231..746f62511 100644 --- a/unsupported/test/cxx11_tensor_block_access.cpp +++ b/unsupported/test/cxx11_tensor_block_access.cpp @@ -517,6 +517,114 @@ static void test_block_io_squeeze_ones() { } } +template +static void test_block_cwise_unary_io_basic() { + typedef internal::scalar_square_op UnaryFunctor; + typedef internal::TensorBlockCwiseUnaryIO + TensorBlockCwiseUnaryIO; + + DSizes block_sizes = RandomDims(); + DSizes strides(ComputeStrides(block_sizes)); + + const auto total_size = block_sizes.TotalSize(); + + // Create a random input tensors. + T* input_data = GenerateRandomData(total_size); + + T* output_data = new T[total_size]; + UnaryFunctor functor; + TensorBlockCwiseUnaryIO::Run(functor, block_sizes, strides, output_data, + strides, input_data); + for (int i = 0; i < total_size; ++i) { + VERIFY_IS_EQUAL(output_data[i], functor(input_data[i])); + } + + delete[] input_data; + delete[] output_data; +} + +template +static void test_block_cwise_unary_io_squeeze_ones() { + typedef internal::scalar_square_op UnaryFunctor; + typedef internal::TensorBlockCwiseUnaryIO + TensorBlockCwiseUnaryIO; + + DSizes block_sizes(1, 2, 1, 3, 1); + DSizes strides(ComputeStrides(block_sizes)); + + const auto total_size = block_sizes.TotalSize(); + + // Create a random input tensors. + auto* input_data = GenerateRandomData(total_size); + + auto* output_data = new float[total_size]; + UnaryFunctor functor; + TensorBlockCwiseUnaryIO::Run(functor, block_sizes, strides, output_data, + strides, input_data); + for (int i = 0; i < total_size; ++i) { + VERIFY_IS_EQUAL(output_data[i], functor(input_data[i])); + } + + delete[] input_data; + delete[] output_data; +} + +template +static void test_block_cwise_unary_io_zero_strides() { + typedef internal::scalar_square_op UnaryFunctor; + typedef internal::TensorBlockCwiseUnaryIO + TensorBlockCwiseUnaryIO; + + DSizes rnd_dims = RandomDims<5>(); + + DSizes input_sizes = rnd_dims; + input_sizes[0] = 1; + input_sizes[2] = 1; + input_sizes[4] = 1; + + DSizes input_strides(ComputeStrides(input_sizes)); + input_strides[0] = 0; + input_strides[2] = 0; + input_strides[4] = 0; + + // Generate random data. + auto* input_data = GenerateRandomData(input_sizes.TotalSize()); + + DSizes output_sizes = rnd_dims; + DSizes output_strides(ComputeStrides(output_sizes)); + + const auto output_total_size = output_sizes.TotalSize(); + auto* output_data = new float[output_total_size]; + + UnaryFunctor functor; + TensorBlockCwiseUnaryIO::Run(functor, output_sizes, output_strides, + output_data, input_strides, input_data); + for (int i = 0; i < rnd_dims[0]; ++i) { + for (int j = 0; j < rnd_dims[1]; ++j) { + for (int k = 0; k < rnd_dims[2]; ++k) { + for (int l = 0; l < rnd_dims[3]; ++l) { + for (int m = 0; m < rnd_dims[4]; ++m) { + Index output_index = i * output_strides[0] + j * output_strides[1] + + k * output_strides[2] + l * output_strides[3] + + m * output_strides[4]; + Index input_index = i * input_strides[0] + j * input_strides[1] + + k * input_strides[2] + l * input_strides[3] + + m * input_strides[4]; + VERIFY_IS_EQUAL(output_data[output_index], + functor(input_data[input_index])); + } + } + } + } + } + + delete[] input_data; + delete[] output_data; +} + template static void test_block_cwise_binary_io_basic() { typedef internal::scalar_sum_op BinaryFunctor; @@ -982,6 +1090,9 @@ EIGEN_DECLARE_TEST(cxx11_tensor_block_access) { TEST_LAYOUTS_AND_DIMS(Data, test_block_io_copy_using_reordered_dimensions); TEST_LAYOUTS(test_block_io_zero_stride); TEST_LAYOUTS(test_block_io_squeeze_ones); + TEST_LAYOUTS_AND_DIMS(float, test_block_cwise_unary_io_basic); + TEST_LAYOUTS(test_block_cwise_unary_io_squeeze_ones); + TEST_LAYOUTS(test_block_cwise_unary_io_zero_strides); TEST_LAYOUTS_AND_DIMS(float, test_block_cwise_binary_io_basic); TEST_LAYOUTS(test_block_cwise_binary_io_squeeze_ones); TEST_LAYOUTS(test_block_cwise_binary_io_zero_strides); diff --git a/unsupported/test/cxx11_tensor_executor.cpp b/unsupported/test/cxx11_tensor_executor.cpp index d16ae4d11..1dc18220c 100644 --- a/unsupported/test/cxx11_tensor_executor.cpp +++ b/unsupported/test/cxx11_tensor_executor.cpp @@ -29,6 +29,33 @@ static array RandomDims(int min_dim = 1, int max_dim = 20) { return dims; }; +template +static void test_execute_unary_expr(Device d) { + static constexpr int Options = 0 | Layout; + + // Pick a large enough tensor size to bypass small tensor block evaluation + // optimization. + auto dims = RandomDims(50 / NumDims, 100 / NumDims); + + Tensor src(dims); + Tensor dst(dims); + + src.setRandom(); + const auto expr = src.square(); + + using Assign = TensorAssignOp; + using Executor = + internal::TensorExecutor; + + Executor::run(Assign(dst, expr), d); + + for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { + T square = src.coeff(i) * src.coeff(i); + VERIFY_IS_EQUAL(square, dst.coeff(i)); + } +} + template static void test_execute_binary_expr(Device d) @@ -445,6 +472,10 @@ EIGEN_DECLARE_TEST(cxx11_tensor_executor) { Eigen::ThreadPool tp(num_threads); Eigen::ThreadPoolDevice tp_device(&tp, num_threads); + CALL_SUBTEST_COMBINATIONS(test_execute_unary_expr, float, 3); + CALL_SUBTEST_COMBINATIONS(test_execute_unary_expr, float, 4); + CALL_SUBTEST_COMBINATIONS(test_execute_unary_expr, float, 5); + CALL_SUBTEST_COMBINATIONS(test_execute_binary_expr, float, 3); CALL_SUBTEST_COMBINATIONS(test_execute_binary_expr, float, 4); CALL_SUBTEST_COMBINATIONS(test_execute_binary_expr, float, 5); -- cgit v1.2.3 From 81b38a155adf5d527bce5c84cf90cd83c28da445 Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Tue, 11 Sep 2018 13:32:32 -0700 Subject: Fix compilation of tiled evaluation code with c++03 --- unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h | 6 +++--- unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h | 11 +++++------ unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h | 5 ++--- unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h | 5 ++--- unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h | 5 ++--- unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h | 5 ++--- unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h | 4 ++-- unsupported/test/cxx11_tensor_block_access.cpp | 14 +++++++------- unsupported/test/cxx11_tensor_shuffling.cpp | 10 +++++----- 9 files changed, 30 insertions(+), 35 deletions(-) (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h index 6d90af2d3..13da36257 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h @@ -396,8 +396,8 @@ struct TensorBlockCwiseUnaryOp { typedef const Eigen::Array Input; typedef Eigen::Array Output; - typedef Eigen::Map> InputMap; - typedef Eigen::Map> OutputMap; + typedef Eigen::Map > InputMap; + typedef Eigen::Map > OutputMap; const InputScalar* input_base = &input_data[input_index]; OutputScalar* output_base = &output_data[output_index]; @@ -502,7 +502,7 @@ struct TensorBlockCwiseUnaryIO { input_stride, input_data); // Update index. for (int j = 0; j < num_squeezed_dims; ++j) { - auto& state = block_iter_state[j]; + BlockIteratorState& state = block_iter_state[j]; if (++state.count < state.size) { output_index += state.output_stride; input_index += state.input_stride; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index 02d061a9c..e5cf93ab0 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -596,12 +596,11 @@ struct TensorEvaluator, Device> std::vector* resources) const { // TODO(wuke): Targeting L1 size is 30% faster than targeting L{-1} on large // tensors. But this might need further tuning. - auto block_total_size_max = numext::maxi( + Eigen::Index block_total_size_max = numext::maxi( 1, m_device.firstLevelCacheSize() / sizeof(Scalar)); resources->push_back(internal::TensorOpResourceRequirements( - internal::TensorBlockShapeType::kSkewedInnerDims, - block_total_size_max)); + internal::kSkewedInnerDims, block_total_size_max)); m_impl.getResourceRequirements(resources); } @@ -617,8 +616,8 @@ struct TensorEvaluator, Device> // equal to m_dimensions for inner dims, a smaller than m_dimensions[i] size // for the first outer dim, and 1 for other outer dims. This is guaranteed // by MergeResourceRequirements() in TensorBlock.h. - const auto& output_block_sizes = output_block->block_sizes(); - const auto& output_block_strides = output_block->block_strides(); + const Dimensions& output_block_sizes = output_block->block_sizes(); + const Dimensions& output_block_strides = output_block->block_strides(); // Find where outer dims start. int outer_dim_start = 0; @@ -642,7 +641,7 @@ struct TensorEvaluator, Device> return; } - const auto& input_dims = m_impl.dimensions(); + const Dimensions& input_dims = m_impl.dimensions(); // Pre-fill input_block_sizes, broadcast_block_sizes, // broadcast_block_strides, and broadcast_tensor_strides. Later on we will diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h index 76fab39e2..b47fa9e8e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -290,11 +290,10 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( std::vector* resources) const { - auto block_total_size_max = numext::maxi( + Eigen::Index block_total_size_max = numext::maxi( 1, m_device.lastLevelCacheSize() / sizeof(Scalar)); resources->push_back(internal::TensorOpResourceRequirements( - internal::TensorBlockShapeType::kSkewedInnerDims, - block_total_size_max)); + internal::kSkewedInnerDims, block_total_size_max)); m_impl.getResourceRequirements(resources); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h index 1826d7022..965bd8f1e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h @@ -550,11 +550,10 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( std::vector* resources) const { - auto block_total_size_max = numext::maxi( + Eigen::Index block_total_size_max = numext::maxi( 1, m_device.lastLevelCacheSize() / sizeof(Scalar)); resources->push_back(internal::TensorOpResourceRequirements( - internal::TensorBlockShapeType::kSkewedInnerDims, - block_total_size_max)); + internal::kSkewedInnerDims, block_total_size_max)); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block( diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index 2f765acb7..16dc74afe 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -677,11 +677,10 @@ struct TensorEvaluator, Devi EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( std::vector* resources) const { - auto block_total_size_max = numext::maxi( + Eigen::Index block_total_size_max = numext::maxi( 1, m_device.lastLevelCacheSize() / sizeof(Scalar)); resources->push_back(internal::TensorOpResourceRequirements( - internal::TensorBlockShapeType::kSkewedInnerDims, - block_total_size_max)); + internal::kSkewedInnerDims, block_total_size_max)); m_impl.getResourceRequirements(resources); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 3d534eaa2..eeb2578fd 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -771,11 +771,10 @@ struct TensorEvaluator, EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( std::vector* resources) const { - auto block_total_size_max = numext::maxi( + Eigen::Index block_total_size_max = numext::maxi( 1, m_device.lastLevelCacheSize() / sizeof(Scalar)); resources->push_back(internal::TensorOpResourceRequirements( - internal::TensorBlockShapeType::kSkewedInnerDims, - block_total_size_max)); + internal::kSkewedInnerDims, block_total_size_max)); m_impl.getResourceRequirements(resources); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h index a5b541a68..e018d0ab2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h @@ -229,10 +229,10 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( std::vector* resources) const { - auto block_total_size_max = numext::maxi( + Eigen::Index block_total_size_max = numext::maxi( 1, m_device.firstLevelCacheSize() / sizeof(Scalar)); resources->push_back(internal::TensorOpResourceRequirements( - internal::TensorBlockShapeType::kUniformAllDims, block_total_size_max)); + internal::kUniformAllDims, block_total_size_max)); m_impl.getResourceRequirements(resources); } diff --git a/unsupported/test/cxx11_tensor_block_access.cpp b/unsupported/test/cxx11_tensor_block_access.cpp index eec282ba7..ad12ae557 100644 --- a/unsupported/test/cxx11_tensor_block_access.cpp +++ b/unsupported/test/cxx11_tensor_block_access.cpp @@ -535,7 +535,7 @@ static void test_block_cwise_unary_io_basic() { DSizes block_sizes = RandomDims(); DSizes strides(ComputeStrides(block_sizes)); - const auto total_size = block_sizes.TotalSize(); + const Index total_size = block_sizes.TotalSize(); // Create a random input tensors. T* input_data = GenerateRandomData(total_size); @@ -562,12 +562,12 @@ static void test_block_cwise_unary_io_squeeze_ones() { DSizes block_sizes(1, 2, 1, 3, 1); DSizes strides(ComputeStrides(block_sizes)); - const auto total_size = block_sizes.TotalSize(); + const Index total_size = block_sizes.TotalSize(); // Create a random input tensors. - auto* input_data = GenerateRandomData(total_size); + float* input_data = GenerateRandomData(total_size); - auto* output_data = new float[total_size]; + float* output_data = new float[total_size]; UnaryFunctor functor; TensorBlockCwiseUnaryIO::Run(functor, block_sizes, strides, output_data, strides, input_data); @@ -599,13 +599,13 @@ static void test_block_cwise_unary_io_zero_strides() { input_strides[4] = 0; // Generate random data. - auto* input_data = GenerateRandomData(input_sizes.TotalSize()); + float* input_data = GenerateRandomData(input_sizes.TotalSize()); DSizes output_sizes = rnd_dims; DSizes output_strides(ComputeStrides(output_sizes)); - const auto output_total_size = output_sizes.TotalSize(); - auto* output_data = new float[output_total_size]; + const Index output_total_size = output_sizes.TotalSize(); + float* output_data = new float[output_total_size]; UnaryFunctor functor; TensorBlockCwiseUnaryIO::Run(functor, output_sizes, output_strides, diff --git a/unsupported/test/cxx11_tensor_shuffling.cpp b/unsupported/test/cxx11_tensor_shuffling.cpp index 467df39c7..062dd1c0f 100644 --- a/unsupported/test/cxx11_tensor_shuffling.cpp +++ b/unsupported/test/cxx11_tensor_shuffling.cpp @@ -81,12 +81,12 @@ static void test_expr_shuffling() Tensor expected; expected = tensor.shuffle(shuffles); - Tensor result(5,7,3,2); + Tensor result(5, 7, 3, 2); - array src_slice_dim{{2,3,1,7}}; - array src_slice_start{{0,0,0,0}}; - array dst_slice_dim{{1,7,3,2}}; - array dst_slice_start{{0,0,0,0}}; + array src_slice_dim({2, 3, 1, 7}); + array src_slice_start({0, 0, 0, 0}); + array dst_slice_dim({1, 7, 3, 2}); + array dst_slice_start({0, 0, 0, 0}); for (int i = 0; i < 5; ++i) { result.slice(dst_slice_start, dst_slice_dim) = -- cgit v1.2.3 From 01197e44527941c95f9a63e4f60ab3a989f12cbe Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Thu, 13 Sep 2018 15:03:36 -0700 Subject: Fix warnings --- unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h | 2 -- unsupported/test/cxx11_tensor_executor.cpp | 8 ++++---- 2 files changed, 4 insertions(+), 6 deletions(-) (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index eeb2578fd..261dbffa8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -1140,7 +1140,6 @@ struct TensorEvaluator, const Index max_coeff_count, const DSizes& input_slice_sizes, DSizes* target_input_block_sizes) const { - typedef typename internal::packet_traits::type Packet; typedef internal::BlockReducer BlockReducer; // TODO(andydavis) Compute reducer overhead correctly for the case where // we are preserving the inner most dimension, and a single reducer @@ -1168,7 +1167,6 @@ struct TensorEvaluator, } else if (!first_preserved_dim_allocated) { // TODO(andydavis) Include output block size in this L1 working set // calculation. - const Index allocated = max_coeff_count - coeff_to_allocate; const Index alloc_size = numext::maxi( static_cast(1), coeff_to_allocate / reducer_overhead); (*target_input_block_sizes)[dim] = diff --git a/unsupported/test/cxx11_tensor_executor.cpp b/unsupported/test/cxx11_tensor_executor.cpp index 8639e7b38..1bb99854c 100644 --- a/unsupported/test/cxx11_tensor_executor.cpp +++ b/unsupported/test/cxx11_tensor_executor.cpp @@ -27,7 +27,7 @@ static array RandomDims(int min_dim = 1, int max_dim = 20) { dims[i] = internal::random(min_dim, max_dim); } return dims; -}; +} template @@ -117,7 +117,7 @@ static void test_execute_broadcasting(Device d) for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i)); } -}; +} template @@ -155,7 +155,7 @@ static void test_execute_chipping_rvalue(Device d) { TEST_CHIPPING(5) #undef TEST_CHIPPING -}; +} template @@ -207,7 +207,7 @@ static void test_execute_chipping_lvalue(Device d) { TEST_CHIPPING(5) #undef TEST_CHIPPING -}; +} template -- cgit v1.2.3