From 83c0a16baf5ecac6288cd9b74536a82de8985b31 Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Tue, 31 Jul 2018 15:56:31 -0700 Subject: Add block evaluation support to TensorOps --- .../Eigen/CXX11/src/Tensor/TensorReduction.h | 437 ++++++++++++++++++++- 1 file changed, 431 insertions(+), 6 deletions(-) (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 375fc0802..05c0990dc 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -356,6 +356,70 @@ template __global__ void OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); #endif +template +class BlockReducer { + public: + typedef typename Self::Index Index; + typedef typename Self::Scalar Scalar; + typedef typename Self::CoeffReturnType CoeffReturnType; + typedef typename Self::PacketReturnType PacketReturnType; + explicit BlockReducer(const Op& reducer) : op_(reducer) { + accum_ = op_.initialize(); + } + void Reduce(Index index, Index num_values_to_reduce, Scalar* data) { + for (Index i = 0; i < num_values_to_reduce; ++i) { + op_.reduce(data[index + i], &accum_); + } + } + CoeffReturnType Finalize() { return op_.finalize(accum_); } + PacketReturnType FinalizePacket() { + // TODO(andydavis) This function should not be called for Scalar + // reductions: clean this up or add an assert here. + return PacketReturnType(); + } + + private: + CoeffReturnType accum_; + Op op_; +}; + +template +class BlockReducer { + public: + typedef typename Self::Index Index; + typedef typename Self::Scalar Scalar; + typedef typename Self::CoeffReturnType CoeffReturnType; + typedef typename Self::PacketReturnType PacketReturnType; + static const Index PacketSize = + internal::unpacket_traits::size; + + explicit BlockReducer(const Op& reducer) : op_(reducer) { + vaccum_ = op_.template initializePacket(); + accum_ = op_.initialize(); + } + void Reduce(Index index, Index num_values_to_reduce, Scalar* data) { + const Index vectorized_size = + (num_values_to_reduce / PacketSize) * PacketSize; + for (Index i = 0; i < vectorized_size; i += PacketSize) { + op_.reducePacket( + internal::ploadt(&data[index + i]), + &vaccum_); + } + for (Index i = vectorized_size; i < num_values_to_reduce; ++i) { + op_.reduce(data[index + i], &accum_); + } + } + CoeffReturnType Finalize() { return op_.finalizeBoth(accum_, vaccum_); } + PacketReturnType FinalizePacket() { return op_.finalizePacket(vaccum_); } + + private: + PacketReturnType vaccum_; + CoeffReturnType accum_; + Op op_; +}; + } // end namespace internal @@ -394,6 +458,7 @@ class TensorReductionOp : public TensorBase class MakePointer_, typename Device> struct TensorEvaluator, Device> { + typedef internal::reducer_traits ReducerTraits; typedef TensorReductionOp XprType; typedef typename XprType::Index Index; typedef ArgType ChildType; @@ -410,14 +475,19 @@ struct TensorEvaluator, static const int PacketSize = internal::unpacket_traits::size; enum { - IsAligned = false, + IsAligned = false, PacketAccess = Self::InputPacketAccess && Op::PacketAccess, - BlockAccess = false, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false + BlockAccess = TensorEvaluator::BlockAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false }; + using ScalarNoConst = typename internal::remove_const::type; + + using OutputTensorBlock = internal::TensorBlock; + using InputTensorBlock = internal::TensorBlock; + static const bool ReducingInnerMostDims = internal::are_inner_most_dims::value; static const bool PreservingInnerMostDims = internal::preserve_inner_most_dims::value; static const bool RunningFullReduction = (NumOutputDims==0); @@ -451,11 +521,13 @@ struct TensorEvaluator, m_outputStrides[0] = 1; for (int i = 1; i < NumOutputDims; ++i) { m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1]; + m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i]); } } else { - m_outputStrides.back() = 1; + m_outputStrides[NumOutputDims - 1] = 1; for (int i = NumOutputDims - 2; i >= 0; --i) { m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1]; + m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i]); } } } @@ -483,6 +555,7 @@ struct TensorEvaluator, ++reduceIndex; } else { m_preservedStrides[outputIndex] = input_strides[i]; + m_output_to_input_dim_map[outputIndex] = i; ++outputIndex; } } @@ -492,6 +565,16 @@ struct TensorEvaluator, if (NumOutputDims == 0) { m_preservedStrides[0] = internal::array_prod(input_dims); } + + m_numValuesToReduce = + NumOutputDims == 0 + ? internal::array_prod(input_dims) + : (static_cast(Layout) == static_cast(ColMajor)) + ? m_preservedStrides[0] + : m_preservedStrides[NumOutputDims - 1]; + + m_block_total_size_max = + numext::maxi(1, device.lastLevelCacheSize() / sizeof(Scalar)); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } @@ -686,6 +769,265 @@ struct TensorEvaluator, } } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( + std::vector* resources) const { + resources->push_back(internal::TensorOpResourceRequirements( + internal::TensorBlockShapeType::kSkewedInnerDims, + m_block_total_size_max)); + m_impl.getResourceRequirements(resources); + } + + EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void block( + OutputTensorBlock* output_block) const { + // Special case full reductions to avoid input block copy below. + if (NumInputDims == NumReducedDims) { + eigen_assert(output_block->first_coeff_index() == 0); + eigen_assert(output_block->block_sizes().TotalSize() == 1); + Op reducer(m_reducer); + output_block->data()[0] = internal::InnerMostDimReducer::reduce( + *this, 0, m_numValuesToReduce, reducer); + return; + } + + // Calculate input tensor 'slice' required to reduce output block coeffs. + DSizes input_slice_sizes(m_impl.dimensions()); + for (int i = 0; i < NumOutputDims; ++i) { + // Clip preserved input dimensions by output block size. + input_slice_sizes[m_output_to_input_dim_map[i]] = + output_block->block_sizes()[i]; + } + + // Shard input tensor slice into blocks (because it could be large if we + // need to reduce along several dimensions to calculate required output + // coefficients). + const Index max_coeff_count = + numext::mini(((m_device.firstLevelCacheSize()) / sizeof(Scalar)), + input_slice_sizes.TotalSize()); + + // Calculate max output shard size needed to keep working set of reducers + // in L1, while leaving enough space for reducer overhead and 'PacketSize' + // reductions. + DSizes target_input_block_sizes; + CalculateTargetInputBlockShape(max_coeff_count, input_slice_sizes, + &target_input_block_sizes); + // Calculate indices for first preserved dimension. + const Index first_preserved_dim_output_index = + static_cast(Layout) == static_cast(ColMajor) + ? 0 + : NumOutputDims - 1; + const Index first_preserved_dim_input_index = + m_output_to_input_dim_map[first_preserved_dim_output_index]; + const bool inner_most_dim_preserved = + first_preserved_dim_input_index == + (static_cast(Layout) == static_cast(ColMajor) + ? 0 + : NumInputDims - 1) | + PreservingInnerMostDims; + + // Calculate output block inner/outer dimension sizes. + const Index output_block_inner_dim_size = + output_block->block_sizes()[first_preserved_dim_output_index]; + const Index output_block_outer_dim_size = + output_block->block_sizes().TotalSize() / output_block_inner_dim_size; + // Calculate shard size for first preserved dimension. + const Index output_shard_size = + target_input_block_sizes[first_preserved_dim_input_index]; + const Index num_output_shards = + (output_block_inner_dim_size + output_shard_size - 1) / + output_shard_size; + + // Initialize 'tensor_slice_offsets' from input coords of output index. + DSizes tensor_slice_offsets; + GetInputCoordsForOutputIndex(output_block->first_coeff_index(), + &tensor_slice_offsets); + + // Store tensor slice offset in first preserved dimension to be used + // to update tensor slice extents in loop below. + const Index first_preserved_dim_offset_start = + tensor_slice_offsets[first_preserved_dim_input_index]; + + array block_iter_state; + + // Initialize state used to iterate through output coefficients + // and update 'tensor_slice_offsets' in outer preserved dims. + for (int i = 0; i < NumOutputDims - 1; ++i) { + const int dim = static_cast(Layout) == static_cast(ColMajor) + ? i + 1 + : NumOutputDims - i - 2; + block_iter_state[i].input_dim = m_output_to_input_dim_map[dim]; + block_iter_state[i].output_size = output_block->block_sizes()[dim]; + block_iter_state[i].output_count = 0; + } + + // Allocate input block memory. + ScalarNoConst* input_block_data = static_cast( + m_device.allocate(max_coeff_count * sizeof(Scalar))); + // Allocate reducer memory. + const bool packet_reductions_enabled = + (Self::InputPacketAccess & Self::ReducerTraits::PacketAccess); + const Index num_reducers = + (inner_most_dim_preserved && packet_reductions_enabled) + ? (output_shard_size / PacketSize + output_shard_size % PacketSize + + PacketSize) + : output_shard_size; + typedef internal::BlockReducer BlockReducer; + BlockReducer* reducers = static_cast( + m_device.allocate(num_reducers * sizeof(BlockReducer))); + + InputDimensions input_tensor_dims(m_impl.dimensions()); + for (Index output_outer_index = 0; + output_outer_index < output_block_outer_dim_size; + ++output_outer_index) { + for (Index output_shard_index = 0; output_shard_index < num_output_shards; + ++output_shard_index) { + // Initialize 'tensor_slice_extents' for this output shard. + DSizes tensor_slice_extents(input_slice_sizes); + for (int i = 0; i < NumInputDims; ++i) { + if (i == first_preserved_dim_input_index) { + // Clip first preserved dim size to output shard size. + tensor_slice_extents[i] = numext::mini( + output_shard_size, + input_slice_sizes[i] - (tensor_slice_offsets[i] - + first_preserved_dim_offset_start)); + + } else if (!m_reduced[i]) { + // Clip outer preserved dims to size 1, so that we reduce a + // contiguous set of output coefficients. + tensor_slice_extents[i] = 1; + } + } + + // Intialize output coefficient reducers. + for (int i = 0; i < num_reducers; ++i) { + new (&reducers[i]) BlockReducer(m_reducer); + } + + using TensorSliceBlockMapper = + internal::TensorSliceBlockMapper; + + // TODO(andydavis) Consider removing 'input_block_stride_order' if we + // find that scattered reads are not worth supporting in + // TensorSliceBlockMapper. + TensorSliceBlockMapper block_mapper( + input_tensor_dims, tensor_slice_offsets, tensor_slice_extents, + target_input_block_sizes, DimensionList()); + + const Index num_outputs_to_update = + tensor_slice_extents[first_preserved_dim_input_index]; + const Index preserved_dim_vector_reducer_count = + (inner_most_dim_preserved && packet_reductions_enabled) + ? num_outputs_to_update / PacketSize + : 0; + const Index preserved_dim_vector_coeff_count = + inner_most_dim_preserved + ? preserved_dim_vector_reducer_count * PacketSize + : 0; + const Index preserved_dim_reducer_limit = + (inner_most_dim_preserved && packet_reductions_enabled) + ? (preserved_dim_vector_reducer_count + + num_outputs_to_update % PacketSize) + : num_outputs_to_update; + + const Index total_block_count = block_mapper.total_block_count(); + for (Index b = 0; b < total_block_count; ++b) { + InputTensorBlock input_block = + block_mapper.GetBlockForIndex(b, input_block_data); + // Read. + m_impl.block(&input_block); + + Index num_values_to_reduce = 1; + for (Index i = 0; i < NumInputDims; ++i) { + if (m_reduced[i]) { + num_values_to_reduce *= input_block.block_sizes()[i]; + } + } + // Reduce. + if (inner_most_dim_preserved) { + const Index input_outer_dim_size = + input_block.block_sizes().TotalSize() / num_outputs_to_update; + for (Index input_outer_dim_index = 0; + input_outer_dim_index < input_outer_dim_size; + ++input_outer_dim_index) { + const Index input_outer_dim_base = + input_outer_dim_index * num_outputs_to_update; + for (Index i = 0; i < preserved_dim_vector_reducer_count; ++i) { + reducers[i].Reduce(input_outer_dim_base + i * PacketSize, + PacketSize, input_block.data()); + } + const Index scalar_reducer_base = + input_outer_dim_base + preserved_dim_vector_coeff_count; + for (Index i = preserved_dim_vector_reducer_count; + i < preserved_dim_reducer_limit; ++i) { + reducers[i].Reduce(scalar_reducer_base + i - + preserved_dim_vector_reducer_count, + 1, input_block.data()); + } + } + } else { + for (Index i = 0; i < num_outputs_to_update; ++i) { + reducers[i].Reduce(i * num_values_to_reduce, num_values_to_reduce, + input_block.data()); + } + } + } + + // Finalize all reducers for this output shard. + const Index output_base_index = + output_outer_index * output_block_inner_dim_size + + output_shard_index * output_shard_size; + if (inner_most_dim_preserved) { + EIGEN_ALIGN_MAX + typename internal::remove_const::type + values[PacketSize]; + for (Index i = 0; i < preserved_dim_vector_reducer_count; ++i) { + const Index reducer_base = output_base_index + i * PacketSize; + internal::pstore( + values, reducers[i].FinalizePacket()); + for (Index j = 0; j < PacketSize; ++j) { + output_block->data()[reducer_base + j] = values[j]; + } + } + const Index scalar_reducer_base = + output_base_index + preserved_dim_vector_coeff_count; + + for (Index i = preserved_dim_vector_reducer_count; + i < preserved_dim_reducer_limit; ++i) { + output_block->data()[scalar_reducer_base + i - + preserved_dim_vector_reducer_count] = + reducers[i].Finalize(); + } + } else { + for (int i = 0; i < num_outputs_to_update; ++i) { + output_block->data()[output_base_index + i] = + reducers[i].Finalize(); + } + } + + // Update 'tensor_slice_offsets' by num outputs for this output shard. + tensor_slice_offsets[first_preserved_dim_input_index] += + num_outputs_to_update; + } + // Update slice offset for inner preserved dim. + tensor_slice_offsets[first_preserved_dim_input_index] -= + output_block_inner_dim_size; + // Update slice offsets for remaining output dims. + for (int i = 0; i < NumOutputDims - 1; ++i) { + BlockIteratorState& b = block_iter_state[i]; + if (++b.output_count < b.output_size) { + ++tensor_slice_offsets[b.input_dim]; + break; + } + b.output_count = 0; + tensor_slice_offsets[b.input_dim] -= b.output_size - 1; + } + } + + // Free memory. + m_device.deallocate(input_block_data); + m_device.deallocate(reducers); + } + EIGEN_DEVICE_FUNC typename MakePointer_::Type data() const { return m_result; } #if defined(EIGEN_USE_SYCL) @@ -722,6 +1064,12 @@ struct TensorEvaluator, template friend struct internal::InnerReducer; + struct BlockIteratorState { + Index input_dim; + Index output_size; + Index output_count; + }; + // Returns the Index in the input tensor of the first value that needs to be // used to compute the reduction at output index "index". EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const { @@ -764,16 +1112,90 @@ struct TensorEvaluator, return startInput; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void GetInputCoordsForOutputIndex( + Index index, + DSizes* coords) const { + for (int i = 0; i < NumInputDims; ++i) { + (*coords)[i] = 0; + } + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = NumOutputDims - 1; i > 0; --i) { + const Index idx = index / m_fastOutputStrides[i]; + (*coords)[m_output_to_input_dim_map[i]] = idx; + index -= idx * m_outputStrides[i]; + } + (*coords)[m_output_to_input_dim_map[0]] = index; + } else { + for (int i = 0; i < NumOutputDims - 1; ++i) { + const Index idx = index / m_fastOutputStrides[i]; + (*coords)[m_output_to_input_dim_map[i]] = idx; + index -= idx * m_outputStrides[i]; + } + (*coords)[m_output_to_input_dim_map[NumOutputDims-1]] = index; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void CalculateTargetInputBlockShape( + const Index max_coeff_count, + const DSizes& input_slice_sizes, + DSizes* target_input_block_sizes) const { + typedef typename internal::packet_traits::type Packet; + typedef internal::BlockReducer BlockReducer; + // TODO(andydavis) Compute reducer overhead correctly for the case where + // we are preserving the inner most dimension, and a single reducer + // reduces a packet's worth of output coefficients. + const Index reducer_overhead = sizeof(BlockReducer) / sizeof(Scalar); + + Index coeff_to_allocate = max_coeff_count; + bool first_preserved_dim_allocated = false; + bool first_reduced_dim_allocated = false; + for (int i = 0; i < NumInputDims; ++i) { + const int dim = static_cast(Layout) == static_cast(ColMajor) + ? i + : NumInputDims - i - 1; + (*target_input_block_sizes)[dim] = 1; + if (m_reduced[dim]) { + // TODO(andydavis) Consider allocating to multiple reduced dimensions. + // Watch out for cases where reduced dimensions are not contiguous, + // which induces scattered reads. + if (!first_reduced_dim_allocated) { + (*target_input_block_sizes)[dim] = + numext::mini(input_slice_sizes[dim], coeff_to_allocate); + coeff_to_allocate /= (*target_input_block_sizes)[dim]; + first_reduced_dim_allocated = true; + } + } else if (!first_preserved_dim_allocated) { + // TODO(andydavis) Include output block size in this L1 working set + // calculation. + const Index allocated = max_coeff_count - coeff_to_allocate; + const Index alloc_size = numext::maxi( + static_cast(1), coeff_to_allocate / reducer_overhead); + (*target_input_block_sizes)[dim] = + numext::mini(input_slice_sizes[dim], alloc_size); + coeff_to_allocate = numext::maxi( + static_cast(1), + coeff_to_allocate / + ((*target_input_block_sizes)[dim] * reducer_overhead)); + first_preserved_dim_allocated = true; + } + } + } + // Bitmap indicating if an input dimension is reduced or not. array m_reduced; // Dimensions of the output of the operation. Dimensions m_dimensions; // Precomputed strides for the output tensor. array m_outputStrides; + array, NumOutputDims> m_fastOutputStrides; // Subset of strides of the input tensor for the non-reduced dimensions. // Indexed by output dimensions. static const int NumPreservedStrides = max_n_1::size; array m_preservedStrides; + // Map from output to input dimension index. + array m_output_to_input_dim_map; + // How many values go into each reduction + Index m_numValuesToReduce; // Subset of strides of the input tensor for the reduced dimensions. // Indexed by reduced dimensions. @@ -782,6 +1204,9 @@ struct TensorEvaluator, // Indexed by reduced dimensions. array m_reducedDims; + // Block size for tiled (aka TensorBlock) evaluation. + Index m_block_total_size_max; + // Evaluator for the input expression. TensorEvaluator m_impl; -- cgit v1.2.3 From 64abdf1d7eb17174f571751346dd0cbadcf3bc52 Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Wed, 1 Aug 2018 12:35:19 -0700 Subject: Fix typo + get rid of redundant member variables for block sizes --- .../Eigen/CXX11/src/Tensor/TensorBroadcasting.h | 6 +++--- unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h | 17 ++++++++--------- unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h | 11 +++++------ unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h | 12 +++--------- unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h | 10 +++------- unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h | 18 ++++++++++-------- 6 files changed, 32 insertions(+), 42 deletions(-) (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index b6dbe5a22..cca14aafd 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -120,7 +120,7 @@ struct TensorEvaluator, Device> // Block based access to the XprType (input) tensor. using TensorBlock = internal::TensorBlock; using TensorBlockReader = internal::TensorBlockReader; - // We do block based broadcasting using a a trick with 2x tensor rank and 0 + // We do block based broadcasting using a trick with 2x tensor rank and 0 // strides. See block method implementation for details. using BroadcastDimensions = DSizes; using BroadcastTensorBlock = internal::TensorBlock; @@ -589,8 +589,8 @@ struct TensorEvaluator, Device> std::vector* resources) const { // TODO(wuke): Targeting L1 size is 30% faster than targeting L{-1} on large // tensors. But this might need further tuning. - Index l1_cache_scalars = m_device.firstLevelCacheSize() / sizeof(Scalar); - Index block_total_size_max = numext::maxi(Index(1), l1_cache_scalars); + auto block_total_size_max = numext::maxi( + 1, m_device.firstLevelCacheSize() / sizeof(Scalar)); resources->push_back(internal::TensorOpResourceRequirements( internal::TensorBlockShapeType::kSkewedInnerDims, diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h index 7579ab507..aca2ead12 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -202,9 +202,6 @@ struct TensorEvaluator, Device> m_inputStrides[i] = m_inputStrides[i + 1] * input_dims[i + 1]; } } - - m_block_total_size_max = - numext::maxi(1, device.lastLevelCacheSize() / sizeof(Scalar)); } } @@ -290,9 +287,11 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( std::vector* resources) const { + auto block_total_size_max = numext::maxi( + 1, m_device.lastLevelCacheSize() / sizeof(Scalar)); resources->push_back(internal::TensorOpResourceRequirements( internal::TensorBlockShapeType::kSkewedInnerDims, - m_block_total_size_max)); + block_total_size_max)); m_impl.getResourceRequirements(resources); } @@ -370,13 +369,14 @@ struct TensorEvaluator, Device> { Index inputIndex; if ((static_cast(Layout) == static_cast(ColMajor) && m_dim.actualDim() == 0) || - (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == NumInputDims-1)) { + (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == NumInputDims - 1)) { // m_stride is equal to 1, so let's avoid the integer division. eigen_assert(m_stride == 1); inputIndex = index * m_inputStride + m_inputOffset; - } else if ((static_cast(Layout) == static_cast(ColMajor) && m_dim.actualDim() == NumInputDims-1) || - (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == 0)) { - // m_stride is aways greater than index, so let's avoid the integer division. + } else if ((static_cast(Layout) == static_cast(ColMajor) && m_dim.actualDim() == NumInputDims - 1) || + (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == 0)) { + // m_stride is aways greater than index, so let's avoid the integer + // division. eigen_assert(m_stride > index); inputIndex = index + m_inputOffset; } else { @@ -392,7 +392,6 @@ struct TensorEvaluator, Device> Index m_stride; Index m_inputOffset; Index m_inputStride; - Index m_block_total_size_max; DSizes m_inputStrides; TensorEvaluator m_impl; const internal::DimensionId m_dim; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h index 39759b6c3..a8247be90 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h @@ -259,7 +259,7 @@ struct TensorEvaluator, Device> #else EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator( const XprType& op, const Device& device) #endif - : m_impl(op.expression(), device) + : m_device(device), m_impl(op.expression(), device) #ifdef EIGEN_USE_SYCL , m_op(op) #endif @@ -404,9 +404,6 @@ struct TensorEvaluator, Device> } else { m_fastOutputDepth = internal::TensorIntDivisor(m_dimensions[NumDims-1]); } - - m_block_total_size_max = - numext::maxi(1, device.lastLevelCacheSize() / sizeof(Scalar)); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } @@ -551,9 +548,11 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( std::vector* resources) const { + auto block_total_size_max = numext::maxi( + 1, m_device.lastLevelCacheSize() / sizeof(Scalar)); resources->push_back(internal::TensorOpResourceRequirements( internal::TensorBlockShapeType::kSkewedInnerDims, - m_block_total_size_max)); + block_total_size_max)); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block( @@ -743,8 +742,8 @@ struct TensorEvaluator, Device> internal::TensorIntDivisor m_fastOutputDepth; Scalar m_paddingValue; - Index m_block_total_size_max; + const Device& m_device; TensorEvaluator m_impl; #ifdef EIGEN_USE_SYCL // Required for SYCL in order to construct the expression tree on the device diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index 2630311b8..6ddded0bd 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -560,9 +560,6 @@ struct TensorEvaluator, Devi m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i]); } } - - m_block_total_size_max = - numext::maxi(1, device.lastLevelCacheSize() / sizeof(Scalar)); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } @@ -672,9 +669,11 @@ struct TensorEvaluator, Devi EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( std::vector* resources) const { + auto block_total_size_max = numext::maxi( + 1, m_device.lastLevelCacheSize() / sizeof(Scalar)); resources->push_back(internal::TensorOpResourceRequirements( internal::TensorBlockShapeType::kSkewedInnerDims, - m_block_total_size_max)); + block_total_size_max)); m_impl.getResourceRequirements(resources); } @@ -761,7 +760,6 @@ struct TensorEvaluator, Devi Dimensions m_dimensions; bool m_is_identity; const StartIndices m_offsets; - Index m_block_total_size_max; }; @@ -1047,9 +1045,6 @@ struct TensorEvaluator(degenerate ? 1 : m_outputStrides[i]); } } - m_block_total_size_max = numext::maxi(static_cast(1), - device.lastLevelCacheSize() / - sizeof(Scalar)); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } @@ -1128,7 +1123,6 @@ struct TensorEvaluator m_dimensions; DSizes m_offsets; // offset in a flattened shape const Strides m_strides; - std::size_t m_block_total_size_max; //use by sycl const StartIndices m_exprStartIndices; //use by sycl diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index c41783106..73675e7dd 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -572,9 +572,6 @@ struct TensorEvaluator, : (static_cast(Layout) == static_cast(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumOutputDims - 1]; - - m_block_total_size_max = - numext::maxi(1, device.lastLevelCacheSize() / sizeof(Scalar)); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } @@ -771,9 +768,11 @@ struct TensorEvaluator, EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( std::vector* resources) const { + auto block_total_size_max = numext::maxi( + 1, m_device.lastLevelCacheSize() / sizeof(Scalar)); resources->push_back(internal::TensorOpResourceRequirements( internal::TensorBlockShapeType::kSkewedInnerDims, - m_block_total_size_max)); + block_total_size_max)); m_impl.getResourceRequirements(resources); } @@ -1204,9 +1203,6 @@ struct TensorEvaluator, // Indexed by reduced dimensions. array m_reducedDims; - // Block size for tiled (aka TensorBlock) evaluation. - Index m_block_total_size_max; - // Evaluator for the input expression. TensorEvaluator m_impl; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h index 77f47bf64..f94c1380d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h @@ -124,8 +124,11 @@ struct TensorEvaluator, Device> using TensorBlock = internal::TensorBlock; using TensorBlockReader = internal::TensorBlockReader; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_shuffle(op.shufflePermutation()) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, + const Device& device) + : m_device(device), + m_impl(op.expression(), device), + m_shuffle(op.shufflePermutation()) { const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); const Shuffle& shuffle = op.shufflePermutation(); @@ -162,9 +165,6 @@ struct TensorEvaluator, Device> for (int i = 0; i < NumDims; ++i) { m_inputStrides[i] = m_unshuffledInputStrides[shuffle[i]]; } - - m_block_total_size_max = - numext::maxi(1, device.firstLevelCacheSize() / sizeof(Scalar)); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } @@ -226,9 +226,10 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( std::vector* resources) const { + auto block_total_size_max = numext::maxi( + 1, m_device.firstLevelCacheSize() / sizeof(Scalar)); resources->push_back(internal::TensorOpResourceRequirements( - internal::TensorBlockShapeType::kUniformAllDims, - m_block_total_size_max)); + internal::TensorBlockShapeType::kUniformAllDims, block_total_size_max)); m_impl.getResourceRequirements(resources); } @@ -384,7 +385,8 @@ struct TensorEvaluator, Device> array, NumDims> m_fastOutputStrides; array m_inputStrides; array m_unshuffledInputStrides; - Index m_block_total_size_max; + + const Device& m_device; TensorEvaluator m_impl; /// required by sycl Shuffle m_shuffle; -- cgit v1.2.3 From 1b0373ae10687ecc51ad9a0bfd46aa4ee116ade1 Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Wed, 1 Aug 2018 15:55:46 -0700 Subject: Replace all using declarations with typedefs in Tensor ops --- unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h | 20 +++++------ .../Eigen/CXX11/src/Tensor/TensorBroadcasting.h | 17 +++++---- .../Eigen/CXX11/src/Tensor/TensorChipping.h | 16 +++++---- .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 40 ++++++++++++---------- .../Eigen/CXX11/src/Tensor/TensorImagePatch.h | 7 ++-- .../Eigen/CXX11/src/Tensor/TensorMorphing.h | 22 +++++++----- .../Eigen/CXX11/src/Tensor/TensorReduction.h | 14 ++++---- .../Eigen/CXX11/src/Tensor/TensorShuffling.h | 16 +++++---- 8 files changed, 88 insertions(+), 64 deletions(-) (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h index 33c4ef5b7..1db8d6124 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h @@ -152,11 +152,11 @@ struct TensorBlockCopyOp { const Scalar* src_base = &src_data[src_index]; Scalar* dst_base = &dst_data[dst_index]; - using Src = const Eigen::Array; - using Dst = Eigen::Array; + typedef const Eigen::Array Src; + typedef Eigen::Array Dst; - using SrcMap = Eigen::Map>; - using DstMap = Eigen::Map>; + typedef Eigen::Map> SrcMap; + typedef Eigen::Map> DstMap; const SrcMap src(src_base, num_coeff_to_copy, InnerStride<>(src_stride)); DstMap dst(dst_base, num_coeff_to_copy, InnerStride<>(dst_stride)); @@ -401,13 +401,13 @@ struct TensorBlockCwiseBinaryOp { const StorageIndex left_stride, const LeftScalar* left_data, const StorageIndex right_index, const StorageIndex right_stride, const RightScalar* right_data) { - using Lhs = const Eigen::Array; - using Rhs = const Eigen::Array; - using Out = Eigen::Array; + typedef const Eigen::Array Lhs; + typedef const Eigen::Array Rhs; + typedef Eigen::Array Out; - using LhsMap = Eigen::Map>; - using RhsMap = Eigen::Map>; - using OutMap = Eigen::Map>; + typedef Eigen::Map> LhsMap; + typedef Eigen::Map> RhsMap; + typedef Eigen::Map> OutMap; const LeftScalar* lhs_base = &left_data[left_index]; const RightScalar* rhs_base = &right_data[right_index]; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index cca14aafd..a4d750885 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -115,16 +115,21 @@ struct TensorEvaluator, Device> RawAccess = false }; - using ScalarNoConst = typename internal::remove_const::type; + typedef typename internal::remove_const::type ScalarNoConst; // Block based access to the XprType (input) tensor. - using TensorBlock = internal::TensorBlock; - using TensorBlockReader = internal::TensorBlockReader; + typedef internal::TensorBlock + TensorBlock; + typedef internal::TensorBlockReader + TensorBlockReader; + // We do block based broadcasting using a trick with 2x tensor rank and 0 // strides. See block method implementation for details. - using BroadcastDimensions = DSizes; - using BroadcastTensorBlock = internal::TensorBlock; - using BroadcastTensorBlockReader = internal::TensorBlockReader; + typedef DSizes BroadcastDimensions; + typedef internal::TensorBlock + BroadcastTensorBlock; + typedef internal::TensorBlockReader + BroadcastTensorBlockReader; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h index aca2ead12..b4c4162ef 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -152,10 +152,12 @@ struct TensorEvaluator, Device> RawAccess = false }; - using ScalarNoConst = typename internal::remove_const::type; + typedef typename internal::remove_const::type ScalarNoConst; - using InputTensorBlock = internal::TensorBlock; - using OutputTensorBlock = internal::TensorBlock; + typedef internal::TensorBlock + InputTensorBlock; + typedef internal::TensorBlock + OutputTensorBlock; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_dim(op.dim()), m_device(device), m_offset(op.offset()) @@ -426,10 +428,12 @@ struct TensorEvaluator, Device> RawAccess = false }; - using ScalarNoConst = typename internal::remove_const::type; + typedef typename internal::remove_const::type ScalarNoConst; - using InputTensorBlock = internal::TensorBlock; - using OutputTensorBlock = internal::TensorBlock; + typedef internal::TensorBlock + InputTensorBlock; + typedef internal::TensorBlock + OutputTensorBlock; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 17008917a..f11241f83 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -36,7 +36,7 @@ template class TensorExecutor { public: - using StorageIndex = typename Expression::Index; + typedef typename Expression::Index StorageIndex; EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(const Expression& expr, @@ -60,7 +60,7 @@ template class TensorExecutor { public: - using StorageIndex = typename Expression::Index; + typedef typename Expression::Index StorageIndex; EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(const Expression& expr, @@ -102,21 +102,22 @@ template class TensorExecutor { public: - using Scalar = typename traits::Scalar; - using ScalarNoConst = typename remove_const::type; + typedef typename traits::Scalar Scalar; + typedef typename remove_const::type ScalarNoConst; - using Evaluator = TensorEvaluator; - using StorageIndex = typename traits::Index; + typedef TensorEvaluator Evaluator; + typedef typename traits::Index StorageIndex; static const int NumDims = traits::NumDimensions; EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(const Expression& expr, const DefaultDevice& device = DefaultDevice()) { - using TensorBlock = - TensorBlock; - using TensorBlockMapper = TensorBlockMapper; + typedef TensorBlock + TensorBlock; + typedef TensorBlockMapper + TensorBlockMapper; Evaluator evaluator(expr, device); Index total_size = array_prod(evaluator.dimensions()); @@ -221,7 +222,7 @@ struct EvalRange { template class TensorExecutor { public: - using StorageIndex = typename Expression::Index; + typedef typename Expression::Index StorageIndex; static EIGEN_STRONG_INLINE void run(const Expression& expr, const ThreadPoolDevice& device) { @@ -249,20 +250,21 @@ class TensorExecutor { template class TensorExecutor { public: - using Scalar = typename traits::Scalar; - using ScalarNoConst = typename remove_const::type; + typedef typename traits::Scalar Scalar; + typedef typename remove_const::type ScalarNoConst; - using Evaluator = TensorEvaluator; - using StorageIndex = typename traits::Index; + typedef TensorEvaluator Evaluator; + typedef typename traits::Index StorageIndex; static const int NumDims = traits::NumDimensions; static EIGEN_STRONG_INLINE void run(const Expression& expr, const ThreadPoolDevice& device) { - using TensorBlock = - TensorBlock; - using TensorBlockMapper = - TensorBlockMapper; + typedef TensorBlock + TensorBlock; + typedef TensorBlockMapper + TensorBlockMapper; Evaluator evaluator(expr, device); StorageIndex total_size = array_prod(evaluator.dimensions()); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h index a8247be90..2b2f4a650 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h @@ -252,7 +252,8 @@ struct TensorEvaluator, Device> RawAccess = false }; - using OutputTensorBlock = internal::TensorBlock; + typedef internal::TensorBlock + OutputTensorBlock; #ifdef __SYCL_DEVICE_ONLY__ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator( const XprType op, const Device& device) @@ -557,8 +558,8 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block( OutputTensorBlock* output_block) const { - using ImagePatchCopyOp = internal::ImagePatchCopyOp; - using ImagePatchPaddingOp = internal::ImagePatchPaddingOp; + typedef internal::ImagePatchCopyOp ImagePatchCopyOp; + typedef internal::ImagePatchPaddingOp ImagePatchPaddingOp; // Calculate loop limits and various input/output dim sizes. const DSizes& block_sizes = output_block->block_sizes(); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index 6ddded0bd..d5b0c1237 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -123,11 +123,15 @@ struct TensorEvaluator, Device> RawAccess = TensorEvaluator::RawAccess }; - using ScalarNoConst = typename internal::remove_const::type; + typedef typename internal::remove_const::type ScalarNoConst; - using InputTensorBlock = internal::TensorBlock; - using OutputTensorBlock = internal::TensorBlock; - using OutputTensorBlockReader = internal::TensorBlockReader; + typedef internal::TensorBlock + InputTensorBlock; + typedef internal::TensorBlock + OutputTensorBlock; + typedef internal::TensorBlockReader + OutputTensorBlockReader; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_dimensions(op.dimensions()) @@ -512,9 +516,10 @@ struct TensorEvaluator, Devi RawAccess = false }; - using ScalarNoConst = typename internal::remove_const::type; + typedef typename internal::remove_const::type ScalarNoConst; - using TensorBlock = internal::TensorBlock; + typedef internal::TensorBlock + TensorBlock; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_device(device), m_dimensions(op.sizes()), m_offsets(op.startIndices()) @@ -787,9 +792,10 @@ struct TensorEvaluator, Device> RawAccess = (NumDims == 1) & TensorEvaluator::RawAccess }; - using ScalarNoConst = typename internal::remove_const::type; + typedef typename internal::remove_const::type ScalarNoConst; - using TensorBlock = internal::TensorBlock; + typedef internal::TensorBlock + TensorBlock; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 73675e7dd..80f179ba4 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -483,10 +483,12 @@ struct TensorEvaluator, RawAccess = false }; - using ScalarNoConst = typename internal::remove_const::type; + typedef typename internal::remove_const::type ScalarNoConst; - using OutputTensorBlock = internal::TensorBlock; - using InputTensorBlock = internal::TensorBlock; + typedef internal::TensorBlock + OutputTensorBlock; + typedef internal::TensorBlock + InputTensorBlock; static const bool ReducingInnerMostDims = internal::are_inner_most_dims::value; static const bool PreservingInnerMostDims = internal::preserve_inner_most_dims::value; @@ -901,9 +903,9 @@ struct TensorEvaluator, new (&reducers[i]) BlockReducer(m_reducer); } - using TensorSliceBlockMapper = - internal::TensorSliceBlockMapper; + typedef internal::TensorSliceBlockMapper + TensorSliceBlockMapper; // TODO(andydavis) Consider removing 'input_block_stride_order' if we // find that scattered reads are not worth supporting in diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h index f94c1380d..fbe69aabc 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h @@ -119,10 +119,12 @@ struct TensorEvaluator, Device> RawAccess = false }; - using ScalarNoConst = typename internal::remove_const::type; + typedef typename internal::remove_const::type ScalarNoConst; - using TensorBlock = internal::TensorBlock; - using TensorBlockReader = internal::TensorBlockReader; + typedef internal::TensorBlock + TensorBlock; + typedef internal::TensorBlockReader + TensorBlockReader; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -417,10 +419,12 @@ struct TensorEvaluator, Device> RawAccess = false }; - using ScalarNoConst = typename internal::remove_const::type; + typedef typename internal::remove_const::type ScalarNoConst; - using TensorBlock = internal::TensorBlock; - using TensorBlockWriter = internal::TensorBlockWriter; + typedef internal::TensorBlock + TensorBlock; + typedef internal::TensorBlockWriter + TensorBlockWriter; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) -- cgit v1.2.3 From f2209d06e428e0691de71f30fc2db4cb29191cd2 Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Fri, 10 Aug 2018 16:53:36 -0700 Subject: Add block evaluationto CwiseUnaryOp and add PreferBlockAccess enum to all evaluators --- unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h | 2 + .../Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h | 1 + unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h | 18 +-- unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h | 141 +++++++++++++++++++++ .../Eigen/CXX11/src/Tensor/TensorBroadcasting.h | 11 +- .../Eigen/CXX11/src/Tensor/TensorChipping.h | 13 +- .../Eigen/CXX11/src/Tensor/TensorConcatenation.h | 2 + .../Eigen/CXX11/src/Tensor/TensorContraction.h | 1 + .../Eigen/CXX11/src/Tensor/TensorConversion.h | 1 + .../Eigen/CXX11/src/Tensor/TensorConvolution.h | 2 + .../Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h | 1 + .../Eigen/CXX11/src/Tensor/TensorCustomOp.h | 2 + unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h | 1 + .../Eigen/CXX11/src/Tensor/TensorEvaluator.h | 76 ++++++++--- unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h | 1 + .../Eigen/CXX11/src/Tensor/TensorFixedSize.h | 1 + .../Eigen/CXX11/src/Tensor/TensorForcedEval.h | 1 + .../CXX11/src/Tensor/TensorForwardDeclarations.h | 7 +- .../Eigen/CXX11/src/Tensor/TensorGenerator.h | 1 + .../Eigen/CXX11/src/Tensor/TensorImagePatch.h | 13 +- .../Eigen/CXX11/src/Tensor/TensorInflation.h | 1 + .../Eigen/CXX11/src/Tensor/TensorLayoutSwap.h | 2 + .../Eigen/CXX11/src/Tensor/TensorMorphing.h | 46 ++++--- unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h | 1 + unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h | 1 + .../Eigen/CXX11/src/Tensor/TensorReduction.h | 13 +- unsupported/Eigen/CXX11/src/Tensor/TensorRef.h | 3 + unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h | 2 + unsupported/Eigen/CXX11/src/Tensor/TensorScan.h | 1 + .../Eigen/CXX11/src/Tensor/TensorShuffling.h | 24 ++-- .../Eigen/CXX11/src/Tensor/TensorStriding.h | 2 + unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h | 1 + .../Eigen/CXX11/src/Tensor/TensorVolumePatch.h | 1 + unsupported/test/cxx11_tensor_block_access.cpp | 111 ++++++++++++++++ unsupported/test/cxx11_tensor_executor.cpp | 31 +++++ 35 files changed, 454 insertions(+), 82 deletions(-) (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h index c0f33ba2d..ea3ab329d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h @@ -87,6 +87,7 @@ struct TensorEvaluator, Device> IsAligned = /*TensorEvaluator::IsAligned*/ false, PacketAccess = /*TensorEvaluator::PacketAccess*/ false, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false @@ -220,6 +221,7 @@ struct TensorEvaluator, Devi IsAligned = /*TensorEvaluator::IsAligned*/ false, PacketAccess = /*TensorEvaluator::PacketAccess*/ false, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator >, Device>::Layout, CoordAccess = false, // to be implemented RawAccess = false diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h index 442639868..5110e99ee 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h @@ -109,6 +109,7 @@ struct TensorEvaluator, Sy IsAligned = false, PacketAccess = false, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, RawAccess = false diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h index f1f877c16..9ec1ec726 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h @@ -102,14 +102,16 @@ struct TensorEvaluator, Device> static const int NumDims = XprType::NumDims; enum { - IsAligned = TensorEvaluator::IsAligned & - TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess & - TensorEvaluator::PacketAccess, - BlockAccess = TensorEvaluator::BlockAccess & - TensorEvaluator::BlockAccess, - Layout = TensorEvaluator::Layout, - RawAccess = TensorEvaluator::RawAccess + IsAligned = TensorEvaluator::IsAligned & + TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess & + TensorEvaluator::PacketAccess, + BlockAccess = TensorEvaluator::BlockAccess & + TensorEvaluator::BlockAccess, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess | + TensorEvaluator::PreferBlockAccess, + Layout = TensorEvaluator::Layout, + RawAccess = TensorEvaluator::RawAccess }; typedef typename internal::TensorBlock< diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h index 877603421..4a3e1ac17 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h @@ -381,6 +381,147 @@ class TensorBlockWriter : public TensorBlockIO + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( + const UnaryFunctor& functor, const StorageIndex num_coeff, + const StorageIndex output_index, const StorageIndex output_stride, + OutputScalar* output_data, const StorageIndex input_index, + const StorageIndex input_stride, const InputScalar* input_data) { + typedef const Eigen::Array Input; + typedef Eigen::Array Output; + + typedef Eigen::Map> InputMap; + typedef Eigen::Map> OutputMap; + + const InputScalar* input_base = &input_data[input_index]; + OutputScalar* output_base = &output_data[output_index]; + + const InputMap input(input_base, num_coeff, InnerStride<>(input_stride)); + OutputMap output(output_base, num_coeff, InnerStride<>(output_stride)); + + output = Eigen::CwiseUnaryOp(input, functor); + } +}; + +/** + * \class TensorBlockCwiseUnaryIO + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor block IO class for carrying out cwise unary ops. + * + * This class carries out the unary op on given blocks. + */ +template +struct TensorBlockCwiseUnaryIO { + typedef typename internal::TensorBlock::Dimensions Dimensions; + + struct BlockIteratorState { + StorageIndex output_stride, output_span; + StorageIndex input_stride, input_span; + StorageIndex size, count; + }; + + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( + const UnaryFunctor& functor, const Dimensions& block_sizes, + const Dimensions& block_strides, OutputScalar* output_data, + const array& input_strides, + const InputScalar* input_data) { + // Find the innermost dimension whose size is not 1. This is the effective + // inner dim. If all dimensions are of size 1, fallback to using the actual + // innermost dim to avoid out-of-bound access. + int num_size_one_inner_dims = 0; + for (int i = 0; i < NumDims; ++i) { + const int dim = cond()(i, NumDims - i - 1); + if (block_sizes[dim] != 1) { + num_size_one_inner_dims = i; + break; + } + } + // Calculate strides and dimensions. + const int inner_dim = + NumDims == 0 ? 1 + : cond()(num_size_one_inner_dims, + NumDims - num_size_one_inner_dims - 1); + StorageIndex inner_dim_size = NumDims == 0 ? 1 : block_sizes[inner_dim]; + for (int i = num_size_one_inner_dims + 1; i < NumDims; ++i) { + const int dim = cond()(i, NumDims - i - 1); + // Merge multiple inner dims into one for larger inner dim size (i.e. + // fewer calls to TensorBlockCwiseUnaryOp::Run()). + if (inner_dim_size == block_strides[dim] && + block_strides[dim] == input_strides[dim]) { + inner_dim_size *= block_sizes[dim]; + ++num_size_one_inner_dims; + } else { + break; + } + } + + StorageIndex output_index = 0, input_index = 0; + + const StorageIndex output_stride = + NumDims == 0 ? 1 : block_strides[inner_dim]; + const StorageIndex input_stride = + NumDims == 0 ? 1 : input_strides[inner_dim]; + + const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1; + array block_iter_state; + + // Initialize block iterator state. Squeeze away any dimension of size 1. + int num_squeezed_dims = 0; + for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) { + const int dim = cond()(i + 1, NumDims - i - 2); + const StorageIndex size = block_sizes[dim]; + if (size == 1) { + continue; + } + BlockIteratorState& state = block_iter_state[num_squeezed_dims]; + state.output_stride = block_strides[dim]; + state.input_stride = input_strides[dim]; + state.size = size; + state.output_span = state.output_stride * (size - 1); + state.input_span = state.input_stride * (size - 1); + state.count = 0; + ++num_squeezed_dims; + } + + // Compute cwise unary op. + const StorageIndex block_total_size = + NumDims == 0 ? 1 : block_sizes.TotalSize(); + for (StorageIndex i = 0; i < block_total_size; i += inner_dim_size) { + TensorBlockCwiseUnaryOp::Run(functor, inner_dim_size, output_index, + output_stride, output_data, input_index, + input_stride, input_data); + // Update index. + for (int j = 0; j < num_squeezed_dims; ++j) { + auto& state = block_iter_state[j]; + if (++state.count < state.size) { + output_index += state.output_stride; + input_index += state.input_stride; + break; + } + state.count = 0; + output_index -= state.output_span; + input_index -= state.input_span; + } + } + } +}; + /** * \class TensorBlockCwiseBinaryOp * \ingroup CXX11_Tensor_Module diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index a4d750885..5e812b04d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -108,11 +108,12 @@ struct TensorEvaluator, Device> bool isCopy= false, nByOne = false, oneByN = false; enum { - IsAligned = true, - PacketAccess = TensorEvaluator::PacketAccess, - BlockAccess = TensorEvaluator::BlockAccess, - Layout = TensorEvaluator::Layout, - RawAccess = false + IsAligned = true, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = TensorEvaluator::BlockAccess, + PreferBlockAccess = true, + Layout = TensorEvaluator::Layout, + RawAccess = false }; typedef typename internal::remove_const::type ScalarNoConst; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h index b4c4162ef..76fab39e2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -144,12 +144,13 @@ struct TensorEvaluator, Device> enum { // Alignment can't be guaranteed at compile time since it depends on the // slice offsets. - IsAligned = false, - PacketAccess = TensorEvaluator::PacketAccess, - BlockAccess = TensorEvaluator::BlockAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = TensorEvaluator::BlockAccess, + PreferBlockAccess = true, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false }; typedef typename internal::remove_const::type ScalarNoConst; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h index 27c92d8f6..3863ee8c3 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h @@ -123,6 +123,7 @@ struct TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, RawAccess = false }; @@ -308,6 +309,7 @@ template::PacketAccess & TensorEvaluator::PacketAccess, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, RawAccess = false }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index e604456e8..c459fc649 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -241,6 +241,7 @@ struct TensorContractionEvaluatorBase IsAligned = true, PacketAccess = (PacketType::size > 1), BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = true diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h index a7751eee1..1f613d3c7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h @@ -196,6 +196,7 @@ struct TensorEvaluator, Device> IsAligned = false, PacketAccess = true, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, RawAccess = false }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index a07e32db0..2d0e6599f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -308,6 +308,7 @@ struct TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false @@ -780,6 +781,7 @@ struct TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, PacketAccess = false, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h index d301d0c01..e79958fc9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h @@ -243,6 +243,7 @@ struct TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, PacketAccess = false, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h index 47b5a5a5e..d71b2e34b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h @@ -93,6 +93,7 @@ struct TensorEvaluator, Devi IsAligned = false, PacketAccess = (PacketType::size > 1), BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false @@ -255,6 +256,7 @@ struct TensorEvaluator::size > 1), BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h index 256d499f2..554ee5f59 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h @@ -108,6 +108,7 @@ struct TensorEvaluator, Device> IsAligned = TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = true diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index 028902fea..e30f9ad8e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -43,6 +43,7 @@ struct TensorEvaluator IsAligned = Derived::IsAligned, PacketAccess = (PacketType::size > 1), BlockAccess = internal::is_arithmetic::type>::value, + PreferBlockAccess = false, Layout = Derived::Layout, CoordAccess = NumCoords > 0, RawAccess = true @@ -195,6 +196,7 @@ struct TensorEvaluator IsAligned = Derived::IsAligned, PacketAccess = (PacketType::size > 1), BlockAccess = internal::is_arithmetic::type>::value, + PreferBlockAccess = false, Layout = Derived::Layout, CoordAccess = NumCoords > 0, RawAccess = true @@ -288,6 +290,7 @@ struct TensorEvaluator, Device> IsAligned = true, PacketAccess = internal::functor_traits::PacketAccess, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false @@ -351,27 +354,34 @@ struct TensorEvaluator, Device> typedef TensorCwiseUnaryOp XprType; enum { - IsAligned = TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess & - internal::functor_traits::PacketAccess, - BlockAccess = false, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false + IsAligned = TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess & + internal::functor_traits::PacketAccess, + BlockAccess = TensorEvaluator::PacketAccess, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false }; EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) - : m_functor(op.functor()), + : m_device(device), + m_functor(op.functor()), m_argImpl(op.nestedExpression(), device) { } typedef typename XprType::Index Index; typedef typename XprType::Scalar Scalar; + typedef typename internal::remove_const::type ScalarNoConst; typedef typename internal::traits::Scalar CoeffReturnType; typedef typename PacketType::type PacketReturnType; static const int PacketSize = PacketType::size; typedef typename TensorEvaluator::Dimensions Dimensions; + static const int NumDims = internal::array_size::value; + typedef internal::TensorBlock + TensorBlock; + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { @@ -399,6 +409,29 @@ struct TensorEvaluator, Device> TensorOpCost(0, 0, functor_cost, vectorized, PacketSize); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( + std::vector* resources) const { + m_argImpl.getResourceRequirements(resources); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block( + TensorBlock* output_block) const { + if (NumDims <= 0) { + output_block->data()[0] = coeff(0); + return; + } + internal::TensorBlockView arg_block(m_device, m_argImpl, + *output_block); + internal::TensorBlockCwiseUnaryIO::Run(m_functor, + output_block->block_sizes(), + output_block + ->block_strides(), + output_block->data(), + arg_block.block_strides(), + arg_block.data()); + } + EIGEN_DEVICE_FUNC typename Eigen::internal::traits::PointerType data() const { return NULL; } /// required by sycl in order to extract the accessor @@ -408,6 +441,7 @@ struct TensorEvaluator, Device> private: + const Device& m_device; const UnaryOp m_functor; TensorEvaluator m_argImpl; }; @@ -421,16 +455,18 @@ struct TensorEvaluator XprType; enum { - IsAligned = TensorEvaluator::IsAligned & - TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess & - TensorEvaluator::PacketAccess & - internal::functor_traits::PacketAccess, - BlockAccess = TensorEvaluator::BlockAccess & - TensorEvaluator::BlockAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false + IsAligned = TensorEvaluator::IsAligned & + TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess & + TensorEvaluator::PacketAccess & + internal::functor_traits::PacketAccess, + BlockAccess = TensorEvaluator::BlockAccess & + TensorEvaluator::BlockAccess, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess | + TensorEvaluator::PreferBlockAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false }; EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) @@ -501,7 +537,7 @@ struct TensorEvaluatordata()[0] = coeff(0); + output_block->data()[0] = coeff(Index(0)); return; } internal::TensorBlockView left_block( @@ -543,6 +579,7 @@ struct TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess & internal::functor_traits::PacketAccess, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false @@ -648,6 +685,7 @@ struct TensorEvaluator PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess & PacketType::HasBlend, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h index d6ab4d997..480cf1f39 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h @@ -136,6 +136,7 @@ struct TensorEvaluator, D IsAligned = false, PacketAccess = true, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, RawAccess = false diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h index 1342e47a6..71ba56773 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h @@ -42,6 +42,7 @@ class TensorFixedSize : public TensorBase0), PacketAccess = (internal::packet_traits::size > 1), BlockAccess = false, + PreferBlockAccess = false, Layout = Options_ & RowMajor ? RowMajor : ColMajor, CoordAccess = true, RawAccess = true diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h index 2778bf5ec..edf9f85d8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h @@ -99,6 +99,7 @@ struct TensorEvaluator, Device> IsAligned = true, PacketAccess = (PacketType::size > 1), BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, RawAccess = true }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h index 0dd524a30..b8a57ab99 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h @@ -132,7 +132,12 @@ struct IsVectorizable { template struct IsTileable { - static const bool value = TensorEvaluator::BlockAccess; + // Check that block evaluation is supported and it's a preferred option (at + // least one sub-expression has much faster block evaluation, e.g. + // broadcasting). + static const bool value = + TensorEvaluator::BlockAccess & + TensorEvaluator::PreferBlockAccess; }; template , Device> IsAligned = false, PacketAccess = (PacketType::size > 1), BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h index 2b2f4a650..1826d7022 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h @@ -244,12 +244,13 @@ struct TensorEvaluator, Device> static const int PacketSize = PacketType::size; enum { - IsAligned = false, - PacketAccess = TensorEvaluator::PacketAccess, - BlockAccess = true, - Layout = TensorEvaluator::Layout, - CoordAccess = false, - RawAccess = false + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = true, + PreferBlockAccess = true, + Layout = TensorEvaluator::Layout, + CoordAccess = false, + RawAccess = false }; typedef internal::TensorBlock diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h index 64f2ad81f..e28565009 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h @@ -91,6 +91,7 @@ struct TensorEvaluator, Device> IsAligned = /*TensorEvaluator::IsAligned*/ false, PacketAccess = TensorEvaluator::PacketAccess, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h index e3165fa10..998757d14 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h @@ -120,6 +120,7 @@ struct TensorEvaluator, Device> IsAligned = TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess, BlockAccess = false, + PreferBlockAccess = false, Layout = (static_cast(TensorEvaluator::Layout) == static_cast(ColMajor)) ? RowMajor : ColMajor, CoordAccess = false, // to be implemented RawAccess = TensorEvaluator::RawAccess @@ -183,6 +184,7 @@ template IsAligned = TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess, BlockAccess = false, + PreferBlockAccess = false, Layout = (static_cast(TensorEvaluator::Layout) == static_cast(ColMajor)) ? RowMajor : ColMajor, CoordAccess = false // to be implemented }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index b25c1eabc..6f9294ccf 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -111,16 +111,17 @@ struct TensorEvaluator, Device> static const int NumInputDims = internal::array_size::Dimensions>::value; enum { - IsAligned = TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess, + IsAligned = TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess, // TODO(andydavis, wuke) Enable BlockAccess for the general case when the // performance issue with block-based reshape is resolved. - BlockAccess = TensorEvaluator::BlockAccess && - TensorEvaluator::RawAccess && - NumInputDims > 0 && NumOutputDims > 0, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = TensorEvaluator::RawAccess + BlockAccess = TensorEvaluator::BlockAccess && + TensorEvaluator::RawAccess && + NumInputDims > 0 && NumOutputDims > 0, + PreferBlockAccess = true, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = TensorEvaluator::RawAccess }; typedef typename internal::remove_const::type ScalarNoConst; @@ -349,6 +350,7 @@ template IsAligned = TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = TensorEvaluator::RawAccess @@ -508,12 +510,13 @@ struct TensorEvaluator, Devi enum { // Alignment can't be guaranteed at compile time since it depends on the // slice offsets and sizes. - IsAligned = /*TensorEvaluator::IsAligned*/false, - PacketAccess = TensorEvaluator::PacketAccess, - BlockAccess = TensorEvaluator::BlockAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, - RawAccess = false + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = TensorEvaluator::BlockAccess, + PreferBlockAccess = true, + Layout = TensorEvaluator::Layout, + CoordAccess = false, + RawAccess = false }; typedef typename internal::remove_const::type ScalarNoConst; @@ -785,12 +788,13 @@ struct TensorEvaluator, Device> typedef Sizes Dimensions; enum { - IsAligned = /*TensorEvaluator::IsAligned*/false, - PacketAccess = TensorEvaluator::PacketAccess, - BlockAccess = TensorEvaluator::BlockAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, - RawAccess = (NumDims == 1) & TensorEvaluator::RawAccess + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = TensorEvaluator::BlockAccess, + PreferBlockAccess = true, + Layout = TensorEvaluator::Layout, + CoordAccess = false, + RawAccess = (NumDims == 1) & TensorEvaluator::RawAccess }; typedef typename internal::remove_const::type ScalarNoConst; @@ -972,6 +976,7 @@ struct TensorEvaluator::Layout, RawAccess = false }; @@ -1148,6 +1153,7 @@ struct TensorEvaluator::Layout, CoordAccess = TensorEvaluator::CoordAccess, RawAccess = false diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h index aa1db3c73..59c1704ed 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h @@ -97,6 +97,7 @@ struct TensorEvaluator, Device IsAligned = true, PacketAccess = TensorEvaluator::PacketAccess, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = true, RawAccess = false diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h index a0a1ad8f4..4292fe0c2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h @@ -95,6 +95,7 @@ struct TensorEvaluator, Device> IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, RawAccess = false diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 80f179ba4..3d534eaa2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -475,12 +475,13 @@ struct TensorEvaluator, static const int PacketSize = PacketType::size; enum { - IsAligned = false, - PacketAccess = Self::InputPacketAccess && Op::PacketAccess, - BlockAccess = TensorEvaluator::BlockAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false + IsAligned = false, + PacketAccess = Self::InputPacketAccess && Op::PacketAccess, + BlockAccess = TensorEvaluator::BlockAccess, + PreferBlockAccess = true, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false }; typedef typename internal::remove_const::type ScalarNoConst; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h index a6cade50f..6e15e75f9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h @@ -137,6 +137,7 @@ template class TensorRef : public TensorBase, Device> IsAligned = false, PacketAccess = false, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorRef::Layout, CoordAccess = false, // to be implemented RawAccess = false @@ -414,6 +416,7 @@ struct TensorEvaluator, Device> : public TensorEvaluator, Device IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false @@ -255,6 +256,7 @@ struct TensorEvaluator, Device> IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h index b1135f297..641366d9d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h @@ -97,6 +97,7 @@ struct TensorEvaluator, Device> { IsAligned = false, PacketAccess = (PacketType::size > 1), BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, RawAccess = true diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h index 98f125408..a5b541a68 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h @@ -111,12 +111,13 @@ struct TensorEvaluator, Device> static const int PacketSize = PacketType::size; enum { - IsAligned = false, - PacketAccess = (PacketType::size > 1), - BlockAccess = TensorEvaluator::BlockAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false + IsAligned = false, + PacketAccess = (PacketType::size > 1), + BlockAccess = TensorEvaluator::BlockAccess, + PreferBlockAccess = true, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false }; typedef typename internal::remove_const::type ScalarNoConst; @@ -412,11 +413,12 @@ struct TensorEvaluator, Device> static const int PacketSize = PacketType::size; enum { - IsAligned = false, - PacketAccess = (PacketType::size > 1), - BlockAccess = TensorEvaluator::BlockAccess, - Layout = TensorEvaluator::Layout, - RawAccess = false + IsAligned = false, + PacketAccess = (PacketType::size > 1), + BlockAccess = TensorEvaluator::BlockAccess, + PreferBlockAccess = true, + Layout = TensorEvaluator::Layout, + RawAccess = false }; typedef typename internal::remove_const::type ScalarNoConst; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h index 4b69072f2..221dc96c9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h @@ -113,6 +113,7 @@ struct TensorEvaluator, Device> IsAligned = /*TensorEvaluator::IsAligned*/false, PacketAccess = TensorEvaluator::PacketAccess, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false @@ -275,6 +276,7 @@ struct TensorEvaluator, Device> IsAligned = /*TensorEvaluator::IsAligned*/false, PacketAccess = TensorEvaluator::PacketAccess, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h index ea53bb04b..9fc54a4c0 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h @@ -96,6 +96,7 @@ struct TensorEvaluator, Device> IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, RawAccess = false diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h index 3c7d8bbc0..c1b7a58ca 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h @@ -200,6 +200,7 @@ struct TensorEvaluator, D IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, RawAccess = false diff --git a/unsupported/test/cxx11_tensor_block_access.cpp b/unsupported/test/cxx11_tensor_block_access.cpp index 6feeff231..746f62511 100644 --- a/unsupported/test/cxx11_tensor_block_access.cpp +++ b/unsupported/test/cxx11_tensor_block_access.cpp @@ -517,6 +517,114 @@ static void test_block_io_squeeze_ones() { } } +template +static void test_block_cwise_unary_io_basic() { + typedef internal::scalar_square_op UnaryFunctor; + typedef internal::TensorBlockCwiseUnaryIO + TensorBlockCwiseUnaryIO; + + DSizes block_sizes = RandomDims(); + DSizes strides(ComputeStrides(block_sizes)); + + const auto total_size = block_sizes.TotalSize(); + + // Create a random input tensors. + T* input_data = GenerateRandomData(total_size); + + T* output_data = new T[total_size]; + UnaryFunctor functor; + TensorBlockCwiseUnaryIO::Run(functor, block_sizes, strides, output_data, + strides, input_data); + for (int i = 0; i < total_size; ++i) { + VERIFY_IS_EQUAL(output_data[i], functor(input_data[i])); + } + + delete[] input_data; + delete[] output_data; +} + +template +static void test_block_cwise_unary_io_squeeze_ones() { + typedef internal::scalar_square_op UnaryFunctor; + typedef internal::TensorBlockCwiseUnaryIO + TensorBlockCwiseUnaryIO; + + DSizes block_sizes(1, 2, 1, 3, 1); + DSizes strides(ComputeStrides(block_sizes)); + + const auto total_size = block_sizes.TotalSize(); + + // Create a random input tensors. + auto* input_data = GenerateRandomData(total_size); + + auto* output_data = new float[total_size]; + UnaryFunctor functor; + TensorBlockCwiseUnaryIO::Run(functor, block_sizes, strides, output_data, + strides, input_data); + for (int i = 0; i < total_size; ++i) { + VERIFY_IS_EQUAL(output_data[i], functor(input_data[i])); + } + + delete[] input_data; + delete[] output_data; +} + +template +static void test_block_cwise_unary_io_zero_strides() { + typedef internal::scalar_square_op UnaryFunctor; + typedef internal::TensorBlockCwiseUnaryIO + TensorBlockCwiseUnaryIO; + + DSizes rnd_dims = RandomDims<5>(); + + DSizes input_sizes = rnd_dims; + input_sizes[0] = 1; + input_sizes[2] = 1; + input_sizes[4] = 1; + + DSizes input_strides(ComputeStrides(input_sizes)); + input_strides[0] = 0; + input_strides[2] = 0; + input_strides[4] = 0; + + // Generate random data. + auto* input_data = GenerateRandomData(input_sizes.TotalSize()); + + DSizes output_sizes = rnd_dims; + DSizes output_strides(ComputeStrides(output_sizes)); + + const auto output_total_size = output_sizes.TotalSize(); + auto* output_data = new float[output_total_size]; + + UnaryFunctor functor; + TensorBlockCwiseUnaryIO::Run(functor, output_sizes, output_strides, + output_data, input_strides, input_data); + for (int i = 0; i < rnd_dims[0]; ++i) { + for (int j = 0; j < rnd_dims[1]; ++j) { + for (int k = 0; k < rnd_dims[2]; ++k) { + for (int l = 0; l < rnd_dims[3]; ++l) { + for (int m = 0; m < rnd_dims[4]; ++m) { + Index output_index = i * output_strides[0] + j * output_strides[1] + + k * output_strides[2] + l * output_strides[3] + + m * output_strides[4]; + Index input_index = i * input_strides[0] + j * input_strides[1] + + k * input_strides[2] + l * input_strides[3] + + m * input_strides[4]; + VERIFY_IS_EQUAL(output_data[output_index], + functor(input_data[input_index])); + } + } + } + } + } + + delete[] input_data; + delete[] output_data; +} + template static void test_block_cwise_binary_io_basic() { typedef internal::scalar_sum_op BinaryFunctor; @@ -982,6 +1090,9 @@ EIGEN_DECLARE_TEST(cxx11_tensor_block_access) { TEST_LAYOUTS_AND_DIMS(Data, test_block_io_copy_using_reordered_dimensions); TEST_LAYOUTS(test_block_io_zero_stride); TEST_LAYOUTS(test_block_io_squeeze_ones); + TEST_LAYOUTS_AND_DIMS(float, test_block_cwise_unary_io_basic); + TEST_LAYOUTS(test_block_cwise_unary_io_squeeze_ones); + TEST_LAYOUTS(test_block_cwise_unary_io_zero_strides); TEST_LAYOUTS_AND_DIMS(float, test_block_cwise_binary_io_basic); TEST_LAYOUTS(test_block_cwise_binary_io_squeeze_ones); TEST_LAYOUTS(test_block_cwise_binary_io_zero_strides); diff --git a/unsupported/test/cxx11_tensor_executor.cpp b/unsupported/test/cxx11_tensor_executor.cpp index d16ae4d11..1dc18220c 100644 --- a/unsupported/test/cxx11_tensor_executor.cpp +++ b/unsupported/test/cxx11_tensor_executor.cpp @@ -29,6 +29,33 @@ static array RandomDims(int min_dim = 1, int max_dim = 20) { return dims; }; +template +static void test_execute_unary_expr(Device d) { + static constexpr int Options = 0 | Layout; + + // Pick a large enough tensor size to bypass small tensor block evaluation + // optimization. + auto dims = RandomDims(50 / NumDims, 100 / NumDims); + + Tensor src(dims); + Tensor dst(dims); + + src.setRandom(); + const auto expr = src.square(); + + using Assign = TensorAssignOp; + using Executor = + internal::TensorExecutor; + + Executor::run(Assign(dst, expr), d); + + for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { + T square = src.coeff(i) * src.coeff(i); + VERIFY_IS_EQUAL(square, dst.coeff(i)); + } +} + template static void test_execute_binary_expr(Device d) @@ -445,6 +472,10 @@ EIGEN_DECLARE_TEST(cxx11_tensor_executor) { Eigen::ThreadPool tp(num_threads); Eigen::ThreadPoolDevice tp_device(&tp, num_threads); + CALL_SUBTEST_COMBINATIONS(test_execute_unary_expr, float, 3); + CALL_SUBTEST_COMBINATIONS(test_execute_unary_expr, float, 4); + CALL_SUBTEST_COMBINATIONS(test_execute_unary_expr, float, 5); + CALL_SUBTEST_COMBINATIONS(test_execute_binary_expr, float, 3); CALL_SUBTEST_COMBINATIONS(test_execute_binary_expr, float, 4); CALL_SUBTEST_COMBINATIONS(test_execute_binary_expr, float, 5); -- cgit v1.2.3 From c64fe9ea1f9f5943864cd9ca27d3fcca07453f82 Mon Sep 17 00:00:00 2001 From: Deven Desai Date: Thu, 30 Aug 2018 20:22:16 +0000 Subject: Updates to fix HIP-clang specific compile errors. Compiling the eigen unittests with hip-clang (HIP with clang as the underlying compiler instead of hcc or nvcc), results in compile errors. The changes in this commit fix those compile errors. The main change is to convert a few instances of "__device__" to "EIGEN_DEVICE_FUNC" --- Eigen/src/Core/arch/GPU/PacketMathHalf.h | 74 +++++++++++----------- test/gpu_common.h | 6 +- .../Eigen/CXX11/src/Tensor/TensorReduction.h | 2 +- 3 files changed, 41 insertions(+), 41 deletions(-) (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h') diff --git a/Eigen/src/Core/arch/GPU/PacketMathHalf.h b/Eigen/src/Core/arch/GPU/PacketMathHalf.h index b0a72e1f9..c4feda87d 100644 --- a/Eigen/src/Core/arch/GPU/PacketMathHalf.h +++ b/Eigen/src/Core/arch/GPU/PacketMathHalf.h @@ -43,7 +43,7 @@ template<> struct packet_traits : default_packet_traits template<> struct unpacket_traits { typedef Eigen::half type; enum {size=2, alignment=Aligned16}; typedef half2 half; }; -template<> __device__ EIGEN_STRONG_INLINE half2 pset1(const Eigen::half& from) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1(const Eigen::half& from) { #if defined(EIGEN_HIP_DEVICE_COMPILE) @@ -58,29 +58,29 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pset1(const Eigen::half& #endif } -template<> __device__ EIGEN_STRONG_INLINE half2 pload(const Eigen::half* from) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload(const Eigen::half* from) { return *reinterpret_cast(from); } -template<> __device__ EIGEN_STRONG_INLINE half2 ploadu(const Eigen::half* from) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu(const Eigen::half* from) { return __halves2half2(from[0], from[1]); } -template<> __device__ EIGEN_STRONG_INLINE half2 ploaddup(const Eigen::half* from) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploaddup(const Eigen::half* from) { return __halves2half2(from[0], from[0]); } -template<> __device__ EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const half2& from) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const half2& from) { *reinterpret_cast(to) = from; } -template<> __device__ EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const half2& from) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const half2& from) { to[0] = __low2half(from); to[1] = __high2half(from); } template<> - __device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro(const Eigen::half* from) { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro(const Eigen::half* from) { #if defined(EIGEN_HIP_DEVICE_COMPILE) @@ -102,7 +102,7 @@ template<> } template<> -__device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro(const Eigen::half* from) { +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro(const Eigen::half* from) { #if defined(EIGEN_HIP_DEVICE_COMPILE) @@ -123,20 +123,20 @@ __device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro(const Eigen::ha #endif } -template<> __device__ EIGEN_STRONG_INLINE half2 pgather(const Eigen::half* from, Index stride) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pgather(const Eigen::half* from, Index stride) { return __halves2half2(from[0*stride], from[1*stride]); } -template<> __device__ EIGEN_STRONG_INLINE void pscatter(Eigen::half* to, const half2& from, Index stride) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(Eigen::half* to, const half2& from, Index stride) { to[stride*0] = __low2half(from); to[stride*1] = __high2half(from); } -template<> __device__ EIGEN_STRONG_INLINE Eigen::half pfirst(const half2& a) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst(const half2& a) { return __low2half(a); } -template<> __device__ EIGEN_STRONG_INLINE half2 pabs(const half2& a) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs(const half2& a) { half2 result; unsigned temp = *(reinterpret_cast(&(a))); *(reinterpret_cast(&(result))) = temp & 0x7FFF7FFF; @@ -144,7 +144,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pabs(const half2& a) { } -__device__ EIGEN_STRONG_INLINE void +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { __half a1 = __low2half(kernel.packet[0]); __half a2 = __high2half(kernel.packet[0]); @@ -154,7 +154,7 @@ ptranspose(PacketBlock& kernel) { kernel.packet[1] = __halves2half2(a2, b2); } -template<> __device__ EIGEN_STRONG_INLINE half2 plset(const Eigen::half& a) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset(const Eigen::half& a) { #if defined(EIGEN_HIP_DEVICE_COMPILE) return __halves2half2(a, __hadd(a, __float2half(1.0f))); @@ -171,7 +171,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 plset(const Eigen::half& #endif } -template<> __device__ EIGEN_STRONG_INLINE half2 padd(const half2& a, const half2& b) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd(const half2& a, const half2& b) { #if defined(EIGEN_HIP_DEVICE_COMPILE) return __hadd2(a, b); @@ -193,7 +193,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 padd(const half2& a, cons #endif } -template<> __device__ EIGEN_STRONG_INLINE half2 psub(const half2& a, const half2& b) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub(const half2& a, const half2& b) { #if defined(EIGEN_HIP_DEVICE_COMPILE) return __hsub2(a, b); @@ -215,7 +215,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 psub(const half2& a, cons #endif } -template<> __device__ EIGEN_STRONG_INLINE half2 pnegate(const half2& a) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) { #if defined(EIGEN_HIP_DEVICE_COMPILE) return __hneg2(a); @@ -233,9 +233,9 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pnegate(const half2& a) { #endif } -template<> __device__ EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; } -template<> __device__ EIGEN_STRONG_INLINE half2 pmul(const half2& a, const half2& b) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul(const half2& a, const half2& b) { #if defined(EIGEN_HIP_DEVICE_COMPILE) return __hmul2(a, b); @@ -257,7 +257,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmul(const half2& a, cons #endif } -template<> __device__ EIGEN_STRONG_INLINE half2 pmadd(const half2& a, const half2& b, const half2& c) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd(const half2& a, const half2& b, const half2& c) { #if defined(EIGEN_HIP_DEVICE_COMPILE) return __hfma2(a, b, c); @@ -281,7 +281,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmadd(const half2& a, con #endif } -template<> __device__ EIGEN_STRONG_INLINE half2 pdiv(const half2& a, const half2& b) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv(const half2& a, const half2& b) { #if defined(EIGEN_HIP_DEVICE_COMPILE) #if defined(EIGEN_HAS_OLD_HIP_FP16) @@ -303,7 +303,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pdiv(const half2& a, cons #endif } -template<> __device__ EIGEN_STRONG_INLINE half2 pmin(const half2& a, const half2& b) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin(const half2& a, const half2& b) { float a1 = __low2float(a); float a2 = __high2float(a); float b1 = __low2float(b); @@ -313,7 +313,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmin(const half2& a, cons return __halves2half2(r1, r2); } -template<> __device__ EIGEN_STRONG_INLINE half2 pmax(const half2& a, const half2& b) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax(const half2& a, const half2& b) { float a1 = __low2float(a); float a2 = __high2float(a); float b1 = __low2float(b); @@ -323,7 +323,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmax(const half2& a, cons return __halves2half2(r1, r2); } -template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux(const half2& a) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux(const half2& a) { #if defined(EIGEN_HIP_DEVICE_COMPILE) return __hadd(__low2half(a), __high2half(a)); @@ -341,7 +341,7 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux(const half2& #endif } -template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max(const half2& a) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max(const half2& a) { #if defined(EIGEN_HIP_DEVICE_COMPILE) __half first = __low2half(a); @@ -363,7 +363,7 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max(const ha #endif } -template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min(const half2& a) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min(const half2& a) { #if defined(EIGEN_HIP_DEVICE_COMPILE) __half first = __low2half(a); @@ -385,7 +385,7 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min(const ha #endif } -template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_mul(const half2& a) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul(const half2& a) { #if defined(EIGEN_HIP_DEVICE_COMPILE) return __hmul(__low2half(a), __high2half(a)); @@ -403,7 +403,7 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_mul(const ha #endif } -template<> __device__ EIGEN_STRONG_INLINE half2 plog1p(const half2& a) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog1p(const half2& a) { float a1 = __low2float(a); float a2 = __high2float(a); float r1 = log1pf(a1); @@ -411,7 +411,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 plog1p(const half2& a) { return __floats2half2_rn(r1, r2); } -template<> __device__ EIGEN_STRONG_INLINE half2 pexpm1(const half2& a) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexpm1(const half2& a) { float a1 = __low2float(a); float a2 = __high2float(a); float r1 = expm1f(a1); @@ -422,29 +422,29 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pexpm1(const half2& a) { #if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \ defined(EIGEN_HIP_DEVICE_COMPILE) -template<> __device__ EIGEN_STRONG_INLINE +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog(const half2& a) { return h2log(a); } -template<> __device__ EIGEN_STRONG_INLINE +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp(const half2& a) { return h2exp(a); } -template<> __device__ EIGEN_STRONG_INLINE +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psqrt(const half2& a) { return h2sqrt(a); } -template<> __device__ EIGEN_STRONG_INLINE +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 prsqrt(const half2& a) { return h2rsqrt(a); } #else -template<> __device__ EIGEN_STRONG_INLINE half2 plog(const half2& a) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog(const half2& a) { float a1 = __low2float(a); float a2 = __high2float(a); float r1 = logf(a1); @@ -452,7 +452,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 plog(const half2& a) { return __floats2half2_rn(r1, r2); } -template<> __device__ EIGEN_STRONG_INLINE half2 pexp(const half2& a) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp(const half2& a) { float a1 = __low2float(a); float a2 = __high2float(a); float r1 = expf(a1); @@ -460,7 +460,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pexp(const half2& a) { return __floats2half2_rn(r1, r2); } -template<> __device__ EIGEN_STRONG_INLINE half2 psqrt(const half2& a) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psqrt(const half2& a) { float a1 = __low2float(a); float a2 = __high2float(a); float r1 = sqrtf(a1); @@ -468,7 +468,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 psqrt(const half2& a) { return __floats2half2_rn(r1, r2); } -template<> __device__ EIGEN_STRONG_INLINE half2 prsqrt(const half2& a) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 prsqrt(const half2& a) { float a1 = __low2float(a); float a2 = __high2float(a); float r1 = rsqrtf(a1); diff --git a/test/gpu_common.h b/test/gpu_common.h index 3aac49e96..79d4ea694 100644 --- a/test/gpu_common.h +++ b/test/gpu_common.h @@ -61,9 +61,9 @@ void run_on_gpu(const Kernel& ker, int n, const Input& in, Output& out) gpuDeviceSynchronize(); #ifdef EIGEN_USE_HIP - hipLaunchKernelGGL(run_on_gpu_meta_kernel::type, - typename std::decay::type>, + hipLaunchKernelGGL(HIP_KERNEL_NAME(run_on_gpu_meta_kernel::type, + typename std::decay::type>), dim3(Grids), dim3(Blocks), 0, 0, ker, n, d_in, d_out); #else run_on_gpu_meta_kernel<<>>(ker, n, d_in, d_out); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 01d3863da..f3f1640b0 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -14,7 +14,7 @@ // clang is incompatible with the CUDA syntax wrt making a kernel a class friend, // so we'll use a macro to make clang happy. #ifndef KERNEL_FRIEND -#if defined(__clang__) && defined(__CUDA__) +#if defined(__clang__) && (defined(__CUDA__) || defined(__HIP__)) #define KERNEL_FRIEND friend __global__ #else #define KERNEL_FRIEND friend -- cgit v1.2.3 From 81b38a155adf5d527bce5c84cf90cd83c28da445 Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Tue, 11 Sep 2018 13:32:32 -0700 Subject: Fix compilation of tiled evaluation code with c++03 --- unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h | 6 +++--- unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h | 11 +++++------ unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h | 5 ++--- unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h | 5 ++--- unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h | 5 ++--- unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h | 5 ++--- unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h | 4 ++-- unsupported/test/cxx11_tensor_block_access.cpp | 14 +++++++------- unsupported/test/cxx11_tensor_shuffling.cpp | 10 +++++----- 9 files changed, 30 insertions(+), 35 deletions(-) (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h index 6d90af2d3..13da36257 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h @@ -396,8 +396,8 @@ struct TensorBlockCwiseUnaryOp { typedef const Eigen::Array Input; typedef Eigen::Array Output; - typedef Eigen::Map> InputMap; - typedef Eigen::Map> OutputMap; + typedef Eigen::Map > InputMap; + typedef Eigen::Map > OutputMap; const InputScalar* input_base = &input_data[input_index]; OutputScalar* output_base = &output_data[output_index]; @@ -502,7 +502,7 @@ struct TensorBlockCwiseUnaryIO { input_stride, input_data); // Update index. for (int j = 0; j < num_squeezed_dims; ++j) { - auto& state = block_iter_state[j]; + BlockIteratorState& state = block_iter_state[j]; if (++state.count < state.size) { output_index += state.output_stride; input_index += state.input_stride; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index 02d061a9c..e5cf93ab0 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -596,12 +596,11 @@ struct TensorEvaluator, Device> std::vector* resources) const { // TODO(wuke): Targeting L1 size is 30% faster than targeting L{-1} on large // tensors. But this might need further tuning. - auto block_total_size_max = numext::maxi( + Eigen::Index block_total_size_max = numext::maxi( 1, m_device.firstLevelCacheSize() / sizeof(Scalar)); resources->push_back(internal::TensorOpResourceRequirements( - internal::TensorBlockShapeType::kSkewedInnerDims, - block_total_size_max)); + internal::kSkewedInnerDims, block_total_size_max)); m_impl.getResourceRequirements(resources); } @@ -617,8 +616,8 @@ struct TensorEvaluator, Device> // equal to m_dimensions for inner dims, a smaller than m_dimensions[i] size // for the first outer dim, and 1 for other outer dims. This is guaranteed // by MergeResourceRequirements() in TensorBlock.h. - const auto& output_block_sizes = output_block->block_sizes(); - const auto& output_block_strides = output_block->block_strides(); + const Dimensions& output_block_sizes = output_block->block_sizes(); + const Dimensions& output_block_strides = output_block->block_strides(); // Find where outer dims start. int outer_dim_start = 0; @@ -642,7 +641,7 @@ struct TensorEvaluator, Device> return; } - const auto& input_dims = m_impl.dimensions(); + const Dimensions& input_dims = m_impl.dimensions(); // Pre-fill input_block_sizes, broadcast_block_sizes, // broadcast_block_strides, and broadcast_tensor_strides. Later on we will diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h index 76fab39e2..b47fa9e8e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -290,11 +290,10 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( std::vector* resources) const { - auto block_total_size_max = numext::maxi( + Eigen::Index block_total_size_max = numext::maxi( 1, m_device.lastLevelCacheSize() / sizeof(Scalar)); resources->push_back(internal::TensorOpResourceRequirements( - internal::TensorBlockShapeType::kSkewedInnerDims, - block_total_size_max)); + internal::kSkewedInnerDims, block_total_size_max)); m_impl.getResourceRequirements(resources); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h index 1826d7022..965bd8f1e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h @@ -550,11 +550,10 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( std::vector* resources) const { - auto block_total_size_max = numext::maxi( + Eigen::Index block_total_size_max = numext::maxi( 1, m_device.lastLevelCacheSize() / sizeof(Scalar)); resources->push_back(internal::TensorOpResourceRequirements( - internal::TensorBlockShapeType::kSkewedInnerDims, - block_total_size_max)); + internal::kSkewedInnerDims, block_total_size_max)); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block( diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index 2f765acb7..16dc74afe 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -677,11 +677,10 @@ struct TensorEvaluator, Devi EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( std::vector* resources) const { - auto block_total_size_max = numext::maxi( + Eigen::Index block_total_size_max = numext::maxi( 1, m_device.lastLevelCacheSize() / sizeof(Scalar)); resources->push_back(internal::TensorOpResourceRequirements( - internal::TensorBlockShapeType::kSkewedInnerDims, - block_total_size_max)); + internal::kSkewedInnerDims, block_total_size_max)); m_impl.getResourceRequirements(resources); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 3d534eaa2..eeb2578fd 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -771,11 +771,10 @@ struct TensorEvaluator, EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( std::vector* resources) const { - auto block_total_size_max = numext::maxi( + Eigen::Index block_total_size_max = numext::maxi( 1, m_device.lastLevelCacheSize() / sizeof(Scalar)); resources->push_back(internal::TensorOpResourceRequirements( - internal::TensorBlockShapeType::kSkewedInnerDims, - block_total_size_max)); + internal::kSkewedInnerDims, block_total_size_max)); m_impl.getResourceRequirements(resources); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h index a5b541a68..e018d0ab2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h @@ -229,10 +229,10 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( std::vector* resources) const { - auto block_total_size_max = numext::maxi( + Eigen::Index block_total_size_max = numext::maxi( 1, m_device.firstLevelCacheSize() / sizeof(Scalar)); resources->push_back(internal::TensorOpResourceRequirements( - internal::TensorBlockShapeType::kUniformAllDims, block_total_size_max)); + internal::kUniformAllDims, block_total_size_max)); m_impl.getResourceRequirements(resources); } diff --git a/unsupported/test/cxx11_tensor_block_access.cpp b/unsupported/test/cxx11_tensor_block_access.cpp index eec282ba7..ad12ae557 100644 --- a/unsupported/test/cxx11_tensor_block_access.cpp +++ b/unsupported/test/cxx11_tensor_block_access.cpp @@ -535,7 +535,7 @@ static void test_block_cwise_unary_io_basic() { DSizes block_sizes = RandomDims(); DSizes strides(ComputeStrides(block_sizes)); - const auto total_size = block_sizes.TotalSize(); + const Index total_size = block_sizes.TotalSize(); // Create a random input tensors. T* input_data = GenerateRandomData(total_size); @@ -562,12 +562,12 @@ static void test_block_cwise_unary_io_squeeze_ones() { DSizes block_sizes(1, 2, 1, 3, 1); DSizes strides(ComputeStrides(block_sizes)); - const auto total_size = block_sizes.TotalSize(); + const Index total_size = block_sizes.TotalSize(); // Create a random input tensors. - auto* input_data = GenerateRandomData(total_size); + float* input_data = GenerateRandomData(total_size); - auto* output_data = new float[total_size]; + float* output_data = new float[total_size]; UnaryFunctor functor; TensorBlockCwiseUnaryIO::Run(functor, block_sizes, strides, output_data, strides, input_data); @@ -599,13 +599,13 @@ static void test_block_cwise_unary_io_zero_strides() { input_strides[4] = 0; // Generate random data. - auto* input_data = GenerateRandomData(input_sizes.TotalSize()); + float* input_data = GenerateRandomData(input_sizes.TotalSize()); DSizes output_sizes = rnd_dims; DSizes output_strides(ComputeStrides(output_sizes)); - const auto output_total_size = output_sizes.TotalSize(); - auto* output_data = new float[output_total_size]; + const Index output_total_size = output_sizes.TotalSize(); + float* output_data = new float[output_total_size]; UnaryFunctor functor; TensorBlockCwiseUnaryIO::Run(functor, output_sizes, output_strides, diff --git a/unsupported/test/cxx11_tensor_shuffling.cpp b/unsupported/test/cxx11_tensor_shuffling.cpp index 467df39c7..062dd1c0f 100644 --- a/unsupported/test/cxx11_tensor_shuffling.cpp +++ b/unsupported/test/cxx11_tensor_shuffling.cpp @@ -81,12 +81,12 @@ static void test_expr_shuffling() Tensor expected; expected = tensor.shuffle(shuffles); - Tensor result(5,7,3,2); + Tensor result(5, 7, 3, 2); - array src_slice_dim{{2,3,1,7}}; - array src_slice_start{{0,0,0,0}}; - array dst_slice_dim{{1,7,3,2}}; - array dst_slice_start{{0,0,0,0}}; + array src_slice_dim({2, 3, 1, 7}); + array src_slice_start({0, 0, 0, 0}); + array dst_slice_dim({1, 7, 3, 2}); + array dst_slice_start({0, 0, 0, 0}); for (int i = 0; i < 5; ++i) { result.slice(dst_slice_start, dst_slice_dim) = -- cgit v1.2.3 From 01197e44527941c95f9a63e4f60ab3a989f12cbe Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Thu, 13 Sep 2018 15:03:36 -0700 Subject: Fix warnings --- unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h | 2 -- unsupported/test/cxx11_tensor_executor.cpp | 8 ++++---- 2 files changed, 4 insertions(+), 6 deletions(-) (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index eeb2578fd..261dbffa8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -1140,7 +1140,6 @@ struct TensorEvaluator, const Index max_coeff_count, const DSizes& input_slice_sizes, DSizes* target_input_block_sizes) const { - typedef typename internal::packet_traits::type Packet; typedef internal::BlockReducer BlockReducer; // TODO(andydavis) Compute reducer overhead correctly for the case where // we are preserving the inner most dimension, and a single reducer @@ -1168,7 +1167,6 @@ struct TensorEvaluator, } else if (!first_preserved_dim_allocated) { // TODO(andydavis) Include output block size in this L1 working set // calculation. - const Index allocated = max_coeff_count - coeff_to_allocate; const Index alloc_size = numext::maxi( static_cast(1), coeff_to_allocate / reducer_overhead); (*target_input_block_sizes)[dim] = diff --git a/unsupported/test/cxx11_tensor_executor.cpp b/unsupported/test/cxx11_tensor_executor.cpp index 8639e7b38..1bb99854c 100644 --- a/unsupported/test/cxx11_tensor_executor.cpp +++ b/unsupported/test/cxx11_tensor_executor.cpp @@ -27,7 +27,7 @@ static array RandomDims(int min_dim = 1, int max_dim = 20) { dims[i] = internal::random(min_dim, max_dim); } return dims; -}; +} template @@ -117,7 +117,7 @@ static void test_execute_broadcasting(Device d) for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i)); } -}; +} template @@ -155,7 +155,7 @@ static void test_execute_chipping_rvalue(Device d) { TEST_CHIPPING(5) #undef TEST_CHIPPING -}; +} template @@ -207,7 +207,7 @@ static void test_execute_chipping_lvalue(Device d) { TEST_CHIPPING(5) #undef TEST_CHIPPING -}; +} template -- cgit v1.2.3