From 966c2a7bb62a8b5b9ecd349730ffcd3b5719837d Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Fri, 27 Jul 2018 12:45:17 -0700 Subject: Rename Index to StorageIndex + use Eigen::Array and Eigen::Map when possible --- .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 185 +++++++++++---------- 1 file changed, 94 insertions(+), 91 deletions(-) (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 024de3696..ac5afd891 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -36,15 +36,16 @@ template class TensorExecutor { public: - typedef typename Expression::Index Index; + using StorageIndex = typename Expression::Index; + EIGEN_DEVICE_FUNC static inline void run(const Expression& expr, const Device& device = Device()) { TensorEvaluator evaluator(expr, device); const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); if (needs_assign) { - const Index size = array_prod(evaluator.dimensions()); - for (Index i = 0; i < size; ++i) { + const StorageIndex size = array_prod(evaluator.dimensions()); + for (StorageIndex i = 0; i < size; ++i) { evaluator.evalScalar(i); } } @@ -56,35 +57,36 @@ class TensorExecutor { * Process all the data with a single cpu thread, using vectorized instructions. */ template -class TensorExecutor { +class TensorExecutor { public: - typedef typename Expression::Index Index; + using StorageIndex = typename Expression::Index; EIGEN_DEVICE_FUNC - static inline void run(const Expression& expr, const DefaultDevice& device = DefaultDevice()) - { + static inline void run(const Expression& expr, + const DefaultDevice& device = DefaultDevice()) { TensorEvaluator evaluator(expr, device); const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); - if (needs_assign) - { - const Index size = array_prod(evaluator.dimensions()); + if (needs_assign) { + const StorageIndex size = array_prod(evaluator.dimensions()); const int PacketSize = unpacket_traits::PacketReturnType>::size; // Give compiler a strong possibility to unroll the loop. But don't insist // on unrolling, because if the function is expensive compiler should not // unroll the loop at the expense of inlining. - const Index UnrolledSize = (size / (4 * PacketSize)) * 4 * PacketSize; - for (Index i = 0; i < UnrolledSize; i += 4*PacketSize) { - for (Index j = 0; j < 4; j++) { + const StorageIndex UnrolledSize = + (size / (4 * PacketSize)) * 4 * PacketSize; + for (StorageIndex i = 0; i < UnrolledSize; i += 4 * PacketSize) { + for (StorageIndex j = 0; j < 4; j++) { evaluator.evalPacket(i + j * PacketSize); } } - const Index VectorizedSize = (size / PacketSize) * PacketSize; - for (Index i = UnrolledSize; i < VectorizedSize; i += PacketSize) { + const StorageIndex VectorizedSize = (size / PacketSize) * PacketSize; + for (StorageIndex i = UnrolledSize; i < VectorizedSize; i += PacketSize) { evaluator.evalPacket(i); } - for (Index i = VectorizedSize; i < size; ++i) { + for (StorageIndex i = VectorizedSize; i < size; ++i) { evaluator.evalScalar(i); } } @@ -97,42 +99,41 @@ class TensorExecutor -class TensorExecutor { +class TensorExecutor { public: - typedef typename Expression::Index Index; + using Scalar = typename traits::Scalar; + using ScalarNoConst = typename remove_const::type; + + using Evaluator = TensorEvaluator; + using StorageIndex = typename traits::Index; + + static const int NumDims = traits::NumDimensions; EIGEN_DEVICE_FUNC static inline void run(const Expression& expr, const DefaultDevice& device = DefaultDevice()) { - using Evaluator = TensorEvaluator; - - using Index = typename traits::Index; - const int NumDims = traits::NumDimensions; - - using Scalar = typename traits::Scalar; - using ScalarNoConst = typename remove_const::type; - using TensorBlock = - TensorBlock; - using TensorBlockMapper = - TensorBlockMapper; + TensorBlock; + using TensorBlockMapper = TensorBlockMapper; Evaluator evaluator(expr, device); - std::size_t total_size = array_prod(evaluator.dimensions()); - std::size_t cache_size = device.firstLevelCacheSize() / sizeof(Scalar); + Index total_size = array_prod(evaluator.dimensions()); + Index cache_size = device.firstLevelCacheSize() / sizeof(Scalar); if (total_size < cache_size) { // TODO(andydavis) Reduce block management overhead for small tensors. // TODO(wuke) Do not do this when evaluating TensorBroadcastingOp. internal::TensorExecutor::run(expr, device); + /*Tileable*/ false>::run(expr, device); return; } const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); if (needs_assign) { // Size tensor blocks to fit in cache (or requested target block size). - size_t block_total_size = numext::mini(cache_size, total_size); + Index block_total_size = numext::mini(cache_size, total_size); TensorBlockShapeType block_shape = TensorBlockShapeType::kSkewedInnerDims; // Query expression tree for desired block size/shape. std::vector resources; @@ -146,8 +147,8 @@ class TensorExecutor Scalar* data = static_cast( device.allocate(block_total_size * sizeof(Scalar))); - const Index total_block_count = block_mapper.total_block_count(); - for (Index i = 0; i < total_block_count; ++i) { + const StorageIndex total_block_count = block_mapper.total_block_count(); + for (StorageIndex i = 0; i < total_block_count; ++i) { TensorBlock block = block_mapper.GetBlockForIndex(i, data); evaluator.evalBlock(&block); } @@ -162,37 +163,38 @@ class TensorExecutor * executed on a single core. */ #ifdef EIGEN_USE_THREADS -template +template struct EvalRange { - static void run(Evaluator* evaluator_in, const Index first, const Index last) { + static void run(Evaluator* evaluator_in, const StorageIndex first, + const StorageIndex last) { Evaluator evaluator = *evaluator_in; eigen_assert(last >= first); - for (Index i = first; i < last; ++i) { + for (StorageIndex i = first; i < last; ++i) { evaluator.evalScalar(i); } } - static Index alignBlockSize(Index size) { - return size; - } + static StorageIndex alignBlockSize(StorageIndex size) { return size; } }; -template -struct EvalRange { - static const int PacketSize = unpacket_traits::size; +template +struct EvalRange { + static const int PacketSize = + unpacket_traits::size; - static void run(Evaluator* evaluator_in, const Index first, const Index last) { + static void run(Evaluator* evaluator_in, const StorageIndex first, + const StorageIndex last) { Evaluator evaluator = *evaluator_in; eigen_assert(last >= first); - Index i = first; + StorageIndex i = first; if (last - first >= PacketSize) { eigen_assert(first % PacketSize == 0); - Index last_chunk_offset = last - 4 * PacketSize; + StorageIndex last_chunk_offset = last - 4 * PacketSize; // Give compiler a strong possibility to unroll the loop. But don't insist // on unrolling, because if the function is expensive compiler should not // unroll the loop at the expense of inlining. - for (; i <= last_chunk_offset; i += 4*PacketSize) { - for (Index j = 0; j < 4; j++) { + for (; i <= last_chunk_offset; i += 4 * PacketSize) { + for (StorageIndex j = 0; j < 4; j++) { evaluator.evalPacket(i + j * PacketSize); } } @@ -206,7 +208,7 @@ struct EvalRange { } } - static Index alignBlockSize(Index size) { + static StorageIndex alignBlockSize(StorageIndex size) { // Align block size to packet size and account for unrolling in run above. if (size >= 16 * PacketSize) { return (size + 4 * PacketSize - 1) & ~(4 * PacketSize - 1); @@ -219,24 +221,24 @@ struct EvalRange { template class TensorExecutor { public: - typedef typename Expression::Index Index; + using StorageIndex = typename Expression::Index; static inline void run(const Expression& expr, const ThreadPoolDevice& device) { typedef TensorEvaluator Evaluator; - typedef EvalRange EvalRange; + typedef EvalRange EvalRange; Evaluator evaluator(expr, device); const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr); if (needs_assign) { - const Index PacketSize = + const StorageIndex PacketSize = Vectorizable ? unpacket_traits::size : 1; - const Index size = array_prod(evaluator.dimensions()); + const StorageIndex size = array_prod(evaluator.dimensions()); device.parallelFor(size, evaluator.costPerCoeff(Vectorizable), EvalRange::alignBlockSize, - [&evaluator](Index first, Index last) { + [&evaluator](StorageIndex first, StorageIndex last) { EvalRange::run(&evaluator, first, last); }); } @@ -247,24 +249,24 @@ class TensorExecutor { template class TensorExecutor { public: - typedef typename Expression::Index Index; + using Scalar = typename traits::Scalar; + using ScalarNoConst = typename remove_const::type; - static inline void run(const Expression& expr, - const ThreadPoolDevice& device) { - typedef TensorEvaluator Evaluator; - typedef typename internal::remove_const< - typename traits::Scalar>::type Scalar; - typedef typename traits::Index Index; + using Evaluator = TensorEvaluator; + using StorageIndex = typename traits::Index; - static const int NumDims = traits::NumDimensions; + static const int NumDims = traits::NumDimensions; - typedef TensorBlock TensorBlock; - typedef TensorBlockMapper - TensorBlockMapper; + static inline void run(const Expression& expr, + const ThreadPoolDevice& device) { + using TensorBlock = + TensorBlock; + using TensorBlockMapper = + TensorBlockMapper; Evaluator evaluator(expr, device); - std::size_t total_size = array_prod(evaluator.dimensions()); - std::size_t cache_size = device.firstLevelCacheSize() / sizeof(Scalar); + StorageIndex total_size = array_prod(evaluator.dimensions()); + StorageIndex cache_size = device.firstLevelCacheSize() / sizeof(Scalar); if (total_size < cache_size) { // TODO(andydavis) Reduce block management overhead for small tensors. internal::TensorExecutor resources; evaluator.getResourceRequirements(&resources); @@ -296,15 +298,16 @@ class TensorExecutor= -1 && thread_idx < num_threads); Scalar* thread_buf = reinterpret_cast( static_cast(buf) + aligned_blocksize * (thread_idx + 1)); - for (Index i = first; i < last; ++i) { + for (StorageIndex i = first; i < last; ++i) { auto block = block_mapper.GetBlockForIndex(i, thread_buf); evaluator.evalBlock(&block); } @@ -324,51 +327,51 @@ class TensorExecutor class TensorExecutor { public: - typedef typename Expression::Index Index; + typedef typename Expression::Index StorageIndex; static void run(const Expression& expr, const GpuDevice& device); }; #if defined(EIGEN_GPUCC) -template +template struct EigenMetaKernelEval { static __device__ EIGEN_ALWAYS_INLINE - void run(Evaluator& eval, Index first, Index last, Index step_size) { - for (Index i = first; i < last; i += step_size) { + void run(Evaluator& eval, StorageIndex first, StorageIndex last, StorageIndex step_size) { + for (StorageIndex i = first; i < last; i += step_size) { eval.evalScalar(i); } } }; -template -struct EigenMetaKernelEval { +template +struct EigenMetaKernelEval { static __device__ EIGEN_ALWAYS_INLINE - void run(Evaluator& eval, Index first, Index last, Index step_size) { - const Index PacketSize = unpacket_traits::size; - const Index vectorized_size = (last / PacketSize) * PacketSize; - const Index vectorized_step_size = step_size * PacketSize; + void run(Evaluator& eval, StorageIndex first, StorageIndex last, StorageIndex step_size) { + const StorageIndex PacketSize = unpacket_traits::size; + const StorageIndex vectorized_size = (last / PacketSize) * PacketSize; + const StorageIndex vectorized_step_size = step_size * PacketSize; // Use the vector path - for (Index i = first * PacketSize; i < vectorized_size; + for (StorageIndex i = first * PacketSize; i < vectorized_size; i += vectorized_step_size) { eval.evalPacket(i); } - for (Index i = vectorized_size + first; i < last; i += step_size) { + for (StorageIndex i = vectorized_size + first; i < last; i += step_size) { eval.evalScalar(i); } } }; -template +template __global__ void __launch_bounds__(1024) -EigenMetaKernel(Evaluator eval, Index size) { +EigenMetaKernel(Evaluator eval, StorageIndex size) { - const Index first_index = blockIdx.x * blockDim.x + threadIdx.x; - const Index step_size = blockDim.x * gridDim.x; + const StorageIndex first_index = blockIdx.x * blockDim.x + threadIdx.x; + const StorageIndex step_size = blockDim.x * gridDim.x; const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned; - EigenMetaKernelEval::run(eval, first_index, size, step_size); + EigenMetaKernelEval::run(eval, first_index, size, step_size); } /*static*/ @@ -382,12 +385,12 @@ inline void TensorExecutor::run( const int block_size = device.maxGpuThreadsPerBlock(); const int max_blocks = device.getNumGpuMultiProcessors() * device.maxGpuThreadsPerMultiProcessor() / block_size; - const Index size = array_prod(evaluator.dimensions()); + const StorageIndex size = array_prod(evaluator.dimensions()); // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0. const int num_blocks = numext::maxi(numext::mini(max_blocks, divup(size, block_size)), 1); LAUNCH_GPU_KERNEL( - (EigenMetaKernel, Index>), + (EigenMetaKernel, StorageIndex>), num_blocks, block_size, 0, device, evaluator, size); } evaluator.cleanup(); -- cgit v1.2.3