diff options
author | Eugene Zhulenev <ezhulenev@google.com> | 2018-07-27 12:45:17 -0700 |
---|---|---|
committer | Eugene Zhulenev <ezhulenev@google.com> | 2018-07-27 12:45:17 -0700 |
commit | 966c2a7bb62a8b5b9ecd349730ffcd3b5719837d (patch) | |
tree | 83e61bb77a5340f529c336afaa69cc78d654d599 /unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h | |
parent | 6913221c43c6ad41b1fbfc0d263d2764abd11ad2 (diff) |
Rename Index to StorageIndex + use Eigen::Array and Eigen::Map when possible
Diffstat (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h')
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h | 185 |
1 files changed, 94 insertions, 91 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 024de3696..ac5afd891 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -36,15 +36,16 @@ template <typename Expression, typename Device, bool Vectorizable, bool Tileable> class TensorExecutor { public: - typedef typename Expression::Index Index; + using StorageIndex = typename Expression::Index; + EIGEN_DEVICE_FUNC static inline void run(const Expression& expr, const Device& device = Device()) { TensorEvaluator<Expression, Device> evaluator(expr, device); const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); if (needs_assign) { - const Index size = array_prod(evaluator.dimensions()); - for (Index i = 0; i < size; ++i) { + const StorageIndex size = array_prod(evaluator.dimensions()); + for (StorageIndex i = 0; i < size; ++i) { evaluator.evalScalar(i); } } @@ -56,35 +57,36 @@ class TensorExecutor { * Process all the data with a single cpu thread, using vectorized instructions. */ template <typename Expression> -class TensorExecutor<Expression, DefaultDevice, /*Vectorizable*/ true, /*Tilable*/ false> { +class TensorExecutor<Expression, DefaultDevice, /*Vectorizable*/ true, + /*Tileable*/ false> { public: - typedef typename Expression::Index Index; + using StorageIndex = typename Expression::Index; EIGEN_DEVICE_FUNC - static inline void run(const Expression& expr, const DefaultDevice& device = DefaultDevice()) - { + static inline void run(const Expression& expr, + const DefaultDevice& device = DefaultDevice()) { TensorEvaluator<Expression, DefaultDevice> evaluator(expr, device); const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); - if (needs_assign) - { - const Index size = array_prod(evaluator.dimensions()); + if (needs_assign) { + const StorageIndex size = array_prod(evaluator.dimensions()); const int PacketSize = unpacket_traits<typename TensorEvaluator< Expression, DefaultDevice>::PacketReturnType>::size; // Give compiler a strong possibility to unroll the loop. But don't insist // on unrolling, because if the function is expensive compiler should not // unroll the loop at the expense of inlining. - const Index UnrolledSize = (size / (4 * PacketSize)) * 4 * PacketSize; - for (Index i = 0; i < UnrolledSize; i += 4*PacketSize) { - for (Index j = 0; j < 4; j++) { + const StorageIndex UnrolledSize = + (size / (4 * PacketSize)) * 4 * PacketSize; + for (StorageIndex i = 0; i < UnrolledSize; i += 4 * PacketSize) { + for (StorageIndex j = 0; j < 4; j++) { evaluator.evalPacket(i + j * PacketSize); } } - const Index VectorizedSize = (size / PacketSize) * PacketSize; - for (Index i = UnrolledSize; i < VectorizedSize; i += PacketSize) { + const StorageIndex VectorizedSize = (size / PacketSize) * PacketSize; + for (StorageIndex i = UnrolledSize; i < VectorizedSize; i += PacketSize) { evaluator.evalPacket(i); } - for (Index i = VectorizedSize; i < size; ++i) { + for (StorageIndex i = VectorizedSize; i < size; ++i) { evaluator.evalScalar(i); } } @@ -97,42 +99,41 @@ class TensorExecutor<Expression, DefaultDevice, /*Vectorizable*/ true, /*Tilable * sizing a block to fit L1 cache we get better cache performance. */ template <typename Expression, bool Vectorizable> -class TensorExecutor<Expression, DefaultDevice, Vectorizable, /*Tilable*/ true> { +class TensorExecutor<Expression, DefaultDevice, Vectorizable, + /*Tileable*/ true> { public: - typedef typename Expression::Index Index; + using Scalar = typename traits<Expression>::Scalar; + using ScalarNoConst = typename remove_const<Scalar>::type; + + using Evaluator = TensorEvaluator<Expression, DefaultDevice>; + using StorageIndex = typename traits<Expression>::Index; + + static const int NumDims = traits<Expression>::NumDimensions; EIGEN_DEVICE_FUNC static inline void run(const Expression& expr, const DefaultDevice& device = DefaultDevice()) { - using Evaluator = TensorEvaluator<Expression, DefaultDevice>; - - using Index = typename traits<Expression>::Index; - const int NumDims = traits<Expression>::NumDimensions; - - using Scalar = typename traits<Expression>::Scalar; - using ScalarNoConst = typename remove_const<Scalar>::type; - using TensorBlock = - TensorBlock<ScalarNoConst, Index, NumDims, Evaluator::Layout>; - using TensorBlockMapper = - TensorBlockMapper<ScalarNoConst, Index, NumDims, Evaluator::Layout>; + TensorBlock<ScalarNoConst, StorageIndex, NumDims, Evaluator::Layout>; + using TensorBlockMapper = TensorBlockMapper<ScalarNoConst, StorageIndex, + NumDims, Evaluator::Layout>; Evaluator evaluator(expr, device); - std::size_t total_size = array_prod(evaluator.dimensions()); - std::size_t cache_size = device.firstLevelCacheSize() / sizeof(Scalar); + Index total_size = array_prod(evaluator.dimensions()); + Index cache_size = device.firstLevelCacheSize() / sizeof(Scalar); if (total_size < cache_size) { // TODO(andydavis) Reduce block management overhead for small tensors. // TODO(wuke) Do not do this when evaluating TensorBroadcastingOp. internal::TensorExecutor<Expression, DefaultDevice, Vectorizable, - false>::run(expr, device); + /*Tileable*/ false>::run(expr, device); return; } const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); if (needs_assign) { // Size tensor blocks to fit in cache (or requested target block size). - size_t block_total_size = numext::mini(cache_size, total_size); + Index block_total_size = numext::mini(cache_size, total_size); TensorBlockShapeType block_shape = TensorBlockShapeType::kSkewedInnerDims; // Query expression tree for desired block size/shape. std::vector<TensorOpResourceRequirements> resources; @@ -146,8 +147,8 @@ class TensorExecutor<Expression, DefaultDevice, Vectorizable, /*Tilable*/ true> Scalar* data = static_cast<Scalar*>( device.allocate(block_total_size * sizeof(Scalar))); - const Index total_block_count = block_mapper.total_block_count(); - for (Index i = 0; i < total_block_count; ++i) { + const StorageIndex total_block_count = block_mapper.total_block_count(); + for (StorageIndex i = 0; i < total_block_count; ++i) { TensorBlock block = block_mapper.GetBlockForIndex(i, data); evaluator.evalBlock(&block); } @@ -162,37 +163,38 @@ class TensorExecutor<Expression, DefaultDevice, Vectorizable, /*Tilable*/ true> * executed on a single core. */ #ifdef EIGEN_USE_THREADS -template <typename Evaluator, typename Index, bool Vectorizable> +template <typename Evaluator, typename StorageIndex, bool Vectorizable> struct EvalRange { - static void run(Evaluator* evaluator_in, const Index first, const Index last) { + static void run(Evaluator* evaluator_in, const StorageIndex first, + const StorageIndex last) { Evaluator evaluator = *evaluator_in; eigen_assert(last >= first); - for (Index i = first; i < last; ++i) { + for (StorageIndex i = first; i < last; ++i) { evaluator.evalScalar(i); } } - static Index alignBlockSize(Index size) { - return size; - } + static StorageIndex alignBlockSize(StorageIndex size) { return size; } }; -template <typename Evaluator, typename Index> -struct EvalRange<Evaluator, Index, /*Vectorizable*/ true> { - static const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size; +template <typename Evaluator, typename StorageIndex> +struct EvalRange<Evaluator, StorageIndex, /*Vectorizable*/ true> { + static const int PacketSize = + unpacket_traits<typename Evaluator::PacketReturnType>::size; - static void run(Evaluator* evaluator_in, const Index first, const Index last) { + static void run(Evaluator* evaluator_in, const StorageIndex first, + const StorageIndex last) { Evaluator evaluator = *evaluator_in; eigen_assert(last >= first); - Index i = first; + StorageIndex i = first; if (last - first >= PacketSize) { eigen_assert(first % PacketSize == 0); - Index last_chunk_offset = last - 4 * PacketSize; + StorageIndex last_chunk_offset = last - 4 * PacketSize; // Give compiler a strong possibility to unroll the loop. But don't insist // on unrolling, because if the function is expensive compiler should not // unroll the loop at the expense of inlining. - for (; i <= last_chunk_offset; i += 4*PacketSize) { - for (Index j = 0; j < 4; j++) { + for (; i <= last_chunk_offset; i += 4 * PacketSize) { + for (StorageIndex j = 0; j < 4; j++) { evaluator.evalPacket(i + j * PacketSize); } } @@ -206,7 +208,7 @@ struct EvalRange<Evaluator, Index, /*Vectorizable*/ true> { } } - static Index alignBlockSize(Index size) { + static StorageIndex alignBlockSize(StorageIndex size) { // Align block size to packet size and account for unrolling in run above. if (size >= 16 * PacketSize) { return (size + 4 * PacketSize - 1) & ~(4 * PacketSize - 1); @@ -219,24 +221,24 @@ struct EvalRange<Evaluator, Index, /*Vectorizable*/ true> { template <typename Expression, bool Vectorizable, bool Tileable> class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tileable> { public: - typedef typename Expression::Index Index; + using StorageIndex = typename Expression::Index; static inline void run(const Expression& expr, const ThreadPoolDevice& device) { typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator; - typedef EvalRange<Evaluator, Index, Vectorizable> EvalRange; + typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange; Evaluator evaluator(expr, device); const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr); if (needs_assign) { - const Index PacketSize = + const StorageIndex PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1; - const Index size = array_prod(evaluator.dimensions()); + const StorageIndex size = array_prod(evaluator.dimensions()); device.parallelFor(size, evaluator.costPerCoeff(Vectorizable), EvalRange::alignBlockSize, - [&evaluator](Index first, Index last) { + [&evaluator](StorageIndex first, StorageIndex last) { EvalRange::run(&evaluator, first, last); }); } @@ -247,24 +249,24 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tileable> { template <typename Expression, bool Vectorizable> class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, /*Tileable*/ true> { public: - typedef typename Expression::Index Index; + using Scalar = typename traits<Expression>::Scalar; + using ScalarNoConst = typename remove_const<Scalar>::type; - static inline void run(const Expression& expr, - const ThreadPoolDevice& device) { - typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator; - typedef typename internal::remove_const< - typename traits<Expression>::Scalar>::type Scalar; - typedef typename traits<Expression>::Index Index; + using Evaluator = TensorEvaluator<Expression, ThreadPoolDevice>; + using StorageIndex = typename traits<Expression>::Index; - static const int NumDims = traits<Expression>::NumDimensions; + static const int NumDims = traits<Expression>::NumDimensions; - typedef TensorBlock<Scalar, Index, NumDims, Evaluator::Layout> TensorBlock; - typedef TensorBlockMapper<Scalar, Index, NumDims, Evaluator::Layout> - TensorBlockMapper; + static inline void run(const Expression& expr, + const ThreadPoolDevice& device) { + using TensorBlock = + TensorBlock<ScalarNoConst, StorageIndex, NumDims, Evaluator::Layout>; + using TensorBlockMapper = + TensorBlockMapper<ScalarNoConst, StorageIndex, NumDims, Evaluator::Layout>; Evaluator evaluator(expr, device); - std::size_t total_size = array_prod(evaluator.dimensions()); - std::size_t cache_size = device.firstLevelCacheSize() / sizeof(Scalar); + StorageIndex total_size = array_prod(evaluator.dimensions()); + StorageIndex cache_size = device.firstLevelCacheSize() / sizeof(Scalar); if (total_size < cache_size) { // TODO(andydavis) Reduce block management overhead for small tensors. internal::TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, @@ -276,7 +278,7 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, /*Tileable*/ tr const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr); if (needs_assign) { TensorBlockShapeType block_shape = TensorBlockShapeType::kSkewedInnerDims; - size_t block_total_size = 0; + Index block_total_size = 0; // Query expression tree for desired block size/shape. std::vector<internal::TensorOpResourceRequirements> resources; evaluator.getResourceRequirements(&resources); @@ -296,15 +298,16 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, /*Tileable*/ tr void* buf = device.allocate((num_threads + 1) * aligned_blocksize); device.parallelFor( block_mapper.total_block_count(), cost * block_size, - [=, &device, &evaluator, &block_mapper](Index first, Index last) { + [=, &device, &evaluator, &block_mapper](StorageIndex first, + StorageIndex last) { // currentThreadId() returns -1 if called from a thread not in the - // threadpool, such as the main thread dispatching Eigen + // thread pool, such as the main thread dispatching Eigen // expressions. const int thread_idx = device.currentThreadId(); eigen_assert(thread_idx >= -1 && thread_idx < num_threads); Scalar* thread_buf = reinterpret_cast<Scalar*>( static_cast<char*>(buf) + aligned_blocksize * (thread_idx + 1)); - for (Index i = first; i < last; ++i) { + for (StorageIndex i = first; i < last; ++i) { auto block = block_mapper.GetBlockForIndex(i, thread_buf); evaluator.evalBlock(&block); } @@ -324,51 +327,51 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, /*Tileable*/ tr template <typename Expression, bool Vectorizable, bool Tileable> class TensorExecutor<Expression, GpuDevice, Vectorizable, Tileable> { public: - typedef typename Expression::Index Index; + typedef typename Expression::Index StorageIndex; static void run(const Expression& expr, const GpuDevice& device); }; #if defined(EIGEN_GPUCC) -template <typename Evaluator, typename Index, bool Vectorizable> +template <typename Evaluator, typename StorageIndex, bool Vectorizable> struct EigenMetaKernelEval { static __device__ EIGEN_ALWAYS_INLINE - void run(Evaluator& eval, Index first, Index last, Index step_size) { - for (Index i = first; i < last; i += step_size) { + void run(Evaluator& eval, StorageIndex first, StorageIndex last, StorageIndex step_size) { + for (StorageIndex i = first; i < last; i += step_size) { eval.evalScalar(i); } } }; -template <typename Evaluator, typename Index> -struct EigenMetaKernelEval<Evaluator, Index, true> { +template <typename Evaluator, typename StorageIndex> +struct EigenMetaKernelEval<Evaluator, StorageIndex, true> { static __device__ EIGEN_ALWAYS_INLINE - void run(Evaluator& eval, Index first, Index last, Index step_size) { - const Index PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size; - const Index vectorized_size = (last / PacketSize) * PacketSize; - const Index vectorized_step_size = step_size * PacketSize; + void run(Evaluator& eval, StorageIndex first, StorageIndex last, StorageIndex step_size) { + const StorageIndex PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size; + const StorageIndex vectorized_size = (last / PacketSize) * PacketSize; + const StorageIndex vectorized_step_size = step_size * PacketSize; // Use the vector path - for (Index i = first * PacketSize; i < vectorized_size; + for (StorageIndex i = first * PacketSize; i < vectorized_size; i += vectorized_step_size) { eval.evalPacket(i); } - for (Index i = vectorized_size + first; i < last; i += step_size) { + for (StorageIndex i = vectorized_size + first; i < last; i += step_size) { eval.evalScalar(i); } } }; -template <typename Evaluator, typename Index> +template <typename Evaluator, typename StorageIndex> __global__ void __launch_bounds__(1024) -EigenMetaKernel(Evaluator eval, Index size) { +EigenMetaKernel(Evaluator eval, StorageIndex size) { - const Index first_index = blockIdx.x * blockDim.x + threadIdx.x; - const Index step_size = blockDim.x * gridDim.x; + const StorageIndex first_index = blockIdx.x * blockDim.x + threadIdx.x; + const StorageIndex step_size = blockDim.x * gridDim.x; const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned; - EigenMetaKernelEval<Evaluator, Index, vectorizable>::run(eval, first_index, size, step_size); + EigenMetaKernelEval<Evaluator, StorageIndex, vectorizable>::run(eval, first_index, size, step_size); } /*static*/ @@ -382,12 +385,12 @@ inline void TensorExecutor<Expression, GpuDevice, Vectorizable, Tileable>::run( const int block_size = device.maxGpuThreadsPerBlock(); const int max_blocks = device.getNumGpuMultiProcessors() * device.maxGpuThreadsPerMultiProcessor() / block_size; - const Index size = array_prod(evaluator.dimensions()); + const StorageIndex size = array_prod(evaluator.dimensions()); // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0. const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, divup<int>(size, block_size)), 1); LAUNCH_GPU_KERNEL( - (EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, Index>), + (EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, StorageIndex>), num_blocks, block_size, 0, device, evaluator, size); } evaluator.cleanup(); |