aboutsummaryrefslogtreecommitdiffhomepage
path: root/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
diff options
context:
space:
mode:
authorGravatar Eugene Zhulenev <ezhulenev@google.com>2018-07-27 12:45:17 -0700
committerGravatar Eugene Zhulenev <ezhulenev@google.com>2018-07-27 12:45:17 -0700
commit966c2a7bb62a8b5b9ecd349730ffcd3b5719837d (patch)
tree83e61bb77a5340f529c336afaa69cc78d654d599 /unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
parent6913221c43c6ad41b1fbfc0d263d2764abd11ad2 (diff)
Rename Index to StorageIndex + use Eigen::Array and Eigen::Map when possible
Diffstat (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h')
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h185
1 files changed, 94 insertions, 91 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index 024de3696..ac5afd891 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -36,15 +36,16 @@ template <typename Expression, typename Device, bool Vectorizable,
bool Tileable>
class TensorExecutor {
public:
- typedef typename Expression::Index Index;
+ using StorageIndex = typename Expression::Index;
+
EIGEN_DEVICE_FUNC
static inline void run(const Expression& expr,
const Device& device = Device()) {
TensorEvaluator<Expression, Device> evaluator(expr, device);
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
if (needs_assign) {
- const Index size = array_prod(evaluator.dimensions());
- for (Index i = 0; i < size; ++i) {
+ const StorageIndex size = array_prod(evaluator.dimensions());
+ for (StorageIndex i = 0; i < size; ++i) {
evaluator.evalScalar(i);
}
}
@@ -56,35 +57,36 @@ class TensorExecutor {
* Process all the data with a single cpu thread, using vectorized instructions.
*/
template <typename Expression>
-class TensorExecutor<Expression, DefaultDevice, /*Vectorizable*/ true, /*Tilable*/ false> {
+class TensorExecutor<Expression, DefaultDevice, /*Vectorizable*/ true,
+ /*Tileable*/ false> {
public:
- typedef typename Expression::Index Index;
+ using StorageIndex = typename Expression::Index;
EIGEN_DEVICE_FUNC
- static inline void run(const Expression& expr, const DefaultDevice& device = DefaultDevice())
- {
+ static inline void run(const Expression& expr,
+ const DefaultDevice& device = DefaultDevice()) {
TensorEvaluator<Expression, DefaultDevice> evaluator(expr, device);
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
- if (needs_assign)
- {
- const Index size = array_prod(evaluator.dimensions());
+ if (needs_assign) {
+ const StorageIndex size = array_prod(evaluator.dimensions());
const int PacketSize = unpacket_traits<typename TensorEvaluator<
Expression, DefaultDevice>::PacketReturnType>::size;
// Give compiler a strong possibility to unroll the loop. But don't insist
// on unrolling, because if the function is expensive compiler should not
// unroll the loop at the expense of inlining.
- const Index UnrolledSize = (size / (4 * PacketSize)) * 4 * PacketSize;
- for (Index i = 0; i < UnrolledSize; i += 4*PacketSize) {
- for (Index j = 0; j < 4; j++) {
+ const StorageIndex UnrolledSize =
+ (size / (4 * PacketSize)) * 4 * PacketSize;
+ for (StorageIndex i = 0; i < UnrolledSize; i += 4 * PacketSize) {
+ for (StorageIndex j = 0; j < 4; j++) {
evaluator.evalPacket(i + j * PacketSize);
}
}
- const Index VectorizedSize = (size / PacketSize) * PacketSize;
- for (Index i = UnrolledSize; i < VectorizedSize; i += PacketSize) {
+ const StorageIndex VectorizedSize = (size / PacketSize) * PacketSize;
+ for (StorageIndex i = UnrolledSize; i < VectorizedSize; i += PacketSize) {
evaluator.evalPacket(i);
}
- for (Index i = VectorizedSize; i < size; ++i) {
+ for (StorageIndex i = VectorizedSize; i < size; ++i) {
evaluator.evalScalar(i);
}
}
@@ -97,42 +99,41 @@ class TensorExecutor<Expression, DefaultDevice, /*Vectorizable*/ true, /*Tilable
* sizing a block to fit L1 cache we get better cache performance.
*/
template <typename Expression, bool Vectorizable>
-class TensorExecutor<Expression, DefaultDevice, Vectorizable, /*Tilable*/ true> {
+class TensorExecutor<Expression, DefaultDevice, Vectorizable,
+ /*Tileable*/ true> {
public:
- typedef typename Expression::Index Index;
+ using Scalar = typename traits<Expression>::Scalar;
+ using ScalarNoConst = typename remove_const<Scalar>::type;
+
+ using Evaluator = TensorEvaluator<Expression, DefaultDevice>;
+ using StorageIndex = typename traits<Expression>::Index;
+
+ static const int NumDims = traits<Expression>::NumDimensions;
EIGEN_DEVICE_FUNC
static inline void run(const Expression& expr,
const DefaultDevice& device = DefaultDevice()) {
- using Evaluator = TensorEvaluator<Expression, DefaultDevice>;
-
- using Index = typename traits<Expression>::Index;
- const int NumDims = traits<Expression>::NumDimensions;
-
- using Scalar = typename traits<Expression>::Scalar;
- using ScalarNoConst = typename remove_const<Scalar>::type;
-
using TensorBlock =
- TensorBlock<ScalarNoConst, Index, NumDims, Evaluator::Layout>;
- using TensorBlockMapper =
- TensorBlockMapper<ScalarNoConst, Index, NumDims, Evaluator::Layout>;
+ TensorBlock<ScalarNoConst, StorageIndex, NumDims, Evaluator::Layout>;
+ using TensorBlockMapper = TensorBlockMapper<ScalarNoConst, StorageIndex,
+ NumDims, Evaluator::Layout>;
Evaluator evaluator(expr, device);
- std::size_t total_size = array_prod(evaluator.dimensions());
- std::size_t cache_size = device.firstLevelCacheSize() / sizeof(Scalar);
+ Index total_size = array_prod(evaluator.dimensions());
+ Index cache_size = device.firstLevelCacheSize() / sizeof(Scalar);
if (total_size < cache_size) {
// TODO(andydavis) Reduce block management overhead for small tensors.
// TODO(wuke) Do not do this when evaluating TensorBroadcastingOp.
internal::TensorExecutor<Expression, DefaultDevice, Vectorizable,
- false>::run(expr, device);
+ /*Tileable*/ false>::run(expr, device);
return;
}
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
if (needs_assign) {
// Size tensor blocks to fit in cache (or requested target block size).
- size_t block_total_size = numext::mini(cache_size, total_size);
+ Index block_total_size = numext::mini(cache_size, total_size);
TensorBlockShapeType block_shape = TensorBlockShapeType::kSkewedInnerDims;
// Query expression tree for desired block size/shape.
std::vector<TensorOpResourceRequirements> resources;
@@ -146,8 +147,8 @@ class TensorExecutor<Expression, DefaultDevice, Vectorizable, /*Tilable*/ true>
Scalar* data = static_cast<Scalar*>(
device.allocate(block_total_size * sizeof(Scalar)));
- const Index total_block_count = block_mapper.total_block_count();
- for (Index i = 0; i < total_block_count; ++i) {
+ const StorageIndex total_block_count = block_mapper.total_block_count();
+ for (StorageIndex i = 0; i < total_block_count; ++i) {
TensorBlock block = block_mapper.GetBlockForIndex(i, data);
evaluator.evalBlock(&block);
}
@@ -162,37 +163,38 @@ class TensorExecutor<Expression, DefaultDevice, Vectorizable, /*Tilable*/ true>
* executed on a single core.
*/
#ifdef EIGEN_USE_THREADS
-template <typename Evaluator, typename Index, bool Vectorizable>
+template <typename Evaluator, typename StorageIndex, bool Vectorizable>
struct EvalRange {
- static void run(Evaluator* evaluator_in, const Index first, const Index last) {
+ static void run(Evaluator* evaluator_in, const StorageIndex first,
+ const StorageIndex last) {
Evaluator evaluator = *evaluator_in;
eigen_assert(last >= first);
- for (Index i = first; i < last; ++i) {
+ for (StorageIndex i = first; i < last; ++i) {
evaluator.evalScalar(i);
}
}
- static Index alignBlockSize(Index size) {
- return size;
- }
+ static StorageIndex alignBlockSize(StorageIndex size) { return size; }
};
-template <typename Evaluator, typename Index>
-struct EvalRange<Evaluator, Index, /*Vectorizable*/ true> {
- static const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
+template <typename Evaluator, typename StorageIndex>
+struct EvalRange<Evaluator, StorageIndex, /*Vectorizable*/ true> {
+ static const int PacketSize =
+ unpacket_traits<typename Evaluator::PacketReturnType>::size;
- static void run(Evaluator* evaluator_in, const Index first, const Index last) {
+ static void run(Evaluator* evaluator_in, const StorageIndex first,
+ const StorageIndex last) {
Evaluator evaluator = *evaluator_in;
eigen_assert(last >= first);
- Index i = first;
+ StorageIndex i = first;
if (last - first >= PacketSize) {
eigen_assert(first % PacketSize == 0);
- Index last_chunk_offset = last - 4 * PacketSize;
+ StorageIndex last_chunk_offset = last - 4 * PacketSize;
// Give compiler a strong possibility to unroll the loop. But don't insist
// on unrolling, because if the function is expensive compiler should not
// unroll the loop at the expense of inlining.
- for (; i <= last_chunk_offset; i += 4*PacketSize) {
- for (Index j = 0; j < 4; j++) {
+ for (; i <= last_chunk_offset; i += 4 * PacketSize) {
+ for (StorageIndex j = 0; j < 4; j++) {
evaluator.evalPacket(i + j * PacketSize);
}
}
@@ -206,7 +208,7 @@ struct EvalRange<Evaluator, Index, /*Vectorizable*/ true> {
}
}
- static Index alignBlockSize(Index size) {
+ static StorageIndex alignBlockSize(StorageIndex size) {
// Align block size to packet size and account for unrolling in run above.
if (size >= 16 * PacketSize) {
return (size + 4 * PacketSize - 1) & ~(4 * PacketSize - 1);
@@ -219,24 +221,24 @@ struct EvalRange<Evaluator, Index, /*Vectorizable*/ true> {
template <typename Expression, bool Vectorizable, bool Tileable>
class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tileable> {
public:
- typedef typename Expression::Index Index;
+ using StorageIndex = typename Expression::Index;
static inline void run(const Expression& expr,
const ThreadPoolDevice& device) {
typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
- typedef EvalRange<Evaluator, Index, Vectorizable> EvalRange;
+ typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange;
Evaluator evaluator(expr, device);
const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
if (needs_assign) {
- const Index PacketSize =
+ const StorageIndex PacketSize =
Vectorizable
? unpacket_traits<typename Evaluator::PacketReturnType>::size
: 1;
- const Index size = array_prod(evaluator.dimensions());
+ const StorageIndex size = array_prod(evaluator.dimensions());
device.parallelFor(size, evaluator.costPerCoeff(Vectorizable),
EvalRange::alignBlockSize,
- [&evaluator](Index first, Index last) {
+ [&evaluator](StorageIndex first, StorageIndex last) {
EvalRange::run(&evaluator, first, last);
});
}
@@ -247,24 +249,24 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tileable> {
template <typename Expression, bool Vectorizable>
class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, /*Tileable*/ true> {
public:
- typedef typename Expression::Index Index;
+ using Scalar = typename traits<Expression>::Scalar;
+ using ScalarNoConst = typename remove_const<Scalar>::type;
- static inline void run(const Expression& expr,
- const ThreadPoolDevice& device) {
- typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
- typedef typename internal::remove_const<
- typename traits<Expression>::Scalar>::type Scalar;
- typedef typename traits<Expression>::Index Index;
+ using Evaluator = TensorEvaluator<Expression, ThreadPoolDevice>;
+ using StorageIndex = typename traits<Expression>::Index;
- static const int NumDims = traits<Expression>::NumDimensions;
+ static const int NumDims = traits<Expression>::NumDimensions;
- typedef TensorBlock<Scalar, Index, NumDims, Evaluator::Layout> TensorBlock;
- typedef TensorBlockMapper<Scalar, Index, NumDims, Evaluator::Layout>
- TensorBlockMapper;
+ static inline void run(const Expression& expr,
+ const ThreadPoolDevice& device) {
+ using TensorBlock =
+ TensorBlock<ScalarNoConst, StorageIndex, NumDims, Evaluator::Layout>;
+ using TensorBlockMapper =
+ TensorBlockMapper<ScalarNoConst, StorageIndex, NumDims, Evaluator::Layout>;
Evaluator evaluator(expr, device);
- std::size_t total_size = array_prod(evaluator.dimensions());
- std::size_t cache_size = device.firstLevelCacheSize() / sizeof(Scalar);
+ StorageIndex total_size = array_prod(evaluator.dimensions());
+ StorageIndex cache_size = device.firstLevelCacheSize() / sizeof(Scalar);
if (total_size < cache_size) {
// TODO(andydavis) Reduce block management overhead for small tensors.
internal::TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,
@@ -276,7 +278,7 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, /*Tileable*/ tr
const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
if (needs_assign) {
TensorBlockShapeType block_shape = TensorBlockShapeType::kSkewedInnerDims;
- size_t block_total_size = 0;
+ Index block_total_size = 0;
// Query expression tree for desired block size/shape.
std::vector<internal::TensorOpResourceRequirements> resources;
evaluator.getResourceRequirements(&resources);
@@ -296,15 +298,16 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, /*Tileable*/ tr
void* buf = device.allocate((num_threads + 1) * aligned_blocksize);
device.parallelFor(
block_mapper.total_block_count(), cost * block_size,
- [=, &device, &evaluator, &block_mapper](Index first, Index last) {
+ [=, &device, &evaluator, &block_mapper](StorageIndex first,
+ StorageIndex last) {
// currentThreadId() returns -1 if called from a thread not in the
- // threadpool, such as the main thread dispatching Eigen
+ // thread pool, such as the main thread dispatching Eigen
// expressions.
const int thread_idx = device.currentThreadId();
eigen_assert(thread_idx >= -1 && thread_idx < num_threads);
Scalar* thread_buf = reinterpret_cast<Scalar*>(
static_cast<char*>(buf) + aligned_blocksize * (thread_idx + 1));
- for (Index i = first; i < last; ++i) {
+ for (StorageIndex i = first; i < last; ++i) {
auto block = block_mapper.GetBlockForIndex(i, thread_buf);
evaluator.evalBlock(&block);
}
@@ -324,51 +327,51 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, /*Tileable*/ tr
template <typename Expression, bool Vectorizable, bool Tileable>
class TensorExecutor<Expression, GpuDevice, Vectorizable, Tileable> {
public:
- typedef typename Expression::Index Index;
+ typedef typename Expression::Index StorageIndex;
static void run(const Expression& expr, const GpuDevice& device);
};
#if defined(EIGEN_GPUCC)
-template <typename Evaluator, typename Index, bool Vectorizable>
+template <typename Evaluator, typename StorageIndex, bool Vectorizable>
struct EigenMetaKernelEval {
static __device__ EIGEN_ALWAYS_INLINE
- void run(Evaluator& eval, Index first, Index last, Index step_size) {
- for (Index i = first; i < last; i += step_size) {
+ void run(Evaluator& eval, StorageIndex first, StorageIndex last, StorageIndex step_size) {
+ for (StorageIndex i = first; i < last; i += step_size) {
eval.evalScalar(i);
}
}
};
-template <typename Evaluator, typename Index>
-struct EigenMetaKernelEval<Evaluator, Index, true> {
+template <typename Evaluator, typename StorageIndex>
+struct EigenMetaKernelEval<Evaluator, StorageIndex, true> {
static __device__ EIGEN_ALWAYS_INLINE
- void run(Evaluator& eval, Index first, Index last, Index step_size) {
- const Index PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
- const Index vectorized_size = (last / PacketSize) * PacketSize;
- const Index vectorized_step_size = step_size * PacketSize;
+ void run(Evaluator& eval, StorageIndex first, StorageIndex last, StorageIndex step_size) {
+ const StorageIndex PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
+ const StorageIndex vectorized_size = (last / PacketSize) * PacketSize;
+ const StorageIndex vectorized_step_size = step_size * PacketSize;
// Use the vector path
- for (Index i = first * PacketSize; i < vectorized_size;
+ for (StorageIndex i = first * PacketSize; i < vectorized_size;
i += vectorized_step_size) {
eval.evalPacket(i);
}
- for (Index i = vectorized_size + first; i < last; i += step_size) {
+ for (StorageIndex i = vectorized_size + first; i < last; i += step_size) {
eval.evalScalar(i);
}
}
};
-template <typename Evaluator, typename Index>
+template <typename Evaluator, typename StorageIndex>
__global__ void
__launch_bounds__(1024)
-EigenMetaKernel(Evaluator eval, Index size) {
+EigenMetaKernel(Evaluator eval, StorageIndex size) {
- const Index first_index = blockIdx.x * blockDim.x + threadIdx.x;
- const Index step_size = blockDim.x * gridDim.x;
+ const StorageIndex first_index = blockIdx.x * blockDim.x + threadIdx.x;
+ const StorageIndex step_size = blockDim.x * gridDim.x;
const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned;
- EigenMetaKernelEval<Evaluator, Index, vectorizable>::run(eval, first_index, size, step_size);
+ EigenMetaKernelEval<Evaluator, StorageIndex, vectorizable>::run(eval, first_index, size, step_size);
}
/*static*/
@@ -382,12 +385,12 @@ inline void TensorExecutor<Expression, GpuDevice, Vectorizable, Tileable>::run(
const int block_size = device.maxGpuThreadsPerBlock();
const int max_blocks = device.getNumGpuMultiProcessors() *
device.maxGpuThreadsPerMultiProcessor() / block_size;
- const Index size = array_prod(evaluator.dimensions());
+ const StorageIndex size = array_prod(evaluator.dimensions());
// Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0.
const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, divup<int>(size, block_size)), 1);
LAUNCH_GPU_KERNEL(
- (EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, Index>),
+ (EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, StorageIndex>),
num_blocks, block_size, 0, device, evaluator, size);
}
evaluator.cleanup();