diff options
Diffstat (limited to 'third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h')
-rw-r--r-- | third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h | 642 |
1 files changed, 642 insertions, 0 deletions
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h new file mode 100644 index 0000000000..d052dcdf69 --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -0,0 +1,642 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Manjunath Kudlur <keveman@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H +#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H + +#if defined(EIGEN_USE_GPU) + +namespace Eigen { +namespace internal { + +template <typename OutExpr, typename InExpr, typename Op, typename Indices, + bool Tileable> +class TensorExecutor< + const TensorAssignOp< + OutExpr, TensorReductionOp<Op, Indices const, InExpr const> const>, + GpuDevice, false, Tileable> { + public: + typedef const TensorAssignOp< + OutExpr, TensorReductionOp<Op, Indices const, InExpr const> const> + Expression; + static void run(const Expression& expr, const GpuDevice& device); +}; + +template <typename OutExpr, typename InExpr, typename Op, typename Indices, + bool Tileable> +class TensorExecutor< + const TensorAssignOp< + OutExpr, TensorReductionOp<Op, Indices const, InExpr const> const>, + GpuDevice, true, Tileable> { + public: + typedef const TensorAssignOp< + OutExpr, TensorReductionOp<Op, Indices const, InExpr const> const> + Expression; + static void run(const Expression& expr, const GpuDevice& device); +}; + +template <typename InExpr, typename Op, typename Indices, bool Tileable> +class TensorExecutor<const TensorEvalToOp<const TensorReductionOp< + Op, const Indices, const InExpr> >, + GpuDevice, false, Tileable> { + public: + typedef const TensorEvalToOp< + const TensorReductionOp<Op, const Indices, const InExpr> > Expression; + static void run(const Expression& expr, const GpuDevice& device); +}; + +template <typename InExpr, typename Op, typename Indices, bool Tileable> +class TensorExecutor<const TensorEvalToOp<const TensorReductionOp< + Op, const Indices, const InExpr> >, + GpuDevice, true, Tileable> { + public: + typedef const TensorEvalToOp< + const TensorReductionOp<Op, const Indices, const InExpr> > Expression; + static void run(const Expression& expr, const GpuDevice& device); +}; + +} // end namespace internal +} // end namespace Eigen + +#if defined(__CUDACC__) + +namespace Eigen { + +namespace internal { + +namespace { + +#define DIVUP(x, y) (((x) + (y)-1) / (y)) + +// Initialize output[0..size-1] with val +template <typename Output> +__global__ void InitVector(const float val, int size, Output output) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + for (int i = idx; i < size; i += gridDim.x * blockDim.x) { + output.coeffRef(i) = val; + } +} + +// ----------------------------------------------------------------------------- +// Column Reduction kernels +// ----------------------------------------------------------------------------- +template <int GRID_DIM, int BLOCK_DIM, int NUM_PER_THREAD, typename Input, + typename Output, typename Reducer> +__global__ void ColumnReduceKernel(Reducer reducer, const Input input, int rows, + int cols, Output output) { + assert(blockDim.x == BLOCK_DIM); + assert(blockDim.y == 1); + assert(blockDim.z == 1); + + assert(gridDim.x == GRID_DIM); + assert(gridDim.y == 1); + assert(gridDim.z == 1); + + typedef typename Input::Index Index; + + const Index num_input_points = DIVUP(rows, NUM_PER_THREAD) * cols; + const int bx = blockIdx.x; + const int tx = threadIdx.x; + + for (Index i = bx * BLOCK_DIM + tx; i < num_input_points; + i += BLOCK_DIM * GRID_DIM) { + const Index input_col = i % cols; + const Index input_row_begin = + ((i / cols) % DIVUP(rows, NUM_PER_THREAD)) * NUM_PER_THREAD; + float reduced_val = reducer.bottom_value(); + for (int j = 0; j < NUM_PER_THREAD; ++j) { + float val = ((input_col < cols) && (input_row_begin + j < rows)) + ? input.coeff((input_row_begin + j) * cols + input_col) + : reducer.bottom_value(); + reduced_val = reducer(reduced_val, val); + } +#if __CUDA_ARCH__ >= 300 + reducer.atomic_reduce(&output.coeffRef(input_col), reduced_val); +#endif + } +} + +// ----------------------------------------------------------------------------- +// Row Reduction kernels +// ----------------------------------------------------------------------------- +template <int GRID_DIM, int BLOCK_DIM, int NUM_PER_THREAD, typename Input, + typename Output, typename Reducer> +__global__ void RowReduceKernel(Reducer reducer, const Input input, int rows, + int cols, Output output) { + assert(BLOCK_DIM % 32 == 0); + assert(blockDim.x == BLOCK_DIM); + assert(blockDim.y == 1); + assert(blockDim.z == 1); + + assert(gridDim.x == GRID_DIM); + assert(gridDim.y == 1); + assert(gridDim.z == 1); + + const int unroll_times = 16; + assert(NUM_PER_THREAD % unroll_times == 0); + + typedef typename Input::Index Index; + + __shared__ float temp[BLOCK_DIM]; + + const Index input_col_blocks = DIVUP(cols, BLOCK_DIM * NUM_PER_THREAD); + const Index num_input_blocks = input_col_blocks * rows; + + const int bx = blockIdx.x; + const int tx = threadIdx.x; + + for (Index i = bx; i < num_input_blocks; i += GRID_DIM) { + const Index col_block = i % input_col_blocks; + const Index row_block = i / input_col_blocks; + const Index col_begin = col_block * BLOCK_DIM * NUM_PER_THREAD + tx; + const Index row = row_block; + float reduced_val = reducer.bottom_value(); + if (row < rows) { + for (Index j = 0; j < NUM_PER_THREAD; j += unroll_times) { + const Index last_col = col_begin + BLOCK_DIM * (j + unroll_times - 1); + if (last_col >= cols) { + // We can skip the last iteration of the loop since we know + // that col >= cols there. +#pragma unroll + for (int k = 0; k < unroll_times - 1; ++k) { + const Index col = col_begin + BLOCK_DIM * (j + k); + const float val = (col < cols ? input.coeff(row * cols + col) + : reducer.bottom_value()); + reduced_val = reducer(reduced_val, val); + } + break; // col < cols for all later iterations. + } else { + // Faster version of the loop with no branches after unrolling. +#pragma unroll + for (int k = 0; k < unroll_times; ++k) { + const Index col = col_begin + BLOCK_DIM * (j + k); + reduced_val = reducer(reduced_val, input.coeff(row * cols + col)); + } + } + } + } + temp[tx] = reduced_val; + + __syncthreads(); + const int warp_id = tx & 31; + if (warp_id < 16) temp[tx] = reducer(temp[tx], temp[tx + 16]); + if (warp_id < 8) temp[tx] = reducer(temp[tx], temp[tx + 8]); + if (warp_id < 4) temp[tx] = reducer(temp[tx], temp[tx + 4]); + if (warp_id < 2) temp[tx] = reducer(temp[tx], temp[tx + 2]); + if (warp_id < 1) temp[tx] = reducer(temp[tx], temp[tx + 1]); + + if (warp_id == 0) { + if (row < rows) { +#if __CUDA_ARCH__ >= 300 + reducer.atomic_reduce(&output.coeffRef(row), temp[tx]); +#endif + } + } + + __syncthreads(); + } +} + +template <typename Input, typename Output, typename Reducer> +void ColumnReduceCuda(Reducer reducer, const GpuDevice& device, + const Input input, int rows, int cols, Output output) { + const int block_size = 256; + const int grid_size = 128; + const int num_per_thread = 16; + LAUNCH_CUDA_KERNEL(InitVector, 32, 1024, 0, device, reducer.bottom_value(), + cols, output); + LAUNCH_CUDA_KERNEL( + (ColumnReduceKernel<grid_size, block_size, num_per_thread>), grid_size, + block_size, 0, device, reducer, input, rows, cols, output); +} + +template <typename Input, typename Output, typename Reducer> +void RowReduceCuda(Reducer reducer, const GpuDevice& device, const Input input, + int rows, int cols, Output output) { + const int block_size = 256; + const int grid_size = 32; + const int num_per_thread = 128; + LAUNCH_CUDA_KERNEL(InitVector, 32, 1024, 0, device, reducer.bottom_value(), + rows, output); + LAUNCH_CUDA_KERNEL((RowReduceKernel<grid_size, block_size, num_per_thread>), + grid_size, block_size, 0, device, reducer, input, rows, + cols, output); +} + +// Provides arbitrary sum reductions, applying a function across the +// right argument being reduced prior to summing +template <typename F> +struct FnSumReducer { + __host__ __device__ FnSumReducer(F f) : f_(f) {} + __host__ __device__ float bottom_value() { return 0.0f; } + __device__ float operator()(float x, float y) const { return x + f_(y); } + __device__ void atomic_reduce(float* x, float y) const { atomicAdd(x, y); } + + F f_; +}; + +// Identity is used for the basic SumReduction +struct Identity { + __device__ float operator()(float x) const { return x; } +}; + +struct CudaSumReducer : FnSumReducer<Identity> { + __host__ __device__ CudaSumReducer() : FnSumReducer(Identity()) {} +}; + +struct CudaMaxReducer { + // nvcc doesn't recognize numeric_limits<float>::lowest for some reason. + CudaMaxReducer() { + bottom_value_ = -3.40282347E+38F; // std::numeric_limits<float>::lowest(); + } + __host__ __device__ float bottom_value() { return bottom_value_; } + __device__ float operator()(float x, float y) const { return fmax(x, y); } + + // This is equivalent to atomicMax(x, y), but CUDA does not have atomicMax for + // float data type. Instead, this atomically compares-and-swaps the old value + // at x with y. If the old value returned by the CAS operation was already + // larger than y, or what was read before, it declares success and finishes, + // otherwise repeats the procedure. + __device__ void atomic_reduce(float* x, float y) { + unsigned int old_val = *reinterpret_cast<unsigned int*>(x); + while (*reinterpret_cast<float*>(&old_val) < y) { + unsigned int current_val = + atomicCAS(reinterpret_cast<unsigned int*>(x), old_val, + *reinterpret_cast<unsigned int*>(&y)); + if (old_val == current_val) { + break; + } + old_val = current_val; + } + } + float bottom_value_; +}; + +} // end namespace + +template <typename Op> +struct IsFloatSumReduction { + static const bool value = false; +}; + +template <> +struct IsFloatSumReduction<SumReducer<float> > { + static const bool value = true; +}; + +template <typename Op> +struct IsFloatMaxReduction { + static const bool value = false; +}; + +template <> +struct IsFloatMaxReduction<MaxReducer<float> > { + static const bool value = true; +}; + +template <typename Op> +struct SumOrMaxOfFloat { + static const bool value = + IsFloatSumReduction<Op>::value || IsFloatMaxReduction<Op>::value; +}; + +enum ReductionType { ROW_REDUCE, COL_REDUCE, UNOPTIMIZED }; + +template <typename Op, typename Expr, typename ReductionExpr> +ReductionType GetReductionType(const Expr& expr, + const ReductionExpr& reduction_expr, + const GpuDevice& device, std::size_t* rows, + std::size_t* cols) { + typedef TensorEvaluator<const Expr, GpuDevice> EvalExpr; + typedef TensorEvaluator<const ReductionExpr, GpuDevice> ReductionEvalExpr; + + if (device.majorDeviceVersion() < 3) { + return UNOPTIMIZED; + } + const EvalExpr eval_expr(expr, device); + + // We only have fast reductions for sum/max of float. + if (!SumOrMaxOfFloat<Op>::value) { + return UNOPTIMIZED; + } + + // For sum/max of float, if we are doing a full reduction, we can + // use the ROW_REDUCE optimization. + if (ReductionEvalExpr::NumReducedDims == ReductionEvalExpr::NumInputDims) { + *rows = 1; + *cols = array_prod(eval_expr.dimensions()); + return ROW_REDUCE; + } + + if (ReductionEvalExpr::NumReducedDims > 1) { + return UNOPTIMIZED; + } + + const int dim = reduction_expr.dims()[0]; + if (static_cast<int>(ReductionEvalExpr::Layout) == + static_cast<int>(RowMajor)) { + if (dim == ReductionEvalExpr::NumInputDims - 1) { + *rows = array_prod(eval_expr.dimensions()) / + eval_expr.dimensions()[ReductionEvalExpr::NumInputDims - 1]; + *cols = eval_expr.dimensions()[ReductionEvalExpr::NumInputDims - 1]; + if (*cols < 32) return UNOPTIMIZED; + return ROW_REDUCE; + } else if (dim == 0) { + *rows = eval_expr.dimensions()[0]; + *cols = array_prod(eval_expr.dimensions()) / eval_expr.dimensions()[0]; + if (*rows < 32) return UNOPTIMIZED; + return COL_REDUCE; + } + } else if (static_cast<int>(ReductionEvalExpr::Layout) == + static_cast<int>(ColMajor)) { + if (dim == ReductionEvalExpr::NumInputDims - 1) { + *rows = eval_expr.dimensions()[ReductionEvalExpr::NumInputDims - 1]; + *cols = array_prod(eval_expr.dimensions()) / + eval_expr.dimensions()[ReductionEvalExpr::NumInputDims - 1]; + if (*rows < 32) return UNOPTIMIZED; + return COL_REDUCE; + } else if (dim == 0) { + *rows = array_prod(eval_expr.dimensions()) / eval_expr.dimensions()[0]; + *cols = eval_expr.dimensions()[0]; + if (*cols < 32) return UNOPTIMIZED; + return ROW_REDUCE; + } + } + return UNOPTIMIZED; +} + +template <typename Expression, typename Index, bool Vectorizable> +struct LaunchKernel; + +template <typename Expression, typename Index> +struct LaunchKernel<Expression, Index, true> { + static void launch(int num_blocks, int block_size, const GpuDevice& device, + const TensorEvaluator<Expression, GpuDevice>& evaluator, + Index size) { + LAUNCH_CUDA_KERNEL( + (EigenMetaKernel_Vectorizable<TensorEvaluator<Expression, GpuDevice>, + Index>), + num_blocks, block_size, 0, device, evaluator, size); + } +}; + +template <typename Expression, typename Index> +struct LaunchKernel<Expression, Index, false> { + static void launch(int num_blocks, int block_size, const GpuDevice& device, + const TensorEvaluator<Expression, GpuDevice>& evaluator, + Index size) { + LAUNCH_CUDA_KERNEL( + (EigenMetaKernel_NonVectorizable<TensorEvaluator<Expression, GpuDevice>, + Index>), + num_blocks, block_size, 0, device, evaluator, size); + } +}; + +template <typename F, typename LHS, typename RHS, bool Compatible> +struct LaunchRowReduce; + +template <typename F, typename LHS, typename RHS> +struct LaunchRowReduce<F, LHS, RHS, true> { + static void launch(const GpuDevice& device, RHS input, std::size_t rows, + std::size_t cols, LHS output) { + RowReduceCuda(F(), device, input, rows, cols, output); + } +}; + +template <typename F, typename LHS, typename RHS> +struct LaunchRowReduce<F, LHS, RHS, false> { + static void launch(const GpuDevice& device, RHS input, std::size_t rows, + std::size_t cols, LHS output) {} +}; + +template <typename F, typename LHS, typename RHS, bool Compatible> +struct LaunchColReduce; + +template <typename F, typename LHS, typename RHS> +struct LaunchColReduce<F, LHS, RHS, true> { + static void launch(const GpuDevice& device, RHS input, std::size_t rows, + std::size_t cols, LHS output) { + ColumnReduceCuda(F(), device, input, rows, cols, output); + } +}; + +template <typename F, typename LHS, typename RHS> +struct LaunchColReduce<F, LHS, RHS, false> { + static void launch(const GpuDevice& device, RHS input, std::size_t rows, + std::size_t cols, LHS output) {} +}; + +template <typename Expression, typename Device, bool Vectorizable> +class TensorAssignExecutorHelper; + +template <typename OutExpr, typename InExpr, typename Op, typename Indices, + bool Vectorizable> +class TensorAssignExecutorHelper< + const TensorAssignOp< + OutExpr, TensorReductionOp<Op, Indices const, InExpr const> const>, + GpuDevice, Vectorizable> { + public: + typedef const TensorAssignOp< + OutExpr, TensorReductionOp<Op, Indices const, InExpr const> const> + Expression; + + typedef typename Expression::Index Index; + typedef TensorEvaluator<OutExpr, GpuDevice> LHSEval; + typedef TensorEvaluator<const InExpr, GpuDevice> RHSEval; + static inline void run(const Expression& expr, const GpuDevice& device) { + std::size_t rows, cols; + const ReductionType reduction_type = + GetReductionType<Op>(expr.rhsExpression().expression(), + expr.rhsExpression(), device, &rows, &cols); + if (reduction_type == UNOPTIMIZED) { + TensorEvaluator<Expression, GpuDevice> evaluator(expr, device); + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + if (needs_assign) { + const int num_blocks = device.getNumCudaMultiProcessors() * + device.maxCudaThreadsPerMultiProcessor() / + device.maxCudaThreadsPerBlock(); + const int block_size = device.maxCudaThreadsPerBlock(); + const Index size = array_prod(evaluator.dimensions()); + LaunchKernel<Expression, Index, Vectorizable>::launch( + num_blocks, block_size, device, evaluator, size); + } + evaluator.cleanup(); + } else { + LHSEval output(expr.lhsExpression(), device); + RHSEval input(expr.rhsExpression().expression(), device); + bool lhs_needs_assign = output.evalSubExprsIfNeeded(NULL); + bool rhs_needs_assign = input.evalSubExprsIfNeeded(NULL); + if (lhs_needs_assign && rhs_needs_assign) { + const bool Compatible = + IsFloatSumReduction<Op>::value || IsFloatMaxReduction<Op>::value; + if (reduction_type == ROW_REDUCE) { + if (IsFloatSumReduction<Op>::value) { + LaunchRowReduce<CudaSumReducer, LHSEval, RHSEval, + Compatible>::launch(device, input, rows, cols, + output); + } else if (IsFloatMaxReduction<Op>::value) { + LaunchRowReduce<CudaMaxReducer, LHSEval, RHSEval, + Compatible>::launch(device, input, rows, cols, + output); + } else { + // Unsupported reduction type + assert(false && "Unsupported reduction function for ROW_REDUCE"); + } + } else { + if (IsFloatSumReduction<Op>::value) { + LaunchColReduce<CudaSumReducer, LHSEval, RHSEval, + Compatible>::launch(device, input, rows, cols, + output); + } else if (IsFloatMaxReduction<Op>::value) { + LaunchColReduce<CudaMaxReducer, LHSEval, RHSEval, + Compatible>::launch(device, input, rows, cols, + output); + } else { + // Unsupported reduction type + assert(false && "Unsupported reduction function for COL_REDUCE"); + } + } + } + input.cleanup(); + output.cleanup(); + } + } +}; + +template <typename OutExpr, typename InExpr, typename Op, typename Indices, + bool Tileable> +inline void TensorExecutor< + const TensorAssignOp< + OutExpr, TensorReductionOp<Op, Indices const, InExpr const> const>, + GpuDevice, false, Tileable>::run(const Expression& expr, + const GpuDevice& device) { + TensorAssignExecutorHelper< + const TensorAssignOp< + OutExpr, TensorReductionOp<Op, Indices const, InExpr const> const>, + GpuDevice, false>::run(expr, device); +} + +template <typename OutExpr, typename InExpr, typename Op, typename Indices, + bool Tileable> +inline void TensorExecutor< + const TensorAssignOp< + OutExpr, TensorReductionOp<Op, Indices const, InExpr const> const>, + GpuDevice, true, Tileable>::run(const Expression& expr, + const GpuDevice& device) { + TensorAssignExecutorHelper< + const TensorAssignOp< + OutExpr, TensorReductionOp<Op, Indices const, InExpr const> const>, + GpuDevice, true>::run(expr, device); +} + +template <typename T, typename Index> +struct PtrWrapper { + EIGEN_DEVICE_FUNC PtrWrapper(T* ptr) : m_ptr(ptr) {} + EIGEN_DEVICE_FUNC T& coeffRef(Index i) { return *(m_ptr + i); } + T* m_ptr; +}; + +template <typename Expression, typename Device, bool Vectorizable> +class TensorEvalToExecutorHelper; + +template <typename InExpr, typename Op, typename Indices, bool Vectorizable> +class TensorEvalToExecutorHelper<const TensorEvalToOp<const TensorReductionOp< + Op, const Indices, const InExpr> >, + GpuDevice, Vectorizable> { + public: + typedef const TensorEvalToOp<const TensorReductionOp< + Op, const Indices, const InExpr> > Expression; + typedef typename Expression::Index Index; + typedef TensorEvaluator<const InExpr, GpuDevice> RHSEval; + + static inline void run(const Expression& expr, const GpuDevice& device) { + std::size_t rows, cols; + const ReductionType reduction_type = + GetReductionType<Op>(expr.expression().expression(), expr.expression(), + device, &rows, &cols); + if (reduction_type == UNOPTIMIZED) { + TensorEvaluator<Expression, GpuDevice> evaluator(expr, device); + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + if (needs_assign) { + const int num_blocks = device.getNumCudaMultiProcessors() * + device.maxCudaThreadsPerMultiProcessor() / + device.maxCudaThreadsPerBlock(); + const int block_size = device.maxCudaThreadsPerBlock(); + const Index size = array_prod(evaluator.dimensions()); + LaunchKernel<Expression, Index, Vectorizable>::launch( + num_blocks, block_size, device, evaluator, size); + } + evaluator.cleanup(); + } else { + typedef typename internal::remove_const<typename Expression::Scalar>::type Scalar; + PtrWrapper<Scalar, Index> output(expr.buffer()); + TensorEvaluator<const InExpr, GpuDevice> input( + expr.expression().expression(), device); + typedef PtrWrapper<Scalar, Index> LHSEval; + typedef TensorEvaluator<const InExpr, GpuDevice> RHSEval; + bool rhs_needs_assign = input.evalSubExprsIfNeeded(NULL); + if (rhs_needs_assign) { + const bool Compatible = + IsFloatSumReduction<Op>::value || IsFloatMaxReduction<Op>::value; + if (reduction_type == ROW_REDUCE) { + if (IsFloatSumReduction<Op>::value) { + LaunchRowReduce<CudaSumReducer, LHSEval, RHSEval, + Compatible>::launch(device, input, rows, cols, + output); + } else if (IsFloatMaxReduction<Op>::value) { + LaunchRowReduce<CudaMaxReducer, LHSEval, RHSEval, + Compatible>::launch(device, input, rows, cols, + output); + } + } else { + if (IsFloatSumReduction<Op>::value) { + LaunchColReduce<CudaSumReducer, LHSEval, RHSEval, + Compatible>::launch(device, input, rows, cols, + output); + } else if (IsFloatMaxReduction<Op>::value) { + LaunchColReduce<CudaMaxReducer, LHSEval, RHSEval, + Compatible>::launch(device, input, rows, cols, + output); + } + } + } + input.cleanup(); + } + } +}; + +template <typename InExpr, typename Op, typename Indices, bool Tileable> +inline void +TensorExecutor<const TensorEvalToOp< + const TensorReductionOp<Op, const Indices, const InExpr> >, + GpuDevice, false, Tileable>::run(const Expression& expr, + const GpuDevice& device) { + TensorEvalToExecutorHelper<const TensorEvalToOp<const TensorReductionOp< + Op, const Indices, const InExpr> >, + GpuDevice, false>::run(expr, device); +} + +template <typename InExpr, typename Op, typename Indices, bool Tileable> +inline void +TensorExecutor<const TensorEvalToOp< + const TensorReductionOp<Op, const Indices, const InExpr> >, + GpuDevice, true, Tileable>::run(const Expression& expr, + const GpuDevice& device) { + TensorEvalToExecutorHelper<const TensorEvalToOp<const TensorReductionOp< + Op, const Indices, const InExpr> >, + GpuDevice, true>::run(expr, device); +} + +} // end namespace internal + +} // end namespace Eigen + +#endif // __CUDACC__ +#endif // EIGEN_USE_GPU +#endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H |