// This file is part of Eigen, a lightweight C++ template library // for linear algebra. // // Copyright (C) 2015 Manjunath Kudlur // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. #ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H #define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H #if defined(EIGEN_USE_GPU) namespace Eigen { namespace internal { template class TensorExecutor< const TensorAssignOp< OutExpr, TensorReductionOp const>, GpuDevice, false, Tileable> { public: typedef const TensorAssignOp< OutExpr, TensorReductionOp const> Expression; static void run(const Expression& expr, const GpuDevice& device); }; template class TensorExecutor< const TensorAssignOp< OutExpr, TensorReductionOp const>, GpuDevice, true, Tileable> { public: typedef const TensorAssignOp< OutExpr, TensorReductionOp const> Expression; static void run(const Expression& expr, const GpuDevice& device); }; template class TensorExecutor >, GpuDevice, false, Tileable> { public: typedef const TensorEvalToOp< const TensorReductionOp > Expression; static void run(const Expression& expr, const GpuDevice& device); }; template class TensorExecutor >, GpuDevice, true, Tileable> { public: typedef const TensorEvalToOp< const TensorReductionOp > Expression; static void run(const Expression& expr, const GpuDevice& device); }; } // end namespace internal } // end namespace Eigen #if defined(__CUDACC__) namespace Eigen { namespace internal { namespace { #define DIVUP(x, y) (((x) + (y)-1) / (y)) // Initialize output[0..size-1] with val template __global__ void InitVector(const float val, int size, Output output) { int idx = blockIdx.x * blockDim.x + threadIdx.x; for (int i = idx; i < size; i += gridDim.x * blockDim.x) { output.coeffRef(i) = val; } } // ----------------------------------------------------------------------------- // Column Reduction kernels // ----------------------------------------------------------------------------- template __global__ void ColumnReduceKernel(Reducer reducer, const Input input, int rows, int cols, Output output) { assert(blockDim.x == BLOCK_DIM); assert(blockDim.y == 1); assert(blockDim.z == 1); assert(gridDim.x == GRID_DIM); assert(gridDim.y == 1); assert(gridDim.z == 1); typedef typename Input::Index Index; const Index num_input_points = DIVUP(rows, NUM_PER_THREAD) * cols; const int bx = blockIdx.x; const int tx = threadIdx.x; for (Index i = bx * BLOCK_DIM + tx; i < num_input_points; i += BLOCK_DIM * GRID_DIM) { const Index input_col = i % cols; const Index input_row_begin = ((i / cols) % DIVUP(rows, NUM_PER_THREAD)) * NUM_PER_THREAD; float reduced_val = reducer.bottom_value(); for (int j = 0; j < NUM_PER_THREAD; ++j) { float val = ((input_col < cols) && (input_row_begin + j < rows)) ? input.coeff((input_row_begin + j) * cols + input_col) : reducer.bottom_value(); reduced_val = reducer(reduced_val, val); } #if __CUDA_ARCH__ >= 300 reducer.atomic_reduce(&output.coeffRef(input_col), reduced_val); #endif } } // ----------------------------------------------------------------------------- // Row Reduction kernels // ----------------------------------------------------------------------------- template __global__ void RowReduceKernel(Reducer reducer, const Input input, int rows, int cols, Output output) { assert(BLOCK_DIM % 32 == 0); assert(blockDim.x == BLOCK_DIM); assert(blockDim.y == 1); assert(blockDim.z == 1); assert(gridDim.x == GRID_DIM); assert(gridDim.y == 1); assert(gridDim.z == 1); const int unroll_times = 16; assert(NUM_PER_THREAD % unroll_times == 0); typedef typename Input::Index Index; __shared__ float temp[BLOCK_DIM]; const Index input_col_blocks = DIVUP(cols, BLOCK_DIM * NUM_PER_THREAD); const Index num_input_blocks = input_col_blocks * rows; const int bx = blockIdx.x; const int tx = threadIdx.x; for (Index i = bx; i < num_input_blocks; i += GRID_DIM) { const Index col_block = i % input_col_blocks; const Index row_block = i / input_col_blocks; const Index col_begin = col_block * BLOCK_DIM * NUM_PER_THREAD + tx; const Index row = row_block; float reduced_val = reducer.bottom_value(); if (row < rows) { for (Index j = 0; j < NUM_PER_THREAD; j += unroll_times) { const Index last_col = col_begin + BLOCK_DIM * (j + unroll_times - 1); if (last_col >= cols) { // We can skip the last iteration of the loop since we know // that col >= cols there. #pragma unroll for (int k = 0; k < unroll_times - 1; ++k) { const Index col = col_begin + BLOCK_DIM * (j + k); const float val = (col < cols ? input.coeff(row * cols + col) : reducer.bottom_value()); reduced_val = reducer(reduced_val, val); } break; // col < cols for all later iterations. } else { // Faster version of the loop with no branches after unrolling. #pragma unroll for (int k = 0; k < unroll_times; ++k) { const Index col = col_begin + BLOCK_DIM * (j + k); reduced_val = reducer(reduced_val, input.coeff(row * cols + col)); } } } } temp[tx] = reduced_val; __syncthreads(); const int warp_id = tx & 31; if (warp_id < 16) temp[tx] = reducer(temp[tx], temp[tx + 16]); if (warp_id < 8) temp[tx] = reducer(temp[tx], temp[tx + 8]); if (warp_id < 4) temp[tx] = reducer(temp[tx], temp[tx + 4]); if (warp_id < 2) temp[tx] = reducer(temp[tx], temp[tx + 2]); if (warp_id < 1) temp[tx] = reducer(temp[tx], temp[tx + 1]); if (warp_id == 0) { if (row < rows) { #if __CUDA_ARCH__ >= 300 reducer.atomic_reduce(&output.coeffRef(row), temp[tx]); #endif } } __syncthreads(); } } template void ColumnReduceCuda(Reducer reducer, const GpuDevice& device, const Input input, int rows, int cols, Output output) { const int block_size = 256; const int grid_size = 128; const int num_per_thread = 16; LAUNCH_CUDA_KERNEL(InitVector, 32, 1024, 0, device, reducer.bottom_value(), cols, output); LAUNCH_CUDA_KERNEL( (ColumnReduceKernel), grid_size, block_size, 0, device, reducer, input, rows, cols, output); } template void RowReduceCuda(Reducer reducer, const GpuDevice& device, const Input input, int rows, int cols, Output output) { const int block_size = 256; const int grid_size = 32; const int num_per_thread = 128; LAUNCH_CUDA_KERNEL(InitVector, 32, 1024, 0, device, reducer.bottom_value(), rows, output); LAUNCH_CUDA_KERNEL((RowReduceKernel), grid_size, block_size, 0, device, reducer, input, rows, cols, output); } // Provides arbitrary sum reductions, applying a function across the // right argument being reduced prior to summing template struct FnSumReducer { __host__ __device__ FnSumReducer(F f) : f_(f) {} __host__ __device__ float bottom_value() { return 0.0f; } __device__ float operator()(float x, float y) const { return x + f_(y); } __device__ void atomic_reduce(float* x, float y) const { atomicAdd(x, y); } F f_; }; // Identity is used for the basic SumReduction struct Identity { __device__ float operator()(float x) const { return x; } }; struct CudaSumReducer : FnSumReducer { __host__ __device__ CudaSumReducer() : FnSumReducer(Identity()) {} }; struct CudaMaxReducer { // nvcc doesn't recognize numeric_limits::lowest for some reason. CudaMaxReducer() { bottom_value_ = -3.40282347E+38F; // std::numeric_limits::lowest(); } __host__ __device__ float bottom_value() { return bottom_value_; } __device__ float operator()(float x, float y) const { return fmax(x, y); } // This is equivalent to atomicMax(x, y), but CUDA does not have atomicMax for // float data type. Instead, this atomically compares-and-swaps the old value // at x with y. If the old value returned by the CAS operation was already // larger than y, or what was read before, it declares success and finishes, // otherwise repeats the procedure. __device__ void atomic_reduce(float* x, float y) { unsigned int old_val = *reinterpret_cast(x); while (*reinterpret_cast(&old_val) < y) { unsigned int current_val = atomicCAS(reinterpret_cast(x), old_val, *reinterpret_cast(&y)); if (old_val == current_val) { break; } old_val = current_val; } } float bottom_value_; }; } // end namespace template struct IsFloatSumReduction { static const bool value = false; }; template <> struct IsFloatSumReduction > { static const bool value = true; }; template struct IsFloatMaxReduction { static const bool value = false; }; template <> struct IsFloatMaxReduction > { static const bool value = true; }; template struct SumOrMaxOfFloat { static const bool value = IsFloatSumReduction::value || IsFloatMaxReduction::value; }; enum ReductionType { ROW_REDUCE, COL_REDUCE, UNOPTIMIZED }; template ReductionType GetReductionType(const Expr& expr, const ReductionExpr& reduction_expr, const GpuDevice& device, std::size_t* rows, std::size_t* cols) { typedef TensorEvaluator EvalExpr; typedef TensorEvaluator ReductionEvalExpr; if (device.majorDeviceVersion() < 3) { return UNOPTIMIZED; } const EvalExpr eval_expr(expr, device); // We only have fast reductions for sum/max of float. if (!SumOrMaxOfFloat::value) { return UNOPTIMIZED; } // For sum/max of float, if we are doing a full reduction, we can // use the ROW_REDUCE optimization. if (ReductionEvalExpr::NumReducedDims == ReductionEvalExpr::NumInputDims) { *rows = 1; *cols = array_prod(eval_expr.dimensions()); return ROW_REDUCE; } if (ReductionEvalExpr::NumReducedDims > 1) { return UNOPTIMIZED; } const int dim = reduction_expr.dims()[0]; if (static_cast(ReductionEvalExpr::Layout) == static_cast(RowMajor)) { if (dim == ReductionEvalExpr::NumInputDims - 1) { *rows = array_prod(eval_expr.dimensions()) / eval_expr.dimensions()[ReductionEvalExpr::NumInputDims - 1]; *cols = eval_expr.dimensions()[ReductionEvalExpr::NumInputDims - 1]; if (*cols < 32) return UNOPTIMIZED; return ROW_REDUCE; } else if (dim == 0) { *rows = eval_expr.dimensions()[0]; *cols = array_prod(eval_expr.dimensions()) / eval_expr.dimensions()[0]; if (*rows < 32) return UNOPTIMIZED; return COL_REDUCE; } } else if (static_cast(ReductionEvalExpr::Layout) == static_cast(ColMajor)) { if (dim == ReductionEvalExpr::NumInputDims - 1) { *rows = eval_expr.dimensions()[ReductionEvalExpr::NumInputDims - 1]; *cols = array_prod(eval_expr.dimensions()) / eval_expr.dimensions()[ReductionEvalExpr::NumInputDims - 1]; if (*rows < 32) return UNOPTIMIZED; return COL_REDUCE; } else if (dim == 0) { *rows = array_prod(eval_expr.dimensions()) / eval_expr.dimensions()[0]; *cols = eval_expr.dimensions()[0]; if (*cols < 32) return UNOPTIMIZED; return ROW_REDUCE; } } return UNOPTIMIZED; } template struct LaunchKernel; template struct LaunchKernel { static void launch(int num_blocks, int block_size, const GpuDevice& device, const TensorEvaluator& evaluator, Index size) { LAUNCH_CUDA_KERNEL( (EigenMetaKernel_Vectorizable, Index>), num_blocks, block_size, 0, device, evaluator, size); } }; template struct LaunchKernel { static void launch(int num_blocks, int block_size, const GpuDevice& device, const TensorEvaluator& evaluator, Index size) { LAUNCH_CUDA_KERNEL( (EigenMetaKernel_NonVectorizable, Index>), num_blocks, block_size, 0, device, evaluator, size); } }; template struct LaunchRowReduce; template struct LaunchRowReduce { static void launch(const GpuDevice& device, RHS input, std::size_t rows, std::size_t cols, LHS output) { RowReduceCuda(F(), device, input, rows, cols, output); } }; template struct LaunchRowReduce { static void launch(const GpuDevice& device, RHS input, std::size_t rows, std::size_t cols, LHS output) {} }; template struct LaunchColReduce; template struct LaunchColReduce { static void launch(const GpuDevice& device, RHS input, std::size_t rows, std::size_t cols, LHS output) { ColumnReduceCuda(F(), device, input, rows, cols, output); } }; template struct LaunchColReduce { static void launch(const GpuDevice& device, RHS input, std::size_t rows, std::size_t cols, LHS output) {} }; template class TensorAssignExecutorHelper; template class TensorAssignExecutorHelper< const TensorAssignOp< OutExpr, TensorReductionOp const>, GpuDevice, Vectorizable> { public: typedef const TensorAssignOp< OutExpr, TensorReductionOp const> Expression; typedef typename Expression::Index Index; typedef TensorEvaluator LHSEval; typedef TensorEvaluator RHSEval; static inline void run(const Expression& expr, const GpuDevice& device) { std::size_t rows, cols; const ReductionType reduction_type = GetReductionType(expr.rhsExpression().expression(), expr.rhsExpression(), device, &rows, &cols); if (reduction_type == UNOPTIMIZED) { TensorEvaluator evaluator(expr, device); const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); if (needs_assign) { const int num_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / device.maxCudaThreadsPerBlock(); const int block_size = device.maxCudaThreadsPerBlock(); const Index size = array_prod(evaluator.dimensions()); LaunchKernel::launch( num_blocks, block_size, device, evaluator, size); } evaluator.cleanup(); } else { LHSEval output(expr.lhsExpression(), device); RHSEval input(expr.rhsExpression().expression(), device); bool lhs_needs_assign = output.evalSubExprsIfNeeded(NULL); bool rhs_needs_assign = input.evalSubExprsIfNeeded(NULL); if (lhs_needs_assign && rhs_needs_assign) { const bool Compatible = IsFloatSumReduction::value || IsFloatMaxReduction::value; if (reduction_type == ROW_REDUCE) { if (IsFloatSumReduction::value) { LaunchRowReduce::launch(device, input, rows, cols, output); } else if (IsFloatMaxReduction::value) { LaunchRowReduce::launch(device, input, rows, cols, output); } else { // Unsupported reduction type assert(false && "Unsupported reduction function for ROW_REDUCE"); } } else { if (IsFloatSumReduction::value) { LaunchColReduce::launch(device, input, rows, cols, output); } else if (IsFloatMaxReduction::value) { LaunchColReduce::launch(device, input, rows, cols, output); } else { // Unsupported reduction type assert(false && "Unsupported reduction function for COL_REDUCE"); } } } input.cleanup(); output.cleanup(); } } }; template inline void TensorExecutor< const TensorAssignOp< OutExpr, TensorReductionOp const>, GpuDevice, false, Tileable>::run(const Expression& expr, const GpuDevice& device) { TensorAssignExecutorHelper< const TensorAssignOp< OutExpr, TensorReductionOp const>, GpuDevice, false>::run(expr, device); } template inline void TensorExecutor< const TensorAssignOp< OutExpr, TensorReductionOp const>, GpuDevice, true, Tileable>::run(const Expression& expr, const GpuDevice& device) { TensorAssignExecutorHelper< const TensorAssignOp< OutExpr, TensorReductionOp const>, GpuDevice, true>::run(expr, device); } template struct PtrWrapper { EIGEN_DEVICE_FUNC PtrWrapper(T* ptr) : m_ptr(ptr) {} EIGEN_DEVICE_FUNC T& coeffRef(Index i) { return *(m_ptr + i); } T* m_ptr; }; template class TensorEvalToExecutorHelper; template class TensorEvalToExecutorHelper >, GpuDevice, Vectorizable> { public: typedef const TensorEvalToOp > Expression; typedef typename Expression::Index Index; typedef TensorEvaluator RHSEval; static inline void run(const Expression& expr, const GpuDevice& device) { std::size_t rows, cols; const ReductionType reduction_type = GetReductionType(expr.expression().expression(), expr.expression(), device, &rows, &cols); if (reduction_type == UNOPTIMIZED) { TensorEvaluator evaluator(expr, device); const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); if (needs_assign) { const int num_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / device.maxCudaThreadsPerBlock(); const int block_size = device.maxCudaThreadsPerBlock(); const Index size = array_prod(evaluator.dimensions()); LaunchKernel::launch( num_blocks, block_size, device, evaluator, size); } evaluator.cleanup(); } else { typedef typename internal::remove_const::type Scalar; PtrWrapper output(expr.buffer()); TensorEvaluator input( expr.expression().expression(), device); typedef PtrWrapper LHSEval; typedef TensorEvaluator RHSEval; bool rhs_needs_assign = input.evalSubExprsIfNeeded(NULL); if (rhs_needs_assign) { const bool Compatible = IsFloatSumReduction::value || IsFloatMaxReduction::value; if (reduction_type == ROW_REDUCE) { if (IsFloatSumReduction::value) { LaunchRowReduce::launch(device, input, rows, cols, output); } else if (IsFloatMaxReduction::value) { LaunchRowReduce::launch(device, input, rows, cols, output); } } else { if (IsFloatSumReduction::value) { LaunchColReduce::launch(device, input, rows, cols, output); } else if (IsFloatMaxReduction::value) { LaunchColReduce::launch(device, input, rows, cols, output); } } } input.cleanup(); } } }; template inline void TensorExecutor >, GpuDevice, false, Tileable>::run(const Expression& expr, const GpuDevice& device) { TensorEvalToExecutorHelper >, GpuDevice, false>::run(expr, device); } template inline void TensorExecutor >, GpuDevice, true, Tileable>::run(const Expression& expr, const GpuDevice& device) { TensorEvalToExecutorHelper >, GpuDevice, true>::run(expr, device); } } // end namespace internal } // end namespace Eigen #endif // __CUDACC__ #endif // EIGEN_USE_GPU #endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H