diff options
author | Gael Guennebaud <g.gael@free.fr> | 2018-07-13 16:04:27 +0200 |
---|---|---|
committer | Gael Guennebaud <g.gael@free.fr> | 2018-07-13 16:04:27 +0200 |
commit | 06eb24cf4d7d54e56abfb37ea062a7cb0c887550 (patch) | |
tree | a25c3aeb41414fc3f8bebee82a94c5d798dbb7ec /unsupported/Eigen/CXX11/src/Tensor | |
parent | 5fd03ddbfb91a6d641903229ed1428bc82756c4f (diff) |
Introduce gpu_assert for assertion in device-code, and disable them with clang-cuda.
Diffstat (limited to 'unsupported/Eigen/CXX11/src/Tensor')
6 files changed, 44 insertions, 43 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index 3110887e1..25131600d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -903,7 +903,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr } const int shared_mem = block_size.y * (maxX + kernel_size - 1) * sizeof(Scalar); - assert(shared_mem <= maxSharedMem); + gpu_assert(shared_mem <= maxSharedMem); const int num_x_blocks = ceil(numX, maxX); const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem); @@ -960,7 +960,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxP); const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * sizeof(Scalar); - assert(shared_mem <= maxSharedMem); + gpu_assert(shared_mem <= maxSharedMem); const int num_x_blocks = ceil(numX, maxX); const int num_y_blocks = ceil(numY, maxY); @@ -1040,7 +1040,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr dim3 num_blocks(ceil(numX, maxX), ceil(numY, maxY), ceil(numZ, maxZ)); const int shared_mem = (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) * sizeof(Scalar); - assert(shared_mem <= maxSharedMem); + gpu_assert(shared_mem <= maxSharedMem); //cout << "launching 3D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl; const array<Index, 3> indices(m_indices[idxX], m_indices[idxY], diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h index da88bcb3b..65403905a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h @@ -352,7 +352,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr size_t range_x, GRange_x, tileSize_x, range_y, GRange_y, tileSize_y; m_device.parallel_for_setup(numX, numP, tileSize_x,tileSize_y,range_x,range_y, GRange_x, GRange_y ); const size_t shared_mem =(tileSize_x +kernel_size -1)*(tileSize_y); - assert(static_cast<unsigned long>(shared_mem) <= m_device.sharedMemPerBlock()); + gpu_assert(static_cast<unsigned long>(shared_mem) <= m_device.sharedMemPerBlock()); auto global_range=cl::sycl::range<2>(GRange_x, GRange_y); // global range auto local_range=cl::sycl::range<2>(tileSize_x, tileSize_y); // local range InputLocalAcc local_acc(cl::sycl::range<1>(shared_mem), cgh); @@ -377,7 +377,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr size_t range_x, GRange_x, tileSize_x, range_y, GRange_y, tileSize_y, range_z, GRange_z, tileSize_z; m_device.parallel_for_setup(numX, numY, numP, tileSize_x, tileSize_y, tileSize_z, range_x, range_y, range_z, GRange_x, GRange_y, GRange_z ); const size_t shared_mem =(tileSize_x +kernel_size_x -1)*(tileSize_y +kernel_size_y -1) * tileSize_z; - assert(static_cast<unsigned long>(shared_mem) <= m_device.sharedMemPerBlock()); + gpu_assert(static_cast<unsigned long>(shared_mem) <= m_device.sharedMemPerBlock()); auto global_range=cl::sycl::range<3>(GRange_x, GRange_y, GRange_z); // global range auto local_range=cl::sycl::range<3>(tileSize_x, tileSize_y, tileSize_z); // local range InputLocalAcc local_acc(cl::sycl::range<1>(shared_mem), cgh); @@ -408,7 +408,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr size_t range_x, GRange_x, tileSize_x, range_y, GRange_y, tileSize_y, range_z, GRange_z, tileSize_z; m_device.parallel_for_setup(numX, numY, numZ, tileSize_x, tileSize_y, tileSize_z, range_x, range_y, range_z, GRange_x, GRange_y, GRange_z ); const size_t shared_mem =(tileSize_x +kernel_size_x -1)*(tileSize_y +kernel_size_y -1) * (tileSize_z +kernel_size_y -1); - assert(static_cast<unsigned long>(shared_mem) <= m_device.sharedMemPerBlock()); + gpu_assert(static_cast<unsigned long>(shared_mem) <= m_device.sharedMemPerBlock()); auto global_range=cl::sycl::range<3>(GRange_x, GRange_y, GRange_z); // global range auto local_range=cl::sycl::range<3>(tileSize_x, tileSize_y, tileSize_z); // local range InputLocalAcc local_acc(cl::sycl::range<1>(shared_mem), cgh); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h index 64ef32793..0c036833f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h @@ -68,7 +68,7 @@ static void initializeDeviceProp() { std::cerr << "Failed to get the number of GPU devices: " << gpuGetErrorString(status) << std::endl; - assert(status == gpuSuccess); + gpu_assert(status == gpuSuccess); } m_deviceProperties = new gpuDeviceProp_t[num_devices]; for (int i = 0; i < num_devices; ++i) { @@ -79,7 +79,7 @@ static void initializeDeviceProp() { << ": " << gpuGetErrorString(status) << std::endl; - assert(status == gpuSuccess); + gpu_assert(status == gpuSuccess); } } @@ -124,8 +124,8 @@ class GpuStreamDevice : public StreamInterface { int num_devices; gpuError_t err = gpuGetDeviceCount(&num_devices); EIGEN_UNUSED_VARIABLE(err) - assert(err == gpuSuccess); - assert(device < num_devices); + gpu_assert(err == gpuSuccess); + gpu_assert(device < num_devices); device_ = device; } initializeDeviceProp(); @@ -144,20 +144,20 @@ class GpuStreamDevice : public StreamInterface { virtual void* allocate(size_t num_bytes) const { gpuError_t err = gpuSetDevice(device_); EIGEN_UNUSED_VARIABLE(err) - assert(err == gpuSuccess); + gpu_assert(err == gpuSuccess); void* result; err = gpuMalloc(&result, num_bytes); - assert(err == gpuSuccess); - assert(result != NULL); + gpu_assert(err == gpuSuccess); + gpu_assert(result != NULL); return result; } virtual void deallocate(void* buffer) const { gpuError_t err = gpuSetDevice(device_); EIGEN_UNUSED_VARIABLE(err) - assert(err == gpuSuccess); - assert(buffer != NULL); + gpu_assert(err == gpuSuccess); + gpu_assert(buffer != NULL); err = gpuFree(buffer); - assert(err == gpuSuccess); + gpu_assert(err == gpuSuccess); } virtual void* scratchpad() const { @@ -173,7 +173,7 @@ class GpuStreamDevice : public StreamInterface { semaphore_ = reinterpret_cast<unsigned int*>(scratch); gpuError_t err = gpuMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_); EIGEN_UNUSED_VARIABLE(err) - assert(err == gpuSuccess); + gpu_assert(err == gpuSuccess); } return semaphore_; } @@ -220,7 +220,7 @@ struct GpuDevice { gpuError_t err = gpuMemcpyAsync(dst, src, n, gpuMemcpyDeviceToDevice, stream_->stream()); EIGEN_UNUSED_VARIABLE(err) - assert(err == gpuSuccess); + gpu_assert(err == gpuSuccess); #else EIGEN_UNUSED_VARIABLE(dst); EIGEN_UNUSED_VARIABLE(src); @@ -233,21 +233,21 @@ struct GpuDevice { gpuError_t err = gpuMemcpyAsync(dst, src, n, gpuMemcpyHostToDevice, stream_->stream()); EIGEN_UNUSED_VARIABLE(err) - assert(err == gpuSuccess); + gpu_assert(err == gpuSuccess); } EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const { gpuError_t err = gpuMemcpyAsync(dst, src, n, gpuMemcpyDeviceToHost, stream_->stream()); EIGEN_UNUSED_VARIABLE(err) - assert(err == gpuSuccess); + gpu_assert(err == gpuSuccess); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { #ifndef EIGEN_GPU_COMPILE_PHASE gpuError_t err = gpuMemsetAsync(buffer, c, n, stream_->stream()); EIGEN_UNUSED_VARIABLE(err) - assert(err == gpuSuccess); + gpu_assert(err == gpuSuccess); #else eigen_assert(false && "The default device should be used instead to generate kernel code"); #endif @@ -276,10 +276,10 @@ struct GpuDevice { std::cerr << "Error detected in GPU stream: " << gpuGetErrorString(err) << std::endl; - assert(err == gpuSuccess); + gpu_assert(err == gpuSuccess); } #else - assert(false && "The default device should be used instead to generate kernel code"); + gpu_assert(false && "The default device should be used instead to generate kernel code"); #endif } @@ -326,13 +326,13 @@ struct GpuDevice { #define LAUNCH_GPU_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \ hipLaunchKernelGGL(kernel, dim3(gridsize), dim3(blocksize), (sharedmem), (device).stream(), __VA_ARGS__); \ - assert(hipGetLastError() == hipSuccess); + gpu_assert(hipGetLastError() == hipSuccess); #else #define LAUNCH_GPU_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \ (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \ - assert(cudaGetLastError() == cudaSuccess); + gpu_assert(cudaGetLastError() == cudaSuccess); #endif @@ -342,7 +342,7 @@ static EIGEN_DEVICE_FUNC inline void setGpuSharedMemConfig(gpuSharedMemConfig co #ifndef EIGEN_GPU_COMPILE_PHASE gpuError_t status = gpuDeviceSetSharedMemConfig(config); EIGEN_UNUSED_VARIABLE(status) - assert(status == gpuSuccess); + gpu_assert(status == gpuSuccess); #else EIGEN_UNUSED_VARIABLE(config) #endif diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h index 9966955f7..5438ebe71 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h @@ -78,10 +78,11 @@ #endif -#if defined(EIGEN_HIP_DEVICE_COMPILE) -// HIPCC does not support the use of assert on the GPU side. -#undef assert -#define assert(COND) +#if defined(EIGEN_HIP_DEVICE_COMPILE) || (defined(EIGEN_CUDACC) && (EIGEN_CUDACC_VER==0)) +// clang-cuda and HIPCC do not support the use of assert on the GPU side. +#define gpu_assert(COND) +#else +#define gpu_assert(COND) assert(COND) #endif #endif // EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h index 5a547141a..787cbd031 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h @@ -19,7 +19,7 @@ EIGEN_DEVICE_FUNC uint64_t get_random_seed() { #if defined(EIGEN_GPU_COMPILE_PHASE) // We don't support 3d kernels since we currently only use 1 and // 2d kernels. - assert(threadIdx.z == 0); + gpu_assert(threadIdx.z == 0); return clock64() + blockIdx.x * blockDim.x + threadIdx.x + gridDim.x * blockDim.x * (blockIdx.y * blockDim.y + threadIdx.y); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h index a691e530a..cd20df505 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h @@ -60,10 +60,10 @@ __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) } } else { - assert(0 && "Wordsize not supported"); + gpu_assert(0 && "Wordsize not supported"); } #else // EIGEN_CUDA_ARCH >= 300 - assert(0 && "Shouldn't be called on unsupported device"); + gpu_assert(0 && "Shouldn't be called on unsupported device"); #endif // EIGEN_CUDA_ARCH >= 300 } @@ -105,7 +105,7 @@ __device__ inline void atomicReduce(float* output, float accum, SumReducer<float #if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300) atomicAdd(output, accum); #else // EIGEN_CUDA_ARCH >= 300 - assert(0 && "Shouldn't be called on unsupported device"); + gpu_assert(0 && "Shouldn't be called on unsupported device"); #endif // EIGEN_CUDA_ARCH >= 300 } @@ -196,7 +196,7 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num #endif } #else // EIGEN_CUDA_ARCH >= 300 - assert(0 && "Shouldn't be called on unsupported device"); + gpu_assert(0 && "Shouldn't be called on unsupported device"); #endif // EIGEN_CUDA_ARCH >= 300 } @@ -304,7 +304,7 @@ __global__ void ReductionCleanupKernelHalfFloat(Op& reducer, half* output, half2 template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void> struct FullReductionLauncher { static void run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index) { - assert(false && "Should only be called on doubles, floats and half floats"); + gpu_assert(false && "Should only be called on doubles, floats and half floats"); } }; @@ -337,7 +337,7 @@ struct FullReductionLauncher< template <typename Self, typename Op> struct FullReductionLauncher<Self, Op, Eigen::half, false> { static void run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index) { - assert(false && "Should not be called since there is no packet accessor"); + gpu_assert(false && "Should not be called since there is no packet accessor"); } }; @@ -388,7 +388,7 @@ struct FullReducer<Self, Op, GpuDevice, Vectorizable> { template <typename OutputType> static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) { - assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats"); + gpu_assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats"); const Index num_coeffs = array_prod(self.m_impl.dimensions()); // Don't crash when we're called with an input tensor of size 0. if (num_coeffs == 0) { @@ -479,7 +479,7 @@ __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index nu } } #else // EIGEN_CUDA_ARCH >= 300 - assert(0 && "Shouldn't be called on unsupported device"); + gpu_assert(0 && "Shouldn't be called on unsupported device"); #endif // EIGEN_CUDA_ARCH >= 300 } @@ -601,7 +601,7 @@ __global__ void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void> struct InnerReductionLauncher { static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index, typename Self::Index) { - assert(false && "Should only be called to reduce doubles, floats and half floats on a gpu device"); + gpu_assert(false && "Should only be called to reduce doubles, floats and half floats on a gpu device"); return true; } }; @@ -648,7 +648,7 @@ struct InnerReductionLauncher< template <typename Self, typename Op> struct InnerReductionLauncher<Self, Op, Eigen::half, false> { static bool run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index, typename Self::Index) { - assert(false && "Should not be called since there is no packet accessor"); + gpu_assert(false && "Should not be called since there is no packet accessor"); return true; } }; @@ -709,7 +709,7 @@ struct InnerReducer<Self, Op, GpuDevice> { template <typename OutputType> static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { - assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats"); + gpu_assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats"); const Index num_coeffs = array_prod(self.m_impl.dimensions()); // Don't crash when we're called with an input tensor of size 0. if (num_coeffs == 0) { @@ -777,7 +777,7 @@ struct OuterReducer<Self, Op, GpuDevice> { EIGEN_DEVICE_FUNC #endif bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) { - assert(false && "Should only be called to reduce doubles or floats on a gpu device"); + gpu_assert(false && "Should only be called to reduce doubles or floats on a gpu device"); return true; } |