diff options
author | Benoit Steiner <benoit.steiner.goog@gmail.com> | 2014-07-08 16:30:48 -0700 |
---|---|---|
committer | Benoit Steiner <benoit.steiner.goog@gmail.com> | 2014-07-08 16:30:48 -0700 |
commit | c285fda7f40ca161e6c8e66481d9a68e50613c48 (patch) | |
tree | 46d8f2bebd2e5264bda969f00a4483a2a8d076f3 /unsupported/Eigen/CXX11/src/Tensor | |
parent | 7d53633e05986c61ce90e7fc36862d529c0cc036 (diff) |
Extended the functionality of the TensorDeviceType classes
Diffstat (limited to 'unsupported/Eigen/CXX11/src/Tensor')
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h | 59 |
1 files changed, 56 insertions, 3 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h index 142edda14..b9c8c19fe 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h @@ -21,6 +21,12 @@ struct DefaultDevice { EIGEN_STRONG_INLINE void deallocate(void* buffer) const { internal::aligned_free(buffer); } + EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { + ::memcpy(dst, src, n); + } + EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { + ::memset(buffer, c, n); + } }; @@ -28,7 +34,7 @@ struct DefaultDevice { // We should really use a thread pool here but first we need to find a portable thread pool library. #ifdef EIGEN_USE_THREADS struct ThreadPoolDevice { - ThreadPoolDevice(/*ThreadPool* pool, */size_t num_cores) : /*pool_(pool), */num_threads_(num_cores) { } + ThreadPoolDevice(/*ThreadPool* pool, */size_t num_cores) : /*pool_(pool), */num_threads_(num_cores) { } size_t numThreads() const { return num_threads_; } EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { @@ -37,6 +43,12 @@ struct ThreadPoolDevice { EIGEN_STRONG_INLINE void deallocate(void* buffer) const { internal::aligned_free(buffer); } + EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { + ::memcpy(dst, src, n); + } + EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { + ::memset(buffer, c, n); + } private: // todo: NUMA, ... @@ -47,20 +59,61 @@ struct ThreadPoolDevice { // GPU offloading #ifdef EIGEN_USE_GPU +static int m_numMultiProcessors = 0; +static int m_maxThreadsPerBlock = 0; +static int m_maxThreadsPerMultiProcessor = 0; + +static inline int getNumCudaMultiProcessors() { + if (m_numMultiProcessors == 0) { + cudaDeviceProp deviceProp; + cudaGetDeviceProperties(&deviceProp, 0); + m_maxThreadsPerBlock = deviceProp.maxThreadsPerBlock; + m_maxThreadsPerMultiProcessor = deviceProp.maxThreadsPerMultiProcessor; + m_numMultiProcessors = deviceProp.multiProcessorCount; + } + return m_numMultiProcessors; +} +static inline int maxCudaThreadsPerBlock() { + if (m_maxThreadsPerBlock == 0) { + cudaDeviceProp deviceProp; + cudaGetDeviceProperties(&deviceProp, 0); + m_numMultiProcessors = deviceProp.multiProcessorCount; + m_maxThreadsPerMultiProcessor = deviceProp.maxThreadsPerMultiProcessor; + m_maxThreadsPerBlock = deviceProp.maxThreadsPerBlock; + } + return m_maxThreadsPerBlock; +} +static inline int maxCudaThreadsPerMultiProcessor() { + if (m_maxThreadsPerBlock == 0) { + cudaDeviceProp deviceProp; + cudaGetDeviceProperties(&deviceProp, 0); + m_numMultiProcessors = deviceProp.multiProcessorCount; + m_maxThreadsPerBlock = deviceProp.maxThreadsPerBlock; + m_maxThreadsPerMultiProcessor = deviceProp.maxThreadsPerMultiProcessor; + } + return m_maxThreadsPerMultiProcessor; +} + struct GpuDevice { // The cudastream is not owned: the caller is responsible for its initialization and eventual destruction. GpuDevice(const cudaStream_t* stream) : stream_(stream) { eigen_assert(stream); } EIGEN_STRONG_INLINE const cudaStream_t& stream() const { return *stream_; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { + /*EIGEN_DEVICE_FUNC*/ EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { void* result; cudaMalloc(&result, num_bytes); return result; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const { + /*EIGEN_DEVICE_FUNC */EIGEN_STRONG_INLINE void deallocate(void* buffer) const { cudaFree(buffer); } + EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { + cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice, *stream_); + } + EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { + cudaMemsetAsync(buffer, c, n, *stream_); + } private: // TODO: multigpu. |