Added support for multi gpu configuration to the GpuDevice class

author: Benoit Steiner <benoit.steiner.goog@gmail.com> 2015-07-15 12:38:34 -0700
committer: Benoit Steiner <benoit.steiner.goog@gmail.com> 2015-07-15 12:38:34 -0700
commit: e892524efe7e8adbd43bf4c1c150f4f4ebf27d1d (patch)
tree: 4bc0c43b27ec1ee6a595baf0e56904da5b309855 /unsupported/Eigen
parent: f5aa64086228ca9ccfa27e6086667fd0bdbad22c (diff)
3 files changed, 197 insertions, 56 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
index 88db9d410..64b0315ee 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
@@ -835,10 +835,10 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
   void executeEval(Scalar* data) const {
     typedef typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions InputDims;
 
-    const int maxSharedMem = sharedMemPerBlock();
-    const int maxThreadsPerBlock = maxCudaThreadsPerBlock();
-    const int maxBlocksPerProcessor = maxCudaThreadsPerMultiProcessor() / maxThreadsPerBlock;
-    const int numMultiProcessors = getNumCudaMultiProcessors();
+    const int maxSharedMem = m_device.sharedMemPerBlock();
+    const int maxThreadsPerBlock = m_device.maxCudaThreadsPerBlock();
+    const int maxBlocksPerProcessor = m_device.maxCudaThreadsPerMultiProcessor() / maxThreadsPerBlock;
+    const int numMultiProcessors = m_device.getNumCudaMultiProcessors();
     const int warpSize = 32;
 
     switch (NumKernelDims) {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h
index a4419c665..346c0b6fa 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h
@@ -15,16 +15,22 @@ namespace Eigen {
 
 // Default device for the machine (typically a single cpu core)
 struct DefaultDevice {
-  EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
     return internal::aligned_malloc(num_bytes);
   }
-  EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
     internal::aligned_free(buffer);
   }
-  EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
     ::memcpy(dst, src, n);
   }
-  EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
+    memcpy(dst, src, n);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
+    memcpy(dst, src, n);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
     ::memset(buffer, c, n);
   }
 
@@ -208,6 +214,7 @@ static EIGEN_STRONG_INLINE void wait_until_ready(Notification* n) {
 
 // Build a thread pool device on top the an existing pool of threads.
 struct ThreadPoolDevice {
+  // The ownership of the thread pool remains with the caller.
   ThreadPoolDevice(ThreadPoolInterface* pool, size_t num_cores) : pool_(pool), num_threads_(num_cores) { }
 
   EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
@@ -221,6 +228,12 @@ struct ThreadPoolDevice {
   EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
     ::memcpy(dst, src, n);
   }
+  EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
+    memcpy(dst, src, n);
+  }
+  EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
+    memcpy(dst, src, n);
+  }
 
   EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
     ::memset(buffer, c, n);
@@ -259,81 +272,164 @@ struct ThreadPoolDevice {
 
 // GPU offloading
 #ifdef EIGEN_USE_GPU
-static cudaDeviceProp m_deviceProperties;
+
+// This defines an interface that GPUDevice can take to use
+// CUDA streams underneath.
+class StreamInterface {
+ public:
+  virtual ~StreamInterface() {}
+
+  virtual const cudaStream_t& stream() const = 0;
+  virtual const cudaDeviceProp& deviceProperties() const = 0;
+
+  // Allocate memory on the actual device where the computation will run
+  virtual void* allocate(size_t num_bytes) const = 0;
+  virtual void deallocate(void* buffer) const = 0;
+};
+
+static cudaDeviceProp* m_deviceProperties;
 static bool m_devicePropInitialized = false;
 
 static void initializeDeviceProp() {
   if (!m_devicePropInitialized) {
-    assert(cudaGetDeviceProperties(&m_deviceProperties, 0) == cudaSuccess);
-    m_devicePropInitialized = true;
+    if (!m_devicePropInitialized) {
+      int num_devices;
+      cudaError_t status = cudaGetDeviceCount(&num_devices);
+      eigen_check(status == cudaSuccess);
+      m_deviceProperties = new cudaDeviceProp[num_devices];
+      for (int i = 0; i < num_devices; ++i) {
+        status = cudaGetDeviceProperties(&m_deviceProperties[i], i);
+        eigen_check(status == cudaSuccess);
+      }
+      m_devicePropInitialized = true;
+    }
   }
 }
 
-static inline int getNumCudaMultiProcessors() {
-  initializeDeviceProp();
-  return m_deviceProperties.multiProcessorCount;
-}
-static inline int maxCudaThreadsPerBlock() {
-  initializeDeviceProp();
-  return m_deviceProperties.maxThreadsPerBlock;
-}
-static inline int maxCudaThreadsPerMultiProcessor() {
-  initializeDeviceProp();
-  return m_deviceProperties.maxThreadsPerMultiProcessor;
-}
-static inline int sharedMemPerBlock() {
-  initializeDeviceProp();
-  return m_deviceProperties.sharedMemPerBlock;
-}
+static const cudaStream_t default_stream = cudaStreamDefault;
 
-static inline void setCudaSharedMemConfig(cudaSharedMemConfig config) {
-  cudaError_t status = cudaDeviceSetSharedMemConfig(config);
-  assert(status == cudaSuccess);
-}
+class CudaStreamDevice : public StreamInterface {
+ public:
+  // Use the default stream on the current device
+  CudaStreamDevice() : stream_(&default_stream) {
+    cudaGetDevice(&device_);
+    initializeDeviceProp();
+  }
+  // Use the default stream on the specified device
+  CudaStreamDevice(int device) : stream_(&default_stream), device_(device) {
+    initializeDeviceProp();
+  }
+  // Use the specified stream. Note that it's the
+  // caller responsibility to ensure that the stream can run on
+  // the specified device. If no device is specified the code
+  // assumes that the stream is associated to the current gpu device.
+  CudaStreamDevice(const cudaStream_t* stream, int device = -1)
+      : stream_(stream), device_(device) {
+    if (device < 0) {
+      cudaGetDevice(&device_);
+    } else {
+      int num_devices;
+      cudaError_t err = cudaGetDeviceCount(&num_devices);
+      eigen_check(err == cudaSuccess);
+      eigen_check(device < num_devices);
+      device_ = device;
+    }
+    initializeDeviceProp();
+  }
+
+  const cudaStream_t& stream() const { return *stream_; }
+  const cudaDeviceProp& deviceProperties() const {
+    return m_deviceProperties[device_];
+  }
+  virtual void* allocate(size_t num_bytes) const {
+    cudaError_t err = cudaSetDevice(device_);
+    eigen_check(err == cudaSuccess);
+    void* result;
+    err = cudaMalloc(&result, num_bytes);
+    eigen_check(err == cudaSuccess);
+    eigen_check(result != NULL);
+    return result;
+  }
+  virtual void deallocate(void* buffer) const {
+    cudaError_t err = cudaSetDevice(device_);
+    eigen_check(err == cudaSuccess);
+    assert(buffer != NULL);
+    err = cudaFree(buffer);
+    assert(err == cudaSuccess);
+  }
+
+ private:
+  const cudaStream_t* stream_;
+  int device_;
+};
 
-// Cuda stream to use when no stream is specified explicitely.
-static const cudaStream_t default_stream = cudaStreamDefault;
 
 struct GpuDevice {
-  // The cudastream is not owned: the caller is responsible for its initialization and eventual destruction.
-  GpuDevice(const cudaStream_t* stream = &default_stream) : stream_(stream) { eigen_assert(stream); }
+  // The StreamInterface is not owned: the caller is
+  // responsible for its initialization and eventual destruction.
+  explicit GpuDevice(const StreamInterface* stream) : stream_(stream) {
+    eigen_assert(stream);
+  }
 
-  EIGEN_STRONG_INLINE const cudaStream_t& stream() const { return *stream_; }
+  // TODO(bsteiner): This is an internal API, we should not expose it.
+  EIGEN_STRONG_INLINE const cudaStream_t& stream() const {
+    return stream_->stream();
+  }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
 #ifndef __CUDA_ARCH__
-    void* result;
-    assert(cudaMalloc(&result, num_bytes) == cudaSuccess);
-    assert(result != NULL);
-    return result;
+    return stream_->allocate(num_bytes);
 #else
-    assert(false && "The default device should be used instead to generate kernel code");
+    eigen_assert(false && "The default device should be used instead to generate kernel code");
     return NULL;
 #endif
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
 #ifndef __CUDA_ARCH__
-    assert(buffer != NULL);
-    assert(cudaFree(buffer) == cudaSuccess);
+    stream_->deallocate(buffer);
+
 #else
-    assert(false && "The default device should be used instead to generate kernel code");
+    eigen_assert(false && "The default device should be used instead to generate kernel code");
 #endif
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
 #ifndef __CUDA_ARCH__
-    assert(cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice, *stream_) == cudaSuccess);
+    cudaError_t err = cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice,
+                                      stream_->stream());
+    assert(err == cudaSuccess);
 #else
-    assert(false && "The default device should be used instead to generate kernel code");
+    eigen_assert(false && "The default device should be used instead to generate kernel code");
+#endif
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
+#ifndef __CUDA_ARCH__
+    cudaError_t err =
+        cudaMemcpyAsync(dst, src, n, cudaMemcpyHostToDevice, stream_->stream());
+    assert(err == cudaSuccess);
+#else
+    eigen_assert(false && "The default device should be used instead to generate kernel code");
+#endif
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
+#ifndef __CUDA_ARCH__
+    cudaError_t err =
+        cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToHost, stream_->stream());
+    assert(err == cudaSuccess);
+#else
+    eigen_assert(false && "The default device should be used instead to generate kernel code");
 #endif
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
 #ifndef __CUDA_ARCH__
-    assert(cudaMemsetAsync(buffer, c, n, *stream_) == cudaSuccess);
+    cudaError_t err = cudaMemsetAsync(buffer, c, n, stream_->stream());
+    assert(err == cudaSuccess);
 #else
-    assert(false && "The default device should be used instead to generate kernel code");
+    eigen_assert(false && "The default device should be used instead to generate kernel code");
 #endif
   }
 
@@ -342,21 +438,66 @@ struct GpuDevice {
     return 32;
   }
 
- inline int majorDeviceVersion() const { return m_deviceProperties.major; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
+    // FIXME
+    return 48*1024;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
+    // We won't try to take advantage of the l2 cache for the time being, and
+    // there is no l3 cache on cuda devices.
+    return firstLevelCacheSize();
+  }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const {
-    cudaStreamSynchronize(*stream_);
+#ifndef __CUDA_ARCH__
+    cudaError_t err = cudaStreamSynchronize(stream_->stream());
+    assert(err == cudaSuccess);
+#else
+    assert(false && "The default device should be used instead to generate kernel code");
+#endif
+  }
+
+  inline int getNumCudaMultiProcessors() const {
+    return stream_->deviceProperties().multiProcessorCount;
+  }
+  inline int maxCudaThreadsPerBlock() const {
+    return stream_->deviceProperties().maxThreadsPerBlock;
+  }
+  inline int maxCudaThreadsPerMultiProcessor() const {
+    return stream_->deviceProperties().maxThreadsPerMultiProcessor;
+  }
+  inline int sharedMemPerBlock() const {
+    return stream_->deviceProperties().sharedMemPerBlock;
+  }
+  inline int majorDeviceVersion() const {
+    return stream_->deviceProperties().major;
+  }
+
+  // This function checks if the CUDA runtime recorded an error for the
+  // underlying stream device.
+  inline bool ok() const {
+    cudaError_t error = cudaStreamQuery(stream_->stream());
+    return (error == cudaSuccess) || (error == cudaErrorNotReady);
   }
 
  private:
-  // TODO: multigpu.
-  const cudaStream_t* stream_;
+  const StreamInterface* stream_;
+
 };
 
+
 #define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...)            \
   (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__);  \
   assert(cudaGetLastError() == cudaSuccess);
 
+
+// FIXME: Should be device and kernel specific.
+static inline void setCudaSharedMemConfig(cudaSharedMemConfig config) {
+  cudaError_t status = cudaDeviceSetSharedMemConfig(config);
+  eigen_check(status == cudaSuccess);
+}
+
 #endif
 
 }  // end namespace Eigen
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index 24606b0c8..a795f8eaa 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -205,8 +205,8 @@ class TensorExecutor<Expression, GpuDevice, false>
     const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
     if (needs_assign)
     {
-      const int num_blocks = getNumCudaMultiProcessors() * maxCudaThreadsPerMultiProcessor() / maxCudaThreadsPerBlock();
-      const int block_size = maxCudaThreadsPerBlock();
+      const int num_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / device.maxCudaThreadsPerBlock();
+      const int block_size = device.maxCudaThreadsPerBlock();
       const Index size = array_prod(evaluator.dimensions());
       LAUNCH_CUDA_KERNEL((EigenMetaKernel_NonVectorizable<TensorEvaluator<Expression, GpuDevice>, Index>), num_blocks, block_size, 0, device, evaluator, size);
     }
@@ -225,8 +225,8 @@ class TensorExecutor<Expression, GpuDevice, true>
     const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
     if (needs_assign)
     {
-      const int num_blocks = getNumCudaMultiProcessors() * maxCudaThreadsPerMultiProcessor() / maxCudaThreadsPerBlock();
-      const int block_size = maxCudaThreadsPerBlock();
+      const int num_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / device.maxCudaThreadsPerBlock();
+      const int block_size = device.maxCudaThreadsPerBlock();
       const Index size = array_prod(evaluator.dimensions());
       LAUNCH_CUDA_KERNEL((EigenMetaKernel_Vectorizable<TensorEvaluator<Expression, GpuDevice>, Index>), num_blocks, block_size, 0, device, evaluator, size);
     }
author	Benoit Steiner <benoit.steiner.goog@gmail.com>	2015-07-15 12:38:34 -0700
committer	Benoit Steiner <benoit.steiner.goog@gmail.com>	2015-07-15 12:38:34 -0700
commit	e892524efe7e8adbd43bf4c1c150f4f4ebf27d1d (patch)
tree	4bc0c43b27ec1ee6a595baf0e56904da5b309855 /unsupported/Eigen
parent	f5aa64086228ca9ccfa27e6086667fd0bdbad22c (diff)