From c3859a2b584730ed1cb155a6d9bc422592d8d26b Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Mon, 9 May 2016 17:05:53 -0700
Subject: Added the ability to use a scratch buffer in cuda kernels

---
 .../Eigen/CXX11/src/Tensor/TensorDeviceCuda.h      | 31 +++++++++++++++++++---
 1 file changed, 28 insertions(+), 3 deletions(-)

(limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h')

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
index 1d2d162dc..2fa6cad34 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
@@ -24,6 +24,9 @@ class StreamInterface {
   // Allocate memory on the actual device where the computation will run
   virtual void* allocate(size_t num_bytes) const = 0;
   virtual void deallocate(void* buffer) const = 0;
+
+  // Return a scratchpad buffer of size 1k
+  virtual void* scratchpad() const = 0;
 };
 
 static cudaDeviceProp* m_deviceProperties;
@@ -62,12 +65,12 @@ static const cudaStream_t default_stream = cudaStreamDefault;
 class CudaStreamDevice : public StreamInterface {
  public:
   // Use the default stream on the current device
-  CudaStreamDevice() : stream_(&default_stream) {
+  CudaStreamDevice() : stream_(&default_stream), scratch_(NULL) {
     cudaGetDevice(&device_);
     initializeDeviceProp();
   }
   // Use the default stream on the specified device
-  CudaStreamDevice(int device) : stream_(&default_stream), device_(device) {
+  CudaStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL) {
     initializeDeviceProp();
   }
   // Use the specified stream. Note that it's the
@@ -75,7 +78,7 @@ class CudaStreamDevice : public StreamInterface {
   // the specified device. If no device is specified the code
   // assumes that the stream is associated to the current gpu device.
   CudaStreamDevice(const cudaStream_t* stream, int device = -1)
-      : stream_(stream), device_(device) {
+      : stream_(stream), device_(device), scratch_(NULL) {
     if (device < 0) {
       cudaGetDevice(&device_);
     } else {
@@ -89,6 +92,12 @@ class CudaStreamDevice : public StreamInterface {
     initializeDeviceProp();
   }
 
+  virtual ~CudaStreamDevice() {
+    if (scratch_) {
+      deallocate(scratch_);
+    }
+  }
+
   const cudaStream_t& stream() const { return *stream_; }
   const cudaDeviceProp& deviceProperties() const {
     return m_deviceProperties[device_];
@@ -112,9 +121,17 @@ class CudaStreamDevice : public StreamInterface {
     assert(err == cudaSuccess);
   }
 
+  virtual void* scratchpad() const {
+    if (scratch_ == NULL) {
+      scratch_ = allocate(1024);
+    }
+    return scratch_;
+  }
+
  private:
   const cudaStream_t* stream_;
   int device_;
+  mutable void* scratch_;
 };
 
 struct GpuDevice {
@@ -143,10 +160,18 @@ struct GpuDevice {
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
 #ifndef __CUDA_ARCH__
     stream_->deallocate(buffer);
+#else
+    eigen_assert(false && "The default device should be used instead to generate kernel code");
+#endif
+  }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* scratchpad() const {
+#ifndef __CUDA_ARCH__
+    return stream_->scratchpad();
 #else
     eigen_assert(false && "The default device should be used instead to generate kernel code");
 #endif
+    return NULL;
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
-- 
cgit v1.2.3