aboutsummaryrefslogtreecommitdiffhomepage
path: root/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
diff options
context:
space:
mode:
authorGravatar Benoit Steiner <benoit.steiner.goog@gmail.com>2016-05-09 17:05:53 -0700
committerGravatar Benoit Steiner <benoit.steiner.goog@gmail.com>2016-05-09 17:05:53 -0700
commitc3859a2b584730ed1cb155a6d9bc422592d8d26b (patch)
tree811a5b34b580b4737c8469c9734f50ccdadd43aa /unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
parentba95e43ea25fd0c6066630ac121559c2e0ba7728 (diff)
Added the ability to use a scratch buffer in cuda kernels
Diffstat (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h')
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h31
1 files changed, 28 insertions, 3 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
index 1d2d162dc..2fa6cad34 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
@@ -24,6 +24,9 @@ class StreamInterface {
// Allocate memory on the actual device where the computation will run
virtual void* allocate(size_t num_bytes) const = 0;
virtual void deallocate(void* buffer) const = 0;
+
+ // Return a scratchpad buffer of size 1k
+ virtual void* scratchpad() const = 0;
};
static cudaDeviceProp* m_deviceProperties;
@@ -62,12 +65,12 @@ static const cudaStream_t default_stream = cudaStreamDefault;
class CudaStreamDevice : public StreamInterface {
public:
// Use the default stream on the current device
- CudaStreamDevice() : stream_(&default_stream) {
+ CudaStreamDevice() : stream_(&default_stream), scratch_(NULL) {
cudaGetDevice(&device_);
initializeDeviceProp();
}
// Use the default stream on the specified device
- CudaStreamDevice(int device) : stream_(&default_stream), device_(device) {
+ CudaStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL) {
initializeDeviceProp();
}
// Use the specified stream. Note that it's the
@@ -75,7 +78,7 @@ class CudaStreamDevice : public StreamInterface {
// the specified device. If no device is specified the code
// assumes that the stream is associated to the current gpu device.
CudaStreamDevice(const cudaStream_t* stream, int device = -1)
- : stream_(stream), device_(device) {
+ : stream_(stream), device_(device), scratch_(NULL) {
if (device < 0) {
cudaGetDevice(&device_);
} else {
@@ -89,6 +92,12 @@ class CudaStreamDevice : public StreamInterface {
initializeDeviceProp();
}
+ virtual ~CudaStreamDevice() {
+ if (scratch_) {
+ deallocate(scratch_);
+ }
+ }
+
const cudaStream_t& stream() const { return *stream_; }
const cudaDeviceProp& deviceProperties() const {
return m_deviceProperties[device_];
@@ -112,9 +121,17 @@ class CudaStreamDevice : public StreamInterface {
assert(err == cudaSuccess);
}
+ virtual void* scratchpad() const {
+ if (scratch_ == NULL) {
+ scratch_ = allocate(1024);
+ }
+ return scratch_;
+ }
+
private:
const cudaStream_t* stream_;
int device_;
+ mutable void* scratch_;
};
struct GpuDevice {
@@ -143,10 +160,18 @@ struct GpuDevice {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
#ifndef __CUDA_ARCH__
stream_->deallocate(buffer);
+#else
+ eigen_assert(false && "The default device should be used instead to generate kernel code");
+#endif
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* scratchpad() const {
+#ifndef __CUDA_ARCH__
+ return stream_->scratchpad();
#else
eigen_assert(false && "The default device should be used instead to generate kernel code");
#endif
+ return NULL;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {