aboutsummaryrefslogtreecommitdiffhomepage
path: root/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h
diff options
context:
space:
mode:
Diffstat (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h')
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h190
1 files changed, 190 insertions, 0 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h
new file mode 100644
index 000000000..efd207507
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h
@@ -0,0 +1,190 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_DEVICE_TYPE_H
+#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_TYPE_H
+
+
+namespace Eigen {
+
+// Default device for the machine (typically a single cpu core)
+struct DefaultDevice {
+ EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
+ return internal::aligned_malloc(num_bytes);
+ }
+ EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
+ internal::aligned_free(buffer);
+ }
+ EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
+ ::memcpy(dst, src, n);
+ }
+ EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
+ ::memset(buffer, c, n);
+ }
+
+ EIGEN_STRONG_INLINE size_t numThreads() const {
+ return 1;
+ }
+};
+
+
+// Multiple cpu cores
+// We should really use a thread pool here but first we need to find a portable thread pool library.
+#ifdef EIGEN_USE_THREADS
+
+typedef std::future<void> Future;
+typedef std::promise<void> Promise;
+
+static EIGEN_STRONG_INLINE void wait_until_ready(const Future* f) {
+ f->wait();
+}
+static EIGEN_STRONG_INLINE void get_when_ready(Future* f) {
+ f->get();
+}
+
+
+struct ThreadPoolDevice {
+ ThreadPoolDevice(size_t num_cores) : num_threads_(num_cores) { }
+
+ EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
+ return internal::aligned_malloc(num_bytes);
+ }
+
+ EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
+ internal::aligned_free(buffer);
+ }
+
+ EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
+ ::memcpy(dst, src, n);
+ }
+
+ EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
+ ::memset(buffer, c, n);
+ }
+
+ EIGEN_STRONG_INLINE size_t numThreads() const {
+ return num_threads_;
+ }
+
+ template <class Function, class... Args>
+ EIGEN_STRONG_INLINE Future enqueue(Function&& f, Args&&... args) const {
+ return std::async(std::launch::async, f, args...);
+ }
+ template <class Function, class... Args>
+ EIGEN_STRONG_INLINE void enqueueNoFuture(Function&& f, Args&&... args) const {
+ std::async(std::launch::async, f, args...);
+ }
+
+ private:
+ size_t num_threads_;
+};
+
+#endif
+
+
+// GPU offloading
+#ifdef EIGEN_USE_GPU
+static cudaDeviceProp m_deviceProperties;
+static bool m_devicePropInitialized = false;
+
+static void initializeDeviceProp() {
+ if (!m_devicePropInitialized) {
+ assert(cudaGetDeviceProperties(&m_deviceProperties, 0) == cudaSuccess);
+ m_devicePropInitialized = true;
+ }
+}
+
+static inline int getNumCudaMultiProcessors() {
+ initializeDeviceProp();
+ return m_deviceProperties.multiProcessorCount;
+}
+static inline int maxCudaThreadsPerBlock() {
+ initializeDeviceProp();
+ return m_deviceProperties.maxThreadsPerBlock;
+}
+static inline int maxCudaThreadsPerMultiProcessor() {
+ initializeDeviceProp();
+ return m_deviceProperties.maxThreadsPerMultiProcessor;
+}
+static inline int sharedMemPerBlock() {
+ initializeDeviceProp();
+ return m_deviceProperties.sharedMemPerBlock;
+}
+
+static inline void setCudaSharedMemConfig(cudaSharedMemConfig config) {
+ cudaError_t status = cudaDeviceSetSharedMemConfig(config);
+ assert(status == cudaSuccess);
+}
+
+struct GpuDevice {
+ // The cudastream is not owned: the caller is responsible for its initialization and eventual destruction.
+ GpuDevice(const cudaStream_t* stream) : stream_(stream) { eigen_assert(stream); }
+
+ EIGEN_STRONG_INLINE const cudaStream_t& stream() const { return *stream_; }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
+#ifndef __CUDA_ARCH__
+ void* result;
+ assert(cudaMalloc(&result, num_bytes) == cudaSuccess);
+ assert(result != NULL);
+ return result;
+#else
+ assert(false && "The default device should be used instead to generate kernel code");
+ return NULL;
+#endif
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
+#ifndef __CUDA_ARCH__
+ assert(buffer != NULL);
+ assert(cudaFree(buffer) == cudaSuccess);
+#else
+ assert(false && "The default device should be used instead to generate kernel code");
+#endif
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
+#ifndef __CUDA_ARCH__
+ assert(cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice, *stream_) == cudaSuccess);
+#else
+ assert(false && "The default device should be used instead to generate kernel code");
+#endif
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
+#ifndef __CUDA_ARCH__
+ assert(cudaMemsetAsync(buffer, c, n, *stream_) == cudaSuccess);
+#else
+ assert(false && "The default device should be used instead to generate kernel code");
+#endif
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const {
+ // FIXME
+ return 32;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const {
+ cudaStreamSynchronize(*stream_);
+ }
+
+ private:
+ // TODO: multigpu.
+ const cudaStream_t* stream_;
+};
+
+#define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \
+ (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \
+ assert(cudaGetLastError() == cudaSuccess);
+
+#endif
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_TYPE_H