aboutsummaryrefslogtreecommitdiffhomepage
path: root/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h')
-rw-r--r--third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h920
1 files changed, 920 insertions, 0 deletions
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h
new file mode 100644
index 0000000000..b6eeb73832
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h
@@ -0,0 +1,920 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_DEVICE_TYPE_H
+#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_TYPE_H
+
+namespace Eigen {
+
+// Default device for the machine (typically a single cpu core)
+struct DefaultDevice {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
+ return internal::aligned_malloc(num_bytes);
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
+ internal::aligned_free(buffer);
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
+ ::memcpy(dst, src, n);
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
+ memcpy(dst, src, n);
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
+ memcpy(dst, src, n);
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
+ ::memset(buffer, c, n);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const {
+#ifndef __CUDA_ARCH__
+ // Running on the host CPU
+ return 1;
+#else
+ // Running on a CUDA device
+ return 32;
+#endif
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t memcpyThreshold() const {
+ return 2 * numThreads();
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
+#ifndef __CUDA_ARCH__
+ // Running on the host CPU
+ return l1CacheSize();
+#else
+ // Running on a CUDA device, return the amount of shared memory available.
+ return 48*1024;
+#endif
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
+#ifndef __CUDA_ARCH__
+ // Running single threaded on the host CPU
+ return l3CacheSize();
+#else
+ // Running on a CUDA device
+ return firstLevelCacheSize();
+#endif
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
+#ifndef __CUDA_ARCH__
+ // Running single threaded on the host CPU
+ // Should return an enum that encodes the ISA supported by the CPU
+ return 1;
+#else
+ // Running on a CUDA device
+ return __CUDA_ARCH__ / 100;
+#endif
+ }
+};
+
+// Multiple cpu cores
+#ifdef EIGEN_USE_THREADS
+
+#if __cplusplus > 199711
+// This defines an interface that ThreadPoolDevice can take to use
+// custom thread pools underneath.
+class ThreadPoolInterface {
+ public:
+ virtual void Schedule(std::function<void()> fn) = 0;
+
+ virtual ~ThreadPoolInterface() {}
+};
+#endif
+
+// The implementation of the ThreadPool type ensures that the Schedule method
+// runs the functions it is provided in FIFO order when the scheduling is done
+// by a single thread.
+#ifdef EIGEN_USE_CUSTOM_THREAD_POOL
+class ThreadPool : public ThreadPoolInterface {
+ public:
+ // Construct a pool that contains "num_threads" threads.
+ explicit ThreadPool(int num_threads) : threads_(num_threads), waiters_(num_threads) {
+ for (int i = 0; i < num_threads; i++) {
+ threads_.push_back(new std::thread([this]() { WorkerLoop(); }));
+ }
+ }
+
+ // Wait until all scheduled work has finished and then destroy the
+ // set of threads.
+ ~ThreadPool() {
+ {
+ // Wait for all work to get done.
+ std::unique_lock<std::mutex> l(mu_);
+ while (!pending_.empty()) {
+ empty_.wait(l);
+ }
+ exiting_ = true;
+
+ // Wakeup all waiters.
+ for (auto w : waiters_) {
+ w->ready = true;
+ w->work = nullptr;
+ w->cv.notify_one();
+ }
+ }
+
+ // Wait for threads to finish.
+ for (auto t : threads_) {
+ t->join();
+ delete t;
+ }
+ }
+
+ // Schedule fn() for execution in the pool of threads. The functions are
+ // executed in the order in which they are scheduled.
+ void Schedule(std::function<void()> fn) final {
+ std::unique_lock<std::mutex> l(mu_);
+ if (waiters_.empty()) {
+ pending_.push_back(fn);
+ } else {
+ Waiter* w = waiters_.back();
+ waiters_.pop_back();
+ w->ready = true;
+ w->work = fn;
+ w->cv.notify_one();
+ }
+ }
+
+ protected:
+ void WorkerLoop() {
+ std::unique_lock<std::mutex> l(mu_);
+ Waiter w;
+ while (!exiting_) {
+ std::function<void()> fn;
+ if (pending_.empty()) {
+ // Wait for work to be assigned to me
+ w.ready = false;
+ waiters_.push_back(&w);
+ while (!w.ready) {
+ w.cv.wait(l);
+ }
+ fn = w.work;
+ w.work = nullptr;
+ } else {
+ // Pick up pending work
+ fn = pending_.front();
+ pending_.pop_front();
+ if (pending_.empty()) {
+ empty_.notify_all();
+ }
+ }
+ if (fn) {
+ mu_.unlock();
+ fn();
+ mu_.lock();
+ }
+ }
+ }
+
+ private:
+ struct Waiter {
+ std::condition_variable cv;
+ std::function<void()> work;
+ bool ready;
+ };
+
+ std::mutex mu_;
+ FixedSizeVector<std::thread*> threads_; // All threads
+ FixedSizeVector<Waiter*> waiters_; // Stack of waiting threads.
+ std::deque<std::function<void()>> pending_; // Queue of pending work
+ std::condition_variable empty_; // Signaled on pending_.empty()
+ bool exiting_ = false;
+};
+
+
+// Notification is an object that allows a user to to wait for another
+// thread to signal a notification that an event has occurred.
+//
+// Multiple threads can wait on the same Notification object.
+// but only one caller must call Notify() on the object.
+class Notification {
+ public:
+ Notification() : notified_(false) {}
+ ~Notification() {}
+
+ void Notify() {
+ std::unique_lock<std::mutex> l(mu_);
+ eigen_assert(!notified_);
+ notified_ = true;
+ cv_.notify_all();
+ }
+
+ void WaitForNotification() {
+ std::unique_lock<std::mutex> l(mu_);
+ while (!notified_) {
+ cv_.wait(l);
+ }
+ }
+
+ private:
+ std::mutex mu_;
+ std::condition_variable cv_;
+ bool notified_;
+};
+
+#else
+
+// Notification is an object that allows a user to to wait for another
+// thread to signal a notification that an event has occurred.
+//
+// Multiple threads can wait on the same Notification object.
+// but only one caller must call Notify() on the object.
+class Notification {
+ public:
+ Notification() : notified_(false) {}
+ ~Notification() {}
+
+ void Notify() {
+ tensorflow::mutex_lock l(mu_);
+ eigen_assert(!notified_);
+ notified_ = true;
+ cv_.notify_all();
+ }
+
+ void WaitForNotification() {
+ tensorflow::mutex_lock l(mu_);
+ while (!notified_) {
+ cv_.wait(l);
+ }
+ }
+
+ private:
+ tensorflow::mutex mu_;
+ tensorflow::condition_variable cv_;
+ bool notified_;
+};
+#endif
+
+// Runs an arbitrary function and then calls Notify() on the passed in
+// Notification.
+template <typename Function, typename... Args> struct FunctionWrapper
+{
+ static void run(Notification* n, Function f, Args... args) {
+ f(args...);
+ n->Notify();
+ }
+};
+
+static EIGEN_STRONG_INLINE void wait_until_ready(Notification* n) {
+ if (n) {
+ n->WaitForNotification();
+ }
+}
+
+
+struct MemcpyExecutor {
+ typedef MemcpyExecutor Self;
+
+ MemcpyExecutor(void *dst, const void *src) :
+ m_dst(static_cast<char *>(dst)), m_src(static_cast<const char *>(src)) { }
+
+ static EIGEN_STRONG_INLINE void run(const MemcpyExecutor* exec, size_t idx, size_t block_size) {
+ ::memcpy(&(exec->m_dst[idx]), &(exec->m_src[idx]), block_size);
+ }
+
+ private:
+ char* m_dst;
+ const char* m_src;
+};
+
+struct MemsetExecutor {
+ typedef MemsetExecutor Self;
+
+ MemsetExecutor(void *buffer, int val) :
+ m_buffer(static_cast<char *>(buffer)), m_val(val) { }
+
+ static EIGEN_STRONG_INLINE void run(const MemsetExecutor* exec, size_t idx, size_t block_size) {
+ ::memset(&(exec->m_buffer[idx]), exec->m_val, block_size);
+ }
+
+ private:
+ char* m_buffer;
+ const int m_val;
+};
+
+
+struct ThreadPoolDevice {
+ // The ownership of the thread pool remains with the caller.
+ ThreadPoolDevice(ThreadPoolInterface* pool, size_t num_cores)
+ : pool_(pool), num_threads_(num_cores) {}
+
+ EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
+ return internal::aligned_malloc(num_bytes);
+ }
+
+ EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
+ internal::aligned_free(buffer);
+ }
+
+ EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
+#ifdef __ANDROID__
+ ::memcpy(dst, src, n);
+#else
+ if (n <= 32768) {
+ ::memcpy(dst, src, n);
+ } else {
+ MemcpyExecutor memcpy_executor(dst, src);
+ execute(memcpy_executor, n);
+ }
+#endif
+ }
+
+ EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
+ memcpy(dst, src, n);
+ }
+
+ EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
+ memcpy(dst, src, n);
+ }
+
+ EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
+#ifdef __ANDROID__
+ ::memset(buffer, c, n);
+#else
+ if (n <= 32768) {
+ ::memset(buffer, c, n);
+ } else {
+ MemsetExecutor memset_executor(buffer, c);
+ execute(memset_executor, n);
+ }
+#endif
+ }
+
+ EIGEN_STRONG_INLINE size_t numThreads() const {
+ return num_threads_;
+ }
+
+ EIGEN_STRONG_INLINE size_t memcpyThreshold() const {
+ return 2 * numThreads();
+ }
+
+ EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
+ return l1CacheSize();
+ }
+
+ EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
+ // The l3 cache size is shared between all the cores.
+ return l3CacheSize() / num_threads_;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
+ // Should return an enum that encodes the ISA supported by the CPU
+ return 1;
+ }
+
+ template <class Function, class... Args>
+ EIGEN_STRONG_INLINE Notification* enqueue(Function&& f, Args&&... args) const {
+ Notification* n = new Notification();
+ std::function<void()> func =
+ std::bind(&FunctionWrapper<Function, Args...>::run, n, f, args...);
+ pool_->Schedule(func);
+ return n;
+ }
+
+ template <class Function, class... Args>
+ EIGEN_STRONG_INLINE void enqueue_and_forget(Function&& f, Args&&... args) const {
+ std::function<void()> func = std::bind(f, args...);
+ pool_->Schedule(func);
+ }
+
+ private:
+ template<typename Executor>
+ EIGEN_STRONG_INLINE void execute(const Executor& exec, size_t n) const {
+ // don't spawn a thread to process fewer than 1024 bytes (chosen by small amount of
+ // experimentation)
+ // TODO: make block_size a multiple of packet_size and align everything
+ const size_t block_size = numext::maxi(static_cast<size_t>(1024), n / numThreads());
+ const size_t block_count = n / block_size;
+ eigen_assert(block_count <= numThreads());
+
+ FixedSizeVector<Notification*> results(block_count);
+ for (size_t block_idx = 0; block_idx < block_count; block_idx++) {
+ results.push_back(enqueue(&Executor::run, &exec, block_idx * block_size, block_size));
+ }
+
+ if (block_count * block_size < n) {
+ Executor::run(&exec, block_count * block_size, n - block_count * block_size);
+ }
+
+ // wait for threads to finish
+ for (size_t block_idx = 0; block_idx < block_count; block_idx++) {
+ results[block_idx]->WaitForNotification();
+ delete results[block_idx];
+ }
+ }
+
+ // todo: NUMA, ...
+ size_t num_threads_;
+ ThreadPoolInterface* pool_;
+};
+#endif
+
+
+// GPU offloading
+#ifdef EIGEN_USE_GPU
+
+// An interface abstracting away device specific memory allocator.
+class Allocator {
+ public:
+ virtual ~Allocator() {}
+ EIGEN_DEVICE_FUNC virtual void* allocate(size_t num_bytes) const = 0;
+ EIGEN_DEVICE_FUNC virtual void deallocate(void* buffer) const = 0;
+};
+
+#if !defined(__GCUDACC__) && !defined(__GCUDACC_HOST__)
+
+// This defines an interface that GPUDevice can take to use
+// CUDA streams underneath.
+class StreamInterface {
+ public:
+ virtual ~StreamInterface() {}
+
+ virtual const cudaStream_t& stream() const = 0;
+ virtual const cudaDeviceProp& deviceProperties() const = 0;
+
+ // Allocate memory on the actual device where the computation will run
+ virtual void* allocate(size_t num_bytes) const = 0;
+ virtual void deallocate(void* buffer) const = 0;
+};
+
+static cudaDeviceProp* m_deviceProperties;
+static bool m_devicePropInitialized = false;
+static tensorflow::mutex m_devicePropInitMutex(tensorflow::LINKER_INITIALIZED);
+
+static void initializeDeviceProp() {
+ if (!m_devicePropInitialized) {
+ tensorflow::mutex_lock l(m_devicePropInitMutex);
+ if (!m_devicePropInitialized) {
+ int num_devices;
+ cudaError_t status = cudaGetDeviceCount(&num_devices);
+ eigen_check(status == cudaSuccess);
+ m_deviceProperties = new cudaDeviceProp[num_devices];
+ for (int i = 0; i < num_devices; ++i) {
+ status = cudaGetDeviceProperties(&m_deviceProperties[i], i);
+ eigen_check(status == cudaSuccess);
+ }
+ m_devicePropInitialized = true;
+ }
+ }
+}
+
+static const cudaStream_t default_stream = cudaStreamDefault;
+
+class CudaStreamDevice : public StreamInterface {
+ public:
+ // Use the default stream on the current device
+ CudaStreamDevice() : stream_(&default_stream) {
+ cudaGetDevice(&device_);
+ initializeDeviceProp();
+ }
+ // Use the default stream on the specified device
+ CudaStreamDevice(int device) : stream_(&default_stream), device_(device) {
+ initializeDeviceProp();
+ }
+ // Use the specified stream. Note that it's the
+ // caller responsibility to ensure that the stream can run on
+ // the specified device. If no device is specified the code
+ // assumes that the stream is associated to the current gpu device.
+ CudaStreamDevice(const cudaStream_t* stream, int device = -1)
+ : stream_(stream), device_(device) {
+ if (device < 0) {
+ cudaGetDevice(&device_);
+ } else {
+ int num_devices;
+ cudaError_t err = cudaGetDeviceCount(&num_devices);
+ eigen_check(err == cudaSuccess);
+ eigen_check(device < num_devices);
+ device_ = device;
+ }
+ initializeDeviceProp();
+ }
+
+ const cudaStream_t& stream() const { return *stream_; }
+ const cudaDeviceProp& deviceProperties() const {
+ return m_deviceProperties[device_];
+ }
+ virtual void* allocate(size_t num_bytes) const {
+ cudaError_t err = cudaSetDevice(device_);
+ eigen_check(err == cudaSuccess);
+ void* result;
+ err = cudaMalloc(&result, num_bytes);
+ eigen_check(err == cudaSuccess);
+ eigen_check(result != NULL);
+ return result;
+ }
+ virtual void deallocate(void* buffer) const {
+ cudaError_t err = cudaSetDevice(device_);
+ eigen_check(err == cudaSuccess);
+ assert(buffer != NULL);
+ err = cudaFree(buffer);
+ assert(err == cudaSuccess);
+ }
+
+ private:
+ const cudaStream_t* stream_;
+ int device_;
+};
+
+static inline void setCudaSharedMemConfig(cudaSharedMemConfig config) {
+ cudaError_t status = cudaDeviceSetSharedMemConfig(config);
+ eigen_check(status == cudaSuccess);
+}
+
+struct GpuDevice {
+ // Neither the cudastream nor the allocator is not owned: the caller is
+ // responsible for their initialization and eventual destruction.
+ explicit GpuDevice(const StreamInterface* stream) : stream_(stream) {
+ eigen_assert(stream);
+ }
+
+ // TODO(bsteiner): This is an internal API, we should not expose it.
+ EIGEN_STRONG_INLINE const cudaStream_t& stream() const {
+ return stream_->stream();
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
+#ifndef __CUDA_ARCH__
+ return stream_->allocate(num_bytes);
+#else
+ eigen_assert(false && "The default device should be used instead to generate kernel code");
+ return NULL;
+#endif
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
+#ifndef __CUDA_ARCH__
+ stream_->deallocate(buffer);
+#else
+ eigen_assert(false && "The default device should be used instead to generate kernel code");
+#endif
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
+#ifndef __CUDA_ARCH__
+ cudaError_t err = cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice,
+ stream_->stream());
+ assert(err == cudaSuccess);
+#else
+ eigen_assert(false && "The default device should be used instead to generate kernel code");
+#endif
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
+#ifndef __CUDA_ARCH__
+ cudaError_t err =
+ cudaMemcpyAsync(dst, src, n, cudaMemcpyHostToDevice, stream_->stream());
+ assert(err == cudaSuccess);
+#else
+ eigen_assert(false && "The default device should be used instead to generate kernel code");
+#endif
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
+#ifndef __CUDA_ARCH__
+ cudaError_t err =
+ cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToHost, stream_->stream());
+ assert(err == cudaSuccess);
+#else
+ eigen_assert(false && "The default device should be used instead to generate kernel code");
+#endif
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
+#ifndef __CUDA_ARCH__
+ cudaError_t err = cudaMemsetAsync(buffer, c, n, stream_->stream());
+ assert(err == cudaSuccess);
+#else
+ eigen_assert(false && "The default device should be used instead to generate kernel code");
+#endif
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const {
+ // FIXME
+ return 32;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t memcpyThreshold() const {
+ return 4 * 1024 * 1024;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
+ // FIXME
+ return 48*1024;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
+ // We won't try to take advantage of the l2 cache for the time being, and
+ // there is no l3 cache on cuda devices.
+ return firstLevelCacheSize();
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const {
+#ifndef __CUDA_ARCH__
+ cudaError_t err = cudaStreamSynchronize(stream_->stream());
+ assert(err == cudaSuccess);
+#else
+ assert(false && "The default device should be used instead to generate kernel code");
+#endif
+ }
+
+ inline int getNumCudaMultiProcessors() const {
+ return stream_->deviceProperties().multiProcessorCount;
+ }
+ inline int maxCudaThreadsPerBlock() const {
+ return stream_->deviceProperties().maxThreadsPerBlock;
+ }
+ inline int maxCudaThreadsPerMultiProcessor() const {
+ return stream_->deviceProperties().maxThreadsPerMultiProcessor;
+ }
+ inline int sharedMemPerBlock() const {
+ return stream_->deviceProperties().sharedMemPerBlock;
+ }
+ inline int majorDeviceVersion() const {
+ return stream_->deviceProperties().major;
+ }
+
+ // This function checks if the CUDA runtime recorded an error for the
+ // underlying stream device.
+ inline bool ok() const {
+ cudaError_t error = cudaStreamQuery(stream_->stream());
+ return (error == cudaSuccess) || (error == cudaErrorNotReady);
+ }
+
+ private:
+ const StreamInterface* stream_;
+};
+
+inline void assertCudaOk() {
+ cudaError_t err = cudaGetLastError();
+
+ assert(err != cudaErrorMissingConfiguration);
+ assert(err != cudaErrorMemoryAllocation);
+ assert(err != cudaErrorInitializationError);
+ assert(err != cudaErrorLaunchFailure);
+ assert(err != cudaErrorPriorLaunchFailure);
+ assert(err != cudaErrorLaunchTimeout);
+ assert(err != cudaErrorLaunchOutOfResources);
+ assert(err != cudaErrorInvalidDeviceFunction);
+ assert(err != cudaErrorInvalidConfiguration);
+ assert(err != cudaErrorInvalidDevice);
+ assert(err != cudaErrorInvalidValue);
+ assert(err != cudaErrorInvalidPitchValue);
+ assert(err != cudaErrorInvalidSymbol);
+ assert(err != cudaErrorMapBufferObjectFailed);
+ assert(err != cudaErrorUnmapBufferObjectFailed);
+ assert(err != cudaErrorInvalidHostPointer);
+ assert(err != cudaErrorInvalidDevicePointer);
+ assert(err != cudaErrorInvalidTexture);
+ assert(err != cudaErrorInvalidTextureBinding);
+ assert(err != cudaErrorInvalidChannelDescriptor);
+ assert(err != cudaErrorInvalidMemcpyDirection);
+ assert(err != cudaErrorAddressOfConstant);
+ assert(err != cudaErrorTextureFetchFailed);
+ assert(err != cudaErrorTextureNotBound);
+ assert(err != cudaErrorSynchronizationError);
+ assert(err != cudaErrorInvalidFilterSetting);
+ assert(err != cudaErrorInvalidNormSetting);
+ assert(err != cudaErrorMixedDeviceExecution);
+ assert(err != cudaErrorCudartUnloading);
+ assert(err != cudaErrorUnknown);
+ assert(err != cudaErrorNotYetImplemented);
+ assert(err != cudaErrorMemoryValueTooLarge);
+ assert(err != cudaErrorInvalidResourceHandle);
+ assert(err != cudaErrorNotReady);
+ assert(err != cudaErrorInsufficientDriver);
+ assert(err != cudaErrorSetOnActiveProcess);
+ assert(err != cudaErrorInvalidSurface);
+ assert(err != cudaErrorNoDevice);
+ assert(err != cudaErrorECCUncorrectable);
+ assert(err != cudaErrorSharedObjectSymbolNotFound);
+ assert(err != cudaErrorSharedObjectInitFailed);
+ assert(err != cudaErrorUnsupportedLimit);
+ assert(err != cudaErrorDuplicateVariableName);
+ assert(err != cudaErrorDuplicateTextureName);
+ assert(err != cudaErrorDuplicateSurfaceName);
+ assert(err != cudaErrorDevicesUnavailable);
+ assert(err != cudaErrorInvalidKernelImage);
+ assert(err != cudaErrorNoKernelImageForDevice);
+ assert(err != cudaErrorIncompatibleDriverContext);
+ assert(err != cudaErrorPeerAccessAlreadyEnabled);
+ assert(err != cudaErrorPeerAccessNotEnabled);
+ assert(err != cudaErrorDeviceAlreadyInUse);
+ assert(err != cudaErrorProfilerDisabled);
+ assert(err != cudaErrorProfilerNotInitialized);
+ assert(err != cudaErrorProfilerAlreadyStarted);
+ assert(err != cudaErrorProfilerAlreadyStopped);
+ assert(err != cudaErrorAssert);
+ assert(err != cudaErrorTooManyPeers);
+ assert(err != cudaErrorHostMemoryAlreadyRegistered);
+ assert(err != cudaErrorHostMemoryNotRegistered);
+ assert(err != cudaErrorOperatingSystem);
+ assert(err != cudaErrorStartupFailure);
+ assert(err != cudaErrorApiFailureBase);
+
+ // catch errors types introduced after this function was written
+ assert(err == cudaSuccess);
+}
+
+#define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, \
+ ...) \
+ do { \
+ (kernel)<<<(gridsize), (blocksize), (sharedmem), (device).stream()>>>( \
+ __VA_ARGS__); \
+ assertCudaOk(); \
+ } while (false)
+
+#else // __GCUDACC__
+
+// The following is the version of GpuDevice for StreamExecutor
+// (go/gpuexecutor) a GPU runtime that supports both CUDA and OpenCL.
+// StreamExecutor is being developed as an open-source replacement for the CUDA
+// runtime and is the runtime used when compiling with gcudacc. Differences
+// between the CUDA runtime and StreamExecutor are abstracted away behind
+// GpuDevice.
+
+// TODO(jpienaar): Temporary workaround until b/18409724 is addressed.
+enum cudaSharedMemConfig
+{
+ cudaSharedMemBankSizeDefault = 0,
+ cudaSharedMemBankSizeFourByte = 1,
+ cudaSharedMemBankSizeEightByte = 2
+};
+
+static inline void setCudaSharedMemConfig(cudaSharedMemConfig cache_config) {
+ // TODO(jpienaar): fix when implemented (b/18409724)
+}
+
+struct GpuDevice {
+ GpuDevice()
+ : stream_(perftools::gputools::MachineManager::singleton()->stream_for_device(0)),
+ allocator_(nullptr),
+ stream_exec_(stream_->parent()) {}
+
+ GpuDevice(perftools::gputools::Stream* stream,
+ const Allocator* alloc = nullptr)
+ : stream_(stream), allocator_(alloc), stream_exec_(stream_->parent()) { }
+
+ EIGEN_STRONG_INLINE perftools::gputools::Stream* stream() const {
+ return stream_;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
+ if (allocator_ != nullptr) return allocator_->allocate(num_bytes);
+#ifndef __CUDA_ARCH__
+ perftools::gputools::DeviceMemory<char> mem =
+ stream_exec_->AllocateArray<char>(num_bytes);
+ return mem.opaque();
+#else
+ assert(false &&
+ "The default device should be used instead to generate kernel code");
+ return nullptr;
+#endif
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
+ if (allocator_ != nullptr) {
+ allocator_->deallocate(buffer);
+ return;
+ }
+#ifndef __CUDA_ARCH__
+ perftools::gputools::DeviceMemoryBase gpu_mem(buffer);
+ stream_exec_->Deallocate(&gpu_mem);
+#else
+ assert(false &&
+ "The default device should be used instead to generate kernel code");
+#endif
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src,
+ size_t n) const {
+#ifndef __CUDA_ARCH__
+ perftools::gputools::DeviceMemoryBase gpu_to(dst);
+ if (!stream_->ThenMemcpy(&gpu_to, perftools::gputools::DeviceMemoryBase(
+ const_cast<void*>(src)),
+ n).ok()) {
+ assert(false &&
+ "failed during enqueue of 'copy perftools::gputools to "
+ "perftools::gputools'");
+ }
+#else
+ assert(false &&
+ "The default device should be used instead to generate kernel code");
+#endif
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
+#ifndef __CUDA_ARCH__
+ perftools::gputools::DeviceMemoryBase gpu_to(dst);
+ if (!stream_->ThenMemcpy(&gpu_to, src, n).ok()) {
+ assert(false && "failed while enqueuing memcpy from host to device");
+ }
+#else
+ eigen_assert(false && "The default device should be used instead to generate kernel code");
+#endif
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
+#ifndef __CUDA_ARCH__
+ if (!stream_->ThenMemcpy(dst, perftools::gputools::DeviceMemoryBase(
+ const_cast<void*>(src)),
+ n).ok()) {
+ assert(false && "failed while enqueuing memcpy from device to host");
+ }
+#else
+ eigen_assert(false && "The default device should be used instead to generate kernel code");
+#endif
+ }
+
+ EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
+#ifndef __CUDA_ARCH__
+ perftools::gputools::DeviceMemoryBase gpu_buffer{buffer};
+ if (!stream_exec_->Memset32(stream_, &gpu_buffer, c, n)) {
+ assert(false && "GPU memset failed.");
+ }
+#else
+ assert(false &&
+ "The default device should be used instead to generate kernel code");
+#endif
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const {
+ // FIXME
+ return 32;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t memcpyThreshold() const {
+ return 4 * 1024 * 1024;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
+ // FIXME
+ return 48*1024;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
+ // We won't try to take advantage of the l2 cache for the time being, and
+ // there is no l3 cache on cuda devices.
+ return firstLevelCacheSize();
+ }
+
+ EIGEN_STRONG_INLINE void synchronize() const {
+ stream_->BlockHostUntilDone();
+ }
+
+ // A gpu::DeviceDescription is cached inside a StreamExecutor, so these calls
+ // aren't expensive/wasteful.
+ EIGEN_DEVICE_FUNC inline int getNumCudaMultiProcessors() const {
+ return stream_exec_->GetDeviceDescription().core_count();
+ }
+
+ EIGEN_DEVICE_FUNC inline int maxCudaThreadsPerBlock() const {
+ return stream_exec_->GetDeviceDescription().threads_per_block_limit();
+ }
+
+ EIGEN_DEVICE_FUNC inline int maxCudaThreadsPerMultiProcessor() const {
+ return stream_exec_->GetDeviceDescription().threads_per_core_limit();
+ }
+
+ EIGEN_DEVICE_FUNC inline int sharedMemPerBlock() const {
+ return stream_exec_->GetDeviceDescription().shared_memory_per_block();
+ }
+
+ EIGEN_DEVICE_FUNC inline int majorDeviceVersion() const {
+ int major, minor;
+ if (stream_exec_->GetDeviceDescription().cuda_compute_capability(&major,
+ &minor)) {
+ return major;
+ } else {
+ return 0;
+ }
+ }
+
+ inline bool ok() const { return stream_->ok(); }
+
+ private:
+ perftools::gputools::Stream* stream_;
+ perftools::gputools::StreamExecutor* stream_exec_;
+ const Allocator* allocator_;
+};
+
+#define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...)\
+ (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \
+ CHECK((device).stream()->ok());
+#endif // __GCUDACC__
+
+#endif // EIGEN_USE_GPU
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_TYPE_H