diff options
Diffstat (limited to 'third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h')
-rw-r--r-- | third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h | 920 |
1 files changed, 920 insertions, 0 deletions
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h new file mode 100644 index 0000000000..b6eeb73832 --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h @@ -0,0 +1,920 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_DEVICE_TYPE_H +#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_TYPE_H + +namespace Eigen { + +// Default device for the machine (typically a single cpu core) +struct DefaultDevice { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { + return internal::aligned_malloc(num_bytes); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const { + internal::aligned_free(buffer); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { + ::memcpy(dst, src, n); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const { + memcpy(dst, src, n); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const { + memcpy(dst, src, n); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { + ::memset(buffer, c, n); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const { +#ifndef __CUDA_ARCH__ + // Running on the host CPU + return 1; +#else + // Running on a CUDA device + return 32; +#endif + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t memcpyThreshold() const { + return 2 * numThreads(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { +#ifndef __CUDA_ARCH__ + // Running on the host CPU + return l1CacheSize(); +#else + // Running on a CUDA device, return the amount of shared memory available. + return 48*1024; +#endif + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { +#ifndef __CUDA_ARCH__ + // Running single threaded on the host CPU + return l3CacheSize(); +#else + // Running on a CUDA device + return firstLevelCacheSize(); +#endif + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const { +#ifndef __CUDA_ARCH__ + // Running single threaded on the host CPU + // Should return an enum that encodes the ISA supported by the CPU + return 1; +#else + // Running on a CUDA device + return __CUDA_ARCH__ / 100; +#endif + } +}; + +// Multiple cpu cores +#ifdef EIGEN_USE_THREADS + +#if __cplusplus > 199711 +// This defines an interface that ThreadPoolDevice can take to use +// custom thread pools underneath. +class ThreadPoolInterface { + public: + virtual void Schedule(std::function<void()> fn) = 0; + + virtual ~ThreadPoolInterface() {} +}; +#endif + +// The implementation of the ThreadPool type ensures that the Schedule method +// runs the functions it is provided in FIFO order when the scheduling is done +// by a single thread. +#ifdef EIGEN_USE_CUSTOM_THREAD_POOL +class ThreadPool : public ThreadPoolInterface { + public: + // Construct a pool that contains "num_threads" threads. + explicit ThreadPool(int num_threads) : threads_(num_threads), waiters_(num_threads) { + for (int i = 0; i < num_threads; i++) { + threads_.push_back(new std::thread([this]() { WorkerLoop(); })); + } + } + + // Wait until all scheduled work has finished and then destroy the + // set of threads. + ~ThreadPool() { + { + // Wait for all work to get done. + std::unique_lock<std::mutex> l(mu_); + while (!pending_.empty()) { + empty_.wait(l); + } + exiting_ = true; + + // Wakeup all waiters. + for (auto w : waiters_) { + w->ready = true; + w->work = nullptr; + w->cv.notify_one(); + } + } + + // Wait for threads to finish. + for (auto t : threads_) { + t->join(); + delete t; + } + } + + // Schedule fn() for execution in the pool of threads. The functions are + // executed in the order in which they are scheduled. + void Schedule(std::function<void()> fn) final { + std::unique_lock<std::mutex> l(mu_); + if (waiters_.empty()) { + pending_.push_back(fn); + } else { + Waiter* w = waiters_.back(); + waiters_.pop_back(); + w->ready = true; + w->work = fn; + w->cv.notify_one(); + } + } + + protected: + void WorkerLoop() { + std::unique_lock<std::mutex> l(mu_); + Waiter w; + while (!exiting_) { + std::function<void()> fn; + if (pending_.empty()) { + // Wait for work to be assigned to me + w.ready = false; + waiters_.push_back(&w); + while (!w.ready) { + w.cv.wait(l); + } + fn = w.work; + w.work = nullptr; + } else { + // Pick up pending work + fn = pending_.front(); + pending_.pop_front(); + if (pending_.empty()) { + empty_.notify_all(); + } + } + if (fn) { + mu_.unlock(); + fn(); + mu_.lock(); + } + } + } + + private: + struct Waiter { + std::condition_variable cv; + std::function<void()> work; + bool ready; + }; + + std::mutex mu_; + FixedSizeVector<std::thread*> threads_; // All threads + FixedSizeVector<Waiter*> waiters_; // Stack of waiting threads. + std::deque<std::function<void()>> pending_; // Queue of pending work + std::condition_variable empty_; // Signaled on pending_.empty() + bool exiting_ = false; +}; + + +// Notification is an object that allows a user to to wait for another +// thread to signal a notification that an event has occurred. +// +// Multiple threads can wait on the same Notification object. +// but only one caller must call Notify() on the object. +class Notification { + public: + Notification() : notified_(false) {} + ~Notification() {} + + void Notify() { + std::unique_lock<std::mutex> l(mu_); + eigen_assert(!notified_); + notified_ = true; + cv_.notify_all(); + } + + void WaitForNotification() { + std::unique_lock<std::mutex> l(mu_); + while (!notified_) { + cv_.wait(l); + } + } + + private: + std::mutex mu_; + std::condition_variable cv_; + bool notified_; +}; + +#else + +// Notification is an object that allows a user to to wait for another +// thread to signal a notification that an event has occurred. +// +// Multiple threads can wait on the same Notification object. +// but only one caller must call Notify() on the object. +class Notification { + public: + Notification() : notified_(false) {} + ~Notification() {} + + void Notify() { + tensorflow::mutex_lock l(mu_); + eigen_assert(!notified_); + notified_ = true; + cv_.notify_all(); + } + + void WaitForNotification() { + tensorflow::mutex_lock l(mu_); + while (!notified_) { + cv_.wait(l); + } + } + + private: + tensorflow::mutex mu_; + tensorflow::condition_variable cv_; + bool notified_; +}; +#endif + +// Runs an arbitrary function and then calls Notify() on the passed in +// Notification. +template <typename Function, typename... Args> struct FunctionWrapper +{ + static void run(Notification* n, Function f, Args... args) { + f(args...); + n->Notify(); + } +}; + +static EIGEN_STRONG_INLINE void wait_until_ready(Notification* n) { + if (n) { + n->WaitForNotification(); + } +} + + +struct MemcpyExecutor { + typedef MemcpyExecutor Self; + + MemcpyExecutor(void *dst, const void *src) : + m_dst(static_cast<char *>(dst)), m_src(static_cast<const char *>(src)) { } + + static EIGEN_STRONG_INLINE void run(const MemcpyExecutor* exec, size_t idx, size_t block_size) { + ::memcpy(&(exec->m_dst[idx]), &(exec->m_src[idx]), block_size); + } + + private: + char* m_dst; + const char* m_src; +}; + +struct MemsetExecutor { + typedef MemsetExecutor Self; + + MemsetExecutor(void *buffer, int val) : + m_buffer(static_cast<char *>(buffer)), m_val(val) { } + + static EIGEN_STRONG_INLINE void run(const MemsetExecutor* exec, size_t idx, size_t block_size) { + ::memset(&(exec->m_buffer[idx]), exec->m_val, block_size); + } + + private: + char* m_buffer; + const int m_val; +}; + + +struct ThreadPoolDevice { + // The ownership of the thread pool remains with the caller. + ThreadPoolDevice(ThreadPoolInterface* pool, size_t num_cores) + : pool_(pool), num_threads_(num_cores) {} + + EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { + return internal::aligned_malloc(num_bytes); + } + + EIGEN_STRONG_INLINE void deallocate(void* buffer) const { + internal::aligned_free(buffer); + } + + EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { +#ifdef __ANDROID__ + ::memcpy(dst, src, n); +#else + if (n <= 32768) { + ::memcpy(dst, src, n); + } else { + MemcpyExecutor memcpy_executor(dst, src); + execute(memcpy_executor, n); + } +#endif + } + + EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const { + memcpy(dst, src, n); + } + + EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const { + memcpy(dst, src, n); + } + + EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { +#ifdef __ANDROID__ + ::memset(buffer, c, n); +#else + if (n <= 32768) { + ::memset(buffer, c, n); + } else { + MemsetExecutor memset_executor(buffer, c); + execute(memset_executor, n); + } +#endif + } + + EIGEN_STRONG_INLINE size_t numThreads() const { + return num_threads_; + } + + EIGEN_STRONG_INLINE size_t memcpyThreshold() const { + return 2 * numThreads(); + } + + EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { + return l1CacheSize(); + } + + EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { + // The l3 cache size is shared between all the cores. + return l3CacheSize() / num_threads_; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const { + // Should return an enum that encodes the ISA supported by the CPU + return 1; + } + + template <class Function, class... Args> + EIGEN_STRONG_INLINE Notification* enqueue(Function&& f, Args&&... args) const { + Notification* n = new Notification(); + std::function<void()> func = + std::bind(&FunctionWrapper<Function, Args...>::run, n, f, args...); + pool_->Schedule(func); + return n; + } + + template <class Function, class... Args> + EIGEN_STRONG_INLINE void enqueue_and_forget(Function&& f, Args&&... args) const { + std::function<void()> func = std::bind(f, args...); + pool_->Schedule(func); + } + + private: + template<typename Executor> + EIGEN_STRONG_INLINE void execute(const Executor& exec, size_t n) const { + // don't spawn a thread to process fewer than 1024 bytes (chosen by small amount of + // experimentation) + // TODO: make block_size a multiple of packet_size and align everything + const size_t block_size = numext::maxi(static_cast<size_t>(1024), n / numThreads()); + const size_t block_count = n / block_size; + eigen_assert(block_count <= numThreads()); + + FixedSizeVector<Notification*> results(block_count); + for (size_t block_idx = 0; block_idx < block_count; block_idx++) { + results.push_back(enqueue(&Executor::run, &exec, block_idx * block_size, block_size)); + } + + if (block_count * block_size < n) { + Executor::run(&exec, block_count * block_size, n - block_count * block_size); + } + + // wait for threads to finish + for (size_t block_idx = 0; block_idx < block_count; block_idx++) { + results[block_idx]->WaitForNotification(); + delete results[block_idx]; + } + } + + // todo: NUMA, ... + size_t num_threads_; + ThreadPoolInterface* pool_; +}; +#endif + + +// GPU offloading +#ifdef EIGEN_USE_GPU + +// An interface abstracting away device specific memory allocator. +class Allocator { + public: + virtual ~Allocator() {} + EIGEN_DEVICE_FUNC virtual void* allocate(size_t num_bytes) const = 0; + EIGEN_DEVICE_FUNC virtual void deallocate(void* buffer) const = 0; +}; + +#if !defined(__GCUDACC__) && !defined(__GCUDACC_HOST__) + +// This defines an interface that GPUDevice can take to use +// CUDA streams underneath. +class StreamInterface { + public: + virtual ~StreamInterface() {} + + virtual const cudaStream_t& stream() const = 0; + virtual const cudaDeviceProp& deviceProperties() const = 0; + + // Allocate memory on the actual device where the computation will run + virtual void* allocate(size_t num_bytes) const = 0; + virtual void deallocate(void* buffer) const = 0; +}; + +static cudaDeviceProp* m_deviceProperties; +static bool m_devicePropInitialized = false; +static tensorflow::mutex m_devicePropInitMutex(tensorflow::LINKER_INITIALIZED); + +static void initializeDeviceProp() { + if (!m_devicePropInitialized) { + tensorflow::mutex_lock l(m_devicePropInitMutex); + if (!m_devicePropInitialized) { + int num_devices; + cudaError_t status = cudaGetDeviceCount(&num_devices); + eigen_check(status == cudaSuccess); + m_deviceProperties = new cudaDeviceProp[num_devices]; + for (int i = 0; i < num_devices; ++i) { + status = cudaGetDeviceProperties(&m_deviceProperties[i], i); + eigen_check(status == cudaSuccess); + } + m_devicePropInitialized = true; + } + } +} + +static const cudaStream_t default_stream = cudaStreamDefault; + +class CudaStreamDevice : public StreamInterface { + public: + // Use the default stream on the current device + CudaStreamDevice() : stream_(&default_stream) { + cudaGetDevice(&device_); + initializeDeviceProp(); + } + // Use the default stream on the specified device + CudaStreamDevice(int device) : stream_(&default_stream), device_(device) { + initializeDeviceProp(); + } + // Use the specified stream. Note that it's the + // caller responsibility to ensure that the stream can run on + // the specified device. If no device is specified the code + // assumes that the stream is associated to the current gpu device. + CudaStreamDevice(const cudaStream_t* stream, int device = -1) + : stream_(stream), device_(device) { + if (device < 0) { + cudaGetDevice(&device_); + } else { + int num_devices; + cudaError_t err = cudaGetDeviceCount(&num_devices); + eigen_check(err == cudaSuccess); + eigen_check(device < num_devices); + device_ = device; + } + initializeDeviceProp(); + } + + const cudaStream_t& stream() const { return *stream_; } + const cudaDeviceProp& deviceProperties() const { + return m_deviceProperties[device_]; + } + virtual void* allocate(size_t num_bytes) const { + cudaError_t err = cudaSetDevice(device_); + eigen_check(err == cudaSuccess); + void* result; + err = cudaMalloc(&result, num_bytes); + eigen_check(err == cudaSuccess); + eigen_check(result != NULL); + return result; + } + virtual void deallocate(void* buffer) const { + cudaError_t err = cudaSetDevice(device_); + eigen_check(err == cudaSuccess); + assert(buffer != NULL); + err = cudaFree(buffer); + assert(err == cudaSuccess); + } + + private: + const cudaStream_t* stream_; + int device_; +}; + +static inline void setCudaSharedMemConfig(cudaSharedMemConfig config) { + cudaError_t status = cudaDeviceSetSharedMemConfig(config); + eigen_check(status == cudaSuccess); +} + +struct GpuDevice { + // Neither the cudastream nor the allocator is not owned: the caller is + // responsible for their initialization and eventual destruction. + explicit GpuDevice(const StreamInterface* stream) : stream_(stream) { + eigen_assert(stream); + } + + // TODO(bsteiner): This is an internal API, we should not expose it. + EIGEN_STRONG_INLINE const cudaStream_t& stream() const { + return stream_->stream(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { +#ifndef __CUDA_ARCH__ + return stream_->allocate(num_bytes); +#else + eigen_assert(false && "The default device should be used instead to generate kernel code"); + return NULL; +#endif + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const { +#ifndef __CUDA_ARCH__ + stream_->deallocate(buffer); +#else + eigen_assert(false && "The default device should be used instead to generate kernel code"); +#endif + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { +#ifndef __CUDA_ARCH__ + cudaError_t err = cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice, + stream_->stream()); + assert(err == cudaSuccess); +#else + eigen_assert(false && "The default device should be used instead to generate kernel code"); +#endif + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const { +#ifndef __CUDA_ARCH__ + cudaError_t err = + cudaMemcpyAsync(dst, src, n, cudaMemcpyHostToDevice, stream_->stream()); + assert(err == cudaSuccess); +#else + eigen_assert(false && "The default device should be used instead to generate kernel code"); +#endif + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const { +#ifndef __CUDA_ARCH__ + cudaError_t err = + cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToHost, stream_->stream()); + assert(err == cudaSuccess); +#else + eigen_assert(false && "The default device should be used instead to generate kernel code"); +#endif + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { +#ifndef __CUDA_ARCH__ + cudaError_t err = cudaMemsetAsync(buffer, c, n, stream_->stream()); + assert(err == cudaSuccess); +#else + eigen_assert(false && "The default device should be used instead to generate kernel code"); +#endif + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const { + // FIXME + return 32; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t memcpyThreshold() const { + return 4 * 1024 * 1024; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { + // FIXME + return 48*1024; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { + // We won't try to take advantage of the l2 cache for the time being, and + // there is no l3 cache on cuda devices. + return firstLevelCacheSize(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const { +#ifndef __CUDA_ARCH__ + cudaError_t err = cudaStreamSynchronize(stream_->stream()); + assert(err == cudaSuccess); +#else + assert(false && "The default device should be used instead to generate kernel code"); +#endif + } + + inline int getNumCudaMultiProcessors() const { + return stream_->deviceProperties().multiProcessorCount; + } + inline int maxCudaThreadsPerBlock() const { + return stream_->deviceProperties().maxThreadsPerBlock; + } + inline int maxCudaThreadsPerMultiProcessor() const { + return stream_->deviceProperties().maxThreadsPerMultiProcessor; + } + inline int sharedMemPerBlock() const { + return stream_->deviceProperties().sharedMemPerBlock; + } + inline int majorDeviceVersion() const { + return stream_->deviceProperties().major; + } + + // This function checks if the CUDA runtime recorded an error for the + // underlying stream device. + inline bool ok() const { + cudaError_t error = cudaStreamQuery(stream_->stream()); + return (error == cudaSuccess) || (error == cudaErrorNotReady); + } + + private: + const StreamInterface* stream_; +}; + +inline void assertCudaOk() { + cudaError_t err = cudaGetLastError(); + + assert(err != cudaErrorMissingConfiguration); + assert(err != cudaErrorMemoryAllocation); + assert(err != cudaErrorInitializationError); + assert(err != cudaErrorLaunchFailure); + assert(err != cudaErrorPriorLaunchFailure); + assert(err != cudaErrorLaunchTimeout); + assert(err != cudaErrorLaunchOutOfResources); + assert(err != cudaErrorInvalidDeviceFunction); + assert(err != cudaErrorInvalidConfiguration); + assert(err != cudaErrorInvalidDevice); + assert(err != cudaErrorInvalidValue); + assert(err != cudaErrorInvalidPitchValue); + assert(err != cudaErrorInvalidSymbol); + assert(err != cudaErrorMapBufferObjectFailed); + assert(err != cudaErrorUnmapBufferObjectFailed); + assert(err != cudaErrorInvalidHostPointer); + assert(err != cudaErrorInvalidDevicePointer); + assert(err != cudaErrorInvalidTexture); + assert(err != cudaErrorInvalidTextureBinding); + assert(err != cudaErrorInvalidChannelDescriptor); + assert(err != cudaErrorInvalidMemcpyDirection); + assert(err != cudaErrorAddressOfConstant); + assert(err != cudaErrorTextureFetchFailed); + assert(err != cudaErrorTextureNotBound); + assert(err != cudaErrorSynchronizationError); + assert(err != cudaErrorInvalidFilterSetting); + assert(err != cudaErrorInvalidNormSetting); + assert(err != cudaErrorMixedDeviceExecution); + assert(err != cudaErrorCudartUnloading); + assert(err != cudaErrorUnknown); + assert(err != cudaErrorNotYetImplemented); + assert(err != cudaErrorMemoryValueTooLarge); + assert(err != cudaErrorInvalidResourceHandle); + assert(err != cudaErrorNotReady); + assert(err != cudaErrorInsufficientDriver); + assert(err != cudaErrorSetOnActiveProcess); + assert(err != cudaErrorInvalidSurface); + assert(err != cudaErrorNoDevice); + assert(err != cudaErrorECCUncorrectable); + assert(err != cudaErrorSharedObjectSymbolNotFound); + assert(err != cudaErrorSharedObjectInitFailed); + assert(err != cudaErrorUnsupportedLimit); + assert(err != cudaErrorDuplicateVariableName); + assert(err != cudaErrorDuplicateTextureName); + assert(err != cudaErrorDuplicateSurfaceName); + assert(err != cudaErrorDevicesUnavailable); + assert(err != cudaErrorInvalidKernelImage); + assert(err != cudaErrorNoKernelImageForDevice); + assert(err != cudaErrorIncompatibleDriverContext); + assert(err != cudaErrorPeerAccessAlreadyEnabled); + assert(err != cudaErrorPeerAccessNotEnabled); + assert(err != cudaErrorDeviceAlreadyInUse); + assert(err != cudaErrorProfilerDisabled); + assert(err != cudaErrorProfilerNotInitialized); + assert(err != cudaErrorProfilerAlreadyStarted); + assert(err != cudaErrorProfilerAlreadyStopped); + assert(err != cudaErrorAssert); + assert(err != cudaErrorTooManyPeers); + assert(err != cudaErrorHostMemoryAlreadyRegistered); + assert(err != cudaErrorHostMemoryNotRegistered); + assert(err != cudaErrorOperatingSystem); + assert(err != cudaErrorStartupFailure); + assert(err != cudaErrorApiFailureBase); + + // catch errors types introduced after this function was written + assert(err == cudaSuccess); +} + +#define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, \ + ...) \ + do { \ + (kernel)<<<(gridsize), (blocksize), (sharedmem), (device).stream()>>>( \ + __VA_ARGS__); \ + assertCudaOk(); \ + } while (false) + +#else // __GCUDACC__ + +// The following is the version of GpuDevice for StreamExecutor +// (go/gpuexecutor) a GPU runtime that supports both CUDA and OpenCL. +// StreamExecutor is being developed as an open-source replacement for the CUDA +// runtime and is the runtime used when compiling with gcudacc. Differences +// between the CUDA runtime and StreamExecutor are abstracted away behind +// GpuDevice. + +// TODO(jpienaar): Temporary workaround until b/18409724 is addressed. +enum cudaSharedMemConfig +{ + cudaSharedMemBankSizeDefault = 0, + cudaSharedMemBankSizeFourByte = 1, + cudaSharedMemBankSizeEightByte = 2 +}; + +static inline void setCudaSharedMemConfig(cudaSharedMemConfig cache_config) { + // TODO(jpienaar): fix when implemented (b/18409724) +} + +struct GpuDevice { + GpuDevice() + : stream_(perftools::gputools::MachineManager::singleton()->stream_for_device(0)), + allocator_(nullptr), + stream_exec_(stream_->parent()) {} + + GpuDevice(perftools::gputools::Stream* stream, + const Allocator* alloc = nullptr) + : stream_(stream), allocator_(alloc), stream_exec_(stream_->parent()) { } + + EIGEN_STRONG_INLINE perftools::gputools::Stream* stream() const { + return stream_; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { + if (allocator_ != nullptr) return allocator_->allocate(num_bytes); +#ifndef __CUDA_ARCH__ + perftools::gputools::DeviceMemory<char> mem = + stream_exec_->AllocateArray<char>(num_bytes); + return mem.opaque(); +#else + assert(false && + "The default device should be used instead to generate kernel code"); + return nullptr; +#endif + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const { + if (allocator_ != nullptr) { + allocator_->deallocate(buffer); + return; + } +#ifndef __CUDA_ARCH__ + perftools::gputools::DeviceMemoryBase gpu_mem(buffer); + stream_exec_->Deallocate(&gpu_mem); +#else + assert(false && + "The default device should be used instead to generate kernel code"); +#endif + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, + size_t n) const { +#ifndef __CUDA_ARCH__ + perftools::gputools::DeviceMemoryBase gpu_to(dst); + if (!stream_->ThenMemcpy(&gpu_to, perftools::gputools::DeviceMemoryBase( + const_cast<void*>(src)), + n).ok()) { + assert(false && + "failed during enqueue of 'copy perftools::gputools to " + "perftools::gputools'"); + } +#else + assert(false && + "The default device should be used instead to generate kernel code"); +#endif + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const { +#ifndef __CUDA_ARCH__ + perftools::gputools::DeviceMemoryBase gpu_to(dst); + if (!stream_->ThenMemcpy(&gpu_to, src, n).ok()) { + assert(false && "failed while enqueuing memcpy from host to device"); + } +#else + eigen_assert(false && "The default device should be used instead to generate kernel code"); +#endif + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const { +#ifndef __CUDA_ARCH__ + if (!stream_->ThenMemcpy(dst, perftools::gputools::DeviceMemoryBase( + const_cast<void*>(src)), + n).ok()) { + assert(false && "failed while enqueuing memcpy from device to host"); + } +#else + eigen_assert(false && "The default device should be used instead to generate kernel code"); +#endif + } + + EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { +#ifndef __CUDA_ARCH__ + perftools::gputools::DeviceMemoryBase gpu_buffer{buffer}; + if (!stream_exec_->Memset32(stream_, &gpu_buffer, c, n)) { + assert(false && "GPU memset failed."); + } +#else + assert(false && + "The default device should be used instead to generate kernel code"); +#endif + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const { + // FIXME + return 32; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t memcpyThreshold() const { + return 4 * 1024 * 1024; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { + // FIXME + return 48*1024; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { + // We won't try to take advantage of the l2 cache for the time being, and + // there is no l3 cache on cuda devices. + return firstLevelCacheSize(); + } + + EIGEN_STRONG_INLINE void synchronize() const { + stream_->BlockHostUntilDone(); + } + + // A gpu::DeviceDescription is cached inside a StreamExecutor, so these calls + // aren't expensive/wasteful. + EIGEN_DEVICE_FUNC inline int getNumCudaMultiProcessors() const { + return stream_exec_->GetDeviceDescription().core_count(); + } + + EIGEN_DEVICE_FUNC inline int maxCudaThreadsPerBlock() const { + return stream_exec_->GetDeviceDescription().threads_per_block_limit(); + } + + EIGEN_DEVICE_FUNC inline int maxCudaThreadsPerMultiProcessor() const { + return stream_exec_->GetDeviceDescription().threads_per_core_limit(); + } + + EIGEN_DEVICE_FUNC inline int sharedMemPerBlock() const { + return stream_exec_->GetDeviceDescription().shared_memory_per_block(); + } + + EIGEN_DEVICE_FUNC inline int majorDeviceVersion() const { + int major, minor; + if (stream_exec_->GetDeviceDescription().cuda_compute_capability(&major, + &minor)) { + return major; + } else { + return 0; + } + } + + inline bool ok() const { return stream_->ok(); } + + private: + perftools::gputools::Stream* stream_; + perftools::gputools::StreamExecutor* stream_exec_; + const Allocator* allocator_; +}; + +#define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...)\ + (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \ + CHECK((device).stream()->ok()); +#endif // __GCUDACC__ + +#endif // EIGEN_USE_GPU +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_TYPE_H |