diff options
Diffstat (limited to 'tensorflow/stream_executor/cuda/cuda_gpu_executor.h')
-rw-r--r-- | tensorflow/stream_executor/cuda/cuda_gpu_executor.h | 270 |
1 files changed, 270 insertions, 0 deletions
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h new file mode 100644 index 0000000000..fda89b9738 --- /dev/null +++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h @@ -0,0 +1,270 @@ +// The CUDA implementation of the StreamExecutorInterface functionality. +// CUDA inclusions are ideally confined to this implementation file. +// +// The notions from the StreamExecutor basically correspond to the CUDA streams +// programming model provided by the libcuda.so driver APIs, so we don't have +// to do much more than wrap the calls to the libraries appropriately. +#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_GPU_EXECUTOR_H_ +#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_GPU_EXECUTOR_H_ + +#include <map> +#include <set> + +#include "tensorflow/stream_executor/cuda/cuda_kernel.h" +#include "tensorflow/stream_executor/event.h" +#include "tensorflow/stream_executor/lib/status.h" +#include "tensorflow/stream_executor/lib/statusor.h" +#include "tensorflow/stream_executor/platform.h" +#include "tensorflow/stream_executor/platform/mutex.h" +#include "tensorflow/stream_executor/platform/port.h" +#include "tensorflow/stream_executor/platform/thread_annotations.h" +#include "tensorflow/stream_executor/stream_executor_internal.h" + +namespace perftools { +namespace gputools { +namespace blas { +class BlasSupport; +} +namespace internal { +class RngSupport; +} // namespace internal +} // namespace gputools +} // namespace perftools + +namespace perftools { +namespace gputools { +namespace cuda { + +// CUDA-platform implementation of the platform-agnostic +// StreamExecutorInferface. +class CUDAExecutor : public internal::StreamExecutorInterface { + public: + // sub_platform indicates the subplatform used in this executor; it must + // be a CUDA type. + explicit CUDAExecutor(const PluginConfig &plugin_config) + : device_(0), + context_(nullptr), + device_ordinal_(0), + cc_major_(0), + cc_minor_(0), + plugin_config_(plugin_config) {} + + // See the corresponding StreamExecutor methods for method comments on the + // following overrides. + + ~CUDAExecutor() override; + + port::Status Init(int device_ordinal, DeviceOptions device_options) override; + + bool GetKernel(const MultiKernelLoaderSpec &spec, + KernelBase *kernel) override; + + bool Launch(Stream *stream, const ThreadDim &thread_dims, + const BlockDim &block_dims, const KernelBase &k, + const std::vector<KernelArg> &args) override; + + void *Allocate(uint64 size) override; + + void *AllocateSubBuffer(DeviceMemoryBase *mem, uint64 offset_bytes, + uint64 size_bytes) override; + + void Deallocate(DeviceMemoryBase *mem) override; + + // CUDA allocation/registration functions are necessary because the driver + // internally sets up buffers for DMA operations (and page locks them). + // There's no external interface for us to otherwise control these DMA + // settings. + void *HostMemoryAllocate(uint64 size) override { + return CUDADriver::HostAllocate(context_, size); + } + + void HostMemoryDeallocate(void *location) override { + return CUDADriver::HostDeallocate(context_, location); + } + + bool HostMemoryRegister(void *location, uint64 size) override; + + bool HostMemoryUnregister(void *location) override; + + bool SynchronizeAllActivity() override; + + bool SynchronousMemZero(DeviceMemoryBase *location, uint64 size) override; + + bool SynchronousMemSet(DeviceMemoryBase *location, int value, + uint64 size) override; + + bool SynchronousMemcpy(DeviceMemoryBase *gpu_dst, const void *host_src, + uint64 size) override; + + bool SynchronousMemcpy(void *host_dst, const DeviceMemoryBase &gpu_src, + uint64 size) override; + + bool SynchronousMemcpyDeviceToDevice(DeviceMemoryBase *gpu_dst, + const DeviceMemoryBase &gpu_src, + uint64 size) override; + + bool MemZero(Stream *stream, DeviceMemoryBase *location, + uint64 size) override; + bool Memset32(Stream *stream, DeviceMemoryBase *location, uint32 pattern, + uint64 size) override; + + bool Memcpy(Stream *stream, void *host_dst, const DeviceMemoryBase &gpu_src, + uint64 size) override; + + bool Memcpy(Stream *stream, DeviceMemoryBase *gpu_dst, const void *host_src, + uint64 size) override; + + bool MemcpyDeviceToDevice(Stream *stream, DeviceMemoryBase *gpu_dst, + const DeviceMemoryBase &gpu_src, + uint64 size) override; + + bool HostCallback(Stream *stream, std::function<void()> callback) override; + + bool AllocateStream(Stream *stream) override; + + void DeallocateStream(Stream *stream) override; + + bool CreateStreamDependency(Stream *dependent, Stream *other) override; + + bool AllocateTimer(Timer *timer) override; + + void DeallocateTimer(Timer *timer) override; + + bool StartTimer(Stream *stream, Timer *timer) override; + + bool StopTimer(Stream *stream, Timer *timer) override; + + port::Status AllocateEvent(Event *event) override; + + port::Status DeallocateEvent(Event *event) override; + + port::Status RecordEvent(Stream *stream, Event *event) override; + + port::Status WaitForEvent(Stream *stream, Event *event) override; + + Event::Status PollForEventStatus(Event *event) override; + + bool BlockHostUntilDone(Stream *stream) override; + + int PlatformDeviceCount() override { return CUDADriver::GetDeviceCount(); } + + port::Status EnablePeerAccessTo(StreamExecutorInterface *other) override; + + bool CanEnablePeerAccessTo(StreamExecutorInterface *other) override; + + SharedMemoryConfig GetDeviceSharedMemoryConfig() override; + + port::Status SetDeviceSharedMemoryConfig(SharedMemoryConfig config) override; + + bool DeviceMemoryUsage(int64 *free, int64 *total) const override; + + // Search for the symbol and returns a device pointer and size. + // Returns false if symbol does not exist. + bool GetSymbol(const string& symbol_name, void **mem, size_t *bytes) override; + + DeviceDescription *PopulateDeviceDescription() const override; + + // Populates the block_dim_limit by querying the device driver API. If an + // error occurs at any point while asking the driver for block dim limits, it + // will be only partially populated as a result, and an error will be logged. + bool FillBlockDimLimit(BlockDim *block_dim_limit) const; + + KernelArg DeviceMemoryToKernelArg( + const DeviceMemoryBase &gpu_mem) const override; + + bool SupportsBlas() const override; + + blas::BlasSupport *CreateBlas() override; + + bool SupportsFft() const override; + + fft::FftSupport *CreateFft() override; + + bool SupportsRng() const override; + + rng::RngSupport *CreateRng() override; + + bool SupportsDnn() const override; + + dnn::DnnSupport *CreateDnn() override; + + void *CudaContextHack() override; + + CUcontext cuda_context(); + + private: + // Attempts to find a more specific version of the file indicated by + // filename by looking for compute-capability-specific suffixed versions; i.e. + // looking for "foo.ptx" will check to see if "foo.ptx.cc30.ptx" is present if + // we're on a compute capability 3.0 machine. + bool FindOnDiskForComputeCapability(port::StringPiece filename, + port::StringPiece canonical_suffix, + string *found_filename) const; + + // Host callback landing routine invoked by CUDA. + // data: User-provided callback provided to HostCallback() above, captured + // as a std::function<void()>. Allocated/initialized inside + // HostCallback() and owned and deleted by this call. + static void InternalHostCallback(CUstream stream, CUresult status, + void *data); + + // Collects metadata for the specified kernel. + bool GetKernelMetadata(CUDAKernel *cuda_kernel, + KernelMetadata *kernel_metadata); + + // Determines if the given kernel's occupancy could be improved by only + // slightly reducing its register usage. If so, a message is emitted to the + // INFO log. The warning threshold is controlled by the flag + // register_occupancy_warning_threshold. + void OccupancyCheck(const KernelBase &kernel, const ThreadDim &thread_dims, + const BlockDim &block_dims); + + // Guards the on-disk-module mapping. + mutex disk_modules_mu_; + + // Mapping from filename to CUmodule, if it was already retrieved. + // Multiple CUfunctions are usually obtained from a single CUmodule so we + // attempt to hit in this mapping first, before retrieving it. + std::map<string, CUmodule> disk_modules_ GUARDED_BY(disk_modules_mu_); + + // Guards the in-memory-module mapping. + mutex in_memory_modules_mu_; + + std::map<const char *, CUmodule> in_memory_modules_ + GUARDED_BY(in_memory_modules_mu_); + + // Guards the launched kernel set. + mutex launched_kernels_mu_; + + // Keeps track of the set of launched kernels. Currently used to suppress the + // occupancy check on subsequent launches. + std::set<CUfunction> launched_kernels_ GUARDED_BY(launched_kernels_mu_); + + // Handle for the CUDA device being operated on. Immutable + // post-initialization. + CUdevice device_; + + // Handle for session with the library/driver. Immutable post-initialization. + CUcontext context_; + + // The device ordinal value that this executor was initialized with; recorded + // for use in getting device metadata. Immutable post-initialization. + int device_ordinal_; + + // The major verion of the compute capability for device_. + int cc_major_; + + // The minor verion of the compute capability for device_. + int cc_minor_; + + // The plugin configuration associated with this instance. + PluginConfig plugin_config_; + + SE_DISALLOW_COPY_AND_ASSIGN(CUDAExecutor); +}; + +} // namespace cuda +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_GPU_EXECUTOR_H_ |