aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
diff options
context:
space:
mode:
Diffstat (limited to 'tensorflow/stream_executor/cuda/cuda_gpu_executor.h')
-rw-r--r--tensorflow/stream_executor/cuda/cuda_gpu_executor.h270
1 files changed, 270 insertions, 0 deletions
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
new file mode 100644
index 0000000000..fda89b9738
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
@@ -0,0 +1,270 @@
+// The CUDA implementation of the StreamExecutorInterface functionality.
+// CUDA inclusions are ideally confined to this implementation file.
+//
+// The notions from the StreamExecutor basically correspond to the CUDA streams
+// programming model provided by the libcuda.so driver APIs, so we don't have
+// to do much more than wrap the calls to the libraries appropriately.
+#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_GPU_EXECUTOR_H_
+#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_GPU_EXECUTOR_H_
+
+#include <map>
+#include <set>
+
+#include "tensorflow/stream_executor/cuda/cuda_kernel.h"
+#include "tensorflow/stream_executor/event.h"
+#include "tensorflow/stream_executor/lib/status.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+#include "tensorflow/stream_executor/platform.h"
+#include "tensorflow/stream_executor/platform/mutex.h"
+#include "tensorflow/stream_executor/platform/port.h"
+#include "tensorflow/stream_executor/platform/thread_annotations.h"
+#include "tensorflow/stream_executor/stream_executor_internal.h"
+
+namespace perftools {
+namespace gputools {
+namespace blas {
+class BlasSupport;
+}
+namespace internal {
+class RngSupport;
+} // namespace internal
+} // namespace gputools
+} // namespace perftools
+
+namespace perftools {
+namespace gputools {
+namespace cuda {
+
+// CUDA-platform implementation of the platform-agnostic
+// StreamExecutorInferface.
+class CUDAExecutor : public internal::StreamExecutorInterface {
+ public:
+ // sub_platform indicates the subplatform used in this executor; it must
+ // be a CUDA type.
+ explicit CUDAExecutor(const PluginConfig &plugin_config)
+ : device_(0),
+ context_(nullptr),
+ device_ordinal_(0),
+ cc_major_(0),
+ cc_minor_(0),
+ plugin_config_(plugin_config) {}
+
+ // See the corresponding StreamExecutor methods for method comments on the
+ // following overrides.
+
+ ~CUDAExecutor() override;
+
+ port::Status Init(int device_ordinal, DeviceOptions device_options) override;
+
+ bool GetKernel(const MultiKernelLoaderSpec &spec,
+ KernelBase *kernel) override;
+
+ bool Launch(Stream *stream, const ThreadDim &thread_dims,
+ const BlockDim &block_dims, const KernelBase &k,
+ const std::vector<KernelArg> &args) override;
+
+ void *Allocate(uint64 size) override;
+
+ void *AllocateSubBuffer(DeviceMemoryBase *mem, uint64 offset_bytes,
+ uint64 size_bytes) override;
+
+ void Deallocate(DeviceMemoryBase *mem) override;
+
+ // CUDA allocation/registration functions are necessary because the driver
+ // internally sets up buffers for DMA operations (and page locks them).
+ // There's no external interface for us to otherwise control these DMA
+ // settings.
+ void *HostMemoryAllocate(uint64 size) override {
+ return CUDADriver::HostAllocate(context_, size);
+ }
+
+ void HostMemoryDeallocate(void *location) override {
+ return CUDADriver::HostDeallocate(context_, location);
+ }
+
+ bool HostMemoryRegister(void *location, uint64 size) override;
+
+ bool HostMemoryUnregister(void *location) override;
+
+ bool SynchronizeAllActivity() override;
+
+ bool SynchronousMemZero(DeviceMemoryBase *location, uint64 size) override;
+
+ bool SynchronousMemSet(DeviceMemoryBase *location, int value,
+ uint64 size) override;
+
+ bool SynchronousMemcpy(DeviceMemoryBase *gpu_dst, const void *host_src,
+ uint64 size) override;
+
+ bool SynchronousMemcpy(void *host_dst, const DeviceMemoryBase &gpu_src,
+ uint64 size) override;
+
+ bool SynchronousMemcpyDeviceToDevice(DeviceMemoryBase *gpu_dst,
+ const DeviceMemoryBase &gpu_src,
+ uint64 size) override;
+
+ bool MemZero(Stream *stream, DeviceMemoryBase *location,
+ uint64 size) override;
+ bool Memset32(Stream *stream, DeviceMemoryBase *location, uint32 pattern,
+ uint64 size) override;
+
+ bool Memcpy(Stream *stream, void *host_dst, const DeviceMemoryBase &gpu_src,
+ uint64 size) override;
+
+ bool Memcpy(Stream *stream, DeviceMemoryBase *gpu_dst, const void *host_src,
+ uint64 size) override;
+
+ bool MemcpyDeviceToDevice(Stream *stream, DeviceMemoryBase *gpu_dst,
+ const DeviceMemoryBase &gpu_src,
+ uint64 size) override;
+
+ bool HostCallback(Stream *stream, std::function<void()> callback) override;
+
+ bool AllocateStream(Stream *stream) override;
+
+ void DeallocateStream(Stream *stream) override;
+
+ bool CreateStreamDependency(Stream *dependent, Stream *other) override;
+
+ bool AllocateTimer(Timer *timer) override;
+
+ void DeallocateTimer(Timer *timer) override;
+
+ bool StartTimer(Stream *stream, Timer *timer) override;
+
+ bool StopTimer(Stream *stream, Timer *timer) override;
+
+ port::Status AllocateEvent(Event *event) override;
+
+ port::Status DeallocateEvent(Event *event) override;
+
+ port::Status RecordEvent(Stream *stream, Event *event) override;
+
+ port::Status WaitForEvent(Stream *stream, Event *event) override;
+
+ Event::Status PollForEventStatus(Event *event) override;
+
+ bool BlockHostUntilDone(Stream *stream) override;
+
+ int PlatformDeviceCount() override { return CUDADriver::GetDeviceCount(); }
+
+ port::Status EnablePeerAccessTo(StreamExecutorInterface *other) override;
+
+ bool CanEnablePeerAccessTo(StreamExecutorInterface *other) override;
+
+ SharedMemoryConfig GetDeviceSharedMemoryConfig() override;
+
+ port::Status SetDeviceSharedMemoryConfig(SharedMemoryConfig config) override;
+
+ bool DeviceMemoryUsage(int64 *free, int64 *total) const override;
+
+ // Search for the symbol and returns a device pointer and size.
+ // Returns false if symbol does not exist.
+ bool GetSymbol(const string& symbol_name, void **mem, size_t *bytes) override;
+
+ DeviceDescription *PopulateDeviceDescription() const override;
+
+ // Populates the block_dim_limit by querying the device driver API. If an
+ // error occurs at any point while asking the driver for block dim limits, it
+ // will be only partially populated as a result, and an error will be logged.
+ bool FillBlockDimLimit(BlockDim *block_dim_limit) const;
+
+ KernelArg DeviceMemoryToKernelArg(
+ const DeviceMemoryBase &gpu_mem) const override;
+
+ bool SupportsBlas() const override;
+
+ blas::BlasSupport *CreateBlas() override;
+
+ bool SupportsFft() const override;
+
+ fft::FftSupport *CreateFft() override;
+
+ bool SupportsRng() const override;
+
+ rng::RngSupport *CreateRng() override;
+
+ bool SupportsDnn() const override;
+
+ dnn::DnnSupport *CreateDnn() override;
+
+ void *CudaContextHack() override;
+
+ CUcontext cuda_context();
+
+ private:
+ // Attempts to find a more specific version of the file indicated by
+ // filename by looking for compute-capability-specific suffixed versions; i.e.
+ // looking for "foo.ptx" will check to see if "foo.ptx.cc30.ptx" is present if
+ // we're on a compute capability 3.0 machine.
+ bool FindOnDiskForComputeCapability(port::StringPiece filename,
+ port::StringPiece canonical_suffix,
+ string *found_filename) const;
+
+ // Host callback landing routine invoked by CUDA.
+ // data: User-provided callback provided to HostCallback() above, captured
+ // as a std::function<void()>. Allocated/initialized inside
+ // HostCallback() and owned and deleted by this call.
+ static void InternalHostCallback(CUstream stream, CUresult status,
+ void *data);
+
+ // Collects metadata for the specified kernel.
+ bool GetKernelMetadata(CUDAKernel *cuda_kernel,
+ KernelMetadata *kernel_metadata);
+
+ // Determines if the given kernel's occupancy could be improved by only
+ // slightly reducing its register usage. If so, a message is emitted to the
+ // INFO log. The warning threshold is controlled by the flag
+ // register_occupancy_warning_threshold.
+ void OccupancyCheck(const KernelBase &kernel, const ThreadDim &thread_dims,
+ const BlockDim &block_dims);
+
+ // Guards the on-disk-module mapping.
+ mutex disk_modules_mu_;
+
+ // Mapping from filename to CUmodule, if it was already retrieved.
+ // Multiple CUfunctions are usually obtained from a single CUmodule so we
+ // attempt to hit in this mapping first, before retrieving it.
+ std::map<string, CUmodule> disk_modules_ GUARDED_BY(disk_modules_mu_);
+
+ // Guards the in-memory-module mapping.
+ mutex in_memory_modules_mu_;
+
+ std::map<const char *, CUmodule> in_memory_modules_
+ GUARDED_BY(in_memory_modules_mu_);
+
+ // Guards the launched kernel set.
+ mutex launched_kernels_mu_;
+
+ // Keeps track of the set of launched kernels. Currently used to suppress the
+ // occupancy check on subsequent launches.
+ std::set<CUfunction> launched_kernels_ GUARDED_BY(launched_kernels_mu_);
+
+ // Handle for the CUDA device being operated on. Immutable
+ // post-initialization.
+ CUdevice device_;
+
+ // Handle for session with the library/driver. Immutable post-initialization.
+ CUcontext context_;
+
+ // The device ordinal value that this executor was initialized with; recorded
+ // for use in getting device metadata. Immutable post-initialization.
+ int device_ordinal_;
+
+ // The major verion of the compute capability for device_.
+ int cc_major_;
+
+ // The minor verion of the compute capability for device_.
+ int cc_minor_;
+
+ // The plugin configuration associated with this instance.
+ PluginConfig plugin_config_;
+
+ SE_DISALLOW_COPY_AND_ASSIGN(CUDAExecutor);
+};
+
+} // namespace cuda
+} // namespace gputools
+} // namespace perftools
+
+#endif // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_GPU_EXECUTOR_H_