1 files changed, 270 insertions, 0 deletions
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
new file mode 100644
index 0000000000..fda89b9738
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
@@ -0,0 +1,270 @@
+// The CUDA implementation of the StreamExecutorInterface functionality.
+// CUDA inclusions are ideally confined to this implementation file.
+//
+// The notions from the StreamExecutor basically correspond to the CUDA streams
+// programming model provided by the libcuda.so driver APIs, so we don't have
+// to do much more than wrap the calls to the libraries appropriately.
+#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_GPU_EXECUTOR_H_
+#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_GPU_EXECUTOR_H_
+
+#include <map>
+#include <set>
+
+#include "tensorflow/stream_executor/cuda/cuda_kernel.h"
+#include "tensorflow/stream_executor/event.h"
+#include "tensorflow/stream_executor/lib/status.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+#include "tensorflow/stream_executor/platform.h"
+#include "tensorflow/stream_executor/platform/mutex.h"
+#include "tensorflow/stream_executor/platform/port.h"
+#include "tensorflow/stream_executor/platform/thread_annotations.h"
+#include "tensorflow/stream_executor/stream_executor_internal.h"
+
+namespace perftools {
+namespace gputools {
+namespace blas {
+class BlasSupport;
+}
+namespace internal {
+class RngSupport;
+}  // namespace internal
+}  // namespace gputools
+}  // namespace perftools
+
+namespace perftools {
+namespace gputools {
+namespace cuda {
+
+// CUDA-platform implementation of the platform-agnostic
+// StreamExecutorInferface.
+class CUDAExecutor : public internal::StreamExecutorInterface {
+ public:
+  // sub_platform indicates the subplatform used in this executor; it must
+  // be a CUDA type.
+  explicit CUDAExecutor(const PluginConfig &plugin_config)
+      : device_(0),
+        context_(nullptr),
+        device_ordinal_(0),
+        cc_major_(0),
+        cc_minor_(0),
+        plugin_config_(plugin_config) {}
+
+  // See the corresponding StreamExecutor methods for method comments on the
+  // following overrides.
+
+  ~CUDAExecutor() override;
+
+  port::Status Init(int device_ordinal, DeviceOptions device_options) override;
+
+  bool GetKernel(const MultiKernelLoaderSpec &spec,
+                 KernelBase *kernel) override;
+
+  bool Launch(Stream *stream, const ThreadDim &thread_dims,
+              const BlockDim &block_dims, const KernelBase &k,
+              const std::vector<KernelArg> &args) override;
+
+  void *Allocate(uint64 size) override;
+
+  void *AllocateSubBuffer(DeviceMemoryBase *mem, uint64 offset_bytes,
+                          uint64 size_bytes) override;
+
+  void Deallocate(DeviceMemoryBase *mem) override;
+
+  // CUDA allocation/registration functions are necessary because the driver
+  // internally sets up buffers for DMA operations (and page locks them).
+  // There's no external interface for us to otherwise control these DMA
+  // settings.
+  void *HostMemoryAllocate(uint64 size) override {
+    return CUDADriver::HostAllocate(context_, size);
+  }
+
+  void HostMemoryDeallocate(void *location) override {
+    return CUDADriver::HostDeallocate(context_, location);
+  }
+
+  bool HostMemoryRegister(void *location, uint64 size) override;
+
+  bool HostMemoryUnregister(void *location) override;
+
+  bool SynchronizeAllActivity() override;
+
+  bool SynchronousMemZero(DeviceMemoryBase *location, uint64 size) override;
+
+  bool SynchronousMemSet(DeviceMemoryBase *location, int value,
+                         uint64 size) override;
+
+  bool SynchronousMemcpy(DeviceMemoryBase *gpu_dst, const void *host_src,
+                         uint64 size) override;
+
+  bool SynchronousMemcpy(void *host_dst, const DeviceMemoryBase &gpu_src,
+                         uint64 size) override;
+
+  bool SynchronousMemcpyDeviceToDevice(DeviceMemoryBase *gpu_dst,
+                                       const DeviceMemoryBase &gpu_src,
+                                       uint64 size) override;
+
+  bool MemZero(Stream *stream, DeviceMemoryBase *location,
+               uint64 size) override;
+  bool Memset32(Stream *stream, DeviceMemoryBase *location, uint32 pattern,
+                uint64 size) override;
+
+  bool Memcpy(Stream *stream, void *host_dst, const DeviceMemoryBase &gpu_src,
+              uint64 size) override;
+
+  bool Memcpy(Stream *stream, DeviceMemoryBase *gpu_dst, const void *host_src,
+              uint64 size) override;
+
+  bool MemcpyDeviceToDevice(Stream *stream, DeviceMemoryBase *gpu_dst,
+                            const DeviceMemoryBase &gpu_src,
+                            uint64 size) override;
+
+  bool HostCallback(Stream *stream, std::function<void()> callback) override;
+
+  bool AllocateStream(Stream *stream) override;
+
+  void DeallocateStream(Stream *stream) override;
+
+  bool CreateStreamDependency(Stream *dependent, Stream *other) override;
+
+  bool AllocateTimer(Timer *timer) override;
+
+  void DeallocateTimer(Timer *timer) override;
+
+  bool StartTimer(Stream *stream, Timer *timer) override;
+
+  bool StopTimer(Stream *stream, Timer *timer) override;
+
+  port::Status AllocateEvent(Event *event) override;
+
+  port::Status DeallocateEvent(Event *event) override;
+
+  port::Status RecordEvent(Stream *stream, Event *event) override;
+
+  port::Status WaitForEvent(Stream *stream, Event *event) override;
+
+  Event::Status PollForEventStatus(Event *event) override;
+
+  bool BlockHostUntilDone(Stream *stream) override;
+
+  int PlatformDeviceCount() override { return CUDADriver::GetDeviceCount(); }
+
+  port::Status EnablePeerAccessTo(StreamExecutorInterface *other) override;
+
+  bool CanEnablePeerAccessTo(StreamExecutorInterface *other) override;
+
+  SharedMemoryConfig GetDeviceSharedMemoryConfig() override;
+
+  port::Status SetDeviceSharedMemoryConfig(SharedMemoryConfig config) override;
+
+  bool DeviceMemoryUsage(int64 *free, int64 *total) const override;
+
+  // Search for the symbol and returns a device pointer and size.
+  // Returns false if symbol does not exist.
+  bool GetSymbol(const string& symbol_name, void **mem, size_t *bytes) override;
+
+  DeviceDescription *PopulateDeviceDescription() const override;
+
+  // Populates the block_dim_limit by querying the device driver API. If an
+  // error occurs at any point while asking the driver for block dim limits, it
+  // will be only partially populated as a result, and an error will be logged.
+  bool FillBlockDimLimit(BlockDim *block_dim_limit) const;
+
+  KernelArg DeviceMemoryToKernelArg(
+      const DeviceMemoryBase &gpu_mem) const override;
+
+  bool SupportsBlas() const override;
+
+  blas::BlasSupport *CreateBlas() override;
+
+  bool SupportsFft() const override;
+
+  fft::FftSupport *CreateFft() override;
+
+  bool SupportsRng() const override;
+
+  rng::RngSupport *CreateRng() override;
+
+  bool SupportsDnn() const override;
+
+  dnn::DnnSupport *CreateDnn() override;
+
+  void *CudaContextHack() override;
+
+  CUcontext cuda_context();
+
+ private:
+  // Attempts to find a more specific version of the file indicated by
+  // filename by looking for compute-capability-specific suffixed versions; i.e.
+  // looking for "foo.ptx" will check to see if "foo.ptx.cc30.ptx" is present if
+  // we're on a compute capability 3.0 machine.
+  bool FindOnDiskForComputeCapability(port::StringPiece filename,
+                                      port::StringPiece canonical_suffix,
+                                      string *found_filename) const;
+
+  // Host callback landing routine invoked by CUDA.
+  // data: User-provided callback provided to HostCallback() above, captured
+  //       as a std::function<void()>. Allocated/initialized inside
+  //       HostCallback() and owned and deleted by this call.
+  static void InternalHostCallback(CUstream stream, CUresult status,
+                                   void *data);
+
+  // Collects metadata for the specified kernel.
+  bool GetKernelMetadata(CUDAKernel *cuda_kernel,
+                         KernelMetadata *kernel_metadata);
+
+  // Determines if the given kernel's occupancy could be improved by only
+  // slightly reducing its register usage. If so, a message is emitted to the
+  // INFO log. The warning threshold is controlled by the flag
+  // register_occupancy_warning_threshold.
+  void OccupancyCheck(const KernelBase &kernel, const ThreadDim &thread_dims,
+                      const BlockDim &block_dims);
+
+  // Guards the on-disk-module mapping.
+  mutex disk_modules_mu_;
+
+  // Mapping from filename to CUmodule, if it was already retrieved.
+  // Multiple CUfunctions are usually obtained from a single CUmodule so we
+  // attempt to hit in this mapping first, before retrieving it.
+  std::map<string, CUmodule> disk_modules_ GUARDED_BY(disk_modules_mu_);
+
+  // Guards the in-memory-module mapping.
+  mutex in_memory_modules_mu_;
+
+  std::map<const char *, CUmodule> in_memory_modules_
+      GUARDED_BY(in_memory_modules_mu_);
+
+  // Guards the launched kernel set.
+  mutex launched_kernels_mu_;
+
+  // Keeps track of the set of launched kernels. Currently used to suppress the
+  // occupancy check on subsequent launches.
+  std::set<CUfunction> launched_kernels_ GUARDED_BY(launched_kernels_mu_);
+
+  // Handle for the CUDA device being operated on. Immutable
+  // post-initialization.
+  CUdevice device_;
+
+  // Handle for session with the library/driver. Immutable post-initialization.
+  CUcontext context_;
+
+  // The device ordinal value that this executor was initialized with; recorded
+  // for use in getting device metadata. Immutable post-initialization.
+  int device_ordinal_;
+
+  // The major verion of the compute capability for device_.
+  int cc_major_;
+
+  // The minor verion of the compute capability for device_.
+  int cc_minor_;
+
+  // The plugin configuration associated with this instance.
+  PluginConfig plugin_config_;
+
+  SE_DISALLOW_COPY_AND_ASSIGN(CUDAExecutor);
+};
+
+}  // namespace cuda
+}  // namespace gputools
+}  // namespace perftools
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_GPU_EXECUTOR_H_