tensorflow/stream_executor/cuda/cuda_stream.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74

// Defines the CUDAStream type - the CUDA-specific implementation of the generic
// StreamExecutor Stream interface.

#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_STREAM_H_
#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_STREAM_H_

#include "tensorflow/stream_executor/cuda/cuda_driver.h"
#include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h"
#include "tensorflow/stream_executor/stream_executor_internal.h"

namespace perftools {
namespace gputools {
namespace cuda {

class CUDAExecutor;

// Wraps a CUstream in order to satisfy the platform-independent
// StreamInterface.
//
// Thread-safe post-initialization.
class CUDAStream : public internal::StreamInterface {
 public:
  explicit CUDAStream(CUDAExecutor *parent)
      : parent_(parent), cuda_stream_(nullptr), completed_event_(nullptr) {}

  // Note: teardown is handled by a parent's call to DeallocateStream.
  ~CUDAStream() override {}

  void *CudaStreamHack() override { return cuda_stream_; }
  void **CudaStreamMemberHack() override {
    return reinterpret_cast<void **>(&cuda_stream_);
  }

  // Explicitly initialize the CUDA resources associated with this stream, used
  // by StreamExecutor::AllocateStream().
  bool Init();

  // Explicitly destroy the CUDA resources associated with this stream, used by
  // StreamExecutor::DeallocateStream().
  void Destroy();

  // Returns true if no work is pending or executing on the stream.
  bool IsIdle() const;

  // Retrieves an event which indicates that all work enqueued into the stream
  // has completed. Ownership of the event is not transferred to the caller, the
  // event is owned by this stream.
  bool GetOrCreateCompletedEvent(CUevent *completed_event);

  // Returns the CUstream value for passing to the CUDA API.
  //
  // Precond: this CUDAStream has been allocated (otherwise passing a nullptr
  // into the NVIDIA library causes difficult-to-understand faults).
  CUstream cuda_stream() const {
    DCHECK(cuda_stream_ != nullptr);
    return const_cast<CUstream>(cuda_stream_);
  }

  CUDAExecutor *parent() const { return parent_; }

 private:
  mutex mu_;              // mutex that guards the completion event.
  CUDAExecutor *parent_;  // Executor that spawned this stream.
  CUstream cuda_stream_;  // Wrapped CUDA stream handle.

  // Event that indicates this stream has completed.
  CUevent completed_event_ GUARDED_BY(mu_);
};

}  // namespace cuda
}  // namespace gputools
}  // namespace perftools

#endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_STREAM_H_