aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/stream_executor/device_memory.h
blob: 5a5334e0f5f6e8744b92188de14d7fea0f2ff9a0 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/

// Suite of types that represent device memory allocations. These are
// allocated by the StreamExecutor interface, which produces values appropriate
// for the underlying platform (whether it be CUDA or OpenCL).
//
// The untyped base class (like a device void*) is DeviceMemoryBase, which can
// be specialized for a given allocation type (like a device T*) using
// DeviceMemory<T>.

#ifndef TENSORFLOW_STREAM_EXECUTOR_DEVICE_MEMORY_H_
#define TENSORFLOW_STREAM_EXECUTOR_DEVICE_MEMORY_H_

#include <stddef.h>

#include "tensorflow/stream_executor/lib/casts.h"
#include "tensorflow/stream_executor/platform/port.h"

namespace perftools {
namespace gputools {

// Temporarily pull stream_executor into perftools::gputools while we migrate
// code to the new namespace.  TODO(b/77980417): Remove this once we've
// completed the migration.
using namespace stream_executor;  // NOLINT[build/namespaces]

}  // namespace gputools
}  // namespace perftools

namespace stream_executor {

class StreamExecutor;

// void*-analogous device memory allocation. For the typed variation, see
// DeviceMemory<T>.
//
// This is effectively a two-tuple of a pointer and size; however, note that the
// pointer may not be to the virtual address itself -- in OpenCL the pointer is
// to a cl_mem handle that describes the device allocation. Therefore,
// DeviceMemoryBase::opaque does not necessarily produce a pointer that can be
// referenced directly, so use it with caution.
//
// Thread-compatible.
class DeviceMemoryBase {
 public:
  // Default constructor instantiates a null-pointed, zero-sized device memory
  // region. An opaque pointer may be provided -- see header for details on the
  // opacity of that pointer.
  explicit DeviceMemoryBase(void *opaque = nullptr, uint64 size = 0,
                            bool is_sub_buffer = false)
      : opaque_(opaque), size_(size), is_sub_buffer_(is_sub_buffer) {}

  // Returns whether the backing memory is the null pointer.
  // A `== nullptr` convenience method is also provided.
  bool is_null() const { return opaque_ == nullptr; }
  bool operator==(std::nullptr_t other) const { return is_null(); }
  bool operator!=(std::nullptr_t other) const { return !is_null(); }

  // Provides a partial order between device memory values.
  //
  // This operator is provided so that this object can be used as a key in an
  // ordered map.
  bool operator<(const DeviceMemoryBase &other) const {
    return opaque() < other.opaque();
  }

  // Returns the size, in bytes, for the backing memory.
  uint64 size() const { return size_; }

  // Warning: note that the pointer returned is not necessarily directly to
  // device virtual address space, but is platform-dependent.
  void *opaque() { return opaque_; }
  const void *opaque() const { return opaque_; }

  // Returns true if this is an offset into another primary allocation.
  bool is_sub_buffer() const { return is_sub_buffer_; }

  // Returns whether the two DeviceMemoryBase segments are identical (both in
  // their opaque pointer and size).
  bool IsSameAs(const DeviceMemoryBase &other) const {
    return opaque() == other.opaque() && size() == other.size();
  }

 protected:
  friend class StreamExecutor;

  // Resets the internal values of the opaque pointer and number of bytes in the
  // memory region, just as in the constructor.
  void Reset(void *opaque, uint64 bytes) {
    opaque_ = opaque;
    size_ = bytes;
  }

 private:
  void *opaque_;  // Platform-dependent value representing allocated memory.
  uint64 size_;   // Size in bytes of this allocation.
  bool is_sub_buffer_;  // Is this a primary allocation or a sub-buffer?
};

// Typed wrapper around "void *"-like DeviceMemoryBase.
//
// For example, DeviceMemory<int> is a simple wrapper around DeviceMemoryBase
// that represents one or more integers in Device memory.
//
// Thread-compatible.
template <typename ElemT>
class DeviceMemory final : public DeviceMemoryBase {
 public:
  // Default constructor instantiates a null-pointed, zero-sized memory region.
  DeviceMemory() : DeviceMemoryBase(nullptr, 0) {}
  DeviceMemory(std::nullptr_t) : DeviceMemory() {}

  // Typed device memory regions may be constructed from untyped device memory
  // regions, this effectively amounts to a cast from a void*.
  explicit DeviceMemory(const DeviceMemoryBase &other)
      : DeviceMemoryBase(const_cast<DeviceMemoryBase &>(other).opaque(),
                         other.size(), other.is_sub_buffer()) {}

  // Returns the number of elements of type ElemT that constitute this
  // allocation.
  uint64 ElementCount() const { return size() / sizeof(ElemT); }

  // Returns whether this is a single-element allocation.
  bool IsScalar() const { return ElementCount() == 1; }

  // Create a typed area of DeviceMemory with a given opaque pointer and the
  // quantity of bytes in the allocation. This function is broken out to
  // distinguish bytes from an element count.
  static DeviceMemory<ElemT> MakeFromByteSize(void *opaque, uint64 bytes) {
    return DeviceMemory<ElemT>(opaque, bytes);
  }

  // Resets the DeviceMemory data, in MakeFromByteSize fashion.
  // This simply clobbers the prior values.
  void ResetFromByteSize(void *opaque, uint64 bytes) {
    // TODO(leary) when NVCC is eliminated we can add this check (and the
    // logging include it requires).
    // CHECK_EQ(0, bytes % sizeof(ElemT));
    DeviceMemoryBase::Reset(opaque, bytes);
  }

  // ------------------------------------------------------------

 protected:
  // This constructor is solely used from derived classes; it is made protected
  // because it accepts a byte-size instead of an element count, which could
  // potentially be misused given the ElementCount() nature of this interface.
  //
  // In order to specify the desire to use byte size instead of element count
  // explicitly, use MakeFromByteSize.
  DeviceMemory(void *opaque, uint64 size) : DeviceMemoryBase(opaque, size) {}
};

// A class to encapsulate the type and size of a dynamic shared memory
// buffer. Because the buffer exists solely on the device and is not copyable
// to the host, memory objects of this type do not maintain buffer pointers
// on the host.
template <typename ElemT>
class SharedDeviceMemory final : public DeviceMemoryBase {
 public:
  explicit SharedDeviceMemory(uint64 elem_count)
      : DeviceMemoryBase(nullptr, elem_count * kElemSize) {}

  static constexpr size_t kElemSize = sizeof(ElemT);

  // Returns the number of elements of type ElemT that constitute this
  // allocation.
  uint64 ElementCount() const { return size() / kElemSize; }

  // Returns whether this is a single-element allocation.
  bool IsScalar() const { return ElementCount() == 1; }
};

// Similar to the typed DeviceMemory, but is the unique owner of its
// memory, if any. ScopedDeviceMemory is thread-compatible. It is also
// movable and uncopyable to represent unique ownership.
template <typename ElemT>
class ScopedDeviceMemory {
 public:
  // Default construction initializes the internal state to nullptr.  This
  // mirrors the std::unique_ptr<> functionality, where default construction
  // produces a nullptr unique_ptr, which can be assigned later.
  ScopedDeviceMemory();

  // Parameters:
  //  parent: Executor used to deallocate memory when this instance goes
  //          out of scope.
  //  value: Already-allocated device memory value for this scoped mechanism to
  //         deallocate. This memory must have been allocated by parent.
  ScopedDeviceMemory(StreamExecutor *parent, DeviceMemoryBase value);

  // Constructor overload that places a literal array into device memory
  ScopedDeviceMemory(StreamExecutor *parent,
                     std::initializer_list<ElemT> values);

  // Moves ownership of the memory from other to the constructed
  // object.
  //
  // Postcondition: other == nullptr.
  ScopedDeviceMemory(ScopedDeviceMemory &&other) noexcept:
      ScopedDeviceMemory(other.parent_, other.Release()) {}

  // Releases the memory that was provided in the constructor, through the
  // "parent" StreamExecutor.
  ~ScopedDeviceMemory();

  // Moves ownership of the memory from other to this object.
  //
  // Postcondition: other == nullptr.
  ScopedDeviceMemory& operator=(ScopedDeviceMemory &&other) {
    Reset(other.Release());
    parent_ = other.parent_;
    return *this;
  }

  // Returns the memory that backs this scoped allocation converted to
  // DeviceMemory<T> apparent type. This is useful for cases where the
  // DeviceMemory must be passed by const-ref, as the ScopedDeviceMemory doesn't
  // allow copying, for scoped-object-lifetime reasons.
  const DeviceMemory<ElemT> &cref() const { return wrapped_; }

  // Returns a pointer to the DeviceMemory<T> apparent type for use in mutable
  // operations. The value returned should not be used outside the scope of this
  // ScopedDeviceMemory object's lifetime.
  DeviceMemory<ElemT> *ptr() { return &wrapped_; }
  const DeviceMemory<ElemT> *ptr() const { return &wrapped_; }

  // Smart-pointer-like operators for the wrapped DeviceMemory.
  // This reference must not be used outside the lifetime of this
  // ScopedDeviceMemory.
  const DeviceMemory<ElemT> &operator*() const { return cref(); }
  DeviceMemory<ElemT> *operator->() { return ptr(); }
  const DeviceMemory<ElemT> *operator->() const { return ptr(); }
  bool operator==(std::nullptr_t other) const { return wrapped_.is_null(); }
  bool operator!=(std::nullptr_t other) const { return !wrapped_.is_null(); }

  // Analogous to std::unique_ptr::reset, frees the existing memory held in
  // this scoped memory container and replaces it with updated. Ownership
  // of updated is transferred to this object.
  void Reset(DeviceMemory<ElemT> updated);
  void Reset(std::nullptr_t);

  // Analogous to std::unique_ptr::release, releases ownership of the held
  // memory and transfers it to the caller.
  //
  // Postcondition: *this == nullptr
  DeviceMemory<ElemT> Release() {
    auto tmp = wrapped_;
    wrapped_.ResetFromByteSize(nullptr, 0);
    return tmp;
  }

 private:
  DeviceMemory<ElemT> wrapped_;  // Value we wrap with scoped-release.
  StreamExecutor *parent_;       // See constructor.

  SE_DISALLOW_COPY_AND_ASSIGN(ScopedDeviceMemory);
};

// Host-side representation of packed-and-aligned vector datatypes on the device
// side. Since these can appear in device kernel signatures, we support
// launching them with these datatypes in launch signatures.

struct Float2 {
  float x, y;
};

struct Float4 {
  Float2 xz, yw;
};

struct Double2 {
  double x, y;
};

static_assert(sizeof(Float2) == 2 * sizeof(float), "Float2 must be packed");
static_assert(sizeof(Float4) == 4 * sizeof(float), "Float4 must be packed");
static_assert(sizeof(Double2) == 2 * sizeof(double), "Double2 must be packed");

}  // namespace stream_executor

#endif  // TENSORFLOW_STREAM_EXECUTOR_DEVICE_MEMORY_H_