diff options
Diffstat (limited to 'tensorflow/stream_executor/device_memory.h')
-rw-r--r-- | tensorflow/stream_executor/device_memory.h | 284 |
1 files changed, 284 insertions, 0 deletions
diff --git a/tensorflow/stream_executor/device_memory.h b/tensorflow/stream_executor/device_memory.h new file mode 100644 index 0000000000..9e88180316 --- /dev/null +++ b/tensorflow/stream_executor/device_memory.h @@ -0,0 +1,284 @@ +// Suite of types that represent device memory allocations. These are +// allocated by the StreamExecutor interface, which produces values appropriate +// for the underlying platform (whether it be CUDA or OpenCL). +// +// The untyped base class (like a device void*) is DeviceMemoryBase, which can +// be specialized for a given allocation type (like a device T*) using +// DeviceMemory<T>. + +#ifndef TENSORFLOW_STREAM_EXECUTOR_DEVICE_MEMORY_H_ +#define TENSORFLOW_STREAM_EXECUTOR_DEVICE_MEMORY_H_ + +#include <stddef.h> + +#include "tensorflow/stream_executor/lib/casts.h" +#include "tensorflow/stream_executor/platform/port.h" + +namespace perftools { +namespace gputools { + +class StreamExecutor; + +// void*-analogous device memory allocation. For the typed variation, see +// DeviceMemory<T>. +// +// This is effectively a two-tuple of a pointer and size; however, note that the +// pointer may not be to the virtual address itself -- in OpenCL the pointer is +// to a cl_mem handle that describes the device allocation. Therefore, +// DeviceMemoryBase::opaque does not necessarily produce a pointer that can be +// referenced directly, so use it with caution. +// +// Thread-compatible. +class DeviceMemoryBase { + public: + // Default constructor instantiates a null-pointed, zero-sized device memory + // region. An opaque pointer may be provided -- see header for details on the + // opacity of that pointer. + explicit DeviceMemoryBase(void *opaque = nullptr, uint64 size = 0, + bool is_sub_buffer = false) + : opaque_(opaque), size_(size), is_sub_buffer_(is_sub_buffer) {} + + // Returns whether the backing memory is the null pointer. + // A `== nullptr` convenience method is also provided. + bool is_null() const { return opaque_ == nullptr; } + bool operator==(std::nullptr_t other) const { return is_null(); } + bool operator!=(std::nullptr_t other) const { return !is_null(); } + + // Provides a partial order between device memory values. + // + // This operator is provided so that this object can be used as a key in an + // ordered map. + bool operator<(const DeviceMemoryBase &other) const { + return opaque() < other.opaque(); + } + + // Returns the size, in bytes, for the backing memory. + uint64 size() const { return size_; } + + // Warning: note that the pointer returned is not necessarily directly to + // device virtual address space, but is platform-dependent. + void *opaque() { return opaque_; } + const void *opaque() const { return opaque_; } + + // Returns true if this is an offset into another primary allocation. + bool is_sub_buffer() const { return is_sub_buffer_; } + + // Returns whether the two DeviceMemoryBase segments are identical (both in + // their opaque pointer and size). + bool IsSameAs(const DeviceMemoryBase &other) const { + return opaque() == other.opaque() && size() == other.size(); + } + + protected: + friend class StreamExecutor; + + // Resets the internal values of the opaque pointer and number of bytes in the + // memory region, just as in the constructor. + void Reset(void *opaque, uint64 bytes) { + opaque_ = opaque; + size_ = bytes; + } + + private: + void *opaque_; // Platform-dependent value representing allocated memory. + uint64 size_; // Size in bytes of this allocation. + bool is_sub_buffer_; // Is this a primary allocation or a sub-buffer? +}; + +// Typed wrapper around "void *"-like DeviceMemoryBase. +// +// For example, DeviceMemory<int> is a simple wrapper around DeviceMemoryBase +// that represents one or more integers in Device memory. +// +// Thread-compatible. +template <typename ElemT> +class DeviceMemory final : public DeviceMemoryBase { + public: + // Default constructor instantiates a null-pointed, zero-sized memory region. + DeviceMemory() : DeviceMemoryBase(nullptr, 0) {} + + // Typed device memory regions may be constructed from untyped device memory + // regions, this effectively amounts to a cast from a void*. + explicit DeviceMemory(const DeviceMemoryBase &other) + : DeviceMemoryBase(const_cast<DeviceMemoryBase &>(other).opaque(), + other.size(), other.is_sub_buffer()) {} + + static constexpr size_t kElemSize = sizeof(ElemT); + + // Returns the number of elements of type ElemT that constitute this + // allocation. + uint64 ElementCount() const { return size() / kElemSize; } + + // Returns whether this is a single-element allocation. + bool IsScalar() const { return ElementCount() == 1; } + + // Create a typed area of DeviceMemory with a given opaque pointer and the + // quantity of bytes in the allocation. This function is broken out to + // distinguish bytes from an element count. + static DeviceMemory<ElemT> MakeFromByteSize(void *opaque, uint64 bytes) { + return DeviceMemory<ElemT>(opaque, bytes); + } + + // Resets the DeviceMemory data, in MakeFromByteSize fashion. + // This simply clobbers the prior values. + void ResetFromByteSize(void *opaque, uint64 bytes) { + // TODO(leary) when NVCC is eliminated we can add this check (and the + // logging include it requires). + // CHECK_EQ(0, bytes % kElemSize); + DeviceMemoryBase::Reset(opaque, bytes); + } + + // ------------------------------------------------------------ + // DO NOT USE - FASTR TEAM-INTERNAL FUNCTIONS + // Used internally by gcudacc. +#ifdef __GCUDACC__ + // Implicit conversion operators needed to support mixed mode. Since buffer + // sizes aren't used in the CUDA launching process, and since the constructed + // objects are all temporary, this is safe. + // Linter warning disabled as we require an implicit conversion. + DeviceMemory(const ElemT *opaque) : // NOLINT + DeviceMemoryBase(reinterpret_cast<void *>(const_cast<ElemT *>(opaque)), + 0) {} + + operator ElemT *() { return reinterpret_cast<ElemT *>(opaque()); } + operator const ElemT *() { + return const_cast<const ElemT *>(reinterpret_cast<ElemT *>(opaque())); + } +#endif + // ------------------------------------------------------------ + + protected: + // This constructor is solely used from derived classes; it is made protected + // because it accepts a byte-size instead of an element count, which could + // potentially be misused given the ElementCount() nature of this interface. + // + // In order to specify the desire to use byte size instead of element count + // explicitly, use MakeFromByteSize. + DeviceMemory(void *opaque, uint64 size) : DeviceMemoryBase(opaque, size) {} +}; + +// A class to encapsulate the type and size of a dynamic shared memory +// buffer. Because the buffer exists solely on the device and is not copyable +// to the host, memory objects of this type do not maintain buffer pointers +// on the host. +template <typename ElemT> +class SharedDeviceMemory final : public DeviceMemoryBase { + public: + explicit SharedDeviceMemory(uint64 elem_count) + : DeviceMemoryBase(nullptr, elem_count * kElemSize) {} + + static constexpr size_t kElemSize = sizeof(ElemT); + + // Returns the number of elements of type ElemT that constitute this + // allocation. + uint64 ElementCount() const { return size() / kElemSize; } + + // Returns whether this is a single-element allocation. + bool IsScalar() const { return ElementCount() == 1; } +}; + +// Similar to the typed DeviceMemory, but is the unique owner of its +// memory, if any. ScopedDeviceMemory is thread-compatible. It is also +// movable and uncopyable to represent unique ownership. +template <typename ElemT> +class ScopedDeviceMemory { + public: + // Parameters: + // parent: Executor used to deallocate memory when this instance goes + // out of scope. + // value: Already-allocated device memory value for this scoped mechanism to + // deallocate. This memory must have been allocated by parent. + ScopedDeviceMemory(StreamExecutor *parent, DeviceMemoryBase value); + + // Constructor overload that places a literal array into device memory + ScopedDeviceMemory(StreamExecutor *parent, + std::initializer_list<ElemT> values); + + // Moves ownership of the memory from other to the constructed + // object. + // + // Postcondition: other == nullptr. + ScopedDeviceMemory(ScopedDeviceMemory &&other) noexcept: + ScopedDeviceMemory(other.parent_, other.Release()) {} + + // Releases the memory that was provided in the constructor, through the + // "parent" StreamExecutor. + ~ScopedDeviceMemory(); + + // Moves ownership of the memory from other to this object. + // + // Postcondition: other == nullptr. + ScopedDeviceMemory& operator=(ScopedDeviceMemory &&other) { + Reset(other.Release()); + parent_ = other.parent_; + return *this; + } + + // Returns the memory that backs this scoped allocation converted to + // DeviceMemory<T> apparent type. This is useful for cases where the + // DeviceMemory must be passed by const-ref, as the ScopedDeviceMemory doesn't + // allow copying, for scoped-object-lifetime reasons. + const DeviceMemory<ElemT> &cref() const { return wrapped_; } + + // Returns a pointer to the DeviceMemory<T> apparent type for use in mutable + // operations. The value returned should not be used outside the scope of this + // ScopedDeviceMemory object's lifetime. + DeviceMemory<ElemT> *ptr() { return &wrapped_; } + const DeviceMemory<ElemT> *ptr() const { return &wrapped_; } + + // Smart-pointer-like operators for the wrapped DeviceMemory. + // This reference must not be used outside the lifetime of this + // ScopedDeviceMemory. + const DeviceMemory<ElemT> &operator*() const { return cref(); } + DeviceMemory<ElemT> *operator->() { return ptr(); } + const DeviceMemory<ElemT> *operator->() const { return ptr(); } + bool operator==(std::nullptr_t other) const { return wrapped_.is_null(); } + bool operator!=(std::nullptr_t other) const { return !wrapped_.is_null(); } + + // Analogous to std::unique_ptr::reset, frees the existing memory held in + // this scoped memory container and replaces it with updated. Ownership + // of updated is transferred to this object. + void Reset(DeviceMemory<ElemT> updated); + void Reset(std::nullptr_t); + + // Analogous to std::unique_ptr::release, releases ownership of the held + // memory and transfers it to the caller. + // + // Postcondition: *this == nullptr + DeviceMemory<ElemT> Release() { + auto tmp = wrapped_; + wrapped_.ResetFromByteSize(nullptr, 0); + return tmp; + } + + private: + DeviceMemory<ElemT> wrapped_; // Value we wrap with scoped-release. + StreamExecutor *parent_; // See constructor. + + SE_DISALLOW_COPY_AND_ASSIGN(ScopedDeviceMemory); +}; + +// Host-side representation of packed-and-aligned vector datatypes on the device +// side. Since these can appear in device kernel signatures, we support +// launching them with these datatypes in launch signatures. + +struct Float2 { + float x, y; +}; + +struct Float4 { + Float2 xz, yw; +}; + +struct Double2 { + double x, y; +}; + +static_assert(sizeof(Float2) == 2 * sizeof(float), "Float2 must be packed"); +static_assert(sizeof(Float4) == 4 * sizeof(float), "Float4 must be packed"); +static_assert(sizeof(Double2) == 2 * sizeof(double), "Double2 must be packed"); + +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_DEVICE_MEMORY_H_ |