tensorflow/core/common_runtime/gpu/gpu_region_allocator.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146

#ifndef TENSORFLOW_COMMON_RUNTIME_GPU_GPU_REGION_ALLOCATOR_H_
#define TENSORFLOW_COMMON_RUNTIME_GPU_GPU_REGION_ALLOCATOR_H_

#include <memory>
#include <string>
#include <unordered_map>
#include <vector>

#include "tensorflow/stream_executor/stream_executor.h"
#include "tensorflow/core/common_runtime/gpu/gpu_allocator_retry.h"
#include "tensorflow/core/common_runtime/gpu/visitable_allocator.h"
#include "tensorflow/core/lib/strings/strcat.h"
#include "tensorflow/core/platform/port.h"
#include "tensorflow/core/platform/thread_annotations.h"

namespace tensorflow {

class GPURegionAllocator : public VisitableAllocator {
 public:
  // 'device_id' must be a valid device on the machine.
  //
  // total_bytes is how many bytes this allocator should allocate up
  // to.  This may be less than the total available.
  explicit GPURegionAllocator(int device_id, size_t total_bytes);
  ~GPURegionAllocator() override;

  string Name() override { return "gpu_region"; }
  void* AllocateRaw(size_t alignment, size_t num_bytes) override;
  void DeallocateRaw(void* ptr) override;
  void AddAllocVisitor(Visitor visitor) override;
  // Does nothing, because regions are never freed.
  void AddFreeVisitor(Visitor visitor) override {}

  bool TracksAllocationSizes() override;
  size_t RequestedSize(void* ptr) override;
  size_t AllocatedSize(void* ptr) override;

 private:
  // A Chunk is the header on a single piece of memory given back
  // in response to an AllocateRaw() call.
  struct Chunk {
    char* ptr;               // pointer to granted GPU buffer.
    size_t size;             // Full size of GPU buffer.
    size_t bytes_allocated;  // Bytes asked for by client.
    bool in_use;
    Chunk* prev;  // Used for chaining in pool.
    Chunk* next;
    Chunk()
        : ptr(nullptr),
          size(0),
          bytes_allocated(0),
          in_use(false),
          prev(nullptr),
          next(nullptr) {}
  };

  // A Pool is a collection of same-sized Chunks.
  struct Pool {
    int num_chunks;             // total chunks in this pool
    int num_free;               // total free chunks in this pool
    int64 cumulative_malloced;  // number of chunks malloced so far
    int64 cumulative_freed;     // number of chunks freed so far

    // double-linked ring of chunks; all free chunks precede all
    // granted chunks
    Chunk* first;
    Chunk* last;
    Pool()
        : num_chunks(0),
          num_free(0),
          cumulative_malloced(0),
          cumulative_freed(0),
          first(nullptr),
          last(nullptr) {}

    string ToString() const {
      return strings::StrCat("chunks: ", num_chunks, " free: ", num_free,
                             " cumulative malloc: ", cumulative_malloced,
                             " cumulative freed: ", cumulative_freed);
    }
  };

  // A Region is a single area of GPU memory that has been
  // reserved by this class and carved up into Chunks.
  struct Region {
    char* ptr;   // base GPU ptr
    char* next;  // frontier of unused part of region
    size_t size;
    Region() : ptr(nullptr), size(0) {}
  };

  // Calculate size of chunk for an allocation of this size.
  // Min chunk size is 16, for alignment.
  // For larger sizes, we round up somewhat so there are fewer
  // size-specific pools.
  static size_t ChunkSize(size_t bytes);

  void* AllocateRawInternal(size_t alignment, size_t num_bytes,
                            bool dump_log_on_failure);
  void DeallocateRawInternal(void* ptr);

  bool ExpandPool(Pool* p, size_t chunk_size, size_t requested_size,
                  bool dump_log_on_failure) EXCLUSIVE_LOCKS_REQUIRED(lock_);

  // Inspects region maps and crashes with debug information if there
  // are any memory leaks as detected by the region allocator.
  void CheckForMemoryLeaks() LOCKS_EXCLUDED(lock_);

  void DumpMemoryLog() EXCLUSIVE_LOCKS_REQUIRED(lock_);

  perftools::gputools::StreamExecutor* stream_exec_;  // Not owned.

  typedef std::unordered_map<size_t, Pool> PoolMap;
  typedef std::unordered_map<void*, Chunk*> ChunkMap;

  GPUAllocatorRetry retry_helper_;
  mutable mutex lock_;
  PoolMap pools_ GUARDED_BY(lock_);

  // Owns regions.
  std::vector<Region*> regions_ GUARDED_BY(lock_);

  // Maps from GPU ptr to Chunk owning it.
  //
  // Owns chunks.
  ChunkMap chunk_map_ GUARDED_BY(lock_);

  // Called once on each region, ASAP.
  std::vector<Visitor> region_visitors_ GUARDED_BY(lock_);

  const int device_id_;

  // Total amount of memory (in bytes) available to this Allocator
  const size_t total_bytes_;

  // Total amount of memory allocated to regions.
  size_t allocated_memory_ = 0;

  size_t region_size_ = 0;

  TF_DISALLOW_COPY_AND_ASSIGN(GPURegionAllocator);
};

}  // namespace tensorflow

#endif  // TENSORFLOW_COMMON_RUNTIME_GPU_GPU_REGION_ALLOCATOR_H_