#ifndef TENSORFLOW_COMMON_RUNTIME_GPU_PROCESS_STATE_H_ #define TENSORFLOW_COMMON_RUNTIME_GPU_PROCESS_STATE_H_ #include #include #include #include "tensorflow/core/framework/allocator.h" #include "tensorflow/core/platform/port.h" #include "tensorflow/core/platform/thread_annotations.h" namespace tensorflow { class Allocator; class VisitableAllocator; class PoolAllocator; // Singleton that manages per-process state, e.g. allocation // of shared resources. class ProcessState { public: static ProcessState* singleton(); // Descriptor for memory allocation attributes, used by optional // runtime correctness analysis logic. struct MemDesc { enum MemLoc { CPU, GPU }; MemLoc loc; int dev_index; bool gpu_registered; bool nic_registered; MemDesc() : loc(CPU), dev_index(0), gpu_registered(false), nic_registered(false) {} string DebugString(); }; // Records the number of GPUs available in the local process. // It is a fatal error to call this with a value != to the value // in a prior call. void SetGPUCount(int c); // Returns number of GPUs available in local process, as set by // SetGPUCount(); Returns 0 if SetGPUCount has not been called. int GPUCount() const; // Returns what we know about the memory at ptr. // If we know nothing, it's called CPU 0 with no other attributes. MemDesc PtrType(const void* ptr); // Returns the one CPUAllocator used for the given numa_node. // TEMPORY: ignores numa_node. Allocator* GetCPUAllocator(int numa_node); // Returns the one GPU allocator used for the indexed GPU. // Note that this is a system GPU index, not (necessarily) a brain // device index. // // 'total_bytes' is the total number of bytes that should be made // available to the allocator. The first call to this function for // a given gpu_id creates the allocator, so only the total_bytes // used on that first call is used. // // REQUIRES: gpu_id must be a valid ordinal for a GPU available in the // current system environment. Otherwise returns nullptr. Allocator* GetGPUAllocator(int gpu_id, size_t total_bytes); Allocator* GetCUDAHostAllocator(int numa_node); // Registers a function to be called once on every new Region // allocated by every GPURegionAllocator proximate to the specified // bus. The AllocVisitor is provided with a memory pointer and the // size of the area it identifies. The pointer is not guaranteed to // be valid after the call terminates. The intention is for this // interface to be used for network device memory registration. // "bus_id" is platform-specific. On many platforms it // should be 0. On machines with multiple PCIe buses, it should be // the index of one of the PCIe buses. If the the bus_id is invalid, // results are undefined. typedef std::function AllocVisitor; void AddGPUAllocVisitor(int bus_id, AllocVisitor visitor); typedef std::unordered_map MDMap; protected: ProcessState(); static ProcessState* instance_; mutex mu_; int gpu_count_; std::vector cpu_allocators_ GUARDED_BY(mu_); std::vector gpu_allocators_ GUARDED_BY(mu_); std::vector> gpu_visitors_ GUARDED_BY(mu_); std::vector cuda_host_allocators_ GUARDED_BY(mu_); virtual ~ProcessState(); // Optional RecordingAllocators that wrap the corresponding // Allocators for runtime attribute use analysis. MDMap mem_desc_map_; std::vector cpu_al_ GUARDED_BY(mu_); std::vector gpu_al_ GUARDED_BY(mu_); std::vector cuda_al_ GUARDED_BY(mu_); }; namespace internal { class RecordingAllocator : public Allocator { public: RecordingAllocator(ProcessState::MDMap* mm, Allocator* a, ProcessState::MemDesc md, mutex* mu) : mm_(mm), a_(a), md_(md), mu_(mu) {} string Name() override { return a_->Name(); } void* AllocateRaw(size_t alignment, size_t num_bytes) override { void* p = a_->AllocateRaw(alignment, num_bytes); mutex_lock l(*mu_); (*mm_)[p] = md_; return p; } void DeallocateRaw(void* p) override { mutex_lock l(*mu_); auto iter = mm_->find(p); mm_->erase(iter); a_->DeallocateRaw(p); } bool TracksAllocationSizes() override { return a_->TracksAllocationSizes(); } size_t RequestedSize(void* p) override { return a_->RequestedSize(p); } size_t AllocatedSize(void* p) override { return a_->AllocatedSize(p); } ProcessState::MDMap* mm_; // not owned Allocator* a_; // not owned ProcessState::MemDesc md_; mutex* mu_; }; } // namespace internal } // namespace tensorflow #endif // TENSORFLOW_COMMON_RUNTIME_GPU_PROCESS_STATE_H_