diff options
Diffstat (limited to 'tensorflow/core/common_runtime')
68 files changed, 12927 insertions, 0 deletions
diff --git a/tensorflow/core/common_runtime/device.cc b/tensorflow/core/common_runtime/device.cc new file mode 100644 index 0000000000..2e3e7b6597 --- /dev/null +++ b/tensorflow/core/common_runtime/device.cc @@ -0,0 +1,37 @@ +#include "tensorflow/core/common_runtime/device.h" + +#include "tensorflow/core/framework/op_segment.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/lib/random/random.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/port.h" + +namespace tensorflow { + +Device::Device(Env* env, const DeviceAttributes& device_attributes, + Allocator* device_allocator) + : DeviceBase(env), device_attributes_(device_attributes) { + CHECK(DeviceNameUtils::ParseFullName(name(), &parsed_name_)) + << "Invalid device name: " << name(); + rmgr_ = new ResourceMgr(parsed_name_.job); +} + +Device::~Device() { delete rmgr_; } + +// static +DeviceAttributes Device::BuildDeviceAttributes( + const string& name, DeviceType device, Bytes memory_limit, + BusAdjacency bus_adjacency, const string& physical_device_desc) { + DeviceAttributes da; + da.set_name(name); + do { + da.set_incarnation(random::New64()); + } while (da.incarnation() == 0); // This proto field must not be zero + da.set_device_type(device.type()); + da.set_memory_limit(memory_limit.value()); + da.set_bus_adjacency(bus_adjacency); + da.set_physical_device_desc(physical_device_desc); + return da; +} + +} // namespace tensorflow diff --git a/tensorflow/core/common_runtime/device.h b/tensorflow/core/common_runtime/device.h new file mode 100644 index 0000000000..ff3404fea4 --- /dev/null +++ b/tensorflow/core/common_runtime/device.h @@ -0,0 +1,128 @@ +// A Device is a something that can perform computations as part of a +// model. Devices can be local (runs computation on this machine), or +// remote (contacts a device local to another machine using an RPC to +// do the work). Devices are registered in a DeviceSet, which is also +// responsible for the Device <-> id mapping. +// +// Device names +// * Every Device should have a unique name with the format: +// /job:___/replica:___/task:___/(gpu|cpu):___ +// An example name would be "/job:train/replica:0/task:3/gpu:2". +// * Task numbers are within the specified replica, so there are as +// many "task zeros" as replicas. + +#ifndef TENSORFLOW_COMMON_RUNTIME_DEVICE_H_ +#define TENSORFLOW_COMMON_RUNTIME_DEVICE_H_ + +#include <memory> +#include <string> +#include <vector> + +#include "tensorflow/core/framework/allocator.h" +#include "tensorflow/core/framework/control_flow.h" +#include "tensorflow/core/framework/device_attributes.pb.h" +#include "tensorflow/core/framework/device_base.h" +#include "tensorflow/core/framework/graph.pb.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/op_segment.h" +#include "tensorflow/core/framework/resource_mgr.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/graph/graph.h" +#include "tensorflow/core/graph/types.h" +#include "tensorflow/core/platform/port.h" +#include "tensorflow/core/public/status.h" +#include "tensorflow/core/util/device_name_utils.h" + +namespace tensorflow { + +class Device : public DeviceBase { + public: + Device(Env* env, const DeviceAttributes& device_attributes, + Allocator* device_allocator); + ~Device() override; + + // Full name of this device (see top comment). + const string& name() const { return device_attributes_.name(); } + + // Parsed name of this device + const DeviceNameUtils::ParsedName parsed_name() const { return parsed_name_; } + + // Describes what kind of device this is. This is intended to be + // human-readable and not computer-parsed, except that two devices + // with the same device_type() are expected to perform similarly + // (both from a computation and communication perspective). + const string& device_type() const { return device_attributes_.device_type(); } + + // Returns an aggregation of device attributes. + const DeviceAttributes& attributes() const override { + return device_attributes_; + } + + // Performs the actual compute function. + // + // Subclasses may override this function if they wish to perform + // some initialization before each compute. + virtual void Compute(OpKernel* op_kernel, OpKernelContext* context) { + op_kernel->Compute(context); + } + + // Asynchronous kernel's compute. + virtual void ComputeAsync(AsyncOpKernel* op_kernel, OpKernelContext* context, + AsyncOpKernel::DoneCallback done) { + op_kernel->ComputeAsync(context, done); + } + + // Blocks until all operations queued on the device at the time of + // the call have completed. Returns any error pending on the device + // at completion. + virtual Status Sync() = 0; + + // Fill in the context map for the graph. Default behavior is to do + // nothing. + // + // The caller takes ownership over the DeviceContext objects given + // by the device. + virtual Status FillContextMap(const Graph* graph, + DeviceContextMap* device_context_map) { + return Status::OK(); + } + + // Returns the op segment of this device. The caller can reuse op + // kernels registered for the same session running on this device. + OpSegment* op_segment() { return &op_seg_; } + + // Returns the resource manager associated w/ this device. + ResourceMgr* resource_manager() { return rmgr_; } + + // Summarizes the status of this Device, for debugging. + string DebugString() const { return device_attributes_.DebugString(); } + + // Assembles the parameter components into a complete DeviceAttributes value. + static DeviceAttributes BuildDeviceAttributes( + const string& name, DeviceType device, Bytes memory_limit, + BusAdjacency bus_adjacency, const string& physical_device_desc); + + static DeviceAttributes BuildDeviceAttributes(const string& name, + DeviceType device, + Bytes memory_limit, + BusAdjacency bus_adjacency) { + // Pass in an empty string as physical device name. + return BuildDeviceAttributes(name, device, memory_limit, bus_adjacency, ""); + } + + private: + const DeviceAttributes device_attributes_; + DeviceNameUtils::ParsedName parsed_name_; + + // op_seg_ maps session handle and op name to OpKernel objects. + OpSegment op_seg_; + + // Resources associated w/ this device. E.g., shared variables, etc. + ResourceMgr* rmgr_ = nullptr; + + TF_DISALLOW_COPY_AND_ASSIGN(Device); +}; + +} // namespace tensorflow + +#endif // TENSORFLOW_COMMON_RUNTIME_DEVICE_H_ diff --git a/tensorflow/core/common_runtime/device_factory.cc b/tensorflow/core/common_runtime/device_factory.cc new file mode 100644 index 0000000000..7d391bde1d --- /dev/null +++ b/tensorflow/core/common_runtime/device_factory.cc @@ -0,0 +1,106 @@ +#include "tensorflow/core/common_runtime/device_factory.h" + +#include <memory> +#include <string> +#include <unordered_map> + +#include "tensorflow/core/lib/strings/strcat.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/port.h" +#include "tensorflow/core/public/session_options.h" + +namespace tensorflow { + +namespace { + +static mutex* get_device_factory_lock() { + static mutex device_factory_lock; + return &device_factory_lock; +} + +struct FactoryItem { + std::unique_ptr<DeviceFactory> factory; + int priority; +}; + +std::unordered_map<string, FactoryItem>& device_factories() { + static std::unordered_map<string, FactoryItem>* factories = + new std::unordered_map<string, FactoryItem>; + return *factories; +} +} // namespace + +void DeviceFactory::Register(const string& device_type, DeviceFactory* factory, + int priority) { + mutex_lock l(*get_device_factory_lock()); + std::unique_ptr<DeviceFactory> factory_ptr(factory); + std::unordered_map<string, FactoryItem>& factories = device_factories(); + auto iter = factories.find(device_type); + if (iter == factories.end()) { + factories[device_type] = {std::move(factory_ptr), priority}; + } else { + if (iter->second.priority < priority) { + iter->second = {std::move(factory_ptr), priority}; + } else if (iter->second.priority == priority) { + LOG(FATAL) << "Duplicate registration of device factory for type " + << device_type << " with the same priority " << priority; + } + } +} + +DeviceFactory* DeviceFactory::GetFactory(const string& device_type) { + mutex_lock l(*get_device_factory_lock()); // could use reader lock + auto it = device_factories().find(device_type); + if (it == device_factories().end()) { + return nullptr; + } + return it->second.factory.get(); +} + +void DeviceFactory::AddDevices(const SessionOptions& options, + const string& name_prefix, + std::vector<Device*>* devices) { + // CPU first. + auto cpu_factory = GetFactory("CPU"); + if (!cpu_factory) { + LOG(FATAL) + << "CPU Factory not registered. Did you link in threadpool_device?"; + } + size_t init_size = devices->size(); + cpu_factory->CreateDevices(options, name_prefix, devices); + if (devices->size() == init_size) { + LOG(FATAL) << "No CPU devices are available in this process"; + } + + // Then GPU. + auto gpu_factory = GetFactory("GPU"); + if (gpu_factory) { + gpu_factory->CreateDevices(options, name_prefix, devices); + } + + // Then the rest. + mutex_lock l(*get_device_factory_lock()); + for (auto& p : device_factories()) { + auto factory = p.second.factory.get(); + if (factory != cpu_factory && factory != gpu_factory) { + factory->CreateDevices(options, name_prefix, devices); + } + } +} + +Device* DeviceFactory::NewDevice(const string& type, + const SessionOptions& options, + const string& name_prefix) { + auto device_factory = GetFactory(type); + if (!device_factory) { + return nullptr; + } + SessionOptions opt = options; + (*opt.config.mutable_device_count())[type] = 1; + std::vector<Device*> devices; + device_factory->CreateDevices(opt, name_prefix, &devices); + CHECK_EQ(devices.size(), 1); + return devices[0]; +} + +} // namespace tensorflow diff --git a/tensorflow/core/common_runtime/device_factory.h b/tensorflow/core/common_runtime/device_factory.h new file mode 100644 index 0000000000..57b625b3e5 --- /dev/null +++ b/tensorflow/core/common_runtime/device_factory.h @@ -0,0 +1,69 @@ +#ifndef TENSORFLOW_COMMON_RUNTIME_DEVICE_FACTORY_H_ +#define TENSORFLOW_COMMON_RUNTIME_DEVICE_FACTORY_H_ + +#include <string> +#include <vector> +#include "tensorflow/core/platform/port.h" + +namespace tensorflow { + +class Device; +struct SessionOptions; + +class DeviceFactory { + public: + virtual ~DeviceFactory() {} + static void Register(const string& device_type, DeviceFactory* factory, + int priority); + static DeviceFactory* GetFactory(const string& device_type); + + // Append to "*devices" all suitable devices, respecting + // any device type specific properties/counts listed in "options". + // + // CPU devices are added first. + static void AddDevices(const SessionOptions& options, + const string& name_prefix, + std::vector<Device*>* devices); + + // Helper for tests. Create a single device of type "type". The + // returned device is always numbered zero, so if creating multiple + // devices of the same type, supply distinct name_prefix arguments. + static Device* NewDevice(const string& type, const SessionOptions& options, + const string& name_prefix); + + // Most clients should call AddDevices() instead. + virtual void CreateDevices(const SessionOptions& options, + const string& name_prefix, + std::vector<Device*>* devices) = 0; +}; + +namespace dfactory { + +template <class Factory> +class Registrar { + public: + // Multiple registrations for the same device type with different priorities + // are allowed. The registration with the highest priority will be used. + explicit Registrar(const string& device_type, int priority = 0) { + DeviceFactory::Register(device_type, new Factory(), priority); + } +}; + +} // namespace dfactory + +#define REGISTER_LOCAL_DEVICE_FACTORY(device_type, device_factory, ...) \ + INTERNAL_REGISTER_LOCAL_DEVICE_FACTORY(device_type, device_factory, \ + __COUNTER__, ##__VA_ARGS__) + +#define INTERNAL_REGISTER_LOCAL_DEVICE_FACTORY(device_type, device_factory, \ + ctr, ...) \ + static ::tensorflow::dfactory::Registrar<device_factory> \ + INTERNAL_REGISTER_LOCAL_DEVICE_FACTORY_NAME(ctr)(device_type, \ + ##__VA_ARGS__) + +// __COUNTER__ must go through another macro to be properly expanded +#define INTERNAL_REGISTER_LOCAL_DEVICE_FACTORY_NAME(ctr) ___##ctr##__object_ + +} // namespace tensorflow + +#endif // TENSORFLOW_COMMON_RUNTIME_DEVICE_FACTORY_H_ diff --git a/tensorflow/core/common_runtime/device_mgr.cc b/tensorflow/core/common_runtime/device_mgr.cc new file mode 100644 index 0000000000..4fa13f6b4b --- /dev/null +++ b/tensorflow/core/common_runtime/device_mgr.cc @@ -0,0 +1,90 @@ +#include "tensorflow/core/common_runtime/device_mgr.h" + +#include "tensorflow/core/common_runtime/local_device.h" +#include "tensorflow/core/framework/device_attributes.pb.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/util/device_name_utils.h" + +namespace tensorflow { + +DeviceMgr::DeviceMgr(const std::vector<Device*>& devices) { + for (Device* d : devices) { + devices_.push_back(d); + + // Register under both the full name and the local name. + device_map_[d->name()] = d; + device_map_[DeviceNameUtils::LocalName(d->name())] = d; + device_type_counts_[d->device_type()]++; + } +} + +DeviceMgr::~DeviceMgr() { + for (auto p : devices_) delete p; +} + +void DeviceMgr::ListDeviceAttributes( + std::vector<DeviceAttributes>* devices) const { + devices->reserve(devices_.size()); + for (Device* dev : devices_) { + devices->emplace_back(dev->attributes()); + } +} + +std::vector<Device*> DeviceMgr::ListDevices() const { + return std::vector<Device*>(devices_.begin(), devices_.end()); +} + +string DeviceMgr::DebugString() const { + string out; + for (Device* dev : devices_) { + strings::StrAppend(&out, dev->name(), "\n"); + } + return out; +} + +string DeviceMgr::DeviceMappingString() const { + string out; + for (Device* dev : devices_) { + if (!dev->attributes().physical_device_desc().empty()) { + strings::StrAppend(&out, dev->name(), " -> ", + dev->attributes().physical_device_desc(), "\n"); + } + } + return out; +} + +Status DeviceMgr::LookupDevice(const string& name, Device** device) const { + Status s; + auto iter = device_map_.find(name); + if (iter == device_map_.end()) { + return errors::InvalidArgument(name, " unknown device."); + } + *device = iter->second; + return Status::OK(); +} + +void DeviceMgr::ClearContainers(gtl::ArraySlice<string> containers) const { + Status s; + for (Device* dev : devices_) { + if (containers.empty()) { + s.Update(dev->resource_manager()->Cleanup( + dev->resource_manager()->default_container())); + } else { + for (const string& c : containers) { + s.Update(dev->resource_manager()->Cleanup(c)); + } + } + if (!s.ok()) { + LOG(WARNING) << s; + } + } +} + +int DeviceMgr::NumDeviceType(const string& type) const { + auto iter = device_type_counts_.find(type); + if (iter != device_type_counts_.end()) return iter->second; + return 0; +} + +} // namespace tensorflow diff --git a/tensorflow/core/common_runtime/device_mgr.h b/tensorflow/core/common_runtime/device_mgr.h new file mode 100644 index 0000000000..c57d0222aa --- /dev/null +++ b/tensorflow/core/common_runtime/device_mgr.h @@ -0,0 +1,55 @@ +#ifndef TENSORFLOW_COMMON_RUNTIME_DEVICE_MGR_H_ +#define TENSORFLOW_COMMON_RUNTIME_DEVICE_MGR_H_ + +#include <string> +#include <unordered_map> +#include <unordered_set> +#include <vector> + +#include "tensorflow/core/common_runtime/device.h" +#include "tensorflow/core/lib/gtl/inlined_vector.h" +#include "tensorflow/core/public/status.h" + +namespace tensorflow { + +class DeviceAttributes; + +class DeviceMgr { + public: + // TODO(zhifengc): Other initialization information. + explicit DeviceMgr(const std::vector<Device*>& devices); + ~DeviceMgr(); + + // Returns attributes of all devices. + void ListDeviceAttributes(std::vector<DeviceAttributes>* devices) const; + + std::vector<Device*> ListDevices() const; + + // Returns a string listing all devices. + string DebugString() const; + + // Returns a string of all the device mapping. + string DeviceMappingString() const; + + // Assigns *device with pointer to Device of the given name. + // Accepts either a full device name, or just the replica-local suffix. + Status LookupDevice(const string& name, Device** device) const; + + // Clears given containers of all devices if 'container' is + // non-empty. Otherwise, clears default containers of all devices. + void ClearContainers(gtl::ArraySlice<string> containers) const; + + int NumDeviceType(const string& type) const; + + private: + typedef gtl::InlinedVector<Device*, 8> DeviceVec; + DeviceVec devices_; + std::unordered_map<string, Device*> device_map_; + std::unordered_map<string, int> device_type_counts_; + + TF_DISALLOW_COPY_AND_ASSIGN(DeviceMgr); +}; + +} // namespace tensorflow + +#endif // TENSORFLOW_COMMON_RUNTIME_DEVICE_MGR_H_ diff --git a/tensorflow/core/common_runtime/device_set.cc b/tensorflow/core/common_runtime/device_set.cc new file mode 100644 index 0000000000..3b0465d9a6 --- /dev/null +++ b/tensorflow/core/common_runtime/device_set.cc @@ -0,0 +1,68 @@ +#include "tensorflow/core/common_runtime/device_set.h" + +#include <set> +#include <utility> +#include <vector> + +#include "tensorflow/core/common_runtime/device.h" +#include "tensorflow/core/lib/core/stringpiece.h" +#include "tensorflow/core/lib/gtl/map_util.h" + +namespace tensorflow { + +DeviceSet::DeviceSet() {} + +DeviceSet::~DeviceSet() {} + +void DeviceSet::AddDevice(Device* device) { + devices_.push_back(device); + device_by_name_.insert({device->name(), device}); +} + +void DeviceSet::FindMatchingDevices(const DeviceNameUtils::ParsedName& spec, + std::vector<Device*>* devices) const { + // TODO(jeff): If we are going to repeatedly lookup the set of devices + // for the same spec, maybe we should have a cache of some sort + devices->clear(); + for (Device* d : devices_) { + if (DeviceNameUtils::IsCompleteSpecification(spec, d->parsed_name())) { + devices->push_back(d); + } + } +} + +Device* DeviceSet::FindDeviceByName(const string& name) const { + return gtl::FindPtrOrNull(device_by_name_, name); +} + +// Higher result implies lower priority. +static int Order(const DeviceType& d) { + if (StringPiece(d.type()) == DEVICE_CPU) { + return 3; + } else if (StringPiece(d.type()) == DEVICE_GPU) { + return 2; + } else { + return 1; + } +} + +static bool ByPriority(const DeviceType& a, const DeviceType& b) { + // Order by "order number"; break ties lexicographically. + return std::make_pair(Order(a), StringPiece(a.type())) < + std::make_pair(Order(b), StringPiece(b.type())); +} + +std::vector<DeviceType> DeviceSet::PrioritizedDeviceTypeList() const { + std::vector<DeviceType> result; + std::set<string> seen; + for (Device* d : devices_) { + auto t = d->device_type(); + if (seen.insert(t).second) { + result.emplace_back(DeviceType(t)); + } + } + std::sort(result.begin(), result.end(), ByPriority); + return result; +} + +} // namespace tensorflow diff --git a/tensorflow/core/common_runtime/device_set.h b/tensorflow/core/common_runtime/device_set.h new file mode 100644 index 0000000000..130d965891 --- /dev/null +++ b/tensorflow/core/common_runtime/device_set.h @@ -0,0 +1,64 @@ +#ifndef TENSORFLOW_COMMON_RUNTIME_DEVICE_SET_H_ +#define TENSORFLOW_COMMON_RUNTIME_DEVICE_SET_H_ + +#include <memory> +#include <unordered_map> +#include <vector> + +#include "tensorflow/core/common_runtime/device.h" +#include "tensorflow/core/platform/port.h" +#include "tensorflow/core/util/device_name_utils.h" + +namespace tensorflow { + +// DeviceSet is a container class for managing the various types of +// devices used by a model. +class DeviceSet { + public: + DeviceSet(); + ~DeviceSet(); + + // Does not take ownership of 'device'. + void AddDevice(Device* device); + + // Set the device designated as the "client". This device + // must also be registered via AddDevice(). + void set_client_device(Device* device) { client_device_ = device; } + + // Returns a pointer to the device designated as the "client". + Device* client_device() const { return client_device_; } + + // Return the list of devices in this set. + const std::vector<Device*>& devices() const { return devices_; } + + // Given a DeviceNameUtils::ParsedName (which may have some + // wildcards for different components), fills "*devices" with all + // devices in "*this" that match "spec". + void FindMatchingDevices(const DeviceNameUtils::ParsedName& spec, + std::vector<Device*>* devices) const; + + // Finds the device with the given "fullname". Returns nullptr if + // not found. + Device* FindDeviceByName(const string& fullname) const; + + // Return the list of unique device types in this set, ordered + // with more preferable devices earlier. + std::vector<DeviceType> PrioritizedDeviceTypeList() const; + + private: + // Not owned. + std::vector<Device*> devices_; + + // Fullname -> device* for device in devices_. + std::unordered_map<string, Device*> device_by_name_; + + // client_device_ points to an element of devices_ that we consider + // to be the client device (in this local process). + Device* client_device_ = nullptr; + + TF_DISALLOW_COPY_AND_ASSIGN(DeviceSet); +}; + +} // namespace tensorflow + +#endif // TENSORFLOW_COMMON_RUNTIME_DEVICE_SET_H_ diff --git a/tensorflow/core/common_runtime/device_set_test.cc b/tensorflow/core/common_runtime/device_set_test.cc new file mode 100644 index 0000000000..1b80a5b697 --- /dev/null +++ b/tensorflow/core/common_runtime/device_set_test.cc @@ -0,0 +1,65 @@ +#include "tensorflow/core/common_runtime/device_set.h" + +#include "tensorflow/core/public/status.h" +#include <gtest/gtest.h> + +namespace tensorflow { +namespace { + +// Return a fake device with the specified type and name. +static Device* Dev(const char* type, const char* name) { + class FakeDevice : public Device { + public: + explicit FakeDevice(const DeviceAttributes& attr) + : Device(nullptr, attr, nullptr) {} + Status Sync() override { return Status::OK(); } + Allocator* GetAllocator(AllocatorAttributes) override { return nullptr; } + }; + DeviceAttributes attr; + attr.set_name(name); + attr.set_device_type(type); + return new FakeDevice(attr); +} + +class DeviceSetTest : public testing::Test { + public: + void AddDevice(const char* type, const char* name) { + Device* d = Dev(type, name); + owned_.emplace_back(d); + devices_.AddDevice(d); + } + + std::vector<DeviceType> types() const { + return devices_.PrioritizedDeviceTypeList(); + } + + private: + DeviceSet devices_; + std::vector<std::unique_ptr<Device>> owned_; +}; + +TEST_F(DeviceSetTest, PrioritizedDeviceTypeList) { + EXPECT_EQ(std::vector<DeviceType>{}, types()); + + AddDevice("CPU", "/job:a/replica:0/task:0/cpu:0"); + EXPECT_EQ(std::vector<DeviceType>{DeviceType(DEVICE_CPU)}, types()); + + AddDevice("CPU", "/job:a/replica:0/task:0/cpu:1"); + EXPECT_EQ(std::vector<DeviceType>{DeviceType(DEVICE_CPU)}, types()); + + AddDevice("GPU", "/job:a/replica:0/task:0/gpu:0"); + EXPECT_EQ( + (std::vector<DeviceType>{DeviceType(DEVICE_GPU), DeviceType(DEVICE_CPU)}), + types()); + + AddDevice("T1", "/job:a/replica:0/task:0/device:T1:0"); + AddDevice("T1", "/job:a/replica:0/task:0/device:T1:1"); + AddDevice("T2", "/job:a/replica:0/task:0/device:T2:0"); + EXPECT_EQ( + (std::vector<DeviceType>{DeviceType("T1"), DeviceType("T2"), + DeviceType(DEVICE_GPU), DeviceType(DEVICE_CPU)}), + types()); +} + +} // namespace +} // namespace tensorflow diff --git a/tensorflow/core/common_runtime/eigen_thread_pool.h b/tensorflow/core/common_runtime/eigen_thread_pool.h new file mode 100644 index 0000000000..2554f3521b --- /dev/null +++ b/tensorflow/core/common_runtime/eigen_thread_pool.h @@ -0,0 +1,22 @@ +#ifndef TENSORFLOW_COMMON_RUNTIME_EIGEN_THREAD_POOL_H_ +#define TENSORFLOW_COMMON_RUNTIME_EIGEN_THREAD_POOL_H_ + +#include "tensorflow/core/lib/core/threadpool.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" + +namespace tensorflow { + +class EigenThreadPoolWrapper : public Eigen::ThreadPoolInterface { + public: + explicit EigenThreadPoolWrapper(thread::ThreadPool* pool) : pool_(pool) {} + ~EigenThreadPoolWrapper() override {} + + void Schedule(std::function<void()> fn) override { pool_->Schedule(fn); } + + private: + thread::ThreadPool* pool_ = nullptr; +}; + +} // namespace tensorflow + +#endif // TENSORFLOW_COMMON_RUNTIME_EIGEN_THREAD_POOL_H_ diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc new file mode 100644 index 0000000000..7f2473f93b --- /dev/null +++ b/tensorflow/core/common_runtime/executor.cc @@ -0,0 +1,2118 @@ +#include "tensorflow/core/common_runtime/executor.h" + +#include <atomic> +#include <memory> +#include <string> +#include <unordered_map> +#include <unordered_set> +#include <vector> +#include <deque> + +#include "tensorflow/core/framework/allocator.h" +#include "tensorflow/core/framework/cancellation.h" +#include "tensorflow/core/framework/control_flow.h" +#include "tensorflow/core/framework/device_attributes.pb.h" +#include "tensorflow/core/framework/graph.pb.h" +#include "tensorflow/core/framework/node_def_util.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/op_segment.h" +#include "tensorflow/core/framework/step_stats.pb.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/framework/types.pb.h" +#include "tensorflow/core/graph/edgeset.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/lib/core/notification.h" +#include "tensorflow/core/lib/core/stringpiece.h" +#include "tensorflow/core/lib/gtl/inlined_vector.h" +#include "tensorflow/core/lib/gtl/stl_util.h" +#include "tensorflow/core/lib/hash/hash.h" +#include "tensorflow/core/lib/strings/str_util.h" +#include "tensorflow/core/lib/strings/stringprintf.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/port.h" +#include "tensorflow/core/platform/thread_annotations.h" +#include "tensorflow/core/platform/tracing.h" +#include "tensorflow/core/public/tensor.h" +#include "tensorflow/core/util/tensor_slice_reader_cache.h" + +namespace tensorflow { + +namespace { + +// 1-D, 0 element tensor. +static const Tensor* const kEmptyTensor = new Tensor; + +bool IsInitializationOp(const Node* node) { + return node->op_def().allows_uninitialized_input(); +} + +// Sets the timeline_label field of *node_stats, using data from *node. +// Returns true iff the node is a transfer node. +// TODO(tucker): merge with the DetailText function in session.cc +// in a common location. +bool SetTimelineLabel(const Node* node, NodeExecStats* node_stats) { + bool is_transfer_node = false; + string memory; + for (auto& all : node_stats->memory()) { + int64 tot = all.total_bytes(); + if (tot >= 0.1 * 1048576.0) { + int64 peak = all.peak_bytes(); + if (peak > 0) { + memory = + strings::StrCat(memory, "[", all.allocator_name(), + strings::Printf(" %.1fMB %.1fMB] ", tot / 1048576.0, + peak / 1048576.0)); + } else { + memory = strings::StrCat(memory, "[", all.allocator_name(), + strings::Printf(" %.1fMB] ", tot / 1048576.0)); + } + } + } + const NodeDef& def = node->def(); + string text = ""; + if (IsSend(node)) { + string tensor_name; + TF_CHECK_OK(GetNodeAttr(def, "tensor_name", &tensor_name)); + string recv_device; + TF_CHECK_OK(GetNodeAttr(def, "recv_device", &recv_device)); + text = strings::StrCat(memory, def.name(), " = ", def.op(), "(", + tensor_name, " @", recv_device); + is_transfer_node = true; + } else if (IsRecv(node)) { + string tensor_name; + TF_CHECK_OK(GetNodeAttr(def, "tensor_name", &tensor_name)); + string send_device; + TF_CHECK_OK(GetNodeAttr(def, "send_device", &send_device)); + text = strings::StrCat(memory, def.name(), " = ", def.op(), "(", + tensor_name, " @", send_device); + is_transfer_node = true; + } else { + text = strings::StrCat( + memory, def.name(), " = ", def.op(), "(", + str_util::Join( + std::vector<StringPiece>(def.input().begin(), def.input().end()), + ", "), + ")"); + } + node_stats->set_timeline_label(text); + return is_transfer_node; +} + +// Helper routines for collecting step stats. +namespace nodestats { +inline int64 NowInUsec() { return Env::Default()->NowMicros(); } + +void SetScheduled(NodeExecStats* nt, int64 t) { nt->set_scheduled_micros(t); } + +void SetAllStart(NodeExecStats* nt) { nt->set_all_start_micros(NowInUsec()); } + +void SetOpStart(NodeExecStats* nt) { + DCHECK_NE(nt->all_start_micros(), 0); + nt->set_op_start_rel_micros(NowInUsec() - nt->all_start_micros()); +} + +void SetOpEnd(NodeExecStats* nt) { + DCHECK_NE(nt->all_start_micros(), 0); + nt->set_op_end_rel_micros(NowInUsec() - nt->all_start_micros()); +} + +void SetAllEnd(NodeExecStats* nt) { + DCHECK_NE(nt->all_start_micros(), 0); + nt->set_all_end_rel_micros(NowInUsec() - nt->all_start_micros()); +} + +void SetOutput(NodeExecStats* nt, int slot, AllocationType allocation_type, + const Tensor* v) { + DCHECK(v); + NodeOutput* no = nt->add_output(); + no->set_slot(slot); + no->set_allocation_type(allocation_type); + v->FillDescription(no->mutable_tensor_description()); +} + +void SetMemory(NodeExecStats* nt, OpKernelContext* ctx) { + for (const auto& allocator_pair : ctx->wrapped_allocators()) { + AllocatorMemoryUsed* memory = nt->add_memory(); + // retrieving the sizes from the wrapped allocator removes the + // executor's reference to it, so allocator_pair.second must not + // be dereferenced again after this statement + auto sizes = allocator_pair.second->GetSizesAndUnRef(); + memory->set_allocator_name(allocator_pair.first->Name()); + int tb = sizes.first; + memory->set_total_bytes(tb); + if (allocator_pair.first->TracksAllocationSizes()) { + memory->set_peak_bytes(sizes.second); + } + } +} +} // namespace nodestats + +struct NodeItem { + // A graph node. + const Node* node = nullptr; + + // The kernel for this node. + OpKernel* kernel = nullptr; + + // ExecutorImpl::tensors_[input_start] is the 1st positional input + // for this node. + int input_start = 0; +}; + +// Map from std::pair<node_id, output_index> to attributes. +struct pairhash { + public: + template <typename T, typename U> + std::size_t operator()(const std::pair<T, U>& x) const { + return std::hash<T>()(x.first) ^ std::hash<U>()(x.second); + } +}; +typedef std::unordered_map<std::pair<int, int>, AllocatorAttributes, pairhash> + DevAttrMap; + +typedef gtl::InlinedVector<TensorValue, 4> TensorValueVec; +typedef gtl::InlinedVector<DeviceContext*, 4> DeviceContextVec; +typedef gtl::InlinedVector<AllocatorAttributes, 4> AllocatorAttributeVec; + +class ExecutorImpl : public Executor { + public: + ExecutorImpl(const LocalExecutorParams& p, const Graph* g) + : params_(p), graph_(g) { + CHECK(p.create_kernel != nullptr); + CHECK(p.delete_kernel != nullptr); + } + + ~ExecutorImpl() override { + for (NodeItem& item : nodes_) { + params_.delete_kernel(item.kernel); + } + delete graph_; + } + + Status Initialize(); + + // Infer memory allocation attributes of a node n's output, + // based on its use node dst. Note that dst might not be directly + // connected to n by a single edge, but might be a downstream + // consumer of n's output by reference. *attr is updated with any + // necessary attributes. + Status InferAllocAttr(const Node* n, const Node* dst, + const DeviceNameUtils::ParsedName& local_dev_name, + AllocatorAttributes* attr); + + // Process all Nodes in the current graph, attempting to infer the + // memory allocation attributes to be used wherever they may allocate + // a tensor buffer. + Status SetAllocAttrs(); + + void RunAsync(const Args& args, DoneCallback done) override; + + private: + friend class ExecutorState; + friend class SimpleExecutorState; + + // Owned. + LocalExecutorParams params_; + const Graph* graph_; + std::vector<NodeItem> nodes_; // nodes_.size == graph_.num_node_ids(). + int total_tensors_ = 0; // total_tensors_ = sum(nodes_[*].num_inputs()) + + // The number of inputs for each frame in this graph. This is static + // information of the graph. + std::unordered_map<string, int> frame_input_count_; + + DevAttrMap alloc_attr_; + + TF_DISALLOW_COPY_AND_ASSIGN(ExecutorImpl); +}; + +Status ExecutorImpl::Initialize() { + const int num_nodes = graph_->num_node_ids(); + nodes_.resize(num_nodes); + + Status s; + total_tensors_ = 0; + + // Preprocess every node in the graph to create an instance of op + // kernel for each node; + for (const Node* n : graph_->nodes()) { + const int id = n->id(); + NodeItem* item = &nodes_[id]; + item->node = n; + item->input_start = total_tensors_; + total_tensors_ += n->num_inputs(); + s = params_.create_kernel(n->def(), &item->kernel); + if (!s.ok()) { + s = AttachDef(s, n->def()); + LOG(ERROR) << "Executor failed to create kernel. " << s; + break; + } + CHECK(item->kernel); + + // Initialize static information about the frames in the graph. + if (IsEnter(n)) { + string frame_name; + s = GetNodeAttr(n->def(), "frame_name", &frame_name); + if (!s.ok()) return s; + ++frame_input_count_[frame_name]; + } + } + if (params_.has_control_flow) { + VLOG(2) << "Graph has control flow."; + } + if (!s.ok()) return s; + return SetAllocAttrs(); +} + +Status ExecutorImpl::SetAllocAttrs() { + Status s; + Device* device = params_.device; + DeviceNameUtils::ParsedName local_dev_name = device->parsed_name(); + + for (const Node* n : graph_->nodes()) { + // Examine the out edges of each node looking for special use + // cases that may affect memory allocation attributes. + for (auto e : n->out_edges()) { + AllocatorAttributes attr; + s = InferAllocAttr(n, e->dst(), local_dev_name, &attr); + if (!s.ok()) return s; + if (attr.value != 0) { + VLOG(2) << "node " << n->name() << " gets attr " << attr.value + << " for output " << e->src_output(); + alloc_attr_[std::make_pair(n->id(), e->src_output())].Merge(attr); + } else { + VLOG(2) << "default output attr for node " << n->name() << " output " + << e->src_output(); + } + } + } + return s; +} + +Status ExecutorImpl::InferAllocAttr( + const Node* n, const Node* dst, + const DeviceNameUtils::ParsedName& local_dev_name, + AllocatorAttributes* attr) { + Status s; + if (IsSend(dst)) { + string dst_name; + s = GetNodeAttr(dst->def(), "recv_device", &dst_name); + if (!s.ok()) return s; + DeviceNameUtils::ParsedName parsed_dst_name; + if (!DeviceNameUtils::ParseFullName(dst_name, &parsed_dst_name)) { + s = errors::Internal("Bad recv_device attr '", dst_name, "' in node ", + n->name()); + return s; + } + if (!DeviceNameUtils::IsSameAddressSpace(parsed_dst_name, local_dev_name)) { + // Value is going to be the source of an RPC. + attr->set_nic_compatible(true); + VLOG(2) << "node " << n->name() << " is the source of an RPC out"; + } else if (local_dev_name.type == "CPU" && parsed_dst_name.type == "GPU") { + // Value is going to be the source of a local DMA from CPU to GPU. + attr->set_gpu_compatible(true); + VLOG(2) << "node " << n->name() << " is the source of a cpu->gpu copy"; + } else { + VLOG(2) << "default alloc case local type " << local_dev_name.type + << " remote type " << parsed_dst_name.type; + } + } else if (dst->type_string() == "ToFloat") { + for (auto e : dst->out_edges()) { + s = InferAllocAttr(n, e->dst(), local_dev_name, attr); + if (!s.ok()) return s; + } + } + return s; +} + +// The state associated with one invokation of ExecutorImpl::Run. +// ExecutorState dispatches nodes when they become ready and keeps +// track of how many predecessors of a node have not done (pending_). +class ExecutorState { + public: + ExecutorState(const Executor::Args& args, ExecutorImpl* impl); + ~ExecutorState(); + + void RunAsync(Executor::DoneCallback done); + + private: + typedef ExecutorState ME; + + // Either a tensor pointer (pass-by-reference) or a tensor (pass-by-value). + // TODO(yuanbyu): A better way to do "has_value"? + struct Entry { + Tensor val = *kEmptyTensor; // A tensor value. + Tensor* ref = nullptr; // A tensor reference. + mutex* ref_mu = nullptr; // mutex for *ref if ref is not nullptr. + bool has_value = false; // Whether the value exists + + // Every entry carries an optional DeviceContext containing + // Device-specific information about how the Tensor was produced. + DeviceContext* device_context = nullptr; + + // The attributes of the allocator that creates the tensor. + AllocatorAttributes alloc_attr; + }; + + // Contains a map from node id to the DeviceContext object that was + // assigned by the device at the beginning of a step. + DeviceContextMap device_context_map_; + + struct IterationState { + // The state of an iteration. + + // The pending count for each graph node. One copy per iteration. + // Iteration i can be garbage collected when it is done. + // TODO(yuanbyu): This vector currently has size of the number of nodes + // in this partition. This is not efficient if the subgraph for the frame + // is only a small subset of the partition. We should make the vector + // size to be only the size of the frame subgraph. + std::vector<int>* pending_count; + + // The dead input count for each graph node. One copy per iteration. + std::vector<int>* dead_count; + + // One copy per iteration. For iteration k, i-th node's j-th input is in + // input_tensors[k][impl_->nodes[i].input_start + j]. An entry is either + // a tensor pointer (pass-by-reference) or a tensor (pass-by-value). + // + // NOTE: No need to protect input_tensors[i] by any locks because it + // is resized once. Each element of tensors_ is written once by the + // source node of an edge and is cleared by the destination of the same + // edge. The latter node is never run concurrently with the former node. + std::vector<Entry>* input_tensors; + + // The number of outstanding ops for each iteration. + int outstanding_ops; + + // The number of outstanding frames for each iteration. + int outstanding_frame_count; + + ~IterationState() { + delete pending_count; + delete dead_count; + delete input_tensors; + } + }; + + struct FrameState { + // A new frame is created for each loop. Execution starts at iteration 0. + // When a value at iteration 0 passes through a NextIteration node, + // iteration 1 is created and starts running. Note that iteration 0 may + // still be running so multiple iterations may run in parallel. The + // frame maintains the state of iterations in several data structures + // such as pending_count and input_tensors. When iteration 0 completes, + // we garbage collect the state of iteration 0. + // + // A frame instance is considered "done" and can be garbage collected + // if all its inputs have entered and all its iterations are "done". + // + // A frame manages the live iterations of an iterative computation. + // Iteration i is considered "done" when there are no outstanding ops, + // frames at iteration i are done, all recvs for this iteration are + // completed, and iteration i-1 is done. For iteration 0, we instead + // wait for there to be no more pending inputs of the frame. + // + // Frames and iterations are garbage collected once they are done. + // The state we need to keep around is highly dependent on the + // parallelism enabled by the scheduler. We may want to have the + // scheduler dynamically control the outstanding number of live + // parallel frames and iterations. To reduce the state space, the + // scheduler might want to schedule ops in inner frames first and + // lower iterations first. + // + // This frame state is mostly initialized lazily on demand so we + // don't introduce unnecessary overhead. + + // The name of this frame, which is the concatenation of its parent + // frame name, the iteration of the parent frame when this frame was + // created, and the value of the attr 'frame_name'. + string frame_name; + + // The unique id for this frame. Generated by fingerprinting + // frame_name. + uint64 frame_id; + + // The iteration id of its parent frame when this frame is created. + // -1 if there is no parent frame. The frame_name/parent_iter pair + // uniquely identifies this FrameState. + int64 parent_iter = -1; + + // The FrameState of its parent frame. + FrameState* parent_frame = nullptr; + + // The highest iteration number we have reached so far in this frame. + int64 iteration_count = 0; + + // The number of inputs this frame is still waiting. + int num_pending_inputs = 0; + + // The number of outstanding iterations. + int num_outstanding_iterations = 0; + + // The maximum allowed number of parallel iterations. + int max_parallel_iterations = 1; + + // The iteration states of this frame. + std::vector<IterationState*> iterations; + + // The NextIteration nodes to enter a new iteration. If the number of + // outstanding iterations reaches the limit, we will defer the start of + // the next iteration until the number of outstanding iterations falls + // below the limit. + std::vector<std::pair<const Node*, Entry>> next_iter_roots; + + // The values of the loop invariants for this loop. They are added into + // this list as they "enter" the frame. When a loop invariant enters, + // we make it available to all active iterations. When the frame starts + // a new iteration, we make all the current loop invariants available + // to the new iteration. + std::vector<std::pair<const Node*, Entry>> inv_values; + + // The list of dead exit nodes for the current highest iteration. We + // will only "execute" the dead exits of the final iteration. + std::vector<const Node*> dead_exits; + + IterationState* GetIteration(int64 iter) { + int index = iter % iterations.size(); + return iterations[index]; + } + + void SetIteration(int64 iter, IterationState* state) { + int index = iter % iterations.size(); + iterations[index] = state; + } + + ~FrameState() { + for (size_t i = 0; i < iterations.size(); ++i) { + delete iterations[i]; + iterations[i] = nullptr; + } + } + }; + + // A tagged node: <frame*, iter, node*>. + struct TaggedNode { + const Node* node = nullptr; + FrameState* input_frame = nullptr; + int64 input_iter = -1; + bool is_dead = false; + + TaggedNode(const Node* t_node, FrameState* in_frame, int64 in_iter, + bool dead) { + node = t_node; + input_frame = in_frame; + input_iter = in_iter; + is_dead = dead; + } + }; + + typedef gtl::InlinedVector<TaggedNode, 8> TaggedNodeSeq; + typedef gtl::InlinedVector<Entry, 4> EntryVector; + + // Not owned. + Rendezvous* rendezvous_; + StepStatsCollector* stats_collector_; + // QUESTION: Make it a checkpoint::TensorSliceReaderCacheWrapper instead of a + // pointer? (avoids having to delete). + checkpoint::TensorSliceReaderCacheWrapper* slice_reader_cache_; + FunctionCallFrame* call_frame_; + const ExecutorImpl* impl_; + CancellationManager* cancellation_manager_; + Executor::Args::Runner runner_; + + // Owned. + + // Step-local resource manager. + ResourceMgr step_resource_manager_; + + // The root frame in which the execution of this step is started. + FrameState* root_frame_; + + // Invoked when the execution finishes. + Executor::DoneCallback done_cb_; + + std::atomic_int_fast32_t num_outstanding_ops_; + + mutex mu_; + Status status_ GUARDED_BY(mu_); + + // Mapping from frame name to outstanding frames. A new frame is created + // at some iteration of an active frame. So the unique key for the new + // child frame is composed of the name of the parent frame, the iteration + // number at which the parent frame is creating the new frame, and the + // name of the new frame from nodedef. + std::unordered_map<string, FrameState*> outstanding_frames_ GUARDED_BY(mu_); + + // The unique name of a frame. + inline string MakeFrameName(FrameState* frame, int64 iter_id, string name) { + return strings::StrCat(frame->frame_name, ";", iter_id, ";", name); + } + + // Initialize the pending count for a graph. + static void InitializePending(const Graph* graph, std::vector<int>* pending); + + // Find an existing or create a new child frame in the frame 'frame' at + // iteration 'iter'. + void FindOrCreateChildFrame(FrameState* frame, int64 iter, const Node* node, + FrameState** child) EXCLUSIVE_LOCKS_REQUIRED(mu_); + + // Increments the iteration id. If this is a new iteration, initialize it. + void IncrementIteration(FrameState* frame, TaggedNodeSeq* ready) + EXCLUSIVE_LOCKS_REQUIRED(mu_); + + // Returns true if the computation in the frame is completed. + bool IsFrameDone(FrameState* frame) EXCLUSIVE_LOCKS_REQUIRED(mu_); + + // Returns true if the iteration of the frame is completed. + bool IsIterationDone(FrameState* frame, int64 iter) + EXCLUSIVE_LOCKS_REQUIRED(mu_); + + // Get the output frame/iter of a node. Create new frame/iteration if + // needed. If there are dead roots for the new iteration, we need to + // "execute" them so ad them to the ready queue. Returns true if + // we need to check for the completion of output frame/iter. + bool SetOutputFrameIter(const TaggedNode& tagged_node, + const EntryVector& outputs, FrameState** frame, + int64* iter, TaggedNodeSeq* ready) + EXCLUSIVE_LOCKS_REQUIRED(mu_); + + // Cleanup frames and iterations + void CleanupFramesIterations(FrameState* frame, int64 iter, + TaggedNodeSeq* ready) + EXCLUSIVE_LOCKS_REQUIRED(mu_); + + // Activate all the deferred NextIteration nodes in a new iteration. + void ActivateNexts(FrameState* frame, int64 iter, TaggedNodeSeq* ready) + EXCLUSIVE_LOCKS_REQUIRED(mu_); + + // Activate all the current loop invariants in a new iteration. + void ActivateLoopInvs(FrameState* frame, int64 iter, TaggedNodeSeq* ready) + EXCLUSIVE_LOCKS_REQUIRED(mu_); + + // Add a new loop invariant and make it available to all active iterations. + void AddLoopInv(FrameState* frame, const Node* node, const Entry& value, + TaggedNodeSeq* ready) EXCLUSIVE_LOCKS_REQUIRED(mu_); + + // Activate the successors of a node. + void ActivateNode(const Node* node, const bool is_dead, FrameState* frame, + int64 iter, const EntryVector& outputs, + TaggedNodeSeq* ready) EXCLUSIVE_LOCKS_REQUIRED(mu_); + + // Process a ready node in current thread. + void Process(TaggedNode node, int64 scheduled_usec); + + // Before invoking item->kernel, fills in its "inputs". + Status PrepareInputs(const NodeItem& item, Entry* first_input, + TensorValueVec* inputs, + DeviceContextVec* input_device_contexts, + AllocatorAttributeVec* input_alloc_attrs, + bool* is_input_dead); + + // After item->kernel computation is done, processes its outputs. + Status ProcessOutputs(const NodeItem& item, OpKernelContext* ctx, + EntryVector* outputs, NodeExecStats* stats); + + // After processing the outputs, propagates the outputs to their dsts. + void PropagateOutputs(const TaggedNode& tagged_node, + const EntryVector& outputs, TaggedNodeSeq* ready); + + // "node" just finishes. Takes ownership of "stats". Returns true if + // execution has completed. + bool NodeDone(const Status& s, const Node* node, const TaggedNodeSeq& ready, + NodeExecStats* stats, std::deque<TaggedNode>* inline_ready); + + // Call Process() on all nodes in 'inline_ready'. + void ProcessInline(const std::deque<TaggedNode>& inline_ready); + + // Schedule all the expensive nodes in 'ready', and put all the inexpensive + // nodes in 'ready' into 'inline_ready'. + void ScheduleReady(const TaggedNodeSeq& ready, + std::deque<TaggedNode>* inline_ready); + + // One thread of control finishes. + void Finish(); +}; + +ExecutorState::ExecutorState(const Executor::Args& args, ExecutorImpl* impl) + : rendezvous_(args.rendezvous), + stats_collector_(args.stats_collector), + slice_reader_cache_(new checkpoint::TensorSliceReaderCacheWrapper), + call_frame_(args.call_frame), + impl_(impl), + cancellation_manager_(args.cancellation_manager), + runner_(args.runner), + num_outstanding_ops_(0) { + // We start the entire execution in iteration 0 of the root frame + // so let us create the root frame and the state for iteration 0. + // Initialize the frame. + root_frame_ = new FrameState; + root_frame_->frame_name = "_root"; // assume to be unique + root_frame_->frame_id = 0; // must be 0 + root_frame_->num_pending_inputs = 0; + root_frame_->num_outstanding_iterations = 1; + root_frame_->max_parallel_iterations = 1; // enough for root frame + root_frame_->iterations.resize(root_frame_->max_parallel_iterations); + + VLOG(2) << "Create frame: " << root_frame_->frame_name; + + // Initialize the iteration. + IterationState* iter_state = new IterationState; + root_frame_->iterations[0] = iter_state; + iter_state->outstanding_ops = 0; + iter_state->outstanding_frame_count = 0; + iter_state->pending_count = new std::vector<int>; + iter_state->dead_count = new std::vector<int>(impl->graph_->num_node_ids()); + iter_state->input_tensors = new std::vector<Entry>(impl_->total_tensors_); + + // Initialize the executor state. + outstanding_frames_.insert({root_frame_->frame_name, root_frame_}); +} + +ExecutorState::~ExecutorState() { + for (auto name_frame : outstanding_frames_) { + delete name_frame.second; + } + + for (auto it : device_context_map_) { + it.second->Unref(); + } + + delete slice_reader_cache_; +} + +void ExecutorState::InitializePending(const Graph* graph, + std::vector<int>* pending) { + pending->resize(graph->num_node_ids()); + for (const Node* n : graph->nodes()) { + const int id = n->id(); + const int num_in_edges = n->in_edges().size(); + if (IsMerge(n)) { + // merge waits all control inputs so we initialize the pending + // count to be the number of control edges. + int32 num_control_edges = 0; + for (const Edge* edge : n->in_edges()) { + if (edge->IsControlEdge()) { + num_control_edges++; + } + } + // Use bit 0 to indicate if there is a ready live data input. + (*pending)[id] = num_control_edges << 1; + } else { + (*pending)[id] = num_in_edges; + } + } +} + +void ExecutorState::RunAsync(Executor::DoneCallback done) { + const Graph* graph = impl_->graph_; + TaggedNodeSeq ready; + + { + // Initialize the executor state. We grab the mutex here just to + // keep the thread safety analysis happy. + mutex_lock l(mu_); + std::vector<int>* pending = root_frame_->iterations[0]->pending_count; + InitializePending(graph, pending); + } + + // Ask the device to fill in the device context map. + Device* device = impl_->params_.device; + device->FillContextMap(graph, &device_context_map_); + + // Initialize the ready queue. + for (const Node* n : graph->nodes()) { + const int num_in_edges = n->in_edges().size(); + if (num_in_edges == 0) { + ready.push_back(TaggedNode{n, root_frame_, 0, false}); + } + } + if (ready.empty()) { + done(Status::OK()); + } else { + num_outstanding_ops_ = ready.size(); + root_frame_->iterations[0]->outstanding_ops = ready.size(); + done_cb_ = done; + // Schedule to run all the ready ops in thread pool. + ScheduleReady(ready, nullptr); + } +} + +namespace { + +// This function is provided for use by OpKernelContext when allocating +// the index'th output of node. It provides access to the +// AllocatorAttributes computed during initialization to determine in +// which memory region the tensor should be allocated. +AllocatorAttributes OutputAttributes(const DevAttrMap* attr_map, + const Node* node, + const OpKernel* op_kernel, int index) { + DCHECK_GE(index, 0); + + AllocatorAttributes attr; + int nid = node->id(); + const auto& iter = attr_map->find(std::make_pair(nid, index)); + if (iter != attr_map->end()) { + attr = iter->second; + VLOG(2) << "nondefault attr " << attr.value << " for node " << node->name() + << " output " << index; + } else { + VLOG(2) << "default attr for node " << node->name() << " output " << index; + } + + DCHECK_LT(index, op_kernel->output_memory_types().size()); + bool on_host = op_kernel->output_memory_types()[index] == HOST_MEMORY; + attr.set_on_host(on_host); + return attr; +} + +// Helpers to make a copy of 'p' and makes a copy of the input type +// vector and the device context vector. +// +// NOTE: We need to make a copy of p.input for asynchronous kernel +// because OpKernelContext methods like input_type(i) needs the param +// points to valid input type vector. It's not an issue for sync +// kernels because the type vector is kept on the stack. +OpKernelContext::Params* CopyParams(const OpKernelContext::Params& p) { + OpKernelContext::Params* ret = new OpKernelContext::Params; + *ret = p; + ret->inputs = new TensorValueVec(*p.inputs); + ret->input_device_contexts = new DeviceContextVec(*p.input_device_contexts); + ret->input_alloc_attrs = new AllocatorAttributeVec(*p.input_alloc_attrs); + return ret; +} + +// Helpers to delete 'p' and copies made by CopyParams. +void DeleteParams(OpKernelContext::Params* p) { + delete p->inputs; + delete p->input_device_contexts; + delete p->input_alloc_attrs; + delete p; +} + +} // namespace + +void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_usec) { + const std::vector<NodeItem>& nodes = impl_->nodes_; + TaggedNodeSeq ready; + std::deque<TaggedNode> inline_ready; + + // Parameters passed to OpKernel::Compute. + TensorValueVec inputs; + DeviceContextVec input_device_contexts; + AllocatorAttributeVec input_alloc_attrs; + + OpKernelContext::Params params; + Device* device = impl_->params_.device; + params.device = device; + // track allocations if and only if we are collecting statistics + params.track_allocations = (stats_collector_ != nullptr); + params.rendezvous = rendezvous_; + params.cancellation_manager = cancellation_manager_; + params.call_frame = call_frame_; + params.function_library = impl_->params_.function_library; + params.resource_manager = device->resource_manager(); + params.step_resource_manager = &step_resource_manager_; + params.slice_reader_cache = slice_reader_cache_; + params.inputs = &inputs; + params.input_device_contexts = &input_device_contexts; + params.input_alloc_attrs = &input_alloc_attrs; + + Status s; + NodeExecStats* stats = nullptr; + EntryVector outputs; + bool completed = false; + inline_ready.push_back(tagged_node); + while (!inline_ready.empty()) { + tagged_node = inline_ready.front(); + inline_ready.pop_front(); + const Node* node = tagged_node.node; + FrameState* input_frame = tagged_node.input_frame; + int64 input_iter = tagged_node.input_iter; + const int id = node->id(); + const NodeItem& item = nodes[id]; + + // Set the device_context for this node id, if it exists. + auto dc_it = device_context_map_.find(id); + if (dc_it != device_context_map_.end()) { + params.op_device_context = dc_it->second; + } + + if (stats_collector_) { + stats = new NodeExecStats; + stats->set_node_name(node->name()); + nodestats::SetScheduled(stats, scheduled_usec); + nodestats::SetAllStart(stats); + } + + VLOG(1) << "Process node: " << id << " " << SummarizeNodeDef(node->def()); + + std::vector<Entry>* input_tensors; + { + // Need the lock because the iterations vector could be resized by + // another thread. + mutex_lock l(mu_); + input_tensors = input_frame->GetIteration(input_iter)->input_tensors; + } + Entry* first_input = input_tensors->data() + item.input_start; + outputs.clear(); + outputs.resize(node->num_outputs()); + + // Only execute this node if it is not dead or it is a send/recv + // transfer node. For transfer nodes, we need to propagate the "dead" + // bit even when the node is dead. + AsyncOpKernel* async = nullptr; + if (!tagged_node.is_dead || IsTransferNode(node)) { + // Prepares inputs. + bool is_input_dead = false; + s = PrepareInputs(item, first_input, &inputs, &input_device_contexts, + &input_alloc_attrs, &is_input_dead); + if (!s.ok()) { + // Continue to process the nodes in 'inline_ready'. + completed = NodeDone(s, item.node, ready, stats, &inline_ready); + continue; + } + + // Set up compute params. + OpKernel* op_kernel = item.kernel; + params.op_kernel = op_kernel; + params.frame_iter = FrameAndIter(input_frame->frame_id, input_iter); + params.is_input_dead = is_input_dead; + params.output_alloc_attr = [this, node, op_kernel](int index) { + return OutputAttributes(&impl_->alloc_attr_, node, op_kernel, index); + }; + + async = op_kernel->AsAsync(); + if (async) { + // Asynchronous computes. + auto pcopy = CopyParams(params); + auto ctx = new OpKernelContext(*pcopy); + auto done = [this, tagged_node, item, first_input, ctx, stats, + pcopy]() { + VLOG(2) << this << " Async kernel done: " + << SummarizeNodeDef(item.node->def()); + if (stats_collector_) nodestats::SetOpEnd(stats); + EntryVector outputs; + Status s = ProcessOutputs(item, ctx, &outputs, stats); + if (stats_collector_) nodestats::SetMemory(stats, ctx); + // Clears inputs. + int num_inputs = tagged_node.node->num_inputs(); + for (int i = 0; i < num_inputs; ++i) { + (first_input + i)->val = *kEmptyTensor; + } + TaggedNodeSeq ready; + if (s.ok()) { + PropagateOutputs(tagged_node, outputs, &ready); + } + // Schedule to run all the ready ops in thread pool. + bool completed = NodeDone(s, item.node, ready, stats, nullptr); + delete ctx; + DeleteParams(pcopy); + if (completed) Finish(); + }; + if (stats_collector_) nodestats::SetOpStart(stats); + device->ComputeAsync(async, ctx, done); + } else { + // Synchronous computes. + OpKernelContext ctx(params); + if (stats_collector_) nodestats::SetOpStart(stats); + device->Compute(CHECK_NOTNULL(op_kernel), &ctx); + if (stats_collector_) nodestats::SetOpEnd(stats); + + // Processes outputs. + s = ProcessOutputs(item, &ctx, &outputs, stats); + if (stats_collector_) nodestats::SetMemory(stats, &ctx); + } + } + + if (!async) { + // Clears inputs. + int num_inputs = node->num_inputs(); + for (int i = 0; i < num_inputs; ++i) { + (first_input + i)->val = *kEmptyTensor; + } + // Propagates outputs. + if (s.ok()) { + PropagateOutputs(tagged_node, outputs, &ready); + } + if (stats_collector_) { + scheduled_usec = nodestats::NowInUsec(); + } + // Postprocess. + completed = NodeDone(s, item.node, ready, stats, &inline_ready); + } + } // while !inline_ready.empty() + + // This thread of computation is done if completed = true. + if (completed) Finish(); +} + +Status ExecutorState::PrepareInputs(const NodeItem& item, Entry* first_input, + TensorValueVec* inputs, + DeviceContextVec* input_device_contexts, + AllocatorAttributeVec* input_alloc_attrs, + bool* is_input_dead) { + const Node* node = item.node; + + inputs->clear(); + inputs->resize(node->num_inputs()); + input_device_contexts->clear(); + input_device_contexts->resize(node->num_inputs()); + input_alloc_attrs->clear(); + input_alloc_attrs->resize(node->num_inputs()); + + *is_input_dead = false; + + bool is_merge = IsMerge(node); + for (int i = 0; i < node->num_inputs(); ++i) { + const bool expect_ref = IsRefType(node->input_type(i)); + Entry* entry = first_input + i; + (*input_device_contexts)[i] = entry->device_context; + (*input_alloc_attrs)[i] = entry->alloc_attr; + + // i-th input. + TensorValue* inp = &(*inputs)[i]; + + // Only merge and transfer nodes can have no-value inputs. + if (!entry->has_value) { + if (!is_merge) { + DCHECK(IsTransferNode(node)); + inp->tensor = &entry->val; + *is_input_dead = true; + } + continue; + } + if (entry->ref == nullptr) { + if (expect_ref) { + return AttachDef( + errors::InvalidArgument(i, "-th input expects a ref type"), + item.kernel->def()); + } + inp->tensor = &entry->val; + } else { + if (!entry->ref->IsInitialized() && !IsInitializationOp(item.node)) { + return AttachDef( + errors::FailedPrecondition("Attempting to use uninitialized value ", + item.kernel->def().input(i)), + item.kernel->def()); + } + if (expect_ref) { + inp->mutex_if_ref = entry->ref_mu; + inp->tensor = entry->ref; + } else { + // Automatically deref the tensor ref when the op expects a + // tensor but is given a ref to a tensor. Need to deref it + // under the mutex. + { + mutex_lock l(*(entry->ref_mu)); + entry->val = *entry->ref; + } + inp->tensor = &entry->val; + } + } + } + return Status::OK(); +} + +Status ExecutorState::ProcessOutputs(const NodeItem& item, OpKernelContext* ctx, + EntryVector* outputs, + NodeExecStats* stats) { + const Node* node = item.node; + outputs->clear(); + outputs->resize(node->num_outputs()); + + Status s = ctx->status(); + if (!s.ok()) { + s = AttachDef(s, item.kernel->def()); + LOG(WARNING) << this << " Compute status: " << s; + return s; + } + + // Get the device_context for this node id, if it exists. + DeviceContext* device_context = nullptr; + auto dc_it = device_context_map_.find(node->id()); + if (dc_it != device_context_map_.end()) { + device_context = dc_it->second; + } + + for (int i = 0; i < node->num_outputs(); ++i) { + TensorValue val = ctx->release_output(i); + // Only Switch and Recv nodes can generate new dead outputs + if (*ctx->is_output_dead() || val.tensor == nullptr) { + DCHECK(IsSwitch(node) || IsRecv(node)); + } else { + Entry* out = &((*outputs)[i]); + out->has_value = true; + + // Set the device context of the output entry. + out->device_context = device_context; + + // Set the allocator attributes of the output entry. + out->alloc_attr = ctx->output_alloc_attr(i); + + // Sanity check of output tensor types. + DataType dtype = val->dtype(); + if (val.is_ref()) dtype = MakeRefType(dtype); + if (dtype == node->output_type(i)) { + if (val.is_ref()) { + out->ref = val.tensor; + out->ref_mu = val.mutex_if_ref; + } else { + out->val = *val.tensor; + } + if (stats_collector_ && val.tensor->IsInitialized()) { + nodestats::SetOutput(stats, i, ctx->output_allocation_type(i), + val.tensor); + } + } else { + s.Update(errors::Internal("Output ", i, " of type ", + DataTypeString(dtype), + " does not match declared output type ", + DataTypeString(node->output_type(i)), + " for node ", SummarizeNodeDef(node->def()))); + } + } + if (!val.is_ref()) { + // If OpKernelContext returns outputs via pass-by-value, we + // don't need this trouble. + delete val.tensor; + } + } + return s; +} + +void ExecutorState::PropagateOutputs(const TaggedNode& tagged_node, + const EntryVector& outputs, + TaggedNodeSeq* ready) { + FrameState* input_frame = tagged_node.input_frame; + int64 input_iter = tagged_node.input_iter; + + // Propagates outputs along out edges, and puts newly ready nodes + // into the ready queue. + ready->clear(); + + { + FrameState* output_frame = input_frame; + int64 output_iter = input_iter; + + mutex_lock l(mu_); + // Sets the output_frame and output_iter of node. + bool maybe_completed = SetOutputFrameIter( + tagged_node, outputs, &output_frame, &output_iter, ready); + if (output_frame != nullptr) { + // Continue to process the out nodes: + ActivateNode(tagged_node.node, tagged_node.is_dead, output_frame, + output_iter, outputs, ready); + } + + // At this point, this node is completely done. + input_frame->GetIteration(input_iter)->outstanding_ops--; + CleanupFramesIterations(input_frame, input_iter, ready); + + // The execution of a node such as Enter may cause the completion of + // output_frame:output_iter, so perform cleanup if output_frame:output_iter + // is indeed completed. + if (maybe_completed) { + CleanupFramesIterations(output_frame, output_iter, ready); + } + } +} + +void ExecutorState::ActivateNode(const Node* node, const bool is_dead, + FrameState* output_frame, int64 output_iter, + const EntryVector& outputs, + TaggedNodeSeq* ready) { + const std::vector<NodeItem>& nodes = impl_->nodes_; + IterationState* output_iter_state = output_frame->GetIteration(output_iter); + std::vector<int>* pending = output_iter_state->pending_count; + std::vector<int>* dead_count = output_iter_state->dead_count; + for (const Edge* e : node->out_edges()) { + const Node* dst_node = e->dst(); + const int dst_id = dst_node->id(); + const int src_slot = e->src_output(); + + bool dst_dead = false; + bool dst_ready = false; + bool dst_need_input = !e->IsControlEdge(); + if (IsMerge(dst_node)) { + // A merge node is ready if a) all control edges are enabled and a + // live data input becomes available, or b) all control edges are + // enabled and all data inputs are dead. + if (e->IsControlEdge()) { + (*pending)[dst_id] -= 2; + int count = (*pending)[dst_id]; + dst_dead = ((*dead_count)[dst_id] == dst_node->num_inputs()); + dst_ready = (count == 1) || ((count == 0) && dst_dead); + } else { + if (outputs[src_slot].has_value) { + // This is a live data input. + int count = (*pending)[dst_id]; + (*pending)[dst_id] |= 0x1; + dst_ready = (count == 0); + } else { + // This is a dead data input. + ++(*dead_count)[dst_id]; + dst_dead = ((*dead_count)[dst_id] == dst_node->num_inputs()); + dst_ready = ((*pending)[dst_id] == 0) && dst_dead; + } + // This input for dst is not needed if !dst_ready. We suppress the + // propagation to make the thread safety analysis happy. + dst_need_input = dst_ready; + } + } else { + // A non-merge node is ready if all its inputs are ready. We wait + // for all inputs to come in even if we know the node is dead. This + // ensures that all input tensors get cleaned up. + if (is_dead || (!e->IsControlEdge() && !outputs[src_slot].has_value)) { + ++(*dead_count)[dst_id]; + } + dst_dead = (*dead_count)[dst_id] > 0; + dst_ready = (--(*pending)[dst_id] == 0); + } + + if (dst_need_input) { + const NodeItem& dst_item = nodes[dst_id]; + const int dst_slot = e->dst_input(); + std::vector<Entry>* input_tensors = output_iter_state->input_tensors; + int dst_loc = dst_item.input_start + dst_slot; + (*input_tensors)[dst_loc] = outputs[src_slot]; + } + + // Add dst to the ready queue if it's ready + if (dst_ready) { + dst_dead = dst_dead && !IsControlTrigger(dst_node); + ready->push_back( + TaggedNode(dst_node, output_frame, output_iter, dst_dead)); + output_iter_state->outstanding_ops++; + } + } +} + +void ExecutorState::ActivateNexts(FrameState* frame, int64 iter, + TaggedNodeSeq* ready) { + // Propagate the deferred NextIteration nodes to the new iteration. + for (auto& node_entry : frame->next_iter_roots) { + const Node* node = node_entry.first; + const Entry& entry = node_entry.second; + const bool is_dead = !entry.has_value; + ActivateNode(node, is_dead, frame, iter, {entry}, ready); + } + frame->next_iter_roots.clear(); +} + +void ExecutorState::ActivateLoopInvs(FrameState* frame, int64 iter, + TaggedNodeSeq* ready) { + // Propagate loop invariants to the new iteration. + for (auto& node_entry : frame->inv_values) { + const Node* node = node_entry.first; + const Entry& entry = node_entry.second; + const bool is_dead = !entry.has_value; + ActivateNode(node, is_dead, frame, iter, {entry}, ready); + } +} + +void ExecutorState::AddLoopInv(FrameState* frame, const Node* node, + const Entry& entry, TaggedNodeSeq* ready) { + // Store this value. + frame->inv_values.push_back({node, entry}); + + // Make this value available to all iterations. + bool is_dead = !entry.has_value; + for (int i = 1; i <= frame->iteration_count; ++i) { + ActivateNode(node, is_dead, frame, i, {entry}, ready); + } +} + +bool ExecutorState::NodeDone(const Status& s, const Node* node, + const TaggedNodeSeq& ready, NodeExecStats* stats, + std::deque<TaggedNode>* inline_ready) { + if (stats_collector_) { + nodestats::SetAllEnd(stats); + if (!SetTimelineLabel(node, stats)) { + // Only record non-transfer nodes. + stats_collector_->Save(impl_->params_.device->name(), stats); + } else { + delete stats; + } + } + + Rendezvous* captured_rendezvous = nullptr; // Will be set on error. + if (!s.ok()) { + // Some error happened. This thread of computation is done. + mutex_lock l(mu_); + if (status_.ok()) { + captured_rendezvous = rendezvous_; + if (captured_rendezvous) captured_rendezvous->Ref(); + status_ = s; + } + } + if (captured_rendezvous) { + // If we captured the rendezvous_ pointer, we are in an error condition. + // Use captured_rendezvous, in case "this" is deleted by another thread. + TRACEPRINTF("StartAbort: %s", s.ToString().c_str()); + captured_rendezvous->StartAbort(s); + captured_rendezvous->Unref(); + } + + bool completed = false; + int ready_size = ready.size(); + if (ready_size == 0 || !s.ok()) { + completed = (num_outstanding_ops_.fetch_sub(1) == 1); + } else if (ready_size > 1) { + num_outstanding_ops_.fetch_add(ready_size - 1, std::memory_order_relaxed); + } + + // Schedule the ready nodes in 'ready'. + if (s.ok()) { + ScheduleReady(ready, inline_ready); + } + return completed; +} + +void ExecutorState::ProcessInline(const std::deque<TaggedNode>& inline_ready) { + if (inline_ready.empty()) return; + int64 scheduled_usec = 0; + if (stats_collector_) { + scheduled_usec = nodestats::NowInUsec(); + } + for (auto& tagged_node : inline_ready) { + Process(tagged_node, scheduled_usec); + } +} + +void ExecutorState::ScheduleReady(const TaggedNodeSeq& ready, + std::deque<TaggedNode>* inline_ready) { + if (ready.empty()) return; + + int64 scheduled_usec = 0; + if (stats_collector_) { + scheduled_usec = nodestats::NowInUsec(); + } + if (inline_ready == nullptr) { + // Schedule to run all the ready ops in thread pool. + for (auto& tagged_node : ready) { + runner_(std::bind(&ME::Process, this, tagged_node, scheduled_usec)); + } + return; + } + const std::vector<NodeItem>& nodes = impl_->nodes_; + const TaggedNode* curr_expensive_node = nullptr; + for (auto& tagged_node : ready) { + const NodeItem& item = nodes[tagged_node.node->id()]; + if (tagged_node.is_dead || !item.kernel->IsExpensive()) { + // Inline this inexpensive node. + inline_ready->push_back(tagged_node); + } else { + if (curr_expensive_node) { + // Dispatch to another thread since there is plenty of work to + // do for this thread. + runner_(std::bind(&ME::Process, this, *curr_expensive_node, + scheduled_usec)); + } + curr_expensive_node = &tagged_node; + } + } + if (curr_expensive_node) { + if (inline_ready->empty()) { + // Tail recursion optimization + inline_ready->push_back(*curr_expensive_node); + } else { + // There are inline nodes to run already. We dispatch this expensive + // node to other thread. + runner_( + std::bind(&ME::Process, this, *curr_expensive_node, scheduled_usec)); + } + } +} + +void ExecutorState::Finish() { + mu_.lock(); + auto status = status_; + auto done_cb = done_cb_; + auto runner = runner_; + mu_.unlock(); + delete this; + CHECK(done_cb != nullptr); + runner([done_cb, status]() { done_cb(status); }); +} + +bool ExecutorState::IsFrameDone(FrameState* frame) { + return (frame->num_pending_inputs == 0 && + frame->num_outstanding_iterations == 0); +} + +bool ExecutorState::IsIterationDone(FrameState* frame, int64 iter) { + IterationState* iter_state = frame->GetIteration(iter); + if (iter_state->outstanding_ops == 0 && + iter_state->outstanding_frame_count == 0) { + if (iter == 0) { + // The enclosing frame has no pending input. + return frame->num_pending_inputs == 0; + } else { + // The preceding iteration is deleted (and therefore done). + return (frame->GetIteration(iter - 1) == nullptr); + } + } + return false; +} + +void ExecutorState::FindOrCreateChildFrame(FrameState* frame, int64 iter, + const Node* node, + FrameState** child) { + // Get the child frame name. + string enter_name; + Status s = GetNodeAttr(node->def(), "frame_name", &enter_name); + CHECK(s.ok()) << s; + const string child_name = MakeFrameName(frame, iter, enter_name); + + auto it = outstanding_frames_.find(child_name); + if (it != outstanding_frames_.end()) { + *child = it->second; + } else { + // Need to create a new frame instance. + VLOG(2) << "Create frame: " << child_name; + + FrameState* temp = new FrameState; + temp->frame_name = child_name; + temp->frame_id = Hash64(child_name); + temp->parent_frame = frame; + temp->parent_iter = iter; + s = GetNodeAttr(node->def(), "parallel_iterations", + &temp->max_parallel_iterations); + CHECK(s.ok()) << s; + // 'iterations' is a fixed-length circular buffer. + temp->iterations.resize(temp->max_parallel_iterations + 1); + IterationState* iter_state = new IterationState; + temp->iterations[0] = iter_state; + + iter_state->outstanding_ops = 0; + iter_state->outstanding_frame_count = 0; + iter_state->pending_count = new std::vector<int>; + InitializePending(impl_->graph_, iter_state->pending_count); + iter_state->dead_count = + new std::vector<int>(impl_->graph_->num_node_ids()); + iter_state->input_tensors = new std::vector<Entry>(impl_->total_tensors_); + + auto frame_pending = impl_->frame_input_count_.find(enter_name); + DCHECK(frame_pending != impl_->frame_input_count_.end()); + temp->num_pending_inputs = frame_pending->second; + temp->num_outstanding_iterations = 1; + *child = temp; + + frame->GetIteration(iter)->outstanding_frame_count++; + outstanding_frames_[child_name] = temp; + } +} + +void ExecutorState::IncrementIteration(FrameState* frame, + TaggedNodeSeq* ready) { + frame->iteration_count++; + int64 next_iter = frame->iteration_count; + + VLOG(2) << "Create iteration: [" << frame->frame_name << ", " << next_iter + << "]"; + + IterationState* iter_state = new IterationState; + frame->SetIteration(next_iter, iter_state); + frame->num_outstanding_iterations++; + frame->dead_exits.clear(); + + iter_state->outstanding_ops = 0; + iter_state->outstanding_frame_count = 0; + iter_state->pending_count = new std::vector<int>; + InitializePending(impl_->graph_, iter_state->pending_count); + iter_state->dead_count = new std::vector<int>(impl_->graph_->num_node_ids()); + iter_state->input_tensors = new std::vector<Entry>(impl_->total_tensors_); + + // Activate the successors of the deferred roots in the new iteration. + ActivateNexts(frame, next_iter, ready); + + // Activate the loop invariants in the new iteration. + ActivateLoopInvs(frame, next_iter, ready); +} + +bool ExecutorState::SetOutputFrameIter(const TaggedNode& tagged_node, + const EntryVector& outputs, + FrameState** output_frame, + int64* output_iter, + TaggedNodeSeq* ready) { + const Node* node = tagged_node.node; + FrameState* input_frame = tagged_node.input_frame; + int64 input_iter = tagged_node.input_iter; + bool is_dead = tagged_node.is_dead; + bool is_enter = IsEnter(node); + + if (is_enter) { + FindOrCreateChildFrame(input_frame, input_iter, node, output_frame); + // Propagate if this is a loop invariant. + bool is_constant; + Status s = GetNodeAttr(node->def(), "is_constant", &is_constant); + CHECK(s.ok()) << s; + if (is_constant) { + AddLoopInv(*output_frame, node, outputs[0], ready); + } + --(*output_frame)->num_pending_inputs; + *output_iter = 0; + } else if (IsExit(node)) { + if (is_dead) { + // Stop and remember this node if it is a dead exit. + if (input_iter == input_frame->iteration_count) { + input_frame->dead_exits.push_back(node); + } + *output_frame = nullptr; + } else { + *output_frame = input_frame->parent_frame; + *output_iter = input_frame->parent_iter; + } + } else if (IsNextIteration(node)) { + if (is_dead) { + // Stop the deadness propagation + *output_frame = nullptr; + } else { + if (input_iter == input_frame->iteration_count && + input_frame->num_outstanding_iterations == + input_frame->max_parallel_iterations) { + // Reached the maximum for parallel iterations. + input_frame->next_iter_roots.push_back({node, outputs[0]}); + *output_frame = nullptr; + } else { + // If this is a new iteration, start it. + if (input_iter == input_frame->iteration_count) { + IncrementIteration(input_frame, ready); + } + *output_iter = input_iter + 1; + } + } + } + return is_enter; +} + +void ExecutorState::CleanupFramesIterations(FrameState* frame, int64 iter, + TaggedNodeSeq* ready) { + int64 curr_iter = iter; + while (curr_iter <= frame->iteration_count && + IsIterationDone(frame, curr_iter)) { + // Delete the iteration curr_iter + VLOG(2) << "Delete iteration [" << frame->frame_name << ", " << curr_iter + << "]."; + + delete frame->GetIteration(curr_iter); + frame->SetIteration(curr_iter, nullptr); + --frame->num_outstanding_iterations; + ++curr_iter; + + // If there is a deferred iteration, start it. + if (frame->next_iter_roots.size() > 0) { + IncrementIteration(frame, ready); + } + } + + if (IsFrameDone(frame)) { + FrameState* parent_frame = frame->parent_frame; + int64 parent_iter = frame->parent_iter; + + // Propagate all the dead exits to the parent frame. + for (const Node* node : frame->dead_exits) { + auto parent_iter_state = parent_frame->GetIteration(parent_iter); + std::vector<int>* pending = parent_iter_state->pending_count; + std::vector<int>* dead_count = parent_iter_state->dead_count; + for (const Edge* e : node->out_edges()) { + const Node* dst_node = e->dst(); + const int dst_id = dst_node->id(); + + bool dst_dead = true; + bool dst_ready = false; + // We know this is a dead input to dst + if (IsMerge(dst_node)) { + if (e->IsControlEdge()) { + (*pending)[dst_id] -= 2; + int count = (*pending)[dst_id]; + dst_dead = ((*dead_count)[dst_id] == dst_node->num_inputs()); + dst_ready = (count == 1) || ((count == 0) && dst_dead); + } else { + ++(*dead_count)[dst_id]; + dst_dead = ((*dead_count)[dst_id] == dst_node->num_inputs()); + dst_ready = ((*pending)[dst_id] == 0) && dst_dead; + } + } else { + ++(*dead_count)[dst_id]; + dst_ready = (--(*pending)[dst_id] == 0); + } + if (dst_ready) { + ready->push_back( + TaggedNode(dst_node, parent_frame, parent_iter, dst_dead)); + parent_iter_state->outstanding_ops++; + } + } + } + + // Delete the frame + const string& frame_name = frame->frame_name; + VLOG(2) << "Delete frame " << frame_name; + outstanding_frames_.erase(frame_name); + delete frame; + + // Cleanup recursively + if (parent_frame != nullptr) { + parent_frame->GetIteration(parent_iter)->outstanding_frame_count--; + CleanupFramesIterations(parent_frame, parent_iter, ready); + } + } +} + +// When ExecutorImpl graph has no control flow nodes, +// SimpleExecutorState is used instead of ExecutorState. It maintains +// fewer internal state and is convenient for experimenting with async +// op kernels. +class SimpleExecutorState { + public: + SimpleExecutorState(const Executor::Args& args, ExecutorImpl* impl); + ~SimpleExecutorState() { + for (auto it : device_context_map_) { + it.second->Unref(); + } + delete slice_reader_cache_; + } + void RunAsync(Executor::DoneCallback done); + + private: + typedef SimpleExecutorState ME; + + // Not owned. + Rendezvous* rendezvous_; + StepStatsCollector* stats_collector_; + checkpoint::TensorSliceReaderCacheWrapper* slice_reader_cache_; + FunctionCallFrame* call_frame_; + const ExecutorImpl* impl_; + CancellationManager* cancellation_manager_; + Executor::Args::Runner runner_; + + // Owned. + + // i-th node's j-th input is in tensors_[impl_->nodes[i].input_start + // + j]. The output is either a tensor pointer (pass-by-reference) + // or a tensor (pass-by-value). + // + // NOTE: Not protected by mu_ because tensors_ is resized once. Each + // element of tensors_ is written once by the source node of an edge + // and is cleared by the destination of the same edge. The latter + // node is never run concurrently with the former node. + struct Entry { + Tensor val = *kEmptyTensor; // A tensor value. + Tensor* ref = nullptr; // A tensor reference. + mutex* ref_mu = nullptr; // mutex for *ref if ref is not nullptr. + + // Every entry carries an optional DeviceContext containing + // Device-specific information about how the Tensor was produced. + DeviceContext* device_context = nullptr; + + // The attributes of the allocator that creates the tensor. + AllocatorAttributes alloc_attr; + }; + + // Contains a map from node id to the DeviceContext object that was + // assigned by the device at the beginning of a step. + DeviceContextMap device_context_map_; + + std::vector<Entry> input_tensors_; + + // Step-local resource manager. + ResourceMgr step_resource_manager_; + + // Invoked when the execution finishes. + Executor::DoneCallback done_cb_; + + // How many active threads of computation are being used. Same as + // the number of pending Process() functions. + std::atomic_int_fast32_t num_active_; + + mutex mu_; + Status status_ GUARDED_BY(mu_); + + // i-th kernel is still waiting for pending[i] inputs. + class CountDown { + public: + CountDown() : v_(0) {} + void Set(int32 v) { v_.store(v); } + bool Dec() { + return v_.load(std::memory_order_acquire) == 1 || v_.fetch_sub(1) == 1; + } + + private: + std::atomic_int_fast32_t v_; + }; + std::vector<CountDown> pending_; + + // Process Node identified by "id" in current thread. "scheduled_usec" + // indicates when the node becomes ready and gets scheduled. + void Process(int id, int64 scheduled_usec); + + // Before invoking item->kernel, fills in its "inputs". + Status PrepareInputs(const NodeItem& item, TensorValueVec* inputs, + DeviceContextVec* input_device_contexts); + + // After item->kernel computation is done, processes its outputs + // and returns nodes that become "ready". + typedef gtl::InlinedVector<int, 8> ReadyNodeIds; + Status ProcessOutputs(const NodeItem& item, OpKernelContext* ctx, + ReadyNodeIds* ready, NodeExecStats* stats); + + // "node" just finishes. Takes ownership of "stats". Returns true if + // execution has completed. + bool NodeDone(const Status& s, const Node* node, const ReadyNodeIds& ready, + NodeExecStats* stats, std::deque<int>* inline_ready); + + // Call Process() on all nodes in 'inline_ready'. + void ProcessInline(const std::deque<int>& inline_ready); + + // Schedule all the expensive nodes in 'ready', and put all the inexpensive + // nodes in 'ready' into 'inline_ready'. + void ScheduleReady(const ReadyNodeIds& ready, std::deque<int>* inline_ready); + + // One thread of control finishes. + void Finish(); + + TF_DISALLOW_COPY_AND_ASSIGN(SimpleExecutorState); +}; + +SimpleExecutorState::SimpleExecutorState(const Executor::Args& args, + ExecutorImpl* impl) + : rendezvous_(args.rendezvous), + stats_collector_(args.stats_collector), + slice_reader_cache_(new checkpoint::TensorSliceReaderCacheWrapper), + call_frame_(args.call_frame), + impl_(impl), + cancellation_manager_(args.cancellation_manager), + runner_(args.runner), + num_active_(0), + pending_(impl_->nodes_.size()) {} + +void SimpleExecutorState::ProcessInline(const std::deque<int>& inline_ready) { + if (inline_ready.empty()) return; + int64 scheduled_usec = 0; + if (stats_collector_) { + scheduled_usec = nodestats::NowInUsec(); + } + for (int id : inline_ready) { + Process(id, scheduled_usec); + } +} + +void SimpleExecutorState::ScheduleReady(const ReadyNodeIds& ready, + std::deque<int>* inline_ready) { + if (ready.empty()) return; + + int64 scheduled_usec = 0; + if (stats_collector_) { + scheduled_usec = nodestats::NowInUsec(); + } + if (inline_ready == nullptr) { + // Schedule to run all the ready ops in thread pool. + for (auto id : ready) { + runner_(std::bind(&ME::Process, this, id, scheduled_usec)); + } + return; + } + const std::vector<NodeItem>& nodes = impl_->nodes_; + int curr_expensive_node = -1; + for (auto id : ready) { + if (!nodes[id].kernel->IsExpensive()) { + // Inline this inexpensive node. + inline_ready->push_back(id); + } else { + if (curr_expensive_node != -1) { + // Dispatch to another thread since there is plenty of work to + // do for this thread. + runner_( + std::bind(&ME::Process, this, curr_expensive_node, scheduled_usec)); + } + curr_expensive_node = id; + } + } + if (curr_expensive_node != -1) { + if (inline_ready->empty()) { + // Tail recursion optimization + inline_ready->push_back(curr_expensive_node); + } else { + // There are inline nodes to run already. We dispatch this expensive + // node to other thread. + runner_( + std::bind(&ME::Process, this, curr_expensive_node, scheduled_usec)); + } + } +} + +void SimpleExecutorState::RunAsync(Executor::DoneCallback done) { + const Graph* graph = impl_->graph_; + ReadyNodeIds ready; + + // Ask the device to fill in the device context map. + Device* device = impl_->params_.device; + device->FillContextMap(graph, &device_context_map_); + + for (const Node* n : graph->nodes()) { + const int id = n->id(); + const int num_in_edges = n->in_edges().size(); + pending_[id].Set(num_in_edges); + if (num_in_edges == 0) { + ready.push_back(id); + } + } + if (ready.empty()) { + done(Status::OK()); + } else { + num_active_ = ready.size(); + done_cb_ = done; + input_tensors_.resize(impl_->total_tensors_); + // Schedule to run all the ready ops in thread pool. + ScheduleReady(ready, nullptr); + } +} + +Status SimpleExecutorState::PrepareInputs( + const NodeItem& item, TensorValueVec* inputs, + DeviceContextVec* input_device_contexts) { + const Node* node = item.node; + + inputs->clear(); + inputs->resize(node->num_inputs()); + input_device_contexts->clear(); + input_device_contexts->resize(node->num_inputs()); + + for (int i = 0; i < node->num_inputs(); ++i) { + const bool expect_ref = IsRefType(node->input_type(i)); + Entry* entry = input_tensors_.data() + item.input_start + i; + (*input_device_contexts)[i] = entry->device_context; + + // i-th input. + TensorValue* inp = &(*inputs)[i]; + + if (entry->ref == nullptr) { + if (expect_ref) { + return AttachDef( + errors::InvalidArgument(i, "-th input expects a ref type"), + item.kernel->def()); + } + inp->tensor = &entry->val; + } else { + if (!entry->ref->IsInitialized() && !IsInitializationOp(item.node)) { + return AttachDef( + errors::FailedPrecondition("Attempting to use uninitialized value ", + item.kernel->def().input(i)), + item.kernel->def()); + } + if (expect_ref) { + inp->mutex_if_ref = entry->ref_mu; + inp->tensor = entry->ref; + } else { + // Automatically deref the tensor ref when the op expects a + // tensor but is given a ref to a tensor. Need to deref it + // under the mutex. + { + mutex_lock l(*(entry->ref_mu)); + entry->val = *entry->ref; + } + inp->tensor = &entry->val; + } + } + } + return Status::OK(); +} + +void SimpleExecutorState::Process(int id, int64 scheduled_usec) { + const std::vector<NodeItem>& nodes = impl_->nodes_; + ReadyNodeIds ready; + std::deque<int> inline_ready; + + // Parameters passed to OpKernel::Compute. + TensorValueVec inputs; + DeviceContextVec input_device_contexts; + + OpKernelContext::Params params; + Device* device = impl_->params_.device; + params.device = device; + // track allocations if and only if we are collecting statistics + params.track_allocations = (stats_collector_ != nullptr); + params.rendezvous = rendezvous_; + params.cancellation_manager = cancellation_manager_; + params.call_frame = call_frame_; + params.function_library = impl_->params_.function_library; + params.resource_manager = device->resource_manager(); + params.step_resource_manager = &step_resource_manager_; + params.slice_reader_cache = slice_reader_cache_; + params.inputs = &inputs; + params.input_device_contexts = &input_device_contexts; + params.frame_iter = FrameAndIter(0, 0); + + Status s; + NodeExecStats* stats = nullptr; + bool completed = false; + inline_ready.push_back(id); + while (!inline_ready.empty()) { + id = inline_ready.front(); + inline_ready.pop_front(); + const NodeItem& item = nodes[id]; + const Node* node = item.node; + + // Set the device_context for this node id, if it exists. + auto dc_it = device_context_map_.find(id); + if (dc_it != device_context_map_.end()) { + params.op_device_context = dc_it->second; + } + + if (stats_collector_) { + stats = new NodeExecStats; + stats->set_node_name(node->name()); + nodestats::SetScheduled(stats, scheduled_usec); + nodestats::SetAllStart(stats); + } + + VLOG(1) << "Process node: " << id << " " << SummarizeNodeDef(node->def()); + + // Prepares inputs. + s = PrepareInputs(item, &inputs, &input_device_contexts); + if (!s.ok()) { + // Continue to process the nodes in 'inline_ready'. + completed = NodeDone(s, item.node, ready, stats, &inline_ready); + continue; + } + + OpKernel* op_kernel = item.kernel; + params.op_kernel = op_kernel; + params.output_alloc_attr = [this, node, op_kernel](int index) { + return OutputAttributes(&impl_->alloc_attr_, node, op_kernel, index); + }; + + // Asynchronous computes. + AsyncOpKernel* async = op_kernel->AsAsync(); + if (async) { + auto pcopy = CopyParams(params); + auto ctx = new OpKernelContext(*pcopy); + auto done = [this, item, ctx, stats, pcopy]() { + VLOG(2) << this + << " Async kernel done: " << SummarizeNodeDef(item.node->def()); + if (stats_collector_) nodestats::SetOpEnd(stats); + ReadyNodeIds ready; + Status s = ProcessOutputs(item, ctx, &ready, stats); + if (stats_collector_) nodestats::SetMemory(stats, ctx); + // Schedule to run all the ready ops in thread pool. + bool completed = NodeDone(s, item.node, ready, stats, nullptr); + delete ctx; + DeleteParams(pcopy); + if (completed) Finish(); + }; + if (stats_collector_) nodestats::SetOpStart(stats); + device->ComputeAsync(async, ctx, done); + } else { + // Synchronous computes. + OpKernelContext ctx(params); + if (stats_collector_) nodestats::SetOpStart(stats); + device->Compute(CHECK_NOTNULL(op_kernel), &ctx); + if (stats_collector_) nodestats::SetOpEnd(stats); + + s = ProcessOutputs(item, &ctx, &ready, stats); + if (stats_collector_) nodestats::SetMemory(stats, &ctx); + if (stats_collector_) { + scheduled_usec = nodestats::NowInUsec(); + } + completed = NodeDone(s, node, ready, stats, &inline_ready); + } + } // while !inline_ready.empty() + + // This thread of computation is done if completed = true. + if (completed) Finish(); +} + +bool SimpleExecutorState::NodeDone(const Status& s, const Node* node, + const ReadyNodeIds& ready, + NodeExecStats* stats, + std::deque<int>* inline_ready) { + if (stats_collector_) { + nodestats::SetAllEnd(stats); + if (!SetTimelineLabel(node, stats)) { + // Only record non-transfer nodes. + stats_collector_->Save(impl_->params_.device->name(), stats); + } else { + delete stats; + } + } + + Rendezvous* captured_rendezvous = nullptr; // Will be set on error. + if (!s.ok()) { + // Some error happened. This thread of computation is done. + mutex_lock l(mu_); + if (status_.ok()) { + captured_rendezvous = rendezvous_; + if (captured_rendezvous) captured_rendezvous->Ref(); + status_ = s; + } + } + if (captured_rendezvous) { + // If we captured the rendezvous_ pointer, we are in an error condition. + // Use captured_rendezvous, in case "this" is deleted by another thread. + TRACEPRINTF("StartAbort: %s", s.ToString().c_str()); + captured_rendezvous->StartAbort(s); + captured_rendezvous->Unref(); + } + + bool completed = false; + int ready_size = ready.size(); + if (ready_size == 0 || !s.ok()) { + completed = (num_active_.fetch_sub(1) == 1); + } else if (ready_size > 1) { + num_active_.fetch_add(ready_size - 1, std::memory_order_relaxed); + } + + // Schedule the ready nodes in 'ready'. + if (s.ok()) { + ScheduleReady(ready, inline_ready); + } + return completed; +} + +void SimpleExecutorState::Finish() { + mu_.lock(); + auto ret = status_; + auto done_cb = done_cb_; + auto runner = runner_; + mu_.unlock(); + delete this; + CHECK(done_cb != nullptr); + runner([done_cb, ret]() { done_cb(ret); }); +} + +Status SimpleExecutorState::ProcessOutputs(const NodeItem& item, + OpKernelContext* ctx, + ReadyNodeIds* ready, + NodeExecStats* stats) { + Status s = ctx->status(); + if (!s.ok()) { + s = AttachDef(s, item.kernel->def()); + LOG(WARNING) << this << " Compute status: " << s; + return s; + } + + // Processes outputs. + gtl::InlinedVector<Entry, 4> outputs; + const Node* node = item.node; + outputs.resize(node->num_outputs()); + + // Get the device_context for this node id, if it exists. + DeviceContext* device_context = nullptr; + auto dc_it = device_context_map_.find(node->id()); + if (dc_it != device_context_map_.end()) { + device_context = dc_it->second; + } + + for (int i = 0; i < node->num_outputs(); ++i) { + TensorValue val = ctx->release_output(i); + // Sanity check of output tensor types. + DataType dtype = val->dtype(); + if (val.is_ref()) dtype = MakeRefType(dtype); + if (dtype == node->output_type(i)) { + Entry* out = &(outputs[i]); + if (val.is_ref()) { + out->ref = val.tensor; + out->ref_mu = val.mutex_if_ref; + } else { + out->val = *val.tensor; + } + + // Set the device context of the output entry. + out->device_context = device_context; + + // Set the allocator attributes of the output entry. + out->alloc_attr = ctx->output_alloc_attr(i); + + if (stats_collector_ && val.tensor->IsInitialized()) { + nodestats::SetOutput(stats, i, ctx->output_allocation_type(i), + val.tensor); + } + } else { + s.Update( + errors::Internal("Output ", i, " of type ", DataTypeString(dtype), + " does not match declared output type ", + DataTypeString(node->output_type(i)), + " for operation ", SummarizeNodeDef(node->def()))); + } + if (!val.is_ref()) { + // If OpKernelContext returns outputs via pass-by-value, we + // don't need this trouble. + delete val.tensor; + } + } + if (!s.ok()) return s; + + // Clears inputs. + for (int i = 0; i < node->num_inputs(); ++i) { + input_tensors_[item.input_start + i].val = *kEmptyTensor; + } + + // Propagates outputs along out edges. + ready->clear(); + const std::vector<NodeItem>& nodes = impl_->nodes_; + for (const Edge* e : node->out_edges()) { + const int src_slot = e->src_output(); + const int dst_id = e->dst()->id(); + const NodeItem& dst_item = nodes[dst_id]; + if (!e->IsControlEdge()) { + const int dst_slot = e->dst_input(); + input_tensors_[dst_item.input_start + dst_slot] = outputs[src_slot]; + } + if (pending_[dst_id].Dec()) { + ready->push_back(dst_id); + } + } + return Status::OK(); +} + +// NOTE(yuanbyu): Use the executor that supports control flow by default. +const bool use_control_flow_executor = true; +void ExecutorImpl::RunAsync(const Args& args, DoneCallback done) { + if (params_.has_control_flow || use_control_flow_executor) { + (new ExecutorState(args, this))->RunAsync(done); + } else { + (new SimpleExecutorState(args, this))->RunAsync(done); + } +} + +} // end namespace + +Status NewLocalExecutor(const LocalExecutorParams& params, const Graph* graph, + Executor** executor) { + ExecutorImpl* impl = new ExecutorImpl(params, graph); + Status s = impl->Initialize(); + if (s.ok()) { + *executor = impl; + } else { + delete impl; + } + return s; +} + +Status CreateNonCachedKernel(Device* device, FunctionLibraryRuntime* flib, + const NodeDef& ndef, OpKernel** kernel) { + auto device_type = DeviceType(device->attributes().device_type()); + auto allocator = device->GetAllocator(AllocatorAttributes()); + return CreateOpKernel(device_type, device, allocator, flib, ndef, kernel); +} + +void DeleteNonCachedKernel(OpKernel* kernel) { delete kernel; } + +Status CreateCachedKernel(Device* device, const string& session, + FunctionLibraryRuntime* flib, const NodeDef& ndef, + OpKernel** kernel) { + auto op_seg = device->op_segment(); + auto create_fn = [device, flib, &ndef](OpKernel** kernel) { + return CreateNonCachedKernel(device, flib, ndef, kernel); + }; + return op_seg->FindOrCreate(session, ndef.name(), kernel, create_fn); +} + +// Deletes "kernel". +void DeleteCachedKernel(Device* device, const string& session, + OpKernel* kernel) { + // Do nothing. +} + +} // end namespace tensorflow diff --git a/tensorflow/core/common_runtime/executor.h b/tensorflow/core/common_runtime/executor.h new file mode 100644 index 0000000000..82bcbab836 --- /dev/null +++ b/tensorflow/core/common_runtime/executor.h @@ -0,0 +1,209 @@ +#ifndef TENSORFLOW_COMMON_RUNTIME_EXECUTOR_H_ +#define TENSORFLOW_COMMON_RUNTIME_EXECUTOR_H_ + +#include "tensorflow/core/common_runtime/device.h" +#include "tensorflow/core/framework/rendezvous.h" +#include "tensorflow/core/graph/graph.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/lib/core/notification.h" +#include "tensorflow/core/public/status.h" +#include "tensorflow/core/public/tensor.h" + +namespace tensorflow { + +class StepStatsCollector; + +// Executor runs a graph computation. +// Example: +// Graph* graph = ...; +// ... construct graph ... +// Executor* executor; +// TF_CHECK_OK(NewSimpleExecutor(my_device, graph, &executor)); +// Rendezvous* rendezvous = NewNaiveRendezvous(); +// TF_CHECK_OK(rendezvous->Send("input", some_input_tensor)); +// TF_CHECK_OK(executor->Run({ExecutorOpts, rendezvous, nullptr})); +// TF_CHECK_OK(rendezvous->Recv("input", &output_tensor)); +// ... ... +// +// Multiple threads can call Executor::Run concurrently. +class Executor { + public: + virtual ~Executor() {} + + // RunAsync() executes the graph computation. "done" is run when the + // graph computation completes. If any error happens during the + // computation, "done" is run and the error is passed to "done". + // + // RunAsync() is given a few arguments in Args. The caller must + // ensure objects passed in Args (rendezvous, stats_collector, etc.) + // are alive at least until done is invoked. All pointers to the + // argument objects can be nullptr. + // + // RunAsync() uses the given "rendezvous", if not null, as the + // mechanism to communicate inputs and outputs of the underlying + // graph computation. + // + // RunAsync() calls "stats_collector", if not null, to keep track of + // stats. This allows us to collect statistics and traces on demand. + // + // RunAsync() is provided a "call_frame", if the executor is used + // for executing a function, is used to pass arguments and return + // values between the caller and the callee. + // + // RunAsync() uses "cancellation_manager", if not nullptr, to + // register callbacks that should be called if the graph computation + // is cancelled. Note that the callbacks merely unblock any + // long-running computation, and a cancelled step will terminate by + // returning/calling the DoneCallback as usual. + // + // RunAsync() dispatches closures to "runner". Typically, "runner" + // is backed up by a bounded threadpool. + struct Args { + Rendezvous* rendezvous = nullptr; + StepStatsCollector* stats_collector = nullptr; + FunctionCallFrame* call_frame = nullptr; + CancellationManager* cancellation_manager = nullptr; + + typedef std::function<void()> Closure; + typedef std::function<void(Closure)> Runner; + Runner runner = nullptr; + }; + typedef std::function<void(const Status&)> DoneCallback; + virtual void RunAsync(const Args& args, DoneCallback done) = 0; + + // Synchronous wrapper for RunAsync(). + Status Run(const Args& args) { + Status ret; + Notification n; + RunAsync(args, [&ret, &n](const Status& s) { + ret = s; + n.Notify(); + }); + n.WaitForNotification(); + return ret; + } +}; + +// Creates an Executor that computes the given "graph". +// +// If successful, returns the constructed executor in "*executor". The +// caller keeps the ownership of "device". The returned executor takes +// the ownership of "graph". Otherwise, returns an error status. +// +// "params" provides a set of context for the executor. We expect that +// different context would provide different implementations. +struct LocalExecutorParams { + Device* device; + + // The library runtime support. + FunctionLibraryRuntime* function_library; + + // True iff the computation contains control flow nodes. + bool has_control_flow; + + // create_kernel returns an instance of op kernel based on NodeDef. + // delete_kernel is called for every kernel used by the executor + // when the executor is deleted. + std::function<Status(const NodeDef&, OpKernel**)> create_kernel; + std::function<void(OpKernel*)> delete_kernel; +}; +::tensorflow::Status NewLocalExecutor(const LocalExecutorParams& params, + const Graph* graph, Executor** executor); + +// A class to help run multiple executors in parallel and wait until +// all of them are complete. +// +// ExecutorBarrier deletes itself after the function returned by Get() +// is called. +class ExecutorBarrier { + public: + typedef std::function<void(const Status&)> StatusCallback; + + // Create an ExecutorBarrier for 'num' different executors. + // + // 'r' is the shared Rendezvous object that is used to communicate + // state. If any of the executors experiences an error, the + // rendezvous object will be aborted exactly once. + // + // 'done' is called after the last executor completes, and + // ExecutorBarrier is deleted. + ExecutorBarrier(int num, Rendezvous* r, StatusCallback done) + : rendez_(r), done_cb_(done), pending_(num) {} + + ~ExecutorBarrier() {} + + // Returns a closure that Executors must call when they are done + // computing, passing the status of their execution as an argument. + StatusCallback Get() { + return std::bind(&ExecutorBarrier::WhenDone, this, std::placeholders::_1); + } + + private: + Rendezvous* rendez_ = nullptr; + StatusCallback done_cb_ = nullptr; + + mutable mutex mu_; + int pending_ GUARDED_BY(mu_) = 0; + Status status_ GUARDED_BY(mu_); + + void WhenDone(const Status& s) { + bool error = false; + StatusCallback done = nullptr; + Status status; + { + mutex_lock l(mu_); + // If we are the first error encountered, mark the status + // appropriately and later trigger an abort of the Rendezvous + // object by this thread only. + if (status_.ok() && !s.ok()) { + error = true; + status_ = s; + } + + // If this is the last call to WhenDone, call the final callback + // below. + if (--pending_ == 0) { + CHECK(done_cb_ != nullptr); + done = done_cb_; + done_cb_ = nullptr; + } + status = status_; + } + if (error) { + rendez_->StartAbort(status); + } + if (done != nullptr) { + delete this; + done(status); + } + } + + TF_DISALLOW_COPY_AND_ASSIGN(ExecutorBarrier); +}; + +// A few helpers to facilitate create/delete kernels. + +// Creates a kernel based on "ndef" on device "device". The kernel can +// access the functions in the "flib". The caller takes ownership of +// returned "*kernel". +Status CreateNonCachedKernel(Device* device, FunctionLibraryRuntime* flib, + const NodeDef& ndef, OpKernel** kernel); + +// Deletes "kernel" returned by CreateKernel. +void DeleteNonCachedKernel(OpKernel* kernel); + +// Creates a kernel based on "ndef" on device "device". The kernel can +// access the functions in the "flib". The caller does not take +// ownership of returned "*kernel". If a kernel has been created for +// ndef.name(), returns the same kernel instance. +Status CreateCachedKernel(Device* device, const string& session, + FunctionLibraryRuntime* flib, const NodeDef& ndef, + OpKernel** kernel); + +// Deletes "kernel" returned by CreateCachedKernel. +void DeleteCachedKernel(Device* device, const string& session, + OpKernel* kernel); + +} // end namespace tensorflow + +#endif // TENSORFLOW_COMMON_RUNTIME_EXECUTOR_H_ diff --git a/tensorflow/core/common_runtime/function.cc b/tensorflow/core/common_runtime/function.cc new file mode 100644 index 0000000000..2b1a041235 --- /dev/null +++ b/tensorflow/core/common_runtime/function.cc @@ -0,0 +1,1335 @@ +#include "tensorflow/core/common_runtime/function.h" + +#include <deque> + +#include "tensorflow/core/common_runtime/device.h" +#include "tensorflow/core/common_runtime/executor.h" +#include "tensorflow/core/framework/function.h" +#include "tensorflow/core/framework/node_def_util.h" +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/graph/algorithm.h" +#include "tensorflow/core/graph/graph_constructor.h" +#include "tensorflow/core/graph/optimizer_cse.h" +#include "tensorflow/core/lib/gtl/map_util.h" + +namespace tensorflow { + +// A few string constant used throughout this module. +static const char* const kArgOp = "_Arg"; +static const char* const kRetOp = "_Retval"; +static const char* const kGradientOp = "SymbolicGradient"; +static const char* const kNodeLabel = "Func"; + +// Represents the index-th output of a node. +struct Endpoint { + Node* node; + int index; + + // Returns the string name represents this endpoint. + string name() const { + if (index == 0) { + return node->name(); + } else { + return strings::StrCat(node->name(), ":", index); + } + } + + DataType dtype() const { return node->output_type(index); } +}; + +struct EndpointHash { + uint64 operator()(const Endpoint& x) const { + return Hash64(reinterpret_cast<const char*>(&x.node), sizeof(Node*), + x.index); + } +}; + +struct EndpointEq { + bool operator()(const Endpoint& x, const Endpoint& y) const { + return (x.node == y.node) && (x.index == y.index); + } +}; + +// The following Add* routines are used to add a few graph nodes while +// functions are transformed. +static Node* AddNoOp(Graph* g) { + NodeDef ndef; + ndef.set_name(g->NewName(kNodeLabel)); + ndef.set_op("NoOp"); + Status s; + Node* ret = g->AddNode(ndef, &s); + TF_CHECK_OK(s); + return ret; +} + +static Node* AddIdentity(Graph* g, Endpoint input) { + DCHECK_LT(0, input.dtype()); + DCHECK_LT(input.dtype(), DT_FLOAT_REF); + NodeDef ndef; + ndef.set_name(g->NewName(kNodeLabel)); + ndef.set_op("Identity"); + ndef.add_input(input.name()); + AddNodeAttr("T", input.dtype(), &ndef); + Status s; + Node* ret = g->AddNode(ndef, &s); + TF_CHECK_OK(s); + g->AddEdge(input.node, input.index, ret, 0); + return ret; +} + +static Node* AddArg(Graph* g, DataType dtype, int index) { + DCHECK_LT(0, dtype); + DCHECK_LT(dtype, DT_FLOAT_REF); + NodeDef ndef; + ndef.set_name(g->NewName(kNodeLabel)); + ndef.set_op(kArgOp); + AddNodeAttr("T", dtype, &ndef); + AddNodeAttr("index", index, &ndef); + Status s; + Node* ret = g->AddNode(ndef, &s); + TF_CHECK_OK(s); + return ret; +} + +static Node* AddRet(Graph* g, Endpoint input, int index) { + DCHECK_LT(0, input.dtype()); + DCHECK_LT(input.dtype(), DT_FLOAT_REF); + NodeDef ndef; + ndef.set_name(g->NewName(kNodeLabel)); + ndef.set_op(kRetOp); + ndef.add_input(input.name()); + AddNodeAttr("T", input.dtype(), &ndef); + AddNodeAttr("index", index, &ndef); + Status s; + Node* ret = g->AddNode(ndef, &s); + TF_CHECK_OK(s); + g->AddEdge(input.node, input.index, ret, 0); + return ret; +} + +static Node* AddZerosLike(Graph* g, Endpoint input) { + DCHECK_LT(0, input.dtype()); + DCHECK_LT(input.dtype(), DT_FLOAT_REF); + NodeDef ndef; + ndef.set_name(g->NewName(kNodeLabel)); + ndef.set_op("ZerosLike"); + ndef.add_input(input.name()); + AddNodeAttr("T", input.dtype(), &ndef); + Status s; + Node* ret = g->AddNode(ndef, &s); + TF_CHECK_OK(s); + g->AddEdge(input.node, input.index, ret, 0); + return ret; +} + +static Node* AddSymGrad(Graph* g, Node* n, gtl::ArraySlice<Endpoint> grads) { + const int num_x = n->num_inputs(); + const int num_y = n->num_outputs(); + CHECK_EQ(num_y, grads.size()); + + NodeDef ndef; + ndef.set_name(g->NewName(kNodeLabel)); + ndef.set_op(kGradientOp); + + // The gradient node should have num_x + num_y inputs. + std::vector<Endpoint> n_inputs(num_x); + for (const Edge* e : n->in_edges()) { + if (e->IsControlEdge()) continue; + n_inputs[e->dst_input()] = {e->src(), e->src_output()}; + } + DataTypeVector in_types; + for (const Endpoint& ep : n_inputs) { + ndef.add_input(ep.name()); + in_types.push_back(ep.dtype()); + } + for (const Endpoint& ep : grads) { + ndef.add_input(ep.name()); + in_types.push_back(ep.dtype()); + } + CHECK_EQ(ndef.input_size(), num_x + num_y); + + AddNodeAttr("Tin", in_types, &ndef); + + // The gradient node's outputs have the same types as the node 'n's + // inputs. + AddNodeAttr("Tout", n->input_types(), &ndef); + NameAttrList func; + func.set_name(n->type_string()); + *(func.mutable_attr()) = n->def().attr(); + AddNodeAttr("f", func, &ndef); + Status s; + Node* ret = g->AddNode(ndef, &s); + TF_CHECK_OK(s); + return ret; +} + +class ArgOp : public OpKernel { + public: + explicit ArgOp(OpKernelConstruction* ctx) : OpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("T", &dtype_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("index", &index_)); + } + + void Compute(OpKernelContext* ctx) override { + auto frame = ctx->call_frame(); + OP_REQUIRES(ctx, frame != nullptr, errors::Internal("no call frame")); + Tensor val; + OP_REQUIRES_OK(ctx, frame->GetArg(index_, &val)); + OP_REQUIRES(ctx, val.dtype() == dtype_, + errors::InvalidArgument( + "Type mismatch: actual ", DataTypeString(val.dtype()), + " vs. expect ", DataTypeString(dtype_))); + ctx->set_output(0, val); + } + + private: + int index_; + DataType dtype_; + + TF_DISALLOW_COPY_AND_ASSIGN(ArgOp); +}; + +REGISTER_KERNEL_BUILDER(Name("_Arg").Device(DEVICE_CPU), ArgOp); +REGISTER_KERNEL_BUILDER(Name("_Arg").Device(DEVICE_GPU), ArgOp); + +class RetvalOp : public OpKernel { + public: + explicit RetvalOp(OpKernelConstruction* ctx) : OpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("T", &dtype_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("index", &index_)); + } + + void Compute(OpKernelContext* ctx) override { + const Tensor& val = ctx->input(0); + OP_REQUIRES(ctx, val.dtype() == dtype_, + errors::InvalidArgument( + "Type mismatch: actual ", DataTypeString(val.dtype()), + " vs. expect ", DataTypeString(dtype_))); + auto frame = ctx->call_frame(); + OP_REQUIRES(ctx, frame != nullptr, errors::Internal("no call frame")); + OP_REQUIRES_OK(ctx, frame->SetRetval(index_, val)); + } + + private: + int index_; + DataType dtype_; + + TF_DISALLOW_COPY_AND_ASSIGN(RetvalOp); +}; + +REGISTER_KERNEL_BUILDER(Name("_Retval").Device(DEVICE_CPU), RetvalOp); +REGISTER_KERNEL_BUILDER(Name("_Retval").Device(DEVICE_GPU), RetvalOp); + +static const FunctionLibraryRuntime::Handle kInvalidHandle = -1; + +class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime { + public: + FunctionLibraryRuntimeImpl(Device* device, Runner runner, + const FunctionLibraryDefinition* lib_def); + + ~FunctionLibraryRuntimeImpl() override; + + Status Instantiate(const string& function_name, + const InstantiateAttrValueMap& attrs, + Handle* handle) override; + + const FunctionBody* GetFunctionBody(Handle handle) override; + + Status CreateKernel(const NodeDef& ndef, OpKernel** kernel) override; + + void Run(const Options& opts, Handle handle, gtl::ArraySlice<Tensor> args, + std::vector<Tensor>* rets, DoneCallback done) override; + + bool IsDefined(const string& function_name) override; + + private: + typedef FunctionLibraryRuntimeImpl ME; + + Device* const device_; + Runner runner_ = nullptr; + const FunctionLibraryDefinition* const lib_def_; + std::function<Status(const string&, const OpDef**)> get_func_sig_; + std::function<Status(const NodeDef&, OpKernel**)> create_kernel_; + + mutable mutex mu_; + + // Maps function instantiation to a handle. The key is a + // canonicalized representation of the function name and + // instantiation attrs. The handle is an index into the items_. + std::unordered_map<string, Handle> table_ GUARDED_BY(mu_); + + // func_graphs_ never shrinks or reorders its members. + std::vector<FunctionBody*> func_graphs_ GUARDED_BY(mu_); + + // The instantiated and transformed function is encoded as a Graph + // object, and an executor is created for the graph. + struct Item : public core::RefCounted { + Executor* exec = nullptr; + + ~Item() override { delete this->exec; } + }; + std::vector<Item*> items_; + + Status FunctionDefToBody(const FunctionDef& fdef, + const InstantiateAttrValueMap& attrs, + FunctionBody** fbody); + Status CreateItem(Handle handle, Item** item); + Status GetOrCreateItem(Handle handle, Item** item); + Status InstantiateSymbolicGradient(const InstantiateAttrValueMap& attrs, + FunctionBody** g_body); + + TF_DISALLOW_COPY_AND_ASSIGN(FunctionLibraryRuntimeImpl); +}; + +FunctionLibraryRuntimeImpl::FunctionLibraryRuntimeImpl( + Device* device, Runner runner, const FunctionLibraryDefinition* lib_def) + : device_(device), runner_(runner), lib_def_(lib_def) { + get_func_sig_ = [this](const string& op, const OpDef** sig) { + Status s; + *sig = lib_def_->LookUp(op, &s); + return s; + }; + create_kernel_ = [this](const NodeDef& ndef, OpKernel** kernel) { + return CreateKernel(ndef, kernel); + }; +} + +FunctionLibraryRuntimeImpl::~FunctionLibraryRuntimeImpl() { + for (FunctionBody* p : func_graphs_) delete p; + for (Item* item : items_) + if (item) item->Unref(); +} + +// An asynchronous op kernel which executes an instantiated function +// defined in a library. +class CallOp : public AsyncOpKernel { + public: + CallOp(FunctionLibraryRuntime::Handle handle, OpKernelConstruction* ctx) + : AsyncOpKernel(ctx), handle_(handle) {} + + ~CallOp() override {} + + void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override { + FunctionLibraryRuntime* lib = ctx->function_library(); + OP_REQUIRES_ASYNC(ctx, lib != nullptr, + errors::Internal("No function library is provided."), + done); + FunctionLibraryRuntime::Options opts; + std::vector<Tensor> args; + args.reserve(ctx->num_inputs()); + for (int i = 0; i < ctx->num_inputs(); ++i) { + args.push_back(ctx->input(i)); + } + std::vector<Tensor>* rets = new std::vector<Tensor>; + lib->Run(opts, handle_, args, rets, + [ctx, done, rets](const Status& status) { + if (!status.ok()) { + ctx->SetStatus(status); + } else { + CHECK_EQ(rets->size(), ctx->num_outputs()); + for (size_t i = 0; i < rets->size(); ++i) { + ctx->set_output(i, (*rets)[i]); + } + } + delete rets; + done(); + }); + } + + private: + FunctionLibraryRuntime::Handle handle_; + + TF_DISALLOW_COPY_AND_ASSIGN(CallOp); +}; + +const FunctionBody* FunctionLibraryRuntimeImpl::GetFunctionBody(Handle h) { + mutex_lock l(mu_); + CHECK_LE(0, h); + CHECK_LT(h, func_graphs_.size()); + return func_graphs_[h]; +} + +Status FunctionLibraryRuntimeImpl::CreateKernel(const NodeDef& ndef, + OpKernel** kernel) { + if (ndef.op() != kGradientOp && (lib_def_->Find(ndef.op()) == nullptr)) { + return CreateNonCachedKernel(device_, this, ndef, kernel); + } + + // Try to instantiate this function for the func/attr. Maybe its + // cached already. + Handle handle; + TF_RETURN_IF_ERROR(Instantiate(ndef.op(), ndef.attr(), &handle)); + + const FunctionBody* fbody = GetFunctionBody(handle); + CHECK_NOTNULL(fbody); + + // Constructs a CallOp kernel for running the instantiated function. + Status s; + auto device_type = DeviceType(device_->attributes().device_type()); + OpKernelConstruction construction( + device_type, device_, device_->GetAllocator(AllocatorAttributes()), &ndef, + &fbody->fdef.signature(), this, fbody->arg_types, fbody->ret_types, &s); + *kernel = new CallOp(handle, &construction); + if (!s.ok()) { + delete kernel; + } + return s; +} + +Status FunctionLibraryRuntimeImpl::FunctionDefToBody( + const FunctionDef& fdef, const InstantiateAttrValueMap& attrs, + FunctionBody** fbody) { + // Instantiates the function template into a graph def. + InstantiationResult result; + TF_RETURN_IF_ERROR(InstantiateFunction(fdef, attrs, get_func_sig_, &result)); + + Graph* graph = new Graph(lib_def_); + GraphConstructorOptions opts; + opts.allow_internal_ops = true; + opts.expect_device_spec = false; + Status s = ConvertGraphDefToGraph(opts, result.gdef, graph); + if (!s.ok()) { + delete graph; + } else { + *fbody = new FunctionBody(fdef, result.arg_types, result.ret_types, graph); + } + return s; +} + +Status FunctionLibraryRuntimeImpl::InstantiateSymbolicGradient( + const InstantiateAttrValueMap& attrs, FunctionBody** g_body) { + const AttrValue* f = gtl::FindOrNull(attrs, "f"); + if (f == nullptr) { + return errors::InvalidArgument("SymbolicGradient is missing attr: f"); + } + const auto& func = f->func(); + const FunctionDef* fdef = lib_def_->Find(func.name()); + if (fdef == nullptr) { + // f is a primitve op. + gradient::Creator creator; + TF_RETURN_IF_ERROR(gradient::GetOpGradientCreator(func.name(), &creator)); + if (creator == nullptr) { + return errors::InvalidArgument("No gradient is defined for ", + func.name()); + } + FunctionDef grad_fdef; + TF_RETURN_IF_ERROR(creator(AttrSlice(&func.attr()), &grad_fdef)); + TF_RETURN_IF_ERROR(FunctionDefToBody(grad_fdef, func.attr(), g_body)); + } else { + // f is a user-defined function. + Handle f_handle; + TF_RETURN_IF_ERROR(Instantiate(func.name(), func.attr(), &f_handle)); + const FunctionBody* f_body = GetFunctionBody(f_handle); + CHECK_NOTNULL(f_body); + *g_body = SymbolicGradient(*f_body); + } + return Status::OK(); +} + +Status FunctionLibraryRuntimeImpl::Instantiate( + const string& function_name, const InstantiateAttrValueMap& attrs, + Handle* handle) { + const string key = Canonicalize(function_name, attrs); + { + mutex_lock l(mu_); + *handle = gtl::FindWithDefault(table_, key, kInvalidHandle); + if (*handle != kInvalidHandle) { + return Status::OK(); + } + } + + Status s; + FunctionBody* fbody = nullptr; + if (function_name == kGradientOp) { + TF_RETURN_IF_ERROR(InstantiateSymbolicGradient(attrs, &fbody)); + } else { + const FunctionDef* fdef = lib_def_->Find(function_name); + if (fdef == nullptr) { + return errors::NotFound("Function ", function_name, " is not defined."); + } + TF_RETURN_IF_ERROR(FunctionDefToBody(*fdef, attrs, &fbody)); + } + + { + mutex_lock l(mu_); + *handle = gtl::FindWithDefault(table_, key, kInvalidHandle); + if (*handle != kInvalidHandle) { + delete fbody; + } else { + *handle = func_graphs_.size(); + table_.insert({key, *handle}); + func_graphs_.push_back(fbody); + items_.resize(func_graphs_.size()); + } + } + return Status::OK(); +} + +static void DumpGraph(const char* label, const Graph* g) { + if (VLOG_IS_ON(1)) { + LOG(INFO) << label << ": " << std::endl << DebugString(g); + } +} + +static void SimplifyGraph(Graph* g) { + if (RemoveListArrayConverter(g)) { + DumpGraph("RemoveListArrayConverter", g); + } + bool changed; + do { + changed = false; + if (RemoveDeadNodes(g)) { + changed = true; + DumpGraph("RemoveDeadNodes", g); + } + if (RemoveIdentityNodes(g)) { + changed = true; + DumpGraph("RemoveIdentityNodes", g); + } + FixupSourceAndSinkEdges(g); + OptimizeCSE(g, nullptr); + DumpGraph("OptimizeCSE", g); + } while (changed); +} + +void OptimizeGraph(FunctionLibraryRuntime* lib, Graph** g) { + DumpGraph("Initial", *g); + const int kNumInlineRounds = 10; + for (int i = 0; i < kNumInlineRounds; ++i) { + if (!ExpandInlineFunctions(lib, *g)) break; + DumpGraph("ExpandInlineFunctions", *g); + SimplifyGraph(*g); + } + + // Makes a copy so that we densify node ids. + Graph* copy = new Graph((*g)->op_registry()); + CopyGraph(**g, copy); + delete *g; + *g = copy; + DumpGraph("ReCopy", *g); +} + +Status FunctionLibraryRuntimeImpl::CreateItem(Handle handle, Item** item) { + const FunctionBody* fbody = GetFunctionBody(handle); + CHECK_NOTNULL(fbody); + Graph* g = new Graph(lib_def_); + CopyGraph(*fbody->graph, g); + OptimizeGraph(this, &g); + + // Creates an executor based on the g. This must be done without + // holding mu_ because create_kernel_ calls back into the library. + LocalExecutorParams params; + params.device = device_; + params.function_library = this; + params.has_control_flow = false; + params.create_kernel = create_kernel_; + params.delete_kernel = [](OpKernel* kernel) { + DeleteNonCachedKernel(kernel); + }; + Executor* exec; + TF_RETURN_IF_ERROR(NewLocalExecutor(params, g, &exec)); + + *item = new Item; + (*item)->exec = exec; + return Status::OK(); +} + +Status FunctionLibraryRuntimeImpl::GetOrCreateItem(Handle handle, Item** item) { + { + mutex_lock l(mu_); + if (handle >= items_.size()) { + return errors::NotFound("Function handle ", handle, + " is not valid. Likely an internal error."); + } + *item = items_[handle]; + if (*item != nullptr) { + (*item)->Ref(); + return Status::OK(); + } + } + // NOTE: We need to call CreateItem out of mu_ because creating an + // executor needs to call CreateKernel. + TF_RETURN_IF_ERROR(CreateItem(handle, item)); + + { + mutex_lock l(mu_); + if (items_[handle] == nullptr) { + // Install *item in items_. + items_[handle] = *item; + (*item)->Ref(); + } + } + return Status::OK(); +} + +void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle, + gtl::ArraySlice<Tensor> args, + std::vector<Tensor>* rets, + DoneCallback done) { + if (opts.cancellation_manager && opts.cancellation_manager->IsCancelled()) { + return done(errors::Cancelled("")); + } + const FunctionBody* fbody = GetFunctionBody(handle); + FunctionCallFrame* frame = + new FunctionCallFrame(fbody->arg_types, fbody->ret_types); + Status s = frame->SetArgs(args); + if (!s.ok()) { + delete frame; + return done(s); + } + Item* item = nullptr; + s = GetOrCreateItem(handle, &item); + if (!s.ok()) { + delete frame; + return done(s); + } + Executor::Args exec_args; + exec_args.call_frame = frame; + exec_args.cancellation_manager = opts.cancellation_manager; + exec_args.runner = runner_; + item->exec->RunAsync( + // Executor args + exec_args, + // Done callback. + [item, frame, rets, done](const Status& status) { + item->Unref(); + Status s = status; + if (s.ok()) { + s = frame->GetRetvals(rets); + } + delete frame; + done(s); + }); +} + +bool FunctionLibraryRuntimeImpl::IsDefined(const string& function_name) { + return lib_def_->Find(function_name) != nullptr; +} + +FunctionLibraryRuntime* NewFunctionLibraryRuntime( + Device* device, Runner runner, const FunctionLibraryDefinition* lib_def) { + return new FunctionLibraryRuntimeImpl(device, runner, lib_def); +} + +bool RemoveDeadNodes(Graph* g) { + std::vector<bool> visited(g->num_node_ids(), false); + visited[Graph::kSourceId] = true; + visited[Graph::kSinkId] = true; + std::deque<Node*> q; + for (auto n : g->nodes()) { + if (n->op_def().is_stateful()) { + visited[n->id()] = true; + } else if (n->type_string() == kArgOp) { + visited[n->id()] = true; + } else if (n->type_string() == kRetOp) { + visited[n->id()] = true; + q.push_back(n); + } + } + while (!q.empty()) { + const Node* n = q.front(); + q.pop_front(); + visited[n->id()] = true; + for (auto e : n->in_edges()) { + q.push_back(e->src()); + } + } + bool removed_any = false; + for (Node* n : g->nodes()) { + if (!visited[n->id()]) { + g->RemoveNode(n); + removed_any = true; + } + } + return removed_any; +} + +namespace { +// If 'edges' contains only 1 non-control edge, returns it. Otherwise, +// returns a nullptr. +const Edge* GetTheOnlyDataEdge(const EdgeSet& edges) { + const Edge* ret = nullptr; + for (const Edge* e : edges) { + if (e->IsControlEdge() || ret) return nullptr; + ret = e; + } + return ret; +} +} // end namespace + +bool RemoveIdentityNodes(Graph* g) { + bool removed_any = false; + gtl::InlinedVector<Node*, 8> matches; + for (Node* n : g->nodes()) { + if ((n->type_string() == "Identity") && GetTheOnlyDataEdge(n->in_edges())) { + matches.push_back(n); + } + } + if (!matches.empty()) { + for (Node* n : matches) { + const Edge* in = GetTheOnlyDataEdge(n->in_edges()); + for (const Edge* out : n->out_edges()) { + if (out->IsControlEdge()) { + g->AddControlEdge(in->src(), out->dst()); + } else { + g->AddEdge(in->src(), in->src_output(), out->dst(), out->dst_input()); + } + } + g->RemoveNode(n); + removed_any = true; + } + } + return removed_any; +} + +bool RemoveListArrayConverter(Graph* g) { + gtl::InlinedVector<Node*, 8> matches; + for (Node* n : g->nodes()) { + if ((n->type_string() == "_ListToArray") || + (n->type_string() == "_ArrayToList")) { + matches.push_back(n); + } + } + bool removed_any = false; + if (!matches.empty()) { + for (Node* n : matches) { + if (n->num_inputs() != n->num_outputs()) { + continue; // Not expected. Skip. + } + gtl::InlinedVector<Node*, 8> identity_nodes(n->num_inputs(), nullptr); + + // Process input edges first. + Node* input_control_node = nullptr; + for (const Edge* e : n->in_edges()) { + if (e->IsControlEdge()) { + if (input_control_node == nullptr) { + // If node "n" has any control dependencies, adds a no-op + // node (input_control_node) which the additional Identity + // nodes depends on and the input_control_node depends on + // the node "n"s control dependencies. + input_control_node = AddNoOp(g); + } + g->AddControlEdge(e->src(), input_control_node); + } else { + const int index = e->dst_input(); + Node** id_node = &identity_nodes[index]; + if (*id_node != nullptr) { + LOG(ERROR) + << "RemoveListArrayConverter unexpected duplicated input: " + << e->dst_input(); + return removed_any; + } + *id_node = AddIdentity(g, {e->src(), e->src_output()}); + } + } + + // If node "n" has any control dependencies, the added identity + // nodes should have control dependencies on input_control_node. + if (input_control_node != nullptr) { + for (Node* id : identity_nodes) { + g->AddControlEdge(input_control_node, id); + } + } + + Node* output_control_node = nullptr; + for (const Edge* e : n->out_edges()) { + if (e->IsControlEdge()) { + if (output_control_node == nullptr) { + // If node "n" is control-depended upon by other nodes, + // adds a no-op node (output_control_node) which those + // nodes will depend on and output_control_node depends on + // all Identity nodes. + output_control_node = AddNoOp(g); + } + g->AddControlEdge(output_control_node, e->dst()); + } else { + Node* id_node = identity_nodes[e->src_output()]; + if (id_node == nullptr) { + LOG(ERROR) << "RemoveListArrayConverter unexpected missing input: " + << e->src_output(); + return removed_any; + } + CHECK(id_node); + g->AddEdge(id_node, 0, e->dst(), e->dst_input()); + } + } + + // If any nodes have control dependencies on node "n", those + // nodes should have control dependencies on + // output_control_node. + if (output_control_node != nullptr) { + for (Node* id : identity_nodes) { + g->AddControlEdge(id, output_control_node); + } + } + + g->RemoveNode(n); + removed_any = true; + } + } + return removed_any; +} + +// Returns true iff the function '*fbody' can be inlined at 'node' +// based on the type signature of 'node' and 'fbody'. +static bool ValidateInlining(const Node* node, const FunctionBody* fbody) { + if (static_cast<size_t>(node->num_inputs()) != fbody->arg_types.size()) { + return false; + } + if (static_cast<size_t>(node->num_inputs()) != fbody->arg_nodes.size()) { + return false; + } + if (static_cast<size_t>(node->num_outputs()) != fbody->ret_types.size()) { + return false; + } + if (static_cast<size_t>(node->num_outputs()) != fbody->ret_nodes.size()) { + return false; + } + for (int i = 0; i < node->num_inputs(); ++i) { + if (node->input_type(i) != fbody->arg_types[i]) return false; + } + for (int i = 0; i < node->num_outputs(); ++i) { + if (node->output_type(i) != fbody->ret_types[i]) return false; + } + return true; +} + +// Given a "caller" in "graph", which is a function call of a function +// to "fbody". Replaces the "caller" with fbody->graph and connects +// edges properly. +static void InlineFunctionBody(Graph* g, Node* caller, + const FunctionBody* fbody) { + if (!ValidateInlining(caller, fbody)) { + LOG(WARNING) << "Inlining mismatch: " << caller->DebugString() << " vs. " + << DebugString(fbody->graph); + return; + } + + // Duplicate fbody->graph into 'g'. First, we copy the nodes of + // fbody->graph into 'g' except the source and sink nodes. We copy + // edges among nodes in 'fbody->graph'. + // + // If 'x' is a node in fbody->graph and its copy in 'g' is 'y', we + // remember 'y' in node_map[x->id()]. + std::vector<Node*> node_map(fbody->graph->num_node_ids()); + for (Node* n : fbody->graph->nodes()) { + if (n->IsSource() || n->IsSink()) continue; + CHECK(n->IsOp()); + node_map[n->id()] = g->CopyNode(n); + } + for (const Edge* e : fbody->graph->edges()) { + if (e->src()->IsSource() || e->src()->IsSink() || e->dst()->IsSource() || + e->dst()->IsSink()) { + continue; + } + Node* src_copy = node_map[e->src()->id()]; + Node* dst_copy = node_map[e->dst()->id()]; + g->AddEdge(src_copy, e->src_output(), dst_copy, e->dst_input()); + } + + // Connect input edges. + // + // For data edges coming into "caller", we first compute the + // <src>:<src_output> for the i-th input in "inputs". We create one + // Identity node for each input. Then, we connect inputs[i] to to + // the i-th identity node added. The nodes that previously connects + // to the j-th output of i-th arg node are reconnected to th i-th + // identity node. + // + // If "caller" has any input control dependencies, we add a NoOp + // node "input_control_node". This "input_control_node" depends on + // what "caller" depends on, and the added identity nodes depend on + // "input_control_node". + std::vector<Endpoint> inputs(caller->num_inputs()); + Node* input_control_node = nullptr; + for (const Edge* e : caller->in_edges()) { + if (e->IsControlEdge()) { + if (input_control_node == nullptr) { + input_control_node = AddNoOp(g); + } + g->AddControlEdge(e->src(), input_control_node); + } else { + inputs[e->dst_input()] = {e->src(), e->src_output()}; + } + } + for (std::size_t i = 0; i < fbody->arg_nodes.size(); ++i) { + Node* arg = node_map[fbody->arg_nodes[i]->id()]; + Node* n = AddIdentity(g, inputs[i]); + if (input_control_node) { + g->AddControlEdge(input_control_node, n); + } + for (const Edge* e : arg->out_edges()) { + if (e->IsControlEdge()) { + g->AddControlEdge(n, e->dst()); + } else { + g->AddEdge(n, 0, e->dst(), e->dst_input()); + } + } + node_map[fbody->arg_nodes[i]->id()] = n; + g->RemoveNode(arg); // 'arg' is disconnected. + } + + // Connect output edges. + // + // For i-th return node in fbody->graph, we add in "g" an identity + // node (outputs[i-th]). We then reconnect every incoming edge into + // the i-th return node to the added identity node. + // + // For every data edge coming out of "callee"s i-th output, we + // reconnect it to the i-th identity added above. + // + // If "callee" is control-depended upon by any other nodes, we add a + // NoOp node "output_control_node". "output_control_node" depends on + // all identity nodes added above. And nodes previously depend on + // "callee" is changed to depend on "output_control_node". + std::vector<Node*> outputs(caller->num_inputs()); + for (std::size_t i = 0; i < fbody->ret_nodes.size(); ++i) { + Node* ret = node_map[fbody->ret_nodes[i]->id()]; + Endpoint data; // Data input for the ret node. + for (const Edge* e : ret->in_edges()) { + if (!e->IsControlEdge()) { + data = {e->src(), e->src_output()}; + break; + } + } + CHECK(data.node != nullptr); + Node* n = AddIdentity(g, data); + outputs[i] = n; + for (const Edge* e : ret->in_edges()) { + if (e->IsControlEdge()) { + g->AddControlEdge(e->src(), n); + } + } + g->RemoveNode(ret); // 'ret' is disconnected. + } + Node* output_control_node = nullptr; + for (const Edge* e : caller->out_edges()) { + if (e->IsControlEdge()) { + if (output_control_node == nullptr) { + output_control_node = AddNoOp(g); + for (Node* n : outputs) { + g->AddControlEdge(n, output_control_node); + } + } + g->AddControlEdge(output_control_node, e->dst()); + } else { + g->AddEdge(outputs[e->src_output()], 0, e->dst(), e->dst_input()); + } + } + g->RemoveNode(caller); // 'caller' is replaced with inlined nodes. +} + +bool ExpandInlineFunctions(FunctionLibraryRuntime* lib, Graph* graph) { + std::vector<std::pair<Node*, const FunctionBody*>> candidates; + for (Node* node : graph->nodes()) { + VLOG(3) << "Expanding " << node->DebugString(); + FunctionLibraryRuntime::Handle handle; + Status s = + lib->Instantiate(node->type_string(), node->def().attr(), &handle); + if (!s.ok()) { + // Either "node" is a primitive op, or the instantiation failed. + if (errors::IsNotFound(s)) { + VLOG(2) << "ExpandInlineFunctions " << s; + } else { + LOG(ERROR) << "ExpandInlineFunctions " << s; + } + continue; + } + const FunctionBody* fbody = lib->GetFunctionBody(handle); + CHECK_NOTNULL(fbody); + candidates.push_back({node, fbody}); + } + for (const auto& p : candidates) { + InlineFunctionBody(graph, p.first, p.second); + } + return !candidates.empty(); +} + +// TODO(zhifengc): Maybe this should be the default Graph::AsGraphDef. +// and stash the original NodeDef name as an attr for documentation +// purpose. +static void ToGraphDef(const Graph* g, GraphDef* gdef) { + // We visit nodes in forward topological sort order, which is a + // possible execution order of the graph. + std::vector<int> pending(g->num_node_ids()); + std::deque<const Node*> ready; + for (const Node* n : g->nodes()) { + pending[n->id()] = n->in_edges().size(); + if (pending[n->id()] == 0) ready.push_back(n); + } + gtl::InlinedVector<const Edge*, 4> inputs; + gdef->Clear(); + while (!ready.empty()) { + const Node* n = ready.front(); + ready.pop_front(); + for (const Edge* e : n->out_edges()) { + const Node* next = e->dst(); + if (--pending[next->id()] == 0) { + ready.push_back(next); + } + } + if (!n->IsOp()) continue; + NodeDef* ndef = gdef->add_node(); + ndef->set_name(strings::StrCat("n", n->id())); + ndef->set_op(n->type_string()); + *(ndef->mutable_attr()) = n->def().attr(); + inputs.clear(); + inputs.resize(n->num_inputs()); + for (const Edge* e : n->in_edges()) { + if (e->IsControlEdge()) { + inputs.push_back(e); + } else { + if (inputs[e->dst_input()] == nullptr) { + inputs[e->dst_input()] = e; + } else { + LOG(WARNING) << "Malformed graph node. multiple input edges: " + << n->DebugString(); + } + } + } + // node->name() is merely NodeDef::name, which are not guaranteed + // to be unique and stable after optimization rewrites. Therefore, + // we use "n<node id>" instead. + for (const Edge* e : inputs) { + if (e == nullptr) { + ndef->add_input("unknown"); + } else if (!e->src()->IsOp()) { + } else if (e->IsControlEdge()) { + ndef->add_input(strings::StrCat("^n", e->src()->id())); + } else if (e->src_output() == 0) { + ndef->add_input(strings::StrCat("n", e->src()->id())); + } else { + ndef->add_input( + strings::StrCat("n", e->src()->id(), ":", e->src_output())); + } + } + } +} + +string DebugString(const Graph* g) { + GraphDef gdef; + ToGraphDef(g, &gdef); + return DebugString(gdef); +} + +FunctionBody::FunctionBody(const FunctionDef& f, DataTypeSlice arg_t, + DataTypeSlice ret_t, Graph* g) + : fdef(f), + graph(g), + arg_types(arg_t.begin(), arg_t.end()), + ret_types(ret_t.begin(), ret_t.end()) { + this->arg_nodes.resize(arg_types.size()); + this->ret_nodes.resize(ret_types.size()); + for (Node* n : this->graph->nodes()) { + gtl::InlinedVector<Node*, 4>* node_vec; + if (n->type_string() == kRetOp) { + node_vec = &this->ret_nodes; + } else if (n->type_string() == kArgOp) { + node_vec = &this->arg_nodes; + } else { + continue; + } + int index; + TF_CHECK_OK(GetNodeAttr(n->def(), "index", &index)); + CHECK_LE(0, index); + CHECK_LT(index, node_vec->size()); + (*node_vec)[index] = n; + } +} + +FunctionBody::~FunctionBody() { delete this->graph; } + +class SymbolicGradientHelper { + public: + explicit SymbolicGradientHelper(const FunctionBody& f) : fbody_(&f) {} + + ~SymbolicGradientHelper() { delete gbody_; } + + FunctionBody* Compute(); + + private: + const FunctionBody* fbody_; + FunctionBody* gbody_ = nullptr; + + // A vector of output endpoints which represents backpropagated + // gradients + typedef std::vector<Endpoint> BackpropedGradients; + + // backprops_ is a map from an output endpoint to its accumulated + // gradients. When an output endpoint has accumulated all its + // gradients, we add a node which sums them up. + std::unordered_map<Endpoint, BackpropedGradients, EndpointHash, EndpointEq> + backprops_; + + // pending[i] is count-down counter for i-th node's expected + // backprops. When pending[i] becomes zero, we collected all + // backprop gradients for all output endpoint of the ith-node. + std::vector<int> pending_; + + // 'ready' keeps track of nodes that have been completely + // backpropped. Initially, for every output y of the function f, we + // add dy as an input of the the gradient function. + std::deque<Node*> ready_; + + // Makes a copy of fbody_ in gbody_. + void Copy(); + + // Initialize pending_ and ready_. + void InitBackprop(); + + // In the original function body, there is a forward edge from 'src' + // to 'dst', when the backprop algorithm constructs the node + // 'dst_grad' which computes the gradient, we need to propagate it + // to 'src'. + void BackpropAlongEdge(const Endpoint& dst_grad, const Endpoint& src); + void BackpropZerosAlongEdge(const Endpoint& src); + + Endpoint SumGradients(const Endpoint& src); + + TF_DISALLOW_COPY_AND_ASSIGN(SymbolicGradientHelper); +}; + +void SymbolicGradientHelper::Copy() { + const Graph& src = *(fbody_->graph); + gbody_->graph = new Graph(src.op_registry()); + Graph* dst = gbody_->graph; + + std::vector<Node*> node_map(src.num_node_ids()); + + // Copy the nodes. + node_map[src.source_node()->id()] = dst->source_node(); + node_map[src.sink_node()->id()] = dst->sink_node(); + for (Node* n : src.nodes()) { + if (n->IsSource() || n->IsSink()) continue; + CHECK(n->IsOp()); + node_map[n->id()] = dst->CopyNode(n); + } + + // Copy the edges. + for (const Edge* e : src.edges()) { + Node* src_copy = node_map[e->src()->id()]; + Node* dst_copy = node_map[e->dst()->id()]; + dst->AddEdge(src_copy, e->src_output(), dst_copy, e->dst_input()); + } + + // Save inputs in copied graph. + CHECK_EQ(fbody_->arg_types.size(), fbody_->arg_nodes.size()); + gbody_->arg_types = fbody_->arg_types; + for (std::size_t i = 0; i < fbody_->arg_nodes.size(); ++i) { + gbody_->arg_nodes.push_back(node_map[fbody_->arg_nodes[i]->id()]); + } + + // Save outputs in copied graph. + CHECK_EQ(fbody_->ret_types.size(), fbody_->ret_nodes.size()); + gbody_->ret_types = fbody_->ret_types; + for (std::size_t i = 0; i < fbody_->ret_nodes.size(); ++i) { + gbody_->ret_nodes.push_back(node_map[fbody_->ret_nodes[i]->id()]); + } +} + +void SymbolicGradientHelper::BackpropAlongEdge(const Endpoint& dst_grad, + const Endpoint& src) { + CHECK_NOTNULL(src.node); + auto iter = backprops_.find(src); + if (iter != backprops_.end()) { + auto* grads = &iter->second; + grads->push_back(dst_grad); + if (--pending_[src.node->id()] == 0) { + ready_.push_back(src.node); + } + } +} + +void SymbolicGradientHelper::BackpropZerosAlongEdge(const Endpoint& src) { + CHECK_NOTNULL(src.node); + auto iter = backprops_.find(src); + if (iter != backprops_.end()) { + if (--pending_[src.node->id()] == 0) { + ready_.push_back(src.node); + } + } +} + +void SymbolicGradientHelper::InitBackprop() { + Graph* g = gbody_->graph; + pending_.resize(g->num_node_ids(), 0); + { + backprops_.clear(); + std::unordered_set<Node*> visited; + std::deque<Node*> queue; + for (Node* n : gbody_->arg_nodes) { + queue.push_back(n); + } + + // Going forward to figure out which endpoints need backprop-ed. + // A node's endpoints need to be backprop-ed only if one of the + // arg node can reach the node via data edges. + while (!queue.empty()) { + Node* n = queue.front(); + queue.pop_front(); + visited.insert(n); + for (int i = 0; i < n->num_outputs(); ++i) { + backprops_[{n, i}].clear(); + } + int num_expected_backprops = 0; + for (const Edge* e : n->out_edges()) { + if (e->IsControlEdge()) continue; + ++num_expected_backprops; + if (visited.find(e->dst()) == visited.end()) { + queue.push_back(e->dst()); + } + } + pending_[n->id()] = num_expected_backprops; + } + } + + { + const int num_y = gbody_->ret_nodes.size(); + for (int i = 0; i < num_y; ++i) { + Node* y = gbody_->ret_nodes[i]; + DCHECK_EQ(y->type_string(), kRetOp); + const DataType dtype = y->input_type(0); + const int index = gbody_->arg_nodes.size(); + Node* dy = AddArg(g, dtype, index); + gbody_->arg_types.push_back(dtype); + gbody_->arg_nodes.push_back(dy); + + // What's the input to y? + Endpoint y_in{nullptr, 0}; + for (const Edge* e : y->in_edges()) { + if (!e->IsControlEdge()) { + y_in = {e->src(), e->src_output()}; + break; + } + } + CHECK_NOTNULL(y_in.node); + BackpropAlongEdge({dy, 0}, y_in); + } + } +} + +Endpoint SymbolicGradientHelper::SumGradients(const Endpoint& src) { + Graph* g = gbody_->graph; + const DataType dtype = src.dtype(); + auto iter = backprops_.find(src); + CHECK(iter != backprops_.end()); + const auto& grads = iter->second; + if (grads.empty()) { + // Nothing propagated back. The best we can come up is zeros. + Node* zero_like = AddZerosLike(g, src); + return {zero_like, 0}; + } + if (grads.size() == 1) { + // Just one backprop edge. + return grads[0]; + } + // Otherwise, adds backprop-ed gradients. + NodeDef ndef; + ndef.set_name(g->NewName(kNodeLabel)); + ndef.set_op("AddN"); // N-way Add + for (const Endpoint& ep : grads) { + ndef.add_input(ep.name()); + } + AddNodeAttr("N", static_cast<int64>(grads.size()), &ndef); + AddNodeAttr("T", dtype, &ndef); + Status s; + Node* add = gbody_->graph->AddNode(ndef, &s); + TF_CHECK_OK(s); + for (size_t i = 0; i < grads.size(); ++i) { + const Endpoint& ep = grads[i]; + g->AddEdge(ep.node, ep.index, add, i); + } + return {add, 0}; +} + +static bool IsPrimitiveOpWithNoGrad(const string& func) { + gradient::Creator creator; + Status s = gradient::GetOpGradientCreator(func, &creator); + return s.ok() && (creator == nullptr); +} + +FunctionBody* SymbolicGradientHelper::Compute() { + CHECK(gbody_ == nullptr); + gbody_ = new FunctionBody; + + // Copy fbody_ into gbody_. + Copy(); + + // Initialize backprops. + InitBackprop(); + + // Backward propagation. + gtl::InlinedVector<Endpoint, 8> dy; + Graph* g = gbody_->graph; + while (!ready_.empty()) { + // n has collected all gradients. + Node* n = ready_.front(); + ready_.pop_front(); + + if (n->type_string() == kArgOp) { + // We'll handle the _Arg node after backprop is done. + continue; + } + + // "n" has num_x inputs and num_y outputs. + const int num_x = n->num_inputs(); + const int num_y = n->num_outputs(); + + // dy[i] is the sum of i-th output's backpropped gradients. + dy.clear(); + dy.resize(num_y, {nullptr, 0}); + for (int i = 0; i < num_y; ++i) { + dy[i] = SumGradients({n, i}); + } + + if (IsPrimitiveOpWithNoGrad(n->type_string())) { + // No grad defined for this op. Backprops zeros along the in + // edges. + for (const Edge* e : n->in_edges()) { + if (e->IsControlEdge()) continue; + BackpropZerosAlongEdge({e->src(), e->src_output()}); + } + continue; + } + + // Adds a gradient node with num_x + num_y inputs and num_x + // outputs. + Node* grad = AddSymGrad(g, n, dy); + for (const Edge* e : n->in_edges()) { + if (e->IsControlEdge()) continue; + g->AddEdge(e->src(), e->src_output(), grad, e->dst_input()); + } + for (int i = 0; i < num_y; ++i) { + g->AddEdge(dy[i].node, dy[i].index, grad, num_x + i); + } + + // Backprops along the in edges. + for (const Edge* e : n->in_edges()) { + if (e->IsControlEdge()) continue; + BackpropAlongEdge({grad, e->dst_input()}, {e->src(), e->src_output()}); + } + } + + // The gradient's retval nodes. + for (Node* n : gbody_->ret_nodes) { + g->RemoveNode(n); + } + gbody_->ret_types = fbody_->arg_types; + gbody_->ret_nodes.clear(); + for (size_t i = 0; i < fbody_->arg_types.size(); ++i) { + Endpoint grad = SumGradients({gbody_->arg_nodes[i], 0}); + Node* ret = AddRet(g, grad, i); + gbody_->ret_nodes.push_back(ret); + } + + auto ret = gbody_; + gbody_ = nullptr; + return ret; +} + +FunctionBody* SymbolicGradient(const FunctionBody& f) { + return SymbolicGradientHelper(f).Compute(); +} + +} // end namespace tensorflow diff --git a/tensorflow/core/common_runtime/function.h b/tensorflow/core/common_runtime/function.h new file mode 100644 index 0000000000..634b31232a --- /dev/null +++ b/tensorflow/core/common_runtime/function.h @@ -0,0 +1,100 @@ +#ifndef TENSORFLOW_COMMON_RUNTIME_FUNCTION_H_ +#define TENSORFLOW_COMMON_RUNTIME_FUNCTION_H_ + +#include <functional> + +#include "tensorflow/core/common_runtime/device.h" +#include "tensorflow/core/framework/function.h" +#include "tensorflow/core/graph/graph.h" + +namespace tensorflow { + +// Creates a FunctionLibraryRuntime, which instantiates functions +// defined in "lib_def" and executes functions on the "device". +// +// The returned object does not take ownerships of "device" or +// "lib_def". The caller must ensure "device" and "lib_def" outlives +// the returned object. +typedef std::function<void()> Closure; +typedef std::function<void(Closure)> Runner; +FunctionLibraryRuntime* NewFunctionLibraryRuntime( + Device* device, Runner runner, const FunctionLibraryDefinition* lib_def); + +// FunctionLibraryRuntime::GetFunctionBody returns a description of an +// instantiated function that is represented as a Graph with arg/ret +// nodes annotated. +struct FunctionBody { + FunctionDef fdef; + Graph* graph = nullptr; // owned. + DataTypeVector arg_types; + DataTypeVector ret_types; + gtl::InlinedVector<Node*, 4> arg_nodes; + gtl::InlinedVector<Node*, 4> ret_nodes; + + FunctionBody() {} + FunctionBody(const FunctionDef& f, DataTypeSlice arg_types, + DataTypeSlice ret_types, Graph* g); + ~FunctionBody(); +}; + +// Debugging facility. Returns a debug string for a graph +// representing an instantiated function. +string DebugString(const Graph* instantiated_func_graph); + +// A few hand-crafted optimization on the instantiated function body +// (a Graph*). + +// Removes nodes that are +// 1. not stateful; and +// 2. not _Arg; and +// 3. not reachable from _Retval. +// Returns true iff any node is removed from "g". +bool RemoveDeadNodes(Graph* g); + +// Find a pattern: +// src -(in)-> node -(out)-> dst, where +// 1) node is an identity node; +// 2) in is the only incoming data edge; +// 3) out is the only outgoing data edge; +// +// Rewrites the above pattern with src->dst and relevant data +// dependencies updated. Repeat the process until no such pattern +// left. +bool RemoveIdentityNodes(Graph* g); + +// Rewrites _ListToArray and _ArrayToList to a set of Identity nodes. +bool RemoveListArrayConverter(Graph* g); + +// For each node in "graph", if "lib" indicates that the node is a +// function call, inline the function body. Returns true if at least +// one node is inlined. +// +// This routine goes through "graph" nodes once and applies the +// inlining. The caller may decide to apply the inlining on "graph" +// multiple times by calling ExpandInlineFunctions a few times. +bool ExpandInlineFunctions(FunctionLibraryRuntime* lib, Graph* graph); + +// Applies graph rewrite optimzation such as inlining, dead code +// removal, etc. +// +// **g is a graph constructed based on the runtime library 'lib'. +// OptimizeGraph mutates **g extensively and replaces '*g' with a +// complete copy. Therefore, the caller should not keep any references +// to nodes *g. +void OptimizeGraph(FunctionLibraryRuntime* lib, Graph** g); + +// Given a numerical function "f", returns another numerical function +// "g", such that if "f" takes N inputs and produces M outputs, "g" +// takes N + M inputs and produces N outputs. I.e., if +// (y1, y2, ..., y_M) = f(x1, x2, ..., x_N), +// g is a function which is +// (dL/dx1, dL/dx2, ..., dL/dx_N) = g(x1, x2, ..., x_N, +// dL/dy1, dL/dy2, ..., dL/dy_M), +// where L is a scalar-value function of (...x_i...). +// +// TODO(zhifengc): Asks math expert to say the comment again. +FunctionBody* SymbolicGradient(const FunctionBody& f); + +} // end namespace tensorflow + +#endif // TENSORFLOW_COMMON_RUNTIME_FUNCTION_H_ diff --git a/tensorflow/core/common_runtime/gpu/dma_helper.h b/tensorflow/core/common_runtime/gpu/dma_helper.h new file mode 100644 index 0000000000..7b0750f405 --- /dev/null +++ b/tensorflow/core/common_runtime/gpu/dma_helper.h @@ -0,0 +1,18 @@ +#ifndef TENSORFLOW_COMMON_RUNTIME_GPU_DMA_HELPER_H_ +#define TENSORFLOW_COMMON_RUNTIME_GPU_DMA_HELPER_H_ + +#include "tensorflow/core/public/tensor.h" + +// For internal use only. Visibility should be limited to brain/framework. + +namespace tensorflow { +class DMAHelper { + public: + static bool CanUseDMA(const Tensor* t) { return t->CanUseDMA(); } + static const void* base(const Tensor* t) { return t->base<const void>(); } + static void* base(Tensor* t) { return t->base<void>(); } + static TensorBuffer* buffer(Tensor* t) { return t->buf_; } + static const TensorBuffer* buffer(const Tensor* t) { return t->buf_; } +}; +} // namespace tensorflow +#endif // TENSORFLOW_COMMON_RUNTIME_GPU_DMA_HELPER_H_ diff --git a/tensorflow/core/common_runtime/gpu/gpu_allocator_retry.cc b/tensorflow/core/common_runtime/gpu/gpu_allocator_retry.cc new file mode 100644 index 0000000000..742459c63b --- /dev/null +++ b/tensorflow/core/common_runtime/gpu/gpu_allocator_retry.cc @@ -0,0 +1,49 @@ +#include "tensorflow/core/common_runtime/gpu/gpu_allocator_retry.h" +#include "tensorflow/core/public/env.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/port.h" + +namespace tensorflow { + +GPUAllocatorRetry::GPUAllocatorRetry() : env_(Env::Default()) {} + +void* GPUAllocatorRetry::AllocateRaw( + std::function<void*(size_t alignment, size_t num_bytes, + bool verbose_failure)> alloc_func, + int max_millis_to_wait, size_t alignment, size_t num_bytes) { + if (num_bytes == 0) { + LOG(WARNING) << "Request to allocate 0 bytes"; + return nullptr; + } + uint64 deadline_micros = env_->NowMicros() + max_millis_to_wait * 1000; + void* ptr = nullptr; + while (ptr == nullptr) { + ptr = alloc_func(alignment, num_bytes, false); + if (ptr == nullptr) { + uint64 now = env_->NowMicros(); + if (now < deadline_micros) { + mutex_lock l(mu_); + WaitForMilliseconds(&l, &memory_returned_, + (deadline_micros - now) / 1000); + } else { + return alloc_func(alignment, num_bytes, true); + } + } + } + return ptr; +} + +void GPUAllocatorRetry::DeallocateRaw(std::function<void(void*)> dealloc_func, + void* ptr) { + if (ptr == nullptr) { + LOG(ERROR) << "Request to free nullptr"; + return; + } + dealloc_func(ptr); + { + mutex_lock l(mu_); + memory_returned_.notify_all(); + } +} + +} // namespace tensorflow diff --git a/tensorflow/core/common_runtime/gpu/gpu_allocator_retry.h b/tensorflow/core/common_runtime/gpu/gpu_allocator_retry.h new file mode 100644 index 0000000000..a3298ab222 --- /dev/null +++ b/tensorflow/core/common_runtime/gpu/gpu_allocator_retry.h @@ -0,0 +1,36 @@ +#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_ALLOCATOR_RETRY_H_ +#define TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_ALLOCATOR_RETRY_H_ + +#include "tensorflow/core/platform/port.h" +#include "tensorflow/core/public/env.h" + +namespace tensorflow { + +// A retrying wrapper for a memory allocator. +class GPUAllocatorRetry { + public: + GPUAllocatorRetry(); + + // Call 'alloc_func' to obtain memory. On first call, + // 'verbose_failure' will be false. If return value is nullptr, + // then wait up to 'max_millis_to_wait' milliseconds, retrying each + // time a call to DeallocateRaw() is detected, until either a good + // pointer is returned or the deadline is exhausted. If the + // deadline is exahusted, try one more time with 'verbose_failure' + // set to true. The value returned is either the first good pointer + // obtained from 'alloc_func' or nullptr. + void* AllocateRaw(std::function<void*(size_t alignment, size_t num_bytes, + bool verbose_failure)> alloc_func, + int max_millis_to_wait, size_t alignment, size_t bytes); + + // Calls dealloc_func(ptr) and then notifies any threads blocked in + // AllocateRaw() that would like to retry. + void DeallocateRaw(std::function<void(void* ptr)> dealloc_func, void* ptr); + + private: + Env* env_; + mutex mu_; + condition_variable memory_returned_; +}; +} // namespace tensorflow +#endif // TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_ALLOCATOR_RETRY_H_ diff --git a/tensorflow/core/common_runtime/gpu/gpu_allocator_retry_test.cc b/tensorflow/core/common_runtime/gpu/gpu_allocator_retry_test.cc new file mode 100644 index 0000000000..db1c58cc65 --- /dev/null +++ b/tensorflow/core/common_runtime/gpu/gpu_allocator_retry_test.cc @@ -0,0 +1,175 @@ +#include "tensorflow/core/common_runtime/gpu/gpu_allocator_retry.h" + +#include "tensorflow/core/lib/core/notification.h" +#include "tensorflow/core/platform/port.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/thread_annotations.h" +#include "tensorflow/core/public/env.h" +#include <gtest/gtest.h> + +namespace tensorflow { +namespace { + +class FakeAllocator { + public: + FakeAllocator(size_t cap, int millis_to_wait) + : memory_capacity_(cap), millis_to_wait_(millis_to_wait) {} + + // Allocate just keeps track of the number of outstanding allocations, + // not their sizes. Assume a constant size for each. + void* AllocateRaw(size_t alignment, size_t num_bytes) { + return retry_.AllocateRaw( + [this](size_t a, size_t nb, bool v) { + mutex_lock l(mu_); + if (memory_capacity_ > 0) { + --memory_capacity_; + return good_ptr_; + } else { + return static_cast<void*>(nullptr); + } + }, + millis_to_wait_, alignment, num_bytes); + } + + void DeallocateRaw(void* ptr) { + retry_.DeallocateRaw( + [this](void* p) { + mutex_lock l(mu_); + ++memory_capacity_; + }, + ptr); + } + + private: + GPUAllocatorRetry retry_; + void* good_ptr_ = reinterpret_cast<void*>(0xdeadbeef); + mutex mu_; + size_t memory_capacity_ GUARDED_BY(mu_); + int millis_to_wait_; +}; + +class GPUAllocatorRetryTest : public ::testing::Test { + protected: + GPUAllocatorRetryTest() {} + + void LaunchConsumerThreads(int num_consumers, int cap_needed) { + consumer_count_.resize(num_consumers, 0); + for (int i = 0; i < num_consumers; ++i) { + consumers_.push_back(Env::Default()->StartThread( + ThreadOptions(), "anon_thread", [this, i, cap_needed]() { + do { + void* ptr = nullptr; + for (int j = 0; j < cap_needed; ++j) { + ptr = alloc_->AllocateRaw(16, 1); + if (ptr == nullptr) { + mutex_lock l(mu_); + has_failed_ = true; + return; + } + } + ++consumer_count_[i]; + for (int j = 0; j < cap_needed; ++j) { + alloc_->DeallocateRaw(ptr); + } + } while (!notifier_.HasBeenNotified()); + })); + } + } + + // Wait up to wait_micros microseconds for has_failed_ to equal expected, + // then terminate all threads. + void JoinConsumerThreads(bool expected, int wait_micros) { + while (wait_micros > 0) { + { + mutex_lock l(mu_); + if (has_failed_ == expected) break; + } + int interval_micros = std::min(1000, wait_micros); + Env::Default()->SleepForMicroseconds(interval_micros); + wait_micros -= interval_micros; + } + notifier_.Notify(); + for (auto c : consumers_) { + // Blocks until thread terminates. + delete c; + } + } + + std::unique_ptr<FakeAllocator> alloc_; + std::vector<Thread*> consumers_; + std::vector<int> consumer_count_; + Notification notifier_; + mutex mu_; + bool has_failed_ GUARDED_BY(mu_) = false; + int count_ GUARDED_BY(mu_) = 0; +}; + +// Verifies correct retrying when memory is slightly overcommitted but +// we allow retry. +TEST_F(GPUAllocatorRetryTest, RetrySuccess) { + // Support up to 2 allocations simultaneously, waits up to 10 msec for + // a chance to alloc. + alloc_.reset(new FakeAllocator(2, 10000)); + // Launch 3 consumers, each of whom needs 1 unit at a time. + LaunchConsumerThreads(3, 1); + // This should be enough time for each consumer to be satisfied many times. + Env::Default()->SleepForMicroseconds(50000); + JoinConsumerThreads(false, 0); + for (int i = 0; i < 3; ++i) { + LOG(INFO) << "Consumer " << i << " is " << consumer_count_[i]; + } + { + mutex_lock l(mu_); + EXPECT_FALSE(has_failed_); + } + EXPECT_GT(consumer_count_[0], 0); + EXPECT_GT(consumer_count_[1], 0); + EXPECT_GT(consumer_count_[2], 0); +} + +// Verifies OutOfMemory failure when memory is slightly overcommitted +// and retry is not allowed. +TEST_F(GPUAllocatorRetryTest, NoRetryFail) { + // Support up to 2 allocations simultaneously, waits up to 0 msec for + // a chance to alloc. + alloc_.reset(new FakeAllocator(2, 0)); + // Launch 3 consumers, each of whom needs 1 unit at a time. + LaunchConsumerThreads(3, 1); + Env::Default()->SleepForMicroseconds(50000); + // Will wait up to 10 seconds for proper race condition to occur, resulting + // in failure. + JoinConsumerThreads(true, 10000000); + for (int i = 0; i < 3; ++i) { + LOG(INFO) << "Consumer " << i << " is " << consumer_count_[i]; + } + { + mutex_lock l(mu_); + EXPECT_TRUE(has_failed_); + } +} + +// Verifies OutOfMemory failure when retry is allowed but memory capacity +// is too low even for retry. +TEST_F(GPUAllocatorRetryTest, RetryInsufficientFail) { + // Support up to 2 allocations simultaneously, waits up to 10 msec for + // a chance to alloc. + alloc_.reset(new FakeAllocator(2, 10000)); + // Launch 3 consumers, each of whom needs 2 units at a time. We expect + // deadlock where 2 consumers each hold 1 unit, and timeout trying to + // get the second. + LaunchConsumerThreads(3, 2); + Env::Default()->SleepForMicroseconds(50000); + // Will wait up to 10 seconds for proper race condition to occur, resulting + // in failure. + JoinConsumerThreads(true, 10000000); + for (int i = 0; i < 3; ++i) { + LOG(INFO) << "Consumer " << i << " is " << consumer_count_[i]; + } + { + mutex_lock l(mu_); + EXPECT_TRUE(has_failed_); + } +} + +} // namespace +} // namespace tensorflow diff --git a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc new file mode 100644 index 0000000000..3df833594f --- /dev/null +++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc @@ -0,0 +1,397 @@ +#include "tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h" + +#include "tensorflow/stream_executor/multi_platform_manager.h" +#include "tensorflow/stream_executor/stream_executor.h" +#include "tensorflow/core/common_runtime/gpu/gpu_allocator_retry.h" +#include "tensorflow/core/common_runtime/gpu/gpu_init.h" +#include "tensorflow/core/lib/core/bits.h" +#include "tensorflow/core/lib/gtl/stl_util.h" +#include "tensorflow/core/lib/strings/numbers.h" +#include "tensorflow/core/lib/strings/str_util.h" +#include "tensorflow/core/lib/strings/strcat.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/port.h" + +namespace gpu = ::perftools::gputools; + +namespace tensorflow { + +GPUBFCAllocator::GPUBFCAllocator(int device_id, size_t total_memory) + : device_id_(device_id) { + // Get a pointer to the stream_executor for this device + stream_exec_ = GPUMachineManager()->ExecutorForDevice(device_id).ValueOrDie(); + + // Allocate the requested amount of memory. + gpu_memory_size_ = total_memory; + + LOG(INFO) << "Allocating " << strings::HumanReadableNumBytes(gpu_memory_size_) + << " bytes."; + gpu::DeviceMemory<char> gpu_mem = + stream_exec_->AllocateArray<char>(gpu_memory_size_); + + QCHECK(gpu_mem != nullptr) + << " Could not allocate GPU device memory for device " << device_id + << ". Tried to allocate " + << strings::HumanReadableNumBytes(gpu_memory_size_); + base_ptr_ = gpu_mem.opaque(); + LOG(INFO) << "GPU " << device_id << " memory begins at " << base_ptr_ + << " extends to " + << static_cast<void*>( + (static_cast<char*>(base_ptr_) + gpu_memory_size_)); + + // Create a bunch of bins of various good sizes. + + // Covers allocations of exactly 256 bytes (the minimum size). + bins_.insert(std::make_pair(256, new Bin(256))); + + // We create bins to fit all possible ranges that cover the + // gpu_memory_size_ starting from allocations up to 1024 bytes to + // allocations up to (and including) the memory limit. + for (size_t bin_size = 1024; bin_size < gpu_memory_size_ * 2; bin_size *= 2) { + LOG(INFO) << "Creating bin of max chunk size " + << strings::HumanReadableNumBytes(bin_size); + bins_.insert(std::make_pair(bin_size, new Bin(bin_size))); + } + + // Create one large chunk for the whole memory space that will + // be chunked later. + GPUBFCAllocator::Chunk* c = new GPUBFCAllocator::Chunk(); + c->ptr = gpu_mem.opaque(); + c->size = gpu_memory_size_; + c->in_use = false; + c->prev = nullptr; + c->next = nullptr; + + ptr_to_chunk_map_.insert(std::make_pair(c->ptr, c)); + + // Insert the chunk into the right bin. + ReassignChunkToBin(c); +} + +GPUBFCAllocator::~GPUBFCAllocator() { + // Return memory back. + if (base_ptr_) { + gpu::DeviceMemoryBase gpu_ptr{base_ptr_}; + stream_exec_->Deallocate(&gpu_ptr); + } + + gtl::STLDeleteValues(&bins_); +} + +void* GPUBFCAllocator::AllocateRaw(size_t unused_alignment, size_t num_bytes) { + static const int64 kMaxMillisToWait = 10000; // 10 seconds + return retry_helper_.AllocateRaw( + [this](size_t a, size_t nb, bool v) { + return AllocateRawInternal(a, nb, v); + }, + kMaxMillisToWait, unused_alignment, num_bytes); +} + +void* GPUBFCAllocator::AllocateRawInternal(size_t unused_alignment, + size_t num_bytes, + bool dump_log_on_failure) { + if (num_bytes == 0) { + LOG(ERROR) << "tried to allocate 0 bytes"; + return nullptr; + } + // First, always allocate memory of at least 256 bytes, and always + // allocate multiples of 256 bytes so all memory addresses are + // nicely byte aligned. + size_t rounded_bytes = (256 * ((num_bytes + 255) / 256)); + DCHECK_EQ(0, rounded_bytes % 256); + + // The BFC allocator tries to find the best fit first. + // + // First identify the first bin that could satisfy rounded_bytes. + auto it = bins_.lower_bound(rounded_bytes); + if (it == bins_.end()) { + LOG(ERROR) << " Asked for " << rounded_bytes << " but largest bin was " + << bins_.rbegin()->first; + return nullptr; + } + + mutex_lock l(lock_); + for (; it != bins_.end(); ++it) { + // Start searching from the first bin for the smallest chunk that fits + // rounded_bytes. + Bin* b = it->second; + for (GPUBFCAllocator::Chunk* chunk : b->chunks) { + if (!chunk->in_use && chunk->size > rounded_bytes) { + // We found an existing chunk that fits us that wasn't in use. + chunk->in_use = true; + + // If we can break the size of the chunk into two reasonably + // large pieces, do so. + // + // TODO(vrv): What should be the criteria when deciding when + // to split? + if (chunk->size >= rounded_bytes * 2) { + SplitChunk(chunk, rounded_bytes); + } + + // The requested size of the returned chunk is what the user + // has allocated. + chunk->requested_size = num_bytes; + + VLOG(4) << "Returning: " << chunk->ptr; + return chunk->ptr; + } + } + } + + // We searched all bins for an existing free chunk to use and + // couldn't find one. This means we must have run out of memory, + // Dump the memory log for analysis. + if (dump_log_on_failure) { + DumpMemoryLog(rounded_bytes); + LOG(WARNING) << "Ran out of memory trying to allocate " + << strings::HumanReadableNumBytes(num_bytes) + << ". See logs for memory state"; + } + return nullptr; +} + +void GPUBFCAllocator::SplitChunk(GPUBFCAllocator::Chunk* c, size_t num_bytes) { + // Create a new chunk starting num_bytes after c + GPUBFCAllocator::Chunk* new_chunk = new GPUBFCAllocator::Chunk(); + new_chunk->ptr = static_cast<void*>(static_cast<char*>(c->ptr) + num_bytes); + VLOG(6) << "Adding to chunk map: " << new_chunk->ptr; + ptr_to_chunk_map_.insert(std::make_pair(new_chunk->ptr, new_chunk)); + + // Set the new sizes of the chunks. + new_chunk->size = c->size - num_bytes; + c->size = num_bytes; + + // The new chunk is not in use. + new_chunk->in_use = false; + + // Maintain the pointers. + // c <-> c_neighbor becomes + // c <-> new_chunk <-> c_neighbor + GPUBFCAllocator::Chunk* c_neighbor = c->next; + new_chunk->prev = c; + new_chunk->next = c_neighbor; + c->next = new_chunk; + if (c_neighbor) { + c_neighbor->prev = new_chunk; + } + + // Maintain the bins + ReassignChunkToBin(new_chunk); + ReassignChunkToBin(c); +} + +void GPUBFCAllocator::DeallocateRaw(void* ptr) { + retry_helper_.DeallocateRaw([this](void* p) { DeallocateRawInternal(p); }, + ptr); +} + +void GPUBFCAllocator::DeallocateRawInternal(void* ptr) { + if (ptr == nullptr) { + LOG(ERROR) << "tried to deallocate nullptr"; + return; + } + mutex_lock l(lock_); + + // Find the chunk from the ptr. + auto it = ptr_to_chunk_map_.find(ptr); + CHECK(it != ptr_to_chunk_map_.end()) + << "Asked to deallocate a pointer we never allocated: " << ptr; + + GPUBFCAllocator::Chunk* c = it->second; + VLOG(6) << "Chunk at " << c->ptr << " no longer in use"; + // Mark the chunk as no longer in use + c->in_use = false; + + // Consider coalescing it. + MaybeCoalesce(c); +} + +// Merges c1 and c2 when c1->next is c2 and c2->prev is c1. +// We merge c2 into c1. +void GPUBFCAllocator::Merge(GPUBFCAllocator::Chunk* c1, + GPUBFCAllocator::Chunk* c2) { + // We can only merge chunks that are not in use. + DCHECK(!c1->in_use && !c2->in_use); + + // c1's prev doesn't change, still points to the same ptr, and is + // still not in use. + + // Fix up neighbor pointers + // + // c1 <-> c2 <-> c3 should become + // c1 <-> c3 + GPUBFCAllocator::Chunk* c3 = c2->next; + c1->next = c3; + CHECK(c2->prev == c1); + if (c3 != nullptr) { + c3->prev = c1; + } + + // Set the new size + c1->size += c2->size; + + // Delete c2 and cleanup all state + RemoveChunkFromBin(c2); +} + +void GPUBFCAllocator::ReassignChunkToBin(GPUBFCAllocator::Chunk* c) { + auto it = bins_.lower_bound(c->size); + CHECK(it != bins_.end()) << " Tried to reassign to non-existent bin for size " + << c->size; + + Bin* new_bin = it->second; + + // If the bin has not changed, do nothing. + Bin* old_bin = c->bin; + if (old_bin != nullptr && new_bin == old_bin) { + return; + } + + // The bin has changed. Add the chunk to the new bin and remove + // the chunk from the old bin. + new_bin->chunks.insert(c); + c->bin = new_bin; + + if (old_bin == nullptr) { + return; + } + + // Remove chunk from old bin + for (auto it = old_bin->chunks.begin(); it != old_bin->chunks.end(); ++it) { + if (*it == c) { + old_bin->chunks.erase(it); + return; + } + } + CHECK(false) << "Could not find chunk in old bin"; +} + +void GPUBFCAllocator::RemoveChunkFromBin(GPUBFCAllocator::Chunk* c) { + Bin* b = c->bin; + for (auto it = b->chunks.begin(); it != b->chunks.end(); ++it) { + Chunk* other_c = *it; + if (other_c->ptr == c->ptr) { + b->chunks.erase(it); + VLOG(4) << "Removing: " << c->ptr; + ptr_to_chunk_map_.erase(c->ptr); + delete c; + return; + } + } + + CHECK(false) << "Could not find chunk in bin"; +} + +void GPUBFCAllocator::MaybeCoalesce(GPUBFCAllocator::Chunk* c) { + // This chunk is no longer in-use, consider coalescing the chunk + // with adjacent chunks. + Chunk* chunk_to_reassign = nullptr; + + // If the next chunk is free, coalesce the two, if the result would + // fit in an existing bin. + if (c->next && !c->next->in_use) { + VLOG(8) << "Chunk at " << c->next->ptr << " merging with c " << c->ptr; + + chunk_to_reassign = c; + + // Deletes c->next + Merge(c, c->next); + } + + // If the previous chunk is free, coalesce the two + if (c->prev && !c->prev->in_use) { + VLOG(8) << "Chunk at " << c->ptr << " merging into c->prev " + << c->prev->ptr; + + chunk_to_reassign = c->prev; + + // Deletes c + Merge(c->prev, c); + } + + // Reassign the final merged chunk into the right bin. + if (chunk_to_reassign) { + ReassignChunkToBin(chunk_to_reassign); + } +} + +void GPUBFCAllocator::AddAllocVisitor(Visitor visitor) { + VLOG(1) << "AddVisitor"; + mutex_lock l(lock_); + region_visitors_.push_back(visitor); + visitor(base_ptr_, gpu_memory_size_); +} + +bool GPUBFCAllocator::TracksAllocationSizes() { return true; } + +size_t GPUBFCAllocator::RequestedSize(void* ptr) { + mutex_lock l(lock_); + auto it = ptr_to_chunk_map_.find(ptr); + CHECK(it != ptr_to_chunk_map_.end()) + << "Asked for requested size of pointer we never allocated: " << ptr; + GPUBFCAllocator::Chunk* c = it->second; + return c->requested_size; +} + +size_t GPUBFCAllocator::AllocatedSize(void* ptr) { + mutex_lock l(lock_); + auto it = ptr_to_chunk_map_.find(ptr); + CHECK(it != ptr_to_chunk_map_.end()) + << "Asked for allocated size of pointer we never allocated: " << ptr; + GPUBFCAllocator::Chunk* c = it->second; + return c->size; +} + +void GPUBFCAllocator::DumpMemoryLog(size_t num_bytes) { + // For each bin: tally up the total number of chunks and bytes. + for (auto bit : bins_) { + Bin* b = bit.second; + + size_t total_bytes_in_use = 0; + size_t total_bytes_in_bin = 0; + size_t total_requested_bytes_in_use = 0; + size_t total_requested_bytes_in_bin = 0; + size_t total_chunks_in_use = 0; + size_t total_chunks_in_bin = 0; + for (Chunk* c : b->chunks) { + total_bytes_in_bin += c->size; + total_requested_bytes_in_bin += c->requested_size; + ++total_chunks_in_bin; + if (c->in_use) { + total_bytes_in_use += c->size; + total_requested_bytes_in_use += c->requested_size; + ++total_chunks_in_use; + } + } + + LOG(INFO) << "Bin (" << b->bin_size + << "): \tTotal Chunks: " << total_chunks_in_bin + << ", Chunks in use: " << total_chunks_in_use << " " + << strings::HumanReadableNumBytes(total_bytes_in_bin) + << " allocated for chunks. " + << strings::HumanReadableNumBytes(total_requested_bytes_in_bin) + << " client-requested for chunks. " + << strings::HumanReadableNumBytes(total_bytes_in_use) + << " in use in bin. " + << strings::HumanReadableNumBytes(total_requested_bytes_in_use) + << " client-requested in use in bin."; + } + + // Find the bin that we would have liked to allocate in, so we + // can get some further analysis about fragmentation. + auto it = bins_.lower_bound(num_bytes); + if (it != bins_.end()) { + Bin* b = it->second; + + LOG(INFO) << "Bin for " << strings::HumanReadableNumBytes(num_bytes) + << " was " << strings::HumanReadableNumBytes(b->bin_size) + << ", Chunk State: "; + + for (Chunk* c : b->chunks) { + LOG(INFO) << c->DebugString(true); + } + } +} + +} // namespace tensorflow diff --git a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h new file mode 100644 index 0000000000..3d1601e132 --- /dev/null +++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h @@ -0,0 +1,156 @@ +#ifndef TENSORFLOW_COMMON_RUNTIME_GPU_GPU_BFC_ALLOCATOR_H_ +#define TENSORFLOW_COMMON_RUNTIME_GPU_GPU_BFC_ALLOCATOR_H_ + +#include <memory> +#include <string> +#include <unordered_map> +#include <vector> + +#include "tensorflow/stream_executor/stream_executor.h" +#include "tensorflow/core/common_runtime/gpu/gpu_allocator_retry.h" +#include "tensorflow/core/common_runtime/gpu/visitable_allocator.h" +#include "tensorflow/core/lib/gtl/stl_util.h" +#include "tensorflow/core/lib/strings/strcat.h" +#include "tensorflow/core/platform/port.h" +#include "tensorflow/core/platform/thread_annotations.h" + +namespace tensorflow { + +// A GPU memory allocator that implements a 'best-fit with coalescing' +// algorithm. This is essentially a very simple version of Doug Lea's +// malloc (dlmalloc). +// +// The goal of this allocator is to support defragmentation via +// coalescing. One assumption we make is that the process using this +// allocator owns pretty much all of the GPU memory, and that nearly +// all requests to allocate GPU memory go through this interface. +class GPUBFCAllocator : public VisitableAllocator { + public: + // 'device_id' refers to the StreamExecutor ID of the device within + // the process and must reference a valid ID in the process. + explicit GPUBFCAllocator(int device_id, size_t total_memory); + ~GPUBFCAllocator() override; + + string Name() override { return "gpu_bfc"; } + void* AllocateRaw(size_t alignment, size_t num_bytes) override; + void DeallocateRaw(void* ptr) override; + + void AddAllocVisitor(Visitor visitor) override; + + // Does nothing, because gpu memory is never freed. + void AddFreeVisitor(Visitor visitor) override {} + + bool TracksAllocationSizes() override; + + size_t RequestedSize(void* ptr) override; + + size_t AllocatedSize(void* ptr) override; + + private: + struct Bin; + + void* AllocateRawInternal(size_t alignment, size_t num_bytes, + bool dump_log_on_failure); + void DeallocateRawInternal(void* ptr); + + // Chunks point to GPU memory. Their prev/next pointers form a + // doubly-linked list of addresses sorted by GPU base address that + // must be contiguous. Chunks contain information about whether + // they are in use or whether they are free, and contain a pointer + // to the bin they are in. + struct Chunk { + size_t size = 0; // Full size of GPU buffer. + + // We sometimes give chunks that are larger than needed to reduce + // fragmentation. requested_size keeps track of what the client + // actually wanted so we can understand whether our splitting + // strategy is efficient. + size_t requested_size = 0; + + bool in_use = false; + void* ptr = nullptr; // pointer to granted GPU subbuffer. + + // If not null, the memory referred to by 'prev' is directly + // preceding the memory used by this chunk. E.g., It should start + // at 'ptr - prev->size' + Chunk* prev = nullptr; + + // If not null, the memory referred to by 'next' is directly + // following the memory used by this chunk. E.g., It should be at + // 'ptr + size' + Chunk* next = nullptr; + + // What bin are we in? + Bin* bin = nullptr; + + string DebugString(bool recurse) { + string dbg; + strings::StrAppend(&dbg, " Size: ", strings::HumanReadableNumBytes(size), + " | Requested Size: ", + strings::HumanReadableNumBytes(requested_size), + " | in_use: ", in_use); + if (recurse && prev) { + strings::StrAppend(&dbg, ", prev: ", prev->DebugString(false)); + } + if (recurse && next) { + strings::StrAppend(&dbg, ", next: ", next->DebugString(false)); + } + return dbg; + } + }; + + Chunk* AllocateNewChunk(size_t num_bytes); + void SplitChunk(Chunk* c, size_t num_bytes); + void Merge(Chunk* c1, Chunk* c2); + void MaybeCoalesce(Chunk* c); + + void ReassignChunkToBin(Chunk* c); + void RemoveChunkFromBin(Chunk* c); + + void DumpMemoryLog(size_t num_bytes); + + // A Bin is a collection of similar-sized Chunks. + struct Bin { + // All chunks in this bin have >= bin_size memory. + size_t bin_size = 0; + + struct ChunkComparator { + bool operator()(Chunk* a, Chunk* b) { return a->size < b->size; } + }; + + // List of chunks within the bin, sorted by chunk size. + std::multiset<Chunk*, ChunkComparator> chunks; + + explicit Bin(size_t bs) : bin_size(bs) {} + + ~Bin() { gtl::STLDeleteElements(&chunks); } + }; + + GPUAllocatorRetry retry_helper_; + + // Structures immutable after construction + const int device_id_; + // The base pointer where all the GPU memory begins. + void* base_ptr_ = nullptr; + size_t gpu_memory_size_ = 0; + + // Map from bin size to Bin + // After construction, the bin map is never resized. + std::map<size_t, Bin*> bins_; + + perftools::gputools::StreamExecutor* stream_exec_; // Not owned. + + // Structures mutable after construction + mutable mutex lock_; + // Not owned. + std::unordered_map<void*, Chunk*> ptr_to_chunk_map_; + + // Called once on each region, ASAP. + std::vector<Visitor> region_visitors_; + + TF_DISALLOW_COPY_AND_ASSIGN(GPUBFCAllocator); +}; + +} // namespace tensorflow + +#endif // TENSORFLOW_COMMON_RUNTIME_GPU_GPU_BFC_ALLOCATOR_H_ diff --git a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc new file mode 100644 index 0000000000..7b5e8aec1d --- /dev/null +++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc @@ -0,0 +1,166 @@ +#if GOOGLE_CUDA + +#include "tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h" + +#include <algorithm> +#include <vector> + +#include "tensorflow/stream_executor/stream_executor.h" +#include <gtest/gtest.h> +#include "tensorflow/core/common_runtime/gpu/gpu_init.h" +#include "tensorflow/core/lib/gtl/inlined_vector.h" +#include "tensorflow/core/lib/random/simple_philox.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/port.h" + +namespace gpu = ::perftools::gputools; + +namespace tensorflow { +namespace { + +TEST(GPUBFCAllocatorTest, NoDups) { + GPUBFCAllocator a(0, 1 << 30); + // Allocate a lot of raw pointers + std::vector<void*> ptrs; + for (int s = 1; s < 1024; s++) { + void* raw = a.AllocateRaw(1, s); + ptrs.push_back(raw); + } + + std::sort(ptrs.begin(), ptrs.end()); + + // Make sure none of them are equal, and that none of them overlap. + for (int i = 0; i < ptrs.size(); i++) { + if (i > 0) { + ASSERT_NE(ptrs[i], ptrs[i - 1]); // No dups + size_t req_size = a.RequestedSize(ptrs[i - 1]); + ASSERT_GT(req_size, 0); + ASSERT_GE(static_cast<char*>(ptrs[i]) - static_cast<char*>(ptrs[i - 1]), + req_size); + } + } + + for (int i = 0; i < ptrs.size(); i++) { + a.DeallocateRaw(ptrs[i]); + } +} + +TEST(GPUBFCAllocatorTest, AllocationsAndDeallocations) { + GPUBFCAllocator a(0, 1 << 30); + // Allocate 256 raw pointers of sizes between 100 bytes and about + // a meg + random::PhiloxRandom philox(123, 17); + random::SimplePhilox rand(&philox); + + std::vector<void*> initial_ptrs; + for (int s = 1; s < 256; s++) { + size_t size = std::min<size_t>( + std::max<size_t>(rand.Rand32() % 1048576, 100), 1048576); + void* raw = a.AllocateRaw(1, size); + + initial_ptrs.push_back(raw); + } + + // Deallocate half of the memory, and keep track of the others. + std::vector<void*> existing_ptrs; + for (int i = 0; i < initial_ptrs.size(); i++) { + if (i % 2 == 1) { + a.DeallocateRaw(initial_ptrs[i]); + } else { + existing_ptrs.push_back(initial_ptrs[i]); + } + } + + // Allocate a lot of raw pointers + for (int s = 1; s < 256; s++) { + size_t size = std::min<size_t>( + std::max<size_t>(rand.Rand32() % 1048576, 100), 1048576); + void* raw = a.AllocateRaw(1, size); + existing_ptrs.push_back(raw); + } + + std::sort(existing_ptrs.begin(), existing_ptrs.end()); + // Make sure none of them are equal + for (int i = 0; i < existing_ptrs.size(); i++) { + if (i > 0) { + CHECK_NE(existing_ptrs[i], existing_ptrs[i - 1]); // No dups + + size_t req_size = a.RequestedSize(existing_ptrs[i - 1]); + ASSERT_GT(req_size, 0); + + // Check that they don't overlap. + ASSERT_GE(static_cast<char*>(existing_ptrs[i]) - + static_cast<char*>(existing_ptrs[i - 1]), + req_size); + } + } + + for (int i = 0; i < existing_ptrs.size(); i++) { + a.DeallocateRaw(existing_ptrs[i]); + } +} + +TEST(GPUBFCAllocatorTest, ExerciseCoalescing) { + GPUBFCAllocator a(0, 1 << 30); + + float* first_ptr = a.Allocate<float>(1024); + a.Deallocate(first_ptr); + for (int i = 0; i < 1024; ++i) { + // Allocate several buffers of different sizes, and then clean them + // all up. We should be able to repeat this endlessly without + // causing fragmentation and growth. + float* t1 = a.Allocate<float>(1024); + + int64* t2 = a.Allocate<int64>(1048576); + double* t3 = a.Allocate<double>(2048); + float* t4 = a.Allocate<float>(10485760); + + a.Deallocate(t1); + a.Deallocate(t2); + a.Deallocate(t3); + a.Deallocate(t4); + } + + // At the end, we should have coalesced all memory into one region + // starting at the beginning, so validate that allocating a pointer + // starts from this region. + float* first_ptr_after = a.Allocate<float>(1024); + EXPECT_EQ(first_ptr, first_ptr_after); + a.Deallocate(first_ptr_after); +} + +TEST(GPUBFCAllocatorTest, AllocateZeroBufSize) { + GPUBFCAllocator a(0, 1 << 30); + float* ptr = a.Allocate<float>(0); + EXPECT_EQ(nullptr, ptr); +} + +TEST(GPUBFCAllocatorTest, TracksSizes) { + GPUBFCAllocator a(0, 1 << 30); + EXPECT_EQ(true, a.TracksAllocationSizes()); +} + +TEST(GPUBFCAllocatorTest, AllocatedVsRequested) { + GPUBFCAllocator a(0, 1 << 30); + float* t1 = a.Allocate<float>(1); + EXPECT_EQ(4, a.RequestedSize(t1)); + EXPECT_EQ(256, a.AllocatedSize(t1)); + a.Deallocate(t1); +} + +TEST(GPUBFCAllocatorTest, TestCustomMemoryLimit) { + // Configure a 1MiB byte limit + GPUBFCAllocator a(0, 1 << 20); + + float* first_ptr = a.Allocate<float>(1 << 6); + float* second_ptr = a.Allocate<float>(1 << 20); + + EXPECT_NE(nullptr, first_ptr); + EXPECT_EQ(nullptr, second_ptr); + a.Deallocate(first_ptr); +} + +} // namespace +} // namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc new file mode 100644 index 0000000000..5ec405cd80 --- /dev/null +++ b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc @@ -0,0 +1,186 @@ +#include "tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h" + +#include "tensorflow/core/common_runtime/gpu/gpu_init.h" +#include "tensorflow/stream_executor/multi_platform_manager.h" +#include "tensorflow/stream_executor/stream_executor.h" + +namespace gpu = ::perftools::gputools; + +namespace tensorflow { + +#define MASK_WORDS 2 +#define MASK_BYTES (MASK_WORDS * sizeof(int64)) + +namespace { + +static int64* NewMask(int64 word) { + int64* m = new int64[MASK_WORDS]; + for (int i = 0; i < MASK_WORDS; ++i) { + m[i] = word; + } + return m; +} + +static int64* before_mask = NewMask(0xabababababababab); +static int64* after_mask = NewMask(0xcdcdcdcdcdcdcdcd); + +bool CheckMask(perftools::gputools::StreamExecutor* exec, void* ptr, + int64* mask) { + gpu::DeviceMemory<int64> gpu_ptr{gpu::DeviceMemoryBase{ptr, MASK_BYTES}}; + int64 tmp[MASK_WORDS]; + + if (!exec->SynchronousMemcpy(&tmp, gpu_ptr, MASK_BYTES)) { + LOG(FATAL) << "Could not copy debug mask"; + } + + bool ok = true; + for (int i = 0; i < MASK_WORDS; ++i) { + ok &= (mask[i] == tmp[i]); + if (!ok) { + LOG(ERROR) << "i=" << i + << " mask=" << reinterpret_cast<const void*>(mask[i]) + << " field=" << reinterpret_cast<const void*>(tmp[i]); + } + } + + return ok; +} + +void InitMask(perftools::gputools::StreamExecutor* exec, void* ptr, + int64* mask) { + gpu::DeviceMemory<int64> gpu_ptr{gpu::DeviceMemoryBase{ptr, MASK_BYTES}}; + if (!exec->SynchronousMemcpy(&gpu_ptr, mask, MASK_BYTES)) { + LOG(FATAL) << "Could not copy debug mask"; + } +} + +} // namespace + +// ----------------------------------------------------------------------------- +// GPUDebugAllocator +// ----------------------------------------------------------------------------- +GPUDebugAllocator::GPUDebugAllocator(VisitableAllocator* allocator, + int device_id) + : base_allocator_(allocator) { + stream_exec_ = GPUMachineManager()->ExecutorForDevice(device_id).ValueOrDie(); +} + +GPUDebugAllocator::~GPUDebugAllocator() { delete base_allocator_; } + +void* GPUDebugAllocator::AllocateRaw(size_t alignment, size_t num_bytes) { + num_bytes += (2 * MASK_BYTES); + + void* allocated_ptr = base_allocator_->AllocateRaw(alignment, num_bytes); + + // Return the pointer after the header + void* rv = static_cast<char*>(allocated_ptr) + MASK_BYTES; + + // Write the header at allocated_ptr + InitMask(stream_exec_, allocated_ptr, before_mask); + + // Write the footer at the end. + size_t req_size = base_allocator_->RequestedSize(allocated_ptr); + InitMask(stream_exec_, + static_cast<char*>(allocated_ptr) + req_size - MASK_BYTES, + after_mask); + return rv; +} +void GPUDebugAllocator::DeallocateRaw(void* ptr) { + CHECK(CheckHeader(ptr)) << "before_mask has been overwritten"; + CHECK(CheckFooter(ptr)) << "after_mask has been overwritten"; + + // Backtrack to the beginning of the header. + ptr = static_cast<void*>(static_cast<char*>(ptr) - MASK_BYTES); + // Deallocate the memory + base_allocator_->DeallocateRaw(ptr); +} + +void GPUDebugAllocator::AddAllocVisitor(Visitor visitor) { + return base_allocator_->AddAllocVisitor(visitor); +} + +void GPUDebugAllocator::AddFreeVisitor(Visitor visitor) { + return base_allocator_->AddFreeVisitor(visitor); +} + +bool GPUDebugAllocator::TracksAllocationSizes() { return true; } + +size_t GPUDebugAllocator::RequestedSize(void* ptr) { + auto req_size = + base_allocator_->RequestedSize(static_cast<char*>(ptr) - MASK_BYTES); + return req_size - 2 * MASK_BYTES; +} + +size_t GPUDebugAllocator::AllocatedSize(void* ptr) { + return base_allocator_->AllocatedSize(static_cast<char*>(ptr) - MASK_BYTES); +} + +bool GPUDebugAllocator::CheckHeader(void* ptr) { + return CheckMask(stream_exec_, static_cast<char*>(ptr) - MASK_BYTES, + before_mask); +} + +bool GPUDebugAllocator::CheckFooter(void* ptr) { + char* original_ptr = static_cast<char*>(ptr) - MASK_BYTES; + size_t req_size = base_allocator_->RequestedSize(original_ptr); + return CheckMask(stream_exec_, original_ptr + req_size - MASK_BYTES, + after_mask); +} + +// ----------------------------------------------------------------------------- +// GPUNanResetAllocator +// ----------------------------------------------------------------------------- +GPUNanResetAllocator::GPUNanResetAllocator(VisitableAllocator* allocator, + int device_id) + : base_allocator_(allocator) { + stream_exec_ = GPUMachineManager()->ExecutorForDevice(device_id).ValueOrDie(); +} + +GPUNanResetAllocator::~GPUNanResetAllocator() { delete base_allocator_; } + +void* GPUNanResetAllocator::AllocateRaw(size_t alignment, size_t num_bytes) { + void* allocated_ptr = base_allocator_->AllocateRaw(alignment, num_bytes); + + // Initialize the buffer to Nans + size_t req_size = base_allocator_->RequestedSize(allocated_ptr); + std::vector<float> nans(req_size / sizeof(float), std::nanf("")); + gpu::DeviceMemory<float> nan_ptr{ + gpu::DeviceMemoryBase{static_cast<float*>(allocated_ptr), req_size}}; + + if (!stream_exec_->SynchronousMemcpy(&nan_ptr, &nans[0], req_size)) { + LOG(ERROR) << "Could not initialize to NaNs"; + } + + return allocated_ptr; +} +void GPUNanResetAllocator::DeallocateRaw(void* ptr) { + // Reset the buffer to Nans + size_t req_size = base_allocator_->RequestedSize(ptr); + std::vector<float> nans(req_size / sizeof(float), std::nanf("")); + gpu::DeviceMemory<float> nan_ptr{ + gpu::DeviceMemoryBase{static_cast<float*>(ptr), req_size}}; + if (!stream_exec_->SynchronousMemcpy(&nan_ptr, &nans[0], req_size)) { + LOG(ERROR) << "Could not initialize to NaNs"; + } + + // Deallocate the memory + base_allocator_->DeallocateRaw(ptr); +} + +void GPUNanResetAllocator::AddAllocVisitor(Visitor visitor) { + return base_allocator_->AddAllocVisitor(visitor); +} + +void GPUNanResetAllocator::AddFreeVisitor(Visitor visitor) { + return base_allocator_->AddFreeVisitor(visitor); +} + +size_t GPUNanResetAllocator::RequestedSize(void* ptr) { + return base_allocator_->RequestedSize(ptr); +} + +size_t GPUNanResetAllocator::AllocatedSize(void* ptr) { + return base_allocator_->AllocatedSize(ptr); +} + +} // namespace tensorflow diff --git a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h new file mode 100644 index 0000000000..c9b564ffc4 --- /dev/null +++ b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h @@ -0,0 +1,68 @@ +#ifndef TENSORFLOW_COMMON_RUNTIME_GPU_GPU_DEBUG_ALLOCATOR_H_ +#define TENSORFLOW_COMMON_RUNTIME_GPU_GPU_DEBUG_ALLOCATOR_H_ + +#include <memory> +#include <string> +#include <unordered_map> +#include <vector> + +#include "tensorflow/core/platform/port.h" +#include "tensorflow/core/common_runtime/gpu/visitable_allocator.h" +#include "tensorflow/stream_executor/stream_executor.h" + +namespace tensorflow { + +// An allocator that wraps a GPU allocator and adds debugging +// functionality that verifies that users do not write outside their +// allocated memory. +class GPUDebugAllocator : public VisitableAllocator { + public: + explicit GPUDebugAllocator(VisitableAllocator* allocator, int device_id); + ~GPUDebugAllocator() override; + string Name() override { return "gpu_debug"; } + void* AllocateRaw(size_t alignment, size_t num_bytes) override; + void DeallocateRaw(void* ptr) override; + void AddAllocVisitor(Visitor visitor) override; + void AddFreeVisitor(Visitor visitor) override; + bool TracksAllocationSizes() override; + size_t RequestedSize(void* ptr) override; + size_t AllocatedSize(void* ptr) override; + + // For testing. + bool CheckHeader(void* ptr); + bool CheckFooter(void* ptr); + + private: + VisitableAllocator* base_allocator_ = nullptr; // owned + + perftools::gputools::StreamExecutor* stream_exec_; // Not owned. + + TF_DISALLOW_COPY_AND_ASSIGN(GPUDebugAllocator); +}; + +// An allocator that wraps a GPU allocator and resets the memory on +// allocation and free to 'NaN', helping to identify cases where the +// user forgets to initialize the memory. +class GPUNanResetAllocator : public VisitableAllocator { + public: + explicit GPUNanResetAllocator(VisitableAllocator* allocator, int device_id); + ~GPUNanResetAllocator() override; + string Name() override { return "gpu_nan_reset"; } + void* AllocateRaw(size_t alignment, size_t num_bytes) override; + void DeallocateRaw(void* ptr) override; + void AddAllocVisitor(Visitor visitor) override; + void AddFreeVisitor(Visitor visitor) override; + size_t RequestedSize(void* ptr) override; + size_t AllocatedSize(void* ptr) override; + + private: + VisitableAllocator* base_allocator_ = nullptr; // owned + + perftools::gputools::StreamExecutor* stream_exec_; // Not owned. + + TF_DISALLOW_COPY_AND_ASSIGN(GPUNanResetAllocator); +}; + +} // namespace tensorflow + +#endif // TENSORFLOW_COMMON_RUNTIME_GPU_GPU_DEBUG_ALLOCATOR_H_ diff --git a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc new file mode 100644 index 0000000000..5f63906576 --- /dev/null +++ b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc @@ -0,0 +1,207 @@ +#if GOOGLE_CUDA + +#include "tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h" + +#include <algorithm> +#include <vector> + +#include "tensorflow/core/platform/port.h" +#include "tensorflow/core/lib/gtl/inlined_vector.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/common_runtime/gpu/gpu_init.h" +#include "tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h" +#include "tensorflow/stream_executor/multi_platform_manager.h" +#include "tensorflow/stream_executor/stream_executor.h" +#include <gtest/gtest.h> + +namespace gpu = ::perftools::gputools; + +namespace tensorflow { + +TEST(GPUDebugAllocatorTest, OverwriteDetection_None) { + const int device_id = 0; + GPUDebugAllocator a(new GPUBFCAllocator(device_id, 1 << 30), device_id); + auto stream_exec = + GPUMachineManager()->ExecutorForDevice(device_id).ValueOrDie(); + + for (int s : {8}) { + std::vector<int64> cpu_array(s); + memset(&cpu_array[0], 0, cpu_array.size() * sizeof(int64)); + int64* gpu_array = a.Allocate<int64>(cpu_array.size()); + gpu::DeviceMemory<int64> gpu_array_ptr{gpu::DeviceMemoryBase{gpu_array}}; + ASSERT_TRUE(stream_exec->SynchronousMemcpy(&gpu_array_ptr, &cpu_array[0], + s * sizeof(int64))); + EXPECT_TRUE(a.CheckHeader(gpu_array)); + EXPECT_TRUE(a.CheckFooter(gpu_array)); + + // Confirm no error on free. + a.DeallocateRaw(gpu_array); + } +} + +TEST(GPUDebugAllocatorTest, OverwriteDetection_Header) { + for (int s : {8, 211}) { + EXPECT_DEATH( + { + const int device_id = 0; + GPUDebugAllocator a(new GPUBFCAllocator(device_id, 1 << 30), + device_id); + auto stream_exec = + GPUMachineManager()->ExecutorForDevice(device_id).ValueOrDie(); + + std::vector<int64> cpu_array(s); + memset(&cpu_array[0], 0, cpu_array.size() * sizeof(int64)); + int64* gpu_array = a.Allocate<int64>(cpu_array.size()); + + gpu::DeviceMemory<int64> gpu_array_ptr{ + gpu::DeviceMemoryBase{gpu_array}}; + ASSERT_TRUE(stream_exec->SynchronousMemcpy( + &gpu_array_ptr, &cpu_array[0], cpu_array.size() * sizeof(int64))); + + gpu::DeviceMemory<int64> gpu_hdr_ptr{ + gpu::DeviceMemoryBase{gpu_array - 1}}; + // Clobber first word of the header. + float pi = 3.1417; + ASSERT_TRUE( + stream_exec->SynchronousMemcpy(&gpu_hdr_ptr, &pi, sizeof(float))); + + // Expect error on free. + a.Deallocate(gpu_array); + }, + ""); + } +} + +TEST(GPUDebugAllocatorTest, OverwriteDetection_Footer) { + for (int s : {8, 22}) { + EXPECT_DEATH( + { + const int device_id = 0; + GPUDebugAllocator a(new GPUBFCAllocator(device_id, 1 << 30), + device_id); + auto stream_exec = + GPUMachineManager()->ExecutorForDevice(device_id).ValueOrDie(); + + std::vector<int64> cpu_array(s); + memset(&cpu_array[0], 0, cpu_array.size() * sizeof(int64)); + int64* gpu_array = a.Allocate<int64>(cpu_array.size()); + + gpu::DeviceMemory<int64> gpu_array_ptr{ + gpu::DeviceMemoryBase{gpu_array}}; + ASSERT_TRUE(stream_exec->SynchronousMemcpy( + &gpu_array_ptr, &cpu_array[0], cpu_array.size() * sizeof(int64))); + + // Clobber word of the footer. + gpu::DeviceMemory<int64> gpu_ftr_ptr{ + gpu::DeviceMemoryBase{gpu_array + s}}; + float pi = 3.1417; + ASSERT_TRUE( + stream_exec->SynchronousMemcpy(&gpu_ftr_ptr, &pi, sizeof(float))); + + // Expect error on free. + a.Deallocate(gpu_array); + }, + ""); + } +} + +TEST(GPUDebugAllocatorTest, ResetToNan) { + const int device_id = 0; + GPUNanResetAllocator a(new GPUBFCAllocator(device_id, 1 << 30), device_id); + auto stream_exec = + GPUMachineManager()->ExecutorForDevice(device_id).ValueOrDie(); + + std::vector<float> cpu_array(1024); + std::vector<float> cpu_array_result(1024); + + // Allocate 1024 floats + float* gpu_array = a.Allocate<float>(cpu_array.size()); + gpu::DeviceMemory<float> gpu_array_ptr{gpu::DeviceMemoryBase{gpu_array}}; + ASSERT_TRUE(stream_exec->SynchronousMemcpy(&cpu_array[0], gpu_array_ptr, + cpu_array.size() * sizeof(float))); + for (float f : cpu_array) { + ASSERT_FALSE(std::isfinite(f)); + } + + // Set one of the fields to 1.0. + cpu_array[0] = 1.0; + ASSERT_TRUE(stream_exec->SynchronousMemcpy(&gpu_array_ptr, &cpu_array[0], + cpu_array.size() * sizeof(float))); + // Copy the data back and verify. + ASSERT_TRUE( + stream_exec->SynchronousMemcpy(&cpu_array_result[0], gpu_array_ptr, + cpu_array_result.size() * sizeof(float))); + ASSERT_EQ(1.0, cpu_array_result[0]); + + // Free the array + a.Deallocate(gpu_array); + + // All values should be reset to nan. + ASSERT_TRUE( + stream_exec->SynchronousMemcpy(&cpu_array_result[0], gpu_array_ptr, + cpu_array_result.size() * sizeof(float))); + for (float f : cpu_array_result) { + ASSERT_FALSE(std::isfinite(f)); + } +} + +TEST(GPUDebugAllocatorTest, ResetToNanWithHeaderFooter) { + const int device_id = 0; + // NaN reset must be the outer-most allocator. + GPUNanResetAllocator a( + new GPUDebugAllocator(new GPUBFCAllocator(device_id, 1 << 30), device_id), + device_id); + auto stream_exec = + GPUMachineManager()->ExecutorForDevice(device_id).ValueOrDie(); + + std::vector<float> cpu_array(1024); + std::vector<float> cpu_array_result(1024); + + // Allocate 1024 floats + float* gpu_array = a.Allocate<float>(cpu_array.size()); + gpu::DeviceMemory<float> gpu_array_ptr{gpu::DeviceMemoryBase{gpu_array}}; + ASSERT_TRUE(stream_exec->SynchronousMemcpy(&cpu_array[0], gpu_array_ptr, + cpu_array.size() * sizeof(float))); + for (float f : cpu_array) { + ASSERT_FALSE(std::isfinite(f)); + } + + // Set one of the fields to 1.0. + cpu_array[0] = 1.0; + ASSERT_TRUE(stream_exec->SynchronousMemcpy(&gpu_array_ptr, &cpu_array[0], + cpu_array.size() * sizeof(float))); + // Copy the data back and verify. + ASSERT_TRUE( + stream_exec->SynchronousMemcpy(&cpu_array_result[0], gpu_array_ptr, + cpu_array_result.size() * sizeof(float))); + ASSERT_EQ(1.0, cpu_array_result[0]); + + // Free the array + a.Deallocate(gpu_array); + + // All values should be reset to nan. + ASSERT_TRUE( + stream_exec->SynchronousMemcpy(&cpu_array_result[0], gpu_array_ptr, + cpu_array_result.size() * sizeof(float))); + for (float f : cpu_array_result) { + ASSERT_FALSE(std::isfinite(f)); + } +} + +TEST(GPUDebugAllocatorTest, TracksSizes) { + GPUDebugAllocator a(new GPUBFCAllocator(0, 1 << 30), 0); + EXPECT_EQ(true, a.TracksAllocationSizes()); +} + +TEST(GPUDebugAllocatorTest, AllocatedVsRequested) { + GPUNanResetAllocator a( + new GPUDebugAllocator(new GPUBFCAllocator(0, 1 << 30), 0), 0); + float* t1 = a.Allocate<float>(1); + EXPECT_EQ(4, a.RequestedSize(t1)); + EXPECT_EQ(256, a.AllocatedSize(t1)); + a.Deallocate(t1); +} + +} // namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc new file mode 100644 index 0000000000..26d34645f1 --- /dev/null +++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc @@ -0,0 +1,651 @@ +// TODO(opensource): Use a more generic sounding preprocessor name than +// GOOGLE_CUDA +#if GOOGLE_CUDA + +#define EIGEN_USE_GPU + +#include "tensorflow/core/common_runtime/gpu/gpu_device.h" + +#include <stdlib.h> +#include <string.h> + +//#include "base/commandlineflags.h" +#include "tensorflow/stream_executor/cuda/cuda_activation.h" +#include "tensorflow/stream_executor/multi_platform_manager.h" +#include "tensorflow/stream_executor/stream.h" +#include "tensorflow/stream_executor/stream_executor.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" +#include "tensorflow/core/common_runtime/device_factory.h" +#include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h" +#include "tensorflow/core/common_runtime/gpu/gpu_init.h" +#include "tensorflow/core/common_runtime/gpu/gpu_stream_util.h" +#include "tensorflow/core/common_runtime/gpu/gpu_util.h" +#include "tensorflow/core/common_runtime/gpu/process_state.h" +#include "tensorflow/core/common_runtime/gpu_device_context.h" +#include "tensorflow/core/common_runtime/local_device.h" +#include "tensorflow/core/framework/allocator.h" +#include "tensorflow/core/framework/device_base.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/graph/types.h" +#include "tensorflow/core/lib/gtl/stl_util.h" +#include "tensorflow/core/lib/strings/numbers.h" +#include "tensorflow/core/lib/strings/strcat.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/port.h" +#include "tensorflow/core/platform/tracing.h" +#include "tensorflow/core/public/session_options.h" +#include "tensorflow/core/public/status.h" +#include "tensorflow/core/public/tensor.h" +#include "tensorflow/core/util/device_name_utils.h" + +#if defined(PLATFORM_GOOGLE) +DEFINE_bool(brain_gpu_sync_every_op, false, + "If true, call GPUUtil::Sync() between every dispatched opkernel."); + +DEFINE_int32(brain_gpu_max_streams, 1, + "Max number of GPU streams to use for computation."); +#else +// TODO(opensource): These should be made options in some options struct, +// rather than flags. +bool FLAGS_brain_gpu_sync_every_op = false; +tensorflow::int32 FLAGS_brain_gpu_max_streams = 1; +#endif + +namespace gpu = ::perftools::gputools; + +namespace tensorflow { + +// Eigen Ops directly allocate memory only for temporary buffers used +// during OpKernel::Compute(). The recommended way of allocating such +// memory is via OpKernelContext::allocate_temp(). However, Eigen Ops +// don't have access to OpKernelContext, instead they get access to +// memory directly through the device allocator. As an Open Source +// project, Eigen assumes allocator semantics similar to those of the +// CUDA memory allocator, and may not work correctly due to race +// conditions if used with some other allocator. For safety, we need +// to delay deallocation calls out of Eigen until all events on the +// corresponding stream have completed. The following two classes +// serve this purpose in two different compilation environments. + +#if defined(__GCUDACC__) || defined(__GCUDACC_HOST__) +class EigenAllocator : public ::Eigen::Allocator { + public: + explicit EigenAllocator(gpu::Stream* stream, ::tensorflow::Allocator* alloc, + EventMgr* em) + : stream_(stream), allocator_(alloc), em_(em) {} + + void* allocate(size_t num_bytes) const override { + void* ret = allocator_->AllocateRaw(32 /* alignment */, num_bytes); + // Eigen doesn't typically check the return pointer from allocate, + // so we do it here and die with a more helpful error message. + if (ret == nullptr) { + LOG(FATAL) << "EigenAllocator for GPU ran out of memory when allocating " + << num_bytes << ". See error logs for more detailed info."; + } + return ret; + } + + void deallocate(void* buffer) const override { + em_->ThenDeleteBuffer(stream_, {allocator_, buffer}); + } + + private: + gpu::Stream* stream_; // Not owned. + ::tensorflow::Allocator* allocator_; // Not owned. + ::tensorflow::EventMgr* em_; // Not owned. + + TF_DISALLOW_COPY_AND_ASSIGN(EigenAllocator); +}; + +#else +class EigenCudaStreamDevice : public ::Eigen::StreamInterface { + public: + EigenCudaStreamDevice(const cudaStream_t* cuda_stream, int gpu_id, + ::tensorflow::Allocator* alloc) + : stream_(cuda_stream), allocator_(alloc) { + Eigen::initializeDeviceProp(); + device_prop_ = &Eigen::m_deviceProperties[gpu_id]; + } + + const cudaStream_t& stream() const override { return *stream_; } + const cudaDeviceProp& deviceProperties() const override { + return *device_prop_; + } + + void* allocate(size_t num_bytes) const override { + void* ret = allocator_->AllocateRaw(32 /* alignment */, num_bytes); + if (ret == nullptr) { + LOG(FATAL) << "EigenAllocator for GPU ran out of memory when allocating " + << num_bytes << ". See error logs for more detailed info."; + } + + return ret; + } + void deallocate(void* buffer) const override { + AsyncFreeData* afData = new AsyncFreeData(allocator_, buffer); + cudaError_t err = cudaStreamAddCallback(*stream_, asyncFree, afData, 0); + CHECK_EQ(err, cudaSuccess); + } + + private: + struct AsyncFreeData { + AsyncFreeData(::tensorflow::Allocator* a, void* p) + : allocator_(a), address_(p) {} + ::tensorflow::Allocator* allocator_; + void* address_; + }; + + static void CUDART_CB asyncFree(cudaStream_t stream, cudaError_t status, + void* userData) { + AsyncFreeData* data = static_cast<AsyncFreeData*>(userData); + data->allocator_->DeallocateRaw(data->address_); + delete data; + } + + const cudaStream_t* stream_; // Not owned. + const cudaDeviceProp* device_prop_; // Not owned. + ::tensorflow::Allocator* allocator_; // Not owned. + + TF_DISALLOW_COPY_AND_ASSIGN(EigenCudaStreamDevice); +}; + +#endif + +BaseGPUDevice::BaseGPUDevice(const SessionOptions& options, const string& name, + Bytes memory_limit, BusAdjacency bus_adjacency, + int gpu_id, const string& physical_device_desc, + Allocator* gpu_allocator, Allocator* cpu_allocator) + : LocalDevice(options, Device::BuildDeviceAttributes( + name, DEVICE_GPU, memory_limit, bus_adjacency, + physical_device_desc), + gpu_allocator), + gpu_allocator_(gpu_allocator), + cpu_allocator_(cpu_allocator), + gpu_id_(gpu_id) { + gpu::StreamExecutor* executor = + GPUMachineManager()->ExecutorForDevice(gpu_id_).ValueOrDie(); + if (!executor) { + LOG(ERROR) << "Failed to get StreamExecutor for device " << gpu_id_; + return; + } + em_.reset(new EventMgr(executor)); + + if (FLAGS_brain_gpu_max_streams < 1) { + LOG(FATAL) << "Invalid value for brain_gpu_max_streams."; + } + + // Create the specified number of GPU streams + for (int i = 0; i < FLAGS_brain_gpu_max_streams; i++) { + auto stream = new gpu::Stream(executor); + stream->Init(); + VLOG(2) << "Created stream[" << i << "] = " << stream; + streams_.push_back(stream); + device_contexts_.push_back(new GPUDeviceContext(i, stream)); + } + gpu_device_info_ = new GpuDeviceInfo; + gpu_device_info_->stream = streams_[0]; + gpu_device_info_->default_context = device_contexts_[0]; + gpu_device_info_->event_mgr = em_.get(); + set_tensorflow_gpu_device_info(gpu_device_info_); +} + +BaseGPUDevice::~BaseGPUDevice() { + delete gpu_device_info_; + for (auto ctx : device_contexts_) ctx->Unref(); + gtl::STLDeleteElements(&streams_); +} + +Status BaseGPUDevice::FillContextMap(const Graph* graph, + DeviceContextMap* device_context_map) { + VLOG(2) << "FillContextMap"; + + const auto num_streams = streams_.size(); + // Special case for single stream. + if (num_streams == 1) { + return Status::OK(); + } + const int64 before = Env::Default()->NowMicros(); + gpu_stream_util::AssignStreamsOpts opts; + opts.max_streams = num_streams; + std::unordered_map<int, int> node_to_stream_id; + TF_RETURN_IF_ERROR( + gpu_stream_util::AssignStreams(graph, opts, &node_to_stream_id)); + int64 elapsed = Env::Default()->NowMicros() - before; + VLOG(3) << "AssignStreams took " << elapsed << "us"; + + // Fill in the context map. It is OK for this map to contain + // duplicate DeviceContexts so long as we increment the refcount. + for (Node* n : graph->nodes()) { + auto mapped_stream = node_to_stream_id[n->id()]; + CHECK_LE(mapped_stream, num_streams); + auto ctx = device_contexts_[mapped_stream]; + VLOG(3) << "Assigned stream " << node_to_stream_id[n->id()] + << " ==> stream[" << ctx->stream_id() << "] for node id " << n->id() + << " " << n->type_string() << " " << n->name(); + ctx->Ref(); + device_context_map->insert(std::make_pair(n->id(), ctx)); + } + + return Status::OK(); +} + +void BaseGPUDevice::Compute(OpKernel* op_kernel, OpKernelContext* context) { + // ScopedActivity is cheap when tracing is not active, but we + // can avoid computing the Hash64. + // TODO(pbar) This would no longer be needed if Ops have a unique id. + const uint64 id = port::Tracing::IsActive() ? Hash64(op_kernel->name()) : 0; + port::Tracing::ScopedActivity region(port::Tracing::EventCategory::kCompute, + id); + + GPUDeviceContext* gpu_device_context = device_contexts_[0]; + if (context->op_device_context() != nullptr) { + gpu_device_context = + static_cast<GPUDeviceContext*>(context->op_device_context()); + } + gpu::Stream* stream = gpu_device_context->stream(); + const auto stream_id = gpu_device_context->stream_id(); + + VLOG(1) << "GpuDevice::Compute " << op_kernel->name() << " op " + << op_kernel->def().op() << " on GPU" << gpu_id_ << " stream[" + << stream_id << "]"; + + // NOTE(tucker): We need to discriminate between Eigen GPU + // operations and all others. If an operation is Eigen + // implemented (or otherwise tries to launch a cuda kernel + // directly), we need to establish a stacked-scoped environment + // that directs it to execute on the proper device. Otherwise we + // expect the Op to use StreamExecutor directly and correctly. The + // way we make this discrimination is quite hacky: At the moment + // the only non-Eigen GPU Op is the recv-op, which is known to be + // asynchronous. + if (op_kernel->type_string() == "_Recv") { + context->SetStatus(errors::Internal( + "Invalid synchronous 'Compute' on GPU for '_Recv' op")); + } else { + const string label = + strings::StrCat(op_kernel->name(), ":", op_kernel->type_string()); + port::Tracing::ScopedAnnotation annotation(label); + + const auto num_streams = streams_.size(); + if (num_streams > 1) { + // If this op's device context is different from the other contexts, + // we must wait on the stream. + for (int i = 0; i < context->num_inputs(); ++i) { + const GPUDeviceContext* idc = + static_cast<GPUDeviceContext*>(context->input_device_context(i)); + OP_REQUIRES(context, idc != nullptr, + errors::Internal("Input device context ", i, + " was not set properly.")); + if (VLOG_IS_ON(2)) { + const void* base; + size_t len; + if (context->has_input(i)) { + if (IsRefType(context->input_dtype(i))) { + Tensor tensor = context->mutable_input(i, false); + base = DMAHelper::base(&tensor); + len = tensor.TotalBytes(); + } else { + const Tensor& tensor = context->input(i); + base = DMAHelper::base(&tensor); + len = tensor.TotalBytes(); + } + VLOG(2) << "Input " << i << " " << base << " " << len; + VLOG(2) << " stream[" << stream_id << "].ThenWaitFor(stream[" + << idc->stream_id() << "])" + << ((idc->stream() == stream) ? " not needed" : ""); + } + } + if (idc->stream() != stream) stream->ThenWaitFor(idc->stream()); + } + } + gpu::cuda::ScopedActivateExecutorContext scoped_activation{ + stream->parent(), gpu::cuda::MultiOpActivation::kYes}; + // Keep a copy of the inputs before Compute runs, in case they get + // deleted. TODO(misard) this will be fixed when the tracking is + // done right. + std::vector<Tensor>* tensor_refs = nullptr; + if (!FLAGS_brain_gpu_sync_every_op) { + tensor_refs = new std::vector<Tensor>; + tensor_refs->reserve(context->num_inputs() + context->num_outputs()); + for (int ii = 0; ii < context->num_inputs(); ++ii) { + if (context->has_input(ii)) { + if (IsRefType(context->input_dtype(ii))) { + Tensor in = context->mutable_input(ii, false); + tensor_refs->push_back(in); + } else { + const Tensor& in = context->input(ii); + tensor_refs->push_back(in); + } + } + } + } + op_kernel->Compute(context); + if (context->status().ok()) { + if (FLAGS_brain_gpu_sync_every_op) { + // Note: GPUUtil::Sync() only syncs the default stream. + // We need to either sync the stream used by this op, or + // all streams. Given that this flag is typically used for + // debugging it makes more sense to sync all GPU activity. + context->SetStatus(GPUUtil::SyncAll(this)); + } else { + // The GPU kernel has been queued, but may not complete for some + // time. As soon as this function completes, the caller will + // discard its refs on the inputs, outputs and any scratch + // tensors it created. Create additional refs here that will be + // held until the kernel completes. + for (int ii = 0; ii < context->num_temps(); ++ii) { + Tensor* temp = context->temp(ii); + VLOG(2) << "Saving ref to temp Tensor @ " << DMAHelper::base(temp); + tensor_refs->push_back(*temp); + } + for (int ii = 0; ii < context->num_outputs(); ++ii) { + Tensor* temp = context->mutable_output(ii); + if (nullptr != temp) { + tensor_refs->push_back(*temp); + } + } + em_->ThenDeleteTensors(stream, tensor_refs); + } + } else { + if (!FLAGS_brain_gpu_sync_every_op) { + delete tensor_refs; + } + } + } +} + +Status BaseGPUDevice::Sync() { return GPUUtil::Sync(this); } + +void BaseGPUDevice::ComputeAsync(AsyncOpKernel* op_kernel, + OpKernelContext* context, + AsyncOpKernel::DoneCallback done) { + GPUDeviceContext* gpu_device_context = device_contexts_[0]; + if (context->op_device_context() != nullptr) { + gpu_device_context = + static_cast<GPUDeviceContext*>(context->op_device_context()); + } + const auto stream_id = gpu_device_context->stream_id(); + + VLOG(1) << "GpuDevice::ComputeAsync " << op_kernel->name() << " op " + << op_kernel->def().op() << " on GPU" << gpu_id_ << " stream[" + << stream_id << "]"; + + port::Tracing::TraceMe activity( + strings::StrCat(op_kernel->name(), ":", op_kernel->type_string())); + op_kernel->ComputeAsync(context, done); +} + +Status BaseGPUDevice::MakeTensorFromProto(const TensorProto& tensor_proto, + const AllocatorAttributes alloc_attrs, + Tensor* tensor) { + AllocatorAttributes attr; + attr.set_on_host(true); + attr.set_gpu_compatible(true); + Allocator* host_alloc = GetAllocator(attr); + Tensor parsed(tensor_proto.dtype()); + if (!parsed.FromProto(host_alloc, tensor_proto)) { + return errors::InvalidArgument("Cannot parse tensor from proto: ", + tensor_proto.DebugString()); + } + Status status; + if (alloc_attrs.on_host()) { + *tensor = parsed; + } else { + if (!DMAHelper::CanUseDMA(&parsed)) { + return errors::Internal("GPU copy from non-DMA ", + DataTypeString(parsed.dtype()), " tensor"); + } + Tensor copy(GetAllocator(alloc_attrs), parsed.dtype(), parsed.shape()); + port::Tracing::ScopedAnnotation annotation("MakeTensorFromProto"); + Notification n; + device_contexts_[0]->CopyCPUTensorToDevice(&parsed, this, ©, + [&n, &status](const Status& s) { + status = s; + n.Notify(); + }); + n.WaitForNotification(); + *tensor = copy; + } + return status; +} + +namespace { +#if defined(__GCUDACC__) || defined(__GCUDACC_HOST__) +class ConcretePerOpGpuDevice : public PerOpGpuDevice { + public: + explicit ConcretePerOpGpuDevice(gpu::Stream* stream, + EigenAllocator* allocator) + : device_(stream, allocator), allocator_(allocator) {} + ~ConcretePerOpGpuDevice() { delete allocator_; } + + const Eigen::GpuDevice& device() const override { return device_; } + + private: + Eigen::GpuDevice device_; + EigenAllocator* allocator_; +}; +#else +class ConcretePerOpGpuDevice : public PerOpGpuDevice { + public: + explicit ConcretePerOpGpuDevice(EigenCudaStreamDevice* stream_device) + : device_(stream_device), stream_device_(stream_device) {} + ~ConcretePerOpGpuDevice() { delete stream_device_; } + + const Eigen::GpuDevice& device() const override { return device_; } + + private: + Eigen::GpuDevice device_; + EigenCudaStreamDevice* stream_device_; +}; +#endif +} // namespace + +const PerOpGpuDevice* BaseGPUDevice::NewDevice(int stream_id, + Allocator* allocator) { +#if defined(__GCUDACC__) || defined(__GCUDACC_HOST__) + auto ea = new EigenAllocator(streams_[stream_id], allocator, em_.get()); + return new ConcretePerOpGpuDevice(streams_[stream_id], ea); +#else + const cudaStream_t* cuda_stream = reinterpret_cast<const cudaStream_t*>( + streams_[stream_id]->implementation()->CudaStreamMemberHack()); + auto es = new EigenCudaStreamDevice(cuda_stream, gpu_id_, allocator); + return new ConcretePerOpGpuDevice(es); +#endif +} + +const PerOpGpuDevice* BaseGPUDevice::MakeGpuDevice(DeviceContext* dc, + Allocator* allocator) { + if (dc) { + const GPUDeviceContext* gpu_dc = static_cast<GPUDeviceContext*>(dc); + const int stream_id = gpu_dc->stream_id(); + VLOG(1) << " eigen_gpu_device(" << dc << ") => stream[" << stream_id + << "]"; + CHECK_LT(stream_id, streams_.size()); + return NewDevice(stream_id, allocator); + } else { + return NewDevice(0, allocator); + } +} + +void BaseGPUDeviceFactory::CreateDevices(const SessionOptions& options, + const string& name_prefix, + std::vector<Device*>* devices) { + int n = INT_MAX; + auto iter = options.config.device_count().find("GPU"); + if (iter != options.config.device_count().end()) { + n = iter->second; + } + std::vector<int> valid_gpu_ids; + GetValidDeviceIds(&valid_gpu_ids); + if (static_cast<size_t>(n) > valid_gpu_ids.size()) { + n = valid_gpu_ids.size(); + } + for (int i = 0; i < n; i++) { + devices->push_back(CreateGPUDevice( + options, strings::StrCat(name_prefix, "/gpu:", i), valid_gpu_ids[i])); + } +} + +namespace { +int64 MinSystemMemory(int64 available_memory) { + // We use the following heuristic for now: + // + // If the available_memory is < 2GiB, we allocate 200MiB to system memory. + // Otherwise, allocate 300MiB to system memory. + // + // In the future we could be more sophisticated by using a table of + // devices. + if (available_memory < (1LL << 31)) { + // 200MiB + return 209715200LL; + } else { + // max(300 MiB, 0.95 * available_memory) + return std::max(314572800LL, static_cast<int64>(available_memory * 0.05)); + } +} +} // namespace + +static string GetShortDeviceDescription(int device_id, + const gpu::DeviceDescription& desc) { + return strings::StrCat("device: ", device_id, ", name: ", desc.name(), + ", pci bus id: ", desc.pci_bus_id()); +} + +LocalDevice* BaseGPUDeviceFactory::CreateGPUDevice( + const SessionOptions& options, const string& name, int gpu_id) { + CHECK_GE(gpu_id, 0); + + // Look up the device, to see its attributes. + gpu::Platform* gpu_platform = GPUMachineManager(); + CHECK_LT(gpu_id, gpu_platform->VisibleDeviceCount()); + gpu::StreamExecutor* se = + gpu_platform->ExecutorForDevice(gpu_id).ValueOrDie(); + const gpu::DeviceDescription& desc = se->GetDeviceDescription(); + + int64 total_memory, available_memory; + CHECK(se->DeviceMemoryUsage(&available_memory, &total_memory)); + + int64 allocated_memory = available_memory; + double config_memory_fraction = + options.config.gpu_options().per_process_gpu_memory_fraction(); + if (config_memory_fraction == 0) { + const int64 min_system_memory = MinSystemMemory(available_memory); + if (min_system_memory < allocated_memory) { + allocated_memory -= min_system_memory; + } + } else { + allocated_memory *= config_memory_fraction; + } + + Bytes allocated_bytes = static_cast<Bytes>(allocated_memory); + + // Get GPU BusAdjacency from its reported NUMA affinity. + // Because GPUs are virtualized in some environments, we can't just + // use the GPU id. + BusAdjacency bus_adjacency = BUS_ANY; + switch (desc.numa_node()) { + case 0: + bus_adjacency = BUS_0; + break; + case 1: + bus_adjacency = BUS_1; + break; + default: + bus_adjacency = BUS_ANY; + } + VLOG(1) << "GPUDevice id " << gpu_id << " on bus " << bus_adjacency + << " numa: " << desc.numa_node() << " pci: " << desc.pci_bus_id(); + + ProcessState* process_state = ProcessState::singleton(); + return CreateGPUDevice( + options, name, allocated_bytes, bus_adjacency, gpu_id, + GetShortDeviceDescription(gpu_id, desc), + process_state->GetGPUAllocator(gpu_id, allocated_memory), + process_state->GetCPUAllocator(desc.numa_node())); +} + +static int GetMinGPUMultiprocessorCount() { + static const int kDefaultMinGPUMultiprocessorCount = 8; + + const char* tf_min_gpu_core_count = getenv("TF_MIN_GPU_MULTIPROCESSOR_COUNT"); + + if (tf_min_gpu_core_count == nullptr || + strcmp(tf_min_gpu_core_count, "") == 0) { + return kDefaultMinGPUMultiprocessorCount; + } + + int min_gpu_core_count = -1; + if (strings::safe_strto32(tf_min_gpu_core_count, &min_gpu_core_count)) { + if (min_gpu_core_count >= 0) { + return min_gpu_core_count; + } + } + + LOG(ERROR) << "Invalid minimum GPU multiprocessor count: [" + << tf_min_gpu_core_count << "]. " + << "Using the default value: " + << kDefaultMinGPUMultiprocessorCount; + return kDefaultMinGPUMultiprocessorCount; +} + +void BaseGPUDeviceFactory::GetValidDeviceIds(std::vector<int>* ids) { + auto gpu_manager = GPUMachineManager(); + int min_gpu_core_count = GetMinGPUMultiprocessorCount(); + if (gpu_manager) { + auto visible_device_count = gpu_manager->VisibleDeviceCount(); + for (int i = 0; i < gpu_manager->VisibleDeviceCount(); ++i) { + auto exec_status = gpu_manager->ExecutorForDevice(i); + if (!exec_status.ok()) { + continue; + } + gpu::StreamExecutor* se = exec_status.ValueOrDie(); + const gpu::DeviceDescription& desc = se->GetDeviceDescription(); + int major, minor; + if (!desc.cuda_compute_capability(&major, &minor)) { + continue; + } + // Only consider GPUs with compute capability >= 3.5 (Kepler or + // higher) + if (major < 3 || (major == 3 && minor < 5)) { + LOG(INFO) << "Ignoring gpu device " + << "(" << GetShortDeviceDescription(i, desc) << ") " + << "with Cuda compute capability " << major << "." << minor + << ". The minimum required Cuda capability is 3.5."; + continue; + } + + // TensorFlow currently places computation on devices assuming + // they have similar capability. + // + // If there are multiple GPUs available on the machine, only + // consider GPUs with 8 or more multiprocessors. + // + // TODO(vrv): In the medium term: we should only filter out GPUs + // that are slow relative to the fastest GPU. In the long term, + // TensorFlow should support automatic placement based on + // capability. + if (visible_device_count > 1) { + if (desc.core_count() < min_gpu_core_count) { + LOG(INFO) << "Ignoring gpu device " + << "(" << GetShortDeviceDescription(i, desc) << ") " + << "with Cuda multiprocessor count: " << desc.core_count() + << ". The minimum required count is " << min_gpu_core_count + << ". You can adjust this requirement with the env var " + "TF_MIN_GPU_MULTIPROCESSOR_COUNT."; + continue; + } + } + + int new_id = ids->size(); + ids->push_back(i); + + LOG(INFO) << "Creating TensorFlow device (/gpu:" << new_id << ") -> " + << "(" << GetShortDeviceDescription(i, desc) << ")"; + } + } +} + +} // namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.h b/tensorflow/core/common_runtime/gpu/gpu_device.h new file mode 100644 index 0000000000..a415224d95 --- /dev/null +++ b/tensorflow/core/common_runtime/gpu/gpu_device.h @@ -0,0 +1,94 @@ +#if !GOOGLE_CUDA +#error This file must only be included when building with Cuda support +#endif + +#ifndef TENSORFLOW_COMMON_RUNTIME_GPU_GPU_DEVICE_H_ +#define TENSORFLOW_COMMON_RUNTIME_GPU_GPU_DEVICE_H_ + +#include "tensorflow/core/common_runtime/device_factory.h" +#include "tensorflow/core/common_runtime/gpu_device_context.h" +#include "tensorflow/core/common_runtime/local_device.h" +#include "tensorflow/core/framework/allocator.h" +#include "tensorflow/core/framework/device_base.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/platform/port.h" +#include "tensorflow/core/public/session_options.h" +#include "tensorflow/core/public/status.h" +#include "tensorflow/core/public/tensor.h" +#include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h" +#include "tensorflow/stream_executor/stream.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" + +namespace tensorflow { + +class EigenAllocator; + +class BaseGPUDevice : public LocalDevice { + public: + BaseGPUDevice(const SessionOptions& options, const string& name, + Bytes memory_limit, BusAdjacency bus_adjacency, int gpu_id, + const string& physical_device_desc, Allocator* gpu_allocator, + Allocator* cpu_allocator); + + ~BaseGPUDevice() override; + + // GPU devices require the Op Compute method to save a reference to + // any temporary tensors that are allocated until the Op execution + // completes. + bool SaveTemporaryTensors() const override { return true; } + + Status FillContextMap(const Graph* graph, + DeviceContextMap* device_context_map); + + void Compute(OpKernel* op_kernel, OpKernelContext* context) override; + + Status Sync() override; + + void ComputeAsync(AsyncOpKernel* op_kernel, OpKernelContext* context, + AsyncOpKernel::DoneCallback done) override; + + Status MakeTensorFromProto(const TensorProto& tensor_proto, + const AllocatorAttributes alloc_attrs, + Tensor* tensor) override; + + // The caller owns the returned device. + const PerOpGpuDevice* MakeGpuDevice(DeviceContext* dc, + Allocator* allocator) override; + + protected: + Allocator* gpu_allocator_; // not owned + Allocator* cpu_allocator_; // not owned + + private: + std::vector<gpu::Stream*> streams_; + std::vector<GPUDeviceContext*> device_contexts_; + GpuDeviceInfo* gpu_device_info_ = nullptr; + mutex trace_mu_; + int gpu_id_ = -1; + std::unique_ptr<EventMgr> em_; + + const PerOpGpuDevice* NewDevice(int stream_id, Allocator* allocator); +}; + +class BaseGPUDeviceFactory : public DeviceFactory { + public: + void CreateDevices(const SessionOptions& options, const string& name_prefix, + std::vector<Device*>* devices) override; + + private: + LocalDevice* CreateGPUDevice(const SessionOptions& options, + const string& name, int gpu_id); + + virtual LocalDevice* CreateGPUDevice(const SessionOptions& options, + const string& name, Bytes memory_limit, + BusAdjacency bus_adjacency, int gpu_id, + const string& physical_device_desc, + Allocator* gpu_allocator, + Allocator* cpu_allocator) = 0; + + void GetValidDeviceIds(std::vector<int>* ids); +}; + +} // namespace tensorflow + +#endif // TENSORFLOW_COMMON_RUNTIME_GPU_GPU_DEVICE_H_ diff --git a/tensorflow/core/common_runtime/gpu/gpu_device_factory.cc b/tensorflow/core/common_runtime/gpu/gpu_device_factory.cc new file mode 100644 index 0000000000..240ac47499 --- /dev/null +++ b/tensorflow/core/common_runtime/gpu/gpu_device_factory.cc @@ -0,0 +1,52 @@ +#if GOOGLE_CUDA + +#define EIGEN_USE_GPU + +#include "tensorflow/core/common_runtime/gpu/gpu_device.h" +#include "tensorflow/core/common_runtime/gpu/process_state.h" + +namespace tensorflow { + +void RequireGPUDevice() {} + +class GPUDevice : public BaseGPUDevice { + public: + GPUDevice(const SessionOptions& options, const string& name, + Bytes memory_limit, BusAdjacency bus_adjacency, int gpu_id, + const string& physical_device_desc, Allocator* gpu_allocator, + Allocator* cpu_allocator) + : BaseGPUDevice(options, name, memory_limit, bus_adjacency, gpu_id, + physical_device_desc, gpu_allocator, cpu_allocator) {} + + Allocator* GetAllocator(AllocatorAttributes attr) override { + if (attr.on_host()) { + ProcessState* ps = ProcessState::singleton(); + if (attr.gpu_compatible()) { + return ps->GetCUDAHostAllocator(0); + } else { + return cpu_allocator_; + } + } else { + return gpu_allocator_; + } + } +}; + +class GPUDeviceFactory : public BaseGPUDeviceFactory { + private: + LocalDevice* CreateGPUDevice(const SessionOptions& options, + const string& name, Bytes memory_limit, + BusAdjacency bus_adjacency, int gpu_id, + const string& physical_device_desc, + Allocator* gpu_allocator, + Allocator* cpu_allocator) override { + return new GPUDevice(options, name, memory_limit, bus_adjacency, gpu_id, + physical_device_desc, gpu_allocator, cpu_allocator); + } +}; + +REGISTER_LOCAL_DEVICE_FACTORY("GPU", GPUDeviceFactory); + +} // namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc b/tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc new file mode 100644 index 0000000000..29d6281733 --- /dev/null +++ b/tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc @@ -0,0 +1,132 @@ +#include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h" + +#include "tensorflow/stream_executor/event.h" +#include "tensorflow/stream_executor/stream.h" + +namespace gpu = ::perftools::gputools; + +namespace tensorflow { + +EventMgr::EventMgr(gpu::StreamExecutor* se) + : exec_(se), + // threadpool_ has 1 thread for the polling loop, and one to execute + // event callback functions. Maybe we should have more? + threadpool_(Env::Default(), "GPU_Event_Manager", 2) { + threadpool_.Schedule([this]() { PollLoop(); }); +} + +EventMgr::~EventMgr() { + stop_polling_.Notify(); + // Shut down the backup polling loop. + polling_stopped_.WaitForNotification(); + + // Events are owned by this object. + for (auto& e : free_events_) { + delete e; + } + while (!used_events_.empty()) { + delete used_events_[0].event; + delete used_events_[0].mem; + if (used_events_[0].bufrec.buf) { + used_events_[0].bufrec.alloc->DeallocateRaw(used_events_[0].bufrec.buf); + } + if (used_events_[0].func != nullptr) + threadpool_.Schedule(used_events_[0].func); + used_events_.pop_front(); + } +} + +// This polling loop runs at a relatively low frequency. Most calls to +// PollEvents() should come directly from Compute() via +// ThenDeleteTensors(). This function's purpose is to ensure that +// even if no more GPU operations are being requested, we still +// eventually clear the queue. It seems to prevent some tensorflow +// programs from stalling for reasons not yet understood. +void EventMgr::PollLoop() { + while (!stop_polling_.HasBeenNotified()) { + Env::Default()->SleepForMicroseconds(1 * 1000); + { + mutex_lock l(mu_); + PollEvents(true); + } + } + polling_stopped_.Notify(); +} + +void EventMgr::QueueInUse(gpu::Stream* stream, InUse iu) { + VLOG(2) << "QueueInUse free_events_ " << free_events_.size() + << " used_events_ " << used_events_.size(); + // Events are created on demand, and repeatedly reused. There is no + // limit placed here on the number of allocated Events. + if (free_events_.empty()) { + free_events_.push_back(new gpu::Event(exec_)); + free_events_.back()->Init(); + } + gpu::Event* e = free_events_.back(); + free_events_.pop_back(); + stream->ThenRecordEvent(e); + iu.event = e; + used_events_.push_back(iu); +} + +// This function must be called periodically to check whether pending +// events have recorded, and then retire them. Initial observations +// suggest that typical behavior in a TensorFlow program is to have +// 0-3 events pending most of the time, but there are occasionally +// spikes of up to several hundred outstanding. +// +// NOTE: If all events are on the same stream, no later event will +// complete before an earlier event, except possibly if the earlier +// event transitions to an error state, so there's no advantage in +// looking past the first kPending event. However, if we're using +// multiple streams there may be some gain in looking deeper. +// As a compromise, PollEvent() calls that are triggered by the queueing +// of a single event never look past the first kPending event. Calls +// coming from the dedicated polling thread always sweep the full queue. +// +// Note that allowing the queue to grow very long could cause overall +// GPU memory use to spike needlessly. An alternative strategy would +// be to throttle new Op execution until the pending event queue +// clears. +void EventMgr::PollEvents(bool is_dedicated_poller) { + VLOG(2) << "PollEvents free_events_ " << free_events_.size() + << " used_events_ " << used_events_.size(); + // Sweep the remaining events in order. If this is the dedicated + // polling thread, check the entire set. Otherwise, just sweep up to + // the first non-complete record that is still pending. + for (auto& iu : used_events_) { + if (iu.event == nullptr) continue; + gpu::Event::Status s = iu.event->PollForStatus(); + switch (s) { + case gpu::Event::Status::kUnknown: + case gpu::Event::Status::kError: + // We don't expect to see these. Someday maybe propagate + // a Status error, but for now fail hard. + LOG(FATAL) << "Unexpected Event status: " << static_cast<int>(s); + break; + case gpu::Event::Status::kPending: + if (!is_dedicated_poller) return; // quit processing queue + break; + case gpu::Event::Status::kComplete: + delete iu.mem; + if (iu.bufrec.buf) iu.bufrec.alloc->DeallocateRaw(iu.bufrec.buf); + // The function must be called in another thread, outside of + // the mutex held here. + if (iu.func != nullptr) threadpool_.Schedule(iu.func); + free_events_.push_back(iu.event); + // Mark this InUse record as completed. + iu.event = nullptr; + } + } + // Then clear any completed InUse records from the front of the queue. + while (!used_events_.empty()) { + InUse& iu = used_events_.front(); + if (iu.event == nullptr) { + used_events_.pop_front(); + } else { + break; + } + } +} + +} // namespace tensorflow diff --git a/tensorflow/core/common_runtime/gpu/gpu_event_mgr.h b/tensorflow/core/common_runtime/gpu/gpu_event_mgr.h new file mode 100644 index 0000000000..f9436566d4 --- /dev/null +++ b/tensorflow/core/common_runtime/gpu/gpu_event_mgr.h @@ -0,0 +1,118 @@ +#ifndef TENSORFLOW_COMMON_RUNTIME_GPU_GPU_EVENT_MGR_H_ +#define TENSORFLOW_COMMON_RUNTIME_GPU_GPU_EVENT_MGR_H_ + +#include <deque> +#include <vector> +#include "tensorflow/core/lib/core/notification.h" +#include "tensorflow/core/lib/core/threadpool.h" +#include "tensorflow/core/platform/port.h" +#include "tensorflow/core/platform/thread_annotations.h" +#include "tensorflow/core/public/tensor.h" + +namespace perftools { +namespace gputools { +class Event; +class Stream; +class StreamExecutor; +} // namespace gputools +} // namespace perftools + +namespace tensorflow { + +// An object to keep track of pending Events in the StreamExecutor streams +// and associated Tensors that cannot safely be deleted until the associated +// Events are recorded. +class EventMgr { + public: + explicit EventMgr(perftools::gputools::StreamExecutor* se); + + ~EventMgr(); + + // Takes ownership of *tensors and deletes it as soon as all events + // currently enqueued on *stream have completed. + inline void ThenDeleteTensors(perftools::gputools::Stream* stream, + std::vector<Tensor>* tensors) { + mutex_lock l(mu_); + QueueTensors(stream, tensors); + PollEvents(false); + } + + struct BufRec { + Allocator* alloc; + void* buf; + }; + + // Takes ownership of *bufrec.buf and calls bufrec.alloc->DeallocateRaw() + // on it as soon as all events currently enqueued on *stream have completed. + inline void ThenDeleteBuffer(perftools::gputools::Stream* stream, + BufRec bufrec) { + mutex_lock l(mu_); + QueueBuffer(stream, bufrec); + PollEvents(false); + } + + inline void ThenExecute(perftools::gputools::Stream* stream, + std::function<void()> func) { + mutex_lock l(mu_); + QueueFunc(stream, func); + PollEvents(false); + } + + private: + friend class TEST_EventMgrHelper; + mutex mu_; + perftools::gputools::StreamExecutor* exec_; + + struct InUse { + perftools::gputools::Event* event; + std::vector<Tensor>* mem; + BufRec bufrec; + std::function<void()> func; + }; + + // Stream-enqueue an unused Event and save with it a collection of + // Tensors and/or a BufRec to be deleted only after the Event + // records. + void QueueInUse(perftools::gputools::Stream* stream, InUse in_use) + EXCLUSIVE_LOCKS_REQUIRED(mu_); + + void QueueTensors(perftools::gputools::Stream* stream, + std::vector<Tensor>* tensors) + EXCLUSIVE_LOCKS_REQUIRED(mu_) { + QueueInUse(stream, {nullptr, tensors, BufRec(), nullptr}); + } + + void QueueBuffer(perftools::gputools::Stream* stream, BufRec bufrec) + EXCLUSIVE_LOCKS_REQUIRED(mu_) { + QueueInUse(stream, {nullptr, nullptr, bufrec, nullptr}); + } + + void QueueFunc(perftools::gputools::Stream* stream, + std::function<void()> func) EXCLUSIVE_LOCKS_REQUIRED(mu_) { + QueueInUse(stream, {nullptr, nullptr, BufRec(), func}); + } + + // This function should be called at roughly the same tempo as + // QueueTensors() to check whether pending events have recorded, + // and then retire them. + void PollEvents(bool is_dedicated_poller) EXCLUSIVE_LOCKS_REQUIRED(mu_); + + // An internal polling loop that runs at a low frequency to clear + // straggler Events. + void PollLoop(); + + // A stack of unused events + std::vector<perftools::gputools::Event*> free_events_ GUARDED_BY(mu_); + + // A FIFO queue of InUse events and associated tensors. + std::deque<InUse> used_events_ GUARDED_BY(mu_); + + Notification stop_polling_; + Notification polling_stopped_; + + // The main PollLoop for the event manager runs in this threadpool. + thread::ThreadPool threadpool_; +}; + +} // namespace tensorflow +#endif // TENSORFLOW_COMMON_RUNTIME_GPU_GPU_EVENT_MGR_H_ diff --git a/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc b/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc new file mode 100644 index 0000000000..30ca1ff187 --- /dev/null +++ b/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc @@ -0,0 +1,152 @@ +#if GOOGLE_CUDA + +#include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h" + +#include "tensorflow/core/common_runtime/gpu/gpu_init.h" +#include "tensorflow/stream_executor/multi_platform_manager.h" +#include "tensorflow/stream_executor/stream_executor.h" +#include <gtest/gtest.h> + +namespace gpu = ::perftools::gputools; + +namespace tensorflow { + +class TEST_EventMgrHelper { + public: + explicit TEST_EventMgrHelper(EventMgr* em) : em_(em) {} + + int queue_size() { + mutex_lock l(em_->mu_); + return em_->used_events_.size(); + } + + int free_size() { + mutex_lock l(em_->mu_); + return em_->free_events_.size(); + } + + void QueueTensors(perftools::gputools::Stream* stream, + std::vector<Tensor>* tensors) { + mutex_lock l(em_->mu_); + em_->QueueTensors(stream, tensors); + } + + void PollEvents(bool is_dedicated_poller) { + mutex_lock l(em_->mu_); + em_->PollEvents(is_dedicated_poller); + } + + private: + EventMgr* em_; +}; + +namespace { + +TEST(EventMgr, Empty) { + auto stream_exec = GPUMachineManager()->ExecutorForDevice(0).ValueOrDie(); + EventMgr em(stream_exec); + TEST_EventMgrHelper th(&em); + EXPECT_EQ(0, th.queue_size()); + EXPECT_EQ(0, th.free_size()); +} + +// Delaying polling until after several enqueings should grow the +// total number of allocated events. Once we have enough events for +// the max simultaneously pending, we should not allocate any more. +TEST(EventMgr, DelayedPolling) { + auto stream_exec = GPUMachineManager()->ExecutorForDevice(0).ValueOrDie(); + EventMgr em(stream_exec); + TEST_EventMgrHelper th(&em); + EXPECT_EQ(0, th.queue_size()); + std::vector<Tensor>* v = nullptr; + std::unique_ptr<gpu::Stream> stream(new gpu::Stream(stream_exec)); + CHECK(stream.get()); + stream->Init(); + for (int i = 0; i < 5; ++i) { + v = new std::vector<Tensor>; + th.QueueTensors(stream.get(), v); + EXPECT_EQ(i + 1, th.queue_size()); + EXPECT_EQ(0, th.free_size()); + } + th.PollEvents(false); + EXPECT_EQ(0, th.queue_size()); + EXPECT_EQ(5, th.free_size()); + for (int j = 0; j < 2; ++j) { + for (int i = 0; i < 5; ++i) { + v = new std::vector<Tensor>; + th.QueueTensors(stream.get(), v); + EXPECT_EQ(i + 1, th.queue_size()); + EXPECT_EQ(4 - i, th.free_size()); + } + th.PollEvents(false); + EXPECT_EQ(0, th.queue_size()); + EXPECT_EQ(5, th.free_size()); + } +} + +// Immediate polling should require only one event to be allocated. +TEST(EventMgr, ImmediatePolling) { + auto stream_exec = GPUMachineManager()->ExecutorForDevice(0).ValueOrDie(); + EventMgr em(stream_exec); + TEST_EventMgrHelper th(&em); + EXPECT_EQ(0, th.queue_size()); + EXPECT_EQ(0, th.free_size()); + std::vector<Tensor>* v = nullptr; + std::unique_ptr<gpu::Stream> stream(new gpu::Stream(stream_exec)); + CHECK(stream.get()); + stream->Init(); + for (int i = 0; i < 5; ++i) { + v = new std::vector<Tensor>; + em.ThenDeleteTensors(stream.get(), v); + EXPECT_EQ(0, th.queue_size()); + EXPECT_EQ(1, th.free_size()); + } +} + +// If we delay polling by more than 1 second, the backup polling loop +// should clear the queue. +TEST(EventMgr, LongDelayedPolling) { + auto stream_exec = GPUMachineManager()->ExecutorForDevice(0).ValueOrDie(); + EventMgr em(stream_exec); + TEST_EventMgrHelper th(&em); + EXPECT_EQ(0, th.queue_size()); + EXPECT_EQ(0, th.free_size()); + std::vector<Tensor>* v = nullptr; + std::unique_ptr<gpu::Stream> stream(new gpu::Stream(stream_exec)); + CHECK(stream.get()); + stream->Init(); + for (int i = 0; i < 5; ++i) { + v = new std::vector<Tensor>; + th.QueueTensors(stream.get(), v); + EXPECT_EQ(1 + i, th.queue_size()); + EXPECT_EQ(0, th.free_size()); + } + sleep(1); + EXPECT_EQ(0, th.queue_size()); + EXPECT_EQ(5, th.free_size()); +} + +// Deleting the EventMgr when events are still pending should shut +// down gracefully. +TEST(EventMgr, NonEmptyShutdown) { + auto stream_exec = GPUMachineManager()->ExecutorForDevice(0).ValueOrDie(); + EventMgr em(stream_exec); + TEST_EventMgrHelper th(&em); + EXPECT_EQ(0, th.queue_size()); + EXPECT_EQ(0, th.free_size()); + std::vector<Tensor>* v = nullptr; + std::unique_ptr<gpu::Stream> stream(new gpu::Stream(stream_exec)); + CHECK(stream.get()); + stream->Init(); + for (int i = 0; i < 5; ++i) { + v = new std::vector<Tensor>; + th.QueueTensors(stream.get(), v); + EXPECT_EQ(1 + i, th.queue_size()); + EXPECT_EQ(0, th.free_size()); + } +} + +} // namespace +} // namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/common_runtime/gpu/gpu_init.cc b/tensorflow/core/common_runtime/gpu/gpu_init.cc new file mode 100644 index 0000000000..631a47eb91 --- /dev/null +++ b/tensorflow/core/common_runtime/gpu/gpu_init.cc @@ -0,0 +1,147 @@ +#include "tensorflow/core/common_runtime/gpu/gpu_init.h" + +#include <string> + +#include "tensorflow/core/platform/port.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/stream_executor/multi_platform_manager.h" +#include "tensorflow/stream_executor/stream_executor.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/lib/strings/numbers.h" +#include "tensorflow/core/lib/strings/strcat.h" + +namespace gpu = ::perftools::gputools; + +namespace tensorflow { + +namespace { + +std::unique_ptr<std::map<std::pair<int, int>, bool>> GetPeerAccessMap( + gpu::Platform* platform, int device_count) { + auto* map = new std::map<std::pair<int, int>, bool>; + for (int i = 0; i < device_count; ++i) { + for (int j = 0; j < device_count; ++j) { + gpu::StreamExecutor* from = platform->ExecutorForDevice(i).ValueOrDie(); + gpu::StreamExecutor* to = platform->ExecutorForDevice(j).ValueOrDie(); + (*map)[{i, j}] = from->CanEnablePeerAccessTo(to); + } + } + + return std::unique_ptr<std::map<std::pair<int, int>, bool>>{map}; +} + +Status EnablePeerAccess(gpu::Platform* platform, int device_count) { + for (int i = 0; i < device_count; ++i) { + for (int j = 0; j < device_count; ++j) { + gpu::StreamExecutor* from = platform->ExecutorForDevice(i).ValueOrDie(); + gpu::StreamExecutor* to = platform->ExecutorForDevice(j).ValueOrDie(); + + if (from->CanEnablePeerAccessTo(to)) { + auto status = from->EnablePeerAccessTo(to); + if (!status.ok()) { + return errors::Internal(status.ToString()); + } + } else { + LOG(INFO) << "cannot enable peer access from device ordinal " << i + << " to device ordinal " << j; + } + } + } + return Status::OK(); +} + +static void InitGPU() { + auto result = gpu::MultiPlatformManager::PlatformWithName("CUDA"); + if (!result.ok()) { + LOG(WARNING) + << "Not initializing the GPU, could not create GPU MachineManager. " + << "Error: " << result.status(); + return; + } + + gpu::Platform* platform = result.ValueOrDie(); + + int dev_count = platform->VisibleDeviceCount(); + + if (dev_count == 0) { + LOG(INFO) << "No GPU devices available on machine."; + return; + } + + for (int i = 0; i < dev_count; ++i) { + auto stream_exec = platform->ExecutorForDevice(i).ValueOrDie(); + int64 free_bytes; + int64 total_bytes; + if (!stream_exec->DeviceMemoryUsage(&free_bytes, &total_bytes)) { + // Logs internally on failure. + free_bytes = 0; + total_bytes = 0; + } + const auto& description = stream_exec->GetDeviceDescription(); + int cc_major; + int cc_minor; + if (!description.cuda_compute_capability(&cc_major, &cc_minor)) { + // Logs internally on failure. + cc_major = 0; + cc_minor = 0; + } + LOG(INFO) << "Found device " << i << " with properties: " + << "\nname: " << description.name() << "\nmajor: " << cc_major + << " minor: " << cc_minor << " memoryClockRate (GHz) " + << description.clock_rate_ghz() << "\npciBusID " + << description.pci_bus_id() << "\nTotal memory: " + << strings::HumanReadableNumBytes(total_bytes) + << "\nFree memory: " + << strings::HumanReadableNumBytes(free_bytes); + } + + // Enable peer access + + auto status = EnablePeerAccess(platform, dev_count); + if (!status.ok()) { + LOG(FATAL) << "could not enable peer access for GPU devices: " << status; + } + + // Print out a matrix showing which devices can DMA to one + // another. + auto access_map = GetPeerAccessMap(platform, dev_count); + string line_buf = "DMA: "; + for (int i = 0; i < dev_count; ++i) { + strings::StrAppend(&line_buf, i, " "); + } + LOG(INFO) << line_buf; + for (int i = 0; i < dev_count; ++i) { + line_buf = strings::StrCat(i, ": "); + for (int j = 0; j < dev_count; ++j) { + if ((*access_map)[{i, j}]) { + line_buf.append("Y "); + } else { + line_buf.append("N "); + } + } + LOG(INFO) << line_buf; + } +} + +static bool InitModule() { + InitGPU(); + return true; +} + +} // namespace + +gpu::Platform* GPUMachineManager() { + // Create the machine manager singleton and initialize the GPUs only + // once. + static bool init = InitModule(); + CHECK(init); // Avoids compiler warning that init is unused. + + auto result = gpu::MultiPlatformManager::PlatformWithName("CUDA"); + if (!result.ok()) { + return nullptr; + } + + return result.ValueOrDie(); +} + +} // namespace tensorflow diff --git a/tensorflow/core/common_runtime/gpu/gpu_init.h b/tensorflow/core/common_runtime/gpu/gpu_init.h new file mode 100644 index 0000000000..d126a8b1ca --- /dev/null +++ b/tensorflow/core/common_runtime/gpu/gpu_init.h @@ -0,0 +1,19 @@ +#ifndef TENSORFLOW_COMMON_RUNTIME_GPU_GPU_INIT_H_ +#define TENSORFLOW_COMMON_RUNTIME_GPU_GPU_INIT_H_ + +namespace perftools { +namespace gputools { +class Platform; +} // namespace gputools +} // namespace perftools + +namespace tensorflow { + +// Returns the GPU machine manager singleton, creating it and +// initializing the GPUs on the machine if needed the first time it is +// called. +perftools::gputools::Platform* GPUMachineManager(); + +} // namespace tensorflow + +#endif // TENSORFLOW_COMMON_RUNTIME_GPU_GPU_INIT_H_ diff --git a/tensorflow/core/common_runtime/gpu/gpu_region_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_region_allocator.cc new file mode 100644 index 0000000000..08ff55e221 --- /dev/null +++ b/tensorflow/core/common_runtime/gpu/gpu_region_allocator.cc @@ -0,0 +1,371 @@ +#include "tensorflow/core/common_runtime/gpu/gpu_region_allocator.h" + +//#include "base/commandlineflags.h" +#include "tensorflow/stream_executor/multi_platform_manager.h" +#include "tensorflow/core/common_runtime/gpu/gpu_allocator_retry.h" +#include "tensorflow/core/common_runtime/gpu/gpu_init.h" +#include "tensorflow/core/lib/core/bits.h" +#include "tensorflow/core/lib/gtl/stl_util.h" +#include "tensorflow/core/lib/strings/numbers.h" +#include "tensorflow/core/lib/strings/str_util.h" +#include "tensorflow/core/lib/strings/strcat.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/port.h" + +#if defined(PLATFORM_GOOGLE) +DEFINE_bool(brain_gpu_region_allocator_heap_check_on_destruction, true, + "If true, the CUDA gpu manager checks that all allocated " + "memory through the GPU memory pool implementation has been " + "freed."); + +DEFINE_int64(brain_gpu_region_allocator_region_size, 0, + "If > 0, sets the default chunk-size allocatable from GPU memory. " + "Else defaults to entire GPU memory."); + +#else +bool FLAGS_brain_gpu_region_allocator_heap_check_on_destruction = true; +tensorflow::int64 FLAGS_brain_gpu_region_allocator_region_size = 0; +#endif + +namespace gpu = ::perftools::gputools; + +namespace tensorflow { + +GPURegionAllocator::GPURegionAllocator(int device_id, size_t total_bytes) + : device_id_(device_id), total_bytes_(total_bytes) { + // Get a pointer to the stream_executor for this device + stream_exec_ = GPUMachineManager()->ExecutorForDevice(device_id).ValueOrDie(); + + // Set the region size based on explicit user request, or based on + // total GPU capacity. + if (FLAGS_brain_gpu_region_allocator_region_size > 0) { + region_size_ = FLAGS_brain_gpu_region_allocator_region_size; + } else { + region_size_ = static_cast<size_t>(total_bytes_); + } + + LOG(INFO) << "Setting region size to " << region_size_; +} + +GPURegionAllocator::~GPURegionAllocator() { + if (FLAGS_brain_gpu_region_allocator_heap_check_on_destruction) { + CheckForMemoryLeaks(); + } + + gtl::STLDeleteValues(&chunk_map_); + + for (auto r : regions_) { + gpu::DeviceMemoryBase gpu_ptr{r->ptr}; + stream_exec_->Deallocate(&gpu_ptr); + delete r; + } +} + +void* GPURegionAllocator::AllocateRaw(size_t alignment, size_t num_bytes) { + static const int64 kMaxMillisToWait = 10000; // 10 seconds + return retry_helper_.AllocateRaw( + [this](size_t a, size_t nb, bool v) { + return AllocateRawInternal(a, nb, v); + }, + kMaxMillisToWait, alignment, num_bytes); +} + +void* GPURegionAllocator::AllocateRawInternal(size_t alignment, + size_t num_bytes, + bool dump_log_on_failure) { + if (num_bytes == 0) { + LOG(ERROR) << "tried to allocate 0 bytes"; + return nullptr; + } + size_t chunk_size = ChunkSize(num_bytes); + + VLOG(2) << "chunk_size " << chunk_size << " from num_bytes " + << strings::HumanReadableNumBytes(num_bytes); + mutex_lock l(lock_); + Pool* pool = &pools_[chunk_size]; + if (pool->num_free == 0) { + if (!ExpandPool(pool, chunk_size, num_bytes, dump_log_on_failure)) { + if (dump_log_on_failure) { + LOG(WARNING) << "Out of GPU memory, see memory state dump above"; + } + return nullptr; + } + } + CHECK_LT(0, pool->num_free); + CHECK(pool->first); + CHECK(pool->last); + Chunk* c = pool->first; + CHECK(c); + CHECK(!c->in_use); + + c->in_use = true; + // Move c to the back of the queue. + if (c->next != nullptr) { + pool->first = c->next; + pool->first->prev = nullptr; + c->next = nullptr; + } + + if (pool->last != c) { + pool->last->next = c; + c->prev = pool->last; + pool->last = c; + } + pool->num_free--; + pool->cumulative_malloced++; + + void* rv = c->ptr; + c->bytes_allocated = num_bytes; + + VLOG(2) << "new ptr " << rv; + return rv; +} + +void GPURegionAllocator::DeallocateRaw(void* ptr) { + retry_helper_.DeallocateRaw([this](void* p) { DeallocateRawInternal(p); }, + ptr); +} + +void GPURegionAllocator::DeallocateRawInternal(void* ptr) { + VLOG(2) << "DeallocateRaw: " << ptr; + if (ptr == nullptr) { + LOG(ERROR) << "tried to deallocate nullptr"; + return; + } + + mutex_lock l(lock_); + ChunkMap::const_iterator iter = chunk_map_.find(ptr); + CHECK(iter != chunk_map_.end()); + + Chunk* c = iter->second; + VLOG(2) << "chunk of size " << c->size << " at " << c; + + Pool* pool = &(pools_[c->size]); + // Move chunk to head of queue, and mark free. + DCHECK(c->in_use); + c->in_use = false; + if (c->prev) c->prev->next = c->next; + if (c->next) c->next->prev = c->prev; + if (pool->first == c) pool->first = c->next; + if (pool->last == c) pool->last = c->prev; + c->next = pool->first; + c->prev = nullptr; + if (c->next) c->next->prev = c; + pool->first = c; + if (pool->last == nullptr) pool->last = c; + pool->num_free++; + pool->cumulative_freed++; +} + +bool GPURegionAllocator::ExpandPool(Pool* pool, size_t chunk_size, + size_t requested_size, + bool dump_log_on_failure) { + VLOG(1) << "ExpandPool of " << chunk_size << " from " << pool->num_chunks + << " current members"; + DCHECK_NE(0, chunk_size); + // If chunk_size is < 4096, double the pool size. Otherwise + // just increase by one. + int num_chunks = pool->num_chunks; + if (num_chunks == 0) { + if (chunk_size > 4096) { + num_chunks = 1; + } else { + num_chunks = 4096 / chunk_size; + } + } + // For larger chunks, limit the amount of expansion. + size_t aggregate_size = num_chunks * chunk_size; + if (aggregate_size > (1 << 20)) { + num_chunks = static_cast<int>( + std::max(static_cast<size_t>(1), (1 << 20) / chunk_size)); + } + while (num_chunks > 0) { + Region* r = (regions_.empty() ? nullptr : regions_.back()); + if (r == nullptr || + (((r->ptr + r->size) - r->next) < static_cast<int64>(chunk_size))) { + // Current region is not large enough to accommodate another chunk. + while (r == nullptr || (((r->ptr + r->size) - r->next) < + static_cast<int64>(chunk_size))) { + // Get another region. + size_t this_region_size = std::max(region_size_, chunk_size); + + // Check if we would exceed our limit. + if (allocated_memory_ + this_region_size > total_bytes_) { + if (dump_log_on_failure) DumpMemoryLog(); + return false; + } + + // Perform the allocation, still checking that the allocator + // has not run out of memory. + gpu::DeviceMemory<char> gpu_mem = + stream_exec_->AllocateArray<char>(this_region_size); + if (gpu_mem == nullptr) { + if (dump_log_on_failure) DumpMemoryLog(); + return false; + } + + // We never release memory once expanded. + allocated_memory_ += this_region_size; + + Region* nr = new Region; + nr->ptr = static_cast<char*>(gpu_mem.opaque()); + + if (VLOG_IS_ON(2)) { + int64 free_bytes; + int64 total_bytes; + if (stream_exec_->DeviceMemoryUsage(&free_bytes, &total_bytes)) { + VLOG(2) << "free " << free_bytes << " total " << total_bytes; + } else { + // Note: stream_exec call also logs internally on failure. + VLOG(2) << "could not retrieve memory usage"; + } + } + VLOG(1) << "new Region of size " << this_region_size << " at " + << static_cast<void*>(nr->ptr) << " on device " << device_id_; + r = nr; + r->size = this_region_size; + r->next = r->ptr; + regions_.push_back(r); + + for (auto visitor : region_visitors_) { + visitor(r->ptr, r->size); + } + } + } else { + // Allocate a new chunk and push on front of Pool. + Chunk* c = new Chunk; + c->ptr = r->next; + chunk_map_[c->ptr] = c; + c->size = chunk_size; + r->next += chunk_size; + c->next = pool->first; + if (c->next != nullptr) c->next->prev = c; + pool->first = c; + if (pool->last == nullptr) pool->last = c; + pool->num_chunks++; + pool->num_free++; + --num_chunks; + } + } + + return true; +} + +void GPURegionAllocator::CheckForMemoryLeaks() { + std::vector<string> errors; + mutex_lock l(lock_); // could use reader lock + for (auto pool_map : pools_) { + const Pool& p = pool_map.second; + Chunk* curr_chunk = p.first; + while (curr_chunk != nullptr) { + if (curr_chunk->in_use) { + errors.push_back( + strings::StrCat("Unfreed chunk of size ", curr_chunk->size)); + } + curr_chunk = curr_chunk->next; + } + } + if (!errors.empty()) { + LOG(FATAL) << "GPU Memory leaks:\n" << str_util::Join(errors, "\n"); + } +} + +// Since there's no merging of chunks once allocated, we want to +// maximize their reusablity (which argues for fewer, larger sizes), +// while minimizing waste (which argues for tight-fitting sizes). +// +// The smallest unit of allocation is 256 bytes. +// NOTE(tucker): akrizhevsky says that nvidia's memory manager always +// aligns to 256 bytes, and doing so results in significant speedup. +// +// Up to 2^16 bytes we only allocate in powers of 2. +// +// Above that, we pick a max-waste which is the largest power +// of 2 <= 1/16 of the requested size, then round up to the nearest +// multiple of max_waste. +// +// static +size_t GPURegionAllocator::ChunkSize(size_t bytes) { + if (bytes <= 256) { + return 256; + } else if (bytes <= (1 << 16)) { + return 1uLL << Log2Ceiling64(bytes); + } else { + // 1/16th of requested size + size_t max_waste = 1uLL << (Log2Ceiling64(bytes) - 4); + return (bytes + max_waste) & (~(max_waste - 1)); + } +} + +void GPURegionAllocator::AddAllocVisitor(Visitor visitor) { + VLOG(1) << "AddVisitor"; + mutex_lock l(lock_); + region_visitors_.push_back(visitor); + for (auto region : regions_) { + visitor(region->ptr, region->size); + } +} + +void GPURegionAllocator::DumpMemoryLog() { + size_t region_bytes = 0; + for (auto r : regions_) { + region_bytes += r->size; + } + size_t chunk_bytes = 0; + std::vector<size_t> chunk_sizes; + for (auto i : pools_) { + chunk_sizes.push_back(i.first); + } + std::sort(chunk_sizes.begin(), chunk_sizes.end()); + for (auto i : chunk_sizes) { + int32 chunks_in_use = 0; + const Pool& p = pools_[i]; + chunk_bytes += i * p.num_chunks; + + if (p.num_chunks > 0) { + // Iterate backwards (allocated chunks are last). + Chunk* curr_chunk = p.last; + while (curr_chunk != nullptr) { + if (curr_chunk->in_use) { + ++chunks_in_use; + } + curr_chunk = curr_chunk->prev; + if (curr_chunk == p.first) { + break; + } + } + } + + LOG(INFO) << "Chunk size: " << i << " (" + << strings::HumanReadableNumBytes(i) << ") Pool: " << p.ToString() + << "\nNumber of chunks: " << p.num_chunks + << ", in_use chunks: " << chunks_in_use; + } + + LOG(INFO) << "Aggregate Region Memory: " << region_bytes << " (" + << strings::HumanReadableNumBytes(region_bytes) << ")"; + LOG(INFO) << "Aggregate Chunk Memory: " << chunk_bytes << " (" + << strings::HumanReadableNumBytes(chunk_bytes) << ")"; +} + +bool GPURegionAllocator::TracksAllocationSizes() { return true; } + +size_t GPURegionAllocator::RequestedSize(void* ptr) { + mutex_lock l(lock_); + auto it = chunk_map_.find(ptr); + CHECK(it != chunk_map_.end()) + << "Asked for requested size of pointer we never allocated: " << ptr; + auto c = it->second; + return c->bytes_allocated; +} + +size_t GPURegionAllocator::AllocatedSize(void* ptr) { + mutex_lock l(lock_); + auto it = chunk_map_.find(ptr); + CHECK(it != chunk_map_.end()) + << "Asked for allocated size of pointer we never allocated: " << ptr; + auto c = it->second; + return c->size; +} + +} // namespace tensorflow diff --git a/tensorflow/core/common_runtime/gpu/gpu_region_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_region_allocator.h new file mode 100644 index 0000000000..1a250b6ede --- /dev/null +++ b/tensorflow/core/common_runtime/gpu/gpu_region_allocator.h @@ -0,0 +1,146 @@ +#ifndef TENSORFLOW_COMMON_RUNTIME_GPU_GPU_REGION_ALLOCATOR_H_ +#define TENSORFLOW_COMMON_RUNTIME_GPU_GPU_REGION_ALLOCATOR_H_ + +#include <memory> +#include <string> +#include <unordered_map> +#include <vector> + +#include "tensorflow/stream_executor/stream_executor.h" +#include "tensorflow/core/common_runtime/gpu/gpu_allocator_retry.h" +#include "tensorflow/core/common_runtime/gpu/visitable_allocator.h" +#include "tensorflow/core/lib/strings/strcat.h" +#include "tensorflow/core/platform/port.h" +#include "tensorflow/core/platform/thread_annotations.h" + +namespace tensorflow { + +class GPURegionAllocator : public VisitableAllocator { + public: + // 'device_id' must be a valid device on the machine. + // + // total_bytes is how many bytes this allocator should allocate up + // to. This may be less than the total available. + explicit GPURegionAllocator(int device_id, size_t total_bytes); + ~GPURegionAllocator() override; + + string Name() override { return "gpu_region"; } + void* AllocateRaw(size_t alignment, size_t num_bytes) override; + void DeallocateRaw(void* ptr) override; + void AddAllocVisitor(Visitor visitor) override; + // Does nothing, because regions are never freed. + void AddFreeVisitor(Visitor visitor) override {} + + bool TracksAllocationSizes() override; + size_t RequestedSize(void* ptr) override; + size_t AllocatedSize(void* ptr) override; + + private: + // A Chunk is the header on a single piece of memory given back + // in response to an AllocateRaw() call. + struct Chunk { + char* ptr; // pointer to granted GPU buffer. + size_t size; // Full size of GPU buffer. + size_t bytes_allocated; // Bytes asked for by client. + bool in_use; + Chunk* prev; // Used for chaining in pool. + Chunk* next; + Chunk() + : ptr(nullptr), + size(0), + bytes_allocated(0), + in_use(false), + prev(nullptr), + next(nullptr) {} + }; + + // A Pool is a collection of same-sized Chunks. + struct Pool { + int num_chunks; // total chunks in this pool + int num_free; // total free chunks in this pool + int64 cumulative_malloced; // number of chunks malloced so far + int64 cumulative_freed; // number of chunks freed so far + + // double-linked ring of chunks; all free chunks precede all + // granted chunks + Chunk* first; + Chunk* last; + Pool() + : num_chunks(0), + num_free(0), + cumulative_malloced(0), + cumulative_freed(0), + first(nullptr), + last(nullptr) {} + + string ToString() const { + return strings::StrCat("chunks: ", num_chunks, " free: ", num_free, + " cumulative malloc: ", cumulative_malloced, + " cumulative freed: ", cumulative_freed); + } + }; + + // A Region is a single area of GPU memory that has been + // reserved by this class and carved up into Chunks. + struct Region { + char* ptr; // base GPU ptr + char* next; // frontier of unused part of region + size_t size; + Region() : ptr(nullptr), size(0) {} + }; + + // Calculate size of chunk for an allocation of this size. + // Min chunk size is 16, for alignment. + // For larger sizes, we round up somewhat so there are fewer + // size-specific pools. + static size_t ChunkSize(size_t bytes); + + void* AllocateRawInternal(size_t alignment, size_t num_bytes, + bool dump_log_on_failure); + void DeallocateRawInternal(void* ptr); + + bool ExpandPool(Pool* p, size_t chunk_size, size_t requested_size, + bool dump_log_on_failure) EXCLUSIVE_LOCKS_REQUIRED(lock_); + + // Inspects region maps and crashes with debug information if there + // are any memory leaks as detected by the region allocator. + void CheckForMemoryLeaks() LOCKS_EXCLUDED(lock_); + + void DumpMemoryLog() EXCLUSIVE_LOCKS_REQUIRED(lock_); + + perftools::gputools::StreamExecutor* stream_exec_; // Not owned. + + typedef std::unordered_map<size_t, Pool> PoolMap; + typedef std::unordered_map<void*, Chunk*> ChunkMap; + + GPUAllocatorRetry retry_helper_; + mutable mutex lock_; + PoolMap pools_ GUARDED_BY(lock_); + + // Owns regions. + std::vector<Region*> regions_ GUARDED_BY(lock_); + + // Maps from GPU ptr to Chunk owning it. + // + // Owns chunks. + ChunkMap chunk_map_ GUARDED_BY(lock_); + + // Called once on each region, ASAP. + std::vector<Visitor> region_visitors_ GUARDED_BY(lock_); + + const int device_id_; + + // Total amount of memory (in bytes) available to this Allocator + const size_t total_bytes_; + + // Total amount of memory allocated to regions. + size_t allocated_memory_ = 0; + + size_t region_size_ = 0; + + TF_DISALLOW_COPY_AND_ASSIGN(GPURegionAllocator); +}; + +} // namespace tensorflow + +#endif // TENSORFLOW_COMMON_RUNTIME_GPU_GPU_REGION_ALLOCATOR_H_ diff --git a/tensorflow/core/common_runtime/gpu/gpu_region_allocator_test.cc b/tensorflow/core/common_runtime/gpu/gpu_region_allocator_test.cc new file mode 100644 index 0000000000..07b0dd57f6 --- /dev/null +++ b/tensorflow/core/common_runtime/gpu/gpu_region_allocator_test.cc @@ -0,0 +1,71 @@ +#if GOOGLE_CUDA + +#include "tensorflow/core/common_runtime/gpu/gpu_region_allocator.h" + +#include <algorithm> +#include <vector> + +#include "tensorflow/core/platform/port.h" +#include "tensorflow/core/lib/gtl/inlined_vector.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/common_runtime/gpu/gpu_init.h" +#include "tensorflow/stream_executor/stream_executor.h" +#include <gtest/gtest.h> + +namespace gpu = ::perftools::gputools; + +namespace tensorflow { +namespace { + +TEST(GPURegionAllocatorTest, Simple) { + GPURegionAllocator a(0, 1 << 26); + std::vector<void*> ptrs; + for (int s = 1; s < 1024; s++) { + void* raw = a.AllocateRaw(1, s); + ptrs.push_back(raw); + } + std::sort(ptrs.begin(), ptrs.end()); + for (int i = 0; i < ptrs.size(); i++) { + if (i > 0) { + CHECK_NE(ptrs[i], ptrs[i - 1]); // No dups + } + a.DeallocateRaw(ptrs[i]); + } + float* t1 = a.Allocate<float>(1024); + double* t2 = a.Allocate<double>(1048576); + a.Deallocate(t1); + a.Deallocate(t2); +} + +TEST(GPURegionAllocatorTest, CheckMemLeak) { + EXPECT_DEATH( + { + GPURegionAllocator a(0, 1 << 26); + float* t1 = a.Allocate<float>(1024); + if (t1) { + LOG(INFO) << "Not deallocating"; + } + }, + ""); +} + +TEST(GPURegionAllocatorTest, TracksSizes) { + GPURegionAllocator a(0, 1 << 26); + EXPECT_EQ(true, a.TracksAllocationSizes()); +} + +TEST(GPURegionAllocatorTest, AllocatedVsRequested) { + GPURegionAllocator a(0, 1 << 26); + float* t1 = a.Allocate<float>(1); + EXPECT_EQ(sizeof(float), a.RequestedSize(t1)); + + // Minimum allocation size if 256 + EXPECT_EQ(256, a.AllocatedSize(t1)); + + a.Deallocate(t1); +} + +} // namespace +} // namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/common_runtime/gpu/gpu_stream_util.cc b/tensorflow/core/common_runtime/gpu/gpu_stream_util.cc new file mode 100644 index 0000000000..ca86c7fa06 --- /dev/null +++ b/tensorflow/core/common_runtime/gpu/gpu_stream_util.cc @@ -0,0 +1,97 @@ +#include "tensorflow/core/common_runtime/gpu/gpu_stream_util.h" + +#include <set> +#include <string> +#include <unordered_set> +#include <vector> + +#include "tensorflow/core/graph/algorithm.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/lib/strings/strcat.h" + +namespace tensorflow { +namespace gpu_stream_util { + +Status AssignStreams(const Graph* graph, const AssignStreamsOpts& opts, + std::unordered_map<int, int>* node_to_stream_id) { + VLOG(1) << "AssignStreams"; + Status status; + + // Sanity check arguments. + if (graph == nullptr) + status.Update(errors::InvalidArgument("Bad graph argument supplied.")); + if (node_to_stream_id == nullptr) { + status.Update( + errors::InvalidArgument("Bad node_to_stream_id argument supplied.")); + } + if ((opts.max_streams < 1) || (opts.send_stream >= opts.max_streams) || + (opts.recv_stream >= opts.max_streams) || + (opts.const_stream >= opts.max_streams) || + (opts.compute_stream >= opts.max_streams)) { + status.Update(errors::InvalidArgument("Bad graph argument supplied.")); + } + TF_RETURN_IF_ERROR(status); + + // Topologically sort the nodes. + std::vector<Node*> order; + GetReversePostOrder(*graph, &order); + if (VLOG_IS_ON(2)) { + for (Node* n : order) { + const int node_id = n->id(); + VLOG(2) << "Node " << node_id << " " << n->type_string() << " " + << n->name() << " " << n->in_edges().size() << " inputs"; + for (const Edge* e : n->in_edges()) { + VLOG(2) << " Edge from " << e->src()->id() << " " << e->src()->name() + << " fanout " << e->src()->out_edges().size(); + } + } + } + // We perform stream assigmnent assuming a large number of + // stream IDs and then map these down to the required number of streams + // using simple round-robin. + // Stream Assignment strategy: + // 1. Nodes with zero inputs are always be executed on a + // fresh stream. + // 2. Try to execute a node on the same stream as one of its + // inputs to avoid inter-stream dependencies. + // 3. If any input comes from a node with a large fanout then + // perhaps an indication that it is shared between parallel + // streams of work. We choose a new stream here so that all consumers + // of the tensor are likely to run in parallel. + int highest_stream_id = -1; + for (Node* n : order) { + VLOG(3) << "Inspecting node " << n->DebugString(); + const int node_id = n->id(); + const string& op = n->type_string(); + + // Determine a suitable stream to use. + int stream_id = highest_stream_id + 1; + for (const Edge* e : n->in_edges()) { + const int fanout = e->src()->out_edges().size(); + if (fanout == 1) { + stream_id = (*node_to_stream_id)[e->src()->id()]; + break; + } + } + // Override stream for specific op types. + if (op == "_Send") { + if (opts.send_stream >= 0) stream_id = opts.send_stream; + } else if (op == "_Recv") { + if (opts.recv_stream >= 0) stream_id = opts.recv_stream; + } else if (op == "Const") { + if (opts.const_stream >= 0) stream_id = opts.const_stream; + } else { + if (opts.compute_stream >= 0) stream_id = opts.compute_stream; + } + + (*node_to_stream_id)[node_id] = stream_id % opts.max_streams; + highest_stream_id = std::max(stream_id, highest_stream_id); + } + VLOG(1) << "Identified " << highest_stream_id << " candidate streams for " + << order.size() << " nodes."; + + return Status::OK(); +} + +} // namespace gpu_stream_util +} // namespace tensorflow diff --git a/tensorflow/core/common_runtime/gpu/gpu_stream_util.h b/tensorflow/core/common_runtime/gpu/gpu_stream_util.h new file mode 100644 index 0000000000..e1c623382c --- /dev/null +++ b/tensorflow/core/common_runtime/gpu/gpu_stream_util.h @@ -0,0 +1,30 @@ +#ifndef TENSORFLOW_COMMON_RUNTIME_GPU_GPU_STREAM_UTIL_H_ +#define TENSORFLOW_COMMON_RUNTIME_GPU_GPU_STREAM_UTIL_H_ + +#include <unordered_map> + +#include "tensorflow/core/graph/graph.h" +#include "tensorflow/core/public/status.h" + +namespace tensorflow { +namespace gpu_stream_util { + +struct AssignStreamsOpts { + int32 max_streams = 1; + // The following options specify a stream to use for specific op + // types. The value -1 allows ops to be assigned to any stream. + int32 send_stream = -1; + int32 recv_stream = -1; + int32 const_stream = -1; + int32 compute_stream = -1; +}; + +// Given the input graph, assigns every node in the graph with a +// stream_id that should be used. +Status AssignStreams(const Graph* graph, const AssignStreamsOpts& opts, + std::unordered_map<int, int>* node_to_stream_id); + +} // namespace gpu_stream_util +} // namespace tensorflow + +#endif // TENSORFLOW_COMMON_RUNTIME_GPU_GPU_STREAM_UTIL_H_ diff --git a/tensorflow/core/common_runtime/gpu/gpu_stream_util_test.cc b/tensorflow/core/common_runtime/gpu/gpu_stream_util_test.cc new file mode 100644 index 0000000000..5c426caaef --- /dev/null +++ b/tensorflow/core/common_runtime/gpu/gpu_stream_util_test.cc @@ -0,0 +1,137 @@ +#include "tensorflow/core/common_runtime/gpu/gpu_stream_util.h" + +#include <gtest/gtest.h> +#include "tensorflow/cc/ops/array_ops.h" +#include "tensorflow/cc/ops/sendrecv_ops.h" +#include "tensorflow/cc/ops/standard_ops.h" +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/types.pb.h" +#include "tensorflow/core/graph/graph_def_builder.h" +#include "tensorflow/core/graph/node_builder.h" +#include "tensorflow/core/kernels/ops_testutil.h" +#include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/lib/core/status_test_util.h" + +namespace tensorflow { +namespace { + +class GpuStreamUtilTest : public OpsTestBase { + protected: + void SetUp() override { RequireDefaultOps(); } +}; + +TEST_F(GpuStreamUtilTest, BogusOpts) { + GraphDefBuilder b(GraphDefBuilder::kFailImmediately); + Graph g(OpRegistry::Global()); + ASSERT_OK(b.ToGraph(&g)); + std::unordered_map<int, int> node_to_stream_id; + gpu_stream_util::AssignStreamsOpts opts; + Status status; + status = gpu_stream_util::AssignStreams(nullptr, opts, &node_to_stream_id); + EXPECT_FALSE(status.ok()); + status = gpu_stream_util::AssignStreams(&g, opts, nullptr); + EXPECT_FALSE(status.ok()); + opts.max_streams = 0; + status = gpu_stream_util::AssignStreams(&g, opts, &node_to_stream_id); + EXPECT_FALSE(status.ok()); + opts.max_streams = 1; + opts.compute_stream = 5; + status = gpu_stream_util::AssignStreams(&g, opts, &node_to_stream_id); + EXPECT_FALSE(status.ok()); +} + +TEST_F(GpuStreamUtilTest, EmptyGraph) { + GraphDefBuilder b(GraphDefBuilder::kFailImmediately); + Graph g(OpRegistry::Global()); + ASSERT_OK(b.ToGraph(&g)); + std::unordered_map<int, int> node_to_stream_id; + gpu_stream_util::AssignStreamsOpts opts; + ASSERT_OK(gpu_stream_util::AssignStreams(&g, opts, &node_to_stream_id)); + EXPECT_EQ(2, node_to_stream_id.size()); // _SOURCE and _SINK +} + +TEST_F(GpuStreamUtilTest, SimpleGraphOneStream) { + GraphDefBuilder b(GraphDefBuilder::kFailImmediately); + ops::MatMul(ops::Const(Tensor(DT_FLOAT), b.opts()), + ops::Const(Tensor(DT_FLOAT), b.opts()), b.opts()); + Graph g(OpRegistry::Global()); + ASSERT_OK(b.ToGraph(&g)); + + std::unordered_map<int, int> node_to_stream_id; + gpu_stream_util::AssignStreamsOpts opts; + ASSERT_OK(gpu_stream_util::AssignStreams(&g, opts, &node_to_stream_id)); + + // There should be 5 nodes assigned. + EXPECT_EQ(5, node_to_stream_id.size()); + + // All of them should have stream 0. + for (const auto& it : node_to_stream_id) { + EXPECT_EQ(0, it.second); + } +} + +TEST_F(GpuStreamUtilTest, SimpleGraphManyStreams) { + GraphDefBuilder b(GraphDefBuilder::kFailImmediately); + ops::MatMul(ops::Const(Tensor(DT_FLOAT), b.opts()), + ops::Const(Tensor(DT_FLOAT), b.opts()), b.opts()); + Graph g(OpRegistry::Global()); + ASSERT_OK(b.ToGraph(&g)); + + std::unordered_map<int, int> node_to_stream_id; + gpu_stream_util::AssignStreamsOpts opts; + opts.max_streams = 3; + ASSERT_OK(gpu_stream_util::AssignStreams(&g, opts, &node_to_stream_id)); + + // There should be 5 nodes assigned. + EXPECT_EQ(5, node_to_stream_id.size()); + + // All of them should have a stream in the range [0..max_streams). + for (const auto& it : node_to_stream_id) { + EXPECT_GE(it.second, 0); + EXPECT_LT(it.second, opts.max_streams); + } +} + +TEST_F(GpuStreamUtilTest, StreamOverrides) { + GraphDefBuilder b(GraphDefBuilder::kFailImmediately); + ops::_Recv(DT_FLOAT, "input", "/cpu:0", 0, "/gpu:0", + b.opts().WithName("input")); + auto n = ops::MatMul(ops::Const(Tensor(DT_FLOAT), b.opts()), + ops::Const(Tensor(DT_FLOAT), b.opts()), b.opts()); + ops::_Send(n, "output", "/gpu:0", 0, "/cpu:0", b.opts().WithName("output")); + Graph g(OpRegistry::Global()); + ASSERT_OK(b.ToGraph(&g)); + + // Perform stream assignment using a large number of streams, but with + // op types constrained to specific streams. + std::unordered_map<int, int> node_to_stream_id; + gpu_stream_util::AssignStreamsOpts opts; + opts.max_streams = 100; + opts.const_stream = 90; + opts.send_stream = 91; + opts.recv_stream = 92; + opts.compute_stream = 93; + ASSERT_OK(gpu_stream_util::AssignStreams(&g, opts, &node_to_stream_id)); + + // There should be 7 nodes assigned. + EXPECT_EQ(7, node_to_stream_id.size()); // including _SOURCE and _SINK + + // Nodes should be assigned to streams by op type. + for (const auto& it : node_to_stream_id) { + Node* n = g.FindNodeId(it.first); + const string op = n->type_string(); + const int stream = it.second; + if (op == "Const") { + EXPECT_EQ(stream, 90); + } else if (op == "_Send") { + EXPECT_EQ(stream, 91); + } else if (op == "_Recv") { + EXPECT_EQ(stream, 92); + } else { // Compute. + EXPECT_EQ(stream, 93); + } + } +} + +} // namespace +} // namespace tensorflow diff --git a/tensorflow/core/common_runtime/gpu/gpu_util.cc b/tensorflow/core/common_runtime/gpu/gpu_util.cc new file mode 100644 index 0000000000..a6a3ce01fc --- /dev/null +++ b/tensorflow/core/common_runtime/gpu/gpu_util.cc @@ -0,0 +1,345 @@ +#include "tensorflow/core/common_runtime/gpu/gpu_util.h" + +//#include "base/commandlineflags.h" +#include "tensorflow/core/common_runtime/device.h" +#include "tensorflow/core/common_runtime/gpu_device_context.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/lib/core/refcount.h" +#include "tensorflow/core/lib/gtl/array_slice.h" +#include "tensorflow/core/lib/gtl/stl_util.h" +#include "tensorflow/core/lib/hash/hash.h" +#include "tensorflow/core/lib/strings/strcat.h" +#include "tensorflow/core/lib/strings/stringprintf.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/tensor_coding.h" +#include "tensorflow/core/platform/tracing.h" +#include "tensorflow/core/public/tensor.h" +#include "tensorflow/core/common_runtime/gpu/dma_helper.h" +#include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h" +#include "tensorflow/core/common_runtime/gpu/process_state.h" +#include "tensorflow/core/util/util.h" +#include "tensorflow/stream_executor/stream.h" +#include "tensorflow/stream_executor/stream_executor.h" + +#include "tensorflow/core/platform/stream_executor_util.h" + +#if defined(PLATFORM_GOOGLE) +DEFINE_int64(brain_gpu_util_debug_string_maxlen, 128, + "When dumping gpu memory, prints up to this many bytes."); + +DECLARE_bool(record_mem_types); +#else +tensorflow::int64 FLAGS_brain_gpu_util_debug_string_maxlen = 128; +bool FLAGS_EXPERIMENTAL_brain_gpu_multi_stream = false; +extern bool FLAGS_record_mem_types; +#endif + +using perftools::gputools::DeviceMemoryBase; +using perftools::gputools::DeviceMemory; +using perftools::gputools::Stream; + +namespace tensorflow { + +namespace gpu = ::perftools::gputools; + +/*static*/ +void GPUUtil::SetProtoFromGPU(const Tensor& tensor, Device* dev, + const DeviceContext* device_context, + TensorProto* proto, bool is_dead, + StatusCallback done) { + VLOG(1) << "SetProtoFromGPU device_context " << device_context; + // Tensor values need to be copied from GPU to CPU ram so that + // we can build the protobuf response for a RecvTensor RPC. + // "device context" identifies the stream where the _Send op executed. + CHECK(device_context); + gpu::Stream* stream = + static_cast<const GPUDeviceContext*>(device_context)->stream(); + + if (!DMAHelper::CanUseDMA(&tensor)) { + done(errors::Internal(strings::StrCat( + "GPU copy from non-DMA ", DataTypeString(tensor.dtype()), "tensor"))); + return; + } + proto->set_dtype(tensor.dtype()); + tensor.shape().AsProto(proto->mutable_tensor_shape()); + // Prepare a Cord with the right data buf size, and DMA the + // data over from the GPU buffer. Note that 0-size tensors + // do not have a backing buffer. + const size_t num_bytes = is_dead ? 0 : tensor.TotalBytes(); + if (num_bytes > 0) { + port::Tracing::ScopedAnnotation annotation("SetProtoFromGPU"); + Allocator* alloc = ProcessState::singleton()->GetCUDAHostAllocator(0); + char* mb = alloc->Allocate<char>(num_bytes); + const char* src_ptr = + reinterpret_cast<const char*>(DMAHelper::base(&tensor)); + DeviceMemoryBase gpu_src_ptr(const_cast<char*>(src_ptr), num_bytes); + stream->ThenMemcpy(mb, gpu_src_ptr, num_bytes); + // Use of tensor may outlive stack scope, so keep a ref. + Tensor* tensor_ref = new Tensor(tensor); + dev->tensorflow_gpu_device_info()->event_mgr->ThenExecute( + stream, [stream, done, proto, mb, num_bytes, alloc, tensor_ref]() { + if (!stream->ok()) { + done(errors::Internal("SetProtoFromGPU: GPU Memcpy failed")); + // TODO(pbar) We currently have no way to recover the + // worker from a GPU stream in the error state. Until + // there is a way to reset the CUDA driver, it is + // preferable to crash the process and restart. Tracked + // under b/23717097 + LOG(FATAL) << "SetProtoFromGPU: GPU Memcpy failed"; + return; + } + delete tensor_ref; + port::CopyFromArray(proto->mutable_tensor_content(), mb, num_bytes); + alloc->Deallocate<char>(mb); + done(Status::OK()); + }); + } else { + done(Status::OK()); + } +} + +typedef ProcessState::MemDesc PMD; + +/*static*/ +void GPUUtil::CopyViaDMA(const string& edge_name, + DeviceContext* send_dev_context, + DeviceContext* recv_dev_context, Device* src, + Device* dst, AllocatorAttributes src_alloc_attr, + AllocatorAttributes dst_alloc_attr, + const Tensor* input, Tensor* output, + StatusCallback done) { + port::Tracing::ScopedAnnotation annotation(edge_name); + VLOG(1) << "CopyViaDMA " << edge_name; + size_t total_bytes = input->TotalBytes(); + // Note that 0-size tensors have no backing buffer. + if (total_bytes > 0) { + const void* src_ptr = DMAHelper::base(input); + void* dst_ptr = DMAHelper::base(output); + VLOG(2) << "src_ptr " << src_ptr << " dst_ptr " << dst_ptr; + if (FLAGS_record_mem_types) { + ProcessState::MemDesc smd = ProcessState::singleton()->PtrType(src_ptr); + ProcessState::MemDesc dmd = ProcessState::singleton()->PtrType(dst_ptr); + VLOG(0) << "Src " << smd.DebugString() << " Dst " << dmd.DebugString(); + if (smd.loc == PMD::CPU && dmd.loc == PMD::GPU && (!smd.gpu_registered)) { + LOG(WARNING) << "CPU -> GPU no reg for " << edge_name; + } + if (dmd.loc == PMD::CPU && smd.loc == PMD::GPU && (!dmd.gpu_registered)) { + LOG(WARNING) << "GPU -> CPU no reg for " << edge_name; + } + } + + auto src_device_type = src->attributes().device_type(); + auto dst_device_type = dst->attributes().device_type(); + + bool non_cpu_src = (!src_alloc_attr.on_host() && + src_device_type != DeviceType(DEVICE_CPU).type()); + bool non_cpu_dst = (!dst_alloc_attr.on_host() && + dst_device_type != DeviceType(DEVICE_CPU).type()); + if (non_cpu_src) { + gpu::Stream* stream = send_dev_context->stream(); + if (stream == nullptr) { + done(errors::Internal("Failed to find device stream")); + return; + } + auto* src_dev_info = src->tensorflow_gpu_device_info(); + CHECK(src_dev_info); + + if (non_cpu_dst) { + // Device to device copy + DeviceMemoryBase gpu_dst_ptr(dst_ptr, total_bytes); + stream->ThenMemcpy( + &gpu_dst_ptr, + DeviceMemoryBase{const_cast<void*>(src_ptr), total_bytes}, + total_bytes); + if (dst_device_type == DeviceType(DEVICE_GPU).type()) { + // Use of input may outlive stack scope, so keep a ref. + Tensor* input_ref = new Tensor(*input); + src_dev_info->event_mgr->ThenExecute( + stream, [done, stream, input_ref]() { + delete input_ref; + if (!stream->ok()) { + done(errors::Internal("GPU->GPU Memcpy failed")); + } else { + done(Status::OK()); + } + }); + } + send_dev_context->MaintainLifetimeOnStream(input, stream); + } else { + // Device to host copy. + return send_dev_context->CopyDeviceTensorToCPU(input, edge_name, src, + output, done); + } + } else if (non_cpu_dst) { + // Host to Device copy. + // Note that this is already an async copy. + recv_dev_context->CopyCPUTensorToDevice(input, dst, output, done); + } else { + memcpy(dst_ptr, src_ptr, total_bytes); + done(Status::OK()); + } + } else { + // buffer is empty + done(Status::OK()); + } +} + +void GPUUtil::CopyGPUTensorToCPU(Device* gpu_device, + const DeviceContext* device_context, + const Tensor* gpu_tensor, Tensor* cpu_tensor, + StatusCallback done) { + VLOG(1) << "CopyGPUTensorToCPU"; + size_t total_bytes = gpu_tensor->TotalBytes(); + // Note that 0-size tensors have no backing buffer. + if (total_bytes > 0) { + const void* src_ptr = DMAHelper::base(gpu_tensor); + void* dst_ptr = DMAHelper::base(cpu_tensor); + CHECK(dst_ptr); + auto* stream = gpu_device->tensorflow_gpu_device_info()->stream; + if (device_context) { + stream = static_cast<const GPUDeviceContext*>(device_context)->stream(); + } + stream->ThenMemcpy( + dst_ptr, DeviceMemoryBase{const_cast<void*>(src_ptr), total_bytes}, + total_bytes); + stream->BlockHostUntilDone(); + if (!stream->ok()) { + done(errors::Internal("CopyGPUTensorToCPU: GPU->CPU Memcpy failed")); + return; + } + } + + done(Status::OK()); +} + +/* static */ +void GPUUtil::CopyCPUTensorToGPU(const Tensor* cpu_tensor, + const DeviceContext* device_context, + Device* gpu_device, Tensor* gpu_tensor, + StatusCallback done) { + VLOG(1) << "CopyCPUTensorToGPU"; + CHECK(DeviceType(gpu_device->attributes().device_type()) == + DeviceType(DEVICE_GPU)); + + auto* dev_info = gpu_device->tensorflow_gpu_device_info(); + if (!dev_info) { + done(errors::Internal("Failed to find dest device GPUDeviceInfo")); + return; + } + if (cpu_tensor->TotalBytes() != gpu_tensor->TotalBytes()) { + done(errors::Internal( + strings::StrCat("Can't copy ", cpu_tensor->TotalBytes(), + " bytes of a tensor into another with ", + gpu_tensor->TotalBytes(), " bytes buffer."))); + return; + } + const int64 total_bytes = cpu_tensor->TotalBytes(); + // Note that 0-size tensors have no backing buffer. + if (total_bytes > 0) { + const void* src_ptr = DMAHelper::base(cpu_tensor); + void* dst_ptr = DMAHelper::base(gpu_tensor); + DeviceMemoryBase gpu_dst_ptr(dst_ptr, total_bytes); + + CHECK(device_context); + auto* stream = + static_cast<const GPUDeviceContext*>(device_context)->stream(); + stream->ThenMemcpy(&gpu_dst_ptr, src_ptr, total_bytes); + auto* dev_info = gpu_device->tensorflow_gpu_device_info(); + // Use of cpu_tensor may outlive stack scope, so keep a ref. + Tensor* input_ref = new Tensor(*cpu_tensor); + dev_info->event_mgr->ThenExecute(stream, [stream, done, input_ref]() { + delete input_ref; + if (!stream->ok()) { + done(errors::Internal("CopyCPUTensorToGPU: GPU Memcpy failed")); + } else { + done(Status::OK()); + } + }); + } else { + // empty tensor case + done(Status::OK()); + } +} + +Status GPUUtil::Sync(Device* gpu_device) { + VLOG(1) << "GPUUtil::Sync"; + auto* dev_info = gpu_device->tensorflow_gpu_device_info(); + if (!dev_info) { + return errors::Internal("Failed to find dest device GPUDeviceInfo"); + } + dev_info->stream->BlockHostUntilDone(); + if (!dev_info->stream->ok()) { + LOG(FATAL) << "GPU sync failed"; + } + return Status::OK(); +} + +Status GPUUtil::SyncAll(Device* gpu_device) { + VLOG(1) << "GPUUtil::SyncAll"; + auto* dev_info = gpu_device->tensorflow_gpu_device_info(); + if (!dev_info) { + return errors::Internal("Failed to find dest device GPUDeviceInfo"); + } + if (!dev_info->stream->parent()->SynchronizeAllActivity() || + !dev_info->stream->ok()) { + LOG(FATAL) << "GPU sync failed"; + } + return Status::OK(); +} + +string GPUUtil::MemoryDebugString(const Device* device, Tensor* tensor) { + string ret; + CHECK(tensor); + const int64 num_bytes = std::min<int64>( + FLAGS_brain_gpu_util_debug_string_maxlen, tensor->TotalBytes()); + void* ptr = (num_bytes > 0) ? DMAHelper::base(tensor) : nullptr; + strings::Appendf(&ret, "%p:", ptr); + if (num_bytes > 0) { + auto* dev_info = device->tensorflow_gpu_device_info(); + if (!dev_info) { + strings::StrAppend( + &ret, PrintMemory(reinterpret_cast<const char*>(ptr), num_bytes)); + } else { + string buf; + buf.resize(num_bytes); + DeviceMemoryBase gpu_ptr(ptr, num_bytes); + Status s = dev_info->stream->parent()->SynchronousMemcpyD2H( + gpu_ptr, num_bytes, gtl::string_as_array(&buf)); + strings::StrAppend(&ret, + PrintMemory(gtl::string_as_array(&buf), num_bytes)); + } + } + return ret; +} + +// TODO(pbar) Checksum is called from places without a valid device context. +uint64 GPUUtil::Checksum(Device* gpu_device, + const DeviceContext* device_context, + const Tensor& tensor) { + Tensor copy(tensor.dtype(), tensor.shape()); + Status s; + Notification n; + CopyGPUTensorToCPU(gpu_device, device_context, &tensor, ©, + [&s, &n](Status status) { + s.Update(status); + n.Notify(); + }); + n.WaitForNotification(); + CHECK(s.ok()) << s; + return Checksum(copy); +} + +uint64 GPUUtil::Checksum(const Tensor& tensor) { + const float* fptr = reinterpret_cast<const float*>(DMAHelper::base(&tensor)); + size_t num_bytes = tensor.TotalBytes(); + size_t num_floats = num_bytes / sizeof(float); + for (size_t i = 0; i < num_floats; ++i) { + CHECK(!std::isnan(fptr[i])) << " i " << i; + } + // TODO(tucker): consider using crc32c instead. + return Hash64(reinterpret_cast<const char*>(DMAHelper::base(&tensor)), + tensor.TotalBytes(), 0); +} + +} // namespace tensorflow diff --git a/tensorflow/core/common_runtime/gpu/gpu_util.h b/tensorflow/core/common_runtime/gpu/gpu_util.h new file mode 100644 index 0000000000..1d8c3a054d --- /dev/null +++ b/tensorflow/core/common_runtime/gpu/gpu_util.h @@ -0,0 +1,89 @@ +#ifndef TENSORFLOW_COMMON_RUNTIME_GPU_GPU_UTIL_H_ +#define TENSORFLOW_COMMON_RUNTIME_GPU_GPU_UTIL_H_ + +#include "tensorflow/core/common_runtime/device.h" +#include "tensorflow/core/public/tensor.h" +#include "tensorflow/core/public/status.h" +#include "tensorflow/core/common_runtime/gpu/dma_helper.h" +#include "tensorflow/stream_executor/device_memory.h" + +#include "tensorflow/stream_executor/stream.h" + +namespace tensorflow { + +class RecvTensorResponse; +class TensorProto; + +namespace gpu = ::perftools::gputools; + +class GPUUtil { + public: + // "tensor" is GPU-local. "dev" is the hosting GPU. + // "device_context" should be the context of the GPU "_Send" op + // which provides the Tensor. + // Sets all necessasry fields of "proto" by transferring value + // bytes from GPU to CPU RAM. "is_dead" indicates that the + // tensor is dead with an uninit value. + static void SetProtoFromGPU(const Tensor& tensor, Device* dev, + const DeviceContext* device_context, + TensorProto* proto, bool is_dead, + StatusCallback done); + + // Copies "input" to "output" between devices accessible to the + // local process via some DMA-like method. "edge_name" is the name + // of the tensor being copied, for debugging purposes. Depending on + // the type of devices and memory in use, the copy may be performed + // synchronously or asynchronously. 'done' will be invoked only + // after the copy is actually complete. + static void CopyViaDMA(const string& edge_name, + DeviceContext* send_dev_context, + DeviceContext* recv_dev_context, Device* src, + Device* dst, const AllocatorAttributes src_alloc_attr, + const AllocatorAttributes dst_alloc_attr, + const Tensor* input, Tensor* output, + StatusCallback done); + + // Copies the data in 'gpu_tensor' into 'cpu_tensor'. + // 'gpu_tensor''s backing memory must be on 'gpu_device' and + // 'cpu_tensor' must be allocated to be of the same size as + // 'gpu_tensor'. Synchronous: may block. + static void CopyGPUTensorToCPU(Device* gpu_device, + const DeviceContext* device_context, + const Tensor* gpu_tensor, Tensor* cpu_tensor, + StatusCallback done); + + // Blocks until all operations queued on the stream associated with + // "gpu_device" at the time of the call have completed. Returns any + // error pending on the stream at completion. + static Status Sync(Device* gpu_device); + + // Blocks until all operations queued on all streams associated with the + // corresponding GPU device at the time of call have completed. + // Returns any error pending on the stream at completion. + static Status SyncAll(Device* gpu_device); + + // For debugging purpose, given a "device" and a "tensor" allocated + // on the device, return a string printing each byte in the tensor + // (up to a limit). "device" can be either a CPU or a GPU device. + static string MemoryDebugString(const Device* device, Tensor* tensor); + + static perftools::gputools::DeviceMemory<float> AsGPUFloat(const Tensor& t); + + // Computes a checksum over the contents of "tensor", which is allocated + // on "gpu_device". + static uint64 Checksum(Device* gpu_device, + const DeviceContext* device_context, + const Tensor& tensor); + + // Computes a checksum over the contents of "tensor", which is allocated + // in local CPU RAM. + static uint64 Checksum(const Tensor& tensor); + + static void CopyCPUTensorToGPU(const Tensor* cpu_tensor, + const DeviceContext* device_context, + Device* gpu_device, Tensor* gpu_tensor, + StatusCallback done); +}; + +} // namespace tensorflow +#endif // TENSORFLOW_COMMON_RUNTIME_GPU_GPU_UTIL_H_ diff --git a/tensorflow/core/common_runtime/gpu/gpu_util_platform_specific.cc b/tensorflow/core/common_runtime/gpu/gpu_util_platform_specific.cc new file mode 100644 index 0000000000..f1b1174a28 --- /dev/null +++ b/tensorflow/core/common_runtime/gpu/gpu_util_platform_specific.cc @@ -0,0 +1,24 @@ +#include "tensorflow/core/common_runtime/device.h" +#include "tensorflow/core/common_runtime/gpu_device_context.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/public/tensor.h" +#include "tensorflow/core/common_runtime/gpu/gpu_util.h" +#include "tensorflow/stream_executor/stream.h" + +namespace tensorflow { + +void GPUDeviceContext::CopyCPUTensorToDevice(const Tensor* cpu_tensor, + Device* device, + Tensor* device_tensor, + StatusCallback done) const { + GPUUtil::CopyCPUTensorToGPU(cpu_tensor, this, device, device_tensor, done); +} + +void GPUDeviceContext::CopyDeviceTensorToCPU(const Tensor* device_tensor, + const string& tensor_name, + Device* device, Tensor* cpu_tensor, + StatusCallback done) { + GPUUtil::CopyGPUTensorToCPU(device, this, device_tensor, cpu_tensor, done); +} + +} // namespace tensorflow diff --git a/tensorflow/core/common_runtime/gpu/pool_allocator.cc b/tensorflow/core/common_runtime/gpu/pool_allocator.cc new file mode 100644 index 0000000000..52deb7fce2 --- /dev/null +++ b/tensorflow/core/common_runtime/gpu/pool_allocator.cc @@ -0,0 +1,269 @@ +#include "tensorflow/core/common_runtime/gpu/pool_allocator.h" + +#include <errno.h> +#include <strings.h> +#include <sys/mman.h> // for munmap + +#include <map> + +#include "tensorflow/core/lib/strings/numbers.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/port.h" +//#include "prodkernel/api/base/numa.h" + +namespace tensorflow { + +PoolAllocator::PoolAllocator(size_t pool_size_limit, bool auto_resize, + SubAllocator* allocator, + RoundUpInterface* size_rounder, string name) + : name_(name), + has_size_limit_(pool_size_limit > 0), + auto_resize_(auto_resize), + pool_size_limit_(pool_size_limit), + allocator_(allocator), + size_rounder_(size_rounder), + allocation_begun_(false) { + if (auto_resize) { + CHECK_LT(0, pool_size_limit) + << "size limit must be > 0 if auto_resize is true."; + } +} + +PoolAllocator::~PoolAllocator() { Clear(); } + +namespace { +// Pools contain Chunks allocatated from the underlying Allocator. +// Chunk alignment is always on kPoolAlignment boundaries. Each Chunk +// begins with a descriptor (ChunkPrefix) that gives its size and a +// pointer to itself. The pointer returned to the user is just past +// the ChunkPrefix. If the user asks for a larger alignment, we will +// increase the size of the chunk, then adjust the returned user +// pointer and also re-write the ChunkPrefix.chunk_ptr value +// immediately before it. This way the Chunk address and size can be +// recovered from the returned user pointer, regardless of alignment. +// Note that this deferencing of the pointers means that we cannot +// handle GPU memory, only CPU memory. +struct ChunkPrefix { + size_t num_bytes; + void* chunk_ptr; +}; +// kPoolAlignment cannot be less than the size of ChunkPrefix. +static const int kPoolAlignment = sizeof(ChunkPrefix); + +void* PrepareChunk(void* chunk, size_t alignment, size_t num_bytes) { + ChunkPrefix* cp = reinterpret_cast<ChunkPrefix*>(chunk); + cp->num_bytes = num_bytes; + cp->chunk_ptr = chunk; + void* user_ptr = reinterpret_cast<void*>(cp + 1); + if (alignment > kPoolAlignment) { + // Move user_ptr forward to the first satisfying offset, and write + // chunk_ptr just before it. + size_t aligned_ptr = reinterpret_cast<size_t>(user_ptr) + alignment; + user_ptr = reinterpret_cast<void*>(aligned_ptr & ~(alignment - 1)); + (reinterpret_cast<ChunkPrefix*>(user_ptr) - 1)->chunk_ptr = chunk; + } + // Safety check that user_ptr is always past the ChunkPrefix. + CHECK_GE(user_ptr, reinterpret_cast<ChunkPrefix*>(chunk) + 1); + return user_ptr; +} + +ChunkPrefix* FindPrefix(void* user_ptr) { + ChunkPrefix* cp = reinterpret_cast<ChunkPrefix*>(user_ptr) - 1; + return reinterpret_cast<ChunkPrefix*>(cp->chunk_ptr); +} +} // namespace + +void* PoolAllocator::AllocateRaw(size_t alignment, size_t num_bytes) { + if (!allocation_begun_) allocation_begun_ = true; + if (num_bytes == 0) return nullptr; + + // If alignment is larger than kPoolAlignment, increase num_bytes so that we + // are guaranteed to be able to return an aligned ptr by advancing user_ptr + // without overrunning the end of the chunk. + if (alignment > kPoolAlignment) { + num_bytes += alignment; + } + num_bytes += sizeof(ChunkPrefix); + num_bytes = size_rounder_->RoundUp(num_bytes); + PtrRecord* pr = nullptr; + if (has_size_limit_) { + { + mutex_lock lock(mutex_); + auto iter = pool_.find(num_bytes); + if (iter == pool_.end()) { + allocated_count_++; + // Deliberately fall out of lock scope before + // calling the allocator. No further modification + // to the pool will be performed. + } else { + get_from_pool_count_++; + pr = iter->second; + RemoveFromList(pr); + pool_.erase(iter); + // Fall out of lock scope and do the result without the lock held. + } + } + } + if (pr != nullptr) { + void* r = pr->ptr; + delete pr; + return PrepareChunk(r, alignment, num_bytes); + } else { + void* ptr = allocator_->Alloc(kPoolAlignment, num_bytes); + for (auto v : alloc_visitors_) { + v(ptr, num_bytes); + } + return PrepareChunk(ptr, alignment, num_bytes); + } +} + +void PoolAllocator::DeallocateRaw(void* ptr) { + if (ptr == nullptr) return; + ChunkPrefix* cp = FindPrefix(ptr); + CHECK_LE((void*)cp, (void*)ptr); + if (!has_size_limit_ && !auto_resize_) { + for (auto v : free_visitors_) { + v(cp, cp->num_bytes); + } + allocator_->Free(cp, cp->num_bytes); + } else { + mutex_lock lock(mutex_); + ++put_count_; + while (pool_.size() >= pool_size_limit_) { + EvictOne(); + } + PtrRecord* pr = new PtrRecord; + pr->num_bytes = cp->num_bytes; + pr->ptr = cp; + AddToList(pr); + pool_.insert(std::make_pair(cp->num_bytes, pr)); + } +} + +void PoolAllocator::Clear() { + if (has_size_limit_) { + mutex_lock lock(mutex_); + for (auto iter : pool_) { + PtrRecord* pr = iter.second; + for (auto v : free_visitors_) { + v(pr->ptr, pr->num_bytes); + } + allocator_->Free(pr->ptr, pr->num_bytes); + delete pr; + } + pool_.clear(); + get_from_pool_count_ = 0; + put_count_ = 0; + allocated_count_ = 0; + evicted_count_ = 0; + lru_head_ = nullptr; + lru_tail_ = nullptr; + } +} + +void PoolAllocator::RemoveFromList(PtrRecord* pr) { + if (pr->prev == nullptr) { + DCHECK_EQ(lru_head_, pr); + lru_head_ = nullptr; + } else { + pr->prev->next = pr->next; + } + if (pr->next == nullptr) { + DCHECK_EQ(lru_tail_, pr); + lru_tail_ = pr->prev; + } else { + pr->next->prev = pr->prev; + if (lru_head_ == nullptr) { + lru_head_ = pr->next; + } + } +} + +void PoolAllocator::AddToList(PtrRecord* pr) { + pr->prev = nullptr; + if (lru_head_ == nullptr) { + CHECK(lru_tail_ == nullptr); + lru_tail_ = pr; + pr->next = nullptr; + } else { + pr->next = lru_head_; + pr->next->prev = pr; + } + lru_head_ = pr; +} + +void PoolAllocator::EvictOne() { + DCHECK(lru_tail_ != nullptr); + PtrRecord* prec = lru_tail_; + RemoveFromList(prec); + auto iter = pool_.find(prec->num_bytes); + while (iter->second != prec) { + ++iter; + DCHECK(iter != pool_.end()); + } + pool_.erase(iter); + for (auto v : free_visitors_) { + v(prec->ptr, prec->num_bytes); + } + allocator_->Free(prec->ptr, prec->num_bytes); + delete prec; + ++evicted_count_; + // Auto-resizing, and warning messages. + static const double kTolerable = 2e-3; + static const int kCheckInterval = 1000; + static const double kIncreaseFactor = 1.1; + static const int kMinPoolSize = 100; + if (0 == evicted_count_ % kCheckInterval) { + const double eviction_rate = + evicted_count_ / static_cast<double>(put_count_); + const int64 alloc_request_count = allocated_count_ + get_from_pool_count_; + const double alloc_rate = + allocated_count_ / static_cast<double>(alloc_request_count); + static int log_counter = 0; + // (counter increment not thread safe but it's just for logging, so we + // don't care). + bool should_log = ((log_counter++ % 10) == 0); + if (should_log) { + LOG(WARNING) << "PoolAllocator: After " << alloc_request_count + << " get requests, put_count=" << put_count_ + << " evicted_count=" << evicted_count_ + << " eviction_rate=" << eviction_rate + << " and unsatisfied allocation rate=" << alloc_rate; + } + if (auto_resize_ && (eviction_rate > kTolerable) && + (alloc_rate > kTolerable)) { + size_t new_size_limit = (pool_size_limit_ < kMinPoolSize) + ? kMinPoolSize + : (kIncreaseFactor * pool_size_limit_); + if (should_log) { + LOG(INFO) << "Raising pool_size_limit_ from " << pool_size_limit_ + << " to " << new_size_limit; + } + pool_size_limit_ = new_size_limit; + // Reset all the counters so that ratios are relative to new sizes + // at next test interval. + put_count_ = 0; + allocated_count_ = 0; + evicted_count_ = 0; + get_from_pool_count_ = 0; + } + } +} + +void PoolAllocator::AddAllocVisitor(Visitor visitor) { + mutex_lock lock(mutex_); + CHECK(!allocation_begun_) + << "AddAllocVisitor may not be called after pool allocation " + << "has begun."; + alloc_visitors_.push_back(visitor); +} + +void PoolAllocator::AddFreeVisitor(Visitor visitor) { + mutex_lock lock(mutex_); + CHECK(!allocation_begun_) + << "AddFreeVisitor may not be called after pool allocation " + << "has begun."; + free_visitors_.push_back(visitor); +} + +} // namespace tensorflow diff --git a/tensorflow/core/common_runtime/gpu/pool_allocator.h b/tensorflow/core/common_runtime/gpu/pool_allocator.h new file mode 100644 index 0000000000..d10aabe88a --- /dev/null +++ b/tensorflow/core/common_runtime/gpu/pool_allocator.h @@ -0,0 +1,202 @@ +#ifndef TENSORFLOW_COMMON_RUNTIME_GPU_POOL_ALLOCATOR_H_ +#define TENSORFLOW_COMMON_RUNTIME_GPU_POOL_ALLOCATOR_H_ + +// Simple LRU pool allocators for various flavors of CPU RAM that +// implement the VisitableAllocator interface. GPU memory is managed +// by GPURegionAllocator. + +#include <atomic> +#include <map> +#include <memory> +#include "tensorflow/core/lib/core/bits.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/port.h" +#include "tensorflow/core/common_runtime/gpu/visitable_allocator.h" +#include "tensorflow/stream_executor/stream_executor.h" + +namespace tensorflow { + +// Interface of an object that does the underlying alloc/free of memory. +class SubAllocator { + public: + virtual ~SubAllocator() {} + virtual void* Alloc(size_t alignment, size_t num_bytes) = 0; + virtual void Free(void* ptr, size_t num_bytes) = 0; +}; + +// Interface of an object that rounds up integers. +class RoundUpInterface { + public: + virtual ~RoundUpInterface() {} + virtual size_t RoundUp(size_t num_bytes) = 0; +}; + +// Size-limited pool of memory buffers obtained from a SubAllocator +// instance. Pool eviction policy is LRU. +class PoolAllocator : public VisitableAllocator { + public: + // "pool_size_limit" is the maximum number of returned, re-usable + // memory buffers to keep in the pool. If pool_size_limit == 0, the + // pool is effectively a thin wrapper around the allocator. + // If "auto_resize" is true, then the pool_size_limit will gradually + // be raised so that deallocations happen very rarely, if at all. + // Transitory start-up objects may deallocate, but the long-term + // working-set should not. Auto-resizing can raise pool_size_limit + // but will never lower it. + // "allocator" is the object that performs the underlying memory + // malloc/free operations. This object takes ownership of allocator. + PoolAllocator(size_t pool_size_limit, bool auto_resize, + SubAllocator* allocator, RoundUpInterface* size_rounder, + string name); + ~PoolAllocator() override; + + string Name() override { return name_; } + + void* AllocateRaw(size_t alignment, size_t num_bytes) override; + + void DeallocateRaw(void* ptr) override; + + // REQUIRES: The following functions may only be called prior + // to the first Allocate*() call. Once allocation has begun, it is + // illegal to register another visitor. + + void AddAllocVisitor(Visitor visitor) override; + + void AddFreeVisitor(Visitor visitor) override; + + // Allocate an unused memory region of size "num_bytes". Fetch from + // the pool if available, otherwise call allocator_. + void* Get(size_t num_bytes); + + // Return a no-longer needed memory region to the pool. It is an error + // to deference "ptr" after this call. If the pool is full, the least + // recently used region will be deallocated. + void Put(void* ptr, size_t num_bytes); + + // Reset the pool to empty. + void Clear(); + + // The following accessors permit monitoring the effectiveness of + // the pool at avoiding repeated malloc/frees on the underlying + // allocator. Read locks are not taken on the theory that value + // consistency with other threads is not important. + + // Number of Get() requests satisfied from pool. + int64 get_from_pool_count() const NO_THREAD_SAFETY_ANALYSIS { + return get_from_pool_count_; + } + // Number of Put() requests. + int64 put_count() const NO_THREAD_SAFETY_ANALYSIS { return put_count_; } + // Number of Get() requests requiring a fresh allocation. + int64 allocated_count() const NO_THREAD_SAFETY_ANALYSIS { + return allocated_count_; + } + // Number of pool evictions. + int64 evicted_count() const NO_THREAD_SAFETY_ANALYSIS { + return evicted_count_; + } + // Current size limit. + size_t size_limit() const NO_THREAD_SAFETY_ANALYSIS { + return pool_size_limit_; + } + + private: + struct PtrRecord { + void* ptr; + size_t num_bytes; + PtrRecord* prev; + PtrRecord* next; + }; + + // Remove "pr" from the double-linked LRU list. + void RemoveFromList(PtrRecord* pr) EXCLUSIVE_LOCKS_REQUIRED(mutex_); + + // Add "pr" to the head of the double-linked LRU list. + void AddToList(PtrRecord* pr) EXCLUSIVE_LOCKS_REQUIRED(mutex_); + + // Delete the least recently used record. + void EvictOne() EXCLUSIVE_LOCKS_REQUIRED(mutex_); + + const string name_; + const bool has_size_limit_; + const bool auto_resize_; + size_t pool_size_limit_; + std::unique_ptr<SubAllocator> allocator_; + std::unique_ptr<RoundUpInterface> size_rounder_; + mutex mutex_; + std::multimap<const size_t, PtrRecord*> pool_ GUARDED_BY(mutex_); + PtrRecord* lru_head_ GUARDED_BY(mutex_) = nullptr; + PtrRecord* lru_tail_ GUARDED_BY(mutex_) = nullptr; + int64 get_from_pool_count_ GUARDED_BY(mutex_) = 0; + int64 put_count_ GUARDED_BY(mutex_) = 0; + int64 allocated_count_ GUARDED_BY(mutex_) = 0; + int64 evicted_count_ GUARDED_BY(mutex_) = 0; + // Write access to these is guarded by mutex_, but not read + // access. They may only be modified prior to the first + // allocation. Later attempts to modify will fail. + std::vector<Visitor> alloc_visitors_; + std::vector<Visitor> free_visitors_; + std::atomic<bool> allocation_begun_; +}; + +// Do-nothing rounder. Passes through sizes unchanged. +class NoopRounder : public RoundUpInterface { + public: + size_t RoundUp(size_t num_bytes) override { return num_bytes; } +}; + +// Power of 2 rounder: rounds up to nearest power of 2 size. +class Pow2Rounder : public RoundUpInterface { + public: + size_t RoundUp(size_t num_bytes) override { + return 1uLL << Log2Ceiling64(num_bytes); + } +}; + +class BasicCPUAllocator : public SubAllocator { + public: + ~BasicCPUAllocator() override {} + + void* Alloc(size_t alignment, size_t num_bytes) override { + return port::aligned_malloc(num_bytes, alignment); + } + void Free(void* ptr, size_t num_bytes) override { free(ptr); } +}; + +// Allocator for pinned CPU RAM that is made known to CUDA for the +// purpose of efficient DMA with a GPU. +class CUDAHostAllocator : public SubAllocator { + public: + // Note: stream_exec cannot be null. + explicit CUDAHostAllocator(perftools::gputools::StreamExecutor* stream_exec) + : stream_exec_(stream_exec) { + CHECK(stream_exec_ != nullptr); + } + ~CUDAHostAllocator() override {} + + void* Alloc(size_t alignment, size_t num_bytes) override { + void* ptr = nullptr; + if (num_bytes > 0) { + ptr = stream_exec_->HostMemoryAllocate(num_bytes); + if (ptr == nullptr) { + LOG(FATAL) << "could not allocate pinned host memory of size: " + << num_bytes; + } + } + return ptr; + } + + void Free(void* ptr, size_t num_bytes) override { + if (ptr != nullptr) { + stream_exec_->HostMemoryDeallocate(ptr); + } + } + + private: + perftools::gputools::StreamExecutor* stream_exec_; // not owned, non-null + + TF_DISALLOW_COPY_AND_ASSIGN(CUDAHostAllocator); +}; + +} // namespace tensorflow +#endif // TENSORFLOW_COMMON_RUNTIME_GPU_POOL_ALLOCATOR_H_ diff --git a/tensorflow/core/common_runtime/gpu/pool_allocator_test.cc b/tensorflow/core/common_runtime/gpu/pool_allocator_test.cc new file mode 100644 index 0000000000..ca409b2b4c --- /dev/null +++ b/tensorflow/core/common_runtime/gpu/pool_allocator_test.cc @@ -0,0 +1,203 @@ +#if GOOGLE_CUDA + +#include "tensorflow/core/common_runtime/gpu/pool_allocator.h" + +#include "tensorflow/stream_executor/multi_platform_manager.h" +#include "tensorflow/stream_executor/platform.h" +#include <gtest/gtest.h> + +namespace gpu = ::perftools::gputools; + +namespace tensorflow { +namespace { + +TEST(PoolAllocatorTest, ZeroSizeBuffers) { + gpu::Platform* platform = + gpu::MultiPlatformManager::PlatformWithName("cuda").ValueOrDie(); + PoolAllocator pool( + 2 /*pool_size_limit*/, false /*auto_resize*/, + new CUDAHostAllocator( + platform->GetExecutor(gpu::StreamExecutorConfig(/*ordinal=*/0)) + .ValueOrDie()), + new NoopRounder, "pool"); + + EXPECT_EQ(nullptr, pool.AllocateRaw(4 /*alignment*/, 0 /*num_bytes*/)); + pool.DeallocateRaw(nullptr); // Should not crash. + EXPECT_EQ(0, pool.get_from_pool_count()); + EXPECT_EQ(0, pool.put_count()); + EXPECT_EQ(0, pool.allocated_count()); + EXPECT_EQ(0, pool.evicted_count()); +} + +TEST(PoolAllocatorTest, ZeroSizePool) { + gpu::Platform* platform = + gpu::MultiPlatformManager::PlatformWithName("cuda").ValueOrDie(); + PoolAllocator pool( + 0 /*pool_size_limit*/, false /*auto_resize*/, + new CUDAHostAllocator( + platform->GetExecutor(gpu::StreamExecutorConfig(/*ordinal=*/0)) + .ValueOrDie()), + new NoopRounder, "pool"); + + EXPECT_EQ(0, pool.get_from_pool_count()); + EXPECT_EQ(0, pool.put_count()); + EXPECT_EQ(0, pool.allocated_count()); + EXPECT_EQ(0, pool.evicted_count()); + + // All allocations should bypass the pool and return valid pointers. + for (int i = 0; i < 3; ++i) { + void* p0 = pool.AllocateRaw(4, 0); + void* p4 = pool.AllocateRaw(4, 4); + void* p12 = pool.AllocateRaw(4, 12); + EXPECT_EQ(nullptr, p0); + EXPECT_NE(nullptr, p4); + EXPECT_NE(nullptr, p12); + pool.DeallocateRaw(p0); + pool.DeallocateRaw(p4); + pool.DeallocateRaw(p12); + } + EXPECT_EQ(0, pool.get_from_pool_count()); + EXPECT_EQ(0, pool.put_count()); + EXPECT_EQ(0, pool.allocated_count()); + EXPECT_EQ(0, pool.evicted_count()); +} + +TEST(PoolAllocatorTest, Alignment) { + gpu::Platform* platform = + gpu::MultiPlatformManager::PlatformWithName("cuda").ValueOrDie(); + PoolAllocator pool( + 0 /*pool_size_limit*/, false /*auto_resize*/, + new CUDAHostAllocator( + platform->GetExecutor(gpu::StreamExecutorConfig(/*ordinal=*/0)) + .ValueOrDie()), + new NoopRounder, "pool"); + for (int i = 0; i < 16; ++i) { + size_t alignment = 1 << i; + void* p = pool.AllocateRaw(alignment, 111); + EXPECT_TRUE(p != nullptr); + EXPECT_EQ(0, reinterpret_cast<int64>(p) & (alignment - 1)) + << "ptr: " << p << " alignment " << alignment; + // Intentionally don't deallocate, to test that destruction of + // the PoolAllocator frees all pending memory. + } +} + +TEST(PoolAllocatorTest, AutoResize) { + PoolAllocator pool(2 /*pool_size_limit*/, true /*auto_resize*/, + new BasicCPUAllocator, new NoopRounder, "pool"); + + // Alloc/dealloc 10 sizes just a few times, confirming pool size + // stays at 2. + for (int i = 0; i < 10; ++i) { + void* p = pool.AllocateRaw(4, 64 << i); + pool.DeallocateRaw(p); + } + EXPECT_EQ(0, pool.get_from_pool_count()); + EXPECT_EQ(10, pool.allocated_count()); + EXPECT_EQ(10, pool.put_count()); + EXPECT_EQ(8, pool.evicted_count()); + EXPECT_EQ(2, pool.size_limit()); + + // Then repeat 1200 times. Pool size limit should jump to 100. + for (int j = 0; j < 120; ++j) { + for (int i = 0; i < 10; ++i) { + void* p = pool.AllocateRaw(4, 64 << i); + pool.DeallocateRaw(p); + } + } + EXPECT_EQ(100, pool.size_limit()); +} + +TEST(PoolAllocatorTest, CudaHostAllocator) { + gpu::Platform* platform = + gpu::MultiPlatformManager::PlatformWithName("cuda").ValueOrDie(); + PoolAllocator pool( + 2 /*pool_size_limit*/, false /*auto_resize*/, + new CUDAHostAllocator( + platform->GetExecutor(gpu::StreamExecutorConfig(/*ordinal=*/0)) + .ValueOrDie()), + new NoopRounder, "pool"); + + // Repeatedly Get a 16-byte value, confirming that there's only + // one real allocation. + void* p1_16 = pool.AllocateRaw(4, 16); + EXPECT_EQ(0, pool.get_from_pool_count()); + EXPECT_EQ(1, pool.allocated_count()); + EXPECT_NE(nullptr, p1_16); + pool.DeallocateRaw(p1_16); + // Pool contents {16} + EXPECT_EQ(1, pool.put_count()); + void* p2_16 = pool.AllocateRaw(4, 16); // Get it again. + EXPECT_EQ(1, pool.get_from_pool_count()); + EXPECT_EQ(1, pool.allocated_count()); + EXPECT_EQ(p1_16, p2_16); // Same pointer value + pool.DeallocateRaw(p2_16); // Put it back. + // Pool contents {16} + EXPECT_EQ(2, pool.put_count()); + + // Get two more values of different sizes. + void* p3_4 = pool.AllocateRaw(4, 4); + EXPECT_EQ(2, pool.allocated_count()); + EXPECT_NE(p1_16, p3_4); // Different pointer value + EXPECT_NE(nullptr, p3_4); + pool.DeallocateRaw(p3_4); // Put it back. Pool is now full. + // Pool contents {4, 16} + EXPECT_EQ(3, pool.put_count()); + void* p4_2 = pool.AllocateRaw(4, 2); // Get a third size buffer. + EXPECT_NE(nullptr, p4_2); + EXPECT_EQ(0, pool.evicted_count()); + + // The pool is full: when we put back p4_2, the 16-byte buffer + // should be evicted since it was least recently inserted. + pool.DeallocateRaw(p4_2); + // Pool contents {2, 4} + EXPECT_EQ(4, pool.put_count()); + EXPECT_EQ(1, pool.evicted_count()); + + // Re-getting and putting size 2 or 4 should not alter pool size or + // num-evicted. + void* p5_4 = pool.AllocateRaw(4, 4); + EXPECT_NE(nullptr, p5_4); + pool.DeallocateRaw(p5_4); + void* p6_2 = pool.AllocateRaw(4, 2); + EXPECT_NE(nullptr, p6_2); + pool.DeallocateRaw(p6_2); + EXPECT_EQ(3, pool.get_from_pool_count()); + EXPECT_EQ(6, pool.put_count()); + EXPECT_EQ(3, pool.allocated_count()); + EXPECT_EQ(1, pool.evicted_count()); + + pool.Clear(); + EXPECT_EQ(0, pool.get_from_pool_count()); + EXPECT_EQ(0, pool.put_count()); + EXPECT_EQ(0, pool.allocated_count()); + EXPECT_EQ(0, pool.evicted_count()); +} + +TEST(PoolAllocatorTest, Pow2Rounder) { + Pow2Rounder rounder; + EXPECT_EQ(1, rounder.RoundUp(1)); + EXPECT_EQ(2, rounder.RoundUp(2)); + EXPECT_EQ(16, rounder.RoundUp(9)); + EXPECT_EQ(16, rounder.RoundUp(16)); + EXPECT_EQ(65536, rounder.RoundUp(41234)); + EXPECT_EQ(65536, rounder.RoundUp(65535)); + EXPECT_EQ(65536, rounder.RoundUp(65536)); +} + +TEST(PoolAllocatorTest, Name) { + gpu::Platform* platform = + gpu::MultiPlatformManager::PlatformWithName("cuda").ValueOrDie(); + PoolAllocator pool( + 2 /*pool_size_limit*/, false /*auto_resize*/, + new CUDAHostAllocator( + platform->GetExecutor(gpu::StreamExecutorConfig(/*ordinal=*/0)) + .ValueOrDie()), + new NoopRounder, "pool"); + EXPECT_EQ("pool", pool.Name()); +} + +} // namespace +} // namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/common_runtime/gpu/process_state.cc b/tensorflow/core/common_runtime/gpu/process_state.cc new file mode 100644 index 0000000000..70ac6130c2 --- /dev/null +++ b/tensorflow/core/common_runtime/gpu/process_state.cc @@ -0,0 +1,220 @@ +#include "tensorflow/core/common_runtime/gpu/process_state.h" + +#include "tensorflow/core/framework/allocator.h" +#include "tensorflow/core/common_runtime/gpu/gpu_init.h" +#include "tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h" +#include "tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h" +#include "tensorflow/core/common_runtime/gpu/gpu_region_allocator.h" +#include "tensorflow/core/common_runtime/gpu/pool_allocator.h" +#include "tensorflow/core/lib/strings/strcat.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/port.h" +#include "tensorflow/stream_executor/multi_platform_manager.h" + +#if defined(PLATFORM_GOOGLE) +DEFINE_bool(record_mem_types, false, + "If true, record attributes of memory allocations and " + "dyanmically check for appropriate use of registered memory." + "Should only be true for debugging or diagnosis of " + "performance issues."); +DEFINE_bool(brain_mem_reg_cuda_dma, true, + "If true, register CPU RAM used to copy to/from GPU RAM " + "with the CUDA driver."); +DEFINE_bool(brain_gpu_use_bfc_allocator, false, + "If true, uses the Best-Fit GPU allocator."); +DEFINE_bool(brain_gpu_region_allocator_debug, false, + "If true, checks for memory overwrites by writing " + "distinctive patterns on both ends of allocated memory."); +DEFINE_bool(brain_gpu_region_allocator_reset_to_nan, false, + "If true, initializes all new Malloc buffers to NaN, " + "and resets the buffer to NaN upon Free."); + +#else +bool FLAGS_record_mem_types = false; +bool FLAGS_brain_mem_reg_cuda_dma = true; +bool FLAGS_brain_gpu_region_allocator_debug = false; +bool FLAGS_brain_gpu_region_allocator_reset_to_nan = false; +bool FLAGS_brain_gpu_use_bfc_allocator = false; +#endif + +namespace gpu = ::perftools::gputools; + +namespace tensorflow { + +ProcessState* ProcessState::instance_ = nullptr; + +/*static*/ ProcessState* ProcessState::singleton() { + if (instance_ == nullptr) { + instance_ = new ProcessState; + } + + return instance_; +} + +ProcessState::ProcessState() : gpu_count_(0) { + CHECK(instance_ == nullptr); + instance_ = this; +} + +ProcessState::~ProcessState() { + for (auto p : gpu_allocators_) { + delete p; + } + instance_ = nullptr; +} + +string ProcessState::MemDesc::DebugString() { + return strings::StrCat((loc == CPU ? "CPU " : "GPU "), dev_index, ", dma: ", + gpu_registered, ", nic: ", nic_registered); +} + +ProcessState::MemDesc ProcessState::PtrType(const void* ptr) { + if (FLAGS_record_mem_types) { + auto iter = mem_desc_map_.find(ptr); + if (iter != mem_desc_map_.end()) { + return iter->second; + } + } + return MemDesc(); +} + +void ProcessState::SetGPUCount(int c) { + CHECK(gpu_count_ == 0 || gpu_count_ == c) + << "Cannot call SetGPUCount with a non-zero value " + << "not equal to prior set value."; + gpu_count_ = c; +} + +int ProcessState::GPUCount() const { return gpu_count_; } + +Allocator* ProcessState::GetGPUAllocator(int gpu_id, size_t total_bytes) { +#if GOOGLE_CUDA + mutex_lock lock(mu_); + gpu::Platform* gpu_platform = GPUMachineManager(); + + // Verify that gpu_id is legitimate. + CHECK_LT(gpu_id, gpu_platform->VisibleDeviceCount()) + << "gpu_id is outside discovered device range"; + + if (gpu_id >= static_cast<int64>(gpu_allocators_.size())) { + gpu_allocators_.resize(gpu_id + 1); + if (FLAGS_record_mem_types) gpu_al_.resize(gpu_id + 1); + } + + if (gpu_allocators_[gpu_id] == nullptr) { + VisitableAllocator* gpu_allocator; + + if (FLAGS_brain_gpu_use_bfc_allocator) { + gpu_allocator = new GPUBFCAllocator(gpu_id, total_bytes); + } else { + gpu_allocator = new GPURegionAllocator(gpu_id, total_bytes); + } + + if (FLAGS_brain_gpu_region_allocator_debug) { + gpu_allocator = new GPUDebugAllocator(gpu_allocator, gpu_id); + } + if (FLAGS_brain_gpu_region_allocator_reset_to_nan) { + gpu_allocator = new GPUNanResetAllocator(gpu_allocator, gpu_id); + } + + gpu_allocators_[gpu_id] = gpu_allocator; + + // If there are any pending AllocVisitors for this bus, add + // them now. + gpu::StreamExecutor* se = + gpu_platform->ExecutorForDevice(gpu_id).ValueOrDie(); + int bus_id = se->GetDeviceDescription().numa_node(); + if (bus_id < static_cast<int64>(gpu_visitors_.size())) { + for (auto v : gpu_visitors_[bus_id]) { + gpu_allocators_[gpu_id]->AddAllocVisitor(v); + } + } + if (FLAGS_record_mem_types) { + MemDesc md; + md.loc = MemDesc::GPU; + md.dev_index = gpu_id; + md.gpu_registered = false; + md.nic_registered = true; + if (static_cast<int64>(gpu_al_.size()) <= gpu_id) + gpu_al_.resize(gpu_id + 1); + gpu_al_[gpu_id] = new internal::RecordingAllocator( + &mem_desc_map_, gpu_allocators_[gpu_id], md, &mu_); + } + } + if (FLAGS_record_mem_types) return gpu_al_[gpu_id]; + return gpu_allocators_[gpu_id]; +#else + LOG(FATAL) << "GPUAllocator unavailable. Not compiled with --config=cuda."; + return nullptr; +#endif // GOOGLE_CUDA +} + +Allocator* ProcessState::GetCPUAllocator(int numa_node) { + // Although we're temporarily ignoring numa_node, check for legality. + CHECK_GE(numa_node, 0); + // TODO(tucker): actually maintain separate CPUAllocators for + // different numa_nodes. For now, just one. + numa_node = 0; + mutex_lock lock(mu_); + while (cpu_allocators_.size() <= static_cast<size_t>(numa_node)) { + cpu_allocators_.push_back(new PoolAllocator( + 100 /*pool_size_limit*/, true /*auto_resize*/, new BasicCPUAllocator(), + new NoopRounder, "cpu_pool")); + } + return cpu_allocators_[0]; +} + +Allocator* ProcessState::GetCUDAHostAllocator(int numa_node) { + if (gpu_count_ == 0 || !FLAGS_brain_mem_reg_cuda_dma) { + return GetCPUAllocator(numa_node); + } + // Although we're temporarily ignoring numa_node, check for legality. + CHECK_GE(numa_node, 0); + // TODO(tucker): actually maintain separate CPUAllocators for + // different numa_nodes. For now, just one. + numa_node = 0; + mutex_lock lock(mu_); + while (static_cast<int>(cuda_host_allocators_.size()) <= numa_node) { + // CUDAHost alloc the same across all gpus, so just get the + // executor for the first device. + gpu::Platform* gpu_platform = GPUMachineManager(); + gpu::StreamExecutor* se = gpu_platform->ExecutorForDevice(0).ValueOrDie(); + CHECK(se); + cuda_host_allocators_.push_back(new PoolAllocator( + 100 /*pool_size_limit*/, true /*auto_resize*/, + new CUDAHostAllocator(se), new Pow2Rounder, "cuda_host")); + if (FLAGS_record_mem_types) { + MemDesc md; + md.loc = MemDesc::CPU; + md.dev_index = 0; + md.gpu_registered = true; + md.nic_registered = false; + cuda_al_.push_back(new internal::RecordingAllocator( + &mem_desc_map_, cuda_host_allocators_.back(), md, &mu_)); + } + } + if (FLAGS_record_mem_types) return cuda_al_[0]; + return cuda_host_allocators_[0]; +} + +void ProcessState::AddGPUAllocVisitor(int bus_id, AllocVisitor visitor) { +#if GOOGLE_CUDA + mutex_lock lock(mu_); + gpu::Platform* gpu_platform = GPUMachineManager(); + for (int gpu_id = 0; gpu_id < static_cast<int64>(gpu_allocators_.size()); + ++gpu_id) { + gpu::StreamExecutor* se = + gpu_platform->ExecutorForDevice(gpu_id).ValueOrDie(); + if (gpu_allocators_[gpu_id] && + se->GetDeviceDescription().numa_node() == bus_id) { + gpu_allocators_[gpu_id]->AddAllocVisitor(visitor); + } + } + while (bus_id >= static_cast<int64>(gpu_visitors_.size())) { + gpu_visitors_.push_back(std::vector<AllocVisitor>()); + } + gpu_visitors_[bus_id].push_back(visitor); +#endif // GOOGLE_CUDA +} + +} // namespace tensorflow diff --git a/tensorflow/core/common_runtime/gpu/process_state.h b/tensorflow/core/common_runtime/gpu/process_state.h new file mode 100644 index 0000000000..527d12c10d --- /dev/null +++ b/tensorflow/core/common_runtime/gpu/process_state.h @@ -0,0 +1,140 @@ +#ifndef TENSORFLOW_COMMON_RUNTIME_GPU_PROCESS_STATE_H_ +#define TENSORFLOW_COMMON_RUNTIME_GPU_PROCESS_STATE_H_ + +#include <functional> +#include <unordered_map> +#include <vector> + +#include "tensorflow/core/framework/allocator.h" +#include "tensorflow/core/platform/port.h" +#include "tensorflow/core/platform/thread_annotations.h" + +namespace tensorflow { + +class Allocator; +class VisitableAllocator; +class PoolAllocator; + +// Singleton that manages per-process state, e.g. allocation +// of shared resources. +class ProcessState { + public: + static ProcessState* singleton(); + + // Descriptor for memory allocation attributes, used by optional + // runtime correctness analysis logic. + struct MemDesc { + enum MemLoc { CPU, GPU }; + MemLoc loc; + int dev_index; + bool gpu_registered; + bool nic_registered; + MemDesc() + : loc(CPU), + dev_index(0), + gpu_registered(false), + nic_registered(false) {} + string DebugString(); + }; + + // Records the number of GPUs available in the local process. + // It is a fatal error to call this with a value != to the value + // in a prior call. + void SetGPUCount(int c); + + // Returns number of GPUs available in local process, as set by + // SetGPUCount(); Returns 0 if SetGPUCount has not been called. + int GPUCount() const; + + // Returns what we know about the memory at ptr. + // If we know nothing, it's called CPU 0 with no other attributes. + MemDesc PtrType(const void* ptr); + + // Returns the one CPUAllocator used for the given numa_node. + // TEMPORY: ignores numa_node. + Allocator* GetCPUAllocator(int numa_node); + + // Returns the one GPU allocator used for the indexed GPU. + // Note that this is a system GPU index, not (necessarily) a brain + // device index. + // + // 'total_bytes' is the total number of bytes that should be made + // available to the allocator. The first call to this function for + // a given gpu_id creates the allocator, so only the total_bytes + // used on that first call is used. + // + // REQUIRES: gpu_id must be a valid ordinal for a GPU available in the + // current system environment. Otherwise returns nullptr. + Allocator* GetGPUAllocator(int gpu_id, size_t total_bytes); + + Allocator* GetCUDAHostAllocator(int numa_node); + + // Registers a function to be called once on every new Region + // allocated by every GPURegionAllocator proximate to the specified + // bus. The AllocVisitor is provided with a memory pointer and the + // size of the area it identifies. The pointer is not guaranteed to + // be valid after the call terminates. The intention is for this + // interface to be used for network device memory registration. + // "bus_id" is platform-specific. On many platforms it + // should be 0. On machines with multiple PCIe buses, it should be + // the index of one of the PCIe buses. If the the bus_id is invalid, + // results are undefined. + typedef std::function<void(void*, size_t)> AllocVisitor; + void AddGPUAllocVisitor(int bus_id, AllocVisitor visitor); + + typedef std::unordered_map<const void*, MemDesc> MDMap; + + protected: + ProcessState(); + + static ProcessState* instance_; + + mutex mu_; + int gpu_count_; + + std::vector<PoolAllocator*> cpu_allocators_ GUARDED_BY(mu_); + std::vector<VisitableAllocator*> gpu_allocators_ GUARDED_BY(mu_); + std::vector<std::vector<AllocVisitor>> gpu_visitors_ GUARDED_BY(mu_); + std::vector<PoolAllocator*> cuda_host_allocators_ GUARDED_BY(mu_); + + virtual ~ProcessState(); + + // Optional RecordingAllocators that wrap the corresponding + // Allocators for runtime attribute use analysis. + MDMap mem_desc_map_; + std::vector<Allocator*> cpu_al_ GUARDED_BY(mu_); + std::vector<Allocator*> gpu_al_ GUARDED_BY(mu_); + std::vector<Allocator*> cuda_al_ GUARDED_BY(mu_); +}; + +namespace internal { +class RecordingAllocator : public Allocator { + public: + RecordingAllocator(ProcessState::MDMap* mm, Allocator* a, + ProcessState::MemDesc md, mutex* mu) + : mm_(mm), a_(a), md_(md), mu_(mu) {} + + string Name() override { return a_->Name(); } + void* AllocateRaw(size_t alignment, size_t num_bytes) override { + void* p = a_->AllocateRaw(alignment, num_bytes); + mutex_lock l(*mu_); + (*mm_)[p] = md_; + return p; + } + void DeallocateRaw(void* p) override { + mutex_lock l(*mu_); + auto iter = mm_->find(p); + mm_->erase(iter); + a_->DeallocateRaw(p); + } + bool TracksAllocationSizes() override { return a_->TracksAllocationSizes(); } + size_t RequestedSize(void* p) override { return a_->RequestedSize(p); } + size_t AllocatedSize(void* p) override { return a_->AllocatedSize(p); } + ProcessState::MDMap* mm_; // not owned + Allocator* a_; // not owned + ProcessState::MemDesc md_; + mutex* mu_; +}; +} // namespace internal +} // namespace tensorflow +#endif // TENSORFLOW_COMMON_RUNTIME_GPU_PROCESS_STATE_H_ diff --git a/tensorflow/core/common_runtime/gpu/visitable_allocator.h b/tensorflow/core/common_runtime/gpu/visitable_allocator.h new file mode 100644 index 0000000000..23feed9aab --- /dev/null +++ b/tensorflow/core/common_runtime/gpu/visitable_allocator.h @@ -0,0 +1,30 @@ +#ifndef TENSORFLOW_COMMON_RUNTIME_GPU_VISITABLE_ALLOCATOR_H_ +#define TENSORFLOW_COMMON_RUNTIME_GPU_VISITABLE_ALLOCATOR_H_ + +#include <functional> +#include "tensorflow/core/framework/allocator.h" + +namespace tensorflow { + +// Subclass VisitableAllocator instead of Allocator when a memory +// allocator needs to enable some kind of registration/deregistration +// of memory areas. +class VisitableAllocator : public Allocator { + public: + // Visitor gets called with a pointer to a memory area and its + // size in bytes. + typedef std::function<void(void*, size_t)> Visitor; + + // Register a visitor guaranteed to be called exactly once on each + // chunk of memory newly allocated from the underlying device. + // Typically, chunks will be reused and possibly sub-divided by a + // pool manager, so the calls will happen only once per process + // execution, not once per tensor (re)allocation. + virtual void AddAllocVisitor(Visitor visitor) = 0; + + // Register a visitor guaranteed to be called on each chunk of + // memory returned to the underlying device. + virtual void AddFreeVisitor(Visitor visitor) = 0; +}; +} // namespace tensorflow +#endif // TENSORFLOW_COMMON_RUNTIME_GPU_VISITABLE_ALLOCATOR_H_ diff --git a/tensorflow/core/common_runtime/gpu_device_context.h b/tensorflow/core/common_runtime/gpu_device_context.h new file mode 100644 index 0000000000..03fd9a97c3 --- /dev/null +++ b/tensorflow/core/common_runtime/gpu_device_context.h @@ -0,0 +1,45 @@ +#ifndef TENSORFLOW_COMMON_RUNTIME_GPU_DEVICE_CONTEXT_H_ +#define TENSORFLOW_COMMON_RUNTIME_GPU_DEVICE_CONTEXT_H_ + +#include "tensorflow/core/common_runtime/device.h" +#include "tensorflow/core/framework/device_base.h" + +namespace perftools { +namespace gputools { +class Stream; +} // namespace gputools +} // namespace perftools + +namespace tensorflow { + +namespace gpu = ::perftools::gputools; + +class GPUDeviceContext : public DeviceContext { + public: + GPUDeviceContext(int stream_id, gpu::Stream* stream) + : stream_id_(stream_id), stream_(stream) {} + + ~GPUDeviceContext() override {} + + gpu::Stream* stream() const override { return stream_; } + int stream_id() const { return stream_id_; } + + void CopyCPUTensorToDevice(const Tensor* cpu_tensor, Device* device, + Tensor* device_tensor, + StatusCallback done) const override; + + void CopyDeviceTensorToCPU(const Tensor* device_tensor, + const string& edge_name, Device* device, + Tensor* cpu_tensor, StatusCallback done) override; + + void MaintainLifetimeOnStream( + const Tensor* t, perftools::gputools::Stream* stream) const override {} + + private: + int stream_id_; + gpu::Stream* stream_; +}; + +} // namespace tensorflow + +#endif // TENSORFLOW_COMMON_RUNTIME_GPU_DEVICE_CONTEXT_H_ diff --git a/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc b/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc new file mode 100644 index 0000000000..28afc95c1b --- /dev/null +++ b/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc @@ -0,0 +1,160 @@ +#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h" + +#include "tensorflow/core/common_runtime/device.h" +#include "tensorflow/core/common_runtime/device_factory.h" +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/graph/graph.h" +#include "tensorflow/core/framework/op_segment.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/lib/core/threadpool.h" +#include "tensorflow/core/lib/strings/str_util.h" +#include "tensorflow/core/util/device_name_utils.h" +#include "tensorflow/core/lib/core/notification.h" +#include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/platform/port.h" +#include "tensorflow/core/platform/test_benchmark.h" +#include "tensorflow/core/public/session_options.h" + +#if defined(PLATFORM_GOOGLE) +DECLARE_bool(brain_gpu_use_bfc_allocator); +#else +extern bool FLAGS_brain_gpu_use_bfc_allocator; +#endif + +namespace tensorflow { +namespace test { + +Benchmark::Benchmark(const string& device, Graph* g, + const SessionOptions* options, Graph* init) { + RequireDefaultOps(); + + FLAGS_brain_gpu_use_bfc_allocator = true; + + SessionOptions default_options; + if (!options) { + options = &default_options; + } + + testing::StopTiming(); + string t = str_util::Uppercase(device); + device_ = + DeviceFactory::NewDevice(t, *options, "/job:localhost/replica:0/task:0"); + CHECK(device_) << "Could not create a " << device << " device"; + + pool_ = new thread::ThreadPool(options->env, "blocking", + port::NumSchedulableCPUs()); + + auto runner = [this](std::function<void()> closure) { + pool_->Schedule(closure); + }; + + rendez_ = NewLocalRendezvous(); + + if (init) { + Executor* init_exec; + TF_CHECK_OK(NewLocalExecutor( + { + device_, nullptr, false, + [this](const NodeDef& ndef, OpKernel** kernel) { + return CreateNonCachedKernel(device_, nullptr, ndef, kernel); + }, + [](OpKernel* kernel) { DeleteNonCachedKernel(kernel); }, + }, + init, &init_exec)); + Executor::Args args; + args.rendezvous = rendez_; + args.runner = runner; + TF_CHECK_OK(init_exec->Run(args)); + delete init_exec; + } + + TF_CHECK_OK(NewLocalExecutor( + { + device_, + nullptr, + false, + [this](const NodeDef& ndef, OpKernel** kernel) { + return CreateNonCachedKernel(device_, nullptr, ndef, kernel); + }, + [](OpKernel* kernel) { DeleteNonCachedKernel(kernel); }, + }, + g, &exec_)); +} + +Benchmark::~Benchmark() { + if (device_) { + rendez_->Unref(); + delete exec_; + delete device_; + delete pool_; + } +} + +void Benchmark::Run(int iters) { RunWithArgs({}, {}, iters); } + +string GetRendezvousKey(const Node* node) { + string send_device; + TF_CHECK_OK(GetNodeAttr(node->def(), "send_device", &send_device)); + string recv_device; + TF_CHECK_OK(GetNodeAttr(node->def(), "recv_device", &recv_device)); + string tensor_name; + TF_CHECK_OK(GetNodeAttr(node->def(), "tensor_name", &tensor_name)); + uint64 send_device_incarnation; + TF_CHECK_OK(GetNodeAttr(node->def(), "send_device_incarnation", + reinterpret_cast<int64*>(&send_device_incarnation))); + return Rendezvous::CreateKey(send_device, send_device_incarnation, + recv_device, tensor_name, FrameAndIter(0, 0)); +} + +void Benchmark::RunWithArgs( + const std::vector<std::pair<const Node*, Tensor>>& inputs, + const std::vector<const Node*>& outputs, int iters) { + if (device_) { + // Gets inputs' and outputs' rendezvous keys. + std::vector<std::pair<string, Tensor>> in; + for (const auto& p : inputs) { + in.push_back({GetRendezvousKey(p.first), p.second}); + } + std::vector<string> out; + for (const auto& n : outputs) { + out.push_back(GetRendezvousKey(n)); + } + Tensor unused; // In benchmark, we don't care the return value. + bool is_dead; + + // Warm up + Executor::Args args; + args.rendezvous = rendez_; + args.runner = [this](std::function<void()> closure) { + pool_->Schedule(closure); + }; + for (int i = 0; i < 3; ++i) { + for (const auto& p : in) { + rendez_->Send(p.first, Rendezvous::Args(), p.second, false); + } + TF_CHECK_OK(exec_->Run(args)); + for (const string& key : out) { + rendez_->Recv(key, Rendezvous::Args(), &unused, &is_dead); + } + } + TF_CHECK_OK(device_->Sync()); + + testing::StartTiming(); + while (iters-- > 0) { + for (const auto& p : in) { + rendez_->Send(p.first, Rendezvous::Args(), p.second, false); + } + TF_CHECK_OK(exec_->Run(args)); + for (const string& key : out) { + rendez_->Recv(key, Rendezvous::Args(), &unused, &is_dead); + } + } + + TF_CHECK_OK(device_->Sync()); + testing::StopTiming(); + } +} + +} // end namespace test +} // end namespace tensorflow diff --git a/tensorflow/core/common_runtime/kernel_benchmark_testlib.h b/tensorflow/core/common_runtime/kernel_benchmark_testlib.h new file mode 100644 index 0000000000..5ebe13e1d4 --- /dev/null +++ b/tensorflow/core/common_runtime/kernel_benchmark_testlib.h @@ -0,0 +1,52 @@ +#ifndef TENSORFLOW_COMMON_RUNTIME_KERNEL_BENCHMARK_TESTLIB_H_ +#define TENSORFLOW_COMMON_RUNTIME_KERNEL_BENCHMARK_TESTLIB_H_ + +#include <string> +#include <vector> + +#include "tensorflow/core/platform/port.h" +#include "tensorflow/core/lib/core/threadpool.h" +#include "tensorflow/core/common_runtime/executor.h" +#include "tensorflow/core/graph/testlib.h" +#include "tensorflow/core/public/tensor.h" + +namespace tensorflow { + +class Device; +class SessionOptions; + +namespace test { + +class Benchmark { + public: + // "device" must be either "cpu" or "gpu". Takes ownership of "g" + // and "init". + Benchmark(const string& device, Graph* g, + const SessionOptions* options = nullptr, Graph* init = nullptr); + ~Benchmark(); + + // Executes the graph for "iters" times. + void Run(int iters); + + // If "g" contains send/recv nodes, before each execution, we send + // inputs to the corresponding recv nodes in the graph, after each + // execution, we recv outputs from the corresponding send nodes in + // the graph. In the benchmark, we throw away values returned by the + // graph. + void RunWithArgs(const std::vector<std::pair<const Node*, Tensor>>& inputs, + const std::vector<const Node*>& outputs, int iters); + + private: + thread::ThreadPool* pool_ = nullptr; + thread::ThreadPool* non_blocking_pool_ = nullptr; + Device* device_ = nullptr; + Rendezvous* rendez_ = nullptr; + Executor* exec_ = nullptr; + + TF_DISALLOW_COPY_AND_ASSIGN(Benchmark); +}; + +} // end namespace test +} // end namespace tensorflow + +#endif // TENSORFLOW_COMMON_RUNTIME_KERNEL_BENCHMARK_TESTLIB_H_ diff --git a/tensorflow/core/common_runtime/local_device.cc b/tensorflow/core/common_runtime/local_device.cc new file mode 100644 index 0000000000..6a75346805 --- /dev/null +++ b/tensorflow/core/common_runtime/local_device.cc @@ -0,0 +1,51 @@ +#define EIGEN_USE_THREADS + +#include "tensorflow/core/common_runtime/eigen_thread_pool.h" +#include "tensorflow/core/common_runtime/local_device.h" +#include "tensorflow/core/lib/core/threadpool.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/port.h" +#include "tensorflow/core/public/session_options.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" + +namespace tensorflow { + +namespace { + +DeviceBase::CpuWorkerThreads eigen_worker_threads; +Eigen::ThreadPoolInterface* eigen_thread_pool = nullptr; +Eigen::ThreadPoolDevice* eigen_device = nullptr; + +static bool InitModule(const SessionOptions& options) { + int32 intra_op_parallelism_threads = + options.config.intra_op_parallelism_threads(); + if (intra_op_parallelism_threads == 0) { + intra_op_parallelism_threads = port::NumSchedulableCPUs(); + } + LOG(INFO) << "Local device intra op parallelism threads: " + << intra_op_parallelism_threads; + eigen_worker_threads.num_threads = intra_op_parallelism_threads; + eigen_worker_threads.workers = new thread::ThreadPool( + options.env, "Eigen", intra_op_parallelism_threads); + eigen_thread_pool = new EigenThreadPoolWrapper(eigen_worker_threads.workers); + eigen_device = new Eigen::ThreadPoolDevice(eigen_thread_pool, + eigen_worker_threads.num_threads); + return true; +} +} // end namespace + +// LocalDevice ---------------------------------------------------------------- + +LocalDevice::LocalDevice(const SessionOptions& options, + const DeviceAttributes& attributes, + Allocator* device_allocator) + : Device(options.env, attributes, device_allocator) { + // All ThreadPoolDevices in the process will use this single fixed + // sized threadpool for numerical computations. + static bool init = InitModule(options); + CHECK(init); // Avoids compiler warning that init is unused. + set_tensorflow_cpu_worker_threads(&eigen_worker_threads); + set_eigen_cpu_device(eigen_device); +} + +} // namespace tensorflow diff --git a/tensorflow/core/common_runtime/local_device.h b/tensorflow/core/common_runtime/local_device.h new file mode 100644 index 0000000000..fc4cfc2dfc --- /dev/null +++ b/tensorflow/core/common_runtime/local_device.h @@ -0,0 +1,27 @@ +#ifndef TENSORFLOW_COMMON_RUNTIME_LOCAL_DEVICE_H_ +#define TENSORFLOW_COMMON_RUNTIME_LOCAL_DEVICE_H_ + +#include "tensorflow/core/common_runtime/device.h" +#include "tensorflow/core/framework/device_attributes.pb.h" + +namespace tensorflow { + +class SessionOptions; + +// This class is shared by ThreadPoolDevice and GPUDevice and +// initializes a shared Eigen compute device used by both. This +// should eventually be removed once we refactor ThreadPoolDevice and +// GPUDevice into more 'process-wide' abstractions. +class LocalDevice : public Device { + public: + LocalDevice(const SessionOptions& options, const DeviceAttributes& attributes, + Allocator* device_allocator); + ~LocalDevice() override {} + + private: + TF_DISALLOW_COPY_AND_ASSIGN(LocalDevice); +}; + +} // namespace tensorflow + +#endif // TENSORFLOW_COMMON_RUNTIME_LOCAL_DEVICE_H_ diff --git a/tensorflow/core/common_runtime/local_session.cc b/tensorflow/core/common_runtime/local_session.cc new file mode 100644 index 0000000000..ab6993b8a2 --- /dev/null +++ b/tensorflow/core/common_runtime/local_session.cc @@ -0,0 +1,500 @@ +#include "tensorflow/core/common_runtime/local_session.h" + +#include <string> +#include <vector> + +#include "tensorflow/core/common_runtime/device_factory.h" +#include "tensorflow/core/common_runtime/executor.h" +#include "tensorflow/core/common_runtime/rendezvous_mgr.h" +#include "tensorflow/core/common_runtime/session_factory.h" +#include "tensorflow/core/common_runtime/simple_placer.h" +#include "tensorflow/core/framework/graph.pb.h" +#include "tensorflow/core/graph/algorithm.h" +#include "tensorflow/core/graph/graph.h" +#include "tensorflow/core/graph/graph_constructor.h" +#include "tensorflow/core/graph/graph_partition.h" +#include "tensorflow/core/graph/subgraph.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/lib/core/notification.h" +#include "tensorflow/core/lib/core/refcount.h" +#include "tensorflow/core/lib/core/threadpool.h" +#include "tensorflow/core/lib/gtl/array_slice.h" +#include "tensorflow/core/lib/gtl/stl_util.h" +#include "tensorflow/core/lib/random/random.h" +#include "tensorflow/core/lib/strings/numbers.h" +#include "tensorflow/core/lib/strings/str_util.h" +#include "tensorflow/core/lib/strings/strcat.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/port.h" +#include "tensorflow/core/public/status.h" +#include "tensorflow/core/public/tensor.h" +#include "tensorflow/core/util/device_name_utils.h" + +namespace tensorflow { + +namespace { + +thread::ThreadPool* kernel_thread_pool_ = nullptr; +static bool InitModule(const SessionOptions& options) { + int32 inter_op_parallelism_threads = + options.config.inter_op_parallelism_threads(); + if (inter_op_parallelism_threads == 0) { + // Default to using the number of cores available in the process. + inter_op_parallelism_threads = port::NumSchedulableCPUs(); + } + LOG(INFO) << "Local session inter op parallelism threads: " + << inter_op_parallelism_threads; + kernel_thread_pool_ = new thread::ThreadPool(options.env, "Compute", + inter_op_parallelism_threads); + return true; +} + +// TODO(vrv): Figure out how to unify the many different functions +// that generate RendezvousKey, since many of them have to be +// consistent with each other. +string GetRendezvousKey(const string& tensor_name, + const DeviceAttributes& device_info, + const FrameAndIter& frame_iter) { + return strings::StrCat(device_info.name(), ";", + strings::FpToString(device_info.incarnation()), ";", + device_info.name(), ";", tensor_name, ";", + frame_iter.frame_id, ":", frame_iter.iter_id); +} + +// NOTE: On Android with a single device, there is never +// a risk of an OpKernel blocking indefinitely: +// +// 1) No operations do I/O that depends on other simultaneous kernels, +// +// 2) Recv nodes always complete immediately: The inputs are sent into +// the local rendezvous before we start the executor, so the +// corresonding recvs will not block. +// +// Based on these assumptions, we can use the same thread pool for +// both "non-blocking" and "blocking" OpKernels on Android. +// +// This may change down the road when we add support for multiple +// devices that run concurrently, in which case we will need to +// revisit this decision. +void SchedClosure(std::function<void()> c) { +// TODO(sanjay): Get rid of __ANDROID__ path +#ifdef __ANDROID__ + // On Android, there is no implementation of ThreadPool that takes + // std::function, only Closure, which we cannot easily convert. + // + // Instead, we just run the function in-line, which is currently + // safe given the reasoning above. + c(); +#else + kernel_thread_pool_->Schedule(c); +#endif // __ANDROID__ +} + +} // namespace + +LocalSession::LocalSession(const SessionOptions& options, + const DeviceMgr* device_mgr) + : options_(options), + device_mgr_(device_mgr), + cancellation_manager_(new CancellationManager()) { + static bool init = InitModule(options); + CHECK(init); // Avoids compiler warning that init is unused. + session_handle_ = strings::FpToString(random::New64()); + int devices_added = 0; + if (options.config.log_device_placement()) { + const string mapping_str = device_mgr_->DeviceMappingString(); + printf("Device mapping:\n%s", mapping_str.c_str()); + LOG(INFO) << "Device mapping:\n" << mapping_str; + } + for (auto d : device_mgr_->ListDevices()) { + devices_.push_back(d); + device_set_.AddDevice(d); + d->op_segment()->AddHold(session_handle_); + + // The first device added is special: it is the 'client device' (a + // CPU device) from which we feed and fetch Tensors. + if (devices_added == 0) { + device_set_.set_client_device(d); + } + ++devices_added; + } +} + +LocalSession::~LocalSession() { + for (auto d : device_mgr_->ListDevices()) { + d->op_segment()->RemoveHold(session_handle_); + } + for (auto it : executors_) { + delete it.second; + } + delete cancellation_manager_; +} + +Status LocalSession::Create(const GraphDef& graph) { + mutex_lock l(graph_def_lock_); + if (graph_created_) { + return errors::AlreadyExists( + "A Graph has already been created for this session."); + } + return ExtendLocked(graph); +} + +Status LocalSession::Extend(const GraphDef& graph) { + mutex_lock l(graph_def_lock_); + return ExtendLocked(graph); +} + +Status LocalSession::ExtendLocked(const GraphDef& graph) { + graph_created_ = true; // In case this is first call + graph_def_.MergeFrom(graph); + return Status::OK(); +} + +Status LocalSession::Run(const std::vector<std::pair<string, Tensor>>& inputs, + const std::vector<string>& output_names, + const std::vector<string>& target_nodes, + std::vector<Tensor>* outputs) { + { + mutex_lock l(graph_def_lock_); + if (!graph_created_) { + return errors::InvalidArgument( + "Session was not created with a graph before Run()!"); + } + } + + // Extract the inputs names for this run of the session. + std::vector<string> input_tensor_names; + input_tensor_names.reserve(inputs.size()); + for (const auto& it : inputs) { + input_tensor_names.push_back(it.first); + } + + // Check if we already have an executor for these arguments. + ExecutorsAndKeys* executors_and_keys; + Status s = GetOrCreateExecutors(input_tensor_names, output_names, + target_nodes, &executors_and_keys); + if (!s.ok()) { + return s; + } + + IntraProcessRendezvous* rendez = + new IntraProcessRendezvous(device_mgr_.get()); + core::ScopedUnref rendez_unref(rendez); + + // Insert the input tensors into the local rendezvous by their + // rendezvous key. + for (const auto& input : inputs) { + const string& input_key = executors_and_keys->input_keys[input.first]; + s = rendez->Send(input_key, Rendezvous::Args(), input.second, false); + if (!s.ok()) { + rendez->StartAbort(s); + return s; + } + } + + // Start parallel Executors. + Notification executors_done; + const int num_executors = executors_and_keys->device_executors.size(); + ExecutorBarrier* barrier = new ExecutorBarrier( + num_executors, rendez, [&executors_done, &s](const Status& ret) { + s = ret; + executors_done.Notify(); + }); + + Executor::Args args; + args.rendezvous = rendez; + args.cancellation_manager = cancellation_manager_; + args.runner = SchedClosure; + + for (auto device_executor : executors_and_keys->device_executors) { + Executor* exec = device_executor.second; + exec->RunAsync(args, barrier->Get()); + } + + executors_done.WaitForNotification(); + + TF_RETURN_IF_ERROR(s); + + if (!output_names.empty()) { + outputs->resize(output_names.size()); + } + + // Get the outputs from the rendezvous + for (size_t output_offset = 0; output_offset < output_names.size(); + ++output_offset) { + const string& output_key = + executors_and_keys->output_keys[output_names[output_offset]]; + Tensor output_tensor; + bool is_dead; + + // Fetch data from the Rendezvous. + s = rendez->Recv(output_key, Rendezvous::Args(), &output_tensor, &is_dead); + if (is_dead) { + s = errors::InvalidArgument("The tensor returned for ", + output_names[output_offset], + " was not valid."); + } + if (!s.ok()) { + rendez->StartAbort(s); + outputs->clear(); + return s; + } + + (*outputs)[output_offset] = output_tensor; + } + + return s; +} + +Status LocalSession::GetOrCreateExecutors( + gtl::ArraySlice<string> inputs, gtl::ArraySlice<string> outputs, + gtl::ArraySlice<string> target_nodes, + ExecutorsAndKeys** executors_and_keys) { + // Sort the inputs and outputs, so we don't create separate + // executors when a user passes in the same inputs/outputs in + // different orders. + // + // We could consider some other signature instead of sorting that + // preserves the same property to avoid the sort in the future. + std::vector<string> inputs_sorted(inputs.begin(), inputs.end()); + std::vector<string> outputs_sorted(outputs.begin(), outputs.end()); + std::vector<string> tn_sorted(target_nodes.begin(), target_nodes.end()); + std::sort(inputs_sorted.begin(), inputs_sorted.end()); + std::sort(outputs_sorted.begin(), outputs_sorted.end()); + std::sort(tn_sorted.begin(), tn_sorted.end()); + + const string key = strings::StrCat(str_util::Join(inputs_sorted, ","), "->", + str_util::Join(outputs_sorted, ","), "/", + str_util::Join(tn_sorted, ",")); + + // See if we already have the executors for this run. + { + mutex_lock l(executor_lock_); // could use reader lock + auto it = executors_.find(key); + if (it != executors_.end()) { + *executors_and_keys = it->second; + return Status::OK(); + } + } + + // The executor_lock_ is intentionally released while executor is + // being created. + std::unordered_map<string, Graph*> graphs; + Status s = CreateGraphs(inputs, outputs, target_nodes, &graphs); + if (!s.ok()) { + return s; + } + + bool has_control_flow = false; + for (const auto& graph : graphs) { + for (const Node* n : graph.second->nodes()) { + if (IsControlFlow(n)) { + has_control_flow = true; + break; + } + } + if (has_control_flow) break; + } + + std::unique_ptr<ExecutorsAndKeys> ek(new ExecutorsAndKeys); + + for (const auto& graph : graphs) { + const string& partition_name = graph.first; + Graph* partition_graph = graph.second; + + Device* d; + s = device_mgr_->LookupDevice(partition_name, &d); + if (!s.ok()) { + return s; + } + + LocalExecutorParams params; + params.has_control_flow = has_control_flow; + params.device = d; + params.create_kernel = [this, d](const NodeDef& ndef, OpKernel** kernel) { + return CreateCachedKernel(d, session_handle_, nullptr, ndef, kernel); + }; + params.delete_kernel = [this, d](OpKernel* kernel) { + DeleteCachedKernel(d, session_handle_, kernel); + }; + + Executor* tmp_exec; + s = NewLocalExecutor(params, partition_graph, &tmp_exec); + if (!s.ok()) { + return s; + } + ek->device_executors.insert(std::make_pair(graph.first, tmp_exec)); + } + + // Compute the rendezvous keys to avoid recomputing them every time. + // + // We always use the first device as the device name portion of the + // key, even if we're feeding another graph. + for (const string& input : inputs) { + ek->input_keys[input] = GetRendezvousKey( + input, device_set_.client_device()->attributes(), FrameAndIter(0, 0)); + } + for (const string& output : outputs) { + ek->output_keys[output] = GetRendezvousKey( + output, device_set_.client_device()->attributes(), FrameAndIter(0, 0)); + } + + // Reacquire the lock, try to insert into the map. + mutex_lock l(executor_lock_); + const bool inserted = executors_.insert(std::make_pair(key, ek.get())).second; + if (!inserted) { + // Another thread created the entry before us, so delete the + // one we created and return the already created one. + auto it = executors_.find(key); + *executors_and_keys = it->second; + } else { + *executors_and_keys = ek.release(); + } + + return Status::OK(); +} + +void LocalSession::SaveStatefulNodes(Graph* graph) { + for (Node* n : graph->nodes()) { + if (n->op_def().is_stateful()) { + VLOG(2) << "Saving " << n->DebugString(); + stateful_placements_[n->name()] = n->assigned_device_name(); + } + } +} + +void LocalSession::RestoreStatefulNodes(Graph* graph) { + for (Node* n : graph->nodes()) { + if (n->op_def().is_stateful()) { + auto iter = stateful_placements_.find(n->name()); + if (iter != stateful_placements_.end()) { + n->set_assigned_device_name(iter->second); + VLOG(2) << "Restored " << n->DebugString(); + } + } + } +} + +Status LocalSession::CreateGraphs(gtl::ArraySlice<string> feeds, + gtl::ArraySlice<string> fetches, + gtl::ArraySlice<string> target_nodes, + std::unordered_map<string, Graph*>* outputs) { + Graph graph(OpRegistry::Global()); + GraphConstructorOptions opts; + + { + mutex_lock l(graph_def_lock_); + TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(opts, graph_def_, &graph)); + } + + TF_RETURN_IF_ERROR(subgraph::RewriteGraphForExecution( + &graph, feeds, fetches, target_nodes, + device_set_.client_device()->attributes())); + + // Run the simple placer after rewriting the graph. + std::unordered_map<string, int32> node_name_to_cost_map; + for (Node* n : graph.nodes()) { + node_name_to_cost_map[n->name()] = n->cost_id(); + } + SimplePlacer placer(&graph, &device_set_, &node_name_to_cost_map, &options_); + + { + mutex_lock l(mu_); + // Restore stateful nodes. + RestoreStatefulNodes(&graph); + TF_RETURN_IF_ERROR(placer.Run()); + // Save stateful nodes. + SaveStatefulNodes(&graph); + } + + // Partition the graph across devices. + std::unordered_map<string, GraphDef> partitions; + PartitionOptions popts; + popts.node_to_loc = [](const Node* node) { + return node->assigned_device_name(); + }; + popts.new_name = [this](const string& prefix) { + mutex_lock l(mu_); + return strings::StrCat(prefix, "/_", name_counter_++); + }; + popts.get_incarnation = [](const string& name) { + // The local session does not have changing incarnation numbers. + // Just return '1'. + return 1; + }; + popts.control_flow_added = false; + TF_RETURN_IF_ERROR(Partition(popts, &graph, &partitions)); + + std::vector<string> device_names; + for (auto device : devices_) { + // Extract the LocalName from the device. + device_names.push_back(DeviceNameUtils::LocalName(device->name())); + } + + // Check for valid partitions. + for (const auto& partition : partitions) { + const string& local_partition_name = + DeviceNameUtils::LocalName(partition.first); + if (std::count(device_names.begin(), device_names.end(), + local_partition_name) == 0) { + return errors::InvalidArgument( + "Creating a partition for ", local_partition_name, + " which doesn't exist in the list of available devices. Available " + "devices: ", + str_util::Join(device_names, ",")); + } + } + + for (const auto& partition : partitions) { + const string& partition_name = partition.first; + + const GraphDef& graph_def = partition.second; + VLOG(2) << "Created " << graph_def.DebugString() << " for " + << partition_name; + + Graph* device_graph = new Graph(OpRegistry::Global()); + GraphConstructorOptions device_opts; + // There are internal operations (e.g., send/recv) that we now + // allow. + device_opts.allow_internal_ops = true; + device_opts.expect_device_spec = true; + Status s = + ConvertGraphDefToGraph(device_opts, graph_def, device_graph); + if (!s.ok()) { + delete device_graph; + // Also delete other graphs created during the loop. + gtl::STLDeleteValues(outputs); + return s; + } + outputs->insert(std::make_pair(partition_name, device_graph)); + } + + return Status::OK(); +} + +::tensorflow::Status LocalSession::Close() { + cancellation_manager_->StartCancel(); + return ::tensorflow::Status::OK(); +} + +class LocalSessionFactory : public SessionFactory { + public: + LocalSessionFactory() {} + + Session* NewSession(const SessionOptions& options) override { + std::vector<Device*> devices; + DeviceFactory::AddDevices(options, "/job:localhost/replica:0/task:0", + &devices); + return new LocalSession(options, new DeviceMgr(devices)); + } +}; + +class LocalSessionRegistrar { + public: + LocalSessionRegistrar() { + SessionFactory::Register("LOCAL_SESSION", new LocalSessionFactory()); + } +}; +static LocalSessionRegistrar registrar; + +} // namespace tensorflow diff --git a/tensorflow/core/common_runtime/local_session.h b/tensorflow/core/common_runtime/local_session.h new file mode 100644 index 0000000000..453cfdde47 --- /dev/null +++ b/tensorflow/core/common_runtime/local_session.h @@ -0,0 +1,109 @@ +#ifndef TENSORFLOW_COMMON_RUNTIME_LOCAL_SESSION_H_ +#define TENSORFLOW_COMMON_RUNTIME_LOCAL_SESSION_H_ + +#include <memory> +#include <string> +#include <unordered_map> +#include <vector> + +#include "tensorflow/core/platform/thread_annotations.h" +#include "tensorflow/core/common_runtime/device_mgr.h" +#include "tensorflow/core/common_runtime/device_set.h" +#include "tensorflow/core/common_runtime/executor.h" +#include "tensorflow/core/framework/cancellation.h" +#include "tensorflow/core/framework/graph.pb.h" +#include "tensorflow/core/platform/port.h" +#include "tensorflow/core/public/session.h" +#include "tensorflow/core/public/tensor.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/public/status.h" + +namespace tensorflow { + +class Device; + +class LocalSession : public Session { + public: + // Takes ownership of 'device_mgr'. + LocalSession(const SessionOptions& options, const DeviceMgr* device_mgr); + ~LocalSession() override; + + ::tensorflow::Status Create(const GraphDef& graph) override; + ::tensorflow::Status Extend(const GraphDef& graph) override; + ::tensorflow::Status Run(const std::vector<std::pair<string, Tensor>>& inputs, + const std::vector<string>& output_names, + const std::vector<string>& target_nodes, + std::vector<Tensor>* outputs) override; + ::tensorflow::Status Close() override; + + private: + struct ExecutorsAndKeys { + std::unordered_map<string, Executor*> device_executors; + std::unordered_map<string, string> input_keys; + std::unordered_map<string, string> output_keys; + + ~ExecutorsAndKeys() { + for (auto it : device_executors) { + delete it.second; + } + } + }; + + // Retrieves an already existing set of executors to run 'inputs' and + // 'outputs', or creates and caches them for future use. + ::tensorflow::Status GetOrCreateExecutors( + gtl::ArraySlice<string> inputs, gtl::ArraySlice<string> outputs, + gtl::ArraySlice<string> target_nodes, + ExecutorsAndKeys** executors_and_keys); + + // Creates several graphs given the existing graph_def_ and the + // input feeds and fetches, given 'devices'. + ::tensorflow::Status CreateGraphs( + gtl::ArraySlice<string> feeds, gtl::ArraySlice<string> fetches, + gtl::ArraySlice<string> target_nodes, + std::unordered_map<string, Graph*>* outputs); + + ::tensorflow::Status ExtendLocked(const GraphDef& graph) + EXCLUSIVE_LOCKS_REQUIRED(graph_def_lock_); + + const SessionOptions options_; + + // Device structures. + const std::unique_ptr<const DeviceMgr> device_mgr_; + std::vector<Device*> devices_; // not owned + DeviceSet device_set_; + + string session_handle_; + bool graph_created_ GUARDED_BY(graph_def_lock_) = false; + + mutex graph_def_lock_; + GraphDef graph_def_ GUARDED_BY(graph_def_lock_); + + mutex executor_lock_; // protects executors_ + // Holds mappings from signature to the executors that process + // it. The reason for a level of indirection around mapped_type is + // to guarantee address stability. + std::unordered_map<string, ExecutorsAndKeys*> executors_ + GUARDED_BY(executor_lock_); + + CancellationManager* cancellation_manager_; + + // Saves and restores device placements for stateful nodes. + mutex mu_; + void SaveStatefulNodes(Graph* graph) EXCLUSIVE_LOCKS_REQUIRED(mu_); + void RestoreStatefulNodes(Graph* graph) EXCLUSIVE_LOCKS_REQUIRED(mu_); + // Map of placed stateful nodes, i.e. nodes for which is_stateful() + // is true, such as "params" and "queue" nodes. Once placed these + // nodes can not be moved to a different device. Maps node names to + // device names. + std::unordered_map<string, string> stateful_placements_ GUARDED_BY(mu_); + + // For generating unique names. + int64 name_counter_ GUARDED_BY(mu_) = 0; + + TF_DISALLOW_COPY_AND_ASSIGN(LocalSession); +}; + +} // end namespace tensorflow + +#endif // TENSORFLOW_COMMON_RUNTIME_LOCAL_SESSION_H_ diff --git a/tensorflow/core/common_runtime/local_session_test.cc b/tensorflow/core/common_runtime/local_session_test.cc new file mode 100644 index 0000000000..9325fe44c3 --- /dev/null +++ b/tensorflow/core/common_runtime/local_session_test.cc @@ -0,0 +1,314 @@ +#include "tensorflow/core/common_runtime/local_session.h" + +#include <map> +#include <string> +#include <unordered_map> +#include <vector> + +#include "tensorflow/core/framework/allocator.h" +#include "tensorflow/core/framework/graph.pb.h" +#include "tensorflow/core/framework/tensor_testutil.h" +#include "tensorflow/core/framework/types.pb.h" +#include "tensorflow/core/graph/graph.h" +#include "tensorflow/core/graph/testlib.h" +#include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/lib/core/status_test_util.h" +#include "tensorflow/core/lib/core/threadpool.h" +#include "tensorflow/core/public/session_options.h" +#include "tensorflow/core/public/status.h" +#include "tensorflow/core/public/tensor.h" +#include "tensorflow/core/util/device_name_utils.h" +#include <gtest/gtest.h> + +namespace tensorflow { +namespace { + +Session* CreateSession() { + SessionOptions options; + (*options.config.mutable_device_count())["CPU"] = 2; + return NewSession(options); +} + +class LocalSessionMinusAXTest : public ::testing::Test { + public: + void Initialize(std::initializer_list<float> a_values) { + RequireDefaultOps(); + Graph graph(OpRegistry::Global()); + + Tensor a_tensor(DT_FLOAT, TensorShape({2, 2})); + test::FillValues<float>(&a_tensor, a_values); + Node* a = test::graph::Constant(&graph, a_tensor); + a->set_assigned_device_name("/job:localhost/replica:0/task:0/cpu:0"); + + Tensor x_tensor(DT_FLOAT, TensorShape({2, 1})); + test::FillValues<float>(&x_tensor, {1, 1}); + Node* x = test::graph::Constant(&graph, x_tensor); + x->set_assigned_device_name("/job:localhost/replica:0/task:0/cpu:1"); + x_ = x->name(); + + // y = A * x + Node* y = test::graph::Matmul(&graph, a, x, false, false); + y->set_assigned_device_name("/job:localhost/replica:0/task:0/cpu:0"); + y_ = y->name(); + + Node* y_neg = test::graph::Unary(&graph, "Neg", y); + y_neg_ = y_neg->name(); + y_neg->set_assigned_device_name("/job:localhost/replica:0/task:0/cpu:1"); + + test::graph::ToGraphDef(&graph, &def_); + } + + string x_; + string y_; + string y_neg_; + GraphDef def_; +}; + +TEST_F(LocalSessionMinusAXTest, RunSimpleNetwork) { + Initialize({3, 2, -1, 0}); + std::unique_ptr<Session> session(CreateSession()); + ASSERT_TRUE(session != nullptr); + ASSERT_OK(session->Create(def_)); + std::vector<std::pair<string, Tensor>> inputs; + + // Request two targets: one fetch output and one non-fetched output. + std::vector<string> output_names = {y_ + ":0"}; + std::vector<string> target_nodes = {y_neg_}; + std::vector<Tensor> outputs; + Status s = session->Run(inputs, output_names, target_nodes, &outputs); + ASSERT_OK(s); + + ASSERT_EQ(1, outputs.size()); + // The first output should be initiailzed and have the correct + // output. + auto mat = outputs[0].matrix<float>(); + ASSERT_TRUE(outputs[0].IsInitialized()); + EXPECT_FLOAT_EQ(5.0, mat(0, 0)); +} + +TEST_F(LocalSessionMinusAXTest, TestFeed) { + Initialize({1, 2, 3, 4}); + std::unique_ptr<Session> session(CreateSession()); + ASSERT_TRUE(session != nullptr); + + ASSERT_OK(session->Create(def_)); + + // Fill in the input and ask for the output + // + // Note that the input being fed is on the second device. + Tensor t(DT_FLOAT, TensorShape({2, 1})); + t.matrix<float>()(0, 0) = 5; + t.matrix<float>()(1, 0) = 6; + std::vector<std::pair<string, Tensor>> inputs = {{x_, t}}; + std::vector<string> output_names = {y_ + ":0"}; + std::vector<Tensor> outputs; + + // Run the graph + Status s = session->Run(inputs, output_names, {}, &outputs); + ASSERT_OK(s); + + ASSERT_EQ(1, outputs.size()); + auto mat = outputs[0].matrix<float>(); + + // Expect outputs to be; 1*5 + 2*6, 3*5 + 4*6 + EXPECT_FLOAT_EQ(17.0, mat(0, 0)); + EXPECT_FLOAT_EQ(39.0, mat(1, 0)); +} + +TEST_F(LocalSessionMinusAXTest, TestConcurrency) { + Initialize({1, 2, 3, 4}); + std::unique_ptr<Session> session(CreateSession()); + ASSERT_TRUE(session != nullptr); + ASSERT_OK(session->Create(def_)); + + // Fill in the input and ask for the output + thread::ThreadPool* tp = new thread::ThreadPool(Env::Default(), "test", 4); + + // Run the graph 1000 times in 4 different threads concurrently. + std::vector<string> output_names = {y_ + ":0"}; + auto fn = [&session, output_names]() { + for (int i = 0; i < 1000; ++i) { + std::vector<std::pair<string, Tensor>> inputs; + std::vector<Tensor> outputs; + // Run the graph + Status s = session->Run(inputs, output_names, {}, &outputs); + ASSERT_TRUE(s.ok()); + ASSERT_EQ(1, outputs.size()); + auto mat = outputs[0].matrix<float>(); + EXPECT_FLOAT_EQ(3.0, mat(0, 0)); + } + }; + + for (int i = 0; i < 4; ++i) { + tp->Schedule(fn); + } + + // Wait for the functions to finish. + delete tp; +} + +TEST_F(LocalSessionMinusAXTest, TwoCreateCallsFails) { + Initialize({1, 2, 3, 4}); + std::unique_ptr<Session> session(CreateSession()); + ASSERT_TRUE(session != nullptr); + ASSERT_OK(session->Create(def_)); + + // Second is not. + ASSERT_FALSE(session->Create(def_).ok()); +} + +TEST_F(LocalSessionMinusAXTest, ForgetToCreate) { + Initialize({1, 2, 3, 4}); + std::unique_ptr<Session> session(CreateSession()); + ASSERT_TRUE(session != nullptr); + std::vector<std::pair<string, Tensor>> inputs; + std::vector<Tensor> outputs; + ASSERT_FALSE(session->Run(inputs, {y_ + ":0"}, {y_neg_}, &outputs).ok()); +} + +TEST_F(LocalSessionMinusAXTest, InvalidDevice) { + GraphDef def; + Graph graph(OpRegistry::Global()); + + Tensor a_tensor(DT_FLOAT, TensorShape({2, 2})); + a_tensor.flat<float>().setRandom(); + Node* a = test::graph::Constant(&graph, a_tensor); + a->set_assigned_device_name("/job:localhost/replica:0/task:0/cpu:0"); + Tensor x_tensor(DT_FLOAT, TensorShape({2, 1})); + x_tensor.flat<float>().setRandom(); + Node* x = test::graph::Constant(&graph, x_tensor); + x->set_assigned_device_name("/job:localhost/replica:0/task:0/cpu:1"); + // Skip placing y. + Node* y = test::graph::Matmul(&graph, a, x, false, false); + y->set_assigned_device_name("/job:localhost/replica:0/task:0/cpu:2"); + + test::graph::ToGraphDef(&graph, &def); + + std::unique_ptr<Session> session(CreateSession()); + ASSERT_TRUE(session != nullptr); + ASSERT_OK(session->Create(def)); + std::vector<std::pair<string, Tensor>> inputs; + std::vector<string> output_names = {y->name() + ":0"}; + std::vector<Tensor> outputs; + + // Should return an error. + ASSERT_FALSE(session->Run(inputs, output_names, {}, &outputs).ok()); + + // Fix placement and run again + def.Clear(); + y->set_assigned_device_name("/job:localhost/replica:0/task:0/cpu:1"); + test::graph::ToGraphDef(&graph, &def); + session.reset(CreateSession()); + ASSERT_OK(session->Create(def)); + ASSERT_OK(session->Run(inputs, output_names, {}, &outputs)); +} + +TEST(LocalSessionTest, KeepsStateAcrossRunsOfSession) { + GraphDef def; + Graph g(OpRegistry::Global()); + Node* var = test::graph::Var(&g, DT_FLOAT, TensorShape({10})); + var->set_assigned_device_name("/job:localhost/replica:0/task:0/cpu:0"); + + Tensor twenty(DT_FLOAT, TensorShape({10})); + for (int i = 0; i < 10; ++i) { + twenty.flat<float>()(i) = 20.0; + } + + Node* twenty_node = test::graph::Constant(&g, twenty); + twenty_node->set_assigned_device_name( + "/job:localhost/replica:0/task:0/cpu:0"); + + Node* init = test::graph::Assign(&g, var, twenty_node); + init->set_assigned_device_name("/job:localhost/replica:0/task:0/cpu:0"); + + test::graph::ToGraphDef(&g, &def); + + std::unique_ptr<Session> session(CreateSession()); + ASSERT_TRUE(session != nullptr); + ASSERT_OK(session->Create(def)); + + std::vector<std::pair<string, Tensor>> inputs; + std::vector<Tensor> outputs; + + // Initialize the variable + Status s = session->Run(inputs, {init->name()}, {}, &outputs); + ASSERT_OK(s); + + // Get the variable's data + s = session->Run(inputs, {var->name() + ":0"}, {}, &outputs); + ASSERT_OK(s); + ASSERT_EQ(1, outputs.size()); + ASSERT_TRUE(outputs[0].IsInitialized()); + EXPECT_EQ(20.0, outputs[0].flat<float>()(0)); +} + +TEST(LocalSessionTest, MultipleFeedTest) { + GraphDef def; + Graph g(OpRegistry::Global()); + Node* var = test::graph::Var(&g, DT_FLOAT, TensorShape({10})); + var->set_assigned_device_name("/job:localhost/replica:0/task:0/cpu:0"); + + Tensor first_value(DT_FLOAT, TensorShape({})); + first_value.scalar<float>()() = 1.0; + Node* first_const = test::graph::Constant(&g, first_value); + Node* first_identity = test::graph::Identity(&g, first_const); + + Tensor second_value(DT_FLOAT, TensorShape({})); + second_value.scalar<float>()() = 2.0; + Node* second_const = test::graph::Constant(&g, second_value); + Node* second_identity = test::graph::Identity(&g, second_const); + + test::graph::ToGraphDef(&g, &def); + + std::unique_ptr<Session> session(CreateSession()); + ASSERT_TRUE(session != nullptr); + ASSERT_OK(session->Create(def)); + + std::vector<Tensor> outputs; + + // Fetch without feeding. + Status s = session->Run( + {}, {first_identity->name() + ":0", second_identity->name() + ":0"}, {}, + &outputs); + ASSERT_TRUE(s.ok()); + ASSERT_EQ(2, outputs.size()); + ASSERT_EQ(1.0, outputs[0].flat<float>()(0)); + ASSERT_EQ(2.0, outputs[1].flat<float>()(0)); + + s = session->Run( + {}, {second_identity->name() + ":0", first_identity->name() + ":0"}, {}, + &outputs); + ASSERT_TRUE(s.ok()); + ASSERT_EQ(2, outputs.size()); + ASSERT_EQ(2.0, outputs[0].flat<float>()(0)); + ASSERT_EQ(1.0, outputs[1].flat<float>()(0)); + + Tensor value_11(DT_FLOAT, TensorShape({})); + value_11.scalar<float>()() = 11.0; + Tensor value_22(DT_FLOAT, TensorShape({})); + value_22.scalar<float>()() = 22.0; + + // Feed [first_const, second_const] + s = session->Run( + {{first_const->name(), value_11}, {second_const->name(), value_22}}, + {first_identity->name() + ":0", second_identity->name() + ":0"}, {}, + &outputs); + ASSERT_TRUE(s.ok()); + ASSERT_EQ(2, outputs.size()); + ASSERT_EQ(11.0, outputs[0].flat<float>()(0)); + ASSERT_EQ(22.0, outputs[1].flat<float>()(0)); + + // Feed [second_const, first_const] + s = session->Run( + {{second_const->name(), value_22}, {first_const->name(), value_11}}, + {first_identity->name() + ":0", second_identity->name() + ":0"}, {}, + &outputs); + ASSERT_TRUE(s.ok()); + ASSERT_EQ(2, outputs.size()); + ASSERT_EQ(11.0, outputs[0].flat<float>()(0)); + ASSERT_EQ(22.0, outputs[1].flat<float>()(0)); +} + +} // namespace + +} // namespace tensorflow diff --git a/tensorflow/core/common_runtime/rendezvous_mgr.cc b/tensorflow/core/common_runtime/rendezvous_mgr.cc new file mode 100644 index 0000000000..111dea6d4c --- /dev/null +++ b/tensorflow/core/common_runtime/rendezvous_mgr.cc @@ -0,0 +1,170 @@ +#include "tensorflow/core/common_runtime/rendezvous_mgr.h" + +#include <unordered_set> + +#include "tensorflow/core/common_runtime/device.h" +#include "tensorflow/core/common_runtime/device_mgr.h" +#if (!defined(PLATFORM_POSIX_ANDROID) && !defined(PLATFORM_GOOGLE_ANDROID)) && \ + (defined(PLATFORM_GOOGLE) || GOOGLE_CUDA) +#include "tensorflow/core/common_runtime/gpu/gpu_util.h" +#endif +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/lib/core/notification.h" +#include "tensorflow/core/lib/strings/numbers.h" +#include "tensorflow/core/lib/strings/str_util.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/port.h" + +namespace tensorflow { + +namespace { + +void CopyTensorBetweenDevices(const string& id, DeviceContext* send_dev_context, + DeviceContext* recv_dev_context, Device* src, + Device* dst, + const AllocatorAttributes src_alloc_attr, + const AllocatorAttributes dst_alloc_attr, + const Tensor* input, Tensor* output, + std::function<void(const Status&)> done) { + if (src->attributes().device_type() != dst->attributes().device_type()) { + done(errors::Unimplemented( + "Copy between device types not yet implemented: src=", src->name(), + " dst=", dst->name())); + } else if (src->attributes().device_type() != "CPU") { + done(errors::Unimplemented( + "Copy between non-CPU devices not yet implemented")); + } + *output = *input; + done(Status::OK()); +} + +#if (!defined(PLATFORM_POSIX_ANDROID) && !defined(PLATFORM_GOOGLE_ANDROID)) && \ + (defined(PLATFORM_GOOGLE) || GOOGLE_CUDA) +constexpr auto CopyTensorBetweenDevicesFunc = &GPUUtil::CopyViaDMA; +#else +constexpr auto CopyTensorBetweenDevicesFunc = &CopyTensorBetweenDevices; +#endif + +} // end namespace + +IntraProcessRendezvous::IntraProcessRendezvous(const DeviceMgr* device_mgr) + : device_mgr_(device_mgr), local_(NewLocalRendezvous()) {} + +IntraProcessRendezvous::~IntraProcessRendezvous() { local_->Unref(); } + +Status IntraProcessRendezvous::Send(const string& key, + const Rendezvous::Args& args, + const Tensor& val, const bool is_dead) { + VLOG(1) << "IntraProcessRendezvous Send " << this << " " << key; + { + mutex_lock l(mu_); + if (!status_.ok()) return status_; + } + Rendezvous::ParsedKey parsed; + TF_RETURN_IF_ERROR(Rendezvous::ParseKey(key, &parsed)); + + // Buffers "val" and "device_context" in local_. + return local_->Send(key, args, val, is_dead); +} + +Status IntraProcessRendezvous::ParseKey(const string& key, bool is_src, + Rendezvous::ParsedKey* parsed) { + { + mutex_lock l(mu_); + if (!status_.ok()) return status_; + } + TF_RETURN_IF_ERROR(Rendezvous::ParseKey(key, parsed)); + return Status::OK(); +} + +void IntraProcessRendezvous::SameWorkerRecvDone( + const Rendezvous::ParsedKey& parsed, const Rendezvous::Args& send_args, + const Rendezvous::Args& recv_args, const Tensor& in, Tensor* out, + StatusCallback done) { + // Do a quick copy (sharing the underlying buffer) if both tensors + // are on host memory. + const bool src_host = + (send_args.alloc_attrs.on_host() || parsed.src.type == "CPU"); + const bool dst_host = + (recv_args.alloc_attrs.on_host() || parsed.dst.type == "CPU"); + if (src_host && dst_host) { + *out = in; + done(Status::OK()); + return; + } + + // This copy must involve a non-CPU device. Hence, "in" must support DMA + // (e.g., string tensors do not work on GPU). + if (!DataTypeCanUseMemcpy(in.dtype())) { + done(errors::InvalidArgument("Non-DMA-safe ", DataTypeString(in.dtype()), + " tensor may not be copied from/to a GPU.")); + return; + } + + Device* src_device; + Status s = device_mgr_->LookupDevice(parsed.src_device, &src_device); + if (!s.ok()) { + done(s); + return; + } + Device* dst_device; + s = device_mgr_->LookupDevice(parsed.dst_device, &dst_device); + if (!s.ok()) { + done(s); + return; + } + + AllocatorAttributes attr = recv_args.alloc_attrs; + attr.set_gpu_compatible(send_args.alloc_attrs.gpu_compatible() || + recv_args.alloc_attrs.gpu_compatible()); + Allocator* out_allocator = dst_device->GetAllocator(attr); + Tensor copy(out_allocator, in.dtype(), in.shape()); + *out = copy; + + CopyTensorBetweenDevicesFunc(parsed.edge_name, send_args.device_context, + recv_args.device_context, src_device, dst_device, + send_args.alloc_attrs, recv_args.alloc_attrs, + &in, out, done); +} + +void IntraProcessRendezvous::RecvAsync(const string& key, + const Rendezvous::Args& recv_args, + DoneCallback done) { + VLOG(1) << "IntraProcessRendezvous Recv " << this << " " << key; + + Rendezvous::ParsedKey parsed; + Status s = ParseKey(key, false /*!is_src*/, &parsed); + if (!s.ok()) { + done(s, Args(), recv_args, Tensor(), false); + return; + } + + // Recv the tensor from local_. + local_->RecvAsync(key, recv_args, [this, parsed, done]( + const Status& status, + const Rendezvous::Args& send_args, + const Rendezvous::Args& recv_args, + const Tensor& in, bool is_dead) { + Status s = status; + Tensor* out = new Tensor; + StatusCallback final_callback = [done, send_args, recv_args, out, + is_dead](const Status& s) { + done(s, send_args, recv_args, *out, is_dead); + delete out; + }; + + if (s.ok()) { + SameWorkerRecvDone(parsed, send_args, recv_args, in, out, final_callback); + } else { + final_callback(s); + } + }); +} + +void IntraProcessRendezvous::StartAbort(const Status& s) { + CHECK(!s.ok()); + local_->StartAbort(s); +} + +} // end namespace tensorflow diff --git a/tensorflow/core/common_runtime/rendezvous_mgr.h b/tensorflow/core/common_runtime/rendezvous_mgr.h new file mode 100644 index 0000000000..eaae65f956 --- /dev/null +++ b/tensorflow/core/common_runtime/rendezvous_mgr.h @@ -0,0 +1,73 @@ +#ifndef TENSORFLOW_COMMON_RUNTIME_RENDEZVOUS_MGR_H_ +#define TENSORFLOW_COMMON_RUNTIME_RENDEZVOUS_MGR_H_ + +#include <string> +#include <unordered_map> + +#include "tensorflow/core/common_runtime/device_mgr.h" +#include "tensorflow/core/framework/rendezvous.h" +#include "tensorflow/core/platform/port.h" +#include "tensorflow/core/public/status.h" +#include "tensorflow/core/public/tensor.h" + +namespace tensorflow { + +// IntraProcessRendezvous is a Rendezvous which expects all producers +// and consumers to be devices immediately accessible within the +// process. That is, it will never be necessary to perform an RPC to +// communicate with either. +// +// Buffering of Tensor values is delegated to a "local" Rendezvous +// obtained from NewLocalRendezvous(). This class just adds +// functionality to coordinate multiple process-local devices. +class IntraProcessRendezvous : public Rendezvous { + public: + explicit IntraProcessRendezvous(const DeviceMgr* device_mgr); + + // Forwards to local_, where the Tensor "val" will be buffered and + // any waiting callback stored. + Status Send(const string& key, const Rendezvous::Args& args, + const Tensor& val, const bool is_dead) override; + + // This method is called only by the RecvOp. It tests to see + // whether the value will be produced by a local or remote device + // and handles accordingly. In the local case it forwards to + // local_, in the remote case it initiates an RPC request. + void RecvAsync(const string& key, const Rendezvous::Args& args, + DoneCallback done) override; + + void StartAbort(const Status& status) override; + + private: + const DeviceMgr* device_mgr_; + Rendezvous* local_; // Owns a Ref on this object. + + mutable mutex mu_; + + // Status given by StartAbort() if any. + Status status_ GUARDED_BY(mu_); + + ~IntraProcessRendezvous() override; + + // Parses "key" into "parsed". If "is_src" is true, checks that the + // rendezvous key's source is in this process. If "is_src" is false, + // checks that the rendezvous key's destination is in this process. + Status ParseKey(const string& key, bool is_src, + Rendezvous::ParsedKey* parsed); + + // Callback handling the case when a rendezvous has been + // accomplished in local_ and the consumer is local to this process. + // Tensor "in" will be copied into "out". The key "parsed" encodes + // the src and dst devices. + typedef std::function<void(const Status&)> StatusCallback; + void SameWorkerRecvDone(const Rendezvous::ParsedKey& parsed, + const Rendezvous::Args& send_args, + const Rendezvous::Args& recv_args, const Tensor& in, + Tensor* out, StatusCallback done); + + TF_DISALLOW_COPY_AND_ASSIGN(IntraProcessRendezvous); +}; + +} // end namespace tensorflow + +#endif // TENSORFLOW_COMMON_RUNTIME_RENDEZVOUS_MGR_H_ diff --git a/tensorflow/core/common_runtime/session.cc b/tensorflow/core/common_runtime/session.cc new file mode 100644 index 0000000000..6d1ab5cea4 --- /dev/null +++ b/tensorflow/core/common_runtime/session.cc @@ -0,0 +1,51 @@ +#include <string> + +#include "tensorflow/core/common_runtime/session_factory.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/public/session.h" + +namespace tensorflow { + +namespace { +Status GetFactory(const SessionOptions& options, SessionFactory** ret) { + string runtime_type = "LOCAL_SESSION"; + if (!options.target.empty()) { + // Use the service based session. + runtime_type = "REMOTE_SESSION"; + } + *ret = SessionFactory::GetFactory(runtime_type); + if (!*ret) { + return errors::NotFound("Could not find session factory for ", + runtime_type); + } + return Status::OK(); +} +} // end namespace + +Session* NewSession(const SessionOptions& options) { + SessionFactory* factory; + Status s = GetFactory(options, &factory); + if (!s.ok()) { + LOG(ERROR) << s; + return nullptr; + } + return factory->NewSession(options); +} + +Status NewSession(const SessionOptions& options, Session** out_session) { + SessionFactory* factory; + Status s = GetFactory(options, &factory); + if (!s.ok()) { + *out_session = nullptr; + LOG(ERROR) << s; + return s; + } + *out_session = factory->NewSession(options); + if (!*out_session) { + return errors::Internal("Failed to create session."); + } + return Status::OK(); +} + +} // namespace tensorflow diff --git a/tensorflow/core/common_runtime/session_factory.cc b/tensorflow/core/common_runtime/session_factory.cc new file mode 100644 index 0000000000..666b99812d --- /dev/null +++ b/tensorflow/core/common_runtime/session_factory.cc @@ -0,0 +1,41 @@ +#include "tensorflow/core/common_runtime/session_factory.h" + +#include <unordered_map> + +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/port.h" +namespace tensorflow { +namespace { + +static mutex* get_session_factory_lock() { + static mutex session_factory_lock; + return &session_factory_lock; +} + +typedef std::unordered_map<string, SessionFactory*> SessionFactories; +SessionFactories* session_factories() { + static SessionFactories* factories = new SessionFactories; + return factories; +} + +} // namespace + +void SessionFactory::Register(const string& runtime_type, + SessionFactory* factory) { + mutex_lock l(*get_session_factory_lock()); + if (!session_factories()->insert({runtime_type, factory}).second) { + LOG(ERROR) << "Two session factories are being registered " + << "under" << runtime_type; + } +} + +SessionFactory* SessionFactory::GetFactory(const string& runtime_type) { + mutex_lock l(*get_session_factory_lock()); // could use reader lock + auto it = session_factories()->find(runtime_type); + if (it == session_factories()->end()) { + return nullptr; + } + return it->second; +} + +} // namespace tensorflow diff --git a/tensorflow/core/common_runtime/session_factory.h b/tensorflow/core/common_runtime/session_factory.h new file mode 100644 index 0000000000..f770ba93ff --- /dev/null +++ b/tensorflow/core/common_runtime/session_factory.h @@ -0,0 +1,25 @@ +#ifndef TENSORFLOW_COMMON_RUNTIME_SESSION_FACTORY_H_ +#define TENSORFLOW_COMMON_RUNTIME_SESSION_FACTORY_H_ + +#include <string> + +#include "tensorflow/core/lib/gtl/array_slice.h" +#include "tensorflow/core/platform/port.h" +#include "tensorflow/core/public/status.h" + +namespace tensorflow { + +class Session; +class SessionOptions; + +class SessionFactory { + public: + virtual Session* NewSession(const SessionOptions& options) = 0; + virtual ~SessionFactory() {} + static void Register(const string& runtime_type, SessionFactory* factory); + static SessionFactory* GetFactory(const string& runtime_type); +}; + +} // namespace tensorflow + +#endif // TENSORFLOW_COMMON_RUNTIME_SESSION_FACTORY_H_ diff --git a/tensorflow/core/common_runtime/session_options.cc b/tensorflow/core/common_runtime/session_options.cc new file mode 100644 index 0000000000..ef585efb5c --- /dev/null +++ b/tensorflow/core/common_runtime/session_options.cc @@ -0,0 +1,9 @@ +#include "tensorflow/core/public/session_options.h" + +#include "tensorflow/core/public/env.h" + +namespace tensorflow { + +SessionOptions::SessionOptions() : env(Env::Default()) {} + +} // namespace tensorflow diff --git a/tensorflow/core/common_runtime/session_test.cc b/tensorflow/core/common_runtime/session_test.cc new file mode 100644 index 0000000000..82b5d7ffb0 --- /dev/null +++ b/tensorflow/core/common_runtime/session_test.cc @@ -0,0 +1,17 @@ +#include "tensorflow/core/public/session.h" + +#include "tensorflow/core/public/session_options.h" +#include <gtest/gtest.h> + +namespace tensorflow { +namespace { + +TEST(SessionTest, InvalidTargetReturnsNull) { + SessionOptions options; + options.target = "invalid target"; + + EXPECT_EQ(nullptr, tensorflow::NewSession(options)); +} + +} // namespace +} // namespace tensorflow diff --git a/tensorflow/core/common_runtime/simple_placer.cc b/tensorflow/core/common_runtime/simple_placer.cc new file mode 100644 index 0000000000..1cd1db29db --- /dev/null +++ b/tensorflow/core/common_runtime/simple_placer.cc @@ -0,0 +1,559 @@ +#include "tensorflow/core/common_runtime/simple_placer.h" + +#include <memory> +#include <utility> +#include <vector> + +#include "tensorflow/core/common_runtime/device.h" +#include "tensorflow/core/framework/device_attributes.pb.h" +#include "tensorflow/core/framework/graph.pb.h" +#include "tensorflow/core/framework/node_def_util.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/framework/types.pb.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/lib/core/stringpiece.h" + +namespace tensorflow { + +namespace { + +// Returns a list of devices sorted by name from 'devices' whose type is in +// 'supported_device_types'. This function searches in order of the device +// types in 'supported_device_types' and returns the *first* subset of devices +// that match. +// +// For example, if suported_device_types contains {GPU, CPU} and +// 'devices' contains CPU and GPU devices, the returned vector will +// include *only* GPU devices, since that is higher in the priority +// order in 'supported_device_types'. +std::vector<Device*> FilterSupportedDevices( + const std::vector<Device*>& devices, + const DeviceTypeVector& supported_device_types) { + std::vector<Device*> filtered_devices; + auto device_sort = [](const Device* a, const Device* b) { + return a->name() < b->name(); + }; + for (DeviceType d : supported_device_types) { + for (Device* device : devices) { + if (DeviceType(device->attributes().device_type()) == d) { + filtered_devices.emplace_back(device); + } + } + + // If there are any devices under this device type, return this + // subset. + if (!filtered_devices.empty()) { + std::sort(filtered_devices.begin(), filtered_devices.end(), device_sort); + return filtered_devices; + } + } + + std::sort(filtered_devices.begin(), filtered_devices.end(), device_sort); + return filtered_devices; +} + +bool HasColocatedNodeName(const Node& node) { + return StringPiece(node.def().device()).starts_with("@"); +} + +Status ParseColocatedNodeName(const Node& node, + string* out_colocated_node_name) { + StringPiece device(node.def().device()); + if (!device.Consume("@")) { + return errors::InvalidArgument("Malformed colocated node name: '", device, + "'"); + } + // TODO(mrry): Validate that the node name is a valid node name. + *out_colocated_node_name = device.ToString(); + return Status::OK(); +} + +// This class maintains the connected components of a colocation +// constraint graph, and uses this information to assign a satisfying +// device placement to the nodes of the graph. +// +// The typical usage pattern is: +// +// Graph graph = ...; +// DeviceSet device_set = ...; +// ColocationGraph colocation_graph(graph, device_set); +// +// // Add all the nodes of graph to colocation_graph. +// for (Node* node : graph.nodes()) { +// TF_RETURN_IF_ERROR(colocation_graph.AddNode(*node)); +// } +// +// // Add one or more colocation constraint. +// Node node_1 = *graph.FindNodeId(...); +// Node node_2 = *graph.FindNodeId(...); +// TF_RETURN_IF_ERROR(colocation_graph.ColocateNodes(node_1, node_2)); +// +// // Assign devices based on the accumulated constraints. +// for (Node* node : graph.nodes()) { +// TF_RETURN_IF_ERROR(colocation_graph.AssignDevice(node)); +// } +// +// The implementation uses the union-find algorithm to maintain the +// connected components efficiently and incrementally as edges +// (implied by ColocationGraph::ColocateNodes() invocations) are added. +class ColocationGraph { + public: + ColocationGraph(Graph* graph, const DeviceSet* device_set, + const SessionOptions* options) + : device_set_(device_set), + device_types_(device_set->PrioritizedDeviceTypeList()), + options_(options) { + members_.reserve(graph->num_node_ids()); + } + + // Adds the given node to this ColocationGraph as a singleton. + // + // NOTE: The implementation assumes that the ids of nodes passed to + // this method are dense and zero-based; the memory used will be linear in + // the largest node ID. + // NOTE: If this method returns an error, *this is left in an undefined + // state. + Status AddNode(const Node& node) { + Member member; + TF_RETURN_IF_ERROR(InitializeMember(node, &member)); + CHECK_GE(member.parent, 0); + members_.resize(member.parent + 1); + members_[member.parent] = std::move(member); + return Status::OK(); + } + + // Merge the (possibly disjoint) sets containing nodes "x" and + // "y". Returns OK if the all nodes in the union of these sets can + // be placed on the same device type. + // + // NOTE: If this method returns an error, *this is left in an undefined + // state. + Status ColocateNodes(const Node& x, const Node& y) { + int x_root = FindRoot(x.id()); + int y_root = FindRoot(y.id()); + if (x_root != y_root) { + // Merge the sets by swinging the parent pointer of the smaller + // tree to point to the root of the larger tree. Together with + // path compression in ColocationGraph::FindRoot, this ensures + // that we do not experience pathological performance on graphs + // such as chains. + int new_root, old_root; + if (members_[x_root].rank < members_[y_root].rank) { + // The tree rooted at x_root is shallower, so connect it to + // y_root. The rank of y_root is unchanged because its new + // child has strictly less rank. + members_[x_root].parent = y_root; + new_root = y_root; + old_root = x_root; + } else if (members_[x_root].rank > members_[y_root].rank) { + // The tree rooted at y_root is shallower, so connect it to + // x_root. The rank of x_root is unchanged because its new + // child has strictly less rank. + members_[y_root].parent = x_root; + new_root = x_root; + old_root = y_root; + } else { + // Both trees have the same rank, so break the tie by choosing + // x_root as the new root. + members_[y_root].parent = x_root; + // Increment the rank of the tree rooted at x_root, because it + // is now strictly deeper than before. + ++members_[x_root].rank; + new_root = x_root; + old_root = y_root; + } + + // Merge the partial device specifications, and ensure that they are + // compatible. NULL options_ is treated as allowing soft placement. + // TODO(mrry): Consider enriching the error message by pointing + // out which nodes have the explicit partial device + // specifications that caused this conflict. + TF_RETURN_IF_ERROR(DeviceNameUtils::MergeDevNames( + &members_[new_root].device_name, members_[old_root].device_name, + options_ == nullptr || options_->config.allow_soft_placement())); + + // Ensure that the common root has at least one supported device + // type, by computing the intersection of + // members_[new_root].supported_device_types and + // members_[old_root].supported_device_types. + MergeSupportedDevices(&members_[new_root].supported_device_types, + members_[old_root].supported_device_types); + if (members_[x_root].supported_device_types.size() == 0) { + return errors::InvalidArgument( + "Cannot colocate nodes '", x.name(), "' and '", y.name(), + "' because no device type supports both of those nodes and the " + "other nodes colocated with them"); + } + } + return Status::OK(); + } + + // For the given node, subject to the constraints previously given + // to this ColocationGraph, set its assigned_device_name. Returns OK + // if a satisfying device can be found, otherwise an error. + Status AssignDevice(Node* node) { + int node_root = FindRoot(node->id()); + if (members_[node_root].assigned_device == nullptr) { + // We have not yet assigned a device for the colocated node set containing + // n, so we do so now using the constraints on the root node. + + // "devices" will contain the set of feasible placements for the + // colocated node set containing n. + std::vector<Device*> devices; + if (DeviceNameUtils::HasSomeDetails(members_[node_root].device_name)) { + // The root node has a (possibly partial) device + // specification, so enumerate the physical devices that + // conform to it. + device_set_->FindMatchingDevices(members_[node_root].device_name, + &devices); + + if (!devices.empty()) { + // Filter devices into those that are compatible with the root + // node (and its children). + devices = FilterSupportedDevices( + devices, members_[node_root].supported_device_types); + } + + // Perform soft placement if allow_soft_placement is set. options_ + // being NULL is treated as allowing soft placement. + if (devices.empty() && + (options_ == nullptr || options_->config.allow_soft_placement())) { + // The soft_device_name is the same as the node's device name + // without specifying the device type or ID. + DeviceNameUtils::ParsedName soft_device_name = + members_[node_root].device_name; + soft_device_name.type.clear(); + soft_device_name.has_type = false; + soft_device_name.has_id = false; + device_set_->FindMatchingDevices(soft_device_name, &devices); + if (!devices.empty()) { + devices = FilterSupportedDevices( + devices, members_[node_root].supported_device_types); + } + } + + if (devices.empty()) { + // Return an error when a physical device that matches an explicit + // device specification is not found. This ensures that we don't + // assign a node to GPU when the user wanted to force it on CPU. + DeviceNameUtils::ParsedName specified_device_name; + if (DeviceNameUtils::ParseFullName(node->def().device(), + &specified_device_name) && + specified_device_name == members_[node_root].device_name) { + // The specified device and merged set device match, and + // will appear in the GraphDef (for debugging), so just + // print the specified device. + return errors::InvalidArgument( + "Could not satisfy explicit device specification '", + node->def().device(), "'"); + } else { + // The specified device may be a valid device but the + // merged set device is different, so print both. + return errors::InvalidArgument( + "Could not satisfy explicit device specification '", + node->def().device(), + "' because the node was colocated with a group of nodes that " + "required incompatible device '", + DeviceNameUtils::ParsedNameToString( + members_[node_root].device_name), + "'"); + } + } + } else { + // The device is completely unspecified, so enumerate the devices that + // support all of the nodes in the set. + if (device_set_->devices().empty()) { + return errors::Internal("No devices are registered"); + } + devices = FilterSupportedDevices( + device_set_->devices(), members_[node_root].supported_device_types); + + if (devices.empty()) { + return errors::InvalidArgument( + "Node had no OpKernel registered to support this operation: ", + "Operation was ", node->type_string(), " and inputs were ", + DataTypeVectorString(node->input_types())); + } + } + + // Returns the first device in sorted devices list so we will always + // choose the same device. + members_[node_root].assigned_device = devices[0]; + } + node->set_assigned_device_name(members_[node_root].assigned_device->name()); + + // Log placement if log_device_placement is set. + if (options_ && options_->config.log_device_placement()) { + printf("%s: %s\n", node->name().c_str(), + node->assigned_device_name().c_str()); + LOG(INFO) << node->name() << ": " << node->assigned_device_name(); + } + + return Status::OK(); + } + + private: + // Represents a node in the disjoint node set forest, and the + // accumulated constraints on the device used by that node. + struct Member { + Member() = default; + // The id of the node that is the parent of this one, or its own + // id if it is a root. parent <= 0 indicates that this member is invalid. + int parent = -1; + // A proxy for the depth of the tree that is used to prefer + // connecting smaller trees to larger trees when merging disjoint + // sets. + int rank = 0; + // The intersection of all device types supported by this node, + // and those of all of its children, in priority order + // of the preferred device. + DeviceTypeVector supported_device_types; + // The merged form of the device requested for this node, with + // those of all of its children. + DeviceNameUtils::ParsedName device_name; + // If this node is a root, stores the Device to which this node + // and all of its children have been assigned, or nullptr if this + // has not yet been computed by GetAssignedDevice(). + Device* assigned_device = nullptr; + }; + + Status InitializeMember(const Node& node, Member* member) { + const int id = node.id(); + if (id < 0) { + return errors::InvalidArgument("Node id was not positive: ", id); + } + member->parent = id; + TF_RETURN_IF_ERROR(SupportedDeviceTypesForNode( + device_types_, node.def(), &member->supported_device_types)); + + if (!node.assigned_device_name().empty()) { + // This node has already been assigned to a device, so we + // respect this placement, after sanity-checking it. The + // device_name and supported_device_types for this node reflect + // the assigned device, so any nodes colocated with this node + // will be assigned to the same device (assuming this is + // possible). + // NOTE: Since any assignment must have been performed by + // the TensorFlow runtime, we consider errors in this branch to + // be INTERNAL. + if (!DeviceNameUtils::ParseFullName(node.assigned_device_name(), + &member->device_name)) { + return errors::Internal("Malformed assigned device '", + node.assigned_device_name(), "'"); + } + std::vector<Device*> devices; + const Device* assigned_device = + device_set_->FindDeviceByName(node.assigned_device_name()); + if (assigned_device == nullptr) { + return errors::Internal("Assigned device '", + node.assigned_device_name(), + "' does not match any device"); + } + + for (DeviceType d : member->supported_device_types) { + if (DeviceType(assigned_device->attributes().device_type()) == d) { + return Status::OK(); + } + } + + return errors::Internal("Assigned device '", node.assigned_device_name(), + "' does not have registered OpKernel support " + "for ", + node.def().op()); + } else { + // This node has not yet been assigned to a device, so we + // calculate any constraints due to the set of registered + // kernels and any (partial) user-provided device specification + // in the NodeDef. + + // If no kernels are registered for this op type, fail with an error. + if (member->supported_device_types.empty()) { + return errors::InvalidArgument( + "No OpKernel was registered to support " + "Op '", + node.def().op(), "' with these attrs"); + } + + // If the NodeDef contains a device that is *not* a colocated node name + // (i.e. it does not begin with '@') then we interpret it as a (partial) + // device specification. + string colocated_node_name; + if (!node.def().device().empty() && !HasColocatedNodeName(node)) { + // The user has specified a device in the NodeDef, try to find a + // valid device matching their specification in the set of + // devices. + // NOTE: The full name may specify a device that is not in + // n.supported_device_types(), but we check that in AssignDevice(). + if (!DeviceNameUtils::ParseFullName(node.def().device(), + &member->device_name)) { + return errors::InvalidArgument("Malformed device specification '", + node.def().device(), "'"); + } + } + } + return Status::OK(); + } + + // Updates target to contain the intersection of the device types in + // "target" and "other". + static void MergeSupportedDevices(DeviceTypeVector* target, + const DeviceTypeVector& other) { + DeviceTypeVector temp = *target; + target->clear(); + + // Iterate in priority order. + for (DeviceType device_type : temp) { + bool found = false; + for (DeviceType other_device_type : other) { + if (device_type == other_device_type) { + found = true; + break; + } + } + if (found) { + target->push_back(device_type); + } + } + } + + // Returns the root node of the disjoint tree to which the node with the + // given id is connected. + int FindRoot(int node_id) { + DCHECK_GE(members_[node_id].parent, 0); + if (members_[node_id].parent != node_id) { + // NOTE: Compress paths from node_id to its root, so that future + // calls to FindRoot and ColocateNodes are more efficient. + members_[node_id].parent = FindRoot(members_[node_id].parent); + } + return members_[node_id].parent; + } + + std::vector<Member> members_; + const DeviceSet* device_set_; // Not owned. + const std::vector<DeviceType> device_types_; + const SessionOptions* options_; // Not owned; +}; + +} // namespace + +SimplePlacer::SimplePlacer(Graph* graph, const DeviceSet* devices, + const NodeNameToIdMap* name_to_id_map, + const SessionOptions* options) + : graph_(graph), + devices_(devices), + name_to_id_map_(name_to_id_map), + options_(options) {} + +SimplePlacer::SimplePlacer(Graph* graph, const DeviceSet* devices, + const NodeNameToIdMap* name_to_id_map) + : graph_(graph), devices_(devices), name_to_id_map_(name_to_id_map) { + options_ = nullptr; +} + +SimplePlacer::~SimplePlacer() {} + +Status SimplePlacer::Run() { + if (devices_->devices().empty()) { + return errors::FailedPrecondition("No devices are registered"); + } + + ColocationGraph colocation_graph(graph_, devices_, options_); + Status status; + + // 1. First add all of the nodes. Note that steps (1) and (2) + // requires two passes over the nodes because the graph (and hence + // the constraints) may not be acyclic. + for (Node* node : graph_->nodes()) { + // Skip the source and sink nodes. + if (!node->IsOp()) { + continue; + } + status = colocation_graph.AddNode(*node); + if (!status.ok()) return AttachDef(status, node->def()); + } + + // 2. Enumerate the constraint edges, and use them to update the disjoint + // node set. + for (Node* node : graph_->nodes()) { + if (!node->IsOp()) { + continue; + } + + // 2(a). If node n specifies a colocation constraint as its device name, + // add an edge from the colocated node to n. + if (HasColocatedNodeName(*node)) { + string colocated_node_name; + status = ParseColocatedNodeName(*node, &colocated_node_name); + if (!status.ok()) { + return AttachDef(status, node->def()); + } + Node* colocated_node; + status = GetNodeByName(colocated_node_name, &colocated_node); + if (!status.ok()) { + return AttachDef( + errors::InvalidArgument("Colocated node named in device '", + colocated_node_name, "' does not exist"), + node->def()); + } + status = colocation_graph.ColocateNodes(*colocated_node, *node); + if (!status.ok()) { + return AttachDef( + errors::InvalidArgument( + "Cannot satisfy colocation constraint named in device '", + colocated_node_name, "': ", status.error_message()), + node->def()); + } + } + + // 2(b). If `node` has an input edge with reference type, add an + // edge from the source of that edge to `node`. + for (const auto& edge : node->in_edges()) { + if (!edge->IsControlEdge() && + IsRefType(node->input_type(edge->dst_input()))) { + status = colocation_graph.ColocateNodes(*edge->src(), *node); + if (!status.ok()) { + return AttachDef( + errors::InvalidArgument("Cannot satisfy colocation constraint " + "implied by reference connection: ", + status.error_message()), + node->def()); + } + } + } + } + + // 3. For each node, assign a device based on the constraints in the + // disjoint node set. + for (Node* node : graph_->nodes()) { + // Skip the source and sink nodes. + if (!node->IsOp()) { + continue; + } + // Skip nodes that already have an assigned name. + if (!node->assigned_device_name().empty()) { + continue; + } + + status = colocation_graph.AssignDevice(node); + if (!status.ok()) { + return AttachDef( + errors::InvalidArgument("Cannot assign a device to node '", + node->name(), "': ", status.error_message()), + node->def()); + } + } + return Status::OK(); +} + +Status SimplePlacer::GetNodeByName(const string& name, Node** out_node) const { + NodeNameToIdMap::const_iterator iter = name_to_id_map_->find(name); + if (iter != name_to_id_map_->end()) { + *out_node = graph_->FindNodeId(iter->second); + if (*out_node) { + return Status::OK(); + } + } + return errors::NotFound(name); +} + +} // namespace tensorflow diff --git a/tensorflow/core/common_runtime/simple_placer.h b/tensorflow/core/common_runtime/simple_placer.h new file mode 100644 index 0000000000..4b3df50c72 --- /dev/null +++ b/tensorflow/core/common_runtime/simple_placer.h @@ -0,0 +1,81 @@ +#ifndef TENSORFLOW_COMMON_RUNTIME_SIMPLE_PLACER_H_ +#define TENSORFLOW_COMMON_RUNTIME_SIMPLE_PLACER_H_ + +#include <string> +#include <unordered_map> + +#include "tensorflow/core/common_runtime/device_set.h" +#include "tensorflow/core/graph/graph.h" +#include "tensorflow/core/platform/port.h" +#include "tensorflow/core/public/status.h" +#include "tensorflow/core/util/device_name_utils.h" +#include "tensorflow/core/public/session_options.h" + +namespace tensorflow { + +// A placement algorithm that assigns the nodes of the given Graph to +// devices the given DeviceSet, respecting the following constraints: +// +// 1. Existing device assignments remain unchanged. +// 2. Requested (partial or complete) device specifications in the +// are granted. +// 3. Nodes connected by edges of a reference type are colocated on +// the same device. +// 4. Given nodes "A" and "B", if node "B" has the device specification +// "@A", nodes "A" and "B" will be colocated on the same device. +// +// The implementation builds a constraint graph with the same set of +// nodes, and edges that represent colocation constraints between +// nodes. Each connected component in the resulting constraint graph +// is then assigned to a single device. +// +// TODO(mrry): "Soft" constraints, such as "place node 'x' as close as +// possible to node 'y' while respecting the other constraints"? +// TODO(mrry): Create a common interface for this and the other +// placement algorithms so that they may be injected into the graph +// builder. +class SimplePlacer { + public: + // A map from graph node names to numerical IDs (in a Graph object). + typedef std::unordered_map<string, int> NodeNameToIdMap; + + // Creates an instance of the SimplePlacer algorithm for the given + // Graph "graph" (nodes in which may or may not be assigned) on the + // given DeviceSet "devices". The "name_to_id_map" maps the names of + // nodes in "g" to their numerical ID. + // + // REQUIRES: for all mappings (k, v) in "name_to_id_map", + // graph.FindNodeId(v)->name() == k. + // + // The "graph", "devices", and "name_to_id_map" pointer arguments + // are borrowed by this SimplePlacer, and must outlive it. + SimplePlacer(Graph* graph, const DeviceSet* devices, + const NodeNameToIdMap* name_to_id_map, + const SessionOptions* options); + + SimplePlacer(Graph* graph, const DeviceSet* devices, + const NodeNameToIdMap* name_to_id_map); + + ~SimplePlacer(); + + // Assigns each node in this SimplePlacer's graph to a device in its + // set of devices. + // + // This method is not thread-safe. + // Run() may be invoked at most once. + Status Run(); + + private: + Status GetNodeByName(const string& name, Node** out_node) const; + + Graph* const graph_; // Not owned. + const DeviceSet* const devices_; // Not owned. + const NodeNameToIdMap* const name_to_id_map_; // Not owned. + const SessionOptions* options_; // Not owned. + + TF_DISALLOW_COPY_AND_ASSIGN(SimplePlacer); +}; + +} // namespace tensorflow + +#endif // TENSORFLOW_COMMON_RUNTIME_SIMPLE_PLACER_H_ diff --git a/tensorflow/core/common_runtime/simple_placer_test.cc b/tensorflow/core/common_runtime/simple_placer_test.cc new file mode 100644 index 0000000000..3139962d7e --- /dev/null +++ b/tensorflow/core/common_runtime/simple_placer_test.cc @@ -0,0 +1,863 @@ +#include "tensorflow/core/common_runtime/simple_placer.h" + +#include <memory> +#include <string> +#include <utility> +#include <vector> + +#include "tensorflow/core/common_runtime/device.h" +#include "tensorflow/core/common_runtime/device_set.h" +#include "tensorflow/core/framework/device_attributes.pb.h" +#include "tensorflow/core/framework/graph.pb.h" +#include "tensorflow/core/framework/kernel_def_builder.h" +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/op_def_builder.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/graph/graph.h" +#include "tensorflow/core/graph/graph_def_builder.h" +#include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/lib/core/error_codes.pb.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/lib/core/status_test_util.h" +#include "tensorflow/core/lib/strings/strcat.h" +#include <gtest/gtest.h> + +namespace tensorflow { + +namespace { + +//////////////////////////////////////////////////////////////////////////////// +// +// Op, kernel, and device registrations to set up the environment. +// +// The SimplePlacer uses information about the op (input types), +// kernel (device constraints), and available devices to make +// placement decisions. To avoid depending on the full runtime, we +// define dummy implementations of these, and register them with the +// runtime. +// +//////////////////////////////////////////////////////////////////////////////// + +// A dummy OpKernel that is used to register ops on different devices. +class DummyOp : public OpKernel { + public: + explicit DummyOp(OpKernelConstruction* context) : OpKernel(context) {} + void Compute(OpKernelContext* context) override {} +}; + +// A fake device that has specific device attributes, used to simulate +// the presence of a CPU or a GPU (without depending on that part of +// the runtime. +class FakeDevice : public Device { + private: + explicit FakeDevice(const DeviceAttributes& device_attributes) + : Device(nullptr, device_attributes, nullptr) {} + + public: + Status Sync() override { return errors::Unimplemented("FakeDevice::Sync()"); } + + Allocator* GetAllocator(AllocatorAttributes attr) override { return nullptr; } + + static std::unique_ptr<Device> MakeCPU(const string& name) { + DeviceAttributes device_attributes; + device_attributes.set_name(name); + device_attributes.set_device_type(DeviceType(DEVICE_CPU).type()); + return std::unique_ptr<Device>(new FakeDevice(device_attributes)); + } + + static std::unique_ptr<Device> MakeGPU(const string& name) { + DeviceAttributes device_attributes; + device_attributes.set_name(name); + device_attributes.set_device_type(DeviceType(DEVICE_GPU).type()); + return std::unique_ptr<Device>(new FakeDevice(device_attributes)); + } +}; + +// Register the following ops so they can be added to a Graph, and +// kernels so that they can be placed on particular device types. +REGISTER_OP("TestVariable").Output("o: Ref(float)"); +REGISTER_KERNEL_BUILDER(Name("TestVariable").Device(DEVICE_CPU), DummyOp); +REGISTER_KERNEL_BUILDER(Name("TestVariable").Device(DEVICE_GPU), DummyOp); + +REGISTER_OP("VariableCPU").Output("o: Ref(float)"); +REGISTER_KERNEL_BUILDER(Name("VariableCPU").Device(DEVICE_CPU), DummyOp); + +REGISTER_OP("VariableGPU").Output("o: Ref(float)"); +REGISTER_KERNEL_BUILDER(Name("VariableGPU").Device(DEVICE_GPU), DummyOp); + +REGISTER_OP("VariableNoKernels").Output("o: Ref(float)"); + +REGISTER_OP("TestAdd").Input("a: float").Input("b: float").Output("o: float"); +REGISTER_KERNEL_BUILDER(Name("TestAdd").Device(DEVICE_CPU), DummyOp); +REGISTER_KERNEL_BUILDER(Name("TestAdd").Device(DEVICE_GPU), DummyOp); + +REGISTER_OP("TestRelu").Input("i: float").Output("o: float"); +REGISTER_KERNEL_BUILDER(Name("TestRelu").Device(DEVICE_CPU), DummyOp); +REGISTER_KERNEL_BUILDER(Name("TestRelu").Device(DEVICE_GPU), DummyOp); + +REGISTER_OP("ReluGPU").Input("i: float").Output("o: float"); +REGISTER_KERNEL_BUILDER(Name("ReluGPU").Device(DEVICE_GPU), DummyOp); + +REGISTER_OP("TestAssign").Input("i: Ref(float)").Input("v: float"); +REGISTER_KERNEL_BUILDER(Name("TestAssign").Device(DEVICE_CPU), DummyOp); +REGISTER_KERNEL_BUILDER(Name("TestAssign").Device(DEVICE_GPU), DummyOp); + +REGISTER_OP("AssignCPU").Input("i: Ref(float)").Input("v: float"); +REGISTER_KERNEL_BUILDER(Name("AssignCPU").Device(DEVICE_CPU), DummyOp); + +REGISTER_OP("AssignGPU").Input("i: Ref(float)").Input("v: float"); +REGISTER_KERNEL_BUILDER(Name("AssignGPU").Device(DEVICE_GPU), DummyOp); + +REGISTER_OP("TestInput").Output("a: float").Output("b: float"); +REGISTER_KERNEL_BUILDER(Name("TestInput").Device(DEVICE_CPU), DummyOp); + +REGISTER_OP("TestDevice").Output("a: float").Output("b: float"); +REGISTER_KERNEL_BUILDER(Name("TestDevice").Device(DEVICE_GPU), DummyOp); + +REGISTER_OP("TestDeviceEnforce").Input("a: Ref(float)").Output("b: float"); +REGISTER_KERNEL_BUILDER(Name("TestDeviceEnforce").Device(DEVICE_CPU), DummyOp); +REGISTER_KERNEL_BUILDER(Name("TestDeviceEnforce").Device(DEVICE_GPU), DummyOp); + +//////////////////////////////////////////////////////////////////////////////// +// +// A SimplePlacerTest method has three phases: +// +// 1. Build a TensorFlow graph, with no (or partial) device assignments. +// 2. Attempt to compute a placement using the SimplePlacer. +// 3. EITHER: test that the constraints implied by the graph are respected; +// or that an appropriate error was reported. +// +//////////////////////////////////////////////////////////////////////////////// +class SimplePlacerTest : public ::testing::Test { + protected: + SimplePlacerTest() { + RequireDefaultOps(); + // Build a set of 10 GPU and 10 CPU devices. + // NOTE: this->local_devices_ owns the device objects; + // this->devices_ contains borrowed pointers to the device + // objects. + for (int i = 0; i < 10; ++i) { + local_devices_.emplace_back(FakeDevice::MakeCPU( + strings::StrCat("/job:a/replica:0/task:0/cpu:", i))); + devices_.AddDevice(local_devices_.back().get()); + // Insert the GPUs in reverse order. + local_devices_.emplace_back(FakeDevice::MakeGPU( + strings::StrCat("/job:a/replica:0/task:0/gpu:", 9 - i))); + devices_.AddDevice(local_devices_.back().get()); + } + } + + // Builds the given graph, and (if successful) indexes the node + // names for use in placement, and later lookup. + Status BuildGraph(const GraphDefBuilder& builder, Graph* out_graph) { + TF_RETURN_IF_ERROR(builder.ToGraph(out_graph)); + nodes_by_name_.clear(); + for (Node* node : out_graph->nodes()) { + nodes_by_name_[node->name()] = node->id(); + } + return Status::OK(); + } + + // Invokes the SimplePlacer on "graph". If no DeviceSet is specified, the + // placement will use the default DeviceSet (of 10 CPU and 10 GPU devices). + // + // REQUIRES: "*graph" was produced by the most recent call to BuildGraph. + Status Place(Graph* graph, DeviceSet* devices, SessionOptions* options) { + SimplePlacer placer(graph, devices, &nodes_by_name_, options); + return placer.Run(); + } + + Status Place(Graph* graph, DeviceSet* devices) { + return Place(graph, devices, nullptr); + } + + Status Place(Graph* graph, SessionOptions* options) { + return Place(graph, &devices_, options); + } + + Status Place(Graph* graph) { return Place(graph, &devices_, nullptr); } + + // Returns the node in "graph" with the given name. + // + // REQUIRES: "graph" was produced by the most recent call to BuildGraph. + Node* GetNodeByName(const Graph& graph, const string& name) { + const auto search = nodes_by_name_.find(name); + CHECK(search != nodes_by_name_.end()) << "Unknown node name: " << name; + return graph.FindNodeId(search->second); + } + + protected: + std::vector<std::unique_ptr<Device>> local_devices_; + DeviceSet devices_; + SimplePlacer::NodeNameToIdMap nodes_by_name_; + + Status ReferenceTestHelper(const string& variable_op_type, + const string& assign_op_type, + DeviceType expected_device_type); +}; + +#define EXPECT_COLOCATED(g, name_a, name_b) \ + do { \ + Graph& g_ = (g); \ + EXPECT_EQ(GetNodeByName(g_, (name_a))->assigned_device_name(), \ + GetNodeByName(g_, (name_b))->assigned_device_name()); \ + } while (0) + +#define EXPECT_DEVICE_TYPE(g, name, expected_device_type) \ + EXPECT_EQ(DeviceType(expected_device_type).type(), \ + devices_.FindDeviceByName( \ + GetNodeByName((g), (name))->assigned_device_name()) \ + ->attributes() \ + .device_type()) + +#define EXPECT_DEVICE_CONTAINS(g, name, device_substr) \ + EXPECT_TRUE(StringPiece(GetNodeByName((g), (name))->assigned_device_name()) \ + .contains(device_substr)) + +// Test that a graph with no constraints will successfully assign nodes to the +// "best available" device (i.e. prefer GPU over CPU). +TEST_F(SimplePlacerTest, TestNoConstraints) { + Graph g(OpRegistry::Global()); + { // Scope for temporary variables used to construct g. + GraphDefBuilder b(GraphDefBuilder::kFailImmediately); + Node* input = ops::SourceOp("TestInput", b.opts().WithName("in")); + ops::UnaryOp("TestRelu", ops::NodeOut(input, 0), b.opts().WithName("n1")); + ops::UnaryOp("TestRelu", ops::NodeOut(input, 1), b.opts().WithName("n2")); + EXPECT_OK(BuildGraph(b, &g)); + } + + EXPECT_OK(Place(&g)); + EXPECT_DEVICE_TYPE(g, "in", DEVICE_CPU); + EXPECT_DEVICE_TYPE(g, "n1", DEVICE_GPU); + EXPECT_DEVICE_TYPE(g, "n2", DEVICE_GPU); +} + +// Test that a graph with device type and reference constraints on +// some of the ops will successfully assign nodes to the constrained +// device, and colocate nodes with reference connections. +TEST_F(SimplePlacerTest, TestDeviceTypeConstraints) { + Graph g(OpRegistry::Global()); + { // Scope for temporary variables used to construct g. + GraphDefBuilder b(GraphDefBuilder::kFailImmediately); + Node* input = ops::SourceOp("TestInput", b.opts().WithName("in")); + Node* var_cpu = ops::SourceOp("VariableCPU", b.opts().WithName("var_cpu")); + ops::BinaryOp("AssignCPU", var_cpu, input, b.opts().WithName("assign_cpu")); + Node* var_gpu = ops::SourceOp("VariableGPU", b.opts().WithName("var_gpu")); + ops::BinaryOp("AssignGPU", var_gpu, input, b.opts().WithName("assign_gpu")); + EXPECT_OK(BuildGraph(b, &g)); + } + + EXPECT_OK(Place(&g)); + EXPECT_DEVICE_TYPE(g, "in", DEVICE_CPU); + EXPECT_DEVICE_TYPE(g, "var_cpu", DEVICE_CPU); + EXPECT_DEVICE_TYPE(g, "assign_cpu", DEVICE_CPU); + EXPECT_COLOCATED(g, "var_cpu", "assign_cpu"); + EXPECT_DEVICE_TYPE(g, "var_gpu", DEVICE_GPU); + EXPECT_DEVICE_TYPE(g, "assign_gpu", DEVICE_GPU); + EXPECT_COLOCATED(g, "var_gpu", "assign_gpu"); +} + +// Test that a graph with partial device specifications on the ops +// will successfully +TEST_F(SimplePlacerTest, TestPartialSpec) { + Graph g(OpRegistry::Global()); + { // Scope for temporary variables used to construct g. + GraphDefBuilder b(GraphDefBuilder::kFailImmediately); + ops::SourceOp("TestInput", b.opts().WithName("in").WithDevice("/job:a")); + ops::SourceOp("TestVariable", + b.opts().WithName("var").WithDevice("/job:a")); + EXPECT_OK(BuildGraph(b, &g)); + } + + EXPECT_OK(Place(&g)); + EXPECT_DEVICE_TYPE(g, "in", DEVICE_CPU); + EXPECT_DEVICE_CONTAINS(g, "in", "/job:a"); + EXPECT_DEVICE_TYPE(g, "var", DEVICE_GPU); + EXPECT_DEVICE_CONTAINS(g, "var", "/job:a"); +} + +// Test that a node with an assigned device is not relocated. +TEST_F(SimplePlacerTest, TestAssignedDevicePreserved) { + Graph g(OpRegistry::Global()); + { // Scope for temporary variables used to construct g. + GraphDefBuilder b(GraphDefBuilder::kFailImmediately); + ops::SourceOp("TestInput", b.opts().WithName("in")); + EXPECT_OK(BuildGraph(b, &g)); + } + + GetNodeByName(g, "in") + ->set_assigned_device_name("/job:a/replica:0/task:0/cpu:7"); + + EXPECT_OK(Place(&g)); + EXPECT_EQ("/job:a/replica:0/task:0/cpu:7", + GetNodeByName(g, "in")->assigned_device_name()); +} + +// Test that a graph with partial device specifications for CPU-only ops +// will be relocated to CPU. +TEST_F(SimplePlacerTest, TestPartialSpecGpuToCpu) { + Graph g(OpRegistry::Global()); + { // Scope for temporary variables used to construct g. + GraphDefBuilder b(GraphDefBuilder::kFailImmediately); + ops::SourceOp("TestInput", b.opts().WithName("in").WithDevice("/gpu:0")); + ops::SourceOp("TestVariable", + b.opts().WithName("var").WithDevice("/gpu:0")); + EXPECT_OK(BuildGraph(b, &g)); + } + + SessionOptions options; + options.config.set_allow_soft_placement(true); + EXPECT_OK(Place(&g, &options)); + EXPECT_DEVICE_TYPE(g, "in", DEVICE_CPU); + EXPECT_DEVICE_CONTAINS(g, "in", "/cpu"); + EXPECT_DEVICE_TYPE(g, "var", DEVICE_GPU); + EXPECT_DEVICE_CONTAINS(g, "var", "/gpu:0"); +} + +// Test that a node with an assigned GPU device but has not registered +// OpKernel will fail. +TEST_F(SimplePlacerTest, TestAssignedGpuDeviceToCpuDevice) { + Graph g(OpRegistry::Global()); + { // Scope for temporary variables used to construct g. + GraphDefBuilder b(GraphDefBuilder::kFailImmediately); + ops::SourceOp("TestInput", b.opts().WithName("in")); + EXPECT_OK(BuildGraph(b, &g)); + } + + GetNodeByName(g, "in") + ->set_assigned_device_name("/job:a/replica:0/task:0/gpu:0"); + + Status s = Place(&g); + EXPECT_EQ(error::INTERNAL, s.code()); + EXPECT_TRUE( + StringPiece(s.error_message()) + .contains("Assigned device '/job:a/replica:0/task:0/gpu:0' " + "does not have registered OpKernel support for TestInput")); +} + +// Test that graphs with reference connections are correctly placed. + +// Build a graph containing a Variable op of "variable_op_type" and an +// Assign op of "assign_op_type", and expect all of the ops to be +// placed on a device of type "expected_device_type". +Status SimplePlacerTest::ReferenceTestHelper(const string& variable_op_type, + const string& assign_op_type, + DeviceType expected_device_type) { + Graph g(OpRegistry::Global()); + { // Scope for temporary variables used to construct g. + GraphDefBuilder b(GraphDefBuilder::kFailImmediately); + Node* input = ops::SourceOp("TestInput", b.opts().WithName("in")); + // Build ten variable-and-assignment pairs. + for (int i = 0; i < 10; ++i) { + Node* var = ops::SourceOp(variable_op_type, + b.opts().WithName(strings::StrCat("var_", i))); + ops::BinaryOp(assign_op_type, var, input, + b.opts().WithName(strings::StrCat("assign_", i))); + } + EXPECT_OK(BuildGraph(b, &g)); + } + + TF_RETURN_IF_ERROR(Place(&g)); + + for (int i = 0; i < 10; ++i) { + EXPECT_COLOCATED(g, strings::StrCat("var_", i), + strings::StrCat("assign_", i)); + EXPECT_DEVICE_TYPE(g, strings::StrCat("var_", i), expected_device_type); + EXPECT_DEVICE_TYPE(g, strings::StrCat("assign_", i), expected_device_type); + } + + return Status::OK(); +} + +// Test all 2^3 combinations of Variable and Assignment op types +// (unconstrained, CPU-only, and GPU-only). +TEST_F(SimplePlacerTest, TestReferenceConnection) { + Status s; + EXPECT_OK(ReferenceTestHelper("TestVariable", "TestAssign", DEVICE_GPU)); + EXPECT_OK(ReferenceTestHelper("TestVariable", "AssignCPU", DEVICE_CPU)); + EXPECT_OK(ReferenceTestHelper("TestVariable", "AssignGPU", DEVICE_GPU)); + EXPECT_OK(ReferenceTestHelper("VariableCPU", "TestAssign", DEVICE_CPU)); + EXPECT_OK(ReferenceTestHelper("VariableCPU", "AssignCPU", DEVICE_CPU)); + { + Status s = ReferenceTestHelper("VariableCPU", "AssignGPU", DEVICE_CPU); + EXPECT_EQ(error::INVALID_ARGUMENT, s.code()); + EXPECT_TRUE(StringPiece(s.error_message()) + .contains("no device type supports both of those nodes")); + } + EXPECT_OK(ReferenceTestHelper("VariableGPU", "TestAssign", DEVICE_GPU)); + { + Status s = ReferenceTestHelper("VariableGPU", "AssignCPU", DEVICE_CPU); + EXPECT_EQ(error::INVALID_ARGUMENT, s.code()); + EXPECT_TRUE(StringPiece(s.error_message()) + .contains("no device type supports both of those nodes")); + } + EXPECT_OK(ReferenceTestHelper("VariableGPU", "AssignGPU", DEVICE_GPU)); +} + +// Test the handling of '@node_name' colocation constraints, when +// these are arranged in multiple chains. +TEST_F(SimplePlacerTest, TestColocatedChain) { + Graph g(OpRegistry::Global()); + { // Scope for temporary variables used to construct g. + GraphDefBuilder b(GraphDefBuilder::kFailImmediately); + Node* input = ops::SourceOp("TestInput", b.opts().WithName("in")); + Node* last_node = input; + for (int i = 0; i < 100; ++i) { + if (i % 10 == 0) { + // Every ten nodes, start a new chain. + last_node = ops::UnaryOp("TestRelu", last_node, + b.opts().WithName(strings::StrCat("n_", i))); + } else { + // Chain each successive node to the previous one. + last_node = + ops::UnaryOp("TestRelu", last_node, + b.opts() + .WithName(strings::StrCat("n_", i)) + .WithDevice(strings::StrCat("@n_", i - 1))); + } + } + EXPECT_OK(BuildGraph(b, &g)); + } + + EXPECT_OK(Place(&g)); + for (int i = 0; i < 100; ++i) { + if (i % 10 != 0) { + EXPECT_COLOCATED(g, strings::StrCat("n_", i - (i % 1)), + strings::StrCat("n_", i)); + } + } +} + +// Test the handling of '@node_name' colocation constraints, when the +// chains are shuffled. +TEST_F(SimplePlacerTest, TestColocatedChainWithLongRangeColocations) { + Graph g(OpRegistry::Global()); + { // Scope for temporary variables used to construct g. + GraphDefBuilder b(GraphDefBuilder::kFailImmediately); + Node* input = ops::SourceOp("TestInput", b.opts().WithName("in")); + Node* last_node = input; + for (int i = 0; i < 10; ++i) { + // Start ten chains. + last_node = ops::UnaryOp("TestRelu", last_node, + b.opts().WithName(strings::StrCat("n_", i))); + } + for (int i = 10; i < 100; ++i) { + // Add each node to the (i % 10)^th chain. + last_node = ops::UnaryOp("TestRelu", last_node, + b.opts() + .WithName(strings::StrCat("n_", i)) + .WithDevice(strings::StrCat("@n_", i % 10))); + } + EXPECT_OK(BuildGraph(b, &g)); + } + + EXPECT_OK(Place(&g)); + for (int i = 10; i < 100; ++i) { + EXPECT_COLOCATED(g, strings::StrCat("n_", i % 10), + strings::StrCat("n_", i)); + } +} + +TEST_F(SimplePlacerTest, TestColocationAndReferenceConnections) { + Graph g(OpRegistry::Global()); + { // Scope for temporary variables used to construct g. + GraphDefBuilder b(GraphDefBuilder::kFailImmediately); + Node* input = ops::SourceOp("TestInput", b.opts().WithName("in")); + for (int i = 0; i < 10; ++i) { + // Declare ten variable and assignment pairs. + Node* var = ops::SourceOp("TestVariable", + b.opts().WithName(strings::StrCat("var_", i))); + ops::BinaryOp("TestAssign", var, input, + b.opts().WithName(strings::StrCat("assign_", i))); + } + for (int i = 10; i < 100; ++i) { + // Create a variable colocated with some existing variable, and + // an assignment colocated with a possibly-different variable. + Node* var = ops::SourceOp( + "TestVariable", b.opts() + .WithName(strings::StrCat("var_", i)) + .WithDevice(strings::StrCat("@var_", i % 6))); + ops::BinaryOp("TestAssign", var, input, + b.opts() + .WithName(strings::StrCat("assign_", i)) + .WithDevice(strings::StrCat("@assign_", i % 3))); + } + EXPECT_OK(BuildGraph(b, &g)); + } + + EXPECT_OK(Place(&g)); + for (int i = 0; i < 10; ++i) { + EXPECT_COLOCATED(g, strings::StrCat("var_", i), + strings::StrCat("assign_", i)); + } + for (int i = 10; i < 100; ++i) { + EXPECT_COLOCATED(g, strings::StrCat("var_", i), + strings::StrCat("assign_", i)); + EXPECT_COLOCATED(g, strings::StrCat("var_", i), + strings::StrCat("var_", i % 6)); + EXPECT_COLOCATED(g, strings::StrCat("assign_", i), + strings::StrCat("assign_", i % 3)); + } +} + +// Test that placement fails when no devices are registered. +TEST_F(SimplePlacerTest, TestEmptyDeviceSet) { + Graph g(OpRegistry::Global()); + { // Scope for temporary variables used to construct g. + GraphDefBuilder b(GraphDefBuilder::kFailImmediately); + ops::SourceOp("TestInput", b.opts().WithName("in")); + EXPECT_OK(BuildGraph(b, &g)); + } + + DeviceSet empty; + + Status s = Place(&g, &empty); + EXPECT_TRUE( + StringPiece(s.error_message()).contains("No devices are registered")); +} + +// Test that placement fails when the requested device forces an +// indirect constraint to be violated. +TEST_F(SimplePlacerTest, TestHeterogeneousDeviceSetFailure) { + Graph g(OpRegistry::Global()); + { // Scope for temporary variables used to construct g. + GraphDefBuilder b(GraphDefBuilder::kFailImmediately); + Node* in = ops::SourceOp("TestInput", b.opts().WithName("in")); + Node* var = ops::SourceOp("VariableGPU", b.opts().WithName("var")); + ops::BinaryOp("TestAssign", var, in, + b.opts().WithName("assign").WithDevice("/job:b/task:1")); + EXPECT_OK(BuildGraph(b, &g)); + } + + DeviceSet heterogeneous; + std::unique_ptr<Device> gpu( + FakeDevice::MakeGPU("/job:b/replica:0/task:0/gpu:0")); + heterogeneous.AddDevice(gpu.get()); + std::unique_ptr<Device> cpu( + FakeDevice::MakeCPU("/job:b/replica:0/task:1/cpu:0")); + heterogeneous.AddDevice(cpu.get()); + Status s = Place(&g, &heterogeneous); + EXPECT_EQ(error::INVALID_ARGUMENT, s.code()); + EXPECT_TRUE(StringPiece(s.error_message()) + .contains("colocated with a group of nodes that required " + "incompatible device")); +} + +// Test that placement fails when an unknown device is requested. +TEST_F(SimplePlacerTest, TestUnknownDevice) { + Graph g(OpRegistry::Global()); + { // Scope for temporary variables used to construct g. + GraphDefBuilder b(GraphDefBuilder::kFailImmediately); + ops::SourceOp("TestInput", b.opts().WithName("in").WithDevice("/job:foo")); + EXPECT_OK(BuildGraph(b, &g)); + } + + Status s = Place(&g); + EXPECT_EQ(error::INVALID_ARGUMENT, s.code()); + EXPECT_TRUE( + StringPiece(s.error_message()) + .contains( + "Could not satisfy explicit device specification '/job:foo'")); +} + +// Test that placement fails when the combination of partial +// constraints leads to an unknown device. +TEST_F(SimplePlacerTest, TestUnknownMergedDevice) { + Graph g(OpRegistry::Global()); + { // Scope for temporary variables used to construct g. + GraphDefBuilder b(GraphDefBuilder::kFailImmediately); + ops::SourceOp("TestInput", b.opts().WithName("in").WithDevice("/job:foo")); + EXPECT_OK(BuildGraph(b, &g)); + } + + Status s = Place(&g); + EXPECT_EQ(error::INVALID_ARGUMENT, s.code()); + EXPECT_TRUE( + StringPiece(s.error_message()) + .contains( + "Could not satisfy explicit device specification '/job:foo'")); +} + +// Test that placement fails when the previously-assigned device for a +// node is unknown. +TEST_F(SimplePlacerTest, TestUnknownAssignedDevice) { + Graph g(OpRegistry::Global()); + { // Scope for temporary variables used to construct g. + GraphDefBuilder b(GraphDefBuilder::kFailImmediately); + ops::SourceOp("TestInput", b.opts().WithName("in")); + EXPECT_OK(BuildGraph(b, &g)); + } + + GetNodeByName(g, "in")->set_assigned_device_name("/job:foo"); + + Status s = Place(&g); + EXPECT_EQ(error::INTERNAL, s.code()); + EXPECT_TRUE( + StringPiece(s.error_message()) + .contains("Assigned device '/job:foo' does not match any device")); +} + +// Test that placement fails when an op with no registered kernels is +// requested. +TEST_F(SimplePlacerTest, TestNoKernelsRegistered) { + Graph g(OpRegistry::Global()); + { // Scope for temporary variables used to construct g. + GraphDefBuilder b(GraphDefBuilder::kFailImmediately); + ops::SourceOp("VariableNoKernels", b.opts().WithName("var")); + EXPECT_OK(BuildGraph(b, &g)); + } + + Status s = Place(&g); + EXPECT_EQ(error::INVALID_ARGUMENT, s.code()); + EXPECT_TRUE( + StringPiece(s.error_message()) + .contains( + "No OpKernel was registered to support Op 'VariableNoKernels'")); +} + +// Test that placement fails when a kernel is registered but no known +// device supports it. +TEST_F(SimplePlacerTest, TestNoDevicesRegistered) { + Graph g(OpRegistry::Global()); + { // Scope for temporary variables used to construct g. + GraphDefBuilder b(GraphDefBuilder::kFailImmediately); + ops::SourceOp("VariableGPU", b.opts().WithName("var")); + EXPECT_OK(BuildGraph(b, &g)); + } + + DeviceSet cpu_only; + std::unique_ptr<Device> cpu( + FakeDevice::MakeCPU("/job:a/replica:0/task:0/cpu:0")); + cpu_only.AddDevice(cpu.get()); + + Status s = Place(&g, &cpu_only); + EXPECT_EQ(error::INVALID_ARGUMENT, s.code()); + EXPECT_TRUE(StringPiece(s.error_message()) + .contains("No OpKernel was registered to support " + "Op 'VariableGPU'")); +} + +// Test that placement fails when a requested device is malformed. +TEST_F(SimplePlacerTest, TestMalformedDeviceSpecification) { + Graph g(OpRegistry::Global()); + { // Scope for temporary variables used to construct g. + GraphDefBuilder b(GraphDefBuilder::kFailImmediately); + ops::SourceOp("TestInput", b.opts().WithName("in").WithDevice("/foo:bar")); + EXPECT_OK(BuildGraph(b, &g)); + } + + Status s = Place(&g); + EXPECT_EQ(error::INVALID_ARGUMENT, s.code()); + EXPECT_TRUE(StringPiece(s.error_message()) + .contains("Malformed device specification '/foo:bar'")); +} + +// Test that placement fails when a previously-assigned device is malformed. +TEST_F(SimplePlacerTest, TestMalformedAssignedDevice) { + Graph g(OpRegistry::Global()); + { // Scope for temporary variables used to construct g. + GraphDefBuilder b(GraphDefBuilder::kFailImmediately); + ops::SourceOp("TestInput", b.opts().WithName("in")); + EXPECT_OK(BuildGraph(b, &g)); + } + + GetNodeByName(g, "in")->set_assigned_device_name("/foo:bar"); + + Status s = Place(&g); + EXPECT_EQ(error::INTERNAL, s.code()); + EXPECT_TRUE(StringPiece(s.error_message()) + .contains("Malformed assigned device '/foo:bar'")); +} + +// Test that placement fails when a device was previously assigned to +// a node, but it does not uniquely identify a particular device. +TEST_F(SimplePlacerTest, TestNonUniqueAssignedDevice) { + Graph g(OpRegistry::Global()); + { // Scope for temporary variables used to construct g. + GraphDefBuilder b(GraphDefBuilder::kFailImmediately); + ops::SourceOp("TestInput", b.opts().WithName("in")); + EXPECT_OK(BuildGraph(b, &g)); + } + + GetNodeByName(g, "in")->set_assigned_device_name("/job:a"); + + Status s = Place(&g); + EXPECT_EQ(error::INTERNAL, s.code()); + EXPECT_TRUE( + StringPiece(s.error_message()) + .contains("Assigned device '/job:a' does not match any device")); +} + +// Test that placement fails when a node requests colocation with another +// node that does not exist. +TEST_F(SimplePlacerTest, TestUnknownColocatedNode) { + Graph g(OpRegistry::Global()); + { // Scope for temporary variables used to construct g. + GraphDefBuilder b(GraphDefBuilder::kFailImmediately); + ops::SourceOp("TestInput", b.opts().WithName("in").WithDevice("@foo")); + EXPECT_OK(BuildGraph(b, &g)); + } + + Status s = Place(&g); + EXPECT_EQ(error::INVALID_ARGUMENT, s.code()); + EXPECT_TRUE(StringPiece(s.error_message()).contains("'foo' does not exist")); +} + +// Test that placement fails when a node requests colocation with a +// malformed node name. +TEST_F(SimplePlacerTest, TestMalformedColocatedNode) { + Graph g(OpRegistry::Global()); + { // Scope for temporary variables used to construct g. + GraphDefBuilder b(GraphDefBuilder::kFailImmediately); + ops::SourceOp("TestInput", b.opts().WithName("in").WithDevice("@")); + EXPECT_OK(BuildGraph(b, &g)); + } + + Status s = Place(&g); + EXPECT_EQ(error::INVALID_ARGUMENT, s.code()); + EXPECT_TRUE(StringPiece(s.error_message()) + .contains("node named in device '' does not exist")); +} + +// Test that ops request to be placed on non-existent devices will be relocated +// to existing device of the same type if allow_soft_placement is set. +TEST_F(SimplePlacerTest, TestNonexistentGpuAllowSoftPlacement) { + Graph g(OpRegistry::Global()); + { // Scope for temporary variables used to construct g. + GraphDefBuilder b(GraphDefBuilder::kFailImmediately); + ops::SourceOp("TestDevice", b.opts().WithName("in").WithDevice("/gpu:11")); + EXPECT_OK(BuildGraph(b, &g)); + } + + SessionOptions options; + options.config.set_allow_soft_placement(true); + EXPECT_OK(Place(&g, &options)); + EXPECT_DEVICE_CONTAINS(g, "in", "/gpu:0"); +} + +// Test that ops request to be placed on non-existent devices will fail if +// allow_soft_placement is not set. +TEST_F(SimplePlacerTest, TestNonexistentGpuNoAllowSoftPlacement) { + Graph g(OpRegistry::Global()); + { // Scope for temporary variables used to construct g. + GraphDefBuilder b(GraphDefBuilder::kFailImmediately); + ops::SourceOp("TestDevice", b.opts().WithName("in").WithDevice("/gpu:11")); + EXPECT_OK(BuildGraph(b, &g)); + } + + SessionOptions options; + Status s = Place(&g, &options); + EXPECT_EQ(error::INVALID_ARGUMENT, s.code()); + EXPECT_TRUE( + StringPiece(s.error_message()) + .contains( + "Could not satisfy explicit device specification '/gpu:11'")); +} + +// Test that placement fails when a node requests an explicit device that is not +// supported by the registered kernels if allow_soft_placement is no set. +TEST_F(SimplePlacerTest, TestUnsupportedDeviceNoAllowSoftPlacement) { + Graph g(OpRegistry::Global()); + { // Scope for temporary variables used to construct g. + GraphDefBuilder b(GraphDefBuilder::kFailImmediately); + ops::SourceOp("VariableGPU", b.opts().WithName("var").WithDevice("/cpu:0")); + EXPECT_OK(BuildGraph(b, &g)); + } + + SessionOptions options; + Status s = Place(&g, &options); + EXPECT_EQ(error::INVALID_ARGUMENT, s.code()); + EXPECT_TRUE( + StringPiece(s.error_message()) + .contains( + "Could not satisfy explicit device specification '/cpu:0'")); +} + +TEST_F(SimplePlacerTest, TestUnsupportedDeviceAllowSoftPlacement) { + Graph g(OpRegistry::Global()); + { // Scope for temporary variables used to construct g. + GraphDefBuilder b(GraphDefBuilder::kFailImmediately); + ops::SourceOp("VariableGPU", b.opts().WithName("var").WithDevice("/cpu:0")); + EXPECT_OK(BuildGraph(b, &g)); + } + + SessionOptions options; + options.config.set_allow_soft_placement(true); + EXPECT_OK(Place(&g, &options)); +} + +// Test that a graph with device type and reference constraints on +// some of the ops will successfully assign nodes to the constrained +// device, and colocate nodes with reference connections. +TEST_F(SimplePlacerTest, TestDeviceTypeConstraintsAllowSoftPlacement) { + Graph g(OpRegistry::Global()); + { // Scope for temporary variables used to construct g. + GraphDefBuilder b(GraphDefBuilder::kFailImmediately); + // var_gpu has ref output and runs on GPU. + // force_gpu takes var_gpu and requested CPU. + // Verify that both are placed on GPU. + Node* var_gpu = ops::SourceOp("VariableGPU", b.opts().WithName("var_gpu")); + ops::UnaryOp("TestDeviceEnforce", var_gpu, + b.opts().WithName("force_gpu").WithDevice("/cpu:0")); + // var_cpu has ref output and runs on CPU. + // force_cpu takes var_cpu and requested GPU. + // Verify that both are placed on CPU. + Node* var_cpu = ops::SourceOp("VariableCPU", b.opts().WithName("var_cpu")); + ops::UnaryOp("TestDeviceEnforce", var_cpu, + b.opts().WithName("force_cpu").WithDevice("/gpu:0")); + EXPECT_OK(BuildGraph(b, &g)); + } + + SessionOptions options; + options.config.set_allow_soft_placement(true); + EXPECT_OK(Place(&g, &options)); + EXPECT_DEVICE_TYPE(g, "var_gpu", DEVICE_GPU); + EXPECT_DEVICE_TYPE(g, "force_gpu", DEVICE_GPU); + EXPECT_COLOCATED(g, "var_gpu", "force_gpu"); + EXPECT_DEVICE_TYPE(g, "var_cpu", DEVICE_CPU); + EXPECT_DEVICE_TYPE(g, "force_cpu", DEVICE_CPU); + EXPECT_COLOCATED(g, "var_cpu", "force_cpu"); +} + +// Test that placement fails when two nodes have a reference connection +// constraint, and each node requires a mutually incompatible device. +TEST_F(SimplePlacerTest, TestUnsatisfiableConstraintWithReferenceConnections) { + Graph g(OpRegistry::Global()); + { // Scope for temporary variables used to construct g. + GraphDefBuilder b(GraphDefBuilder::kFailImmediately); + Node* var = ops::SourceOp("VariableGPU", b.opts().WithName("var")); + Node* input = ops::SourceOp("TestInput", b.opts().WithName("in")); + ops::BinaryOp("AssignCPU", var, input, b.opts().WithName("assign")); + EXPECT_OK(BuildGraph(b, &g)); + } + + Status s = Place(&g); + EXPECT_EQ(error::INVALID_ARGUMENT, s.code()); + EXPECT_TRUE(StringPiece(s.error_message()) + .contains("Cannot colocate nodes 'var' and 'assign'")); +} + +// Test that placement fails when two nodes have an explicit +// colocation constraint, and each node requires a mutually +// incompatible device. +TEST_F(SimplePlacerTest, TestUnsatisfiableConstraintWithColocatedNodes) { + Graph g(OpRegistry::Global()); + { // Scope for temporary variables used to construct g. + GraphDefBuilder b(GraphDefBuilder::kFailImmediately); + Node* input = ops::SourceOp("TestInput", + b.opts().WithName("in").WithDevice("/gpu:0")); + Node* relu_1 = ops::UnaryOp("TestRelu", input, + b.opts().WithName("relu_1").WithDevice("@in")); + ops::UnaryOp("ReluGPU", relu_1, + b.opts().WithName("relu_2").WithDevice("@relu_1")); + EXPECT_OK(BuildGraph(b, &g)); + } + + Status s = Place(&g); + EXPECT_EQ(error::INVALID_ARGUMENT, s.code()); + EXPECT_TRUE(StringPiece(s.error_message()) + .contains("Cannot colocate nodes 'relu_1' and 'relu_2'")); +} + +} // namespace +} // namespace tensorflow diff --git a/tensorflow/core/common_runtime/threadpool_device.cc b/tensorflow/core/common_runtime/threadpool_device.cc new file mode 100644 index 0000000000..4806e69c67 --- /dev/null +++ b/tensorflow/core/common_runtime/threadpool_device.cc @@ -0,0 +1,55 @@ +#include "tensorflow/core/common_runtime/threadpool_device.h" + +#include "tensorflow/core/common_runtime/local_device.h" +#include "tensorflow/core/framework/allocator.h" +#include "tensorflow/core/framework/device_base.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/graph/types.h" +#include "tensorflow/core/lib/hash/hash.h" +#include "tensorflow/core/platform/port.h" +#include "tensorflow/core/platform/tracing.h" +#include "tensorflow/core/public/session_options.h" + +namespace tensorflow { + +ThreadPoolDevice::ThreadPoolDevice(const SessionOptions& options, + const string& name, Bytes memory_limit, + BusAdjacency bus_adjacency, + Allocator* allocator) + : LocalDevice(options, Device::BuildDeviceAttributes( + name, DEVICE_CPU, memory_limit, bus_adjacency), + allocator), + allocator_(allocator) {} + +ThreadPoolDevice::~ThreadPoolDevice() {} + +void ThreadPoolDevice::Compute(OpKernel* op_kernel, OpKernelContext* context) { + if (port::Tracing::IsActive()) { + // TODO(pbar) We really need a useful identifier of the graph node. + const uint64 id = Hash64(op_kernel->name()); + port::Tracing::ScopedActivity region(port::Tracing::EventCategory::kCompute, + id); + op_kernel->Compute(context); + } else { + op_kernel->Compute(context); + } +} + +Allocator* ThreadPoolDevice::GetAllocator(AllocatorAttributes attr) { + return allocator_; +} + +Status ThreadPoolDevice::MakeTensorFromProto( + const TensorProto& tensor_proto, const AllocatorAttributes alloc_attrs, + Tensor* tensor) { + Tensor parsed(tensor_proto.dtype()); + if (!parsed.FromProto(cpu_allocator(), tensor_proto)) { + return errors::InvalidArgument("Cannot parse tensor from proto: ", + tensor_proto.DebugString()); + } + *tensor = parsed; + return Status::OK(); +} + +} // namespace tensorflow diff --git a/tensorflow/core/common_runtime/threadpool_device.h b/tensorflow/core/common_runtime/threadpool_device.h new file mode 100644 index 0000000000..5b0347231f --- /dev/null +++ b/tensorflow/core/common_runtime/threadpool_device.h @@ -0,0 +1,31 @@ +#ifndef TENSORFLOW_COMMON_RUNTIME_THREADPOOL_DEVICE_H_ +#define TENSORFLOW_COMMON_RUNTIME_THREADPOOL_DEVICE_H_ + +#include "tensorflow/core/common_runtime/device_factory.h" +#include "tensorflow/core/common_runtime/local_device.h" + +namespace tensorflow { + +// CPU device implementation. +class ThreadPoolDevice : public LocalDevice { + public: + ThreadPoolDevice(const SessionOptions& options, const string& name, + Bytes memory_limit, BusAdjacency bus_adjacency, + Allocator* allocator); + ~ThreadPoolDevice() override; + + void Compute(OpKernel* op_kernel, OpKernelContext* context) override; + Allocator* GetAllocator(AllocatorAttributes attr) override; + Status MakeTensorFromProto(const TensorProto& tensor_proto, + const AllocatorAttributes alloc_attrs, + Tensor* tensor) override; + + Status Sync() override { return Status::OK(); } + + private: + Allocator* allocator_; // Not owned +}; + +} // namespace tensorflow + +#endif // TENSORFLOW_COMMON_RUNTIME_THREADPOOL_DEVICE_H_ diff --git a/tensorflow/core/common_runtime/threadpool_device_factory.cc b/tensorflow/core/common_runtime/threadpool_device_factory.cc new file mode 100644 index 0000000000..ee6319abad --- /dev/null +++ b/tensorflow/core/common_runtime/threadpool_device_factory.cc @@ -0,0 +1,31 @@ +// Register a factory that provides CPU devices. +#include "tensorflow/core/common_runtime/threadpool_device.h" + +#include "tensorflow/core/common_runtime/device_factory.h" +#include "tensorflow/core/framework/allocator.h" +#include "tensorflow/core/public/session_options.h" + +namespace tensorflow { + +// TODO(zhifengc/tucker): Figure out the bytes of available RAM. +class ThreadPoolDeviceFactory : public DeviceFactory { + public: + void CreateDevices(const SessionOptions& options, const string& name_prefix, + std::vector<Device*>* devices) override { + // TODO(zhifengc/tucker): Figure out the number of available CPUs + // and/or NUMA configuration. + int n = 1; + auto iter = options.config.device_count().find("CPU"); + if (iter != options.config.device_count().end()) { + n = iter->second; + } + for (int i = 0; i < n; i++) { + string name = strings::StrCat(name_prefix, "/cpu:", i); + devices->push_back(new ThreadPoolDevice(options, name, Bytes(256 << 20), + BUS_ANY, cpu_allocator())); + } + } +}; +REGISTER_LOCAL_DEVICE_FACTORY("CPU", ThreadPoolDeviceFactory); + +} // namespace tensorflow |