diff options
Diffstat (limited to 'tensorflow/core/common_runtime/gpu/gpu_device.cc')
-rw-r--r-- | tensorflow/core/common_runtime/gpu/gpu_device.cc | 651 |
1 files changed, 651 insertions, 0 deletions
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc new file mode 100644 index 0000000000..26d34645f1 --- /dev/null +++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc @@ -0,0 +1,651 @@ +// TODO(opensource): Use a more generic sounding preprocessor name than +// GOOGLE_CUDA +#if GOOGLE_CUDA + +#define EIGEN_USE_GPU + +#include "tensorflow/core/common_runtime/gpu/gpu_device.h" + +#include <stdlib.h> +#include <string.h> + +//#include "base/commandlineflags.h" +#include "tensorflow/stream_executor/cuda/cuda_activation.h" +#include "tensorflow/stream_executor/multi_platform_manager.h" +#include "tensorflow/stream_executor/stream.h" +#include "tensorflow/stream_executor/stream_executor.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" +#include "tensorflow/core/common_runtime/device_factory.h" +#include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h" +#include "tensorflow/core/common_runtime/gpu/gpu_init.h" +#include "tensorflow/core/common_runtime/gpu/gpu_stream_util.h" +#include "tensorflow/core/common_runtime/gpu/gpu_util.h" +#include "tensorflow/core/common_runtime/gpu/process_state.h" +#include "tensorflow/core/common_runtime/gpu_device_context.h" +#include "tensorflow/core/common_runtime/local_device.h" +#include "tensorflow/core/framework/allocator.h" +#include "tensorflow/core/framework/device_base.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/graph/types.h" +#include "tensorflow/core/lib/gtl/stl_util.h" +#include "tensorflow/core/lib/strings/numbers.h" +#include "tensorflow/core/lib/strings/strcat.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/port.h" +#include "tensorflow/core/platform/tracing.h" +#include "tensorflow/core/public/session_options.h" +#include "tensorflow/core/public/status.h" +#include "tensorflow/core/public/tensor.h" +#include "tensorflow/core/util/device_name_utils.h" + +#if defined(PLATFORM_GOOGLE) +DEFINE_bool(brain_gpu_sync_every_op, false, + "If true, call GPUUtil::Sync() between every dispatched opkernel."); + +DEFINE_int32(brain_gpu_max_streams, 1, + "Max number of GPU streams to use for computation."); +#else +// TODO(opensource): These should be made options in some options struct, +// rather than flags. +bool FLAGS_brain_gpu_sync_every_op = false; +tensorflow::int32 FLAGS_brain_gpu_max_streams = 1; +#endif + +namespace gpu = ::perftools::gputools; + +namespace tensorflow { + +// Eigen Ops directly allocate memory only for temporary buffers used +// during OpKernel::Compute(). The recommended way of allocating such +// memory is via OpKernelContext::allocate_temp(). However, Eigen Ops +// don't have access to OpKernelContext, instead they get access to +// memory directly through the device allocator. As an Open Source +// project, Eigen assumes allocator semantics similar to those of the +// CUDA memory allocator, and may not work correctly due to race +// conditions if used with some other allocator. For safety, we need +// to delay deallocation calls out of Eigen until all events on the +// corresponding stream have completed. The following two classes +// serve this purpose in two different compilation environments. + +#if defined(__GCUDACC__) || defined(__GCUDACC_HOST__) +class EigenAllocator : public ::Eigen::Allocator { + public: + explicit EigenAllocator(gpu::Stream* stream, ::tensorflow::Allocator* alloc, + EventMgr* em) + : stream_(stream), allocator_(alloc), em_(em) {} + + void* allocate(size_t num_bytes) const override { + void* ret = allocator_->AllocateRaw(32 /* alignment */, num_bytes); + // Eigen doesn't typically check the return pointer from allocate, + // so we do it here and die with a more helpful error message. + if (ret == nullptr) { + LOG(FATAL) << "EigenAllocator for GPU ran out of memory when allocating " + << num_bytes << ". See error logs for more detailed info."; + } + return ret; + } + + void deallocate(void* buffer) const override { + em_->ThenDeleteBuffer(stream_, {allocator_, buffer}); + } + + private: + gpu::Stream* stream_; // Not owned. + ::tensorflow::Allocator* allocator_; // Not owned. + ::tensorflow::EventMgr* em_; // Not owned. + + TF_DISALLOW_COPY_AND_ASSIGN(EigenAllocator); +}; + +#else +class EigenCudaStreamDevice : public ::Eigen::StreamInterface { + public: + EigenCudaStreamDevice(const cudaStream_t* cuda_stream, int gpu_id, + ::tensorflow::Allocator* alloc) + : stream_(cuda_stream), allocator_(alloc) { + Eigen::initializeDeviceProp(); + device_prop_ = &Eigen::m_deviceProperties[gpu_id]; + } + + const cudaStream_t& stream() const override { return *stream_; } + const cudaDeviceProp& deviceProperties() const override { + return *device_prop_; + } + + void* allocate(size_t num_bytes) const override { + void* ret = allocator_->AllocateRaw(32 /* alignment */, num_bytes); + if (ret == nullptr) { + LOG(FATAL) << "EigenAllocator for GPU ran out of memory when allocating " + << num_bytes << ". See error logs for more detailed info."; + } + + return ret; + } + void deallocate(void* buffer) const override { + AsyncFreeData* afData = new AsyncFreeData(allocator_, buffer); + cudaError_t err = cudaStreamAddCallback(*stream_, asyncFree, afData, 0); + CHECK_EQ(err, cudaSuccess); + } + + private: + struct AsyncFreeData { + AsyncFreeData(::tensorflow::Allocator* a, void* p) + : allocator_(a), address_(p) {} + ::tensorflow::Allocator* allocator_; + void* address_; + }; + + static void CUDART_CB asyncFree(cudaStream_t stream, cudaError_t status, + void* userData) { + AsyncFreeData* data = static_cast<AsyncFreeData*>(userData); + data->allocator_->DeallocateRaw(data->address_); + delete data; + } + + const cudaStream_t* stream_; // Not owned. + const cudaDeviceProp* device_prop_; // Not owned. + ::tensorflow::Allocator* allocator_; // Not owned. + + TF_DISALLOW_COPY_AND_ASSIGN(EigenCudaStreamDevice); +}; + +#endif + +BaseGPUDevice::BaseGPUDevice(const SessionOptions& options, const string& name, + Bytes memory_limit, BusAdjacency bus_adjacency, + int gpu_id, const string& physical_device_desc, + Allocator* gpu_allocator, Allocator* cpu_allocator) + : LocalDevice(options, Device::BuildDeviceAttributes( + name, DEVICE_GPU, memory_limit, bus_adjacency, + physical_device_desc), + gpu_allocator), + gpu_allocator_(gpu_allocator), + cpu_allocator_(cpu_allocator), + gpu_id_(gpu_id) { + gpu::StreamExecutor* executor = + GPUMachineManager()->ExecutorForDevice(gpu_id_).ValueOrDie(); + if (!executor) { + LOG(ERROR) << "Failed to get StreamExecutor for device " << gpu_id_; + return; + } + em_.reset(new EventMgr(executor)); + + if (FLAGS_brain_gpu_max_streams < 1) { + LOG(FATAL) << "Invalid value for brain_gpu_max_streams."; + } + + // Create the specified number of GPU streams + for (int i = 0; i < FLAGS_brain_gpu_max_streams; i++) { + auto stream = new gpu::Stream(executor); + stream->Init(); + VLOG(2) << "Created stream[" << i << "] = " << stream; + streams_.push_back(stream); + device_contexts_.push_back(new GPUDeviceContext(i, stream)); + } + gpu_device_info_ = new GpuDeviceInfo; + gpu_device_info_->stream = streams_[0]; + gpu_device_info_->default_context = device_contexts_[0]; + gpu_device_info_->event_mgr = em_.get(); + set_tensorflow_gpu_device_info(gpu_device_info_); +} + +BaseGPUDevice::~BaseGPUDevice() { + delete gpu_device_info_; + for (auto ctx : device_contexts_) ctx->Unref(); + gtl::STLDeleteElements(&streams_); +} + +Status BaseGPUDevice::FillContextMap(const Graph* graph, + DeviceContextMap* device_context_map) { + VLOG(2) << "FillContextMap"; + + const auto num_streams = streams_.size(); + // Special case for single stream. + if (num_streams == 1) { + return Status::OK(); + } + const int64 before = Env::Default()->NowMicros(); + gpu_stream_util::AssignStreamsOpts opts; + opts.max_streams = num_streams; + std::unordered_map<int, int> node_to_stream_id; + TF_RETURN_IF_ERROR( + gpu_stream_util::AssignStreams(graph, opts, &node_to_stream_id)); + int64 elapsed = Env::Default()->NowMicros() - before; + VLOG(3) << "AssignStreams took " << elapsed << "us"; + + // Fill in the context map. It is OK for this map to contain + // duplicate DeviceContexts so long as we increment the refcount. + for (Node* n : graph->nodes()) { + auto mapped_stream = node_to_stream_id[n->id()]; + CHECK_LE(mapped_stream, num_streams); + auto ctx = device_contexts_[mapped_stream]; + VLOG(3) << "Assigned stream " << node_to_stream_id[n->id()] + << " ==> stream[" << ctx->stream_id() << "] for node id " << n->id() + << " " << n->type_string() << " " << n->name(); + ctx->Ref(); + device_context_map->insert(std::make_pair(n->id(), ctx)); + } + + return Status::OK(); +} + +void BaseGPUDevice::Compute(OpKernel* op_kernel, OpKernelContext* context) { + // ScopedActivity is cheap when tracing is not active, but we + // can avoid computing the Hash64. + // TODO(pbar) This would no longer be needed if Ops have a unique id. + const uint64 id = port::Tracing::IsActive() ? Hash64(op_kernel->name()) : 0; + port::Tracing::ScopedActivity region(port::Tracing::EventCategory::kCompute, + id); + + GPUDeviceContext* gpu_device_context = device_contexts_[0]; + if (context->op_device_context() != nullptr) { + gpu_device_context = + static_cast<GPUDeviceContext*>(context->op_device_context()); + } + gpu::Stream* stream = gpu_device_context->stream(); + const auto stream_id = gpu_device_context->stream_id(); + + VLOG(1) << "GpuDevice::Compute " << op_kernel->name() << " op " + << op_kernel->def().op() << " on GPU" << gpu_id_ << " stream[" + << stream_id << "]"; + + // NOTE(tucker): We need to discriminate between Eigen GPU + // operations and all others. If an operation is Eigen + // implemented (or otherwise tries to launch a cuda kernel + // directly), we need to establish a stacked-scoped environment + // that directs it to execute on the proper device. Otherwise we + // expect the Op to use StreamExecutor directly and correctly. The + // way we make this discrimination is quite hacky: At the moment + // the only non-Eigen GPU Op is the recv-op, which is known to be + // asynchronous. + if (op_kernel->type_string() == "_Recv") { + context->SetStatus(errors::Internal( + "Invalid synchronous 'Compute' on GPU for '_Recv' op")); + } else { + const string label = + strings::StrCat(op_kernel->name(), ":", op_kernel->type_string()); + port::Tracing::ScopedAnnotation annotation(label); + + const auto num_streams = streams_.size(); + if (num_streams > 1) { + // If this op's device context is different from the other contexts, + // we must wait on the stream. + for (int i = 0; i < context->num_inputs(); ++i) { + const GPUDeviceContext* idc = + static_cast<GPUDeviceContext*>(context->input_device_context(i)); + OP_REQUIRES(context, idc != nullptr, + errors::Internal("Input device context ", i, + " was not set properly.")); + if (VLOG_IS_ON(2)) { + const void* base; + size_t len; + if (context->has_input(i)) { + if (IsRefType(context->input_dtype(i))) { + Tensor tensor = context->mutable_input(i, false); + base = DMAHelper::base(&tensor); + len = tensor.TotalBytes(); + } else { + const Tensor& tensor = context->input(i); + base = DMAHelper::base(&tensor); + len = tensor.TotalBytes(); + } + VLOG(2) << "Input " << i << " " << base << " " << len; + VLOG(2) << " stream[" << stream_id << "].ThenWaitFor(stream[" + << idc->stream_id() << "])" + << ((idc->stream() == stream) ? " not needed" : ""); + } + } + if (idc->stream() != stream) stream->ThenWaitFor(idc->stream()); + } + } + gpu::cuda::ScopedActivateExecutorContext scoped_activation{ + stream->parent(), gpu::cuda::MultiOpActivation::kYes}; + // Keep a copy of the inputs before Compute runs, in case they get + // deleted. TODO(misard) this will be fixed when the tracking is + // done right. + std::vector<Tensor>* tensor_refs = nullptr; + if (!FLAGS_brain_gpu_sync_every_op) { + tensor_refs = new std::vector<Tensor>; + tensor_refs->reserve(context->num_inputs() + context->num_outputs()); + for (int ii = 0; ii < context->num_inputs(); ++ii) { + if (context->has_input(ii)) { + if (IsRefType(context->input_dtype(ii))) { + Tensor in = context->mutable_input(ii, false); + tensor_refs->push_back(in); + } else { + const Tensor& in = context->input(ii); + tensor_refs->push_back(in); + } + } + } + } + op_kernel->Compute(context); + if (context->status().ok()) { + if (FLAGS_brain_gpu_sync_every_op) { + // Note: GPUUtil::Sync() only syncs the default stream. + // We need to either sync the stream used by this op, or + // all streams. Given that this flag is typically used for + // debugging it makes more sense to sync all GPU activity. + context->SetStatus(GPUUtil::SyncAll(this)); + } else { + // The GPU kernel has been queued, but may not complete for some + // time. As soon as this function completes, the caller will + // discard its refs on the inputs, outputs and any scratch + // tensors it created. Create additional refs here that will be + // held until the kernel completes. + for (int ii = 0; ii < context->num_temps(); ++ii) { + Tensor* temp = context->temp(ii); + VLOG(2) << "Saving ref to temp Tensor @ " << DMAHelper::base(temp); + tensor_refs->push_back(*temp); + } + for (int ii = 0; ii < context->num_outputs(); ++ii) { + Tensor* temp = context->mutable_output(ii); + if (nullptr != temp) { + tensor_refs->push_back(*temp); + } + } + em_->ThenDeleteTensors(stream, tensor_refs); + } + } else { + if (!FLAGS_brain_gpu_sync_every_op) { + delete tensor_refs; + } + } + } +} + +Status BaseGPUDevice::Sync() { return GPUUtil::Sync(this); } + +void BaseGPUDevice::ComputeAsync(AsyncOpKernel* op_kernel, + OpKernelContext* context, + AsyncOpKernel::DoneCallback done) { + GPUDeviceContext* gpu_device_context = device_contexts_[0]; + if (context->op_device_context() != nullptr) { + gpu_device_context = + static_cast<GPUDeviceContext*>(context->op_device_context()); + } + const auto stream_id = gpu_device_context->stream_id(); + + VLOG(1) << "GpuDevice::ComputeAsync " << op_kernel->name() << " op " + << op_kernel->def().op() << " on GPU" << gpu_id_ << " stream[" + << stream_id << "]"; + + port::Tracing::TraceMe activity( + strings::StrCat(op_kernel->name(), ":", op_kernel->type_string())); + op_kernel->ComputeAsync(context, done); +} + +Status BaseGPUDevice::MakeTensorFromProto(const TensorProto& tensor_proto, + const AllocatorAttributes alloc_attrs, + Tensor* tensor) { + AllocatorAttributes attr; + attr.set_on_host(true); + attr.set_gpu_compatible(true); + Allocator* host_alloc = GetAllocator(attr); + Tensor parsed(tensor_proto.dtype()); + if (!parsed.FromProto(host_alloc, tensor_proto)) { + return errors::InvalidArgument("Cannot parse tensor from proto: ", + tensor_proto.DebugString()); + } + Status status; + if (alloc_attrs.on_host()) { + *tensor = parsed; + } else { + if (!DMAHelper::CanUseDMA(&parsed)) { + return errors::Internal("GPU copy from non-DMA ", + DataTypeString(parsed.dtype()), " tensor"); + } + Tensor copy(GetAllocator(alloc_attrs), parsed.dtype(), parsed.shape()); + port::Tracing::ScopedAnnotation annotation("MakeTensorFromProto"); + Notification n; + device_contexts_[0]->CopyCPUTensorToDevice(&parsed, this, ©, + [&n, &status](const Status& s) { + status = s; + n.Notify(); + }); + n.WaitForNotification(); + *tensor = copy; + } + return status; +} + +namespace { +#if defined(__GCUDACC__) || defined(__GCUDACC_HOST__) +class ConcretePerOpGpuDevice : public PerOpGpuDevice { + public: + explicit ConcretePerOpGpuDevice(gpu::Stream* stream, + EigenAllocator* allocator) + : device_(stream, allocator), allocator_(allocator) {} + ~ConcretePerOpGpuDevice() { delete allocator_; } + + const Eigen::GpuDevice& device() const override { return device_; } + + private: + Eigen::GpuDevice device_; + EigenAllocator* allocator_; +}; +#else +class ConcretePerOpGpuDevice : public PerOpGpuDevice { + public: + explicit ConcretePerOpGpuDevice(EigenCudaStreamDevice* stream_device) + : device_(stream_device), stream_device_(stream_device) {} + ~ConcretePerOpGpuDevice() { delete stream_device_; } + + const Eigen::GpuDevice& device() const override { return device_; } + + private: + Eigen::GpuDevice device_; + EigenCudaStreamDevice* stream_device_; +}; +#endif +} // namespace + +const PerOpGpuDevice* BaseGPUDevice::NewDevice(int stream_id, + Allocator* allocator) { +#if defined(__GCUDACC__) || defined(__GCUDACC_HOST__) + auto ea = new EigenAllocator(streams_[stream_id], allocator, em_.get()); + return new ConcretePerOpGpuDevice(streams_[stream_id], ea); +#else + const cudaStream_t* cuda_stream = reinterpret_cast<const cudaStream_t*>( + streams_[stream_id]->implementation()->CudaStreamMemberHack()); + auto es = new EigenCudaStreamDevice(cuda_stream, gpu_id_, allocator); + return new ConcretePerOpGpuDevice(es); +#endif +} + +const PerOpGpuDevice* BaseGPUDevice::MakeGpuDevice(DeviceContext* dc, + Allocator* allocator) { + if (dc) { + const GPUDeviceContext* gpu_dc = static_cast<GPUDeviceContext*>(dc); + const int stream_id = gpu_dc->stream_id(); + VLOG(1) << " eigen_gpu_device(" << dc << ") => stream[" << stream_id + << "]"; + CHECK_LT(stream_id, streams_.size()); + return NewDevice(stream_id, allocator); + } else { + return NewDevice(0, allocator); + } +} + +void BaseGPUDeviceFactory::CreateDevices(const SessionOptions& options, + const string& name_prefix, + std::vector<Device*>* devices) { + int n = INT_MAX; + auto iter = options.config.device_count().find("GPU"); + if (iter != options.config.device_count().end()) { + n = iter->second; + } + std::vector<int> valid_gpu_ids; + GetValidDeviceIds(&valid_gpu_ids); + if (static_cast<size_t>(n) > valid_gpu_ids.size()) { + n = valid_gpu_ids.size(); + } + for (int i = 0; i < n; i++) { + devices->push_back(CreateGPUDevice( + options, strings::StrCat(name_prefix, "/gpu:", i), valid_gpu_ids[i])); + } +} + +namespace { +int64 MinSystemMemory(int64 available_memory) { + // We use the following heuristic for now: + // + // If the available_memory is < 2GiB, we allocate 200MiB to system memory. + // Otherwise, allocate 300MiB to system memory. + // + // In the future we could be more sophisticated by using a table of + // devices. + if (available_memory < (1LL << 31)) { + // 200MiB + return 209715200LL; + } else { + // max(300 MiB, 0.95 * available_memory) + return std::max(314572800LL, static_cast<int64>(available_memory * 0.05)); + } +} +} // namespace + +static string GetShortDeviceDescription(int device_id, + const gpu::DeviceDescription& desc) { + return strings::StrCat("device: ", device_id, ", name: ", desc.name(), + ", pci bus id: ", desc.pci_bus_id()); +} + +LocalDevice* BaseGPUDeviceFactory::CreateGPUDevice( + const SessionOptions& options, const string& name, int gpu_id) { + CHECK_GE(gpu_id, 0); + + // Look up the device, to see its attributes. + gpu::Platform* gpu_platform = GPUMachineManager(); + CHECK_LT(gpu_id, gpu_platform->VisibleDeviceCount()); + gpu::StreamExecutor* se = + gpu_platform->ExecutorForDevice(gpu_id).ValueOrDie(); + const gpu::DeviceDescription& desc = se->GetDeviceDescription(); + + int64 total_memory, available_memory; + CHECK(se->DeviceMemoryUsage(&available_memory, &total_memory)); + + int64 allocated_memory = available_memory; + double config_memory_fraction = + options.config.gpu_options().per_process_gpu_memory_fraction(); + if (config_memory_fraction == 0) { + const int64 min_system_memory = MinSystemMemory(available_memory); + if (min_system_memory < allocated_memory) { + allocated_memory -= min_system_memory; + } + } else { + allocated_memory *= config_memory_fraction; + } + + Bytes allocated_bytes = static_cast<Bytes>(allocated_memory); + + // Get GPU BusAdjacency from its reported NUMA affinity. + // Because GPUs are virtualized in some environments, we can't just + // use the GPU id. + BusAdjacency bus_adjacency = BUS_ANY; + switch (desc.numa_node()) { + case 0: + bus_adjacency = BUS_0; + break; + case 1: + bus_adjacency = BUS_1; + break; + default: + bus_adjacency = BUS_ANY; + } + VLOG(1) << "GPUDevice id " << gpu_id << " on bus " << bus_adjacency + << " numa: " << desc.numa_node() << " pci: " << desc.pci_bus_id(); + + ProcessState* process_state = ProcessState::singleton(); + return CreateGPUDevice( + options, name, allocated_bytes, bus_adjacency, gpu_id, + GetShortDeviceDescription(gpu_id, desc), + process_state->GetGPUAllocator(gpu_id, allocated_memory), + process_state->GetCPUAllocator(desc.numa_node())); +} + +static int GetMinGPUMultiprocessorCount() { + static const int kDefaultMinGPUMultiprocessorCount = 8; + + const char* tf_min_gpu_core_count = getenv("TF_MIN_GPU_MULTIPROCESSOR_COUNT"); + + if (tf_min_gpu_core_count == nullptr || + strcmp(tf_min_gpu_core_count, "") == 0) { + return kDefaultMinGPUMultiprocessorCount; + } + + int min_gpu_core_count = -1; + if (strings::safe_strto32(tf_min_gpu_core_count, &min_gpu_core_count)) { + if (min_gpu_core_count >= 0) { + return min_gpu_core_count; + } + } + + LOG(ERROR) << "Invalid minimum GPU multiprocessor count: [" + << tf_min_gpu_core_count << "]. " + << "Using the default value: " + << kDefaultMinGPUMultiprocessorCount; + return kDefaultMinGPUMultiprocessorCount; +} + +void BaseGPUDeviceFactory::GetValidDeviceIds(std::vector<int>* ids) { + auto gpu_manager = GPUMachineManager(); + int min_gpu_core_count = GetMinGPUMultiprocessorCount(); + if (gpu_manager) { + auto visible_device_count = gpu_manager->VisibleDeviceCount(); + for (int i = 0; i < gpu_manager->VisibleDeviceCount(); ++i) { + auto exec_status = gpu_manager->ExecutorForDevice(i); + if (!exec_status.ok()) { + continue; + } + gpu::StreamExecutor* se = exec_status.ValueOrDie(); + const gpu::DeviceDescription& desc = se->GetDeviceDescription(); + int major, minor; + if (!desc.cuda_compute_capability(&major, &minor)) { + continue; + } + // Only consider GPUs with compute capability >= 3.5 (Kepler or + // higher) + if (major < 3 || (major == 3 && minor < 5)) { + LOG(INFO) << "Ignoring gpu device " + << "(" << GetShortDeviceDescription(i, desc) << ") " + << "with Cuda compute capability " << major << "." << minor + << ". The minimum required Cuda capability is 3.5."; + continue; + } + + // TensorFlow currently places computation on devices assuming + // they have similar capability. + // + // If there are multiple GPUs available on the machine, only + // consider GPUs with 8 or more multiprocessors. + // + // TODO(vrv): In the medium term: we should only filter out GPUs + // that are slow relative to the fastest GPU. In the long term, + // TensorFlow should support automatic placement based on + // capability. + if (visible_device_count > 1) { + if (desc.core_count() < min_gpu_core_count) { + LOG(INFO) << "Ignoring gpu device " + << "(" << GetShortDeviceDescription(i, desc) << ") " + << "with Cuda multiprocessor count: " << desc.core_count() + << ". The minimum required count is " << min_gpu_core_count + << ". You can adjust this requirement with the env var " + "TF_MIN_GPU_MULTIPROCESSOR_COUNT."; + continue; + } + } + + int new_id = ids->size(); + ids->push_back(i); + + LOG(INFO) << "Creating TensorFlow device (/gpu:" << new_id << ") -> " + << "(" << GetShortDeviceDescription(i, desc) << ")"; + } + } +} + +} // namespace tensorflow + +#endif // GOOGLE_CUDA |