diff options
author | Justin Lebar <jlebar@google.com> | 2018-03-15 02:22:17 -0700 |
---|---|---|
committer | TensorFlower Gardener <gardener@tensorflow.org> | 2018-03-15 02:25:56 -0700 |
commit | b08c54271084b05ea822b3348a3a448a9fe3b898 (patch) | |
tree | 49e71a99b2f6c974edd73761e4d5928c9a914b1f /tensorflow/stream_executor | |
parent | 9037e241de1e64044ff55ab539ccc1fb013c178a (diff) |
[SE] [XLA:GPU] Inform --xla_hlo_profile of the GPU's memory bandwidth.
Add a memory_bandwidth() property to StreamExecutor's DeviceDescription,
and use this in the GPU's --xla_hlo_profile.
PiperOrigin-RevId: 189157407
Diffstat (limited to 'tensorflow/stream_executor')
-rw-r--r-- | tensorflow/stream_executor/cuda/cuda_driver.cc | 13 | ||||
-rw-r--r-- | tensorflow/stream_executor/cuda/cuda_driver.h | 10 | ||||
-rw-r--r-- | tensorflow/stream_executor/cuda/cuda_gpu_executor.cc | 12 | ||||
-rw-r--r-- | tensorflow/stream_executor/device_description.cc | 3 | ||||
-rw-r--r-- | tensorflow/stream_executor/device_description.h | 9 |
5 files changed, 46 insertions, 1 deletions
diff --git a/tensorflow/stream_executor/cuda/cuda_driver.cc b/tensorflow/stream_executor/cuda/cuda_driver.cc index a017ff64d4..58e1e58c59 100644 --- a/tensorflow/stream_executor/cuda/cuda_driver.cc +++ b/tensorflow/stream_executor/cuda/cuda_driver.cc @@ -1503,6 +1503,19 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device, return true; } +/* static */ port::StatusOr<int> CUDADriver::GetDeviceAttribute( + CUdevice_attribute attribute, CUdevice device) { + int val; + CUresult res = cuDeviceGetAttribute(&val, attribute, device); + if (res != CUDA_SUCCESS) { + return port::Status{ + port::error::INTERNAL, + port::Printf("failed to get device attribute %d for device %d: %s", + attribute, device, ToString(res).c_str())}; + } + return val; +} + /* static */ bool CUDADriver::IsEccEnabled(CUdevice device, bool *result) { int value = -1; CUresult res = diff --git a/tensorflow/stream_executor/cuda/cuda_driver.h b/tensorflow/stream_executor/cuda/cuda_driver.h index 4002ba2021..fa9172b3f0 100644 --- a/tensorflow/stream_executor/cuda/cuda_driver.h +++ b/tensorflow/stream_executor/cuda/cuda_driver.h @@ -400,12 +400,20 @@ class CUDADriver { // Returns a grab-bag of device properties in a caller-owned device_properties // structure for device_ordinal via cuDeviceGetProperties. - // This call is deprecated in the NVIDIA driver API. + // + // This call is deprecated in the NVIDIA driver API; its replacement is + // GetDeviceAttribute // // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE__DEPRECATED.html#group__CUDA__DEVICE__DEPRECATED_1g65a5b4e25186bd257df80b98c98cffe6 static bool GetDeviceProperties(CUdevprop *device_properties, int device_ordinal); + // Gets a specific integer-valued property about the given device. + // + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g9c3e1414f0ad901d3278a4d6645fc266 + static port::StatusOr<int> GetDeviceAttribute(CUdevice_attribute attribute, + CUdevice device); + // Returns whether ECC is enabled for the given CUdevice via // cuDeviceGetattribute with CU_DEVICE_ATTRIBUTE_ECC_ENABLED. // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g9c3e1414f0ad901d3278a4d6645fc266 diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc index 4bbd531e14..5ecaf46b8c 100644 --- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc +++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc @@ -1103,6 +1103,18 @@ DeviceDescription *CUDAExecutor::PopulateDeviceDescription() const { builder.set_device_memory_size(device_memory_size); } + port::StatusOr<int> mem_clock_khz = CUDADriver::GetDeviceAttribute( + CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, device_ordinal_); + port::StatusOr<int> mem_bus_width_bits = CUDADriver::GetDeviceAttribute( + CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, device_ordinal_); + if (mem_clock_khz.ok() && mem_bus_width_bits.ok()) { + // Times 2 because HBM is DDR memory; it gets two data bits per each data + // lane. + builder.set_memory_bandwidth(2 * int64_t{mem_clock_khz.ValueOrDie()} * + 1000 * + int64_t{mem_bus_width_bits.ValueOrDie()} / 8); + } + { BlockDim block_dim_limit; FillBlockDimLimit(&block_dim_limit); diff --git a/tensorflow/stream_executor/device_description.cc b/tensorflow/stream_executor/device_description.cc index a98143e34b..52f5319a3b 100644 --- a/tensorflow/stream_executor/device_description.cc +++ b/tensorflow/stream_executor/device_description.cc @@ -50,6 +50,7 @@ DeviceDescription::DeviceDescription() shared_memory_alloc_granularity_(1), device_address_bits_(kUninitializedUint64), device_memory_size_(kUninitializedUint64), + memory_bandwidth_(kUninitializedUint64), shared_memory_per_core_(kUninitializedUint64), shared_memory_per_block_(kUninitializedUint64), clock_rate_ghz_(-1.0), @@ -85,6 +86,8 @@ std::unique_ptr<std::map<string, string>> DeviceDescription::ToMap() const { result["Device Address Bits"] = port::StrCat(device_address_bits()); result["Device Memory Size"] = port::HumanReadableNumBytes::ToString(device_memory_size()); + result["Memory Bandwidth"] = port::StrCat( + port::HumanReadableNumBytes::ToString(memory_bandwidth_), "/s"); result["Shared Memory Per Core"] = port::HumanReadableNumBytes::ToString(shared_memory_per_core_); diff --git a/tensorflow/stream_executor/device_description.h b/tensorflow/stream_executor/device_description.h index f2b35bcb43..fcf0928096 100644 --- a/tensorflow/stream_executor/device_description.h +++ b/tensorflow/stream_executor/device_description.h @@ -140,6 +140,11 @@ class DeviceDescription { // Returns the device memory size in bytes. uint64 device_memory_size() const { return device_memory_size_; } + // Returns the device's memory bandwidth in bytes/sec. (This is for + // reads/writes to/from the device's own memory, not for transfers between the + // host and device.) + uint64 memory_bandwidth() const { return memory_bandwidth_; } + // Returns the device's core clock rate in GHz. float clock_rate_ghz() const { return clock_rate_ghz_; } @@ -212,6 +217,7 @@ class DeviceDescription { uint64 device_address_bits_; uint64 device_memory_size_; + uint64 memory_bandwidth_; // Shared memory limits on a given device. uint64 shared_memory_per_core_; @@ -305,6 +311,9 @@ class DeviceDescriptionBuilder { void set_device_memory_size(uint64 value) { device_description_->device_memory_size_ = value; } + void set_memory_bandwidth(uint64 value) { + device_description_->memory_bandwidth_ = value; + } void set_shared_memory_per_core(int64 value) { device_description_->shared_memory_per_core_ = value; |