aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/stream_executor
diff options
context:
space:
mode:
authorGravatar Justin Lebar <jlebar@google.com>2018-03-15 02:22:17 -0700
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2018-03-15 02:25:56 -0700
commitb08c54271084b05ea822b3348a3a448a9fe3b898 (patch)
tree49e71a99b2f6c974edd73761e4d5928c9a914b1f /tensorflow/stream_executor
parent9037e241de1e64044ff55ab539ccc1fb013c178a (diff)
[SE] [XLA:GPU] Inform --xla_hlo_profile of the GPU's memory bandwidth.
Add a memory_bandwidth() property to StreamExecutor's DeviceDescription, and use this in the GPU's --xla_hlo_profile. PiperOrigin-RevId: 189157407
Diffstat (limited to 'tensorflow/stream_executor')
-rw-r--r--tensorflow/stream_executor/cuda/cuda_driver.cc13
-rw-r--r--tensorflow/stream_executor/cuda/cuda_driver.h10
-rw-r--r--tensorflow/stream_executor/cuda/cuda_gpu_executor.cc12
-rw-r--r--tensorflow/stream_executor/device_description.cc3
-rw-r--r--tensorflow/stream_executor/device_description.h9
5 files changed, 46 insertions, 1 deletions
diff --git a/tensorflow/stream_executor/cuda/cuda_driver.cc b/tensorflow/stream_executor/cuda/cuda_driver.cc
index a017ff64d4..58e1e58c59 100644
--- a/tensorflow/stream_executor/cuda/cuda_driver.cc
+++ b/tensorflow/stream_executor/cuda/cuda_driver.cc
@@ -1503,6 +1503,19 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
return true;
}
+/* static */ port::StatusOr<int> CUDADriver::GetDeviceAttribute(
+ CUdevice_attribute attribute, CUdevice device) {
+ int val;
+ CUresult res = cuDeviceGetAttribute(&val, attribute, device);
+ if (res != CUDA_SUCCESS) {
+ return port::Status{
+ port::error::INTERNAL,
+ port::Printf("failed to get device attribute %d for device %d: %s",
+ attribute, device, ToString(res).c_str())};
+ }
+ return val;
+}
+
/* static */ bool CUDADriver::IsEccEnabled(CUdevice device, bool *result) {
int value = -1;
CUresult res =
diff --git a/tensorflow/stream_executor/cuda/cuda_driver.h b/tensorflow/stream_executor/cuda/cuda_driver.h
index 4002ba2021..fa9172b3f0 100644
--- a/tensorflow/stream_executor/cuda/cuda_driver.h
+++ b/tensorflow/stream_executor/cuda/cuda_driver.h
@@ -400,12 +400,20 @@ class CUDADriver {
// Returns a grab-bag of device properties in a caller-owned device_properties
// structure for device_ordinal via cuDeviceGetProperties.
- // This call is deprecated in the NVIDIA driver API.
+ //
+ // This call is deprecated in the NVIDIA driver API; its replacement is
+ // GetDeviceAttribute
//
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE__DEPRECATED.html#group__CUDA__DEVICE__DEPRECATED_1g65a5b4e25186bd257df80b98c98cffe6
static bool GetDeviceProperties(CUdevprop *device_properties,
int device_ordinal);
+ // Gets a specific integer-valued property about the given device.
+ //
+ // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g9c3e1414f0ad901d3278a4d6645fc266
+ static port::StatusOr<int> GetDeviceAttribute(CUdevice_attribute attribute,
+ CUdevice device);
+
// Returns whether ECC is enabled for the given CUdevice via
// cuDeviceGetattribute with CU_DEVICE_ATTRIBUTE_ECC_ENABLED.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g9c3e1414f0ad901d3278a4d6645fc266
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
index 4bbd531e14..5ecaf46b8c 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
@@ -1103,6 +1103,18 @@ DeviceDescription *CUDAExecutor::PopulateDeviceDescription() const {
builder.set_device_memory_size(device_memory_size);
}
+ port::StatusOr<int> mem_clock_khz = CUDADriver::GetDeviceAttribute(
+ CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, device_ordinal_);
+ port::StatusOr<int> mem_bus_width_bits = CUDADriver::GetDeviceAttribute(
+ CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, device_ordinal_);
+ if (mem_clock_khz.ok() && mem_bus_width_bits.ok()) {
+ // Times 2 because HBM is DDR memory; it gets two data bits per each data
+ // lane.
+ builder.set_memory_bandwidth(2 * int64_t{mem_clock_khz.ValueOrDie()} *
+ 1000 *
+ int64_t{mem_bus_width_bits.ValueOrDie()} / 8);
+ }
+
{
BlockDim block_dim_limit;
FillBlockDimLimit(&block_dim_limit);
diff --git a/tensorflow/stream_executor/device_description.cc b/tensorflow/stream_executor/device_description.cc
index a98143e34b..52f5319a3b 100644
--- a/tensorflow/stream_executor/device_description.cc
+++ b/tensorflow/stream_executor/device_description.cc
@@ -50,6 +50,7 @@ DeviceDescription::DeviceDescription()
shared_memory_alloc_granularity_(1),
device_address_bits_(kUninitializedUint64),
device_memory_size_(kUninitializedUint64),
+ memory_bandwidth_(kUninitializedUint64),
shared_memory_per_core_(kUninitializedUint64),
shared_memory_per_block_(kUninitializedUint64),
clock_rate_ghz_(-1.0),
@@ -85,6 +86,8 @@ std::unique_ptr<std::map<string, string>> DeviceDescription::ToMap() const {
result["Device Address Bits"] = port::StrCat(device_address_bits());
result["Device Memory Size"] =
port::HumanReadableNumBytes::ToString(device_memory_size());
+ result["Memory Bandwidth"] = port::StrCat(
+ port::HumanReadableNumBytes::ToString(memory_bandwidth_), "/s");
result["Shared Memory Per Core"] =
port::HumanReadableNumBytes::ToString(shared_memory_per_core_);
diff --git a/tensorflow/stream_executor/device_description.h b/tensorflow/stream_executor/device_description.h
index f2b35bcb43..fcf0928096 100644
--- a/tensorflow/stream_executor/device_description.h
+++ b/tensorflow/stream_executor/device_description.h
@@ -140,6 +140,11 @@ class DeviceDescription {
// Returns the device memory size in bytes.
uint64 device_memory_size() const { return device_memory_size_; }
+ // Returns the device's memory bandwidth in bytes/sec. (This is for
+ // reads/writes to/from the device's own memory, not for transfers between the
+ // host and device.)
+ uint64 memory_bandwidth() const { return memory_bandwidth_; }
+
// Returns the device's core clock rate in GHz.
float clock_rate_ghz() const { return clock_rate_ghz_; }
@@ -212,6 +217,7 @@ class DeviceDescription {
uint64 device_address_bits_;
uint64 device_memory_size_;
+ uint64 memory_bandwidth_;
// Shared memory limits on a given device.
uint64 shared_memory_per_core_;
@@ -305,6 +311,9 @@ class DeviceDescriptionBuilder {
void set_device_memory_size(uint64 value) {
device_description_->device_memory_size_ = value;
}
+ void set_memory_bandwidth(uint64 value) {
+ device_description_->memory_bandwidth_ = value;
+ }
void set_shared_memory_per_core(int64 value) {
device_description_->shared_memory_per_core_ = value;