aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/python/profiler
diff options
context:
space:
mode:
authorGravatar A. Unique TensorFlower <gardener@tensorflow.org>2017-11-13 22:34:51 -0800
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2017-11-13 22:40:38 -0800
commit43c428ada3cd80b8d269c34f3724aaaec08f12de (patch)
tree174c3349a19099cbba7576d6185be7a8a891313d /tensorflow/python/profiler
parent2c26c98f8d1f15d064c76548393137f058043dc1 (diff)
OOM error with allocation information.
PiperOrigin-RevId: 175637128
Diffstat (limited to 'tensorflow/python/profiler')
-rw-r--r--tensorflow/python/profiler/BUILD1
-rw-r--r--tensorflow/python/profiler/model_analyzer_test.py59
2 files changed, 60 insertions, 0 deletions
diff --git a/tensorflow/python/profiler/BUILD b/tensorflow/python/profiler/BUILD
index 26cc5f0b74..519b05975f 100644
--- a/tensorflow/python/profiler/BUILD
+++ b/tensorflow/python/profiler/BUILD
@@ -53,6 +53,7 @@ cuda_py_test(
"//tensorflow/python:client",
"//tensorflow/python:client_testlib",
"//tensorflow/python:framework_for_generated_wrappers",
+ "//tensorflow/python:distributed_framework_test_lib",
"//tensorflow/python:platform",
"//tensorflow/python:variables",
],
diff --git a/tensorflow/python/profiler/model_analyzer_test.py b/tensorflow/python/profiler/model_analyzer_test.py
index 17c87bea92..698f8906d4 100644
--- a/tensorflow/python/profiler/model_analyzer_test.py
+++ b/tensorflow/python/profiler/model_analyzer_test.py
@@ -28,6 +28,8 @@ from tensorflow.core.protobuf import config_pb2
from tensorflow.core.protobuf import rewriter_config_pb2
from tensorflow.python.client import session
from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import random_ops
from tensorflow.python.ops import variables
from tensorflow.python.platform import gfile
from tensorflow.python.platform import test
@@ -635,6 +637,63 @@ class PrintModelAnalysisTest(test.TestCase):
self._trainLoop(x, 10, time_dir, time_steps,
memory_dir, memory_steps, profile_dir, dump_steps)
+ def testOOM(self):
+ if not test.is_gpu_available():
+ return
+ ops.reset_default_graph()
+ with ops.device('/device:GPU:0'):
+ a = random_ops.random_normal([1, 10000, 20000], name='test_random1')
+ b = random_ops.random_normal([30000, 10000, 1], name='test_random2')
+ c = a * b
+
+ try:
+ with session.Session() as sess:
+ sess.run(c, options=config_pb2.RunOptions(
+ report_tensor_allocations_upon_oom=True))
+ except Exception as e: # pylint: disable=broad-except
+ exception_str = '%s' % e
+ # This trace reports allocations for to random tensor.
+ self.assertTrue(
+ 'OOM when allocating tensor with shape[30000,10000,20000]' in
+ exception_str)
+ mat = re.search('(.*)GiB from test_random2/RandomStandardNormal',
+ exception_str)
+ self.assertGreater(float(mat.group(1)), 0.0)
+ mat = re.search('(.*)MiB from test_random1/RandomStandardNormal',
+ exception_str)
+ self.assertGreater(float(mat.group(1)), 0.0)
+
+ def testDistributedOOM(self):
+ if not test.is_gpu_available():
+ return
+ ops.reset_default_graph()
+
+ workers, _ = test_util.create_local_cluster(2, 0)
+
+ with ops.device('/job:worker/replica:0/task:0/gpu:0'):
+ a = random_ops.random_normal([1, 10000, 20000], name='test_random1')
+ with ops.device('/job:worker/replica:0/task:1/gpu:0'):
+ b = random_ops.random_normal([30000, 10000, 1], name='test_random2')
+ c = a * b
+
+ try:
+ with session.Session(workers[1].target) as sess:
+ sess.run(c, options=config_pb2.RunOptions(
+ report_tensor_allocations_upon_oom=True))
+ except Exception as e: # pylint: disable=broad-except
+ exception_str = '%s' % e
+ # test_random2 is reported because it's allocated in worker 1.
+ self.assertTrue('Current usage from device: '
+ '/job:worker/replica:0/task:1/device:GPU:0, '
+ 'allocator: GPU_0_bfc' in exception_str)
+ mat = re.search('(.*)GiB from test_random2/RandomStandardNormal',
+ exception_str)
+ self.assertGreater(float(mat.group(1)), 0.0)
+ # test_random1 is not reported because it's allocated in worker 0.
+ mat = re.search('(.*)MiB from test_random1/RandomStandardNormal',
+ exception_str)
+ self.assertTrue(mat is None)
+
if __name__ == '__main__':
test.main()