diff options
author | 2017-11-13 22:34:51 -0800 | |
---|---|---|
committer | 2017-11-13 22:40:38 -0800 | |
commit | 43c428ada3cd80b8d269c34f3724aaaec08f12de (patch) | |
tree | 174c3349a19099cbba7576d6185be7a8a891313d /tensorflow/python/profiler | |
parent | 2c26c98f8d1f15d064c76548393137f058043dc1 (diff) |
OOM error with allocation information.
PiperOrigin-RevId: 175637128
Diffstat (limited to 'tensorflow/python/profiler')
-rw-r--r-- | tensorflow/python/profiler/BUILD | 1 | ||||
-rw-r--r-- | tensorflow/python/profiler/model_analyzer_test.py | 59 |
2 files changed, 60 insertions, 0 deletions
diff --git a/tensorflow/python/profiler/BUILD b/tensorflow/python/profiler/BUILD index 26cc5f0b74..519b05975f 100644 --- a/tensorflow/python/profiler/BUILD +++ b/tensorflow/python/profiler/BUILD @@ -53,6 +53,7 @@ cuda_py_test( "//tensorflow/python:client", "//tensorflow/python:client_testlib", "//tensorflow/python:framework_for_generated_wrappers", + "//tensorflow/python:distributed_framework_test_lib", "//tensorflow/python:platform", "//tensorflow/python:variables", ], diff --git a/tensorflow/python/profiler/model_analyzer_test.py b/tensorflow/python/profiler/model_analyzer_test.py index 17c87bea92..698f8906d4 100644 --- a/tensorflow/python/profiler/model_analyzer_test.py +++ b/tensorflow/python/profiler/model_analyzer_test.py @@ -28,6 +28,8 @@ from tensorflow.core.protobuf import config_pb2 from tensorflow.core.protobuf import rewriter_config_pb2 from tensorflow.python.client import session from tensorflow.python.framework import ops +from tensorflow.python.framework import test_util +from tensorflow.python.ops import random_ops from tensorflow.python.ops import variables from tensorflow.python.platform import gfile from tensorflow.python.platform import test @@ -635,6 +637,63 @@ class PrintModelAnalysisTest(test.TestCase): self._trainLoop(x, 10, time_dir, time_steps, memory_dir, memory_steps, profile_dir, dump_steps) + def testOOM(self): + if not test.is_gpu_available(): + return + ops.reset_default_graph() + with ops.device('/device:GPU:0'): + a = random_ops.random_normal([1, 10000, 20000], name='test_random1') + b = random_ops.random_normal([30000, 10000, 1], name='test_random2') + c = a * b + + try: + with session.Session() as sess: + sess.run(c, options=config_pb2.RunOptions( + report_tensor_allocations_upon_oom=True)) + except Exception as e: # pylint: disable=broad-except + exception_str = '%s' % e + # This trace reports allocations for to random tensor. + self.assertTrue( + 'OOM when allocating tensor with shape[30000,10000,20000]' in + exception_str) + mat = re.search('(.*)GiB from test_random2/RandomStandardNormal', + exception_str) + self.assertGreater(float(mat.group(1)), 0.0) + mat = re.search('(.*)MiB from test_random1/RandomStandardNormal', + exception_str) + self.assertGreater(float(mat.group(1)), 0.0) + + def testDistributedOOM(self): + if not test.is_gpu_available(): + return + ops.reset_default_graph() + + workers, _ = test_util.create_local_cluster(2, 0) + + with ops.device('/job:worker/replica:0/task:0/gpu:0'): + a = random_ops.random_normal([1, 10000, 20000], name='test_random1') + with ops.device('/job:worker/replica:0/task:1/gpu:0'): + b = random_ops.random_normal([30000, 10000, 1], name='test_random2') + c = a * b + + try: + with session.Session(workers[1].target) as sess: + sess.run(c, options=config_pb2.RunOptions( + report_tensor_allocations_upon_oom=True)) + except Exception as e: # pylint: disable=broad-except + exception_str = '%s' % e + # test_random2 is reported because it's allocated in worker 1. + self.assertTrue('Current usage from device: ' + '/job:worker/replica:0/task:1/device:GPU:0, ' + 'allocator: GPU_0_bfc' in exception_str) + mat = re.search('(.*)GiB from test_random2/RandomStandardNormal', + exception_str) + self.assertGreater(float(mat.group(1)), 0.0) + # test_random1 is not reported because it's allocated in worker 0. + mat = re.search('(.*)MiB from test_random1/RandomStandardNormal', + exception_str) + self.assertTrue(mat is None) + if __name__ == '__main__': test.main() |