diff options
author | Smit Hinsu <hinsu@google.com> | 2018-06-18 20:51:50 -0700 |
---|---|---|
committer | TensorFlower Gardener <gardener@tensorflow.org> | 2018-06-18 20:54:44 -0700 |
commit | 60b78d6152e6f8d985f3086930ff986c140c36bf (patch) | |
tree | 769d6396ac7937fcba20b28a9613378eef016609 /tensorflow/contrib/nccl | |
parent | e8d37d9d27b59d54fb48e6b379093840bbd54f13 (diff) |
Load NCCL lib on-demand to facilitate default NCCL version upgrade to 2
Change in the default version to NCCL 2 would require all TF users to
download the NCCL library without the on-demand loading. With on-demand
loading, it will only require users using the nccl ops to download and
install the NCCL lib.
PiperOrigin-RevId: 201109554
Diffstat (limited to 'tensorflow/contrib/nccl')
-rw-r--r-- | tensorflow/contrib/nccl/BUILD | 40 | ||||
-rw-r--r-- | tensorflow/contrib/nccl/python/ops/nccl_dependency_test.py | 59 | ||||
-rw-r--r-- | tensorflow/contrib/nccl/python/ops/nccl_ops.py | 39 |
3 files changed, 123 insertions, 15 deletions
diff --git a/tensorflow/contrib/nccl/BUILD b/tensorflow/contrib/nccl/BUILD index 334e70318d..7cfdf0f607 100644 --- a/tensorflow/contrib/nccl/BUILD +++ b/tensorflow/contrib/nccl/BUILD @@ -97,18 +97,19 @@ tf_gen_op_wrapper_py( deps = [":nccl_ops_op_lib"], ) +# Test only nccl ops lib without dso to test behavior when NCCL lib is not +# installed. See nccl_dependency_test for more details. +# +# Users should use the public nccl_py lib that also adds the dso. tf_custom_op_py_library( - name = "nccl_py", + name = "nccl_ops_lib_without_dso", srcs = [ "__init__.py", "python/ops/nccl_ops.py", ], - dso = [":python/ops/_nccl_ops.so"], kernels = if_cuda([":nccl_kernels"]) + [ ":nccl_ops_op_lib", ], - srcs_version = "PY2AND3", - visibility = ["//visibility:public"], deps = [ ":nccl_ops", "//tensorflow/contrib/util:util_py", @@ -120,6 +121,15 @@ tf_custom_op_py_library( ], ) +tf_custom_op_py_library( + name = "nccl_py", + dso = [":python/ops/_nccl_ops.so"], + visibility = ["//visibility:public"], + deps = [ + ":nccl_ops_lib_without_dso", + ], +) + cuda_py_test( name = "nccl_ops_test", size = "small", @@ -141,3 +151,25 @@ cuda_py_test( "notap", ], ) + +cuda_py_test( + name = "nccl_dependency_test", + size = "small", + srcs = ["python/ops/nccl_dependency_test.py"], + additional_deps = [ + ":nccl_ops_lib_without_dso", + "//tensorflow/python:constant_op", + "//tensorflow/python:errors", + "//tensorflow/python:framework_ops", + "//tensorflow/python:util", + "//tensorflow/python:client_testlib", + "//tensorflow/python:platform_test", + ], + # Disable this test internally as static linking is used internally and only + # run for OSS to verify that NCCL is an optional dynamic dependency. + tags = [ + "manual", + "noguitar", + "notap", + ], +) diff --git a/tensorflow/contrib/nccl/python/ops/nccl_dependency_test.py b/tensorflow/contrib/nccl/python/ops/nccl_dependency_test.py new file mode 100644 index 0000000000..c766080dbe --- /dev/null +++ b/tensorflow/contrib/nccl/python/ops/nccl_dependency_test.py @@ -0,0 +1,59 @@ +# Copyright 2016 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Dependency test for nccl to test behavior when NCCL is not installed.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from tensorflow.contrib import nccl +from tensorflow.python.framework import constant_op +from tensorflow.python.framework import errors_impl +from tensorflow.python.framework import ops +from tensorflow.python.platform import test +from tensorflow.python.util import tf_inspect + + +class NcclDependencyTest(test.TestCase): + """Verifies that importing nccl ops lib does not fail even if NCCL is not + installed but nccl ops throws an exception on use if NCCL is not installed. + """ + + def test_nccl_ops(self): + """Tests behavior of nccl ops when NCCL is not installed.""" + + public_methods = [ + m[0] + for m in tf_inspect.getmembers(nccl, tf_inspect.isfunction) + if not m[0].startswith('_') + ] + for method_name in public_methods: + with ops.device('/device:CPU:0'): + tensor = constant_op.constant(1) + + if method_name == 'broadcast': + arg = tensor + else: + arg = [tensor] + + nccl_op = getattr(nccl, method_name) + with ops.device('/device:CPU:0'): + with self.assertRaisesRegexp(errors_impl.NotFoundError, + r'cannot open shared object file'): + nccl_op(arg) + + +if __name__ == '__main__': + test.main() diff --git a/tensorflow/contrib/nccl/python/ops/nccl_ops.py b/tensorflow/contrib/nccl/python/ops/nccl_ops.py index 794372a1f4..029b01412d 100644 --- a/tensorflow/contrib/nccl/python/ops/nccl_ops.py +++ b/tensorflow/contrib/nccl/python/ops/nccl_ops.py @@ -26,8 +26,10 @@ from tensorflow.python.framework import device from tensorflow.python.framework import ops from tensorflow.python.platform import resource_loader -_nccl_ops_so = loader.load_op_library( - resource_loader.get_path_to_datafile('_nccl_ops.so')) + +_nccl_ops_so = None +_module_lock = threading.Lock() +_shared_name_counter = 0 def all_sum(tensors): @@ -180,7 +182,7 @@ def broadcast(tensor): A tensor with the value of `src_tensor`, which can be used as input to ops on other GPU devices. """ - _check_graph_mode() + _validate_and_load_nccl_so() _check_device(tensor) with ops.device(tensor.device): @@ -212,7 +214,7 @@ def _apply_all_reduce(reduction, tensors): """Helper function for all_* functions.""" if not tensors: raise ValueError('Must pass >0 tensors to all reduce operations') - _check_graph_mode() + _validate_and_load_nccl_so() shared_name = _get_shared_name() res = [] @@ -234,7 +236,7 @@ def _apply_reduce(reduction, tensors): """Helper function for reduce_* functions.""" if not tensors: raise ValueError('Must pass >0 tensors to reduce operations') - _check_graph_mode() + _validate_and_load_nccl_so() for t in tensors: _check_device(t) @@ -246,14 +248,10 @@ def _apply_reduce(reduction, tensors): return result -_lock = threading.Lock() -_shared_name_counter = 0 - - def _get_shared_name(): global _shared_name_counter - with _lock: + with _module_lock: val = _shared_name_counter _shared_name_counter += 1 return 'c%s' % val @@ -266,6 +264,25 @@ def _check_device(tensor, expected=None): raise ValueError('Expected device %s, got %s' % (expected, tensor.device)) -def _check_graph_mode(): +def _maybe_load_nccl_ops_so(): + """Loads nccl ops so if it hasn't been loaded already.""" + + with _module_lock: + global _nccl_ops_so + if not _nccl_ops_so: + _nccl_ops_so = loader.load_op_library( + resource_loader.get_path_to_datafile('_nccl_ops.so')) + + +def _validate_and_load_nccl_so(): + """Validates calling context and loads nccl ops so file. + + Raises: + ValueError: Ops are not supported. + errors_impl.NotFoundError: nccl library is not installed. + """ + if context.executing_eagerly(): raise ValueError('Nccl ops are not supported in eager mode') + + _maybe_load_nccl_ops_so() |