aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/contrib/nccl
diff options
context:
space:
mode:
authorGravatar Smit Hinsu <hinsu@google.com>2018-06-18 20:51:50 -0700
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2018-06-18 20:54:44 -0700
commit60b78d6152e6f8d985f3086930ff986c140c36bf (patch)
tree769d6396ac7937fcba20b28a9613378eef016609 /tensorflow/contrib/nccl
parente8d37d9d27b59d54fb48e6b379093840bbd54f13 (diff)
Load NCCL lib on-demand to facilitate default NCCL version upgrade to 2
Change in the default version to NCCL 2 would require all TF users to download the NCCL library without the on-demand loading. With on-demand loading, it will only require users using the nccl ops to download and install the NCCL lib. PiperOrigin-RevId: 201109554
Diffstat (limited to 'tensorflow/contrib/nccl')
-rw-r--r--tensorflow/contrib/nccl/BUILD40
-rw-r--r--tensorflow/contrib/nccl/python/ops/nccl_dependency_test.py59
-rw-r--r--tensorflow/contrib/nccl/python/ops/nccl_ops.py39
3 files changed, 123 insertions, 15 deletions
diff --git a/tensorflow/contrib/nccl/BUILD b/tensorflow/contrib/nccl/BUILD
index 334e70318d..7cfdf0f607 100644
--- a/tensorflow/contrib/nccl/BUILD
+++ b/tensorflow/contrib/nccl/BUILD
@@ -97,18 +97,19 @@ tf_gen_op_wrapper_py(
deps = [":nccl_ops_op_lib"],
)
+# Test only nccl ops lib without dso to test behavior when NCCL lib is not
+# installed. See nccl_dependency_test for more details.
+#
+# Users should use the public nccl_py lib that also adds the dso.
tf_custom_op_py_library(
- name = "nccl_py",
+ name = "nccl_ops_lib_without_dso",
srcs = [
"__init__.py",
"python/ops/nccl_ops.py",
],
- dso = [":python/ops/_nccl_ops.so"],
kernels = if_cuda([":nccl_kernels"]) + [
":nccl_ops_op_lib",
],
- srcs_version = "PY2AND3",
- visibility = ["//visibility:public"],
deps = [
":nccl_ops",
"//tensorflow/contrib/util:util_py",
@@ -120,6 +121,15 @@ tf_custom_op_py_library(
],
)
+tf_custom_op_py_library(
+ name = "nccl_py",
+ dso = [":python/ops/_nccl_ops.so"],
+ visibility = ["//visibility:public"],
+ deps = [
+ ":nccl_ops_lib_without_dso",
+ ],
+)
+
cuda_py_test(
name = "nccl_ops_test",
size = "small",
@@ -141,3 +151,25 @@ cuda_py_test(
"notap",
],
)
+
+cuda_py_test(
+ name = "nccl_dependency_test",
+ size = "small",
+ srcs = ["python/ops/nccl_dependency_test.py"],
+ additional_deps = [
+ ":nccl_ops_lib_without_dso",
+ "//tensorflow/python:constant_op",
+ "//tensorflow/python:errors",
+ "//tensorflow/python:framework_ops",
+ "//tensorflow/python:util",
+ "//tensorflow/python:client_testlib",
+ "//tensorflow/python:platform_test",
+ ],
+ # Disable this test internally as static linking is used internally and only
+ # run for OSS to verify that NCCL is an optional dynamic dependency.
+ tags = [
+ "manual",
+ "noguitar",
+ "notap",
+ ],
+)
diff --git a/tensorflow/contrib/nccl/python/ops/nccl_dependency_test.py b/tensorflow/contrib/nccl/python/ops/nccl_dependency_test.py
new file mode 100644
index 0000000000..c766080dbe
--- /dev/null
+++ b/tensorflow/contrib/nccl/python/ops/nccl_dependency_test.py
@@ -0,0 +1,59 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Dependency test for nccl to test behavior when NCCL is not installed."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib import nccl
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import ops
+from tensorflow.python.platform import test
+from tensorflow.python.util import tf_inspect
+
+
+class NcclDependencyTest(test.TestCase):
+ """Verifies that importing nccl ops lib does not fail even if NCCL is not
+ installed but nccl ops throws an exception on use if NCCL is not installed.
+ """
+
+ def test_nccl_ops(self):
+ """Tests behavior of nccl ops when NCCL is not installed."""
+
+ public_methods = [
+ m[0]
+ for m in tf_inspect.getmembers(nccl, tf_inspect.isfunction)
+ if not m[0].startswith('_')
+ ]
+ for method_name in public_methods:
+ with ops.device('/device:CPU:0'):
+ tensor = constant_op.constant(1)
+
+ if method_name == 'broadcast':
+ arg = tensor
+ else:
+ arg = [tensor]
+
+ nccl_op = getattr(nccl, method_name)
+ with ops.device('/device:CPU:0'):
+ with self.assertRaisesRegexp(errors_impl.NotFoundError,
+ r'cannot open shared object file'):
+ nccl_op(arg)
+
+
+if __name__ == '__main__':
+ test.main()
diff --git a/tensorflow/contrib/nccl/python/ops/nccl_ops.py b/tensorflow/contrib/nccl/python/ops/nccl_ops.py
index 794372a1f4..029b01412d 100644
--- a/tensorflow/contrib/nccl/python/ops/nccl_ops.py
+++ b/tensorflow/contrib/nccl/python/ops/nccl_ops.py
@@ -26,8 +26,10 @@ from tensorflow.python.framework import device
from tensorflow.python.framework import ops
from tensorflow.python.platform import resource_loader
-_nccl_ops_so = loader.load_op_library(
- resource_loader.get_path_to_datafile('_nccl_ops.so'))
+
+_nccl_ops_so = None
+_module_lock = threading.Lock()
+_shared_name_counter = 0
def all_sum(tensors):
@@ -180,7 +182,7 @@ def broadcast(tensor):
A tensor with the value of `src_tensor`, which can be used as input to
ops on other GPU devices.
"""
- _check_graph_mode()
+ _validate_and_load_nccl_so()
_check_device(tensor)
with ops.device(tensor.device):
@@ -212,7 +214,7 @@ def _apply_all_reduce(reduction, tensors):
"""Helper function for all_* functions."""
if not tensors:
raise ValueError('Must pass >0 tensors to all reduce operations')
- _check_graph_mode()
+ _validate_and_load_nccl_so()
shared_name = _get_shared_name()
res = []
@@ -234,7 +236,7 @@ def _apply_reduce(reduction, tensors):
"""Helper function for reduce_* functions."""
if not tensors:
raise ValueError('Must pass >0 tensors to reduce operations')
- _check_graph_mode()
+ _validate_and_load_nccl_so()
for t in tensors:
_check_device(t)
@@ -246,14 +248,10 @@ def _apply_reduce(reduction, tensors):
return result
-_lock = threading.Lock()
-_shared_name_counter = 0
-
-
def _get_shared_name():
global _shared_name_counter
- with _lock:
+ with _module_lock:
val = _shared_name_counter
_shared_name_counter += 1
return 'c%s' % val
@@ -266,6 +264,25 @@ def _check_device(tensor, expected=None):
raise ValueError('Expected device %s, got %s' % (expected, tensor.device))
-def _check_graph_mode():
+def _maybe_load_nccl_ops_so():
+ """Loads nccl ops so if it hasn't been loaded already."""
+
+ with _module_lock:
+ global _nccl_ops_so
+ if not _nccl_ops_so:
+ _nccl_ops_so = loader.load_op_library(
+ resource_loader.get_path_to_datafile('_nccl_ops.so'))
+
+
+def _validate_and_load_nccl_so():
+ """Validates calling context and loads nccl ops so file.
+
+ Raises:
+ ValueError: Ops are not supported.
+ errors_impl.NotFoundError: nccl library is not installed.
+ """
+
if context.executing_eagerly():
raise ValueError('Nccl ops are not supported in eager mode')
+
+ _maybe_load_nccl_ops_so()