third_party/nccl/build_defs.bzl.tpl


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210

"""Repository rule for NCCL."""

load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts")

def _gen_nccl_h_impl(ctx):
    """Creates nccl.h from a template."""
    ctx.actions.expand_template(
        output = ctx.outputs.output,
        template = ctx.file.template,
        substitutions = {
            "${nccl:Major}": "2",
            "${nccl:Minor}": "3",
            "${nccl:Patch}": "5",
            "${nccl:Suffix}": "",
            "${nccl:Version}": "2305",
        },
    )
gen_nccl_h = rule(
    implementation = _gen_nccl_h_impl,
    attrs = {
        "template": attr.label(allow_single_file = True),
        "output": attr.output(),
    },
)
"""Creates the NCCL header file."""


def _process_srcs_impl(ctx):
    """Appends .cc to .cu files, patches include directives."""
    files = []
    for src in ctx.files.srcs:
        if not src.is_source:
          # Process only once, specifically "src/nccl.h".
          files.append(src)
          continue
        name = src.basename
        if src.extension == "cu":
            name = ctx.attr.prefix + name + ".cc"
        file = ctx.actions.declare_file(name, sibling = src)
        ctx.actions.expand_template(
            output = file,
            template = src,
            substitutions = {
                "\"collectives.h": "\"collectives/collectives.h",
                "\"../collectives.h": "\"collectives/collectives.h",
                "#if __CUDACC_VER_MAJOR__":
                    "#if defined __CUDACC_VER_MAJOR__ && __CUDACC_VER_MAJOR__",
                # Substitutions are applied in order.
                "std::nullptr_t": "nullptr_t",
                "nullptr_t": "std::nullptr_t",
            },
        )
        files.append(file)
    return [DefaultInfo(files = depset(files))]
_process_srcs = rule(
    implementation = _process_srcs_impl,
    attrs = {
        "srcs": attr.label_list(allow_files = True),
        "prefix": attr.string(default = ""),
    },
)
"""Processes the NCCL srcs so they can be compiled with bazel and clang."""


def nccl_library(name, srcs=None, hdrs=None, prefix=None, **kwargs):
    """Processes the srcs and hdrs and creates a cc_library."""

    _process_srcs(
        name = name + "_srcs",
        srcs = srcs,
        prefix = prefix,
    )
    _process_srcs(
        name = name + "_hdrs",
        srcs = hdrs,
    )

    native.cc_library(
        name = name,
        srcs = [name + "_srcs"] if srcs else [],
        hdrs = [name + "_hdrs"] if hdrs else [],
        **kwargs
    )


def rdc_copts():
    """Returns copts for compiling relocatable device code."""

    # The global functions can not have a lower register count than the
    # device functions. This is enforced by setting a fixed register count.
    # https://github.com/NVIDIA/nccl/blob/f93fe9bfd94884cec2ba711897222e0df5569a53/makefiles/common.mk#L48
    maxrregcount = "-maxrregcount=96"

    return cuda_default_copts() + select({
          "@local_config_cuda//cuda:using_nvcc": [
              "-nvcc_options",
              "relocatable-device-code=true",
              "-nvcc_options",
              "ptxas-options=" + maxrregcount,
          ],
          "@local_config_cuda//cuda:using_clang": [
              "-fcuda-rdc",
              "-Xcuda-ptxas",
              maxrregcount,
          ],
          "//conditions:default": [],
      }) + ["-fvisibility=hidden"]


def _filter_impl(ctx):
    suffix = ctx.attr.suffix
    files = [src for src in ctx.files.srcs if src.path.endswith(suffix)]
    return [DefaultInfo(files = depset(files))]
_filter = rule(
    implementation = _filter_impl,
    attrs = {
        "srcs": attr.label_list(allow_files = True),
        "suffix": attr.string(),
    },
)
"""Filters the srcs to the ones ending with suffix."""


def _gen_link_src_impl(ctx):
    ctx.actions.expand_template(
        output = ctx.outputs.output,
        template = ctx.file.template,
        substitutions = {
            "REGISTERLINKBINARYFILE": '"%s"' % ctx.file.register_hdr.short_path,
            "FATBINFILE": '"%s"' % ctx.file.fatbin_hdr.short_path,
        },
    )
_gen_link_src = rule(
    implementation = _gen_link_src_impl,
    attrs = {
        "register_hdr": attr.label(allow_single_file = True),
        "fatbin_hdr": attr.label(allow_single_file = True),
        "template": attr.label(allow_single_file = True),
        "output": attr.output(),
    },
)
"""Patches the include directives for the link.stub file."""


def device_link(name, srcs):
    """Links seperately compiled relocatable device code into a cc_library."""

    # From .a and .pic.a archives, just use the latter.
    _filter(
        name = name + "_pic_a",
        srcs = srcs,
        suffix = ".pic.a",
    )

    # Device-link to cubins for each architecture.
    images = []
    cubins = []
    for arch in %{gpu_architectures}:
        cubin = "%s_%s.cubin" % (name, arch)
        register_hdr = "%s_%s.h" % (name, arch)
        nvlink = "@local_config_nccl//:nvlink"
        cmd = ("$(location %s) --cpu-arch=X86_64 " % nvlink +
            "--arch=%s $(SRCS) " % arch +
            "--register-link-binaries=$(location %s) " % register_hdr +
            "--output-file=$(location %s)" % cubin)
        native.genrule(
            name = "%s_%s" % (name, arch),
            outs = [register_hdr, cubin],
            srcs = [name + "_pic_a"],
            cmd = cmd,
            tools = [nvlink],
        )
        images.append("--image=profile=%s,file=$(location %s)" % (arch, cubin))
        cubins.append(cubin)

    # Generate fatbin header from all cubins.
    fatbin_hdr = name + ".fatbin.h"
    fatbinary = "@local_config_nccl//:cuda/bin/fatbinary"
    cmd = ("PATH=$$CUDA_TOOLKIT_PATH/bin:$$PATH " + # for bin2c
          "$(location %s) -64 --cmdline=--compile-only --link " % fatbinary +
          "--compress-all %s --create=%%{name}.fatbin " % " ".join(images) +
          "--embedded-fatbin=$@")
    native.genrule(
        name = name + "_fatbin_h",
        outs = [fatbin_hdr],
        srcs = cubins,
        cmd = cmd,
        tools = [fatbinary],
    )

    # Generate the source file #including the headers generated above.
    _gen_link_src(
        name = name + "_cc",
        # Include just the last one, they are equivalent.
        register_hdr = register_hdr,
        fatbin_hdr = fatbin_hdr,
        template = "@local_config_nccl//:cuda/bin/crt/link.stub",
        output = name + ".cc",
    )

    # Compile the source file into the cc_library.
    native.cc_library(
        name = name,
        srcs = [name + "_cc"],
        textual_hdrs = [register_hdr, fatbin_hdr],
        deps = [
            "@local_config_cuda//cuda:cuda_headers",
            "@local_config_cuda//cuda:cudart_static",
        ],
    )