"""Repository rule for NCCL."""

load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts")

def _gen_nccl_h_impl(ctx):
    """Creates nccl.h from a template."""
    ctx.actions.expand_template(
        output = ctx.outputs.output,
        template = ctx.file.template,
        substitutions = {
            "${nccl:Major}": "2",
            "${nccl:Minor}": "3",
            "${nccl:Patch}": "5",
            "${nccl:Suffix}": "",
            "${nccl:Version}": "2305",
        },
    )
gen_nccl_h = rule(
    implementation = _gen_nccl_h_impl,
    attrs = {
        "template": attr.label(allow_single_file = True),
        "output": attr.output(),
    },
)
"""Creates the NCCL header file."""


def _process_srcs_impl(ctx):
    """Appends .cc to .cu files, patches include directives."""
    files = []
    for src in ctx.files.srcs:
        if not src.is_source:
          # Process only once, specifically "src/nccl.h".
          files.append(src)
          continue
        name = src.basename
        if src.extension == "cu":
            name = ctx.attr.prefix + name + ".cc"
        file = ctx.actions.declare_file(name, sibling = src)
        ctx.actions.expand_template(
            output = file,
            template = src,
            substitutions = {
                "\"collectives.h": "\"collectives/collectives.h",
                "\"../collectives.h": "\"collectives/collectives.h",
                "#if __CUDACC_VER_MAJOR__":
                    "#if defined __CUDACC_VER_MAJOR__ && __CUDACC_VER_MAJOR__",
                # Substitutions are applied in order.
                "std::nullptr_t": "nullptr_t",
                "nullptr_t": "std::nullptr_t",
            },
        )
        files.append(file)
    return [DefaultInfo(files = depset(files))]
_process_srcs = rule(
    implementation = _process_srcs_impl,
    attrs = {
        "srcs": attr.label_list(allow_files = True),
        "prefix": attr.string(default = ""),
    },
)
"""Processes the NCCL srcs so they can be compiled with bazel and clang."""


def nccl_library(name, srcs=None, hdrs=None, prefix=None, **kwargs):
    """Processes the srcs and hdrs and creates a cc_library."""

    _process_srcs(
        name = name + "_srcs",
        srcs = srcs,
        prefix = prefix,
    )
    _process_srcs(
        name = name + "_hdrs",
        srcs = hdrs,
    )

    native.cc_library(
        name = name,
        srcs = [name + "_srcs"] if srcs else [],
        hdrs = [name + "_hdrs"] if hdrs else [],
        **kwargs
    )


def rdc_copts():
    """Returns copts for compiling relocatable device code."""

    # The global functions can not have a lower register count than the
    # device functions. This is enforced by setting a fixed register count.
    # https://github.com/NVIDIA/nccl/blob/f93fe9bfd94884cec2ba711897222e0df5569a53/makefiles/common.mk#L48
    maxrregcount = "-maxrregcount=96"

    return cuda_default_copts() + select({
          "@local_config_cuda//cuda:using_nvcc": [
              "-nvcc_options",
              "relocatable-device-code=true",
              "-nvcc_options",
              "ptxas-options=" + maxrregcount,
          ],
          "@local_config_cuda//cuda:using_clang": [
              "-fcuda-rdc",
              "-Xcuda-ptxas",
              maxrregcount,
          ],
          "//conditions:default": [],
      }) + ["-fvisibility=hidden"]


def _filter_impl(ctx):
    suffix = ctx.attr.suffix
    files = [src for src in ctx.files.srcs if src.path.endswith(suffix)]
    return [DefaultInfo(files = depset(files))]
_filter = rule(
    implementation = _filter_impl,
    attrs = {
        "srcs": attr.label_list(allow_files = True),
        "suffix": attr.string(),
    },
)
"""Filters the srcs to the ones ending with suffix."""


def _gen_link_src_impl(ctx):
    ctx.actions.expand_template(
        output = ctx.outputs.output,
        template = ctx.file.template,
        substitutions = {
            "REGISTERLINKBINARYFILE": '"%s"' % ctx.file.register_hdr.short_path,
            "FATBINFILE": '"%s"' % ctx.file.fatbin_hdr.short_path,
        },
    )
_gen_link_src = rule(
    implementation = _gen_link_src_impl,
    attrs = {
        "register_hdr": attr.label(allow_single_file = True),
        "fatbin_hdr": attr.label(allow_single_file = True),
        "template": attr.label(allow_single_file = True),
        "output": attr.output(),
    },
)
"""Patches the include directives for the link.stub file."""


def device_link(name, srcs):
    """Links seperately compiled relocatable device code into a cc_library."""

    # From .a and .pic.a archives, just use the latter.
    _filter(
        name = name + "_pic_a",
        srcs = srcs,
        suffix = ".pic.a",
    )

    # Device-link to cubins for each architecture.
    images = []
    cubins = []
    for arch in %{gpu_architectures}:
        cubin = "%s_%s.cubin" % (name, arch)
        register_hdr = "%s_%s.h" % (name, arch)
        nvlink = "@local_config_nccl//:nvlink"
        cmd = ("$(location %s) --cpu-arch=X86_64 " % nvlink +
            "--arch=%s $(SRCS) " % arch +
            "--register-link-binaries=$(location %s) " % register_hdr +
            "--output-file=$(location %s)" % cubin)
        native.genrule(
            name = "%s_%s" % (name, arch),
            outs = [register_hdr, cubin],
            srcs = [name + "_pic_a"],
            cmd = cmd,
            tools = [nvlink],
        )
        images.append("--image=profile=%s,file=$(location %s)" % (arch, cubin))
        cubins.append(cubin)

    # Generate fatbin header from all cubins.
    fatbin_hdr = name + ".fatbin.h"
    fatbinary = "@local_config_nccl//:cuda/bin/fatbinary"
    cmd = ("PATH=$$CUDA_TOOLKIT_PATH/bin:$$PATH " + # for bin2c
          "$(location %s) -64 --cmdline=--compile-only --link " % fatbinary +
          "--compress-all %s --create=%%{name}.fatbin " % " ".join(images) +
          "--embedded-fatbin=$@")
    native.genrule(
        name = name + "_fatbin_h",
        outs = [fatbin_hdr],
        srcs = cubins,
        cmd = cmd,
        tools = [fatbinary],
    )

    # Generate the source file #including the headers generated above.
    _gen_link_src(
        name = name + "_cc",
        # Include just the last one, they are equivalent.
        register_hdr = register_hdr,
        fatbin_hdr = fatbin_hdr,
        template = "@local_config_nccl//:cuda/bin/crt/link.stub",
        output = name + ".cc",
    )

    # Compile the source file into the cc_library.
    native.cc_library(
        name = name,
        srcs = [name + "_cc"],
        textual_hdrs = [register_hdr, fatbin_hdr],
        deps = [
            "@local_config_cuda//cuda:cuda_headers",
            "@local_config_cuda//cuda:cudart_static",
        ],
    )