From a2e50847230e72c1782aefe9a3b728d799bee505 Mon Sep 17 00:00:00 2001 From: Jon Brandvein Date: Thu, 23 Feb 2017 20:40:21 +0000 Subject: Add overview documentation on depsets -- PiperOrigin-RevId: 148377512 MOS_MIGRATED_REVID=148377512 --- site/_layouts/documentation.html | 1 + site/versions/master/docs/skylark/concepts.md | 12 -- site/versions/master/docs/skylark/depsets.md | 292 ++++++++++++++++++++++++++ site/versions/master/docs/skylark/rules.md | 28 +-- 4 files changed, 309 insertions(+), 24 deletions(-) create mode 100644 site/versions/master/docs/skylark/depsets.md diff --git a/site/_layouts/documentation.html b/site/_layouts/documentation.html index 775ca27852..d53f07ea4a 100644 --- a/site/_layouts/documentation.html +++ b/site/_layouts/documentation.html @@ -87,6 +87,7 @@ version_prefix: /versions/master
  • Language
  • Macros
  • Rules
  • +
  • Depsets
  • Aspects
  • Repository rules
  • Challenges of writing rules
  • diff --git a/site/versions/master/docs/skylark/concepts.md b/site/versions/master/docs/skylark/concepts.md index fb6f0e2f9c..c8e2f5e5de 100644 --- a/site/versions/master/docs/skylark/concepts.md +++ b/site/versions/master/docs/skylark/concepts.md @@ -119,18 +119,6 @@ The following items are upcoming changes. dictionaries. This will be going away. The same result can be achieved using `dict(a.items() + b.items())`. -* The `|` operator is defined for depsets as a synonym for `+`. This will be - going away; use `+` instead. - -* The structure of the set that you get back from using the `+` or `|` - operator is changing. Previously `a + b`, where `a` is a set, would include - as its direct items all of `a`'s direct items. Under the upcoming way, the - result will only include `a` as a single transitive entity. This will alter - the visible iteration order of the returned set. Most notably, `set([1, - 2]) + set([3, 4] + set([5, 6])` will return elements in the order `1 2 3 4 5 - 6` instead of `3 4 5 6 1 2`. This change is associated with a fix that - improves set union to be O(1) time. - These changes concern the `load()` syntax in particular. * Currently a `load()` statement can appear anywhere in a file so long as it is diff --git a/site/versions/master/docs/skylark/depsets.md b/site/versions/master/docs/skylark/depsets.md new file mode 100644 index 0000000000..a29043e431 --- /dev/null +++ b/site/versions/master/docs/skylark/depsets.md @@ -0,0 +1,292 @@ +--- +layout: documentation +title: Depsets +--- + +# Depsets + +Depsets are a specialized data structure for efficiently collecting data across +a target’s transitive dependencies. Since this use case concerns the [analysis +phase](concepts.md#evaluation-model), depsets are useful for authors of rules +and aspects, but probably not macros. + +The main feature of depsets is that they support a time- and space-efficient +merge operation, whose cost is independent of the size of the existing contents. +Depsets also have well-defined ordering semantics. + +Example uses of depsets include: + +* storing the paths of all object files for a program’s libraries, which can + then be passed to a linker action + +* for an interpreted language, storing the transitive source files that will + be included in an executable's runfiles + + +## Full example + +Suppose we have a hypothetical interpreted language Foo. In order to build each +`foo_binary` we need to know all the \*.foo files that it directly or indirectly +depends on. + +```python +# //mypackage:BUILD + +load(":foo.bzl", "foo_library", "foo_binary") + +# Our hypothetical Foo compiler. +py_binary( + name = "foocc", + srcs = ["foocc.py"], +) + +foo_library( + name = "a", + srcs = ["a.foo", "a_impl.foo"], +) + +foo_library( + name = "b", + srcs = ["b.foo", "b_impl.foo"], + deps = [":a"], +) + +foo_library( + name = "c", + srcs = ["c.foo", "c_impl.foo"], + deps = [":a"], +) + +foo_binary( + name = "d", + srcs = ["d.foo"], + deps = [":b", ":c"], +) +``` + +```python +# //mypackage:foocc.py + +# "Foo compiler" that just concatenates its inputs to form its output. +import sys + +if __name__ == "__main__": + assert len(sys.argv) >= 1 + output = open(sys.argv[1], "wt") + for path in sys.argv[2:]: + input = open(path, "rt") + output.write(input.read()) +``` + +Here, the transitive sources of the binary `d` are all of the \*.foo files in +the `srcs` fields of `a`, `b`, `c`, and `d`. In order for the `foo_binary` +target to know about any file besides `d.foo`, the `foo_library` targets need to +pass them along in a provider. Each library receives the providers from its own +dependencies, adds its own immediate sources, and passes on a new provider with +the augmented contents. The `foo_binary` rule does the same, except that instead +of returning a provider, it uses the complete list of sources to construct a +command line for an action. + +Here’s a complete implementation of the `foo_library` and `foo_binary` rules. + +```python +# //mypackage/foo.bzl + +# A provider with one field, transitive_sources. +FooFiles = provider() + +def get_transitive_srcs(srcs, deps): + """Obtain the source files for a target and its transitive dependencies. + + Args: + srcs: a list of source files + deps: a list of targets that are direct dependencies + Returns: + a collection of the transitive sources + """ + trans_srcs = depset() + for dep in deps: + trans_srcs += dep[FooFiles].transitive_sources + trans_srcs += srcs + return trans_srcs + +def _foo_library_impl(ctx): + trans_srcs = get_transitive_srcs(ctx.files.srcs, ctx.attr.deps) + return [FooFiles(transitive_sources=trans_srcs)] + +foo_library = rule( + implementation = _foo_library_impl, + attrs = { + "srcs": attr.label_list(allow_files=True), + "deps": attr.label_list(), + }, +) + +def _foo_binary_impl(ctx): + foocc = ctx.executable._foocc + out = ctx.outputs.out + trans_srcs = get_transitive_srcs(ctx.files.srcs, ctx.attr.deps) + srcs_list = trans_srcs.to_list() + cmd_string = (foocc.path + " " + out.path + " " + + " ".join([src.path for src in srcs_list])) + ctx.action(command=cmd_string, + inputs=srcs_list + [foocc], + outputs=[out]) + +foo_binary = rule( + implementation = _foo_binary_impl, + attrs = { + "srcs": attr.label_list(allow_files=True), + "deps": attr.label_list(), + "_foocc": attr.label(default=Label("//mypackage:foocc"), + allow_files=True, executable=True, cfg="host") + }, + outputs = {"out": "%{name}.out"}, +) +``` + +You can test this by copying these files into a fresh package, renaming the +labels appropriately, creating the source \*.foo files with dummy content, and +building the `d` target. + +## Description and operations + +Conceptually, a depset is a directed acyclic graph (DAG) that typically looks +similar to the target graph. It is constructed from the leaves up to the root. +Each target in a dependency chain can add its own contents on top of the +previous without having to read or copy them. + +Each node in the DAG holds a list of direct elements and a list of child nodes. +The contents of the depset are the transitive elements, i.e. the direct elements +of all the nodes. A new depset with direct elements but no children can be +created using the [depset](lib/globals.html#depset) constructor. Given an +existing depset, the `+` operator can be used to form a new depset that has +additional contents. Specifically, for the operation `a + b` where `a` is a +depset, the result is a copy of `a` where: + +* if `b` is a depset, then `b` is appended to `a`’s list of children; and + otherwise, + +* if `b` is an iterable, then `b`’s elements are appended to `a`’s list of + direct elements. + +In all cases, the original depset is left unmodified because depsets are +immutable. The returned value shares most of its internal structure with the old +depset. + +To retrieve the contents of a depset, use the +[to_list()](lib/depset.html#to_list) method. It returns a list of all transitive +elements, not including duplicates. There is no way to directly inspect the +precise structure of the DAG, although this structure does affect the order in +which the elements are returned. + +## Order + +The `to_list` operation performs a traversal over the DAG. The kind of traversal +depends on the *order* that was specified at the time the depset was +constructed. It is useful for Bazel to support multiple orders because sometimes +tools care about the order of their inputs. For example, a linker action may +need to ensure that if `B` depends on `A`, then `A.o` comes before `B.o` on the +linker’s command line. Other tools might have the opposite requirement. + +Three traversal orders are supported: `postorder`, `preorder`, and +`topological`. The first two work exactly like [tree +traversals](https://en.wikipedia.org/wiki/Tree_traversal#Depth-first_search) +except that they operate on DAGs and skip already visited nodes. The third order +works as a topological sort from root to leaves, essentially the same as +preorder except that shared children are listed only after all of their parents. +Preorder and postorder operate as left-to-right traversals. For topological +order, there is no left-to-right guarantee, and even the +all-parents-before-child guarantee does not apply in the case that there are +duplicate elements in different nodes of the DAG. + +Due to how traversals are implemented, the order must be specified at the time +the depset is created with the constructor’s `order` keyword argument. If this +argument is omitted, the depset has the special `default` order, in which case +there are no guarantees about the order of any of its elements. For safety, +depsets with different orders cannot be merged with the `+` operator unless one +of them uses the default order; the resulting depset’s order is the same as the +left operand. Note that when two depsets of different order are merged in this +way, the child may appear to have had its elements rearranged when it is +traversed via the parent. + +## Performance + +To see the motivation for using depsets, consider what would have happened if we +had implemented `get_transitive_srcs()` without them. A naive way of writing +this function would be to collect the sources in a list. + +```python +def get_transitive_srcs(srcs, deps): + trans_srcs = [] + for dep in deps: + trans_srcs += dep[FooFiles].transitive_sources + trans_srcs += srcs + return trans_srcs +``` + +However, this does not take into account duplicates, so the source files for `a` +will appear twice on the command line and twice in the contents of the output +file. + +The next alternative is using a general set, which can be simulated by a +dictionary where the keys are the elements and all the keys map to `None`. + +```python +def get_transitive_srcs(srcs, deps): + trans_srcs = {} + for dep in deps: + for file in dep[FooFiles].transitive_sources: + trans_srcs[file] = None + for file in srcs: + trans_srcs[file] = None + return trans_srcs +``` + +This gets rid of the duplicates, but it makes the order of the command line +arguments (and therefore the contents of the files) unspecified, although still +deterministic. + +Moreover, both this approach and the list-based are asymptotically worse than +the depset-based approach. Consider the case where there is a long chain of +dependencies on Foo libraries. Processing every rule requires copying all of the +transitive sources that came before it into a new data structure. This means +that the time and space cost for analyzing an individual library or binary +target is proportional to its own height in the chain. For a chain of length n, +foolib_1 ← foolib_2 ← … ← foolib_n, the overall cost is effectively the +[triangle sum](https://en.wikipedia.org/wiki/Triangular_number) 1 + 2 + … + n, +which is O(n^2). This cost is wasteful because the library rule’s behavior is +not actually affected by the transitive sources. + +Generally speaking, depsets should be used whenever you are accumulating more +and more information through your transitive dependencies. This helps ensure +that your build scales well as your target graph grows deeper. The exact +advantage will depend on how deep the target graph is and how many elements per +target are added. + +To actually get the performance advantage, it’s important to not retrieve the +contents of the depset unnecessarily in library rules. One call to `to_list()` +at the end in a binary rule is fine, since the overall cost is just O(n). It’s +when many non-terminal targets try to call `to_list()` that we start to get into +quadratic behavior. + +## Upcoming changes + +The API for depsets is being updated to be more consistent. Here are some recent +and/or upcoming changes. + +* The name “set” has been replaced by “depset”. Do not use the `set` + constructor in new code; it is deprecated and will be removed. The traversal + orders have undergone a similar renaming; their old names will be removed as + well. + +* Depset contents should be retrieved using `to_list()`, not by iterating over + the depset itself. Direct iteration over depsets is deprecated and will be + removed. + +* The `|` operator is defined for depsets as a synonym for `+`. This will be + going away; use `+` instead. + +* (Pending approval) The `+` operator will be deprecated in favor of a new + syntax based on function calls. This avoids confusion regarding how `+` + treats direct elements vs children. diff --git a/site/versions/master/docs/skylark/rules.md b/site/versions/master/docs/skylark/rules.md index 6e8023d2e5..525fe59b26 100644 --- a/site/versions/master/docs/skylark/rules.md +++ b/site/versions/master/docs/skylark/rules.md @@ -329,18 +329,22 @@ use `ctx.host_fragments` instead. ## Providers -Providers are used to access information from other rules. A rule depending on -another rule has access to the data the latter provides. These data can be e.g. -output files, the libraries the dependent rule is using to link or compile, or -anything the depending rule should know about. Using providers is the only way -to exchange data between rules. - -A rule can only access data provided by its direct dependencies, not that of -transitive dependencies: if rule `top` depends on `middle`, and `middle` depends -on `bottom`, then `middle` is a direct dependency of `top` and `bottom` is a -transitive dependency of `top`. In this scenario `top` can only access data -provided by `middle`. If `middle` also provides the data that `bottom` provided -to it, then and only then can `top` access it. +Providers are pieces of information that a rule exposes to other rules that +depend on it. This data can include output files, libraries, parameters to pass +on a tool's command line, or anything else the depending rule should know about. +Providers are the only mechanism to exchange data between rules, and can be +thought of as part of a rule's public interface (loosely analogous to a +function's return value). + +A rule can only see the providers of its direct dependencies. If there is a rule +`top` that depends on `middle`, and `middle` depends on `bottom`, then we say +that `middle` is a direct dependency of `top`, while `bottom` is a transitive +dependency of `top`. In this case, `top` can see the providers of `middle`. The +only way for `top` to see any information from `bottom` is if `middle` +re-exports this information in its own providers; this is how transitive +information can be accumulated from all dependencies. In such cases, consider +using [depsets](depsets.md) to hold the data more efficiently without excessive +copying. The following data types can be passed using providers: -- cgit v1.2.3