// Copyright 2018 The Bazel Authors. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package com.google.devtools.build.lib.skyframe.serialization;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableSet;
import com.google.devtools.build.lib.skyframe.serialization.ObjectCodec.MemoizationStrategy;
import com.google.protobuf.CodedInputStream;
import com.google.protobuf.CodedOutputStream;
import java.io.IOException;
import java.util.ArrayDeque;
import java.util.Deque;
import java.util.HashMap;
import java.util.IdentityHashMap;
import javax.annotation.Nullable;
/**
* A framework for serializing and deserializing with memo tables. Memoization is useful both for
* performance and, in the case of cyclic data structures, to help avoid infinite recursion.
*
*
The memo table associates each value with an integer identifier.
*
*
*
On the sending end: The first time a value is to be serialized, a new id is created
* and a mapping for it is added to the table. The id is emitted on the wire alongside the
* value's serialized representation. If the same value surfaces again later on, instead of
* reserializing it, we just emit a backreference consisting of its id.
*
On the receiving end: Each deserialized value is stored in the memo table along with
* the id it was associated with. When a backreference is read, the value in the memo table is
* returned instead of deserializing a new copy of that same value.
*
*
*
Cyclic data structures can occur either naturally in complex types, or as a result of a
* pathological Skylark program. An example of the former is how a user-defined Skylark function
* implicitly refers to its global frame, which in turn refers to the functions defined in that
* frame. An example of the latter is a list that the user mutates to contain itself as an element.
* Such pathological values in Skylark are technically allowed, but they are not useful since
* Skylark prohibits recursive function calls. They can also expose implementation bugs in code that
* is not expecting them (b/30310522).
*
*
Ideally, to handle cyclic data structures, the serializer should add the value to the memo
* table before actually performing the serialization. For example, to handle the recursive
* list {@code L = ["A", L]}, the serializer should do the following:
*
*
*
add a memo mapping from {@code L} to a fresh id, {@code k}
*
emit {@code k} to the wire
*
serialize {@code L}, which means it has to
*
*
write its length, {@code 2}
*
serialize the string {@code "A"}
*
serialize {@code L}, but since this matches the entry added in Step 1, it just emits
* {@code k} as a backreference
*
*
*
* The problem is that on the other end of the wire, the deserializer needs to associate a value
* with the memo entry for k before {@code L} has been fully formed. To solve this, we associate k
* with a new empty list as the initial value, then allow the deserialization logic to mutate this
* list to form {@code L}. It is important that this is done by mutating the initial list rather
* than by replacing it with another list, since each backreference to k creates an actual Java
* reference to the initial object.
*
*
However, this strategy does not work for all types of values. There is no way to deserialize
* the recursive tuple {@code T = ("A", T)}, because tuples are immutable and therefore cannot be
* instantiated before all of their elements have been. Rather than restrict serialization to only
* mutable types, or add a special way for deserializers to modify seemingly immutable types, we
* simply don't memoize immutable types until after they are fully constructed. This means that
* {@code T} is not serializable. But that's okay, because objects like {@code T} should not even be
* able to exist (barring a hidden API or reflection). In general, all cycles must go through at
* least one mutable type of value.
*
*
Aside from mutability, there is another potential problem: One of the types' constructors may
* enforce an invariant that is not satisfied at the time the constructor is invoked, even though it
* may be satisfied once construction of all objects is complete. For instance, suppose a type
* {@code Foo} has a constructor that takes a non-empty list. Then we would fail to deserialize
* {@code L = [Foo(L)]}, since {@code L} is initially empty. Such a list could legally be formed by
* putting other elements in {@code L} before creating {@code Foo}, and then later removing those
* other elements. But there's no general way for a deserializer to know how to do that. Therefore,
* it is the caller's responsibility to ensure the following property:
*
*
*
* For any value that is to be serialized, if the value has children that directly or indirectly
* contain the value, then the value must be constructible even when those children are in a
* semi-constructed state.
*
*
*
* where "semi-constructed state" means any state that can be produced by the codecs for those
* children. Other serialization systems address this issue by providing multiple hooks for types to
* setup their invariants, but we keep the API relatively simple.
*
*
Round-tripping a value through memoized serialization and deserialization is guaranteed to
* preserve the object graph, i.e., to not duplicate a value. For mutable types, a value can only be
* serialized and deserialized at most once because it is memoized before recursing over its
* children. For immutable types, although a value can be serialized multiple times, upon
* deserialization only one copy is retained in the memo table. This is conceptually similar to how
* Python's Pickle library
* handles tuples, although in that case they use an abstract machine whereas we do not.
*
*
*
* where {@code Payload} is the serialized representation of the value. {@code Payload} may itself
* contain complete memo-aware encodings of the value's children.
*/
// TODO(brandjon): Maybe make this more robust against a pathological cycle of immutable objects, so
// that instead of failing with a stack overflow, we detect the cycle and throw
// SerializationException. This requires just a little extra memo tracking for the MEMOIZE_AFTER
// case.
class Memoizer {
private Memoizer() {}
/** A context for serializing; wraps a memo table. Not thread-safe. */
static class Serializer {
private final SerializingMemoTable memo = new SerializingMemoTable();
/**
* Serializes an object using the given codec and current memo table state.
*
* @throws SerializationException on a logical error during serialization
* @throws IOException on {@link IOException} during serialization
*/
void serialize(
SerializationContext context,
T obj,
ObjectCodec super T> codec,
CodedOutputStream codedOut)
throws SerializationException, IOException {
MemoizationStrategy strategy = codec.getStrategy();
if (strategy == MemoizationStrategy.DO_NOT_MEMOIZE) {
codec.serialize(context, obj, codedOut);
} else {
// The caller already checked the table, so this is definitely a new value.
serializeMemoContent(context, obj, codec, codedOut, strategy);
}
}
@Nullable
Integer getMemoizedIndex(Object obj) {
return memo.lookupNullable(obj);
}
// Corresponds to MemoContent in the abstract grammar.
private void serializeMemoContent(
SerializationContext context,
T obj,
ObjectCodec codec,
CodedOutputStream codedOut,
MemoizationStrategy strategy)
throws SerializationException, IOException {
switch(strategy) {
case MEMOIZE_BEFORE: {
int id = memo.memoize(obj);
codedOut.writeInt32NoTag(id);
codec.serialize(context, obj, codedOut);
break;
}
case MEMOIZE_AFTER: {
codec.serialize(context, obj, codedOut);
// If serializing the children caused the parent object itself to be serialized due to a
// cycle, then there's now a memo entry for the parent. Don't overwrite it with a new id.
Integer cylicallyCreatedId = memo.lookupNullable(obj);
int id = (cylicallyCreatedId != null) ? cylicallyCreatedId : memo.memoize(obj);
codedOut.writeInt32NoTag(id);
break;
}
default:
throw new AssertionError("Unreachable (strategy=" + strategy + ")");
}
}
private static class SerializingMemoTable {
private final IdentityHashMap