diff options
Diffstat (limited to 'src/main/java/com/google/devtools/build/lib/util/CompactStringIndexer.java')
-rw-r--r-- | src/main/java/com/google/devtools/build/lib/util/CompactStringIndexer.java | 546 |
1 files changed, 546 insertions, 0 deletions
diff --git a/src/main/java/com/google/devtools/build/lib/util/CompactStringIndexer.java b/src/main/java/com/google/devtools/build/lib/util/CompactStringIndexer.java new file mode 100644 index 0000000000..698758dba6 --- /dev/null +++ b/src/main/java/com/google/devtools/build/lib/util/CompactStringIndexer.java @@ -0,0 +1,546 @@ +// Copyright 2014 Google Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +package com.google.devtools.build.lib.util; + +import static java.nio.charset.StandardCharsets.UTF_8; + +import com.google.common.base.Preconditions; +import com.google.common.collect.Lists; +import com.google.devtools.build.lib.concurrent.ThreadSafety.ThreadCompatible; +import com.google.devtools.build.lib.concurrent.ThreadSafety.ThreadSafe; + +import java.util.ArrayList; + +/** + * Provides memory-efficient bidirectional mapping String <-> unique integer. + * Uses byte-wise compressed prefix trie internally. + * <p> + * Class allows index retrieval for the given string, addition of the new + * index and string retrieval for the given index. It also allows efficient + * serialization of the internal data structures. + * <p> + * Internally class stores list of nodes with each node containing byte[] + * representation of compressed trie node: + * <pre> + * varint32 parentIndex; // index of the parent node + * varint32 keylen; // length of the node key + * byte[keylen] key; // node key data + * repeated jumpEntry { // Zero or more jump entries, referencing child nodes + * byte key // jump key (first byte of the child node key) + * varint32 nodeIndex // child index + * } + * <p> + * Note that jumpEntry key byte is actually duplicated in the child node + * instance. This is done to improve performance of the index->string + * lookup (so we can avoid jump table parsing during this lookup). + * <p> + * Root node of the trie must have parent id pointing to itself. + * <p> + * TODO(bazel-team): (2010) Consider more fine-tuned locking mechanism - e.g. + * distinguishing between read and write locks. + */ +@ThreadSafe +public class CompactStringIndexer extends AbstractIndexer { + + private static final int NOT_FOUND = -1; + + private ArrayList<byte[]> nodes; // Compressed prefix trie nodes. + private int rootId; // Root node id. + + /* + * Creates indexer instance. + */ + public CompactStringIndexer (int expectedCapacity) { + Preconditions.checkArgument(expectedCapacity > 0); + nodes = Lists.newArrayListWithExpectedSize(expectedCapacity); + rootId = NOT_FOUND; + } + + /** + * Allocates new node index. Must be called only from + * synchronized methods. + */ + private int allocateIndex() { + nodes.add(null); + return nodes.size() - 1; + } + + /** + * Replaces given node record with the new one. Must be called only from + * synchronized methods. + * <p> + * Subclasses can override this method to be notified when an update actually + * takes place. + */ + @ThreadCompatible + protected void updateNode(int index, byte[] content) { + nodes.set(index, content); + } + + /** + * Returns parent id for the given node content. + * + * @return parent node id + */ + private int getParentId(byte[] content) { + int[] intHolder = new int[1]; + VarInt.getVarInt(content, 0, intHolder); + return intHolder[0]; + } + + /** + * Creates new node using specified key suffix. Must be called from + * synchronized methods. + * + * @param parentNode parent node id + * @param key original key that is being added to the indexer + * @param offset node key offset in the original key. + * + * @return new node id corresponding to the given key + */ + private int createNode(int parentNode, byte[] key, int offset) { + int index = allocateIndex(); + + int len = key.length - offset; + Preconditions.checkState(len >= 0); + + // Content consists of parent id, key length and key. There are no jump records. + byte[] content = new byte[VarInt.varIntSize(parentNode) + VarInt.varIntSize(len) + len]; + // Add parent id. + int contentOffset = VarInt.putVarInt(parentNode, content, 0); + // Add node key length. + contentOffset = VarInt.putVarInt(len, content, contentOffset); + // Add node key content. + System.arraycopy(key, offset, content, contentOffset, len); + + updateNode(index, content); + return index; + } + + /** + * Updates jump entry index in the given node. + * + * @param node node id to update + * @param oldIndex old jump entry index + * @param newIndex updated jump entry index + */ + private void updateJumpEntry(int node, int oldIndex, int newIndex) { + byte[] content = nodes.get(node); + int[] intHolder = new int[1]; + int offset = VarInt.getVarInt(content, 0, intHolder); // parent id + offset = VarInt.getVarInt(content, offset, intHolder); // key length + offset += intHolder[0]; // Offset now points to the first jump entry. + while (offset < content.length) { + int next = VarInt.getVarInt(content, offset + 1, intHolder); // jump index + if (intHolder[0] == oldIndex) { + // Substitute oldIndex value with newIndex. + byte[] newContent = + new byte[content.length + VarInt.varIntSize(newIndex) - VarInt.varIntSize(oldIndex)]; + System.arraycopy(content, 0, newContent, 0, offset + 1); + offset = VarInt.putVarInt(newIndex, newContent, offset + 1); + System.arraycopy(content, next, newContent, offset, content.length - next); + updateNode(node, newContent); + return; + } else { + offset = next; + } + } + StringBuilder builder = new StringBuilder().append("Index ").append(oldIndex) + .append(" is not present in the node ").append(node).append(", "); + dumpNodeContent(builder, content); + throw new IllegalArgumentException(builder.toString()); + } + + /** + * Creates new branch node content at the predefined location, splitting + * prefix from the given node and optionally adding another child node + * jump entry. + * + * @param originalNode node that will be split + * @param newBranchNode new branch node id + * @param splitOffset offset at which to split original node key + * @param indexKey optional additional jump key + * @param childIndex optional additional jump index. Optional jump entry will + * be skipped if this index is set to NOT_FOUND. + */ + private void createNewBranchNode(int originalNode, int newBranchNode, int splitOffset, + byte indexKey, int childIndex) { + byte[] content = nodes.get(originalNode); + int[] intHolder = new int[1]; + int keyOffset = VarInt.getVarInt(content, 0, intHolder); // parent id + + // If original node is a root node, new branch node will become new root. So set parent id + // appropriately (for root node it is set to the node's own id). + int parentIndex = (originalNode == intHolder[0] ? newBranchNode : intHolder[0]); + + keyOffset = VarInt.getVarInt(content, keyOffset, intHolder); // key length + Preconditions.checkState(intHolder[0] >= splitOffset); + // Calculate new content size. + int newSize = VarInt.varIntSize(parentIndex) + + VarInt.varIntSize(splitOffset) + splitOffset + + 1 + VarInt.varIntSize(originalNode) + + (childIndex != NOT_FOUND ? 1 + VarInt.varIntSize(childIndex) : 0); + // New content consists of parent id, new key length, truncated key and two jump records. + byte[] newContent = new byte[newSize]; + // Add parent id. + int contentOffset = VarInt.putVarInt(parentIndex, newContent, 0); + // Add adjusted key length. + contentOffset = VarInt.putVarInt(splitOffset, newContent, contentOffset); + // Add truncated key content and first jump key. + System.arraycopy(content, keyOffset, newContent, contentOffset, splitOffset + 1); + // Add index for the first jump key. + contentOffset = VarInt.putVarInt(originalNode, newContent, contentOffset + splitOffset + 1); + // If requested, add additional jump entry. + if (childIndex != NOT_FOUND) { + // Add second jump key. + newContent[contentOffset] = indexKey; + // Add index for the second jump key. + VarInt.putVarInt(childIndex, newContent, contentOffset + 1); + } + updateNode(newBranchNode, newContent); + } + + /** + * Inject newly created branch node into the trie data structure. Method + * will update parent node jump entry to point to the new branch node (or + * will update root id if branch node becomes new root) and will truncate + * key prefix from the original node that was split (that prefix now + * resides in the branch node). + * + * @param originalNode node that will be split + * @param newBranchNode new branch node id + * @param commonPrefixLength how many bytes should be split into the new branch node. + */ + private void injectNewBranchNode(int originalNode, int newBranchNode, int commonPrefixLength) { + byte[] content = nodes.get(originalNode); + + int parentId = getParentId(content); + if (originalNode == parentId) { + rootId = newBranchNode; // update root index + } else { + updateJumpEntry(parentId, originalNode, newBranchNode); + } + + // Truncate prefix from the original node and set its parent to the our new branch node. + int[] intHolder = new int[1]; + int suffixOffset = VarInt.getVarInt(content, 0, intHolder); // parent id + suffixOffset = VarInt.getVarInt(content, suffixOffset, intHolder); // key length + int len = intHolder[0] - commonPrefixLength; + Preconditions.checkState(len >= 0); + suffixOffset += commonPrefixLength; + // New content consists of parent id, new key length and duplicated key suffix. + byte[] newContent = new byte[VarInt.varIntSize(newBranchNode) + VarInt.varIntSize(len) + + (content.length - suffixOffset)]; + // Add parent id. + int contentOffset = VarInt.putVarInt(newBranchNode, newContent, 0); + // Add new key length. + contentOffset = VarInt.putVarInt(len, newContent, contentOffset); + // Add key and jump table. + System.arraycopy(content, suffixOffset, newContent, contentOffset, + content.length - suffixOffset); + updateNode(originalNode, newContent); + } + + /** + * Adds new child node (that uses specified key suffix) to the given + * current node. + * Example: + * <pre> + * Had "ab". Adding "abcd". + * + * 1:"ab",'c'->2 + * 1:"ab" -> \ + * 2:"cd" + * </pre> + */ + private int addChildNode(int parentNode, byte[] key, int keyOffset) { + int child = createNode(parentNode, key, keyOffset); + + byte[] content = nodes.get(parentNode); + // Add jump table entry to the parent node. + int entryOffset = content.length; + // New content consists of original content and additional jump record. + byte[] newContent = new byte[entryOffset + 1 + VarInt.varIntSize(child)]; + // Copy original content. + System.arraycopy(content, 0, newContent, 0, entryOffset); + // Add jump key. + newContent[entryOffset] = key[keyOffset]; + // Add jump index. + VarInt.putVarInt(child, newContent, entryOffset + 1); + + updateNode(parentNode, newContent); + return child; + } + + /** + * Splits node into two at the specified offset. + * Example: + * <pre> + * Had "abcd". Adding "ab". + * + * 2:"ab",'c'->1 + * 1:"abcd" -> \ + * 1:"cd" + * </pre> + */ + private int splitNodeSuffix(int nodeToSplit, int commonPrefixLength) { + int newBranchNode = allocateIndex(); + // Create new node with truncated key. + createNewBranchNode(nodeToSplit, newBranchNode, commonPrefixLength, (byte) 0, NOT_FOUND); + injectNewBranchNode(nodeToSplit, newBranchNode, commonPrefixLength); + + return newBranchNode; + } + + /** + * Splits node into two at the specified offset and adds another leaf. + * Example: + * <pre> + * Had "abcd". Adding "abef". + * + * 3:"ab",'c'->1,'e'->2 + * 1:"abcd" -> / \ + * 1:"cd" 2:"ef" + * </pre> + */ + private int addBranch(int nodeToSplit, byte[] key, int offset, int commonPrefixLength) { + int newBranchNode = allocateIndex(); + int child = createNode(newBranchNode, key, offset + commonPrefixLength); + // Create new node with the truncated key and reference to the new child node. + createNewBranchNode(nodeToSplit, newBranchNode, commonPrefixLength, + key[offset + commonPrefixLength], child); + injectNewBranchNode(nodeToSplit, newBranchNode, commonPrefixLength); + + return child; + } + + private int findOrCreateIndexInternal(int node, byte[] key, int offset, + boolean createIfNotFound) { + byte[] content = nodes.get(node); + int[] intHolder = new int[1]; + int contentOffset = VarInt.getVarInt(content, 0, intHolder); // parent id + contentOffset = VarInt.getVarInt(content, contentOffset, intHolder); // key length + int skyKeyLen = intHolder[0]; + int remainingKeyLen = key.length - offset; + int minKeyLen = remainingKeyLen > skyKeyLen ? skyKeyLen : remainingKeyLen; + + // Compare given key/offset content with the node key. Skip first key byte for recursive + // calls - this byte is equal to the byte in the jump entry and was already compared. + for (int i = (offset > 0 ? 1 : 0); i < minKeyLen; i++) { // compare key + if (key[offset + i] != content[contentOffset + i]) { + // Mismatch found somewhere in the middle of the node key. If requested, node + // should be split and another leaf added for the new key. + return createIfNotFound ? addBranch(node, key, offset, i) : NOT_FOUND; + } + } + + if (remainingKeyLen > minKeyLen) { + // Node key matched portion of the key - find appropriate jump entry. If found - recursion. + // If not - mismatch (we will add new child node if requested). + contentOffset += skyKeyLen; + while (contentOffset < content.length) { + if (key[offset + skyKeyLen] == content[contentOffset]) { // compare index value + VarInt.getVarInt(content, contentOffset + 1, intHolder); + // Found matching jump entry - recursively compare the child. + return findOrCreateIndexInternal(intHolder[0], key, offset + skyKeyLen, + createIfNotFound); + } else { + // Jump entry key does not match. Skip rest of the entry data. + contentOffset = VarInt.getVarInt(content, contentOffset + 1, intHolder); + } + } + // There are no matching jump entries - report mismatch or create a new leaf if necessary. + return createIfNotFound ? addChildNode(node, key, offset + skyKeyLen) : NOT_FOUND; + } else if (skyKeyLen > minKeyLen) { + // Key suffix is a subset of the node key. Report mismatch or split the node if requested). + return createIfNotFound ? splitNodeSuffix(node, minKeyLen) : NOT_FOUND; + } else { + // Node key exactly matches key suffix - return associated index value. + return node; + } + } + + private synchronized int findOrCreateIndex(byte[] key, boolean createIfNotFound) { + if (rootId == NOT_FOUND) { + // Root node does not seem to exist - create it if needed. + if (createIfNotFound) { + rootId = createNode(0, key, 0); + Preconditions.checkState(rootId == 0); + return 0; + } else { + return NOT_FOUND; + } + } + return findOrCreateIndexInternal(rootId, key, 0, createIfNotFound); + } + + private byte[] reconstructKeyInternal(int node, int suffixSize) { + byte[] content = nodes.get(node); + Preconditions.checkNotNull(content); + int[] intHolder = new int[1]; + int contentOffset = VarInt.getVarInt(content, 0, intHolder); // parent id + int parentNode = intHolder[0]; + contentOffset = VarInt.getVarInt(content, contentOffset, intHolder); // key length + int len = intHolder[0]; + byte[] key; + if (node != parentNode) { + // We haven't reached root node yet. Make a recursive call, adjusting suffix length. + key = reconstructKeyInternal(parentNode, suffixSize + len); + } else { + // We are in a root node. Finally allocate array for the key. It will be filled up + // on our way back from recursive call tree. + key = new byte[suffixSize + len]; + } + // Fill appropriate portion of the full key with the node key content. + System.arraycopy(content, contentOffset, key, key.length - suffixSize - len, len); + return key; + } + + private byte[] reconstructKey(int node) { + return reconstructKeyInternal(node, 0); + } + + /* (non-Javadoc) + * @see com.google.devtools.build.lib.util.StringIndexer#clear() + */ + @Override + public synchronized void clear() { + nodes.clear(); + } + + /* (non-Javadoc) + * @see com.google.devtools.build.lib.util.StringIndexer#size() + */ + @Override + public synchronized int size() { + return nodes.size(); + } + + protected int getOrCreateIndexForBytes(byte[] bytes) { + return findOrCreateIndex(bytes, true); + } + + protected synchronized boolean addBytes(byte[] bytes) { + int count = nodes.size(); + int index = getOrCreateIndexForBytes(bytes); + return index >= count; + } + + protected int getIndexForBytes(byte[] bytes) { + return findOrCreateIndex(bytes, false); + } + + /* (non-Javadoc) + * @see com.google.devtools.build.lib.util.StringIndexer#getOrCreateIndex(java.lang.String) + */ + @Override + public int getOrCreateIndex(String s) { + return getOrCreateIndexForBytes(string2bytes(s)); + } + + /* (non-Javadoc) + * @see com.google.devtools.build.lib.util.StringIndexer#getIndex(java.lang.String) + */ + @Override + public int getIndex(String s) { + return getIndexForBytes(string2bytes(s)); + } + + /* (non-Javadoc) + * @see com.google.devtools.build.lib.util.StringIndexer#addString(java.lang.String) + */ + @Override + public boolean addString(String s) { + return addBytes(string2bytes(s)); + } + + protected synchronized byte[] getBytesForIndex(int i) { + Preconditions.checkArgument(i >= 0); + if (i >= nodes.size()) { + return null; + } + return reconstructKey(i); + } + + /* (non-Javadoc) + * @see com.google.devtools.build.lib.util.StringIndexer#getStringForIndex(int) + */ + @Override + public String getStringForIndex(int i) { + byte[] bytes = getBytesForIndex(i); + return bytes != null ? bytes2string(bytes) : null; + } + + private void dumpNodeContent(StringBuilder builder, byte[] content) { + int[] intHolder = new int[1]; + int offset = VarInt.getVarInt(content, 0, intHolder); + builder.append("parent: ").append(intHolder[0]); + offset = VarInt.getVarInt(content, offset, intHolder); + int len = intHolder[0]; + builder.append(", len: ").append(len).append(", key: \"") + .append(new String(content, offset, len, UTF_8)).append('"'); + offset += len; + while (offset < content.length) { + builder.append(", '").append(new String(content, offset, 1, UTF_8)).append("': "); + offset = VarInt.getVarInt(content, offset + 1, intHolder); + builder.append(intHolder[0]); + } + builder.append(", size: ").append(content.length); + } + + private int dumpContent(StringBuilder builder, int node, int indent, boolean[] seen) { + for(int i = 0; i < indent; i++) { + builder.append(" "); + } + builder.append(node).append(": "); + if (node >= nodes.size()) { + builder.append("OUT_OF_BOUNDS\n"); + return 0; + } else if (seen[node]) { + builder.append("ALREADY_SEEN\n"); + return 0; + } + seen[node] = true; + byte[] content = nodes.get(node); + if (content == null) { + builder.append("NULL\n"); + return 0; + } + dumpNodeContent(builder, content); + builder.append("\n"); + int contentSize = content.length; + + int[] intHolder = new int[1]; + int contentOffset = VarInt.getVarInt(content, 0, intHolder); // parent id + contentOffset = VarInt.getVarInt(content, contentOffset, intHolder); // key length + contentOffset += intHolder[0]; + while (contentOffset < content.length) { + contentOffset = VarInt.getVarInt(content, contentOffset + 1, intHolder); + contentSize += dumpContent(builder, intHolder[0], indent + 1, seen); + } + return contentSize; + } + + @Override + public synchronized String toString() { + StringBuilder builder = new StringBuilder(); + builder.append("size = ").append(nodes.size()).append("\n"); + if (nodes.size() > 0) { + int contentSize = dumpContent(builder, rootId, 0, new boolean[nodes.size()]); + builder.append("contentSize = ").append(contentSize).append("\n"); + } + return builder.toString(); + } + +} |