aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/core/lib/io/block_builder.cc
diff options
context:
space:
mode:
Diffstat (limited to 'tensorflow/core/lib/io/block_builder.cc')
-rw-r--r--tensorflow/core/lib/io/block_builder.cc107
1 files changed, 107 insertions, 0 deletions
diff --git a/tensorflow/core/lib/io/block_builder.cc b/tensorflow/core/lib/io/block_builder.cc
new file mode 100644
index 0000000000..d94048d744
--- /dev/null
+++ b/tensorflow/core/lib/io/block_builder.cc
@@ -0,0 +1,107 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// BlockBuilder generates blocks where keys are prefix-compressed:
+//
+// When we store a key, we drop the prefix shared with the previous
+// string. This helps reduce the space requirement significantly.
+// Furthermore, once every K keys, we do not apply the prefix
+// compression and store the entire key. We call this a "restart
+// point". The tail end of the block stores the offsets of all of the
+// restart points, and can be used to do a binary search when looking
+// for a particular key. Values are stored as-is (without compression)
+// immediately following the corresponding key.
+//
+// An entry for a particular key-value pair has the form:
+// shared_bytes: varint32
+// unshared_bytes: varint32
+// value_length: varint32
+// key_delta: char[unshared_bytes]
+// value: char[value_length]
+// shared_bytes == 0 for restart points.
+//
+// The trailer of the block has the form:
+// restarts: uint32[num_restarts]
+// num_restarts: uint32
+// restarts[i] contains the offset within the block of the ith restart point.
+
+#include "tensorflow/core/lib/io/block_builder.h"
+
+#include <algorithm>
+#include <assert.h>
+#include "tensorflow/core/lib/io/table_builder.h"
+#include "tensorflow/core/lib/core/coding.h"
+
+namespace tensorflow {
+namespace table {
+
+BlockBuilder::BlockBuilder(const Options* options)
+ : options_(options), restarts_(), counter_(0), finished_(false) {
+ assert(options->block_restart_interval >= 1);
+ restarts_.push_back(0); // First restart point is at offset 0
+}
+
+void BlockBuilder::Reset() {
+ buffer_.clear();
+ restarts_.clear();
+ restarts_.push_back(0); // First restart point is at offset 0
+ counter_ = 0;
+ finished_ = false;
+ last_key_.clear();
+}
+
+size_t BlockBuilder::CurrentSizeEstimate() const {
+ return (buffer_.size() + // Raw data buffer
+ restarts_.size() * sizeof(uint32) + // Restart array
+ sizeof(uint32)); // Restart array length
+}
+
+StringPiece BlockBuilder::Finish() {
+ // Append restart array
+ for (size_t i = 0; i < restarts_.size(); i++) {
+ core::PutFixed32(&buffer_, restarts_[i]);
+ }
+ core::PutFixed32(&buffer_, restarts_.size());
+ finished_ = true;
+ return StringPiece(buffer_);
+}
+
+void BlockBuilder::Add(const StringPiece& key, const StringPiece& value) {
+ StringPiece last_key_piece(last_key_);
+ assert(!finished_);
+ assert(counter_ <= options_->block_restart_interval);
+ assert(buffer_.empty() // No values yet?
+ || key.compare(last_key_piece) > 0);
+ size_t shared = 0;
+ if (counter_ < options_->block_restart_interval) {
+ // See how much sharing to do with previous string
+ const size_t min_length = std::min(last_key_piece.size(), key.size());
+ while ((shared < min_length) && (last_key_piece[shared] == key[shared])) {
+ shared++;
+ }
+ } else {
+ // Restart compression
+ restarts_.push_back(buffer_.size());
+ counter_ = 0;
+ }
+ const size_t non_shared = key.size() - shared;
+
+ // Add "<shared><non_shared><value_size>" to buffer_
+ core::PutVarint32(&buffer_, shared);
+ core::PutVarint32(&buffer_, non_shared);
+ core::PutVarint32(&buffer_, value.size());
+
+ // Add string delta to buffer_ followed by value
+ buffer_.append(key.data() + shared, non_shared);
+ buffer_.append(value.data(), value.size());
+
+ // Update state
+ last_key_.resize(shared);
+ last_key_.append(key.data() + shared, non_shared);
+ assert(StringPiece(last_key_) == key);
+ counter_++;
+}
+
+} // namespace table
+} // namespace tensorflow