Implement PresizedCuckooMap. Original design and implementation by

dga@ (Dave Andersen). PresizedCuckooMap is a fast map implementation using Cuckoo hashing. See header for details on when to use this. Change: 130058678
author: Eugene Brevdo <ebrevdo@google.com> 2016-08-11 17:53:31 -0800
committer: TensorFlower Gardener <gardener@tensorflow.org> 2016-08-11 19:03:16 -0700
commit: fc218ef4d53e697c1a97d90a57a7437255575f89 (patch)
tree: 375c793f7d9169fc4320b665dff2e80adc7f0d0a /tensorflow/core/util/presized_cuckoo_map.h
parent: 71ee173c80679289e5995d540c6199ef3747cb3a (diff)
1 files changed, 328 insertions, 0 deletions
diff --git a/tensorflow/core/util/presized_cuckoo_map.h b/tensorflow/core/util/presized_cuckoo_map.h
new file mode 100644
index 0000000000..b488d32e03
--- /dev/null
+++ b/tensorflow/core/util/presized_cuckoo_map.h
@@ -0,0 +1,328 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_UTIL_PRESIZED_CUCKOO_MAP_H_
+#define TENSORFLOW_UTIL_PRESIZED_CUCKOO_MAP_H_
+
+#include <algorithm>
+#include <vector>
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace tensorflow {
+
+// Class for efficiently storing key->value mappings when the size is
+// known in advance and the keys are pre-hashed into uint64s.
+// Keys should have "good enough" randomness (be spread across the
+// entire 64 bit space).
+//
+// Important:  Clients wishing to use deterministic keys must
+// ensure that their keys fall in the range 0 .. (uint64max-1);
+// the table uses 2^64-1 as the "not occupied" flag.
+//
+// Inserted keys must be unique, and there are no update
+// or delete functions (until some subsequent use of this table
+// requires them).
+//
+// Threads must synchronize their access to a PresizedCuckooMap.
+//
+// The cuckoo hash table is 4-way associative (each "bucket" has 4
+// "slots" for key/value entries).  Uses breadth-first-search to find
+// a good cuckoo path with less data movement (see
+// http://www.cs.cmu.edu/~dga/papers/cuckoo-eurosys14.pdf )
+
+template <class value>
+class PresizedCuckooMap {
+ public:
+  // The key type is fixed as a pre-hashed key for this specialized use.
+  typedef uint64 key_type;
+
+  explicit PresizedCuckooMap(uint64 num_entries) : cpq_(new CuckooPathQueue) {
+    double n(num_entries);
+    n /= kLoadFactor;
+    num_buckets_ = (static_cast<uint64>(n) / kSlotsPerBucket);
+    // Very small cuckoo tables don't work, because the probability
+    // of having same-bucket hashes is large.  We compromise for those
+    // uses by having a larger static starting size.
+    num_buckets_ += 32;
+    Bucket empty_bucket;
+    for (int i = 0; i < kSlotsPerBucket; i++) {
+      empty_bucket.keys[i] = kUnusedSlot;
+    }
+    buckets_.resize(num_buckets_, empty_bucket);
+#if !defined(__GCUDACC__) && !defined(__GCUDACC_HOST__)
+    buckets_divisor_ = Eigen::internal::TensorIntDivisor<uint64>(num_buckets_);
+#endif
+  }
+
+  // Returns false if k is already in table or if the table
+  // is full; true otherwise.
+  bool InsertUnique(const key_type k, const value& v) {
+    uint64 tk = key_transform(k);
+    uint64 b1 = fast_mod_by_buckets(tk);
+    uint64 b2 = fast_mod_by_buckets(h2(tk));
+
+    // Merged find and duplicate checking.
+    uint64 target_bucket = 0;
+    int target_slot = kNoSpace;
+
+    for (auto bucket : {b1, b2}) {
+      Bucket* bptr = &buckets_[bucket];
+      for (int slot = 0; slot < kSlotsPerBucket; slot++) {
+        if (bptr->keys[slot] == k) {  // Duplicates are not allowed.
+          return false;
+        } else if (target_slot == kNoSpace && bptr->keys[slot] == kUnusedSlot) {
+          target_bucket = bucket;
+          target_slot = slot;
+        }
+      }
+    }
+
+    if (target_slot != kNoSpace) {
+      InsertInternal(tk, v, target_bucket, target_slot);
+      return true;
+    }
+
+    return CuckooInsert(tk, v, b1, b2);
+  }
+
+  // Returns true if found.  Sets *out = value.
+  bool Find(const key_type k, value* out) const {
+    uint64 tk = key_transform(k);
+    return FindInBucket(k, fast_mod_by_buckets(tk), out) ||
+           FindInBucket(k, fast_mod_by_buckets(h2(tk)), out);
+  }
+
+ private:
+  static constexpr int kSlotsPerBucket = 4;
+
+  // The load factor is chosen slightly conservatively for speed and
+  // to avoid the need for a table rebuild on insertion failure.
+  // 0.94 is achievable, but 0.85 is faster and keeps the code simple
+  // at the cost of a small amount of memory.
+  // NOTE:  0 < kLoadFactor <= 1.0
+  static constexpr double kLoadFactor = 0.85;
+
+  // Cuckoo insert:  The maximum number of entries to scan should be ~400
+  // (Source:  Personal communication with Michael Mitzenmacher;  empirical
+  // experiments validate.).  After trying 400 candidate locations, declare
+  // the table full - it's probably full of unresolvable cycles.  Less than
+  // 400 reduces max occupancy;  much more results in very poor performance
+  // around the full point.  For (2,4) a max BFS path len of 5 results in ~682
+  // nodes to visit, calculated below, and is a good value.
+
+  static constexpr uint8 kMaxBFSPathLen = 5;
+
+  // Constants for BFS cuckoo path search:
+  // The visited list must be maintained for all but the last level of search
+  // in order to trace back the path.  The BFS search has two roots
+  // and each can go to a total depth (including the root) of 5.
+  // The queue must be sized for 2 * \sum_{k=0...4}{kSlotsPerBucket^k} = 682.
+  // The visited queue, however, does not need to hold the deepest level,
+  // and so it is sized 2 * \sum{k=0...3}{kSlotsPerBucket^k} = 170
+  static constexpr int kMaxQueueSize = 682;
+  static constexpr int kVisitedListSize = 170;
+
+  static constexpr int kNoSpace = -1;  // SpaceAvailable return
+  static constexpr uint64 kUnusedSlot = ~(0ULL);
+
+  // Buckets are organized with key_types clustered for access speed
+  // and for compactness while remaining aligned.
+  struct Bucket {
+    key_type keys[kSlotsPerBucket];
+    value values[kSlotsPerBucket];
+  };
+
+  // Insert uses the BFS optimization (search before moving) to reduce
+  // the number of cache lines dirtied during search.
+
+  struct CuckooPathEntry {
+    uint64 bucket;
+    int depth;
+    int parent;       // To index in the visited array.
+    int parent_slot;  // Which slot in our parent did we come from?  -1 == root.
+  };
+
+  // CuckooPathQueue is a trivial circular queue for path entries.
+  // The caller is responsible for not inserting more than kMaxQueueSize
+  // entries.  Each PresizedCuckooMap has one (heap-allocated) CuckooPathQueue
+  // that it reuses across inserts.
+  class CuckooPathQueue {
+   public:
+    CuckooPathQueue() : head_(0), tail_(0) {}
+
+    void push_back(CuckooPathEntry e) {
+      queue_[tail_] = e;
+      tail_ = (tail_ + 1) % kMaxQueueSize;
+    }
+
+    CuckooPathEntry pop_front() {
+      CuckooPathEntry& e = queue_[head_];
+      head_ = (head_ + 1) % kMaxQueueSize;
+      return e;
+    }
+
+    bool empty() { return head_ == tail_; }
+
+    bool full() { return ((tail_ + 1) % kMaxQueueSize) == head_; }
+
+    void reset() { head_ = tail_ = 0; }
+
+   private:
+    CuckooPathEntry queue_[kMaxQueueSize];
+    int head_;
+    int tail_;
+  };
+
+  typedef std::array<CuckooPathEntry, kMaxBFSPathLen> CuckooPath;
+
+  // Callers are expected to have pre-hashed the keys into a uint64
+  // and are expected to be able to handle (a very low rate) of
+  // collisions, OR must ensure that their keys are always in
+  // the range 0 - (uint64max - 1).  This transforms 'not found flag'
+  // keys into something else.
+  inline uint64 key_transform(const key_type k) const {
+    return k + (k == kUnusedSlot);
+  }
+
+  // h2 performs a very quick mix of h to generate the second bucket hash.
+  // Assumes there is plenty of remaining entropy in the initial h.
+  inline uint64 h2(uint64 h) const {
+    const uint64 m = 0xc6a4a7935bd1e995;
+    return m * ((h >> 32) | (h << 32));
+  }
+
+  // alt_bucket identifies the "other" bucket for key k, whether
+  // other is "the one that isn't bucket b"
+  inline uint64 alt_bucket(key_type k, uint64 b) const {
+    if (fast_mod_by_buckets(k) != b) {
+      return fast_mod_by_buckets(k);
+    }
+    return fast_mod_by_buckets(h2(k));
+  }
+
+  inline void InsertInternal(key_type k, const value& v, uint64 b, int slot) {
+    Bucket* bptr = &buckets_[b];
+    bptr->keys[slot] = k;
+    bptr->values[slot] = v;
+  }
+
+  // For the associative cuckoo table, check all of the slots in
+  // the bucket to see if the key is present.
+  bool FindInBucket(key_type k, uint64 b, value* out) const {
+    const Bucket& bref = buckets_[b];
+    for (int i = 0; i < kSlotsPerBucket; i++) {
+      if (bref.keys[i] == k) {
+        *out = bref.values[i];
+        return true;
+      }
+    }
+    return false;
+  }
+
+  //  returns either kNoSpace or the index of an
+  //  available slot (0 <= slot < kSlotsPerBucket)
+  inline int SpaceAvailable(uint64 bucket) const {
+    const Bucket& bref = buckets_[bucket];
+    for (int i = 0; i < kSlotsPerBucket; i++) {
+      if (bref.keys[i] == kUnusedSlot) {
+        return i;
+      }
+    }
+    return kNoSpace;
+  }
+
+  inline void CopyItem(uint64 src_bucket, int src_slot, uint64 dst_bucket,
+                       int dst_slot) {
+    Bucket& src_ref = buckets_[src_bucket];
+    Bucket& dst_ref = buckets_[dst_bucket];
+    dst_ref.keys[dst_slot] = src_ref.keys[src_slot];
+    dst_ref.values[dst_slot] = src_ref.values[src_slot];
+  }
+
+  bool CuckooInsert(key_type k, const value& v, uint64 b1, uint64 b2) {
+    int visited_end = 0;
+    cpq_->reset();
+
+    cpq_->push_back({b1, 1, 0, 0});  // Note depth starts at 1.
+    cpq_->push_back({b2, 1, 0, 0});
+
+    while (!cpq_->empty()) {
+      CuckooPathEntry e = cpq_->pop_front();
+      int free_slot;
+      free_slot = SpaceAvailable(e.bucket);
+      if (free_slot != kNoSpace) {
+        while (e.depth > 1) {
+          // "copy" instead of "swap" because one entry is always zero.
+          // After, write target key/value over top of last copied entry.
+          CuckooPathEntry parent = visited_[e.parent];
+          CopyItem(parent.bucket, e.parent_slot, e.bucket, free_slot);
+          free_slot = e.parent_slot;
+          e = parent;
+        }
+        InsertInternal(k, v, e.bucket, free_slot);
+        return true;
+      } else {
+        if (e.depth < (kMaxBFSPathLen)) {
+          auto parent_index = visited_end;
+          visited_[visited_end] = e;
+          visited_end++;
+          // Don't always start with the same slot, to even out the path depth.
+          int start_slot = (k + e.bucket) % kSlotsPerBucket;
+          const Bucket& bref = buckets_[e.bucket];
+          for (int i = 0; i < kSlotsPerBucket; i++) {
+            int slot = (start_slot + i) % kSlotsPerBucket;
+            uint64 next_bucket = alt_bucket(bref.keys[slot], e.bucket);
+            // Optimization:  Avoid single-step cycles (from e, don't
+            // add a child node that is actually e's parent).
+            uint64 e_parent_bucket = visited_[e.parent].bucket;
+            if (next_bucket != e_parent_bucket) {
+              cpq_->push_back({next_bucket, e.depth + 1, parent_index, slot});
+            }
+          }
+        }
+      }
+    }
+
+    LOG(WARNING) << "Cuckoo path finding failed: Table too small?";
+    return false;
+  }
+
+  inline uint64 fast_mod_by_buckets(uint64 x) const {
+// Omitting the optimized bucket mod for CUDA platforms
+// until Eigen supports 2^63 divisors on GPU.
+#if !defined(__GCUDACC__) && !defined(__GCUDACC_HOST__)
+    x &= ~(1ULL << 63);  // Fast div can only handle 2^63-1
+    return x - num_buckets_ * (x / buckets_divisor_);
+#else
+    return x % num_buckets_;
+#endif
+  }
+
+  // Set upon initialization: num_entries / kLoadFactor / kSlotsPerBucket.
+  uint64 num_buckets_;
+  std::vector<Bucket> buckets_;
+  Eigen::internal::TensorIntDivisor<uint64> buckets_divisor_;  // for fast mod
+
+  const std::unique_ptr<CuckooPathQueue> cpq_;
+  CuckooPathEntry visited_[kVisitedListSize];
+
+  TF_DISALLOW_COPY_AND_ASSIGN(PresizedCuckooMap);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_UTIL_PRESIZED_CUCKOO_MAP_H_
author	Eugene Brevdo <ebrevdo@google.com>	2016-08-11 17:53:31 -0800
committer	TensorFlower Gardener <gardener@tensorflow.org>	2016-08-11 19:03:16 -0700
commit	fc218ef4d53e697c1a97d90a57a7437255575f89 (patch)
tree	375c793f7d9169fc4320b665dff2e80adc7f0d0a /tensorflow/core/util/presized_cuckoo_map.h
parent	71ee173c80679289e5995d540c6199ef3747cb3a (diff)