diff options
author | Eugene Brevdo <ebrevdo@google.com> | 2016-08-11 17:53:31 -0800 |
---|---|---|
committer | TensorFlower Gardener <gardener@tensorflow.org> | 2016-08-11 19:03:16 -0700 |
commit | fc218ef4d53e697c1a97d90a57a7437255575f89 (patch) | |
tree | 375c793f7d9169fc4320b665dff2e80adc7f0d0a /tensorflow/core/util/presized_cuckoo_map.h | |
parent | 71ee173c80679289e5995d540c6199ef3747cb3a (diff) |
Implement PresizedCuckooMap. Original design and implementation by
dga@ (Dave Andersen). PresizedCuckooMap is a fast map implementation
using Cuckoo hashing. See header for details on when to use this.
Change: 130058678
Diffstat (limited to 'tensorflow/core/util/presized_cuckoo_map.h')
-rw-r--r-- | tensorflow/core/util/presized_cuckoo_map.h | 328 |
1 files changed, 328 insertions, 0 deletions
diff --git a/tensorflow/core/util/presized_cuckoo_map.h b/tensorflow/core/util/presized_cuckoo_map.h new file mode 100644 index 0000000000..b488d32e03 --- /dev/null +++ b/tensorflow/core/util/presized_cuckoo_map.h @@ -0,0 +1,328 @@ +/* Copyright 2016 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_UTIL_PRESIZED_CUCKOO_MAP_H_ +#define TENSORFLOW_UTIL_PRESIZED_CUCKOO_MAP_H_ + +#include <algorithm> +#include <vector> +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/platform/macros.h" + +namespace tensorflow { + +// Class for efficiently storing key->value mappings when the size is +// known in advance and the keys are pre-hashed into uint64s. +// Keys should have "good enough" randomness (be spread across the +// entire 64 bit space). +// +// Important: Clients wishing to use deterministic keys must +// ensure that their keys fall in the range 0 .. (uint64max-1); +// the table uses 2^64-1 as the "not occupied" flag. +// +// Inserted keys must be unique, and there are no update +// or delete functions (until some subsequent use of this table +// requires them). +// +// Threads must synchronize their access to a PresizedCuckooMap. +// +// The cuckoo hash table is 4-way associative (each "bucket" has 4 +// "slots" for key/value entries). Uses breadth-first-search to find +// a good cuckoo path with less data movement (see +// http://www.cs.cmu.edu/~dga/papers/cuckoo-eurosys14.pdf ) + +template <class value> +class PresizedCuckooMap { + public: + // The key type is fixed as a pre-hashed key for this specialized use. + typedef uint64 key_type; + + explicit PresizedCuckooMap(uint64 num_entries) : cpq_(new CuckooPathQueue) { + double n(num_entries); + n /= kLoadFactor; + num_buckets_ = (static_cast<uint64>(n) / kSlotsPerBucket); + // Very small cuckoo tables don't work, because the probability + // of having same-bucket hashes is large. We compromise for those + // uses by having a larger static starting size. + num_buckets_ += 32; + Bucket empty_bucket; + for (int i = 0; i < kSlotsPerBucket; i++) { + empty_bucket.keys[i] = kUnusedSlot; + } + buckets_.resize(num_buckets_, empty_bucket); +#if !defined(__GCUDACC__) && !defined(__GCUDACC_HOST__) + buckets_divisor_ = Eigen::internal::TensorIntDivisor<uint64>(num_buckets_); +#endif + } + + // Returns false if k is already in table or if the table + // is full; true otherwise. + bool InsertUnique(const key_type k, const value& v) { + uint64 tk = key_transform(k); + uint64 b1 = fast_mod_by_buckets(tk); + uint64 b2 = fast_mod_by_buckets(h2(tk)); + + // Merged find and duplicate checking. + uint64 target_bucket = 0; + int target_slot = kNoSpace; + + for (auto bucket : {b1, b2}) { + Bucket* bptr = &buckets_[bucket]; + for (int slot = 0; slot < kSlotsPerBucket; slot++) { + if (bptr->keys[slot] == k) { // Duplicates are not allowed. + return false; + } else if (target_slot == kNoSpace && bptr->keys[slot] == kUnusedSlot) { + target_bucket = bucket; + target_slot = slot; + } + } + } + + if (target_slot != kNoSpace) { + InsertInternal(tk, v, target_bucket, target_slot); + return true; + } + + return CuckooInsert(tk, v, b1, b2); + } + + // Returns true if found. Sets *out = value. + bool Find(const key_type k, value* out) const { + uint64 tk = key_transform(k); + return FindInBucket(k, fast_mod_by_buckets(tk), out) || + FindInBucket(k, fast_mod_by_buckets(h2(tk)), out); + } + + private: + static constexpr int kSlotsPerBucket = 4; + + // The load factor is chosen slightly conservatively for speed and + // to avoid the need for a table rebuild on insertion failure. + // 0.94 is achievable, but 0.85 is faster and keeps the code simple + // at the cost of a small amount of memory. + // NOTE: 0 < kLoadFactor <= 1.0 + static constexpr double kLoadFactor = 0.85; + + // Cuckoo insert: The maximum number of entries to scan should be ~400 + // (Source: Personal communication with Michael Mitzenmacher; empirical + // experiments validate.). After trying 400 candidate locations, declare + // the table full - it's probably full of unresolvable cycles. Less than + // 400 reduces max occupancy; much more results in very poor performance + // around the full point. For (2,4) a max BFS path len of 5 results in ~682 + // nodes to visit, calculated below, and is a good value. + + static constexpr uint8 kMaxBFSPathLen = 5; + + // Constants for BFS cuckoo path search: + // The visited list must be maintained for all but the last level of search + // in order to trace back the path. The BFS search has two roots + // and each can go to a total depth (including the root) of 5. + // The queue must be sized for 2 * \sum_{k=0...4}{kSlotsPerBucket^k} = 682. + // The visited queue, however, does not need to hold the deepest level, + // and so it is sized 2 * \sum{k=0...3}{kSlotsPerBucket^k} = 170 + static constexpr int kMaxQueueSize = 682; + static constexpr int kVisitedListSize = 170; + + static constexpr int kNoSpace = -1; // SpaceAvailable return + static constexpr uint64 kUnusedSlot = ~(0ULL); + + // Buckets are organized with key_types clustered for access speed + // and for compactness while remaining aligned. + struct Bucket { + key_type keys[kSlotsPerBucket]; + value values[kSlotsPerBucket]; + }; + + // Insert uses the BFS optimization (search before moving) to reduce + // the number of cache lines dirtied during search. + + struct CuckooPathEntry { + uint64 bucket; + int depth; + int parent; // To index in the visited array. + int parent_slot; // Which slot in our parent did we come from? -1 == root. + }; + + // CuckooPathQueue is a trivial circular queue for path entries. + // The caller is responsible for not inserting more than kMaxQueueSize + // entries. Each PresizedCuckooMap has one (heap-allocated) CuckooPathQueue + // that it reuses across inserts. + class CuckooPathQueue { + public: + CuckooPathQueue() : head_(0), tail_(0) {} + + void push_back(CuckooPathEntry e) { + queue_[tail_] = e; + tail_ = (tail_ + 1) % kMaxQueueSize; + } + + CuckooPathEntry pop_front() { + CuckooPathEntry& e = queue_[head_]; + head_ = (head_ + 1) % kMaxQueueSize; + return e; + } + + bool empty() { return head_ == tail_; } + + bool full() { return ((tail_ + 1) % kMaxQueueSize) == head_; } + + void reset() { head_ = tail_ = 0; } + + private: + CuckooPathEntry queue_[kMaxQueueSize]; + int head_; + int tail_; + }; + + typedef std::array<CuckooPathEntry, kMaxBFSPathLen> CuckooPath; + + // Callers are expected to have pre-hashed the keys into a uint64 + // and are expected to be able to handle (a very low rate) of + // collisions, OR must ensure that their keys are always in + // the range 0 - (uint64max - 1). This transforms 'not found flag' + // keys into something else. + inline uint64 key_transform(const key_type k) const { + return k + (k == kUnusedSlot); + } + + // h2 performs a very quick mix of h to generate the second bucket hash. + // Assumes there is plenty of remaining entropy in the initial h. + inline uint64 h2(uint64 h) const { + const uint64 m = 0xc6a4a7935bd1e995; + return m * ((h >> 32) | (h << 32)); + } + + // alt_bucket identifies the "other" bucket for key k, whether + // other is "the one that isn't bucket b" + inline uint64 alt_bucket(key_type k, uint64 b) const { + if (fast_mod_by_buckets(k) != b) { + return fast_mod_by_buckets(k); + } + return fast_mod_by_buckets(h2(k)); + } + + inline void InsertInternal(key_type k, const value& v, uint64 b, int slot) { + Bucket* bptr = &buckets_[b]; + bptr->keys[slot] = k; + bptr->values[slot] = v; + } + + // For the associative cuckoo table, check all of the slots in + // the bucket to see if the key is present. + bool FindInBucket(key_type k, uint64 b, value* out) const { + const Bucket& bref = buckets_[b]; + for (int i = 0; i < kSlotsPerBucket; i++) { + if (bref.keys[i] == k) { + *out = bref.values[i]; + return true; + } + } + return false; + } + + // returns either kNoSpace or the index of an + // available slot (0 <= slot < kSlotsPerBucket) + inline int SpaceAvailable(uint64 bucket) const { + const Bucket& bref = buckets_[bucket]; + for (int i = 0; i < kSlotsPerBucket; i++) { + if (bref.keys[i] == kUnusedSlot) { + return i; + } + } + return kNoSpace; + } + + inline void CopyItem(uint64 src_bucket, int src_slot, uint64 dst_bucket, + int dst_slot) { + Bucket& src_ref = buckets_[src_bucket]; + Bucket& dst_ref = buckets_[dst_bucket]; + dst_ref.keys[dst_slot] = src_ref.keys[src_slot]; + dst_ref.values[dst_slot] = src_ref.values[src_slot]; + } + + bool CuckooInsert(key_type k, const value& v, uint64 b1, uint64 b2) { + int visited_end = 0; + cpq_->reset(); + + cpq_->push_back({b1, 1, 0, 0}); // Note depth starts at 1. + cpq_->push_back({b2, 1, 0, 0}); + + while (!cpq_->empty()) { + CuckooPathEntry e = cpq_->pop_front(); + int free_slot; + free_slot = SpaceAvailable(e.bucket); + if (free_slot != kNoSpace) { + while (e.depth > 1) { + // "copy" instead of "swap" because one entry is always zero. + // After, write target key/value over top of last copied entry. + CuckooPathEntry parent = visited_[e.parent]; + CopyItem(parent.bucket, e.parent_slot, e.bucket, free_slot); + free_slot = e.parent_slot; + e = parent; + } + InsertInternal(k, v, e.bucket, free_slot); + return true; + } else { + if (e.depth < (kMaxBFSPathLen)) { + auto parent_index = visited_end; + visited_[visited_end] = e; + visited_end++; + // Don't always start with the same slot, to even out the path depth. + int start_slot = (k + e.bucket) % kSlotsPerBucket; + const Bucket& bref = buckets_[e.bucket]; + for (int i = 0; i < kSlotsPerBucket; i++) { + int slot = (start_slot + i) % kSlotsPerBucket; + uint64 next_bucket = alt_bucket(bref.keys[slot], e.bucket); + // Optimization: Avoid single-step cycles (from e, don't + // add a child node that is actually e's parent). + uint64 e_parent_bucket = visited_[e.parent].bucket; + if (next_bucket != e_parent_bucket) { + cpq_->push_back({next_bucket, e.depth + 1, parent_index, slot}); + } + } + } + } + } + + LOG(WARNING) << "Cuckoo path finding failed: Table too small?"; + return false; + } + + inline uint64 fast_mod_by_buckets(uint64 x) const { +// Omitting the optimized bucket mod for CUDA platforms +// until Eigen supports 2^63 divisors on GPU. +#if !defined(__GCUDACC__) && !defined(__GCUDACC_HOST__) + x &= ~(1ULL << 63); // Fast div can only handle 2^63-1 + return x - num_buckets_ * (x / buckets_divisor_); +#else + return x % num_buckets_; +#endif + } + + // Set upon initialization: num_entries / kLoadFactor / kSlotsPerBucket. + uint64 num_buckets_; + std::vector<Bucket> buckets_; + Eigen::internal::TensorIntDivisor<uint64> buckets_divisor_; // for fast mod + + const std::unique_ptr<CuckooPathQueue> cpq_; + CuckooPathEntry visited_[kVisitedListSize]; + + TF_DISALLOW_COPY_AND_ASSIGN(PresizedCuckooMap); +}; + +} // namespace tensorflow + +#endif // TENSORFLOW_UTIL_PRESIZED_CUCKOO_MAP_H_ |