diff options
author | 2016-08-11 17:53:31 -0800 | |
---|---|---|
committer | 2016-08-11 19:03:16 -0700 | |
commit | fc218ef4d53e697c1a97d90a57a7437255575f89 (patch) | |
tree | 375c793f7d9169fc4320b665dff2e80adc7f0d0a /tensorflow | |
parent | 71ee173c80679289e5995d540c6199ef3747cb3a (diff) |
Implement PresizedCuckooMap. Original design and implementation by
dga@ (Dave Andersen). PresizedCuckooMap is a fast map implementation
using Cuckoo hashing. See header for details on when to use this.
Change: 130058678
Diffstat (limited to 'tensorflow')
-rw-r--r-- | tensorflow/core/BUILD | 2 | ||||
-rw-r--r-- | tensorflow/core/util/presized_cuckoo_map.h | 328 | ||||
-rw-r--r-- | tensorflow/core/util/presized_cuckoo_map_test.cc | 156 |
3 files changed, 486 insertions, 0 deletions
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD index 0d8891faa9..32fb8762aa 100644 --- a/tensorflow/core/BUILD +++ b/tensorflow/core/BUILD @@ -950,6 +950,7 @@ tf_cuda_library( "framework/tracking_allocator.h", # only needed for tests "framework/unique_tensor_references.h", "util/command_line_flags.h", + "util/presized_cuckoo_map.h", "util/tensor_slice_set.h", "util/tensor_slice_util.h", ], @@ -1452,6 +1453,7 @@ tf_cc_tests( "util/events_writer_test.cc", "util/example_proto_helper_test.cc", "util/memmapped_file_system_test.cc", + "util/presized_cuckoo_map_test.cc", "util/reporter_test.cc", "util/saved_tensor_slice_util_test.cc", "util/sparse/sparse_tensor_test.cc", diff --git a/tensorflow/core/util/presized_cuckoo_map.h b/tensorflow/core/util/presized_cuckoo_map.h new file mode 100644 index 0000000000..b488d32e03 --- /dev/null +++ b/tensorflow/core/util/presized_cuckoo_map.h @@ -0,0 +1,328 @@ +/* Copyright 2016 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_UTIL_PRESIZED_CUCKOO_MAP_H_ +#define TENSORFLOW_UTIL_PRESIZED_CUCKOO_MAP_H_ + +#include <algorithm> +#include <vector> +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/platform/macros.h" + +namespace tensorflow { + +// Class for efficiently storing key->value mappings when the size is +// known in advance and the keys are pre-hashed into uint64s. +// Keys should have "good enough" randomness (be spread across the +// entire 64 bit space). +// +// Important: Clients wishing to use deterministic keys must +// ensure that their keys fall in the range 0 .. (uint64max-1); +// the table uses 2^64-1 as the "not occupied" flag. +// +// Inserted keys must be unique, and there are no update +// or delete functions (until some subsequent use of this table +// requires them). +// +// Threads must synchronize their access to a PresizedCuckooMap. +// +// The cuckoo hash table is 4-way associative (each "bucket" has 4 +// "slots" for key/value entries). Uses breadth-first-search to find +// a good cuckoo path with less data movement (see +// http://www.cs.cmu.edu/~dga/papers/cuckoo-eurosys14.pdf ) + +template <class value> +class PresizedCuckooMap { + public: + // The key type is fixed as a pre-hashed key for this specialized use. + typedef uint64 key_type; + + explicit PresizedCuckooMap(uint64 num_entries) : cpq_(new CuckooPathQueue) { + double n(num_entries); + n /= kLoadFactor; + num_buckets_ = (static_cast<uint64>(n) / kSlotsPerBucket); + // Very small cuckoo tables don't work, because the probability + // of having same-bucket hashes is large. We compromise for those + // uses by having a larger static starting size. + num_buckets_ += 32; + Bucket empty_bucket; + for (int i = 0; i < kSlotsPerBucket; i++) { + empty_bucket.keys[i] = kUnusedSlot; + } + buckets_.resize(num_buckets_, empty_bucket); +#if !defined(__GCUDACC__) && !defined(__GCUDACC_HOST__) + buckets_divisor_ = Eigen::internal::TensorIntDivisor<uint64>(num_buckets_); +#endif + } + + // Returns false if k is already in table or if the table + // is full; true otherwise. + bool InsertUnique(const key_type k, const value& v) { + uint64 tk = key_transform(k); + uint64 b1 = fast_mod_by_buckets(tk); + uint64 b2 = fast_mod_by_buckets(h2(tk)); + + // Merged find and duplicate checking. + uint64 target_bucket = 0; + int target_slot = kNoSpace; + + for (auto bucket : {b1, b2}) { + Bucket* bptr = &buckets_[bucket]; + for (int slot = 0; slot < kSlotsPerBucket; slot++) { + if (bptr->keys[slot] == k) { // Duplicates are not allowed. + return false; + } else if (target_slot == kNoSpace && bptr->keys[slot] == kUnusedSlot) { + target_bucket = bucket; + target_slot = slot; + } + } + } + + if (target_slot != kNoSpace) { + InsertInternal(tk, v, target_bucket, target_slot); + return true; + } + + return CuckooInsert(tk, v, b1, b2); + } + + // Returns true if found. Sets *out = value. + bool Find(const key_type k, value* out) const { + uint64 tk = key_transform(k); + return FindInBucket(k, fast_mod_by_buckets(tk), out) || + FindInBucket(k, fast_mod_by_buckets(h2(tk)), out); + } + + private: + static constexpr int kSlotsPerBucket = 4; + + // The load factor is chosen slightly conservatively for speed and + // to avoid the need for a table rebuild on insertion failure. + // 0.94 is achievable, but 0.85 is faster and keeps the code simple + // at the cost of a small amount of memory. + // NOTE: 0 < kLoadFactor <= 1.0 + static constexpr double kLoadFactor = 0.85; + + // Cuckoo insert: The maximum number of entries to scan should be ~400 + // (Source: Personal communication with Michael Mitzenmacher; empirical + // experiments validate.). After trying 400 candidate locations, declare + // the table full - it's probably full of unresolvable cycles. Less than + // 400 reduces max occupancy; much more results in very poor performance + // around the full point. For (2,4) a max BFS path len of 5 results in ~682 + // nodes to visit, calculated below, and is a good value. + + static constexpr uint8 kMaxBFSPathLen = 5; + + // Constants for BFS cuckoo path search: + // The visited list must be maintained for all but the last level of search + // in order to trace back the path. The BFS search has two roots + // and each can go to a total depth (including the root) of 5. + // The queue must be sized for 2 * \sum_{k=0...4}{kSlotsPerBucket^k} = 682. + // The visited queue, however, does not need to hold the deepest level, + // and so it is sized 2 * \sum{k=0...3}{kSlotsPerBucket^k} = 170 + static constexpr int kMaxQueueSize = 682; + static constexpr int kVisitedListSize = 170; + + static constexpr int kNoSpace = -1; // SpaceAvailable return + static constexpr uint64 kUnusedSlot = ~(0ULL); + + // Buckets are organized with key_types clustered for access speed + // and for compactness while remaining aligned. + struct Bucket { + key_type keys[kSlotsPerBucket]; + value values[kSlotsPerBucket]; + }; + + // Insert uses the BFS optimization (search before moving) to reduce + // the number of cache lines dirtied during search. + + struct CuckooPathEntry { + uint64 bucket; + int depth; + int parent; // To index in the visited array. + int parent_slot; // Which slot in our parent did we come from? -1 == root. + }; + + // CuckooPathQueue is a trivial circular queue for path entries. + // The caller is responsible for not inserting more than kMaxQueueSize + // entries. Each PresizedCuckooMap has one (heap-allocated) CuckooPathQueue + // that it reuses across inserts. + class CuckooPathQueue { + public: + CuckooPathQueue() : head_(0), tail_(0) {} + + void push_back(CuckooPathEntry e) { + queue_[tail_] = e; + tail_ = (tail_ + 1) % kMaxQueueSize; + } + + CuckooPathEntry pop_front() { + CuckooPathEntry& e = queue_[head_]; + head_ = (head_ + 1) % kMaxQueueSize; + return e; + } + + bool empty() { return head_ == tail_; } + + bool full() { return ((tail_ + 1) % kMaxQueueSize) == head_; } + + void reset() { head_ = tail_ = 0; } + + private: + CuckooPathEntry queue_[kMaxQueueSize]; + int head_; + int tail_; + }; + + typedef std::array<CuckooPathEntry, kMaxBFSPathLen> CuckooPath; + + // Callers are expected to have pre-hashed the keys into a uint64 + // and are expected to be able to handle (a very low rate) of + // collisions, OR must ensure that their keys are always in + // the range 0 - (uint64max - 1). This transforms 'not found flag' + // keys into something else. + inline uint64 key_transform(const key_type k) const { + return k + (k == kUnusedSlot); + } + + // h2 performs a very quick mix of h to generate the second bucket hash. + // Assumes there is plenty of remaining entropy in the initial h. + inline uint64 h2(uint64 h) const { + const uint64 m = 0xc6a4a7935bd1e995; + return m * ((h >> 32) | (h << 32)); + } + + // alt_bucket identifies the "other" bucket for key k, whether + // other is "the one that isn't bucket b" + inline uint64 alt_bucket(key_type k, uint64 b) const { + if (fast_mod_by_buckets(k) != b) { + return fast_mod_by_buckets(k); + } + return fast_mod_by_buckets(h2(k)); + } + + inline void InsertInternal(key_type k, const value& v, uint64 b, int slot) { + Bucket* bptr = &buckets_[b]; + bptr->keys[slot] = k; + bptr->values[slot] = v; + } + + // For the associative cuckoo table, check all of the slots in + // the bucket to see if the key is present. + bool FindInBucket(key_type k, uint64 b, value* out) const { + const Bucket& bref = buckets_[b]; + for (int i = 0; i < kSlotsPerBucket; i++) { + if (bref.keys[i] == k) { + *out = bref.values[i]; + return true; + } + } + return false; + } + + // returns either kNoSpace or the index of an + // available slot (0 <= slot < kSlotsPerBucket) + inline int SpaceAvailable(uint64 bucket) const { + const Bucket& bref = buckets_[bucket]; + for (int i = 0; i < kSlotsPerBucket; i++) { + if (bref.keys[i] == kUnusedSlot) { + return i; + } + } + return kNoSpace; + } + + inline void CopyItem(uint64 src_bucket, int src_slot, uint64 dst_bucket, + int dst_slot) { + Bucket& src_ref = buckets_[src_bucket]; + Bucket& dst_ref = buckets_[dst_bucket]; + dst_ref.keys[dst_slot] = src_ref.keys[src_slot]; + dst_ref.values[dst_slot] = src_ref.values[src_slot]; + } + + bool CuckooInsert(key_type k, const value& v, uint64 b1, uint64 b2) { + int visited_end = 0; + cpq_->reset(); + + cpq_->push_back({b1, 1, 0, 0}); // Note depth starts at 1. + cpq_->push_back({b2, 1, 0, 0}); + + while (!cpq_->empty()) { + CuckooPathEntry e = cpq_->pop_front(); + int free_slot; + free_slot = SpaceAvailable(e.bucket); + if (free_slot != kNoSpace) { + while (e.depth > 1) { + // "copy" instead of "swap" because one entry is always zero. + // After, write target key/value over top of last copied entry. + CuckooPathEntry parent = visited_[e.parent]; + CopyItem(parent.bucket, e.parent_slot, e.bucket, free_slot); + free_slot = e.parent_slot; + e = parent; + } + InsertInternal(k, v, e.bucket, free_slot); + return true; + } else { + if (e.depth < (kMaxBFSPathLen)) { + auto parent_index = visited_end; + visited_[visited_end] = e; + visited_end++; + // Don't always start with the same slot, to even out the path depth. + int start_slot = (k + e.bucket) % kSlotsPerBucket; + const Bucket& bref = buckets_[e.bucket]; + for (int i = 0; i < kSlotsPerBucket; i++) { + int slot = (start_slot + i) % kSlotsPerBucket; + uint64 next_bucket = alt_bucket(bref.keys[slot], e.bucket); + // Optimization: Avoid single-step cycles (from e, don't + // add a child node that is actually e's parent). + uint64 e_parent_bucket = visited_[e.parent].bucket; + if (next_bucket != e_parent_bucket) { + cpq_->push_back({next_bucket, e.depth + 1, parent_index, slot}); + } + } + } + } + } + + LOG(WARNING) << "Cuckoo path finding failed: Table too small?"; + return false; + } + + inline uint64 fast_mod_by_buckets(uint64 x) const { +// Omitting the optimized bucket mod for CUDA platforms +// until Eigen supports 2^63 divisors on GPU. +#if !defined(__GCUDACC__) && !defined(__GCUDACC_HOST__) + x &= ~(1ULL << 63); // Fast div can only handle 2^63-1 + return x - num_buckets_ * (x / buckets_divisor_); +#else + return x % num_buckets_; +#endif + } + + // Set upon initialization: num_entries / kLoadFactor / kSlotsPerBucket. + uint64 num_buckets_; + std::vector<Bucket> buckets_; + Eigen::internal::TensorIntDivisor<uint64> buckets_divisor_; // for fast mod + + const std::unique_ptr<CuckooPathQueue> cpq_; + CuckooPathEntry visited_[kVisitedListSize]; + + TF_DISALLOW_COPY_AND_ASSIGN(PresizedCuckooMap); +}; + +} // namespace tensorflow + +#endif // TENSORFLOW_UTIL_PRESIZED_CUCKOO_MAP_H_ diff --git a/tensorflow/core/util/presized_cuckoo_map_test.cc b/tensorflow/core/util/presized_cuckoo_map_test.cc new file mode 100644 index 0000000000..64ad315518 --- /dev/null +++ b/tensorflow/core/util/presized_cuckoo_map_test.cc @@ -0,0 +1,156 @@ +/* Copyright 2016 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/core/platform/env.h" +#include "tensorflow/core/platform/fingerprint.h" +#include "tensorflow/core/platform/test.h" +#include "tensorflow/core/platform/test_benchmark.h" +#include "tensorflow/core/util/presized_cuckoo_map.h" + +namespace tensorflow { +namespace { + +TEST(PresizedCuckooMapTest, Basic) { + PresizedCuckooMap<int> pscm(1000); + EXPECT_TRUE(pscm.InsertUnique(1, 2)); + int out; + EXPECT_TRUE(pscm.Find(1, &out)); + EXPECT_EQ(out, 2); +} + +TEST(PresizedCuckooMapTest, TooManyItems) { + static constexpr int kTableSize = 1000; + PresizedCuckooMap<int> pscm(kTableSize); + for (uint64 i = 0; i < kTableSize; i++) { + EXPECT_TRUE(pscm.InsertUnique(i, i)); + } + // Try to over-fill the table. A few of these + // inserts will succeed, but should start failing. + uint64 failed_at = 0; + for (uint64 i = kTableSize; i < (2 * kTableSize); i++) { + if (!pscm.InsertUnique(i, i)) { + failed_at = i; + break; + } + } + // Requirement 1: Table must return failure when it's full. + EXPECT_NE(failed_at, 0); + // Requirement 2: Table must preserve all items inserted prior + // to the failure. + for (uint64 i = 0; i < failed_at; i++) { + int out; + EXPECT_TRUE(pscm.Find(i, &out)); + EXPECT_EQ(out, i); + } +} + +TEST(PresizedCuckooMapTest, ZeroSizeMap) { + PresizedCuckooMap<int> pscm(0); + int out; + for (uint64 i = 0; i < 100; i++) { + EXPECT_FALSE(pscm.Find(i, &out)); + } +} + +void RunFill(int64 table_size) { + PresizedCuckooMap<int> pscm(table_size); + for (int64 i = 0; i < table_size; i++) { + uint64 key = + Fingerprint64(string(reinterpret_cast<char *>(&i), sizeof(int64))); + EXPECT_TRUE(pscm.InsertUnique(key, i)); + } + for (int64 i = 0; i < table_size; i++) { + uint64 key = + Fingerprint64(string(reinterpret_cast<char *>(&i), sizeof(int64))); + int out; + EXPECT_TRUE(pscm.Find(key, &out)); + EXPECT_EQ(out, i); + } +} + +TEST(PresizedCuckooMapTest, Fill) { + for (int64 table_size = 10; table_size <= 5000000; table_size *= 71) { + RunFill(table_size); + } +} + +TEST(PresizedCuckooMapTest, Duplicates) { + static constexpr int kSmallTableSize = 1000; + PresizedCuckooMap<int> pscm(kSmallTableSize); + + for (uint64 i = 0; i < kSmallTableSize; i++) { + uint64 key = + Fingerprint64(string(reinterpret_cast<char *>(&i), sizeof(uint64))); + EXPECT_TRUE(pscm.InsertUnique(key, i)); + } + + for (uint64 i = 0; i < kSmallTableSize; i++) { + uint64 key = + Fingerprint64(string(reinterpret_cast<char *>(&i), sizeof(uint64))); + EXPECT_FALSE(pscm.InsertUnique(key, i)); + } +} + +static void CalculateKeys(uint64 num, std::vector<uint64> *dst) { + dst->resize(num); + for (uint64 i = 0; i < num; i++) { + uint64 key = + Fingerprint64(string(reinterpret_cast<char *>(&i), sizeof(uint64))); + dst->at(i) = key; + } +} + +static void BM_CuckooFill(int iters, int arg) { + uint64 table_size = arg; + testing::StopTiming(); + std::vector<uint64> calculated_keys; + CalculateKeys(table_size, &calculated_keys); + testing::StartTiming(); + for (int iter = 0; iter < iters; iter++) { + PresizedCuckooMap<int> pscm(table_size); + for (uint64 i = 0; i < table_size; i++) { + pscm.InsertUnique(calculated_keys[i], i); + } + } +} + +BENCHMARK(BM_CuckooFill)->Arg(1000)->Arg(10000000); + +static void BM_CuckooRead(int iters, int arg) { + uint64 table_size = arg; + testing::StopTiming(); + std::vector<uint64> calculated_keys; + CalculateKeys(table_size, &calculated_keys); + PresizedCuckooMap<int> pscm(table_size); + for (uint64 i = 0; i < table_size; i++) { + pscm.InsertUnique(calculated_keys[i], i); + } + testing::StartTiming(); + uint64_t defeat_optimization = 0; + for (int i = 0; i < iters; i++) { + uint64 key_index = i % table_size; // May slow down bench! + int out = 0; + pscm.Find(calculated_keys[key_index], &out); + defeat_optimization += out; + } + if (defeat_optimization == 0) { + printf("Preventing the compiler from eliding the inner loop\n"); + } +} + +BENCHMARK(BM_CuckooRead)->Arg(1000)->Arg(10000000); + +} // namespace +} // namespace tensorflow |