1 //===--- OnDiskHashTable.h - On-Disk Hash Table Implementation --*- C++ -*-===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
11 /// \brief Defines facilities for reading and writing on-disk hash tables.
13 //===----------------------------------------------------------------------===//
14 #ifndef LLVM_SUPPORT_ON_DISK_HASH_TABLE_H
15 #define LLVM_SUPPORT_ON_DISK_HASH_TABLE_H
17 #include "llvm/Support/Allocator.h"
18 #include "llvm/Support/AlignOf.h"
19 #include "llvm/Support/DataTypes.h"
20 #include "llvm/Support/EndianStream.h"
21 #include "llvm/Support/Host.h"
22 #include "llvm/Support/MathExtras.h"
23 #include "llvm/Support/raw_ostream.h"
29 /// \brief Generates an on disk hash table.
31 /// This needs an \c Info that handles storing values into the hash table's
32 /// payload and computes the hash for a given key. This should provide the
33 /// following interface:
36 /// class ExampleInfo {
38 /// typedef ExampleKey key_type; // Must be copy constructible
39 /// typedef ExampleKey &key_type_ref;
40 /// typedef ExampleData data_type; // Must be copy constructible
41 /// typedef ExampleData &data_type_ref;
43 /// /// Calculate the hash for Key
44 /// static unsigned ComputeHash(key_type_ref Key);
45 /// /// Return the lengths, in bytes, of the given Key/Data pair.
46 /// static std::pair<unsigned, unsigned>
47 /// EmitKeyDataLength(raw_ostream &Out, key_type_ref Key, data_type_ref Data);
48 /// /// Write Key to Out. KeyLen is the length from EmitKeyDataLength.
49 /// static void EmitKey(raw_ostream &Out, key_type_ref Key, unsigned KeyLen);
50 /// /// Write Data to Out. DataLen is the length from EmitKeyDataLength.
51 /// static void EmitData(raw_ostream &Out, key_type_ref Key,
52 /// data_type_ref Data, unsigned DataLen);
55 template <typename Info> class OnDiskChainedHashTableGenerator {
58 llvm::BumpPtrAllocator BA;
60 /// \brief A single item in the hash table.
63 typename Info::key_type Key;
64 typename Info::data_type Data;
68 Item(typename Info::key_type_ref Key, typename Info::data_type_ref Data,
70 : Key(Key), Data(Data), Next(0), Hash(InfoObj.ComputeHash(Key)) {}
73 /// \brief A linked list of values in a particular hash bucket.
86 /// \brief Insert an item into the appropriate hash bucket.
87 void insert(Bucket *Buckets, size_t Size, Item *E) {
88 Bucket &B = Buckets[E->Hash & (Size - 1)];
94 /// \brief Resize the hash table, moving the old entries into the new buckets.
95 void resize(size_t NewSize) {
96 Bucket *NewBuckets = (Bucket *)std::calloc(NewSize, sizeof(Bucket));
97 // Populate NewBuckets with the old entries.
98 for (unsigned I = 0; I < NumBuckets; ++I)
99 for (Item *E = Buckets[I].Head; E;) {
102 insert(NewBuckets, NewSize, E);
107 NumBuckets = NewSize;
108 Buckets = NewBuckets;
112 /// \brief Insert an entry into the table.
113 void insert(typename Info::key_type_ref Key,
114 typename Info::data_type_ref Data) {
116 insert(Key, Data, InfoObj);
119 /// \brief Insert an entry into the table.
121 /// Uses the provided Info instead of a stack allocated one.
122 void insert(typename Info::key_type_ref Key,
123 typename Info::data_type_ref Data, Info &InfoObj) {
126 if (4 * NumEntries >= 3 * NumBuckets)
127 resize(NumBuckets * 2);
128 insert(Buckets, NumBuckets,
129 new (BA.Allocate<Item>()) Item(Key, Data, InfoObj));
132 /// \brief Emit the table to Out, which must not be at offset 0.
133 uint32_t Emit(raw_ostream &Out) {
135 return Emit(Out, InfoObj);
138 /// \brief Emit the table to Out, which must not be at offset 0.
140 /// Uses the provided Info instead of a stack allocated one.
141 uint32_t Emit(raw_ostream &Out, Info &InfoObj) {
142 using namespace llvm::support;
143 endian::Writer<little> LE(Out);
145 // Emit the payload of the table.
146 for (unsigned I = 0; I < NumBuckets; ++I) {
147 Bucket &B = Buckets[I];
151 // Store the offset for the data of this bucket.
153 assert(B.Off && "Cannot write a bucket at offset 0. Please add padding.");
155 // Write out the number of items in the bucket.
156 LE.write<uint16_t>(B.Length);
157 assert(B.Length != 0 && "Bucket has a head but zero length?");
159 // Write out the entries in the bucket.
160 for (Item *I = B.Head; I; I = I->Next) {
161 LE.write<uint32_t>(I->Hash);
162 const std::pair<unsigned, unsigned> &Len =
163 InfoObj.EmitKeyDataLength(Out, I->Key, I->Data);
164 InfoObj.EmitKey(Out, I->Key, Len.first);
165 InfoObj.EmitData(Out, I->Key, I->Data, Len.second);
169 // Pad with zeros so that we can start the hashtable at an aligned address.
170 uint32_t TableOff = Out.tell();
171 uint64_t N = llvm::OffsetToAlignment(TableOff, alignOf<uint32_t>());
174 LE.write<uint8_t>(0);
176 // Emit the hashtable itself.
177 LE.write<uint32_t>(NumBuckets);
178 LE.write<uint32_t>(NumEntries);
179 for (unsigned I = 0; I < NumBuckets; ++I)
180 LE.write<uint32_t>(Buckets[I].Off);
185 OnDiskChainedHashTableGenerator() {
188 // Note that we do not need to run the constructors of the individual
189 // Bucket objects since 'calloc' returns bytes that are all 0.
190 Buckets = (Bucket *)std::calloc(NumBuckets, sizeof(Bucket));
193 ~OnDiskChainedHashTableGenerator() { std::free(Buckets); }
196 /// \brief Provides lookup on an on disk hash table.
198 /// This needs an \c Info that handles reading values from the hash table's
199 /// payload and computes the hash for a given key. This should provide the
200 /// following interface:
203 /// class ExampleLookupInfo {
205 /// typedef ExampleData data_type;
206 /// typedef ExampleInternalKey internal_key_type; // The stored key type.
207 /// typedef ExampleKey external_key_type; // The type to pass to find().
209 /// /// Compare two keys for equality.
210 /// static bool EqualKey(internal_key_type &Key1, internal_key_type &Key2);
211 /// /// Calculate the hash for the given key.
212 /// static unsigned ComputeHash(internal_key_type &IKey);
213 /// /// Translate from the semantic type of a key in the hash table to the
214 /// /// type that is actually stored and used for hashing and comparisons.
215 /// /// The internal and external types are often the same, in which case this
216 /// /// can simply return the passed in value.
217 /// static const internal_key_type &GetInternalKey(external_key_type &EKey);
218 /// /// Read the key and data length from Buffer, leaving it pointing at the
219 /// /// following byte.
220 /// static std::pair<unsigned, unsigned>
221 /// ReadKeyDataLength(const unsigned char *&Buffer);
222 /// /// Read the key from Buffer, given the KeyLen as reported from
223 /// /// ReadKeyDataLength.
224 /// const internal_key_type &ReadKey(const unsigned char *Buffer,
225 /// unsigned KeyLen);
226 /// /// Read the data for Key from Buffer, given the DataLen as reported from
227 /// /// ReadKeyDataLength.
228 /// data_type ReadData(StringRef Key, const unsigned char *Buffer,
229 /// unsigned DataLen);
232 template <typename Info> class OnDiskChainedHashTable {
233 const unsigned NumBuckets;
234 const unsigned NumEntries;
235 const unsigned char *const Buckets;
236 const unsigned char *const Base;
240 typedef typename Info::internal_key_type internal_key_type;
241 typedef typename Info::external_key_type external_key_type;
242 typedef typename Info::data_type data_type;
244 OnDiskChainedHashTable(unsigned NumBuckets, unsigned NumEntries,
245 const unsigned char *Buckets,
246 const unsigned char *Base,
247 const Info &InfoObj = Info())
248 : NumBuckets(NumBuckets), NumEntries(NumEntries), Buckets(Buckets),
249 Base(Base), InfoObj(InfoObj) {
250 assert((reinterpret_cast<uintptr_t>(Buckets) & 0x3) == 0 &&
251 "'buckets' must have a 4-byte alignment");
254 unsigned getNumBuckets() const { return NumBuckets; }
255 unsigned getNumEntries() const { return NumEntries; }
256 const unsigned char *getBase() const { return Base; }
257 const unsigned char *getBuckets() const { return Buckets; }
259 bool isEmpty() const { return NumEntries == 0; }
262 internal_key_type Key;
263 const unsigned char *const Data;
268 iterator() : Data(0), Len(0) {}
269 iterator(const internal_key_type K, const unsigned char *D, unsigned L,
271 : Key(K), Data(D), Len(L), InfoObj(InfoObj) {}
273 data_type operator*() const { return InfoObj->ReadData(Key, Data, Len); }
274 bool operator==(const iterator &X) const { return X.Data == Data; }
275 bool operator!=(const iterator &X) const { return X.Data != Data; }
278 /// \brief Look up the stored data for a particular key.
279 iterator find(const external_key_type &EKey, Info *InfoPtr = 0) {
283 using namespace llvm::support;
284 const internal_key_type &IKey = InfoObj.GetInternalKey(EKey);
285 unsigned KeyHash = InfoObj.ComputeHash(IKey);
287 // Each bucket is just a 32-bit offset into the hash table file.
288 unsigned Idx = KeyHash & (NumBuckets - 1);
289 const unsigned char *Bucket = Buckets + sizeof(uint32_t) * Idx;
291 unsigned Offset = endian::readNext<uint32_t, little, aligned>(Bucket);
293 return iterator(); // Empty bucket.
294 const unsigned char *Items = Base + Offset;
296 // 'Items' starts with a 16-bit unsigned integer representing the
297 // number of items in this bucket.
298 unsigned Len = endian::readNext<uint16_t, little, unaligned>(Items);
300 for (unsigned i = 0; i < Len; ++i) {
302 uint32_t ItemHash = endian::readNext<uint32_t, little, unaligned>(Items);
304 // Determine the length of the key and the data.
305 const std::pair<unsigned, unsigned> &L = Info::ReadKeyDataLength(Items);
306 unsigned ItemLen = L.first + L.second;
308 // Compare the hashes. If they are not the same, skip the entry entirely.
309 if (ItemHash != KeyHash) {
315 const internal_key_type &X =
316 InfoPtr->ReadKey((const unsigned char *const)Items, L.first);
318 // If the key doesn't match just skip reading the value.
319 if (!InfoPtr->EqualKey(X, IKey)) {
325 return iterator(X, Items + L.first, L.second, InfoPtr);
331 iterator end() const { return iterator(); }
333 Info &getInfoObj() { return InfoObj; }
335 /// \brief Create the hash table.
337 /// \param Buckets is the beginning of the hash table itself, which follows
338 /// the payload of entire structure. This is the value returned by
339 /// OnDiskHashTableGenerator::Emit.
341 /// \param Base is the point from which all offsets into the structure are
342 /// based. This is offset 0 in the stream that was used when Emitting the
344 static OnDiskChainedHashTable *Create(const unsigned char *Buckets,
345 const unsigned char *const Base,
346 const Info &InfoObj = Info()) {
347 using namespace llvm::support;
348 assert(Buckets > Base);
349 assert((reinterpret_cast<uintptr_t>(Buckets) & 0x3) == 0 &&
350 "buckets should be 4-byte aligned.");
352 unsigned NumBuckets = endian::readNext<uint32_t, little, aligned>(Buckets);
353 unsigned NumEntries = endian::readNext<uint32_t, little, aligned>(Buckets);
354 return new OnDiskChainedHashTable<Info>(NumBuckets, NumEntries, Buckets,
359 /// \brief Provides lookup and iteration over an on disk hash table.
361 /// \copydetails llvm::OnDiskChainedHashTable
362 template <typename Info>
363 class OnDiskIterableChainedHashTable : public OnDiskChainedHashTable<Info> {
364 const unsigned char *Payload;
367 typedef OnDiskChainedHashTable<Info> base_type;
368 typedef typename base_type::internal_key_type internal_key_type;
369 typedef typename base_type::external_key_type external_key_type;
370 typedef typename base_type::data_type data_type;
372 OnDiskIterableChainedHashTable(unsigned NumBuckets, unsigned NumEntries,
373 const unsigned char *Buckets,
374 const unsigned char *Payload,
375 const unsigned char *Base,
376 const Info &InfoObj = Info())
377 : base_type(NumBuckets, NumEntries, Buckets, Base, InfoObj),
380 /// \brief Iterates over all of the keys in the table.
382 const unsigned char *Ptr;
383 unsigned NumItemsInBucketLeft;
384 unsigned NumEntriesLeft;
388 typedef external_key_type value_type;
390 key_iterator(const unsigned char *const Ptr, unsigned NumEntries,
392 : Ptr(Ptr), NumItemsInBucketLeft(0), NumEntriesLeft(NumEntries),
395 : Ptr(0), NumItemsInBucketLeft(0), NumEntriesLeft(0), InfoObj(0) {}
397 friend bool operator==(const key_iterator &X, const key_iterator &Y) {
398 return X.NumEntriesLeft == Y.NumEntriesLeft;
400 friend bool operator!=(const key_iterator &X, const key_iterator &Y) {
401 return X.NumEntriesLeft != Y.NumEntriesLeft;
404 key_iterator &operator++() { // Preincrement
405 using namespace llvm::support;
406 if (!NumItemsInBucketLeft) {
407 // 'Items' starts with a 16-bit unsigned integer representing the
408 // number of items in this bucket.
409 NumItemsInBucketLeft =
410 endian::readNext<uint16_t, little, unaligned>(Ptr);
412 Ptr += 4; // Skip the hash.
413 // Determine the length of the key and the data.
414 const std::pair<unsigned, unsigned> &L = Info::ReadKeyDataLength(Ptr);
415 Ptr += L.first + L.second;
416 assert(NumItemsInBucketLeft);
417 --NumItemsInBucketLeft;
418 assert(NumEntriesLeft);
422 key_iterator operator++(int) { // Postincrement
423 key_iterator tmp = *this; ++*this; return tmp;
426 value_type operator*() const {
427 const unsigned char *LocalPtr = Ptr;
428 if (!NumItemsInBucketLeft)
429 LocalPtr += 2; // number of items in bucket
430 LocalPtr += 4; // Skip the hash.
432 // Determine the length of the key and the data.
433 const std::pair<unsigned, unsigned> &L =
434 Info::ReadKeyDataLength(LocalPtr);
437 const internal_key_type &Key = InfoObj->ReadKey(LocalPtr, L.first);
438 return InfoObj->GetExternalKey(Key);
442 key_iterator key_begin() {
443 return key_iterator(Payload, this->getNumEntries(), &this->getInfoObj());
445 key_iterator key_end() { return key_iterator(); }
447 iterator_range<key_iterator> keys() {
448 return make_range(key_begin(), key_end());
451 /// \brief Iterates over all the entries in the table, returning the data.
452 class data_iterator {
453 const unsigned char *Ptr;
454 unsigned NumItemsInBucketLeft;
455 unsigned NumEntriesLeft;
459 typedef data_type value_type;
461 data_iterator(const unsigned char *const Ptr, unsigned NumEntries,
463 : Ptr(Ptr), NumItemsInBucketLeft(0), NumEntriesLeft(NumEntries),
466 : Ptr(0), NumItemsInBucketLeft(0), NumEntriesLeft(0), InfoObj(0) {}
468 bool operator==(const data_iterator &X) const {
469 return X.NumEntriesLeft == NumEntriesLeft;
471 bool operator!=(const data_iterator &X) const {
472 return X.NumEntriesLeft != NumEntriesLeft;
475 data_iterator &operator++() { // Preincrement
476 using namespace llvm::support;
477 if (!NumItemsInBucketLeft) {
478 // 'Items' starts with a 16-bit unsigned integer representing the
479 // number of items in this bucket.
480 NumItemsInBucketLeft =
481 endian::readNext<uint16_t, little, unaligned>(Ptr);
483 Ptr += 4; // Skip the hash.
484 // Determine the length of the key and the data.
485 const std::pair<unsigned, unsigned> &L = Info::ReadKeyDataLength(Ptr);
486 Ptr += L.first + L.second;
487 assert(NumItemsInBucketLeft);
488 --NumItemsInBucketLeft;
489 assert(NumEntriesLeft);
493 data_iterator operator++(int) { // Postincrement
494 data_iterator tmp = *this; ++*this; return tmp;
497 value_type operator*() const {
498 const unsigned char *LocalPtr = Ptr;
499 if (!NumItemsInBucketLeft)
500 LocalPtr += 2; // number of items in bucket
501 LocalPtr += 4; // Skip the hash.
503 // Determine the length of the key and the data.
504 const std::pair<unsigned, unsigned> &L =
505 Info::ReadKeyDataLength(LocalPtr);
508 const internal_key_type &Key = InfoObj->ReadKey(LocalPtr, L.first);
509 return InfoObj->ReadData(Key, LocalPtr + L.first, L.second);
513 data_iterator data_begin() {
514 return data_iterator(Payload, this->getNumEntries(), &this->getInfoObj());
516 data_iterator data_end() { return data_iterator(); }
518 iterator_range<data_iterator> data() {
519 return make_range(data_begin(), data_end());
522 /// \brief Create the hash table.
524 /// \param Buckets is the beginning of the hash table itself, which follows
525 /// the payload of entire structure. This is the value returned by
526 /// OnDiskHashTableGenerator::Emit.
528 /// \param Payload is the beginning of the data contained in the table. This
529 /// is Base plus any padding or header data that was stored, ie, the offset
530 /// that the stream was at when calling Emit.
532 /// \param Base is the point from which all offsets into the structure are
533 /// based. This is offset 0 in the stream that was used when Emitting the
535 static OnDiskIterableChainedHashTable *
536 Create(const unsigned char *Buckets, const unsigned char *const Payload,
537 const unsigned char *const Base, const Info &InfoObj = Info()) {
538 using namespace llvm::support;
539 assert(Buckets > Base);
540 assert((reinterpret_cast<uintptr_t>(Buckets) & 0x3) == 0 &&
541 "buckets should be 4-byte aligned.");
543 unsigned NumBuckets = endian::readNext<uint32_t, little, aligned>(Buckets);
544 unsigned NumEntries = endian::readNext<uint32_t, little, aligned>(Buckets);
545 return new OnDiskIterableChainedHashTable<Info>(
546 NumBuckets, NumEntries, Buckets, Payload, Base, InfoObj);
550 } // end namespace llvm
552 #endif // LLVM_SUPPORT_ON_DISK_HASH_TABLE_H