1 /*------------------------------------------------------------------------
2 Junction: Concurrent data structures in C++
3 Copyright (c) 2016 Jeff Preshing
5 Distributed under the Simplified BSD License.
6 Original location: https://github.com/preshing/junction
8 This software is distributed WITHOUT ANY WARRANTY; without even the
9 implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
10 See the LICENSE file for more information.
11 ------------------------------------------------------------------------*/
13 #ifndef JUNCTION_DETAILS_GRAMPA_H
14 #define JUNCTION_DETAILS_GRAMPA_H
16 #include <junction/Core.h>
17 #include <turf/Atomic.h>
18 #include <junction/striped/Mutex.h>
19 #include <junction/striped/ManualResetEvent.h>
20 #include <turf/Util.h>
21 #include <junction/MapTraits.h>
22 #include <turf/Trace.h>
23 #include <turf/Heap.h>
24 #include <junction/SimpleJobCoordinator.h>
25 #include <junction/QSBR.h>
31 #if JUNCTION_TRACK_GRAMPA_STATS
32 struct GrampaCounter {
33 turf::Atomic<ureg> total;
34 turf::Atomic<sreg> current;
37 total.fetchAdd(1, turf::Relaxed);
38 current.fetchAdd(1, turf::Relaxed);
42 current.fetchSub(1, turf::Relaxed);
47 GrampaCounter numTables;
48 GrampaCounter numTableMigrations;
49 GrampaCounter numFlatTrees;
50 GrampaCounter numFlatTreeMigrations;
52 static GrampaStats Instance; // Zero-initialized
56 TURF_TRACE_DECLARE(Grampa, 37)
60 typedef typename Map::Hash Hash;
61 typedef typename Map::Value Value;
62 typedef typename Map::KeyTraits KeyTraits;
63 typedef typename Map::ValueTraits ValueTraits;
65 static const ureg RedirectFlatTree = 1;
66 static const ureg InitialSize = 8;
67 static const ureg TableMigrationUnitSize = 32;
68 static const ureg FlatTreeMigrationUnitSize = 32;
69 static const ureg LinearSearchLimit = 128;
70 static const ureg CellsInUseSample = LinearSearchLimit;
71 TURF_STATIC_ASSERT(LinearSearchLimit > 0 && LinearSearchLimit < 256); // Must fit in CellGroup::links
72 TURF_STATIC_ASSERT(CellsInUseSample > 0 && CellsInUseSample <= LinearSearchLimit); // Limit sample to failed search chain
74 static const ureg MinTableSize = 8;
75 static const ureg LeafSizeBits = 10;
76 static const ureg LeafSize = (ureg(1) << LeafSizeBits);
79 // If value == Redirect, threads participate in the jobCoordinator.
80 turf::Atomic<Hash> hash;
81 turf::Atomic<Value> value;
85 // Every cell in the table actually represents a bucket of cells, all linked together in a probe chain.
86 // Each cell in the probe chain is located within the table itself.
87 // "deltas" determines the index of the next cell in the probe chain.
88 // The first cell in the chain is the one that was hashed. It may or may not actually belong in the bucket.
89 // The "second" cell in the chain is given by deltas 0 - 3. It's guaranteed to belong in the bucket.
90 // All subsequent cells in the chain is given by deltas 4 - 7. Also guaranteed to belong in the bucket.
91 turf::Atomic<u8> deltas[8];
96 // unsafeRangeShift determines how many slots are occupied by this Table in the flattree.
97 // The range of hashes stored in this table is given by (1 << shift).
98 // eg. If the entire map is stored in a single table, then Table::shift == HASH_BITS.
99 // If the entire map is stored in two tables, then Table::shift == (HASH_BITS - 1) for each table.
100 // FlatTree::shift is always <= Table::shift for all the tables it contains.
101 const ureg sizeMask; // a power of two minus one
103 const ureg unsafeRangeShift;
104 junction::striped::ManualResetEvent
105 isPublished; // To prevent publishing a subtree before its parent is published (happened in testing)
106 junction::striped::Mutex mutex; // to DCLI the TableMigration (stored in the jobCoordinator)
107 SimpleJobCoordinator jobCoordinator; // makes all blocked threads participate in the migration
109 Table(ureg sizeMask, Hash baseHash, ureg unsafeRangeShift)
110 : sizeMask(sizeMask), baseHash(baseHash), unsafeRangeShift(unsafeRangeShift) {
113 static Table* create(ureg tableSize, Hash baseHash, ureg unsafeShift) {
114 TURF_ASSERT(turf::util::isPowerOf2(tableSize));
115 TURF_ASSERT(unsafeShift > 0 && unsafeShift <= sizeof(Hash) * 8);
116 TURF_ASSERT(tableSize >= 4);
117 ureg numGroups = tableSize >> 2;
118 Table* table = (Table*) TURF_HEAP.alloc(sizeof(Table) + sizeof(CellGroup) * numGroups);
119 new (table) Table(tableSize - 1, baseHash, (u8) unsafeShift);
120 for (ureg i = 0; i < numGroups; i++) {
121 CellGroup* group = table->getCellGroups() + i;
122 for (ureg j = 0; j < 4; j++) {
123 group->deltas[j].storeNonatomic(0);
124 group->deltas[j + 4].storeNonatomic(0);
125 group->cells[j].hash.storeNonatomic(KeyTraits::NullHash);
126 group->cells[j].value.storeNonatomic(Value(ValueTraits::NullValue));
129 #if JUNCTION_TRACK_GRAMPA_STATS
130 GrampaStats::Instance.numTables.increment();
136 #if JUNCTION_TRACK_GRAMPA_STATS
137 GrampaStats::Instance.numTables.decrement();
139 this->Table::~Table();
140 TURF_HEAP.free(this);
143 CellGroup* getCellGroups() const {
144 return (CellGroup*) (this + 1);
147 ureg getNumMigrationUnits() const {
148 return sizeMask / TableMigrationUnitSize + 1;
152 class TableMigration : public SimpleJobCoordinator::Job {
156 turf::Atomic<ureg> sourceIndex;
160 Hash m_baseHash; // The lowest possible hash value in this subtree; determines index in flattree.
161 // If m_numDestinations == 1, m_shift == 0.
162 // Otherwise, m_shift tells (indirectly) the size of the flattree in which our subtree would exactly fit: 1 << (HASH_BITS
164 // This ensures that m_shift is always less than sizeof(Hash) * 8, so that shifting by m_shift is not undefined behavior.
165 // To determine the subtree index for a hash during migration, we use: (hash >> m_shift) & (m_numDestinations - 1)
166 // A mask is used since we are only migrating a subtree -- not necessarily the entire map.
168 turf::Atomic<ureg> m_workerStatus; // number of workers + end flag
169 turf::Atomic<sreg> m_overflowTableIndex;
170 turf::Atomic<sreg> m_unitsRemaining;
172 ureg m_numDestinations; // The size of the subtree being created. Some table pointers may be repeated.
174 TableMigration(Map& map) : m_map(map) {
177 static TableMigration* create(Map& map, ureg numSources, ureg numDestinations) {
178 TableMigration* migration = (TableMigration*) TURF_HEAP.alloc(
179 sizeof(TableMigration) + sizeof(TableMigration::Source) * numSources + sizeof(Table*) * numDestinations);
180 new (migration) TableMigration(map);
181 migration->m_workerStatus.storeNonatomic(0);
182 migration->m_overflowTableIndex.storeNonatomic(-1);
183 migration->m_unitsRemaining.storeNonatomic(0);
184 migration->m_numSources = numSources;
185 migration->m_numDestinations = numDestinations;
186 #if JUNCTION_TRACK_GRAMPA_STATS
187 GrampaStats::Instance.numTableMigrations.increment();
189 // Caller is responsible for filling in source & destination pointers
193 virtual ~TableMigration() TURF_OVERRIDE {
197 #if JUNCTION_TRACK_GRAMPA_STATS
198 GrampaStats::Instance.numTableMigrations.decrement();
200 // Destroy all source tables.
201 for (ureg i = 0; i < m_numSources; i++)
202 if (getSources()[i].table)
203 getSources()[i].table->destroy();
204 // Delete the migration object itself.
205 this->TableMigration::~TableMigration();
206 TURF_HEAP.free(this);
209 ureg getUnsafeShift() const {
210 return m_safeShift ? m_safeShift : (sizeof(Hash) * 8);
213 Source* getSources() const {
214 return (Source*) (this + 1);
217 Table** getDestinations() const {
218 return (Table**) (getSources() + m_numSources);
221 sreg migrateRange(Table* srcTable, ureg startIdx);
222 virtual void run() TURF_OVERRIDE;
225 class FlatTreeMigration;
228 // The size of the flattree is 1 << 64 - HASH_BITS.
229 // Or, stated another way, (Hash(-1) >> shift) + 1.
230 // To determine the flattree index for a given hash, we simply use: (hash >> shift)
231 // Smaller shift == more significant bits used as an index == bigger flattree.
232 // For example, the simplest flattree has only two entries, and only the most significant
233 // bit of each hash is used as the flattree index. In that case, shift == HASH_BITS - 1.
234 // Each time the flattree doubles in size, shift decreases by 1.
235 const ureg safeShift;
236 junction::striped::Mutex mutex;
237 FlatTreeMigration* migration; // Protected by mutex
239 FlatTree(ureg safeShift) : safeShift(safeShift), migration(NULL) {
240 // A FlatTree always has at least two tables, so the shift is always safe.
241 TURF_ASSERT(safeShift < sizeof(Hash) * 8);
244 static FlatTree* create(ureg safeShift) {
245 // A flattree always has at least two tables, so the shift is always safe.
246 TURF_ASSERT(safeShift < sizeof(Hash) * 8);
247 ureg numLeaves = (Hash(-1) >> safeShift) + 1;
248 FlatTree* flatTree = (FlatTree*) TURF_HEAP.alloc(sizeof(FlatTree) + sizeof(turf::Atomic<Table*>) * numLeaves);
249 new (flatTree) FlatTree(safeShift);
250 #if JUNCTION_TRACK_GRAMPA_STATS
251 GrampaStats::Instance.numFlatTrees.increment();
253 // Caller will initialize flatTree->getTables()
258 #if JUNCTION_TRACK_GRAMPA_STATS
259 GrampaStats::Instance.numFlatTrees.decrement();
261 this->FlatTree::~FlatTree();
262 TURF_HEAP.free(this);
265 turf::Atomic<Table*>* getTables() const {
266 return (turf::Atomic<Table*>*) (this + 1);
269 ureg getSize() const {
270 return (Hash(-1) >> safeShift) + 1;
273 ureg getNumMigrationUnits() const {
274 ureg sizeMask = Hash(-1) >> safeShift;
275 return sizeMask / FlatTreeMigrationUnitSize + 1;
279 class FlatTreeMigration : public SimpleJobCoordinator::Job {
283 FlatTree* m_destination;
284 turf::Atomic<ureg> m_workerStatus;
285 turf::Atomic<ureg> m_sourceIndex;
286 turf::Atomic<sreg> m_unitsRemaining;
287 junction::striped::ManualResetEvent m_completed;
289 FlatTreeMigration(Map& map, FlatTree* flatTree, ureg shift) : m_map(map) {
291 m_destination = FlatTree::create(shift);
292 m_workerStatus.storeNonatomic(0);
293 m_sourceIndex.storeNonatomic(0);
294 m_unitsRemaining.storeNonatomic(flatTree->getNumMigrationUnits());
295 #if JUNCTION_TRACK_GRAMPA_STATS
296 GrampaStats::Instance.numFlatTreeMigrations.increment();
300 virtual ~FlatTreeMigration() TURF_OVERRIDE {
301 #if JUNCTION_TRACK_GRAMPA_STATS
302 GrampaStats::Instance.numFlatTreeMigrations.decrement();
304 // Delete source flattree.
312 virtual void run() TURF_OVERRIDE;
315 static void garbageCollectTable(Table* table) {
317 DefaultQSBR.enqueue(&Table::destroy, table);
320 static void garbageCollectFlatTree(FlatTree* flatTree) {
321 TURF_ASSERT(flatTree);
322 DefaultQSBR.enqueue(&FlatTree::destroy, flatTree);
325 static Cell* find(Hash hash, Table* table, ureg sizeMask) {
326 TURF_TRACE(Grampa, 0, "[find] called", uptr(table), hash);
328 TURF_ASSERT(hash != KeyTraits::NullHash);
329 // Optimistically check hashed cell even though it might belong to another bucket
330 ureg idx = hash & sizeMask;
331 CellGroup* group = table->getCellGroups() + (idx >> 2);
332 Cell* cell = group->cells + (idx & 3);
333 Hash probeHash = cell->hash.load(turf::Relaxed);
334 if (probeHash == hash) {
335 TURF_TRACE(Grampa, 1, "[find] found existing cell optimistically", uptr(table), idx);
337 } else if (probeHash == KeyTraits::NullHash) {
340 // Follow probe chain for our bucket
341 u8 delta = group->deltas[idx & 3].load(turf::Relaxed);
343 idx = (idx + delta) & sizeMask;
344 group = table->getCellGroups() + (idx >> 2);
345 cell = group->cells + (idx & 3);
346 Hash probeHash = cell->hash.load(turf::Relaxed);
347 // Note: probeHash might actually be NULL due to memory reordering of a concurrent insert,
348 // but we don't check for it. We just follow the probe chain.
349 if (probeHash == hash) {
350 TURF_TRACE(Grampa, 2, "[find] found existing cell", uptr(table), idx);
353 delta = group->deltas[(idx & 3) + 4].load(turf::Relaxed);
355 // End of probe chain, not found
359 // FIXME: Possible optimization: Dedicated insert for migration? It wouldn't check for InsertResult_AlreadyFound.
360 enum InsertResult { InsertResult_AlreadyFound, InsertResult_InsertedNew, InsertResult_Overflow };
361 static InsertResult insertOrFind(Hash hash, Table* table, ureg sizeMask, Cell*& cell, ureg& overflowIdx) {
362 TURF_TRACE(Grampa, 3, "[insertOrFind] called", uptr(table), hash);
364 TURF_ASSERT(hash != KeyTraits::NullHash);
365 ureg idx = ureg(hash);
367 // Check hashed cell first, though it may not even belong to the bucket.
368 CellGroup* group = table->getCellGroups() + ((idx & sizeMask) >> 2);
369 cell = group->cells + (idx & 3);
370 Hash probeHash = cell->hash.load(turf::Relaxed);
371 if (probeHash == KeyTraits::NullHash) {
372 if (cell->hash.compareExchangeStrong(probeHash, hash, turf::Relaxed)) {
373 TURF_TRACE(Grampa, 4, "[insertOrFind] reserved first cell", uptr(table), idx);
374 // There are no links to set. We're done.
375 return InsertResult_InsertedNew;
377 TURF_TRACE(Grampa, 5, "[insertOrFind] race to reserve first cell", uptr(table), idx);
378 // Fall through to check if it was the same hash...
381 if (probeHash == hash) {
382 TURF_TRACE(Grampa, 6, "[insertOrFind] found in first cell", uptr(table), idx);
383 return InsertResult_AlreadyFound;
386 // Follow the link chain for this bucket.
387 ureg maxIdx = idx + sizeMask;
389 turf::Atomic<u8>* prevLink;
392 prevLink = group->deltas + ((idx & 3) + linkLevel);
394 u8 probeDelta = prevLink->load(turf::Relaxed);
397 // Check the hash for this cell.
398 group = table->getCellGroups() + ((idx & sizeMask) >> 2);
399 cell = group->cells + (idx & 3);
400 probeHash = cell->hash.load(turf::Relaxed);
401 if (probeHash == KeyTraits::NullHash) {
402 // Cell was linked, but hash is not visible yet.
403 // We could avoid this case (and guarantee it's visible) using acquire & release, but instead,
404 // just poll until it becomes visible.
405 TURF_TRACE(Grampa, 7, "[insertOrFind] race to read hash", uptr(table), idx);
407 probeHash = cell->hash.load(turf::Acquire);
408 } while (probeHash == KeyTraits::NullHash);
410 TURF_ASSERT(((probeHash ^ hash) & sizeMask) == 0); // Only hashes in same bucket can be linked
411 if (probeHash == hash) {
412 TURF_TRACE(Grampa, 8, "[insertOrFind] found in probe chain", uptr(table), idx);
413 return InsertResult_AlreadyFound;
416 // Reached the end of the link chain for this bucket.
417 // Switch to linear probing until we reserve a new cell or find a late-arriving cell in the same bucket.
418 ureg prevLinkIdx = idx;
419 TURF_ASSERT(sreg(maxIdx - idx) >= 0); // Nobody would have linked an idx that's out of range.
420 ureg linearProbesRemaining = turf::util::min(maxIdx - idx, LinearSearchLimit);
421 while (linearProbesRemaining-- > 0) {
423 group = table->getCellGroups() + ((idx & sizeMask) >> 2);
424 cell = group->cells + (idx & 3);
425 probeHash = cell->hash.load(turf::Relaxed);
426 if (probeHash == KeyTraits::NullHash) {
427 // It's an empty cell. Try to reserve it.
428 if (cell->hash.compareExchangeStrong(probeHash, hash, turf::Relaxed)) {
429 // Success. We've reserved the cell. Link it to previous cell in same bucket.
430 TURF_TRACE(Grampa, 9, "[insertOrFind] reserved cell", uptr(table), idx);
431 TURF_ASSERT(probeDelta == 0);
432 u8 desiredDelta = idx - prevLinkIdx;
433 #if TURF_WITH_ASSERTS
434 // Note: another thread could actually set the link on our behalf (see below).
435 probeDelta = prevLink->exchange(desiredDelta, turf::Relaxed);
436 TURF_ASSERT(probeDelta == 0 || probeDelta == desiredDelta);
438 prevLink->store(desiredDelta, turf::Relaxed);
440 return InsertResult_InsertedNew;
442 TURF_TRACE(Grampa, 10, "[insertOrFind] race to reserve cell", uptr(table), idx);
443 // Fall through to check if it's the same hash...
446 Hash x = (probeHash ^ hash);
447 // Check for same hash.
449 TURF_TRACE(Grampa, 11, "[insertOrFind] found outside probe chain", uptr(table), idx);
450 return InsertResult_AlreadyFound;
452 // Check for same bucket.
453 if ((x & sizeMask) == 0) {
454 TURF_TRACE(Grampa, 12, "[insertOrFind] found late-arriving cell in same bucket", uptr(table), idx);
455 // Attempt to set the link on behalf of the late-arriving cell.
456 // This is usually redundant, but if we don't attempt to set the late-arriving cell's link here,
457 // there's no guarantee that our own link chain will be well-formed by the time this function returns.
458 // (Indeed, subsequent lookups sometimes failed during testing, for this exact reason.)
459 u8 desiredDelta = idx - prevLinkIdx;
460 #if TURF_WITH_ASSERTS
461 probeDelta = prevLink->exchange(desiredDelta, turf::Relaxed);
462 TURF_ASSERT(probeDelta == 0 || probeDelta == desiredDelta);
464 TURF_TRACE(Grampa, 13, "[insertOrFind] set link on behalf of late-arriving cell", uptr(table), idx);
466 prevLink->store(desiredDelta, turf::Relaxed);
468 goto followLink; // Try to follow link chain for the bucket again.
470 // Continue linear search...
472 // Table is too full to insert.
473 overflowIdx = idx + 1;
474 TURF_TRACE(Grampa, 14, "[insertOrFind] overflow", uptr(table), overflowIdx);
475 return InsertResult_Overflow;
480 static void beginTableMigrationToSize(Map& map, Table* table, ureg nextTableSize, ureg splitShift) {
481 // Create new migration by DCLI.
482 TURF_TRACE(Grampa, 15, "[beginTableMigrationToSize] called", 0, 0);
483 SimpleJobCoordinator::Job* job = table->jobCoordinator.loadConsume();
485 TURF_TRACE(Grampa, 16, "[beginTableMigrationToSize] new migration already exists", 0, 0);
487 turf::LockGuard<junction::striped::Mutex> guard(table->mutex);
488 job = table->jobCoordinator.loadConsume(); // Non-atomic would be sufficient, but that's OK.
490 TURF_TRACE(Grampa, 17, "[beginTableMigrationToSize] new migration already exists (double-checked)", 0, 0);
492 // Create new migration.
493 ureg numDestinations = ureg(1) << splitShift;
494 TableMigration* migration = TableMigration::create(map, 1, numDestinations);
495 migration->m_baseHash = table->baseHash;
496 ureg migrationShift = table->unsafeRangeShift - splitShift;
497 migration->m_safeShift = (migrationShift < sizeof(Hash) * 8) ? migrationShift : 0;
498 migration->m_unitsRemaining.storeNonatomic(table->getNumMigrationUnits());
499 migration->getSources()[0].table = table;
500 migration->getSources()[0].sourceIndex.storeNonatomic(0);
502 table->unsafeRangeShift - splitShift; // subRangeShift is also "unsafe" (possibly represents entire range)
503 ureg hashOffsetDelta = subRangeShift < (sizeof(Hash) * 8) ? (ureg(1) << subRangeShift) : 0;
504 for (ureg i = 0; i < numDestinations; i++) {
505 migration->getDestinations()[i] =
506 Table::create(nextTableSize, table->baseHash + hashOffsetDelta * i, subRangeShift);
508 // Publish the new migration.
509 table->jobCoordinator.storeRelease(migration);
514 static void beginTableMigration(Map& map, Table* table, ureg overflowIdx) {
515 // Estimate number of cells in use based on a small sample.
516 ureg sizeMask = table->sizeMask;
517 ureg idx = overflowIdx - CellsInUseSample;
519 for (ureg linearProbesRemaining = CellsInUseSample; linearProbesRemaining > 0; linearProbesRemaining--) {
520 CellGroup* group = table->getCellGroups() + ((idx & sizeMask) >> 2);
521 Cell* cell = group->cells + (idx & 3);
522 Value value = cell->value.load(turf::Relaxed);
523 if (value == Value(ValueTraits::Redirect)) {
524 // Another thread kicked off the jobCoordinator. The caller will participate upon return.
525 TURF_TRACE(Grampa, 18, "[beginTableMigration] redirected while determining table size", 0, 0);
528 if (value != Value(ValueTraits::NullValue))
532 float inUseRatio = float(inUseCells) / CellsInUseSample;
533 float estimatedInUse = (sizeMask + 1) * inUseRatio;
534 ureg nextTableSize = turf::util::roundUpPowerOf2(ureg(estimatedInUse * 2));
535 // FIXME: Support migrating to smaller tables.
536 nextTableSize = turf::util::max(nextTableSize, sizeMask + 1);
537 // Split into multiple tables if necessary.
539 while (nextTableSize > LeafSize) {
543 beginTableMigrationToSize(map, table, nextTableSize, splitShift);
546 static FlatTreeMigration* createFlatTreeMigration(Map& map, FlatTree* flatTree, ureg shift) {
547 turf::LockGuard<junction::striped::Mutex> guard(flatTree->mutex);
548 if (!flatTree->migration) {
549 flatTree->migration = new FlatTreeMigration(map, flatTree, shift);
551 return flatTree->migration;
554 static FlatTreeMigration* getExistingFlatTreeMigration(FlatTree* flatTree) {
555 turf::LockGuard<junction::striped::Mutex> guard(flatTree->mutex);
556 TURF_ASSERT(flatTree->migration); // Must already exist!
557 return flatTree->migration;
561 // Return index of the destination table that overflowed, or -1 if none
563 sreg Grampa<Map>::TableMigration::migrateRange(Table* srcTable, ureg startIdx) {
564 ureg srcSizeMask = srcTable->sizeMask;
565 ureg safeShift = m_safeShift;
566 Table** dstLeafs = getDestinations();
567 ureg dstLeafMask = m_numDestinations - 1;
568 ureg endIdx = turf::util::min(startIdx + TableMigrationUnitSize, srcSizeMask + 1);
569 // Iterate over source range.
570 for (ureg srcIdx = startIdx; srcIdx < endIdx; srcIdx++) {
571 CellGroup* srcGroup = srcTable->getCellGroups() + ((srcIdx & srcSizeMask) >> 2);
572 Cell* srcCell = srcGroup->cells + (srcIdx & 3);
575 // Fetch the srcHash and srcValue.
577 srcHash = srcCell->hash.load(turf::Relaxed);
578 if (srcHash == KeyTraits::NullHash) {
579 // An unused cell. Try to put a Redirect marker in its value.
581 srcCell->value.compareExchange(Value(ValueTraits::NullValue), Value(ValueTraits::Redirect), turf::Relaxed);
582 if (srcValue == Value(ValueTraits::Redirect)) {
583 // srcValue is already marked Redirect due to previous incomplete migration.
584 TURF_TRACE(Grampa, 19, "[migrateRange] empty cell already redirected", uptr(srcTable), srcIdx);
587 if (srcValue == Value(ValueTraits::NullValue))
588 break; // Redirect has been placed. Break inner loop, continue outer loop.
589 TURF_TRACE(Grampa, 20, "[migrateRange] race to insert key", uptr(srcTable), srcIdx);
590 // Otherwise, somebody just claimed the cell. Read srcHash again...
592 // Check for deleted/uninitialized value.
593 srcValue = srcCell->value.load(turf::Relaxed);
594 if (srcValue == Value(ValueTraits::NullValue)) {
595 // Try to put a Redirect marker.
596 if (srcCell->value.compareExchangeStrong(srcValue, Value(ValueTraits::Redirect), turf::Relaxed))
597 break; // Redirect has been placed. Break inner loop, continue outer loop.
598 TURF_TRACE(Grampa, 21, "[migrateRange] race to insert value", uptr(srcTable), srcIdx);
599 if (srcValue == Value(ValueTraits::Redirect)) {
600 // FIXME: I don't think this will happen. Investigate & change to assert
601 TURF_TRACE(Grampa, 22, "[migrateRange] race inserted Redirect", uptr(srcTable), srcIdx);
604 } else if (srcValue == Value(ValueTraits::Redirect)) {
605 // srcValue is already marked Redirect due to previous incomplete migration.
606 TURF_TRACE(Grampa, 23, "[migrateRange] in-use cell already redirected", uptr(srcTable), srcIdx);
610 // We've got a key/value pair to migrate.
611 // Reserve a destination cell in dstTable.
612 TURF_ASSERT(srcHash != KeyTraits::NullHash);
613 TURF_ASSERT(srcValue != Value(ValueTraits::NullValue));
614 TURF_ASSERT(srcValue != Value(ValueTraits::Redirect));
615 ureg destLeafIndex = (srcHash >> safeShift) & dstLeafMask;
616 Table* dstLeaf = dstLeafs[destLeafIndex];
619 InsertResult result = insertOrFind(srcHash, dstLeaf, dstLeaf->sizeMask, dstCell, overflowIdx);
620 // During migration, a hash can only exist in one place among all the source tables,
621 // and it is only migrated by one thread. Therefore, the hash will never already exist
622 // in the destination table:
623 TURF_ASSERT(result != InsertResult_AlreadyFound);
624 if (result == InsertResult_Overflow) {
625 // Destination overflow.
626 // This can happen for several reasons. For example, the source table could have
627 // existed of all deleted cells when it overflowed, resulting in a small destination
628 // table size, but then another thread could re-insert all the same hashes
629 // before the migration completed.
630 // Caller will cancel the current migration and begin a new one.
631 return destLeafIndex;
633 // Migrate the old value to the new cell.
635 // Copy srcValue to the destination.
636 dstCell->value.store(srcValue, turf::Relaxed);
637 // Try to place a Redirect marker in srcValue.
638 Value doubleCheckedSrcValue =
639 srcCell->value.compareExchange(srcValue, Value(ValueTraits::Redirect), turf::Relaxed);
640 TURF_ASSERT(doubleCheckedSrcValue !=
641 Value(ValueTraits::Redirect)); // Only one thread can redirect a cell at a time.
642 if (doubleCheckedSrcValue == srcValue) {
643 // No racing writes to the src. We've successfully placed the Redirect marker.
644 // srcValue was non-NULL when we decided to migrate it, but it may have changed to NULL
645 // by a late-arriving erase.
646 if (srcValue == Value(ValueTraits::NullValue))
647 TURF_TRACE(Grampa, 24, "[migrateRange] racing update was erase", uptr(srcTable), srcIdx);
650 // There was a late-arriving write (or erase) to the src. Migrate the new value and try again.
651 TURF_TRACE(Grampa, 25, "[migrateRange] race to update migrated value", uptr(srcTable), srcIdx);
652 srcValue = doubleCheckedSrcValue;
654 // Cell successfully migrated. Proceed to next source cell.
659 // Range has been migrated successfully.
664 void Grampa<Map>::TableMigration::run() {
665 // Conditionally increment the shared # of workers.
666 ureg probeStatus = m_workerStatus.load(turf::Relaxed);
668 if (probeStatus & 1) {
669 // End flag is already set, so do nothing.
670 TURF_TRACE(Grampa, 26, "[TableMigration::run] already ended", uptr(this), 0);
673 } while (!m_workerStatus.compareExchangeWeak(probeStatus, probeStatus + 2, turf::Relaxed, turf::Relaxed));
674 // # of workers has been incremented, and the end flag is clear.
675 TURF_ASSERT((probeStatus & 1) == 0);
677 // Iterate over all source tables.
678 Source* sources = getSources();
679 for (ureg s = 0; s < m_numSources; s++) {
680 Source& source = sources[s];
681 // Loop over all migration units in this source table.
683 if (m_workerStatus.load(turf::Relaxed) & 1) {
684 TURF_TRACE(Grampa, 27, "[TableMigration::run] detected end flag set", uptr(this), 0);
687 ureg startIdx = source.sourceIndex.fetchAdd(TableMigrationUnitSize, turf::Relaxed);
688 if (startIdx >= source.table->sizeMask + 1)
689 break; // No more migration units in this table. Try next source table.
690 sreg overflowTableIndex = migrateRange(source.table, startIdx);
691 if (overflowTableIndex >= 0) {
692 // *** FAILED MIGRATION ***
693 // TableMigration failed due to destination table overflow.
694 // No other thread can declare the migration successful at this point, because *this* unit will never complete,
695 // hence m_unitsRemaining won't reach zero.
696 // However, multiple threads can independently detect a failed migration at the same time.
697 TURF_TRACE(Grampa, 28, "[TableMigration::run] destination overflow", uptr(source.table), uptr(startIdx));
698 // The reason we store overflowTableIndex in a shared variable is because we must flush all the worker threads
699 // before we can safely deal with the overflow. Therefore, the thread that detects the failure is often
700 // different from the thread that deals with it.
701 // Store overflowTableIndex unconditionally; racing writes should be rare, and it doesn't matter which one wins.
702 sreg oldIndex = m_overflowTableIndex.exchange(overflowTableIndex, turf::Relaxed);
704 TURF_TRACE(Grampa, 29, "[TableMigration::run] race to set m_overflowTableIndex", uptr(overflowTableIndex),
706 m_workerStatus.fetchOr(1, turf::Relaxed);
709 sreg prevRemaining = m_unitsRemaining.fetchSub(1, turf::Relaxed);
710 TURF_ASSERT(prevRemaining > 0);
711 if (prevRemaining == 1) {
712 // *** SUCCESSFUL MIGRATION ***
713 // That was the last chunk to migrate.
714 m_workerStatus.fetchOr(1, turf::Relaxed);
719 TURF_TRACE(Grampa, 30, "[TableMigration::run] out of migration units", uptr(this), 0);
722 // Decrement the shared # of workers.
724 m_workerStatus.fetchSub(2, turf::AcquireRelease); // Ensure all modifications are visible to the thread that will publish
725 if (probeStatus >= 4) {
726 // There are other workers remaining. Return here so that only the very last worker will proceed.
727 TURF_TRACE(Grampa, 31, "[TableMigration::run] not the last worker", uptr(this), uptr(probeStatus));
731 // We're the very last worker thread.
732 // Perform the appropriate post-migration step depending on whether the migration succeeded or failed.
733 TURF_ASSERT(probeStatus == 3);
734 sreg overflowTableIndex = m_overflowTableIndex.loadNonatomic(); // No racing writes at this point
735 if (overflowTableIndex < 0) {
736 // The migration succeeded. This is the most likely outcome. Publish the new subtree.
737 m_map.publishTableMigration(this);
738 // End the jobCoodinator.
739 sources[0].table->jobCoordinator.end();
741 // The migration failed due to the overflow of a destination table.
742 Table* origTable = sources[0].table;
743 ureg count = ureg(1) << (origTable->unsafeRangeShift - getUnsafeShift());
744 ureg lo = overflowTableIndex & ~(count - 1);
745 TURF_ASSERT(lo + count <= m_numDestinations);
746 turf::LockGuard<junction::striped::Mutex> guard(origTable->mutex);
747 SimpleJobCoordinator::Job* checkedJob = origTable->jobCoordinator.loadConsume();
748 if (checkedJob != this) {
749 TURF_TRACE(Grampa, 32, "[TableMigration::run] a new TableMigration was already started", uptr(origTable),
752 TableMigration* migration;
753 Table* overflowedTable = getDestinations()[overflowTableIndex];
754 if (overflowedTable->sizeMask + 1 < LeafSize) {
755 // The entire map is contained in a small table.
756 TURF_TRACE(Grampa, 33, "[TableMigration::run] overflow occured in a small map", uptr(origTable),
758 TURF_ASSERT(overflowedTable->unsafeRangeShift == sizeof(Hash) * 8);
759 TURF_ASSERT(overflowedTable->baseHash == 0);
760 TURF_ASSERT(m_numDestinations == 1);
761 TURF_ASSERT(m_baseHash == 0);
762 migration = TableMigration::create(m_map, m_numSources + 1, 1);
763 migration->m_baseHash = 0;
764 migration->m_safeShift = 0;
765 // Double the destination table size.
766 migration->getDestinations()[0] = Table::create((overflowedTable->sizeMask + 1) * 2, overflowedTable->baseHash,
767 overflowedTable->unsafeRangeShift);
769 // The overflowed table is already the size of a leaf. Split it into two ranges.
771 TURF_TRACE(Grampa, 34, "[TableMigration::run] doubling subtree size after failure", uptr(origTable),
773 migration = TableMigration::create(m_map, m_numSources + 1, m_numDestinations * 2);
774 migration->m_baseHash = m_baseHash;
775 migration->m_safeShift = getUnsafeShift() - 1;
776 for (ureg i = 0; i < m_numDestinations; i++) {
777 migration->getDestinations()[i * 2] = getDestinations()[i];
778 migration->getDestinations()[i * 2 + 1] = getDestinations()[i];
782 TURF_TRACE(Grampa, 35, "[TableMigration::run] keeping same subtree size after failure", uptr(origTable),
784 migration = TableMigration::create(m_map, m_numSources + 1, m_numDestinations);
785 migration->m_baseHash = m_baseHash;
786 migration->m_safeShift = m_safeShift;
787 memcpy(migration->getDestinations(), getDestinations(), m_numDestinations * sizeof(Table*));
789 Table* splitTable1 = Table::create(LeafSize, origTable->baseHash, origTable->unsafeRangeShift - 1);
791 for (; i < count / 2; i++) {
792 migration->getDestinations()[lo + i] = splitTable1;
794 ureg halfNumHashes = ureg(1) << (origTable->unsafeRangeShift - 1);
796 Table::create(LeafSize, origTable->baseHash + halfNumHashes, origTable->unsafeRangeShift - 1);
797 for (; i < count; i++) {
798 migration->getDestinations()[lo + i] = splitTable2;
801 // Transfer source tables to the new migration.
802 for (ureg i = 0; i < m_numSources; i++) {
803 migration->getSources()[i].table = getSources()[i].table;
804 migration->getSources()[i].sourceIndex.storeNonatomic(0);
805 getSources()[i].table = NULL;
807 migration->getSources()[m_numSources].table = overflowedTable;
808 migration->getSources()[m_numSources].sourceIndex.storeNonatomic(0);
809 // Calculate total number of migration units to move.
810 ureg unitsRemaining = 0;
811 for (ureg s = 0; s < migration->m_numSources; s++)
812 unitsRemaining += migration->getSources()[s].table->getNumMigrationUnits();
813 migration->m_unitsRemaining.storeNonatomic(unitsRemaining);
814 // Publish the new migration.
815 origTable->jobCoordinator.storeRelease(migration);
819 // We're done with this TableMigration. Queue it for GC.
820 DefaultQSBR.enqueue(&TableMigration::destroy, this);
824 void Grampa<Map>::FlatTreeMigration::run() {
825 // Conditionally increment the shared # of workers.
826 ureg probeStatus = m_workerStatus.load(turf::Relaxed);
828 if (probeStatus & 1) {
829 // End flag is already set, so do nothing.
830 TURF_TRACE(Grampa, 36, "[FlatTreeMigration::run] already ended", uptr(this), 0);
833 } while (!m_workerStatus.compareExchangeWeak(probeStatus, probeStatus + 2, turf::Relaxed, turf::Relaxed));
834 // # of workers has been incremented, and the end flag is clear.
835 TURF_ASSERT((probeStatus & 1) == 0);
837 // Loop over all migration units
838 ureg srcSize = (Hash(-1) >> m_source->safeShift) + 1;
839 // FIXME: Support migration to smaller flattrees
840 TURF_ASSERT(m_destination->safeShift < m_source->safeShift);
841 ureg repeat = ureg(1) << (m_source->safeShift - m_destination->safeShift);
843 ureg srcStart = m_sourceIndex.fetchAdd(FlatTreeMigrationUnitSize, turf::Relaxed);
844 if (srcStart >= srcSize)
845 break; // No more migration units in this flattree.
846 // Migrate this range
847 ureg srcEnd = turf::util::min(srcSize, srcStart + FlatTreeMigrationUnitSize);
848 ureg dst = srcStart * repeat;
849 for (ureg src = srcStart; src < srcEnd; src++) {
850 // Pointers in the source table can be changed at any time due to concurrent subtree publishing,
851 // so we need to exchange them with Redirect markers.
852 Table* t = m_source->getTables()[src].exchange((Table*) RedirectFlatTree, turf::Relaxed);
853 TURF_ASSERT(uptr(t) != RedirectFlatTree);
854 for (ureg r = repeat; r > 0; r--) {
855 m_destination->getTables()[dst].storeNonatomic(t);
859 // Decrement m_unitsRemaining
860 sreg prevRemaining = m_unitsRemaining.fetchSub(1, turf::Relaxed);
861 if (prevRemaining == 1) {
862 // *** SUCCESSFUL MIGRATION ***
863 // That was the last chunk to migrate.
864 m_workerStatus.fetchOr(1, turf::Relaxed);
869 // Decrement the shared # of workers.
870 probeStatus = m_workerStatus.fetchSub(
871 2, turf::AcquireRelease); // AcquireRelease makes all previous writes visible to the last worker thread.
872 if (probeStatus >= 4) {
873 // There are other workers remaining. Return here so that only the very last worker will proceed.
877 // We're the very last worker thread.
878 // Publish the new flattree.
879 TURF_ASSERT(probeStatus == 3); // End flag must be set
880 m_map.publishFlatTreeMigration(this);
881 m_completed.signal();
883 // We're done with this FlatTreeMigration. Queue it for GC.
884 DefaultQSBR.enqueue(&FlatTreeMigration::destroy, this);
887 } // namespace details
888 } // namespace junction
890 #endif // JUNCTION_DETAILS_GRAMPA_H