Robust/src/Runtime/bamboo/multicoregarbage.c

   1 // TODO: DO NOT support tag!!!
   2 #ifdef MULTICORE_GC
   3 #include "runtime.h"
   4 #include "multicoregarbage.h"
   5 #include "multicoregcmark.h"
   6 #include "gcqueue.h"
   7 #include "multicoregccompact.h"
   8 #include "multicoregcflush.h"
   9 #include "multicoreruntime.h"
  10 #include "multicoregcprofile.h"
  11 #include "gcqueue.h"
  12
  13 #ifdef SMEMM
  14 extern unsigned int gcmem_mixed_threshold;
  15 extern unsigned int gcmem_mixed_usedmem;
  16 #endif // SMEMM
  17
  18 #ifdef GC_DEBUG
  19 // dump whole mem in blocks
  20 void dumpSMem() {
  21   int block = 0;
  22   int sblock = 0;
  23   unsigned int j = 0;
  24   void * i = 0;
  25   int coren = 0;
  26   int x = 0;
  27   int y = 0;
  28   printf("(%x,%x) Dump shared mem: \n",udn_tile_coord_x(),udn_tile_coord_y());
  29   // reserved blocks for sblocktbl
  30   printf("(%x,%x) ++++ reserved sblocks ++++ \n", udn_tile_coord_x(),
  31          udn_tile_coord_y());
  32   for(i=BAMBOO_BASE_VA; (unsinged int)i<(unsigned int)gcbaseva; i+= 4*16) {
  33     printf("(%x,%x) 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x \n",
  34            udn_tile_coord_x(), udn_tile_coord_y(),
  35            *((int *)(i)), *((int *)(i + 4)),
  36            *((int *)(i + 4*2)), *((int *)(i + 4*3)),
  37            *((int *)(i + 4*4)), *((int *)(i + 4*5)),
  38            *((int *)(i + 4*6)), *((int *)(i + 4*7)),
  39            *((int *)(i + 4*8)), *((int *)(i + 4*9)),
  40            *((int *)(i + 4*10)), *((int *)(i + 4*11)),
  41            *((int *)(i + 4*12)), *((int *)(i + 4*13)),
  42            *((int *)(i + 4*14)), *((int *)(i + 4*15)));
  43   }
  44   sblock = gcreservedsb;
  45   bool advanceblock = false;
  46   // remaining memory
  47   for(i=gcbaseva; (unsigned int)i<(unsigned int)(gcbaseva+BAMBOO_SHARED_MEM_SIZE); i+=4*16) {
  48     advanceblock = false;
  49     // computing sblock # and block #, core coordinate (x,y) also
  50     if(j%((BAMBOO_SMEM_SIZE)/(4*16)) == 0) {
  51       // finished a sblock
  52       if(j < ((BAMBOO_LARGE_SMEM_BOUND)/(4*16))) {
  53         if((j > 0) && (j%((BAMBOO_SMEM_SIZE_L)/(4*16)) == 0)) {
  54           // finished a block
  55           block++;
  56           advanceblock = true;
  57         }
  58       } else {
  59         // finished a block
  60         block++;
  61         advanceblock = true;
  62       }
  63       // compute core #
  64       if(advanceblock) {
  65         coren = gc_block2core[block%(NUMCORES4GC*2)];
  66       }
  67       // compute core coordinate
  68       x = BAMBOO_COORDS_X(coren);
  69       y = BAMBOO_COORDS_Y(coren);
  70       printf("(%x,%x) ==== %d, %d : core (%d,%d), saddr %x====\n",
  71              udn_tile_coord_x(), udn_tile_coord_y(),
  72              block, sblock++, x, y,
  73              (sblock-1)*(BAMBOO_SMEM_SIZE)+gcbaseva);
  74     }
  75     j++;
  76     printf("(%x,%x) 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x \n",
  77            udn_tile_coord_x(), udn_tile_coord_y(),
  78            *((int *)(i)), *((int *)(i + 4)),
  79            *((int *)(i + 4*2)), *((int *)(i + 4*3)),
  80            *((int *)(i + 4*4)), *((int *)(i + 4*5)),
  81            *((int *)(i + 4*6)), *((int *)(i + 4*7)),
  82            *((int *)(i + 4*8)), *((int *)(i + 4*9)),
  83            *((int *)(i + 4*10)), *((int *)(i + 4*11)),
  84            *((int *)(i + 4*12)), *((int *)(i + 4*13)),
  85            *((int *)(i + 4*14)), *((int *)(i + 4*15)));
  86   }
  87   printf("(%x,%x) \n", udn_tile_coord_x(), udn_tile_coord_y());
  88 }
  89 #endif
  90
  91 void initmulticoregcdata() {
  92   if(STARTUPCORE == BAMBOO_NUM_OF_CORE) {
  93     // startup core to initialize corestatus[]
  94     for(int i = 0; i < NUMCORESACTIVE; i++) {
  95       gccorestatus[i] = 1;
  96       gcnumsendobjs[0][i] = gcnumsendobjs[1][i] = 0;
  97       gcnumreceiveobjs[0][i] = gcnumreceiveobjs[1][i] = 0;
  98     }
  99     for(int i = 0; i < NUMCORES4GC; i++) {
 100       gcloads[i] = 0;
 101       gcrequiredmems[i] = 0;
 102       gcstopblock[i] = 0;
 103       gcfilledblocks[i] = 0;
 104     }
 105   }
 106
 107   bamboo_smem_zero_top = NULL;
 108   gcflag = false;
 109   gcprocessing = false;
 110   gcphase = FINISHPHASE;
 111   gcprecheck = true;
 112   gccurr_heaptop = 0;
 113   gcself_numsendobjs = 0;
 114   gcself_numreceiveobjs = 0;
 115   gcmarkedptrbound = 0;
 116   gcforwardobjtbl = allocateMGCHash_I(20, 3);
 117   gcnumlobjs = 0;
 118   gcheaptop = 0;
 119   gctopcore = 0;
 120   gctopblock = 0;
 121   gcmovestartaddr = 0;
 122   gctomove = false;
 123   gcmovepending = 0;
 124   gcblock2fill = 0;
 125 #ifdef SMEMM
 126   gcmem_mixed_threshold = (unsigned int)((BAMBOO_SHARED_MEM_SIZE
 127                 -bamboo_reserved_smem*BAMBOO_SMEM_SIZE)*0.8);
 128   gcmem_mixed_usedmem = 0;
 129 #endif
 130 #ifdef MGC_SPEC
 131   gc_profile_flag = false;
 132 #endif
 133 #ifdef GC_FLUSH_DTLB
 134   gc_num_flush_dtlb = 0;
 135 #endif
 136   gc_localheap_s = false;
 137 #ifdef GC_CACHE_ADAPT
 138   gccachestage = false;
 139 #endif
 140
 141   INIT_MULTICORE_GCPROFILE_DATA();
 142 }
 143
 144 void dismulticoregcdata() {
 145   freeMGCHash(gcforwardobjtbl);
 146 }
 147
 148 void initGC() {
 149   if(STARTUPCORE == BAMBOO_NUM_OF_CORE) {
 150     for(int i = 0; i < NUMCORES4GC; i++) {
 151       gccorestatus[i] = 1;
 152       gcnumsendobjs[0][i] = gcnumsendobjs[1][i] = 0;
 153       gcnumreceiveobjs[0][i] = gcnumreceiveobjs[1][i] = 0;
 154       gcloads[i] = 0;
 155       gcrequiredmems[i] = 0;
 156       gcfilledblocks[i] = 0;
 157       gcstopblock[i] = 0;
 158     }
 159     for(int i = NUMCORES4GC; i < NUMCORESACTIVE; i++) {
 160       gccorestatus[i] = 1;
 161       gcnumsendobjs[0][i] = gcnumsendobjs[1][i] = 0;
 162       gcnumreceiveobjs[0][i] = gcnumreceiveobjs[1][i] = 0;
 163     }
 164     gcheaptop = 0;
 165     gctopcore = 0;
 166     gctopblock = 0;
 167     gcnumsrobjs_index = 0;
 168   }
 169   gcself_numsendobjs = 0;
 170   gcself_numreceiveobjs = 0;
 171   gcmarkedptrbound = 0;
 172   gcnumlobjs = 0;
 173   gcmovestartaddr = 0;
 174   gctomove = false;
 175   gcblock2fill = 0;
 176   gcmovepending = 0;
 177   gccurr_heaptop = 0;
 178   gcdstcore = 0;
 179
 180   gc_queueinit();
 181
 182   freeMGCHash(gcforwardobjtbl);
 183   gcforwardobjtbl = allocateMGCHash(20, 3);
 184
 185   GCPROFILE_INIT();
 186 }
 187
 188 bool gc_checkAllCoreStatus() {
 189   BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
 190   for(int i = 0; i < NUMCORESACTIVE; i++) {
 191     if(gccorestatus[i] != 0) {
 192       BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
 193       return false;
 194     }
 195   }
 196   BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
 197   return true;
 198 }
 199
 200 // NOTE: should be invoked with interrupts turned off
 201 bool gc_checkAllCoreStatus_I() {
 202   for(int i = 0; i < NUMCORESACTIVE; i++) {
 203     if(gccorestatus[i] != 0) {
 204       return false;
 205     }
 206   }
 207   return true;
 208 }
 209
 210 INLINE void checkMarkStatus() {
 211   if((!waitconfirm) ||
 212       (waitconfirm && (numconfirm == 0))) {
 213     unsigned int entry_index = 0;
 214     if(waitconfirm) {
 215       // phase 2
 216       entry_index = (gcnumsrobjs_index == 0) ? 1 : 0;
 217     } else {
 218       // phase 1
 219       entry_index = gcnumsrobjs_index;
 220     }
 221     BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
 222     gccorestatus[BAMBOO_NUM_OF_CORE] = 0;
 223     gcnumsendobjs[entry_index][BAMBOO_NUM_OF_CORE] = gcself_numsendobjs;
 224     gcnumreceiveobjs[entry_index][BAMBOO_NUM_OF_CORE] = gcself_numreceiveobjs;
 225     // check the status of all cores
 226     if (gc_checkAllCoreStatus_I()) {
 227       // ask for confirm
 228       if(!waitconfirm) {
 229         // the first time found all cores stall
 230         // send out status confirm msg to all other cores
 231         // reset the corestatus array too
 232         waitconfirm = true;
 233         numconfirm = NUMCORESACTIVE - 1;
 234         BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
 235         GC_SEND_MSG_1_TO_CLIENT(GCMARKCONFIRM);
 236       } else {
 237         // Phase 2
 238         // check if the sum of send objs and receive obj are the same
 239         // yes->check if the info is the latest; no->go on executing
 240         unsigned int sumsendobj = 0;
 241         for(int i = 0; i < NUMCORESACTIVE; i++) {
 242           sumsendobj += gcnumsendobjs[gcnumsrobjs_index][i];
 243         }
 244         for(int i = 0; i < NUMCORESACTIVE; i++) {
 245           sumsendobj -= gcnumreceiveobjs[gcnumsrobjs_index][i];
 246         }
 247         if(0 == sumsendobj) {
 248           // Check if there are changes of the numsendobjs or numreceiveobjs on
 249           // each core
 250           bool ischanged = false;
 251           for(int i = 0; i < NUMCORESACTIVE; i++) {
 252             if((gcnumsendobjs[0][i] != gcnumsendobjs[1][i]) ||
 253                 (gcnumreceiveobjs[0][i] != gcnumreceiveobjs[1][i]) ) {
 254               ischanged = true;
 255               break;
 256             }
 257           }
 258           if(!ischanged) {
 259             // all the core status info are the latest,stop mark phase
 260             gcphase = COMPACTPHASE;
 261             // restore the gcstatus for all cores
 262             for(int i = 0; i < NUMCORESACTIVE; i++) {
 263               gccorestatus[i] = 1;
 264             }
 265           } else {
 266             // There were changes between phase 1 and phase 2, can not decide
 267             // whether the mark phase has been finished
 268             waitconfirm = false;
 269             // As it fails in phase 2, flip the entries
 270             gcnumsrobjs_index = (gcnumsrobjs_index == 0) ? 1 : 0;
 271           }
 272         } else {
 273           // There were changes between phase 1 and phase 2, can not decide
 274           // whether the mark phase has been finished
 275           waitconfirm = false;
 276           // As it fails in phase 2, flip the entries
 277           gcnumsrobjs_index = (gcnumsrobjs_index == 0) ? 1 : 0;
 278         }
 279         BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
 280       }
 281     } else {
 282       BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
 283     }
 284   }
 285 }
 286
 287 // compute load balance for all cores
 288 INLINE int loadbalance(unsigned int * heaptop) {
 289   // compute load balance
 290
 291   // get the total loads
 292   unsigned int tloads = gcloads[STARTUPCORE];
 293   for(int i = 1; i < NUMCORES4GC; i++) {
 294     tloads += gcloads[i];
 295   }
 296   *heaptop = gcbaseva + tloads;
 297
 298   unsigned int b = 0;
 299   BLOCKINDEX(*heaptop, &b);
 300   // num of blocks per core
 301   unsigned int numbpc = (unsigned int)b/(unsigned int)(NUMCORES4GC);
 302   gctopblock = b;
 303   RESIDECORE(heaptop, &gctopcore);
 304   return numbpc;
 305 }
 306
 307 // compute total mem size required and sort the lobjs in ascending order
 308 INLINE unsigned int sortLObjs() {
 309   unsigned int tmp_lobj = 0;
 310   unsigned int tmp_len = 0;
 311   unsigned int tmp_host = 0;
 312   unsigned int sumsize = 0;
 313
 314   gclobjtail2 = gclobjtail;
 315   gclobjtailindex2 = gclobjtailindex;
 316   // TODO USE QUICK SORT INSTEAD?
 317   while(gc_lobjmoreItems2_I()) {
 318     gc_lobjdequeue2_I();
 319     tmp_lobj = gclobjtail2->lobjs[gclobjtailindex2-1];
 320     tmp_host = gclobjtail2->hosts[gclobjtailindex2-1];
 321     tmp_len = gclobjtail2->lengths[gclobjtailindex2 - 1];
 322     sumsize += tmp_len;
 323     GCPROFILE_RECORD_LOBJ();
 324     unsigned int i = gclobjtailindex2-1;
 325     struct lobjpointerblock * tmp_block = gclobjtail2;
 326     // find the place to insert
 327     while(true) {
 328       if(i == 0) {
 329         if(tmp_block->prev == NULL) {
 330           break;
 331         }
 332         if(tmp_block->prev->lobjs[NUMLOBJPTRS-1] > tmp_lobj) {
 333           tmp_block->lobjs[i] = tmp_block->prev->lobjs[NUMLOBJPTRS-1];
 334           tmp_block->lengths[i] = tmp_block->prev->lengths[NUMLOBJPTRS-1];
 335           tmp_block->hosts[i] = tmp_block->prev->hosts[NUMLOBJPTRS-1];
 336           tmp_block = tmp_block->prev;
 337           i = NUMLOBJPTRS-1;
 338         } else {
 339           break;
 340         }  // if(tmp_block->prev->lobjs[NUMLOBJPTRS-1] < tmp_lobj)
 341       } else {
 342         if(tmp_block->lobjs[i-1] > tmp_lobj) {
 343           tmp_block->lobjs[i] = tmp_block->lobjs[i-1];
 344           tmp_block->lengths[i] = tmp_block->lengths[i-1];
 345           tmp_block->hosts[i] = tmp_block->hosts[i-1];
 346           i--;
 347         } else {
 348           break;
 349         }
 350       }
 351     }
 352     // insert it
 353     if(i != gclobjtailindex2 - 1) {
 354       tmp_block->lobjs[i] = tmp_lobj;
 355       tmp_block->lengths[i] = tmp_len;
 356       tmp_block->hosts[i] = tmp_host;
 357     }
 358   }
 359   return sumsize;
 360 }
 361
 362 INLINE bool cacheLObjs() {
 363   // check the total mem size need for large objs
 364   unsigned long long sumsize = 0;
 365   unsigned int size = 0;
 366
 367   sumsize = sortLObjs();
 368
 369   GCPROFILE_RECORD_LOBJSPACE();
 370
 371   // check if there are enough space to cache these large objs
 372   unsigned int dst = gcbaseva + (BAMBOO_SHARED_MEM_SIZE) -sumsize;
 373   if((unsigned long long)gcheaptop > (unsigned long long)dst) {
 374     // do not have enough room to cache large objs
 375     return false;
 376   }
 377
 378   gcheaptop = dst; // Note: record the start of cached lobjs with gcheaptop
 379   // cache the largeObjs to the top of the shared heap
 380   dst = gcbaseva + (BAMBOO_SHARED_MEM_SIZE);
 381   while(gc_lobjmoreItems3_I()) {
 382     gc_lobjdequeue3_I();
 383     size = gclobjtail2->lengths[gclobjtailindex2];
 384     // set the mark field to , indicating that this obj has been moved
 385     // and need to be flushed
 386     ((struct ___Object___ *)(gclobjtail2->lobjs[gclobjtailindex2]))->marked =
 387       COMPACTED;
 388     dst -= size;
 389     if((unsigned int)dst <
 390         (unsigned int)(gclobjtail2->lobjs[gclobjtailindex2]+size)) {
 391       memmove(dst, gclobjtail2->lobjs[gclobjtailindex2], size);
 392     } else {
 393       memcpy(dst, gclobjtail2->lobjs[gclobjtailindex2], size);
 394     }
 395   }
 396   return true;
 397 }
 398
 399 // update the bmmboo_smemtbl to record current shared mem usage
 400 void updateSmemTbl(unsigned int coren, unsigned int localtop) {
 401   unsigned int ltopcore = 0;
 402   unsigned int bound = BAMBOO_SMEM_SIZE_L;
 403   BLOCKINDEX(localtop, &ltopcore);
 404   if((unsigned int)localtop>=(unsigned int)(gcbaseva+BAMBOO_LARGE_SMEM_BOUND)){
 405     bound = BAMBOO_SMEM_SIZE;
 406   }
 407   unsigned int load = (unsigned int)(localtop-gcbaseva)%(unsigned int)bound;
 408   unsigned int toset = 0;
 409   for(int j=0; 1; j++) {
 410     for(int i=0; i<2; i++) {
 411       toset = gc_core2block[2*coren+i]+(unsigned int)(NUMCORES4GC*2)*j;
 412       if(toset < ltopcore) {
 413         bamboo_smemtbl[toset]=(toset<NUMCORES4GC) ? BAMBOO_SMEM_SIZE_L : BAMBOO_SMEM_SIZE;
 414 #ifdef SMEMM
 415         gcmem_mixed_usedmem += bamboo_smemtbl[toset];
 416 #endif
 417       } else if(toset == ltopcore) {
 418         bamboo_smemtbl[toset] = load;
 419 #ifdef SMEMM
 420         gcmem_mixed_usedmem += bamboo_smemtbl[toset];
 421 #endif
 422         return;
 423       } else {
 424         return;
 425       }
 426     }
 427   }
 428 }
 429
 430 INLINE unsigned int checkCurrHeapTop() {
 431   // update the smemtbl
 432   BAMBOO_MEMSET_WH(bamboo_smemtbl, 0, sizeof(int)*gcnumblock);
 433   // flush all gcloads to indicate the real heap top on one core
 434   // previous it represents the next available ptr on a core
 435   if(((unsigned int)gcloads[0] > (unsigned int)(gcbaseva+BAMBOO_SMEM_SIZE_L))
 436      && (((unsigned int)gcloads[0]%(BAMBOO_SMEM_SIZE)) == 0)) {
 437     // edge of a block, check if this is exactly the heaptop
 438     BASEPTR(0, gcfilledblocks[0]-1, &(gcloads[0]));
 439     gcloads[0]+=(gcfilledblocks[0]>1?(BAMBOO_SMEM_SIZE):(BAMBOO_SMEM_SIZE_L));
 440   }
 441   updateSmemTbl(0, gcloads[0]);
 442   for(int i = 1; i < NUMCORES4GC; i++) {
 443     unsigned int tmptop = 0;
 444     if((gcfilledblocks[i] > 0)
 445        && (((unsigned int)gcloads[i] % (BAMBOO_SMEM_SIZE)) == 0)) {
 446       // edge of a block, check if this is exactly the heaptop
 447       BASEPTR(i, gcfilledblocks[i]-1, &gcloads[i]);
 448       gcloads[i] +=
 449         (gcfilledblocks[i]>1?(BAMBOO_SMEM_SIZE):(BAMBOO_SMEM_SIZE_L));
 450       tmptop = gcloads[i];
 451     }
 452     updateSmemTbl(i, gcloads[i]);
 453   }
 454
 455   // find current heap top
 456   // TODO
 457   // a bug here: when using local allocation, directly move large objects
 458   // to the highest free chunk might not be memory efficient
 459   unsigned int tmpheaptop = 0;
 460   for(int i = gcnumblock-1; i >= 0; i--) {
 461     if(bamboo_smemtbl[i] > 0) {
 462       return gcbaseva+bamboo_smemtbl[i]+((i<NUMCORES4GC) ?
 463         (BAMBOO_SMEM_SIZE_L*i) :
 464         (BAMBOO_SMEM_SIZE*(i-NUMCORES4GC)+BAMBOO_LARGE_SMEM_BOUND));
 465     }
 466   }
 467   return gcbaseva;
 468 }
 469
 470 INLINE void moveLObjs() {
 471 #ifdef SMEMM
 472   // update the gcmem_mixed_usedmem
 473   gcmem_mixed_usedmem = 0;
 474 #endif
 475   unsigned int size = 0;
 476   unsigned int bound = 0;
 477   unsigned int tmpheaptop = checkCurrHeapTop();
 478
 479   // move large objs from gcheaptop to tmpheaptop
 480   // write the header first
 481   unsigned int tomove = gcbaseva+(BAMBOO_SHARED_MEM_SIZE)-gcheaptop;
 482 #ifdef SMEMM
 483   gcmem_mixed_usedmem += tomove;
 484 #endif
 485   // flush the sbstartbl
 486   BAMBOO_MEMSET_WH(&(gcsbstarttbl[gcreservedsb]), '\0',
 487     (BAMBOO_SHARED_MEM_SIZE/BAMBOO_SMEM_SIZE-(unsigned int)gcreservedsb)
 488     *sizeof(unsigned int));
 489   if(tomove == 0) {
 490     gcheaptop = tmpheaptop;
 491   } else {
 492     // check how many blocks it acrosses
 493     unsigned int remain = tmpheaptop-gcbaseva;
 494     //number of the sblock
 495     unsigned int sb = remain/BAMBOO_SMEM_SIZE+(unsigned int)gcreservedsb;
 496     unsigned int b = 0;  // number of the block
 497     BLOCKINDEX(tmpheaptop, &b);
 498     // check the remaining space in this block
 499     bound = (BAMBOO_SMEM_SIZE);
 500     if(remain < (BAMBOO_LARGE_SMEM_BOUND)) {
 501       bound = (BAMBOO_SMEM_SIZE_L);
 502     }
 503     remain = bound - remain%bound;
 504
 505     size = 0;
 506     unsigned int isize = 0;
 507     unsigned int host = 0;
 508     unsigned int ptr = 0;
 509     unsigned int base = tmpheaptop;
 510     unsigned int cpysize = 0;
 511     remain -= BAMBOO_CACHE_LINE_SIZE;
 512     tmpheaptop += BAMBOO_CACHE_LINE_SIZE;
 513     gc_lobjqueueinit4_I();
 514     while(gc_lobjmoreItems4_I()) {
 515       ptr = (unsigned int)(gc_lobjdequeue4_I(&size, &host));
 516       ALIGNSIZE(size, &isize);
 517       if(remain >= isize) {
 518         remain -= isize;
 519         // move the large obj
 520         if((unsigned int)gcheaptop < (unsigned int)(tmpheaptop+size)) {
 521           memmove(tmpheaptop, gcheaptop, size);
 522         } else {
 523           memcpy(tmpheaptop, gcheaptop, size);
 524         }
 525         // fill the remaining space with -2 padding
 526         BAMBOO_MEMSET_WH(tmpheaptop+size, -2, isize-size);
 527
 528         gcheaptop += size;
 529         cpysize += isize;
 530         // cache the mapping info
 531         gcmappingtbl[OBJMAPPINGINDEX((unsigned int)ptr)]=(unsigned int)tmpheaptop;
 532         tmpheaptop += isize;
 533
 534         // update bamboo_smemtbl
 535         bamboo_smemtbl[b] += isize;
 536       } else {
 537         // this object acrosses blocks
 538         if(cpysize > 0) {
 539           CLOSEBLOCK(base, cpysize+BAMBOO_CACHE_LINE_SIZE);
 540           bamboo_smemtbl[b] += BAMBOO_CACHE_LINE_SIZE;
 541           cpysize = 0;
 542           base = tmpheaptop;
 543           if(remain == 0) {
 544             remain = ((tmpheaptop-gcbaseva)<(BAMBOO_LARGE_SMEM_BOUND)) ?
 545               BAMBOO_SMEM_SIZE_L : BAMBOO_SMEM_SIZE;
 546           }
 547           remain -= BAMBOO_CACHE_LINE_SIZE;
 548           tmpheaptop += BAMBOO_CACHE_LINE_SIZE;
 549           BLOCKINDEX(tmpheaptop, &b);
 550           sb = (unsigned int)(tmpheaptop-gcbaseva)/(BAMBOO_SMEM_SIZE)+gcreservedsb;
 551         }
 552
 553         // move the large obj
 554         if((unsigned int)gcheaptop < (unsigned int)(tmpheaptop+size)) {
 555           memmove(tmpheaptop, gcheaptop, size);
 556         } else {
 557           memcpy(tmpheaptop, gcheaptop, size);
 558         }
 559         // fill the remaining space with -2 padding
 560         BAMBOO_MEMSET_WH(tmpheaptop+size, -2, isize-size);
 561         gcheaptop += size;
 562         // cache the mapping info
 563         gcmappingtbl[OBJMAPPINGINDEX((unsigned int)ptr)]=(unsigned int)tmpheaptop;
 564         tmpheaptop += isize;
 565
 566         // set the gcsbstarttbl and bamboo_smemtbl
 567         unsigned int tmpsbs=1+(unsigned int)(isize-remain-1)/BAMBOO_SMEM_SIZE;
 568         for(int k = 1; k < tmpsbs; k++) {
 569           gcsbstarttbl[sb+k] = -1;
 570         }
 571         sb += tmpsbs;
 572         bound = (b<NUMCORES4GC) ? BAMBOO_SMEM_SIZE_L : BAMBOO_SMEM_SIZE;
 573         BLOCKINDEX(tmpheaptop-1, &tmpsbs);
 574         for(; b < tmpsbs; b++) {
 575           bamboo_smemtbl[b] = bound;
 576           if(b==NUMCORES4GC-1) {
 577             bound = BAMBOO_SMEM_SIZE;
 578           }
 579         }
 580         if(((unsigned int)(isize-remain)%(BAMBOO_SMEM_SIZE)) == 0) {
 581           gcsbstarttbl[sb] = -1;
 582           remain = ((tmpheaptop-gcbaseva)<(BAMBOO_LARGE_SMEM_BOUND)) ?
 583             BAMBOO_SMEM_SIZE_L : BAMBOO_SMEM_SIZE;
 584           bamboo_smemtbl[b] = bound;
 585         } else {
 586           gcsbstarttbl[sb] = (int)tmpheaptop;
 587           remain = tmpheaptop-gcbaseva;
 588           bamboo_smemtbl[b] = remain%bound;
 589           remain = bound - bamboo_smemtbl[b];
 590         }
 591
 592         CLOSEBLOCK(base, isize+BAMBOO_CACHE_LINE_SIZE);
 593         cpysize = 0;
 594         base = tmpheaptop;
 595         if(remain == BAMBOO_CACHE_LINE_SIZE) {
 596           // fill with 0 in case
 597           BAMBOO_MEMSET_WH(tmpheaptop, '\0', remain);
 598         }
 599         remain -= BAMBOO_CACHE_LINE_SIZE;
 600         tmpheaptop += BAMBOO_CACHE_LINE_SIZE;
 601       }
 602     }
 603
 604     if(cpysize > 0) {
 605       CLOSEBLOCK(base, cpysize+BAMBOO_CACHE_LINE_SIZE);
 606       bamboo_smemtbl[b] += BAMBOO_CACHE_LINE_SIZE;
 607     } else {
 608       tmpheaptop -= BAMBOO_CACHE_LINE_SIZE;
 609     }
 610     gcheaptop = tmpheaptop;
 611   }
 612
 613   bamboo_free_block = 0;
 614   unsigned int tbound = 0;
 615   do {
 616     tbound=(bamboo_free_block<NUMCORES4GC)?BAMBOO_SMEM_SIZE_L:BAMBOO_SMEM_SIZE;
 617     if(bamboo_smemtbl[bamboo_free_block] == tbound) {
 618       bamboo_free_block++;
 619     } else {
 620       // the first non-full partition
 621       break;
 622     }
 623   } while(true);
 624
 625   GCPROFILE_RECORD_SPACE();
 626 }
 627
 628 void gc_collect(struct garbagelist * stackptr) {
 629   gcprocessing = true;
 630   // inform the master that this core is at a gc safe point and is ready to
 631   // do gc
 632   send_msg_4(STARTUPCORE,GCFINISHPRE,BAMBOO_NUM_OF_CORE,self_numsendobjs,self_numreceiveobjs);
 633
 634   // core collector routine
 635   //wait for init phase
 636   WAITFORGCPHASE(INITPHASE);
 637
 638   GC_PRINTF("Do initGC\n");
 639   initGC();
 640   CACHEADAPT_GC(true);
 641   //send init finish msg to core coordinator
 642   send_msg_2(STARTUPCORE,GCFINISHINIT,BAMBOO_NUM_OF_CORE);
 643
 644   //wait for mark phase
 645   WAITFORGCPHASE(MARKPHASE);
 646
 647   GC_PRINTF("Start mark phase\n");
 648   mark(true, stackptr);
 649   GC_PRINTF("Finish mark phase, start compact phase\n");
 650   compact();
 651   GC_PRINTF("Finish compact phase\n");
 652
 653   WAITFORGCPHASE(FLUSHPHASE);
 654
 655   GC_PRINTF("Start flush phase\n");
 656   GCPROFILE_INFO_2_MASTER();
 657   flush(stackptr);
 658   GC_PRINTF("Finish flush phase\n");
 659
 660   CACHEADAPT_PHASE_CLIENT();
 661
 662   // invalidate all shared mem pointers
 663   bamboo_cur_msp = NULL;
 664   bamboo_smem_size = 0;
 665   bamboo_smem_zero_top = NULL;
 666   gcflag = false;
 667
 668   WAITFORGCPHASE(FINISHPHASE);
 669
 670   GC_PRINTF("Finish gc! \n");
 671 }
 672
 673 void gc_nocollect(struct garbagelist * stackptr) {
 674   gcprocessing = true;
 675   // inform the master that this core is at a gc safe point and is ready to
 676   // do gc
 677   send_msg_4(STARTUPCORE,GCFINISHPRE,BAMBOO_NUM_OF_CORE,self_numsendobjs,self_numreceiveobjs);
 678
 679   WAITFORGCPHASE(INITPHASE);
 680
 681   GC_PRINTF("Do initGC\n");
 682   initGC();
 683   CACHEADAPT_GC(true);
 684   //send init finish msg to core coordinator
 685   send_msg_2(STARTUPCORE,GCFINISHINIT,BAMBOO_NUM_OF_CORE);
 686
 687   WAITFORGCPHASE(MARKPHASE);
 688
 689   GC_PRINTF("Start mark phase\n");
 690   mark(true, stackptr);
 691   GC_PRINTF("Finish mark phase, wait for flush\n");
 692
 693   // non-gc core collector routine
 694   WAITFORGCPHASE(FLUSHPHASE);
 695
 696   GC_PRINTF("Start flush phase\n");
 697   GCPROFILE_INFO_2_MASTER();
 698   flush(stackptr);
 699   GC_PRINTF("Finish flush phase\n");
 700
 701   CACHEADAPT_PHASE_CLIENT();
 702
 703   // invalidate all shared mem pointers
 704   bamboo_cur_msp = NULL;
 705   bamboo_smem_size = 0;
 706   bamboo_smem_zero_top = NULL;
 707
 708   gcflag = false;
 709   WAITFORGCPHASE(FINISHPHASE);
 710
 711   GC_PRINTF("Finish gc! \n");
 712 }
 713
 714 void master_mark(struct garbagelist *stackptr) {
 715   bool isfirst = true;
 716
 717   GC_PRINTF("Start mark phase \n");
 718   GC_SEND_MSG_1_TO_CLIENT(GCSTART);
 719   gcphase = MARKPHASE;
 720   // mark phase
 721
 722   while(MARKPHASE == gcphase) {
 723     mark(isfirst, stackptr);
 724     isfirst=false;
 725     // check gcstatus
 726     checkMarkStatus();
 727   }
 728 }
 729
 730 void master_getlargeobjs() {
 731   // send msgs to all cores requiring large objs info
 732   // Note: only need to ask gc cores, non-gc cores do not host any objs
 733   numconfirm = NUMCORES4GC - 1;
 734   for(int i = 1; i < NUMCORES4GC; i++) {
 735     send_msg_1(i,GCLOBJREQUEST);
 736   }
 737   gcloads[BAMBOO_NUM_OF_CORE] = gccurr_heaptop;
 738   //spin until we have all responses
 739   while(numconfirm!=0) ;
 740
 741   // check the heaptop
 742   if(gcheaptop < gcmarkedptrbound) {
 743     gcheaptop = gcmarkedptrbound;
 744   }
 745   GCPROFILE_ITEM();
 746   GC_PRINTF("prepare to cache large objs \n");
 747
 748   // cache all large objs
 749   BAMBOO_ASSERTMSG(cacheLObjs(), "Not enough space to cache large objects\n");
 750 }
 751
 752 void master_compact() {
 753   // predict number of blocks to fill for each core
 754   unsigned int tmpheaptop = 0;
 755   int numpbc = loadbalance(&tmpheaptop);
 756
 757   numpbc = (BAMBOO_SHARED_MEM_SIZE)/(BAMBOO_SMEM_SIZE);
 758   GC_PRINTF("mark phase finished \n");
 759
 760   tmpheaptop = gcbaseva + (BAMBOO_SHARED_MEM_SIZE);
 761   for(int i = 0; i < NUMCORES4GC; i++) {
 762     unsigned int tmpcoreptr = 0;
 763     BASEPTR(i, numpbc, &tmpcoreptr);
 764     // init some data strutures for compact phase
 765     gcloads[i] = 0;
 766     gcfilledblocks[i] = 0;
 767     gcrequiredmems[i] = 0;
 768     gccorestatus[i] = 1;
 769     //send start compact messages to all cores
 770     //TODO bug here, do not know if the direction is positive or negtive?
 771     if (tmpcoreptr < tmpheaptop) {
 772       gcstopblock[i] = numpbc + 1;
 773       if(i != STARTUPCORE) {
 774         send_msg_2(i, GCSTARTCOMPACT, numpbc+1);
 775       } else {
 776         gcblock2fill = numpbc+1;
 777       }
 778     } else {
 779       gcstopblock[i] = numpbc;
 780       if(i != STARTUPCORE) {
 781         send_msg_2(i, GCSTARTCOMPACT, numpbc);
 782       } else {
 783         gcblock2fill = numpbc;
 784       }
 785     }
 786   }
 787   BAMBOO_CACHE_MF();
 788   GCPROFILE_ITEM();
 789   // compact phase
 790   struct moveHelper * orig = (struct moveHelper *)RUNMALLOC(sizeof(struct moveHelper));
 791   struct moveHelper * to = (struct moveHelper *)RUNMALLOC(sizeof(struct moveHelper));
 792   compact_master(orig, to);
 793   GCPROFILE_ITEM();
 794   GC_PRINTF("prepare to move large objs \n");
 795   // move largeObjs
 796   moveLObjs();
 797   GC_PRINTF("compact phase finished \n");
 798   RUNFREE(orig);
 799   RUNFREE(to);
 800 }
 801
 802 void master_updaterefs(struct garbagelist * stackptr) {
 803   gcphase = FLUSHPHASE;
 804   GC_SEND_MSG_1_TO_CLIENT(GCSTARTFLUSH);
 805   GCPROFILE_ITEM();
 806   GC_PRINTF("Start flush phase \n");
 807   // flush phase
 808   flush(stackptr);
 809   // now the master core need to decide the new cache strategy
 810   CACHEADAPT_MASTER();
 811   GC_CHECK_ALL_CORE_STATUS(FLUSHPHASE==gcphase);
 812   GC_PRINTF("Finish flush phase \n");
 813 }
 814
 815 void master_finish() {
 816   gcphase = FINISHPHASE;
 817
 818   // invalidate all shared mem pointers
 819   // put it here as it takes time to inform all the other cores to
 820   // finish gc and it might cause problem when some core resumes
 821   // mutator earlier than the other cores
 822   bamboo_cur_msp = NULL;
 823   bamboo_smem_size = 0;
 824   bamboo_smem_zero_top = NULL;
 825
 826   GCPROFILE_END();
 827   gcflag = false;
 828   GC_SEND_MSG_1_TO_CLIENT(GCFINISH);
 829
 830   gcprocessing = false;
 831   if(gcflag) {
 832     // inform other cores to stop and wait for gc
 833     gcprecheck = true;
 834     for(int i = 0; i < NUMCORESACTIVE; i++) {
 835       // reuse the gcnumsendobjs & gcnumreceiveobjs
 836       gcnumsendobjs[0][i] = 0;
 837       gcnumreceiveobjs[0][i] = 0;
 838     }
 839     GC_SEND_MSG_1_TO_CLIENT(GCSTARTPRE);
 840   }
 841 }
 842
 843 void gc_master(struct garbagelist * stackptr) {
 844   tprintf("start GC !!!!!!!!!!!!! \n");
 845   gcprocessing = true;
 846   gcphase = INITPHASE;
 847
 848   waitconfirm = false;
 849   numconfirm = 0;
 850   initGC();
 851   GC_SEND_MSG_1_TO_CLIENT(GCSTARTINIT);
 852   CACHEADAPT_GC(true);
 853   GC_PRINTF("Check core status \n");
 854   GC_CHECK_ALL_CORE_STATUS(true);
 855   GCPROFILE_ITEM();
 856   CACHEADAPT_OUTPUT_CACHE_SAMPLING();
 857
 858   // do mark phase
 859   master_mark(stackptr);
 860
 861   // get large objects from all cores
 862   master_getlargeobjs();
 863
 864   // compact the heap
 865   master_compact();
 866
 867   // update the references
 868   master_updaterefs(stackptr);
 869
 870   // do cache adaptation
 871   CACHEADAPT_PHASE_MASTER();
 872
 873   // do finish up stuff
 874   master_finish();
 875
 876   GC_PRINTF("gc finished   \n");
 877   tprintf("finish GC ! %d \n", gcflag);
 878 }
 879
 880 void pregccheck() {
 881   while(true) {
 882     BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
 883     gcnumsendobjs[0][BAMBOO_NUM_OF_CORE] = self_numsendobjs;
 884     gcnumreceiveobjs[0][BAMBOO_NUM_OF_CORE] = self_numreceiveobjs;
 885     int sumsendobj = 0;
 886     for(int i = 0; i < NUMCORESACTIVE; i++) {
 887       sumsendobj += gcnumsendobjs[0][i];
 888     }
 889     for(int i = 0; i < NUMCORESACTIVE; i++) {
 890       sumsendobj -= gcnumreceiveobjs[0][i];
 891     }
 892     if(0 != sumsendobj) {
 893       // there were still some msgs on the fly, wait until there
 894       // are some update pregc information coming and check it again
 895       gcprecheck = false;
 896       BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
 897
 898       while(!gcprecheck) ;
 899     } else {
 900       BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
 901       return;
 902     }
 903   }
 904 }
 905
 906 void pregcprocessing() {
 907 #if defined(GC_CACHE_ADAPT)&&defined(GC_CACHE_SAMPLING)
 908   // disable the timer interrupt
 909   bamboo_mask_timer_intr();
 910 #endif
 911   // Zero out the remaining memory here because for the GC_CACHE_ADAPT version,
 912   // we need to make sure during the gcinit phase the shared heap is not
 913   // touched. Otherwise, there would be problem when adapt the cache strategy.
 914   BAMBOO_CLOSE_CUR_MSP();
 915 #ifdef GC_FLUSH_DTLB
 916   if(gc_num_flush_dtlb < GC_NUM_FLUSH_DTLB) {
 917     BAMBOO_CLEAN_DTLB();
 918     gc_num_flush_dtlb++;
 919   }
 920 #endif
 921 #if defined(GC_CACHE_ADAPT)&&defined(GC_CACHE_SAMPLING)
 922   // get the sampling data
 923   bamboo_output_dtlb_sampling();
 924 #endif
 925 }
 926
 927 void postgcprocessing() {
 928 #if defined(GC_CACHE_ADAPT)&&defined(GC_CACHE_SAMPLING)
 929   // enable the timer interrupt
 930   bamboo_tile_timer_set_next_event(GC_TILE_TIMER_EVENT_SETTING);
 931   bamboo_unmask_timer_intr();
 932 #endif
 933 }
 934
 935 bool gc(struct garbagelist * stackptr) {
 936   // check if do gc
 937   if(!gcflag) {
 938     gcprocessing = false;
 939     return false;
 940   }
 941
 942   // core coordinator routine
 943   if(0 == BAMBOO_NUM_OF_CORE) {
 944     GC_PRINTF("Check if we can do gc or not\n");
 945     gccorestatus[BAMBOO_NUM_OF_CORE] = 0;
 946     if(!gc_checkAllCoreStatus()) {
 947       // some of the cores are still executing the mutator and did not reach
 948       // some gc safe point, therefore it is not ready to do gc
 949       gcflag = true;
 950       return false;
 951     } else {
 952       GCPROFILE_START();
 953       pregccheck();
 954     }
 955     GC_PRINTF("start gc! \n");
 956     pregcprocessing();
 957     gc_master(stackptr);
 958   } else if(BAMBOO_NUM_OF_CORE < NUMCORES4GC) {
 959     pregcprocessing();
 960     gc_collect(stackptr);
 961   } else {
 962     pregcprocessing();
 963     gc_nocollect(stackptr);
 964   }
 965   postgcprocessing();
 966
 967   return true;
 968 }
 969
 970 #endif