Robust/src/Runtime/bamboo/multicoregarbage.c

   1 // TODO: DO NOT support tag!!!
   2 #ifdef MULTICORE_GC
   3 #include "runtime.h"
   4 #include "multicoregarbage.h"
   5 #include "multicoregcmark.h"
   6 #include "gcqueue.h"
   7 #include "multicoregccompact.h"
   8 #include "multicoregcflush.h"
   9 #include "multicoreruntime.h"
  10 #include "multicoregcprofile.h"
  11 #include "gcqueue.h"
  12 #ifdef MGC
  13 #include "runtime_arch.h"
  14
  15 extern int corenum;
  16 #endif
  17
  18 #ifdef SMEMM
  19 extern unsigned int gcmem_mixed_threshold;
  20 extern unsigned int gcmem_mixed_usedmem;
  21 #endif // SMEMM
  22
  23 #ifdef GC_DEBUG
  24 // dump whole mem in blocks
  25 void dumpSMem() {
  26   int block = 0;
  27   int sblock = 0;
  28   unsigned int j = 0;
  29   void * i = 0;
  30   int coren = 0;
  31   int x = 0;
  32   int y = 0;
  33   printf("(%x,%x) Dump shared mem: \n",udn_tile_coord_x(),udn_tile_coord_y());
  34   // reserved blocks for sblocktbl
  35   printf("(%x,%x) ++++ reserved sblocks ++++ \n", udn_tile_coord_x(),
  36          udn_tile_coord_y());
  37   for(i=BAMBOO_BASE_VA; (unsinged int)i<(unsigned int)gcbaseva; i+= 4*16) {
  38     printf("(%x,%x) 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x \n",
  39            udn_tile_coord_x(), udn_tile_coord_y(),
  40            *((int *)(i)), *((int *)(i + 4)),
  41            *((int *)(i + 4*2)), *((int *)(i + 4*3)),
  42            *((int *)(i + 4*4)), *((int *)(i + 4*5)),
  43            *((int *)(i + 4*6)), *((int *)(i + 4*7)),
  44            *((int *)(i + 4*8)), *((int *)(i + 4*9)),
  45            *((int *)(i + 4*10)), *((int *)(i + 4*11)),
  46            *((int *)(i + 4*12)), *((int *)(i + 4*13)),
  47            *((int *)(i + 4*14)), *((int *)(i + 4*15)));
  48   }
  49   sblock = gcreservedsb;
  50   bool advanceblock = false;
  51   // remaining memory
  52   for(i=gcbaseva; (unsigned int)i<(unsigned int)(gcbaseva+BAMBOO_SHARED_MEM_SIZE); i+=4*16) {
  53     advanceblock = false;
  54     // computing sblock # and block #, core coordinate (x,y) also
  55     if(j%((BAMBOO_SMEM_SIZE)/(4*16)) == 0) {
  56       // finished a sblock
  57       if(j < ((BAMBOO_LARGE_SMEM_BOUND)/(4*16))) {
  58         if((j > 0) && (j%((BAMBOO_SMEM_SIZE_L)/(4*16)) == 0)) {
  59           // finished a block
  60           block++;
  61           advanceblock = true;
  62         }
  63       } else {
  64         // finished a block
  65         block++;
  66         advanceblock = true;
  67       }
  68       // compute core #
  69       if(advanceblock) {
  70         coren = gc_block2core[block%(NUMCORES4GC*2)];
  71       }
  72       // compute core coordinate
  73       x = BAMBOO_COORDS_X(coren);
  74       y = BAMBOO_COORDS_Y(coren);
  75       printf("(%x,%x) ==== %d, %d : core (%d,%d), saddr %x====\n",
  76              udn_tile_coord_x(), udn_tile_coord_y(),
  77              block, sblock++, x, y,
  78              (sblock-1)*(BAMBOO_SMEM_SIZE)+gcbaseva);
  79     }
  80     j++;
  81     printf("(%x,%x) 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x \n",
  82            udn_tile_coord_x(), udn_tile_coord_y(),
  83            *((int *)(i)), *((int *)(i + 4)),
  84            *((int *)(i + 4*2)), *((int *)(i + 4*3)),
  85            *((int *)(i + 4*4)), *((int *)(i + 4*5)),
  86            *((int *)(i + 4*6)), *((int *)(i + 4*7)),
  87            *((int *)(i + 4*8)), *((int *)(i + 4*9)),
  88            *((int *)(i + 4*10)), *((int *)(i + 4*11)),
  89            *((int *)(i + 4*12)), *((int *)(i + 4*13)),
  90            *((int *)(i + 4*14)), *((int *)(i + 4*15)));
  91   }
  92   printf("(%x,%x) \n", udn_tile_coord_x(), udn_tile_coord_y());
  93 }
  94 #endif
  95
  96 void initmulticoregcdata() {
  97   if(STARTUPCORE == BAMBOO_NUM_OF_CORE) {
  98     // startup core to initialize corestatus[]
  99     for(int i = 0; i < NUMCORESACTIVE; i++) {
 100       gccorestatus[i] = 1;
 101       gcnumsendobjs[0][i] = gcnumsendobjs[1][i] = 0;
 102       gcnumreceiveobjs[0][i] = gcnumreceiveobjs[1][i] = 0;
 103     }
 104     for(int i = 0; i < NUMCORES4GC; i++) {
 105       gcloads[i] = 0;
 106       gcrequiredmems[i] = 0;
 107       gcstopblock[i] = 0;
 108       gcfilledblocks[i] = 0;
 109     }
 110   }
 111
 112   bamboo_smem_zero_top = NULL;
 113   gcflag = false;
 114   gcprocessing = false;
 115   gcphase = FINISHPHASE;
 116   gcprecheck = true;
 117   gccurr_heaptop = 0;
 118   gcself_numsendobjs = 0;
 119   gcself_numreceiveobjs = 0;
 120   gcmarkedptrbound = 0;
 121   gcforwardobjtbl = allocateMGCHash_I(20, 3);
 122   gcnumlobjs = 0;
 123   gcheaptop = 0;
 124   gctopcore = 0;
 125   gctopblock = 0;
 126   gcmovestartaddr = 0;
 127   gctomove = false;
 128   gcmovepending = 0;
 129   gcblock2fill = 0;
 130 #ifdef SMEMM
 131   gcmem_mixed_threshold = (unsigned int)((BAMBOO_SHARED_MEM_SIZE
 132                 -bamboo_reserved_smem*BAMBOO_SMEM_SIZE)*0.8);
 133   gcmem_mixed_usedmem = 0;
 134 #endif
 135 #ifdef MGC_SPEC
 136   gc_profile_flag = false;
 137 #endif
 138 #ifdef GC_FLUSH_DTLB
 139   gc_num_flush_dtlb = 0;
 140 #endif
 141   gc_localheap_s = false;
 142 #ifdef GC_CACHE_ADAPT
 143   gccachestage = false;
 144 #endif
 145
 146   INIT_MULTICORE_GCPROFILE_DATA();
 147 }
 148
 149 void dismulticoregcdata() {
 150   freeMGCHash(gcforwardobjtbl);
 151 }
 152
 153 void initGC() {
 154   if(STARTUPCORE == BAMBOO_NUM_OF_CORE) {
 155     for(int i = 0; i < NUMCORES4GC; i++) {
 156       gccorestatus[i] = 1;
 157       gcnumsendobjs[0][i] = gcnumsendobjs[1][i] = 0;
 158       gcnumreceiveobjs[0][i] = gcnumreceiveobjs[1][i] = 0;
 159       gcloads[i] = 0;
 160       gcrequiredmems[i] = 0;
 161       gcfilledblocks[i] = 0;
 162       gcstopblock[i] = 0;
 163     }
 164     for(int i = NUMCORES4GC; i < NUMCORESACTIVE; i++) {
 165       gccorestatus[i] = 1;
 166       gcnumsendobjs[0][i] = gcnumsendobjs[1][i] = 0;
 167       gcnumreceiveobjs[0][i] = gcnumreceiveobjs[1][i] = 0;
 168     }
 169     gcheaptop = 0;
 170     gctopcore = 0;
 171     gctopblock = 0;
 172     gcnumsrobjs_index = 0;
 173   }
 174   gcself_numsendobjs = 0;
 175   gcself_numreceiveobjs = 0;
 176   gcmarkedptrbound = 0;
 177   gcnumlobjs = 0;
 178   gcmovestartaddr = 0;
 179   gctomove = false;
 180   gcblock2fill = 0;
 181   gcmovepending = 0;
 182   gccurr_heaptop = 0;
 183   gcdstcore = 0;
 184
 185   gc_queueinit();
 186
 187   freeMGCHash(gcforwardobjtbl);
 188   gcforwardobjtbl = allocateMGCHash(20, 3);
 189
 190   GCPROFILE_INIT();
 191 }
 192
 193 bool gc_checkAllCoreStatus() {
 194   BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
 195   for(int i = 0; i < NUMCORESACTIVE; i++) {
 196     if(gccorestatus[i] != 0) {
 197       BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
 198       return false;
 199     }
 200   }
 201   BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
 202   return true;
 203 }
 204
 205 // NOTE: should be invoked with interrupts turned off
 206 bool gc_checkAllCoreStatus_I() {
 207   for(int i = 0; i < NUMCORESACTIVE; i++) {
 208     if(gccorestatus[i] != 0) {
 209       return false;
 210     }
 211   }
 212   return true;
 213 }
 214
 215 INLINE void checkMarkStatus() {
 216   if((!waitconfirm) ||
 217       (waitconfirm && (numconfirm == 0))) {
 218     unsigned int entry_index = 0;
 219     if(waitconfirm) {
 220       // phase 2
 221       entry_index = (gcnumsrobjs_index == 0) ? 1 : 0;
 222     } else {
 223       // phase 1
 224       entry_index = gcnumsrobjs_index;
 225     }
 226     BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
 227     gccorestatus[BAMBOO_NUM_OF_CORE] = 0;
 228     gcnumsendobjs[entry_index][BAMBOO_NUM_OF_CORE] = gcself_numsendobjs;
 229     gcnumreceiveobjs[entry_index][BAMBOO_NUM_OF_CORE] = gcself_numreceiveobjs;
 230     // check the status of all cores
 231     if (gc_checkAllCoreStatus_I()) {
 232       // ask for confirm
 233       if(!waitconfirm) {
 234         // the first time found all cores stall
 235         // send out status confirm msg to all other cores
 236         // reset the corestatus array too
 237         waitconfirm = true;
 238         numconfirm = NUMCORESACTIVE - 1;
 239         BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
 240         GC_SEND_MSG_1_TO_CLIENT(GCMARKCONFIRM);
 241       } else {
 242         // Phase 2
 243         // check if the sum of send objs and receive obj are the same
 244         // yes->check if the info is the latest; no->go on executing
 245         unsigned int sumsendobj = 0;
 246         for(int i = 0; i < NUMCORESACTIVE; i++) {
 247           sumsendobj += gcnumsendobjs[gcnumsrobjs_index][i];
 248         }
 249         for(int i = 0; i < NUMCORESACTIVE; i++) {
 250           sumsendobj -= gcnumreceiveobjs[gcnumsrobjs_index][i];
 251         }
 252         if(0 == sumsendobj) {
 253           // Check if there are changes of the numsendobjs or numreceiveobjs on
 254           // each core
 255           bool ischanged = false;
 256           for(int i = 0; i < NUMCORESACTIVE; i++) {
 257             if((gcnumsendobjs[0][i] != gcnumsendobjs[1][i]) ||
 258                 (gcnumreceiveobjs[0][i] != gcnumreceiveobjs[1][i]) ) {
 259               ischanged = true;
 260               break;
 261             }
 262           }
 263           if(!ischanged) {
 264             // all the core status info are the latest,stop mark phase
 265             gcphase = COMPACTPHASE;
 266             // restore the gcstatus for all cores
 267             for(int i = 0; i < NUMCORESACTIVE; i++) {
 268               gccorestatus[i] = 1;
 269             }
 270           } else {
 271             // There were changes between phase 1 and phase 2, can not decide
 272             // whether the mark phase has been finished
 273             waitconfirm = false;
 274             // As it fails in phase 2, flip the entries
 275             gcnumsrobjs_index = (gcnumsrobjs_index == 0) ? 1 : 0;
 276           }
 277         } else {
 278           // There were changes between phase 1 and phase 2, can not decide
 279           // whether the mark phase has been finished
 280           waitconfirm = false;
 281           // As it fails in phase 2, flip the entries
 282           gcnumsrobjs_index = (gcnumsrobjs_index == 0) ? 1 : 0;
 283         }
 284         BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
 285       }
 286     } else {
 287       BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
 288     }
 289   }
 290 }
 291
 292 // compute load balance for all cores
 293 INLINE int loadbalance(unsigned int * heaptop) {
 294   // compute load balance
 295
 296   // get the total loads
 297   unsigned int tloads = gcloads[STARTUPCORE];
 298   for(int i = 1; i < NUMCORES4GC; i++) {
 299     tloads += gcloads[i];
 300   }
 301   *heaptop = gcbaseva + tloads;
 302
 303   unsigned int b = 0;
 304   BLOCKINDEX(*heaptop, &b);
 305   // num of blocks per core
 306   unsigned int numbpc = (unsigned int)b/(unsigned int)(NUMCORES4GC);
 307   gctopblock = b;
 308   RESIDECORE(heaptop, &gctopcore);
 309   return numbpc;
 310 }
 311
 312 // compute total mem size required and sort the lobjs in ascending order
 313 INLINE unsigned int sortLObjs() {
 314   unsigned int tmp_lobj = 0;
 315   unsigned int tmp_len = 0;
 316   unsigned int tmp_host = 0;
 317   unsigned int sumsize = 0;
 318
 319   gclobjtail2 = gclobjtail;
 320   gclobjtailindex2 = gclobjtailindex;
 321   // TODO USE QUICK SORT INSTEAD?
 322   while(gc_lobjmoreItems2_I()) {
 323     gc_lobjdequeue2_I();
 324     tmp_lobj = gclobjtail2->lobjs[gclobjtailindex2-1];
 325     tmp_host = gclobjtail2->hosts[gclobjtailindex2-1];
 326     tmp_len = gclobjtail2->lengths[gclobjtailindex2 - 1];
 327     sumsize += tmp_len;
 328     GCPROFILE_RECORD_LOBJ();
 329     unsigned int i = gclobjtailindex2-1;
 330     struct lobjpointerblock * tmp_block = gclobjtail2;
 331     // find the place to insert
 332     while(true) {
 333       if(i == 0) {
 334         if(tmp_block->prev == NULL) {
 335           break;
 336         }
 337         if(tmp_block->prev->lobjs[NUMLOBJPTRS-1] > tmp_lobj) {
 338           tmp_block->lobjs[i] = tmp_block->prev->lobjs[NUMLOBJPTRS-1];
 339           tmp_block->lengths[i] = tmp_block->prev->lengths[NUMLOBJPTRS-1];
 340           tmp_block->hosts[i] = tmp_block->prev->hosts[NUMLOBJPTRS-1];
 341           tmp_block = tmp_block->prev;
 342           i = NUMLOBJPTRS-1;
 343         } else {
 344           break;
 345         }  // if(tmp_block->prev->lobjs[NUMLOBJPTRS-1] < tmp_lobj)
 346       } else {
 347         if(tmp_block->lobjs[i-1] > tmp_lobj) {
 348           tmp_block->lobjs[i] = tmp_block->lobjs[i-1];
 349           tmp_block->lengths[i] = tmp_block->lengths[i-1];
 350           tmp_block->hosts[i] = tmp_block->hosts[i-1];
 351           i--;
 352         } else {
 353           break;
 354         }
 355       }
 356     }
 357     // insert it
 358     if(i != gclobjtailindex2 - 1) {
 359       tmp_block->lobjs[i] = tmp_lobj;
 360       tmp_block->lengths[i] = tmp_len;
 361       tmp_block->hosts[i] = tmp_host;
 362     }
 363   }
 364   return sumsize;
 365 }
 366
 367 INLINE bool cacheLObjs() {
 368   // check the total mem size need for large objs
 369   unsigned long long sumsize = 0;
 370   unsigned int size = 0;
 371
 372   sumsize = sortLObjs();
 373
 374   GCPROFILE_RECORD_LOBJSPACE();
 375
 376   // check if there are enough space to cache these large objs
 377   unsigned int dst = gcbaseva + (BAMBOO_SHARED_MEM_SIZE) -sumsize;
 378   if((unsigned long long)gcheaptop > (unsigned long long)dst) {
 379     // do not have enough room to cache large objs
 380     return false;
 381   }
 382
 383   gcheaptop = dst; // Note: record the start of cached lobjs with gcheaptop
 384   // cache the largeObjs to the top of the shared heap
 385   dst = gcbaseva + (BAMBOO_SHARED_MEM_SIZE);
 386   while(gc_lobjmoreItems3_I()) {
 387     gc_lobjdequeue3_I();
 388     size = gclobjtail2->lengths[gclobjtailindex2];
 389     // set the mark field to , indicating that this obj has been moved
 390     // and need to be flushed
 391     ((struct ___Object___ *)(gclobjtail2->lobjs[gclobjtailindex2]))->marked =
 392       COMPACTED;
 393     dst -= size;
 394     if((unsigned int)dst <
 395         (unsigned int)(gclobjtail2->lobjs[gclobjtailindex2]+size)) {
 396       memmove(dst, gclobjtail2->lobjs[gclobjtailindex2], size);
 397     } else {
 398       memcpy(dst, gclobjtail2->lobjs[gclobjtailindex2], size);
 399     }
 400   }
 401   return true;
 402 }
 403
 404 // update the bmmboo_smemtbl to record current shared mem usage
 405 void updateSmemTbl(unsigned int coren, unsigned int localtop) {
 406   unsigned int ltopcore = 0;
 407   unsigned int bound = BAMBOO_SMEM_SIZE_L;
 408   BLOCKINDEX(localtop, &ltopcore);
 409   if((unsigned int)localtop>=(unsigned int)(gcbaseva+BAMBOO_LARGE_SMEM_BOUND)){
 410     bound = BAMBOO_SMEM_SIZE;
 411   }
 412   unsigned int load = (unsigned int)(localtop-gcbaseva)%(unsigned int)bound;
 413   unsigned int toset = 0;
 414   for(int j=0; 1; j++) {
 415     for(int i=0; i<2; i++) {
 416       toset = gc_core2block[2*coren+i]+(unsigned int)(NUMCORES4GC*2)*j;
 417       if(toset < ltopcore) {
 418         bamboo_smemtbl[toset]=(toset<NUMCORES4GC) ? BAMBOO_SMEM_SIZE_L : BAMBOO_SMEM_SIZE;
 419 #ifdef SMEMM
 420         gcmem_mixed_usedmem += bamboo_smemtbl[toset];
 421 #endif
 422       } else if(toset == ltopcore) {
 423         bamboo_smemtbl[toset] = load;
 424 #ifdef SMEMM
 425         gcmem_mixed_usedmem += bamboo_smemtbl[toset];
 426 #endif
 427         return;
 428       } else {
 429         return;
 430       }
 431     }
 432   }
 433 }
 434
 435 INLINE unsigned int checkCurrHeapTop() {
 436   // update the smemtbl
 437   BAMBOO_MEMSET_WH(bamboo_smemtbl, 0, sizeof(int)*gcnumblock);
 438   // flush all gcloads to indicate the real heap top on one core
 439   // previous it represents the next available ptr on a core
 440   if(((unsigned int)gcloads[0] > (unsigned int)(gcbaseva+BAMBOO_SMEM_SIZE_L))
 441      && (((unsigned int)gcloads[0]%(BAMBOO_SMEM_SIZE)) == 0)) {
 442     // edge of a block, check if this is exactly the heaptop
 443     BASEPTR(0, gcfilledblocks[0]-1, &(gcloads[0]));
 444     gcloads[0]+=(gcfilledblocks[0]>1?(BAMBOO_SMEM_SIZE):(BAMBOO_SMEM_SIZE_L));
 445   }
 446   updateSmemTbl(0, gcloads[0]);
 447   for(int i = 1; i < NUMCORES4GC; i++) {
 448     unsigned int tmptop = 0;
 449     if((gcfilledblocks[i] > 0)
 450        && (((unsigned int)gcloads[i] % (BAMBOO_SMEM_SIZE)) == 0)) {
 451       // edge of a block, check if this is exactly the heaptop
 452       BASEPTR(i, gcfilledblocks[i]-1, &gcloads[i]);
 453       gcloads[i] +=
 454         (gcfilledblocks[i]>1?(BAMBOO_SMEM_SIZE):(BAMBOO_SMEM_SIZE_L));
 455       tmptop = gcloads[i];
 456     }
 457     updateSmemTbl(i, gcloads[i]);
 458   }
 459
 460   // find current heap top
 461   // TODO
 462   // a bug here: when using local allocation, directly move large objects
 463   // to the highest free chunk might not be memory efficient
 464   unsigned int tmpheaptop = 0;
 465   for(int i = gcnumblock-1; i >= 0; i--) {
 466     if(bamboo_smemtbl[i] > 0) {
 467       return gcbaseva+bamboo_smemtbl[i]+((i<NUMCORES4GC) ?
 468         (BAMBOO_SMEM_SIZE_L*i) :
 469         (BAMBOO_SMEM_SIZE*(i-NUMCORES4GC)+BAMBOO_LARGE_SMEM_BOUND));
 470     }
 471   }
 472   return gcbaseva;
 473 }
 474
 475 INLINE void moveLObjs() {
 476 #ifdef SMEMM
 477   // update the gcmem_mixed_usedmem
 478   gcmem_mixed_usedmem = 0;
 479 #endif
 480   unsigned int size = 0;
 481   unsigned int bound = 0;
 482   unsigned int tmpheaptop = checkCurrHeapTop();
 483
 484   // move large objs from gcheaptop to tmpheaptop
 485   // write the header first
 486   unsigned int tomove = gcbaseva+(BAMBOO_SHARED_MEM_SIZE)-gcheaptop;
 487 #ifdef SMEMM
 488   gcmem_mixed_usedmem += tomove;
 489 #endif
 490   // flush the sbstartbl
 491   BAMBOO_MEMSET_WH(&(gcsbstarttbl[gcreservedsb]), '\0',
 492     (BAMBOO_SHARED_MEM_SIZE/BAMBOO_SMEM_SIZE-(unsigned int)gcreservedsb)
 493     *sizeof(unsigned int));
 494   if(tomove == 0) {
 495     gcheaptop = tmpheaptop;
 496   } else {
 497     // check how many blocks it acrosses
 498     unsigned int remain = tmpheaptop-gcbaseva;
 499     //number of the sblock
 500     unsigned int sb = remain/BAMBOO_SMEM_SIZE+(unsigned int)gcreservedsb;
 501     unsigned int b = 0;  // number of the block
 502     BLOCKINDEX(tmpheaptop, &b);
 503     // check the remaining space in this block
 504     bound = (BAMBOO_SMEM_SIZE);
 505     if(remain < (BAMBOO_LARGE_SMEM_BOUND)) {
 506       bound = (BAMBOO_SMEM_SIZE_L);
 507     }
 508     remain = bound - remain%bound;
 509
 510     size = 0;
 511     unsigned int isize = 0;
 512     unsigned int host = 0;
 513     unsigned int ptr = 0;
 514     unsigned int base = tmpheaptop;
 515     unsigned int cpysize = 0;
 516     remain -= BAMBOO_CACHE_LINE_SIZE;
 517     tmpheaptop += BAMBOO_CACHE_LINE_SIZE;
 518     gc_lobjqueueinit4_I();
 519     while(gc_lobjmoreItems4_I()) {
 520       ptr = (unsigned int)(gc_lobjdequeue4_I(&size, &host));
 521       ALIGNSIZE(size, &isize);
 522       if(remain >= isize) {
 523         remain -= isize;
 524         // move the large obj
 525         if((unsigned int)gcheaptop < (unsigned int)(tmpheaptop+size)) {
 526           memmove(tmpheaptop, gcheaptop, size);
 527         } else {
 528           memcpy(tmpheaptop, gcheaptop, size);
 529         }
 530         // fill the remaining space with -2 padding
 531         BAMBOO_MEMSET_WH(tmpheaptop+size, -2, isize-size);
 532
 533         gcheaptop += size;
 534         cpysize += isize;
 535         // cache the mapping info
 536         gcmappingtbl[OBJMAPPINGINDEX((unsigned int)ptr)]=(unsigned int)tmpheaptop;
 537         tmpheaptop += isize;
 538
 539         // update bamboo_smemtbl
 540         bamboo_smemtbl[b] += isize;
 541       } else {
 542         // this object acrosses blocks
 543         if(cpysize > 0) {
 544           CLOSEBLOCK(base, cpysize+BAMBOO_CACHE_LINE_SIZE);
 545           bamboo_smemtbl[b] += BAMBOO_CACHE_LINE_SIZE;
 546           cpysize = 0;
 547           base = tmpheaptop;
 548           if(remain == 0) {
 549             remain = ((tmpheaptop-gcbaseva)<(BAMBOO_LARGE_SMEM_BOUND)) ?
 550               BAMBOO_SMEM_SIZE_L : BAMBOO_SMEM_SIZE;
 551           }
 552           remain -= BAMBOO_CACHE_LINE_SIZE;
 553           tmpheaptop += BAMBOO_CACHE_LINE_SIZE;
 554           BLOCKINDEX(tmpheaptop, &b);
 555           sb = (unsigned int)(tmpheaptop-gcbaseva)/(BAMBOO_SMEM_SIZE)+gcreservedsb;
 556         }
 557
 558         // move the large obj
 559         if((unsigned int)gcheaptop < (unsigned int)(tmpheaptop+size)) {
 560           memmove(tmpheaptop, gcheaptop, size);
 561         } else {
 562           memcpy(tmpheaptop, gcheaptop, size);
 563         }
 564         // fill the remaining space with -2 padding
 565         BAMBOO_MEMSET_WH(tmpheaptop+size, -2, isize-size);
 566         gcheaptop += size;
 567         // cache the mapping info
 568         gcmappingtbl[OBJMAPPINGINDEX((unsigned int)ptr)]=(unsigned int)tmpheaptop;
 569         tmpheaptop += isize;
 570
 571         // set the gcsbstarttbl and bamboo_smemtbl
 572         unsigned int tmpsbs=1+(unsigned int)(isize-remain-1)/BAMBOO_SMEM_SIZE;
 573         for(int k = 1; k < tmpsbs; k++) {
 574           gcsbstarttbl[sb+k] = -1;
 575         }
 576         sb += tmpsbs;
 577         bound = (b<NUMCORES4GC) ? BAMBOO_SMEM_SIZE_L : BAMBOO_SMEM_SIZE;
 578         BLOCKINDEX(tmpheaptop-1, &tmpsbs);
 579         for(; b < tmpsbs; b++) {
 580           bamboo_smemtbl[b] = bound;
 581           if(b==NUMCORES4GC-1) {
 582             bound = BAMBOO_SMEM_SIZE;
 583           }
 584         }
 585         if(((unsigned int)(isize-remain)%(BAMBOO_SMEM_SIZE)) == 0) {
 586           gcsbstarttbl[sb] = -1;
 587           remain = ((tmpheaptop-gcbaseva)<(BAMBOO_LARGE_SMEM_BOUND)) ?
 588             BAMBOO_SMEM_SIZE_L : BAMBOO_SMEM_SIZE;
 589           bamboo_smemtbl[b] = bound;
 590         } else {
 591           gcsbstarttbl[sb] = (int)tmpheaptop;
 592           remain = tmpheaptop-gcbaseva;
 593           bamboo_smemtbl[b] = remain%bound;
 594           remain = bound - bamboo_smemtbl[b];
 595         }
 596
 597         CLOSEBLOCK(base, isize+BAMBOO_CACHE_LINE_SIZE);
 598         cpysize = 0;
 599         base = tmpheaptop;
 600         if(remain == BAMBOO_CACHE_LINE_SIZE) {
 601           // fill with 0 in case
 602           BAMBOO_MEMSET_WH(tmpheaptop, '\0', remain);
 603         }
 604         remain -= BAMBOO_CACHE_LINE_SIZE;
 605         tmpheaptop += BAMBOO_CACHE_LINE_SIZE;
 606       }
 607     }
 608
 609     if(cpysize > 0) {
 610       CLOSEBLOCK(base, cpysize+BAMBOO_CACHE_LINE_SIZE);
 611       bamboo_smemtbl[b] += BAMBOO_CACHE_LINE_SIZE;
 612     } else {
 613       tmpheaptop -= BAMBOO_CACHE_LINE_SIZE;
 614     }
 615     gcheaptop = tmpheaptop;
 616   }
 617
 618   bamboo_free_block = 0;
 619   unsigned int tbound = 0;
 620   do {
 621     tbound=(bamboo_free_block<NUMCORES4GC)?BAMBOO_SMEM_SIZE_L:BAMBOO_SMEM_SIZE;
 622     if(bamboo_smemtbl[bamboo_free_block] == tbound) {
 623       bamboo_free_block++;
 624     } else {
 625       // the first non-full partition
 626       break;
 627     }
 628   } while(true);
 629
 630   GCPROFILE_RECORD_SPACE();
 631 }
 632
 633 void gc_collect(struct garbagelist * stackptr) {
 634   gcprocessing = true;
 635   // inform the master that this core is at a gc safe point and is ready to
 636   // do gc
 637   send_msg_4(STARTUPCORE,GCFINISHPRE,BAMBOO_NUM_OF_CORE,self_numsendobjs,self_numreceiveobjs);
 638
 639   // core collector routine
 640   //wait for init phase
 641   WAITFORGCPHASE(INITPHASE);
 642
 643   GC_PRINTF("Do initGC\n");
 644   initGC();
 645   CACHEADAPT_GC(true);
 646   //send init finish msg to core coordinator
 647   send_msg_2(STARTUPCORE,GCFINISHINIT,BAMBOO_NUM_OF_CORE);
 648
 649   //wait for mark phase
 650   WAITFORGCPHASE(MARKPHASE);
 651
 652   GC_PRINTF("Start mark phase\n");
 653   mark(true, stackptr);
 654   GC_PRINTF("Finish mark phase, start compact phase\n");
 655   compact();
 656   GC_PRINTF("Finish compact phase\n");
 657
 658   WAITFORGCPHASE(FLUSHPHASE);
 659
 660   GC_PRINTF("Start flush phase\n");
 661   GCPROFILE_INFO_2_MASTER();
 662   flush(stackptr);
 663   GC_PRINTF("Finish flush phase\n");
 664
 665   CACHEADAPT_PHASE_CLIENT();
 666
 667   // invalidate all shared mem pointers
 668   bamboo_cur_msp = NULL;
 669   bamboo_smem_size = 0;
 670   bamboo_smem_zero_top = NULL;
 671   gcflag = false;
 672
 673   WAITFORGCPHASE(FINISHPHASE);
 674
 675   GC_PRINTF("Finish gc! \n");
 676 }
 677
 678 void gc_nocollect(struct garbagelist * stackptr) {
 679   gcprocessing = true;
 680   // inform the master that this core is at a gc safe point and is ready to
 681   // do gc
 682   send_msg_4(STARTUPCORE,GCFINISHPRE,BAMBOO_NUM_OF_CORE,self_numsendobjs,self_numreceiveobjs);
 683
 684   WAITFORGCPHASE(INITPHASE);
 685
 686   GC_PRINTF("Do initGC\n");
 687   initGC();
 688   CACHEADAPT_GC(true);
 689   //send init finish msg to core coordinator
 690   send_msg_2(STARTUPCORE,GCFINISHINIT,BAMBOO_NUM_OF_CORE);
 691
 692   WAITFORGCPHASE(MARKPHASE);
 693
 694   GC_PRINTF("Start mark phase\n");
 695   mark(true, stackptr);
 696   GC_PRINTF("Finish mark phase, wait for flush\n");
 697
 698   // non-gc core collector routine
 699   WAITFORGCPHASE(FLUSHPHASE);
 700
 701   GC_PRINTF("Start flush phase\n");
 702   GCPROFILE_INFO_2_MASTER();
 703   flush(stackptr);
 704   GC_PRINTF("Finish flush phase\n");
 705
 706   CACHEADAPT_PHASE_CLIENT();
 707
 708   // invalidate all shared mem pointers
 709   bamboo_cur_msp = NULL;
 710   bamboo_smem_size = 0;
 711   bamboo_smem_zero_top = NULL;
 712
 713   gcflag = false;
 714   WAITFORGCPHASE(FINISHPHASE);
 715
 716   GC_PRINTF("Finish gc! \n");
 717 }
 718
 719 void master_mark(struct garbagelist *stackptr) {
 720   bool isfirst = true;
 721
 722   GC_PRINTF("Start mark phase \n");
 723   GC_SEND_MSG_1_TO_CLIENT(GCSTART);
 724   gcphase = MARKPHASE;
 725   // mark phase
 726
 727   while(MARKPHASE == gcphase) {
 728     mark(isfirst, stackptr);
 729     isfirst=false;
 730     // check gcstatus
 731     checkMarkStatus();
 732   }
 733 }
 734
 735 void master_getlargeobjs() {
 736   // send msgs to all cores requiring large objs info
 737   // Note: only need to ask gc cores, non-gc cores do not host any objs
 738   numconfirm = NUMCORES4GC - 1;
 739   for(int i = 1; i < NUMCORES4GC; i++) {
 740     send_msg_1(i,GCLOBJREQUEST);
 741   }
 742   gcloads[BAMBOO_NUM_OF_CORE] = gccurr_heaptop;
 743   //spin until we have all responses
 744   while(numconfirm!=0) ;
 745
 746   // check the heaptop
 747   if(gcheaptop < gcmarkedptrbound) {
 748     gcheaptop = gcmarkedptrbound;
 749   }
 750   GCPROFILE_ITEM();
 751   GC_PRINTF("prepare to cache large objs \n");
 752
 753   // cache all large objs
 754   BAMBOO_ASSERTMSG(cacheLObjs(), "Not enough space to cache large objects\n");
 755 }
 756
 757 void master_compact() {
 758   // predict number of blocks to fill for each core
 759   unsigned int tmpheaptop = 0;
 760   int numpbc = loadbalance(&tmpheaptop);
 761
 762   numpbc = (BAMBOO_SHARED_MEM_SIZE)/(BAMBOO_SMEM_SIZE);
 763   GC_PRINTF("mark phase finished \n");
 764
 765   tmpheaptop = gcbaseva + (BAMBOO_SHARED_MEM_SIZE);
 766   for(int i = 0; i < NUMCORES4GC; i++) {
 767     unsigned int tmpcoreptr = 0;
 768     BASEPTR(i, numpbc, &tmpcoreptr);
 769     // init some data strutures for compact phase
 770     gcloads[i] = 0;
 771     gcfilledblocks[i] = 0;
 772     gcrequiredmems[i] = 0;
 773     gccorestatus[i] = 1;
 774     //send start compact messages to all cores
 775     //TODO bug here, do not know if the direction is positive or negtive?
 776     if (tmpcoreptr < tmpheaptop) {
 777       gcstopblock[i] = numpbc + 1;
 778       if(i != STARTUPCORE) {
 779         send_msg_2(i, GCSTARTCOMPACT, numpbc+1);
 780       } else {
 781         gcblock2fill = numpbc+1;
 782       }
 783     } else {
 784       gcstopblock[i] = numpbc;
 785       if(i != STARTUPCORE) {
 786         send_msg_2(i, GCSTARTCOMPACT, numpbc);
 787       } else {
 788         gcblock2fill = numpbc;
 789       }
 790     }
 791   }
 792   BAMBOO_CACHE_MF();
 793   GCPROFILE_ITEM();
 794   // compact phase
 795   struct moveHelper * orig = (struct moveHelper *)RUNMALLOC(sizeof(struct moveHelper));
 796   struct moveHelper * to = (struct moveHelper *)RUNMALLOC(sizeof(struct moveHelper));
 797   compact_master(orig, to);
 798   GCPROFILE_ITEM();
 799   GC_PRINTF("prepare to move large objs \n");
 800   // move largeObjs
 801   moveLObjs();
 802   GC_PRINTF("compact phase finished \n");
 803   RUNFREE(orig);
 804   RUNFREE(to);
 805 }
 806
 807 void master_updaterefs(struct garbagelist * stackptr) {
 808   gcphase = FLUSHPHASE;
 809   GC_SEND_MSG_1_TO_CLIENT(GCSTARTFLUSH);
 810   GCPROFILE_ITEM();
 811   GC_PRINTF("Start flush phase \n");
 812   // flush phase
 813   flush(stackptr);
 814   // now the master core need to decide the new cache strategy
 815   CACHEADAPT_MASTER();
 816   GC_CHECK_ALL_CORE_STATUS(FLUSHPHASE==gcphase);
 817   GC_PRINTF("Finish flush phase \n");
 818 }
 819
 820 void master_finish() {
 821   gcphase = FINISHPHASE;
 822
 823   // invalidate all shared mem pointers
 824   // put it here as it takes time to inform all the other cores to
 825   // finish gc and it might cause problem when some core resumes
 826   // mutator earlier than the other cores
 827   bamboo_cur_msp = NULL;
 828   bamboo_smem_size = 0;
 829   bamboo_smem_zero_top = NULL;
 830
 831   GCPROFILE_END();
 832   gcflag = false;
 833   GC_SEND_MSG_1_TO_CLIENT(GCFINISH);
 834
 835   gcprocessing = false;
 836   if(gcflag) {
 837     // inform other cores to stop and wait for gc
 838     gcprecheck = true;
 839     for(int i = 0; i < NUMCORESACTIVE; i++) {
 840       // reuse the gcnumsendobjs & gcnumreceiveobjs
 841       gcnumsendobjs[0][i] = 0;
 842       gcnumreceiveobjs[0][i] = 0;
 843     }
 844     GC_SEND_MSG_1_TO_CLIENT(GCSTARTPRE);
 845   }
 846 }
 847
 848 void gc_master(struct garbagelist * stackptr) {
 849   tprintf("start GC !!!!!!!!!!!!! \n");
 850   gcprocessing = true;
 851   gcphase = INITPHASE;
 852
 853   waitconfirm = false;
 854   numconfirm = 0;
 855   initGC();
 856   GC_SEND_MSG_1_TO_CLIENT(GCSTARTINIT);
 857   CACHEADAPT_GC(true);
 858   GC_PRINTF("Check core status \n");
 859   GC_CHECK_ALL_CORE_STATUS(true);
 860   GCPROFILE_ITEM();
 861   CACHEADAPT_OUTPUT_CACHE_SAMPLING();
 862
 863   // do mark phase
 864   master_mark(stackptr);
 865
 866   // get large objects from all cores
 867   master_getlargeobjs();
 868
 869   // compact the heap
 870   master_compact();
 871
 872   // update the references
 873   master_updaterefs(stackptr);
 874
 875   // do cache adaptation
 876   CACHEADAPT_PHASE_MASTER();
 877
 878   // do finish up stuff
 879   master_finish();
 880
 881   GC_PRINTF("gc finished   \n");
 882   tprintf("finish GC ! %d \n", gcflag);
 883 }
 884
 885 void pregccheck() {
 886   while(true) {
 887     BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
 888     gcnumsendobjs[0][BAMBOO_NUM_OF_CORE] = self_numsendobjs;
 889     gcnumreceiveobjs[0][BAMBOO_NUM_OF_CORE] = self_numreceiveobjs;
 890     int sumsendobj = 0;
 891     for(int i = 0; i < NUMCORESACTIVE; i++) {
 892       sumsendobj += gcnumsendobjs[0][i];
 893     }
 894     for(int i = 0; i < NUMCORESACTIVE; i++) {
 895       sumsendobj -= gcnumreceiveobjs[0][i];
 896     }
 897     if(0 != sumsendobj) {
 898       // there were still some msgs on the fly, wait until there
 899       // are some update pregc information coming and check it again
 900       gcprecheck = false;
 901       BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
 902
 903       while(!gcprecheck) ;
 904     } else {
 905       BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
 906       return;
 907     }
 908   }
 909 }
 910
 911 void pregcprocessing() {
 912 #if defined(GC_CACHE_ADAPT)&&defined(GC_CACHE_SAMPLING)
 913   // disable the timer interrupt
 914   bamboo_mask_timer_intr();
 915 #endif
 916   // Zero out the remaining memory here because for the GC_CACHE_ADAPT version,
 917   // we need to make sure during the gcinit phase the shared heap is not
 918   // touched. Otherwise, there would be problem when adapt the cache strategy.
 919   BAMBOO_CLOSE_CUR_MSP();
 920 #ifdef GC_FLUSH_DTLB
 921   if(gc_num_flush_dtlb < GC_NUM_FLUSH_DTLB) {
 922     BAMBOO_CLEAN_DTLB();
 923     gc_num_flush_dtlb++;
 924   }
 925 #endif
 926 #if defined(GC_CACHE_ADAPT)&&defined(GC_CACHE_SAMPLING)
 927   // get the sampling data
 928   bamboo_output_dtlb_sampling();
 929 #endif
 930 }
 931
 932 void postgcprocessing() {
 933 #if defined(GC_CACHE_ADAPT)&&defined(GC_CACHE_SAMPLING)
 934   // enable the timer interrupt
 935   bamboo_tile_timer_set_next_event(GC_TILE_TIMER_EVENT_SETTING);
 936   bamboo_unmask_timer_intr();
 937 #endif
 938 }
 939
 940 bool gc(struct garbagelist * stackptr) {
 941   // check if do gc
 942   if(!gcflag) {
 943     gcprocessing = false;
 944     return false;
 945   }
 946
 947   // core coordinator routine
 948   if(0 == BAMBOO_NUM_OF_CORE) {
 949     GC_PRINTF("Check if we can do gc or not\n");
 950     gccorestatus[BAMBOO_NUM_OF_CORE] = 0;
 951     if(!gc_checkAllCoreStatus()) {
 952       // some of the cores are still executing the mutator and did not reach
 953       // some gc safe point, therefore it is not ready to do gc
 954       gcflag = true;
 955       return false;
 956     } else {
 957       GCPROFILE_START();
 958       pregccheck();
 959     }
 960     GC_PRINTF("start gc! \n");
 961     pregcprocessing();
 962     gc_master(stackptr);
 963   } else if(BAMBOO_NUM_OF_CORE < NUMCORES4GC) {
 964     pregcprocessing();
 965     gc_collect(stackptr);
 966   } else {
 967     pregcprocessing();
 968     gc_nocollect(stackptr);
 969   }
 970   postgcprocessing();
 971
 972   return true;
 973 }
 974
 975 #endif