Robust/src/Runtime/bamboo/multicoregarbage.c

   1 // TODO: DO NOT support tag!!!
   2 #ifdef MULTICORE_GC
   3 #include "runtime.h"
   4 #include "multicoregarbage.h"
   5 #include "multicoregcmark.h"
   6 #include "gcqueue.h"
   7 #include "multicoregccompact.h"
   8 #include "multicoregcflush.h"
   9 #include "multicoreruntime.h"
  10 #include "multicoregcprofile.h"
  11 #include "gcqueue.h"
  12
  13 #ifdef SMEMM
  14 extern unsigned int gcmem_mixed_threshold;
  15 extern unsigned int gcmem_mixed_usedmem;
  16 #endif // SMEMM
  17
  18 #ifdef GC_DEBUG
  19 // dump whole mem in blocks
  20 void dumpSMem() {
  21   int block = 0;
  22   int sblock = 0;
  23   unsigned int j = 0;
  24   void * i = 0;
  25   int coren = 0;
  26   int x = 0;
  27   int y = 0;
  28   printf("(%x,%x) Dump shared mem: \n",udn_tile_coord_x(),udn_tile_coord_y());
  29   // reserved blocks for sblocktbl
  30   printf("(%x,%x) ++++ reserved sblocks ++++ \n", udn_tile_coord_x(),
  31          udn_tile_coord_y());
  32   for(i=BAMBOO_BASE_VA; (unsinged int)i<(unsigned int)gcbaseva; i+= 4*16) {
  33     printf("(%x,%x) 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x \n",
  34         udn_tile_coord_x(), udn_tile_coord_y(),
  35         *((int *)(i)), *((int *)(i + 4)),
  36         *((int *)(i + 4*2)), *((int *)(i + 4*3)),
  37         *((int *)(i + 4*4)), *((int *)(i + 4*5)),
  38         *((int *)(i + 4*6)), *((int *)(i + 4*7)),
  39         *((int *)(i + 4*8)), *((int *)(i + 4*9)),
  40         *((int *)(i + 4*10)), *((int *)(i + 4*11)),
  41         *((int *)(i + 4*12)), *((int *)(i + 4*13)),
  42         *((int *)(i + 4*14)), *((int *)(i + 4*15)));
  43   }
  44   sblock = gcreservedsb;
  45   bool advanceblock = false;
  46   // remaining memory
  47   for(i=gcbaseva; (unsigned int)i<(unsigned int)(gcbaseva+BAMBOO_SHARED_MEM_SIZE); i+=4*16) {
  48     advanceblock = false;
  49     // computing sblock # and block #, core coordinate (x,y) also
  50     if(j%((BAMBOO_SMEM_SIZE)/(4*16)) == 0) {
  51       // finished a sblock
  52       if(j < ((BAMBOO_LARGE_SMEM_BOUND)/(4*16))) {
  53         if((j > 0) && (j%((BAMBOO_SMEM_SIZE_L)/(4*16)) == 0)) {
  54           // finished a block
  55           block++;
  56           advanceblock = true;
  57         }
  58       } else {
  59         // finished a block
  60         block++;
  61         advanceblock = true;
  62       }
  63       // compute core #
  64       if(advanceblock) {
  65         coren = gc_block2core[block%(NUMCORES4GC*2)];
  66       }
  67       // compute core coordinate
  68       x = BAMBOO_COORDS_X(coren);
  69       y = BAMBOO_COORDS_Y(coren);
  70       printf("(%x,%x) ==== %d, %d : core (%d,%d), saddr %x====\n",
  71           udn_tile_coord_x(), udn_tile_coord_y(),block, sblock++, x, y,
  72           (sblock-1)*(BAMBOO_SMEM_SIZE)+gcbaseva);
  73     }
  74     j++;
  75     printf("(%x,%x) 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x \n",
  76         udn_tile_coord_x(), udn_tile_coord_y(),
  77         *((int *)(i)), *((int *)(i + 4)),
  78         *((int *)(i + 4*2)), *((int *)(i + 4*3)),
  79         *((int *)(i + 4*4)), *((int *)(i + 4*5)),
  80         *((int *)(i + 4*6)), *((int *)(i + 4*7)),
  81         *((int *)(i + 4*8)), *((int *)(i + 4*9)),
  82         *((int *)(i + 4*10)), *((int *)(i + 4*11)),
  83         *((int *)(i + 4*12)), *((int *)(i + 4*13)),
  84         *((int *)(i + 4*14)), *((int *)(i + 4*15)));
  85   }
  86   printf("(%x,%x) \n", udn_tile_coord_x(), udn_tile_coord_y());
  87 }
  88 #endif
  89
  90 void initmulticoregcdata() {
  91   if(STARTUPCORE == BAMBOO_NUM_OF_CORE) {
  92     // startup core to initialize corestatus[]
  93     for(int i = 0; i < NUMCORESACTIVE; i++) {
  94       gccorestatus[i] = 1;
  95       gcnumsendobjs[0][i] = gcnumsendobjs[1][i] = 0;
  96       gcnumreceiveobjs[0][i] = gcnumreceiveobjs[1][i] = 0;
  97     }
  98     for(int i = 0; i < NUMCORES4GC; i++) {
  99       gcloads[i] = 0;
 100       gcrequiredmems[i] = 0;
 101       gcstopblock[i] = 0;
 102       gcfilledblocks[i] = 0;
 103     }
 104   }
 105
 106   bamboo_smem_zero_top = NULL;
 107   gcflag = false;
 108   gcprocessing = false;
 109   gcphase = FINISHPHASE;
 110   gcprecheck = true;
 111   gccurr_heaptop = 0;
 112   gcself_numsendobjs = 0;
 113   gcself_numreceiveobjs = 0;
 114   gcmarkedptrbound = 0;
 115   gcforwardobjtbl = allocateMGCHash_I(20, 3);
 116   gcnumlobjs = 0;
 117   gcheaptop = 0;
 118   gctopcore = 0;
 119   gctopblock = 0;
 120   gcmovestartaddr = 0;
 121   gctomove = false;
 122   gcmovepending = 0;
 123   gcblock2fill = 0;
 124 #ifdef SMEMM
 125   gcmem_mixed_threshold = (unsigned int)((BAMBOO_SHARED_MEM_SIZE
 126                 -bamboo_reserved_smem*BAMBOO_SMEM_SIZE)*0.8);
 127   gcmem_mixed_usedmem = 0;
 128 #endif
 129 #ifdef MGC_SPEC
 130   gc_profile_flag = false;
 131 #endif
 132 #ifdef GC_FLUSH_DTLB
 133   gc_num_flush_dtlb = 0;
 134 #endif
 135   gc_localheap_s = false;
 136 #ifdef GC_CACHE_ADAPT
 137   gccachestage = false;
 138 #endif
 139
 140   INIT_MULTICORE_GCPROFILE_DATA();
 141 }
 142
 143 void dismulticoregcdata() {
 144   freeMGCHash(gcforwardobjtbl);
 145 }
 146
 147 void initGC() {
 148   if(STARTUPCORE == BAMBOO_NUM_OF_CORE) {
 149     for(int i = 0; i < NUMCORES4GC; i++) {
 150       gccorestatus[i] = 1;
 151       gcnumsendobjs[0][i] = gcnumsendobjs[1][i] = 0;
 152       gcnumreceiveobjs[0][i] = gcnumreceiveobjs[1][i] = 0;
 153       gcloads[i] = 0;
 154       gcrequiredmems[i] = 0;
 155       gcfilledblocks[i] = 0;
 156       gcstopblock[i] = 0;
 157     }
 158     for(int i = NUMCORES4GC; i < NUMCORESACTIVE; i++) {
 159       gccorestatus[i] = 1;
 160       gcnumsendobjs[0][i] = gcnumsendobjs[1][i] = 0;
 161       gcnumreceiveobjs[0][i] = gcnumreceiveobjs[1][i] = 0;
 162     }
 163     gcheaptop = 0;
 164     gctopcore = 0;
 165     gctopblock = 0;
 166     gcnumsrobjs_index = 0;
 167   }
 168   gcself_numsendobjs = 0;
 169   gcself_numreceiveobjs = 0;
 170   gcmarkedptrbound = 0;
 171   gcnumlobjs = 0;
 172   gcmovestartaddr = 0;
 173   gctomove = false;
 174   gcblock2fill = 0;
 175   gcmovepending = 0;
 176   gccurr_heaptop = 0;
 177   gcdstcore = 0;
 178
 179   gc_queueinit();
 180
 181   freeMGCHash(gcforwardobjtbl);
 182   gcforwardobjtbl = allocateMGCHash(20, 3);
 183
 184   GCPROFILE_INIT();
 185 }
 186
 187 bool gc_checkAllCoreStatus() {
 188   BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
 189   for(int i = 0; i < NUMCORESACTIVE; i++) {
 190     if(gccorestatus[i] != 0) {
 191       BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
 192       return false;
 193     }
 194   }
 195   BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
 196   return true;
 197 }
 198
 199 // NOTE: should be invoked with interrupts turned off
 200 bool gc_checkAllCoreStatus_I() {
 201   for(int i = 0; i < NUMCORESACTIVE; i++) {
 202     if(gccorestatus[i] != 0) {
 203       return false;
 204     }
 205   }
 206   return true;
 207 }
 208
 209 INLINE void checkMarkStatus_p2() {
 210   // check if the sum of send objs and receive obj are the same
 211   // yes->check if the info is the latest; no->go on executing
 212   unsigned int sumsendobj = 0;
 213   for(int i = 0; i < NUMCORESACTIVE; i++) {
 214     sumsendobj += gcnumsendobjs[gcnumsrobjs_index][i];
 215   }
 216   for(int i = 0; i < NUMCORESACTIVE; i++) {
 217     sumsendobj -= gcnumreceiveobjs[gcnumsrobjs_index][i];
 218   }
 219   if(0 == sumsendobj) {
 220     // Check if there are changes of the numsendobjs or numreceiveobjs
 221     // on each core
 222     int i = 0;
 223     for(i = 0; i < NUMCORESACTIVE; i++) {
 224       if((gcnumsendobjs[0][i]!=gcnumsendobjs[1][i])||(gcnumreceiveobjs[0][i]!=gcnumreceiveobjs[1][i]) ) {
 225         break;
 226       }
 227     }
 228     if(i == NUMCORESACTIVE) {
 229       // all the core status info are the latest,stop mark phase
 230       gcphase = COMPACTPHASE;
 231       // restore the gcstatus for all cores
 232       for(int i = 0; i < NUMCORESACTIVE; i++) {
 233         gccorestatus[i] = 1;
 234       }
 235     } else {
 236       // There were changes between phase 1 and phase 2, can not decide
 237       // whether the mark phase has been finished
 238       waitconfirm = false;
 239       // As it fails in phase 2, flip the entries
 240       gcnumsrobjs_index = (gcnumsrobjs_index == 0) ? 1 : 0;
 241     }
 242   } else {
 243     // There were changes between phase 1 and phase 2, can not decide
 244     // whether the mark phase has been finished
 245     waitconfirm = false;
 246     // As it fails in phase 2, flip the entries
 247     gcnumsrobjs_index = (gcnumsrobjs_index == 0) ? 1 : 0;
 248   }
 249 }
 250
 251 INLINE void checkMarkStatus() {
 252   if((!waitconfirm)||(waitconfirm && (numconfirm == 0))) {
 253     unsigned int entry_index = 0;
 254     if(waitconfirm) {
 255       // phase 2
 256       entry_index = (gcnumsrobjs_index == 0) ? 1 : 0;
 257     } else {
 258       // phase 1
 259       entry_index = gcnumsrobjs_index;
 260     }
 261     BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
 262     gccorestatus[BAMBOO_NUM_OF_CORE] = 0;
 263     gcnumsendobjs[entry_index][BAMBOO_NUM_OF_CORE] = gcself_numsendobjs;
 264     gcnumreceiveobjs[entry_index][BAMBOO_NUM_OF_CORE] = gcself_numreceiveobjs;
 265     // check the status of all cores
 266     if (gc_checkAllCoreStatus_I()) {
 267       // ask for confirm
 268       if(!waitconfirm) {
 269         // the first time found all cores stall
 270         // send out status confirm msg to all other cores
 271         // reset the corestatus array too
 272         waitconfirm = true;
 273         numconfirm = NUMCORESACTIVE - 1;
 274         BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
 275         GC_SEND_MSG_1_TO_CLIENT(GCMARKCONFIRM);
 276       } else {
 277         // Phase 2
 278         checkMarkStatus_p2();
 279         BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
 280       }
 281     } else {
 282       BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
 283     }
 284   }
 285 }
 286
 287 // compute load balance for all cores
 288 INLINE int loadbalance(unsigned int * heaptop) {
 289   // compute load balance
 290   // get the total loads
 291   unsigned int tloads = gcloads[STARTUPCORE];
 292   for(int i = 1; i < NUMCORES4GC; i++) {
 293     tloads += gcloads[i];
 294   }
 295   *heaptop = gcbaseva + tloads;
 296
 297   unsigned int b = 0;
 298   BLOCKINDEX(*heaptop, &b);
 299   // num of blocks per core
 300   unsigned int numbpc = (unsigned int)b/(unsigned int)(NUMCORES4GC);
 301   gctopblock = b;
 302   RESIDECORE(heaptop, &gctopcore);
 303   return numbpc;
 304 }
 305
 306 // compute total mem size required and sort the lobjs in ascending order
 307 INLINE unsigned int sortLObjs() {
 308   unsigned int tmp_lobj = 0;
 309   unsigned int tmp_len = 0;
 310   unsigned int tmp_host = 0;
 311   unsigned int sumsize = 0;
 312
 313   gclobjtail2 = gclobjtail;
 314   gclobjtailindex2 = gclobjtailindex;
 315   // TODO USE QUICK SORT INSTEAD?
 316   while(gc_lobjmoreItems2_I()) {
 317     gc_lobjdequeue2_I();
 318     tmp_lobj = gclobjtail2->lobjs[gclobjtailindex2-1];
 319     tmp_host = gclobjtail2->hosts[gclobjtailindex2-1];
 320     tmp_len = gclobjtail2->lengths[gclobjtailindex2 - 1];
 321     sumsize += tmp_len;
 322     GCPROFILE_RECORD_LOBJ();
 323     unsigned int i = gclobjtailindex2-1;
 324     struct lobjpointerblock * tmp_block = gclobjtail2;
 325     // find the place to insert
 326     while(true) {
 327       if(i == 0) {
 328         if(tmp_block->prev == NULL) {
 329           break;
 330         }
 331         if(tmp_block->prev->lobjs[NUMLOBJPTRS-1] > tmp_lobj) {
 332           tmp_block->lobjs[i] = tmp_block->prev->lobjs[NUMLOBJPTRS-1];
 333           tmp_block->lengths[i] = tmp_block->prev->lengths[NUMLOBJPTRS-1];
 334           tmp_block->hosts[i] = tmp_block->prev->hosts[NUMLOBJPTRS-1];
 335           tmp_block = tmp_block->prev;
 336           i = NUMLOBJPTRS-1;
 337         } else {
 338           break;
 339         }  // if(tmp_block->prev->lobjs[NUMLOBJPTRS-1] < tmp_lobj)
 340       } else {
 341         if(tmp_block->lobjs[i-1] > tmp_lobj) {
 342           tmp_block->lobjs[i] = tmp_block->lobjs[i-1];
 343           tmp_block->lengths[i] = tmp_block->lengths[i-1];
 344           tmp_block->hosts[i] = tmp_block->hosts[i-1];
 345           i--;
 346         } else {
 347           break;
 348         }
 349       }
 350     }
 351     // insert it
 352     if(i != gclobjtailindex2 - 1) {
 353       tmp_block->lobjs[i] = tmp_lobj;
 354       tmp_block->lengths[i] = tmp_len;
 355       tmp_block->hosts[i] = tmp_host;
 356     }
 357   }
 358   return sumsize;
 359 }
 360
 361 INLINE bool cacheLObjs() {
 362   // check the total mem size need for large objs
 363   unsigned long long sumsize = 0;
 364   unsigned int size = 0;
 365
 366   sumsize = sortLObjs();
 367
 368   GCPROFILE_RECORD_LOBJSPACE();
 369
 370   // check if there are enough space to cache these large objs
 371   unsigned int dst = gcbaseva + (BAMBOO_SHARED_MEM_SIZE) -sumsize;
 372   if((unsigned long long)gcheaptop > (unsigned long long)dst) {
 373     // do not have enough room to cache large objs
 374     return false;
 375   }
 376
 377   gcheaptop = dst; // Note: record the start of cached lobjs with gcheaptop
 378   // cache the largeObjs to the top of the shared heap
 379   dst = gcbaseva + (BAMBOO_SHARED_MEM_SIZE);
 380   while(gc_lobjmoreItems3_I()) {
 381     gc_lobjdequeue3_I();
 382     size = gclobjtail2->lengths[gclobjtailindex2];
 383     // set the mark field to , indicating that this obj has been moved
 384     // and need to be flushed
 385     ((struct ___Object___ *)(gclobjtail2->lobjs[gclobjtailindex2]))->marked=COMPACTED;
 386     dst -= size;
 387     if((unsigned int)dst<(unsigned int)(gclobjtail2->lobjs[gclobjtailindex2]+size)) {
 388       memmove(dst, gclobjtail2->lobjs[gclobjtailindex2], size);
 389     } else {
 390       memcpy(dst, gclobjtail2->lobjs[gclobjtailindex2], size);
 391     }
 392   }
 393   return true;
 394 }
 395
 396 // update the bmmboo_smemtbl to record current shared mem usage
 397 void updateSmemTbl(unsigned int coren, unsigned int localtop) {
 398   unsigned int ltopcore = 0;
 399   unsigned int bound = BAMBOO_SMEM_SIZE_L;
 400   BLOCKINDEX(localtop, &ltopcore);
 401   if((unsigned int)localtop>=(unsigned int)(gcbaseva+BAMBOO_LARGE_SMEM_BOUND)){
 402     bound = BAMBOO_SMEM_SIZE;
 403   }
 404   unsigned int load = (unsigned int)(localtop-gcbaseva)%(unsigned int)bound;
 405   unsigned int toset = 0;
 406   for(int j=0; 1; j++) {
 407     for(int i=0; i<2; i++) {
 408       toset = gc_core2block[2*coren+i]+(unsigned int)(NUMCORES4GC*2)*j;
 409       if(toset < ltopcore) {
 410         bamboo_smemtbl[toset]=BLOCKSIZE(toset<NUMCORES4GC);
 411 #ifdef SMEMM
 412         gcmem_mixed_usedmem += bamboo_smemtbl[toset];
 413 #endif
 414       } else if(toset == ltopcore) {
 415         bamboo_smemtbl[toset] = load;
 416 #ifdef SMEMM
 417         gcmem_mixed_usedmem += bamboo_smemtbl[toset];
 418 #endif
 419         return;
 420       } else {
 421         return;
 422       }
 423     }
 424   }
 425 }
 426
 427 INLINE unsigned int checkCurrHeapTop() {
 428   // update the smemtbl
 429   BAMBOO_MEMSET_WH(bamboo_smemtbl, 0, sizeof(int)*gcnumblock);
 430   // flush all gcloads to indicate the real heap top on one core
 431   // previous it represents the next available ptr on a core
 432   if(((unsigned int)gcloads[0]>(unsigned int)(gcbaseva+BAMBOO_SMEM_SIZE_L))&&(((unsigned int)gcloads[0]%(BAMBOO_SMEM_SIZE)) == 0)) {
 433     // edge of a block, check if this is exactly the heaptop
 434     BASEPTR(0, gcfilledblocks[0]-1, &(gcloads[0]));
 435     gcloads[0]+=BLOCKSIZE(gcfilledblocks[0]<=1);
 436   }
 437   updateSmemTbl(0, gcloads[0]);
 438   for(int i = 1; i < NUMCORES4GC; i++) {
 439     unsigned int tmptop = 0;
 440     if((gcfilledblocks[i] > 0)&&(((unsigned int)gcloads[i]%(BAMBOO_SMEM_SIZE)) == 0)) {
 441       // edge of a block, check if this is exactly the heaptop
 442       BASEPTR(i, gcfilledblocks[i]-1, &gcloads[i]);
 443       gcloads[i]+=BLOCKSIZE(gcfilledblocks[i]<=1);
 444       tmptop = gcloads[i];
 445     }
 446     updateSmemTbl(i, gcloads[i]);
 447   }
 448
 449   // find current heap top
 450   // TODO
 451   // a bug here: when using local allocation, directly move large objects
 452   // to the highest free chunk might not be memory efficient
 453   unsigned int tmpheaptop = 0;
 454   for(int i = gcnumblock-1; i >= 0; i--) {
 455     if(bamboo_smemtbl[i] > 0) {
 456       return gcbaseva+bamboo_smemtbl[i]+OFFSET2BASEVA(i);
 457     }
 458   }
 459   return gcbaseva;
 460 }
 461
 462 INLINE void movelobj(unsigned int tmpheaptop,unsigned int ptr,int size,int isize) {
 463   // move the large obj
 464   if((unsigned int)gcheaptop < (unsigned int)(tmpheaptop+size)) {
 465     memmove(tmpheaptop, gcheaptop, size);
 466   } else {
 467     memcpy(tmpheaptop, gcheaptop, size);
 468   }
 469   // fill the remaining space with -2 padding
 470   BAMBOO_MEMSET_WH(tmpheaptop+size, -2, isize-size);
 471   gcheaptop += size;
 472   // cache the mapping info
 473   gcmappingtbl[OBJMAPPINGINDEX((unsigned int)ptr)]=(unsigned int)tmpheaptop;
 474   tmpheaptop += isize;
 475 }
 476
 477 INLINE void moveLObjs() {
 478 #ifdef SMEMM
 479   // update the gcmem_mixed_usedmem
 480   gcmem_mixed_usedmem = 0;
 481 #endif
 482   unsigned int size = 0;
 483   unsigned int bound = 0;
 484   unsigned int tmpheaptop = checkCurrHeapTop();
 485
 486   // move large objs from gcheaptop to tmpheaptop
 487   // write the header first
 488   unsigned int tomove = gcbaseva+(BAMBOO_SHARED_MEM_SIZE)-gcheaptop;
 489 #ifdef SMEMM
 490   gcmem_mixed_usedmem += tomove;
 491 #endif
 492   // flush the sbstartbl
 493   BAMBOO_MEMSET_WH(&(gcsbstarttbl[gcreservedsb]),'\0',(BAMBOO_SHARED_MEM_SIZE/BAMBOO_SMEM_SIZE-(unsigned int)gcreservedsb)*sizeof(unsigned int));
 494   if(tomove == 0) {
 495     gcheaptop = tmpheaptop;
 496   } else {
 497     // check how many blocks it acrosses
 498     unsigned int remain = tmpheaptop-gcbaseva;
 499     //number of the sblock
 500     unsigned int sb = remain/BAMBOO_SMEM_SIZE+(unsigned int)gcreservedsb;
 501     unsigned int b = 0;  // number of the block
 502     BLOCKINDEX(tmpheaptop, &b);
 503     // check the remaining space in this block
 504     bound = (BAMBOO_SMEM_SIZE);
 505     if(remain < (BAMBOO_LARGE_SMEM_BOUND)) {
 506       bound = (BAMBOO_SMEM_SIZE_L);
 507     }
 508     remain = bound - remain%bound;
 509
 510     size = 0;
 511     unsigned int isize = 0;
 512     unsigned int host = 0;
 513     unsigned int ptr = 0;
 514     unsigned int base = tmpheaptop;
 515     unsigned int cpysize = 0;
 516     remain -= BAMBOO_CACHE_LINE_SIZE;
 517     tmpheaptop += BAMBOO_CACHE_LINE_SIZE;
 518     gc_lobjqueueinit4_I();
 519     while(gc_lobjmoreItems4_I()) {
 520       ptr = (unsigned int)(gc_lobjdequeue4_I(&size, &host));
 521       ALIGNSIZE(size, &isize);
 522       if(remain >= isize) {
 523         remain -= isize;
 524         // move the large obj
 525         movelobj(tmpheaptop,ptr,size,isize);
 526         cpysize += isize;
 527         // update bamboo_smemtbl
 528         bamboo_smemtbl[b] += isize;
 529       } else {
 530         // this object acrosses blocks
 531         if(cpysize > 0) {
 532           CLOSEBLOCK(base, cpysize+BAMBOO_CACHE_LINE_SIZE);
 533           bamboo_smemtbl[b] += BAMBOO_CACHE_LINE_SIZE;
 534           cpysize = 0;
 535           base = tmpheaptop;
 536           if(remain == 0) {
 537             remain = BLOCKSIZE((tmpheaptop-gcbaseva)<(BAMBOO_LARGE_SMEM_BOUND));
 538           }
 539           remain -= BAMBOO_CACHE_LINE_SIZE;
 540           tmpheaptop += BAMBOO_CACHE_LINE_SIZE;
 541           BLOCKINDEX(tmpheaptop, &b);
 542           sb = (unsigned int)(tmpheaptop-gcbaseva)/(BAMBOO_SMEM_SIZE)+gcreservedsb;
 543         }
 544         // move the obj
 545         movelobj(tmpheaptop,ptr,size,isize);
 546
 547         // set the gcsbstarttbl and bamboo_smemtbl
 548         unsigned int tmpsbs=1+(unsigned int)(isize-remain-1)/BAMBOO_SMEM_SIZE;
 549         for(int k = 1; k < tmpsbs; k++) {
 550           gcsbstarttbl[sb+k] = -1;
 551         }
 552         sb += tmpsbs;
 553         bound = BLOCKSIZE(b<NUMCORES4GC);
 554         BLOCKINDEX(tmpheaptop-1, &tmpsbs);
 555         for(; b < tmpsbs; b++) {
 556           bamboo_smemtbl[b] = bound;
 557           if(b==NUMCORES4GC-1) {
 558             bound = BAMBOO_SMEM_SIZE;
 559           }
 560         }
 561         if(((unsigned int)(isize-remain)%(BAMBOO_SMEM_SIZE)) == 0) {
 562           gcsbstarttbl[sb] = -1;
 563           remain = BLOCKSIZE((tmpheaptop-gcbaseva)<(BAMBOO_LARGE_SMEM_BOUND));
 564           bamboo_smemtbl[b] = bound;
 565         } else {
 566           gcsbstarttbl[sb] = (int)tmpheaptop;
 567           remain = tmpheaptop-gcbaseva;
 568           bamboo_smemtbl[b] = remain%bound;
 569           remain = bound - bamboo_smemtbl[b];
 570         }
 571
 572         CLOSEBLOCK(base, isize+BAMBOO_CACHE_LINE_SIZE);
 573         cpysize = 0;
 574         base = tmpheaptop;
 575         if(remain == BAMBOO_CACHE_LINE_SIZE) {
 576           // fill with 0 in case
 577           BAMBOO_MEMSET_WH(tmpheaptop, '\0', remain);
 578         }
 579         remain -= BAMBOO_CACHE_LINE_SIZE;
 580         tmpheaptop += BAMBOO_CACHE_LINE_SIZE;
 581       }
 582     }
 583
 584     if(cpysize > 0) {
 585       CLOSEBLOCK(base, cpysize+BAMBOO_CACHE_LINE_SIZE);
 586       bamboo_smemtbl[b] += BAMBOO_CACHE_LINE_SIZE;
 587     } else {
 588       tmpheaptop -= BAMBOO_CACHE_LINE_SIZE;
 589     }
 590     gcheaptop = tmpheaptop;
 591   }
 592
 593   bamboo_free_block = 0;
 594   unsigned int tbound = 0;
 595   do {
 596     tbound=BLOCKSIZE(bamboo_free_block<NUMCORES4GC);
 597     if(bamboo_smemtbl[bamboo_free_block] == tbound) {
 598       bamboo_free_block++;
 599     } else {
 600       // the first non-full partition
 601       break;
 602     }
 603   } while(true);
 604
 605   GCPROFILE_RECORD_SPACE();
 606 }
 607
 608 void gc_collect(struct garbagelist * stackptr) {
 609   gcprocessing = true;
 610   // inform the master that this core is at a gc safe point and is ready to
 611   // do gc
 612   send_msg_4(STARTUPCORE,GCFINISHPRE,BAMBOO_NUM_OF_CORE,self_numsendobjs,self_numreceiveobjs);
 613
 614   // core collector routine
 615   //wait for init phase
 616   WAITFORGCPHASE(INITPHASE);
 617
 618   GC_PRINTF("Do initGC\n");
 619   initGC();
 620   CACHEADAPT_GC(true);
 621   //send init finish msg to core coordinator
 622   send_msg_2(STARTUPCORE,GCFINISHINIT,BAMBOO_NUM_OF_CORE);
 623
 624   //wait for mark phase
 625   WAITFORGCPHASE(MARKPHASE);
 626
 627   GC_PRINTF("Start mark phase\n");
 628   mark(true, stackptr);
 629   GC_PRINTF("Finish mark phase, start compact phase\n");
 630   compact();
 631   GC_PRINTF("Finish compact phase\n");
 632
 633   WAITFORGCPHASE(FLUSHPHASE);
 634
 635   GC_PRINTF("Start flush phase\n");
 636   GCPROFILE_INFO_2_MASTER();
 637   flush(stackptr);
 638   GC_PRINTF("Finish flush phase\n");
 639
 640   CACHEADAPT_PHASE_CLIENT();
 641
 642   // invalidate all shared mem pointers
 643   bamboo_cur_msp = NULL;
 644   bamboo_smem_size = 0;
 645   bamboo_smem_zero_top = NULL;
 646   gcflag = false;
 647
 648   WAITFORGCPHASE(FINISHPHASE);
 649
 650   GC_PRINTF("Finish gc! \n");
 651 }
 652
 653 void gc_nocollect(struct garbagelist * stackptr) {
 654   gcprocessing = true;
 655   // inform the master that this core is at a gc safe point and is ready to
 656   // do gc
 657   send_msg_4(STARTUPCORE,GCFINISHPRE,BAMBOO_NUM_OF_CORE,self_numsendobjs,self_numreceiveobjs);
 658
 659   WAITFORGCPHASE(INITPHASE);
 660
 661   GC_PRINTF("Do initGC\n");
 662   initGC();
 663   CACHEADAPT_GC(true);
 664   //send init finish msg to core coordinator
 665   send_msg_2(STARTUPCORE,GCFINISHINIT,BAMBOO_NUM_OF_CORE);
 666
 667   WAITFORGCPHASE(MARKPHASE);
 668
 669   GC_PRINTF("Start mark phase\n");
 670   mark(true, stackptr);
 671   GC_PRINTF("Finish mark phase, wait for flush\n");
 672
 673   // non-gc core collector routine
 674   WAITFORGCPHASE(FLUSHPHASE);
 675
 676   GC_PRINTF("Start flush phase\n");
 677   GCPROFILE_INFO_2_MASTER();
 678   flush(stackptr);
 679   GC_PRINTF("Finish flush phase\n");
 680
 681   CACHEADAPT_PHASE_CLIENT();
 682
 683   // invalidate all shared mem pointers
 684   bamboo_cur_msp = NULL;
 685   bamboo_smem_size = 0;
 686   bamboo_smem_zero_top = NULL;
 687
 688   gcflag = false;
 689   WAITFORGCPHASE(FINISHPHASE);
 690
 691   GC_PRINTF("Finish gc! \n");
 692 }
 693
 694 void master_mark(struct garbagelist *stackptr) {
 695   bool isfirst = true;
 696
 697   GC_PRINTF("Start mark phase \n");
 698   GC_SEND_MSG_1_TO_CLIENT(GCSTART);
 699   gcphase = MARKPHASE;
 700   // mark phase
 701
 702   while(MARKPHASE == gcphase) {
 703     mark(isfirst, stackptr);
 704     isfirst=false;
 705     // check gcstatus
 706     checkMarkStatus();
 707   }
 708 }
 709
 710 void master_getlargeobjs() {
 711   // send msgs to all cores requiring large objs info
 712   // Note: only need to ask gc cores, non-gc cores do not host any objs
 713   numconfirm = NUMCORES4GC - 1;
 714   for(int i = 1; i < NUMCORES4GC; i++) {
 715     send_msg_1(i,GCLOBJREQUEST);
 716   }
 717   gcloads[BAMBOO_NUM_OF_CORE] = gccurr_heaptop;
 718   //spin until we have all responses
 719   while(numconfirm!=0) ;
 720
 721   // check the heaptop
 722   if(gcheaptop < gcmarkedptrbound) {
 723     gcheaptop = gcmarkedptrbound;
 724   }
 725   GCPROFILE_ITEM();
 726   GC_PRINTF("prepare to cache large objs \n");
 727
 728   // cache all large objs
 729   BAMBOO_ASSERTMSG(cacheLObjs(), "Not enough space to cache large objects\n");
 730 }
 731
 732 void master_compact() {
 733   // predict number of blocks to fill for each core
 734   unsigned int tmpheaptop = 0;
 735   int numpbc = loadbalance(&tmpheaptop);
 736
 737   numpbc = (BAMBOO_SHARED_MEM_SIZE)/(BAMBOO_SMEM_SIZE);
 738   GC_PRINTF("mark phase finished \n");
 739
 740   tmpheaptop = gcbaseva + (BAMBOO_SHARED_MEM_SIZE);
 741   for(int i = 0; i < NUMCORES4GC; i++) {
 742     unsigned int tmpcoreptr = 0;
 743     BASEPTR(i, numpbc, &tmpcoreptr);
 744     // init some data strutures for compact phase
 745     gcloads[i] = 0;
 746     gcfilledblocks[i] = 0;
 747     gcrequiredmems[i] = 0;
 748     gccorestatus[i] = 1;
 749     //send start compact messages to all cores
 750     //TODO bug here, do not know if the direction is positive or negtive?
 751     if (tmpcoreptr < tmpheaptop) {
 752       gcstopblock[i] = numpbc + 1;
 753       if(i != STARTUPCORE) {
 754         send_msg_2(i, GCSTARTCOMPACT, numpbc+1);
 755       } else {
 756         gcblock2fill = numpbc+1;
 757       }
 758     } else {
 759       gcstopblock[i] = numpbc;
 760       if(i != STARTUPCORE) {
 761         send_msg_2(i, GCSTARTCOMPACT, numpbc);
 762       } else {
 763         gcblock2fill = numpbc;
 764       }
 765     }
 766   }
 767   BAMBOO_CACHE_MF();
 768   GCPROFILE_ITEM();
 769   // compact phase
 770   struct moveHelper * orig = (struct moveHelper *)RUNMALLOC(sizeof(struct moveHelper));
 771   struct moveHelper * to = (struct moveHelper *)RUNMALLOC(sizeof(struct moveHelper));
 772   compact_master(orig, to);
 773   GCPROFILE_ITEM();
 774   GC_PRINTF("prepare to move large objs \n");
 775   // move largeObjs
 776   moveLObjs();
 777   GC_PRINTF("compact phase finished \n");
 778   RUNFREE(orig);
 779   RUNFREE(to);
 780 }
 781
 782 void master_updaterefs(struct garbagelist * stackptr) {
 783   gcphase = FLUSHPHASE;
 784   GC_SEND_MSG_1_TO_CLIENT(GCSTARTFLUSH);
 785   GCPROFILE_ITEM();
 786   GC_PRINTF("Start flush phase \n");
 787   // flush phase
 788   flush(stackptr);
 789   // now the master core need to decide the new cache strategy
 790   CACHEADAPT_MASTER();
 791   GC_CHECK_ALL_CORE_STATUS(FLUSHPHASE==gcphase);
 792   GC_PRINTF("Finish flush phase \n");
 793 }
 794
 795 void master_finish() {
 796   gcphase = FINISHPHASE;
 797
 798   // invalidate all shared mem pointers
 799   // put it here as it takes time to inform all the other cores to
 800   // finish gc and it might cause problem when some core resumes
 801   // mutator earlier than the other cores
 802   bamboo_cur_msp = NULL;
 803   bamboo_smem_size = 0;
 804   bamboo_smem_zero_top = NULL;
 805
 806   GCPROFILE_END();
 807   gcflag = false;
 808   GC_SEND_MSG_1_TO_CLIENT(GCFINISH);
 809
 810   gcprocessing = false;
 811   if(gcflag) {
 812     // inform other cores to stop and wait for gc
 813     gcprecheck = true;
 814     for(int i = 0; i < NUMCORESACTIVE; i++) {
 815       // reuse the gcnumsendobjs & gcnumreceiveobjs
 816       gcnumsendobjs[0][i] = 0;
 817       gcnumreceiveobjs[0][i] = 0;
 818     }
 819     GC_SEND_MSG_1_TO_CLIENT(GCSTARTPRE);
 820   }
 821 }
 822
 823 void gc_master(struct garbagelist * stackptr) {
 824   tprintf("start GC !!!!!!!!!!!!! \n");
 825   gcprocessing = true;
 826   gcphase = INITPHASE;
 827
 828   waitconfirm = false;
 829   numconfirm = 0;
 830   initGC();
 831   GC_SEND_MSG_1_TO_CLIENT(GCSTARTINIT);
 832   CACHEADAPT_GC(true);
 833   GC_PRINTF("Check core status \n");
 834   GC_CHECK_ALL_CORE_STATUS(true);
 835   GCPROFILE_ITEM();
 836   CACHEADAPT_OUTPUT_CACHE_SAMPLING();
 837
 838   // do mark phase
 839   master_mark(stackptr);
 840
 841   // get large objects from all cores
 842   master_getlargeobjs();
 843
 844   // compact the heap
 845   master_compact();
 846
 847   // update the references
 848   master_updaterefs(stackptr);
 849
 850   // do cache adaptation
 851   CACHEADAPT_PHASE_MASTER();
 852
 853   // do finish up stuff
 854   master_finish();
 855
 856   GC_PRINTF("gc finished   \n");
 857   tprintf("finish GC ! %d \n", gcflag);
 858 }
 859
 860 void pregccheck() {
 861   while(true) {
 862     BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
 863     gcnumsendobjs[0][BAMBOO_NUM_OF_CORE] = self_numsendobjs;
 864     gcnumreceiveobjs[0][BAMBOO_NUM_OF_CORE] = self_numreceiveobjs;
 865     int sumsendobj = 0;
 866     for(int i = 0; i < NUMCORESACTIVE; i++) {
 867       sumsendobj += gcnumsendobjs[0][i];
 868     }
 869     for(int i = 0; i < NUMCORESACTIVE; i++) {
 870       sumsendobj -= gcnumreceiveobjs[0][i];
 871     }
 872     if(0 != sumsendobj) {
 873       // there were still some msgs on the fly, wait until there
 874       // are some update pregc information coming and check it again
 875       gcprecheck = false;
 876       BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
 877
 878       while(!gcprecheck) ;
 879     } else {
 880       BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
 881       return;
 882     }
 883   }
 884 }
 885
 886 void pregcprocessing() {
 887 #if defined(GC_CACHE_ADAPT)&&defined(GC_CACHE_SAMPLING)
 888   // disable the timer interrupt
 889   bamboo_mask_timer_intr();
 890 #endif
 891   // Zero out the remaining memory here because for the GC_CACHE_ADAPT version,
 892   // we need to make sure during the gcinit phase the shared heap is not
 893   // touched. Otherwise, there would be problem when adapt the cache strategy.
 894   BAMBOO_CLOSE_CUR_MSP();
 895 #ifdef GC_FLUSH_DTLB
 896   if(gc_num_flush_dtlb < GC_NUM_FLUSH_DTLB) {
 897     BAMBOO_CLEAN_DTLB();
 898     gc_num_flush_dtlb++;
 899   }
 900 #endif
 901 #if defined(GC_CACHE_ADAPT)&&defined(GC_CACHE_SAMPLING)
 902   // get the sampling data
 903   bamboo_output_dtlb_sampling();
 904 #endif
 905 }
 906
 907 void postgcprocessing() {
 908 #if defined(GC_CACHE_ADAPT)&&defined(GC_CACHE_SAMPLING)
 909   // enable the timer interrupt
 910   bamboo_tile_timer_set_next_event(GC_TILE_TIMER_EVENT_SETTING);
 911   bamboo_unmask_timer_intr();
 912 #endif
 913 }
 914
 915 bool gc(struct garbagelist * stackptr) {
 916   // check if do gc
 917   if(!gcflag) {
 918     gcprocessing = false;
 919     return false;
 920   }
 921
 922   // core coordinator routine
 923   if(0 == BAMBOO_NUM_OF_CORE) {
 924     GC_PRINTF("Check if we can do gc or not\n");
 925     gccorestatus[BAMBOO_NUM_OF_CORE] = 0;
 926     if(!gc_checkAllCoreStatus()) {
 927       // some of the cores are still executing the mutator and did not reach
 928       // some gc safe point, therefore it is not ready to do gc
 929       gcflag = true;
 930       return false;
 931     } else {
 932       GCPROFILE_START();
 933       pregccheck();
 934     }
 935     GC_PRINTF("start gc! \n");
 936     pregcprocessing();
 937     gc_master(stackptr);
 938   } else if(BAMBOO_NUM_OF_CORE < NUMCORES4GC) {
 939     pregcprocessing();
 940     gc_collect(stackptr);
 941   } else {
 942     pregcprocessing();
 943     gc_nocollect(stackptr);
 944   }
 945   postgcprocessing();
 946
 947   return true;
 948 }
 949
 950 #endif