Robust/src/Runtime/bamboo/multicoregarbage.c

   1 // TODO: DO NOT support tag!!!
   2 #ifdef MULTICORE_GC
   3 #include "runtime.h"
   4 #include "multicoreruntime.h"
   5 #include "multicoregarbage.h"
   6 #include "multicoregcmark.h"
   7 #include "gcqueue.h"
   8 #include "multicoregccompact.h"
   9 #include "multicoregcflush.h"
  10 #include "multicoregcprofile.h"
  11 #include "gcqueue.h"
  12
  13 #ifdef SMEMM
  14 extern unsigned int gcmem_mixed_threshold;
  15 extern unsigned int gcmem_mixed_usedmem;
  16 #endif // SMEMM
  17
  18 volatile bool gcflag;
  19 gc_status_t gc_status_info;
  20
  21 unsigned long long gc_output_cache_policy_time=0;
  22
  23 #ifdef GC_DEBUG
  24 // dump whole mem in blocks
  25 void dumpSMem() {
  26   int block = 0;
  27   int sblock = 0;
  28   unsigned int j = 0;
  29   void * i = 0;
  30   int coren = 0;
  31   int x = 0;
  32   int y = 0;
  33   printf("(%x,%x) Dump shared mem: \n",udn_tile_coord_x(),udn_tile_coord_y());
  34   // reserved blocks for sblocktbl
  35   printf("(%x,%x) ++++ reserved sblocks ++++ \n", udn_tile_coord_x(),
  36          udn_tile_coord_y());
  37   for(i=BAMBOO_BASE_VA; (unsinged int)i<(unsigned int)gcbaseva; i+= 4*16) {
  38     printf("(%x,%x) 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x \n",
  39         udn_tile_coord_x(), udn_tile_coord_y(),
  40         *((int *)(i)), *((int *)(i + 4)),
  41         *((int *)(i + 4*2)), *((int *)(i + 4*3)),
  42         *((int *)(i + 4*4)), *((int *)(i + 4*5)),
  43         *((int *)(i + 4*6)), *((int *)(i + 4*7)),
  44         *((int *)(i + 4*8)), *((int *)(i + 4*9)),
  45         *((int *)(i + 4*10)), *((int *)(i + 4*11)),
  46         *((int *)(i + 4*12)), *((int *)(i + 4*13)),
  47         *((int *)(i + 4*14)), *((int *)(i + 4*15)));
  48   }
  49   sblock = gcreservedsb;
  50   bool advanceblock = false;
  51   // remaining memory
  52   for(i=gcbaseva; (unsigned int)i<(unsigned int)(gcbaseva+BAMBOO_SHARED_MEM_SIZE); i+=4*16) {
  53     advanceblock = false;
  54     // computing sblock # and block #, core coordinate (x,y) also
  55     if(j%((BAMBOO_SMEM_SIZE)/(4*16)) == 0) {
  56       // finished a sblock
  57       if(j < ((BAMBOO_LARGE_SMEM_BOUND)/(4*16))) {
  58         if((j > 0) && (j%((BAMBOO_SMEM_SIZE_L)/(4*16)) == 0)) {
  59           // finished a block
  60           block++;
  61           advanceblock = true;
  62         }
  63       } else {
  64         // finished a block
  65         block++;
  66         advanceblock = true;
  67       }
  68       // compute core #
  69       if(advanceblock) {
  70         coren = gc_block2core[block%(NUMCORES4GC*2)];
  71       }
  72       // compute core coordinate
  73       x = BAMBOO_COORDS_X(coren);
  74       y = BAMBOO_COORDS_Y(coren);
  75       printf("(%x,%x) ==== %d, %d : core (%d,%d), saddr %x====\n",
  76           udn_tile_coord_x(), udn_tile_coord_y(),block, sblock++, x, y,
  77           (sblock-1)*(BAMBOO_SMEM_SIZE)+gcbaseva);
  78     }
  79     j++;
  80     printf("(%x,%x) 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x \n",
  81         udn_tile_coord_x(), udn_tile_coord_y(),
  82         *((int *)(i)), *((int *)(i + 4)),
  83         *((int *)(i + 4*2)), *((int *)(i + 4*3)),
  84         *((int *)(i + 4*4)), *((int *)(i + 4*5)),
  85         *((int *)(i + 4*6)), *((int *)(i + 4*7)),
  86         *((int *)(i + 4*8)), *((int *)(i + 4*9)),
  87         *((int *)(i + 4*10)), *((int *)(i + 4*11)),
  88         *((int *)(i + 4*12)), *((int *)(i + 4*13)),
  89         *((int *)(i + 4*14)), *((int *)(i + 4*15)));
  90   }
  91   printf("(%x,%x) \n", udn_tile_coord_x(), udn_tile_coord_y());
  92 }
  93 #endif
  94
  95 void initmulticoregcdata() {
  96   if(STARTUPCORE == BAMBOO_NUM_OF_CORE) {
  97     // startup core to initialize corestatus[]
  98     for(int i = 0; i < NUMCORESACTIVE; i++) {
  99       gccorestatus[i] = 1;
 100       gcnumsendobjs[0][i] = gcnumsendobjs[1][i] = 0;
 101       gcnumreceiveobjs[0][i] = gcnumreceiveobjs[1][i] = 0;
 102     }
 103     for(int i = 0; i < NUMCORES4GC; i++) {
 104       gcloads[i] = 0;
 105       gcrequiredmems[i] = 0;
 106       gcstopblock[i] = 0;
 107       gcfilledblocks[i] = 0;
 108     }
 109   }
 110
 111   bamboo_smem_zero_top = NULL;
 112   gcflag = false;
 113   gc_status_info.gcprocessing = false;
 114   gc_status_info.gcphase = FINISHPHASE;
 115
 116   gcprecheck = true;
 117   gccurr_heaptop = 0;
 118   gcself_numsendobjs = 0;
 119   gcself_numreceiveobjs = 0;
 120   gcmarkedptrbound = 0;
 121   gcforwardobjtbl = allocateMGCHash_I(20, 3);
 122   gcnumlobjs = 0;
 123   gcheaptop = 0;
 124   gctopcore = 0;
 125   gctopblock = 0;
 126   gcmovestartaddr = 0;
 127   gctomove = false;
 128   gcmovepending = 0;
 129   gcblock2fill = 0;
 130 #ifdef SMEMM
 131   gcmem_mixed_threshold=(unsigned int)((BAMBOO_SHARED_MEM_SIZE-bamboo_reserved_smem*BAMBOO_SMEM_SIZE)*0.8);
 132   gcmem_mixed_usedmem = 0;
 133 #endif
 134 #ifdef MGC_SPEC
 135   gc_profile_flag = false;
 136 #endif
 137   gc_localheap_s = false;
 138 #ifdef GC_CACHE_ADAPT
 139   gccachestage = false;
 140 #endif
 141
 142   INIT_MULTICORE_GCPROFILE_DATA();
 143 }
 144
 145 void dismulticoregcdata() {
 146   freeMGCHash(gcforwardobjtbl);
 147 }
 148
 149 void initGC() {
 150   if(STARTUPCORE == BAMBOO_NUM_OF_CORE) {
 151     for(int i = 0; i < NUMCORES4GC; i++) {
 152       gccorestatus[i] = 1;
 153       gcnumsendobjs[0][i] = gcnumsendobjs[1][i] = 0;
 154       gcnumreceiveobjs[0][i] = gcnumreceiveobjs[1][i] = 0;
 155       gcloads[i] = 0;
 156       gcrequiredmems[i] = 0;
 157       gcfilledblocks[i] = 0;
 158       gcstopblock[i] = 0;
 159     }
 160     for(int i = NUMCORES4GC; i < NUMCORESACTIVE; i++) {
 161       gccorestatus[i] = 1;
 162       gcnumsendobjs[0][i] = gcnumsendobjs[1][i] = 0;
 163       gcnumreceiveobjs[0][i] = gcnumreceiveobjs[1][i] = 0;
 164     }
 165     gcheaptop = 0;
 166     gctopcore = 0;
 167     gctopblock = 0;
 168     gcnumsrobjs_index = 0;
 169   }
 170   gcself_numsendobjs = 0;
 171   gcself_numreceiveobjs = 0;
 172   gcmarkedptrbound = 0;
 173   gcnumlobjs = 0;
 174   gcmovestartaddr = 0;
 175   gctomove = false;
 176   gcblock2fill = 0;
 177   gcmovepending = 0;
 178   gccurr_heaptop = 0;
 179   gcdstcore = 0;
 180
 181   gc_queueinit();
 182
 183   freeMGCHash(gcforwardobjtbl);
 184   gcforwardobjtbl = allocateMGCHash(20, 3);
 185
 186   GCPROFILE_INIT();
 187   gc_output_cache_policy_time=0;
 188 }
 189
 190 bool gc_checkAllCoreStatus() {
 191   BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
 192   for(int i = 0; i < NUMCORESACTIVE; i++) {
 193     if(gccorestatus[i] != 0) {
 194       BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
 195       return false;
 196     }
 197   }
 198   BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
 199   return true;
 200 }
 201
 202 // NOTE: should be invoked with interrupts turned off
 203 bool gc_checkAllCoreStatus_I() {
 204   for(int i = 0; i < NUMCORESACTIVE; i++) {
 205     if(gccorestatus[i] != 0) {
 206       return false;
 207     }
 208   }
 209   return true;
 210 }
 211
 212 INLINE void checkMarkStatus_p2() {
 213   // check if the sum of send objs and receive obj are the same
 214   // yes->check if the info is the latest; no->go on executing
 215   unsigned int sumsendobj = 0;
 216   for(int i = 0; i < NUMCORESACTIVE; i++) {
 217     sumsendobj += gcnumsendobjs[gcnumsrobjs_index][i];
 218   }
 219   for(int i = 0; i < NUMCORESACTIVE; i++) {
 220     sumsendobj -= gcnumreceiveobjs[gcnumsrobjs_index][i];
 221   }
 222   if(0 == sumsendobj) {
 223     // Check if there are changes of the numsendobjs or numreceiveobjs
 224     // on each core
 225     int i = 0;
 226     for(i = 0; i < NUMCORESACTIVE; i++) {
 227       if((gcnumsendobjs[0][i]!=gcnumsendobjs[1][i])||(gcnumreceiveobjs[0][i]!=gcnumreceiveobjs[1][i]) ) {
 228         break;
 229       }
 230     }
 231     if(i == NUMCORESACTIVE) {
 232       // all the core status info are the latest,stop mark phase
 233       gc_status_info.gcphase = COMPACTPHASE;
 234       // restore the gcstatus for all cores
 235       for(int i = 0; i < NUMCORESACTIVE; i++) {
 236         gccorestatus[i] = 1;
 237       }
 238     } else {
 239       // There were changes between phase 1 and phase 2, can not decide
 240       // whether the mark phase has been finished
 241       waitconfirm = false;
 242       // As it fails in phase 2, flip the entries
 243       gcnumsrobjs_index = (gcnumsrobjs_index == 0) ? 1 : 0;
 244     }
 245   } else {
 246     // There were changes between phase 1 and phase 2, can not decide
 247     // whether the mark phase has been finished
 248     waitconfirm = false;
 249     // As it fails in phase 2, flip the entries
 250     gcnumsrobjs_index = (gcnumsrobjs_index == 0) ? 1 : 0;
 251   }
 252 }
 253
 254 INLINE void checkMarkStatus() {
 255   if((!waitconfirm)||(waitconfirm && (numconfirm == 0))) {
 256     unsigned int entry_index = 0;
 257     if(waitconfirm) {
 258       // phase 2
 259       entry_index = (gcnumsrobjs_index == 0) ? 1 : 0;
 260     } else {
 261       // phase 1
 262       entry_index = gcnumsrobjs_index;
 263     }
 264     BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
 265     gccorestatus[BAMBOO_NUM_OF_CORE] = 0;
 266     gcnumsendobjs[entry_index][BAMBOO_NUM_OF_CORE] = gcself_numsendobjs;
 267     gcnumreceiveobjs[entry_index][BAMBOO_NUM_OF_CORE] = gcself_numreceiveobjs;
 268     // check the status of all cores
 269     if (gc_checkAllCoreStatus_I()) {
 270       // ask for confirm
 271       if(!waitconfirm) {
 272         // the first time found all cores stall
 273         // send out status confirm msg to all other cores
 274         // reset the corestatus array too
 275         waitconfirm = true;
 276         numconfirm = NUMCORESACTIVE - 1;
 277         BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
 278         GC_SEND_MSG_1_TO_CLIENT(GCMARKCONFIRM);
 279       } else {
 280         // Phase 2
 281         checkMarkStatus_p2();
 282         BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
 283       }
 284     } else {
 285       BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
 286     }
 287   }
 288 }
 289
 290 // compute load balance for all cores
 291 INLINE int loadbalance(unsigned int * heaptop) {
 292   // compute load balance
 293   // get the total loads
 294   unsigned int tloads = gcloads[STARTUPCORE];
 295   for(int i = 1; i < NUMCORES4GC; i++) {
 296     tloads += gcloads[i];
 297   }
 298   *heaptop = gcbaseva + tloads;
 299
 300   unsigned int b = 0;
 301   BLOCKINDEX(*heaptop, &b);
 302   // num of blocks per core
 303   unsigned int numbpc = (unsigned int)b/(unsigned int)(NUMCORES4GC);
 304   gctopblock = b;
 305   RESIDECORE(heaptop, &gctopcore);
 306   return numbpc;
 307 }
 308
 309 // compute total mem size required and sort the lobjs in ascending order
 310 INLINE unsigned int sortLObjs() {
 311   unsigned int tmp_lobj = 0;
 312   unsigned int tmp_len = 0;
 313   unsigned int tmp_host = 0;
 314   unsigned int sumsize = 0;
 315
 316   gclobjtail2 = gclobjtail;
 317   gclobjtailindex2 = gclobjtailindex;
 318   // TODO USE QUICK SORT INSTEAD?
 319   while(gc_lobjmoreItems2_I()) {
 320     gc_lobjdequeue2_I();
 321     tmp_lobj = gclobjtail2->lobjs[gclobjtailindex2-1];
 322     tmp_host = gclobjtail2->hosts[gclobjtailindex2-1];
 323     tmp_len = gclobjtail2->lengths[gclobjtailindex2 - 1];
 324     sumsize += tmp_len;
 325     GCPROFILE_RECORD_LOBJ();
 326     unsigned int i = gclobjtailindex2-1;
 327     struct lobjpointerblock * tmp_block = gclobjtail2;
 328     // find the place to insert
 329     while(true) {
 330       if(i == 0) {
 331         if(tmp_block->prev == NULL) {
 332           break;
 333         }
 334         if(tmp_block->prev->lobjs[NUMLOBJPTRS-1] > tmp_lobj) {
 335           tmp_block->lobjs[i] = tmp_block->prev->lobjs[NUMLOBJPTRS-1];
 336           tmp_block->lengths[i] = tmp_block->prev->lengths[NUMLOBJPTRS-1];
 337           tmp_block->hosts[i] = tmp_block->prev->hosts[NUMLOBJPTRS-1];
 338           tmp_block = tmp_block->prev;
 339           i = NUMLOBJPTRS-1;
 340         } else {
 341           break;
 342         }  // if(tmp_block->prev->lobjs[NUMLOBJPTRS-1] < tmp_lobj)
 343       } else {
 344         if(tmp_block->lobjs[i-1] > tmp_lobj) {
 345           tmp_block->lobjs[i] = tmp_block->lobjs[i-1];
 346           tmp_block->lengths[i] = tmp_block->lengths[i-1];
 347           tmp_block->hosts[i] = tmp_block->hosts[i-1];
 348           i--;
 349         } else {
 350           break;
 351         }
 352       }
 353     }
 354     // insert it
 355     if(i != gclobjtailindex2 - 1) {
 356       tmp_block->lobjs[i] = tmp_lobj;
 357       tmp_block->lengths[i] = tmp_len;
 358       tmp_block->hosts[i] = tmp_host;
 359     }
 360   }
 361   return sumsize;
 362 }
 363
 364 INLINE bool cacheLObjs() {
 365   // check the total mem size need for large objs
 366   unsigned long long sumsize = 0;
 367   unsigned int size = 0;
 368
 369   sumsize = sortLObjs();
 370
 371   GCPROFILE_RECORD_LOBJSPACE();
 372
 373   // check if there are enough space to cache these large objs
 374   unsigned int dst = gcbaseva + (BAMBOO_SHARED_MEM_SIZE) -sumsize;
 375   if((unsigned long long)gcheaptop > (unsigned long long)dst) {
 376     // do not have enough room to cache large objs
 377     return false;
 378   }
 379
 380   gcheaptop = dst; // Note: record the start of cached lobjs with gcheaptop
 381   // cache the largeObjs to the top of the shared heap
 382   dst = gcbaseva + (BAMBOO_SHARED_MEM_SIZE);
 383   while(gc_lobjmoreItems3_I()) {
 384     gc_lobjdequeue3_I();
 385     size = gclobjtail2->lengths[gclobjtailindex2];
 386     // set the mark field to , indicating that this obj has been moved
 387     // and need to be flushed
 388     ((struct ___Object___ *)(gclobjtail2->lobjs[gclobjtailindex2]))->marked=COMPACTED;
 389     BAMBOO_CACHE_FLUSH_LINE(gclobjtail2->lobjs[gclobjtailindex2]);
 390     dst -= size;
 391     if((unsigned int)dst<(unsigned int)(gclobjtail2->lobjs[gclobjtailindex2]+size)) {
 392       memmove(dst, gclobjtail2->lobjs[gclobjtailindex2], size);
 393     } else {
 394       memcpy(dst, gclobjtail2->lobjs[gclobjtailindex2], size);
 395     }
 396   }
 397   return true;
 398 }
 399
 400 // update the bmmboo_smemtbl to record current shared mem usage
 401 void updateSmemTbl(unsigned int coren, unsigned int localtop) {
 402   unsigned int ltopcore = 0;
 403   unsigned int bound = BAMBOO_SMEM_SIZE_L;
 404   BLOCKINDEX(localtop, &ltopcore);
 405   if((unsigned int)localtop>=(unsigned int)(gcbaseva+BAMBOO_LARGE_SMEM_BOUND)){
 406     bound = BAMBOO_SMEM_SIZE;
 407   }
 408   unsigned int load = (unsigned int)(localtop-gcbaseva)%(unsigned int)bound;
 409   unsigned int toset = 0;
 410   for(int j=0; 1; j++) {
 411     for(int i=0; i<2; i++) {
 412       toset = gc_core2block[2*coren+i]+(unsigned int)(NUMCORES4GC*2)*j;
 413       if(toset < ltopcore) {
 414         bamboo_smemtbl[toset]=BLOCKSIZE(toset<NUMCORES4GC);
 415 #ifdef SMEMM
 416         gcmem_mixed_usedmem += bamboo_smemtbl[toset];
 417 #endif
 418       } else if(toset == ltopcore) {
 419         bamboo_smemtbl[toset] = load;
 420 #ifdef SMEMM
 421         gcmem_mixed_usedmem += bamboo_smemtbl[toset];
 422 #endif
 423         return;
 424       } else {
 425         return;
 426       }
 427     }
 428   }
 429 }
 430
 431 INLINE unsigned int checkCurrHeapTop() {
 432   // update the smemtbl
 433   BAMBOO_MEMSET_WH(bamboo_smemtbl, 0, sizeof(int)*gcnumblock);
 434   // flush all gcloads to indicate the real heap top on one core
 435   // previous it represents the next available ptr on a core
 436   if(((unsigned int)gcloads[0]>(unsigned int)(gcbaseva+BAMBOO_SMEM_SIZE_L))&&(((unsigned int)gcloads[0]%(BAMBOO_SMEM_SIZE)) == 0)) {
 437     // edge of a block, check if this is exactly the heaptop
 438     BASEPTR(0, gcfilledblocks[0]-1, &(gcloads[0]));
 439     gcloads[0]+=BLOCKSIZE(gcfilledblocks[0]<=1);
 440   }
 441   updateSmemTbl(0, gcloads[0]);
 442   for(int i = 1; i < NUMCORES4GC; i++) {
 443     unsigned int tmptop = 0;
 444     if((gcfilledblocks[i] > 0)&&(((unsigned int)gcloads[i]%(BAMBOO_SMEM_SIZE)) == 0)) {
 445       // edge of a block, check if this is exactly the heaptop
 446       BASEPTR(i, gcfilledblocks[i]-1, &gcloads[i]);
 447       gcloads[i]+=BLOCKSIZE(gcfilledblocks[i]<=1);
 448       tmptop = gcloads[i];
 449     }
 450     updateSmemTbl(i, gcloads[i]);
 451   }
 452
 453   // find current heap top
 454   // TODO
 455   // a bug here: when using local allocation, directly move large objects
 456   // to the highest free chunk might not be memory efficient
 457   unsigned int tmpheaptop = 0;
 458   for(int i = gcnumblock-1; i >= 0; i--) {
 459     if(bamboo_smemtbl[i] > 0) {
 460       return gcbaseva+bamboo_smemtbl[i]+OFFSET2BASEVA(i);
 461     }
 462   }
 463   return gcbaseva;
 464 }
 465
 466 INLINE void movelobj(unsigned int tmpheaptop,unsigned int ptr,int size,int isize) {
 467   // move the large obj
 468   if((unsigned int)gcheaptop < (unsigned int)(tmpheaptop+size)) {
 469     memmove(tmpheaptop, gcheaptop, size);
 470   } else {
 471     memcpy(tmpheaptop, gcheaptop, size);
 472   }
 473   // fill the remaining space with -2 padding
 474   BAMBOO_MEMSET_WH(tmpheaptop+size, -2, isize-size);
 475   gcheaptop += size;
 476   // cache the mapping info
 477   gcmappingtbl[OBJMAPPINGINDEX((unsigned int)ptr)]=(unsigned int)tmpheaptop;
 478   tmpheaptop += isize;
 479 }
 480
 481 INLINE void moveLObjs() {
 482 #ifdef SMEMM
 483   // update the gcmem_mixed_usedmem
 484   gcmem_mixed_usedmem = 0;
 485 #endif
 486   unsigned int size = 0;
 487   unsigned int bound = 0;
 488   unsigned int tmpheaptop = checkCurrHeapTop();
 489
 490   // move large objs from gcheaptop to tmpheaptop
 491   // write the header first
 492   unsigned int tomove = gcbaseva+(BAMBOO_SHARED_MEM_SIZE)-gcheaptop;
 493 #ifdef SMEMM
 494   gcmem_mixed_usedmem += tomove;
 495 #endif
 496   // flush the sbstartbl
 497   BAMBOO_MEMSET_WH(&(gcsbstarttbl[gcreservedsb]),'\0',(BAMBOO_SHARED_MEM_SIZE/BAMBOO_SMEM_SIZE-(unsigned int)gcreservedsb)*sizeof(unsigned int));
 498   if(tomove == 0) {
 499     gcheaptop = tmpheaptop;
 500   } else {
 501     // check how many blocks it acrosses
 502     unsigned int remain = tmpheaptop-gcbaseva;
 503     //number of the sblock
 504     unsigned int sb = remain/BAMBOO_SMEM_SIZE+(unsigned int)gcreservedsb;
 505     unsigned int b = 0;  // number of the block
 506     BLOCKINDEX(tmpheaptop, &b);
 507     // check the remaining space in this block
 508     bound = (BAMBOO_SMEM_SIZE);
 509     if(remain < (BAMBOO_LARGE_SMEM_BOUND)) {
 510       bound = (BAMBOO_SMEM_SIZE_L);
 511     }
 512     remain = bound - remain%bound;
 513
 514     size = 0;
 515     unsigned int isize = 0;
 516     unsigned int host = 0;
 517     unsigned int ptr = 0;
 518     unsigned int base = tmpheaptop;
 519     unsigned int cpysize = 0;
 520     remain -= BAMBOO_CACHE_LINE_SIZE;
 521     tmpheaptop += BAMBOO_CACHE_LINE_SIZE;
 522     gc_lobjqueueinit4_I();
 523     while(gc_lobjmoreItems4_I()) {
 524       ptr = (unsigned int)(gc_lobjdequeue4_I(&size, &host));
 525       ALIGNSIZE(size, &isize);
 526       if(remain >= isize) {
 527         remain -= isize;
 528         // move the large obj
 529         movelobj(tmpheaptop,ptr,size,isize);
 530         cpysize += isize;
 531         // update bamboo_smemtbl
 532         bamboo_smemtbl[b] += isize;
 533       } else {
 534         // this object acrosses blocks
 535         if(cpysize > 0) {
 536           CLOSEBLOCK(base, cpysize+BAMBOO_CACHE_LINE_SIZE);
 537           bamboo_smemtbl[b] += BAMBOO_CACHE_LINE_SIZE;
 538           cpysize = 0;
 539           base = tmpheaptop;
 540           if(remain == 0) {
 541             remain = BLOCKSIZE((tmpheaptop-gcbaseva)<(BAMBOO_LARGE_SMEM_BOUND));
 542           }
 543           remain -= BAMBOO_CACHE_LINE_SIZE;
 544           tmpheaptop += BAMBOO_CACHE_LINE_SIZE;
 545           BLOCKINDEX(tmpheaptop, &b);
 546           sb = (unsigned int)(tmpheaptop-gcbaseva)/(BAMBOO_SMEM_SIZE)+gcreservedsb;
 547         }
 548         // move the obj
 549         movelobj(tmpheaptop,ptr,size,isize);
 550
 551         // set the gcsbstarttbl and bamboo_smemtbl
 552         unsigned int tmpsbs=1+(unsigned int)(isize-remain-1)/BAMBOO_SMEM_SIZE;
 553         for(int k = 1; k < tmpsbs; k++) {
 554           gcsbstarttbl[sb+k] = -1;
 555         }
 556         sb += tmpsbs;
 557         bound = BLOCKSIZE(b<NUMCORES4GC);
 558         BLOCKINDEX(tmpheaptop-1, &tmpsbs);
 559         for(; b < tmpsbs; b++) {
 560           bamboo_smemtbl[b] = bound;
 561           if(b==NUMCORES4GC-1) {
 562             bound = BAMBOO_SMEM_SIZE;
 563           }
 564         }
 565         if(((unsigned int)(isize-remain)%(BAMBOO_SMEM_SIZE)) == 0) {
 566           gcsbstarttbl[sb] = -1;
 567           remain = BLOCKSIZE((tmpheaptop-gcbaseva)<(BAMBOO_LARGE_SMEM_BOUND));
 568           bamboo_smemtbl[b] = bound;
 569         } else {
 570           gcsbstarttbl[sb] = (int)tmpheaptop;
 571           remain = tmpheaptop-gcbaseva;
 572           bamboo_smemtbl[b] = remain%bound;
 573           remain = bound - bamboo_smemtbl[b];
 574         }
 575
 576         CLOSEBLOCK(base, isize+BAMBOO_CACHE_LINE_SIZE);
 577         cpysize = 0;
 578         base = tmpheaptop;
 579         if(remain == BAMBOO_CACHE_LINE_SIZE) {
 580           // fill with 0 in case
 581           BAMBOO_MEMSET_WH(tmpheaptop, '\0', remain);
 582         }
 583         remain -= BAMBOO_CACHE_LINE_SIZE;
 584         tmpheaptop += BAMBOO_CACHE_LINE_SIZE;
 585       }
 586     }
 587
 588     if(cpysize > 0) {
 589       CLOSEBLOCK(base, cpysize+BAMBOO_CACHE_LINE_SIZE);
 590       bamboo_smemtbl[b] += BAMBOO_CACHE_LINE_SIZE;
 591     } else {
 592       tmpheaptop -= BAMBOO_CACHE_LINE_SIZE;
 593     }
 594     gcheaptop = tmpheaptop;
 595   }
 596
 597   bamboo_free_block = 0;
 598   unsigned int tbound = 0;
 599   do {
 600     tbound=BLOCKSIZE(bamboo_free_block<NUMCORES4GC);
 601     if(bamboo_smemtbl[bamboo_free_block] == tbound) {
 602       bamboo_free_block++;
 603     } else {
 604       // the first non-full partition
 605       break;
 606     }
 607   } while(true);
 608
 609   GCPROFILE_RECORD_SPACE();
 610 }
 611
 612 void gc_collect(struct garbagelist * stackptr) {
 613   gc_status_info.gcprocessing = true;
 614   // inform the master that this core is at a gc safe point and is ready to
 615   // do gc
 616   send_msg_4(STARTUPCORE,GCFINISHPRE,BAMBOO_NUM_OF_CORE,self_numsendobjs,self_numreceiveobjs);
 617
 618   // core collector routine
 619   //wait for init phase
 620   WAITFORGCPHASE(INITPHASE);
 621
 622   GC_PRINTF("Do initGC\n");
 623   initGC();
 624   CACHEADAPT_GC(true);
 625   //send init finish msg to core coordinator
 626   send_msg_2(STARTUPCORE,GCFINISHINIT,BAMBOO_NUM_OF_CORE);
 627
 628   //wait for mark phase
 629   WAITFORGCPHASE(MARKPHASE);
 630
 631   GC_PRINTF("Start mark phase\n");
 632   mark(true, stackptr);
 633   GC_PRINTF("Finish mark phase, start compact phase\n");
 634   compact();
 635   GC_PRINTF("Finish compact phase\n");
 636
 637   WAITFORGCPHASE(FLUSHPHASE);
 638
 639   GC_PRINTF("Start flush phase\n");
 640   GCPROFILE_INFO_2_MASTER();
 641   flush(stackptr);
 642   GC_PRINTF("Finish flush phase\n");
 643
 644   CACHEADAPT_PHASE_CLIENT();
 645
 646   // invalidate all shared mem pointers
 647   bamboo_cur_msp = NULL;
 648   bamboo_smem_size = 0;
 649   bamboo_smem_zero_top = NULL;
 650   gcflag = false;
 651
 652   WAITFORGCPHASE(FINISHPHASE);
 653
 654   GC_PRINTF("Finish gc! \n");
 655 }
 656
 657 void gc_nocollect(struct garbagelist * stackptr) {
 658   gc_status_info.gcprocessing = true;
 659   // inform the master that this core is at a gc safe point and is ready to
 660   // do gc
 661   send_msg_4(STARTUPCORE,GCFINISHPRE,BAMBOO_NUM_OF_CORE,self_numsendobjs,self_numreceiveobjs);
 662
 663   WAITFORGCPHASE(INITPHASE);
 664
 665   GC_PRINTF("Do initGC\n");
 666   initGC();
 667   CACHEADAPT_GC(true);
 668   //send init finish msg to core coordinator
 669   send_msg_2(STARTUPCORE,GCFINISHINIT,BAMBOO_NUM_OF_CORE);
 670
 671   WAITFORGCPHASE(MARKPHASE);
 672
 673   GC_PRINTF("Start mark phase\n");
 674   mark(true, stackptr);
 675   GC_PRINTF("Finish mark phase, wait for flush\n");
 676
 677   // non-gc core collector routine
 678   WAITFORGCPHASE(FLUSHPHASE);
 679
 680   GC_PRINTF("Start flush phase\n");
 681   GCPROFILE_INFO_2_MASTER();
 682   flush(stackptr);
 683   GC_PRINTF("Finish flush phase\n");
 684
 685   CACHEADAPT_PHASE_CLIENT();
 686
 687   // invalidate all shared mem pointers
 688   bamboo_cur_msp = NULL;
 689   bamboo_smem_size = 0;
 690   bamboo_smem_zero_top = NULL;
 691
 692   gcflag = false;
 693   WAITFORGCPHASE(FINISHPHASE);
 694
 695   GC_PRINTF("Finish gc! \n");
 696 }
 697
 698 void master_mark(struct garbagelist *stackptr) {
 699   bool isfirst = true;
 700
 701   GC_PRINTF("Start mark phase \n");
 702   GC_SEND_MSG_1_TO_CLIENT(GCSTART);
 703   gc_status_info.gcphase = MARKPHASE;
 704   // mark phase
 705
 706   while(MARKPHASE == gc_status_info.gcphase) {
 707     mark(isfirst, stackptr);
 708     isfirst=false;
 709     // check gcstatus
 710     checkMarkStatus();
 711   }
 712 }
 713
 714 void master_getlargeobjs() {
 715   // send msgs to all cores requiring large objs info
 716   // Note: only need to ask gc cores, non-gc cores do not host any objs
 717   numconfirm = NUMCORES4GC - 1;
 718   for(int i = 1; i < NUMCORES4GC; i++) {
 719     send_msg_1(i,GCLOBJREQUEST);
 720   }
 721   gcloads[BAMBOO_NUM_OF_CORE] = gccurr_heaptop;
 722   //spin until we have all responses
 723   while(numconfirm!=0) ;
 724
 725   // check the heaptop
 726   if(gcheaptop < gcmarkedptrbound) {
 727     gcheaptop = gcmarkedptrbound;
 728   }
 729   GCPROFILE_ITEM();
 730   GC_PRINTF("prepare to cache large objs \n");
 731
 732   // cache all large objs
 733   BAMBOO_ASSERTMSG(cacheLObjs(), "Not enough space to cache large objects\n");
 734 }
 735
 736 void master_compact() {
 737   // predict number of blocks to fill for each core
 738   unsigned int tmpheaptop = 0;
 739   int numpbc = loadbalance(&tmpheaptop);
 740   //tprintf("numpbc: %d \n", numpbc);
 741
 742   numpbc = (BAMBOO_SHARED_MEM_SIZE)/(BAMBOO_SMEM_SIZE);
 743   GC_PRINTF("mark phase finished \n");
 744
 745   tmpheaptop = gcbaseva + (BAMBOO_SHARED_MEM_SIZE);
 746   for(int i = 0; i < NUMCORES4GC; i++) {
 747     unsigned int tmpcoreptr = 0;
 748     BASEPTR(i, numpbc, &tmpcoreptr);
 749     // init some data strutures for compact phase
 750     gcloads[i] = 0;
 751     gcfilledblocks[i] = 0;
 752     gcrequiredmems[i] = 0;
 753     gccorestatus[i] = 1;
 754     //send start compact messages to all cores
 755     //TODO bug here, do not know if the direction is positive or negtive?
 756     //if (tmpcoreptr < tmpheaptop) {
 757       gcstopblock[i] = numpbc+1;
 758       if(i != STARTUPCORE) {
 759         send_msg_2(i, GCSTARTCOMPACT, numpbc+1);
 760       } else {
 761         gcblock2fill = numpbc+1;
 762       }
 763     /*} else {
 764       gcstopblock[i] = numpbc;
 765       if(i != STARTUPCORE) {
 766         send_msg_2(i, GCSTARTCOMPACT, numpbc);
 767       } else {
 768         gcblock2fill = numpbc;
 769       }
 770     }*/
 771   }
 772   BAMBOO_CACHE_MF();
 773   GCPROFILE_ITEM();
 774   // compact phase
 775   struct moveHelper * orig = (struct moveHelper *)RUNMALLOC(sizeof(struct moveHelper));
 776   struct moveHelper * to = (struct moveHelper *)RUNMALLOC(sizeof(struct moveHelper));
 777   compact_master(orig, to);
 778   GCPROFILE_ITEM();
 779   GC_PRINTF("prepare to move large objs \n");
 780   // move largeObjs
 781   moveLObjs();
 782   GC_PRINTF("compact phase finished \n");
 783   RUNFREE(orig);
 784   RUNFREE(to);
 785 }
 786
 787 void master_updaterefs(struct garbagelist * stackptr) {
 788   gc_status_info.gcphase = FLUSHPHASE;
 789   GC_SEND_MSG_1_TO_CLIENT(GCSTARTFLUSH);
 790   GCPROFILE_ITEM();
 791   GC_PRINTF("Start flush phase \n");
 792   // flush phase
 793   flush(stackptr);
 794   GC_CHECK_ALL_CORE_STATUS(FLUSHPHASE==gc_status_info.gcphase);
 795   GC_PRINTF("Finish flush phase \n");
 796 }
 797
 798 void master_finish() {
 799   gc_status_info.gcphase = FINISHPHASE;
 800
 801   // invalidate all shared mem pointers
 802   // put it here as it takes time to inform all the other cores to
 803   // finish gc and it might cause problem when some core resumes
 804   // mutator earlier than the other cores
 805   bamboo_cur_msp = NULL;
 806   bamboo_smem_size = 0;
 807   bamboo_smem_zero_top = NULL;
 808
 809   GCPROFILE_END();
 810   unsigned long long tmpt = BAMBOO_GET_EXE_TIME();
 811   CACHEADAPT_OUTPUT_CACHE_POLICY();
 812   gc_output_cache_policy_time += (BAMBOO_GET_EXE_TIME()-tmpt);
 813   gcflag = false;
 814   GC_SEND_MSG_1_TO_CLIENT(GCFINISH);
 815
 816   gc_status_info.gcprocessing = false;
 817   if(gcflag) {
 818     // inform other cores to stop and wait for gc
 819     gcprecheck = true;
 820     for(int i = 0; i < NUMCORESACTIVE; i++) {
 821       // reuse the gcnumsendobjs & gcnumreceiveobjs
 822       gcnumsendobjs[0][i] = 0;
 823       gcnumreceiveobjs[0][i] = 0;
 824     }
 825     GC_SEND_MSG_1_TO_CLIENT(GCSTARTPRE);
 826   }
 827 }
 828
 829 void gc_master(struct garbagelist * stackptr) {
 830   tprintf("start GC !!!!!!!!!!!!! \n");
 831   gc_status_info.gcprocessing = true;
 832   gc_status_info.gcphase = INITPHASE;
 833
 834   waitconfirm = false;
 835   numconfirm = 0;
 836   initGC();
 837   GC_SEND_MSG_1_TO_CLIENT(GCSTARTINIT);
 838   CACHEADAPT_GC(true);
 839   GC_PRINTF("Check core status \n");
 840   GC_CHECK_ALL_CORE_STATUS(true);
 841   GCPROFILE_ITEM();
 842   unsigned long long tmpt = BAMBOO_GET_EXE_TIME();
 843   CACHEADAPT_OUTPUT_CACHE_SAMPLING();
 844   gc_output_cache_policy_time += (BAMBOO_GET_EXE_TIME()-tmpt);
 845
 846   // do mark phase
 847   master_mark(stackptr);
 848
 849   // get large objects from all cores
 850   master_getlargeobjs();
 851
 852   // compact the heap
 853   master_compact();
 854
 855   // update the references
 856   master_updaterefs(stackptr);
 857
 858   // do cache adaptation
 859   CACHEADAPT_PHASE_MASTER();
 860
 861   // do finish up stuff
 862   master_finish();
 863
 864   GC_PRINTF("gc finished   \n");
 865   tprintf("finish GC ! %d \n",gcflag);
 866 }
 867
 868 void pregccheck() {
 869   while(true) {
 870     BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
 871     gcnumsendobjs[0][BAMBOO_NUM_OF_CORE] = self_numsendobjs;
 872     gcnumreceiveobjs[0][BAMBOO_NUM_OF_CORE] = self_numreceiveobjs;
 873     int sumsendobj = 0;
 874     for(int i = 0; i < NUMCORESACTIVE; i++) {
 875       sumsendobj += gcnumsendobjs[0][i];
 876     }
 877     for(int i = 0; i < NUMCORESACTIVE; i++) {
 878       sumsendobj -= gcnumreceiveobjs[0][i];
 879     }
 880     if(0 != sumsendobj) {
 881       // there were still some msgs on the fly, wait until there
 882       // are some update pregc information coming and check it again
 883       gcprecheck = false;
 884       BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
 885
 886       while(!gcprecheck) ;
 887     } else {
 888       BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
 889       return;
 890     }
 891   }
 892 }
 893
 894 void pregcprocessing() {
 895 #if defined(GC_CACHE_ADAPT)&&defined(GC_CACHE_SAMPLING)
 896   // disable the timer interrupt
 897   bamboo_mask_timer_intr();
 898 #endif
 899   // Zero out the remaining memory here because for the GC_CACHE_ADAPT version,
 900   // we need to make sure during the gcinit phase the shared heap is not
 901   // touched. Otherwise, there would be problem when adapt the cache strategy.
 902   BAMBOO_CLOSE_CUR_MSP();
 903 #if defined(GC_CACHE_ADAPT)&&defined(GC_CACHE_SAMPLING)
 904   // get the sampling data
 905   bamboo_output_dtlb_sampling();
 906 #endif
 907 }
 908
 909 void postgcprocessing() {
 910 #if defined(GC_CACHE_ADAPT)&&defined(GC_CACHE_SAMPLING)
 911   // enable the timer interrupt
 912   bamboo_tile_timer_set_next_event(GC_TILE_TIMER_EVENT_SETTING);
 913   bamboo_unmask_timer_intr();
 914 #endif
 915 }
 916
 917 bool gc(struct garbagelist * stackptr) {
 918   // check if do gc
 919   if(!gcflag) {
 920     gc_status_info.gcprocessing = false;
 921     return false;
 922   }
 923
 924   // core coordinator routine
 925   if(0 == BAMBOO_NUM_OF_CORE) {
 926     GC_PRINTF("Check if we can do gc or not\n");
 927     gccorestatus[BAMBOO_NUM_OF_CORE] = 0;
 928     if(!gc_checkAllCoreStatus()) {
 929       // some of the cores are still executing the mutator and did not reach
 930       // some gc safe point, therefore it is not ready to do gc
 931       gcflag = true;
 932       return false;
 933     } else {
 934       GCPROFILE_START();
 935       pregccheck();
 936     }
 937     GC_PRINTF("start gc! \n");
 938     pregcprocessing();
 939     gc_master(stackptr);
 940   } else if(BAMBOO_NUM_OF_CORE < NUMCORES4GC) {
 941     pregcprocessing();
 942     gc_collect(stackptr);
 943   } else {
 944     pregcprocessing();
 945     gc_nocollect(stackptr);
 946   }
 947   postgcprocessing();
 948
 949   return true;
 950 }
 951
 952 #endif