Robust/src/Runtime/bamboo/multicoregarbage.c

   1 // BAMBOO_EXIT(0xb000);
   2 // TODO: DO NOT support tag!!!
   3 #ifdef MULTICORE_GC
   4 #include "runtime.h"
   5 #include "multicoregarbage.h"
   6 #include "multicoregcmark.h"
   7 #include "multicoregccompact.h"
   8 #include "multicoregcflush.h"
   9 #include "multicoreruntime.h"
  10 #include "multicoregcprofile.h"
  11
  12 struct pointerblock *gchead=NULL;
  13 int gcheadindex=0;
  14 struct pointerblock *gctail=NULL;
  15 int gctailindex=0;
  16 struct pointerblock *gctail2=NULL;
  17 int gctailindex2=0;
  18 struct pointerblock *gcspare=NULL;
  19
  20 struct lobjpointerblock *gclobjhead=NULL;
  21 int gclobjheadindex=0;
  22 struct lobjpointerblock *gclobjtail=NULL;
  23 int gclobjtailindex=0;
  24 struct lobjpointerblock *gclobjtail2=NULL;
  25 int gclobjtailindex2=0;
  26 struct lobjpointerblock *gclobjspare=NULL;
  27
  28 #ifdef MULTICORE_GC
  29 #ifdef SMEMM
  30 extern unsigned int gcmem_mixed_threshold;
  31 extern unsigned int gcmem_mixed_usedmem;
  32 #endif // SMEMM
  33 #endif // MULTICORE_GC
  34
  35 #ifdef GC_DEBUG
  36 // dump whole mem in blocks
  37 INLINE void dumpSMem() {
  38   int block = 0;
  39   int sblock = 0;
  40   unsigned int j = 0;
  41   void * i = 0;
  42   int coren = 0;
  43   int x = 0;
  44   int y = 0;
  45   printf("(%x,%x) Dump shared mem: \n",udn_tile_coord_x(),udn_tile_coord_y());
  46   // reserved blocks for sblocktbl
  47   printf("(%x,%x) ++++ reserved sblocks ++++ \n", udn_tile_coord_x(),
  48       udn_tile_coord_y());
  49   for(i=BAMBOO_BASE_VA; (unsinged int)i<(unsigned int)gcbaseva; i+= 4*16) {
  50     printf("(%x,%x) 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x \n",
  51         udn_tile_coord_x(), udn_tile_coord_y(),
  52         *((int *)(i)), *((int *)(i + 4)),
  53         *((int *)(i + 4*2)), *((int *)(i + 4*3)),
  54         *((int *)(i + 4*4)), *((int *)(i + 4*5)),
  55         *((int *)(i + 4*6)), *((int *)(i + 4*7)),
  56         *((int *)(i + 4*8)), *((int *)(i + 4*9)),
  57         *((int *)(i + 4*10)), *((int *)(i + 4*11)),
  58         *((int *)(i + 4*12)), *((int *)(i + 4*13)),
  59         *((int *)(i + 4*14)), *((int *)(i + 4*15)));
  60   }
  61   sblock = gcreservedsb;
  62   bool advanceblock = false;
  63   // remaining memory
  64   for(i=gcbaseva;
  65       (unsigned int)i<(unsigned int)(gcbaseva+BAMBOO_SHARED_MEM_SIZE);
  66       i+=4*16) {
  67     advanceblock = false;
  68     // computing sblock # and block #, core coordinate (x,y) also
  69     if(j%((BAMBOO_SMEM_SIZE)/(4*16)) == 0) {
  70       // finished a sblock
  71       if(j < ((BAMBOO_LARGE_SMEM_BOUND)/(4*16))) {
  72     if((j > 0) && (j%((BAMBOO_SMEM_SIZE_L)/(4*16)) == 0)) {
  73       // finished a block
  74       block++;
  75       advanceblock = true;
  76     }
  77       } else {
  78     // finished a block
  79     block++;
  80     advanceblock = true;
  81       }
  82       // compute core #
  83       if(advanceblock) {
  84     coren = gc_block2core[block%(NUMCORES4GC*2)];
  85       }
  86       // compute core coordinate
  87       x = BAMBOO_COORDS_X(coren);
  88       y = BAMBOO_COORDS_Y(coren);
  89       printf("(%x,%x) ==== %d, %d : core (%d,%d), saddr %x====\n",
  90          udn_tile_coord_x(), udn_tile_coord_y(),
  91              block, sblock++, x, y,
  92              (sblock-1)*(BAMBOO_SMEM_SIZE)+gcbaseva);
  93     }
  94     j++;
  95     printf("(%x,%x) 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x \n",
  96         udn_tile_coord_x(), udn_tile_coord_y(),
  97         *((int *)(i)), *((int *)(i + 4)),
  98         *((int *)(i + 4*2)), *((int *)(i + 4*3)),
  99         *((int *)(i + 4*4)), *((int *)(i + 4*5)),
 100         *((int *)(i + 4*6)), *((int *)(i + 4*7)),
 101         *((int *)(i + 4*8)), *((int *)(i + 4*9)),
 102         *((int *)(i + 4*10)), *((int *)(i + 4*11)),
 103         *((int *)(i + 4*12)), *((int *)(i + 4*13)),
 104         *((int *)(i + 4*14)), *((int *)(i + 4*15)));
 105   }
 106   printf("(%x,%x) \n", udn_tile_coord_x(), udn_tile_coord_y());
 107 }
 108 #endif
 109
 110 INLINE void initmulticoregcdata() {
 111   int i = 0;
 112   if(STARTUPCORE == BAMBOO_NUM_OF_CORE) {
 113     // startup core to initialize corestatus[]
 114     for(i = 0; i < NUMCORESACTIVE; ++i) {
 115       gccorestatus[i] = 1;
 116       gcnumsendobjs[0][i] = gcnumsendobjs[1][i] = 0;
 117       gcnumreceiveobjs[0][i] = gcnumreceiveobjs[1][i] = 0;
 118     }
 119     for(i = 0; i < NUMCORES4GC; ++i) {
 120       gcloads[i] = 0;
 121       gcrequiredmems[i] = 0;
 122       gcstopblock[i] = 0;
 123       gcfilledblocks[i] = 0;
 124     }
 125   }
 126
 127   bamboo_smem_zero_top = NULL;
 128   gcflag = false;
 129   gcprocessing = false;
 130   gcphase = FINISHPHASE;
 131   gcprecheck = true;
 132   gccurr_heaptop = 0;
 133   gcself_numsendobjs = 0;
 134   gcself_numreceiveobjs = 0;
 135   gcmarkedptrbound = 0;
 136   gcforwardobjtbl = allocateMGCHash_I(20, 3);
 137   gcnumlobjs = 0;
 138   gcheaptop = 0;
 139   gctopcore = 0;
 140   gctopblock = 0;
 141   gcmovestartaddr = 0;
 142   gctomove = false;
 143   gcmovepending = 0;
 144   gcblock2fill = 0;
 145 #ifdef SMEMM
 146   gcmem_mixed_threshold = (unsigned int)((BAMBOO_SHARED_MEM_SIZE
 147                 -bamboo_reserved_smem*BAMBOO_SMEM_SIZE)*0.8);
 148   gcmem_mixed_usedmem = 0;
 149 #endif
 150 #ifdef MGC_SPEC
 151   gc_profile_flag = false;
 152 #endif
 153 #ifdef GC_FLUSH_DTLB
 154   gc_num_flush_dtlb = 0;
 155 #endif
 156   gc_localheap_s = false;
 157 #ifdef GC_CACHE_ADAPT
 158   gccachestage = false;
 159 #endif
 160
 161   INIT_MULTICORE_GCPROFILE_DATA();
 162 }
 163
 164 INLINE void dismulticoregcdata() {
 165   freeMGCHash(gcforwardobjtbl);
 166 }
 167
 168 INLINE void initGC() {
 169   int i;
 170   if(STARTUPCORE == BAMBOO_NUM_OF_CORE) {
 171     for(i = 0; i < NUMCORES4GC; ++i) {
 172       gccorestatus[i] = 1;
 173       gcnumsendobjs[0][i] = gcnumsendobjs[1][i] = 0;
 174       gcnumreceiveobjs[0][i] = gcnumreceiveobjs[1][i] = 0;
 175       gcloads[i] = 0;
 176       gcrequiredmems[i] = 0;
 177       gcfilledblocks[i] = 0;
 178       gcstopblock[i] = 0;
 179     }
 180     for(i = NUMCORES4GC; i < NUMCORESACTIVE; ++i) {
 181       gccorestatus[i] = 1;
 182       gcnumsendobjs[0][i] = gcnumsendobjs[1][i] = 0;
 183       gcnumreceiveobjs[0][i] = gcnumreceiveobjs[1][i] = 0;
 184     }
 185     gcheaptop = 0;
 186     gctopcore = 0;
 187     gctopblock = 0;
 188   gcnumsrobjs_index = 0;
 189   }
 190   gcself_numsendobjs = 0;
 191   gcself_numreceiveobjs = 0;
 192   gcmarkedptrbound = 0;
 193   gcnumlobjs = 0;
 194   gcmovestartaddr = 0;
 195   gctomove = false;
 196   gcblock2fill = 0;
 197   gcmovepending = 0;
 198   gccurr_heaptop = 0;
 199   gcdstcore = 0;
 200
 201   // initialize queue
 202   if (gchead==NULL) {
 203     gcheadindex=gctailindex=gctailindex2 = 0;
 204     gchead=gctail=gctail2=RUNMALLOC(sizeof(struct pointerblock));
 205   } else {
 206     gctailindex = gctailindex2 = gcheadindex = 0;
 207     gctail = gctail2 = gchead;
 208   }
 209
 210   // initialize the large obj queues
 211   if (gclobjhead==NULL) {
 212     gclobjheadindex=0;
 213     gclobjtailindex=0;
 214     gclobjtailindex2 = 0;
 215     gclobjhead=gclobjtail=gclobjtail2=
 216     RUNMALLOC(sizeof(struct lobjpointerblock));
 217   } else {
 218     gclobjtailindex = gclobjtailindex2 = gclobjheadindex = 0;
 219     gclobjtail = gclobjtail2 = gclobjhead;
 220   }
 221   gclobjhead->next = gclobjhead->prev = NULL;
 222
 223   freeMGCHash(gcforwardobjtbl);
 224   gcforwardobjtbl = allocateMGCHash(20, 3);
 225
 226   GCPROFILE_INIT();
 227 }
 228
 229 INLINE bool gc_checkAllCoreStatus_I() {
 230   int i = 0;
 231   for(i = 0; i < NUMCORESACTIVE; ++i) {
 232     if(gccorestatus[i] != 0) {
 233       break;
 234     }
 235   }
 236   return (i == NUMCORESACTIVE);
 237 }
 238
 239 INLINE void checkMarkStatue() {
 240   int i;
 241   if((!waitconfirm) ||
 242       (waitconfirm && (numconfirm == 0))) {
 243     unsigned int entry_index = 0;
 244     if(waitconfirm) {
 245       // phase 2
 246       entry_index = (gcnumsrobjs_index == 0) ? 1 : 0;
 247     } else {
 248       // phase 1
 249       entry_index = gcnumsrobjs_index;
 250     }
 251     BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
 252     gccorestatus[BAMBOO_NUM_OF_CORE] = 0;
 253     gcnumsendobjs[entry_index][BAMBOO_NUM_OF_CORE] = gcself_numsendobjs;
 254     gcnumreceiveobjs[entry_index][BAMBOO_NUM_OF_CORE] = gcself_numreceiveobjs;
 255     // check the status of all cores
 256     bool allStall = gc_checkAllCoreStatus_I();
 257     if(allStall) {
 258       // ask for confirm
 259       if(!waitconfirm) {
 260         // the first time found all cores stall
 261         // send out status confirm msg to all other cores
 262         // reset the corestatus array too
 263         gccorestatus[BAMBOO_NUM_OF_CORE] = 1;
 264         waitconfirm = true;
 265         numconfirm = NUMCORESACTIVE - 1;
 266         BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
 267         for(i = 1; i < NUMCORESACTIVE; ++i) {
 268           gccorestatus[i] = 1;
 269           // send mark phase finish confirm request msg to core i
 270           send_msg_1(i, GCMARKCONFIRM, false);
 271         }
 272       } else {
 273         // Phase 2
 274         // check if the sum of send objs and receive obj are the same
 275         // yes->check if the info is the latest; no->go on executing
 276         unsigned int sumsendobj = 0;
 277         for(i = 0; i < NUMCORESACTIVE; ++i) {
 278           sumsendobj += gcnumsendobjs[gcnumsrobjs_index][i];
 279         }
 280         for(i = 0; i < NUMCORESACTIVE; ++i) {
 281           sumsendobj -= gcnumreceiveobjs[gcnumsrobjs_index][i];
 282         }
 283         if(0 == sumsendobj) {
 284           // Check if there are changes of the numsendobjs or numreceiveobjs on
 285           // each core
 286           bool ischanged = false;
 287           for(i = 0; i < NUMCORESACTIVE; ++i) {
 288             if((gcnumsendobjs[0][i] != gcnumsendobjs[1][i]) ||
 289                 (gcnumreceiveobjs[0][i] != gcnumreceiveobjs[1][i]) ) {
 290               ischanged = true;
 291               break;
 292             }
 293           }
 294           if(!ischanged) {
 295             // all the core status info are the latest,stop mark phase
 296             gcphase = COMPACTPHASE;
 297             // restore the gcstatus for all cores
 298             for(i = 0; i < NUMCORESACTIVE; ++i) {
 299               gccorestatus[i] = 1;
 300             }
 301           } else {
 302             // There were changes between phase 1 and phase 2, can not decide
 303             // whether the mark phase has been finished
 304             waitconfirm = false;
 305             // As it fails in phase 2, flip the entries
 306             gcnumsrobjs_index = (gcnumsrobjs_index == 0) ? 1 : 0;
 307           }
 308         } else {
 309           // There were changes between phase 1 and phase 2, can not decide
 310           // whether the mark phase has been finished
 311           waitconfirm = false;
 312           // As it fails in phase 2, flip the entries
 313           gcnumsrobjs_index = (gcnumsrobjs_index == 0) ? 1 : 0;
 314         }
 315         BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
 316       }
 317     } else {
 318       BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
 319     }
 320   }
 321 }
 322
 323 // compute load balance for all cores
 324 INLINE int loadbalance(unsigned int * heaptop) {
 325   // compute load balance
 326   int i;
 327
 328   // get the total loads
 329   unsigned int tloads = gcloads[STARTUPCORE];
 330   for(i = 1; i < NUMCORES4GC; i++) {
 331     tloads += gcloads[i];
 332   }
 333   *heaptop = gcbaseva + tloads;
 334
 335   unsigned int b = 0;
 336   BLOCKINDEX(*heaptop, &b);
 337   // num of blocks per core
 338   unsigned int numbpc = (unsigned int)b/(unsigned int)(NUMCORES4GC);
 339   gctopblock = b;
 340   RESIDECORE(heaptop, &gctopcore);
 341   return numbpc;
 342 }
 343
 344 // compute total mem size required and sort the lobjs in ascending order
 345 INLINE unsigned int sortLObjs() {
 346   unsigned int tmp_lobj = 0;
 347   unsigned int tmp_len = 0;
 348   unsigned int tmp_host = 0;
 349   unsigned int sumsize = 0;
 350
 351   gclobjtail2 = gclobjtail;
 352   gclobjtailindex2 = gclobjtailindex;
 353   // TODO USE QUICK SORT INSTEAD?
 354   while(gc_lobjmoreItems2_I()) {
 355     gc_lobjdequeue2_I();
 356     tmp_lobj = gclobjtail2->lobjs[gclobjtailindex2-1];
 357     tmp_host = gclobjtail2->hosts[gclobjtailindex2-1];
 358     tmp_len = gclobjtail2->lengths[gclobjtailindex2 - 1];
 359     sumsize += tmp_len;
 360     GCPROFILE_RECORD_LOBJ();
 361     unsigned int i = gclobjtailindex2-1;
 362     struct lobjpointerblock * tmp_block = gclobjtail2;
 363     // find the place to insert
 364     while(true) {
 365       if(i == 0) {
 366     if(tmp_block->prev == NULL) {
 367       break;
 368     }
 369     if(tmp_block->prev->lobjs[NUMLOBJPTRS-1] > tmp_lobj) {
 370       tmp_block->lobjs[i] = tmp_block->prev->lobjs[NUMLOBJPTRS-1];
 371       tmp_block->lengths[i] = tmp_block->prev->lengths[NUMLOBJPTRS-1];
 372       tmp_block->hosts[i] = tmp_block->prev->hosts[NUMLOBJPTRS-1];
 373       tmp_block = tmp_block->prev;
 374       i = NUMLOBJPTRS-1;
 375     } else {
 376       break;
 377     }  // if(tmp_block->prev->lobjs[NUMLOBJPTRS-1] < tmp_lobj)
 378       } else {
 379     if(tmp_block->lobjs[i-1] > tmp_lobj) {
 380       tmp_block->lobjs[i] = tmp_block->lobjs[i-1];
 381       tmp_block->lengths[i] = tmp_block->lengths[i-1];
 382       tmp_block->hosts[i] = tmp_block->hosts[i-1];
 383       i--;
 384     } else {
 385       break;
 386     }
 387       }
 388     }
 389     // insert it
 390     if(i != gclobjtailindex2 - 1) {
 391       tmp_block->lobjs[i] = tmp_lobj;
 392       tmp_block->lengths[i] = tmp_len;
 393       tmp_block->hosts[i] = tmp_host;
 394     }
 395   }
 396   return sumsize;
 397 }
 398
 399 INLINE bool cacheLObjs() {
 400   // check the total mem size need for large objs
 401   unsigned long long sumsize = 0;
 402   unsigned int size = 0;
 403
 404   sumsize = sortLObjs();
 405
 406   GCPROFILE_RECORD_LOBJSPACE();
 407
 408   // check if there are enough space to cache these large objs
 409   unsigned int dst = gcbaseva + (BAMBOO_SHARED_MEM_SIZE) -sumsize;
 410   if((unsigned long long)gcheaptop > (unsigned long long)dst) {
 411     // do not have enough room to cache large objs
 412     return false;
 413   }
 414
 415   gcheaptop = dst; // Note: record the start of cached lobjs with gcheaptop
 416   // cache the largeObjs to the top of the shared heap
 417   dst = gcbaseva + (BAMBOO_SHARED_MEM_SIZE);
 418   while(gc_lobjmoreItems3_I()) {
 419     gc_lobjdequeue3_I();
 420     size = gclobjtail2->lengths[gclobjtailindex2];
 421     // set the mark field to , indicating that this obj has been moved
 422     // and need to be flushed
 423     ((struct ___Object___ *)(gclobjtail2->lobjs[gclobjtailindex2]))->marked =
 424       COMPACTED;
 425     dst -= size;
 426     if((unsigned int)dst <
 427         (unsigned int)(gclobjtail2->lobjs[gclobjtailindex2]+size)) {
 428       memmove(dst, gclobjtail2->lobjs[gclobjtailindex2], size);
 429     } else {
 430       memcpy(dst, gclobjtail2->lobjs[gclobjtailindex2], size);
 431     }
 432   }
 433   return true;
 434 }
 435
 436 // update the bmmboo_smemtbl to record current shared mem usage
 437 void updateSmemTbl(unsigned int coren,
 438                    unsigned int localtop) {
 439   unsigned int ltopcore = 0;
 440   unsigned int bound = BAMBOO_SMEM_SIZE_L;
 441   BLOCKINDEX(localtop, &ltopcore);
 442   if((unsigned int)localtop>=(unsigned int)(gcbaseva+BAMBOO_LARGE_SMEM_BOUND)){
 443     bound = BAMBOO_SMEM_SIZE;
 444   }
 445   unsigned int load = (unsigned int)(localtop-gcbaseva)%(unsigned int)bound;
 446   unsigned int i = 0;
 447   unsigned int j = 0;
 448   unsigned int toset = 0;
 449   do {
 450     toset = gc_core2block[2*coren+i]+(unsigned int)(NUMCORES4GC*2)*j;
 451     if(toset < ltopcore) {
 452       bamboo_smemtbl[toset]=
 453         (toset<NUMCORES4GC) ? BAMBOO_SMEM_SIZE_L : BAMBOO_SMEM_SIZE;
 454 #ifdef SMEMM
 455       gcmem_mixed_usedmem += bamboo_smemtbl[toset];
 456 #endif
 457     } else if(toset == ltopcore) {
 458       bamboo_smemtbl[toset] = load;
 459 #ifdef SMEMM
 460       gcmem_mixed_usedmem += bamboo_smemtbl[toset];
 461 #endif
 462       break;
 463     } else {
 464       break;
 465     }
 466     i++;
 467     if(i == 2) {
 468       i = 0;
 469       j++;
 470     }
 471   } while(true);
 472 }
 473
 474 INLINE unsigned int checkCurrHeapTop() {
 475   // update the smemtbl
 476   BAMBOO_MEMSET_WH(bamboo_smemtbl, 0, sizeof(int)*gcnumblock);
 477   // flush all gcloads to indicate the real heap top on one core
 478   // previous it represents the next available ptr on a core
 479   if(((unsigned int)gcloads[0] > (unsigned int)(gcbaseva+BAMBOO_SMEM_SIZE_L))
 480      && (((unsigned int)gcloads[0]%(BAMBOO_SMEM_SIZE)) == 0)) {
 481     // edge of a block, check if this is exactly the heaptop
 482     BASEPTR(0, gcfilledblocks[0]-1, &(gcloads[0]));
 483     gcloads[0]+=(gcfilledblocks[0]>1?(BAMBOO_SMEM_SIZE):(BAMBOO_SMEM_SIZE_L));
 484   }
 485   updateSmemTbl(0, gcloads[0]);
 486   for(int i = 1; i < NUMCORES4GC; i++) {
 487     unsigned int tmptop = 0;
 488     if((gcfilledblocks[i] > 0)
 489        && (((unsigned int)gcloads[i] % (BAMBOO_SMEM_SIZE)) == 0)) {
 490       // edge of a block, check if this is exactly the heaptop
 491       BASEPTR(i, gcfilledblocks[i]-1, &gcloads[i]);
 492       gcloads[i] +=
 493         (gcfilledblocks[i]>1?(BAMBOO_SMEM_SIZE):(BAMBOO_SMEM_SIZE_L));
 494       tmptop = gcloads[i];
 495     }
 496     updateSmemTbl(i, gcloads[i]);
 497   }
 498
 499   // find current heap top
 500   // TODO
 501   // a bug here: when using local allocation, directly move large objects
 502   // to the highest free chunk might not be memory efficient
 503   unsigned int tmpheaptop = 0;
 504   int i = 0;
 505   for(i = gcnumblock-1; i >= 0; i--) {
 506     if(bamboo_smemtbl[i] > 0) {
 507       break;
 508     }
 509   }
 510   if(i == -1) {
 511     tmpheaptop = gcbaseva;
 512   } else {
 513     tmpheaptop = gcbaseva+bamboo_smemtbl[i]+((i<NUMCORES4GC) ?
 514         (BAMBOO_SMEM_SIZE_L*i) :
 515         (BAMBOO_SMEM_SIZE*(i-NUMCORES4GC)+BAMBOO_LARGE_SMEM_BOUND));
 516   }
 517   return tmpheaptop;
 518 }
 519
 520 INLINE void moveLObjs() {
 521 #ifdef SMEMM
 522   // update the gcmem_mixed_usedmem
 523   gcmem_mixed_usedmem = 0;
 524 #endif
 525   unsigned int size = 0;
 526   unsigned int bound = 0;
 527   unsigned int tmpheaptop = checkCurrHeapTop();
 528
 529   // move large objs from gcheaptop to tmpheaptop
 530   // write the header first
 531   unsigned int tomove = gcbaseva+(BAMBOO_SHARED_MEM_SIZE)-gcheaptop;
 532 #ifdef SMEMM
 533   gcmem_mixed_usedmem += tomove;
 534 #endif
 535   // flush the sbstartbl
 536   BAMBOO_MEMSET_WH(&(gcsbstarttbl[gcreservedsb]), '\0',
 537     (BAMBOO_SHARED_MEM_SIZE/BAMBOO_SMEM_SIZE-(unsigned int)gcreservedsb)
 538     *sizeof(unsigned int));
 539   if(tomove == 0) {
 540     gcheaptop = tmpheaptop;
 541   } else {
 542     // check how many blocks it acrosses
 543     unsigned int remain = tmpheaptop-gcbaseva;
 544     //number of the sblock
 545     unsigned int sb = remain/BAMBOO_SMEM_SIZE+(unsigned int)gcreservedsb;
 546     unsigned int b = 0;  // number of the block
 547     BLOCKINDEX(tmpheaptop, &b);
 548     // check the remaining space in this block
 549     bound = (BAMBOO_SMEM_SIZE);
 550     if(remain < (BAMBOO_LARGE_SMEM_BOUND)) {
 551       bound = (BAMBOO_SMEM_SIZE_L);
 552     }
 553     remain = bound - remain%bound;
 554
 555     size = 0;
 556     unsigned int isize = 0;
 557     unsigned int host = 0;
 558     unsigned int ptr = 0;
 559     unsigned int base = tmpheaptop;
 560     unsigned int cpysize = 0;
 561     remain -= BAMBOO_CACHE_LINE_SIZE;
 562     tmpheaptop += BAMBOO_CACHE_LINE_SIZE;
 563     gc_lobjqueueinit4_I();
 564     while(gc_lobjmoreItems4_I()) {
 565       ptr = (unsigned int)(gc_lobjdequeue4_I(&size, &host));
 566       ALIGNSIZE(size, &isize);
 567       if(remain >= isize) {
 568     remain -= isize;
 569     // move the large obj
 570     if((unsigned int)gcheaptop < (unsigned int)(tmpheaptop+size)) {
 571       memmove(tmpheaptop, gcheaptop, size);
 572     } else {
 573       memcpy(tmpheaptop, gcheaptop, size);
 574     }
 575     // fill the remaining space with -2 padding
 576     BAMBOO_MEMSET_WH(tmpheaptop+size, -2, isize-size);
 577
 578     gcheaptop += size;
 579     cpysize += isize;
 580     // cache the mapping info
 581     gcmappingtbl[OBJMAPPINGINDEX((unsigned int)ptr)]=(unsigned int)tmpheaptop;
 582     tmpheaptop += isize;
 583
 584     // update bamboo_smemtbl
 585     bamboo_smemtbl[b] += isize;
 586       } else {
 587     // this object acrosses blocks
 588     if(cpysize > 0) {
 589       CLOSEBLOCK(base, cpysize+BAMBOO_CACHE_LINE_SIZE);
 590       bamboo_smemtbl[b] += BAMBOO_CACHE_LINE_SIZE;
 591       cpysize = 0;
 592       base = tmpheaptop;
 593       if(remain == 0) {
 594         remain = ((tmpheaptop-gcbaseva)<(BAMBOO_LARGE_SMEM_BOUND)) ?
 595           BAMBOO_SMEM_SIZE_L : BAMBOO_SMEM_SIZE;
 596       }
 597       remain -= BAMBOO_CACHE_LINE_SIZE;
 598       tmpheaptop += BAMBOO_CACHE_LINE_SIZE;
 599       BLOCKINDEX(tmpheaptop, &b);
 600       sb = (unsigned int)(tmpheaptop-gcbaseva)/(BAMBOO_SMEM_SIZE)+gcreservedsb;
 601     }
 602
 603     // move the large obj
 604     if((unsigned int)gcheaptop < (unsigned int)(tmpheaptop+size)) {
 605       memmove(tmpheaptop, gcheaptop, size);
 606     } else {
 607       memcpy(tmpheaptop, gcheaptop, size);
 608     }
 609     // fill the remaining space with -2 padding
 610     BAMBOO_MEMSET_WH(tmpheaptop+size, -2, isize-size);
 611     gcheaptop += size;
 612     // cache the mapping info
 613     gcmappingtbl[OBJMAPPINGINDEX((unsigned int)ptr)]=(unsigned int)tmpheaptop;
 614     tmpheaptop += isize;
 615
 616     // set the gcsbstarttbl and bamboo_smemtbl
 617     unsigned int tmpsbs=1+(unsigned int)(isize-remain-1)/BAMBOO_SMEM_SIZE;
 618     for(int k = 1; k < tmpsbs; k++) {
 619       gcsbstarttbl[sb+k] = -1;
 620     }
 621     sb += tmpsbs;
 622     bound = (b<NUMCORES4GC) ? BAMBOO_SMEM_SIZE_L : BAMBOO_SMEM_SIZE;
 623     BLOCKINDEX(tmpheaptop-1, &tmpsbs);
 624     for(; b < tmpsbs; b++) {
 625       bamboo_smemtbl[b] = bound;
 626       if(b==NUMCORES4GC-1) {
 627         bound = BAMBOO_SMEM_SIZE;
 628       }
 629     }
 630     if(((unsigned int)(isize-remain)%(BAMBOO_SMEM_SIZE)) == 0) {
 631       gcsbstarttbl[sb] = -1;
 632       remain = ((tmpheaptop-gcbaseva)<(BAMBOO_LARGE_SMEM_BOUND)) ?
 633            BAMBOO_SMEM_SIZE_L : BAMBOO_SMEM_SIZE;
 634       bamboo_smemtbl[b] = bound;
 635     } else {
 636       gcsbstarttbl[sb] = (int)tmpheaptop;
 637       remain = tmpheaptop-gcbaseva;
 638       bamboo_smemtbl[b] = remain%bound;
 639       remain = bound - bamboo_smemtbl[b];
 640     }
 641
 642     CLOSEBLOCK(base, isize+BAMBOO_CACHE_LINE_SIZE);
 643     cpysize = 0;
 644     base = tmpheaptop;
 645     if(remain == BAMBOO_CACHE_LINE_SIZE) {
 646       // fill with 0 in case
 647       BAMBOO_MEMSET_WH(tmpheaptop, '\0', remain);
 648     }
 649     remain -= BAMBOO_CACHE_LINE_SIZE;
 650     tmpheaptop += BAMBOO_CACHE_LINE_SIZE;
 651       }
 652     }
 653
 654     if(cpysize > 0) {
 655       CLOSEBLOCK(base, cpysize+BAMBOO_CACHE_LINE_SIZE);
 656       bamboo_smemtbl[b] += BAMBOO_CACHE_LINE_SIZE;
 657     } else {
 658       tmpheaptop -= BAMBOO_CACHE_LINE_SIZE;
 659     }
 660     gcheaptop = tmpheaptop;
 661   }
 662
 663   bamboo_free_block = 0;
 664   unsigned int tbound = 0;
 665   do {
 666     tbound=(bamboo_free_block<NUMCORES4GC)?BAMBOO_SMEM_SIZE_L:BAMBOO_SMEM_SIZE;
 667     if(bamboo_smemtbl[bamboo_free_block] == tbound) {
 668       bamboo_free_block++;
 669     } else {
 670       // the first non-full partition
 671       break;
 672     }
 673   } while(true);
 674
 675   GCPROFILE_RECORD_SPACE();
 676 }
 677
 678 INLINE void gc_collect(struct garbagelist * stackptr) {
 679   gcprocessing = true;
 680   tprintf("gc \n");
 681   // inform the master that this core is at a gc safe point and is ready to
 682   // do gc
 683   send_msg_4(STARTUPCORE, GCFINISHPRE, BAMBOO_NUM_OF_CORE, self_numsendobjs,
 684     self_numreceiveobjs, false);
 685
 686   // core collector routine
 687   while(true) {
 688     if(INITPHASE == gcphase) {
 689       break;
 690     }
 691   }
 692   GC_PRINTF("Do initGC\n");
 693   initGC();
 694   CACHEADAPT_GC(true);
 695   //send init finish msg to core coordinator
 696   send_msg_2(STARTUPCORE, GCFINISHINIT, BAMBOO_NUM_OF_CORE, false);
 697
 698   while(true) {
 699     if(MARKPHASE == gcphase) {
 700       break;
 701     }
 702   }
 703   GC_PRINTF("Start mark phase\n");
 704   mark(true, stackptr);
 705   GC_PRINTF("Finish mark phase, start compact phase\n");
 706   compact();
 707   GC_PRINTF("Finish compact phase\n");
 708
 709   while(true) {
 710     if(FLUSHPHASE == gcphase) {
 711       break;
 712     }
 713   }
 714   GC_PRINTF("Start flush phase\n");
 715   GCPROFILE_INFO_2_MASTER();
 716   flush(stackptr);
 717   GC_PRINTF("Finish flush phase\n");
 718
 719   CACHEADAPT_PHASE_CLIENT();
 720
 721   // invalidate all shared mem pointers
 722   bamboo_cur_msp = NULL;
 723   bamboo_smem_size = 0;
 724   bamboo_smem_zero_top = NULL;
 725
 726   gcflag = false;
 727   while(true) {
 728     if(FINISHPHASE == gcphase) {
 729       break;
 730     }
 731   }
 732
 733   GC_PRINTF("Finish gc! \n");
 734 }
 735
 736 INLINE void gc_nocollect(struct garbagelist * stackptr) {
 737   gcprocessing = true;
 738   tprintf("gc \n");
 739   // inform the master that this core is at a gc safe point and is ready to
 740   // do gc
 741   send_msg_4(STARTUPCORE, GCFINISHPRE, BAMBOO_NUM_OF_CORE, self_numsendobjs,
 742     self_numreceiveobjs, false);
 743
 744   while(true) {
 745     if(INITPHASE == gcphase) {
 746       break;
 747     }
 748   }
 749   GC_PRINTF("Do initGC\n");
 750   initGC();
 751   CACHEADAPT_GC(true);
 752   //send init finish msg to core coordinator
 753   send_msg_2(STARTUPCORE, GCFINISHINIT, BAMBOO_NUM_OF_CORE, false);
 754
 755   while(true) {
 756     if(MARKPHASE == gcphase) {
 757       break;
 758     }
 759   }
 760   GC_PRINTF("Start mark phase\n");
 761   mark(true, stackptr);
 762   GC_PRINTF("Finish mark phase, wait for flush\n");
 763
 764   // non-gc core collector routine
 765   while(true) {
 766     if(FLUSHPHASE == gcphase) {
 767       break;
 768     }
 769   }
 770   GC_PRINTF("Start flush phase\n");
 771   GCPROFILE_INFO_2_MASTER();
 772   flush(stackptr);
 773   GC_PRINTF("Finish flush phase\n");
 774
 775   CACHEADAPT_PHASE_CLIENT();
 776
 777   // invalidate all shared mem pointers
 778   bamboo_cur_msp = NULL;
 779   bamboo_smem_size = 0;
 780   bamboo_smem_zero_top = NULL;
 781
 782   gcflag = false;
 783   while(true) {
 784     if(FINISHPHASE == gcphase) {
 785       break;
 786     }
 787   }
 788   GC_PRINTF("Finish gc! \n");
 789 }
 790
 791 INLINE void gc_master(struct garbagelist * stackptr) {
 792   gcprocessing = true;
 793   tprintf("start GC !!!!!!!!!!!!! \n");
 794
 795   gcphase = INITPHASE;
 796   int i = 0;
 797   waitconfirm = false;
 798   numconfirm = 0;
 799   initGC();
 800   GC_SEND_MSG_1_TO_CLIENT(GCSTARTINIT);
 801   CACHEADAPT_GC(true);
 802   GC_PRINTF("Check core status \n");
 803   GC_CHECK_ALL_CORE_STATUS(true);
 804   GCPROFILE_ITEM();
 805   CACHEADAPT_OUTPUT_CACHE_SAMPLING();
 806
 807   GC_PRINTF("(%x,%x) Start mark phase \n");
 808   GC_SEND_MSG_1_TO_CLIENT(GCSTART);
 809   gcphase = MARKPHASE;
 810   // mark phase
 811   bool isfirst = true;
 812   while(MARKPHASE == gcphase) {
 813     mark(isfirst, stackptr);
 814     if(isfirst) {
 815       isfirst = false;
 816     }
 817
 818     // check gcstatus
 819     checkMarkStatue();
 820   }
 821
 822   // send msgs to all cores requiring large objs info
 823   // Note: only need to ask gc cores, non-gc cores do not host any objs
 824   numconfirm = NUMCORES4GC - 1;
 825   for(i = 1; i < NUMCORES4GC; ++i) {
 826     send_msg_1(i, GCLOBJREQUEST, false);
 827   }
 828   gcloads[BAMBOO_NUM_OF_CORE] = gccurr_heaptop;
 829   while(true) {
 830     if(numconfirm==0) {
 831       break;
 832     }
 833   }   // wait for responses
 834   // check the heaptop
 835   if(gcheaptop < gcmarkedptrbound) {
 836     gcheaptop = gcmarkedptrbound;
 837   }
 838   GCPROFILE_ITEM();
 839   GC_PRINTF("prepare to cache large objs \n");
 840   // cache all large objs
 841   if(!cacheLObjs()) {
 842     // no enough space to cache large objs
 843     BAMBOO_EXIT(0xb02e);
 844   }
 845   // predict number of blocks to fill for each core
 846   unsigned int tmpheaptop = 0;
 847   int numpbc = loadbalance(&tmpheaptop);
 848   // TODO
 849   numpbc = (BAMBOO_SHARED_MEM_SIZE)/(BAMBOO_SMEM_SIZE);
 850   GC_PRINTF("mark phase finished \n");
 851
 852   //int tmptopptr = 0;
 853   //BASEPTR(gctopcore, 0, &tmptopptr);
 854   // TODO
 855   //tmptopptr = gcbaseva + (BAMBOO_SHARED_MEM_SIZE);
 856   tmpheaptop = gcbaseva + (BAMBOO_SHARED_MEM_SIZE);
 857   for(i = 0; i < NUMCORES4GC; ++i) {
 858     unsigned int tmpcoreptr = 0;
 859     BASEPTR(i, numpbc, &tmpcoreptr);
 860     // init some data strutures for compact phase
 861     gcloads[i] = 0;
 862     gcfilledblocks[i] = 0;
 863     gcrequiredmems[i] = 0;
 864     gccorestatus[i] = 1;
 865     //send start compact messages to all cores
 866     //TODO bug here, do not know if the direction is positive or negtive?
 867     if (tmpcoreptr < tmpheaptop) {
 868       gcstopblock[i] = numpbc + 1;
 869       if(i != STARTUPCORE) {
 870         send_msg_2(i, GCSTARTCOMPACT, numpbc+1, false);
 871       } else {
 872         gcblock2fill = numpbc+1;
 873       }
 874     } else {
 875       gcstopblock[i] = numpbc;
 876       if(i != STARTUPCORE) {
 877         send_msg_2(i, GCSTARTCOMPACT, numpbc, false);
 878       } else {
 879         gcblock2fill = numpbc;
 880       }
 881     }
 882   }
 883   BAMBOO_CACHE_MF();
 884   GCPROFILE_ITEM();
 885   // compact phase
 886   struct moveHelper * orig =
 887     (struct moveHelper *)RUNMALLOC(sizeof(struct moveHelper));
 888   struct moveHelper * to =
 889     (struct moveHelper *)RUNMALLOC(sizeof(struct moveHelper));
 890   compact_master(orig, to);
 891   GCPROFILE_ITEM();
 892   GC_PRINTF("prepare to move large objs \n");
 893   // move largeObjs
 894   moveLObjs();
 895   GC_PRINTF("compact phase finished \n");
 896   RUNFREE(orig);
 897   RUNFREE(to);
 898   orig = to = NULL;
 899
 900   gcphase = FLUSHPHASE;
 901   GC_SEND_MSG_1_TO_CLIENT(GCSTARTFLUSH);
 902   GCPROFILE_ITEM();
 903   GC_PRINTF("Start flush phase \n");
 904   // flush phase
 905   flush(stackptr);
 906   // now the master core need to decide the new cache strategy
 907   CACHEADAPT_MASTER();
 908   GC_CHECK_ALL_CORE_STATUS(FLUSHPHASE==gcphase);
 909   GC_PRINTF("Finish flush phase \n");
 910
 911   CACHEADAPT_PHASE_MASTER();
 912
 913   gcphase = FINISHPHASE;
 914
 915   // invalidate all shared mem pointers
 916   // put it here as it takes time to inform all the other cores to
 917   // finish gc and it might cause problem when some core resumes
 918   // mutator earlier than the other cores
 919   bamboo_cur_msp = NULL;
 920   bamboo_smem_size = 0;
 921   bamboo_smem_zero_top = NULL;
 922
 923   GCPROFILE_END();
 924   gcflag = false;
 925   GC_SEND_MSG_1_TO_CLIENT(GCFINISH);
 926
 927   gcprocessing = false;
 928   if(gcflag) {
 929     // inform other cores to stop and wait for gc
 930     gcprecheck = true;
 931     for(int i = 0; i < NUMCORESACTIVE; i++) {
 932       // reuse the gcnumsendobjs & gcnumreceiveobjs
 933       gcnumsendobjs[0][i] = 0;
 934       gcnumreceiveobjs[0][i] = 0;
 935     }
 936     GC_SEND_MSG_1_TO_CLIENT(GCSTARTPRE);
 937   }
 938   GC_PRINTF("gc finished   \n");
 939   tprintf("finish GC ! %d \n", gcflag);
 940 }
 941
 942 INLINE void pregccheck_I() {
 943   while(true) {
 944     gcnumsendobjs[0][BAMBOO_NUM_OF_CORE] = self_numsendobjs;
 945     gcnumreceiveobjs[0][BAMBOO_NUM_OF_CORE] = self_numreceiveobjs;
 946     int sumsendobj = 0;
 947     int i = 0;
 948     for(i = 0; i < NUMCORESACTIVE; ++i) {
 949       sumsendobj += gcnumsendobjs[0][i];
 950     }
 951     for(i = 0; i < NUMCORESACTIVE; ++i) {
 952       sumsendobj -= gcnumreceiveobjs[0][i];
 953     }
 954     if(0 != sumsendobj) {
 955       // there were still some msgs on the fly, wait until there
 956       // are some update pregc information coming and check it again
 957       gcprecheck = false;
 958       BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
 959       while(true) {
 960         if(gcprecheck) {
 961           break;
 962         }
 963       }
 964       BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
 965     } else {
 966       return;
 967     }
 968   }
 969 }
 970
 971 INLINE void pregcprocessing() {
 972 #ifdef GC_CACHE_ADAPT
 973 #ifdef GC_CACHE_SAMPLING
 974   // disable the timer interrupt
 975   bamboo_mask_timer_intr();
 976 #endif
 977 #endif
 978   // Zero out the remaining memory here because for the GC_CACHE_ADAPT version,
 979   // we need to make sure during the gcinit phase the shared heap is not
 980   // touched. Otherwise, there would be problem when adapt the cache strategy.
 981   BAMBOO_CLOSE_CUR_MSP();
 982 #ifdef GC_FLUSH_DTLB
 983   if(gc_num_flush_dtlb < GC_NUM_FLUSH_DTLB) {
 984     BAMBOO_CLEAN_DTLB();
 985     gc_num_flush_dtlb++;
 986   }
 987 #endif
 988 #ifdef GC_CACHE_ADAPT
 989 #ifdef GC_CACHE_SAMPLING
 990   // get the sampling data
 991   bamboo_output_dtlb_sampling();
 992 #endif
 993 #endif
 994 }
 995
 996 INLINE void postgcprocessing() {
 997 #ifdef GC_CACHE_ADAPT
 998 #ifdef GC_CACHE_SAMPLING
 999   // enable the timer interrupt
1000   bamboo_tile_timer_set_next_event(GC_TILE_TIMER_EVENT_SETTING);
1001   bamboo_unmask_timer_intr();
1002 #endif
1003 #endif
1004 }
1005
1006 INLINE bool gc(struct garbagelist * stackptr) {
1007   // check if do gc
1008   if(!gcflag) {
1009     gcprocessing = false;
1010     return false;
1011   }
1012
1013   // core coordinator routine
1014   if(0 == BAMBOO_NUM_OF_CORE) {
1015     GC_PRINTF("Check if we can do gc or not\n");
1016     gccorestatus[BAMBOO_NUM_OF_CORE] = 0;
1017     BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
1018     if(!gc_checkAllCoreStatus_I()) {
1019       BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
1020       // some of the cores are still executing the mutator and did not reach
1021       // some gc safe point, therefore it is not ready to do gc
1022       gcflag = true;
1023       return false;
1024     } else {
1025       GCPROFILE_START();
1026       pregccheck_I();
1027       BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
1028     }
1029     GC_PRINTF("start gc! \n");
1030     pregcprocessing();
1031     gc_master(stackptr);
1032   } else if(BAMBOO_NUM_OF_CORE < NUMCORES4GC) {
1033     pregcprocessing();
1034     gc_collect(stackptr);
1035   } else {
1036     pregcprocessing();
1037     gc_nocollect(stackptr);
1038   }
1039   postgcprocessing();
1040
1041   return true;
1042 }
1043
1044 #endif