Robust/src/Runtime/bamboo/multicoregarbage.c

   1 // BAMBOO_EXIT(0xb000);
   2 // TODO: DO NOT support tag!!!
   3 #ifdef MULTICORE_GC
   4 #include "runtime.h"
   5 #include "multicoregarbage.h"
   6 #include "multicoregcmark.h"
   7 #include "multicoregccompact.h"
   8 #include "multicoregcflush.h"
   9 #include "multicoreruntime.h"
  10 #include "multicoregcprofile.h"
  11
  12 struct pointerblock *gchead=NULL;
  13 int gcheadindex=0;
  14 struct pointerblock *gctail=NULL;
  15 int gctailindex=0;
  16 struct pointerblock *gctail2=NULL;
  17 int gctailindex2=0;
  18 struct pointerblock *gcspare=NULL;
  19
  20 struct lobjpointerblock *gclobjhead=NULL;
  21 int gclobjheadindex=0;
  22 struct lobjpointerblock *gclobjtail=NULL;
  23 int gclobjtailindex=0;
  24 struct lobjpointerblock *gclobjtail2=NULL;
  25 int gclobjtailindex2=0;
  26 struct lobjpointerblock *gclobjspare=NULL;
  27
  28 #ifdef MULTICORE_GC
  29 #ifdef SMEMM
  30 extern unsigned int gcmem_mixed_threshold;
  31 extern unsigned int gcmem_mixed_usedmem;
  32 #endif // SMEMM
  33 #endif // MULTICORE_GC
  34
  35 #ifdef GC_DEBUG
  36 // dump whole mem in blocks
  37 void dumpSMem() {
  38   int block = 0;
  39   int sblock = 0;
  40   unsigned int j = 0;
  41   void * i = 0;
  42   int coren = 0;
  43   int x = 0;
  44   int y = 0;
  45   printf("(%x,%x) Dump shared mem: \n",udn_tile_coord_x(),udn_tile_coord_y());
  46   // reserved blocks for sblocktbl
  47   printf("(%x,%x) ++++ reserved sblocks ++++ \n", udn_tile_coord_x(),
  48          udn_tile_coord_y());
  49   for(i=BAMBOO_BASE_VA; (unsinged int)i<(unsigned int)gcbaseva; i+= 4*16) {
  50     printf("(%x,%x) 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x \n",
  51            udn_tile_coord_x(), udn_tile_coord_y(),
  52            *((int *)(i)), *((int *)(i + 4)),
  53            *((int *)(i + 4*2)), *((int *)(i + 4*3)),
  54            *((int *)(i + 4*4)), *((int *)(i + 4*5)),
  55            *((int *)(i + 4*6)), *((int *)(i + 4*7)),
  56            *((int *)(i + 4*8)), *((int *)(i + 4*9)),
  57            *((int *)(i + 4*10)), *((int *)(i + 4*11)),
  58            *((int *)(i + 4*12)), *((int *)(i + 4*13)),
  59            *((int *)(i + 4*14)), *((int *)(i + 4*15)));
  60   }
  61   sblock = gcreservedsb;
  62   bool advanceblock = false;
  63   // remaining memory
  64   for(i=gcbaseva; (unsigned int)i<(unsigned int)(gcbaseva+BAMBOO_SHARED_MEM_SIZE); i+=4*16) {
  65     advanceblock = false;
  66     // computing sblock # and block #, core coordinate (x,y) also
  67     if(j%((BAMBOO_SMEM_SIZE)/(4*16)) == 0) {
  68       // finished a sblock
  69       if(j < ((BAMBOO_LARGE_SMEM_BOUND)/(4*16))) {
  70         if((j > 0) && (j%((BAMBOO_SMEM_SIZE_L)/(4*16)) == 0)) {
  71           // finished a block
  72           block++;
  73           advanceblock = true;
  74         }
  75       } else {
  76         // finished a block
  77         block++;
  78         advanceblock = true;
  79       }
  80       // compute core #
  81       if(advanceblock) {
  82         coren = gc_block2core[block%(NUMCORES4GC*2)];
  83       }
  84       // compute core coordinate
  85       x = BAMBOO_COORDS_X(coren);
  86       y = BAMBOO_COORDS_Y(coren);
  87       printf("(%x,%x) ==== %d, %d : core (%d,%d), saddr %x====\n",
  88              udn_tile_coord_x(), udn_tile_coord_y(),
  89              block, sblock++, x, y,
  90              (sblock-1)*(BAMBOO_SMEM_SIZE)+gcbaseva);
  91     }
  92     j++;
  93     printf("(%x,%x) 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x \n",
  94            udn_tile_coord_x(), udn_tile_coord_y(),
  95            *((int *)(i)), *((int *)(i + 4)),
  96            *((int *)(i + 4*2)), *((int *)(i + 4*3)),
  97            *((int *)(i + 4*4)), *((int *)(i + 4*5)),
  98            *((int *)(i + 4*6)), *((int *)(i + 4*7)),
  99            *((int *)(i + 4*8)), *((int *)(i + 4*9)),
 100            *((int *)(i + 4*10)), *((int *)(i + 4*11)),
 101            *((int *)(i + 4*12)), *((int *)(i + 4*13)),
 102            *((int *)(i + 4*14)), *((int *)(i + 4*15)));
 103   }
 104   printf("(%x,%x) \n", udn_tile_coord_x(), udn_tile_coord_y());
 105 }
 106 #endif
 107
 108 void initmulticoregcdata() {
 109   if(STARTUPCORE == BAMBOO_NUM_OF_CORE) {
 110     // startup core to initialize corestatus[]
 111     int i;
 112     for(i = 0; i < NUMCORESACTIVE; ++i) {
 113       gccorestatus[i] = 1;
 114       gcnumsendobjs[0][i] = gcnumsendobjs[1][i] = 0;
 115       gcnumreceiveobjs[0][i] = gcnumreceiveobjs[1][i] = 0;
 116     }
 117     for(i = 0; i < NUMCORES4GC; ++i) {
 118       gcloads[i] = 0;
 119       gcrequiredmems[i] = 0;
 120       gcstopblock[i] = 0;
 121       gcfilledblocks[i] = 0;
 122     }
 123   }
 124
 125   bamboo_smem_zero_top = NULL;
 126   gcflag = false;
 127   gcprocessing = false;
 128   gcphase = FINISHPHASE;
 129   gcprecheck = true;
 130   gccurr_heaptop = 0;
 131   gcself_numsendobjs = 0;
 132   gcself_numreceiveobjs = 0;
 133   gcmarkedptrbound = 0;
 134   gcforwardobjtbl = allocateMGCHash_I(20, 3);
 135   gcnumlobjs = 0;
 136   gcheaptop = 0;
 137   gctopcore = 0;
 138   gctopblock = 0;
 139   gcmovestartaddr = 0;
 140   gctomove = false;
 141   gcmovepending = 0;
 142   gcblock2fill = 0;
 143 #ifdef SMEMM
 144   gcmem_mixed_threshold = (unsigned int)((BAMBOO_SHARED_MEM_SIZE
 145                 -bamboo_reserved_smem*BAMBOO_SMEM_SIZE)*0.8);
 146   gcmem_mixed_usedmem = 0;
 147 #endif
 148 #ifdef MGC_SPEC
 149   gc_profile_flag = false;
 150 #endif
 151 #ifdef GC_FLUSH_DTLB
 152   gc_num_flush_dtlb = 0;
 153 #endif
 154   gc_localheap_s = false;
 155 #ifdef GC_CACHE_ADAPT
 156   gccachestage = false;
 157 #endif
 158
 159   INIT_MULTICORE_GCPROFILE_DATA();
 160 }
 161
 162 void dismulticoregcdata() {
 163   freeMGCHash(gcforwardobjtbl);
 164 }
 165
 166 void initGC() {
 167   if(STARTUPCORE == BAMBOO_NUM_OF_CORE) {
 168     int i;
 169     for(i = 0; i < NUMCORES4GC; ++i) {
 170       gccorestatus[i] = 1;
 171       gcnumsendobjs[0][i] = gcnumsendobjs[1][i] = 0;
 172       gcnumreceiveobjs[0][i] = gcnumreceiveobjs[1][i] = 0;
 173       gcloads[i] = 0;
 174       gcrequiredmems[i] = 0;
 175       gcfilledblocks[i] = 0;
 176       gcstopblock[i] = 0;
 177     }
 178     for(i = NUMCORES4GC; i < NUMCORESACTIVE; ++i) {
 179       gccorestatus[i] = 1;
 180       gcnumsendobjs[0][i] = gcnumsendobjs[1][i] = 0;
 181       gcnumreceiveobjs[0][i] = gcnumreceiveobjs[1][i] = 0;
 182     }
 183     gcheaptop = 0;
 184     gctopcore = 0;
 185     gctopblock = 0;
 186   gcnumsrobjs_index = 0;
 187   }
 188   gcself_numsendobjs = 0;
 189   gcself_numreceiveobjs = 0;
 190   gcmarkedptrbound = 0;
 191   gcnumlobjs = 0;
 192   gcmovestartaddr = 0;
 193   gctomove = false;
 194   gcblock2fill = 0;
 195   gcmovepending = 0;
 196   gccurr_heaptop = 0;
 197   gcdstcore = 0;
 198
 199   // initialize queue
 200   if (gchead==NULL) {
 201     gcheadindex=gctailindex=gctailindex2 = 0;
 202     gchead=gctail=gctail2=RUNMALLOC(sizeof(struct pointerblock));
 203   } else {
 204     gctailindex=gctailindex2=gcheadindex=0;
 205     gctail=gctail2=gchead;
 206   }
 207   gchead->next=NULL;
 208
 209   // initialize the large obj queues
 210   if (gclobjhead==NULL) {
 211     gclobjheadindex=0;
 212     gclobjtailindex=0;
 213     gclobjtailindex2=0;
 214     gclobjhead=gclobjtail=gclobjtail2=
 215       RUNMALLOC(sizeof(struct lobjpointerblock));
 216   } else {
 217     gclobjtailindex=gclobjtailindex2=gclobjheadindex=0;
 218     gclobjtail=gclobjtail2=gclobjhead;
 219   }
 220   gclobjhead->next=gclobjhead->prev=NULL;
 221
 222   freeMGCHash(gcforwardobjtbl);
 223   gcforwardobjtbl = allocateMGCHash(20, 3);
 224
 225   GCPROFILE_INIT();
 226 }
 227
 228 bool gc_checkAllCoreStatus_I() {
 229   int i;
 230   for(i = 0; i < NUMCORESACTIVE; ++i) {
 231     if(gccorestatus[i] != 0) {
 232       return false;
 233     }
 234   }
 235   return true;
 236 }
 237
 238 INLINE void checkMarkStatus() {
 239   int i;
 240   if((!waitconfirm) ||
 241       (waitconfirm && (numconfirm == 0))) {
 242     unsigned int entry_index = 0;
 243     if(waitconfirm) {
 244       // phase 2
 245       entry_index = (gcnumsrobjs_index == 0) ? 1 : 0;
 246     } else {
 247       // phase 1
 248       entry_index = gcnumsrobjs_index;
 249     }
 250     BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
 251     gccorestatus[BAMBOO_NUM_OF_CORE] = 0;
 252     gcnumsendobjs[entry_index][BAMBOO_NUM_OF_CORE] = gcself_numsendobjs;
 253     gcnumreceiveobjs[entry_index][BAMBOO_NUM_OF_CORE] = gcself_numreceiveobjs;
 254     // check the status of all cores
 255     if (gc_checkAllCoreStatus_I()) {
 256       // ask for confirm
 257       if(!waitconfirm) {
 258         // the first time found all cores stall
 259         // send out status confirm msg to all other cores
 260         // reset the corestatus array too
 261         gccorestatus[BAMBOO_NUM_OF_CORE] = 1;
 262         waitconfirm = true;
 263         numconfirm = NUMCORESACTIVE - 1;
 264         BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
 265         for(i = 1; i < NUMCORESACTIVE; ++i) {
 266           gccorestatus[i] = 1;
 267           // send mark phase finish confirm request msg to core i
 268           send_msg_1(i, GCMARKCONFIRM, false);
 269         }
 270       } else {
 271         // Phase 2
 272         // check if the sum of send objs and receive obj are the same
 273         // yes->check if the info is the latest; no->go on executing
 274         unsigned int sumsendobj = 0;
 275         for(i = 0; i < NUMCORESACTIVE; ++i) {
 276           sumsendobj += gcnumsendobjs[gcnumsrobjs_index][i];
 277         }
 278         for(i = 0; i < NUMCORESACTIVE; ++i) {
 279           sumsendobj -= gcnumreceiveobjs[gcnumsrobjs_index][i];
 280         }
 281         if(0 == sumsendobj) {
 282           // Check if there are changes of the numsendobjs or numreceiveobjs on
 283           // each core
 284           bool ischanged = false;
 285           for(i = 0; i < NUMCORESACTIVE; ++i) {
 286             if((gcnumsendobjs[0][i] != gcnumsendobjs[1][i]) ||
 287                 (gcnumreceiveobjs[0][i] != gcnumreceiveobjs[1][i]) ) {
 288               ischanged = true;
 289               break;
 290             }
 291           }
 292           if(!ischanged) {
 293             // all the core status info are the latest,stop mark phase
 294             gcphase = COMPACTPHASE;
 295             // restore the gcstatus for all cores
 296             for(i = 0; i < NUMCORESACTIVE; ++i) {
 297               gccorestatus[i] = 1;
 298             }
 299           } else {
 300             // There were changes between phase 1 and phase 2, can not decide
 301             // whether the mark phase has been finished
 302             waitconfirm = false;
 303             // As it fails in phase 2, flip the entries
 304             gcnumsrobjs_index = (gcnumsrobjs_index == 0) ? 1 : 0;
 305           }
 306         } else {
 307           // There were changes between phase 1 and phase 2, can not decide
 308           // whether the mark phase has been finished
 309           waitconfirm = false;
 310           // As it fails in phase 2, flip the entries
 311           gcnumsrobjs_index = (gcnumsrobjs_index == 0) ? 1 : 0;
 312         }
 313         BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
 314       }
 315     } else {
 316       BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
 317     }
 318   }
 319 }
 320
 321 // compute load balance for all cores
 322 INLINE int loadbalance(unsigned int * heaptop) {
 323   // compute load balance
 324   int i;
 325
 326   // get the total loads
 327   unsigned int tloads = gcloads[STARTUPCORE];
 328   for(i = 1; i < NUMCORES4GC; i++) {
 329     tloads += gcloads[i];
 330   }
 331   *heaptop = gcbaseva + tloads;
 332
 333   unsigned int b = 0;
 334   BLOCKINDEX(*heaptop, &b);
 335   // num of blocks per core
 336   unsigned int numbpc = (unsigned int)b/(unsigned int)(NUMCORES4GC);
 337   gctopblock = b;
 338   RESIDECORE(heaptop, &gctopcore);
 339   return numbpc;
 340 }
 341
 342 // compute total mem size required and sort the lobjs in ascending order
 343 INLINE unsigned int sortLObjs() {
 344   unsigned int tmp_lobj = 0;
 345   unsigned int tmp_len = 0;
 346   unsigned int tmp_host = 0;
 347   unsigned int sumsize = 0;
 348
 349   gclobjtail2 = gclobjtail;
 350   gclobjtailindex2 = gclobjtailindex;
 351   // TODO USE QUICK SORT INSTEAD?
 352   while(gc_lobjmoreItems2_I()) {
 353     gc_lobjdequeue2_I();
 354     tmp_lobj = gclobjtail2->lobjs[gclobjtailindex2-1];
 355     tmp_host = gclobjtail2->hosts[gclobjtailindex2-1];
 356     tmp_len = gclobjtail2->lengths[gclobjtailindex2 - 1];
 357     sumsize += tmp_len;
 358     GCPROFILE_RECORD_LOBJ();
 359     unsigned int i = gclobjtailindex2-1;
 360     struct lobjpointerblock * tmp_block = gclobjtail2;
 361     // find the place to insert
 362     while(true) {
 363       if(i == 0) {
 364         if(tmp_block->prev == NULL) {
 365           break;
 366         }
 367         if(tmp_block->prev->lobjs[NUMLOBJPTRS-1] > tmp_lobj) {
 368           tmp_block->lobjs[i] = tmp_block->prev->lobjs[NUMLOBJPTRS-1];
 369           tmp_block->lengths[i] = tmp_block->prev->lengths[NUMLOBJPTRS-1];
 370           tmp_block->hosts[i] = tmp_block->prev->hosts[NUMLOBJPTRS-1];
 371           tmp_block = tmp_block->prev;
 372           i = NUMLOBJPTRS-1;
 373         } else {
 374           break;
 375         }  // if(tmp_block->prev->lobjs[NUMLOBJPTRS-1] < tmp_lobj)
 376       } else {
 377         if(tmp_block->lobjs[i-1] > tmp_lobj) {
 378           tmp_block->lobjs[i] = tmp_block->lobjs[i-1];
 379           tmp_block->lengths[i] = tmp_block->lengths[i-1];
 380           tmp_block->hosts[i] = tmp_block->hosts[i-1];
 381           i--;
 382         } else {
 383           break;
 384         }
 385       }
 386     }
 387     // insert it
 388     if(i != gclobjtailindex2 - 1) {
 389       tmp_block->lobjs[i] = tmp_lobj;
 390       tmp_block->lengths[i] = tmp_len;
 391       tmp_block->hosts[i] = tmp_host;
 392     }
 393   }
 394   return sumsize;
 395 }
 396
 397 INLINE bool cacheLObjs() {
 398   // check the total mem size need for large objs
 399   unsigned long long sumsize = 0;
 400   unsigned int size = 0;
 401
 402   sumsize = sortLObjs();
 403
 404   GCPROFILE_RECORD_LOBJSPACE();
 405
 406   // check if there are enough space to cache these large objs
 407   unsigned int dst = gcbaseva + (BAMBOO_SHARED_MEM_SIZE) -sumsize;
 408   if((unsigned long long)gcheaptop > (unsigned long long)dst) {
 409     // do not have enough room to cache large objs
 410     return false;
 411   }
 412
 413   gcheaptop = dst; // Note: record the start of cached lobjs with gcheaptop
 414   // cache the largeObjs to the top of the shared heap
 415   dst = gcbaseva + (BAMBOO_SHARED_MEM_SIZE);
 416   while(gc_lobjmoreItems3_I()) {
 417     gc_lobjdequeue3_I();
 418     size = gclobjtail2->lengths[gclobjtailindex2];
 419     // set the mark field to , indicating that this obj has been moved
 420     // and need to be flushed
 421     ((struct ___Object___ *)(gclobjtail2->lobjs[gclobjtailindex2]))->marked =
 422       COMPACTED;
 423     dst -= size;
 424     if((unsigned int)dst <
 425         (unsigned int)(gclobjtail2->lobjs[gclobjtailindex2]+size)) {
 426       memmove(dst, gclobjtail2->lobjs[gclobjtailindex2], size);
 427     } else {
 428       memcpy(dst, gclobjtail2->lobjs[gclobjtailindex2], size);
 429     }
 430   }
 431   return true;
 432 }
 433
 434 // update the bmmboo_smemtbl to record current shared mem usage
 435 void updateSmemTbl(unsigned int coren,
 436                    unsigned int localtop) {
 437   unsigned int ltopcore = 0;
 438   unsigned int bound = BAMBOO_SMEM_SIZE_L;
 439   BLOCKINDEX(localtop, &ltopcore);
 440   if((unsigned int)localtop>=(unsigned int)(gcbaseva+BAMBOO_LARGE_SMEM_BOUND)){
 441     bound = BAMBOO_SMEM_SIZE;
 442   }
 443   unsigned int load = (unsigned int)(localtop-gcbaseva)%(unsigned int)bound;
 444   unsigned int i = 0;
 445   unsigned int j = 0;
 446   unsigned int toset = 0;
 447   do {
 448     toset = gc_core2block[2*coren+i]+(unsigned int)(NUMCORES4GC*2)*j;
 449     if(toset < ltopcore) {
 450       bamboo_smemtbl[toset]=(toset<NUMCORES4GC) ? BAMBOO_SMEM_SIZE_L : BAMBOO_SMEM_SIZE;
 451 #ifdef SMEMM
 452       gcmem_mixed_usedmem += bamboo_smemtbl[toset];
 453 #endif
 454     } else if(toset == ltopcore) {
 455       bamboo_smemtbl[toset] = load;
 456 #ifdef SMEMM
 457       gcmem_mixed_usedmem += bamboo_smemtbl[toset];
 458 #endif
 459       break;
 460     } else {
 461       break;
 462     }
 463     i++;
 464     if(i == 2) {
 465       i = 0;
 466       j++;
 467     }
 468   } while(true);
 469 }
 470
 471 INLINE unsigned int checkCurrHeapTop() {
 472   // update the smemtbl
 473   BAMBOO_MEMSET_WH(bamboo_smemtbl, 0, sizeof(int)*gcnumblock);
 474   // flush all gcloads to indicate the real heap top on one core
 475   // previous it represents the next available ptr on a core
 476   if(((unsigned int)gcloads[0] > (unsigned int)(gcbaseva+BAMBOO_SMEM_SIZE_L))
 477      && (((unsigned int)gcloads[0]%(BAMBOO_SMEM_SIZE)) == 0)) {
 478     // edge of a block, check if this is exactly the heaptop
 479     BASEPTR(0, gcfilledblocks[0]-1, &(gcloads[0]));
 480     gcloads[0]+=(gcfilledblocks[0]>1?(BAMBOO_SMEM_SIZE):(BAMBOO_SMEM_SIZE_L));
 481   }
 482   updateSmemTbl(0, gcloads[0]);
 483   for(int i = 1; i < NUMCORES4GC; i++) {
 484     unsigned int tmptop = 0;
 485     if((gcfilledblocks[i] > 0)
 486        && (((unsigned int)gcloads[i] % (BAMBOO_SMEM_SIZE)) == 0)) {
 487       // edge of a block, check if this is exactly the heaptop
 488       BASEPTR(i, gcfilledblocks[i]-1, &gcloads[i]);
 489       gcloads[i] +=
 490         (gcfilledblocks[i]>1?(BAMBOO_SMEM_SIZE):(BAMBOO_SMEM_SIZE_L));
 491       tmptop = gcloads[i];
 492     }
 493     updateSmemTbl(i, gcloads[i]);
 494   }
 495
 496   // find current heap top
 497   // TODO
 498   // a bug here: when using local allocation, directly move large objects
 499   // to the highest free chunk might not be memory efficient
 500   unsigned int tmpheaptop = 0;
 501   int i = 0;
 502   for(i = gcnumblock-1; i >= 0; i--) {
 503     if(bamboo_smemtbl[i] > 0) {
 504       break;
 505     }
 506   }
 507   if(i == -1) {
 508     tmpheaptop = gcbaseva;
 509   } else {
 510     tmpheaptop = gcbaseva+bamboo_smemtbl[i]+((i<NUMCORES4GC) ?
 511         (BAMBOO_SMEM_SIZE_L*i) :
 512         (BAMBOO_SMEM_SIZE*(i-NUMCORES4GC)+BAMBOO_LARGE_SMEM_BOUND));
 513   }
 514   return tmpheaptop;
 515 }
 516
 517 INLINE void moveLObjs() {
 518 #ifdef SMEMM
 519   // update the gcmem_mixed_usedmem
 520   gcmem_mixed_usedmem = 0;
 521 #endif
 522   unsigned int size = 0;
 523   unsigned int bound = 0;
 524   unsigned int tmpheaptop = checkCurrHeapTop();
 525
 526   // move large objs from gcheaptop to tmpheaptop
 527   // write the header first
 528   unsigned int tomove = gcbaseva+(BAMBOO_SHARED_MEM_SIZE)-gcheaptop;
 529 #ifdef SMEMM
 530   gcmem_mixed_usedmem += tomove;
 531 #endif
 532   // flush the sbstartbl
 533   BAMBOO_MEMSET_WH(&(gcsbstarttbl[gcreservedsb]), '\0',
 534     (BAMBOO_SHARED_MEM_SIZE/BAMBOO_SMEM_SIZE-(unsigned int)gcreservedsb)
 535     *sizeof(unsigned int));
 536   if(tomove == 0) {
 537     gcheaptop = tmpheaptop;
 538   } else {
 539     // check how many blocks it acrosses
 540     unsigned int remain = tmpheaptop-gcbaseva;
 541     //number of the sblock
 542     unsigned int sb = remain/BAMBOO_SMEM_SIZE+(unsigned int)gcreservedsb;
 543     unsigned int b = 0;  // number of the block
 544     BLOCKINDEX(tmpheaptop, &b);
 545     // check the remaining space in this block
 546     bound = (BAMBOO_SMEM_SIZE);
 547     if(remain < (BAMBOO_LARGE_SMEM_BOUND)) {
 548       bound = (BAMBOO_SMEM_SIZE_L);
 549     }
 550     remain = bound - remain%bound;
 551
 552     size = 0;
 553     unsigned int isize = 0;
 554     unsigned int host = 0;
 555     unsigned int ptr = 0;
 556     unsigned int base = tmpheaptop;
 557     unsigned int cpysize = 0;
 558     remain -= BAMBOO_CACHE_LINE_SIZE;
 559     tmpheaptop += BAMBOO_CACHE_LINE_SIZE;
 560     gc_lobjqueueinit4_I();
 561     while(gc_lobjmoreItems4_I()) {
 562       ptr = (unsigned int)(gc_lobjdequeue4_I(&size, &host));
 563       ALIGNSIZE(size, &isize);
 564       if(remain >= isize) {
 565         remain -= isize;
 566         // move the large obj
 567         if((unsigned int)gcheaptop < (unsigned int)(tmpheaptop+size)) {
 568           memmove(tmpheaptop, gcheaptop, size);
 569         } else {
 570           memcpy(tmpheaptop, gcheaptop, size);
 571         }
 572         // fill the remaining space with -2 padding
 573         BAMBOO_MEMSET_WH(tmpheaptop+size, -2, isize-size);
 574
 575         gcheaptop += size;
 576         cpysize += isize;
 577         // cache the mapping info
 578         gcmappingtbl[OBJMAPPINGINDEX((unsigned int)ptr)]=(unsigned int)tmpheaptop;
 579         tmpheaptop += isize;
 580
 581         // update bamboo_smemtbl
 582         bamboo_smemtbl[b] += isize;
 583       } else {
 584         // this object acrosses blocks
 585         if(cpysize > 0) {
 586           CLOSEBLOCK(base, cpysize+BAMBOO_CACHE_LINE_SIZE);
 587           bamboo_smemtbl[b] += BAMBOO_CACHE_LINE_SIZE;
 588           cpysize = 0;
 589           base = tmpheaptop;
 590           if(remain == 0) {
 591             remain = ((tmpheaptop-gcbaseva)<(BAMBOO_LARGE_SMEM_BOUND)) ?
 592               BAMBOO_SMEM_SIZE_L : BAMBOO_SMEM_SIZE;
 593           }
 594           remain -= BAMBOO_CACHE_LINE_SIZE;
 595           tmpheaptop += BAMBOO_CACHE_LINE_SIZE;
 596           BLOCKINDEX(tmpheaptop, &b);
 597           sb = (unsigned int)(tmpheaptop-gcbaseva)/(BAMBOO_SMEM_SIZE)+gcreservedsb;
 598         }
 599
 600         // move the large obj
 601         if((unsigned int)gcheaptop < (unsigned int)(tmpheaptop+size)) {
 602           memmove(tmpheaptop, gcheaptop, size);
 603         } else {
 604           memcpy(tmpheaptop, gcheaptop, size);
 605         }
 606         // fill the remaining space with -2 padding
 607         BAMBOO_MEMSET_WH(tmpheaptop+size, -2, isize-size);
 608         gcheaptop += size;
 609         // cache the mapping info
 610         gcmappingtbl[OBJMAPPINGINDEX((unsigned int)ptr)]=(unsigned int)tmpheaptop;
 611         tmpheaptop += isize;
 612
 613         // set the gcsbstarttbl and bamboo_smemtbl
 614         unsigned int tmpsbs=1+(unsigned int)(isize-remain-1)/BAMBOO_SMEM_SIZE;
 615         for(int k = 1; k < tmpsbs; k++) {
 616           gcsbstarttbl[sb+k] = -1;
 617         }
 618         sb += tmpsbs;
 619         bound = (b<NUMCORES4GC) ? BAMBOO_SMEM_SIZE_L : BAMBOO_SMEM_SIZE;
 620         BLOCKINDEX(tmpheaptop-1, &tmpsbs);
 621         for(; b < tmpsbs; b++) {
 622           bamboo_smemtbl[b] = bound;
 623           if(b==NUMCORES4GC-1) {
 624             bound = BAMBOO_SMEM_SIZE;
 625           }
 626         }
 627         if(((unsigned int)(isize-remain)%(BAMBOO_SMEM_SIZE)) == 0) {
 628           gcsbstarttbl[sb] = -1;
 629           remain = ((tmpheaptop-gcbaseva)<(BAMBOO_LARGE_SMEM_BOUND)) ?
 630             BAMBOO_SMEM_SIZE_L : BAMBOO_SMEM_SIZE;
 631           bamboo_smemtbl[b] = bound;
 632         } else {
 633           gcsbstarttbl[sb] = (int)tmpheaptop;
 634           remain = tmpheaptop-gcbaseva;
 635           bamboo_smemtbl[b] = remain%bound;
 636           remain = bound - bamboo_smemtbl[b];
 637         }
 638
 639         CLOSEBLOCK(base, isize+BAMBOO_CACHE_LINE_SIZE);
 640         cpysize = 0;
 641         base = tmpheaptop;
 642         if(remain == BAMBOO_CACHE_LINE_SIZE) {
 643           // fill with 0 in case
 644           BAMBOO_MEMSET_WH(tmpheaptop, '\0', remain);
 645         }
 646         remain -= BAMBOO_CACHE_LINE_SIZE;
 647         tmpheaptop += BAMBOO_CACHE_LINE_SIZE;
 648       }
 649     }
 650
 651     if(cpysize > 0) {
 652       CLOSEBLOCK(base, cpysize+BAMBOO_CACHE_LINE_SIZE);
 653       bamboo_smemtbl[b] += BAMBOO_CACHE_LINE_SIZE;
 654     } else {
 655       tmpheaptop -= BAMBOO_CACHE_LINE_SIZE;
 656     }
 657     gcheaptop = tmpheaptop;
 658   }
 659
 660   bamboo_free_block = 0;
 661   unsigned int tbound = 0;
 662   do {
 663     tbound=(bamboo_free_block<NUMCORES4GC)?BAMBOO_SMEM_SIZE_L:BAMBOO_SMEM_SIZE;
 664     if(bamboo_smemtbl[bamboo_free_block] == tbound) {
 665       bamboo_free_block++;
 666     } else {
 667       // the first non-full partition
 668       break;
 669     }
 670   } while(true);
 671
 672   GCPROFILE_RECORD_SPACE();
 673 }
 674
 675 void gc_collect(struct garbagelist * stackptr) {
 676   gcprocessing = true;
 677   // inform the master that this core is at a gc safe point and is ready to
 678   // do gc
 679   send_msg_4(STARTUPCORE, GCFINISHPRE, BAMBOO_NUM_OF_CORE, self_numsendobjs,
 680              self_numreceiveobjs, false);
 681
 682   // core collector routine
 683   //wait for init phase
 684   WAITFORGCPHASE(INITPHASE);
 685
 686   GC_PRINTF("Do initGC\n");
 687   initGC();
 688   CACHEADAPT_GC(true);
 689   //send init finish msg to core coordinator
 690   send_msg_2(STARTUPCORE, GCFINISHINIT, BAMBOO_NUM_OF_CORE, false);
 691
 692   //wait for mark phase
 693   WAITFORGCPHASE(MARKPHASE);
 694
 695   GC_PRINTF("Start mark phase\n");
 696   mark(true, stackptr);
 697   GC_PRINTF("Finish mark phase, start compact phase\n");
 698   compact();
 699   GC_PRINTF("Finish compact phase\n");
 700
 701   WAITFORGCPHASE(FLUSHPHASE);
 702
 703   GC_PRINTF("Start flush phase\n");
 704   GCPROFILE_INFO_2_MASTER();
 705   flush(stackptr);
 706   GC_PRINTF("Finish flush phase\n");
 707
 708   CACHEADAPT_PHASE_CLIENT();
 709
 710   // invalidate all shared mem pointers
 711   bamboo_cur_msp = NULL;
 712   bamboo_smem_size = 0;
 713   bamboo_smem_zero_top = NULL;
 714   gcflag = false;
 715
 716   WAITFORGCPHASE(FINISHPHASE);
 717
 718   GC_PRINTF("Finish gc! \n");
 719 }
 720
 721 void gc_nocollect(struct garbagelist * stackptr) {
 722   gcprocessing = true;
 723   // inform the master that this core is at a gc safe point and is ready to
 724   // do gc
 725   send_msg_4(STARTUPCORE, GCFINISHPRE, BAMBOO_NUM_OF_CORE, self_numsendobjs,
 726     self_numreceiveobjs, false);
 727
 728   WAITFORGCPHASE(INITPHASE);
 729
 730   GC_PRINTF("Do initGC\n");
 731   initGC();
 732   CACHEADAPT_GC(true);
 733   //send init finish msg to core coordinator
 734   send_msg_2(STARTUPCORE, GCFINISHINIT, BAMBOO_NUM_OF_CORE, false);
 735
 736   WAITFORGCPHASE(MARKPHASE);
 737
 738   GC_PRINTF("Start mark phase\n");
 739   mark(true, stackptr);
 740   GC_PRINTF("Finish mark phase, wait for flush\n");
 741
 742   // non-gc core collector routine
 743   WAITFORGCPHASE(FLUSHPHASE);
 744
 745   GC_PRINTF("Start flush phase\n");
 746   GCPROFILE_INFO_2_MASTER();
 747   flush(stackptr);
 748   GC_PRINTF("Finish flush phase\n");
 749
 750   CACHEADAPT_PHASE_CLIENT();
 751
 752   // invalidate all shared mem pointers
 753   bamboo_cur_msp = NULL;
 754   bamboo_smem_size = 0;
 755   bamboo_smem_zero_top = NULL;
 756
 757   gcflag = false;
 758   WAITFORGCPHASE(FINISHPHASE);
 759
 760   GC_PRINTF("Finish gc! \n");
 761 }
 762
 763 void master_mark(struct garbagelist *stackptr) {
 764   bool isfirst = true;
 765
 766   GC_PRINTF("Start mark phase \n");
 767   GC_SEND_MSG_1_TO_CLIENT(GCSTART);
 768   gcphase = MARKPHASE;
 769   // mark phase
 770
 771   while(MARKPHASE == gcphase) {
 772     mark(isfirst, stackptr);
 773     isfirst=false;
 774     // check gcstatus
 775     checkMarkStatus();
 776   }
 777 }
 778
 779 void master_getlargeobjs() {
 780   // send msgs to all cores requiring large objs info
 781   // Note: only need to ask gc cores, non-gc cores do not host any objs
 782   numconfirm = NUMCORES4GC - 1;
 783   for(i = 1; i < NUMCORES4GC; ++i) {
 784     send_msg_1(i, GCLOBJREQUEST, false);
 785   }
 786   gcloads[BAMBOO_NUM_OF_CORE] = gccurr_heaptop;
 787   //spin until we have all responses
 788   while(numconfirm!=0)
 789     ;
 790
 791   // check the heaptop
 792   if(gcheaptop < gcmarkedptrbound) {
 793     gcheaptop = gcmarkedptrbound;
 794   }
 795   GCPROFILE_ITEM();
 796   GC_PRINTF("prepare to cache large objs \n");
 797
 798   // cache all large objs
 799   if(!cacheLObjs()) {
 800     // no enough space to cache large objs
 801     GC_PRINTF("Not enough space to cache large objects\n");
 802     BAMBOO_EXIT(0xb02e);
 803   }
 804 }
 805
 806 void master_compact() {
 807   // predict number of blocks to fill for each core
 808   unsigned int tmpheaptop = 0;
 809   int numpbc = loadbalance(&tmpheaptop);
 810   int i;
 811
 812   numpbc = (BAMBOO_SHARED_MEM_SIZE)/(BAMBOO_SMEM_SIZE);
 813   GC_PRINTF("mark phase finished \n");
 814
 815   tmpheaptop = gcbaseva + (BAMBOO_SHARED_MEM_SIZE);
 816   for(i = 0; i < NUMCORES4GC; ++i) {
 817     unsigned int tmpcoreptr = 0;
 818     BASEPTR(i, numpbc, &tmpcoreptr);
 819     // init some data strutures for compact phase
 820     gcloads[i] = 0;
 821     gcfilledblocks[i] = 0;
 822     gcrequiredmems[i] = 0;
 823     gccorestatus[i] = 1;
 824     //send start compact messages to all cores
 825     //TODO bug here, do not know if the direction is positive or negtive?
 826     if (tmpcoreptr < tmpheaptop) {
 827       gcstopblock[i] = numpbc + 1;
 828       if(i != STARTUPCORE) {
 829         send_msg_2(i, GCSTARTCOMPACT, numpbc+1, false);
 830       } else {
 831         gcblock2fill = numpbc+1;
 832       }
 833     } else {
 834       gcstopblock[i] = numpbc;
 835       if(i != STARTUPCORE) {
 836         send_msg_2(i, GCSTARTCOMPACT, numpbc, false);
 837       } else {
 838         gcblock2fill = numpbc;
 839       }
 840     }
 841   }
 842   BAMBOO_CACHE_MF();
 843   GCPROFILE_ITEM();
 844   // compact phase
 845   struct moveHelper * orig = (struct moveHelper *)RUNMALLOC(sizeof(struct moveHelper));
 846   struct moveHelper * to = (struct moveHelper *)RUNMALLOC(sizeof(struct moveHelper));
 847   compact_master(orig, to);
 848   GCPROFILE_ITEM();
 849   GC_PRINTF("prepare to move large objs \n");
 850   // move largeObjs
 851   moveLObjs();
 852   GC_PRINTF("compact phase finished \n");
 853   RUNFREE(orig);
 854   RUNFREE(to);
 855 }
 856
 857 void master_updaterefs() {
 858   gcphase = FLUSHPHASE;
 859   GC_SEND_MSG_1_TO_CLIENT(GCSTARTFLUSH);
 860   GCPROFILE_ITEM();
 861   GC_PRINTF("Start flush phase \n");
 862   // flush phase
 863   flush(stackptr);
 864   // now the master core need to decide the new cache strategy
 865   CACHEADAPT_MASTER();
 866   GC_CHECK_ALL_CORE_STATUS(FLUSHPHASE==gcphase);
 867   GC_PRINTF("Finish flush phase \n");
 868 }
 869
 870 void master_finish() {
 871   gcphase = FINISHPHASE;
 872
 873   // invalidate all shared mem pointers
 874   // put it here as it takes time to inform all the other cores to
 875   // finish gc and it might cause problem when some core resumes
 876   // mutator earlier than the other cores
 877   bamboo_cur_msp = NULL;
 878   bamboo_smem_size = 0;
 879   bamboo_smem_zero_top = NULL;
 880
 881   GCPROFILE_END();
 882   gcflag = false;
 883   GC_SEND_MSG_1_TO_CLIENT(GCFINISH);
 884
 885   gcprocessing = false;
 886   if(gcflag) {
 887     // inform other cores to stop and wait for gc
 888     gcprecheck = true;
 889     for(int i = 0; i < NUMCORESACTIVE; i++) {
 890       // reuse the gcnumsendobjs & gcnumreceiveobjs
 891       gcnumsendobjs[0][i] = 0;
 892       gcnumreceiveobjs[0][i] = 0;
 893     }
 894     GC_SEND_MSG_1_TO_CLIENT(GCSTARTPRE);
 895   }
 896 }
 897
 898 void gc_master(struct garbagelist * stackptr) {
 899   tprintf("start GC !!!!!!!!!!!!! \n");
 900   gcprocessing = true;
 901   gcphase = INITPHASE;
 902
 903   waitconfirm = false;
 904   numconfirm = 0;
 905   initGC();
 906   GC_SEND_MSG_1_TO_CLIENT(GCSTARTINIT);
 907   CACHEADAPT_GC(true);
 908   GC_PRINTF("Check core status \n");
 909   GC_CHECK_ALL_CORE_STATUS(true);
 910   GCPROFILE_ITEM();
 911   CACHEADAPT_OUTPUT_CACHE_SAMPLING();
 912
 913   // do mark phase
 914   master_mark(stackptr);
 915
 916   // get large objects from all cores
 917   master_getlargeobjs();
 918
 919   // compact the heap
 920   master_compact();
 921
 922   // update the references
 923   master_updaterefs();
 924
 925   // do cache adaptation
 926   CACHEADAPT_PHASE_MASTER();
 927
 928   // do finish up stuff
 929   master_finish();
 930
 931   GC_PRINTF("gc finished   \n");
 932   tprintf("finish GC ! %d \n", gcflag);
 933 }
 934
 935 void pregccheck_I() {
 936   while(true) {
 937     gcnumsendobjs[0][BAMBOO_NUM_OF_CORE] = self_numsendobjs;
 938     gcnumreceiveobjs[0][BAMBOO_NUM_OF_CORE] = self_numreceiveobjs;
 939     int sumsendobj = 0;
 940     int i;
 941     for(i = 0; i < NUMCORESACTIVE; ++i) {
 942       sumsendobj += gcnumsendobjs[0][i];
 943     }
 944     for(i = 0; i < NUMCORESACTIVE; ++i) {
 945       sumsendobj -= gcnumreceiveobjs[0][i];
 946     }
 947     if(0 != sumsendobj) {
 948       // there were still some msgs on the fly, wait until there
 949       // are some update pregc information coming and check it again
 950       gcprecheck = false;
 951       BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
 952
 953       while(!gcprecheck) ;
 954
 955       BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
 956     } else {
 957       return;
 958     }
 959   }
 960 }
 961
 962 void pregcprocessing() {
 963 #if defined(GC_CACHE_ADAPT)&&defined(GC_CACHE_SAMPLING)
 964   // disable the timer interrupt
 965   bamboo_mask_timer_intr();
 966 #endif
 967   // Zero out the remaining memory here because for the GC_CACHE_ADAPT version,
 968   // we need to make sure during the gcinit phase the shared heap is not
 969   // touched. Otherwise, there would be problem when adapt the cache strategy.
 970   BAMBOO_CLOSE_CUR_MSP();
 971 #ifdef GC_FLUSH_DTLB
 972   if(gc_num_flush_dtlb < GC_NUM_FLUSH_DTLB) {
 973     BAMBOO_CLEAN_DTLB();
 974     gc_num_flush_dtlb++;
 975   }
 976 #endif
 977 #if defined(GC_CACHE_ADAPT)&&defined(GC_CACHE_SAMPLING)
 978   // get the sampling data
 979   bamboo_output_dtlb_sampling();
 980 #endif
 981 }
 982
 983 void postgcprocessing() {
 984 #if defined(GC_CACHE_ADAPT)&&defined(GC_CACHE_SAMPLING)
 985   // enable the timer interrupt
 986   bamboo_tile_timer_set_next_event(GC_TILE_TIMER_EVENT_SETTING);
 987   bamboo_unmask_timer_intr();
 988 #endif
 989 }
 990
 991 bool gc(struct garbagelist * stackptr) {
 992   // check if do gc
 993   if(!gcflag) {
 994     gcprocessing = false;
 995     return false;
 996   }
 997
 998   // core coordinator routine
 999   if(0 == BAMBOO_NUM_OF_CORE) {
1000     GC_PRINTF("Check if we can do gc or not\n");
1001     gccorestatus[BAMBOO_NUM_OF_CORE] = 0;
1002     BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
1003     if(!gc_checkAllCoreStatus_I()) {
1004       BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
1005       // some of the cores are still executing the mutator and did not reach
1006       // some gc safe point, therefore it is not ready to do gc
1007       gcflag = true;
1008       return false;
1009     } else {
1010       GCPROFILE_START();
1011       pregccheck_I();
1012       BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
1013     }
1014     GC_PRINTF("start gc! \n");
1015     pregcprocessing();
1016     gc_master(stackptr);
1017   } else if(BAMBOO_NUM_OF_CORE < NUMCORES4GC) {
1018     pregcprocessing();
1019     gc_collect(stackptr);
1020   } else {
1021     pregcprocessing();
1022     gc_nocollect(stackptr);
1023   }
1024   postgcprocessing();
1025
1026   return true;
1027 }
1028
1029 #endif