clean up dtlb sample code...mostly fixing multisample bug

[IRC.git] / Robust / src / Runtime / bamboo / multicoregarbage.c
diff --git a/Robust/src/Runtime/bamboo/multicoregarbage.c b/Robust/src/Runtime/bamboo/multicoregarbage.c

index 2468a98bdcda880662da1e52c5c55651064f3e9b..5e015351c7875a4658f1e1d52aec1c34115550a6 100644 (file)
--- a/Robust/src/Runtime/bamboo/multicoregarbage.c
+++ b/Robust/src/Runtime/bamboo/multicoregarbage.c
@@ -1,19 +1,17 @@
-// TODO: DO NOT support tag!!!
  #ifdef MULTICORE_GC
  #include "runtime.h"
  #include "multicoreruntime.h"
  #include "multicoregarbage.h"
  #include "multicoregcmark.h"
-#include "gcqueue.h"
  #include "multicoregccompact.h"
  #include "multicoregcflush.h"
  #include "multicoregcprofile.h"
  #include "gcqueue.h"
-
-#ifdef SMEMM
-extern unsigned int gcmem_mixed_threshold;
-extern unsigned int gcmem_mixed_usedmem;
-#endif // SMEMM
+#include "multicoremem_helper.h"
+#include "bambooalign.h"
+#ifdef PERFCOUNT
+#include "bme_perf_counter.h"
+#endif
  
  volatile bool gcflag;
  gc_status_t gc_status_info;
@@ -26,7 +24,7 @@ void dumpSMem() {
    int block = 0;
    int sblock = 0;
    unsigned int j = 0;
-  void * i = 0;
+  unsigned int i = 0;
    int coren = 0;
    int x = 0;
    int y = 0;
@@ -34,7 +32,7 @@ void dumpSMem() {
    // reserved blocks for sblocktbl
    printf("(%x,%x) ++++ reserved sblocks ++++ \n", udn_tile_coord_x(),
          udn_tile_coord_y());
-  for(i=BAMBOO_BASE_VA; (unsinged int)i<(unsigned int)gcbaseva; i+= 4*16) {
+  for(i=BAMBOO_BASE_VA; i<gcbaseva; i+= 4*16) {
      printf("(%x,%x) 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x \n",
          udn_tile_coord_x(), udn_tile_coord_y(),
          *((int *)(i)), *((int *)(i + 4)),
@@ -92,53 +90,57 @@ void dumpSMem() {
  }
  #endif
  
-void initmulticoregcdata() {
-  if(STARTUPCORE == BAMBOO_NUM_OF_CORE) {
-    // startup core to initialize corestatus[]
-    for(int i = 0; i < NUMCORESACTIVE; i++) {
-      gccorestatus[i] = 1;
-      gcnumsendobjs[0][i] = gcnumsendobjs[1][i] = 0;
-      gcnumreceiveobjs[0][i] = gcnumreceiveobjs[1][i] = 0;
-    } 
-    for(int i = 0; i < NUMCORES4GC; i++) {
-      gcloads[i] = 0;
-      gcrequiredmems[i] = 0;
-      gcstopblock[i] = 0;
-      gcfilledblocks[i] = 0;
+bool gc_checkCoreStatus() {
+  for(int i = 0; i < NUMCORES4GC; i++) {
+    if(gccorestatus[i]) {
+      return false;
      }
    }
+  return true;
+}
+
+void gc_resetCoreStatus() {
+  for(int i = 0; i < NUMCORES4GC; i++) {
+    gccorestatus[i] = 1;
+  }
+}
  
+
+void initmulticoregcdata() {
    bamboo_smem_zero_top = NULL;
    gcflag = false;
    gc_status_info.gcprocessing = false;
    gc_status_info.gcphase = FINISHPHASE;
  
    gcprecheck = true;
-  gccurr_heaptop = 0;
-  gcself_numsendobjs = 0;
-  gcself_numreceiveobjs = 0;
-  gcmarkedptrbound = 0;
    gcforwardobjtbl = allocateMGCHash_I(128);
-  gcnumlobjs = 0;
-  gcheaptop = 0;
-  gctopcore = 0;
-  gctopblock = 0;
-  gcmovestartaddr = 0;
-  gctomove = false;
-  gcmovepending = 0;
-  gcblock2fill = 0;
-#ifdef SMEMM
-  gcmem_mixed_threshold=(unsigned int)((BAMBOO_SHARED_MEM_SIZE-bamboo_reserved_smem*BAMBOO_SMEM_SIZE)*0.8);
-  gcmem_mixed_usedmem = 0;
-#endif
  #ifdef MGC_SPEC
    gc_profile_flag = false;
  #endif
-  gc_localheap_s = false;
  #ifdef GC_CACHE_ADAPT
    gccachestage = false;
  #endif 
  
+  if(STARTUPCORE == BAMBOO_NUM_OF_CORE) {
+    allocationinfo.blocktable=RUNMALLOC(sizeof(struct blockrecord)*GCNUMBLOCK);
+    for(int i=0; i<GCNUMBLOCK;i++) {
+      if (1==NUMCORES4GC)
+       allocationinfo.blocktable[i].corenum=0;
+      else
+       allocationinfo.blocktable[i].corenum=gc_block2core[(i%(NUMCORES4GC*2))];
+      allocationinfo.blocktable[i].status=BS_FREE;
+      allocationinfo.blocktable[i].usedspace=0;
+      allocationinfo.blocktable[i].freespace=GLOBALBLOCKSIZE(i);
+    }
+    buildCore2Test();
+  }
+
+  //initialize update structures
+  origarraycount=0;
+  for(int i=0;i<NUMCORES4GC;i++) {
+    origblockarray[i]=NULL;
+  }
+
    INIT_MULTICORE_GCPROFILE_DATA();
  }
  
@@ -154,29 +156,22 @@ void initGC() {
        gcnumreceiveobjs[0][i] = gcnumreceiveobjs[1][i] = 0;
        gcloads[i] = 0;
        gcrequiredmems[i] = 0;
-      gcfilledblocks[i] = 0;
-      gcstopblock[i] = 0;
      } 
      for(int i = NUMCORES4GC; i < NUMCORESACTIVE; i++) {
        gccorestatus[i] = 1;
        gcnumsendobjs[0][i] = gcnumsendobjs[1][i] = 0;
        gcnumreceiveobjs[0][i] = gcnumreceiveobjs[1][i] = 0;
      }
-    gcheaptop = 0;
-    gctopcore = 0;
-    gctopblock = 0;
      gcnumsrobjs_index = 0;
    } 
    gcself_numsendobjs = 0;
    gcself_numreceiveobjs = 0;
-  gcmarkedptrbound = 0;
-  gcnumlobjs = 0;
    gcmovestartaddr = 0;
    gctomove = false;
    gcblock2fill = 0;
    gcmovepending = 0;
    gccurr_heaptop = 0;
-
+  update_origblockptr=NULL;
    gc_queueinit();
  
    MGCHashreset(gcforwardobjtbl);
@@ -185,29 +180,8 @@ void initGC() {
    gc_output_cache_policy_time=0;
  } 
  
-bool gc_checkAllCoreStatus() {
-  BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
-  for(int i = 0; i < NUMCORESACTIVE; i++) {
-    if(gccorestatus[i] != 0) {
-      BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-      return false;
-    }  
-  }  
-  BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-  return true;
-}
-
-// NOTE: should be invoked with interrupts turned off
-bool gc_checkAllCoreStatus_I() {
-  for(int i = 0; i < NUMCORESACTIVE; i++) {
-    if(gccorestatus[i] != 0) {
-      return false;
-    }  
-  }  
-  return true;
-}
-
  void checkMarkStatus_p2() {
+  //  tprintf("Check mark status 2\n");
    // check if the sum of send objs and receive obj are the same
    // yes->check if the info is the latest; no->go on executing
    unsigned int sumsendobj = 0;
@@ -227,6 +201,7 @@ void checkMarkStatus_p2() {
        }
      }  
      if(i == NUMCORESACTIVE) {    
+      //tprintf("Mark terminated\n");
        // all the core status info are the latest,stop mark phase
        gc_status_info.gcphase = COMPACTPHASE;
        // restore the gcstatus for all cores
@@ -250,6 +225,7 @@ void checkMarkStatus_p2() {
  }
  
  void checkMarkStatus() {
+  //  tprintf("Check mark status\n");
    if((!waitconfirm)||(waitconfirm && (numconfirm == 0))) {
      unsigned int entry_index = 0;
      if(waitconfirm) {
@@ -260,11 +236,8 @@ void checkMarkStatus() {
        entry_index = gcnumsrobjs_index;
      }
      BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
-    gccorestatus[BAMBOO_NUM_OF_CORE] = 0;  
-    gcnumsendobjs[entry_index][BAMBOO_NUM_OF_CORE] = gcself_numsendobjs;
-    gcnumreceiveobjs[entry_index][BAMBOO_NUM_OF_CORE] = gcself_numreceiveobjs;
      // check the status of all cores
-    if (gc_checkAllCoreStatus_I()) {
+    if (gc_checkCoreStatus()) {
        // ask for confirm
        if(!waitconfirm) {
          // the first time found all cores stall
@@ -286,146 +259,26 @@ void checkMarkStatus() {
  } 
  
  // compute load balance for all cores
-int loadbalance(void ** heaptop, unsigned int * topblock, unsigned int * topcore) {
+int loadbalance() {
    // compute load balance
    // get the total loads
+  void * heaptop;
    unsigned int tloads = 0;
    for(int i = 0; i < NUMCORES4GC; i++) {
      tloads += gcloads[i];
+    //tprintf("load: %d %d \n", gcloads[i], i);
    }
-  *heaptop = gcbaseva + tloads;
+  heaptop = gcbaseva + tloads;
  
    unsigned int topblockindex;
    
-  BLOCKINDEX(topblockindex, *heaptop);
+  BLOCKINDEX(topblockindex, heaptop);
    // num of blocks per core
    unsigned int numbpc = (topblockindex+NUMCORES4GC-1)/NUMCORES4GC;
    
-  *topblock = topblockindex;
-  RESIDECORE(*heaptop, *topcore);
    return numbpc;
  }
  
-// compute total mem size required and sort the lobjs in ascending order
-unsigned int sortLObjs() {
-  unsigned int tmp_lobj = 0;
-  unsigned int tmp_len = 0;
-  unsigned int tmp_host = 0;
-  unsigned int sumsize = 0;
-
-  gclobjtail2 = gclobjtail;
-  gclobjtailindex2 = gclobjtailindex;
-  // TODO USE QUICK SORT INSTEAD?
-  while(gc_lobjmoreItems2_I()) {
-    gc_lobjdequeue2_I();
-    tmp_lobj = gclobjtail2->lobjs[gclobjtailindex2-1];
-    tmp_host = gclobjtail2->hosts[gclobjtailindex2-1];
-    tmp_len = gclobjtail2->lengths[gclobjtailindex2 - 1];
-    sumsize += tmp_len;
-    GCPROFILE_RECORD_LOBJ();
-    unsigned int i = gclobjtailindex2-1;
-    struct lobjpointerblock * tmp_block = gclobjtail2;
-    // find the place to insert
-    while(true) {
-      if(i == 0) {
-        if(tmp_block->prev == NULL) {
-          break;
-        }
-        if(tmp_block->prev->lobjs[NUMLOBJPTRS-1] > tmp_lobj) {
-          tmp_block->lobjs[i] = tmp_block->prev->lobjs[NUMLOBJPTRS-1];
-          tmp_block->lengths[i] = tmp_block->prev->lengths[NUMLOBJPTRS-1];
-          tmp_block->hosts[i] = tmp_block->prev->hosts[NUMLOBJPTRS-1];
-          tmp_block = tmp_block->prev;
-          i = NUMLOBJPTRS-1;
-        } else {
-          break;
-        }  // if(tmp_block->prev->lobjs[NUMLOBJPTRS-1] < tmp_lobj)
-      } else {
-        if(tmp_block->lobjs[i-1] > tmp_lobj) {
-          tmp_block->lobjs[i] = tmp_block->lobjs[i-1];
-          tmp_block->lengths[i] = tmp_block->lengths[i-1];
-          tmp_block->hosts[i] = tmp_block->hosts[i-1];
-          i--;
-        } else {
-          break;
-        }  
-      } 
-    }  
-    // insert it
-    if(i != gclobjtailindex2 - 1) {
-      tmp_block->lobjs[i] = tmp_lobj;
-      tmp_block->lengths[i] = tmp_len;
-      tmp_block->hosts[i] = tmp_host;
-    }
-  }
-  return sumsize;
-}
-
-bool cacheLObjs() {
-  // check the total mem size need for large objs
-  unsigned long long sumsize = 0;
-  unsigned int size = 0;
-  
-  sumsize = sortLObjs();
-
-  GCPROFILE_RECORD_LOBJSPACE();
-
-  // check if there are enough space to cache these large objs
-  unsigned int dst = gcbaseva + (BAMBOO_SHARED_MEM_SIZE) -sumsize;
-  if((unsigned long long)gcheaptop > (unsigned long long)dst) {
-    // do not have enough room to cache large objs
-    return false;
-  }
-
-  gcheaptop = dst; // Note: record the start of cached lobjs with gcheaptop
-  // cache the largeObjs to the top of the shared heap
-  dst = gcbaseva + (BAMBOO_SHARED_MEM_SIZE);
-  while(gc_lobjmoreItems3_I()) {
-    gc_lobjdequeue3_I();
-    size = gclobjtail2->lengths[gclobjtailindex2];
-    // set the mark field to , indicating that this obj has been moved
-    // and need to be flushed
-    dst -= size;
-    if((unsigned int)dst<(unsigned int)(gclobjtail2->lobjs[gclobjtailindex2]+size)) {
-      memmove(dst, gclobjtail2->lobjs[gclobjtailindex2], size);
-    } else {
-      memcpy(dst, gclobjtail2->lobjs[gclobjtailindex2], size);
-    }
-  }
-  return true;
-} 
-
-// update the bmmboo_smemtbl to record current shared mem usage
-void updateSmemTbl(unsigned int coren, void * localtop) {
-  unsigned int ltopcore = 0;
-  unsigned int bound = BAMBOO_SMEM_SIZE_L;
-  BLOCKINDEX(ltopcore, localtop);
-  if((unsigned int)localtop>=(unsigned int)(gcbaseva+BAMBOO_LARGE_SMEM_BOUND)){
-    bound = BAMBOO_SMEM_SIZE;
-  }
-  unsigned int load = (unsigned INTPTR)(localtop-gcbaseva)%(unsigned int)bound;
-  unsigned int toset = 0;
-  for(int j=0; 1; j++) {
-    for(int i=0; i<2; i++) {
-      toset = gc_core2block[2*coren+i]+(unsigned int)(NUMCORES4GC*2)*j;
-      if(toset < ltopcore) {
-        bamboo_smemtbl[toset]=BLOCKSIZE(toset<NUMCORES4GC);
-#ifdef SMEMM
-        gcmem_mixed_usedmem += bamboo_smemtbl[toset];
-#endif
-      } else if(toset == ltopcore) {
-        bamboo_smemtbl[toset] = load;
-#ifdef SMEMM
-        gcmem_mixed_usedmem += bamboo_smemtbl[toset];
-#endif
-        return;
-      } else {
-        return;
-      }
-    }
-  }
-}
-
  void gc_collect(struct garbagelist * stackptr) {
    gc_status_info.gcprocessing = true;
    // inform the master that this core is at a gc safe point and is ready to 
@@ -446,17 +299,17 @@ void gc_collect(struct garbagelist * stackptr) {
    WAITFORGCPHASE(MARKPHASE);
  
    GC_PRINTF("Start mark phase\n");
-  mark(true, stackptr);
+  mark(stackptr);
    GC_PRINTF("Finish mark phase, start compact phase\n");
    compact();
    GC_PRINTF("Finish compact phase\n");
  
-  WAITFORGCPHASE(FLUSHPHASE);
+  WAITFORGCPHASE(UPDATEPHASE);
  
-  GC_PRINTF("Start flush phase\n");
+  GC_PRINTF("Start update phase\n");
    GCPROFILE_INFO_2_MASTER();
-  flush(stackptr);
-  GC_PRINTF("Finish flush phase\n");
+  update(stackptr);
+  GC_PRINTF("Finish update phase\n");
  
    CACHEADAPT_PHASE_CLIENT();
  
@@ -488,16 +341,16 @@ void gc_nocollect(struct garbagelist * stackptr) {
    WAITFORGCPHASE(MARKPHASE);
  
    GC_PRINTF("Start mark phase\n"); 
-  mark(true, stackptr);
-  GC_PRINTF("Finish mark phase, wait for flush\n");
+  mark(stackptr);
+  GC_PRINTF("Finish mark phase, wait for update\n");
  
    // non-gc core collector routine
-  WAITFORGCPHASE(FLUSHPHASE);
+  WAITFORGCPHASE(UPDATEPHASE);
  
-  GC_PRINTF("Start flush phase\n");
+  GC_PRINTF("Start update phase\n");
    GCPROFILE_INFO_2_MASTER();
-  flush(stackptr);
-  GC_PRINTF("Finish flush phase\n"); 
+  update(stackptr);
+  GC_PRINTF("Finish update phase\n"); 
  
    CACHEADAPT_PHASE_CLIENT();
  
@@ -513,19 +366,13 @@ void gc_nocollect(struct garbagelist * stackptr) {
  }
  
  void master_mark(struct garbagelist *stackptr) {
-  bool isfirst = true;
  
    GC_PRINTF("Start mark phase \n");
-  GC_SEND_MSG_1_TO_CLIENT(GCSTART);
    gc_status_info.gcphase = MARKPHASE;
+  GC_SEND_MSG_1_TO_CLIENT(GCSTART);
    // mark phase
  
-  while(MARKPHASE == gc_status_info.gcphase) {
-    mark(isfirst, stackptr);
-    isfirst=false;
-    // check gcstatus
-    checkMarkStatus();
-  }
+  mark(stackptr);
  }
  
  void master_getlargeobjs() {
@@ -539,27 +386,21 @@ void master_getlargeobjs() {
    //spin until we have all responses
    while(numconfirm!=0) ;
  
-  // check the heaptop
-  if(gcheaptop < gcmarkedptrbound) {
-    gcheaptop = gcmarkedptrbound;
-  }
    GCPROFILE_ITEM();
    GC_PRINTF("prepare to cache large objs \n");
  
-  // cache all large objs
-  BAMBOO_ASSERTMSG(cacheLObjs(), "Not enough space to cache large objects\n");
  }
  
  
  void master_updaterefs(struct garbagelist * stackptr) {
-  gc_status_info.gcphase = FLUSHPHASE;
-  GC_SEND_MSG_1_TO_CLIENT(GCSTARTFLUSH);
+  gc_status_info.gcphase = UPDATEPHASE;
+  GC_SEND_MSG_1_TO_CLIENT(GCSTARTUPDATE);
    GCPROFILE_ITEM();
-  GC_PRINTF("Start flush phase \n");
-  // flush phase
-  flush(stackptr);
-  GC_CHECK_ALL_CORE_STATUS(FLUSHPHASE==gc_status_info.gcphase);
-  GC_PRINTF("Finish flush phase \n");
+  GC_PRINTF("Start update phase \n");
+  // update phase
+  update(stackptr);
+  GC_CHECK_ALL_CORE_STATUS();
+  GC_PRINTF("Finish update phase \n");
  }
  
  void master_finish() {
@@ -578,11 +419,13 @@ void master_finish() {
    CACHEADAPT_OUTPUT_CACHE_POLICY();
    gc_output_cache_policy_time += (BAMBOO_GET_EXE_TIME()-tmpt);
    gcflag = false;
+
    GC_SEND_MSG_1_TO_CLIENT(GCFINISH);
-  
-  gc_status_info.gcprocessing = false;
+  gc_status_info.gcprocessing = false;  
+
    if(gcflag) {
      // inform other cores to stop and wait for gc
+    GC_PRINTF("Back to Back gc case\n");
      gcprecheck = true;
      for(int i = 0; i < NUMCORESACTIVE; i++) {
        // reuse the gcnumsendobjs & gcnumreceiveobjs
@@ -594,7 +437,7 @@ void master_finish() {
  }
  
  void gc_master(struct garbagelist * stackptr) {
-  tprintf("start GC !!!!!!!!!!!!! \n");
+  tprintf("start GC!\n");
    gc_status_info.gcprocessing = true;
    gc_status_info.gcphase = INITPHASE;
  
@@ -603,33 +446,39 @@ void gc_master(struct garbagelist * stackptr) {
    initGC();
    GC_SEND_MSG_1_TO_CLIENT(GCSTARTINIT);
    CACHEADAPT_GC(true);
-  GC_PRINTF("Check core status \n");
-  GC_CHECK_ALL_CORE_STATUS(true);
+  //tprintf("Check core status \n");
+  GC_CHECK_ALL_CORE_STATUS();
    GCPROFILE_ITEM();
    unsigned long long tmpt = BAMBOO_GET_EXE_TIME();
    CACHEADAPT_OUTPUT_CACHE_SAMPLING();
    gc_output_cache_policy_time += (BAMBOO_GET_EXE_TIME()-tmpt);
-
+  //tprintf("start mark phase\n");
    // do mark phase
    master_mark(stackptr);
-
+  //tprintf("finish mark phase\n");
    // get large objects from all cores
    master_getlargeobjs();
-
+  //tprintf("start compact phase\n");
    // compact the heap
    master_compact();
-  
+  //tprintf("start update phase\n");
    // update the references
    master_updaterefs(stackptr);
-
+  //tprintf("gc master finished update   \n");
    // do cache adaptation
    CACHEADAPT_PHASE_MASTER();
-
+  //tprintf("finish cachdapt phase\n");
    // do finish up stuff
+#ifdef GC_DEBUG
+  for(int i=0;i<GCNUMBLOCK;i++) {
+    struct blockrecord *record=&allocationinfo.blocktable[i];
+    tprintf("%u. used=%u free=%u corenum=%u status=%u, base=%x, ptr=%x\n", i, record->usedspace, record->freespace, record->corenum, record->status, gcbaseva+OFFSET2BASEVA(i), (gcbaseva+OFFSET2BASEVA(i)+record->usedspace));
+  }
+#endif
+
    master_finish();
  
-  GC_PRINTF("gc finished   \n");
-  tprintf("finish GC ! %d \n",gcflag);
+  //tprintf("finish GC ! %d \n",gcflag);
  } 
  
  void pregccheck() {
@@ -662,12 +511,6 @@ void pregcprocessing() {
  #if defined(GC_CACHE_ADAPT)&&defined(GC_CACHE_SAMPLING)
    // disable the timer interrupt
    bamboo_mask_timer_intr();
-#endif
-  // Zero out the remaining memory here because for the GC_CACHE_ADAPT version,
-  // we need to make sure during the gcinit phase the shared heap is not 
-  // touched. Otherwise, there would be problem when adapt the cache strategy.
-  BAMBOO_CLOSE_CUR_MSP();
-#if defined(GC_CACHE_ADAPT)&&defined(GC_CACHE_SAMPLING)
    // get the sampling data 
    bamboo_output_dtlb_sampling();
  #endif
@@ -678,6 +521,8 @@ void postgcprocessing() {
    // enable the timer interrupt
    bamboo_tile_timer_set_next_event(GC_TILE_TIMER_EVENT_SETTING); 
    bamboo_unmask_timer_intr();
+  //turn on sampling again
+  bamboo_dtlb_sampling_init();
  #endif
  }
  
@@ -687,24 +532,26 @@ bool gc(struct garbagelist * stackptr) {
      gc_status_info.gcprocessing = false;
      return false;
    }
+#ifdef PERFCOUNT
+  profile_start(GC_REGION);
+#endif
  
    // core coordinator routine
    if(0 == BAMBOO_NUM_OF_CORE) {
      GC_PRINTF("Check if we can do gc or not\n");
      gccorestatus[BAMBOO_NUM_OF_CORE] = 0;
-    if(!gc_checkAllCoreStatus()) {
-      // some of the cores are still executing the mutator and did not reach
-      // some gc safe point, therefore it is not ready to do gc
-      gcflag = true;
-      return false;
-    } else {
-      GCPROFILE_START();
-      pregccheck();
-    }
+
+    //wait for other cores to catch up
+    while(!gc_checkCoreStatus())
+      ;
+
+    GCPROFILE_START();
+    pregccheck();
      GC_PRINTF("start gc! \n");
      pregcprocessing();
      gc_master(stackptr);
    } else if(BAMBOO_NUM_OF_CORE < NUMCORES4GC) {
+    GC_PRINTF("Core reporting for gc.\n");
      pregcprocessing();
      gc_collect(stackptr);
    } else {
@@ -712,7 +559,9 @@ bool gc(struct garbagelist * stackptr) {
      gc_nocollect(stackptr);
    }
    postgcprocessing();
-
+#ifdef PERFCOUNT
+  profile_start(APP_REGION);
+#endif
    return true;
  }