Add gcprofile to PMC garbage collector and compute the size of allocated obj instead...

[IRC.git] / Robust / src / Runtime / bamboo / multicoregarbage.c
diff --git a/Robust/src/Runtime/bamboo/multicoregarbage.c b/Robust/src/Runtime/bamboo/multicoregarbage.c

index 7b5dffe12f05580b3cae7e305bc27666b78a90cd..4518fed89c10057533a9a19906e331d67af6d4b0 100644 (file)
--- a/Robust/src/Runtime/bamboo/multicoregarbage.c
+++ b/Robust/src/Runtime/bamboo/multicoregarbage.c
@@ -1,4344 +1,567 @@
  #ifdef MULTICORE_GC
  #include "runtime.h"
-#include "multicoregarbage.h"
  #include "multicoreruntime.h"
-#include "runtime_arch.h"
-#include "SimpleHash.h"
-#include "GenericHashtable.h"
-#include "ObjectHash.h"
-#include "GCSharedHash.h"
-
-extern int corenum;
-extern struct parameterwrapper ** objectqueues[][NUMCLASSES];
-extern int numqueues[][NUMCLASSES];
-
-extern struct genhashtable * activetasks;
-extern struct parameterwrapper ** objectqueues[][NUMCLASSES];
-extern struct taskparamdescriptor *currtpd;
-
-extern struct LockValue runtime_locks[MAXTASKPARAMS];
-extern int runtime_locklen;
-
-#ifdef SMEMM
-extern unsigned int gcmem_mixed_threshold;
-extern unsigned int gcmem_mixed_usedmem;
+#include "multicoregarbage.h"
+#include "multicoregcmark.h"
+#include "multicoregccompact.h"
+#include "multicoregcflush.h"
+#include "multicoregcprofile.h"
+#include "gcqueue.h"
+#include "multicoremem_helper.h"
+#include "bambooalign.h"
+#ifdef PERFCOUNT
+#include "bme_perf_counter.h"
  #endif
  
-struct pointerblock {
-  void * ptrs[NUMPTRS];
-  struct pointerblock *next;
-};
-
-struct pointerblock *gchead=NULL;
-int gcheadindex=0;
-struct pointerblock *gctail=NULL;
-int gctailindex=0;
-struct pointerblock *gctail2=NULL;
-int gctailindex2=0;
-struct pointerblock *gcspare=NULL;
-
-#define NUMLOBJPTRS 20
+volatile bool gcflag;
+gc_status_t gc_status_info;
  
-struct lobjpointerblock {
-  void * lobjs[NUMLOBJPTRS];
-  //void * dsts[NUMLOBJPTRS];
-  int lengths[NUMLOBJPTRS];
-  //void * origs[NUMLOBJPTRS];
-  int hosts[NUMLOBJPTRS];
-  struct lobjpointerblock *next;
-  struct lobjpointerblock *prev;
-};
-
-struct lobjpointerblock *gclobjhead=NULL;
-int gclobjheadindex=0;
-struct lobjpointerblock *gclobjtail=NULL;
-int gclobjtailindex=0;
-struct lobjpointerblock *gclobjtail2=NULL;
-int gclobjtailindex2=0;
-struct lobjpointerblock *gclobjspare=NULL;
-
-#ifdef GC_CACHE_ADAPT
-typedef struct gc_cache_revise_info {
-  int orig_page_start_va;
-  int orig_page_end_va;
-  int orig_page_index;
-  int to_page_start_va;
-  int to_page_end_va;
-  int to_page_index;
-  int revised_sampling[NUMCORESACTIVE];
-} gc_cache_revise_info_t;
-gc_cache_revise_info_t gc_cache_revise_infomation;
-#endif// GC_CACHE_ADAPT
+unsigned long long gc_output_cache_policy_time=0;
  
  #ifdef GC_DEBUG
  // dump whole mem in blocks
-inline void dumpSMem() {
+void dumpSMem() {
    int block = 0;
    int sblock = 0;
-  int j = 0;
-  int i = 0;
+  unsigned int j = 0;
+  unsigned int i = 0;
    int coren = 0;
    int x = 0;
    int y = 0;
-  printf("(%x,%x) Dump shared mem: \n", udn_tile_coord_x(), 
-            udn_tile_coord_y());
+  printf("(%x,%x) Dump shared mem: \n",udn_tile_coord_x(),udn_tile_coord_y());
    // reserved blocks for sblocktbl
-  printf("(%x,%x) ++++ reserved sblocks ++++ \n", udn_tile_coord_x(), 
-            udn_tile_coord_y());
+  printf("(%x,%x) ++++ reserved sblocks ++++ \n", udn_tile_coord_x(),
+        udn_tile_coord_y());
    for(i=BAMBOO_BASE_VA; i<gcbaseva; i+= 4*16) {
      printf("(%x,%x) 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x \n",
-                  udn_tile_coord_x(), udn_tile_coord_y(),
-           *((int *)(i)), *((int *)(i + 4)),
-           *((int *)(i + 4*2)), *((int *)(i + 4*3)),
-           *((int *)(i + 4*4)), *((int *)(i + 4*5)),
-           *((int *)(i + 4*6)), *((int *)(i + 4*7)),
-           *((int *)(i + 4*8)), *((int *)(i + 4*9)),
-           *((int *)(i + 4*10)), *((int *)(i + 4*11)),
-           *((int *)(i + 4*12)), *((int *)(i + 4*13)),
-           *((int *)(i + 4*14)), *((int *)(i + 4*15)));
-  }
-  sblock = gcreservedsb;
+        udn_tile_coord_x(), udn_tile_coord_y(),
+        *((int *)(i)), *((int *)(i + 4)),
+        *((int *)(i + 4*2)), *((int *)(i + 4*3)),
+        *((int *)(i + 4*4)), *((int *)(i + 4*5)),
+        *((int *)(i + 4*6)), *((int *)(i + 4*7)),
+        *((int *)(i + 4*8)), *((int *)(i + 4*9)),
+        *((int *)(i + 4*10)), *((int *)(i + 4*11)),
+        *((int *)(i + 4*12)), *((int *)(i + 4*13)),
+        *((int *)(i + 4*14)), *((int *)(i + 4*15)));
+  }
+  sblock = 0;
    bool advanceblock = false;
    // remaining memory
-  for(i=gcbaseva; i<gcbaseva+BAMBOO_SHARED_MEM_SIZE; i+=4*16) {
+  for(i=gcbaseva; (unsigned int)i<(unsigned int)(gcbaseva+BAMBOO_SHARED_MEM_SIZE); i+=4*16) {
      advanceblock = false;
      // computing sblock # and block #, core coordinate (x,y) also
      if(j%((BAMBOO_SMEM_SIZE)/(4*16)) == 0) {
        // finished a sblock
        if(j < ((BAMBOO_LARGE_SMEM_BOUND)/(4*16))) {
-               if((j > 0) && (j%((BAMBOO_SMEM_SIZE_L)/(4*16)) == 0)) {
-                 // finished a block
-                 block++;
-                 advanceblock = true;
-               }
+        if((j > 0) && (j%((BAMBOO_SMEM_SIZE_L)/(4*16)) == 0)) {
+          // finished a block
+          block++;
+          advanceblock = true; 
+        }
        } else {
-               // finished a block
-               block++;
-               advanceblock = true;
+        // finished a block
+        block++;
+        advanceblock = true;
        }
        // compute core #
        if(advanceblock) {
-               coren = gc_block2core[block%(NUMCORES4GC*2)];
+        coren = gc_block2core[block%(NUMCORES4GC*2)];
        }
        // compute core coordinate
-      BAMBOO_COORDS(coren, &x, &y);
+      x = BAMBOO_COORDS_X(coren);
+      y = BAMBOO_COORDS_Y(coren);
        printf("(%x,%x) ==== %d, %d : core (%d,%d), saddr %x====\n",
-                    udn_tile_coord_x(), udn_tile_coord_y(),
-             block, sblock++, x, y,
-             (sblock-1)*(BAMBOO_SMEM_SIZE)+gcbaseva);
+          udn_tile_coord_x(), udn_tile_coord_y(),block, sblock++, x, y,
+          (sblock-1)*(BAMBOO_SMEM_SIZE)+gcbaseva);
      }
      j++;
      printf("(%x,%x) 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x \n",
-                  udn_tile_coord_x(), udn_tile_coord_y(),
-           *((int *)(i)), *((int *)(i + 4)),
-           *((int *)(i + 4*2)), *((int *)(i + 4*3)),
-           *((int *)(i + 4*4)), *((int *)(i + 4*5)),
-           *((int *)(i + 4*6)), *((int *)(i + 4*7)),
-           *((int *)(i + 4*8)), *((int *)(i + 4*9)),
-           *((int *)(i + 4*10)), *((int *)(i + 4*11)),
-           *((int *)(i + 4*12)), *((int *)(i + 4*13)),
-           *((int *)(i + 4*14)), *((int *)(i + 4*15)));
+        udn_tile_coord_x(), udn_tile_coord_y(),
+        *((int *)(i)), *((int *)(i + 4)),
+        *((int *)(i + 4*2)), *((int *)(i + 4*3)),
+        *((int *)(i + 4*4)), *((int *)(i + 4*5)),
+        *((int *)(i + 4*6)), *((int *)(i + 4*7)),
+        *((int *)(i + 4*8)), *((int *)(i + 4*9)),
+        *((int *)(i + 4*10)), *((int *)(i + 4*11)),
+        *((int *)(i + 4*12)), *((int *)(i + 4*13)),
+        *((int *)(i + 4*14)), *((int *)(i + 4*15)));
    }
    printf("(%x,%x) \n", udn_tile_coord_x(), udn_tile_coord_y());
  }
  #endif
  
-// should be invoked with interruption closed
-inline void gc_enqueue_I(void *ptr) {
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xe601);
-  BAMBOO_DEBUGPRINT_REG(ptr);
-#endif
-  if (gcheadindex==NUMPTRS) {
-    struct pointerblock * tmp;
-    if (gcspare!=NULL) {
-      tmp=gcspare;
-      gcspare=NULL;
-    } else {
-      tmp=RUNMALLOC_I(sizeof(struct pointerblock));
-    }             // if (gcspare!=NULL)
-    gchead->next=tmp;
-    gchead=tmp;
-    gcheadindex=0;
-  } // if (gcheadindex==NUMPTRS)
-  gchead->ptrs[gcheadindex++]=ptr;
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xe602);
-#endif
-} // void gc_enqueue_I(void *ptr)
-
-// dequeue and destroy the queue
-inline void * gc_dequeue_I() {
-  if (gctailindex==NUMPTRS) {
-    struct pointerblock *tmp=gctail;
-    gctail=gctail->next;
-    gctailindex=0;
-    if (gcspare!=NULL) {
-      RUNFREE(tmp);
-    } else {
-      gcspare=tmp;
-    }             // if (gcspare!=NULL)
-  } // if (gctailindex==NUMPTRS)
-  return gctail->ptrs[gctailindex++];
-} // void * gc_dequeue()
-
-// dequeue and do not destroy the queue
-inline void * gc_dequeue2_I() {
-  if (gctailindex2==NUMPTRS) {
-    struct pointerblock *tmp=gctail2;
-    gctail2=gctail2->next;
-    gctailindex2=0;
-  } // if (gctailindex2==NUMPTRS)
-  return gctail2->ptrs[gctailindex2++];
-} // void * gc_dequeue2()
-
-inline int gc_moreItems_I() {
-  if ((gchead==gctail)&&(gctailindex==gcheadindex))
-    return 0;
-  return 1;
-} // int gc_moreItems()
-
-inline int gc_moreItems2_I() {
-  if ((gchead==gctail2)&&(gctailindex2==gcheadindex))
-    return 0;
-  return 1;
-} // int gc_moreItems2()
-
-// should be invoked with interruption closed
-// enqueue a large obj: start addr & length
-inline void gc_lobjenqueue_I(void *ptr,
-                             int length,
-                             int host) {
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xe901);
-#endif
-  if (gclobjheadindex==NUMLOBJPTRS) {
-    struct lobjpointerblock * tmp;
-    if (gclobjspare!=NULL) {
-      tmp=gclobjspare;
-      gclobjspare=NULL;
-    } else {
-      tmp=RUNMALLOC_I(sizeof(struct lobjpointerblock));
-    }             // if (gclobjspare!=NULL)
-    gclobjhead->next=tmp;
-    tmp->prev = gclobjhead;
-    gclobjhead=tmp;
-    gclobjheadindex=0;
-  } // if (gclobjheadindex==NUMLOBJPTRS)
-  gclobjhead->lobjs[gclobjheadindex]=ptr;
-  gclobjhead->lengths[gclobjheadindex]=length;
-  gclobjhead->hosts[gclobjheadindex++]=host;
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT_REG(gclobjhead->lobjs[gclobjheadindex-1]);
-  BAMBOO_DEBUGPRINT_REG(gclobjhead->lengths[gclobjheadindex-1]);
-  BAMBOO_DEBUGPRINT_REG(gclobjhead->hosts[gclobjheadindex-1]);
-#endif
-} // void gc_lobjenqueue_I(void *ptr...)
-
-// dequeue and destroy the queue
-inline void * gc_lobjdequeue_I(int * length,
-                               int * host) {
-  if (gclobjtailindex==NUMLOBJPTRS) {
-    struct lobjpointerblock *tmp=gclobjtail;
-    gclobjtail=gclobjtail->next;
-    gclobjtailindex=0;
-    gclobjtail->prev = NULL;
-    if (gclobjspare!=NULL) {
-      RUNFREE(tmp);
-    } else {
-      gclobjspare=tmp;
-      tmp->next = NULL;
-      tmp->prev = NULL;
-    }  // if (gclobjspare!=NULL)
-  } // if (gclobjtailindex==NUMLOBJPTRS)
-  if(length != NULL) {
-    *length = gclobjtail->lengths[gclobjtailindex];
-  }
-  if(host != NULL) {
-    *host = (int)(gclobjtail->hosts[gclobjtailindex]);
+bool gc_checkCoreStatus() {
+  for(int i = 0; i < NUMCORES4GC; i++) {
+    if(gccorestatus[i]) {
+      return false;
+    }
    }
-  return gclobjtail->lobjs[gclobjtailindex++];
-} // void * gc_lobjdequeue()
-
-inline int gc_lobjmoreItems_I() {
-  if ((gclobjhead==gclobjtail)&&(gclobjtailindex==gclobjheadindex))
-    return 0;
-  return 1;
-} // int gc_lobjmoreItems()
-
-// dequeue and don't destroy the queue
-inline void gc_lobjdequeue2_I() {
-  if (gclobjtailindex2==NUMLOBJPTRS) {
-    gclobjtail2=gclobjtail2->next;
-    gclobjtailindex2=1;
-  } else {
-    gclobjtailindex2++;
-  }      // if (gclobjtailindex2==NUMLOBJPTRS)
-} // void * gc_lobjdequeue2()
-
-inline int gc_lobjmoreItems2_I() {
-  if ((gclobjhead==gclobjtail2)&&(gclobjtailindex2==gclobjheadindex))
-    return 0;
-  return 1;
-} // int gc_lobjmoreItems2()
-
-// 'reversly' dequeue and don't destroy the queue
-inline void gc_lobjdequeue3_I() {
-  if (gclobjtailindex2==0) {
-    gclobjtail2=gclobjtail2->prev;
-    gclobjtailindex2=NUMLOBJPTRS-1;
-  } else {
-    gclobjtailindex2--;
-  }      // if (gclobjtailindex2==NUMLOBJPTRS)
-} // void * gc_lobjdequeue3()
-
-inline int gc_lobjmoreItems3_I() {
-  if ((gclobjtail==gclobjtail2)&&(gclobjtailindex2==gclobjtailindex))
-    return 0;
-  return 1;
-} // int gc_lobjmoreItems3()
-
-inline void gc_lobjqueueinit4_I() {
-  gclobjtail2 = gclobjtail;
-  gclobjtailindex2 = gclobjtailindex;
-} // void gc_lobjqueueinit2()
+  return true;
+}
  
-inline void * gc_lobjdequeue4_I(int * length,
-                                int * host) {
-  if (gclobjtailindex2==NUMLOBJPTRS) {
-    gclobjtail2=gclobjtail2->next;
-    gclobjtailindex2=0;
-  } // if (gclobjtailindex==NUMLOBJPTRS)
-  if(length != NULL) {
-    *length = gclobjtail2->lengths[gclobjtailindex2];
-  }
-  if(host != NULL) {
-    *host = (int)(gclobjtail2->hosts[gclobjtailindex2]);
+void gc_resetCoreStatus() {
+  for(int i = 0; i < NUMCORES4GC; i++) {
+    gccorestatus[i] = 1;
    }
-  return gclobjtail2->lobjs[gclobjtailindex2++];
-} // void * gc_lobjdequeue()
-
-inline int gc_lobjmoreItems4_I() {
-  if ((gclobjhead==gclobjtail2)&&(gclobjtailindex2==gclobjheadindex))
-    return 0;
-  return 1;
-} // int gc_lobjmoreItems(
-
-INTPTR gccurr_heapbound = 0;
-
-inline void gettype_size(void * ptr,
-                         int * ttype,
-                         int * tsize) {
-  int type = ((int *)ptr)[0];
-  int size = 0;
-  if(type < NUMCLASSES) {
-    // a normal object
-    size = classsize[type];
-  } else {
-    // an array
-    struct ArrayObject *ao=(struct ArrayObject *)ptr;
-    int elementsize=classsize[type];
-    int length=ao->___length___;
-    size=sizeof(struct ArrayObject)+length*elementsize;
-  }       // if(type < NUMCLASSES)
-  *ttype = type;
-  *tsize = size;
  }
  
-inline bool isLarge(void * ptr,
-                    int * ttype,
-                    int * tsize) {
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xe701);
-  BAMBOO_DEBUGPRINT_REG(ptr);
-#endif
-  // check if a pointer is referring to a large object
-  gettype_size(ptr, ttype, tsize);
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(*tsize);
-#endif
-  int bound = (BAMBOO_SMEM_SIZE);
-  if(((int)ptr-gcbaseva) < (BAMBOO_LARGE_SMEM_BOUND)) {
-    bound = (BAMBOO_SMEM_SIZE_L);
-  }
-  if((((int)ptr-gcbaseva)%(bound))==0) {
-    // ptr is a start of a block
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xe702);
-    BAMBOO_DEBUGPRINT(1);
-#endif
-    return true;
-  }
-  if((bound-(((int)ptr-gcbaseva)%bound)) < (*tsize)) {
-    // it acrosses the boundary of current block
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xe703);
-    BAMBOO_DEBUGPRINT(1);
-#endif
-    return true;
-  }
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0);
-#endif
-  return false;
-} // bool isLarge(void * ptr, int * ttype, int * tsize)
  
-inline int hostcore(void * ptr) {
-  // check the host core of ptr
-  int host = 0;
-  RESIDECORE(ptr, &host);
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xedd0);
-  BAMBOO_DEBUGPRINT_REG(ptr);
-  BAMBOO_DEBUGPRINT_REG(host);
+void initmulticoregcdata() {
+  bamboo_smem_zero_top = NULL;
+  gcflag = false;
+  gc_status_info.gcprocessing = false;
+  gc_status_info.gcphase = FINISHPHASE;
+
+  gcprecheck = true;
+  gcforwardobjtbl = allocateMGCHash_I(128);
+#ifdef MGC_SPEC
+  gc_profile_flag = false;
  #endif
-  return host;
-} // int hostcore(void * ptr)
  
-inline void cpu2coords(int coren,
-                          int * x,
-                                          int * y) {
-  *x = bamboo_cpu2coords[2*coren];
-  *y = bamboo_cpu2coords[2*coren+1];
-} // void cpu2coords(...)
+  if(STARTUPCORE == BAMBOO_NUM_OF_CORE) {
+    allocationinfo.blocktable=RUNMALLOC(sizeof(struct blockrecord)*GCNUMBLOCK);
+    for(int i=0; i<GCNUMBLOCK;i++) {
+      if (1==NUMCORES4GC)
+       allocationinfo.blocktable[i].corenum=0;
+      else
+       allocationinfo.blocktable[i].corenum=gc_block2core[(i%(NUMCORES4GC*2))];
+      allocationinfo.blocktable[i].status=BS_FREE;
+      allocationinfo.blocktable[i].usedspace=0;
+      allocationinfo.blocktable[i].freespace=GLOBALBLOCKSIZE(i);
+    }
+    buildCore2Test();
+  }
  
-inline bool isLocal(void * ptr) {
-  // check if a pointer is in shared heap on this core
-  return hostcore(ptr) == BAMBOO_NUM_OF_CORE;
-} // bool isLocal(void * ptr)
+  //initialize update structures
+  origarraycount=0;
+  for(int i=0;i<NUMCORES4GC;i++) {
+    origblockarray[i]=NULL;
+  }
  
-inline bool gc_checkCoreStatus_I() {
-  bool allStall = true;
-  for(int i = 0; i < NUMCORES4GC; ++i) {
-    if(gccorestatus[i] != 0) {
-      allStall = false;
-      break;
-    }             // if(gccorestatus[i] != 0)
-  }       // for(i = 0; i < NUMCORES4GC; ++i)
-  return allStall;
+  INIT_MULTICORE_GCPROFILE_DATA();
  }
  
-inline bool gc_checkAllCoreStatus_I() {
-  bool allStall = true;
-  for(int i = 0; i < NUMCORESACTIVE; ++i) {
-    if(gccorestatus[i] != 0) {
-      allStall = false;
-      break;
-    }             // if(gccorestatus[i] != 0)
-  }       // for(i = 0; i < NUMCORESACTIVE; ++i)
-  return allStall;
+void dismulticoregcdata() {
+  freeMGCHash(gcforwardobjtbl);
  }
  
-inline void checkMarkStatue() {
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xee01);
-#endif
-  int i;
-  if((!waitconfirm) ||
-     (waitconfirm && (numconfirm == 0))) {
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xee02);
-#endif
-       int entry_index = 0;
-       if(waitconfirm) {
-         // phase 2
-         entry_index = (gcnumsrobjs_index == 0) ? 1 : 0;
-       } else {
-         // phase 1
-         entry_index = gcnumsrobjs_index;
-       }
-    BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
-    gccorestatus[BAMBOO_NUM_OF_CORE] = 0;
-    gcnumsendobjs[entry_index][BAMBOO_NUM_OF_CORE] = gcself_numsendobjs;
-    gcnumreceiveobjs[entry_index][BAMBOO_NUM_OF_CORE] = gcself_numreceiveobjs;
-    // check the status of all cores
-    bool allStall = gc_checkAllCoreStatus_I();
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xee03);
-#endif
-    if(allStall) {
-#ifdef DEBUG
-      BAMBOO_DEBUGPRINT(0xee04);
-#endif
-      // ask for confirm
-      if(!waitconfirm) {
-#ifdef DEBUG
-               BAMBOO_DEBUGPRINT(0xee05);
-#endif
-               // the first time found all cores stall
-               // send out status confirm msg to all other cores
-               // reset the corestatus array too
-               gccorestatus[BAMBOO_NUM_OF_CORE] = 1;
-               waitconfirm = true;
-               numconfirm = NUMCORESACTIVE - 1;
-               BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-               for(i = 1; i < NUMCORESACTIVE; ++i) {
-                 gccorestatus[i] = 1;
-                 // send mark phase finish confirm request msg to core i
-                 send_msg_1(i, GCMARKCONFIRM, false);
-               }  // for(i = 1; i < NUMCORESACTIVE; ++i)
-      } else {
-               // Phase 2
-               // check if the sum of send objs and receive obj are the same
-               // yes->check if the info is the latest; no->go on executing
-               int sumsendobj = 0;
-               for(i = 0; i < NUMCORESACTIVE; ++i) {
-                 sumsendobj += gcnumsendobjs[gcnumsrobjs_index][i];
-               }  // for(i = 0; i < NUMCORESACTIVE; ++i)
-#ifdef DEBUG
-               BAMBOO_DEBUGPRINT(0xee06);
-               BAMBOO_DEBUGPRINT_REG(sumsendobj);
-#endif
-               for(i = 0; i < NUMCORESACTIVE; ++i) {
-                 sumsendobj -= gcnumreceiveobjs[gcnumsrobjs_index][i];
-               }  // for(i = 0; i < NUMCORESACTIVE; ++i)
-#ifdef DEBUG
-               BAMBOO_DEBUGPRINT(0xee07);
-               BAMBOO_DEBUGPRINT_REG(sumsendobj);
-#endif
-               if(0 == sumsendobj) {
-                 // Check if there are changes of the numsendobjs or numreceiveobjs on
-                 // each core
-                 bool ischanged = false;
-                 for(i = 0; i < NUMCORESACTIVE; ++i) {
-                       if((gcnumsendobjs[0][i] != gcnumsendobjs[1][i]) || 
-                               (gcnumreceiveobjs[0][i] != gcnumreceiveobjs[1][i]) ) {
-                         ischanged = true;
-                         break;
-                       }
-                 }  // for(i = 0; i < NUMCORESACTIVE; ++i)
-#ifdef DEBUG
-                 BAMBOO_DEBUGPRINT(0xee08);
-                 BAMBOO_DEBUGPRINT_REG(ischanged);
-#endif
-                 if(!ischanged) {
-#ifdef DEBUG
-                       BAMBOO_DEBUGPRINT(0xee09);
-#endif
-                       // all the core status info are the latest
-                       // stop mark phase
-                       gcphase = COMPACTPHASE;
-                       // restore the gcstatus for all cores
-                       for(i = 0; i < NUMCORESACTIVE; ++i) {
-                         gccorestatus[i] = 1;
-                       }  // for(i = 0; i < NUMCORESACTIVE; ++i)
-                 } else {
-                       waitconfirm = false;
-                       gcnumsrobjs_index = (gcnumsrobjs_index == 0) ? 1 : 0;
-                 } // if(!ischanged)
-               } else {
-                 // There were changes between phase 1 and phase 2, can not decide 
-                 // whether the mark phase has been finished
-                 waitconfirm = false;
-                 // As it fails in phase 2, flip the entries
-                 gcnumsrobjs_index = (gcnumsrobjs_index == 0) ? 1 : 0;
-               } // if(0 == sumsendobj) else ...
-               BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-      } // if(!gcwaitconfirm) else()
-    } else {
-      BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-    } // if(allStall)
-  }  // if((!waitconfirm)...
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xee0a);
-#endif
-} // void checkMarkStatue()
-/*
-inline bool preGC() {
-  // preparation for gc
-  // make sure to clear all incoming msgs espacially transfer obj msgs
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xec01);
-#endif
-  int i;
-  if((!waitconfirm) ||
-     (waitconfirm && (numconfirm == 0))) {
-    // send out status confirm msgs to all cores to check if there are
-    // transfer obj msgs on-the-fly
-    waitconfirm = true;
-    numconfirm = NUMCORESACTIVE - 1;
-    for(i = 1; i < NUMCORESACTIVE; ++i) {
-      corestatus[i] = 1;
-      // send status confirm msg to core i
-      send_msg_1(i, STATUSCONFIRM, false);
-    }   // for(i = 1; i < NUMCORESACTIVE; ++i)
-
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xec02);
-#endif
-    while(true) {
-      if(numconfirm == 0) {
-               break;
-      }
-    }   // wait for confirmations
-    waitconfirm = false;
-    numconfirm = 0;
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xec03);
-#endif
-    numsendobjs[BAMBOO_NUM_OF_CORE] = self_numsendobjs;
-    numreceiveobjs[BAMBOO_NUM_OF_CORE] = self_numreceiveobjs;
-    int sumsendobj = 0;
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xec04);
-#endif
-    for(i = 0; i < NUMCORESACTIVE; ++i) {
-      sumsendobj += numsendobjs[i];
-#ifdef DEBUG
-      BAMBOO_DEBUGPRINT(0xf000 + numsendobjs[i]);
-#endif
-    }             // for(i = 1; i < NUMCORESACTIVE; ++i)
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xec05);
-    BAMBOO_DEBUGPRINT_REG(sumsendobj);
-#endif
-    for(i = 0; i < NUMCORESACTIVE; ++i) {
-      sumsendobj -= numreceiveobjs[i];
-#ifdef DEBUG
-      BAMBOO_DEBUGPRINT(0xf000 + numreceiveobjs[i]);
-#endif
-    }             // for(i = 1; i < NUMCORESACTIVE; ++i)
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xec06);
-    BAMBOO_DEBUGPRINT_REG(sumsendobj);
-#endif
-    if(0 == sumsendobj) {
-      return true;
-    } else {
-      // still have some transfer obj msgs on-the-fly, can not start gc
-      return false;
-    }  // if(0 == sumsendobj)
-  } else {
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xec07);
-#endif
-    // previously asked for status confirmation and do not have all the
-    // confirmations yet, can not start gc
-    return false;
-  }       // if((!waitconfirm) ||
-} // bool preGC()*/
-
-inline void initGC() {
-  int i;
+void initGC() {
    if(STARTUPCORE == BAMBOO_NUM_OF_CORE) {
-    for(i = 0; i < NUMCORES4GC; ++i) {
+    for(int i = 0; i < NUMCORES4GC; i++) {
        gccorestatus[i] = 1;
        gcnumsendobjs[0][i] = gcnumsendobjs[1][i] = 0;
        gcnumreceiveobjs[0][i] = gcnumreceiveobjs[1][i] = 0;
        gcloads[i] = 0;
        gcrequiredmems[i] = 0;
-      gcfilledblocks[i] = 0;
-      gcstopblock[i] = 0;
-    } // for(i = 0; i < NUMCORES4GC; ++i)
-    for(i = NUMCORES4GC; i < NUMCORESACTIVE; ++i) {
+    } 
+    for(int i = NUMCORES4GC; i < NUMCORESACTIVE; i++) {
        gccorestatus[i] = 1;
        gcnumsendobjs[0][i] = gcnumsendobjs[1][i] = 0;
        gcnumreceiveobjs[0][i] = gcnumreceiveobjs[1][i] = 0;
      }
-    gcheaptop = 0;
-    gctopcore = 0;
-    gctopblock = 0;
-  } // if(STARTUPCORE == BAMBOO_NUM_OF_CORE)
+    gcnumsrobjs_index = 0;
+  } 
    gcself_numsendobjs = 0;
    gcself_numreceiveobjs = 0;
-  gcmarkedptrbound = 0;
-  gcobj2map = 0;
-  gcmappedobj = 0;
-  //gcismapped = false;
-  gcnumlobjs = 0;
    gcmovestartaddr = 0;
    gctomove = false;
    gcblock2fill = 0;
    gcmovepending = 0;
    gccurr_heaptop = 0;
-  gcdstcore = 0;
+  update_origblockptr=NULL;
+  gc_queueinit();
  
-  // initialize queue
-  if (gchead==NULL) {
-    gcheadindex=gctailindex=gctailindex2 = 0;
-    gchead=gctail=gctail2=RUNMALLOC(sizeof(struct pointerblock));
-  } else {
-    gctailindex = gctailindex2 = gcheadindex;
-    gctail = gctail2 = gchead;
-  }
+  MGCHashreset(gcforwardobjtbl);
+
+  GCPROFILE_INIT();
+  gc_output_cache_policy_time=0;
+} 
  
-  // initialize the large obj queues
-  if (gclobjhead==NULL) {
-    gclobjheadindex=0;
-    gclobjtailindex=0;
-    gclobjtailindex2 = 0;
-    gclobjhead=gclobjtail=gclobjtail2=
-         RUNMALLOC(sizeof(struct lobjpointerblock));
+void checkMarkStatus_p2() {
+  //  tprintf("Check mark status 2\n");
+  // check if the sum of send objs and receive obj are the same
+  // yes->check if the info is the latest; no->go on executing
+  unsigned int sumsendobj = 0;
+  for(int i = 0; i < NUMCORESACTIVE; i++) {
+    sumsendobj += gcnumsendobjs[gcnumsrobjs_index][i];
+  } 
+  for(int i = 0; i < NUMCORESACTIVE; i++) {
+    sumsendobj -= gcnumreceiveobjs[gcnumsrobjs_index][i];
+  } 
+  if(0 == sumsendobj) {
+    // Check if there are changes of the numsendobjs or numreceiveobjs 
+    // on each core
+    int i = 0;
+    for(i = 0; i < NUMCORESACTIVE; i++) {
+      if((gcnumsendobjs[0][i]!=gcnumsendobjs[1][i])||(gcnumreceiveobjs[0][i]!=gcnumreceiveobjs[1][i]) ) {
+        break;
+      }
+    }  
+    if(i == NUMCORESACTIVE) {    
+      //tprintf("Mark terminated\n");
+      // all the core status info are the latest,stop mark phase
+      gc_status_info.gcphase = COMPACTPHASE;
+      // restore the gcstatus for all cores
+      for(int i = 0; i < NUMCORESACTIVE; i++) {
+        gccorestatus[i] = 1;
+      }  
+    } else {
+      // There were changes between phase 1 and phase 2, can not decide 
+      // whether the mark phase has been finished
+      waitconfirm = false;
+      // As it fails in phase 2, flip the entries
+      gcnumsrobjs_index = (gcnumsrobjs_index == 0) ? 1 : 0;
+    } 
    } else {
-    gclobjtailindex = gclobjtailindex2 = gclobjheadindex = 0;
-    gclobjtail = gclobjtail2 = gclobjhead;
+    // There were changes between phase 1 and phase 2, can not decide 
+    // whether the mark phase has been finished
+    waitconfirm = false;
+    // As it fails in phase 2, flip the entries
+    gcnumsrobjs_index = (gcnumsrobjs_index == 0) ? 1 : 0;
    }
-  gclobjhead->next = gclobjhead->prev = NULL;
-
-#ifdef LOCALHASHTBL_TEST
-  freeRuntimeHash(gcpointertbl);
-  gcpointertbl = allocateRuntimeHash(20);
-#else
-  mgchashreset(gcpointertbl);
-#endif
-  //gcpointertbl = allocateMGCHash(20);
-
-  freeMGCHash(gcforwardobjtbl);
-  gcforwardobjtbl = allocateMGCHash(20, 3);
+}
  
-  // initialize the mapping info related structures
-  if((BAMBOO_NUM_OF_CORE < NUMCORES4GC) && (gcsharedptbl != NULL)) {
-       // Never free the shared hash table, just reset it
-       /*freeGCSharedHash(gcsharedptbl);
-       gcsharedptbl = allocateGCSharedHash(20);*/
-       mgcsharedhashReset(gcsharedptbl);
-  }
-  // Zero out the remaining bamboo_cur_msp 
-  // Only zero out the first 4 bytes of the remaining memory
-  /*if((bamboo_cur_msp != 0) 
-         && (bamboo_smem_zero_top == bamboo_cur_msp) 
-         && (bamboo_smem_size > 0)) {
-       *((int *)bamboo_cur_msp) = 0;
-  }*/
-#ifdef GC_PROFILE
-  gc_num_livespace = 0;
-  gc_num_freespace = 0;
-  gc_num_lobj = 0;
-  gc_num_lobjspace = 0;
-  gc_num_liveobj = 0;
-  gc_num_forwardobj = 0;
-  gc_num_profiles = NUMCORESACTIVE - 1;
-#endif
-} // void initGC()
+void checkMarkStatus() {
+  //  tprintf("Check mark status\n");
+  if((!waitconfirm)||(waitconfirm && (numconfirm == 0))) {
+    unsigned int entry_index = 0;
+    if(waitconfirm) {
+      // phase 2
+      entry_index = (gcnumsrobjs_index == 0) ? 1 : 0;
+    } else {
+      // phase 1
+      entry_index = gcnumsrobjs_index;
+    }
+    BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
+    // check the status of all cores
+    if (gc_checkCoreStatus()) {
+      // ask for confirm
+      if(!waitconfirm) {
+        // the first time found all cores stall
+        // send out status confirm msg to all other cores
+        // reset the corestatus array too    
+        waitconfirm = true;
+        numconfirm = NUMCORESACTIVE - 1;
+        BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
+        GC_SEND_MSG_1_TO_CLIENT(GCMARKCONFIRM);
+      } else {
+        // Phase 2
+        checkMarkStatus_p2(); 
+        BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
+      }
+    } else {
+      BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
+    } 
+  } 
+} 
  
  // compute load balance for all cores
-inline int loadbalance(int * heaptop) {
+int loadbalance() {
    // compute load balance
-  int i;
-
    // get the total loads
-  int tloads = gcloads[STARTUPCORE];
-  for(i = 1; i < NUMCORES4GC; i++) {
+  void * heaptop;
+  unsigned int tloads = 0;
+  for(int i = 0; i < NUMCORES4GC; i++) {
      tloads += gcloads[i];
+    //tprintf("load: %d %d \n", gcloads[i], i);
    }
-  *heaptop = gcbaseva + tloads;
+  heaptop = gcbaseva + tloads;
  
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xdddd);
-  BAMBOO_DEBUGPRINT_REG(tloads);
-  BAMBOO_DEBUGPRINT_REG(*heaptop);
-#endif
-  int b = 0;
-  BLOCKINDEX(*heaptop, &b);
-  int numbpc = b / NUMCORES4GC;       // num of blocks per core
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT_REG(b);
-  BAMBOO_DEBUGPRINT_REG(numbpc);
-#endif
-  gctopblock = b;
-  RESIDECORE(heaptop, &gctopcore);
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT_REG(gctopcore);
-#endif
+  unsigned int topblockindex;
+  
+  BLOCKINDEX(topblockindex, heaptop);
+  // num of blocks per core
+  unsigned int numbpc = (topblockindex+NUMCORES4GC-1)/NUMCORES4GC;
+  
    return numbpc;
-} // void loadbalance(int * heaptop)
+}
  
-inline bool cacheLObjs() {
-  // check the total mem size need for large objs
-  unsigned long long sumsize = 0;
-  int size = 0;
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xe801);
-#endif
-  gclobjtail2 = gclobjtail;
-  gclobjtailindex2 = gclobjtailindex;
-  int tmp_lobj = 0;
-  int tmp_len = 0;
-  int tmp_host = 0;
-  // compute total mem size required and sort the lobjs in ascending order
-  while(gc_lobjmoreItems2_I()) {
-    gc_lobjdequeue2_I();
-    tmp_lobj = gclobjtail2->lobjs[gclobjtailindex2-1];
-    tmp_host = gclobjtail2->hosts[gclobjtailindex2-1];
-    tmp_len = gclobjtail2->lengths[gclobjtailindex2 - 1];
-    sumsize += tmp_len;
-#ifdef GC_PROFILE
-       gc_num_lobj++;
-#endif
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT_REG(gclobjtail2->lobjs[gclobjtailindex2-1]);
-    BAMBOO_DEBUGPRINT_REG(tmp_len);
-    BAMBOO_DEBUGPRINT_REG(sumsize);
-#endif
-    int i = gclobjtailindex2-1;
-    struct lobjpointerblock * tmp_block = gclobjtail2;
-    // find the place to insert
-    while(true) {
-      if(i == 0) {
-               if(tmp_block->prev == NULL) {
-                 break;
-               }
-               if(tmp_block->prev->lobjs[NUMLOBJPTRS-1] > tmp_lobj) {
-                 tmp_block->lobjs[i] = tmp_block->prev->lobjs[NUMLOBJPTRS-1];
-                 tmp_block->lengths[i] = tmp_block->prev->lengths[NUMLOBJPTRS-1];
-                 tmp_block->hosts[i] = tmp_block->prev->hosts[NUMLOBJPTRS-1];
-                 tmp_block = tmp_block->prev;
-                 i = NUMLOBJPTRS-1;
-               } else {
-                 break;
-               }  // if(tmp_block->prev->lobjs[NUMLOBJPTRS-1] < tmp_lobj)
-         } else {
-               if(tmp_block->lobjs[i-1] > tmp_lobj) {
-                 tmp_block->lobjs[i] = tmp_block->lobjs[i-1];
-                 tmp_block->lengths[i] = tmp_block->lengths[i-1];
-                 tmp_block->hosts[i] = tmp_block->hosts[i-1];
-                 i--;
-               } else {
-                 break;
-               }  // if(tmp_block->lobjs[i-1] < tmp_lobj)
-      }  // if(i ==0 ) else {}
-    }   // while(true)
-    // insert it
-    if(i != gclobjtailindex2 - 1) {
-      tmp_block->lobjs[i] = tmp_lobj;
-      tmp_block->lengths[i] = tmp_len;
-      tmp_block->hosts[i] = tmp_host;
-    }
-  }  // while(gc_lobjmoreItems2())
+void gc_collect(struct garbagelist * stackptr) {
+  gc_status_info.gcprocessing = true;
+  // inform the master that this core is at a gc safe point and is ready to 
+  // do gc
+  send_msg_4(STARTUPCORE,GCFINISHPRE,BAMBOO_NUM_OF_CORE,self_numsendobjs,self_numreceiveobjs);
  
-#ifdef GC_PROFILE
-  gc_num_lobjspace = sumsize;
-#endif
-  // check if there are enough space to cache these large objs
-  INTPTR dst = gcbaseva + (BAMBOO_SHARED_MEM_SIZE) -sumsize;
-  if((unsigned long long)gcheaptop > (unsigned long long)dst) {
-    // do not have enough room to cache large objs
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xe802);
-    BAMBOO_DEBUGPRINT_REG(dst);
-    BAMBOO_DEBUGPRINT_REG(gcheaptop);
-       BAMBOO_DEBUGPRINT_REG(sumsize);
-#endif
-    return false;
-  }
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xe803);
-  BAMBOO_DEBUGPRINT_REG(dst);
-  BAMBOO_DEBUGPRINT_REG(gcheaptop);
-#endif
+  // core collector routine
+  //wait for init phase
+  WAITFORGCPHASE(INITPHASE);
  
-  gcheaptop = dst; // Note: record the start of cached lobjs with gcheaptop
-  // cache the largeObjs to the top of the shared heap
-  //gclobjtail2 = gclobjtail;
-  //gclobjtailindex2 = gclobjtailindex;
-  dst = gcbaseva + (BAMBOO_SHARED_MEM_SIZE);
-  while(gc_lobjmoreItems3_I()) {
-    gc_lobjdequeue3_I();
-    size = gclobjtail2->lengths[gclobjtailindex2];
-    // set the mark field to , indicating that this obj has been moved
-    // and need to be flushed
-    ((int *)(gclobjtail2->lobjs[gclobjtailindex2]))[6] = COMPACTED;
-    dst -= size;
-    if((int)dst < (int)(gclobjtail2->lobjs[gclobjtailindex2])+size) {
-      memmove(dst, gclobjtail2->lobjs[gclobjtailindex2], size);
-    } else {
-      //BAMBOO_WRITE_HINT_CACHE(dst, size);
-      memcpy(dst, gclobjtail2->lobjs[gclobjtailindex2], size);
-    }
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0x804);
-    BAMBOO_DEBUGPRINT_REG(gclobjtail2->lobjs[gclobjtailindex2]);
-    BAMBOO_DEBUGPRINT(dst);
-    BAMBOO_DEBUGPRINT_REG(size);
-    BAMBOO_DEBUGPRINT_REG(*((int*)gclobjtail2->lobjs[gclobjtailindex2]));
-    BAMBOO_DEBUGPRINT_REG(*((int*)(dst)));
-#endif
-  }
-  return true;
-} // void cacheLObjs()
+  GC_PRINTF("Do initGC\n");
+  initGC();
+  CACHEADAPT_GC(true);
+  //send init finish msg to core coordinator
+  send_msg_2(STARTUPCORE,GCFINISHINIT,BAMBOO_NUM_OF_CORE);
  
-// update the bmmboo_smemtbl to record current shared mem usage
-void updateSmemTbl(int coren,
-                   int localtop) {
-  int ltopcore = 0;
-  int bound = BAMBOO_SMEM_SIZE_L;
-  BLOCKINDEX(localtop, &ltopcore);
-  if(localtop >= (gcbaseva+(BAMBOO_LARGE_SMEM_BOUND))) {
-    bound = BAMBOO_SMEM_SIZE;
-  }
-  int load = (localtop-gcbaseva)%bound;
-  int i = 0;
-  int j = 0;
-  int toset = 0;
-  do {
-    toset = gc_core2block[2*coren+i]+(NUMCORES4GC*2)*j;
-    if(toset < ltopcore) {
-      bamboo_smemtbl[toset]=
-        (toset<NUMCORES4GC) ? BAMBOO_SMEM_SIZE_L : BAMBOO_SMEM_SIZE;
-#ifdef SMEMM
-         gcmem_mixed_usedmem += bamboo_smemtbl[toset];
-#endif
-    } else if(toset == ltopcore) {
-      bamboo_smemtbl[toset] = load;
-#ifdef SMEMM
-         gcmem_mixed_usedmem += bamboo_smemtbl[toset];
-#endif
-      break;
-    } else {
-      break;
-    }
-    i++;
-    if(i == 2) {
-      i = 0;
-      j++;
-    }
-  } while(true);
-} // void updateSmemTbl(int, int)
+  //wait for mark phase
+  WAITFORGCPHASE(MARKPHASE);
  
-inline void moveLObjs() {
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xea01);
-#endif
-#ifdef SMEMM
-  // update the gcmem_mixed_usedmem
-  gcmem_mixed_usedmem = 0;
-#endif
-  // zero out the smemtbl
-  BAMBOO_MEMSET_WH(bamboo_smemtbl, 0, sizeof(int)*gcnumblock);
-  // find current heap top
-  // flush all gcloads to indicate the real heap top on one core
-  // previous it represents the next available ptr on a core
-  if((gcloads[0] > (gcbaseva+(BAMBOO_SMEM_SIZE_L)))
-     && ((gcloads[0]%(BAMBOO_SMEM_SIZE)) == 0)) {
-    // edge of a block, check if this is exactly the heaptop
-    BASEPTR(0, gcfilledblocks[0]-1, &(gcloads[0]));
-    gcloads[0]+=(gcfilledblocks[0]>1 ?
-                 (BAMBOO_SMEM_SIZE) : (BAMBOO_SMEM_SIZE_L));
-  }
-  updateSmemTbl(0, gcloads[0]);
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xea02);
-  BAMBOO_DEBUGPRINT_REG(gcloads[0]);
-  BAMBOO_DEBUGPRINT_REG(bamboo_smemtbl[0]);
-#endif
-  for(int i = 1; i < NUMCORES4GC; i++) {
-    int tmptop = 0;
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xf000+i);
-    BAMBOO_DEBUGPRINT_REG(gcloads[i]);
-    BAMBOO_DEBUGPRINT_REG(gcfilledblocks[i]);
-#endif
-    if((gcfilledblocks[i] > 0)
-       && ((gcloads[i] % (BAMBOO_SMEM_SIZE)) == 0)) {
-      // edge of a block, check if this is exactly the heaptop
-      BASEPTR(i, gcfilledblocks[i]-1, &gcloads[i]);
-      gcloads[i] += 
-               (gcfilledblocks[i]>1 ? (BAMBOO_SMEM_SIZE) : (BAMBOO_SMEM_SIZE_L));
-      tmptop = gcloads[i];
-    }
-    updateSmemTbl(i, gcloads[i]);
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT_REG(gcloads[i]);
-#endif
-  } // for(int i = 1; i < NUMCORES4GC; i++) {
+  GC_PRINTF("Start mark phase\n");
+  mark(stackptr);
+  GC_PRINTF("Finish mark phase, start compact phase\n");
+  compact();
+  GC_PRINTF("Finish compact phase\n");
  
-  // find current heap top
-  // TODO
-  // a bug here: when using local allocation, directly move large objects
-  // to the highest free chunk might not be memory efficient
-  int tmpheaptop = 0;
-  int size = 0;
-  int bound = 0;
-  int i = 0;
-  for(i = gcnumblock-1; i >= 0; i--) {
-    if(bamboo_smemtbl[i] > 0) {
-      break;
-    }
-  }
-  if(i == -1) {
-    tmpheaptop = gcbaseva;
-  } else {
-    tmpheaptop = gcbaseva+bamboo_smemtbl[i]+((i<NUMCORES4GC) ?
-               (BAMBOO_SMEM_SIZE_L*i) :
-        (BAMBOO_SMEM_SIZE*(i-NUMCORES4GC)+BAMBOO_LARGE_SMEM_BOUND));
-  }
+  WAITFORGCPHASE(UPDATEPHASE);
  
-  // move large objs from gcheaptop to tmpheaptop
-  // write the header first
-  unsigned int tomove = gcbaseva + (BAMBOO_SHARED_MEM_SIZE) -gcheaptop;
-#ifdef SMEMM
-  gcmem_mixed_usedmem += tomove;
-#endif
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xea03);
-  BAMBOO_DEBUGPRINT_REG(tomove);
-  BAMBOO_DEBUGPRINT_REG(tmpheaptop);
-  BAMBOO_DEBUGPRINT_REG(gcheaptop);
-#endif
-  // flush the sbstartbl
-  BAMBOO_MEMSET_WH(&(gcsbstarttbl[gcreservedsb]), '\0',
-         (BAMBOO_SHARED_MEM_SIZE/BAMBOO_SMEM_SIZE-gcreservedsb)*sizeof(INTPTR));
-  if(tomove == 0) {
-    gcheaptop = tmpheaptop;
-  } else {
-    // check how many blocks it acrosses
-    int remain = tmpheaptop-gcbaseva;
-    int sb = remain/(BAMBOO_SMEM_SIZE) + gcreservedsb;//number of the sblock
-    int b = 0;  // number of the block
-    BLOCKINDEX(tmpheaptop, &b);
-    // check the remaining space in this block
-    bound = (BAMBOO_SMEM_SIZE);
-    if(remain < (BAMBOO_LARGE_SMEM_BOUND)) {
-      bound = (BAMBOO_SMEM_SIZE_L);
-    }
-    remain = bound - remain%bound;
+  GC_PRINTF("Start update phase\n");
+  GCPROFILE_INFO_2_MASTER();
+  update(stackptr);
+  GC_PRINTF("Finish update phase\n");
  
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xea04);
-#endif
-    size = 0;
-    int isize = 0;
-    int host = 0;
-    int ptr = 0;
-    int base = tmpheaptop;
-    int cpysize = 0;
-    remain -= BAMBOO_CACHE_LINE_SIZE;
-    tmpheaptop += BAMBOO_CACHE_LINE_SIZE;
-    gc_lobjqueueinit4_I();
-    while(gc_lobjmoreItems4_I()) {
-      ptr = (int)(gc_lobjdequeue4_I(&size, &host));
-      ALIGNSIZE(size, &isize);
-      if(remain < isize) {
-               // this object acrosses blocks
-               if(cpysize > 0) {
-                 // close current block, fill its header
-                 BAMBOO_MEMSET_WH(base, '\0', BAMBOO_CACHE_LINE_SIZE);
-                 *((int*)base) = cpysize + BAMBOO_CACHE_LINE_SIZE;
-                 bamboo_smemtbl[b]+=BAMBOO_CACHE_LINE_SIZE;//add the size of header
-                 cpysize = 0;
-                 base = tmpheaptop;
-                 if(remain == 0) {
-                       remain = ((tmpheaptop-gcbaseva)<(BAMBOO_LARGE_SMEM_BOUND)) ?
-                                        BAMBOO_SMEM_SIZE_L : BAMBOO_SMEM_SIZE;
-                 }
-                 remain -= BAMBOO_CACHE_LINE_SIZE;
-                 tmpheaptop += BAMBOO_CACHE_LINE_SIZE;
-                 BLOCKINDEX(tmpheaptop, &b);
-                 sb = (tmpheaptop-gcbaseva)/(BAMBOO_SMEM_SIZE) + gcreservedsb;
-               }  // if(cpysize > 0)
+  CACHEADAPT_PHASE_CLIENT();
  
-               // move the large obj
-               if((int)gcheaptop < (int)(tmpheaptop)+size) {
-                 memmove(tmpheaptop, gcheaptop, size);
-               } else {
-                 //BAMBOO_WRITE_HINT_CACHE(tmpheaptop, size);
-                 memcpy(tmpheaptop, gcheaptop, size);
-               }
-               // fill the remaining space with -2 padding
-               BAMBOO_MEMSET_WH(tmpheaptop+size, -2, isize-size);
-#ifdef DEBUG
-               BAMBOO_DEBUGPRINT(0xea05);
-               BAMBOO_DEBUGPRINT_REG(gcheaptop);
-               BAMBOO_DEBUGPRINT_REG(tmpheaptop);
-               BAMBOO_DEBUGPRINT_REG(size);
-               BAMBOO_DEBUGPRINT_REG(isize);
-               BAMBOO_DEBUGPRINT_REG(base);
-#endif
-               gcheaptop += size;
-               // cache the mapping info anyway
-               //if(ptr != tmpheaptop) {
-               BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
-#ifdef LOCALHASHTBL_TEST
-               RuntimeHashadd_I(gcpointertbl, ptr, tmpheaptop);
-#else
-               mgchashInsert_I(gcpointertbl, ptr, tmpheaptop);
-#endif
-               //MGCHashadd_I(gcpointertbl, ptr, tmpheaptop);
-               BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-               //}
-#ifdef DEBUG
-               BAMBOO_DEBUGPRINT(0xcdca);
-               BAMBOO_DEBUGPRINT_REG(ptr);
-               BAMBOO_DEBUGPRINT_REG(tmpheaptop);
-#endif
-               if(host != BAMBOO_NUM_OF_CORE) {
-                 // send the original host core with the mapping info
-                 send_msg_3(host, GCLOBJMAPPING, ptr, tmpheaptop, false);
-#ifdef DEBUG
-                 BAMBOO_DEBUGPRINT(0xcdcb);
-                 BAMBOO_DEBUGPRINT_REG(ptr);
-                 BAMBOO_DEBUGPRINT_REG(tmpheaptop);
-#endif
-               } // if(host != BAMBOO_NUM_OF_CORE)
-               tmpheaptop += isize;
+  // invalidate all shared mem pointers
+  bamboo_cur_msp = NULL;
+  bamboo_smem_size = 0;
+  bamboo_smem_zero_top = NULL;
+  gcflag = false;
  
-               // set the gcsbstarttbl and bamboo_smemtbl
-               int tmpsbs = 1+(isize-remain-1)/BAMBOO_SMEM_SIZE;
-               for(int k = 1; k < tmpsbs; k++) {
-                 gcsbstarttbl[sb+k] = (INTPTR)(-1);
-               }
-               sb += tmpsbs;
-               bound = (b<NUMCORES4GC) ? BAMBOO_SMEM_SIZE_L : BAMBOO_SMEM_SIZE;
-               BLOCKINDEX(tmpheaptop-1, &tmpsbs);
-               for(; b < tmpsbs; b++) {
-                 bamboo_smemtbl[b] = bound;
-                 if(b==NUMCORES4GC-1) {
-                       bound = BAMBOO_SMEM_SIZE;
-                 }
-               }
-               if(((isize-remain)%(BAMBOO_SMEM_SIZE)) == 0) {
-                 gcsbstarttbl[sb] = (INTPTR)(-1);
-                 remain = ((tmpheaptop-gcbaseva)<(BAMBOO_LARGE_SMEM_BOUND)) ?
-                                  BAMBOO_SMEM_SIZE_L : BAMBOO_SMEM_SIZE;
-                 bamboo_smemtbl[b] = bound;
-               } else {
-                 gcsbstarttbl[sb] = (INTPTR)(tmpheaptop);
-                 remain = tmpheaptop-gcbaseva;
-                 bamboo_smemtbl[b] = remain%bound;
-                 remain = bound - bamboo_smemtbl[b];
-               } // if(((isize-remain)%(BAMBOO_SMEM_SIZE)) == 0) else ...
+  WAITFORGCPHASE(FINISHPHASE);
  
-               // close current block and fill the header
-               BAMBOO_MEMSET_WH(base, '\0', BAMBOO_CACHE_LINE_SIZE);
-               *((int*)base) = isize + BAMBOO_CACHE_LINE_SIZE;
-               cpysize = 0;
-               base = tmpheaptop;
-               if(remain == BAMBOO_CACHE_LINE_SIZE) {
-                 // fill with 0 in case
-                 BAMBOO_MEMSET_WH(tmpheaptop, '\0', remain);
-               }
-               remain -= BAMBOO_CACHE_LINE_SIZE;
-               tmpheaptop += BAMBOO_CACHE_LINE_SIZE;
-      } else {
-               remain -= isize;
-               // move the large obj
-               if((int)gcheaptop < (int)(tmpheaptop)+size) {
-                 memmove(tmpheaptop, gcheaptop, size);
-               } else {
-                 //BAMBOO_WRITE_HINT_CACHE(tmpheaptop, size);
-                 memcpy(tmpheaptop, gcheaptop, size);
-               }
-               // fill the remaining space with -2 padding
-               BAMBOO_MEMSET_WH(tmpheaptop+size, -2, isize-size);
-#ifdef DEBUG
-               BAMBOO_DEBUGPRINT(0xea06);
-               BAMBOO_DEBUGPRINT_REG(gcheaptop);
-               BAMBOO_DEBUGPRINT_REG(tmpheaptop);
-               BAMBOO_DEBUGPRINT_REG(size);
-               BAMBOO_DEBUGPRINT_REG(isize);
-#endif
+  GC_PRINTF("Finish gc! \n");
+} 
  
-               gcheaptop += size;
-               cpysize += isize;
-               // cache the mapping info anyway
-               //if(ptr != tmpheaptop) {
-               BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
-#ifdef LOCALHASHTBL_TEST
-               RuntimeHashadd_I(gcpointertbl, ptr, tmpheaptop);
-#else
-               mgchashInsert_I(gcpointertbl, ptr, tmpheaptop);
-#endif
-               //MGCHashadd_I(gcpointertbl, ptr, tmpheaptop);
-               BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-               //}
-#ifdef DEBUG
-               BAMBOO_DEBUGPRINT(0xcdcc);
-               BAMBOO_DEBUGPRINT_REG(ptr);
-               BAMBOO_DEBUGPRINT_REG(tmpheaptop);
-               BAMBOO_DEBUGPRINT_REG(*((int*)tmpheaptop));
-#endif
-               if(host != BAMBOO_NUM_OF_CORE) {
-                 // send the original host core with the mapping info
-                 send_msg_3(host, GCLOBJMAPPING, ptr, tmpheaptop, false);
-#ifdef DEBUG
-                 BAMBOO_DEBUGPRINT(0xcdcd);
-                 BAMBOO_DEBUGPRINT_REG(ptr);
-                 BAMBOO_DEBUGPRINT_REG(tmpheaptop);
-#endif
-               }                         // if(host != BAMBOO_NUM_OF_CORE)
-               tmpheaptop += isize;
+void gc_nocollect(struct garbagelist * stackptr) {
+  gc_status_info.gcprocessing = true;
+  // inform the master that this core is at a gc safe point and is ready to 
+  // do gc
+  send_msg_4(STARTUPCORE,GCFINISHPRE,BAMBOO_NUM_OF_CORE,self_numsendobjs,self_numreceiveobjs);
+  
+  WAITFORGCPHASE(INITPHASE);
  
-               // update bamboo_smemtbl
-               bamboo_smemtbl[b] += isize;
-         }  // if(remain < isize) else ...
-    }  // while(gc_lobjmoreItems())
-    if(cpysize > 0) {
-      // close current block, fill the header
-      BAMBOO_MEMSET_WH(base, '\0', BAMBOO_CACHE_LINE_SIZE);
-      *((int*)base) = cpysize + BAMBOO_CACHE_LINE_SIZE;
-      bamboo_smemtbl[b] += BAMBOO_CACHE_LINE_SIZE;// add the size of the header
-    } else {
-      tmpheaptop -= BAMBOO_CACHE_LINE_SIZE;
-    }
-    gcheaptop = tmpheaptop;
+  GC_PRINTF("Do initGC\n");
+  initGC();
+  CACHEADAPT_GC(true);
  
-  } // if(tomove == 0)
+  //send init finish msg to core coordinator
+  send_msg_2(STARTUPCORE,GCFINISHINIT,BAMBOO_NUM_OF_CORE);
  
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xea07);
-  BAMBOO_DEBUGPRINT_REG(gcheaptop);
-#endif
  
-  bamboo_free_block = 0;
-  int tbound = 0;
-  do {
-    tbound = (bamboo_free_block<NUMCORES4GC) ?
-             BAMBOO_SMEM_SIZE_L : BAMBOO_SMEM_SIZE;
-    if(bamboo_smemtbl[bamboo_free_block] == tbound) {
-      bamboo_free_block++;
-    } else {
-      // the first non-full partition
-      break;
-    }
-  } while(true);
+  WAITFORGCPHASE(MARKPHASE);
  
-#ifdef GC_PROFILE
-  // check how many live space there are
-  gc_num_livespace = 0;
-  for(int tmpi = 0; tmpi < gcnumblock; tmpi++) {
-       gc_num_livespace += bamboo_smemtbl[tmpi];
-  }
-  gc_num_freespace = (BAMBOO_SHARED_MEM_SIZE) - gc_num_livespace;
-#endif
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xea08);
-  BAMBOO_DEBUGPRINT_REG(gcheaptop);
-#endif
-} // void moveLObjs()
+  GC_PRINTF("Start mark phase\n"); 
+  mark(stackptr);
+  GC_PRINTF("Finish mark phase, wait for update\n");
  
-inline void markObj(void * objptr) {
-  if(objptr == NULL) {
-    return;
-  }
-  if(ISSHAREDOBJ(objptr)) {
-    int host = hostcore(objptr);
-    if(BAMBOO_NUM_OF_CORE == host) {
-      // on this core
-      BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
-      if(((int *)objptr)[6] == INIT) {
-               // this is the first time that this object is discovered,
-               // set the flag as DISCOVERED
-               ((int *)objptr)[6] |= DISCOVERED;
-               BAMBOO_CACHE_FLUSH_LINE(objptr);
-               gc_enqueue_I(objptr);
-         }
-      BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-    } else {
-#ifdef DEBUG
-      BAMBOO_DEBUGPRINT(0xbbbb);
-      BAMBOO_DEBUGPRINT_REG(host);
-      BAMBOO_DEBUGPRINT_REG(objptr);
-#endif
-      // check if this obj has been forwarded
-      if(!MGCHashcontains(gcforwardobjtbl, (int)objptr)) {
-               // send a msg to host informing that objptr is active
-               send_msg_2(host, GCMARKEDOBJ, objptr, /*BAMBOO_NUM_OF_CORE,*/ false);
-#ifdef GC_PROFILE
-               gc_num_forwardobj++;
-#endif // GC_PROFILE
-               gcself_numsendobjs++;
-               MGCHashadd(gcforwardobjtbl, (int)objptr);
-      }
-    }
-  } else {
-    BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
-    gc_enqueue_I(objptr);
-    BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-  }       // if(ISSHAREDOBJ(objptr))
-} // void markObj(void * objptr)
+  // non-gc core collector routine
+  WAITFORGCPHASE(UPDATEPHASE);
  
-// enqueue root objs
-inline void tomark(struct garbagelist * stackptr) {
-  if(MARKPHASE != gcphase) {
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT_REG(gcphase);
-#endif
-    BAMBOO_EXIT(0xb101);
-  }
-  gcbusystatus = true;
-  gcnumlobjs = 0;
+  GC_PRINTF("Start update phase\n");
+  GCPROFILE_INFO_2_MASTER();
+  update(stackptr);
+  GC_PRINTF("Finish update phase\n"); 
  
-  int i,j;
-  // enqueue current stack
-  while(stackptr!=NULL) {
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xe501);
-    BAMBOO_DEBUGPRINT_REG(stackptr->size);
-    BAMBOO_DEBUGPRINT_REG(stackptr->next);
-    BAMBOO_DEBUGPRINT_REG(stackptr->array[0]);
-#endif
-    for(i=0; i<stackptr->size; i++) {
-      if(stackptr->array[i] != NULL) {
-               markObj(stackptr->array[i]);
-      }
-    }
-    stackptr=stackptr->next;
-  }
+  CACHEADAPT_PHASE_CLIENT();
  
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xe503);
-#endif
-  // enqueue objectsets
-  if(BAMBOO_NUM_OF_CORE < NUMCORESACTIVE) {
-    for(i=0; i<NUMCLASSES; i++) {
-      struct parameterwrapper ** queues =
-        objectqueues[BAMBOO_NUM_OF_CORE][i];
-      int length = numqueues[BAMBOO_NUM_OF_CORE][i];
-      for(j = 0; j < length; ++j) {
-               struct parameterwrapper * parameter = queues[j];
-               struct ObjectHash * set=parameter->objectset;
-               struct ObjectNode * ptr=set->listhead;
-               while(ptr!=NULL) {
-                 markObj((void *)ptr->key);
-                 ptr=ptr->lnext;
-               }
-      }
-    }
-  }
+  // invalidate all shared mem pointers
+  bamboo_cur_msp = NULL;
+  bamboo_smem_size = 0;
+  bamboo_smem_zero_top = NULL;
  
-  // euqueue current task descriptor
-  if(currtpd != NULL) {
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xe504);
-#endif
-    for(i=0; i<currtpd->numParameters; i++) {
-      markObj(currtpd->parameterArray[i]);
-    }
-  }
+  gcflag = false;
+  WAITFORGCPHASE(FINISHPHASE);
  
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xe505);
-#endif
-  // euqueue active tasks
-  if(activetasks != NULL) {
-    struct genpointerlist * ptr=activetasks->list;
-    while(ptr!=NULL) {
-      struct taskparamdescriptor *tpd=ptr->src;
-      int i;
-      for(i=0; i<tpd->numParameters; i++) {
-               markObj(tpd->parameterArray[i]);
-      }
-      ptr=ptr->inext;
-    }
-  }
+  GC_PRINTF("Finish gc! \n");
+}
  
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xe506);
-#endif
-  // enqueue cached transferred obj
-  struct QueueItem * tmpobjptr =  getHead(&objqueue);
-  while(tmpobjptr != NULL) {
-    struct transObjInfo * objInfo =
-      (struct transObjInfo *)(tmpobjptr->objectptr);
-    markObj(objInfo->objptr);
-    tmpobjptr = getNextQueueItem(tmpobjptr);
-  }
+void master_mark(struct garbagelist *stackptr) {
  
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xe507);
-#endif
-  // enqueue cached objs to be transferred
-  struct QueueItem * item = getHead(totransobjqueue);
-  while(item != NULL) {
-    struct transObjInfo * totransobj =
-      (struct transObjInfo *)(item->objectptr);
-    markObj(totransobj->objptr);
-    item = getNextQueueItem(item);
-  }       // while(item != NULL)
+  GC_PRINTF("Start mark phase \n");
+  gc_status_info.gcphase = MARKPHASE;
+  GC_SEND_MSG_1_TO_CLIENT(GCSTART);
+  // mark phase
  
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xe508);
-#endif
-  // enqueue lock related info
-  for(i = 0; i < runtime_locklen; ++i) {
-    markObj((void *)(runtime_locks[i].redirectlock));
-    if(runtime_locks[i].value != NULL) {
-      markObj((void *)(runtime_locks[i].value));
-    }
+  mark(stackptr);
+}
+
+void master_getlargeobjs() {
+  // send msgs to all cores requiring large objs info
+  // Note: only need to ask gc cores, non-gc cores do not host any objs
+  numconfirm = NUMCORES4GC - 1;
+  for(int i = 1; i < NUMCORES4GC; i++) {
+    send_msg_1(i,GCLOBJREQUEST);
    }
+  gcloads[BAMBOO_NUM_OF_CORE] = gccurr_heaptop;
+  //spin until we have all responses
+  while(numconfirm!=0) ;
  
-} // void tomark(struct garbagelist * stackptr)
+  GCPROFILE_ITEM_MASTER();
+  GC_PRINTF("prepare to cache large objs \n");
  
-inline void mark(bool isfirst,
-                 struct garbagelist * stackptr) {
-#ifdef DEBUG
-  if(BAMBOO_NUM_OF_CORE == 0) BAMBOO_DEBUGPRINT(0xed01);
-#endif
-  if(isfirst) {
-#ifdef DEBUG
-    if(BAMBOO_NUM_OF_CORE == 0) BAMBOO_DEBUGPRINT(0xed02);
-#endif
-    // enqueue root objs
-    tomark(stackptr);
-    gccurr_heaptop = 0; // record the size of all active objs in this core
-                        // aligned but does not consider block boundaries
-    gcmarkedptrbound = 0;
-  }
-#ifdef DEBUG
-  if(BAMBOO_NUM_OF_CORE == 0) BAMBOO_DEBUGPRINT(0xed03);
-#endif
-  int isize = 0;
-  bool checkfield = true;
-  bool sendStall = false;
-  // mark phase
-  while(MARKPHASE == gcphase) {
-#ifdef DEBUG
-    if(BAMBOO_NUM_OF_CORE == 0) BAMBOO_DEBUGPRINT(0xed04);
-#endif
-    while(true) {
-      BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
-      bool hasItems = gc_moreItems2_I();
-      BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-#ifdef DEBUG
-      BAMBOO_DEBUGPRINT(0xed05);
-#endif
-      if(!hasItems) {
-               break;
-      }
-      sendStall = false;
-      gcbusystatus = true;
-      checkfield = true;
-      void * ptr = gc_dequeue2_I();
+}
  
-#ifdef DEBUG
-      BAMBOO_DEBUGPRINT_REG(ptr);
-#endif
-      int size = 0;
-      int isize = 0;
-      int type = 0;
-      // check if it is a shared obj
-      if(ISSHAREDOBJ(ptr)) {
-               // a shared obj, check if it is a local obj on this core
-               int host = hostcore(ptr);
-               bool islocal = (host == BAMBOO_NUM_OF_CORE);
-               if(islocal) {
-                 bool isnotmarked = ((((int *)ptr)[6] & DISCOVERED) != 0);
-                 if(isLarge(ptr, &type, &size) && isnotmarked) {
-                       // ptr is a large object and not marked or enqueued
-#ifdef DEBUG
-                       BAMBOO_DEBUGPRINT(0xecec);
-                       BAMBOO_DEBUGPRINT_REG(ptr);
-                       BAMBOO_DEBUGPRINT_REG(*((int*)ptr));
-#endif
-                       BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
-                       gc_lobjenqueue_I(ptr, size, BAMBOO_NUM_OF_CORE);
-                       gcnumlobjs++;
-                       BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-                       // mark this obj
-                       ((int *)ptr)[6] = ((int *)ptr)[6] & (~DISCOVERED) | MARKED;
-                       BAMBOO_CACHE_FLUSH_LINE(ptr);
-                 } else if(isnotmarked) {
-                       // ptr is an unmarked active object on this core
-                       ALIGNSIZE(size, &isize);
-                       gccurr_heaptop += isize;
-#ifdef DEBUG
-                       BAMBOO_DEBUGPRINT(0xaaaa);
-                       BAMBOO_DEBUGPRINT_REG(ptr);
-                       BAMBOO_DEBUGPRINT_REG(isize);
-                       BAMBOO_DEBUGPRINT(((int *)(ptr))[0]);
-#endif
-                       // mark this obj
-                       ((int *)ptr)[6] = ((int *)ptr)[6] & (~DISCOVERED) | MARKED;
-                       BAMBOO_CACHE_FLUSH_LINE(ptr);
-                 
-                       if(ptr + size > gcmarkedptrbound) {
-                         gcmarkedptrbound = ptr + size;
-                       } // if(ptr + size > gcmarkedptrbound)
-                 } else {
-                       // ptr is not an active obj or has been marked
-                       checkfield = false;
-                 } // if(isLarge(ptr, &type, &size)) else ...
-               }  /* can never reach here
-               else {
-#ifdef DEBUG
-                 if(BAMBOO_NUM_OF_CORE == 0) {
-                       BAMBOO_DEBUGPRINT(0xbbbb);
-                       BAMBOO_DEBUGPRINT_REG(host);
-                       BAMBOO_DEBUGPRINT_REG(ptr);
-                 }
-#endif
-                 // check if this obj has been forwarded
-                 if(!MGCHashcontains(gcforwardobjtbl, (int)ptr)) {
-                       // send a msg to host informing that ptr is active
-                       send_msg_2(host, GCMARKEDOBJ, ptr, false);
-                       gcself_numsendobjs++;
-                       MGCHashadd(gcforwardobjtbl, (int)ptr);
-                 }
-                       checkfield = false;
-               }// if(isLocal(ptr)) else ...*/
-         }   // if(ISSHAREDOBJ(ptr))
-#ifdef DEBUG
-      BAMBOO_DEBUGPRINT(0xed06);
-#endif
-
-      if(checkfield) {
-               // scan all pointers in ptr
-               unsigned INTPTR * pointer;
-               pointer=pointerarray[type];
-               if (pointer==0) {
-                 /* Array of primitives */
-                 /* Do nothing */
-               } else if (((INTPTR)pointer)==1) {
-                 /* Array of pointers */
-                 struct ArrayObject *ao=(struct ArrayObject *) ptr;
-                 int length=ao->___length___;
-                 int j;
-                 for(j=0; j<length; j++) {
-                       void *objptr =
-                         ((void **)(((char *)&ao->___length___)+sizeof(int)))[j];
-                       markObj(objptr);
-                 }
-               } else {
-                 INTPTR size=pointer[0];
-                 int i;
-                 for(i=1; i<=size; i++) {
-                       unsigned int offset=pointer[i];
-                       void * objptr=*((void **)(((char *)ptr)+offset));
-                       markObj(objptr);
-                 }
-               }     // if (pointer==0) else if ... else ...
-      }   // if(checkfield)
-    }     // while(gc_moreItems2())
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xed07);
-#endif
-    gcbusystatus = false;
-    // send mark finish msg to core coordinator
-    if(STARTUPCORE == BAMBOO_NUM_OF_CORE) {
-#ifdef DEBUG
-      BAMBOO_DEBUGPRINT(0xed08);
-#endif
-      gccorestatus[BAMBOO_NUM_OF_CORE] = 0;
-      gcnumsendobjs[gcnumsrobjs_index][BAMBOO_NUM_OF_CORE]=gcself_numsendobjs;
-      gcnumreceiveobjs[gcnumsrobjs_index][BAMBOO_NUM_OF_CORE]=
-               gcself_numreceiveobjs;
-      gcloads[BAMBOO_NUM_OF_CORE] = gccurr_heaptop;
-    } else {
-      if(!sendStall) {
-#ifdef DEBUG
-               BAMBOO_DEBUGPRINT(0xed09);
-#endif
-               send_msg_4(STARTUPCORE, GCFINISHMARK, BAMBOO_NUM_OF_CORE,
-                                  gcself_numsendobjs, gcself_numreceiveobjs, false);
-               sendStall = true;
-      }
-    }             // if(STARTUPCORE == BAMBOO_NUM_OF_CORE) ...
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xed0a);
-#endif
-
-    if(BAMBOO_NUM_OF_CORE == STARTUPCORE) {
-#ifdef DEBUG
-      BAMBOO_DEBUGPRINT(0xed0b);
-#endif
-      return;
-    }
-  } // while(MARKPHASE == gcphase)
-
-  BAMBOO_CACHE_MF();
-} // mark()
-
-inline void compact2Heaptophelper_I(int coren,
-                                    int* p,
-                                    int* numblocks,
-                                    int* remain) {
-  int b;
-  int memneed = gcrequiredmems[coren] + BAMBOO_CACHE_LINE_SIZE;
-  if(STARTUPCORE == coren) {
-    gctomove = true;
-    gcmovestartaddr = *p;
-    gcdstcore = gctopcore;
-    gcblock2fill = *numblocks + 1;
-  } else {
-    send_msg_4(coren, GCMOVESTART, gctopcore, *p, (*numblocks) + 1, false);
-  }
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT_REG(coren);
-  BAMBOO_DEBUGPRINT_REG(gctopcore);
-  BAMBOO_DEBUGPRINT_REG(*p);
-  BAMBOO_DEBUGPRINT_REG(*numblocks+1);
-#endif
-  if(memneed < *remain) {
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xd104);
-#endif
-    *p = *p + memneed;
-    gcrequiredmems[coren] = 0;
-    gcloads[gctopcore] += memneed;
-    *remain = *remain - memneed;
-  } else {
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xd105);
-#endif
-    // next available block
-    *p = *p + *remain;
-    gcfilledblocks[gctopcore] += 1;
-    int newbase = 0;
-    BASEPTR(gctopcore, gcfilledblocks[gctopcore], &newbase);
-    gcloads[gctopcore] = newbase;
-    gcrequiredmems[coren] -= *remain - BAMBOO_CACHE_LINE_SIZE;
-    gcstopblock[gctopcore]++;
-    gctopcore = NEXTTOPCORE(gctopblock);
-    gctopblock++;
-    *numblocks = gcstopblock[gctopcore];
-    *p = gcloads[gctopcore];
-    BLOCKINDEX(*p, &b);
-    *remain=(b<NUMCORES4GC) ?
-             ((BAMBOO_SMEM_SIZE_L)-((*p)%(BAMBOO_SMEM_SIZE_L)))
-            : ((BAMBOO_SMEM_SIZE)-((*p)%(BAMBOO_SMEM_SIZE)));
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xd106);
-    BAMBOO_DEBUGPRINT_REG(gctopcore);
-    BAMBOO_DEBUGPRINT_REG(*p);
-    BAMBOO_DEBUGPRINT_REG(b);
-    BAMBOO_DEBUGPRINT_REG(*remain);
-#endif
-  }       // if(memneed < remain)
-  gcmovepending--;
-} // void compact2Heaptophelper_I(int, int*, int*, int*)
-
-inline void compact2Heaptop() {
-  // no cores with spare mem and some cores are blocked with pending move
-  // find the current heap top and make them move to the heap top
-  int p;
-  int numblocks = gcfilledblocks[gctopcore];
-  //BASEPTR(gctopcore, numblocks, &p);
-  p = gcloads[gctopcore];
-  int b;
-  BLOCKINDEX(p, &b);
-  int remain = (b<NUMCORES4GC) ?
-               ((BAMBOO_SMEM_SIZE_L)-(p%(BAMBOO_SMEM_SIZE_L)))
-              : ((BAMBOO_SMEM_SIZE)-(p%(BAMBOO_SMEM_SIZE)));
-  // check if the top core finishes
-  BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
-  if(gccorestatus[gctopcore] != 0) {
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xd101);
-    BAMBOO_DEBUGPRINT_REG(gctopcore);
-#endif
-    // let the top core finishes its own work first
-    compact2Heaptophelper_I(gctopcore, &p, &numblocks, &remain);
-    BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-    return;
-  }
-  BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xd102);
-  BAMBOO_DEBUGPRINT_REG(gctopcore);
-  BAMBOO_DEBUGPRINT_REG(p);
-  BAMBOO_DEBUGPRINT_REG(b);
-  BAMBOO_DEBUGPRINT_REG(remain);
-#endif
-  for(int i = 0; i < NUMCORES4GC; i++) {
-    BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
-    if((gccorestatus[i] != 0) && (gcrequiredmems[i] > 0)) {
-#ifdef DEBUG
-      BAMBOO_DEBUGPRINT(0xd103);
-#endif
-      compact2Heaptophelper_I(i, &p, &numblocks, &remain);
-      if(gccorestatus[gctopcore] != 0) {
-#ifdef DEBUG
-               BAMBOO_DEBUGPRINT(0xd101);
-               BAMBOO_DEBUGPRINT_REG(gctopcore);
-#endif
-               BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-               // the top core is not free now
-               return;
-      }
-    }             // if((gccorestatus[i] != 0) && (gcrequiredmems[i] > 0))
-    BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-  }       // for(i = 0; i < NUMCORES4GC; i++)
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xd106);
-#endif
-} // void compact2Heaptop()
-
-inline void resolvePendingMoveRequest() {
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xeb01);
-#endif
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xeeee);
-  for(int k = 0; k < NUMCORES4GC; k++) {
-    BAMBOO_DEBUGPRINT(0xf000+k);
-    BAMBOO_DEBUGPRINT_REG(gccorestatus[k]);
-    BAMBOO_DEBUGPRINT_REG(gcloads[k]);
-    BAMBOO_DEBUGPRINT_REG(gcfilledblocks[k]);
-    BAMBOO_DEBUGPRINT_REG(gcstopblock[k]);
-  }
-  BAMBOO_DEBUGPRINT(0xffff);
-#endif
-  int i;
-  int j;
-  bool nosparemem = true;
-  bool haspending = false;
-  bool hasrunning = false;
-  bool noblock = false;
-  int dstcore = 0;       // the core who need spare mem
-  int sourcecore = 0;       // the core who has spare mem
-  for(i = j = 0; (i < NUMCORES4GC) && (j < NUMCORES4GC); ) {
-    if(nosparemem) {
-      // check if there are cores with spare mem
-      if(gccorestatus[i] == 0) {
-               // finished working, check if it still have spare mem
-               if(gcfilledblocks[i] < gcstopblock[i]) {
-                 // still have spare mem
-                 nosparemem = false;
-                 sourcecore = i;
-               }  // if(gcfilledblocks[i] < gcstopblock[i]) else ...
-      }
-      i++;
-    }             // if(nosparemem)
-    if(!haspending) {
-      if(gccorestatus[j] != 0) {
-               // not finished, check if it has pending move requests
-               if((gcfilledblocks[j]==gcstopblock[j])&&(gcrequiredmems[j]>0)) {
-                 dstcore = j;
-                 haspending = true;
-               } else {
-                 hasrunning = true;
-               }  // if((gcfilledblocks[i] == gcstopblock[i])...) else ...
-      }  // if(gccorestatus[i] == 0) else ...
-      j++;
-    }  // if(!haspending)
-    if(!nosparemem && haspending) {
-      // find match
-      int tomove = 0;
-      int startaddr = 0;
-      BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
-      gcrequiredmems[dstcore] = assignSpareMem_I(sourcecore,
-                                                 gcrequiredmems[dstcore],
-                                                 &tomove,
-                                                 &startaddr);
-      BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-#ifdef DEBUG
-      BAMBOO_DEBUGPRINT(0xeb02);
-      BAMBOO_DEBUGPRINT_REG(sourcecore);
-      BAMBOO_DEBUGPRINT_REG(dstcore);
-      BAMBOO_DEBUGPRINT_REG(startaddr);
-      BAMBOO_DEBUGPRINT_REG(tomove);
-#endif
-      if(STARTUPCORE == dstcore) {
-#ifdef DEBUG
-               BAMBOO_DEBUGPRINT(0xeb03);
-#endif
-               gcdstcore = sourcecore;
-               gctomove = true;
-               gcmovestartaddr = startaddr;
-               gcblock2fill = tomove;
-      } else {
-#ifdef DEBUG
-               BAMBOO_DEBUGPRINT(0xeb04);
-#endif
-               send_msg_4(dstcore, GCMOVESTART, sourcecore,
-                                  startaddr, tomove, false);
-      }
-      gcmovepending--;
-      nosparemem = true;
-      haspending = false;
-      noblock = true;
-    }
-  }       // for(i = 0; i < NUMCORES4GC; i++)
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xcccc);
-  BAMBOO_DEBUGPRINT_REG(hasrunning);
-  BAMBOO_DEBUGPRINT_REG(haspending);
-  BAMBOO_DEBUGPRINT_REG(noblock);
-#endif
-
-  if(!hasrunning && !noblock) {
-    gcphase = SUBTLECOMPACTPHASE;
-    compact2Heaptop();
-  }
-
-} // void resovePendingMoveRequest()
-
-struct moveHelper {
-  int numblocks;       // block num for heap
-  INTPTR base;       // base virtual address of current heap block
-  INTPTR ptr;       // virtual address of current heap top
-  int offset;       // offset in current heap block
-  int blockbase;       // virtual address of current small block to check
-  int blockbound;       // bound virtual address of current small blcok
-  int sblockindex;       // index of the small blocks
-  int top;       // real size of current heap block to check
-  int bound;       // bound size of current heap block to check
-}; // struct moveHelper
-
-// If out of boundary of valid shared memory, return false, else return true
-inline bool nextSBlock(struct moveHelper * orig) {
-  orig->blockbase = orig->blockbound;
-  bool sbchanged = false;
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xecc0);
-  BAMBOO_DEBUGPRINT_REG(orig->blockbase);
-  BAMBOO_DEBUGPRINT_REG(orig->blockbound);
-  BAMBOO_DEBUGPRINT_REG(orig->bound);
-  BAMBOO_DEBUGPRINT_REG(orig->ptr);
-#endif
-outernextSBlock:
-  // check if across a big block
-  // TODO now do not zero out the whole memory, maybe the last two conditions
-  // are useless now
-  if((orig->blockbase >= orig->bound) || (orig->ptr >= orig->bound)
-     || ((orig->ptr != NULL) && (*((int*)orig->ptr))==0)
-     || ((*((int*)orig->blockbase))==0)) {
-innernextSBlock:
-    // end of current heap block, jump to next one
-    orig->numblocks++;
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xecc1);
-    BAMBOO_DEBUGPRINT_REG(orig->numblocks);
-#endif
-    BASEPTR(BAMBOO_NUM_OF_CORE, orig->numblocks, &(orig->base));
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(orig->base);
-#endif
-    if(orig->base >= gcbaseva + BAMBOO_SHARED_MEM_SIZE) {
-      // out of boundary
-      orig->ptr = orig->base; // set current ptr to out of boundary too
-      return false;
-    }
-    //orig->bound = orig->base + BAMBOO_SMEM_SIZE;
-    orig->blockbase = orig->base;
-    orig->sblockindex = (orig->blockbase-gcbaseva)/BAMBOO_SMEM_SIZE;
-    sbchanged = true;
-    int blocknum = 0;
-    BLOCKINDEX(orig->base, &blocknum);
-    if(bamboo_smemtbl[blocknum] == 0) {
-      // goto next block
-      goto innernextSBlock;
-    }
-       // check the bamboo_smemtbl to decide the real bound
-       orig->bound = orig->base + bamboo_smemtbl[blocknum];
-  } else if(0 == (orig->blockbase%BAMBOO_SMEM_SIZE)) {
-    orig->sblockindex += 1;
-    sbchanged = true;
-  }  // if((orig->blockbase >= orig->bound) || (orig->ptr >= orig->bound)...
-
-  // check if this sblock should be skipped or have special start point
-  if(gcsbstarttbl[orig->sblockindex] == -1) {
-    // goto next sblock
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xecc2);
-#endif
-    orig->sblockindex += 1;
-    orig->blockbase += BAMBOO_SMEM_SIZE;
-    goto outernextSBlock;
-  } else if((gcsbstarttbl[orig->sblockindex] != 0)
-            && (sbchanged)) {
-    // the first time to access this SBlock
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xecc3);
-#endif
-    // not start from the very beginning
-    orig->blockbase = gcsbstarttbl[orig->sblockindex];
-  }  // if(gcsbstarttbl[orig->sblockindex] == -1) else ...
-
-  // setup information for this sblock
-  orig->blockbound = orig->blockbase + *((int*)(orig->blockbase));
-  orig->offset = BAMBOO_CACHE_LINE_SIZE;
-  orig->ptr = orig->blockbase + orig->offset;
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xecc4);
-  BAMBOO_DEBUGPRINT_REG(orig->base);
-  BAMBOO_DEBUGPRINT_REG(orig->bound);
-  BAMBOO_DEBUGPRINT_REG(orig->ptr);
-  BAMBOO_DEBUGPRINT_REG(orig->blockbound);
-  BAMBOO_DEBUGPRINT_REG(orig->blockbase);
-  BAMBOO_DEBUGPRINT_REG(orig->offset);
-#endif
-  if(orig->ptr >= orig->bound) {
-    // met a lobj, move to next block
-    goto innernextSBlock;
-  }
-
-  return true;
-} // bool nextSBlock(struct moveHelper * orig)
-
-// return false if there are no available data to compact
-inline bool initOrig_Dst(struct moveHelper * orig,
-                         struct moveHelper * to) {
-  // init the dst ptr
-  to->numblocks = 0;
-  to->top = to->offset = BAMBOO_CACHE_LINE_SIZE;
-  to->bound = BAMBOO_SMEM_SIZE_L;
-  BASEPTR(BAMBOO_NUM_OF_CORE, to->numblocks, &(to->base));
-
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xef01);
-  BAMBOO_DEBUGPRINT_REG(to->base);
-#endif
-  to->ptr = to->base + to->offset;
-#ifdef GC_CACHE_ADAPT
-  // initialize the gc_cache_revise_information
-  gc_cache_revise_infomation.to_page_start_va = to->ptr;
-  gc_cache_revise_infomation.to_page_end_va = (BAMBOO_PAGE_SIZE)*
-       ((to->base-gcbaseva)/(BAMBOO_PAGE_SIZE)+1);
-  gc_cache_revise_infomation.to_page_index = 
-       (to->base-gcbaseva)/(BAMBOO_PAGE_SIZE);
-  gc_cache_revise_infomation.orig_page_start_va = -1;
-#endif // GC_CACHE_ADAPT
-
-  // init the orig ptr
-  orig->numblocks = 0;
-  orig->base = to->base;
-  int blocknum = 0;
-  BLOCKINDEX(orig->base, &blocknum);
-  // check the bamboo_smemtbl to decide the real bound
-  orig->bound = orig->base + bamboo_smemtbl[blocknum];
-  orig->blockbase = orig->base;
-  orig->sblockindex = (orig->base - gcbaseva) / BAMBOO_SMEM_SIZE;
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xef02);
-  BAMBOO_DEBUGPRINT_REG(orig->base);
-  BAMBOO_DEBUGPRINT_REG(orig->sblockindex);
-  BAMBOO_DEBUGPRINT_REG(gcsbstarttbl);
-  BAMBOO_DEBUGPRINT_REG(gcsbstarttbl[orig->sblockindex]);
-#endif
-
-  if(gcsbstarttbl[orig->sblockindex] == -1) {
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xef03);
-#endif
-    // goto next sblock
-    orig->blockbound =
-      gcbaseva+BAMBOO_SMEM_SIZE*(orig->sblockindex+1);
-    return nextSBlock(orig);
-  } else if(gcsbstarttbl[orig->sblockindex] != 0) {
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xef04);
-#endif
-    orig->blockbase = gcsbstarttbl[orig->sblockindex];
-  }
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xef05);
-#endif
-  orig->blockbound = orig->blockbase + *((int*)(orig->blockbase));
-  orig->offset = BAMBOO_CACHE_LINE_SIZE;
-  orig->ptr = orig->blockbase + orig->offset;
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xef06);
-  BAMBOO_DEBUGPRINT_REG(orig->base);
-#endif
-
-  return true;
-} // bool initOrig_Dst(struct moveHelper * orig, struct moveHelper * to)
-
-inline void nextBlock(struct moveHelper * to) {
-  to->top = to->bound + BAMBOO_CACHE_LINE_SIZE; // header!
-  to->bound += BAMBOO_SMEM_SIZE;
-  to->numblocks++;
-  BASEPTR(BAMBOO_NUM_OF_CORE, to->numblocks, &(to->base));
-  to->offset = BAMBOO_CACHE_LINE_SIZE;
-  to->ptr = to->base + to->offset;
-} // void nextBlock(struct moveHelper * to)
-
-// endaddr does not contain spaces for headers
-inline bool moveobj(struct moveHelper * orig,
-                    struct moveHelper * to,
-                    int stopblock) {
-  if(stopblock == 0) {
-    return true;
-  }
-
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xe201);
-  BAMBOO_DEBUGPRINT_REG(orig->ptr);
-  BAMBOO_DEBUGPRINT_REG(to->ptr);
-#endif
-
-  int type = 0;
-  int size = 0;
-  int mark = 0;
-  int isize = 0;
-innermoveobj:
-  while((char)(*((int*)(orig->ptr))) == (char)(-2)) {
-    orig->ptr = (int*)(orig->ptr) + 1;
-  }
-#ifdef GC_CACHE_ADAPT
-  if(orig->ptr >= gc_cache_revise_infomation.orig_page_end_va) {
-       // end of an orig page
-       // compute the impact of this page for the new page
-       int tmp_factor = to->ptr-gc_cache_revise_infomation.to_page_start_va; 
-       int topage=gc_cache_revise_infomation.to_page_index;
-       int oldpage = gc_cache_revise_infomation.orig_page_index;
-       int * newtable=&gccachesamplingtbl_r[topage];
-       int * oldtable=&gccachesamplingtbl[oldpage];
-       
-       for(int tt = 0; tt < NUMCORESACTIVE; tt++) {
-         (*newtable) += (*oldtable)*tmp_factor;
-         newtable=(int*)(((char *)newtable)+size_cachesamplingtbl_local_r);
-         oldtable=(int*)(((char *)oldtable)+size_cachesamplingtbl_local);
-       }
-       // prepare for an new orig page
-       int tmp_index = (orig->ptr-gcbaseva)/(BAMBOO_PAGE_SIZE);
-       gc_cache_revise_infomation.orig_page_start_va = orig->ptr;
-       gc_cache_revise_infomation.orig_page_end_va = gcbaseva + 
-         (BAMBOO_PAGE_SIZE)*(tmp_index+1);
-       gc_cache_revise_infomation.orig_page_index = tmp_index;
-       gc_cache_revise_infomation.to_page_start_va = to->ptr;
-  }
-#endif
-  if((orig->ptr >= orig->bound) || (orig->ptr == orig->blockbound)) {
-    if(!nextSBlock(orig)) {
-      // finished, no more data
-      return true;
-    }
-    goto innermoveobj;
-  }
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xe202);
-  BAMBOO_DEBUGPRINT_REG(orig->ptr);
-  BAMBOO_DEBUGPRINT(((int *)(orig->ptr))[0]);
-#endif
-  // check the obj's type, size and mark flag
-  type = ((int *)(orig->ptr))[0];
-  size = 0;
-  if(type == 0) {
-    // end of this block, go to next one
-    if(!nextSBlock(orig)) {
-      // finished, no more data
-      return true;
-    }
-    goto innermoveobj;
-  } else if(type < NUMCLASSES) {
-    // a normal object
-    size = classsize[type];
-  } else {
-    // an array
-    struct ArrayObject *ao=(struct ArrayObject *)(orig->ptr);
-    int elementsize=classsize[type];
-    int length=ao->___length___;
-    size=sizeof(struct ArrayObject)+length*elementsize;
-  }
-  mark = ((int *)(orig->ptr))[6];
-  bool isremote = ((((int *)(orig->ptr))[6] & REMOTEM) != 0);
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xe203);
-  BAMBOO_DEBUGPRINT_REG(orig->ptr);
-  BAMBOO_DEBUGPRINT_REG(size);
-#endif
-  ALIGNSIZE(size, &isize);       // no matter is the obj marked or not
-                                 // should be able to across it
-  if((mark & MARKED) != 0) {
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xe204);
-#endif
-#ifdef GC_PROFILE
-       gc_num_liveobj++;
-#endif
-    // marked obj, copy it to current heap top
-    // check to see if remaining space is enough
-    if(to->top + isize > to->bound) {
-      // fill 0 indicating the end of this block
-      BAMBOO_MEMSET_WH(to->ptr,  '\0', to->bound - to->top);
-      // fill the header of this block and then go to next block
-      to->offset += to->bound - to->top;
-      BAMBOO_MEMSET_WH(to->base, '\0', BAMBOO_CACHE_LINE_SIZE);
-      (*((int*)(to->base))) = to->offset;
-#ifdef GC_CACHE_ADAPT
-         int tmp_ptr = to->ptr;
-#endif // GC_CACHE_ADAPT
-      nextBlock(to);
-#ifdef GC_CACHE_ADAPT
-         if((to->base+to->bound) >= gc_cache_revise_infomation.to_page_end_va) {
-               // end of an to page, wrap up its information
-               int tmp_factor = tmp_ptr-gc_cache_revise_infomation.to_page_start_va;
-               int topage=gc_cache_revise_infomation.to_page_index;
-               int oldpage = gc_cache_revise_infomation.orig_page_index;
-               int * newtable=&gccachesamplingtbl_r[topage];
-               int * oldtable=&gccachesamplingtbl[oldpage];
-         
-               for(int tt = 0; tt < NUMCORESACTIVE; tt++) {
-                 (*newtable)=((*newtable)+(*oldtable)*tmp_factor);
-                 newtable=(int*) (((char *)newtable)+size_cachesamplingtbl_local_r);
-                 oldtable=(int*) (((char *)oldtable)+size_cachesamplingtbl_local);
-               }
-               // prepare for an new to page
-               int tmp_index = (orig->ptr-gcbaseva)/(BAMBOO_PAGE_SIZE);
-               gc_cache_revise_infomation.orig_page_start_va = orig->ptr;
-               gc_cache_revise_infomation.orig_page_end_va = gcbaseva + 
-                 (BAMBOO_PAGE_SIZE)*(tmp_index+1);
-               gc_cache_revise_infomation.orig_page_index = tmp_index;
-               gc_cache_revise_infomation.to_page_start_va = to->ptr;
-               gc_cache_revise_infomation.to_page_end_va = gcbaseva + 
-                 (BAMBOO_PAGE_SIZE)*((to->ptr-gcbaseva)/(BAMBOO_PAGE_SIZE)+1);
-               gc_cache_revise_infomation.to_page_index = 
-                 (to->ptr-gcbaseva)/(BAMBOO_PAGE_SIZE);
-      }
-#endif // GC_CACHE_ADAPT
-      if(stopblock == to->numblocks) {
-               // already fulfilled the block
-               return true;
-      }   // if(stopblock == to->numblocks)
-    }   // if(to->top + isize > to->bound)
-    // set the mark field to 2, indicating that this obj has been moved
-    // and need to be flushed
-    ((int *)(orig->ptr))[6] = COMPACTED;
-    if(to->ptr != orig->ptr) {
-      if((int)(orig->ptr) < (int)(to->ptr)+size) {
-               memmove(to->ptr, orig->ptr, size);
-      } else {
-               //BAMBOO_WRITE_HINT_CACHE(to->ptr, size);
-               memcpy(to->ptr, orig->ptr, size);
-      }
-      // fill the remaining space with -2
-      BAMBOO_MEMSET_WH(to->ptr+size, -2, isize-size);
-    }
-    // store mapping info
-    BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
-#ifdef LOCALHASHTBL_TEST
-    RuntimeHashadd_I(gcpointertbl, orig->ptr, to->ptr);
-#else
-       mgchashInsert_I(gcpointertbl, orig->ptr, to->ptr);
-#endif
-       //MGCHashadd_I(gcpointertbl, orig->ptr, to->ptr);
-       if(isremote) {
-         // add to the sharedptbl
-         if(gcsharedptbl != NULL) {
-               //GCSharedHashadd_I(gcsharedptbl, orig->ptr, to->ptr);
-               mgcsharedhashInsert_I(gcsharedptbl, orig->ptr, to->ptr);
-         }
-       }
-    BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-    //}
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xcdce);
-    BAMBOO_DEBUGPRINT_REG(orig->ptr);
-    BAMBOO_DEBUGPRINT_REG(to->ptr);
-    BAMBOO_DEBUGPRINT_REG(isize);
-#endif
-    gccurr_heaptop -= isize;
-    to->ptr += isize;
-    to->offset += isize;
-    to->top += isize;
-#if 0
-#ifdef GC_CACHE_ADAPT
-       int tmp_ptr = to->ptr;
-#endif // GC_CACHE_ADAPT
-    if(to->top == to->bound) {
-      // fill the header of this block and then go to next block
-      BAMBOO_MEMSET_WH(to->base, '\0', BAMBOO_CACHE_LINE_SIZE);
-      (*((int*)(to->base))) = to->offset;
-      nextBlock(to);
-#ifdef GC_CACHE_ADAPT
-         if((to->base+to->bound) >= gc_cache_revise_infomation.to_page_end_va) {
-               // end of an to page, wrap up its information
-               int tmp_factor = tmp_ptr-gc_cache_revise_infomation.to_page_start_va;
-               int topage=gc_cache_revise_infomation.to_page_index;
-               int oldpage = gc_cache_revise_infomation.orig_page_index;
-               int * newtable=&gccachesamplingtbl_r[topage];
-               int * oldtable=&gccachesamplingtbl[oldpage];
-         
-               for(int tt = 0; tt < NUMCORESACTIVE; tt++) {
-                 (*newtable)=((*newtable)+(*oldtable)*tmp_factor);
-                 newtable=(int*) (((char *)newtable)+size_cachesamplingtbl_local_r);
-                 oldtable=(int*) (((char *)oldtable)+size_cachesamplingtbl_local);
-               }
-               // prepare for an new to page
-               int tmp_index = (orig->ptr-gcbaseva)/(BAMBOO_PAGE_SIZE);
-               gc_cache_revise_infomation.orig_page_start_va = orig->ptr;
-               gc_cache_revise_infomation.orig_page_end_va = gcbaseva + 
-                 (BAMBOO_PAGE_SIZE)*((orig->ptr-gcbaseva)/(BAMBOO_PAGE_SIZE)+1);
-               gc_cache_revise_infomation.orig_page_index = 
-                 (orig->ptr-gcbaseva)/(BAMBOO_PAGE_SIZE);
-               gc_cache_revise_infomation.to_page_start_va = to->ptr;
-               gc_cache_revise_infomation.to_page_end_va = gcbaseva + 
-                 (BAMBOO_PAGE_SIZE)*((to->ptr-gcbaseva)/(BAMBOO_PAGE_SIZE)+1);
-               gc_cache_revise_infomation.to_page_index = 
-                 (to->ptr-gcbaseva)/(BAMBOO_PAGE_SIZE);
-         }
-#endif // GC_CACHE_ADAPT
-    }
-#endif
-  } // if(mark == 1)
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xe205);
-#endif
-  // move to next obj
-  orig->ptr += size;
-
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT_REG(isize);
-  BAMBOO_DEBUGPRINT_REG(size);
-  BAMBOO_DEBUGPRINT_REG(orig->ptr);
-  BAMBOO_DEBUGPRINT_REG(orig->bound);
-#endif
-  if((orig->ptr > orig->bound) || (orig->ptr == orig->blockbound)) {
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xe206);
-#endif
-    if(!nextSBlock(orig)) {
-      // finished, no more data
-      return true;
-    }
-  }
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xe207);
-  BAMBOO_DEBUGPRINT_REG(orig->ptr);
-#endif
-  return false;
-} //bool moveobj(struct moveHelper* orig,struct moveHelper* to,int* endaddr)
-
-// should be invoked with interrupt closed
-inline int assignSpareMem_I(int sourcecore,
-                            int * requiredmem,
-                            int * tomove,
-                            int * startaddr) {
-  int b = 0;
-  BLOCKINDEX(gcloads[sourcecore], &b);
-  int boundptr = (b<NUMCORES4GC) ? ((b+1)*BAMBOO_SMEM_SIZE_L)
-                : (BAMBOO_LARGE_SMEM_BOUND+(b-NUMCORES4GC+1)*BAMBOO_SMEM_SIZE);
-  int remain = boundptr - gcloads[sourcecore];
-  int memneed = requiredmem + BAMBOO_CACHE_LINE_SIZE;
-  *startaddr = gcloads[sourcecore];
-  *tomove = gcfilledblocks[sourcecore] + 1;
-  if(memneed < remain) {
-    gcloads[sourcecore] += memneed;
-    return 0;
-  } else {
-    // next available block
-    gcfilledblocks[sourcecore] += 1;
-    int newbase = 0;
-    BASEPTR(sourcecore, gcfilledblocks[sourcecore], &newbase);
-    gcloads[sourcecore] = newbase;
-    return requiredmem-remain;
-  }
-} // int assignSpareMem_I(int ,int * , int * , int * )
-
-// should be invoked with interrupt closed
-inline bool gcfindSpareMem_I(int * startaddr,
-                             int * tomove,
-                             int * dstcore,
-                             int requiredmem,
-                             int requiredcore) {
-  for(int k = 0; k < NUMCORES4GC; k++) {
-    if((gccorestatus[k] == 0) && (gcfilledblocks[k] < gcstopblock[k])) {
-      // check if this stopped core has enough mem
-      assignSpareMem_I(k, requiredmem, tomove, startaddr);
-      *dstcore = k;
-      return true;
-    }
-  }
-  // if can not find spare mem right now, hold the request
-  gcrequiredmems[requiredcore] = requiredmem;
-  gcmovepending++;
-  return false;
-} //bool gcfindSpareMem_I(int* startaddr,int* tomove,int mem,int core)
-
-inline bool compacthelper(struct moveHelper * orig,
-                          struct moveHelper * to,
-                          int * filledblocks,
-                          int * heaptopptr,
-                          bool * localcompact) {
-  // scan over all objs in this block, compact the marked objs
-  // loop stop when finishing either scanning all active objs or
-  // fulfilled the gcstopblock
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xe101);
-  BAMBOO_DEBUGPRINT_REG(gcblock2fill);
-  BAMBOO_DEBUGPRINT_REG(gcmarkedptrbound);
-#endif
-innercompact:
-  while(orig->ptr < gcmarkedptrbound) {
-    bool stop = moveobj(orig, to, gcblock2fill);
-    if(stop) {
-      break;
-    }
-  }
-#ifdef GC_CACHE_ADAPT
-  // end of an to page, wrap up its information
-  int tmp_factor = to->ptr-gc_cache_revise_infomation.to_page_start_va;
-  int topage=gc_cache_revise_infomation.to_page_index;
-  int oldpage = gc_cache_revise_infomation.orig_page_index;
-  int * newtable=&gccachesamplingtbl_r[topage];
-  int * oldtable=&gccachesamplingtbl[oldpage];
-  
-  for(int tt = 0; tt < NUMCORESACTIVE; tt++) {
-    (*newtable) = ((*newtable)+(*oldtable)*tmp_factor);
-    newtable=(int*) (((char *)newtable)+size_cachesamplingtbl_local_r);
-    oldtable=(int*) (((char *)oldtable)+size_cachesamplingtbl_local);
-  }
-#endif // GC_CACHE_ADAPT
-  // if no objs have been compact, do nothing,
-  // otherwise, fill the header of this block
-  if(to->offset > BAMBOO_CACHE_LINE_SIZE) {
-    BAMBOO_MEMSET_WH(to->base, '\0', BAMBOO_CACHE_LINE_SIZE);
-    (*((int*)(to->base))) = to->offset;
-  } else {
-    to->offset = 0;
-    to->ptr = to->base;
-    to->top -= BAMBOO_CACHE_LINE_SIZE;
-  }  // if(to->offset > BAMBOO_CACHE_LINE_SIZE) else ...
-  if(*localcompact) {
-    *heaptopptr = to->ptr;
-    *filledblocks = to->numblocks;
-  }
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xe102);
-  BAMBOO_DEBUGPRINT_REG(orig->ptr);
-  BAMBOO_DEBUGPRINT_REG(gcmarkedptrbound);
-  BAMBOO_DEBUGPRINT_REG(*heaptopptr);
-  BAMBOO_DEBUGPRINT_REG(*filledblocks);
-  BAMBOO_DEBUGPRINT_REG(gccurr_heaptop);
-#endif
-
-  // send msgs to core coordinator indicating that the compact is finishing
-  // send compact finish message to core coordinator
-  if(STARTUPCORE == BAMBOO_NUM_OF_CORE) {
-    gcfilledblocks[BAMBOO_NUM_OF_CORE] = *filledblocks;
-    gcloads[BAMBOO_NUM_OF_CORE] = *heaptopptr;
-    if(orig->ptr < gcmarkedptrbound) {
-#ifdef DEBUG
-      BAMBOO_DEBUGPRINT(0xe103);
-#endif
-      // ask for more mem
-      gctomove = false;
-      BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
-      if(gcfindSpareMem_I(&gcmovestartaddr, &gcblock2fill, &gcdstcore,
-                          gccurr_heaptop, BAMBOO_NUM_OF_CORE)) {
-#ifdef DEBUG
-               BAMBOO_DEBUGPRINT(0xe104);
-#endif
-               gctomove = true;
-      } else {
-               BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-#ifdef DEBUG
-               BAMBOO_DEBUGPRINT(0xe105);
-#endif
-               return false;
-      }
-      BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-    } else {
-#ifdef DEBUG
-      BAMBOO_DEBUGPRINT(0xe106);
-#endif
-      gccorestatus[BAMBOO_NUM_OF_CORE] = 0;
-      gctomove = false;
-      return true;
-    }
-  } else {
-    if(orig->ptr < gcmarkedptrbound) {
-#ifdef DEBUG
-      BAMBOO_DEBUGPRINT(0xe107);
-#endif
-      // ask for more mem
-      gctomove = false;
-      send_msg_5(STARTUPCORE, GCFINISHCOMPACT, BAMBOO_NUM_OF_CORE,
-                 *filledblocks, *heaptopptr, gccurr_heaptop, false);
-    } else {
-#ifdef DEBUG
-      BAMBOO_DEBUGPRINT(0xe108);
-      BAMBOO_DEBUGPRINT_REG(*heaptopptr);
-#endif
-      // finish compacting
-      send_msg_5(STARTUPCORE, GCFINISHCOMPACT, BAMBOO_NUM_OF_CORE,
-                 *filledblocks, *heaptopptr, 0, false);
-    }
-  }       // if(STARTUPCORE == BAMBOO_NUM_OF_CORE)
-
-  if(orig->ptr < gcmarkedptrbound) {
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xe109);
-#endif
-    // still have unpacked obj
-    while(true) {
-      if(gctomove) {
-               break;
-      }
-    }
-    ;
-       gctomove = false;
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xe10a);
-#endif
-
-    to->ptr = gcmovestartaddr;
-    to->numblocks = gcblock2fill - 1;
-    to->bound = (to->numblocks==0) ?
-                BAMBOO_SMEM_SIZE_L :
-                BAMBOO_SMEM_SIZE_L+BAMBOO_SMEM_SIZE*to->numblocks;
-    BASEPTR(gcdstcore, to->numblocks, &(to->base));
-    to->offset = to->ptr - to->base;
-    to->top = (to->numblocks==0) ?
-              (to->offset) : (to->bound-BAMBOO_SMEM_SIZE+to->offset);
-    to->base = to->ptr;
-    to->offset = BAMBOO_CACHE_LINE_SIZE;
-    to->ptr += to->offset;             // for header
-    to->top += to->offset;
-    if(gcdstcore == BAMBOO_NUM_OF_CORE) {
-      *localcompact = true;
-    } else {
-      *localcompact = false;
-    }
-#ifdef GC_CACHE_ADAPT
-       // initialize the gc_cache_revise_information
-       gc_cache_revise_infomation.to_page_start_va = to->ptr;
-       gc_cache_revise_infomation.to_page_end_va = gcbaseva + 
-         (BAMBOO_PAGE_SIZE)*((to->base-gcbaseva)/(BAMBOO_PAGE_SIZE)+1);
-       gc_cache_revise_infomation.to_page_index = 
-         (to->base-gcbaseva)/(BAMBOO_PAGE_SIZE);
-       gc_cache_revise_infomation.orig_page_start_va = orig->ptr;
-       gc_cache_revise_infomation.orig_page_end_va = gcbaseva + 
-         (BAMBOO_PAGE_SIZE)*((orig->ptr-gcbaseva)/(BAMBOO_PAGE_SIZE)+1);
-       gc_cache_revise_infomation.orig_page_index = 
-         (orig->blockbase-gcbaseva)/(BAMBOO_PAGE_SIZE);
-#endif // GC_CACHE_ADAPT
-    goto innercompact;
-  }
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xe10b);
-#endif
-  return true;
-} // void compacthelper()
-
-inline void compact() {
-  if(COMPACTPHASE != gcphase) {
-    BAMBOO_EXIT(0xb102);
-  }
-
-  // initialize pointers for comapcting
-  struct moveHelper * orig =
-    (struct moveHelper *)RUNMALLOC(sizeof(struct moveHelper));
-  struct moveHelper * to =
-    (struct moveHelper *)RUNMALLOC(sizeof(struct moveHelper));
-
-  if(!initOrig_Dst(orig, to)) {
-    // no available data to compact
-    // send compact finish msg to STARTUP core
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xe001);
-    BAMBOO_DEBUGPRINT_REG(to->base);
-#endif
-    send_msg_5(STARTUPCORE, GCFINISHCOMPACT, BAMBOO_NUM_OF_CORE,
-               0, to->base, 0, false);
-    RUNFREE(orig);
-    RUNFREE(to);
-    return;
-  }
-#ifdef GC_CACHE_ADAPT
-  gc_cache_revise_infomation.orig_page_start_va = orig->ptr;
-  gc_cache_revise_infomation.orig_page_end_va = gcbaseva +  
-       (BAMBOO_PAGE_SIZE)*((orig->ptr-gcbaseva)/(BAMBOO_PAGE_SIZE)+1);
-  gc_cache_revise_infomation.orig_page_index = 
-       (orig->blockbase-gcbaseva)/(BAMBOO_PAGE_SIZE);
-#endif // GC_CACHE_ADAPT
-
-  int filledblocks = 0;
-  INTPTR heaptopptr = 0;
-  bool localcompact = true;
-  compacthelper(orig, to, &filledblocks, &heaptopptr, &localcompact);
-
-  RUNFREE(orig);
-  RUNFREE(to);
-} // compact()
-
-// if return NULL, means
-//   1. objptr is NULL
-//   2. objptr is not a shared obj
-// in these cases, remain the original value is OK
-inline void * flushObj(void * objptr) {
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xe401);
-#endif
-  if(objptr == NULL) {
-    return NULL;
-  }
-  void * dstptr = NULL;
-  if(ISSHAREDOBJ(objptr)) {
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xe402);
-    BAMBOO_DEBUGPRINT_REG(objptr);
-#endif
-    // a shared obj ptr, change to new address
-    BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
-#ifdef GC_PROFILE
-    //unsigned long long ttime = BAMBOO_GET_EXE_TIME();
-#endif
-#ifdef LOCALHASHTBL_TEST
-    RuntimeHashget(gcpointertbl, objptr, &dstptr);
-#else
-       dstptr = mgchashSearch(gcpointertbl, objptr);
-#endif
-       //MGCHashget(gcpointertbl, objptr, &dstptr);
-#ifdef GC_PROFILE
-    //flushstalltime += BAMBOO_GET_EXE_TIME()-ttime;
-#endif
-    BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT_REG(dstptr);
-#endif
-
-    if(NULL == dstptr) {
-      // no mapping info
-#ifdef DEBUG
-      BAMBOO_DEBUGPRINT(0xe403);
-      BAMBOO_DEBUGPRINT_REG(objptr);
-      BAMBOO_DEBUGPRINT_REG(hostcore(objptr));
-#endif
-      if(hostcore(objptr) == BAMBOO_NUM_OF_CORE) {
-               // error! the obj is right on this core, but cannot find it
-               //BAMBOO_DEBUGPRINT(0xecec);
-               BAMBOO_DEBUGPRINT_REG(objptr);
-               BAMBOO_EXIT(0xb103);
-               // assume that the obj has not been moved, use the original address
-               //dstptr = objptr;
-      } else {
-               int hostc = hostcore(objptr);
-#ifdef GC_PROFILE
-               //unsigned long long ttimet = BAMBOO_GET_EXE_TIME();
-#endif
-               // check the corresponsing sharedptbl
-               BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
-               //struct GCSharedHash * sptbl = gcrpointertbls[hostcore(objptr)];
-               mgcsharedhashtbl_t * sptbl = gcrpointertbls[hostc];
-               if(sptbl != NULL) {
-                 //GCSharedHashget(sptbl, (int)objptr, &dstptr);
-                 dstptr = mgcsharedhashSearch(sptbl, (int)objptr);
-                 if(dstptr != NULL) {
-#ifdef LOCALHASHTBL_TEST
-                       RuntimeHashadd_I(gcpointertbl, (int)objptr, (int)dstptr);
-#else
-                       mgchashInsert_I(gcpointertbl, (int)objptr, (int)dstptr);
-#endif
-                 }
-               }
-               BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-#ifdef GC_PROFILE
-               //flushstalltime_i += BAMBOO_GET_EXE_TIME()-ttimet;
-#endif
-
-               if(dstptr == NULL) {
-                 // still can not get the mapping info,
-                 // send msg to host core for the mapping info
-                 gcobj2map = (int)objptr;
-                 gcismapped = false;
-                 gcmappedobj = NULL;
-                 // the first time require the mapping, send msg to the hostcore
-                 // for the mapping info
-                 send_msg_3(hostc, GCMAPREQUEST, (int)objptr,
-                         BAMBOO_NUM_OF_CORE, false);
-                 while(true) {
-                       if(gcismapped) {
-                         break;
-                       }
-                 }
-#ifdef GC_PROFILE
-                 //flushstalltime_i += BAMBOO_GET_EXE_TIME()-ttimet;
-#endif
-                 BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
-#ifdef LOCALHASHTBL_TEST
-                 RuntimeHashget(gcpointertbl, objptr, &dstptr);
-#else
-                 dstptr = mgchashSearch(gcpointertbl, objptr);
-#endif
-                 //MGCHashget(gcpointertbl, objptr, &dstptr);
-                 BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-               } // if(dstptr == NULL)
-         }    // if(hostcore(objptr) == BAMBOO_NUM_OF_CORE) else ...
-#ifdef DEBUG
-      BAMBOO_DEBUGPRINT_REG(dstptr);
-#endif
-    }     // if(NULL == dstptr)
-  }      // if(ISSHAREDOBJ(objptr))
-         // if not a shared obj, return NULL to indicate no need to flush
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xe404);
-#endif
-  return dstptr;
-} // void flushObj(void * objptr)
-
-inline void flushRuntimeObj(struct garbagelist * stackptr) {
-  int i,j;
-  // flush current stack
-  while(stackptr!=NULL) {
-    for(i=0; i<stackptr->size; i++) {
-      if(stackptr->array[i] != NULL) {
-               void * dst = flushObj(stackptr->array[i]);
-               if(dst != NULL) {
-                 stackptr->array[i] = dst;
-               }
-      }
-    }
-    stackptr=stackptr->next;
-  }
-
-  // flush objectsets
-  if(BAMBOO_NUM_OF_CORE < NUMCORESACTIVE) {
-    for(i=0; i<NUMCLASSES; i++) {
-      struct parameterwrapper ** queues =
-        objectqueues[BAMBOO_NUM_OF_CORE][i];
-      int length = numqueues[BAMBOO_NUM_OF_CORE][i];
-      for(j = 0; j < length; ++j) {
-               struct parameterwrapper * parameter = queues[j];
-               struct ObjectHash * set=parameter->objectset;
-               struct ObjectNode * ptr=set->listhead;
-               while(ptr!=NULL) {
-                 void * dst = flushObj((void *)ptr->key);
-                 if(dst != NULL) {
-                       ptr->key = dst;
-                 }
-                 ptr=ptr->lnext;
-               }
-               ObjectHashrehash(set);
-      }
-    }
-  }
  
-  // flush current task descriptor
-  if(currtpd != NULL) {
-    for(i=0; i<currtpd->numParameters; i++) {
-      void * dst = flushObj(currtpd->parameterArray[i]);
-      if(dst != NULL) {
-               currtpd->parameterArray[i] = dst;
-      }
-    }
-  }
-
-  // flush active tasks
-  if(activetasks != NULL) {
-    struct genpointerlist * ptr=activetasks->list;
-    while(ptr!=NULL) {
-      struct taskparamdescriptor *tpd=ptr->src;
-      int i;
-      for(i=0; i<tpd->numParameters; i++) {
-               void * dst = flushObj(tpd->parameterArray[i]);
-               if(dst != NULL) {
-                 tpd->parameterArray[i] = dst;
-               }
-      }
-      ptr=ptr->inext;
-    }
-    genrehash(activetasks);
-  }
-
-  // flush cached transferred obj
-  struct QueueItem * tmpobjptr =  getHead(&objqueue);
-  while(tmpobjptr != NULL) {
-    struct transObjInfo * objInfo =
-      (struct transObjInfo *)(tmpobjptr->objectptr);
-    void * dst = flushObj(objInfo->objptr);
-    if(dst != NULL) {
-      objInfo->objptr = dst;
-    }
-    tmpobjptr = getNextQueueItem(tmpobjptr);
-  }
-
-  // flush cached objs to be transferred
-  struct QueueItem * item = getHead(totransobjqueue);
-  while(item != NULL) {
-    struct transObjInfo * totransobj =
-      (struct transObjInfo *)(item->objectptr);
-    void * dst = flushObj(totransobj->objptr);
-    if(dst != NULL) {
-      totransobj->objptr = dst;
-    }
-    item = getNextQueueItem(item);
-  }       // while(item != NULL)
-
-  // enqueue lock related info
-  for(i = 0; i < runtime_locklen; ++i) {
-    void * dst = flushObj(runtime_locks[i].redirectlock);
-    if(dst != NULL) {
-      runtime_locks[i].redirectlock = (int)dst;
-    }
-    if(runtime_locks[i].value != NULL) {
-      void * dst=flushObj(runtime_locks[i].value);
-      if(dst != NULL) {
-               runtime_locks[i].value = (int)dst;
-      }
-    }
-  }
-
-} // void flushRuntimeObj(struct garbagelist * stackptr)
-
-inline void transmappinginfo() {
-  // broadcast the sharedptbl pointer
-  for(int i = 0; i < NUMCORESACTIVE; i++) {
-       if(i != BAMBOO_NUM_OF_CORE) {
-         send_msg_3(i, GCMAPTBL, gcsharedptbl, BAMBOO_NUM_OF_CORE, false);
-       }
-  }
-
-  if(STARTUPCORE != BAMBOO_NUM_OF_CORE) {
-       send_msg_2(STARTUPCORE, GCFINISHMAPINFO, BAMBOO_NUM_OF_CORE, false);
-  }
+void master_updaterefs(struct garbagelist * stackptr) {
+  gc_status_info.gcphase = UPDATEPHASE;
+  GC_SEND_MSG_1_TO_CLIENT(GCSTARTUPDATE);
+  GC_PRINTF("Start update phase \n");
+  // update phase
+  update(stackptr);
+  GC_CHECK_ALL_CORE_STATUS();
+  GC_PRINTF("Finish update phase \n");
  }
  
-inline void flush(struct garbagelist * stackptr) {
-
-  flushRuntimeObj(stackptr);
-
-  while(true) {
-    BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
-    bool hasItems = gc_moreItems_I();
-    BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-    if(!hasItems) {
-      break;
-    }
-
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xe301);
-#endif
-    BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
-    void * ptr = gc_dequeue_I();
-    BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-    if(ISSHAREDOBJ(ptr)) {
-      // should be a local shared obj and should have mapping info
-      ptr = flushObj(ptr);
-#ifdef DEBUG
-      BAMBOO_DEBUGPRINT(0xe302);
-      BAMBOO_DEBUGPRINT_REG(ptr);
-      BAMBOO_DEBUGPRINT_REG(tptr);
-      BAMBOO_DEBUGPRINT_REG(((int *)(tptr))[0]);
-#endif
-      if(ptr == NULL) {
-               BAMBOO_EXIT(0xb105);
-      }
-    } // if(ISSHAREDOBJ(ptr))
-    if((!ISSHAREDOBJ(ptr)) || (((int *)(ptr))[6] == COMPACTED)) {
-      int type = ((int *)(ptr))[0];
-      // scan all pointers in ptr
-      unsigned INTPTR * pointer;
-      pointer=pointerarray[type];
-#ifdef DEBUG
-      BAMBOO_DEBUGPRINT(0xe303);
-      BAMBOO_DEBUGPRINT_REG(pointer);
-#endif
-      if (pointer==0) {
-               /* Array of primitives */
-               /* Do nothing */
-      } else if (((INTPTR)pointer)==1) {
-#ifdef DEBUG
-               BAMBOO_DEBUGPRINT(0xe304);
-#endif
-               /* Array of pointers */
-               struct ArrayObject *ao=(struct ArrayObject *) ptr;
-               int length=ao->___length___;
-               int j;
-               for(j=0; j<length; j++) {
-#ifdef DEBUG
-                 BAMBOO_DEBUGPRINT(0xe305);
-#endif
-                 void *objptr=
-                       ((void **)(((char *)&ao->___length___)+sizeof(int)))[j];
-#ifdef DEBUG
-                 BAMBOO_DEBUGPRINT_REG(objptr);
-#endif
-                 if(objptr != NULL) {
-                       void * dst = flushObj(objptr);
-                       if(dst != NULL) {
-                         ((void **)(((char *)&ao->___length___)+sizeof(int)))[j] = dst;
-                       }
-                 }
-               }
-      } else {
-#ifdef DEBUG
-               BAMBOO_DEBUGPRINT(0xe306);
-#endif
-               INTPTR size=pointer[0];
-               int i;
-               for(i=1; i<=size; i++) {
-#ifdef DEBUG
-                 BAMBOO_DEBUGPRINT(0xe307);
-#endif
-                 unsigned int offset=pointer[i];
-                 void * objptr=*((void **)(((char *)ptr)+offset));
-#ifdef DEBUG
-                 BAMBOO_DEBUGPRINT_REG(objptr);
-#endif
-                 if(objptr != NULL) {
-                       void * dst = flushObj(objptr);
-                       if(dst != NULL) {
-                         *((void **)(((char *)ptr)+offset)) = dst;
-                       }
-                 }
-               } // for(i=1; i<=size; i++)
-      }  // if (pointer==0) else if (((INTPTR)pointer)==1) else ()
-         // restore the mark field, indicating that this obj has been flushed
-      if(ISSHAREDOBJ(ptr)) {
-               ((int *)(ptr))[6] = INIT;
-      }
-    }  // if((!ISSHAREDOBJ(ptr)) || (((int *)(ptr))[6] == COMPACTED))
-  }   // while(gc_moreItems())
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xe308);
-#endif
-
-  // TODO bug here: the startup core contains all lobjs' info, thus all the
-  // lobjs are flushed in sequence.
-  // flush lobjs
-  while(gc_lobjmoreItems_I()) {
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xe309);
-#endif
-    void * ptr = gc_lobjdequeue_I(NULL, NULL);
-    ptr = flushObj(ptr);
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xe30a);
-    BAMBOO_DEBUGPRINT_REG(ptr);
-    BAMBOO_DEBUGPRINT_REG(tptr);
-    BAMBOO_DEBUGPRINT_REG(((int *)(tptr))[0]);
-#endif
-    if(ptr == NULL) {
-      BAMBOO_EXIT(0xb106);
-    }
-    if(((int *)(ptr))[6] == COMPACTED) {
-      int type = ((int *)(ptr))[0];
-      // scan all pointers in ptr
-      unsigned INTPTR * pointer;
-      pointer=pointerarray[type];
-#ifdef DEBUG
-      BAMBOO_DEBUGPRINT(0xe30b);
-      BAMBOO_DEBUGPRINT_REG(pointer);
-#endif
-      if (pointer==0) {
-               /* Array of primitives */
-               /* Do nothing */
-      } else if (((INTPTR)pointer)==1) {
-#ifdef DEBUG
-               BAMBOO_DEBUGPRINT(0xe30c);
-#endif
-               /* Array of pointers */
-               struct ArrayObject *ao=(struct ArrayObject *) ptr;
-               int length=ao->___length___;
-               int j;
-               for(j=0; j<length; j++) {
-#ifdef DEBUG
-                 BAMBOO_DEBUGPRINT(0xe30d);
-#endif
-                 void *objptr=
-                       ((void **)(((char *)&ao->___length___)+sizeof(int)))[j];
-#ifdef DEBUG
-                 BAMBOO_DEBUGPRINT_REG(objptr);
-#endif
-                 if(objptr != NULL) {
-                       void * dst = flushObj(objptr);
-                       if(dst != NULL) {
-                         ((void **)(((char *)&ao->___length___)+sizeof(int)))[j] = dst;
-                       }
-                 }
-               }
-      } else {
-#ifdef DEBUG
-               BAMBOO_DEBUGPRINT(0xe30e);
-#endif
-               INTPTR size=pointer[0];
-               int i;
-               for(i=1; i<=size; i++) {
-#ifdef DEBUG
-                 BAMBOO_DEBUGPRINT(0xe30f);
-#endif
-                 unsigned int offset=pointer[i];
-                 void * objptr=*((void **)(((char *)ptr)+offset));
-
-#ifdef DEBUG
-                 BAMBOO_DEBUGPRINT_REG(objptr);
-#endif
-                 if(objptr != NULL) {
-                       void * dst = flushObj(objptr);
-                       if(dst != NULL) {
-                         *((void **)(((char *)ptr)+offset)) = dst;
-                       }
-                 }
-               }  // for(i=1; i<=size; i++)
-      }  // if (pointer==0) else if (((INTPTR)pointer)==1) else ()
-         // restore the mark field, indicating that this obj has been flushed
-      ((int *)(ptr))[6] = INIT;
-    }     // if(((int *)(ptr))[6] == COMPACTED)
-  }     // while(gc_lobjmoreItems())
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xe310);
-#endif
-
-  // send flush finish message to core coordinator
-  if(STARTUPCORE == BAMBOO_NUM_OF_CORE) {
-    gccorestatus[BAMBOO_NUM_OF_CORE] = 0;
-  } else {
-    send_msg_2(STARTUPCORE, GCFINISHFLUSH, BAMBOO_NUM_OF_CORE, false);
-  }
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xe311);
-#endif
-} // flush()
-
-#ifdef GC_CACHE_ADAPT
-// prepare for cache adaption:
-//   -- flush the shared heap
-//   -- clean dtlb entries
-//   -- change cache strategy
-void cacheAdapt_gc(bool isgccachestage) {
-  // flush the shared heap
-  BAMBOO_CACHE_FLUSH_L2();
-
-  // clean the dtlb entries
-  BAMBOO_CLEAN_DTLB();
-
-  // change the cache strategy
-  gccachestage = isgccachestage;
-} // cacheAdapt_gc(bool isgccachestage)
-
-// the master core decides how to adapt cache strategy for the mutator 
-// according to collected statistic data
-
-// make all pages hfh
-int cacheAdapt_policy_h4h(){
-  unsigned int page_index = 0;
-  VA page_sva = 0;
-  unsigned int page_num = (BAMBOO_SHARED_MEM_SIZE) / (BAMBOO_PAGE_SIZE);
-  int numchanged = 0;
-  int * tmp_p = gccachepolicytbl+1;
-  for(page_index = 0; page_index < page_num; page_index++) {
-       page_sva = gcbaseva + (BAMBOO_PAGE_SIZE) * page_index;
-       bamboo_cache_policy_t policy = {0};
-       policy.cache_mode = BAMBOO_CACHE_MODE_HASH;
-       *tmp_p = page_index;
-       tmp_p++;
-       *tmp_p = policy.word;
-       tmp_p++;
-       numchanged++;
-  }
-
-  return numchanged;
-} // int cacheAdapt_policy_hfh()
-
-// make all pages local as non-cache-adaptable gc local mode
-int cacheAdapt_policy_local(){
-  unsigned int page_index = 0;
-  VA page_sva = 0;
-  unsigned int page_num = (BAMBOO_SHARED_MEM_SIZE) / (BAMBOO_PAGE_SIZE);
-  int numchanged = 0;
-  int * tmp_p = gccachepolicytbl+1;
-  for(page_index = 0; page_index < page_num; page_index++) {
-       page_sva = gcbaseva + (BAMBOO_PAGE_SIZE) * page_index;
-       bamboo_cache_policy_t policy = {0};
-       int block = 0;
-       BLOCKINDEX(page_sva, &block);
-       int coren = gc_block2core[block%(NUMCORES4GC*2)];
-       // locally cache the page in the hotest core
-       // NOTE: (x,y) should be changed to (x+1, y+1)!!!
-       policy.cache_mode = BAMBOO_CACHE_MODE_COORDS;
-       policy.lotar_x = bamboo_cpu2coords[2*coren]+1;
-       policy.lotar_y = bamboo_cpu2coords[2*coren+1]+1;
-       *tmp_p = page_index;
-       tmp_p++;
-       *tmp_p = policy.word;
-       tmp_p++;
-       numchanged++;
-  }
-
-  return numchanged;
-} // int cacheAdapt_policy_local()
-
-int cacheAdapt_policy_hotest(){
-  unsigned int page_index = 0;
-  VA page_sva = 0;
-  unsigned int page_num = (BAMBOO_SHARED_MEM_SIZE) / (BAMBOO_PAGE_SIZE);
-  int numchanged = 0;
-  int * tmp_p = gccachepolicytbl+1;
-  for(page_index = 0; page_index < page_num; page_index++) {
-       page_sva = gcbaseva + (BAMBOO_PAGE_SIZE) * page_index;
-       bamboo_cache_policy_t policy = {0};
-       int hotestcore = 0;
-       int hotfreq = 0;
-
-       int *local_tbl=&gccachesamplingtbl_r[page_index];
-       for(int i = 0; i < NUMCORESACTIVE; i++) {
-         int freq = *local_tbl;
-         local_tbl=(int *)(((char *)local_tbl)+size_cachesamplingtbl_local_r);
-
-         // check the freqency, decide if this page is hot for the core
-         if(hotfreq < freq) {
-               hotfreq = freq;
-               hotestcore = i;
-         }
-       }
-       // TODO
-       // Decide the cache strategy for this page
-       // If decide to adapt a new cache strategy, write into the shared block of
-       // the gcsharedsamplingtbl. The mem recording information that has been 
-       // written is enough to hold the information.
-       // Format: page start va + cache strategy(hfh/(host core+[x,y]))
-       if(hotfreq == 0) {
-         // this page has not been accessed, do not change its cache policy
-         continue;
-       } else {
-         // locally cache the page in the hotest core
-         // NOTE: (x,y) should be changed to (x+1, y+1)!!!
-         policy.cache_mode = BAMBOO_CACHE_MODE_COORDS;
-         policy.lotar_x = bamboo_cpu2coords[2*hotestcore]+1;
-         policy.lotar_y = bamboo_cpu2coords[2*hotestcore+1]+1;
-         *tmp_p = page_index;
-         tmp_p++;
-         *tmp_p = policy.word;
-         tmp_p++;
-         numchanged++;
-       }
-  }
-
-  return numchanged;
-} // int cacheAdapt_policy_hotest()
-
-#define GC_CACHE_ADAPT_DOMINATE_THRESHOLD  50
-// cache the page on the core that accesses it the most if that core accesses 
-// it more than (GC_CACHE_ADAPT_DOMINATE_THRESHOLD)% of the total.  Otherwise,
-// h4h the page.
-int cacheAdapt_policy_dominate(){
-  unsigned int page_index = 0;
-  VA page_sva = 0;
-  unsigned int page_num = (BAMBOO_SHARED_MEM_SIZE) / (BAMBOO_PAGE_SIZE);
-  int numchanged = 0;
-  int * tmp_p = gccachepolicytbl+1;
-  for(page_index = 0; page_index < page_num; page_index++) {
-       page_sva = gcbaseva + (BAMBOO_PAGE_SIZE) * page_index;
-       bamboo_cache_policy_t policy = {0};
-       int hotestcore = 0;
-       int totalfreq = 0;
-       int hotfreq = 0;
-       
-       int *local_tbl=&gccachesamplingtbl_r[page_index];
-       for(int i = 0; i < NUMCORESACTIVE; i++) {
-         int freq = *local_tbl;
-         local_tbl=(int *)(((char *)local_tbl)+size_cachesamplingtbl_local_r);
-         totalfreq += freq;
-         // TODO
-         // check the freqency, decide if this page is hot for the core
-         if(hotfreq < freq) {
-               hotfreq = freq;
-               hotestcore = i;
-         }
-       }
-
-       // Decide the cache strategy for this page
-       // If decide to adapt a new cache strategy, write into the shared block of
-       // the gcpolicytbl 
-       // Format: page start va + cache policy
-       if(hotfreq == 0) {
-         // this page has not been accessed, do not change its cache policy
-         continue;
-       }
-       totalfreq = (totalfreq*GC_CACHE_ADAPT_DOMINATE_THRESHOLD)/100/BAMBOO_PAGE_SIZE;
-       if(hotfreq < totalfreq) {
-         // use hfh
-         policy.cache_mode = BAMBOO_CACHE_MODE_HASH;
-       } else {
-         // locally cache the page in the hotest core
-         // NOTE: (x,y) should be changed to (x+1, y+1)!!!
-         policy.cache_mode = BAMBOO_CACHE_MODE_COORDS;
-         policy.lotar_x = bamboo_cpu2coords[2*hotestcore]+1;
-         policy.lotar_y = bamboo_cpu2coords[2*hotestcore+1]+1;
-       }
-       *tmp_p = page_index;
-       tmp_p++;
-       *tmp_p = policy.word;
-       tmp_p++;
-       numchanged++;
-  }
-
-  return numchanged;
-} // int cacheAdapt_policy_dominate()
-
-#define GC_CACHE_ADAPT_OVERLOAD_THRESHOLD 20000
-
-void gc_quicksort(int *array, 
-                     int left,
-                                 int right,
-                                 int offset) {
-  int pivot = 0;;
-  int leftIdx = left;
-  int rightIdx = right;
-  if((right-left+1) >= 1) {
-       pivot = (left+right)/2;
-       while((leftIdx <= pivot) && (rightIdx >= pivot)) {
-         int pivotValue = array[pivot*3-offset];
-         while((array[leftIdx*3-offset] > pivotValue) && (leftIdx <= pivot)) {
-               leftIdx++;
-         }
-         while((array[rightIdx*3-offset] < pivotValue) && (rightIdx >= pivot)) {
-               rightIdx--;
-         }
-         // swap [leftIdx] & [rightIdx]
-         for(int k = 0; k < 3; k++) {
-               int tmp = array[3*rightIdx-k];
-               array[3*rightIdx-k] = array[3*leftIdx-k];
-               array[3*leftIdx-k] = tmp;
-         }
-         leftIdx++;
-         rightIdx--;
-         if((leftIdx-1) == pivot) {
-               pivot = rightIdx = rightIdx + 1;
-         } else if((leftIdx+1) == pivot) {
-               pivot = leftIdx = leftIdx-1;
-         }
-       }
-       gc_quicksort(array, left, pivot-1, offset);
-       gc_quicksort(array, pivot+1, right, offset);
-  }
-  return;
-} // void gc_quicksort(...)
-
-// Every page cached on the core that accesses it the most. 
-// Check to see if any core's pages total more accesses than threshold 
-// GC_CACHE_ADAPT_OVERLOAD_THRESHOLD.  If so, find the pages with the 
-// most remote accesses and hash for home them until we get below 
-// GC_CACHE_ADAPT_OVERLOAD_THRESHOLD
-int cacheAdapt_policy_overload(){
-  unsigned int page_index = 0;
-  VA page_sva = 0;
-  unsigned int page_num = (BAMBOO_SHARED_MEM_SIZE) / (BAMBOO_PAGE_SIZE);
-  int numchanged = 0;
-  int * tmp_p = gccachepolicytbl+1;
-  unsigned long long workload[NUMCORESACTIVE];
-  memset(workload, 0, NUMCORESACTIVE*sizeof(unsigned long long));
-  unsigned long long total_workload = 0;
-  int core2heavypages[NUMCORESACTIVE][page_num*3+1];
-  memset(core2heavypages, 0, sizeof(int)*(page_num*3+1)*NUMCORESACTIVE);
-  for(page_index = 0; page_index < page_num; page_index++) {
-       page_sva = gcbaseva + (BAMBOO_PAGE_SIZE) * page_index;
-       bamboo_cache_policy_t policy = {0};
-       int hotestcore = 0;
-       int totalfreq = 0;
-       int hotfreq = 0;
-       
-       int *local_tbl=&gccachesamplingtbl_r[page_index];
-       for(int i = 0; i < NUMCORESACTIVE; i++) {
-         int freq = *local_tbl;
-         local_tbl=(int *)(((char *)local_tbl)+size_cachesamplingtbl_local_r);
-         totalfreq += freq;
-         // TODO
-         // check the freqency, decide if this page is hot for the core
-         if(hotfreq < freq) {
-               hotfreq = freq;
-               hotestcore = i;
-         }
-         // TODO
-         /*if(page_sva == 0x10e90000) {
-               if(freq != 0) tprintf("0x10e90000 core %d, %d\n", i, freq);
-         }*/
-       }
-       // TODO
-       // Decide the cache strategy for this page
-       // If decide to adapt a new cache strategy, write into the shared block of
-       // the gcsharedsamplingtbl. The mem recording information that has been 
-       // written is enough to hold the information.
-       // Format: page start va + cache strategy(hfh/(host core+[x,y]))
-       if(hotfreq == 0) {
-         // this page has not been accessed, do not change its cache policy
-         continue;
-       }
-
-       totalfreq/=BAMBOO_PAGE_SIZE;
-       // locally cache the page in the hotest core
-       // NOTE: (x,y) should be changed to (x+1, y+1)!!!
-       policy.cache_mode = BAMBOO_CACHE_MODE_COORDS;
-       policy.lotar_x = bamboo_cpu2coords[2*hotestcore]+1;
-       policy.lotar_y = bamboo_cpu2coords[2*hotestcore+1]+1;
-       *tmp_p = page_index;
-       tmp_p++;
-       *tmp_p = policy.word;
-       tmp_p++;
-       numchanged++;
-       workload[hotestcore] += totalfreq;
-       total_workload += totalfreq;
-       // insert into core2heavypages using quicksort
-       int remoteaccess = totalfreq - hotfreq;
-       int index = core2heavypages[hotestcore][0];
-       core2heavypages[hotestcore][3*index+3] = remoteaccess;
-       core2heavypages[hotestcore][3*index+2] = totalfreq;
-       core2heavypages[hotestcore][3*index+1] = tmp_p-1;
-       core2heavypages[hotestcore][0]++;
-       // TODO
-       /*if(page_sva == 0x10f10000) {
-       int block = 0;
-       BLOCKINDEX(page_sva, &block);
-       int coren = gc_block2core[block%(NUMCORES4GC*2)];
-       int coord_x =  bamboo_cpu2coords[2*coren]+1;
-       int coord_y = bamboo_cpu2coords[2*coren+1]+1;
-         tprintf("+++ %x(%d-%d,%d) hotcore %d, total %d, hot %d, remote %d, index %d p %x\n", (int)page_sva, coren, coord_x, coord_y, hotestcore, totalfreq, hotfreq, remoteaccess, index, (int)(tmp_p-1));
-       }*/
-  }
-
-  int workload_threshold = total_workload / 10;
-  // Check the workload of each core
-  for(int i = 0; i < NUMCORESACTIVE; i++) {
-       int j = 1;
-       int index = core2heavypages[i][0];
-       if(workload[i] > workload_threshold/*GC_CACHE_ADAPT_OVERLOAD_THRESHOLD*/) {
-         // sort according to the remoteaccess
-         gc_quicksort(&core2heavypages[i][0], 1, index, 0);
-         while((workload[i] > workload_threshold/*GC_CACHE_ADAPT_OVERLOAD_THRESHOLD*/) && (j<index*3)) {
-               // hfh those pages with more remote accesses 
-               bamboo_cache_policy_t policy = {0};
-               policy.cache_mode = BAMBOO_CACHE_MODE_HASH;
-               *((int*)core2heavypages[i][j]) = policy.word;
-               workload[i] -= core2heavypages[i][j+1];
-               j += 3;
-         }
-       }
-  }
-
-  return numchanged;
-} // int cacheAdapt_policy_overload()
-
-#define GC_CACHE_ADAPT_ACCESS_THRESHOLD 70
-#define GC_CACHE_ADAPT_CROWD_THRESHOLD  20
-// Every page cached on the core that accesses it the most. 
-// Check to see if any core's pages total more accesses than threshold 
-// GC_CACHE_ADAPT_OVERLOAD_THRESHOLD.  If so, find the pages with the 
-// most remote accesses and hash for home them until we get below 
-// GC_CACHE_ADAPT_OVERLOAD_THRESHOLD.  
-// Sort pages based on activity.... 
-// If more then GC_CACHE_ADAPT_ACCESS_THRESHOLD% of the accesses for a
-// core's pages are from more than GC_CACHE_ADAPT_CROWD_THRESHOLD pages, 
-// then start hfh these pages(selecting the ones with the most remote 
-// accesses first or fewest local accesses) until we get below 
-// GC_CACHE_ADAPT_CROWD_THRESHOLD pages.
-int cacheAdapt_policy_crowd(){
-  unsigned int page_index = 0;
-  VA page_sva = 0;
-  unsigned int page_num = (BAMBOO_SHARED_MEM_SIZE) / (BAMBOO_PAGE_SIZE);
-  int numchanged = 0;
-  int * tmp_p = gccachepolicytbl+1;
-  unsigned long long workload[NUMCORESACTIVE];
-  memset(workload, 0, NUMCORESACTIVE*sizeof(unsigned long long));
-  unsigned long long total_workload = 0;
-  int core2heavypages[NUMCORESACTIVE][page_num*3+1];
-  memset(core2heavypages, 0, sizeof(int)*(page_num*3+1)*NUMCORESACTIVE);
-  for(page_index = 0; page_index < page_num; page_index++) {
-       page_sva = gcbaseva + (BAMBOO_PAGE_SIZE) * page_index;
-       bamboo_cache_policy_t policy = {0};
-       int hotestcore = 0;
-       int totalfreq = 0;
-       int hotfreq = 0;
-       
-       int *local_tbl=&gccachesamplingtbl_r[page_index];
-       for(int i = 0; i < NUMCORESACTIVE; i++) {
-         int freq = *local_tbl;
-         local_tbl=(int *)(((char *)local_tbl)+size_cachesamplingtbl_local_r);
-         totalfreq += freq;
-         // TODO
-         // check the freqency, decide if this page is hot for the core
-         if(hotfreq < freq) {
-               hotfreq = freq;
-               hotestcore = i;
-         }
-         // TODO
-         /*if(page_sva == 0x10e90000) {
-               if(freq != 0) tprintf("0x10e90000 core %d, %d\n", i, freq);
-         }*/
-       }
-       // TODO
-       // Decide the cache strategy for this page
-       // If decide to adapt a new cache strategy, write into the shared block of
-       // the gcsharedsamplingtbl. The mem recording information that has been 
-       // written is enough to hold the information.
-       // Format: page start va + cache strategy(hfh/(host core+[x,y]))
-       if(hotfreq == 0) {
-         // this page has not been accessed, do not change its cache policy
-         continue;
-       }
-       totalfreq/=BAMBOO_PAGE_SIZE;
-       // locally cache the page in the hotest core
-       // NOTE: (x,y) should be changed to (x+1, y+1)!!!
-       policy.cache_mode = BAMBOO_CACHE_MODE_COORDS;
-       policy.lotar_x = bamboo_cpu2coords[2*hotestcore]+1;
-       policy.lotar_y = bamboo_cpu2coords[2*hotestcore+1]+1;
-       *tmp_p = page_index;
-       tmp_p++;
-       *tmp_p = policy.word;
-       tmp_p++;
-       numchanged++;
-       workload[hotestcore] += totalfreq;
-       total_workload += totalfreq;
-       // insert into core2heavypages using quicksort
-       int remoteaccess = totalfreq - hotfreq;
-       int index = core2heavypages[hotestcore][0];
-       core2heavypages[hotestcore][3*index+3] = remoteaccess;
-       core2heavypages[hotestcore][3*index+2] = totalfreq;
-       core2heavypages[hotestcore][3*index+1] = tmp_p-1;
-       core2heavypages[hotestcore][0]++;
-       // TODO
-       /*if(page_sva == 0x10f10000) {
-       int block = 0;
-       BLOCKINDEX(page_sva, &block);
-       int coren = gc_block2core[block%(NUMCORES4GC*2)];
-       int coord_x =  bamboo_cpu2coords[2*coren]+1;
-       int coord_y = bamboo_cpu2coords[2*coren+1]+1;
-         tprintf("+++ %x(%d-%d,%d) hotcore %d, total %d, hot %d, remote %d, index %d p %x\n", (int)page_sva, coren, coord_x, coord_y, hotestcore, totalfreq, hotfreq, remoteaccess, index, (int)(tmp_p-1));
-       }*/
-  }
-
-  int workload_threshold = total_workload / 10;
-  // Check the workload of each core
-  for(int i = 0; i < NUMCORESACTIVE; i++) {
-       int j = 1;
-       int index = core2heavypages[i][0];
-       if(workload[i] > workload_threshold/*GC_CACHE_ADAPT_OVERLOAD_THRESHOLD*/) {
-         // sort according to the remoteaccess
-         gc_quicksort(&core2heavypages[i][0], 1, index, 0);
-         while((workload[i] > workload_threshold/*GC_CACHE_ADAPT_OVERLOAD_THRESHOLD*/) && (j<index*3)) {
-               // hfh those pages with more remote accesses 
-               bamboo_cache_policy_t policy = {0};
-               policy.cache_mode = BAMBOO_CACHE_MODE_HASH;
-               *((int*)core2heavypages[i][j]) = policy.word;
-               workload[i] -= core2heavypages[i][j+1];
-               j += 3;
-         }
-       }
-
-       // Check if the accesses are crowded on few pages
-       // sort according to the total access
-inner_crowd:
-       gc_quicksort(&core2heavypages[i][0], j/3+1, index, 1);
-       int threshold = GC_CACHE_ADAPT_ACCESS_THRESHOLD*workload[i]/100;
-       int num_crowded = 0;
-       int t_workload = 0;
-       do {
-         t_workload += core2heavypages[i][j+num_crowded*3+1];
-         num_crowded++;
-       } while(t_workload < threshold);
-       // num_crowded <= GC_CACHE_ADAPT_CROWD_THRESHOLD and if there are enough 
-       // items, it is always == GC_CACHE_ADAPT_CROWD_THRESHOLD
-       if(num_crowded > GC_CACHE_ADAPT_CROWD_THRESHOLD) {
-//inner_crowd:
-         // need to hfh these pages
-         // sort the pages according to remote access
-         gc_quicksort(&core2heavypages[i][0], j/3+1, j/3+num_crowded, 0);
-         //while((num_crowded--) && (j < index*3)) {
-               // h4h those pages with more remote accesses 
-               bamboo_cache_policy_t policy = {0};
-               policy.cache_mode = BAMBOO_CACHE_MODE_HASH;
-               *((int*)core2heavypages[i][j]) = policy.word;
-               workload[i] -= core2heavypages[i][j+1];
-               t_workload -= core2heavypages[i][j+1];
-               /*if((j/3+GC_CACHE_ADAPT_CROWD_THRESHOLD) < index) {
-                 t_workload += 
-                       core2heavypages[i][j+GC_CACHE_ADAPT_CROWD_THRESHOLD*3+1];
-               }*/
-               j += 3;
-               threshold = GC_CACHE_ADAPT_ACCESS_THRESHOLD*workload[i]/100;
-               /*if(t_workload <= threshold) {
-                 break;
-               }
-         }
-         if((j < index*3) && (t_workload > threshold)) {
-               num_crowded = ((index-j/3) > GC_CACHE_ADAPT_CROWD_THRESHOLD) ?
-                 (GC_CACHE_ADAPT_CROWD_THRESHOLD) : (index-j/3);*/
-               goto inner_crowd;
-//       }
-       }
-  }
-
-  return numchanged;
-} // int cacheAdapt_policy_overload()
-
-void cacheAdapt_master() {
-#ifdef GC_CACHE_ADAPT
-  //gc_output_cache_sampling_r();
-#endif // GC_CACHE_ADAPT
-  int numchanged = 0;
-  // check the statistic data
-  // for each page, decide the new cache strategy
-  numchanged = cacheAdapt_policy_h4h();
-  //numchanged = cacheAdapt_policy_local();
-  //numchanged = cacheAdapt_policy_hotest();
-  //numchanged = cacheAdapt_policy_dominate();
-  //numchanged = cacheAdapt_policy_overload();
-  //numchanged = cacheAdapt_policy_crowd();
-  *gccachepolicytbl = numchanged;
-  // TODO
-  //if(numchanged > 0) tprintf("=================\n");
-}
-
-// adapt the cache strategy for the mutator
-void cacheAdapt_mutator() {
-  int numchanged = *gccachepolicytbl;
-  // check the changes and adapt them
-  int * tmp_p = gccachepolicytbl+1;
-  while(numchanged--) {
-       // read out the policy
-       int page_index = *tmp_p;
-       bamboo_cache_policy_t policy = (bamboo_cache_policy_t)(*(tmp_p+1));
-       // TODO
-       /*if(BAMBOO_NUM_OF_CORE == 0) {
-         tprintf("va: %x, policy: %d (%d,%d) \n", 
-                 (int)(page_index*(BAMBOO_PAGE_SIZE)+gcbaseva), policy.cache_mode,
-                 policy.lotar_x, policy.lotar_y);
-       }*/
-       // adapt the policy
-       bamboo_adapt_cache_policy(page_index*(BAMBOO_PAGE_SIZE)+gcbaseva, 
-               policy, BAMBOO_PAGE_SIZE);
-
-       tmp_p += 2;
-  }
-  //if(BAMBOO_NUM_OF_CORE == 0) tprintf("=================\n"); // TODO
-}
-
-void gc_output_cache_sampling() {
-  unsigned int page_index = 0;
-  VA page_sva = 0;
-  unsigned int page_num = (BAMBOO_SHARED_MEM_SIZE) / (BAMBOO_PAGE_SIZE);
-  for(page_index = 0; page_index < page_num; page_index++) {
-       page_sva = gcbaseva + (BAMBOO_PAGE_SIZE) * page_index;
-       int block = 0;
-       BLOCKINDEX(page_sva, &block);
-       int coren = gc_block2core[block%(NUMCORES4GC*2)];
-       tprintf("va: %x page_index: %d host: %d\n", 
-               (int)page_sva, page_index, coren);
-       for(int i = 0; i < NUMCORESACTIVE; i++) {
-         int * local_tbl = (int *)((void *)gccachesamplingtbl
-                 +size_cachesamplingtbl_local*i);
-         int freq = local_tbl[page_index]/BAMBOO_PAGE_SIZE;
-         printf("%8d ",freq);
-       }
-       printf("\n");
-  }
-  printf("=================\n");
-} // gc_output_cache_sampling
-
-void gc_output_cache_sampling_r() {
-  unsigned int page_index = 0;
-  VA page_sva = 0;
-  unsigned int page_num = (BAMBOO_SHARED_MEM_SIZE) / (BAMBOO_PAGE_SIZE);
-  for(page_index = 0; page_index < page_num; page_index++) {
-       page_sva = gcbaseva + (BAMBOO_PAGE_SIZE) * page_index;
-       int block = 0;
-       BLOCKINDEX(page_sva, &block);
-       int coren = gc_block2core[block%(NUMCORES4GC*2)];
-       tprintf("va: %x page_index: %d host: %d\n", 
-               (int)page_sva, page_index, coren);
-       for(int i = 0; i < NUMCORESACTIVE; i++) {
-         int * local_tbl = (int *)((void *)gccachesamplingtbl_r
-                 +size_cachesamplingtbl_local_r*i);
-         int freq = local_tbl[page_index]/BAMBOO_PAGE_SIZE;
-         printf("%8d ",freq);
-       }
-       printf("\n");
-  }
-  printf("=================\n");
-} // gc_output_cache_sampling
-#endif // GC_CACHE_ADAPT
-
-inline void gc_collect(struct garbagelist * stackptr) {
-  // inform the master that this core is at a gc safe point and is ready to 
-  // do gc
-  send_msg_4(STARTUPCORE, GCFINISHPRE, BAMBOO_NUM_OF_CORE, self_numsendobjs, 
-         self_numreceiveobjs, false);
-
-  // core collector routine
-  while(true) {
-    if(INITPHASE == gcphase) {
-      break;
-    }
-  }
-#ifdef RAWPATH // TODO GC_DEBUG
-  printf("(%X,%X) Do initGC\n", udn_tile_coord_x(), udn_tile_coord_y());
-#endif
-  initGC();
-#ifdef GC_CACHE_ADAPT
-  // prepare for cache adaption:
-  cacheAdapt_gc(true);
-#endif // GC_CACHE_ADAPT
-  //send init finish msg to core coordinator
-  send_msg_2(STARTUPCORE, GCFINISHINIT, BAMBOO_NUM_OF_CORE, false);
-
-  while(true) {
-    if(MARKPHASE == gcphase) {
-      break;
-    }
-  }
-#ifdef RAWPATH // TODO GC_DEBUG
-  printf("(%x,%x) Start mark phase\n", udn_tile_coord_x(), 
-            udn_tile_coord_y());
-#endif
-  mark(true, stackptr);
-#ifdef RAWPATH // TODO GC_DEBUG
-  printf("(%x,%x) Finish mark phase, start compact phase\n", 
-            udn_tile_coord_x(), udn_tile_coord_y());
-#endif
-  compact();
-#ifdef RAWPATH // TODO GC_DEBUG
-  printf("(%x,%x) Finish compact phase\n", udn_tile_coord_x(),
-            udn_tile_coord_y());
-#endif
-
-  while(true) {
-       if(MAPPHASE == gcphase) {
-         break;
-       }
-  }
-#ifdef RAWPATH // TODO GC_DEBUG
-  printf("(%x,%x) Start map phase\n", udn_tile_coord_x(), 
-            udn_tile_coord_y());
-#endif
-  transmappinginfo();
-#ifdef RAWPATH // TODO GC_DEBUG
-  printf("(%x,%x) Finish map phase\n", udn_tile_coord_x(),
-            udn_tile_coord_y());
-#endif
-
-  while(true) {
-    if(FLUSHPHASE == gcphase) {
-      break;
-    }
-  }
-#ifdef RAWPATH // TODO GC_DEBUG
-  printf("(%x,%x) Start flush phase\n", udn_tile_coord_x(), 
-            udn_tile_coord_y());
-#endif
-#ifdef GC_PROFILE
-  // send the num of obj/liveobj/forwardobj to the startupcore
-  if(STARTUPCORE != BAMBOO_NUM_OF_CORE) {
-       send_msg_4(STARTUPCORE, GCPROFILES, gc_num_obj, 
-               gc_num_liveobj, gc_num_forwardobj, false);
-  }
-  gc_num_obj = 0;
-#endif // GC_PROFLIE
-  flush(stackptr);
-#ifdef RAWPATH // TODO GC_DEBUG
-  printf("(%x,%x) Finish flush phase\n", udn_tile_coord_x(),
-            udn_tile_coord_y());
-#endif
-
-#ifdef GC_CACHE_ADAPT
-  while(true) {
-    if(PREFINISHPHASE == gcphase) {
-      break;
-    }
-  }
-#ifdef RAWPATH // TODO GC_DEBUG
-  printf("(%x,%x) Start prefinish phase\n", udn_tile_coord_x(), 
-            udn_tile_coord_y());
-#endif
-  // cache adapt phase
-  cacheAdapt_mutator();
-  cacheAdapt_gc(false);
-  //send init finish msg to core coordinator
-  send_msg_2(STARTUPCORE, GCFINISHPREF, BAMBOO_NUM_OF_CORE, false);
-#ifdef RAWPATH // TODO GC_DEBUG
-  printf("(%x,%x) Finish prefinish phase\n", udn_tile_coord_x(),
-            udn_tile_coord_y());
-#endif
-#endif // GC_CACHE_ADAPT
-
-  while(true) {
-    if(FINISHPHASE == gcphase) {
-      break;
-    }
-  }
-#ifdef RAWPATH // TODO GC_DEBUG
-  printf("(%x,%x) Finish gc!\n", udn_tile_coord_x(), udn_tile_coord_y());
-#endif
-} // void gc_collect(struct garbagelist * stackptr)
-
-inline void gc_nocollect(struct garbagelist * stackptr) {
-  // inform the master that this core is at a gc safe point and is ready to 
-  // do gc
-  send_msg_4(STARTUPCORE, GCFINISHPRE, BAMBOO_NUM_OF_CORE, self_numsendobjs, 
-         self_numreceiveobjs, false);
+void master_finish() {
+  gc_status_info.gcphase = FINISHPHASE;
    
-  while(true) {
-    if(INITPHASE == gcphase) {
-      break;
-    }
-  }
-#ifdef RAWPATH // TODO GC_DEBUG
-  printf("(%x,%x) Do initGC\n", udn_tile_coord_x(), udn_tile_coord_y());
-#endif
-  initGC();
-#ifdef GC_CACHE_ADAPT
-  // prepare for cache adaption:
-  cacheAdapt_gc(true);
-#endif // GC_CACHE_ADAPT
-  //send init finish msg to core coordinator
-  send_msg_2(STARTUPCORE, GCFINISHINIT, BAMBOO_NUM_OF_CORE, false);
-
-  while(true) {
-    if(MARKPHASE == gcphase) {
-      break;
-    }
-  }
-#ifdef RAWPATH // TODO GC_DEBUG
-  printf("(%x,%x) Start mark phase\n", udn_tile_coord_x(), 
-            udn_tile_coord_y());
-#endif
-  mark(true, stackptr);
-#ifdef RAWPATH // TODO GC_DEBUG
-  printf("(%x,%x) Finish mark phase, wait for flush\n", 
-            udn_tile_coord_x(), udn_tile_coord_y());
-#endif
-
-  // non-gc core collector routine
-  while(true) {
-    if(FLUSHPHASE == gcphase) {
-      break;
-    }
-  }
-#ifdef RAWPATH // TODO GC_DEBUG
-  printf("(%x,%x) Start flush phase\n", udn_tile_coord_x(), 
-            udn_tile_coord_y());
-#endif
-#ifdef GC_PROFILE
-  if(STARTUPCORE != BAMBOO_NUM_OF_CORE) {
-       send_msg_4(STARTUPCORE, GCPROFILES, gc_num_obj, 
-               gc_num_liveobj, gc_num_forwardobj, false);
-  }
-  gc_num_obj = 0;
-#endif // GC_PROFLIE
-  flush(stackptr);
-#ifdef RAWPATH // TODO GC_DEBUG
-  printf("(%x,%x) Finish flush phase\n", udn_tile_coord_x(), 
-            udn_tile_coord_y());
-#endif
+  // invalidate all shared mem pointers
+  // put it here as it takes time to inform all the other cores to
+  // finish gc and it might cause problem when some core resumes
+  // mutator earlier than the other cores
+  bamboo_cur_msp = NULL;
+  bamboo_smem_size = 0;
+  bamboo_smem_zero_top = NULL;
+  
+  GCPROFILE_END_MASTER();
+  unsigned long long tmpt = BAMBOO_GET_EXE_TIME();
+  CACHEADAPT_OUTPUT_CACHE_POLICY();
+  gc_output_cache_policy_time += (BAMBOO_GET_EXE_TIME()-tmpt);
+  gcflag = false;
  
-#ifdef GC_CACHE_ADAPT
-  while(true) {
-    if(PREFINISHPHASE == gcphase) {
-      break;
-    }
-  }
-#ifdef RAWPATH // TODO GC_DEBUG
-  printf("(%x,%x) Start prefinish phase\n", udn_tile_coord_x(), 
-            udn_tile_coord_y());
-#endif
-  // cache adapt phase
-  cacheAdapt_mutator();
-  cacheAdapt_gc(false);
-  //send init finish msg to core coordinator
-  send_msg_2(STARTUPCORE, GCFINISHPREF, BAMBOO_NUM_OF_CORE, false);
-#ifdef RAWPATH // TODO GC_DEBUG
-  printf("(%x,%x) Finish prefinish phase\n", udn_tile_coord_x(),
-            udn_tile_coord_y());
-#endif
-#endif // GC_CACHE_ADAPT
+  GC_SEND_MSG_1_TO_CLIENT(GCFINISH);
+  gc_status_info.gcprocessing = false;  
  
-  while(true) {
-    if(FINISHPHASE == gcphase) {
-      break;
+  if(gcflag) {
+    // inform other cores to stop and wait for gc
+    GC_PRINTF("Back to Back gc case\n");
+    gcprecheck = true;
+    for(int i = 0; i < NUMCORESACTIVE; i++) {
+      // reuse the gcnumsendobjs & gcnumreceiveobjs
+      gcnumsendobjs[0][i] = 0;
+      gcnumreceiveobjs[0][i] = 0;
      }
+    GC_SEND_MSG_1_TO_CLIENT(GCSTARTPRE);
    }
-#ifdef RAWPATH // TODO GC_DEBUG
-  printf("(%x,%x) Finish gc!\n", udn_tile_coord_x(), udn_tile_coord_y());
-#endif
-} // void gc_collect(struct garbagelist * stackptr)
+}
  
-inline void gc_master(struct garbagelist * stackptr) {
+void gc_master(struct garbagelist * stackptr) {
+  tprintf("start GC!\n");
+  gc_status_info.gcprocessing = true;
+  gc_status_info.gcphase = INITPHASE;
  
-  gcphase = INITPHASE;
-  int i = 0;
    waitconfirm = false;
    numconfirm = 0;
    initGC();
-
-  // Note: all cores need to init gc including non-gc cores
-  for(i = 1; i < NUMCORESACTIVE /*NUMCORES4GC*/; i++) {
-       // send GC init messages to all cores
-       send_msg_1(i, GCSTARTINIT, false);
+  GC_SEND_MSG_1_TO_CLIENT(GCSTARTINIT);
+  CACHEADAPT_GC(true);
+  //tprintf("Check core status \n");
+  GC_CHECK_ALL_CORE_STATUS();
+  GCPROFILE_ITEM_MASTER();
+  unsigned long long tmpt = BAMBOO_GET_EXE_TIME();
+  CACHEADAPT_OUTPUT_CACHE_SAMPLING();
+  gc_output_cache_policy_time += (BAMBOO_GET_EXE_TIME()-tmpt);
+  //tprintf("start mark phase\n");
+  // do mark phase
+  master_mark(stackptr);
+  GCPROFILE_ITEM_MASTER();
+  //tprintf("finish mark phase\n");
+  // get large objects from all cores
+  master_getlargeobjs();
+  //tprintf("start compact phase\n");
+  // compact the heap
+  master_compact();
+  //tprintf("start update phase\n");
+  // update the references
+  master_updaterefs(stackptr);
+  //tprintf("gc master finished update   \n");
+  // do cache adaptation
+  CACHEADAPT_PHASE_MASTER();
+  //tprintf("finish cachdapt phase\n");
+  // do finish up stuff
+#ifdef GC_DEBUG
+  for(int i=0;i<GCNUMBLOCK;i++) {
+    struct blockrecord *record=&allocationinfo.blocktable[i];
+    tprintf("%u. used=%u free=%u corenum=%u status=%u, base=%x, ptr=%x\n", i, record->usedspace, record->freespace, record->corenum, record->status, gcbaseva+OFFSET2BASEVA(i), (gcbaseva+OFFSET2BASEVA(i)+record->usedspace));
    }
-  bool isfirst = true;
-  bool allStall = false;
-
-#ifdef GC_CACHE_ADAPT
-  // prepare for cache adaption:
-  cacheAdapt_gc(true);
-#endif // GC_CACHE_ADAPT
-
-#ifdef RAWPATH // TODO GC_DEBUG
-  printf("(%x,%x) Check core status \n", udn_tile_coord_x(), 
-                udn_tile_coord_y());
  #endif
  
-  gccorestatus[BAMBOO_NUM_OF_CORE] = 0;
-  while(true) {
-       BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
-       if(gc_checkAllCoreStatus_I()) {
-         BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-         break;
-       }
-       BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-  }
-#ifdef GC_PROFILE
-  gc_profileItem();
-#endif
-#ifdef GC_CACHE_ADAPT
-  //gc_output_cache_sampling();
-#endif // GC_CACHE_ADAPT
-#ifdef RAWPATH // TODO GC_DEBUG
-  printf("(%x,%x) Start mark phase \n", udn_tile_coord_x(), 
-                udn_tile_coord_y());
-#endif
-  // all cores have finished compacting
-  // restore the gcstatus of all cores
-  // Note: all cores have to do mark including non-gc cores
-  gccorestatus[BAMBOO_NUM_OF_CORE] = 1;
-  for(i = 1; i < NUMCORESACTIVE /*NUMCORES4GC*/; ++i) {
-       gccorestatus[i] = 1;
-       // send GC start messages to all cores
-       send_msg_1(i, GCSTART, false);
-  }
+  master_finish();
  
-  gcphase = MARKPHASE;
-  // mark phase
-  while(MARKPHASE == gcphase) {
-       mark(isfirst, stackptr);
-       if(isfirst) {
-         isfirst = false;
-       }
+  //tprintf("finish GC ! %d \n",gcflag);
+} 
  
-       // check gcstatus
-       checkMarkStatue();
-  }   // while(MARKPHASE == gcphase)
-  // send msgs to all cores requiring large objs info
-  // Note: only need to ask gc cores, non-gc cores do not host any objs
-  numconfirm = NUMCORES4GC - 1;
-  for(i = 1; i < NUMCORES4GC; ++i) {
-       send_msg_1(i, GCLOBJREQUEST, false);
-  }
-  gcloads[BAMBOO_NUM_OF_CORE] = gccurr_heaptop;
+void pregccheck() {
    while(true) {
-       if(numconfirm==0) {
-         break;
-       }
-  }   // wait for responses
-  // check the heaptop
-  if(gcheaptop < gcmarkedptrbound) {
-       gcheaptop = gcmarkedptrbound;
-  }
-#ifdef GC_PROFILE
-  gc_profileItem();
-#endif
-#ifdef RAWPATH // TODO GC_DEBUG
-  printf("(%x,%x) prepare to cache large objs \n", udn_tile_coord_x(),
-                udn_tile_coord_y());
-  //dumpSMem();
-#endif
-  // cache all large objs
-  if(!cacheLObjs()) {
-       // no enough space to cache large objs
-       BAMBOO_EXIT(0xb107);
-  }
-  // predict number of blocks to fill for each core
-  int tmpheaptop = 0;
-  int numpbc = loadbalance(&tmpheaptop);
-  // TODO
-  numpbc = (BAMBOO_SHARED_MEM_SIZE)/(BAMBOO_SMEM_SIZE);
-#ifdef RAWPATH // TODO GC_DEBUG
-  printf("(%x,%x) mark phase finished \n", udn_tile_coord_x(), 
-                udn_tile_coord_y());
-  //dumpSMem();
-#endif
-  //int tmptopptr = 0;
-  //BASEPTR(gctopcore, 0, &tmptopptr);
-  // TODO
-  //tmptopptr = gcbaseva + (BAMBOO_SHARED_MEM_SIZE);
-  tmpheaptop = gcbaseva + (BAMBOO_SHARED_MEM_SIZE);
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xabab);
-  BAMBOO_DEBUGPRINT_REG(tmptopptr);
-#endif
-  for(i = 0; i < NUMCORES4GC; ++i) {
-       int tmpcoreptr = 0;
-       BASEPTR(i, numpbc, &tmpcoreptr);
-       //send start compact messages to all cores
-       //TODO bug here, do not know if the direction is positive or negtive?
-       if (tmpcoreptr < tmpheaptop /*tmptopptr*/) {
-         gcstopblock[i] = numpbc + 1;
-         if(i != STARTUPCORE) {
-               send_msg_2(i, GCSTARTCOMPACT, numpbc+1, false);
-         } else {
-               gcblock2fill = numpbc+1;
-         }                         // if(i != STARTUPCORE)
-       } else {
-         gcstopblock[i] = numpbc;
-         if(i != STARTUPCORE) {
-               send_msg_2(i, GCSTARTCOMPACT, numpbc, false);
-         } else {
-               gcblock2fill = numpbc;
-         }  // if(i != STARTUPCORE)
-       }
-#ifdef DEBUG
-       BAMBOO_DEBUGPRINT(0xf000+i);
-       BAMBOO_DEBUGPRINT_REG(tmpcoreptr);
-       BAMBOO_DEBUGPRINT_REG(gcstopblock[i]);
-#endif
-       // init some data strutures for compact phase
-       gcloads[i] = 0;
-       gcfilledblocks[i] = 0;
-       gcrequiredmems[i] = 0;
-  }
-
-  BAMBOO_CACHE_MF();
-
-#ifdef GC_PROFILE
-  gc_profileItem();
-#endif
-
-  // compact phase
-  bool finalcompact = false;
-  // initialize pointers for comapcting
-  struct moveHelper * orig =
-       (struct moveHelper *)RUNMALLOC(sizeof(struct moveHelper));
-  struct moveHelper * to =
-       (struct moveHelper *)RUNMALLOC(sizeof(struct moveHelper));
-  initOrig_Dst(orig, to);
-  int filledblocks = 0;
-  INTPTR heaptopptr = 0;
-  bool finishcompact = false;
-  bool iscontinue = true;
-  bool localcompact = true;
-  while((COMPACTPHASE == gcphase) || (SUBTLECOMPACTPHASE == gcphase)) {
-       if((!finishcompact) && iscontinue) {
-#ifdef DEBUG
-         BAMBOO_DEBUGPRINT(0xe001);
-         BAMBOO_DEBUGPRINT_REG(numpbc);
-         BAMBOO_DEBUGPRINT_REG(gcblock2fill);
-#endif
-         finishcompact = compacthelper(orig, to, &filledblocks,
-                                                                       &heaptopptr, &localcompact);
-#ifdef DEBUG
-         BAMBOO_DEBUGPRINT(0xe002);
-         BAMBOO_DEBUGPRINT_REG(finishcompact);
-         BAMBOO_DEBUGPRINT_REG(gctomove);
-         BAMBOO_DEBUGPRINT_REG(gcrequiredmems[0]);
-         BAMBOO_DEBUGPRINT_REG(gcfilledblocks[0]);
-         BAMBOO_DEBUGPRINT_REG(gcstopblock[0]);
-#endif
-       }
-
-       BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
-       if(gc_checkCoreStatus_I()) {
-         // all cores have finished compacting
-         // restore the gcstatus of all cores
-         for(i = 0; i < NUMCORES4GC; ++i) {
-               gccorestatus[i] = 1;
-         }
-         BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-         break;
-       } else {
-         BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-         // check if there are spare mem for pending move requires
-         if(COMPACTPHASE == gcphase) {
-#ifdef DEBUG
-               BAMBOO_DEBUGPRINT(0xe003);
-#endif
-               resolvePendingMoveRequest();
-#ifdef DEBUG
-               BAMBOO_DEBUGPRINT_REG(gctomove);
-#endif
-         } else {
-#ifdef DEBUG
-               BAMBOO_DEBUGPRINT(0xe004);
-#endif
-               compact2Heaptop();
-         }
-       }   // if(gc_checkCoreStatus_I()) else ...
-
-       if(gctomove) {
-#ifdef DEBUG
-         BAMBOO_DEBUGPRINT(0xe005);
-         BAMBOO_DEBUGPRINT_REG(gcmovestartaddr);
-         BAMBOO_DEBUGPRINT_REG(gcblock2fill);
-         BAMBOO_DEBUGPRINT_REG(gctomove);
-#endif
-         to->ptr = gcmovestartaddr;
-         to->numblocks = gcblock2fill - 1;
-         to->bound = (to->numblocks==0) ?
-                                 BAMBOO_SMEM_SIZE_L :
-                                 BAMBOO_SMEM_SIZE_L+BAMBOO_SMEM_SIZE*to->numblocks;
-         BASEPTR(gcdstcore, to->numblocks, &(to->base));
-         to->offset = to->ptr - to->base;
-         to->top = (to->numblocks==0) ?
-                               (to->offset) : (to->bound-BAMBOO_SMEM_SIZE+to->offset);
-         to->base = to->ptr;
-         to->offset = BAMBOO_CACHE_LINE_SIZE;
-         to->ptr += to->offset;                         // for header
-         to->top += to->offset;
-         if(gcdstcore == BAMBOO_NUM_OF_CORE) {
-               localcompact = true;
-         } else {
-               localcompact = false;
-         }
-         gctomove = false;
-         iscontinue = true;
-       } else if(!finishcompact) {
-         // still pending
-         iscontinue = false;
-       }  // if(gctomove)
-  }  // while(COMPACTPHASE == gcphase)
-#ifdef GC_PROFILE
-  gc_profileItem();
-#endif
-#ifdef RAWPATH // TODO GC_DEBUG
-  printf("(%x,%x) prepare to move large objs \n", udn_tile_coord_x(),
-                udn_tile_coord_y());
-  //dumpSMem();
-#endif
-  // move largeObjs
-  moveLObjs();
-#ifdef RAWPATH // TODO GC_DEBUG
-  printf("(%x,%x) compact phase finished \n", udn_tile_coord_x(), 
-                udn_tile_coord_y());
-  //dumpSMem();
-#endif
-  RUNFREE(orig);
-  RUNFREE(to);
-  orig = to = NULL;
-
-  gcphase = MAPPHASE;
-  gccorestatus[BAMBOO_NUM_OF_CORE] = 1;
-  // Note: all cores should flush their runtime data including non-gc
-  //       cores
-  for(i = 1; i < NUMCORES4GC; ++i) {
-       // send start flush messages to all cores
-       gccorestatus[i] = 1;
-       send_msg_1(i, GCSTARTMAPINFO, false);
-  }
-#ifdef GC_PROFILE
-  gc_profileItem();
-#endif
-#ifdef RAWPATH // TODO GC_DEBUG
-  printf("(%x,%x) Start map phase \n", udn_tile_coord_x(), 
-                udn_tile_coord_y());
-#endif
-  // mapinto phase
-  transmappinginfo();
-#ifdef RAWPATH // TODO GC_DEBUG
-  printf("(%x,%x) Finish map phase \n", udn_tile_coord_x(), 
-                udn_tile_coord_y());
-#endif
-  gccorestatus[BAMBOO_NUM_OF_CORE] = 0;
-  while(MAPPHASE == gcphase) {
-       // check the status of all cores
-       BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
-       if(gc_checkCoreStatus_I()) {
-         // all cores have finished sending mapping info 
-         BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-         break;
-       }
-       BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-  }  // while(MAPPHASE == gcphase)
+    BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
+    gcnumsendobjs[0][BAMBOO_NUM_OF_CORE] = self_numsendobjs;
+    gcnumreceiveobjs[0][BAMBOO_NUM_OF_CORE] = self_numreceiveobjs;
+    int sumsendobj = 0;
+    for(int i = 0; i < NUMCORESACTIVE; i++) {
+      sumsendobj += gcnumsendobjs[0][i];
+    }  
+    for(int i = 0; i < NUMCORESACTIVE; i++) {
+      sumsendobj -= gcnumreceiveobjs[0][i];
+    } 
+    if(0 != sumsendobj) {
+      // there were still some msgs on the fly, wait until there 
+      // are some update pregc information coming and check it again
+      gcprecheck = false;
+      BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
  
-  gcphase = FLUSHPHASE;
-  gccorestatus[BAMBOO_NUM_OF_CORE] = 1;
-  // Note: all cores should flush their runtime data including non-gc
-  //       cores
-  for(i = 1; i < NUMCORESACTIVE /*NUMCORES4GC*/; ++i) {
-       // send start flush messages to all cores
-       gccorestatus[i] = 1;
-       send_msg_1(i, GCSTARTFLUSH, false);
+      while(!gcprecheck) ;
+    } else {
+      BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
+      return;
+    }
    }
-#ifdef GC_PROFILE
-  gc_profileItem();
-#endif
-#ifdef RAWPATH // TODO GC_DEBUG
-  printf("(%x,%x) Start flush phase \n", udn_tile_coord_x(), 
-                udn_tile_coord_y());
-#endif
-  // flush phase
-  flush(stackptr);
-  gccorestatus[BAMBOO_NUM_OF_CORE] = 0;
-  while(FLUSHPHASE == gcphase) {
-       // check the status of all cores
-       BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
-       if(gc_checkAllCoreStatus_I()) {
-         BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-         break;
-       }
-       BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-  }  // while(FLUSHPHASE == gcphase)
-#ifdef RAWPATH // TODO GC_DEBUG
-  printf("(%x,%x) Finish flush phase \n", udn_tile_coord_x(), 
-                udn_tile_coord_y());
-#endif
-
-#ifdef GC_CACHE_ADAPT
-  // now the master core need to decide the new cache strategy
-  cacheAdapt_master();
+}
  
-  gcphase = PREFINISHPHASE;
-  gccorestatus[BAMBOO_NUM_OF_CORE] = 1;
-  // Note: all cores should flush their runtime data including non-gc
-  //       cores
-  for(i = 1; i < NUMCORESACTIVE /*NUMCORES4GC*/; ++i) {
-       // send start flush messages to all cores
-       gccorestatus[i] = 1;
-       send_msg_1(i, GCSTARTPREF, false);
-  }
-#ifdef GC_PROFILE
-  gc_profileItem();
-#endif
-#ifdef RAWPATH // TODO GC_DEBUG
-  printf("(%x,%x) Start prefinish phase \n", udn_tile_coord_x(), 
-                udn_tile_coord_y());
+void pregcprocessing() {
+#if defined(GC_CACHE_ADAPT)&&defined(GC_CACHE_SAMPLING)
+  // disable the timer interrupt
+  bamboo_mask_timer_intr();
+  // get the sampling data 
+  bamboo_output_dtlb_sampling();
  #endif
-  // cache adapt phase
-  cacheAdapt_mutator();
-#ifdef GC_CACHE_ADAPT_OUTPUT
-  bamboo_output_cache_policy();
-#endif
-  cacheAdapt_gc(false);
-  gccorestatus[BAMBOO_NUM_OF_CORE] = 0;
-  while(PREFINISHPHASE == gcphase) {
-       // check the status of all cores
-       BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
-       if(gc_checkAllCoreStatus_I()) {
-         BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-         break;
-       }
-       BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-  }  // while(PREFINISHPHASE == gcphase)
-#endif // GC_CACHE_ADAPT
-
-  gcphase = FINISHPHASE;
-
-  // invalidate all shared mem pointers
-  // put it here as it takes time to inform all the other cores to
-  // finish gc and it might cause problem when some core resumes
-  // mutator earlier than the other cores
-  bamboo_cur_msp = NULL;
-  bamboo_smem_size = 0;
-  bamboo_smem_zero_top = NULL;
-  gcflag = false;
-  gcprocessing = false;
+}
  
-#ifdef GC_PROFILE
-  gc_profileEnd();
-#endif
-  gccorestatus[BAMBOO_NUM_OF_CORE] = 1;
-  for(i = 1; i < NUMCORESACTIVE /*NUMCORES4GC*/; ++i) {
-       // send gc finish messages to all cores
-       send_msg_1(i, GCFINISH, false);
-       gccorestatus[i] = 1;
-  }
-#ifdef RAWPATH // TODO GC_DEBUG
-  printf("(%x,%x) gc finished \n", udn_tile_coord_x(), 
-                udn_tile_coord_y());
-  //dumpSMem();
+void postgcprocessing() {
+#if defined(GC_CACHE_ADAPT)&&defined(GC_CACHE_SAMPLING)
+  // enable the timer interrupt
+  bamboo_tile_timer_set_next_event(GC_TILE_TIMER_EVENT_SETTING); 
+  bamboo_unmask_timer_intr();
+  //turn on sampling again
+  bamboo_dtlb_sampling_init();
  #endif
-} // void gc_master(struct garbagelist * stackptr)
+}
  
-inline bool gc(struct garbagelist * stackptr) {
+bool gc(struct garbagelist * stackptr) {
    // check if do gc
    if(!gcflag) {
-    gcprocessing = false;
+    gc_status_info.gcprocessing = false;
      return false;
    }
+#ifdef PERFCOUNT
+  profile_start(GC_REGION);
+#endif
  
    // core coordinator routine
    if(0 == BAMBOO_NUM_OF_CORE) {
-#ifdef GC_DEBUG
-    printf("(%x,%X) Check if can do gc or not\n", udn_tile_coord_x(),
-                  udn_tile_coord_y());
-#endif
-       bool isallstall = true;
-       gccorestatus[BAMBOO_NUM_OF_CORE] = 0;
-       BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
-       int ti = 0;
-       for(ti = 0; ti < NUMCORESACTIVE; ++ti) {
-         if(gccorestatus[ti] != 0) {
-               isallstall = false;
-               break;
-         }
-       }
-       if(!isallstall) {
-         BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-         // some of the cores are still executing the mutator and did not reach
-         // some gc safe point, therefore it is not ready to do gc
-         // in case that there are some pregc information msg lost, send a confirm
-         // msg to the 'busy' core
-         send_msg_1(ti, GCSTARTPRE, false);
-         gcflag = true;
-         return false;
-       } else {
-#ifdef GC_PROFILE
-    gc_profileStart();
-#endif
-pregccheck:
-         //BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
-         gcnumsendobjs[0][BAMBOO_NUM_OF_CORE] = self_numsendobjs;
-         gcnumreceiveobjs[0][BAMBOO_NUM_OF_CORE] = self_numreceiveobjs;
-         int sumsendobj = 0;
-#ifdef DEBUG
-         BAMBOO_DEBUGPRINT(0xec04);
-#endif
-         for(int i = 0; i < NUMCORESACTIVE; ++i) {
-               sumsendobj += gcnumsendobjs[0][i];
-#ifdef DEBUG
-               BAMBOO_DEBUGPRINT(0xf000 + gcnumsendobjs[0][i]);
-#endif
-         }  // for(i = 1; i < NUMCORESACTIVE; ++i)
-#ifdef DEBUG
-         BAMBOO_DEBUGPRINT(0xec05);
-         BAMBOO_DEBUGPRINT_REG(sumsendobj);
-#endif
-         for(int i = 0; i < NUMCORESACTIVE; ++i) {
-               sumsendobj -= gcnumreceiveobjs[0][i];
-#ifdef DEBUG
-               BAMBOO_DEBUGPRINT(0xf000 + gcnumreceiveobjs[i]);
-#endif
-         }  // for(i = 1; i < NUMCORESACTIVE; ++i)
-#ifdef DEBUG
-         BAMBOO_DEBUGPRINT(0xec06);
-         BAMBOO_DEBUGPRINT_REG(sumsendobj);
-#endif
-         if(0 != sumsendobj) {
-               // there were still some msgs on the fly, wait until there 
-               // are some update pregc information coming and check it again
-               gcprecheck = false;
-               BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-               while(true) {
-                 if(gcprecheck) {
-                       break;
-                 }
-               }
-               goto pregccheck;
-         } else {
-               BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-         }
-       }
-#ifdef RAWPATH // TODO GC_DEBUG
-    printf("(%x,%x) start gc! \n", udn_tile_coord_x(), udn_tile_coord_y());
-    //dumpSMem();
-#endif
-       // Zero out the remaining bamboo_cur_msp 
-       // Only zero out the first 4 bytes of the remaining memory
-       // Move the operation here because for the GC_CACHE_ADAPT version,
-       // we need to make sure during the gcinit phase the shared heap is not 
-       // touched. Otherwise, there would be problem when adapt the cache 
-       // strategy.
-       if((bamboo_cur_msp != 0) 
-               && (bamboo_smem_zero_top == bamboo_cur_msp) 
-               && (bamboo_smem_size > 0)) {
-         *((int *)bamboo_cur_msp) = 0;
-       }
-#ifdef GC_FLUSH_DTLB
-       if(gc_num_flush_dtlb < GC_NUM_FLUSH_DTLB) {
-         BAMBOO_CLEAN_DTLB();
-         gc_num_flush_dtlb++;
-       }
-#endif
-#ifdef GC_CACHE_ADAPT
-#ifdef GC_CACHE_SAMPLING
-    // disable the timer interrupt
-    bamboo_mask_timer_intr();
-    // get the sampling data 
-    bamboo_output_dtlb_sampling();
-#endif // GC_CACHE_SAMPLING
-#endif // GC_CACHE_ADAPT
-       gcprocessing = true;
-       gc_master(stackptr);
+    GC_PRINTF("Check if we can do gc or not\n");
+    gccorestatus[BAMBOO_NUM_OF_CORE] = 0;
+
+    //wait for other cores to catch up
+    while(!gc_checkCoreStatus())
+      ;
+
+    pregccheck();
+    GCPROFILE_START_MASTER();
+    GC_PRINTF("start gc! \n");
+    pregcprocessing();
+    gc_master(stackptr);
    } else if(BAMBOO_NUM_OF_CORE < NUMCORES4GC) {
-       // Zero out the remaining bamboo_cur_msp 
-       // Only zero out the first 4 bytes of the remaining memory
-       // Move the operation here because for the GC_CACHE_ADAPT version,
-       // we need to make sure during the gcinit phase the shared heap is not 
-       // touched. Otherwise, there would be problem when adapt the cache 
-       // strategy.
-       if((bamboo_cur_msp != 0) 
-               && (bamboo_smem_zero_top == bamboo_cur_msp) 
-               && (bamboo_smem_size > 0)) {
-         *((int *)bamboo_cur_msp) = 0;
-       }
-#ifdef GC_FLUSH_DTLB
-       if(gc_num_flush_dtlb < GC_NUM_FLUSH_DTLB) {
-         BAMBOO_CLEAN_DTLB();
-         gc_num_flush_dtlb++;
-       }
-#endif
-#ifdef GC_CACHE_ADAPT
-#ifdef GC_CACHE_SAMPLING
-       // disable the timer interrupt
-       bamboo_mask_timer_intr();
-       if(BAMBOO_NUM_OF_CORE < NUMCORESACTIVE) {
-         // get the sampling data 
-         bamboo_output_dtlb_sampling();
-       }
-#endif // GC_CACHE_SAMPLING
-#endif // GC_CACHE_ADAPT
-    gcprocessing = true;
+    GC_PRINTF("Core reporting for gc.\n");
+    pregcprocessing();
      gc_collect(stackptr);
-
-    // invalidate all shared mem pointers
-    bamboo_cur_msp = NULL;
-    bamboo_smem_size = 0;
-       bamboo_smem_zero_top = NULL;
-    gcflag = false;
-    gcprocessing = false;
    } else {
-       // Zero out the remaining bamboo_cur_msp 
-       // Only zero out the first 4 bytes of the remaining memory
-       // Move the operation here because for the GC_CACHE_ADAPT version,
-       // we need to make sure during the gcinit phase the shared heap is not 
-       // touched. Otherwise, there would be problem when adapt the cache 
-       // strategy.
-       if((bamboo_cur_msp != 0) 
-               && (bamboo_smem_zero_top == bamboo_cur_msp) 
-               && (bamboo_smem_size > 0)) {
-         *((int *)bamboo_cur_msp) = 0;
-       }
-#ifdef GC_FLUSH_DTLB
-       if(gc_num_flush_dtlb < GC_NUM_FLUSH_DTLB) {
-         BAMBOO_CLEAN_DTLB();
-         gc_num_flush_dtlb++;
-       }
-#endif
-#ifdef GC_CACHE_ADAPT
-#ifdef GC_CACHE_SAMPLING
-       // disable the timer interrupt
-       bamboo_mask_timer_intr();
-       if(BAMBOO_NUM_OF_CORE < NUMCORESACTIVE) {
-         // get the sampling data 
-         bamboo_output_dtlb_sampling();
-       }
-#endif // GC_CACHE_SAMPLING
-#endif // GC_CACHE_ADAPT
-    // not a gc core, should wait for gcfinish msg
-    gcprocessing = true;
+    pregcprocessing();
      gc_nocollect(stackptr);
-
-    // invalidate all shared mem pointers
-    bamboo_cur_msp = NULL;
-    bamboo_smem_size = 0;
-    bamboo_smem_zero_top = NULL;
-       gcflag = false;
-    gcprocessing = false;
    }
-#ifdef GC_CACHE_ADAPT
-#ifdef GC_CACHE_SAMPLING
-  // reset the sampling arrays
-  bamboo_dtlb_sampling_reset();
-#endif // GC_CACHE_SAMPLING
-  if(BAMBOO_NUM_OF_CORE < NUMCORESACTIVE) {
-       // zero out the gccachesamplingtbl
-       BAMBOO_MEMSET_WH(gccachesamplingtbl_local,0,size_cachesamplingtbl_local);
-       BAMBOO_MEMSET_WH(gccachesamplingtbl_local_r,0,
-               size_cachesamplingtbl_local_r);
-       if(STARTUPCORE == BAMBOO_NUM_OF_CORE) {
-         BAMBOO_MEMSET_WH(gccachepolicytbl,0,size_cachepolicytbl);
-       }
-  }
-#ifdef GC_CACHE_SAMPLING
-  // enable the timer interrupt
-  bamboo_tile_timer_set_next_event(GC_TILE_TIMER_EVENT_SETTING); 
-  bamboo_unmask_timer_intr();
-#endif // GC_CACHE_SAMPLING
-#endif // GC_CACHE_ADAPT
-  return true;
-} // void gc(struct garbagelist * stackptr)
-
-#ifdef GC_PROFILE
-inline void gc_profileStart(void) {
-  if(!gc_infoOverflow) {
-    GCInfo* gcInfo = RUNMALLOC(sizeof(struct gc_info));
-    gc_infoArray[gc_infoIndex] = gcInfo;
-    gcInfo->index = 1;
-    gcInfo->time[0] = BAMBOO_GET_EXE_TIME();
-  }
-}
-
-inline void gc_profileItem(void) {
-  if(!gc_infoOverflow) {
-    GCInfo* gcInfo = gc_infoArray[gc_infoIndex];
-    gcInfo->time[gcInfo->index++] = BAMBOO_GET_EXE_TIME();
-  }
-}
-
-inline void gc_profileEnd(void) {
-  if(!gc_infoOverflow) {
-    GCInfo* gcInfo = gc_infoArray[gc_infoIndex];
-    gcInfo->time[gcInfo->index++] = BAMBOO_GET_EXE_TIME();
-       gcInfo->time[gcInfo->index++] = gc_num_livespace;
-       gcInfo->time[gcInfo->index++] = gc_num_freespace;
-       gcInfo->time[gcInfo->index++] = gc_num_lobj;
-       gcInfo->time[gcInfo->index++] = gc_num_lobjspace;
-       gcInfo->time[gcInfo->index++] = gc_num_obj;
-       gcInfo->time[gcInfo->index++] = gc_num_liveobj;
-       gcInfo->time[gcInfo->index++] = gc_num_forwardobj;
-    gc_infoIndex++;
-    if(gc_infoIndex == GCINFOLENGTH) {
-      gc_infoOverflow = true;
-      //taskInfoIndex = 0;
-    }
-  }
-}
-
-// output the profiling data
-void gc_outputProfileData() {
-/*#ifdef USEIO
-  int i,j;
-  unsigned long long totalgc = 0;
-
-  //printf("Start Time, End Time, Duration\n");
-  // output task related info
-  for(i = 0; i < gc_infoIndex; i++) {
-    GCInfo * gcInfo = gc_infoArray[i];
-    unsigned long long tmp = 0;
-    for(j = 0; j < gcInfo->index; j++) {
-      printf("%lld(%lld), ", gcInfo->time[j], (gcInfo->time[j]-tmp));
-      tmp = gcInfo->time[j];
-    }
-    tmp = (tmp-gcInfo->time[0]);
-    printf(" ++ %lld \n", tmp);
-    totalgc += tmp;
-  }
-
-  if(gc_infoOverflow) {
-    printf("Caution: gc info overflow!\n");
-  }
-
-  printf("\n\n total gc time: %lld \n", totalgc);
-#else*/
-  int i = 0;
-  int j = 0;
-  unsigned long long totalgc = 0;
-
-#ifndef BAMBOO_MEMPROF
-  BAMBOO_DEBUGPRINT(0xdddd);
-#endif
-  // output task related info
-  for(i= 0; i < gc_infoIndex; i++) {
-    GCInfo * gcInfo = gc_infoArray[i];
-#ifdef BAMBOO_MEMPROF
-    unsigned long long tmp=gcInfo->time[gcInfo->index-8]-gcInfo->time[0]; //0;
-#else
-       unsigned long long tmp = 0;
-    BAMBOO_DEBUGPRINT(0xddda);
-    for(j = 0; j < gcInfo->index - 7; j++) {
-      BAMBOO_DEBUGPRINT(gcInfo->time[j]);
-      BAMBOO_DEBUGPRINT(gcInfo->time[j]-tmp);
-      BAMBOO_DEBUGPRINT(0xdddb);
-      tmp = gcInfo->time[j];
-    }
-    tmp = (tmp-gcInfo->time[0]);
-    BAMBOO_DEBUGPRINT_REG(tmp);
-       BAMBOO_DEBUGPRINT(0xdddc);
-       BAMBOO_DEBUGPRINT(gcInfo->time[gcInfo->index - 7]);
-       BAMBOO_DEBUGPRINT(gcInfo->time[gcInfo->index - 6]);
-       BAMBOO_DEBUGPRINT(gcInfo->time[gcInfo->index - 5]);
-       BAMBOO_DEBUGPRINT(gcInfo->time[gcInfo->index - 4]);
-       BAMBOO_DEBUGPRINT(gcInfo->time[gcInfo->index - 3]);
-       BAMBOO_DEBUGPRINT(gcInfo->time[gcInfo->index - 2]);
-       BAMBOO_DEBUGPRINT(gcInfo->time[gcInfo->index - 1]);
-    BAMBOO_DEBUGPRINT(0xddde);
+  postgcprocessing();
+#ifdef PERFCOUNT
+  profile_start(APP_REGION);
  #endif
-    totalgc += tmp;
-  }
-#ifndef BAMBOO_MEMPROF
-  BAMBOO_DEBUGPRINT(0xdddf);
-#endif
-  BAMBOO_DEBUGPRINT_REG(totalgc);
-
-  if(gc_infoOverflow) {
-    BAMBOO_DEBUGPRINT(0xefee);
-  }
-
-#ifndef BAMBOO_MEMPROF
-  BAMBOO_DEBUGPRINT(0xeeee);
-#endif
-//#endif
-}
-#endif  // #ifdef GC_PROFILE
+  return true;
+} 
  
  #endif