From 1ef83e1ffefc9614dd40b1a670562d6291760df4 Mon Sep 17 00:00:00 2001
From: jzhou <jzhou>
Date: Wed, 1 Jun 2011 22:30:01 +0000
Subject: [PATCH] Parallelize the cache adpation

---
 Robust/src/Runtime/bamboo/multicorecache.c   | 317 +++++++++++--------
 Robust/src/Runtime/bamboo/multicorecache.h   |  16 +-
 Robust/src/Runtime/bamboo/multicoregarbage.c |  10 +-
 Robust/src/Runtime/bamboo/multicoregarbage.h |  12 +-
 Robust/src/Runtime/bamboo/multicoregcflush.c |   3 +
 Robust/src/Runtime/bamboo/multicoregcmark.c  |  18 +-
 Robust/src/Runtime/bamboo/multicoremem.h     |   6 +-
 Robust/src/Runtime/bamboo/multicoremsg.c     |  31 +-
 Robust/src/Runtime/bamboo/multicoremsg.h     |   6 +-
 9 files changed, 259 insertions(+), 160 deletions(-)
diff --git a/Robust/src/Runtime/bamboo/multicorecache.c b/Robust/src/Runtime/bamboo/multicorecache.c
index 9f050024..14b2fc79 100644
--- a/Robust/src/Runtime/bamboo/multicorecache.c
+++ b/Robust/src/Runtime/bamboo/multicorecache.c
@@ -23,9 +23,6 @@ void cacheAdapt_gc(bool isgccachestage) {
 // the master core decides how to adapt cache strategy for the mutator 
 // according to collected statistic data
 
-// compute the start address of page #page_index
-#define CACHEADAPT_PAGE_START_ADDRESS(page_index) \
-  (gcbaseva + (BAMBOO_PAGE_SIZE) * (page_index))
 // find the core that accesses the page #page_index most
 #define CACHEADAPT_FIND_HOTEST_CORE(page_index,hotestcore,hotfreq) \
   { \
@@ -33,9 +30,9 @@ void cacheAdapt_gc(bool isgccachestage) {
     for(int i = 0; i < NUMCORESACTIVE; i++) { \
       int freq = *local_tbl; \
       local_tbl=(int *)(((char *)local_tbl)+size_cachesamplingtbl_local_r); \
-      if(*((unsigned int *)(hotfreq)) < freq) { \
-        *((unsigned int *)(hotfreq)) = freq; \
-        *((unsigned int *)(hotestcore)) = i; \
+      if(hotfreq < freq) { \
+        hotfreq = freq; \
+        hotestcore = i; \
       } \
     } \
   }
@@ -47,10 +44,10 @@ void cacheAdapt_gc(bool isgccachestage) {
     for(int i = 0; i < NUMCORESACTIVE; i++) { \
       int freq = *local_tbl; \
       local_tbl=(int *)(((char *)local_tbl)+size_cachesamplingtbl_local_r); \
-      *((unsigned int *)(totalfreq)) = *((unsigned int *)(totalfreq)) + freq; \
-      if(*((unsigned int *)(hotfreq)) < freq) { \
-        *((unsigned int *)(hotfreq)) = freq; \
-        *((unsigned int *)(hotestcore)) = i; \
+      totalfreq += freq; \
+      if(hotfreq < freq) { \
+        hotfreq = freq; \
+        hotestcore = i; \
       } \
     } \
   }
@@ -63,123 +60,110 @@ void cacheAdapt_gc(bool isgccachestage) {
     (policy).lotar_y = bamboo_cpu2coords[2*(coren)+1]+1; \
   }
 // store the new policy information at tmp_p in gccachepolicytbl
-#define CACHEADAPT_CHANGE_POLICY_4_PAGE(tmp_p,page_index,policy,numchanged) \
+#define CACHEADAPT_CHANGE_POLICY_4_PAGE(tmp_p,page_index,policy) \
   { \
-    *((int*)(tmp_p)) = (page_index); \
-    (tmp_p)++; \
-    *((int*)(tmp_p)) = (policy).word; \
-    (tmp_p)++; \
-    (numchanged)++; \
+    ((int*)(tmp_p))[page_index] = (policy).word; \
   }
 
 // make all pages hfh
-int cacheAdapt_policy_h4h(){
-  unsigned int page_index = 0;
-  VA page_sva = 0;
-  unsigned int page_num = (BAMBOO_SHARED_MEM_SIZE) / (BAMBOO_PAGE_SIZE);
-  unsigned int numchanged = 0;
-  int * tmp_p = gccachepolicytbl+1;
-  for(page_index = 0; page_index < page_num; page_index++) {
-    page_sva = CACHEADAPT_PAGE_START_ADDRESS(page_index);
+void cacheAdapt_policy_h4h(int coren){
+  unsigned int page_num=(BAMBOO_SHARED_MEM_SIZE)/(BAMBOO_PAGE_SIZE);
+  unsigned int page_gap=page_num/NUMCORESACTIVE;
+  unsigned int page_index=page_gap*coren;
+  unsigned int page_index_end=(coren==NUMCORESACTIVE-1)?page_num:(page_index+page_gap);
+  VA page_sva = gcbaseva+(BAMBOO_PAGE_SIZE)*page_index;
+  int * tmp_p = gccachepolicytbl;
+  for(; page_index < page_index_end; page_index++) {
     bamboo_cache_policy_t policy = {0};
     policy.cache_mode = BAMBOO_CACHE_MODE_HASH;
-    CACHEADAPT_CHANGE_POLICY_4_PAGE(tmp_p,page_index,policy,numchanged);
+    CACHEADAPT_CHANGE_POLICY_4_PAGE(tmp_p,page_index,policy);
+    page_sva += BAMBOO_PAGE_SIZE;
   }
-
-  return numchanged;
 } 
 
 // make all pages local as non-cache-adaptable gc local mode
-int cacheAdapt_policy_local(){
-  unsigned int page_index = 0;
-  VA page_sva = 0;
-  unsigned int page_num = (BAMBOO_SHARED_MEM_SIZE) / (BAMBOO_PAGE_SIZE);
-  unsigned int numchanged = 0;
-  int * tmp_p = gccachepolicytbl+1;
-  for(page_index = 0; page_index < page_num; page_index++) {
-    page_sva = CACHEADAPT_PAGE_START_ADDRESS(page_index);
+void cacheAdapt_policy_local(int coren){
+  unsigned int page_num=(BAMBOO_SHARED_MEM_SIZE)/(BAMBOO_PAGE_SIZE);
+  unsigned int page_gap=page_num/NUMCORESACTIVE;
+  unsigned int page_index=page_gap*coren;
+  unsigned int page_index_end=(coren==NUMCORESACTIVE-1)?page_num:(page_index+page_gap);
+  VA page_sva = gcbaseva+(BAMBOO_PAGE_SIZE)*page_index;
+  int * tmp_p = gccachepolicytbl;
+  for(; page_index < page_index_end; page_index++) {
     bamboo_cache_policy_t policy = {0};
     unsigned int block = 0;
     BLOCKINDEX(page_sva, &block);
     unsigned int coren = gc_block2core[block%(NUMCORES4GC*2)];
     CACHEADAPT_POLICY_SET_HOST_CORE(policy, coren);
-    CACHEADAPT_CHANGE_POLICY_4_PAGE(tmp_p,page_index,policy,numchanged);
+    CACHEADAPT_CHANGE_POLICY_4_PAGE(tmp_p,page_index,policy);
+    page_sva += BAMBOO_PAGE_SIZE;
   }
-
-  return numchanged;
 } 
 
-int cacheAdapt_policy_hotest(){
-  unsigned int page_index = 0;
-  VA page_sva = 0;
-  unsigned int page_num = (BAMBOO_SHARED_MEM_SIZE) / (BAMBOO_PAGE_SIZE);
-  unsigned int numchanged = 0;
-  int * tmp_p = gccachepolicytbl+1;
-  for(page_index = 0; page_index < page_num; page_index++) {
-    page_sva = CACHEADAPT_PAGE_START_ADDRESS(page_index);
+void cacheAdapt_policy_hotest(int coren){
+  unsigned int page_num=(BAMBOO_SHARED_MEM_SIZE)/(BAMBOO_PAGE_SIZE);
+  unsigned int page_gap=page_num/NUMCORESACTIVE;
+  unsigned int page_index=page_gap*coren;
+  unsigned int page_index_end=(coren==NUMCORESACTIVE-1)?page_num:(page_index+page_gap);
+  VA page_sva = gcbaseva+(BAMBOO_PAGE_SIZE)*page_index;
+  int * tmp_p = gccachepolicytbl;
+  for(; page_index < page_index_end; page_index++) {
     bamboo_cache_policy_t policy = {0};
     unsigned int hotestcore = 0;
     unsigned int hotfreq = 0;
-    CACHEADAPT_FIND_HOTEST_CORE(page_index,&hotestcore,&hotfreq);
+    CACHEADAPT_FIND_HOTEST_CORE(page_index,hotestcore,hotfreq);
     // TODO
     // Decide the cache strategy for this page
     // If decide to adapt a new cache strategy, write into the shared block of
     // the gcsharedsamplingtbl. The mem recording information that has been 
     // written is enough to hold the information.
     // Format: page start va + cache strategy(hfh/(host core+[x,y]))
-    if(hotfreq == 0) {
-      // this page has not been accessed, do not change its cache policy
-      continue;
-    } else {
+    if(hotfreq != 0) {
       // locally cache the page in the hotest core
       CACHEADAPT_POLICY_SET_HOST_CORE(policy, hotestcore);
-      CACHEADAPT_CHANGE_POLICY_4_PAGE(tmp_p,page_index,policy,numchanged);
     }
+    CACHEADAPT_CHANGE_POLICY_4_PAGE(tmp_p,page_index,policy);
+    page_sva += BAMBOO_PAGE_SIZE;
   }
-
-  return numchanged;
 } 
 
-#define GC_CACHE_ADAPT_DOMINATE_THRESHOLD  50
+#define GC_CACHE_ADAPT_DOMINATE_THRESHOLD  64
 // cache the page on the core that accesses it the most if that core accesses 
 // it more than (GC_CACHE_ADAPT_DOMINATE_THRESHOLD)% of the total.  Otherwise,
 // h4h the page.
-int cacheAdapt_policy_dominate(){
-  unsigned int page_index = 0;
-  VA page_sva = 0;
-  unsigned int page_num = (BAMBOO_SHARED_MEM_SIZE) / (BAMBOO_PAGE_SIZE);
-  unsigned int numchanged = 0;
-  int * tmp_p = gccachepolicytbl+1;
-  for(page_index = 0; page_index < page_num; page_index++) {
-    page_sva = CACHEADAPT_PAGE_START_ADDRESS(page_index);
+void cacheAdapt_policy_dominate(int coren){
+  unsigned int page_num=(BAMBOO_SHARED_MEM_SIZE)/(BAMBOO_PAGE_SIZE);
+  unsigned int page_gap=page_num/NUMCORESACTIVE;
+  unsigned int page_index=page_gap*coren;
+  unsigned int page_index_end=(coren==NUMCORESACTIVE-1)?page_num:(page_index+page_gap);
+  VA page_sva = gcbaseva+(BAMBOO_PAGE_SIZE)*page_index;
+  int * tmp_p = gccachepolicytbl;
+  for(; page_index < page_index_end; page_index++) {
     bamboo_cache_policy_t policy = {0};
     unsigned int hotestcore = 0;
     unsigned long long totalfreq = 0;
     unsigned int hotfreq = 0;
-    CACHEADAPT_FIND_HOTEST_CORE_W_TOTALFREQ(page_index,&hotestcore,&hotfreq,&totalfreq);
+    CACHEADAPT_FIND_HOTEST_CORE_W_TOTALFREQ(page_index,hotestcore,hotfreq,totalfreq);
     // Decide the cache strategy for this page
     // If decide to adapt a new cache strategy, write into the shared block of
     // the gcpolicytbl 
     // Format: page start va + cache policy
-    if(hotfreq == 0) {
-      // this page has not been accessed, do not change its cache policy
-      continue;
+    if(hotfreq != 0) {
+      totalfreq=(totalfreq*GC_CACHE_ADAPT_DOMINATE_THRESHOLD)>>7;
+      if(hotfreq < totalfreq) {
+        // use hfh
+        policy.cache_mode = BAMBOO_CACHE_MODE_HASH;
+      } else {
+        // locally cache the page in the hotest core
+        CACHEADAPT_POLICY_SET_HOST_CORE(policy, hotestcore);
+      }     
     }
-    totalfreq=(totalfreq*GC_CACHE_ADAPT_DOMINATE_THRESHOLD)/100/BAMBOO_PAGE_SIZE;
-    hotfreq/=BAMBOO_PAGE_SIZE;
-    if(hotfreq < totalfreq) {
-      // use hfh
-      policy.cache_mode = BAMBOO_CACHE_MODE_HASH;
-    } else {
-      // locally cache the page in the hotest core
-      CACHEADAPT_POLICY_SET_HOST_CORE(policy, hotestcore);
-    }
-    CACHEADAPT_CHANGE_POLICY_4_PAGE(tmp_p,page_index,policy,numchanged);
+    CACHEADAPT_CHANGE_POLICY_4_PAGE(tmp_p,page_index,policy);
+    page_sva += BAMBOO_PAGE_SIZE;
   }
-
-  return numchanged;
 }
 
+#if 0
 #define GC_CACHE_ADAPT_OVERLOAD_THRESHOLD 10
 // record the worklocad of the hotestcore into core2heavypages
 #define CACHEADAPT_RECORD_PAGE_WORKLOAD(hotestcore,totalfreq,hotfreq,remoteaccess,tmp_p) \
@@ -251,10 +235,10 @@ INLINE int cacheAdapt_h4h_remote_accesses(unsigned long long workload_threshold,
 // GC_CACHE_ADAPT_OVERLOAD_THRESHOLD.  If so, find the pages with the 
 // most remote accesses and hash for home them until we get below 
 // GC_CACHE_ADAPT_OVERLOAD_THRESHOLD
-int cacheAdapt_policy_overload(){
+int cacheAdapt_policy_overload(int coren){
   unsigned int page_index = 0;
-  VA page_sva = 0;
-  unsigned int page_num = (BAMBOO_SHARED_MEM_SIZE) / (BAMBOO_PAGE_SIZE);
+  VA page_sva = gcbaseva;
+  unsigned int page_num = BAMBOO_SHARED_MEM_SIZE/BAMBOO_PAGE_SIZE;
   unsigned int numchanged = 0;
   int * tmp_p = gccachepolicytbl+1;
   unsigned long long workload[NUMCORESACTIVE];
@@ -262,29 +246,26 @@ int cacheAdapt_policy_overload(){
   unsigned long long total_workload = 0;
   unsigned long long core2heavypages[NUMCORESACTIVE][page_num*3+1];
   memset(core2heavypages,0,sizeof(unsigned long long)*(page_num*3+1)*NUMCORESACTIVE);
-  for(page_index = 0; page_index < page_num; page_index++) {
-    page_sva = CACHEADAPT_PAGE_START_ADDRESS(page_index);
+  for(page_index = 0; page_sva < gctopva; page_index++) {
     bamboo_cache_policy_t policy = {0};
     unsigned int hotestcore = 0;
     unsigned long long totalfreq = 0;
     unsigned int hotfreq = 0;
-    CACHEADAPT_FIND_HOTEST_CORE_W_TOTALFREQ(page_index,&hotestcore,&hotfreq,&totalfreq);
+    CACHEADAPT_FIND_HOTEST_CORE_W_TOTALFREQ(page_index,hotestcore,hotfreq,totalfreq);
     // Decide the cache strategy for this page
     // If decide to adapt a new cache strategy, write into the shared block of
     // the gcsharedsamplingtbl. The mem recording information that has been 
     // written is enough to hold the information.
     // Format: page start va + cache strategy(hfh/(host core+[x,y]))
-    if(hotfreq == 0) {
-      // this page has not been accessed, do not change its cache policy
-      continue;
+    if(hotfreq != 0) {
+      totalfreq/=BAMBOO_PAGE_SIZE;
+      hotfreq/=BAMBOO_PAGE_SIZE;
+      // locally cache the page in the hotest core
+      CACHEADAPT_POLICY_SET_HOST_CORE(policy, hotestcore);
+      CACHEADAPT_CHANGE_POLICY_4_PAGE(tmp_p,page_index,policy,numchanged);
+      CACHEADAPT_RECORD_PAGE_WORKLOAD(hotestcore,totalfreq,hotfreq,remoteaccess,tmp_p);    
     }
-
-    totalfreq/=BAMBOO_PAGE_SIZE;
-    hotfreq/=BAMBOO_PAGE_SIZE;
-    // locally cache the page in the hotest core
-    CACHEADAPT_POLICY_SET_HOST_CORE(policy, hotestcore);
-    CACHEADAPT_CHANGE_POLICY_4_PAGE(tmp_p,page_index,policy,numchanged);
-    CACHEADAPT_RECORD_PAGE_WORKLOAD(hotestcore,totalfreq,hotfreq,remoteaccess,tmp_p);    
+    page_sva += BAMBOO_PAGE_SIZE;
   }
 
   unsigned long long workload_threshold=total_workload/GC_CACHE_ADAPT_OVERLOAD_THRESHOLD;
@@ -309,10 +290,10 @@ int cacheAdapt_policy_overload(){
 // then start hfh these pages(selecting the ones with the most remote 
 // accesses first or fewest local accesses) until we get below 
 // GC_CACHE_ADAPT_CROWD_THRESHOLD pages.
-int cacheAdapt_policy_crowd(){
+int cacheAdapt_policy_crowd(int coren){
   unsigned int page_index = 0;
-  VA page_sva = 0;
-  unsigned int page_num = (BAMBOO_SHARED_MEM_SIZE) / (BAMBOO_PAGE_SIZE);
+  VA page_sva = gcbaseva;
+  unsigned int page_num = BAMBOO_SHARED_MEM_SIZE/BAMBOO_PAGE_SIZE;
   unsigned int numchanged = 0;
   int * tmp_p = gccachepolicytbl+1;
   unsigned long long workload[NUMCORESACTIVE];
@@ -320,28 +301,26 @@ int cacheAdapt_policy_crowd(){
   unsigned long long total_workload = 0;
   unsigned long long core2heavypages[NUMCORESACTIVE][page_num*3+1];
   memset(core2heavypages,0,sizeof(unsigned long long)*(page_num*3+1)*NUMCORESACTIVE);
-  for(page_index = 0; page_index < page_num; page_index++) {
-    page_sva = CACHEADAPT_PAGE_START_ADDRESS(page_index);
+  for(page_index = 0; page_sva < gctopva; page_index++) {
     bamboo_cache_policy_t policy = {0};
     unsigned int hotestcore = 0;
     unsigned long long totalfreq = 0;
     unsigned int hotfreq = 0;
-    CACHEADAPT_FIND_HOTEST_CORE_W_TOTALFREQ(page_index,&hotestcore,&hotfreq,&totalfreq);
+    CACHEADAPT_FIND_HOTEST_CORE_W_TOTALFREQ(page_index,hotestcore,hotfreq,totalfreq);
     // Decide the cache strategy for this page
     // If decide to adapt a new cache strategy, write into the shared block of
     // the gcsharedsamplingtbl. The mem recording information that has been 
     // written is enough to hold the information.
     // Format: page start va + cache strategy(hfh/(host core+[x,y]))
-    if(hotfreq == 0) {
-      // this page has not been accessed, do not change its cache policy
-      continue;
+    if(hotfreq != 0) {
+      totalfreq/=BAMBOO_PAGE_SIZE;
+      hotfreq/=BAMBOO_PAGE_SIZE;
+      // locally cache the page in the hotest core
+      CACHEADAPT_POLICY_SET_HOST_CORE(policy, hotestcore);
+      CACHEADAPT_CHANGE_POLICY_4_PAGE(tmp_p,page_index,policy,numchanged);
+      CACHEADAPT_RECORD_PAGE_WORKLOAD(hotestcore,totalfreq,hotfreq,remoteaccess,tmp_p);
     }
-    totalfreq/=BAMBOO_PAGE_SIZE;
-    hotfreq/=BAMBOO_PAGE_SIZE;
-    // locally cache the page in the hotest core
-    CACHEADAPT_POLICY_SET_HOST_CORE(policy, hotestcore);
-    CACHEADAPT_CHANGE_POLICY_4_PAGE(tmp_p,page_index,policy,numchanged);
-    CACHEADAPT_RECORD_PAGE_WORKLOAD(hotestcore,totalfreq,hotfreq,remoteaccess,tmp_p);
+    page_sva += BAMBOO_PAGE_SIZE;
   }
 
   unsigned long long workload_threshold=total_workload/GC_CACHE_ADAPT_OVERLOAD_THRESHOLD;
@@ -380,46 +359,53 @@ inner_crowd:
 
   return numchanged;
 } 
+#endif
 
-void cacheAdapt_master() {
-  CACHEADAPT_OUTPUT_CACHE_SAMPLING_R();
-  unsigned int numchanged = 0;
+unsigned int cacheAdapt_decision(int coren) {
+  BAMBOO_CACHE_MF();
   // check the statistic data
   // for each page, decide the new cache strategy
 #ifdef GC_CACHE_ADAPT_POLICY1
-  numchanged = cacheAdapt_policy_h4h();
+  cacheAdapt_policy_h4h(coren);
 #elif defined GC_CACHE_ADAPT_POLICY2
-  numchanged = cacheAdapt_policy_local();
+  cacheAdapt_policy_local(coren);
 #elif defined GC_CACHE_ADAPT_POLICY3
-  numchanged = cacheAdapt_policy_hotest();
+  cacheAdapt_policy_hotest(coren);
 #elif defined GC_CACHE_ADAPT_POLICY4
-  numchanged = cacheAdapt_policy_dominate();
-#elif defined GC_CACHE_ADAPT_POLICY5
-  numchanged = cacheAdapt_policy_overload();
-#elif defined GC_CACHE_ADAPT_POLICY6
-  numchanged = cacheAdapt_policy_crowd();
+  cacheAdapt_policy_dominate(coren);
+//#elif defined GC_CACHE_ADAPT_POLICY5
+//  cacheAdapt_policy_overload(coren);
+//#elif defined GC_CACHE_ADAPT_POLICY6
+//  cacheAdapt_policy_crowd(coren);
 #endif
-  *gccachepolicytbl = numchanged;
 }
 
 // adapt the cache strategy for the mutator
 void cacheAdapt_mutator() {
-  int numchanged = *gccachepolicytbl;
+  BAMBOO_CACHE_MF();
   // check the changes and adapt them
-  int * tmp_p = gccachepolicytbl+1;
-  while(numchanged--) {
+  int * tmp_p = gccachepolicytbl;
+  unsigned int page_sva = gcbaseva;
+  for(; page_sva<gctopva; page_sva+=BAMBOO_PAGE_SIZE) {
     // read out the policy
-    int page_index = *tmp_p;
-    bamboo_cache_policy_t policy = (bamboo_cache_policy_t)(*(tmp_p+1));
+    bamboo_cache_policy_t policy = (bamboo_cache_policy_t)(*(tmp_p));
     // adapt the policy
-    bamboo_adapt_cache_policy(page_index*(BAMBOO_PAGE_SIZE)+gcbaseva,policy,BAMBOO_PAGE_SIZE);
-    tmp_p += 2;
+    if(policy.word != 0) {
+      bamboo_adapt_cache_policy(page_sva,policy,BAMBOO_PAGE_SIZE);
+    }
+    tmp_p += 1;
   }
 }
 
 void cacheAdapt_phase_client() {
-  WAITFORGCPHASE(PREFINISHPHASE);
+  WAITFORGCPHASE(CACHEPOLICYPHASE);
+  GC_PRINTF("Start cachepolicy phase\n");
+  cacheAdapt_decision(BAMBOO_NUM_OF_CORE);
+  //send init finish msg to core coordinator
+  send_msg_2(STARTUPCORE, GCFINISHCACHEPOLICY, BAMBOO_NUM_OF_CORE);
+  GC_PRINTF("Finish cachepolicy phase\n");
 
+  WAITFORGCPHASE(PREFINISHPHASE);
   GC_PRINTF("Start prefinish phase\n");
   // cache adapt phase
   cacheAdapt_mutator();
@@ -435,18 +421,32 @@ void cacheAdapt_phase_client() {
   }
 }
 
+extern unsigned long long gc_output_cache_policy_time;
+
 void cacheAdapt_phase_master() {
   GCPROFILE_ITEM();
+  unsigned long long tmpt = BAMBOO_GET_EXE_TIME();
+  CACHEADAPT_OUTPUT_CACHE_SAMPLING_R();
+  gc_output_cache_policy_time += (BAMBOO_GET_EXE_TIME()-tmpt);
+  // let all cores to parallelly process the revised profile data and decide 
+  // the cache policy for each page
+  gc_status_info.gcphase = CACHEPOLICYPHASE;
+  GC_SEND_MSG_1_TO_CLIENT(GCSTARTCACHEPOLICY);
+  GC_PRINTF("Start cachepolicy phase \n");
+  // cache adapt phase
+  cacheAdapt_decision(BAMBOO_NUM_OF_CORE);
+  GC_CHECK_ALL_CORE_STATUS(CACHEPOLICYPHASE==gc_status_info.gcphase);
+  BAMBOO_CACHE_MF();
+
+  // let all cores to adopt new policies
   gc_status_info.gcphase = PREFINISHPHASE;
   // Note: all cores should flush their runtime data including non-gc cores
   GC_SEND_MSG_1_TO_CLIENT(GCSTARTPREF);
   GC_PRINTF("Start prefinish phase \n");
   // cache adapt phase
   cacheAdapt_mutator();
-  CACHEADAPT_OUTPUT_CACHE_POLICY();
   cacheAdapt_gc(false);
-
-  GC_CHECK_ALL_CORE_STATUS(PREFINISHPHASE == gc_status_info.gcphase);
+  GC_CHECK_ALL_CORE_STATUS(PREFINISHPHASE==gc_status_info.gcphase);
 
   CACHEADAPT_SAMPING_RESET();
   if(BAMBOO_NUM_OF_CORE < NUMCORESACTIVE) {
@@ -466,11 +466,13 @@ void gc_output_cache_sampling() {
     unsigned int block = 0;
     BLOCKINDEX(page_sva, &block);
     unsigned int coren = gc_block2core[block%(NUMCORES4GC*2)];
-    tprintf("va: %x page_index: %d host: %d\n",(int)page_sva,page_index,coren);
+    tprintf("va: %x page_index: %d host: %d\t",(int)page_sva,page_index,coren);
     for(int i = 0; i < NUMCORESACTIVE; i++) {
       int * local_tbl = (int *)((void *)gccachesamplingtbl+size_cachesamplingtbl_local*i);
       int freq = local_tbl[page_index];
-      printf("%8d ",freq);
+      if(freq != 0) {
+        printf("(%d) %d,  ", i, freq);
+      }
     }
     printf("\n");
   }
@@ -478,6 +480,13 @@ void gc_output_cache_sampling() {
 } 
 
 void gc_output_cache_sampling_r() {
+  // TODO summary data
+  unsigned int sumdata[4][NUMCORESACTIVE]; // 0 -- single core accessed
+                                           // 1 -- all cores accessed
+                                           // 2 -- less than 5 cores accessed
+                                           // 3 -- multiple cores(5<=n<all) accessed
+  memset(sumdata, '0', sizeof(unsigned int)*4*NUMCORESACTIVE);
+  tprintf("cache sampling_r \n");
   unsigned int page_index = 0;
   VA page_sva = 0;
   unsigned int page_num = (BAMBOO_SHARED_MEM_SIZE) / (BAMBOO_PAGE_SIZE);
@@ -486,15 +495,49 @@ void gc_output_cache_sampling_r() {
     unsigned int block = 0;
     BLOCKINDEX(page_sva, &block);
     unsigned int coren = gc_block2core[block%(NUMCORES4GC*2)];
-    tprintf("va: %x page_index: %d host: %d\n",(int)page_sva,page_index,coren);
+    tprintf("va: %x page_index: %d host: %d\t",(int)page_sva,page_index,coren);
+    int accesscore = 0; // TODO
     for(int i = 0; i < NUMCORESACTIVE; i++) {
       int * local_tbl = (int *)((void *)gccachesamplingtbl_r+size_cachesamplingtbl_local_r*i);
       int freq = local_tbl[page_index]/BAMBOO_PAGE_SIZE;
-      printf("%8d ",freq);
+      if(freq != 0) {
+        printf("(%d) %d,  ", i, freq);
+        accesscore++;// TODO
+      }
+    }
+    if(accesscore==0) {
+    } else if(accesscore==1) {
+      for(int i = 0; i < NUMCORESACTIVE; i++) {
+        int * local_tbl = (int *)((void *)gccachesamplingtbl_r+size_cachesamplingtbl_local_r*i);
+        int freq = local_tbl[page_index]/BAMBOO_PAGE_SIZE;
+        sumdata[0][i]+=freq;
+      }
+    } else if(accesscore<5) {
+      for(int i = 0; i < NUMCORESACTIVE; i++) {
+        int * local_tbl = (int *)((void *)gccachesamplingtbl_r+size_cachesamplingtbl_local_r*i);
+        int freq = local_tbl[page_index]/BAMBOO_PAGE_SIZE;
+        sumdata[2][i]+=freq;
+      }
+    } else if(accesscore<NUMCORESACTIVE) {
+      for(int i = 0; i < NUMCORESACTIVE; i++) {
+        int * local_tbl = (int *)((void *)gccachesamplingtbl_r+size_cachesamplingtbl_local_r*i);
+        int freq = local_tbl[page_index]/BAMBOO_PAGE_SIZE;
+        sumdata[3][i]+=freq;
+      }
+    } else {
+      for(int i = 0; i < NUMCORESACTIVE; i++) {
+        int * local_tbl = (int *)((void *)gccachesamplingtbl_r+size_cachesamplingtbl_local_r*i);
+        int freq = local_tbl[page_index]/BAMBOO_PAGE_SIZE;
+        sumdata[1][i]+=freq;
+      }
     }
   
     printf("\n");
   }
+  // TODO printout the summary data
+  for(int i = 0; i < NUMCORESACTIVE; i++) {
+    tprintf("core %d:  %d,  %d,  %d,  %d \n", i, sumdata[0][i], sumdata[2][i], sumdata[3][i], sumdata[1][i]);
+  }
   printf("=================\n");
 } 
 #endif // GC_CACHE_ADAPT
diff --git a/Robust/src/Runtime/bamboo/multicorecache.h b/Robust/src/Runtime/bamboo/multicorecache.h
index e9aeda38..f4f66d8d 100644
--- a/Robust/src/Runtime/bamboo/multicorecache.h
+++ b/Robust/src/Runtime/bamboo/multicorecache.h
@@ -17,10 +17,10 @@ typedef union
   struct
   {
     // policy type
-    unsigned int cache_mode   : 2;
-	// Reserved.
-    unsigned int __reserved_0 : 6;
-	// Location Override Target Y
+    unsigned int cache_mode   : 3;
+    // Reserved.
+    unsigned int __reserved_0 : 5;
+    // Location Override Target Y
     unsigned int lotar_y      : 4;
     // Reserved.
     unsigned int __reserved_1 : 4;
@@ -31,10 +31,10 @@ typedef union
   };
 } bamboo_cache_policy_t;
 
-#define BAMBOO_CACHE_MODE_LOCAL 0
-#define BAMBOO_CACHE_MODE_HASH 1
-#define BAMBOO_CACHE_MODE_NONE 2
-#define BAMBOO_CACHE_MODE_COORDS 3
+#define BAMBOO_CACHE_MODE_LOCAL 1
+#define BAMBOO_CACHE_MODE_HASH 2
+#define BAMBOO_CACHE_MODE_NONE 3
+#define BAMBOO_CACHE_MODE_COORDS 4
 
 typedef struct gc_cache_revise_info {
   unsigned int orig_page_start_va;
diff --git a/Robust/src/Runtime/bamboo/multicoregarbage.c b/Robust/src/Runtime/bamboo/multicoregarbage.c
index bf3c14f3..c8edc66b 100644
--- a/Robust/src/Runtime/bamboo/multicoregarbage.c
+++ b/Robust/src/Runtime/bamboo/multicoregarbage.c
@@ -18,6 +18,8 @@ extern unsigned int gcmem_mixed_usedmem;
 volatile bool gcflag;
 gc_status_t gc_status_info;
 
+unsigned long long gc_output_cache_policy_time=0;
+
 #ifdef GC_DEBUG
 // dump whole mem in blocks
 void dumpSMem() {
@@ -182,6 +184,7 @@ void initGC() {
   gcforwardobjtbl = allocateMGCHash(20, 3);
 
   GCPROFILE_INIT();
+  gc_output_cache_policy_time=0;
 } 
 
 bool gc_checkAllCoreStatus() {
@@ -786,8 +789,6 @@ void master_updaterefs(struct garbagelist * stackptr) {
   GC_PRINTF("Start flush phase \n");
   // flush phase
   flush(stackptr);
-  // now the master core need to decide the new cache strategy
-  CACHEADAPT_MASTER();
   GC_CHECK_ALL_CORE_STATUS(FLUSHPHASE==gc_status_info.gcphase);
   GC_PRINTF("Finish flush phase \n");
 }
@@ -804,6 +805,9 @@ void master_finish() {
   bamboo_smem_zero_top = NULL;
   
   GCPROFILE_END();
+  unsigned long long tmpt = BAMBOO_GET_EXE_TIME();
+  CACHEADAPT_OUTPUT_CACHE_POLICY();
+  gc_output_cache_policy_time += (BAMBOO_GET_EXE_TIME()-tmpt);
   gcflag = false;
   GC_SEND_MSG_1_TO_CLIENT(GCFINISH);
   
@@ -833,7 +837,9 @@ void gc_master(struct garbagelist * stackptr) {
   GC_PRINTF("Check core status \n");
   GC_CHECK_ALL_CORE_STATUS(true);
   GCPROFILE_ITEM();
+  unsigned long long tmpt = BAMBOO_GET_EXE_TIME();
   CACHEADAPT_OUTPUT_CACHE_SAMPLING();
+  gc_output_cache_policy_time += (BAMBOO_GET_EXE_TIME()-tmpt);
 
   // do mark phase
   master_mark(stackptr);
diff --git a/Robust/src/Runtime/bamboo/multicoregarbage.h b/Robust/src/Runtime/bamboo/multicoregarbage.h
index 4ba7aae8..6684d833 100644
--- a/Robust/src/Runtime/bamboo/multicoregarbage.h
+++ b/Robust/src/Runtime/bamboo/multicoregarbage.h
@@ -35,9 +35,10 @@ typedef enum {
   MAPPHASE,                // 0x4
   FLUSHPHASE,              // 0x5
 #ifdef GC_CACHE_ADAPT
-  PREFINISHPHASE,          // 0x6
+  CACHEPOLICYPHASE,        // 0x6
+  PREFINISHPHASE,          // 0x7
 #endif 
-  FINISHPHASE              // 0x6/0x7
+  FINISHPHASE              // 0x6/0x8
 } GCPHASETYPE;
 
 typedef struct gc_status {
@@ -157,6 +158,13 @@ unsigned int size_cachepolicytbl;
     } \
   }
 
+INLINE static unsigned int hostcore(void * ptr) {
+  // check the host core of ptr
+  unsigned int host = 0;
+  RESIDECORE(ptr, &host);
+  return host;
+}
+
 // NOTE: n starts from 0
 // mapping of heaptop (how many bytes there are in the local heap) to
 // the number of the block
diff --git a/Robust/src/Runtime/bamboo/multicoregcflush.c b/Robust/src/Runtime/bamboo/multicoregcflush.c
index c42dfb67..411f5148 100644
--- a/Robust/src/Runtime/bamboo/multicoregcflush.c
+++ b/Robust/src/Runtime/bamboo/multicoregcflush.c
@@ -175,6 +175,7 @@ INLINE void flushPtrsInObj(void * ptr) {
 }
 
 void flush(struct garbagelist * stackptr) {
+  //unsigned long long tmpt = BAMBOO_GET_EXE_TIME(); // TODO
   BAMBOO_CACHE_MF();
 
   flushRuntimeObj(stackptr);
@@ -212,6 +213,8 @@ void flush(struct garbagelist * stackptr) {
   } else {
     send_msg_2(STARTUPCORE,GCFINISHFLUSH,BAMBOO_NUM_OF_CORE);
   }
+
+  //tprintf("flush: %lld \n", BAMBOO_GET_EXE_TIME()-tmpt); // TODO
 } 
 
 #endif // MULTICORE_GC
diff --git a/Robust/src/Runtime/bamboo/multicoregcmark.c b/Robust/src/Runtime/bamboo/multicoregcmark.c
index 95596dca..889e4506 100644
--- a/Robust/src/Runtime/bamboo/multicoregcmark.c
+++ b/Robust/src/Runtime/bamboo/multicoregcmark.c
@@ -53,15 +53,25 @@ INLINE bool isLarge(void * ptr, int * ttype, unsigned int * tsize) {
   // ptr is a start of a block  OR it acrosses the boundary of current block
   return (((((unsigned int)ptr-gcbaseva)%(bound))==0)||
 	  ((bound-(((unsigned int)ptr-gcbaseva)%bound)) < (*tsize)));
-} 
+}
 
 INLINE unsigned int hostcore(void * ptr) {
   // check the host core of ptr
   unsigned int host = 0;
-  RESIDECORE(ptr, &host);
+  if(1 == (NUMCORES4GC)) { 
+    host = 0; 
+  } else { 
+    unsigned int b;
+    unsigned int t = (unsigned int)ptr - (unsigned int)gcbaseva; 
+    if(t < (BAMBOO_LARGE_SMEM_BOUND)) { 
+      b = t / (BAMBOO_SMEM_SIZE_L); 
+    } else { 
+      b = NUMCORES4GC+((t-(BAMBOO_LARGE_SMEM_BOUND))/(BAMBOO_SMEM_SIZE)); 
+    } 
+    host = gc_block2core[(b%(NUMCORES4GC*2))];
+  }
   return host;
-} 
-
+}
 //push the null check into the mark macro
 //#define MARKOBJ(objptr, ii) {void * marktmpptr=objptr; if (marktmpptr!=NULL) markObj(marktmpptr, __LINE__, ii);}
 
diff --git a/Robust/src/Runtime/bamboo/multicoremem.h b/Robust/src/Runtime/bamboo/multicoremem.h
index 2473bdde..83cb5a6f 100644
--- a/Robust/src/Runtime/bamboo/multicoremem.h
+++ b/Robust/src/Runtime/bamboo/multicoremem.h
@@ -67,13 +67,11 @@
 
 #else // GC_DEBUG
 #ifdef GC_LARGESHAREDHEAP
-#define BAMBOO_NUM_BLOCKS ((unsigned int)((GC_BAMBOO_NUMCORES)*(2+2)))
-#elif defined GC_LARGESHAREDHEAP2
-#define BAMBOO_NUM_BLOCKS ((unsigned int)((GC_BAMBOO_NUMCORES)*(2+2)))
+#define BAMBOO_NUM_BLOCKS ((unsigned int)((GC_BAMBOO_NUMCORES)*(2+5)))
 #elif defined MGC
 #define BAMBOO_NUM_BLOCKS ((unsigned int)((GC_BAMBOO_NUMCORES)*72)) // 72M per core
 #else
-#define BAMBOO_NUM_BLOCKS ((unsigned int)((GC_BAMBOO_NUMCORES)*(2+3))) //(15 * 1024) //(64 * 4 * 0.75) //(1024 * 1024 * 3.5)  3G
+#define BAMBOO_NUM_BLOCKS ((unsigned int)((GC_BAMBOO_NUMCORES)*(2+2))) //(15 * 1024) //(64 * 4 * 0.75) //(1024 * 1024 * 3.5)  3G
 #endif
 #ifdef GC_LARGEPAGESIZE
 #define BAMBOO_PAGE_SIZE ((unsigned int)(4 * 1024 * 1024))  // (4096)
diff --git a/Robust/src/Runtime/bamboo/multicoremsg.c b/Robust/src/Runtime/bamboo/multicoremsg.c
index 2ba1532f..1d60f4fa 100644
--- a/Robust/src/Runtime/bamboo/multicoremsg.c
+++ b/Robust/src/Runtime/bamboo/multicoremsg.c
@@ -47,7 +47,9 @@ int msgsizearray[] = {
   4, //GCPROFILES,            // 0xF3
 #endif // GC_PROFILE
 #ifdef GC_CACHE_ADAPT
-  1, //GCSTARTPREF,           // 0xF5
+  1, //GCSTARTCACHEPOLICY     // 0xF4
+  2, //GCFINISHCACHEPOLICY    // 0xF5
+  1, //GCSTARTPREF,           // 0xF6
   2, //GCFINISHPREF,          // 0xF7
 #endif // GC_CACHE_ADAPT
 #endif // MULTICORE_GC
@@ -661,6 +663,21 @@ INLINE void processmsg_gcprofiles_I() {
 #endif // GC_PROFILE
 
 #ifdef GC_CACHE_ADAPT
+INLINE void processmsg_gcstartcachepolicy_I() {
+  gc_status_info.gcphase = CACHEPOLICYPHASE;
+}
+
+INLINE void processmsg_gcfinishcachepolicy_I() {
+  int data1 = msgdata[msgdataindex];
+  MSG_INDEXINC_I();
+  BAMBOO_ASSERT(BAMBOO_NUM_OF_CORE == STARTUPCORE);
+
+  // all cores should do flush
+  if(data1 < NUMCORESACTIVE) {
+    gccorestatus[data1] = 0;
+  }
+}
+
 INLINE void processmsg_gcstartpref_I() {
   gc_status_info.gcphase = PREFINISHPHASE;
 }
@@ -944,6 +961,18 @@ processmsg:
 #endif // GC_PROFILE
 
 #ifdef GC_CACHE_ADAPT
+    case GCSTARTCACHEPOLICY: {
+      // received a gcstartcachepolicy msg
+      processmsg_gcstartcachepolicy_I();
+      break;
+    }
+
+    case GCFINISHCACHEPOLICY: {
+      // received a gcfinishcachepolicy msg
+      processmsg_gcfinishcachepolicy_I();
+      break;
+    }
+
     case GCSTARTPREF: {
       // received a gcstartpref msg
       processmsg_gcstartpref_I();
diff --git a/Robust/src/Runtime/bamboo/multicoremsg.h b/Robust/src/Runtime/bamboo/multicoremsg.h
index 401a6956..fb66bf7d 100644
--- a/Robust/src/Runtime/bamboo/multicoremsg.h
+++ b/Robust/src/Runtime/bamboo/multicoremsg.h
@@ -192,8 +192,10 @@ typedef enum {
   GCPROFILES,            // 0xF3
 #endif // GC_PROFILE
 #ifdef GC_CACHE_ADAPT
-  GCSTARTPREF,           // 0xF4
-  GCFINISHPREF,          // 0xF5
+  GCSTARTCACHEPOLICY,    // 0xF4
+  GCFINISHCACHEPOLICY,   // 0xF5
+  GCSTARTPREF,           // 0xF6
+  GCFINISHPREF,          // 0xF7
 #endif // GC_CACHE_ADAPT
 #endif // MULTICORE_GC
   MSGEND
-- 
2.34.1