Bug fix for cache adaption
[IRC.git] / Robust / src / Runtime / bamboo / multicorecache.c
index 970ca5833c6bd2b13402c54f5c47c6a1fe794536..f25d4813d13e13c80e7b77dd04b28394f8b868b6 100644 (file)
@@ -1,78 +1,9 @@
 #ifdef GC_CACHE_ADAPT
 #include "multicorecache.h"
+#include "multicoremsg.h"
+#include "multicoregcprofile.h"
 
-typedef struct gc_cache_revise_info {
-  unsigned int orig_page_start_va;
-  unsigned int orig_page_end_va;
-  unsigned int orig_page_index;
-  unsigned int to_page_start_va;
-  unsigned int to_page_end_va;
-  unsigned int to_page_index;
-  unsigned int revised_sampling[NUMCORESACTIVE];
-} gc_cache_revise_info_t;
-gc_cache_revise_info_t gc_cache_revise_infomation;
-
-INLINE void samplingDataInit() {
-  gc_cache_revise_infomation.to_page_start_va = (unsigned int)to->ptr;
-  unsigned int toindex = (unsigned int)(tobase-gcbaseva)/(BAMBOO_PAGE_SIZE);
-  gc_cache_revise_infomation.to_page_end_va = gcbaseva + 
-    (BAMBOO_PAGE_SIZE)*(toindex+1);
-  gc_cache_revise_infomation.to_page_index = toindex;
-  gc_cache_revise_infomation.orig_page_start_va = (unsigned int)orig->ptr;
-  gc_cache_revise_infomation.orig_page_end_va = gcbaseva+(BAMBOO_PAGE_SIZE)
-  *(((unsigned int)(orig->ptr)-gcbaseva)/(BAMBOO_PAGE_SIZE)+1);
-  gc_cache_revise_infomation.orig_page_index = 
-    ((unsigned int)(orig->blockbase)-gcbaseva)/(BAMBOO_PAGE_SIZE);
-}
-
-INLINE void samplingDataConvert(unsigned int current_ptr) {
-  unsigned int tmp_factor = 
-  current_ptr-gc_cache_revise_infomation.to_page_start_va;
-  unsigned int topage=gc_cache_revise_infomation.to_page_index;
-  unsigned int oldpage = gc_cache_revise_infomation.orig_page_index;
-  int * newtable=&gccachesamplingtbl_r[topage];
-  int * oldtable=&gccachesamplingtbl[oldpage];
-  
-  for(int tt = 0; tt < NUMCORESACTIVE; tt++) {
-    (*newtable) = ((*newtable)+(*oldtable)*tmp_factor);
-    newtable=(int*)(((char *)newtable)+size_cachesamplingtbl_local_r);
-    oldtable=(int*) (((char *)oldtable)+size_cachesamplingtbl_local);
-  }
-} 
-
-INLINE void completePageConvert(struct moveHelper * orig,
-                                struct moveHelper * to,
-                                unsigned int current_ptr,
-                                bool closeToPage) {
-  unsigned int ptr = 0;
-  unsigned int tocompare = 0;
-  if(closeToPage) {
-    ptr = to->ptr;
-    tocompare = gc_cache_revise_infomation.to_page_end_va;
-  } else {
-    ptr = orig->ptr;
-    tocompare = gc_cache_revise_infomation.orig_page_end_va;
-  }
-  if((unsigned int)ptr >= (unsigned int)tocompare) {
-    // end of an orig/to page
-    // compute the impact of this page for the new page
-    samplingDataConvert(current_ptr);
-    // prepare for an new orig page
-    unsigned int tmp_index = 
-      (unsigned int)((unsigned int)orig->ptr-gcbaseva)/(BAMBOO_PAGE_SIZE);
-    gc_cache_revise_infomation.orig_page_start_va = orig->ptr;
-    gc_cache_revise_infomation.orig_page_end_va = gcbaseva + 
-      (BAMBOO_PAGE_SIZE)*(unsigned int)(tmp_index+1);
-    gc_cache_revise_infomation.orig_page_index = tmp_index;
-    gc_cache_revise_infomation.to_page_start_va = to->ptr;
-    if(closeToPage) {
-      gc_cache_revise_infomation.to_page_end_va = gcbaseva+(BAMBOO_PAGE_SIZE)
-        *(((unsigned int)(to->ptr)-gcbaseva)/(BAMBOO_PAGE_SIZE)+1);
-      gc_cache_revise_infomation.to_page_index = 
-        ((unsigned int)(to->ptr)-gcbaseva)/(BAMBOO_PAGE_SIZE);
-    }
-  }
-} 
+gc_cache_revise_info_t gc_cache_revise_information;
 
 // prepare for cache adaption:
 //   -- flush the shared heap
@@ -92,170 +23,166 @@ void cacheAdapt_gc(bool isgccachestage) {
 // the master core decides how to adapt cache strategy for the mutator 
 // according to collected statistic data
 
+// find the core that accesses the page #page_index most
+#define CACHEADAPT_FIND_HOTTEST_CORE(page_index,hottestcore,hotfreq) \
+  { \
+    int *local_tbl=&gccachesamplingtbl_r[page_index]; \
+    for(int i = 0; i < NUMCORESACTIVE; i++) { \
+      int freq = *local_tbl; \
+      local_tbl=(int *)(((void *)local_tbl)+size_cachesamplingtbl_local_r); \
+      if(hotfreq < freq) { \
+        hotfreq = freq; \
+        hottestcore = i; \
+      } \
+    } \
+  }
+// find the core that accesses the page #page_index most and comput the total
+// access time of the page at the same time
+#define CACHEADAPT_FIND_HOTTEST_CORE_W_TOTALFREQ(page_index,hottestcore,hotfreq,totalfreq) \
+  { \
+    int *local_tbl=&gccachesamplingtbl_r[page_index]; \
+    for(int i = 0; i < NUMCORESACTIVE; i++) { \
+      int freq = *local_tbl; \
+      local_tbl=(int *)(((void *)local_tbl)+size_cachesamplingtbl_local_r); \
+      totalfreq += freq; \
+      if(hotfreq < freq) { \
+        hotfreq = freq; \
+        hottestcore = i; \
+      } \
+    } \
+  }
+// Set the policy as hosted by coren
+// NOTE: (x,y) should be changed to (x+1, y+1)!!!
+#define CACHEADAPT_POLICY_SET_HOST_CORE(policy, coren) \
+  { \
+    (policy).cache_mode = BAMBOO_CACHE_MODE_COORDS; \    
+    (policy).lotar_x = bamboo_cpu2coords[2*(coren)]+1; \
+    (policy).lotar_y = bamboo_cpu2coords[2*(coren)+1]+1; \
+  }
+// store the new policy information at tmp_p in gccachepolicytbl
+#define CACHEADAPT_CHANGE_POLICY_4_PAGE(tmp_p,page_index,policy) \
+  { \
+    ((int*)(tmp_p))[page_index] = (policy).word; \
+  }
+
 // make all pages hfh
-int cacheAdapt_policy_h4h(){
-  unsigned int page_index = 0;
-  VA page_sva = 0;
-  unsigned int page_num = (BAMBOO_SHARED_MEM_SIZE) / (BAMBOO_PAGE_SIZE);
-  unsigned int numchanged = 0;
-  int * tmp_p = gccachepolicytbl+1;
-  for(page_index = 0; page_index < page_num; page_index++) {
-    page_sva = gcbaseva + (BAMBOO_PAGE_SIZE) * page_index;
+void cacheAdapt_policy_h4h(int coren){
+  unsigned int page_num=(BAMBOO_SHARED_MEM_SIZE)/(BAMBOO_PAGE_SIZE);
+  unsigned int page_gap=page_num/NUMCORESACTIVE;
+  unsigned int page_index=page_gap*coren;
+  unsigned int page_index_end=(coren==NUMCORESACTIVE-1)?page_num:(page_index+page_gap);
+  VA page_sva = gcbaseva+(BAMBOO_PAGE_SIZE)*page_index;
+  int * tmp_p = gccachepolicytbl;
+  for(; page_index < page_index_end; page_index++) {
     bamboo_cache_policy_t policy = {0};
     policy.cache_mode = BAMBOO_CACHE_MODE_HASH;
-    *tmp_p = page_index;
-    tmp_p++;
-    *tmp_p = policy.word;
-    tmp_p++;
-    numchanged++;
+    CACHEADAPT_CHANGE_POLICY_4_PAGE(tmp_p,page_index,policy);
+    page_sva += BAMBOO_PAGE_SIZE;
   }
-
-  return numchanged;
 } 
 
 // make all pages local as non-cache-adaptable gc local mode
-int cacheAdapt_policy_local(){
-  unsigned int page_index = 0;
-  VA page_sva = 0;
-  unsigned int page_num = (BAMBOO_SHARED_MEM_SIZE) / (BAMBOO_PAGE_SIZE);
-  unsigned int numchanged = 0;
-  int * tmp_p = gccachepolicytbl+1;
-  for(page_index = 0; page_index < page_num; page_index++) {
-    page_sva = gcbaseva + (BAMBOO_PAGE_SIZE) * page_index;
+void cacheAdapt_policy_local(int coren){
+  unsigned int page_num=(BAMBOO_SHARED_MEM_SIZE)/(BAMBOO_PAGE_SIZE);
+  unsigned int page_gap=page_num/NUMCORESACTIVE;
+  unsigned int page_index=page_gap*coren;
+  unsigned int page_index_end=(coren==NUMCORESACTIVE-1)?page_num:(page_index+page_gap);
+  VA page_sva = gcbaseva+(BAMBOO_PAGE_SIZE)*page_index;
+  int * tmp_p = gccachepolicytbl;
+  for(; page_index < page_index_end; page_index++) {
     bamboo_cache_policy_t policy = {0};
     unsigned int block = 0;
-    BLOCKINDEX(page_sva, &block);
+    BLOCKINDEX(block, (void *) page_sva);
     unsigned int coren = gc_block2core[block%(NUMCORES4GC*2)];
-    // locally cache the page in the hotest core
-    // NOTE: (x,y) should be changed to (x+1, y+1)!!!
-    policy.cache_mode = BAMBOO_CACHE_MODE_COORDS;
-    policy.lotar_x = bamboo_cpu2coords[2*coren]+1;
-    policy.lotar_y = bamboo_cpu2coords[2*coren+1]+1;
-    *tmp_p = page_index;
-    tmp_p++;
-    *tmp_p = policy.word;
-    tmp_p++;
-    numchanged++;
+    CACHEADAPT_POLICY_SET_HOST_CORE(policy, coren);
+    CACHEADAPT_CHANGE_POLICY_4_PAGE(tmp_p,page_index,policy);
+    page_sva += BAMBOO_PAGE_SIZE;
   }
-
-  return numchanged;
 } 
 
-int cacheAdapt_policy_hotest(){
-  unsigned int page_index = 0;
-  VA page_sva = 0;
-  unsigned int page_num = (BAMBOO_SHARED_MEM_SIZE) / (BAMBOO_PAGE_SIZE);
-  unsigned int numchanged = 0;
-  int * tmp_p = gccachepolicytbl+1;
-  for(page_index = 0; page_index < page_num; page_index++) {
-    page_sva = gcbaseva + (BAMBOO_PAGE_SIZE) * page_index;
+void cacheAdapt_policy_hottest(int coren){
+  unsigned int page_num=(BAMBOO_SHARED_MEM_SIZE)/(BAMBOO_PAGE_SIZE);
+  unsigned int page_gap=page_num/NUMCORESACTIVE;
+  unsigned int page_index=page_gap*coren;
+  unsigned int page_index_end=(coren==NUMCORESACTIVE-1)?page_num:(page_index+page_gap);
+  VA page_sva = gcbaseva+(BAMBOO_PAGE_SIZE)*page_index;
+  int * tmp_p = gccachepolicytbl;
+  for(; page_index < page_index_end; page_index++) {
     bamboo_cache_policy_t policy = {0};
-    unsigned int hotestcore = 0;
+    unsigned int hottestcore = 0;
     unsigned int hotfreq = 0;
-
-    int *local_tbl=&gccachesamplingtbl_r[page_index];
-    for(int i = 0; i < NUMCORESACTIVE; i++) {
-      int freq = *local_tbl;
-      local_tbl=(int *)(((char *)local_tbl)+size_cachesamplingtbl_local_r);
-
-      // check the freqency, decide if this page is hot for the core
-      if(hotfreq < freq) {
-        hotfreq = freq;
-        hotestcore = i;
-      }
-    }
+    CACHEADAPT_FIND_HOTTEST_CORE(page_index,hottestcore,hotfreq);
     // TODO
     // Decide the cache strategy for this page
     // If decide to adapt a new cache strategy, write into the shared block of
     // the gcsharedsamplingtbl. The mem recording information that has been 
     // written is enough to hold the information.
     // Format: page start va + cache strategy(hfh/(host core+[x,y]))
-    if(hotfreq == 0) {
-      // this page has not been accessed, do not change its cache policy
-      continue;
-    } else {
-      // locally cache the page in the hotest core
-      // NOTE: (x,y) should be changed to (x+1, y+1)!!!
-      policy.cache_mode = BAMBOO_CACHE_MODE_COORDS;
-      policy.lotar_x = bamboo_cpu2coords[2*hotestcore]+1;
-      policy.lotar_y = bamboo_cpu2coords[2*hotestcore+1]+1;
-      *tmp_p = page_index;
-      tmp_p++;
-      *tmp_p = policy.word;
-      tmp_p++;
-      numchanged++;
+    if(hotfreq != 0) {
+      // locally cache the page in the hottest core
+      CACHEADAPT_POLICY_SET_HOST_CORE(policy, hottestcore);
     }
+    CACHEADAPT_CHANGE_POLICY_4_PAGE(tmp_p,page_index,policy);
+    page_sva += BAMBOO_PAGE_SIZE;
   }
-
-  return numchanged;
 } 
 
-#define GC_CACHE_ADAPT_DOMINATE_THRESHOLD  50
+#define GC_CACHE_ADAPT_DOMINATE_THRESHOLD  1
 // cache the page on the core that accesses it the most if that core accesses 
 // it more than (GC_CACHE_ADAPT_DOMINATE_THRESHOLD)% of the total.  Otherwise,
 // h4h the page.
-int cacheAdapt_policy_dominate(){
-  unsigned int page_index = 0;
-  VA page_sva = 0;
-  unsigned int page_num = (BAMBOO_SHARED_MEM_SIZE) / (BAMBOO_PAGE_SIZE);
-  unsigned int numchanged = 0;
-  int * tmp_p = gccachepolicytbl+1;
-  for(page_index = 0; page_index < page_num; page_index++) {
-    page_sva = gcbaseva + (BAMBOO_PAGE_SIZE) * page_index;
+void cacheAdapt_policy_dominate(int coren){
+  unsigned int page_num=(BAMBOO_SHARED_MEM_SIZE)/(BAMBOO_PAGE_SIZE);
+  unsigned int page_gap=page_num/NUMCORESACTIVE;
+  unsigned int page_index=page_gap*coren;
+  unsigned int page_index_end=(coren==NUMCORESACTIVE-1)?page_num:(page_index+page_gap);
+  VA page_sva = gcbaseva+(BAMBOO_PAGE_SIZE)*page_index;
+  int * tmp_p = gccachepolicytbl;
+  for(; page_index < page_index_end; page_index++) {
     bamboo_cache_policy_t policy = {0};
-    unsigned int hotestcore = 0;
-    unsigned long long totalfreq = 0;
+    unsigned int hottestcore = 0;
+    unsigned int totalfreq = 0;
     unsigned int hotfreq = 0;
-  
-    int *local_tbl=&gccachesamplingtbl_r[page_index];
-    for(int i = 0; i < NUMCORESACTIVE; i++) {
-      int freq = *local_tbl;
-      local_tbl=(int *)(((char *)local_tbl)+size_cachesamplingtbl_local_r);
-      totalfreq += freq;
-      // check the freqency, decide if this page is hot for the core
-      if(hotfreq < freq) {
-        hotfreq = freq;
-        hotestcore = i;
-      }
-    }
-
+    CACHEADAPT_FIND_HOTTEST_CORE_W_TOTALFREQ(page_index,hottestcore,hotfreq,totalfreq);
     // Decide the cache strategy for this page
     // If decide to adapt a new cache strategy, write into the shared block of
     // the gcpolicytbl 
     // Format: page start va + cache policy
-    if(hotfreq == 0) {
-      // this page has not been accessed, do not change its cache policy
-      continue;
+    if(hotfreq != 0) {
+      totalfreq=totalfreq>>GC_CACHE_ADAPT_DOMINATE_THRESHOLD;
+      if((unsigned int)hotfreq < (unsigned int)totalfreq) {
+        // use hfh
+        //policy.cache_mode = BAMBOO_CACHE_MODE_HASH;
+        unsigned int block = 0;
+        BLOCKINDEX(page_sva, &block);
+        unsigned int coren = gc_block2core[block%(NUMCORES4GC*2)];
+        CACHEADAPT_POLICY_SET_HOST_CORE(policy, coren);
+      } else {
+        // locally cache the page in the hottest core
+        CACHEADAPT_POLICY_SET_HOST_CORE(policy, hottestcore);
+      }     
     }
-    totalfreq = 
-      (totalfreq*GC_CACHE_ADAPT_DOMINATE_THRESHOLD)/100/BAMBOO_PAGE_SIZE;
-    hotfreq/=BAMBOO_PAGE_SIZE;
-    if(hotfreq < totalfreq) {
-      // use hfh
-      policy.cache_mode = BAMBOO_CACHE_MODE_HASH;
-    } else {
-      // locally cache the page in the hotest core
-      // NOTE: (x,y) should be changed to (x+1, y+1)!!!
-      policy.cache_mode = BAMBOO_CACHE_MODE_COORDS;
-      policy.lotar_x = bamboo_cpu2coords[2*hotestcore]+1;
-      policy.lotar_y = bamboo_cpu2coords[2*hotestcore+1]+1;
-    }
-    *tmp_p = page_index;
-    tmp_p++;
-    *tmp_p = policy.word;    
-    tmp_p++;
-    numchanged++;
+    CACHEADAPT_CHANGE_POLICY_4_PAGE(tmp_p,page_index,policy);
+    page_sva += BAMBOO_PAGE_SIZE;
   }
-
-  return numchanged;
 }
 
+#if 0
 #define GC_CACHE_ADAPT_OVERLOAD_THRESHOLD 10
+// record the worklocad of the hottestcore into core2heavypages
+#define CACHEADAPT_RECORD_PAGE_WORKLOAD(hottestcore,totalfreq,hotfreq,remoteaccess,tmp_p) \
+  { \
+    workload[hottestcore] += (totalfreq); \
+    total_workload += (totalfreq); \
+    unsigned long long remoteaccess = (totalfreq) - (hotfreq); \
+    unsigned int index = (unsigned int)core2heavypages[hottestcore][0]; \
+    core2heavypages[hottestcore][3*index+3] = (remoteaccess); \
+    core2heavypages[hottestcore][3*index+2] = (totalfreq); \
+    core2heavypages[hottestcore][3*index+1] = (unsigned long long)((tmp_p)-1); \
+    core2heavypages[hottestcore][0]++; \
+  }
 
-void gc_quicksort(unsigned long long *array,
-                  unsigned int left,
-                  unsigned int right,
-                  unsigned int offset) {
+void gc_quicksort(unsigned long long *array,unsigned int left,unsigned int right,unsigned int offset) {
   unsigned int pivot = 0;;
   unsigned int leftIdx = left;
   unsigned int rightIdx = right;
@@ -289,92 +216,66 @@ void gc_quicksort(unsigned long long *array,
   return;
 }
 
+INLINE int cacheAdapt_h4h_remote_accesses(unsigned long long workload_threshold,unsigned long long ** core2heavypages, unsigned long long * workload,int i) {
+  int j = 1;
+  unsigned int index = (unsigned int)core2heavypages[i][0];
+  if(workload[i] > workload_threshold) {
+    // sort according to the remoteaccess
+    gc_quicksort(&core2heavypages[i][0], 1, index, 0);
+    while((workload[i] > workload_threshold) && (j<index*3)) {
+      // hfh those pages with more remote accesses 
+      bamboo_cache_policy_t policy = {0};
+      policy.cache_mode = BAMBOO_CACHE_MODE_HASH;
+      *((unsigned int*)core2heavypages[i][j]) = policy.word;
+      workload[i] -= core2heavypages[i][j+1];
+      j += 3;
+    }
+  }
+  return j;
+}
+
 // Every page cached on the core that accesses it the most. 
 // Check to see if any core's pages total more accesses than threshold 
 // GC_CACHE_ADAPT_OVERLOAD_THRESHOLD.  If so, find the pages with the 
 // most remote accesses and hash for home them until we get below 
 // GC_CACHE_ADAPT_OVERLOAD_THRESHOLD
-int cacheAdapt_policy_overload(){
+int cacheAdapt_policy_overload(int coren){
   unsigned int page_index = 0;
-  VA page_sva = 0;
-  unsigned int page_num = (BAMBOO_SHARED_MEM_SIZE) / (BAMBOO_PAGE_SIZE);
+  VA page_sva = gcbaseva;
+  unsigned int page_num = BAMBOO_SHARED_MEM_SIZE/BAMBOO_PAGE_SIZE;
   unsigned int numchanged = 0;
   int * tmp_p = gccachepolicytbl+1;
   unsigned long long workload[NUMCORESACTIVE];
   memset(workload, 0, NUMCORESACTIVE*sizeof(unsigned long long));
   unsigned long long total_workload = 0;
   unsigned long long core2heavypages[NUMCORESACTIVE][page_num*3+1];
-  memset(core2heavypages,0,
-      sizeof(unsigned long long)*(page_num*3+1)*NUMCORESACTIVE);
-  for(page_index = 0; page_index < page_num; page_index++) {
-    page_sva = gcbaseva + (BAMBOO_PAGE_SIZE) * page_index;
+  memset(core2heavypages,0,sizeof(unsigned long long)*(page_num*3+1)*NUMCORESACTIVE);
+  for(page_index = 0; page_sva < gctopva; page_index++) {
     bamboo_cache_policy_t policy = {0};
-    unsigned int hotestcore = 0;
+    unsigned int hottestcore = 0;
     unsigned long long totalfreq = 0;
     unsigned int hotfreq = 0;
-  
-    int *local_tbl=&gccachesamplingtbl_r[page_index];
-    for(int i = 0; i < NUMCORESACTIVE; i++) {
-      int freq = *local_tbl;
-      local_tbl=(int *)(((char *)local_tbl)+size_cachesamplingtbl_local_r);
-      totalfreq += freq;
-      // check the freqency, decide if this page is hot for the core
-      if(hotfreq < freq) {
-        hotfreq = freq;
-        hotestcore = i;
-      }
-    }
+    CACHEADAPT_FIND_HOTTEST_CORE_W_TOTALFREQ(page_index,hottestcore,hotfreq,totalfreq);
     // Decide the cache strategy for this page
     // If decide to adapt a new cache strategy, write into the shared block of
     // the gcsharedsamplingtbl. The mem recording information that has been 
     // written is enough to hold the information.
     // Format: page start va + cache strategy(hfh/(host core+[x,y]))
-    if(hotfreq == 0) {
-      // this page has not been accessed, do not change its cache policy
-      continue;
+    if(hotfreq != 0) {
+      totalfreq/=BAMBOO_PAGE_SIZE;
+      hotfreq/=BAMBOO_PAGE_SIZE;
+      // locally cache the page in the hottest core
+      CACHEADAPT_POLICY_SET_HOST_CORE(policy, hottestcore);
+      CACHEADAPT_CHANGE_POLICY_4_PAGE(tmp_p,page_index,policy,numchanged);
+      CACHEADAPT_RECORD_PAGE_WORKLOAD(hottestcore,totalfreq,hotfreq,remoteaccess,tmp_p);    
     }
-
-    totalfreq/=BAMBOO_PAGE_SIZE;
-    hotfreq/=BAMBOO_PAGE_SIZE;
-    // locally cache the page in the hotest core
-    // NOTE: (x,y) should be changed to (x+1, y+1)!!!
-    policy.cache_mode = BAMBOO_CACHE_MODE_COORDS;
-    policy.lotar_x = bamboo_cpu2coords[2*hotestcore]+1;
-    policy.lotar_y = bamboo_cpu2coords[2*hotestcore+1]+1;
-    *tmp_p = page_index;
-    tmp_p++;
-    *tmp_p = policy.word;
-    tmp_p++;
-    numchanged++;
-    workload[hotestcore] += totalfreq;
-    total_workload += totalfreq;
-    // insert into core2heavypages using quicksort
-    unsigned long long remoteaccess = totalfreq - hotfreq;
-    unsigned int index = (unsigned int)core2heavypages[hotestcore][0];
-    core2heavypages[hotestcore][3*index+3] = remoteaccess;
-    core2heavypages[hotestcore][3*index+2] = totalfreq;
-    core2heavypages[hotestcore][3*index+1] = (unsigned long long)(tmp_p-1);
-    core2heavypages[hotestcore][0]++;
+    page_sva += BAMBOO_PAGE_SIZE;
   }
 
-  unsigned long long workload_threshold = 
-  total_workload/GC_CACHE_ADAPT_OVERLOAD_THRESHOLD;
+  unsigned long long workload_threshold=total_workload/GC_CACHE_ADAPT_OVERLOAD_THRESHOLD;
   // Check the workload of each core
   for(int i = 0; i < NUMCORESACTIVE; i++) {
-    int j = 1;
-    unsigned int index = (unsigned int)core2heavypages[i][0];
-    if(workload[i] > workload_threshold) {
-      // sort according to the remoteaccess
-      gc_quicksort(&core2heavypages[i][0], 1, index, 0);
-      while((workload[i] > workload_threshold) && (j<index*3)) {
-        // hfh those pages with more remote accesses 
-        bamboo_cache_policy_t policy = {0};
-        policy.cache_mode = BAMBOO_CACHE_MODE_HASH;
-        *((unsigned int*)core2heavypages[i][j]) = policy.word;
-        workload[i] -= core2heavypages[i][j+1];
-        j += 3;
-      }
-    }
+    cacheAdapt_h4h_remote_accesses(workload_threshold,core2heavypages,workload,i);
   }
 
   return numchanged;
@@ -393,93 +294,49 @@ int cacheAdapt_policy_overload(){
 // then start hfh these pages(selecting the ones with the most remote 
 // accesses first or fewest local accesses) until we get below 
 // GC_CACHE_ADAPT_CROWD_THRESHOLD pages.
-int cacheAdapt_policy_crowd(){
+int cacheAdapt_policy_crowd(int coren){
   unsigned int page_index = 0;
-  VA page_sva = 0;
-  unsigned int page_num = (BAMBOO_SHARED_MEM_SIZE) / (BAMBOO_PAGE_SIZE);
+  VA page_sva = gcbaseva;
+  unsigned int page_num = BAMBOO_SHARED_MEM_SIZE/BAMBOO_PAGE_SIZE;
   unsigned int numchanged = 0;
   int * tmp_p = gccachepolicytbl+1;
   unsigned long long workload[NUMCORESACTIVE];
   memset(workload, 0, NUMCORESACTIVE*sizeof(unsigned long long));
   unsigned long long total_workload = 0;
   unsigned long long core2heavypages[NUMCORESACTIVE][page_num*3+1];
-  memset(core2heavypages,0,
-    sizeof(unsigned long long)*(page_num*3+1)*NUMCORESACTIVE);
-  for(page_index = 0; page_index < page_num; page_index++) {
-    page_sva = gcbaseva + (BAMBOO_PAGE_SIZE) * page_index;
+  memset(core2heavypages,0,sizeof(unsigned long long)*(page_num*3+1)*NUMCORESACTIVE);
+  for(page_index = 0; page_sva < gctopva; page_index++) {
     bamboo_cache_policy_t policy = {0};
-    unsigned int hotestcore = 0;
+    unsigned int hottestcore = 0;
     unsigned long long totalfreq = 0;
     unsigned int hotfreq = 0;
-  
-    int *local_tbl=&gccachesamplingtbl_r[page_index];
-    for(int i = 0; i < NUMCORESACTIVE; i++) {
-      int freq = *local_tbl;
-      local_tbl=(int *)(((char *)local_tbl)+size_cachesamplingtbl_local_r);
-      totalfreq += freq;
-      // check the freqency, decide if this page is hot for the core
-      if(hotfreq < freq) {
-        hotfreq = freq;
-        hotestcore = i;
-      }
-    }
+    CACHEADAPT_FIND_HOTTEST_CORE_W_TOTALFREQ(page_index,hottestcore,hotfreq,totalfreq);
     // Decide the cache strategy for this page
     // If decide to adapt a new cache strategy, write into the shared block of
     // the gcsharedsamplingtbl. The mem recording information that has been 
     // written is enough to hold the information.
     // Format: page start va + cache strategy(hfh/(host core+[x,y]))
-    if(hotfreq == 0) {
-      // this page has not been accessed, do not change its cache policy
-      continue;
+    if(hotfreq != 0) {
+      totalfreq/=BAMBOO_PAGE_SIZE;
+      hotfreq/=BAMBOO_PAGE_SIZE;
+      // locally cache the page in the hottest core
+      CACHEADAPT_POLICY_SET_HOST_CORE(policy, hottestcore);
+      CACHEADAPT_CHANGE_POLICY_4_PAGE(tmp_p,page_index,policy,numchanged);
+      CACHEADAPT_RECORD_PAGE_WORKLOAD(hottestcore,totalfreq,hotfreq,remoteaccess,tmp_p);
     }
-    totalfreq/=BAMBOO_PAGE_SIZE;
-    hotfreq/=BAMBOO_PAGE_SIZE;
-    // locally cache the page in the hotest core
-    // NOTE: (x,y) should be changed to (x+1, y+1)!!!
-    policy.cache_mode = BAMBOO_CACHE_MODE_COORDS;
-    policy.lotar_x = bamboo_cpu2coords[2*hotestcore]+1;
-    policy.lotar_y = bamboo_cpu2coords[2*hotestcore+1]+1;
-    *tmp_p = page_index;
-    tmp_p++;
-    *tmp_p = policy.word;
-    tmp_p++;
-    numchanged++;
-    workload[hotestcore] += totalfreq;
-    total_workload += totalfreq;
-    // insert into core2heavypages using quicksort
-    unsigned long long remoteaccess = totalfreq - hotfreq;
-    unsigned int index = (unsigned int)core2heavypages[hotestcore][0];
-    core2heavypages[hotestcore][3*index+3] = remoteaccess;
-    core2heavypages[hotestcore][3*index+2] = totalfreq;
-    core2heavypages[hotestcore][3*index+1] = (unsigned long long)(tmp_p-1);
-    core2heavypages[hotestcore][0]++;
+    page_sva += BAMBOO_PAGE_SIZE;
   }
 
-  unsigned long long workload_threshold = 
-  total_workload / GC_CACHE_ADAPT_OVERLOAD_THRESHOLD;
+  unsigned long long workload_threshold=total_workload/GC_CACHE_ADAPT_OVERLOAD_THRESHOLD;
   // Check the workload of each core
   for(int i = 0; i < NUMCORESACTIVE; i++) {
-    int j = 1;
-    unsigned int index = (unsigned int)core2heavypages[i][0];  
-    if(workload[i] > workload_threshold) {
-      // sort according to the remoteaccess
-      gc_quicksort(&core2heavypages[i][0], 1, index, 0);
-      while((workload[i] > workload_threshold) && (j<index*3)) {
-        // hfh those pages with more remote accesses 
-        bamboo_cache_policy_t policy = {0};
-        policy.cache_mode = BAMBOO_CACHE_MODE_HASH;
-        *((unsigned int*)core2heavypages[i][j]) = policy.word;
-        workload[i] -= core2heavypages[i][j+1];
-        j += 3;
-      }
-    }
-
+    unsigned int index=(unsigned int)core2heavypages[i][0];
+    int j=cacheAdapt_h4h_remote_accesses(workload_threshold,core2heavypages,workload,i);
     // Check if the accesses are crowded on few pages
     // sort according to the total access
 inner_crowd:
     gc_quicksort(&core2heavypages[i][0], j/3+1, index, 1);
-    unsigned long long threshold = 
-      GC_CACHE_ADAPT_ACCESS_THRESHOLD*workload[i]/100;
+    unsigned long long threshold=GC_CACHE_ADAPT_ACCESS_THRESHOLD*workload[i]/100;
     int num_crowded = 0;
     unsigned long long t_workload = 0;
     do {
@@ -506,117 +363,122 @@ inner_crowd:
 
   return numchanged;
 } 
+#endif
 
-void cacheAdapt_master() {
-  CACHEADAPT_OUTPUT_CACHE_SAMPLING_R();
-  unsigned int numchanged = 0;
+unsigned int cacheAdapt_decision(int coren) {
+  BAMBOO_CACHE_MF();
   // check the statistic data
   // for each page, decide the new cache strategy
 #ifdef GC_CACHE_ADAPT_POLICY1
-  numchanged = cacheAdapt_policy_h4h();
+  cacheAdapt_policy_h4h(coren);
 #elif defined GC_CACHE_ADAPT_POLICY2
-  numchanged = cacheAdapt_policy_local();
+  cacheAdapt_policy_local(coren);
 #elif defined GC_CACHE_ADAPT_POLICY3
-  numchanged = cacheAdapt_policy_hotest();
+  cacheAdapt_policy_hottest(coren);
 #elif defined GC_CACHE_ADAPT_POLICY4
-  numchanged = cacheAdapt_policy_dominate();
-#elif defined GC_CACHE_ADAPT_POLICY5
-  numchanged = cacheAdapt_policy_overload();
-#elif defined GC_CACHE_ADAPT_POLICY6
-  numchanged = cacheAdapt_policy_crowd();
+  cacheAdapt_policy_dominate(coren);
+//#elif defined GC_CACHE_ADAPT_POLICY5
+//  cacheAdapt_policy_overload(coren);
+//#elif defined GC_CACHE_ADAPT_POLICY6
+//  cacheAdapt_policy_crowd(coren);
 #endif
-  *gccachepolicytbl = numchanged;
 }
 
 // adapt the cache strategy for the mutator
 void cacheAdapt_mutator() {
-  int numchanged = *gccachepolicytbl;
+  BAMBOO_CACHE_MF();
   // check the changes and adapt them
-  int * tmp_p = gccachepolicytbl+1;
-  while(numchanged--) {
+  int * tmp_p = gccachepolicytbl;
+  unsigned int page_sva = gcbaseva;
+  for(; page_sva<gctopva; page_sva+=BAMBOO_PAGE_SIZE) {
     // read out the policy
-    int page_index = *tmp_p;
-    bamboo_cache_policy_t policy = (bamboo_cache_policy_t)(*(tmp_p+1));
+    bamboo_cache_policy_t policy = (bamboo_cache_policy_t)(*(tmp_p));
     // adapt the policy
-    bamboo_adapt_cache_policy(page_index*(BAMBOO_PAGE_SIZE)+gcbaseva, 
-        policy, BAMBOO_PAGE_SIZE);
-
-    tmp_p += 2;
+    if(policy.word != 0) {
+      bamboo_adapt_cache_policy(page_sva,policy,BAMBOO_PAGE_SIZE);
+    }
+    tmp_p += 1;
   }
 }
 
 void cacheAdapt_phase_client() {
-  WAITFORGCPHASE(PREFINISHPHASE);
+  WAITFORGCPHASE(CACHEPOLICYPHASE);
+  GC_PRINTF("Start cachepolicy phase\n");
+  cacheAdapt_decision(BAMBOO_NUM_OF_CORE);
+  //send init finish msg to core coordinator
+  send_msg_2(STARTUPCORE, GCFINISHCACHEPOLICY, BAMBOO_NUM_OF_CORE);
+  GC_PRINTF("Finish cachepolicy phase\n");
 
+  WAITFORGCPHASE(PREFINISHPHASE);
   GC_PRINTF("Start prefinish phase\n");
   // cache adapt phase
   cacheAdapt_mutator();
   cacheAdapt_gc(false);
   //send init finish msg to core coordinator
-  send_msg_2(STARTUPCORE, GCFINISHPREF, BAMBOO_NUM_OF_CORE, false);
+  send_msg_2(STARTUPCORE, GCFINISHPREF, BAMBOO_NUM_OF_CORE);
   GC_PRINTF("Finish prefinish phase\n");
   CACHEADAPT_SAMPING_RESET();
   if(BAMBOO_NUM_OF_CORE < NUMCORESACTIVE) {
     // zero out the gccachesamplingtbl
     BAMBOO_MEMSET_WH(gccachesamplingtbl_local,0,size_cachesamplingtbl_local);  
-    BAMBOO_MEMSET_WH(gccachesamplingtbl_local_r,0,
-        size_cachesamplingtbl_local_r);
+    BAMBOO_MEMSET_WH(gccachesamplingtbl_local_r,0,size_cachesamplingtbl_local_r);
   }
 }
 
+extern unsigned long long gc_output_cache_policy_time;
+
 void cacheAdapt_phase_master() {
-  GCPROFILEITEM();
-  gcphase = PREFINISHPHASE;
-  gccorestatus[BAMBOO_NUM_OF_CORE] = 1;
-  // Note: all cores should flush their runtime data including non-gc
-  //       cores
-  for(i = 1; i < NUMCORESACTIVE; ++i) {
-    // send start flush messages to all cores
-    gccorestatus[i] = 1;
-    send_msg_1(i, GCSTARTPREF, false);
-  }
+  GCPROFILE_ITEM();
+  unsigned long long tmpt = BAMBOO_GET_EXE_TIME();
+  CACHEADAPT_OUTPUT_CACHE_SAMPLING_R();
+  gc_output_cache_policy_time += (BAMBOO_GET_EXE_TIME()-tmpt);
+  // let all cores to parallelly process the revised profile data and decide 
+  // the cache policy for each page
+  gc_status_info.gcphase = CACHEPOLICYPHASE;
+  GC_SEND_MSG_1_TO_CLIENT(GCSTARTCACHEPOLICY);
+  GC_PRINTF("Start cachepolicy phase \n");
+  // cache adapt phase
+  cacheAdapt_decision(BAMBOO_NUM_OF_CORE);
+  GC_CHECK_ALL_CORE_STATUS(CACHEPOLICYPHASE==gc_status_info.gcphase);
+  BAMBOO_CACHE_MF();
+
+  // let all cores to adopt new policies
+  gc_status_info.gcphase = PREFINISHPHASE;
+  // Note: all cores should flush their runtime data including non-gc cores
+  GC_SEND_MSG_1_TO_CLIENT(GCSTARTPREF);
   GC_PRINTF("Start prefinish phase \n");
   // cache adapt phase
   cacheAdapt_mutator();
-  CACHEADPAT_OUTPUT_CACHE_POLICY();
   cacheAdapt_gc(false);
-
-  gccorestatus[BAMBOO_NUM_OF_CORE] = 0;
-  while(PREFINISHPHASE == gcphase) {
-    // check the status of all cores
-    BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
-    if(gc_checkAllCoreStatus_I()) {
-      BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-      break;
-    }
-    BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-  }
+  GC_CHECK_ALL_CORE_STATUS(PREFINISHPHASE==gc_status_info.gcphase);
 
   CACHEADAPT_SAMPING_RESET();
   if(BAMBOO_NUM_OF_CORE < NUMCORESACTIVE) {
     // zero out the gccachesamplingtbl
     BAMBOO_MEMSET_WH(gccachesamplingtbl_local,0,size_cachesamplingtbl_local);
-    BAMBOO_MEMSET_WH(gccachesamplingtbl_local_r,0,
-        size_cachesamplingtbl_local_r);
+    BAMBOO_MEMSET_WH(gccachesamplingtbl_local_r,0,size_cachesamplingtbl_local_r);
     BAMBOO_MEMSET_WH(gccachepolicytbl,0,size_cachepolicytbl);
   }
 }
 
 void gc_output_cache_sampling() {
+  //extern volatile bool gc_profile_flag;
+  //if(!gc_profile_flag) return;
   unsigned int page_index = 0;
   VA page_sva = 0;
   unsigned int page_num = (BAMBOO_SHARED_MEM_SIZE) / (BAMBOO_PAGE_SIZE);
   for(page_index = 0; page_index < page_num; page_index++) {
     page_sva = gcbaseva + (BAMBOO_PAGE_SIZE) * page_index;
     unsigned int block = 0;
-    BLOCKINDEX(page_sva, &block);
+    BLOCKINDEX(block, (void *) page_sva);
     unsigned int coren = gc_block2core[block%(NUMCORES4GC*2)];
-    tprintf("va: %x page_index: %d host: %d\n",(int)page_sva,page_index,coren);
+    printf("%x,  %d,  %d,  ",(int)page_sva,page_index,coren);
     for(int i = 0; i < NUMCORESACTIVE; i++) {
-      int * local_tbl = (int *)((void *)gccachesamplingtbl
-          +size_cachesamplingtbl_local*i);
+      int * local_tbl = (int *)((void *)gccachesamplingtbl+size_cachesamplingtbl_local*i);
       int freq = local_tbl[page_index];
-      printf("%8d ",freq);
+      //if(freq != 0) {
+        printf("%d,  ", freq);
+      //}
     }
     printf("\n");
   }
@@ -624,24 +486,52 @@ void gc_output_cache_sampling() {
 } 
 
 void gc_output_cache_sampling_r() {
+  //extern volatile bool gc_profile_flag;
+  //if(!gc_profile_flag) return;
+  // TODO summary data
+  unsigned int sumdata[NUMCORESACTIVE][NUMCORESACTIVE];
+  for(int i = 0; i < NUMCORESACTIVE; i++) {
+    for(int j = 0; j < NUMCORESACTIVE; j++) {
+      sumdata[i][j] = 0;
+    }
+  }
+  tprintf("cache sampling_r \n");
   unsigned int page_index = 0;
   VA page_sva = 0;
   unsigned int page_num = (BAMBOO_SHARED_MEM_SIZE) / (BAMBOO_PAGE_SIZE);
   for(page_index = 0; page_index < page_num; page_index++) {
     page_sva = gcbaseva + (BAMBOO_PAGE_SIZE) * page_index;
     unsigned int block = 0;
-    BLOCKINDEX(page_sva, &block);
+    BLOCKINDEX(block, (void *)page_sva);
     unsigned int coren = gc_block2core[block%(NUMCORES4GC*2)];
-    tprintf("va: %x page_index: %d host: %d\n",(int)page_sva,page_index,coren);
+    printf(" %x,  %d,  %d,  ",(int)page_sva,page_index,coren);
+    int accesscore = 0; // TODO
     for(int i = 0; i < NUMCORESACTIVE; i++) {
-      int * local_tbl = (int *)((void *)gccachesamplingtbl_r
-          +size_cachesamplingtbl_local_r*i);
+      int * local_tbl = (int *)((void *)gccachesamplingtbl_r+size_cachesamplingtbl_local_r*i);
       int freq = local_tbl[page_index]/BAMBOO_PAGE_SIZE;
-      printf("%8d ",freq);
+      printf("%d,  ", freq);
+      if(freq != 0) {
+        accesscore++;// TODO
+      }
+    }
+    if(accesscore!=0) {
+      for(int i = 0; i < NUMCORESACTIVE; i++) {
+        int * local_tbl = (int *)((void *)gccachesamplingtbl_r+size_cachesamplingtbl_local_r*i);
+        int freq = local_tbl[page_index]/BAMBOO_PAGE_SIZE;
+        sumdata[accesscore-1][i]+=freq;
+      }
     }
   
     printf("\n");
   }
+  // TODO printout the summary data
+  for(int i = 0; i < NUMCORESACTIVE; i++) {
+    printf("%d  ", i);
+    for(int j = 0; j < NUMCORESACTIVE; j++) {
+      printf(" %d  ", sumdata[j][i]);
+    }
+    printf("\n");
+  }
   printf("=================\n");
 } 
 #endif // GC_CACHE_ADAPT