From: jzhou <jzhou>
Date: Wed, 10 Feb 2010 17:22:25 +0000 (+0000)
Subject: bug fixing in multicore gc and add profiling code for gc
X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=8581fa3cc30cf51e595c8e3c67a7208c3725bf48;p=IRC.git

bug fixing in multicore gc and add profiling code for gc
---

diff --git a/Robust/src/Analysis/Scheduling/ScheduleAnalysis.java b/Robust/src/Analysis/Scheduling/ScheduleAnalysis.java
index 4334cb2d..0da216fe 100644
--- a/Robust/src/Analysis/Scheduling/ScheduleAnalysis.java
+++ b/Robust/src/Analysis/Scheduling/ScheduleAnalysis.java
@@ -419,7 +419,8 @@ public class ScheduleAnalysis {
                       (cdname.equals("KMeans")) || 
                       (cdname.equals("ZTransform")) ||
                       (cdname.equals("TestRunner")) || 
-                      (cdname.equals("LinkList"))) {
+                      (cdname.equals("LinkList")) ||
+                      (cdname.equals("BHRunner"))) {
                     newRate = this.coreNum;
                   } else if(cdname.equals("SentenceParser")) {
                     newRate = 4;
diff --git a/Robust/src/Runtime/MGCHash.c b/Robust/src/Runtime/MGCHash.c
index 2155f673..97d8c890 100755
--- a/Robust/src/Runtime/MGCHash.c
+++ b/Robust/src/Runtime/MGCHash.c
@@ -66,7 +66,7 @@ void mgchashreset() {
       tmpptr=next;
     }
   } else {*/
-	  memset(mgc_table, '\0', sizeof(mgchashlistnode_t)*mgc_size);
+	  BAMBOO_MEMSET_WH(mgc_table, '\0', sizeof(mgchashlistnode_t)*mgc_size);
   //}
   while(mgc_structs->next!=NULL) {
     mgcliststruct_t *next=mgc_structs->next;
@@ -328,7 +328,7 @@ struct MGCHash * allocateMGCHash(int size,
   thisvar->bucket = 
 		(struct MGCNode *) RUNMALLOC(sizeof(struct MGCNode)*size);
 	// zero out all the buckets
-	memset(thisvar->bucket, '\0', sizeof(struct MGCNode)*size);
+	BAMBOO_MEMSET_WH(thisvar->bucket, '\0', sizeof(struct MGCNode)*size);
   //Set data counts
   thisvar->num4conflicts = conflicts;
   return thisvar;
diff --git a/Robust/src/Runtime/mem.c b/Robust/src/Runtime/mem.c
index 3ae21fbe..9cc3da92 100644
--- a/Robust/src/Runtime/mem.c
+++ b/Robust/src/Runtime/mem.c
@@ -51,8 +51,8 @@ memalloc:
   BAMBOO_CLOSE_CRITICAL_SECTION_MEM();
 	void * alignedp = 
 		(void *)(BAMBOO_CACHE_LINE_SIZE+((int)p-1)&(~BAMBOO_CACHE_LINE_MASK));
-	memset(p, -2, (alignedp - p));
-  memset(alignedp + size, -2, p + isize - alignedp - size);
+	BAMBOO_MEMSET_WH(p, -2, (alignedp - p));
+  BAMBOO_MEMSET_WH(alignedp + size, -2, p + isize - alignedp - size);
 	return alignedp;
 }
 #else
diff --git a/Robust/src/Runtime/multicoregarbage.c b/Robust/src/Runtime/multicoregarbage.c
index ce6f6537..20a86eb5 100644
--- a/Robust/src/Runtime/multicoregarbage.c
+++ b/Robust/src/Runtime/multicoregarbage.c
@@ -99,12 +99,6 @@ inline void dumpSMem() {
 				coren = gc_block2core[block%(NUMCORES4GC*2)];
 			}
 			// compute core coordinate
-			/*int tmpcore = coren;
-			if((NUMCORES4GC==62) && (tmpcore > 5)) {
-				tmpcore+=2;
-			}
-			x = tmpcore/bamboo_width;
-			y = tmpcore%bamboo_width;*/
 			x = bamboo_cpu2coords[coren*2]; 
 			y = bamboo_cpu2coords[coren*2+1];
 			tprintf("==== %d, %d : core (%d,%d), saddr %x====\n", 
@@ -394,6 +388,17 @@ inline bool gc_checkCoreStatus() {
 	return allStall;
 }
 
+inline bool gc_checkAllCoreStatus() {
+	bool allStall = true;
+	for(int i = 0; i < NUMCORESACTIVE; ++i) {
+		if(gccorestatus[i] != 0) {
+			allStall = false;
+			break;
+		} // if(gccorestatus[i] != 0)
+	} // for(i = 0; i < NUMCORESACTIVE; ++i)
+	return allStall;
+}
+
 inline void checkMarkStatue() {
 #ifdef DEBUG
 	BAMBOO_DEBUGPRINT(0xee01);
@@ -409,7 +414,7 @@ inline void checkMarkStatue() {
 		gcnumsendobjs[BAMBOO_NUM_OF_CORE] = gcself_numsendobjs;
 		gcnumreceiveobjs[BAMBOO_NUM_OF_CORE] = gcself_numreceiveobjs;
 		// check the status of all cores
-		bool allStall = gc_checkCoreStatus();
+		bool allStall = gc_checkAllCoreStatus();
 #ifdef DEBUG
 		BAMBOO_DEBUGPRINT(0xee03);
 #endif
@@ -427,26 +432,26 @@ inline void checkMarkStatue() {
 				// reset the corestatus array too
 				gccorestatus[BAMBOO_NUM_OF_CORE] = 1;
 				waitconfirm = true;
-				numconfirm = NUMCORES4GC - 1;
-				for(i = 1; i < NUMCORES4GC; ++i) {	
+				numconfirm = NUMCORESACTIVE - 1;
+				for(i = 1; i < NUMCORESACTIVE; ++i) {	
 					gccorestatus[i] = 1;
 					// send mark phase finish confirm request msg to core i
 					send_msg_1(i, GCMARKCONFIRM, false);
-				} // for(i = 1; i < NUMCORES4GC; ++i) 
+				} // for(i = 1; i < NUMCORESACTIVE; ++i) 
 			} else {
 				// check if the sum of send objs and receive obj are the same
 				// yes->check if the info is the latest; no->go on executing
 				int sumsendobj = 0;
-				for(i = 0; i < NUMCORES4GC; ++i) {
+				for(i = 0; i < NUMCORESACTIVE; ++i) {
 					sumsendobj += gcnumsendobjs[i];
-				} // for(i = 0; i < NUMCORES4GC; ++i) 
+				} // for(i = 0; i < NUMCORESACTIVE; ++i) 
 #ifdef DEBUG
 				BAMBOO_DEBUGPRINT(0xee06);
 				BAMBOO_DEBUGPRINT_REG(sumsendobj);
 #endif
-				for(i = 0; i < NUMCORES4GC; ++i) {
+				for(i = 0; i < NUMCORESACTIVE; ++i) {
 					sumsendobj -= gcnumreceiveobjs[i];
-				} // for(i = 0; i < NUMCORES4GC; ++i) 
+				} // for(i = 0; i < NUMCORESACTIVE; ++i) 
 #ifdef DEBUG
 				BAMBOO_DEBUGPRINT(0xee07);
 				BAMBOO_DEBUGPRINT_REG(sumsendobj);
@@ -459,9 +464,9 @@ inline void checkMarkStatue() {
 					// stop mark phase
 					gcphase = COMPACTPHASE;
 					// restore the gcstatus for all cores
-					for(i = 0; i < NUMCORES4GC; ++i) {
+					for(i = 0; i < NUMCORESACTIVE; ++i) {
 						gccorestatus[i] = 1;
-					} // for(i = 0; i < NUMCORES4GC; ++i)
+					} // for(i = 0; i < NUMCORESACTIVE; ++i)
 				} // if(0 == sumsendobj)
 			} // if(!gcwaitconfirm) else()
 		} // if(allStall)
@@ -558,6 +563,11 @@ inline void initGC() {
 			gcfilledblocks[i] = 0;
 			gcstopblock[i] = 0;
 		} // for(i = 0; i < NUMCORES4GC; ++i)
+		for(i = NUMCORES4GC; i < NUMCORESACTIVE; ++i) {
+			gccorestatus[i] = 1;
+			gcnumsendobjs[i] = 0; 
+			gcnumreceiveobjs[i] = 0;
+		}
 		gcheaptop = 0;
 		gctopcore = 0;
 		gctopblock = 0;
@@ -606,8 +616,6 @@ inline void initGC() {
 	
 	freeMGCHash(gcforwardobjtbl);
 	gcforwardobjtbl = allocateMGCHash(20, 3);
-
-	memset(gcsmemtbl, '\0', sizeof(int)*gcnumblock);
 } // void initGC()
 
 // compute load balance for all cores
@@ -733,6 +741,7 @@ inline bool cacheLObjs() {
 		if((int)dst < (int)(gclobjtail2->lobjs[gclobjtailindex2])+size) {
 			memmove(dst, gclobjtail2->lobjs[gclobjtailindex2], size);
 		} else {
+			//BAMBOO_WRITE_HINT_CACHE(dst, size);
 		  memcpy(dst, gclobjtail2->lobjs[gclobjtailindex2], size);
 		}
 #ifdef DEBUG
@@ -750,7 +759,7 @@ inline bool cacheLObjs() {
 // NOTE: the free mem chunks should be maintained in an ordered linklist
 // the listtop param always specify current list tail
 
-// update the gcsmemtbl to record current shared mem usage
+// update the bmmboo_smemtbl to record current shared mem usage
 void updateSmemTbl(int coren,
 		               int localtop) {
 	int ltopcore = 0;
@@ -766,10 +775,10 @@ void updateSmemTbl(int coren,
 	do{
 		toset = gc_core2block[2*coren+i]+(NUMCORES4GC*2)*j;
 		if(toset < ltopcore) {
-			gcsmemtbl[toset]=
+			bamboo_smemtbl[toset]=
 				(toset<NUMCORES4GC)?BAMBOO_SMEM_SIZE_L:BAMBOO_SMEM_SIZE;
 		} else if(toset == ltopcore) {
-			gcsmemtbl[toset] = load;
+			bamboo_smemtbl[toset] = load;
 			break;
 		} else {
 			break;
@@ -782,49 +791,12 @@ void updateSmemTbl(int coren,
 	}while(true);
 } // void updateSmemTbl(int, int)
 
-inline struct freeMemItem * addFreeMemItem(int ptr,
-		                                       int size,
-																					 struct freeMemItem * listtail,
-																					 bool* sethead) {
-	struct freeMemItem * tochange = listtail;
-	if(*sethead) {
-		if(tochange->next == NULL) {
-			if(bamboo_free_mem_list->backuplist != NULL) {
-				tochange->next = bamboo_free_mem_list->backuplist;
-				bamboo_free_mem_list->backuplist = NULL;
-			} else {
-				tochange->next = 
-					(struct freeMemItem *)RUNMALLOC(sizeof(struct freeMemItem));
-			}
-		} // if(tochange->next == NULL)
-		tochange = tochange->next;
-	} else {
-		*sethead = true;
-	} // if(sethead)
-	tochange->ptr = ptr;
-	tochange->size = size;
-	BLOCKINDEX(ptr, &(tochange->startblock));
-	BLOCKINDEX(ptr+size-1, &(tochange->endblock));
-	// zero out all these spare memory
-	// note that, leave the mem starting from heaptop, as it caches large objs
-	// zero out these cache later when moving large obj
-	{
-		INTPTR tmp = tochange->ptr;
-		unsigned long long int size = tochange->size;
-		while(size > 0) {
-			int tsize = size>1024*1024*1024?1024*1024*1024:size;
-			memset(tmp, '\0', tsize);
-			size -= tsize;
-			tmp += tsize;
-		}
-	}
-	return tochange;
-} // struct freeMemItem * addFreeMemItem(int,int,struct freeMemItem*,bool*, int)
-
 inline void moveLObjs() {
 #ifdef DEBUG
 	BAMBOO_DEBUGPRINT(0xea01);
 #endif
+	// zero out the smemtbl
+	BAMBOO_MEMSET_WH(bamboo_smemtbl, 0, sizeof(int)*gcnumblock);
 	// find current heap top
 	// flush all gcloads to indicate the real heap top on one core
 	// previous it represents the next available ptr on a core
@@ -839,7 +811,7 @@ inline void moveLObjs() {
 #ifdef DEBUG
   BAMBOO_DEBUGPRINT(0xea02);
 	BAMBOO_DEBUGPRINT_REG(gcloads[0]);
-	BAMBOO_DEBUGPRINT_REG(gcsmemtbl[0]);
+	BAMBOO_DEBUGPRINT_REG(bamboo_smemtbl[0]);
 #endif
 	for(int i = 1; i < NUMCORES4GC; i++) {
 		int tmptop = 0;
@@ -871,14 +843,14 @@ inline void moveLObjs() {
 	int bound = 0;
 	int i = 0;
 	for(i = gcnumblock-1; i >= 0; i--) {
-		if(gcsmemtbl[i] > 0) {
+		if(bamboo_smemtbl[i] > 0) {
 			break;
 		}
 	}
 	if(i == -1) {
 		tmpheaptop = gcbaseva;
 	} else {
-		tmpheaptop = gcbaseva+gcsmemtbl[i]+((i<NUMCORES4GC)?
+		tmpheaptop = gcbaseva+bamboo_smemtbl[i]+((i<NUMCORES4GC)?
 				(BAMBOO_SMEM_SIZE_L*i):
 				(BAMBOO_SMEM_SIZE*(i-NUMCORES4GC)+BAMBOO_LARGE_SMEM_BOUND));
 	}
@@ -893,7 +865,7 @@ inline void moveLObjs() {
 	BAMBOO_DEBUGPRINT_REG(gcheaptop);
 #endif
 	// flush the sbstartbl
-	memset(&(gcsbstarttbl[gcreservedsb]), '\0', 
+	BAMBOO_MEMSET_WH(&(gcsbstarttbl[gcreservedsb]), '\0', 
 		(BAMBOO_SHARED_MEM_SIZE/BAMBOO_SMEM_SIZE-gcreservedsb)*sizeof(INTPTR));
 	if(tomove == 0) {
 		gcheaptop = tmpheaptop;
@@ -929,9 +901,9 @@ inline void moveLObjs() {
 				// this object acrosses blocks
 				if(cpysize > 0) {
 					// close current block, fill its header
-					memset(base, '\0', BAMBOO_CACHE_LINE_SIZE);
+					BAMBOO_MEMSET_WH(base, '\0', BAMBOO_CACHE_LINE_SIZE);
 					*((int*)base) = cpysize + BAMBOO_CACHE_LINE_SIZE;
-					gcsmemtbl[b]+=BAMBOO_CACHE_LINE_SIZE; // add the size of the header
+					bamboo_smemtbl[b]+=BAMBOO_CACHE_LINE_SIZE; // add the size of the header
 					cpysize = 0;
 					base = tmpheaptop;
 					if(remain == 0) {
@@ -948,12 +920,13 @@ inline void moveLObjs() {
 				if((int)gcheaptop < (int)(tmpheaptop)+size) {
 				  memmove(tmpheaptop, gcheaptop, size);
 				} else {
+					//BAMBOO_WRITE_HINT_CACHE(tmpheaptop, size);
 					memcpy(tmpheaptop, gcheaptop, size);
 				}
 				// fill the remaining space with -2 padding
-				memset(tmpheaptop+size, -2, isize-size);
+				BAMBOO_MEMSET_WH(tmpheaptop+size, -2, isize-size);
 				// zero out original mem caching the lobj
-				memset(gcheaptop, '\0', size);
+				BAMBOO_MEMSET_WH(gcheaptop, '\0', size);
 #ifdef DEBUG
 				BAMBOO_DEBUGPRINT(0xea05);
 				BAMBOO_DEBUGPRINT_REG(gcheaptop);
@@ -987,7 +960,7 @@ inline void moveLObjs() {
 				} // if(host == BAMBOO_NUM_OF_CORE) else ...
 				tmpheaptop += isize;
 
-				// set the gcsbstarttbl and gcsmemtbl
+				// set the gcsbstarttbl and bamboo_smemtbl
 				int tmpsbs = 1+(isize-remain-1)/BAMBOO_SMEM_SIZE;
 				for(int k = 1; k < tmpsbs; k++) {
 					gcsbstarttbl[sb+k] = (INTPTR)(-1);
@@ -996,7 +969,7 @@ inline void moveLObjs() {
 				bound = (b<NUMCORES4GC)?BAMBOO_SMEM_SIZE_L:BAMBOO_SMEM_SIZE;
 				BLOCKINDEX(tmpheaptop-1, &tmpsbs);
 				for(; b < tmpsbs; b++) {
-					gcsmemtbl[b] = bound;
+					bamboo_smemtbl[b] = bound;
 					if(b==NUMCORES4GC-1) {
 						bound = BAMBOO_SMEM_SIZE;
 					}
@@ -1005,22 +978,22 @@ inline void moveLObjs() {
 					gcsbstarttbl[sb] = (INTPTR)(-1);
 					remain = ((tmpheaptop-gcbaseva)<(BAMBOO_LARGE_SMEM_BOUND)) ? 
 									 BAMBOO_SMEM_SIZE_L : BAMBOO_SMEM_SIZE;
-					gcsmemtbl[b] = bound;
+					bamboo_smemtbl[b] = bound;
 				} else {
 					gcsbstarttbl[sb] = (INTPTR)(tmpheaptop);
 					remain = tmpheaptop-gcbaseva;
-					gcsmemtbl[b] = remain%bound;
-					remain = bound - gcsmemtbl[b];
+					bamboo_smemtbl[b] = remain%bound;
+					remain = bound - bamboo_smemtbl[b];
 				} // if(((isize-remain)%(BAMBOO_SMEM_SIZE)) == 0) else ...
 
 				// close current block and fill the header
-				memset(base, '\0', BAMBOO_CACHE_LINE_SIZE);
+				BAMBOO_MEMSET_WH(base, '\0', BAMBOO_CACHE_LINE_SIZE);
 				*((int*)base) = isize + BAMBOO_CACHE_LINE_SIZE;
 				cpysize = 0;
 				base = tmpheaptop;
 				if(remain == BAMBOO_CACHE_LINE_SIZE) {
 					// fill with 0 in case
-					memset(tmpheaptop, '\0', remain);
+					BAMBOO_MEMSET_WH(tmpheaptop, '\0', remain);
 				}
 				remain -= BAMBOO_CACHE_LINE_SIZE;
 				tmpheaptop += BAMBOO_CACHE_LINE_SIZE;
@@ -1030,12 +1003,13 @@ inline void moveLObjs() {
 				if((int)gcheaptop < (int)(tmpheaptop)+size) {
   				memmove(tmpheaptop, gcheaptop, size);
 				} else {
+					//BAMBOO_WRITE_HINT_CACHE(tmpheaptop, size);
 					memcpy(tmpheaptop, gcheaptop, size);
 				}
 				// fill the remaining space with -2 padding
-				memset(tmpheaptop+size, -2, isize-size);
+				BAMBOO_MEMSET_WH(tmpheaptop+size, -2, isize-size);
 				// zero out original mem caching the lobj
-				memset(gcheaptop, '\0', size);
+				BAMBOO_MEMSET_WH(gcheaptop, '\0', size);
 #ifdef DEBUG
 				BAMBOO_DEBUGPRINT(0xea06);
 				BAMBOO_DEBUGPRINT_REG(gcheaptop);
@@ -1071,15 +1045,15 @@ inline void moveLObjs() {
 				} // if(host == BAMBOO_NUM_OF_CORE) else ...
 				tmpheaptop += isize;
 
-				// update gcsmemtbl
-				gcsmemtbl[b] += isize;
+				// update bamboo_smemtbl
+				bamboo_smemtbl[b] += isize;
 			} // if(remain < isize) else ...
 		} // while(gc_lobjmoreItems())
 		if(cpysize > 0) {
 			// close current block, fill the header
-			memset(base, '\0', BAMBOO_CACHE_LINE_SIZE);
+			BAMBOO_MEMSET_WH(base, '\0', BAMBOO_CACHE_LINE_SIZE);
 			*((int*)base) = cpysize + BAMBOO_CACHE_LINE_SIZE;
-			gcsmemtbl[b] += BAMBOO_CACHE_LINE_SIZE; // add the size of the header
+			bamboo_smemtbl[b] += BAMBOO_CACHE_LINE_SIZE; // add the size of the header
 		} else {
 			tmpheaptop -= BAMBOO_CACHE_LINE_SIZE;
 		}
@@ -1091,82 +1065,19 @@ inline void moveLObjs() {
 	BAMBOO_DEBUGPRINT(0xea07);
 	BAMBOO_DEBUGPRINT_REG(gcheaptop);
 #endif
-
-	// update the free mem list
-	// create new free mem list according to gcsmemtbl
-	bool sethead = false;
-	if(bamboo_free_mem_list->head == NULL) {
-		bamboo_free_mem_list->head = bamboo_free_mem_list->backuplist;
-		bamboo_free_mem_list->backuplist = NULL;
-	}
-	struct freeMemItem * tochange = bamboo_free_mem_list->head;
-	if(tochange == NULL) {
-		bamboo_free_mem_list->head = tochange = 
-			(struct freeMemItem *)RUNMALLOC(sizeof(struct freeMemItem));
-		tochange->next = NULL;
-	}
-	int startptr = 0;
-	size = 0;
-	bound = BAMBOO_SMEM_SIZE_L;
-	for(i = 0; i < gcnumblock-bamboo_reserved_smem; i++) {
-		if(gcsmemtbl[i] < bound) {
-			if(gcsmemtbl[i] == 0) {
-				// blank one
-				if(startptr == 0) {
-					// a start of a new free mem chunk
-					startptr = gcbaseva+((i<NUMCORES4GC)?(i*BAMBOO_SMEM_SIZE_L)
-							:(BAMBOO_LARGE_SMEM_BOUND+(i-NUMCORES4GC)*BAMBOO_SMEM_SIZE));
-				} // if(startptr == 0) 
-				size += bound;
-			} else {
-				if(startptr != 0) {
-					// the end of previous free mem chunk
-					tochange = addFreeMemItem(startptr,size,tochange,&sethead);
-					//startptr = 0;
-					//size = 0;
-				}
-				// start of a new free mem chunk
-				startptr = gcbaseva+((i<NUMCORES4GC)?(i*BAMBOO_SMEM_SIZE_L)
-				      :((BAMBOO_LARGE_SMEM_BOUND+(i-NUMCORES4GC)*BAMBOO_SMEM_SIZE)))
-							 +gcsmemtbl[i];
-				size = bound-gcsmemtbl[i];
-			} // if(gcsmemtbl[i] == 0) else
-		} else {
-			if(startptr != 0) {
-				// the end of previous free mem chunk
-				tochange = addFreeMemItem(startptr,size,tochange,&sethead);
-				startptr = 0;
-				size = 0;
-			} // if(startptr != 0) {
-		} // if(gcsmemtbl[i] < bound) else
-		if(i == NUMCORES4GC-1) {
-			bound = BAMBOO_SMEM_SIZE;
-		}
-	} // for(i = 0; i < gcnumblock; i++) {
-	if(startptr != 0) {
-		tochange = addFreeMemItem(startptr, size, tochange, &sethead);
-		startptr = 0;
-		size = 0;
-	}
-	// remove the remaing list to the back up list, only remain one node, 
-	// free the others
-	if(tochange->next != NULL) {
-		struct freeMemItem * blist = NULL;
-		if(bamboo_free_mem_list->backuplist != NULL) {
-			blist = tochange->next;
+	
+	bamboo_free_block = 0;
+  int tbound = 0;
+  do {
+		tbound = (bamboo_free_block<NUMCORES4GC)?
+			BAMBOO_SMEM_SIZE_L:BAMBOO_SMEM_SIZE;
+		if(bamboo_smemtbl[bamboo_free_block] == tbound) {
+			bamboo_free_block++;
 		} else {
-			bamboo_free_mem_list->backuplist = tochange->next;
-			blist = bamboo_free_mem_list->backuplist->next;
-			bamboo_free_mem_list->backuplist->next = NULL;
+			// the first non-full partition
+			break;
 		}
-		tochange->next = NULL;
-		while(blist != NULL) {
-			struct freeMemItem * tmp = blist;
-			blist = blist->next;
-			RUNFREE(tmp);
-		} // if(blist != NULL)
-	}
-
+	} while(true);
 #ifdef DEBUG
 	BAMBOO_DEBUGPRINT(0xea08);
 	BAMBOO_DEBUGPRINT_REG(gcheaptop);
@@ -1232,9 +1143,6 @@ inline void tomark(struct garbagelist * stackptr) {
 #endif
 		for(i=0; i<stackptr->size; i++) {
 			if(stackptr->array[i] != NULL) {
-				//BAMBOO_START_CRITICAL_SECTION();
-				//gc_enqueue_I(stackptr->array[i]);
-				//BAMBOO_CLOSE_CRITICAL_SECTION();
 			  markObj(stackptr->array[i]);
 			}
 		}
@@ -1255,9 +1163,6 @@ inline void tomark(struct garbagelist * stackptr) {
 				struct ObjectHash * set=parameter->objectset;
 				struct ObjectNode * ptr=set->listhead;
 				while(ptr!=NULL) {
-					//BAMBOO_START_CRITICAL_SECTION();
-					//gc_enqueue_I((void *)ptr->key);
-					//BAMBOO_CLOSE_CRITICAL_SECTION();
 					markObj((void *)ptr->key);
 					ptr=ptr->lnext;
 				}
@@ -1271,9 +1176,6 @@ inline void tomark(struct garbagelist * stackptr) {
 		BAMBOO_DEBUGPRINT(0xe504);
 #endif
 		for(i=0; i<currtpd->numParameters; i++) {
-			//BAMBOO_START_CRITICAL_SECTION();
-			//gc_enqueue_I(currtpd->parameterArray[i]);
-			//BAMBOO_CLOSE_CRITICAL_SECTION();
 			markObj(currtpd->parameterArray[i]);
 		}
 	}
@@ -1288,9 +1190,6 @@ inline void tomark(struct garbagelist * stackptr) {
 			struct taskparamdescriptor *tpd=ptr->src;
 			int i;
 			for(i=0; i<tpd->numParameters; i++) {
-				//BAMBOO_START_CRITICAL_SECTION();
-				//gc_enqueue_I(tpd->parameterArray[i]);
-				//BAMBOO_CLOSE_CRITICAL_SECTION();
 				markObj(tpd->parameterArray[i]);
 			}
 			ptr=ptr->inext;
@@ -1305,9 +1204,6 @@ inline void tomark(struct garbagelist * stackptr) {
 	while(tmpobjptr != NULL) {
 		struct transObjInfo * objInfo = 
 			(struct transObjInfo *)(tmpobjptr->objectptr); 
-		//BAMBOO_START_CRITICAL_SECTION();
-		//gc_enqueue_I(objInfo->objptr);
-		//BAMBOO_CLOSE_CRITICAL_SECTION();
 		markObj(objInfo->objptr);
 		tmpobjptr = getNextQueueItem(tmpobjptr);
 	}
@@ -1320,9 +1216,6 @@ inline void tomark(struct garbagelist * stackptr) {
 	while(item != NULL) {
 		struct transObjInfo * totransobj = 
 			(struct transObjInfo *)(item->objectptr);
-		//BAMBOO_START_CRITICAL_SECTION();
-		//gc_enqueue_I(totransobj->objptr);
-		//BAMBOO_CLOSE_CRITICAL_SECTION();
 		markObj(totransobj->objptr);
 		item = getNextQueueItem(item);
 	} // while(item != NULL)
@@ -1332,10 +1225,8 @@ inline void tomark(struct garbagelist * stackptr) {
 #endif
 	// enqueue lock related info
 	for(i = 0; i < runtime_locklen; ++i) {
-	 //gc_enqueue_I((void *)(runtime_locks[i].redirectlock));
 	 markObj((void *)(runtime_locks[i].redirectlock));
 	 if(runtime_locks[i].value != NULL) {
-		 //gc_enqueue_I((void *)(runtime_locks[i].value));
 		 markObj((void *)(runtime_locks[i].value));
 	 }
 	}
@@ -1596,10 +1487,6 @@ inline void compact2Heaptop() {
 	BAMBOO_DEBUGPRINT_REG(b);
 	BAMBOO_DEBUGPRINT_REG(remain);
 #endif
-	/*if((gctopcore == STARTUPCORE) && (b == 0)) {
-		remain -= gcreservedsb*BAMBOO_SMEM_SIZE;
-		p += gcreservedsb*BAMBOO_SMEM_SIZE;
-	}*/
 	for(int i = 0; i < NUMCORES4GC; i++) {
 		BAMBOO_START_CRITICAL_SECTION();
 		if((gccorestatus[i] != 0) && (gcrequiredmems[i] > 0)) {
@@ -1771,12 +1658,18 @@ innernextSBlock:
 		orig->blockbase = orig->base;
 		orig->sblockindex = (orig->blockbase-BAMBOO_BASE_VA)/BAMBOO_SMEM_SIZE;
 		sbchanged = true;
+		int blocknum = 0;
+		BLOCKINDEX(orig->base, &blocknum);
+		if(bamboo_smemtbl[blocknum] == 0) {
+			// goto next block
+			goto innernextSBlock;
+		}
 	} else if(0 == (orig->blockbase%BAMBOO_SMEM_SIZE)) {
 		orig->sblockindex += 1;
 		sbchanged = true;
 	} // if((orig->blockbase >= orig->bound) || (orig->ptr >= orig->bound)...
 
-	// check if this sblock should be omitted or have special start point
+	// check if this sblock should be skipped or have special start point
 	if(gcsbstarttbl[orig->sblockindex] == -1) {
 		// goto next sblock
 #ifdef DEBUG
@@ -1951,10 +1844,10 @@ innermoveobj:
 		// check to see if remaining space is enough
 		if(to->top + isize > to->bound) {
 			// fill 0 indicating the end of this block
-			memset(to->ptr,  '\0', to->bound - to->top);
+			BAMBOO_MEMSET_WH(to->ptr,  '\0', to->bound - to->top);
 			// fill the header of this block and then go to next block
     	to->offset += to->bound - to->top;
-			memset(to->base, '\0', BAMBOO_CACHE_LINE_SIZE);
+			BAMBOO_MEMSET_WH(to->base, '\0', BAMBOO_CACHE_LINE_SIZE);
 			(*((int*)(to->base))) = to->offset;
 			nextBlock(to);
 			if(stopblock == to->numblocks) {
@@ -1969,10 +1862,11 @@ innermoveobj:
 			if((int)(orig->ptr) < (int)(to->ptr)+size) {
 			  memmove(to->ptr, orig->ptr, size);
 			} else {
+				//BAMBOO_WRITE_HINT_CACHE(to->ptr, size);
 				memcpy(to->ptr, orig->ptr, size);
 			}
 			// fill the remaining space with -2
-			memset(to->ptr+size, -2, isize-size);
+			BAMBOO_MEMSET_WH(to->ptr+size, -2, isize-size);
 		}
 		// store mapping info
 		BAMBOO_START_CRITICAL_SECTION();
@@ -1981,7 +1875,6 @@ innermoveobj:
 		//MGCHashadd_I(gcpointertbl, orig->ptr, to->ptr);
 		BAMBOO_CLOSE_CRITICAL_SECTION();
 	  //}
-
 #ifdef DEBUG
 		BAMBOO_DEBUGPRINT(0xcdce);
 		BAMBOO_DEBUGPRINT_REG(orig->ptr);
@@ -1993,7 +1886,7 @@ innermoveobj:
 		to->top += isize;
 		if(to->top == to->bound) {
 			// fill the header of this block and then go to next block
-			memset(to->base, '\0', BAMBOO_CACHE_LINE_SIZE);
+			BAMBOO_MEMSET_WH(to->base, '\0', BAMBOO_CACHE_LINE_SIZE);
 			(*((int*)(to->base))) = to->offset;
 			nextBlock(to);
 		}
@@ -2095,7 +1988,7 @@ innercompact:
 	// if no objs have been compact, do nothing, 
 	// otherwise, fill the header of this block
 	if(to->offset > BAMBOO_CACHE_LINE_SIZE) {
-		memset(to->base, '\0', BAMBOO_CACHE_LINE_SIZE);
+		BAMBOO_MEMSET_WH(to->base, '\0', BAMBOO_CACHE_LINE_SIZE);
 		(*((int*)(to->base))) = to->offset;
 	} else {
 		to->offset = 0;
@@ -2248,6 +2141,9 @@ inline void * flushObj(void * objptr) {
 #ifdef DEBUG
 	BAMBOO_DEBUGPRINT(0xe401);
 #endif
+	if(objptr == NULL) {
+		return NULL;
+	}
 	void * dstptr = NULL;
 	if(ISSHAREDOBJ(objptr)) {
 #ifdef DEBUG
@@ -2437,8 +2333,10 @@ inline void flush(struct garbagelist * stackptr) {
 #ifdef DEBUG
 					BAMBOO_DEBUGPRINT_REG(objptr);
 #endif
-					((void **)(((char *)&ao->___length___)+sizeof(int)))[j] = 
-						flushObj(objptr);
+					if(objptr != NULL) {
+						((void **)(((char *)&ao->___length___)+sizeof(int)))[j] = 
+							flushObj(objptr);
+					}
 				}
 			} else {
 #ifdef DEBUG
@@ -2452,11 +2350,12 @@ inline void flush(struct garbagelist * stackptr) {
 #endif
 					unsigned int offset=pointer[i];
 					void * objptr=*((void **)(((char *)ptr)+offset));
-
 #ifdef DEBUG
 					BAMBOO_DEBUGPRINT_REG(objptr);
 #endif
-					*((void **)(((char *)ptr)+offset)) = flushObj(objptr);
+					if(objptr != NULL) {
+						*((void **)(((char *)ptr)+offset)) = flushObj(objptr);
+					}
 				} // for(i=1; i<=size; i++) 
 			} // if (pointer==0) else if (((INTPTR)pointer)==1) else ()
 			// restore the mark field, indicating that this obj has been flushed
@@ -2477,7 +2376,6 @@ inline void flush(struct garbagelist * stackptr) {
 		BAMBOO_DEBUGPRINT(0xe309);
 #endif
 		void * ptr = gc_lobjdequeue(NULL, NULL);
-		//if(ISSHAREDOBJ(ptr)) {
 		void * tptr = flushObj(ptr);
 #ifdef DEBUG
 		BAMBOO_DEBUGPRINT(0xe30a);
@@ -2488,8 +2386,7 @@ inline void flush(struct garbagelist * stackptr) {
 		if(tptr != NULL) {
 			ptr = tptr;
 		}
-		//}
-		if(/*(!ISSHAREDOBJ(ptr)) || */(((int *)(ptr))[6] == COMPACTED)) {
+		if(((int *)(ptr))[6] == COMPACTED) {
 			int type = ((int *)(ptr))[0];
 			// scan all pointers in ptr
 			unsigned INTPTR * pointer;
@@ -2518,8 +2415,10 @@ inline void flush(struct garbagelist * stackptr) {
 #ifdef DEBUG
 					BAMBOO_DEBUGPRINT_REG(objptr);
 #endif
-					((void **)(((char *)&ao->___length___)+sizeof(int)))[j] = 
-						flushObj(objptr);
+					if(objptr != NULL) {
+						((void **)(((char *)&ao->___length___)+sizeof(int)))[j] = 
+							flushObj(objptr);
+					}
 				}
 			} else {
 #ifdef DEBUG
@@ -2537,13 +2436,13 @@ inline void flush(struct garbagelist * stackptr) {
 #ifdef DEBUG
 					BAMBOO_DEBUGPRINT_REG(objptr);
 #endif
-					*((void **)(((char *)ptr)+offset)) = flushObj(objptr);
+					if(objptr != NULL) {
+						*((void **)(((char *)ptr)+offset)) = flushObj(objptr);
+					}
 				} // for(i=1; i<=size; i++) 
 			} // if (pointer==0) else if (((INTPTR)pointer)==1) else ()
 			// restore the mark field, indicating that this obj has been flushed
-			//if(ISSHAREDOBJ(ptr)) {
-				((int *)(ptr))[6] = INIT;
-			//}
+			((int *)(ptr))[6] = INIT;
 		} // if(((int *)(ptr))[6] == COMPACTED)
 	} // while(gc_lobjmoreItems())
 #ifdef DEBUG
@@ -2564,12 +2463,9 @@ inline void flush(struct garbagelist * stackptr) {
 inline void gc_collect(struct garbagelist * stackptr) {
 	// core collector routine
 	while(true) {
-		//BAMBOO_START_CRITICAL_SECTION();
 		if(INITPHASE == gcphase) {
-			//BAMBOO_CLOSE_CRITICAL_SECTION();
 			break;
 		}
-		//BAMBOO_CLOSE_CRITICAL_SECTION();
 	}
 #ifdef RAWPATH // TODO GC_DEBUG
 	tprintf("Do initGC\n");
@@ -2578,12 +2474,9 @@ inline void gc_collect(struct garbagelist * stackptr) {
 	//send init finish msg to core coordinator
 	send_msg_2(STARTUPCORE, GCFINISHINIT, BAMBOO_NUM_OF_CORE, false);
 	while(true) {
-		//BAMBOO_START_CRITICAL_SECTION();
 		if(MARKPHASE == gcphase) {
-			//BAMBOO_CLOSE_CRITICAL_SECTION();
 			break;
 		}
-		//BAMBOO_CLOSE_CRITICAL_SECTION();
 	}
 #ifdef RAWPATH // TODO GC_DEBUG
 	tprintf("Start mark phase\n");
@@ -2597,12 +2490,9 @@ inline void gc_collect(struct garbagelist * stackptr) {
 	tprintf("Finish compact phase\n");
 #endif
 	while(true) {
-		//BAMBOO_START_CRITICAL_SECTION();
 		if(FLUSHPHASE == gcphase) {
-			//BAMBOO_CLOSE_CRITICAL_SECTION();
 			break;
 		}
-		//BAMBOO_CLOSE_CRITICAL_SECTION();
 	}
 #ifdef RAWPATH // TODO GC_DEBUG
 	tprintf("Start flush phase\n");
@@ -2613,12 +2503,57 @@ inline void gc_collect(struct garbagelist * stackptr) {
 #endif
 
 	while(true) {
-		//BAMBOO_START_CRITICAL_SECTION();
 		if(FINISHPHASE == gcphase) {
-			//BAMBOO_CLOSE_CRITICAL_SECTION();
 			break;
 		}
-		//BAMBOO_CLOSE_CRITICAL_SECTION();
+	}
+#ifdef RAWPATH // TODO GC_DEBUG
+	tprintf("Finish gc!\n");
+#endif
+} // void gc_collect(struct garbagelist * stackptr)
+
+inline void gc_nocollect(struct garbagelist * stackptr) {
+	while(true) {
+		if(INITPHASE == gcphase) {
+			break;
+		}
+	}
+#ifdef RAWPATH // TODO GC_DEBUG
+	tprintf("Do initGC\n");
+#endif
+	initGC();
+	//send init finish msg to core coordinator
+	send_msg_2(STARTUPCORE, GCFINISHINIT, BAMBOO_NUM_OF_CORE, false);
+	while(true) {
+		if(MARKPHASE == gcphase) {
+			break;
+		}
+	}
+#ifdef RAWPATH // TODO GC_DEBUG
+	tprintf("Start mark phase\n");
+#endif
+	mark(true, stackptr);
+#ifdef RAWPATH // TODO GC_DEBUG
+	tprintf("Finish mark phase, wait for flush\n");
+#endif
+	// non-gc core collector routine
+	while(true) {
+		if(FLUSHPHASE == gcphase) {
+			break;
+		}
+	}
+#ifdef RAWPATH // TODO GC_DEBUG
+	tprintf("Start flush phase\n");
+#endif
+	flush(stackptr);
+#ifdef RAWPATH // TODO GC_DEBUG
+	tprintf("Finish flush phase\n");
+#endif
+
+	while(true) {
+		if(FINISHPHASE == gcphase) {
+			break;
+		}
 	}
 #ifdef RAWPATH // TODO GC_DEBUG
 	tprintf("Finish gc!\n");
@@ -2643,6 +2578,10 @@ inline void gc(struct garbagelist * stackptr) {
 			return;
 		}
 
+#ifdef GC_PROFILE
+		gc_profileStart();
+#endif
+
 #ifdef RAWPATH // TODO GC_DEBUG
 		tprintf("start gc! \n");
 		//dumpSMem();
@@ -2652,7 +2591,8 @@ inline void gc(struct garbagelist * stackptr) {
 		waitconfirm = false;
 		waitconfirm = 0;
 		gcphase = INITPHASE;
-		for(i = 1; i < NUMCORES4GC; i++) {
+		// Note: all cores need to init gc including non-gc cores
+		for(i = 1; i < NUMCORESACTIVE/*NUMCORES4GC*/; i++) {
 			// send GC init messages to all cores
 			send_msg_1(i, GCSTARTINIT, false);
 		}
@@ -2666,20 +2606,21 @@ inline void gc(struct garbagelist * stackptr) {
 
 		gccorestatus[BAMBOO_NUM_OF_CORE] = 0;
 		while(true) {
-			BAMBOO_START_CRITICAL_SECTION();
-			if(gc_checkCoreStatus()) {
-				BAMBOO_CLOSE_CRITICAL_SECTION();
+			if(gc_checkAllCoreStatus()) {
 				break;
 			}
-			BAMBOO_CLOSE_CRITICAL_SECTION();
 		}
+#ifdef GC_PROFILE
+		gc_profileItem();
+#endif
 #ifdef RAWPATH // TODO GC_DEBUG
 		tprintf("Start mark phase \n");
 #endif
 		// all cores have finished compacting
 		// restore the gcstatus of all cores
+		// Note: all cores have to do mark including non-gc cores
 		gccorestatus[BAMBOO_NUM_OF_CORE] = 1;
-		for(i = 1; i < NUMCORES4GC; ++i) {
+		for(i = 1; i < NUMCORESACTIVE/*NUMCORES4GC*/; ++i) {
 			gccorestatus[i] = 1;
 			// send GC start messages to all cores
 			send_msg_1(i, GCSTART, false);
@@ -2697,6 +2638,7 @@ inline void gc(struct garbagelist * stackptr) {
 			checkMarkStatue(); 
 		}  // while(MARKPHASE == gcphase)
 		// send msgs to all cores requiring large objs info
+		// Note: only need to ask gc cores, non-gc cores do not host any objs
 		numconfirm = NUMCORES4GC - 1;
 		for(i = 1; i < NUMCORES4GC; ++i) {
 			send_msg_1(i, GCLOBJREQUEST, false);
@@ -2711,6 +2653,9 @@ inline void gc(struct garbagelist * stackptr) {
 		if(gcheaptop < gcmarkedptrbound) {
 			gcheaptop = gcmarkedptrbound;
 		}
+#ifdef GC_PROFILE
+		gc_profileItem();
+#endif
 #ifdef RAWPATH // TODO GC_DEBUG
 		tprintf("prepare to cache large objs \n");
 		//dumpSMem();
@@ -2769,6 +2714,10 @@ inline void gc(struct garbagelist * stackptr) {
 			gcrequiredmems[i] = 0;
 		}
 
+#ifdef GC_PROFILE
+		gc_profileItem();
+#endif
+
 		// compact phase
 		bool finalcompact = false;
 		// initialize pointers for comapcting
@@ -2859,7 +2808,9 @@ inline void gc(struct garbagelist * stackptr) {
 			} // if(gctomove)
 
 		} // while(COMPACTPHASE == gcphase) 
-	
+#ifdef GC_PROFILE
+		gc_profileItem();
+#endif
 #ifdef RAWPATH // TODO GC_DEBUG
 		tprintf("prepare to move large objs \n");
 		//dumpSMem();
@@ -2876,12 +2827,16 @@ inline void gc(struct garbagelist * stackptr) {
 
 		gcphase = FLUSHPHASE;
 		gccorestatus[BAMBOO_NUM_OF_CORE] = 1;
-		for(i = 1; i < NUMCORES4GC; ++i) {
+		// Note: all cores should flush their runtime data including non-gc 
+		//       cores
+		for(i = 1; i < NUMCORESACTIVE/*NUMCORES4GC*/; ++i) {
 			// send start flush messages to all cores
 			gccorestatus[i] = 1;
 			send_msg_1(i, GCSTARTFLUSH, false);
 		}
-
+#ifdef GC_PROFILE
+		gc_profileItem();
+#endif
 #ifdef RAWPATH // TODO GC_DEBUG
 		tprintf("Start flush phase \n");
 #endif
@@ -2890,14 +2845,26 @@ inline void gc(struct garbagelist * stackptr) {
 		gccorestatus[BAMBOO_NUM_OF_CORE] = 0;
 		while(FLUSHPHASE == gcphase) {
 			// check the status of all cores
-			if(gc_checkCoreStatus()) {
+			if(gc_checkAllCoreStatus()) {
 				break;
 			}
 		} // while(FLUSHPHASE == gcphase)
 		gcphase = FINISHPHASE;
 
+		// invalidate all shared mem pointers
+		// put it here as it takes time to inform all the other cores to 
+		// finish gc and it might cause problem when some core resumes 
+		// mutator earlier than the other cores
+		bamboo_cur_msp = NULL;
+		bamboo_smem_size = 0;
+		gcflag = false;
+		gcprocessing = false;
+
+#ifdef GC_PROFILE
+		gc_profileEnd();
+#endif
 		gccorestatus[BAMBOO_NUM_OF_CORE] = 1;
-		for(i = 1; i < NUMCORES4GC; ++i) {
+		for(i = 1; i < NUMCORESACTIVE/*NUMCORES4GC*/; ++i) {
 			// send gc finish messages to all cores
 			send_msg_1(i, GCFINISH, false);
 			gccorestatus[i] = 1;
@@ -2906,18 +2873,116 @@ inline void gc(struct garbagelist * stackptr) {
 		tprintf("gc finished \n");
 		//dumpSMem();
 #endif
-	} else {
+	} else if(BAMBOO_NUM_OF_CORE < NUMCORES4GC) {
 		gcprocessing = true;
 		gc_collect(stackptr);
-	}
 
-	// invalidate all shared mem pointers
-	bamboo_cur_msp = NULL;
-	bamboo_smem_size = 0;
+		// invalidate all shared mem pointers
+		bamboo_cur_msp = NULL;
+		bamboo_smem_size = 0;
 
-	gcflag = false;
-	gcprocessing = false;
+		gcflag = false;
+		gcprocessing = false;
+	} else {
+		// not a gc core, should wait for gcfinish msg
+	  gcprocessing = true;
+		gc_nocollect(stackptr);
 
+		// invalidate all shared mem pointers
+		bamboo_cur_msp = NULL;
+		bamboo_smem_size = 0;
+
+		gcflag = false;
+		gcprocessing = false;
+	}
 } // void gc(struct garbagelist * stackptr)
 
+#ifdef GC_PROFILE
+inline void gc_profileStart(void) {
+  if(!gc_infoOverflow) {
+		GCInfo* gcInfo = RUNMALLOC(sizeof(struct gc_info));
+	  gc_infoArray[gc_infoIndex] = gcInfo;
+		gcInfo->index = 1;
+		gcInfo->time[0] = BAMBOO_GET_EXE_TIME();
+  }
+}
+
+inline void gc_profileItem(void) {
+  if(!gc_infoOverflow) {
+		GCInfo* gcInfo = gc_infoArray[gc_infoIndex];
+		gcInfo->time[gcInfo->index++] = BAMBOO_GET_EXE_TIME();
+  }
+}
+
+inline void gc_profileEnd(void) {
+  if(!gc_infoOverflow) {
+		GCInfo* gcInfo = gc_infoArray[gc_infoIndex];
+	  gcInfo->time[gcInfo->index++] = BAMBOO_GET_EXE_TIME();
+		gc_infoIndex++;
+	  if(gc_infoIndex == GCINFOLENGTH) {
+		  gc_infoOverflow = true;
+		  //taskInfoIndex = 0;
+	  }
+  }
+}
+
+// output the profiling data
+void gc_outputProfileData() {
+#ifdef USEIO
+  int i,j;
+	unsigned long long totalgc = 0;
+
+  //printf("Start Time, End Time, Duration\n");
+  // output task related info
+  for(i = 0; i < gc_infoIndex; i++) {
+		GCInfo * gcInfo = gc_infoArray[i];
+		unsigned long long tmp = 0;
+		for(j = 0; j < gcInfo->index; j++) {
+			printf("%lld(%lld), ", gcInfo->time[j], (gcInfo->time[j]-tmp));
+			tmp = gcInfo->time[j];
+		}
+		tmp = (tmp-gcInfo->time[0]);
+		printf(" ++ %lld \n", tmp);
+		totalgc += tmp;
+  }
+
+  if(gc_infoOverflow) {
+    printf("Caution: gc info overflow!\n");
+  }
+
+	printf("\n\n total gc time: %lld \n", totalgc);
+#else
+  int i = 0;
+  int j = 0;
+	unsigned long long totalgc = 0;
+
+  BAMBOO_DEBUGPRINT(0xdddd);
+  // output task related info
+  for(i= 0; i < gc_infoIndex; i++) {
+		GCInfo * gcInfo = gc_infoArray[i];
+		unsigned long long tmp = 0;
+		BAMBOO_DEBUGPRINT(0xddda);
+		for(j = 0; j < gcInfo->index; j++) {
+			BAMBOO_DEBUGPRINT(gcInfo->time[j]);
+			BAMBOO_DEBUGPRINT(gcInfo->time[j]-tmp);
+			BAMBOO_DEBUGPRINT(0xdddb);
+			tmp = gcInfo->time[j];
+		}
+		tmp = (tmp-gcInfo->time[0]);
+		BAMBOO_DEBUGPRINT_REG(tmp);
+		BAMBOO_DEBUGPRINT(0xdddc);
+		totalgc += tmp;
+  }
+	BAMBOO_DEBUGPRINT(0xdddd);
+	BAMBOO_DEBUGPRINT_REG(totalgc);
+
+  if(gc_infoOverflow) {
+    BAMBOO_DEBUGPRINT(0xefee);
+  }
+
+  BAMBOO_DEBUGPRINT(0xeeee);
+#endif
+}
+#endif  // #ifdef GC_PROFILE
+
 #endif
diff --git a/Robust/src/Runtime/multicoregarbage.h b/Robust/src/Runtime/multicoregarbage.h
index 95329471..08ba12bc 100644
--- a/Robust/src/Runtime/multicoregarbage.h
+++ b/Robust/src/Runtime/multicoregarbage.h
@@ -13,12 +13,28 @@
 #ifdef GC_DEBUG
 #define BAMBOO_SMEM_SIZE_L (BAMBOO_SMEM_SIZE * 2)
 #else
-#define BAMBOO_SMEM_SIZE_L (32 * BAMBOO_SMEM_SIZE)
+#define BAMBOO_SMEM_SIZE_L (2 * BAMBOO_SMEM_SIZE)
 #endif
-#define BAMBOO_LARGE_SMEM_BOUND (BAMBOO_SMEM_SIZE_L*NUMCORES4GC) // NUMCORES=62
+#define BAMBOO_LARGE_SMEM_BOUND (BAMBOO_SMEM_SIZE_L*NUMCORES4GC) 
+         // let each gc core to have one big block, this is very important 
+				 // for the computation of NUMBLOCKS(s, n), DO NOT change this!
 
 #define NUMPTRS 100
 
+// for GC profile
+#ifdef GC_PROFILE
+#define GCINFOLENGTH 100
+
+typedef struct gc_info {
+  unsigned long long time[7];
+	int index;
+} GCInfo;
+
+GCInfo * gc_infoArray[GCINFOLENGTH];
+int gc_infoIndex;
+bool gc_infoOverflow;
+#endif
+
 typedef enum {
 	INIT = 0,     // 0
 	DISCOVERED,   // 1
@@ -44,11 +60,11 @@ volatile GCPHASETYPE gcphase; // indicating GC phase
 int gccurr_heaptop;
 struct MGCHash * gcforwardobjtbl; // cache forwarded objs in mark phase
 // for mark phase termination
-int gccorestatus[NUMCORES4GC]; // records status of each core
-                            // 1: running gc
-                            // 0: stall
-int gcnumsendobjs[NUMCORES4GC]; // records how many objects sent out
-int gcnumreceiveobjs[NUMCORES4GC]; // records how many objects received
+int gccorestatus[NUMCORESACTIVE]; // records status of each core
+                                  // 1: running gc
+                                  // 0: stall
+int gcnumsendobjs[NUMCORESACTIVE]; // records how many objects sent out
+int gcnumreceiveobjs[NUMCORESACTIVE]; // records how many objects received
 bool gcbusystatus;
 int gcself_numsendobjs;
 int gcself_numreceiveobjs;
@@ -90,10 +106,6 @@ int gcreservedsb;  // number of reserved sblock for sbstarttbl
 int gcnumblock; // number of total blocks in the shared mem
 int gcbaseva; // base va for shared memory without reserved sblocks
 
-// table recording the number of used bytes in each block
-// Note: this table resides on master core's local heap
-int * gcsmemtbl;
-
 #define ISSHAREDOBJ(p) \
 	((((int)p)>gcbaseva)&&(((int)p)<(gcbaseva+(BAMBOO_SHARED_MEM_SIZE))))
 
@@ -163,6 +175,7 @@ int * gcsmemtbl;
 
 inline void gc(struct garbagelist * stackptr); // core coordinator routine
 inline void gc_collect(struct garbagelist* stackptr);//core collector routine
+inline void gc_nocollect(struct garbagelist* stackptr);//non-gc core collector routine
 inline void transferMarkResults_I();
 inline void gc_enqueue_I(void *ptr);
 inline void gc_lobjenqueue_I(void *ptr, int length, int host);
@@ -176,5 +189,12 @@ inline void * gc_lobjdequeue4(int * length, int * host);
 inline int gc_lobjmoreItems4();
 inline void gc_lobjqueueinit4();
 
+#ifdef GC_PROFILE
+INLINE void gc_profileStart(void);
+INLINE void gc_profileItem(void);
+INLINE void gc_profileEnd(void);
+void gc_outputProfileData();
+#endif
+
 #endif
 
diff --git a/Robust/src/Runtime/multicoreruntime.h b/Robust/src/Runtime/multicoreruntime.h
index 14364342..bdf4ab85 100644
--- a/Robust/src/Runtime/multicoreruntime.h
+++ b/Robust/src/Runtime/multicoreruntime.h
@@ -236,10 +236,10 @@ struct Queue * totransobjqueue; // queue to hold objs to be transferred
 #define BAMBOO_SMEM_SIZE (64 * 64) // (BAMBOO_PAGE_SIZE)
 #define BAMBOO_SHARED_MEM_SIZE ((BAMBOO_PAGE_SIZE) * (BAMBOO_NUM_PAGES))
 #else
-#define BAMBOO_NUM_PAGES (64 * 1024) //(64 * 4 * 0.75) //(1024 * 1024 * 3.5)  3G
+#define BAMBOO_NUM_PAGES (15 * 1024) //(64 * 4 * 0.75) //(1024 * 1024 * 3.5)  3G
 #define BAMBOO_PAGE_SIZE (16 * 1024)// * 1024)  // (4096)
 #define BAMBOO_SMEM_SIZE (16 * 1024)
-#define BAMBOO_SHARED_MEM_SIZE (1024 * 1024 * 1024)
+#define BAMBOO_SHARED_MEM_SIZE (1024 * 1024 * 240) //(1024 * 1024 * 1024)
 //(3.0 * 1024 * 1024 * 1024) // 3G// ((BAMBOO_PAGE_SIZE) * (BAMBOO_NUM_PAGES))
 #endif
 
@@ -272,7 +272,13 @@ struct freeMemList {
 	                                  // only maintain 1 fremmMemItem
 };
 
-struct freeMemList * bamboo_free_mem_list;
+// table recording the number of allocated bytes on each block
+// Note: this table resides on the bottom of the shared heap for all cores
+//       to access
+int * bamboo_smemtbl;
+int bamboo_free_block;
+//bool bamboo_smem_flushed;
+//struct freeMemList * bamboo_free_mem_list;
 int bamboo_reserved_smem; // reserved blocks on the top of the shared heap
                           // e.g. 20% of the heap and should not be allocated
 													// otherwise gc is invoked
@@ -398,6 +404,8 @@ INLINE void send_msg_6(int targetcore,
 											 unsigned long n4, 
 											 unsigned long n5,
 											 bool isinterrupton);
+INLINE void cache_msg_1(int targetcore, 
+												unsigned long n0);
 INLINE void cache_msg_2(int targetcore, 
 		                    unsigned long n0, 
 												unsigned long n1);
@@ -478,6 +486,11 @@ void outputProfileData();
 // BAMBOO_MSG_AVAIL(): checking if there are msgs coming in                //
 // BAMBOO_GCMSG_AVAIL(): checking if there are gcmsgs coming in            //
 // BAMBOO_GET_EXE_TIME(): rountine to get current clock cycle number       //
+// BAMBOO_MEMSET_WH(x, y, z): memset the specified region of memory (start //
+//                            address x, size z) to value y with write     //
+//                            hint, the processor will not fetch the       //
+//                            current content of the memory and directly   //
+//                            write                                        //
 //                                                                         //
 // runtime_arch.h should also define following global parameters:          //
 // bamboo_cpu2coords: map the cpu # to (x,y) coordinates                   //
diff --git a/Robust/src/Runtime/multicoretask.c b/Robust/src/Runtime/multicoretask.c
index 0acb6cab..6d1ee763 100644
--- a/Robust/src/Runtime/multicoretask.c
+++ b/Robust/src/Runtime/multicoretask.c
@@ -53,17 +53,23 @@ void initruntimedata() {
 			// initialize the profile data arrays
 			profilestatus[i] = 1;
 #endif
-    } // for(i = 0; i < NUMCORESACTIVE; ++i)
 #ifdef MULTICORE_GC
-		for(i = 0; i < NUMCORES4GC; ++i) {
 			gccorestatus[i] = 1;
 			gcnumsendobjs[i] = 0; 
       gcnumreceiveobjs[i] = 0;
+#endif
+    } // for(i = 0; i < NUMCORESACTIVE; ++i)
+#ifdef MULTICORE_GC
+		for(i = 0; i < NUMCORES4GC; ++i) {
 			gcloads[i] = 0;
 			gcrequiredmems[i] = 0;
 			gcstopblock[i] = 0;
 			gcfilledblocks[i] = 0;
     } // for(i = 0; i < NUMCORES4GC; ++i)
+#ifdef GC_PROFILE
+		gc_infoIndex = 0;
+		gc_infoOverflow = false;
+#endif
 #endif
 		numconfirm = 0;
 		waitconfirm = false; 
@@ -119,7 +125,8 @@ void initruntimedata() {
 	gcmovepending = 0;
 	gcblock2fill = 0;
 	gcsbstarttbl = BAMBOO_BASE_VA;
-	gcsmemtbl = RUNMALLOC_I(sizeof(int)*gcnumblock);
+	bamboo_smemtbl = (void *)gcsbstarttbl
+		+ (BAMBOO_SHARED_MEM_SIZE/BAMBOO_SMEM_SIZE)*sizeof(INTPTR); 
 #else
 	// create the lock table, lockresult table and obj queue
   locktable.size = 20;
@@ -169,9 +176,6 @@ void disruntimedata() {
 	freeRuntimeHash(gcpointertbl);
 	//freeMGCHash(gcpointertbl);
 	freeMGCHash(gcforwardobjtbl);
-	if(gcsmemtbl != NULL) {
-		RUNFREE(gcsmemtbl);
-	}
 #else
 	freeRuntimeHash(lockRedirectTbl);
 	freeRuntimeHash(objRedirectLockTbl);
@@ -446,6 +450,13 @@ void checkCoreStatus() {
 						} // if(!allStall)
 					} // while(true)
 #endif
+
+					// gc_profile mode, ourput gc prfiling data
+#ifdef MULTICORE_GC
+#ifdef GC_PROFILE
+					gc_outputProfileData();
+#endif // #ifdef GC_PROFILE
+#endif // #ifdef MULTICORE_GC
 					disruntimedata();
 					terminate(); // All done.
 				} // if(!waitconfirm)
@@ -1203,181 +1214,160 @@ inline void addNewObjInfo(void * nobj) {
 #endif
 
 #ifdef MULTICORE_GC
-struct freeMemItem * findFreeMemChunk_I(int coren,
-		                                    int isize,
-		                                    int * tofindb) {
-	struct freeMemItem * freemem = bamboo_free_mem_list->head;
-	struct freeMemItem * prev = NULL;
+void * localmalloc_I(int coren,
+		                 int isize,
+		                 int * allocsize) {
+	void * mem = NULL;
 	int i = 0;
 	int j = 0;
-	*tofindb = gc_core2block[2*coren+i]+(NUMCORES4GC*2)*j;
-	// check available shared mem chunks
+	int tofindb = gc_core2block[2*coren+i]+(NUMCORES4GC*2)*j;
+	int totest = tofindb;
+	int bound = BAMBOO_SMEM_SIZE_L;
+	int foundsmem = 0;
+	int size = 0;
 	do {
-		int foundsmem = 0;
-		switch(bamboo_smem_mode) {
-			case SMEMLOCAL: {
-				int startb = freemem->startblock;
-				int endb = freemem->endblock;
-				while(startb > *tofindb) {
-					i++;
-					if(2==i) {
-						i = 0;
-						j++;
-					}
-					*tofindb = gc_core2block[2*coren+i]+(NUMCORES4GC*2)*j;
-				} // while(startb > tofindb)
-				if(startb <= *tofindb) {
-					if((endb >= *tofindb) && (freemem->size >= isize)) {
-						foundsmem = 1;
-					} else if(*tofindb > gcnumblock-1) {
-						// no more local mem
-						foundsmem = 2;
-					} // if(endb >= tofindb) 
-				} // if(startb <= tofindb)
-				break;
-			}
-
-			case SMEMFIXED: {
-				int startb = freemem->startblock;
-				int endb = freemem->endblock;
-				if(startb <= *tofindb) {
-					if((endb >= *tofindb)  && (freemem->size >= isize)) {
-						foundsmem = 1;
-					} 
-				} else {
-					// use the global mem
-					if(((startb > NUMCORES4GC-1) && (freemem->size >= isize)) || 
-							((endb > NUMCORES4GC-1) && ((freemem->size-
-								(gcbaseva+BAMBOO_LARGE_SMEM_BOUND-freemem->ptr))>=isize))) {
-						foundsmem = 1;
-					}
-				}
-				break;
-			}
-
-			case SMEMMIXED: {
-				// TODO not supported yet
-				BAMBOO_EXIT(0xe001);
-				break;
-			}
-
-			case SMEMGLOBAL: {
-		    foundsmem = (freemem->size >= isize);
-				break;
-			}
-			default:
-				break;
-		}
-
-		if(1 == foundsmem) {
-			// found one
-			break;
-		} else if (2 == foundsmem) {
-			// terminate, no more mem
-			freemem = NULL;
-			break;
-		}
-		if(freemem->size == 0) {
-			// an empty item, remove it
-			struct freeMemItem * toremove = freemem;
-			freemem = freemem->next;
-			if(prev == NULL ){
-				// the head
-				bamboo_free_mem_list->head = freemem;
+		bound = (totest < NUMCORES4GC) ? BAMBOO_SMEM_SIZE_L : BAMBOO_SMEM_SIZE;
+		int nsize = bamboo_smemtbl[totest];
+		bool islocal = true;
+		if(nsize < bound) {
+			bool tocheck = true;
+			// have some space in the block
+			if(totest == tofindb) {
+				// the first partition
+				size = bound - nsize;
+			} else if(nsize == 0) {
+				// an empty partition, can be appended
+				size += bound;
 			} else {
-				prev->next = freemem;
-			}
-			// put it to the tail of the list for reuse
-			if(bamboo_free_mem_list->backuplist == NULL) {
-				//toremove->next = bamboo_free_mem_list->backuplist;
-				bamboo_free_mem_list->backuplist = toremove;
-				bamboo_free_mem_list->backuplist->next = NULL;
-			} else {
-				// free it
-				RUNFREE(toremove);
+				// not an empty partition, can not be appended
+				// the last continuous block is not big enough, go to check the next
+				// local block
+				islocal = true;
+				tocheck = false;
+			} // if(totest == tofindb) else if(nsize == 0) else ...
+			if(tocheck) {
+				if(size >= isize) {
+					// have enough space in the block, malloc
+					foundsmem = 1;
+					break;
+				} else {
+					// no enough space yet, try to append next continuous block
+					islocal = false;
+				} // if(size > isize) else ...
+			} // if(tocheck)
+		} // if(nsize < bound)
+		if(islocal) {
+			// no space in the block, go to check the next block
+			i++;
+			if(2==i) {
+				i = 0;
+				j++;
 			}
+			tofindb = totest = gc_core2block[2*coren+i]+(NUMCORES4GC*2)*j;
 		} else {
-			prev = freemem;
-			freemem = freemem->next;
+			totest += 1;
+		} // if(islocal) else ...
+		if(totest > gcnumblock-1-bamboo_reserved_smem) {
+			// no more local mem, do not find suitable block
+			foundsmem = 2;
+			break;
+		} // if(totest > gcnumblock-1-bamboo_reserved_smem) ...
+	} while(true);
+
+	if(foundsmem == 1) {
+		// find suitable block
+		mem = gcbaseva+bamboo_smemtbl[tofindb]+((tofindb<NUMCORES4GC)?
+				(BAMBOO_SMEM_SIZE_L*tofindb):(BAMBOO_LARGE_SMEM_BOUND+
+					(tofindb-NUMCORES4GC)*BAMBOO_SMEM_SIZE));
+		*allocsize = size;
+		// set bamboo_smemtbl
+		for(i = tofindb; i <= totest; i++) {
+			bamboo_smemtbl[i]=(i<NUMCORES4GC)?BAMBOO_SMEM_SIZE_L:BAMBOO_SMEM_SIZE;
 		}
-	} while(freemem != NULL);
-
-	return freemem;
-} // struct freeMemItem * findFreeMemChunk_I(int, int, int *)
-
-void * localmalloc_I(int tofindb,
-		                 int isize,
-		                 struct freeMemItem * freemem,
-		                 int * allocsize) {
-	void * mem = NULL;
-	int startb = freemem->startblock;
-	int endb = freemem->endblock;
-	int tmpptr = gcbaseva+((tofindb<NUMCORES4GC)?tofindb*BAMBOO_SMEM_SIZE_L
-		:BAMBOO_LARGE_SMEM_BOUND+(tofindb-NUMCORES4GC)*BAMBOO_SMEM_SIZE);
-	if((freemem->size+freemem->ptr-tmpptr)>=isize) {
-		mem = (tmpptr>freemem->ptr)?((void *)tmpptr):(freemem->ptr);
-	} else {
-		mem = (void *)(freemem->size+freemem->ptr-isize);
-	}
-	// check the remaining space in this block
-	int remain = (int)(mem-gcbaseva);
-	int bound = (BAMBOO_SMEM_SIZE);
-	if(remain < BAMBOO_LARGE_SMEM_BOUND) {
-		bound = (BAMBOO_SMEM_SIZE_L);
-	}
-	remain = bound - remain%bound;
-	if(remain < isize) {
-		// this object acrosses blocks
-		*allocsize = isize;
-	} else {
-		// round the asigned block to the end of the current block
-		*allocsize = remain;
-	}
-	if(freemem->ptr == (int)mem) {
-		freemem->ptr = ((void*)freemem->ptr) + (*allocsize);
-		freemem->size -= *allocsize;
-		BLOCKINDEX(freemem->ptr, &(freemem->startblock));
-	} else if((freemem->ptr+freemem->size) == ((int)mem+(*allocsize))) {
-		freemem->size -= *allocsize;
-		BLOCKINDEX(((int)mem)-1, &(freemem->endblock));
-	} else {
-		struct freeMemItem * tmp = 
-			(struct freeMemItem *)RUNMALLOC_I(sizeof(struct freeMemItem));
-		tmp->ptr = (int)mem+*allocsize;
-		tmp->size = freemem->ptr+freemem->size-(int)mem-*allocsize;
-		BLOCKINDEX(tmp->ptr, &(tmp->startblock));
-		tmp->endblock = freemem->endblock;
-		tmp->next = freemem->next;
-		freemem->next = tmp;
-		freemem->size = (int)mem - freemem->ptr;
-		BLOCKINDEX(((int)mem-1), &(freemem->endblock));
+	} else if(foundsmem == 2) {
+		// no suitable block
+		*allocsize = 0;
 	}
+
 	return mem;
-} // void * localmalloc_I(int, int, struct freeMemItem *, int *)
+} // void * localmalloc_I(int, int, int *)
 
-void * globalmalloc_I(int isize,
-		                  struct freeMemItem * freemem,
+void * globalmalloc_I(int coren,
+		                  int isize,
 		                  int * allocsize) {
-	void * mem = (void *)(freemem->ptr);
-	// check the remaining space in this block
-	int remain = (int)(mem-gcbaseva);
-	int bound = (BAMBOO_SMEM_SIZE);
-	if(remain < BAMBOO_LARGE_SMEM_BOUND) {
-		bound = (BAMBOO_SMEM_SIZE_L);
+	void * mem = NULL;
+	int tofindb = bamboo_free_block; //0;
+	int totest = tofindb;
+	int bound = BAMBOO_SMEM_SIZE_L;
+	int foundsmem = 0;
+	int size = 0;
+	if(tofindb > gcnumblock-1-bamboo_reserved_smem) {
+		*allocsize = 0;
+		return NULL;
 	}
-	remain = bound - remain%bound;
-	if(remain < isize) {
-		// this object acrosses blocks
-		*allocsize = isize;
-	} else {
-		// round the asigned block to the end of the current block
-		*allocsize = remain;
+	do {
+		bound = (totest < NUMCORES4GC) ? BAMBOO_SMEM_SIZE_L : BAMBOO_SMEM_SIZE;
+		int nsize = bamboo_smemtbl[totest];
+		bool isnext = false;
+		if(nsize < bound) {
+			bool tocheck = true;
+			// have some space in the block
+			if(totest == tofindb) {
+				// the first partition
+				size = bound - nsize;
+			} else if(nsize == 0) {
+				// an empty partition, can be appended
+				size += bound;
+			} else {
+				// not an empty partition, can not be appended
+				// the last continuous block is not big enough, start another block
+				isnext = true;
+				tocheck = false;
+			} // if(totest == tofindb) else if(nsize == 0) else ...
+			if(tocheck) {
+				if(size >= isize) {
+					// have enough space in the block, malloc
+					foundsmem = 1;
+					break;
+				} // if(size > isize) 
+			} // if(tocheck)
+		} else {
+			isnext = true;
+		}// if(nsize < bound) else ...
+		totest += 1;
+		if(totest > gcnumblock-1-bamboo_reserved_smem) {
+			// no more local mem, do not find suitable block
+			foundsmem = 2;
+			break;
+		} // if(totest > gcnumblock-1-bamboo_reserved_smem) ...
+		if(isnext) {
+			// start another block
+			tofindb = totest;
+		} // if(islocal) 
+	} while(true);
+
+	if(foundsmem == 1) {
+		// find suitable block
+		mem = gcbaseva+bamboo_smemtbl[tofindb]+((tofindb<NUMCORES4GC)?
+				(BAMBOO_SMEM_SIZE_L*tofindb):(BAMBOO_LARGE_SMEM_BOUND+
+					(tofindb-NUMCORES4GC)*BAMBOO_SMEM_SIZE));
+		*allocsize = size;
+		// set bamboo_smemtbl
+		for(int i = tofindb; i <= totest; i++) {
+			bamboo_smemtbl[i]=(i<NUMCORES4GC)?BAMBOO_SMEM_SIZE_L:BAMBOO_SMEM_SIZE;
+		}
+		if(tofindb == bamboo_free_block) {
+			bamboo_free_block = totest+1;
+		}
+	} else if(foundsmem == 2) {
+		// no suitable block
+		*allocsize = 0;
+		mem = NULL;
 	}
-	freemem->ptr = ((void*)freemem->ptr) + (*allocsize);
-	freemem->size -= *allocsize;
+
 	return mem;
-} // void * globalmalloc_I(int, struct freeMemItem *, int *)
-#endif
+} // void * globalmalloc_I(int, int, int *)
+#endif // #ifdef MULTICORE_GC
 
 // malloc from the shared memory
 void * smemalloc_I(int coren,
@@ -1386,47 +1376,36 @@ void * smemalloc_I(int coren,
 	void * mem = NULL;
 #ifdef MULTICORE_GC
 	int isize = size+(BAMBOO_CACHE_LINE_SIZE);
-	int toallocate = (isize>(BAMBOO_SMEM_SIZE)) ? (isize):(BAMBOO_SMEM_SIZE);
-	// go through free mem list for suitable chunks
-	int tofindb = 0;
-	struct freeMemItem * freemem = findFreeMemChunk_I(coren, isize, &tofindb);
-
-	// allocate shared mem if available
-	if(freemem != NULL) {
-		switch(bamboo_smem_mode) {
-			case SMEMLOCAL: {
-				mem = localmalloc_I(tofindb, isize, freemem, allocsize);
-				break;
-			}
 
-			case SMEMFIXED: {
-				int startb = freemem->startblock;
-				int endb = freemem->endblock;
-				if(startb > tofindb) {
-					// malloc on global mem
-					mem = globalmalloc_I(isize, freemem, allocsize);
-				} else {
-					// malloc on local mem
-					mem = localmalloc_I(tofindb, isize, freemem, allocsize);
-				}
-				break;
-			}
+	// go through the bamboo_smemtbl for suitable partitions
+	switch(bamboo_smem_mode) {
+		case SMEMLOCAL: {
+		  mem = localmalloc_I(coren, isize, allocsize);
+			break;
+	  }
 
-			case SMEMMIXED: {
-				// TODO not supported yet
-				BAMBOO_EXIT(0xe002);
-				break;
-			}
+		case SMEMFIXED: {
+			// TODO not supported yet
+			BAMBOO_EXIT(0xe001);
+			break;
+		}
 
-			case SMEMGLOBAL: {
-				mem = globalmalloc_I(isize,freemem, allocsize);
-				break;
-			}
+		case SMEMMIXED: {
+			// TODO not supported yet
+			BAMBOO_EXIT(0xe002);
+			break;
+		}
 
-			default:
-				break;
+		case SMEMGLOBAL: {
+			mem = globalmalloc_I(coren, isize, allocsize);
+			break;
 		}
-	} else {
+
+		default:
+			break;
+	}
+
+	if(mem == NULL) {
 #else
 	int toallocate = (size>(BAMBOO_SMEM_SIZE)) ? (size):(BAMBOO_SMEM_SIZE);
 	mem = mspace_calloc(bamboo_free_msp, 1, toallocate);
@@ -1862,15 +1841,28 @@ msg:
 		  BAMBOO_DEBUGPRINT(0xe88a);
 #endif
 #endif
+			int allocsize = 0;
+		  void * mem = NULL;
 #ifdef MULTICORE_GC
 			if(gcprocessing) {
 				// is currently doing gc, dump this msg
+				if(INITPHASE == gcphase) {
+					// if still in the initphase of gc, send a startinit msg again
+					if(isMsgSending) {
+						cache_msg_1(msgdata[2], GCSTARTINIT);
+					} else {
+						send_msg_1(msgdata[2], GCSTARTINIT, true);
+					}
+				}
 				break;
-			}
+			} 
 #endif
-			int allocsize = 0;
-		  void * mem = smemalloc_I(msgdata[2], msgdata[1], &allocsize);
+			mem = smemalloc_I(msgdata[2], msgdata[1], &allocsize);
 			if(mem == NULL) {
+				// in this case, the gcflag of the startup core has been set
+				// and the gc should be started later, then a GCSTARTINIT msg
+				// will be sent to the requesting core to notice it to start gc
+				// and try malloc again
 				break;
 			}
 			// send the start_va to request core
@@ -1902,6 +1894,7 @@ msg:
 	  } else {
 #ifdef MULTICORE_GC
 			// fill header to store the size of this mem block
+			memset(msgdata[1], 0, BAMBOO_CACHE_LINE_SIZE);
 			(*((int*)msgdata[1])) = msgdata[2];
 		  bamboo_smem_size = msgdata[2] - BAMBOO_CACHE_LINE_SIZE;
 			bamboo_cur_msp = msgdata[1] + BAMBOO_CACHE_LINE_SIZE;
@@ -1967,7 +1960,8 @@ msg:
 		BAMBOO_DEBUGPRINT(0xe88c);
 		BAMBOO_DEBUGPRINT_REG(msgdata[1]);
 #endif
-		if(msgdata[1] < NUMCORES4GC) {
+		// All cores should do init GC
+		if(msgdata[1] < NUMCORESACTIVE) {
 			gccorestatus[msgdata[1]] = 0;
 		}
 	}
@@ -1981,7 +1975,8 @@ msg:
 #endif
 		  BAMBOO_EXIT(0xb002);
 		}
-		if(msgdata[1] < NUMCORES4GC) {
+		// all cores should do mark
+		if(msgdata[1] < NUMCORESACTIVE) {
 			gccorestatus[msgdata[1]] = 0;
 			gcnumsendobjs[msgdata[1]] = msgdata[2];
 			gcnumreceiveobjs[msgdata[1]] = msgdata[3];
@@ -2003,6 +1998,7 @@ msg:
 		int filledblocks = msgdata[2];
 		int heaptop = msgdata[3];
 		int data4 = msgdata[4];
+		// only gc cores need to do compact
 		if(cnum < NUMCORES4GC) {
 			if(COMPACTPHASE == gcphase) {
 				gcfilledblocks[cnum] = filledblocks;
@@ -2022,39 +2018,6 @@ msg:
 				}
 			} else {
 				gccorestatus[cnum] = 0;
-				// check if there is pending move request
-				/*if(gcmovepending > 0) {
-					int j;
-					for(j = 0; j < NUMCORES4GC; j++) {
-						if(gcrequiredmems[j]>0) {
-							break;
-						}
-					}
-					if(j < NUMCORES4GC) {
-						// find match
-						int tomove = 0;
-						int startaddr = 0;
-						gcrequiredmems[j] = assignSpareMem_I(cnum, 
-																							   gcrequiredmems[j], 
-																							   &tomove, 
-																							   &startaddr);
-						if(STARTUPCORE == j) {
-							gcdstcore = cnum;
-							gctomove = true;
-							gcmovestartaddr = startaddr;
-							gcblock2fill = tomove;
-						} else {
-							if(isMsgSending) {
-								cache_msg_4(j, GCMOVESTART, cnum, startaddr, tomove);
-							} else {
-								send_msg_4(j, GCMOVESTART, cnum, startaddr, tomove, true);
-							}
-						} // if(STARTUPCORE == j)
-						if(gcrequiredmems[j] == 0) {
-							gcmovepending--;
-						}
-					} // if(j < NUMCORES4GC)
-				} // if(gcmovepending > 0) */
 			} // if(data4>0)
 		} // if(cnum < NUMCORES4GC)
 	  break;
@@ -2070,7 +2033,8 @@ msg:
 #endif
 		  BAMBOO_EXIT(0xb004);
 		} 
-		if(msgdata[1] < NUMCORES4GC) {
+		// all cores should do flush
+		if(msgdata[1] < NUMCORESACTIVE) {
 		  gccorestatus[msgdata[1]] = 0;
 		}
 	  break;
@@ -2084,8 +2048,9 @@ msg:
 
 	case GCMARKCONFIRM: {
 		// received a marked phase finish confirm request msg
+		// all cores should do mark
 		if((BAMBOO_NUM_OF_CORE == STARTUPCORE) 
-				|| (BAMBOO_NUM_OF_CORE > NUMCORES4GC - 1)) {
+				|| (BAMBOO_NUM_OF_CORE > NUMCORESACTIVE - 1)) {
 		  // wrong core to receive such msg
 		  BAMBOO_EXIT(0xb005);
 		} else {
@@ -2238,9 +2203,6 @@ msg:
 	default:
 		break;
 	}
-	/*for(; msgdataindex > 0; --msgdataindex) {
-		msgdata[msgdataindex-1] = -1;
-	}*/
   memset(msgdata, '\0', sizeof(int) * msgdataindex);
 	msgdataindex = 0;
 	msglength = BAMBOO_MSG_BUF_LENGTH;
@@ -2604,22 +2566,6 @@ newtask:
 	  //clock2 = BAMBOO_GET_EXE_TIME();
 
 	  for(i = 0; i < runtime_locklen; i++) {
-	  /*for(i = 0; i < numparams; i++) {
-		  void * param = currtpd->parameterArray[i];
-		  int * lock = 0;
-		  bool insert = true;
-		  if(((struct ___Object___ *)param)->type == STARTUPTYPE) {
-			  islock = false;
-			  taskpointerarray[i+OFFSET]=param;
-			  goto execute;
-		  }
-		  if(((struct ___Object___ *)param)->lock == NULL) {
-			  lock = (int *)param;
-		  } else {
-			  lock = (int *)(((struct ___Object___ *)param)->lock);
-		  }
-		  */
-
 		  int * lock = (int *)(runtime_locks[i].redirectlock);
 		  islock = true;
 		  // require locks for this parameter if it is not a startup object
@@ -2667,18 +2613,9 @@ newtask:
 				BAMBOO_DEBUGPRINT_REG(lock);
 #endif
 				// check if has the lock already
-				/*bool giveup = true;
-				for(j = 0; j < runtime_locklen; j++) {
-			  if(runtime_locks[j].value == lock) {
-				  giveup = false;
-				  break;
-			  }
-		  }
-				if(giveup) {*/
 			  // can not get the lock, try later
 			  // release all grabbed locks for previous parameters
 			  for(j = 0; j < i; ++j) { 
-			  //for(j = 0; j < runtime_locklen; ++j) {
 				  lock = (int*)(runtime_locks[j].redirectlock);
 				  releasewritelock(lock);
 			  }
@@ -2697,12 +2634,7 @@ newtask:
 #endif
 			  goto newtask;
 				//}
-		  }/* else { // line 2794: if(grount == 0)
-		  // TODO
-		  runtime_locks[runtime_locklen].value = (int)lock;
-		  runtime_locks[runtime_locklen].redirectlock = (int)param;
-		  runtime_locklen++;
-		  }*/
+		  }
 	  } // line 2752:  for(i = 0; i < runtime_locklen; i++)
 
 	  /*long clock3;
@@ -3249,4 +3181,184 @@ void toiNext(struct tagobjectiterator *it,
     Objnext(&it->it);
   }
 }
+
+#ifdef PROFILE
+inline void profileTaskStart(char * taskname) {
+  if(!taskInfoOverflow) {
+	  TaskInfo* taskInfo = RUNMALLOC(sizeof(struct task_info));
+	  taskInfoArray[taskInfoIndex] = taskInfo;
+	  taskInfo->taskName = taskname;
+	  taskInfo->startTime = BAMBOO_GET_EXE_TIME();
+	  taskInfo->endTime = -1;
+	  taskInfo->exitIndex = -1;
+	  taskInfo->newObjs = NULL;
+  }
+}
+
+inline void profileTaskEnd() {
+  if(!taskInfoOverflow) {
+	  taskInfoArray[taskInfoIndex]->endTime = BAMBOO_GET_EXE_TIME();
+	  taskInfoIndex++;
+	  if(taskInfoIndex == TASKINFOLENGTH) {
+		  taskInfoOverflow = true;
+		  //taskInfoIndex = 0;
+	  }
+  }
+}
+
+// output the profiling data
+void outputProfileData() {
+#ifdef USEIO
+  int i;
+  unsigned long long totaltasktime = 0;
+  unsigned long long preprocessingtime = 0;
+  unsigned long long objqueuecheckingtime = 0;
+  unsigned long long postprocessingtime = 0;
+  //int interruptiontime = 0;
+  unsigned long long other = 0;
+  unsigned long long averagetasktime = 0;
+  int tasknum = 0;
+
+  printf("Task Name, Start Time, End Time, Duration, Exit Index(, NewObj Name, Num)+\n");
+  // output task related info
+  for(i = 0; i < taskInfoIndex; i++) {
+    TaskInfo* tmpTInfo = taskInfoArray[i];
+    unsigned long long duration = tmpTInfo->endTime - tmpTInfo->startTime;
+    printf("%s, %lld, %lld, %lld, %lld", 
+			tmpTInfo->taskName, tmpTInfo->startTime, tmpTInfo->endTime, 
+			duration, tmpTInfo->exitIndex);
+	// summarize new obj info
+	if(tmpTInfo->newObjs != NULL) {
+		struct RuntimeHash * nobjtbl = allocateRuntimeHash(5);
+		struct RuntimeIterator * iter = NULL;
+		while(0 == isEmpty(tmpTInfo->newObjs)) {
+			char * objtype = (char *)(getItem(tmpTInfo->newObjs));
+			if(RuntimeHashcontainskey(nobjtbl, (int)(objtype))) {
+				int num = 0;
+				RuntimeHashget(nobjtbl, (int)objtype, &num);
+				RuntimeHashremovekey(nobjtbl, (int)objtype);
+				num++;
+				RuntimeHashadd(nobjtbl, (int)objtype, num);
+			} else {
+				RuntimeHashadd(nobjtbl, (int)objtype, 1);
+			}
+			//printf(stderr, "new obj!\n");
+		}
+
+		// output all new obj info
+		iter = RuntimeHashcreateiterator(nobjtbl);
+		while(RunhasNext(iter)) {
+			char * objtype = (char *)Runkey(iter);
+			int num = Runnext(iter);
+			printf(", %s, %d", objtype, num);
+		}
+	}
+	printf("\n");
+    if(strcmp(tmpTInfo->taskName, "tpd checking") == 0) {
+      preprocessingtime += duration;
+    } else if(strcmp(tmpTInfo->taskName, "post task execution") == 0) {
+      postprocessingtime += duration;
+    } else if(strcmp(tmpTInfo->taskName, "objqueue checking") == 0) {
+      objqueuecheckingtime += duration;
+    } else {
+      totaltasktime += duration;
+      averagetasktime += duration;
+      tasknum++;
+    }
+  }
+
+  if(taskInfoOverflow) {
+    printf("Caution: task info overflow!\n");
+  }
+
+  other = totalexetime-totaltasktime-preprocessingtime-postprocessingtime;
+  averagetasktime /= tasknum;
+
+  printf("\nTotal time: %lld\n", totalexetime);
+  printf("Total task execution time: %lld (%d%%)\n", totaltasktime, 
+			   (int)(((double)totaltasktime/(double)totalexetime)*100));
+  printf("Total objqueue checking time: %lld (%d%%)\n", 
+			   objqueuecheckingtime, 
+				 (int)(((double)objqueuecheckingtime/(double)totalexetime)*100));
+  printf("Total pre-processing time: %lld (%d%%)\n", preprocessingtime, 
+			   (int)(((double)preprocessingtime/(double)totalexetime)*100));
+  printf("Total post-processing time: %lld (%d%%)\n", postprocessingtime, 
+			   (int)(((double)postprocessingtime/(double)totalexetime)*100));
+  printf("Other time: %lld (%d%%)\n", other, 
+			   (int)(((double)other/(double)totalexetime)*100));
+
+  printf("\nAverage task execution time: %lld\n", averagetasktime);
+#else
+  int i = 0;
+  int j = 0;
+
+  BAMBOO_DEBUGPRINT(0xdddd);
+  // output task related info
+  for(i= 0; i < taskInfoIndex; i++) {
+    TaskInfo* tmpTInfo = taskInfoArray[i];
+    char* tmpName = tmpTInfo->taskName;
+    int nameLen = strlen(tmpName);
+    BAMBOO_DEBUGPRINT(0xddda);
+    for(j = 0; j < nameLen; j++) {
+      BAMBOO_DEBUGPRINT_REG(tmpName[j]);
+    }
+    BAMBOO_DEBUGPRINT(0xdddb);
+    BAMBOO_DEBUGPRINT_REG(tmpTInfo->startTime);
+    BAMBOO_DEBUGPRINT_REG(tmpTInfo->endTime);
+	BAMBOO_DEBUGPRINT_REG(tmpTInfo->exitIndex);
+	if(tmpTInfo->newObjs != NULL) {
+		struct RuntimeHash * nobjtbl = allocateRuntimeHash(5);
+		struct RuntimeIterator * iter = NULL;
+		while(0 == isEmpty(tmpTInfo->newObjs)) {
+			char * objtype = (char *)(getItem(tmpTInfo->newObjs));
+			if(RuntimeHashcontainskey(nobjtbl, (int)(objtype))) {
+				int num = 0;
+				RuntimeHashget(nobjtbl, (int)objtype, &num);
+				RuntimeHashremovekey(nobjtbl, (int)objtype);
+				num++;
+				RuntimeHashadd(nobjtbl, (int)objtype, num);
+			} else {
+				RuntimeHashadd(nobjtbl, (int)objtype, 1);
+			}
+		}
+
+		// ouput all new obj info
+		iter = RuntimeHashcreateiterator(nobjtbl);
+		while(RunhasNext(iter)) {
+			char * objtype = (char *)Runkey(iter);
+			int num = Runnext(iter);
+			int nameLen = strlen(objtype);
+			BAMBOO_DEBUGPRINT(0xddda);
+			for(j = 0; j < nameLen; j++) {
+				BAMBOO_DEBUGPRINT_REG(objtype[j]);
+			}
+			BAMBOO_DEBUGPRINT(0xdddb);
+			BAMBOO_DEBUGPRINT_REG(num);
+		}
+	}
+    BAMBOO_DEBUGPRINT(0xdddc);
+  }
+
+  if(taskInfoOverflow) {
+    BAMBOO_DEBUGPRINT(0xefee);
+  }
+
+  // output interrupt related info
+  /*for(i = 0; i < interruptInfoIndex; i++) {
+       InterruptInfo* tmpIInfo = interruptInfoArray[i];
+       BAMBOO_DEBUGPRINT(0xddde);
+       BAMBOO_DEBUGPRINT_REG(tmpIInfo->startTime);
+       BAMBOO_DEBUGPRINT_REG(tmpIInfo->endTime);
+       BAMBOO_DEBUGPRINT(0xdddf);
+     }
+
+     if(interruptInfoOverflow) {
+       BAMBOO_DEBUGPRINT(0xefef);
+     }*/
+
+  BAMBOO_DEBUGPRINT(0xeeee);
+#endif
+}
+#endif  // #ifdef PROFILE
+
 #endif
diff --git a/Robust/src/buildscript b/Robust/src/buildscript
index 931382bd..04195913 100755
--- a/Robust/src/buildscript
+++ b/Robust/src/buildscript
@@ -72,6 +72,7 @@ echo -o binary
 echo -nojava do not run bristlecone compiler
 echo -instructionfailures inject code for instructionfailures
 echo -profile build with profile options
+echo -gcprofile build with gcprofile options
 echo -accurateprofile build with accurate profile information including pre/post task processing info
 echo "-useio use standard io to output profiling data (should be used together with -raw and -profile), it only works with single core version"
 echo "-enable-assertions execute assert statements during compilation"
@@ -117,6 +118,7 @@ RAWCONFIG=''
 DEBUGFLAG=false
 RAWPATHFLAG=false
 PROFILEFLAG=false
+GCPROFILEFLAG=false
 ACCURATEPROFILEFLAG=false
 USEIOFLAG=false
 INTERRUPTFLAG=false
@@ -277,6 +279,9 @@ elif [[ $1 = '-profile' ]]
 then
 PROFILEFLAG=true
 EXTRAOPTIONS="$EXTRAOPTIONS -pg"
+elif [[ $1 = '-gcprofile' ]]
+then
+GCPROFILEFLAG=true
 elif [[ $1 = '-accurateprofile' ]]
 then
 ACCURATEPROFILEFLAG=true
@@ -299,11 +304,11 @@ JAVAOPTS="$JAVAOPTS -multicore"
 elif [[ $1 = '-numcore' ]]
 then
 JAVAOPTS="$JAVAOPTS -numcore $2"
-GCCORES="GC_$2"
 shift
 elif [[ $1 = '-numcore4gc' ]]
 then
 JAVAOPTS="$JAVAOPTS -numcore4gc $2"
+GCCORES="GC_$2"
 shift
 elif [[ $1 = '-raw' ]]
 then
@@ -689,6 +694,11 @@ then #MULTICOREGC version
 TILERACFLAGS="${TILERACFLAGS} -DMULTICORE_GC -D${GCCORES}"
 fi
 
+if $GCPROFILEFLAG
+then # GC_PROFILE version
+TILERACFLAGS="${TILERACFLAGS} -DGC_PROFILE"
+fi
+
 cp $ROBUSTROOT/Tilera/Runtime/$MAKEFILE ./Makefile
 cp $ROBUSTROOT/Tilera/Runtime/$SIMHVC ./sim.hvc
 cp $ROBUSTROOT/Tilera/Runtime/$PCIHVC ./pci.hvc