From e542673fc665c67c596b1826da046c56e692fb65 Mon Sep 17 00:00:00 2001
From: jzhou <jzhou>
Date: Tue, 27 Oct 2009 20:24:04 +0000
Subject: [PATCH] Restructure the shared memory allocation and fixed multiple
 bugs in the multicore gc. Now the startegy of shared memory allocation can be
 configurable as 1) each core can only use its local memory 2) use local
 memory for lower address space while use global memory for higher address
 space or 3) all cores allocate globally. The simplest test case can now go
 through several round of gc correctly. Still need more tests. Also organized
 the codes a little bit.

---
 Robust/src/Runtime/multicoregarbage.c | 686 ++++++++++++--------------
 Robust/src/Runtime/multicoregarbage.h |  98 +---
 Robust/src/Runtime/multicoreruntime.h | 101 ++--
 Robust/src/Runtime/multicoretask.c    | 320 +++++++++---
 Robust/src/Runtime/runtime.h          |   7 +-
 Robust/src/buildscript                |   2 +-
 6 files changed, 658 insertions(+), 556 deletions(-)
diff --git a/Robust/src/Runtime/multicoregarbage.c b/Robust/src/Runtime/multicoregarbage.c
index 58491b62..78b3cf1d 100644
--- a/Robust/src/Runtime/multicoregarbage.c
+++ b/Robust/src/Runtime/multicoregarbage.c
@@ -62,10 +62,14 @@ inline void dumpSMem() {
 	tprintf("++++ reserved sblocks ++++ \n");
 	for(i=BAMBOO_BASE_VA; i<gcbaseva; i+= 4*16) {
 		tprintf("0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x \n",
-            *((int *)(i)), *((int *)(i + 4)), *((int *)(i + 4*2)), *((int *)(i + 4*3)), 
-						*((int *)(i + 4*4)), *((int *)(i + 4*5)), *((int *)(i + 4*6)), *((int *)(i + 4*7)), 
-						*((int *)(i + 4*8)), *((int *)(i + 4*9)), *((int *)(i + 4*10)), *((int *)(i + 4*11)),
-						*((int *)(i + 4*12)), *((int *)(i + 4*13)), *((int *)(i + 4*14)), *((int *)(i + 4*15)));
+            *((int *)(i)), *((int *)(i + 4)), 
+						*((int *)(i + 4*2)), *((int *)(i + 4*3)), 
+						*((int *)(i + 4*4)), *((int *)(i + 4*5)), 
+						*((int *)(i + 4*6)), *((int *)(i + 4*7)), 
+						*((int *)(i + 4*8)), *((int *)(i + 4*9)), 
+						*((int *)(i + 4*10)), *((int *)(i + 4*11)),
+						*((int *)(i + 4*12)), *((int *)(i + 4*13)), 
+						*((int *)(i + 4*14)), *((int *)(i + 4*15)));
 	}
 	sblock = gcreservedsb;
 	bool advanceblock = false;
@@ -97,15 +101,20 @@ inline void dumpSMem() {
 			}
 			x = tmpcore/bamboo_width;
 			y = tmpcore%bamboo_width;
-			tprintf("==== %d, %d : core (%d,%d), saddr %x====\n", block, sblock++, 
-					    x, y, (sblock-1)*(BAMBOO_SMEM_SIZE)+BAMBOO_BASE_VA);
+			tprintf("==== %d, %d : core (%d,%d), saddr %x====\n", 
+					    block, sblock++, x, y, 
+							(sblock-1)*(BAMBOO_SMEM_SIZE)+BAMBOO_BASE_VA);
 		}
 		j++;
     tprintf("0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x \n",
-            *((int *)(i)), *((int *)(i + 4)), *((int *)(i + 4*2)), *((int *)(i + 4*3)), 
-						*((int *)(i + 4*4)), *((int *)(i + 4*5)), *((int *)(i + 4*6)), *((int *)(i + 4*7)), 
-						*((int *)(i + 4*8)), *((int *)(i + 4*9)), *((int *)(i + 4*10)), *((int *)(i + 4*11)),
-						*((int *)(i + 4*12)), *((int *)(i + 4*13)), *((int *)(i + 4*14)), *((int *)(i + 4*15)));
+            *((int *)(i)), *((int *)(i + 4)), 
+						*((int *)(i + 4*2)), *((int *)(i + 4*3)), 
+						*((int *)(i + 4*4)), *((int *)(i + 4*5)), 
+						*((int *)(i + 4*6)), *((int *)(i + 4*7)), 
+						*((int *)(i + 4*8)), *((int *)(i + 4*9)), 
+						*((int *)(i + 4*10)), *((int *)(i + 4*11)),
+						*((int *)(i + 4*12)), *((int *)(i + 4*13)), 
+						*((int *)(i + 4*14)), *((int *)(i + 4*15)));
 	}
 	tprintf("\n");
 }
@@ -540,6 +549,8 @@ inline void initGC() {
 
 	freeRuntimeHash(gcpointertbl);
 	gcpointertbl = allocateRuntimeHash(20);
+
+	memset(gcsmemtbl, '\0', sizeof(int)*gcnumblock);
 } // void initGC()
 
 // compute load balance for all cores
@@ -611,7 +622,8 @@ inline bool cacheLObjs() {
 	while(gc_lobjmoreItems2()) {
 		gc_lobjdequeue2();
 		size = gclobjtail2->lengths[gclobjtailindex2 - 1];
-		// set the mark field to 2, indicating that this obj has been moved and need to be flushed
+		// set the mark field to 2, indicating that this obj has been moved and 
+		// need to be flushed
 		((int *)(gclobjtail2->lobjs[gclobjtailindex2-1]))[6] = 2;
 		memcpy(dst, gclobjtail2->lobjs[gclobjtailindex2 - 1], size);
 		dst += size;
@@ -624,92 +636,69 @@ inline bool cacheLObjs() {
 	return true;
 } // void cacheLObjs()
 
-inline struct freeMemItem * updateFreeMemList(int localtop,
-		                                          int localsize,
-                															struct freeMemItem * listtop,
-																							int * returntop) {
-	struct freeMemItem * tochange = listtop;
-	struct freeMemItem * tmp = bamboo_free_mem_list->head;
-	bool extendflag = false;
-	struct freeMemItem * ex_tmp = NULL;
-	// check if there is a hole in the block below it
-	while(true) {
-		if(tmp->ptr<localtop) {
-			if((tmp->ptr+tmp->size) == localtop) {
-				// extend the hole up to includ this block
-				tmp->size += localsize;
-				extendflag = true;
-				*returntop = tmp->ptr;
-				break;
-			} // if((tmp->ptr+tmp->size) == localtop)
-		} else {
-			break;
-		} // if(tmp->ptr<gcloads[i]) else ...
-		if(tmp == tochange) {
+// NOTE: the free mem chunks should be maintained in an ordered linklist
+// the listtop param always specify current list tail
+
+// update the gcsmemtbl to record current shared mem usage
+void updateSmemTbl(int coren,
+		               int localtop) {
+	int ltopcore = 0;
+	int bound = BAMBOO_SMEM_SIZE_L;
+	BLOCKINDEX(localtop, &ltopcore);
+	if(localtop >= (gcbaseva+(BAMBOO_LARGE_SMEM_BOUND))) {
+		bound = BAMBOO_SMEM_SIZE;
+	}
+	int load = (localtop-gcbaseva)%bound;
+	int i = 0;
+	int j = 0;
+	int toset = 0;
+	do{
+		toset = gc_core2block[2*coren+i]+124*j;
+		if(toset < ltopcore) {
+			gcsmemtbl[toset] = (toset<NUMCORES)?BAMBOO_SMEM_SIZE_L:BAMBOO_SMEM_SIZE;
+		} else if(toset == ltopcore) {
+			gcsmemtbl[toset] = load;
 			break;
 		} else {
-			tmp = tmp->next;
-		} // if(tmp == tochange)
-	} // while(true)
-	if((extendflag) && (tmp != tochange)) {
-		ex_tmp = tmp;
-		tmp = tmp->next;
-	} // if(tmp->ptr<gcloads[i])
-	if(tmp != tochange) {
-		while(true) {
-			if((localtop+localsize) == tmp->ptr) {
-				// extend the hole below to include this block
-				extendflag = true;
-				if(ex_tmp == NULL) {
-					tmp->ptr = localtop;
-					tmp->size += localsize;
-					*returntop = localtop;
-				} else {
-					ex_tmp->size += tmp->size;
-					tmp->ptr = tmp->next->ptr;
-					tmp->size = tmp->next->size;
-					if(tmp->next == tochange) {
-						tochange = tmp;
-					}
-					ex_tmp = tmp->next;
-					tmp->next = tmp->next->next;
-					RUNFREE(ex_tmp);
-					ex_tmp = NULL;
-				}
-				break;
-			}
-			if(tmp == tochange) {
-				break;
-			} else {
-				tmp = tmp->next;
-			} // if(tmp == tochange)
-		} // while(true)
-	} // if(tmp != tochange)
-	if((!extendflag) && (tmp == tochange)) {
-		// add a new item for this block hole
+			break;
+		}
+		i++;
+		if(i == 2) {
+			i = 0;
+			j++;
+		}
+	}while(true);
+} // void updateSmemTbl(int, int)
+
+inline struct freeMemItem * addFreeMemItem(int ptr,
+		                                       int size,
+																					 struct freeMemItem * listtail,
+																					 bool* sethead) {
+	struct freeMemItem * tochange = listtail;
+	if(*sethead) {
 		if(tochange->next == NULL) {
 			tochange->next = 
 				(struct freeMemItem *)RUNMALLOC(sizeof(struct freeMemItem));
-		}
+		} // if(tochange->next == NULL)
 		tochange = tochange->next;
-		tochange->ptr = localtop;
-		tochange->size = localsize;
-		*returntop = localtop;
-	} // if((!extendflag) && (tmp == tochange))
+	} else {
+		*sethead = true;
+	} // if(sethead)
+	tochange->ptr = ptr;
+	tochange->size = size;
+	BLOCKINDEX(ptr, &(tochange->startblock));
+	BLOCKINDEX(ptr+size-1, &(tochange->endblock));
+	// zero out all these spare memory
+	// note that, leave the mem starting from heaptop, as it caches large objs
+	// zero out these cache later when moving large obj
+	memset(tochange->ptr, '\0', tochange->size);
 	return tochange;
-} // void updateFreeMemList(int, int, struct freeMemItem *, int *)
+} // struct freeMemItem * addFreeMemItem(int,int,struct freeMemItem*,bool*, int)
 
 inline void moveLObjs() {
 #ifdef DEBUG
 	BAMBOO_DEBUGPRINT(0xea01);
 #endif
-	int remain = 0;
-	int bound = BAMBOO_SMEM_SIZE_L;
-	struct freeMemItem * tochange = bamboo_free_mem_list->head;
-	if(tochange == NULL) {
-		bamboo_free_mem_list->head = tochange = 
-			(struct freeMemItem *)RUNMALLOC(sizeof(struct freeMemItem));
-	}
 	// find current heap top
 	// flush all gcloads to indicate the real heap top on one core
 	// previous it represents the next available ptr on a core
@@ -717,22 +706,14 @@ inline void moveLObjs() {
 			&& ((gcloads[0]%(BAMBOO_SMEM_SIZE)) == 0)) {
 		// edge of a block, check if this is exactly the heaptop
 		BASEPTR(0, gcfilledblocks[0]-1, &(gcloads[0]));
-		gcloads[0]+=(gcfilledblocks[0]>1?(BAMBOO_SMEM_SIZE):(BAMBOO_SMEM_SIZE_L));
-	} else {
-		// in the middle of a block, flush the remaining space in this block
-		// and update it into the free mem list
-		if(gcloads[0] > (gcbaseva+(BAMBOO_SMEM_SIZE_L))) {
-			bound = BAMBOO_SMEM_SIZE;
-		}
-		remain = bound - gcloads[0]%bound;
-		tochange->ptr = gcloads[0];
-		tochange->size = remain;
-		// zero out all these spare memory
-		memset(tochange->ptr, '\0', tochange->size);
-	}
-	int tmpheaptop = gcloads[0];
+		gcloads[0]+=(gcfilledblocks[0]>1?
+				(BAMBOO_SMEM_SIZE):(BAMBOO_SMEM_SIZE_L));
+	} 
+	updateSmemTbl(0, gcloads[0]);
 #ifdef DEBUG
-	BAMBOO_DEBUGPRINT_REG(tmpheaptop);
+  BAMBOO_DEBUGPRINT(0xea02);
+	BAMBOO_DEBUGPRINT_REG(gcloads[0]);
+	BAMBOO_DEBUGPRINT_REG(gcsmemtbl[0]);
 #endif
 	for(int i = 1; i < NUMCORES; i++) {
 		int tmptop = 0;
@@ -748,46 +729,37 @@ inline void moveLObjs() {
 			gcloads[i]
 				+=(gcfilledblocks[i]>1?(BAMBOO_SMEM_SIZE):(BAMBOO_SMEM_SIZE_L));
 			tmptop = gcloads[i];
-		} else {
-			// in the middle of a block, flush the remaining space in this block
-			// and update it into the free mem list
-			if(gcfilledblocks[i] > 0) {
-				bound = BAMBOO_SMEM_SIZE;
-			} else {
-				bound = BAMBOO_SMEM_SIZE_L;
-			}
-			remain = bound - gcloads[i]%bound;
-			// zero out all these spare memory
-			memset(gcloads[i], '\0', remain);
-			// update free mem list
-			tochange = updateFreeMemList(gcloads[i], remain, tochange, &tmptop);
-		} // if((gcfilledblocks[i] > 0)
-
-		if(tmpheaptop < tmptop) {
-			tmpheaptop = tmptop;
-		}
+		} 
+		updateSmemTbl(i, gcloads[i]);
 #ifdef DEBUG
 		BAMBOO_DEBUGPRINT_REG(gcloads[i]);
-		BAMBOO_DEBUGPRINT_REG(tmpheaptop);
 #endif
+	} // for(int i = 1; i < NUMCORES; i++) {
+
+	// find current heap top
+	// TODO
+	// a bug here: when using local allocation, directly move large objects
+	// to the highest free chunk might not be memory efficient
+	int tmpheaptop = 0;
+	int size = 0;
+	int bound = 0;
+	int i = 0;
+	for(i = gcnumblock-1; i >= 0; i--) {
+		if(gcsmemtbl[i] > 0) {
+			break;
+		}
 	}
-	tochange->ptr = tmpheaptop;
-	tochange->size = gcheaptop - tmpheaptop;
-	// zero out all these spare memory
-	memset(tochange->ptr, '\0', tochange->size);
-	if(bamboo_free_mem_list->tail != tochange) {
-		bamboo_free_mem_list->tail = tochange;
-	}
-	while(tochange->next != NULL) {
-		struct freeMemItem * toremove = tochange->next;
-		tochange->next = toremove->next;
-		RUNFREE(toremove);
+	if(i == -1) {
+		tmpheaptop = gcbaseva;
+	} else {
+		tmpheaptop = gcbaseva+gcsmemtbl[i]+((i<NUMCORES)?(BAMBOO_SMEM_SIZE_L*i):
+				(BAMBOO_SMEM_SIZE*(i-NUMCORES)+BAMBOO_LARGE_SMEM_BOUND));
 	}
 	// move large objs from gcheaptop to tmpheaptop
 	// write the header first
 	int tomove = (BAMBOO_BASE_VA) + (BAMBOO_SHARED_MEM_SIZE) - gcheaptop;
 #ifdef DEBUG
-	BAMBOO_DEBUGPRINT(0xea02);
+	BAMBOO_DEBUGPRINT(0xea03);
 	BAMBOO_DEBUGPRINT_REG(tomove);
 	BAMBOO_DEBUGPRINT_REG(tmpheaptop);
 	BAMBOO_DEBUGPRINT_REG(gcheaptop);
@@ -797,180 +769,238 @@ inline void moveLObjs() {
 			   BAMBOO_SHARED_MEM_SIZE/BAMBOO_SMEM_SIZE*sizeof(INTPTR));
 	if(tomove == 0) {
 		gcheaptop = tmpheaptop;
-		return;
-	}
-	// check how many blocks it acrosses
-	remain = tmpheaptop-gcbaseva;
-	int b = remain/(BAMBOO_SMEM_SIZE) + gcreservedsb;
-	// check the remaining space in this block
-	bound = (BAMBOO_SMEM_SIZE);
-	if(remain < (BAMBOO_LARGE_SMEM_BOUND)) {
-		bound = (BAMBOO_SMEM_SIZE_L);
-	}
-	remain = bound - remain%bound;
-
+	} else {
+		// check how many blocks it acrosses
+		int remain = tmpheaptop-gcbaseva;
+		int sb = remain/(BAMBOO_SMEM_SIZE) + gcreservedsb; // number of the sblock
+		int b = 0; // number of the block
+		BLOCKINDEX(tmpheaptop, &b);
+		// check the remaining space in this block
+		bound = (BAMBOO_SMEM_SIZE);
+		if(remain < (BAMBOO_LARGE_SMEM_BOUND)) {
+			bound = (BAMBOO_SMEM_SIZE_L);
+		}
+		remain = bound - remain%bound;
+
+#ifdef DEBUG
+		BAMBOO_DEBUGPRINT(0xea04);
+#endif
+		size = 0;
+		int isize = 0;
+		int host = 0;
+		int ptr = 0;
+		int base = tmpheaptop;
+		int cpysize = 0;
+		remain -= BAMBOO_CACHE_LINE_SIZE;
+		tmpheaptop += BAMBOO_CACHE_LINE_SIZE;
+		while(gc_lobjmoreItems()) {
+			ptr = (int)(gc_lobjdequeue(&size, &host));
+			ALIGNSIZE(size, &isize);
+			if(remain < isize) {
+				// this object acrosses blocks
+				if(cpysize > 0) {
+					// close current block, fill its header
+					memset(base, '\0', BAMBOO_CACHE_LINE_SIZE);
+					*((int*)base) = cpysize + BAMBOO_CACHE_LINE_SIZE;
+					gcsmemtbl[b] = cpysize + BAMBOO_CACHE_LINE_SIZE;
+					cpysize = 0;
+					base = tmpheaptop;
+					if(remain == 0) {
+						remain = ((tmpheaptop-gcbaseva)<(BAMBOO_LARGE_SMEM_BOUND)) ? 
+										 BAMBOO_SMEM_SIZE_L : BAMBOO_SMEM_SIZE;
+					} 
+					remain -= BAMBOO_CACHE_LINE_SIZE;
+					tmpheaptop += BAMBOO_CACHE_LINE_SIZE;
+					BLOCKINDEX(tmpheaptop, &b);
+					sb = (tmpheaptop-gcbaseva)/(BAMBOO_SMEM_SIZE) + gcreservedsb;
+				} // if(cpysize > 0)
+
+				// move the large obj
+				memcpy(tmpheaptop, gcheaptop, size);
+				// fill the remaining space with -2 padding
+				memset(tmpheaptop+size, -2, isize-size);
+				// zero out original mem caching the lobj
+				memset(gcheaptop, '\0', size);
+#ifdef DEBUG
+				BAMBOO_DEBUGPRINT(0xea05);
+				BAMBOO_DEBUGPRINT_REG(gcheaptop);
+				BAMBOO_DEBUGPRINT_REG(tmpheaptop);
+				BAMBOO_DEBUGPRINT_REG(size);
+				BAMBOO_DEBUGPRINT_REG(isize);
+				BAMBOO_DEBUGPRINT_REG(base);
+#endif
+				gcheaptop += size;
+				if(host == BAMBOO_NUM_OF_CORE) {
+					BAMBOO_START_CRITICAL_SECTION();
+					RuntimeHashadd_I(gcpointertbl, ptr, tmpheaptop);
+					BAMBOO_CLOSE_CRITICAL_SECTION();
 #ifdef DEBUG
-	BAMBOO_DEBUGPRINT(0xea03);
+					BAMBOO_DEBUGPRINT(0xcdca);
+					BAMBOO_DEBUGPRINT_REG(ptr);
+					BAMBOO_DEBUGPRINT_REG(tmpheaptop);
 #endif
-	int size = 0;
-	int isize = 0;
-	int host = 0;
-	int ptr = 0;
-	int base = tmpheaptop;
-	int cpysize = 0;
-	remain -= BAMBOO_CACHE_LINE_SIZE;
-	tmpheaptop += BAMBOO_CACHE_LINE_SIZE;
-	while(gc_lobjmoreItems()) {
-		ptr = (int)(gc_lobjdequeue(&size, &host));
-		ALIGNSIZE(size, &isize);
-		if(remain < isize) {
-			// this object acrosses blocks
-			if(cpysize > 0) {
-				// close current block, fill its header
+				} else {
+					// send the original host core with the mapping info
+					send_msg_3(host, GCLOBJMAPPING, ptr, tmpheaptop);
+#ifdef DEBUG
+					BAMBOO_DEBUGPRINT(0xcdcb);
+					BAMBOO_DEBUGPRINT_REG(ptr);
+					BAMBOO_DEBUGPRINT_REG(tmpheaptop);
+#endif
+				} // if(host == BAMBOO_NUM_OF_CORE) else ...
+				tmpheaptop += isize;
+
+				// set the gcsbstarttbl and gcsmemtbl
+				int tmpsbs = 1+(isize-remain-1)/BAMBOO_SMEM_SIZE;
+				for(int k = 1; k < tmpsbs; k++) {
+					gcsbstarttbl[sb+k] = (INTPTR)(-1);
+				}
+				sb += tmpsbs;
+				bound = (b<NUMCORES)?BAMBOO_SMEM_SIZE_L:BAMBOO_SMEM_SIZE;
+				BLOCKINDEX(tmpheaptop-1, &tmpsbs);
+				for(; b < tmpsbs; b++) {
+					gcsmemtbl[b] = bound;
+					if(b==NUMCORES-1) {
+						bound = BAMBOO_SMEM_SIZE;
+					}
+				}
+				if(((isize-remain)%(BAMBOO_SMEM_SIZE)) == 0) {
+					gcsbstarttbl[sb] = (INTPTR)(-1);
+					remain = ((tmpheaptop-gcbaseva)<(BAMBOO_LARGE_SMEM_BOUND)) ? 
+									 BAMBOO_SMEM_SIZE_L : BAMBOO_SMEM_SIZE;
+					gcsmemtbl[b] = bound;
+				} else {
+					gcsbstarttbl[sb] = (INTPTR)(tmpheaptop);
+					remain = tmpheaptop-gcbaseva;
+					gcsmemtbl[b] = remain%bound;
+					remain = bound - gcsmemtbl[b];
+				} // if(((isize-remain)%(BAMBOO_SMEM_SIZE)) == 0) else ...
+
+				// close current block and fill the header
 				memset(base, '\0', BAMBOO_CACHE_LINE_SIZE);
-				*((int*)base) = cpysize + BAMBOO_CACHE_LINE_SIZE;
+				*((int*)base) = isize + BAMBOO_CACHE_LINE_SIZE;
 				cpysize = 0;
 				base = tmpheaptop;
-				if(remain == 0) {
-					remain = ((tmpheaptop-gcbaseva)<(BAMBOO_LARGE_SMEM_BOUND)) ? 
-						       BAMBOO_SMEM_SIZE_L : BAMBOO_SMEM_SIZE;
-				} 
 				remain -= BAMBOO_CACHE_LINE_SIZE;
 				tmpheaptop += BAMBOO_CACHE_LINE_SIZE;
-			} // if(cpysize > 0)
-
-			// move the large obj
-			memcpy(tmpheaptop, gcheaptop, size);
-			// fill the remaining space with -2 padding
-			memset(tmpheaptop+size, -2, isize-size);
-			// zero out original mem caching the lobj
-			memset(gcheaptop, '\0', size);
-#ifdef DEBUG
-			BAMBOO_DEBUGPRINT(0xea04);
-			BAMBOO_DEBUGPRINT_REG(gcheaptop);
-			BAMBOO_DEBUGPRINT_REG(tmpheaptop);
-			BAMBOO_DEBUGPRINT_REG(size);
-			BAMBOO_DEBUGPRINT_REG(isize);
-#endif
-			gcheaptop += size;
-			if(host == BAMBOO_NUM_OF_CORE) {
-				BAMBOO_START_CRITICAL_SECTION();
-				RuntimeHashadd_I(gcpointertbl, ptr, tmpheaptop);
-				BAMBOO_CLOSE_CRITICAL_SECTION();
-#ifdef DEBUG
-				BAMBOO_DEBUGPRINT(0xcdca);
-				BAMBOO_DEBUGPRINT_REG(ptr);
-				BAMBOO_DEBUGPRINT_REG(tmpheaptop);
-#endif
 			} else {
-				// send the original host core with the mapping info
-				send_msg_3(host, GCLOBJMAPPING, ptr, tmpheaptop);
-#ifdef DEBUG
-				BAMBOO_DEBUGPRINT(0xcdcb);
-				BAMBOO_DEBUGPRINT_REG(ptr);
+				remain -= isize;
+				// move the large obj
+				memcpy(tmpheaptop, gcheaptop, size);
+				// fill the remaining space with -2 padding
+				memset(tmpheaptop+size, -2, isize-size);
+				// zero out original mem caching the lobj
+				memset(gcheaptop, '\0', size);
+#ifdef DEBUG
+				BAMBOO_DEBUGPRINT(0xea06);
+				BAMBOO_DEBUGPRINT_REG(gcheaptop);
 				BAMBOO_DEBUGPRINT_REG(tmpheaptop);
+				BAMBOO_DEBUGPRINT_REG(size);
+				BAMBOO_DEBUGPRINT_REG(isize);
 #endif
-			} // if(host == BAMBOO_NUM_OF_CORE) else ...
-			tmpheaptop += isize;
-
-			// set the gcsbstarttbl
-			int tmpsbs = 1+(isize-remain-1)/BAMBOO_SMEM_SIZE;
-			for(int k = 1; k < tmpsbs; k++) {
-				gcsbstarttbl[b+k] = (INTPTR)(-1);
-			}
-			b += tmpsbs;
-			if(((isize-remain)%(BAMBOO_SMEM_SIZE)) == 0) {
-				gcsbstarttbl[b] = (INTPTR)(-1);
-				remain = ((tmpheaptop-gcbaseva)<(BAMBOO_LARGE_SMEM_BOUND)) ? 
-						     BAMBOO_SMEM_SIZE_L : BAMBOO_SMEM_SIZE;
-			} else {
-				gcsbstarttbl[b] = (INTPTR)(tmpheaptop);
-				remain = tmpheaptop-gcbaseva;
-				int bound = remain<(BAMBOO_LARGE_SMEM_BOUND)?(BAMBOO_SMEM_SIZE_L):(BAMBOO_SMEM_SIZE);
-				remain = bound - remain%bound;
-			} // if(((isize-remain)%(BAMBOO_SMEM_SIZE)) == 0) else ...
-
-			// close current block and fill the header
-			memset(base, '\0', BAMBOO_CACHE_LINE_SIZE);
-			*((int*)base) = isize + BAMBOO_CACHE_LINE_SIZE;
-			cpysize = 0;
-			base = tmpheaptop;
-			remain -= BAMBOO_CACHE_LINE_SIZE;
-			tmpheaptop += BAMBOO_CACHE_LINE_SIZE;
-		} else {
-			remain -= isize;
-			// move the large obj
-			memcpy(tmpheaptop, gcheaptop, size);
-			// fill the remaining space with -2 padding
-			memset(tmpheaptop+size, -2, isize-size);
-			// zero out original mem caching the lobj
-			memset(gcheaptop, '\0', size);
-#ifdef DEBUG
-			BAMBOO_DEBUGPRINT(0xea05);
-			BAMBOO_DEBUGPRINT_REG(gcheaptop);
-			BAMBOO_DEBUGPRINT_REG(tmpheaptop);
-			BAMBOO_DEBUGPRINT_REG(size);
-			BAMBOO_DEBUGPRINT_REG(isize);
-#endif
-			gcheaptop += size;
-			cpysize += isize;
-			if(host == BAMBOO_NUM_OF_CORE) {
-				BAMBOO_START_CRITICAL_SECTION();
-				RuntimeHashadd_I(gcpointertbl, ptr, tmpheaptop);
-				BAMBOO_CLOSE_CRITICAL_SECTION();
+				gcheaptop += size;
+				cpysize += isize;
+				if(host == BAMBOO_NUM_OF_CORE) {
+					BAMBOO_START_CRITICAL_SECTION();
+					RuntimeHashadd_I(gcpointertbl, ptr, tmpheaptop);
+					BAMBOO_CLOSE_CRITICAL_SECTION();
 #ifdef DEBUG
-				BAMBOO_DEBUGPRINT(0xcdcc);
-				BAMBOO_DEBUGPRINT_REG(ptr);
-				BAMBOO_DEBUGPRINT_REG(tmpheaptop);
+					BAMBOO_DEBUGPRINT(0xcdcc);
+					BAMBOO_DEBUGPRINT_REG(ptr);
+					BAMBOO_DEBUGPRINT_REG(tmpheaptop);
 #endif
-			} else {
-				// send the original host core with the mapping info
-				send_msg_3(host, GCLOBJMAPPING, ptr, tmpheaptop);
+				} else {
+					// send the original host core with the mapping info
+					send_msg_3(host, GCLOBJMAPPING, ptr, tmpheaptop);
 #ifdef DEBUG
-				BAMBOO_DEBUGPRINT(0xcdcd);
-				BAMBOO_DEBUGPRINT_REG(ptr);
-				BAMBOO_DEBUGPRINT_REG(tmpheaptop);
+					BAMBOO_DEBUGPRINT(0xcdcd);
+					BAMBOO_DEBUGPRINT_REG(ptr);
+					BAMBOO_DEBUGPRINT_REG(tmpheaptop);
 #endif
-			} // if(host == BAMBOO_NUM_OF_CORE) else ...
-			tmpheaptop += isize;
-		} // if(remain < isize) else ...
-	} // while(gc_lobjmoreItems())
-	if(cpysize > 0) {
-		// close current block, fill the header
-		memset(base, '\0', BAMBOO_CACHE_LINE_SIZE);
-		*((int*)base) = cpysize + BAMBOO_CACHE_LINE_SIZE;
-	} else {
-		tmpheaptop -= BAMBOO_CACHE_LINE_SIZE;
-	}
-	gcheaptop = tmpheaptop;
-	// update the free mem list
-	tochange->size = (BAMBOO_BASE_VA)+(BAMBOO_SHARED_MEM_SIZE)-gcheaptop;
-	tochange->ptr = gcheaptop;
+				} // if(host == BAMBOO_NUM_OF_CORE) else ...
+				tmpheaptop += isize;
+
+				// update gcsmemtbl
+				if(gcsmemtbl[b] == 0) {
+					// add the header's size
+					gcsmemtbl[b] = BAMBOO_CACHE_LINE_SIZE;
+				}
+				gcsmemtbl[b] += isize;
+			} // if(remain < isize) else ...
+		} // while(gc_lobjmoreItems())
+		if(cpysize > 0) {
+			// close current block, fill the header
+			memset(base, '\0', BAMBOO_CACHE_LINE_SIZE);
+			*((int*)base) = cpysize + BAMBOO_CACHE_LINE_SIZE;
+			gcsmemtbl[b] = cpysize + BAMBOO_CACHE_LINE_SIZE;
+		} else {
+			tmpheaptop -= BAMBOO_CACHE_LINE_SIZE;
+		}
+		gcheaptop = tmpheaptop;
+	} // if(tomove == 0)
+
 #ifdef DEBUG
-	BAMBOO_DEBUGPRINT(0xea06);
+	BAMBOO_DEBUGPRINT(0xea07);
 	BAMBOO_DEBUGPRINT_REG(gcheaptop);
 #endif
-} // void moveLObjs()
 
-/*inline void updateFreeMemList() {
+	// update the free mem list
+	// create new free mem list according to gcsmemtbl
+	bool sethead = false;
 	struct freeMemItem * tochange = bamboo_free_mem_list->head;
 	if(tochange == NULL) {
 		bamboo_free_mem_list->head = tochange = 
 			(struct freeMemItem *)RUNMALLOC(sizeof(struct freeMemItem));
+		tochange->next = NULL;
 	}
-	// handle the top of the heap
-	tochange->ptr = gcheaptop;
-	tochange->size = BAMBOO_SHARED_MEM_SIZE + BAMBOO_BASE_VA - gcheaptop;
-	// zero out all these spare memory
-	memset(tochange->ptr, '\0', tochange->size);
-	if(bamboo_free_mem_list->tail != tochange) {
-		bamboo_free_mem_list->tail = tochange;
-		if(bamboo_free_mem_list->tail != NULL) {
-			RUNFREE(bamboo_free_mem_list->tail);
+	int startptr = 0;
+	size = 0;
+	bound = BAMBOO_SMEM_SIZE_L;
+	for(i = 0; i < gcnumblock; i++) {
+		if(gcsmemtbl[i] < bound) {
+			if(gcsmemtbl[i] == 0) {
+				// blank one
+				if(startptr == 0) {
+					// a start of a new free mem chunk
+					startptr = gcbaseva+((i<NUMCORES)?(i*BAMBOO_SMEM_SIZE_L)
+							:(BAMBOO_LARGE_SMEM_BOUND+(i-NUMCORES)*BAMBOO_SMEM_SIZE));
+				} // if(startptr == 0) 
+				size += bound;
+			} else {
+				if(startptr != 0) {
+					// the end of previous free mem chunk
+					tochange = addFreeMemItem(startptr,size,tochange,&sethead);
+					//startptr = 0;
+					//size = 0;
+				}
+				// start of a new free mem chunk
+				startptr = gcbaseva+((i<NUMCORES)?(i*BAMBOO_SMEM_SIZE_L)
+						:(BAMBOO_LARGE_SMEM_BOUND+(i-NUMCORES)*BAMBOO_SMEM_SIZE))+gcsmemtbl[i];
+				size = bound-gcsmemtbl[i];
+			} // if(gcsmemtbl[i] == 0) else
+		} else {
+			if(startptr != 0) {
+				// the end of previous free mem chunk
+				tochange = addFreeMemItem(startptr,size,tochange,&sethead);
+				startptr = 0;
+				size = 0;
+			} // if(startptr != 0) {
+		} // if(gcsmemtbl[i] < bound) else
+		if(i == NUMCORES-1) {
+			bound = BAMBOO_SMEM_SIZE;
 		}
+	} // for(i = 0; i < gcnumblock; i++) {
+	if(startptr != 0) {
+		tochange = addFreeMemItem(startptr, size, tochange, &sethead);
+		startptr = 0;
+		size = 0;
 	}
-} // void updateFreeMemList()
-*/
+
+#ifdef DEBUG
+	BAMBOO_DEBUGPRINT(0xea08);
+	BAMBOO_DEBUGPRINT_REG(gcheaptop);
+#endif
+} // void moveLObjs()
 
 // enqueue root objs
 inline void tomark(struct garbagelist * stackptr) {
@@ -1309,8 +1339,8 @@ inline void compact2Heaptophelper(int coren,
 		*numblocks = gcstopblock[gctopcore];
 		*p = gcloads[gctopcore];
 		BLOCKINDEX(*p, &b);
-		*remain = (b<NUMCORES)?((BAMBOO_SMEM_SIZE_L)-((*p)%(BAMBOO_SMEM_SIZE_L)))
-												  :((BAMBOO_SMEM_SIZE)-((*p)%(BAMBOO_SMEM_SIZE)));
+		*remain=(b<NUMCORES)?((BAMBOO_SMEM_SIZE_L)-((*p)%(BAMBOO_SMEM_SIZE_L)))
+											  :((BAMBOO_SMEM_SIZE)-((*p)%(BAMBOO_SMEM_SIZE)));
 #ifdef DEBUG
 		BAMBOO_DEBUGPRINT(0xd106);
 		BAMBOO_DEBUGPRINT_REG(gctopcore);
@@ -1378,53 +1408,6 @@ inline void compact2Heaptop() {
 #endif
 } // void compact2Heaptop()
 
-#if 0
-inline int nextTopcore(int topcore, bool direction) {
-	int nextopcore = topcore;
-	if((NUMCORES == 62) && (nextopcore>5)) {
-		nextopcore += 2;
-	}
-	int x = nextopcore / bamboo_height;
-	int y = nextopcore % bamboo_height;
-	if((direction && (y%2 == 0)) || ((!direction) && (y%2))) {
-		// increase
-		if(x == 7) {
-			if(direction) {
-				y++;
-			} else {
-				y--;
-			}
-		} else {
-			x++;
-		}
-	} else {
-		// decrease
-		if((x == 0) || ((x==1) &&(y==6))) {
-			if(direction) {
-				y++;
-				if(y==6) {
-					x = 1;
-				}
-			} else {
-				y--;
-				if(y==5) {
-					x = 0;
-				}
-			}
-		} else {
-			x--;
-		}
-	}
-	nextopcore = x*bamboo_height+y;
-	if(NUMCORES==62) {
-		if(x>0) {
-			nextopcore -= 2;
-		}
-	}
-	return nextopcore;
-} // int nextTopcore(int topcore, bool direction)
-#endif
-
 inline void resolvePendingMoveRequest() {
 #ifdef DEBUG
 	BAMBOO_DEBUGPRINT(0xeb01);
@@ -1571,7 +1554,7 @@ innernextSBlock:
 		orig->sblockindex = (orig->blockbase-BAMBOO_BASE_VA)/BAMBOO_SMEM_SIZE;
 	} else if(0 == (orig->blockbase%BAMBOO_SMEM_SIZE)) {
 		orig->sblockindex += 1;
-	} // if((orig->blockbase >= orig->bound) || (orig->ptr >= orig->bound) ...
+	} // if((orig->blockbase >= orig->bound) || (orig->ptr >= orig->bound)...
 
 	// check if this sblock should be omitted or have special start point
 	if(gcsbstarttbl[orig->sblockindex] == -1) {
@@ -1624,10 +1607,6 @@ inline bool initOrig_Dst(struct moveHelper * orig,
 	BAMBOO_DEBUGPRINT(0xef01);
 	BAMBOO_DEBUGPRINT_REG(to->base);
 #endif
-	/*if(STARTUPCORE == BAMBOO_NUM_OF_CORE) {
-		to->base += gcreservedsb * BAMBOO_SMEM_SIZE;
-		to->top += gcreservedsb * BAMBOO_SMEM_SIZE;
-	}*/
 	to->ptr = to->base + to->offset;
 
 	// init the orig ptr
@@ -1635,11 +1614,7 @@ inline bool initOrig_Dst(struct moveHelper * orig,
 	orig->base = to->base;
 	orig->bound = to->base + BAMBOO_SMEM_SIZE_L;
 	orig->blockbase = orig->base;
-	/*if(STARTUPCORE == BAMBOO_NUM_OF_CORE) {
-		orig->sblockindex = gcreservedsb;
-	} else {*/
-		orig->sblockindex = (orig->base - BAMBOO_BASE_VA) / BAMBOO_SMEM_SIZE;
-	//}
+	orig->sblockindex = (orig->base - BAMBOO_BASE_VA) / BAMBOO_SMEM_SIZE;
 #ifdef DEBUG
 	BAMBOO_DEBUGPRINT(0xef02);
 	BAMBOO_DEBUGPRINT_REG(orig->base);
@@ -1740,14 +1715,16 @@ innermoveobj:
 #ifdef DEBUG
 	BAMBOO_DEBUGPRINT(0xe203);
 	BAMBOO_DEBUGPRINT_REG(orig->ptr);
+	BAMBOO_DEBUGPRINT_REG(size);
 #endif
+	ALIGNSIZE(size, &isize); // no matter is the obj marked or not
+	                         // should be able to across it
 	if(mark == 1) {
 #ifdef DEBUG
 		BAMBOO_DEBUGPRINT(0xe204);
 #endif
 		// marked obj, copy it to current heap top
 		// check to see if remaining space is enough
-		ALIGNSIZE(size, &isize);
 		if(to->top + isize > to->bound) {
 			// fill -1 indicating the end of this block
 			/*if(to->top != to->bound) {
@@ -1764,7 +1741,8 @@ innermoveobj:
 				return true;
 			} // if(stopblock == to->numblocks)
 		} // if(to->top + isize > to->bound)
-		// set the mark field to 2, indicating that this obj has been moved and need to be flushed
+		// set the mark field to 2, indicating that this obj has been moved 
+		// and need to be flushed
 		((int *)(orig->ptr))[6] = 2;
 		if(to->ptr != orig->ptr) {
 			memcpy(to->ptr, orig->ptr, size);
@@ -1823,8 +1801,8 @@ inline int assignSpareMem_I(int sourcecore,
 													  int * startaddr) {
 	int b = 0;
 	BLOCKINDEX(gcloads[sourcecore], &b);
-	int boundptr = b<NUMCORES?(b+1)*BAMBOO_SMEM_SIZE_L
-		:BAMBOO_LARGE_SMEM_BOUND+(b-NUMCORES+1)*BAMBOO_SMEM_SIZE;
+	int boundptr = (b<NUMCORES)?((b+1)*BAMBOO_SMEM_SIZE_L)
+		:(BAMBOO_LARGE_SMEM_BOUND+(b-NUMCORES+1)*BAMBOO_SMEM_SIZE);
 	int remain = boundptr - gcloads[sourcecore];
 	int memneed = requiredmem + BAMBOO_CACHE_LINE_SIZE;
 	*startaddr = gcloads[sourcecore];
@@ -2141,7 +2119,7 @@ inline void flushRuntimeObj(struct garbagelist * stackptr) {
 
 inline void flush(struct garbagelist * stackptr) {
 	flushRuntimeObj(stackptr);
-	
+
 	while(gc_moreItems()) {
 #ifdef DEBUG
 		BAMBOO_DEBUGPRINT(0xe301);
@@ -2209,7 +2187,7 @@ inline void flush(struct garbagelist * stackptr) {
 			// restore the mark field, indicating that this obj has been flushed
 			((int *)(ptr))[6] = 0;
 		} // if(((int *)(ptr))[6] == 2)
-	} // while(moi != NULL)
+	} // while(gc_moreItems())
 #ifdef DEBUG
 	BAMBOO_DEBUGPRINT(0xe308);
 #endif
@@ -2350,7 +2328,7 @@ inline void gc(struct garbagelist * stackptr) {
 		}
 
 		gcphase = MARKPHASE;
-		// mark phase
+    // mark phase
 		while(MARKPHASE == gcphase) {
 			mark(isfirst, stackptr);
 			if(isfirst) {
@@ -2441,13 +2419,14 @@ inline void gc(struct garbagelist * stackptr) {
 		bool localcompact = true;
 		while((COMPACTPHASE == gcphase) || (SUBTLECOMPACTPHASE == gcphase)) {
 			if((!finishcompact) && iscontinue) {
-#ifdef DEBUG
+#ifdef GC_DEBUG
 				BAMBOO_DEBUGPRINT(0xe001);
+				BAMBOO_DEBUGPRINT_REG(numpbc);
 				BAMBOO_DEBUGPRINT_REG(gcblock2fill);
 #endif
 				finishcompact = compacthelper(orig, to, &filledblocks, 
 						                          &heaptopptr, &localcompact);
-#ifdef DEBUG
+#ifdef GC_DEBUG
 				BAMBOO_DEBUGPRINT(0xe002);
 				BAMBOO_DEBUGPRINT_REG(finishcompact);
 				BAMBOO_DEBUGPRINT_REG(gctomove);
@@ -2517,7 +2496,7 @@ inline void gc(struct garbagelist * stackptr) {
 		} // while(COMPACTPHASE == gcphase) 
 #ifdef GC_DEBUG
 		tprintf("prepare to move large objs \n");
-		dumpSMem();
+		//dumpSMem();
 #endif
 		// move largeObjs
 		moveLObjs();
@@ -2548,15 +2527,6 @@ inline void gc(struct garbagelist * stackptr) {
 		} // while(FLUSHPHASE == gcphase)
 		gcphase = FINISHPHASE;
 
-/*
-		// need to create free memory list  
-		updateFreeMemList();
-#ifdef GC_DEBUG
-		tprintf("flush phase finished \n");
-		//dumpSMem();
-#endif
-*/
-
 		gccorestatus[BAMBOO_NUM_OF_CORE] = 1;
 		for(i = 1; i < NUMCORES; ++i) {
 			// send gc finish messages to all cores
diff --git a/Robust/src/Runtime/multicoregarbage.h b/Robust/src/Runtime/multicoregarbage.h
index 005baece..ff0e8575 100644
--- a/Robust/src/Runtime/multicoregarbage.h
+++ b/Robust/src/Runtime/multicoregarbage.h
@@ -10,7 +10,7 @@
 
 // data structures for GC
 #ifdef GC_DEBUG
-#define BAMBOO_SMEM_SIZE_L (BAMBOO_SMEM_SIZE)
+#define BAMBOO_SMEM_SIZE_L (BAMBOO_SMEM_SIZE * 2)
 #else
 #define BAMBOO_SMEM_SIZE_L (32 * BAMBOO_SMEM_SIZE)
 #endif
@@ -75,16 +75,21 @@ volatile bool gcismapped;
 //          moved or garbage collected.
 INTPTR * gcsbstarttbl;
 int gcreservedsb;  // number of reserved sblock for sbstarttbl
+int gcnumblock; // number of total blocks in the shared mem
 int gcbaseva; // base va for shared memory without reserved sblocks
 
+// table recording the number of used bytes in each block
+// Note: this table resides on master core's local heap
+int * gcsmemtbl;
+
 #define ISSHAREDOBJ(p) \
 	(((p)>gcbaseva)&&((p)<(gcbaseva+(BAMBOO_SHARED_MEM_SIZE))))
 
 #define ALIGNSIZE(s, as) \
 	(*((int*)as)) = (((s) & (~(BAMBOO_CACHE_LINE_MASK))) + (BAMBOO_CACHE_LINE_SIZE))
 
-// mapping of pointer to block # (start from 0), here the block # is the global
-// index
+// mapping of pointer to block # (start from 0), here the block # is 
+// the global index
 #define BLOCKINDEX(p, b) \
   { \
 		int t = (p) - gcbaseva; \
@@ -107,48 +112,11 @@ int gcbaseva; // base va for shared memory without reserved sblocks
 	}\
 }
 
-#if 0
-// mapping of pointer to host core (x,y)
-#define RESIDECORE(p, x, y) \
-  { \
-		if(1 == (NUMCORES)) { \
-			(*((int*)x)) = 0; \
-			(*((int*)y)) = 0; \
-		} else { \
-			int b; \
-			BLOCKINDEX((p), &b); \
-			bool reverse = (b / (NUMCORES)) % 2; \
-			int l = b % (NUMCORES); \
-			if(reverse) { \
-				if(62 == (NUMCORES)) { \
-					if(l < 14) { \
-						l += 1; \
-					} else { \
-						l += 2; \
-					} \
-				} \
-				(*((int*)y)) = bamboo_height - 1 - (l / bamboo_width); \
-			} else { \
-				if(62 == (NUMCORES)) {\
-					if (l > 47) {\
-						l += 1; \
-					} \
-				} \
-				(*((int*)y)) = l / bamboo_width; \
-			} \
-			if(((!reverse)&&(*((int*)y))%2) || ((reverse)&&((*((int*)y))%2==0))){ \
-				(*((int*)x)) = bamboo_width - 1 - (l % bamboo_width); \
-			} else { \
-				(*((int*)x)) = (l % bamboo_width); \
-			} \
-		} \
-	}
-#endif
-
 // NOTE: n starts from 0
-// mapping of heaptop (how many bytes there are in the local heap) to the number of
-// the block
-// the number of the block indicates that the block is the xth block on the local heap
+// mapping of heaptop (how many bytes there are in the local heap) to 
+// the number of the block
+// the number of the block indicates that the block is the xth block on 
+// the local heap
 #define NUMBLOCKS(s, n) \
 	if(s < (BAMBOO_SMEM_SIZE_L)) { \
 		(*((int*)(n))) = 0; \
@@ -165,45 +133,6 @@ int gcbaseva; // base va for shared memory without reserved sblocks
 
 // mapping of (core #, index of the block) to the global block index
 #define BLOCKINDEX2(c, n) (gc_core2block[(2*(c))+((n)%2)]+(124*((n)/2))) 
-#if 0
-#define BLOCKINDEX2(c, n, b) \
-  { \
-		int x; \
-		int y; \
-		int t; \
-		int cc = c; \
-		if((62 == (NUMCORES)) && (cc > 5)) cc += 2; \
-		x = cc / bamboo_height; \
-		y = cc % bamboo_height; \
-		if((n) % 2) { \
-			if(y % 2) { \
-				t = x + (bamboo_width - 1 - y) * bamboo_width; \
-			} else { \
-				t = bamboo_width - 1 - x + (bamboo_width - 1 - y) * bamboo_width; \
-			} \
-			if(62 == (NUMCORES)) {\
-				if(y>5) { \
-					t--; \
-				} else { \
-					t -= 2; \
-				} \
-			} \
-		} else { \
-			if(y % 2) { \
-				t = bamboo_width - 1 - x + y * bamboo_width; \
-			} else { \
-				t = x + y * bamboo_width; \
-			} \
-			if(62 == (NUMCORES)) { \
-				if(y > 5) { \
-					t--; \
-				} \
-			} \
-		} \
-		t += (NUMCORES) * (n); \
-		(*((int*)b)) = t; \
-	}
-#endif
 
 // mapping of (core #, number of the block) to the base pointer of the block
 #define BASEPTR(c, n, p) \
@@ -212,7 +141,8 @@ int gcbaseva; // base va for shared memory without reserved sblocks
 		if(b < (NUMCORES)) { \
 			(*((int*)p)) = gcbaseva + b * (BAMBOO_SMEM_SIZE_L); \
 		} else { \
-			(*((int*)p)) = gcbaseva+(BAMBOO_LARGE_SMEM_BOUND)+(b-(NUMCORES))*(BAMBOO_SMEM_SIZE); \
+			(*((int*)p)) = gcbaseva+(BAMBOO_LARGE_SMEM_BOUND)+ \
+			               (b-(NUMCORES))*(BAMBOO_SMEM_SIZE); \
 		} \
 	}
 
diff --git a/Robust/src/Runtime/multicoreruntime.h b/Robust/src/Runtime/multicoreruntime.h
index 5a7f861e..14ed7921 100644
--- a/Robust/src/Runtime/multicoreruntime.h
+++ b/Robust/src/Runtime/multicoreruntime.h
@@ -1,6 +1,10 @@
 #ifndef MULTICORE_RUNTIME
 #define MULTICORE_RUNTIME
 
+#ifndef INLINE
+#define INLINE    inline __attribute__((always_inline))
+#endif
+
 ////////////////////////////////////////////////////////////////
 // global variables                                          //
 ///////////////////////////////////////////////////////////////
@@ -213,7 +217,7 @@ struct Queue * totransobjqueue; // queue to hold objs to be transferred
 #define BAMBOO_BASE_VA 0xd000000
 #ifdef GC_DEBUG
 #include "structdefs.h"
-#define BAMBOO_NUM_PAGES (NUMCORES*(1+1))
+#define BAMBOO_NUM_PAGES (NUMCORES*(2+3)+5)
 #define BAMBOO_PAGE_SIZE (16 * 16)
 #define BAMBOO_SMEM_SIZE (BAMBOO_PAGE_SIZE)
 #else
@@ -226,9 +230,23 @@ struct Queue * totransobjqueue; // queue to hold objs to be transferred
 #ifdef MULTICORE_GC
 #include "multicoregarbage.h"
 
+typedef enum {
+	SMEMLOCAL = 0x0, // 0x0, using local mem only
+	SMEMFIXED,       // 0x1, use local mem in lower address space(1 block only)
+	                 //      and global mem in higher address space
+	SMEMMIXED,        // 0x2, like FIXED mode but use a threshold to control
+	SMEMGLOBAL,       // 0x3, using global mem only
+	SMEMEND
+} SMEMSTRATEGY;
+
+SMEMSTRATEGY bamboo_smem_mode; //-DSMEML: LOCAL; -DSMEMF: FIXED; 
+                              //-DSMEMM: MIXED; -DSMEMG: GLOBAL;
+
 struct freeMemItem {
 	INTPTR ptr;
 	int size;
+	int startblock;  
+	int endblock;
 	struct freeMemItem * next;
 };
 
@@ -237,16 +255,13 @@ struct freeMemList {
 	struct freeMemItem * tail;
 };
 
-volatile bool smemflag;
 struct freeMemList * bamboo_free_mem_list;
-volatile INTPTR bamboo_cur_msp;
-volatile int bamboo_smem_size;
 #else
-volatile bool smemflag;
 volatile mspace bamboo_free_msp;
+#endif
+volatile bool smemflag;
 volatile INTPTR bamboo_cur_msp;
 volatile int bamboo_smem_size;
-#endif
 
 // for test TODO
 int total_num_t6;
@@ -296,11 +311,11 @@ bool reside;
 ////////////////////////////////////////////////////////////
 #ifdef TASK
 #ifdef MULTICORE
-inline void initialization(void) __attribute__((always_inline));
-inline void initCommunication(void) __attribute__((always_inline));
-inline void fakeExecution(void) __attribute__((always_inline));
-inline void terminate(void) __attribute__((always_inline));
-inline void initlock(struct ___Object___ * v) __attribute__((always_inline));
+INLINE void initialization(void);
+INLINE void initCommunication(void);
+INLINE void fakeExecution(void);
+INLINE void terminate(void);
+INLINE void initlock(struct ___Object___ * v);
 
 // lock related functions
 bool getreadlock(void* ptr);
@@ -317,81 +332,81 @@ void releasewritelock_r(void * lock, void * redirectlock);
 // if return -1: the lock request is redirected
 //            0: the lock request is approved
 //            1: the lock request is denied
-inline int processlockrequest(int locktype, 
+INLINE int processlockrequest(int locktype, 
 		                          int lock, 
 															int obj, 
 															int requestcore, 
 															int rootrequestcore, 
-															bool cache) __attribute__((always_inline));
-inline void processlockrelease(int locktype, 
+															bool cache);
+INLINE void processlockrelease(int locktype, 
 		                           int lock, 
 															 int redirectlock, 
-															 bool redirect)__attribute__((always_inline));
+															 bool redirect);
 
 // msg related functions
-inline void send_hanging_msg() __attribute__((always_inline));
-inline void send_msg_1(int targetcore, 
-		                   unsigned long n0) __attribute__((always_inline));
-inline void send_msg_2(int targetcore, 
+INLINE void send_hanging_msg();
+INLINE void send_msg_1(int targetcore, 
+		                   unsigned long n0);
+INLINE void send_msg_2(int targetcore, 
 		                   unsigned long n0, 
-											 unsigned long n1) __attribute__((always_inline));
-inline void send_msg_3(int targetcore, 
+											 unsigned long n1);
+INLINE void send_msg_3(int targetcore, 
 		                   unsigned long n0, 
 											 unsigned long n1, 
-											 unsigned long n2) __attribute__((always_inline));
-inline void send_msg_4(int targetcore, 
+											 unsigned long n2);
+INLINE void send_msg_4(int targetcore, 
 		                   unsigned long n0, 
 											 unsigned long n1, 
 											 unsigned long n2, 
-											 unsigned long n3) __attribute__((always_inline));
-inline void send_msg_5(int targetcore, 
+											 unsigned long n3);
+INLINE void send_msg_5(int targetcore, 
 		                   unsigned long n0, 
 											 unsigned long n1, 
 											 unsigned long n2, 
 											 unsigned long n3, 
-											 unsigned long n4) __attribute__((always_inline));
-inline void send_msg_6(int targetcore, 
+											 unsigned long n4);
+INLINE void send_msg_6(int targetcore, 
 		                   unsigned long n0, 
 											 unsigned long n1, 
 											 unsigned long n2, 
 											 unsigned long n3, 
 											 unsigned long n4, 
-											 unsigned long n5) __attribute__((always_inline));
-inline void cache_msg_2(int targetcore, 
+											 unsigned long n5);
+INLINE void cache_msg_2(int targetcore, 
 		                    unsigned long n0, 
-												unsigned long n1) __attribute__((always_inline));
-inline void cache_msg_3(int targetcore, 
+												unsigned long n1);
+INLINE void cache_msg_3(int targetcore, 
 		                    unsigned long n0, 
 												unsigned long n1, 
-												unsigned long n2) __attribute__((always_inline));
-inline void cache_msg_4(int targetcore, 
+												unsigned long n2);
+INLINE void cache_msg_4(int targetcore, 
 		                    unsigned long n0, 
 												unsigned long n1, 
 												unsigned long n2, 
-												unsigned long n3) __attribute__((always_inline));
-inline void cache_msg_5(int targetcore, 
+												unsigned long n3);
+INLINE void cache_msg_5(int targetcore, 
 		                    unsigned long n0, 
 												unsigned long n1, 
 												unsigned long n2, 
 												unsigned long n3, 
-												unsigned long n4) __attribute__((always_inline));
-inline void cache_msg_6(int targetcore, 
+												unsigned long n4);
+INLINE void cache_msg_6(int targetcore, 
 		                    unsigned long n0, 
 												unsigned long n1, 
 												unsigned long n2, 
 												unsigned long n3, 
 												unsigned long n4, 
-												unsigned long n5) __attribute__((always_inline));
-inline void transferObject(struct transObjInfo * transObj);
-inline int receiveMsg(void) __attribute__((always_inline));
+												unsigned long n5);
+INLINE void transferObject(struct transObjInfo * transObj);
+INLINE int receiveMsg(void);
 
 #ifdef MULTICORE_GC
-inline void transferMarkResults() __attribute__((always_inline));
+INLINE void transferMarkResults();
 #endif
 
 #ifdef PROFILE
-inline void profileTaskStart(char * taskname) __attribute__((always_inline));
-inline void profileTaskEnd(void) __attribute__((always_inline));
+INLINE void profileTaskStart(char * taskname);
+INLINE void profileTaskEnd(void);
 void outputProfileData();
 #endif  // #ifdef PROFILE
 ///////////////////////////////////////////////////////////
diff --git a/Robust/src/Runtime/multicoretask.c b/Robust/src/Runtime/multicoretask.c
index 697599e0..d3d62bc3 100644
--- a/Robust/src/Runtime/multicoretask.c
+++ b/Robust/src/Runtime/multicoretask.c
@@ -18,6 +18,24 @@ int enqueuetasks_I(struct parameterwrapper *parameter,
 									 int * enterflags, 
 									 int numenterflags);
 
+#ifdef MULTICORE_GC
+inline __attribute__((always_inline)) 
+void setupsmemmode(void) {
+#ifdef SMEML
+	bamboo_smem_mode = SMEMLOCAL;
+#elif defined SMEMF
+	bamboo_smem_mode = SMEMFIXED;
+#elif defined SMEMM
+	bamboo_smem_mode = SMEMMIXED;
+#elif defined SMEMG
+	bamboo_smem_mode = SMEMGLOBAL;
+#else
+	// defaultly using local mode
+	bamboo_smem_mode = SMEMLOCAL;
+#endif
+} // void setupsmemmode(void)
+#endif
+
 inline __attribute__((always_inline)) 
 void initruntimedata() {
 	int i;
@@ -94,6 +112,7 @@ void initruntimedata() {
 	gcmovepending = 0;
 	gcblock2fill = 0;
 	gcsbstarttbl = BAMBOO_BASE_VA;
+	gcsmemtbl = RUNMALLOC_I(sizeof(int)*gcnumblock);
 #else
 	// create the lock table, lockresult table and obj queue
   locktable.size = 20;
@@ -1162,42 +1181,208 @@ inline void addNewObjInfo(void * nobj) {
 }
 #endif
 
-void * smemalloc(int size, 
-		             int * allocsize) {
-	void * mem = NULL;
-	int isize = size+(BAMBOO_CACHE_LINE_SIZE);
-	int toallocate = ((size+(BAMBOO_CACHE_LINE_SIZE))>(BAMBOO_SMEM_SIZE)) ? 
-			             (size+(BAMBOO_CACHE_LINE_SIZE)):(BAMBOO_SMEM_SIZE);
-#ifdef MULTICORE_GC
-	// go through free mem list for suitable blocks
+struct freeMemItem * findFreeMemChunk(int coren,
+		                                  int isize,
+		                                  int * tofindb) {
 	struct freeMemItem * freemem = bamboo_free_mem_list->head;
 	struct freeMemItem * prev = NULL;
+	int i = 0;
+	int j = 0;
+	*tofindb = gc_core2block[2*coren+i]+124*j;
+	// check available shared mem chunks
 	do {
-		if(freemem->size >= isize) {
+		int foundsmem = 0;
+		switch(bamboo_smem_mode) {
+			case SMEMLOCAL: {
+				int startb = freemem->startblock;
+				int endb = freemem->endblock;
+				while(startb > *tofindb) {
+					i++;
+					if(2==i) {
+						i = 0;
+						j++;
+					}
+					*tofindb = gc_core2block[2*coren+i]+124*j;
+				} // while(startb > tofindb)
+				if(startb <= *tofindb) {
+					if((endb >= *tofindb) && (freemem->size >= isize)) {
+						foundsmem = 1;
+					} else if(*tofindb > gcnumblock-1) {
+						// no more local mem
+						foundsmem = 2;
+					} // if(endb >= tofindb) 
+				} // if(startb <= tofindb)
+				break;
+			}
+
+			case SMEMFIXED: {
+				int startb = freemem->startblock;
+				int endb = freemem->endblock;
+				if(startb <= *tofindb) {
+					if((endb >= *tofindb)  && (freemem->size >= isize)) {
+						foundsmem = 1;
+					} 
+				} else {
+					// use the global mem
+					if(((startb > NUMCORES-1) && (freemem->size >= isize)) || 
+							((endb > NUMCORES-1) && ((freemem->size-
+								(gcbaseva+BAMBOO_LARGE_SMEM_BOUND-freemem->ptr))>=isize))) {
+						foundsmem = 1;
+					}
+				}
+				break;
+			}
+
+			case SMEMMIXED: {
+				// TODO not supported yet
+				BAMBOO_EXIT(0xe001);
+				break;
+			}
+
+			case SMEMGLOBAL: {
+		    foundsmem = (freemem->size >= isize);
+				break;
+			}
+			default:
+				break;
+		}
+
+		if(1 == foundsmem) {
 			// found one
 			break;
+		} else if (2 == foundsmem) {
+			// terminate, no more mem
+			freemem = NULL;
+			break;
 		}
 		prev = freemem;
 		freemem = freemem->next;
 	} while(freemem != NULL);
-	if(freemem != NULL) {
-		mem = (void *)(freemem->ptr);
-		// check the remaining space in this block
-		int remain = (int)(mem-(BAMBOO_BASE_VA));
-		int bound = (BAMBOO_SMEM_SIZE);
-		if(remain < BAMBOO_LARGE_SMEM_BOUND) {
-			bound = (BAMBOO_SMEM_SIZE_L);
-		}
-		remain = bound - remain%bound;
-		if(remain < isize) {
-			// this object acrosses blocks
-			*allocsize = isize;
-		} else {
-			// round the asigned block to the end of the current block
-			*allocsize = remain;
-		}
+
+	return freemem;
+} // struct freeMemItem * findFreeMemChunk(int, int, int *)
+
+void * localmalloc(int tofindb,
+		               int isize,
+		               struct freeMemItem * freemem,
+		               int * allocsize) {
+	void * mem = NULL;
+	int startb = freemem->startblock;
+	int endb = freemem->endblock;
+	int tmpptr = gcbaseva+((tofindb<NUMCORES)?tofindb*BAMBOO_SMEM_SIZE_L
+		:BAMBOO_LARGE_SMEM_BOUND+(tofindb-NUMCORES)*BAMBOO_SMEM_SIZE);
+	if((freemem->size+freemem->ptr-tmpptr)>=isize) {
+		mem = (tmpptr>freemem->ptr)?((void *)tmpptr):(freemem->ptr);
+	} else {
+		mem = (void *)(freemem->size+freemem->ptr-isize);
+	}
+	// check the remaining space in this block
+	int remain = (int)(mem-gcbaseva);
+	int bound = (BAMBOO_SMEM_SIZE);
+	if(remain < BAMBOO_LARGE_SMEM_BOUND) {
+		bound = (BAMBOO_SMEM_SIZE_L);
+	}
+	remain = bound - remain%bound;
+	if(remain < isize) {
+		// this object acrosses blocks
+		*allocsize = isize;
+	} else {
+		// round the asigned block to the end of the current block
+		*allocsize = remain;
+	}
+	if(freemem->ptr == (int)mem) {
 		freemem->ptr = ((void*)freemem->ptr) + (*allocsize);
 		freemem->size -= *allocsize;
+		BLOCKINDEX(freemem->ptr, &(freemem->startblock));
+	} else if((freemem->ptr+freemem->size) == ((int)mem+(*allocsize))) {
+		freemem->size -= *allocsize;
+		BLOCKINDEX(((int)mem)-1, &(freemem->endblock));
+	} else {
+		struct freeMemItem * tmp = 
+			(struct freeMemItem *)RUNMALLOC(sizeof(struct freeMemItem));
+		tmp->ptr = (int)mem+*allocsize;
+		tmp->size = freemem->ptr+freemem->size-(int)mem-*allocsize;
+		BLOCKINDEX(tmp->ptr, &(tmp->startblock));
+		tmp->endblock = freemem->endblock;
+		tmp->next = freemem->next;
+		freemem->next = tmp;
+		freemem->size = (int)mem - freemem->ptr;
+		BLOCKINDEX(((int)mem-1), &(freemem->endblock));
+	}
+	return mem;
+} // void * localmalloc(int, int, struct freeMemItem *, int *)
+
+void * globalmalloc(int isize,
+		                struct freeMemItem * freemem,
+		                int * allocsize) {
+	void * mem = (void *)(freemem->ptr);
+	// check the remaining space in this block
+	int remain = (int)(mem-(BAMBOO_BASE_VA));
+	int bound = (BAMBOO_SMEM_SIZE);
+	if(remain < BAMBOO_LARGE_SMEM_BOUND) {
+		bound = (BAMBOO_SMEM_SIZE_L);
+	}
+	remain = bound - remain%bound;
+	if(remain < isize) {
+		// this object acrosses blocks
+		*allocsize = isize;
+	} else {
+		// round the asigned block to the end of the current block
+		*allocsize = remain;
+	}
+	freemem->ptr = ((void*)freemem->ptr) + (*allocsize);
+	freemem->size -= *allocsize;
+	return mem;
+} // void * globalmalloc(int, struct freeMemItem *, int *)
+
+// malloc from the shared memory
+void * smemalloc(int coren,
+		             int size, 
+		             int * allocsize) {
+	void * mem = NULL;
+	int isize = size+(BAMBOO_CACHE_LINE_SIZE);
+	int toallocate = ((size+(BAMBOO_CACHE_LINE_SIZE))>(BAMBOO_SMEM_SIZE)) ? 
+			             (size+(BAMBOO_CACHE_LINE_SIZE)):(BAMBOO_SMEM_SIZE);
+#ifdef MULTICORE_GC
+	// go through free mem list for suitable chunks
+	int tofindb = 0;
+	struct freeMemItem * freemem = findFreeMemChunk(coren, isize, &tofindb);
+
+	// allocate shared mem if available
+	if(freemem != NULL) {
+		switch(bamboo_smem_mode) {
+			case SMEMLOCAL: {
+				mem = localmalloc(tofindb, isize, freemem, allocsize);
+				break;
+			}
+
+			case SMEMFIXED: {
+				int startb = freemem->startblock;
+				int endb = freemem->endblock;
+				if(startb > tofindb) {
+					// malloc on global mem
+					mem = globalmalloc(isize, freemem, allocsize);
+				} else {
+					// malloc on local mem
+					mem = localmalloc(tofindb, isize, freemem, allocsize);
+				}
+				break;
+			}
+
+			case SMEMMIXED: {
+				// TODO not supported yet
+				BAMBOO_EXIT(0xe002);
+				break;
+			}
+
+			case SMEMGLOBAL: {
+				mem = globalmalloc(isize,freemem, allocsize);
+				break;
+			}
+
+			default:
+				break;
+		}
 	} else {
 #else
 	mem = mspace_calloc(bamboo_free_msp, 1, isize);
@@ -1250,12 +1435,12 @@ msg:
 				RUNMALLOC_I(sizeof(struct transObjInfo));
       int k = 0;
 #ifdef DEBUG
-#ifndef TILERA
+#ifndef CLOSE_PRINT
 			BAMBOO_DEBUGPRINT(0xe880);
 #endif
 #endif
       if(BAMBOO_NUM_OF_CORE > NUMCORES - 1) {
-#ifndef TILERA
+#ifndef CLOSE_PRINT
 				BAMBOO_DEBUGPRINT_REG(msgdata[2]);
 #endif
 				BAMBOO_EXIT(0xa002);
@@ -1267,13 +1452,13 @@ msg:
       for(k = 0; k < transObj->length; ++k) {
 				transObj->queues[2*k] = msgdata[3+2*k];
 #ifdef DEBUG
-#ifndef TILERA
+#ifndef CLOSE_PRINT
 				BAMBOO_DEBUGPRINT_REG(transObj->queues[2*k]);
 #endif
 #endif
 				transObj->queues[2*k+1] = msgdata[3+2*k+1];
 #ifdef DEBUG
-#ifndef TILERA
+#ifndef CLOSE_PRINT
 				BAMBOO_DEBUGPRINT_REG(transObj->queues[2*k+1]);
 #endif
 #endif
@@ -1308,14 +1493,14 @@ msg:
       // receive a stall msg
       if(BAMBOO_NUM_OF_CORE != STARTUPCORE) {
 		  // non startup core can not receive stall msg
-#ifndef TILERA
+#ifndef CLOSE_PRINT
 				BAMBOO_DEBUGPRINT_REG(msgdata[1]);
 #endif
 				BAMBOO_EXIT(0xa003);
       } 
       if(msgdata[1] < NUMCORES) {
 #ifdef DEBUG
-#ifndef TILERA
+#ifndef CLOSE_PRINT
 				BAMBOO_DEBUGPRINT(0xe881);
 #endif
 #endif
@@ -1357,14 +1542,14 @@ msg:
     case LOCKGROUNT: {
       // receive lock grount msg
       if(BAMBOO_NUM_OF_CORE > NUMCORES - 1) {
-#ifndef TILERA
+#ifndef CLOSE_PRINT
 				BAMBOO_DEBUGPRINT_REG(msgdata[2]);
 #endif
 				BAMBOO_EXIT(0xa004);
       } 
       if((lockobj == msgdata[2]) && (lock2require == msgdata[3])) {
 #ifdef DEBUG
-#ifndef TILERA
+#ifndef CLOSE_PRINT
 				BAMBOO_DEBUGPRINT(0xe882);
 #endif
 #endif
@@ -1375,7 +1560,7 @@ msg:
 #endif
 			} else {
 				// conflicts on lockresults
-#ifndef TILERA
+#ifndef CLOSE_PRINT
 				BAMBOO_DEBUGPRINT_REG(msgdata[2]);
 #endif
 				BAMBOO_EXIT(0xa005);
@@ -1386,14 +1571,14 @@ msg:
     case LOCKDENY: {
       // receive lock deny msg
       if(BAMBOO_NUM_OF_CORE > NUMCORES - 1) {
-#ifndef TILERA
+#ifndef CLOSE_PRINT
 				BAMBOO_DEBUGPRINT_REG(msgdata[2]);
 #endif
 				BAMBOO_EXIT(0xa006);
       } 
       if((lockobj == msgdata[2]) && (lock2require == msgdata[3])) {
 #ifdef DEBUG
-#ifndef TILERA
+#ifndef CLOSE_PRINT
 				BAMBOO_DEBUGPRINT(0xe883);
 #endif
 #endif
@@ -1404,7 +1589,7 @@ msg:
 #endif
 				} else {
 				// conflicts on lockresults
-#ifndef TILERA
+#ifndef CLOSE_PRINT
 				BAMBOO_DEBUGPRINT_REG(msgdata[2]);
 #endif
 				BAMBOO_EXIT(0xa007);
@@ -1427,7 +1612,7 @@ msg:
 				BAMBOO_EXIT(0xa008);
       }
 #ifdef DEBUG
-#ifndef TILEAR
+#ifndef CLOSE_PRINT
 			BAMBOO_DEBUGPRINT(0xe885);
 #endif
 #endif
@@ -1446,13 +1631,13 @@ msg:
       // receive a profile output finish msg
       if(BAMBOO_NUM_OF_CORE != STARTUPCORE) {
 				// non startup core can not receive profile output finish msg
-#ifndef TILERA
+#ifndef CLOSE_PRINT
 				BAMBOO_DEBUGPRINT_REG(msgdata[1]);
 #endif
 				BAMBOO_EXIT(0xa009);
       }
 #ifdef DEBUG
-#ifndef TILERA
+#ifndef CLOSE_PRINT
 			BAMBOO_DEBUGPRINT(0xe886);
 #endif
 #endif
@@ -1492,14 +1677,14 @@ msg:
 	case REDIRECTGROUNT: {
 		// receive a lock grant msg with redirect info
 		if(BAMBOO_NUM_OF_CORE > NUMCORES - 1) {
-#ifndef TILERA
+#ifndef CLOSE_PRINT
 			BAMBOO_DEBUGPRINT_REG(msgdata[2]);
 #endif
 			BAMBOO_EXIT(0xa00a);
 		}
 		if(lockobj == msgdata[2]) {
 #ifdef DEBUG
-#ifndef TILERA
+#ifndef CLOSE_PRINT
 		  BAMBOO_DEBUGPRINT(0xe891);
 #endif
 #endif
@@ -1511,7 +1696,7 @@ msg:
 #endif
 		} else {
 		  // conflicts on lockresults
-#ifndef TILERA
+#ifndef CLOSE_PRINT
 		  BAMBOO_DEBUGPRINT_REG(msgdata[2]);
 #endif
 		  BAMBOO_EXIT(0xa00b);
@@ -1522,14 +1707,14 @@ msg:
 	case REDIRECTDENY: {
 	  // receive a lock deny msg with redirect info
 	  if(BAMBOO_NUM_OF_CORE > NUMCORES - 1) {
-#ifndef TILERA
+#ifndef CLOSE_PRINT
 		  BAMBOO_DEBUGPRINT_REG(msgdata[2]);
 #endif
 		  BAMBOO_EXIT(0xa00c);
 	  }
 		if(lockobj == msgdata[2]) {
 #ifdef DEBUG
-#ifndef TILERA
+#ifndef CLOSE_PRINT
 		  BAMBOO_DEBUGPRINT(0xe892);
 #endif
 #endif
@@ -1540,7 +1725,7 @@ msg:
 #endif
 		} else {
 		  // conflicts on lockresults
-#ifndef TILERA
+#ifndef CLOSE_PRINT
 		  BAMBOO_DEBUGPRINT_REG(msgdata[2]);
 #endif
 		  BAMBOO_EXIT(0xa00d);
@@ -1564,7 +1749,7 @@ msg:
 		} else {
 		  // send response msg
 #ifdef DEBUG
-#ifndef TILERA
+#ifndef CLOSE_PRINT
 		  BAMBOO_DEBUGPRINT(0xe887);
 #endif
 #endif
@@ -1585,13 +1770,13 @@ msg:
 	  // receive a status confirm info
 	  if(BAMBOO_NUM_OF_CORE != STARTUPCORE) {
 		  // wrong core to receive such msg
-#ifndef TILERA
+#ifndef CLOSE_PRINT
 		  BAMBOO_DEBUGPRINT_REG(msgdata[2]);
 #endif
 		  BAMBOO_EXIT(0xa00f);
 		} else {
 #ifdef DEBUG
-#ifndef TILERA
+#ifndef CLOSE_PRINT
 		  BAMBOO_DEBUGPRINT(0xe888);
 #endif
 #endif
@@ -1608,7 +1793,7 @@ msg:
 	case TERMINATE: {
 	  // receive a terminate msg
 #ifdef DEBUG
-#ifndef TILERA
+#ifndef CLOSE_PRINT
 		BAMBOO_DEBUGPRINT(0xe889);
 #endif
 #endif
@@ -1621,13 +1806,13 @@ msg:
 	  // receive a shared memory request msg
 	  if(BAMBOO_NUM_OF_CORE != STARTUPCORE) {
 		  // wrong core to receive such msg
-#ifndef TILERA
+#ifndef CLOSE_PRINT
 		  BAMBOO_DEBUGPRINT_REG(msgdata[2]);
 #endif
 		  BAMBOO_EXIT(0xa010);
 		} else {
 #ifdef DEBUG
-#ifndef TILERA
+#ifndef CLOSE_PRINT
 		  BAMBOO_DEBUGPRINT(0xe88a);
 #endif
 #endif
@@ -1638,7 +1823,7 @@ msg:
 			}
 #endif
 			int allocsize = 0;
-		  void * mem = smemalloc(msgdata[1], &allocsize);
+		  void * mem = smemalloc(msgdata[2], msgdata[1], &allocsize);
 			if(mem == NULL) {
 				break;
 			}
@@ -1655,7 +1840,7 @@ msg:
 	case MEMRESPONSE: {
 		// receive a shared memory response msg
 #ifdef DEBUG
-#ifndef TILERA
+#ifndef CLOSE_PRINT
 	  BAMBOO_DEBUGPRINT(0xe88b);
 #endif
 #endif
@@ -1703,7 +1888,7 @@ msg:
 	case GCSTART: {
 		// receive a start GC msg
 #ifdef DEBUG
-#ifndef TILERA
+#ifndef CLOSE_PRINT
 	  BAMBOO_DEBUGPRINT(0xe88c);
 #endif
 #endif
@@ -1729,7 +1914,7 @@ msg:
 		// received a init phase finish msg
 		if(BAMBOO_NUM_OF_CORE != STARTUPCORE) {
 		  // non startup core can not receive this msg
-#ifndef TILERA
+#ifndef CLOSE_PRINT
 		  BAMBOO_DEBUGPRINT_REG(msgdata[1]);
 #endif
 		  BAMBOO_EXIT(0xb001);
@@ -1747,7 +1932,7 @@ msg:
 		// received a mark phase finish msg
 		if(BAMBOO_NUM_OF_CORE != STARTUPCORE) {
 		  // non startup core can not receive this msg
-#ifndef TILERA
+#ifndef CLOSE_PRINT
 		  BAMBOO_DEBUGPRINT_REG(msgdata[1]);
 #endif
 		  BAMBOO_EXIT(0xb002);
@@ -1765,7 +1950,7 @@ msg:
 		if(BAMBOO_NUM_OF_CORE != STARTUPCORE) {
 		  // non startup core can not receive this msg
 		  // return -1
-#ifndef TILERA
+#ifndef CLOSE_PRINT
 		  BAMBOO_DEBUGPRINT_REG(msgdata[1]);
 #endif
 		  BAMBOO_EXIT(0xb003);
@@ -1836,7 +2021,7 @@ msg:
 		if(BAMBOO_NUM_OF_CORE != STARTUPCORE) {
 		  // non startup core can not receive this msg
 		  // return -1
-#ifndef TILERA
+#ifndef CLOSE_PRINT
 		  BAMBOO_DEBUGPRINT_REG(msgdata[1]);
 #endif
 		  BAMBOO_EXIT(0xb004);
@@ -1877,7 +2062,7 @@ msg:
 		// received a marked phase finish confirm response msg
 		if(BAMBOO_NUM_OF_CORE != STARTUPCORE) {
 		  // wrong core to receive such msg
-#ifndef TILERA
+#ifndef CLOSE_PRINT
 		  BAMBOO_DEBUGPRINT_REG(msgdata[2]);
 #endif
 		  BAMBOO_EXIT(0xb006);
@@ -1959,7 +2144,7 @@ msg:
 		numconfirm--;
 
 		if(BAMBOO_NUM_OF_CORE > NUMCORES - 1) {
-#ifndef TILERA
+#ifndef CLOSE_PRINT
 			BAMBOO_DEBUGPRINT_REG(msgdata[2]);
 #endif
 			BAMBOO_EXIT(0xb009);
@@ -1997,7 +2182,7 @@ msg:
 	msgtype = -1;
 	msglength = 30;
 #ifdef DEBUG
-#ifndef TILERA
+#ifndef CLOSE_PRINT
 	BAMBOO_DEBUGPRINT(0xe88d);
 #endif
 #endif
@@ -2014,7 +2199,7 @@ msg:
 } else {
 	// not a whole msg
 #ifdef DEBUG
-#ifndef TILERA
+#ifndef CLOSE_PRINT
 	BAMBOO_DEBUGPRINT(0xe88e);
 #endif
 #endif
@@ -2277,7 +2462,6 @@ void executetasks() {
   int andmask=0;
   int checkmask=0;
 
-
 newtask:
   while(hashsize(activetasks)>0) {
 #ifdef MULTICORE_GC
@@ -2296,11 +2480,11 @@ newtask:
 #endif
 #endif
 	  busystatus = true;
-      currtpd=(struct taskparamdescriptor *) getfirstkey(activetasks);
-      genfreekey(activetasks, currtpd);
+		currtpd=(struct taskparamdescriptor *) getfirstkey(activetasks);
+		genfreekey(activetasks, currtpd);
 
-      numparams=currtpd->task->numParameters;
-      numtotal=currtpd->task->numTotal;
+		numparams=currtpd->task->numParameters;
+		numtotal=currtpd->task->numTotal;
 
 	  // clear the lockRedirectTbl 
 		// (TODO, this table should be empty after all locks are released)
diff --git a/Robust/src/Runtime/runtime.h b/Robust/src/Runtime/runtime.h
index b34e9a54..fca662c4 100644
--- a/Robust/src/Runtime/runtime.h
+++ b/Robust/src/Runtime/runtime.h
@@ -138,17 +138,20 @@ extern struct ___Object___ * ___fcrevert___;
 
 #ifdef MULTICORE
 inline void run(void * arg);
+#ifdef MULTICORE_GC
+inline void setupsmemmode(void);
+#endif
 int receiveObject(void);
 void flagorand(void * ptr, int ormask, int andmask, struct parameterwrapper ** queues, int length);
 void flagorandinit(void * ptr, int ormask, int andmask);
-void enqueueObject(void * ptr, struct parameterwrapper ** queues, int length);
+void enqueueObject(void * ptr, struct parameterwrapper ** queues,int length);
 #ifdef PROFILE
 inline void setTaskExitIndex(int index);
 inline void addNewObjInfo(void * nobj);
 #endif
 int * getAliasLock(void ** ptrs, int length, struct RuntimeHash * tbl);
 void addAliasLock(void * ptr, int lock);
-void * smemalloc(int size, int * allocsize);
+void * smemalloc(int coren, int size, int * allocsize);
 #else
 void flagorand(void * ptr, int ormask, int andmask);
 void flagorandinit(void * ptr, int ormask, int andmask);
diff --git a/Robust/src/buildscript b/Robust/src/buildscript
index 98b24f6a..b6a8b3a5 100755
--- a/Robust/src/buildscript
+++ b/Robust/src/buildscript
@@ -602,7 +602,7 @@ cd $TILERADIR
 make clean
 rm ./*
 
-export TILERACFLAGS="-DTASK -DMULTICORE"
+export TILERACFLAGS="-DTASK -DMULTICORE -DCLOSE_PRINT"
 
 if $CACHEFLUSHFLAG
 then # print path
-- 
2.34.1