From: jzhou <jzhou>
Date: Thu, 2 Sep 2010 17:04:11 +0000 (+0000)
Subject: Clean multicore version code
X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=a259b8eeddbcf764b17dd4a12700d0f7578f943a;p=IRC.git

Clean multicore version code
---

diff --git a/Robust/src/Runtime/GCSharedHash.c b/Robust/src/Runtime/GCSharedHash.c
deleted file mode 100755
index 04e92ffb..00000000
--- a/Robust/src/Runtime/GCSharedHash.c
+++ /dev/null
@@ -1,472 +0,0 @@
-#ifdef MULTICORE_GC
-
-#include "GCSharedHash.h"
-#ifdef MULTICORE
-#include "runtime_arch.h"
-#else
-#include <stdio.h>
-#endif
-
-#ifndef INTPTR
-#ifdef BIT64
-#define INTPTR long
-#define INTPTRSHIFT 3
-#else
-#define INTPTR int
-#define INTPTRSHIFT 2
-#endif
-#endif
-
-#ifndef INLINE
-#define INLINE    inline __attribute__((always_inline))
-#endif // #ifndef INLINE
-
-#define GC_SHIFT_BITS  4
-
-/* GCSHARED HASH ********************************************************/
-
-// params: startaddr -- the start addr of the shared memory
-//         rsize -- remaining size of the available shared memory
-struct GCSharedHash * noargallocateGCSharedHash() {
-  return allocateGCSharedHash(100);
-}
-
-struct GCSharedHash * allocateGCSharedHash(int size) {
-  struct GCSharedHash *thisvar; 
-  if (size <= 0) {
-#ifdef MULTICORE
-    BAMBOO_EXIT(0xf201);
-#else
-    printf("Negative Hashtable size Exception\n");
-    exit(-1);
-#endif
-  } 
-  thisvar=(struct GCSharedHash *)FREEMALLOC_NGC(sizeof(struct GCSharedHash));
-  if(thisvar == NULL) {
-	return NULL;
-  }
-  thisvar->size = size;
-  thisvar->bucket = 
-	(struct GCSharedNode **)FREEMALLOC_NGC(sizeof(struct GCSharedNode *)*size);
-  if(thisvar->bucket == NULL) {
-	FREE_NGC(thisvar);
-	return NULL;
-  }
-  /* Set allocation blocks*/
-  thisvar->listhead=NULL;
-  thisvar->listtail=NULL;
-  /*Set data counts*/
-  thisvar->numelements = 0;
-  return thisvar;
-}
-
-void freeGCSharedHash(struct GCSharedHash *thisvar) {
-  struct GCSharedNode *ptr=thisvar->listhead;
-  FREE_NGC(thisvar->bucket);
-  while(ptr) {
-    struct GCSharedNode *next=ptr->lnext;
-    FREE_NGC(ptr);
-    ptr=next;
-  }
-  FREE_NGC(thisvar);
-}
-
-bool GCSharedHashrehash(struct GCSharedHash * thisvar) {
-  int newsize=thisvar->size;
-  struct GCSharedNode ** newbucket = (struct GCSharedNode **)
-	FREEMALLOC_NGC(sizeof(struct GCSharedNode *)*newsize);
-  if(newbucket == NULL) {
-	return false;
-  }
-  int i;
-  for(i=thisvar->size-1; i>=0; i--) {
-    struct GCSharedNode *ptr;
-    for(ptr=thisvar->bucket[i]; ptr!=NULL;) {
-      struct GCSharedNode * nextptr=ptr->next;
-      unsigned int newhashkey=(unsigned int)ptr->key % newsize;
-      ptr->next=newbucket[newhashkey];
-      newbucket[newhashkey]=ptr;
-      ptr=nextptr;
-    }
-  }
-  thisvar->size=newsize;
-  FREE_NGC(thisvar->bucket);
-  thisvar->bucket=newbucket;
-  return true;
-}
-
-int GCSharedHashadd(struct GCSharedHash * thisvar,int key, int data) {
-  /* Rehash code */
-  unsigned int hashkey;
-  struct GCSharedNode **ptr;
-
-  if (thisvar->numelements>=thisvar->size) {
-    int newsize=2*thisvar->size+1;
-    struct GCSharedNode ** newbucket = 
-	  (struct GCSharedNode **)FREEMALLOC_NGC(
-		  sizeof(struct GCSharedNode *)*newsize);
-	if(newbucket == NULL) {
-	  return -1;
-	}
-    int i;
-    for(i=thisvar->size-1; i>=0; i--) {
-      struct GCSharedNode *ptr;
-      for(ptr=thisvar->bucket[i]; ptr!=NULL;) {
-	struct GCSharedNode * nextptr=ptr->next;
-	unsigned int newhashkey=(unsigned int)ptr->key % newsize;
-	ptr->next=newbucket[newhashkey];
-	newbucket[newhashkey]=ptr;
-	ptr=nextptr;
-      }
-    }
-    thisvar->size=newsize;
-    FREE_NGC(thisvar->bucket);
-    thisvar->bucket=newbucket;
-  }
-
-  hashkey = (unsigned int)key % thisvar->size;
-  ptr = &thisvar->bucket[hashkey];
-
-  /* check that thisvar key/object pair isn't already here */
-  /* TBD can be optimized for set v. relation */
-
-  while (*ptr) {
-    if ((*ptr)->key == key && (*ptr)->data == data) {
-      return 0;
-    }
-    ptr = &((*ptr)->next);
-  }
-
-  {
-    struct GCSharedNode *node=FREEMALLOC_NGC(sizeof(struct GCSharedNode));
-	if(node == NULL) {
-	  return -1;
-	}
-    node->data=data;
-    node->key=key;
-    node->next=(*ptr);
-    *ptr=node;
-    if (thisvar->listhead==NULL) {
-      thisvar->listhead=node;
-      thisvar->listtail=node;
-      node->lnext=NULL;
-      node->lprev=NULL;
-    } else {
-      node->lprev=NULL;
-      node->lnext=thisvar->listhead;
-      thisvar->listhead->lprev=node;
-      thisvar->listhead=node;
-    }
-  }
-
-  thisvar->numelements++;
-  return 1;
-}
-
-#ifdef MULTICORE 
-struct GCSharedHash * allocateGCSharedHash_I(int size) {
-  struct GCSharedHash *thisvar;
-  if (size <= 0) {
-#ifdef MULTICORE
-    BAMBOO_EXIT(0xf202);
-#else
-    printf("Negative Hashtable size Exception\n");
-    exit(-1);
-#endif
-  }
-  thisvar=(struct GCSharedHash *)FREEMALLOC_NGC_I(sizeof(struct GCSharedHash));
-  if(thisvar == NULL) {
-	return NULL;
-  }
-  thisvar->size = size;
-  thisvar->bucket = 
-	(struct GCSharedNode **)FREEMALLOC_NGC_I(
-		sizeof(struct GCSharedNode *)*size);
-  if(thisvar->bucket == NULL) {
-	FREE_NGC_I(thisvar);
-	return NULL;
-  }
-  /* Set allocation blocks*/
-  thisvar->listhead=NULL;
-  thisvar->listtail=NULL;
-  /*Set data counts*/
-  thisvar->numelements = 0;
-  return thisvar;
-}
-
-int GCSharedHashadd_I(struct GCSharedHash * thisvar,int key, int data) {
-  /* Rehash code */
-  unsigned int hashkey;
-  struct GCSharedNode **ptr;
-
-  if (thisvar->numelements>=thisvar->size) {
-    int newsize=2*thisvar->size+1;
-    struct GCSharedNode ** newbucket = 
-	  (struct GCSharedNode **)FREEMALLOC_NGC_I(
-		  sizeof(struct GCSharedNode *)*newsize);
-	if(newbucket == NULL) {
-	  return -1;
-	}
-    int i;
-    for(i=thisvar->size-1; i>=0; i--) {
-      struct GCSharedNode *ptr;
-      for(ptr=thisvar->bucket[i]; ptr!=NULL;) {
-	struct GCSharedNode * nextptr=ptr->next;
-	unsigned int newhashkey=(unsigned int)ptr->key % newsize;
-	ptr->next=newbucket[newhashkey];
-	newbucket[newhashkey]=ptr;
-	ptr=nextptr;
-      }
-    }
-    thisvar->size=newsize;
-    FREE_NGC_I(thisvar->bucket);
-    thisvar->bucket=newbucket;
-  }
-
-  hashkey = (unsigned int)key % thisvar->size;
-  ptr = &thisvar->bucket[hashkey];
-
-  /* check that thisvar key/object pair isn't already here */
-  /* TBD can be optimized for set v. relation */
-
-  while (*ptr) {
-    if ((*ptr)->key == key && (*ptr)->data == data) {
-      return 0;
-    }
-    ptr = &((*ptr)->next);
-  }
-
-  {
-    struct GCSharedNode *node=FREEMALLOC_NGC_I(sizeof(struct GCSharedNode));
-	if(node == NULL) {
-	  return -1;
-	}
-    node->data=data;
-    node->key=key;
-    node->next=(*ptr);
-    *ptr=node;
-    if (thisvar->listhead==NULL) {
-      thisvar->listhead=node;
-      thisvar->listtail=node;
-      node->lnext=NULL;
-      node->lprev=NULL;
-    } else {
-      node->lprev=NULL;
-      node->lnext=thisvar->listhead;
-      thisvar->listhead->lprev=node;
-      thisvar->listhead=node;
-    }
-  }
-
-  thisvar->numelements++;
-  return 1;
-}
-#endif
-
-int GCSharedHashget(struct GCSharedHash *thisvar, int key, int *data) {
-  unsigned int hashkey = (unsigned int)key % thisvar->size;
-
-  struct GCSharedNode *ptr = thisvar->bucket[hashkey];
-  while (ptr) {
-    if (ptr->key == key) {
-      *data = ptr->data;
-      return 1;       /* success */
-    }
-    ptr = ptr->next;
-  }
-
-  return 0;   /* failure */
-}
-
-/* MGCSHAREDHASH ********************************************************/
-
-mgcsharedhashtbl_t * mgcsharedhashCreate(unsigned int size, 
-                                         double loadfactor) {
-  mgcsharedhashtbl_t * ctable;
-  mgcsharedhashlistnode_t * nodes;
-  int i;
-
-  ctable = (mgcsharedhashtbl_t *)FREEMALLOC_NGC(sizeof(mgcsharedhashtbl_t));
-  if(ctable == NULL) {
-	// TODO
-	BAMBOO_EXIT(0xf203);
-	return NULL;
-  }
-  // Allocate space for the hash table
-  ctable->table = (mgcsharedhashlistnode_t *)FREEMALLOC_NGC(
-	  size*sizeof(mgcsharedhashlistnode_t));
-  if(ctable->table == NULL) {
-	BAMBOO_EXIT(0xf204); // TODO
-	return NULL;
-  }
-  ctable->size = size;
-  ctable->loadfactor = loadfactor;
-  ctable->threshold = size*loadfactor;
-
-  ctable->mask = (size << (GC_SHIFT_BITS))-1;
-
-  ctable->structs = NULL ; //FREEMALLOC_NGC(1*sizeof(mgcliststruct_t));
-  ctable->numelements = 0; // Initial number of elements in the hash
-  ctable->list = NULL;
-
-  return ctable;
-}
-
-mgcsharedhashtbl_t * mgcsharedhashCreate_I(unsigned int size, 
-                                           double loadfactor) {
-  mgcsharedhashtbl_t * ctable;
-  mgcsharedhashlistnode_t * nodes;
-  int i;
-
-  ctable = (mgcsharedhashtbl_t *)FREEMALLOC_NGC_I(sizeof(mgcsharedhashtbl_t));
-  if(ctable == NULL) {
-	// TODO
-	BAMBOO_EXIT(0xf205);
-	return NULL;
-  }
-  // Allocate space for the hash table
-  ctable->table = (mgcsharedhashlistnode_t *)FREEMALLOC_NGC_I(
-	  size*sizeof(mgcsharedhashlistnode_t));
-  if(ctable->table == NULL) {
-	BAMBOO_EXIT(0xf206); // TODO
-	return NULL;
-  }
-  ctable->size = size;
-  ctable->loadfactor = loadfactor;
-  ctable->threshold = size*loadfactor;
-
-  ctable->mask = (size << (GC_SHIFT_BITS))-1;
-
-  ctable->structs = NULL ; //FREEMALLOC_NGC(1*sizeof(mgcliststruct_t));
-  ctable->numelements = 0; // Initial number of elements in the hash
-  ctable->list = NULL;
-
-  return ctable;
-}
-
-void mgcsharedhashReset(mgcsharedhashtbl_t * tbl) {
-  mgcsharedhashlistnode_t * ptr = tbl->table;
-
-  if ((tbl->numelements) < (tbl->size>>6)) {
-	mgcsharedhashlistnode_t *top = &ptr[tbl->size];
-	mgcsharedhashlistnode_t * list = tbl->list;
-	while(list != NULL) {  
-      mgcsharedhashlistnode_t * next = list->next;
-      if ((list >= ptr) && (list < top)) {
-		//zero in list
-        list->key=NULL;
-        list->next=NULL;
-      }
-      list = next;
-	}
-  } else {
-	BAMBOO_MEMSET_WH(tbl->table, '\0', 
-		sizeof(mgcsharedhashlistnode_t)*tbl->size);
-  }
-
-  mgcsharedliststruct_t * structs = tbl->structs;
-  while(structs != NULL) {
-    mgcsharedliststruct_t * next = structs->next;
-	BAMBOO_MEMSET_WH(structs->array, '\0', 
-		structs->num * sizeof(mgcsharedhashlistnode_t));
-	structs->num = 0;
-    structs = next;
-  }
-  tbl->numelements = 0;
-}
-
-//Store objects and their pointers into hash
-//Using open addressing
-int mgcsharedhashInsert(mgcsharedhashtbl_t * tbl, void * key, void * val) {
-  mgcsharedhashlistnode_t * ptr;
-
-  if(tbl->numelements > (tbl->threshold)) {
-    //Never resize, simply don't insert any more
-    return -1;
-  }
-
-  //int keyto = ((unsigned INTPTR)key) % (tbl->size);
-  //ptr=&tbl->table[keyto];
-  ptr=&tbl->table[(((unsigned INTPTR)key)&tbl->mask)>>(GC_SHIFT_BITS)];
-
-  if(ptr->key==0) {
-    // the first time insert a value for the key
-    ptr->key=key;
-    ptr->val=val;
-  } else { // Insert to the next empty place
-	mgcsharedhashlistnode_t *top = &tbl->table[tbl->size];
-    do {
-	  ptr++;
-	} while((ptr < top) && (ptr->key != NULL));
-	if(ptr >= top) {
-	  return -1;
-	} else {
-	  ptr->key = key;
-	  ptr->val = val;
-	}
-  }
-  ptr->next = tbl->list;
-  tbl->list = ptr;
-  tbl->numelements++;
-  return 1;
-}
-
-int mgcsharedhashInsert_I(mgcsharedhashtbl_t * tbl, void * key, void * val) {
-  mgcsharedhashlistnode_t * ptr;
-
-  if(tbl->numelements > (tbl->threshold)) {
-    //Never resize, simply don't insert any more
-    return -1;
-  }
-
-  //int keyto = ((unsigned INTPTR)key) % (tbl->size);
-  //ptr=&tbl->table[keyto];
-  ptr=&tbl->table[(((unsigned INTPTR)key)&tbl->mask)>>(GC_SHIFT_BITS)];
-
-  if(ptr->key==0) {
-    // the first time insert a value for the key
-    ptr->key=key;
-    ptr->val=val;
-  } else { // Insert to the next empty place
-	mgcsharedhashlistnode_t * top = &tbl->table[tbl->size];
-	mgcsharedhashlistnode_t * start = ptr;
-    do {
-	  ptr++;
-	  if(ptr->key == 0) {
-		break;
-	  }
-	} while(ptr < top);
-	if(ptr >= top) {
-	  return -1;
-	} else {
-	  ptr->key = key;
-	  ptr->val = val;
-	}
-  }
-  ptr->next = tbl->list;
-  tbl->list = ptr;
-  tbl->numelements++;
-  return 1;
-}
-
-// Search for an address for a given oid
-INLINE void * mgcsharedhashSearch(mgcsharedhashtbl_t * tbl, void * key) {
-  //REMOVE HASH FUNCTION CALL TO MAKE SURE IT IS INLINED HERE]
-  //int keyto = ((unsigned INTPTR)key) % (tbl->size);
-  //mgcsharedhashlistnode_t * node=&tbl->table[keyto];
-  mgcsharedhashlistnode_t * node = 
-	&tbl->table[(((unsigned INTPTR)key)&tbl->mask)>>(GC_SHIFT_BITS)];
-  mgcsharedhashlistnode_t *top = &tbl->table[tbl->size];
-
-  do {
-	//i++;
-    if(node->key == key) {
-      return node->val;
-    }
-    node++;
-  } while(node < top);
-
-  return NULL;
-}
-
-#endif
diff --git a/Robust/src/Runtime/GCSharedHash.h b/Robust/src/Runtime/GCSharedHash.h
deleted file mode 100755
index 94725fbf..00000000
--- a/Robust/src/Runtime/GCSharedHash.h
+++ /dev/null
@@ -1,94 +0,0 @@
-#ifdef MULTICORE_GC
-
-#ifndef GCSHAREDHASH_H
-#define GCSHAREDHASH_H
-
-#ifndef bool
-#define bool int
-#endif
-
-#ifndef true
-#define true 1
-#endif
-
-#ifndef false
-#define false 0
-#endif
-
-#include "mem.h"
-
-/* GCSharedHash *********************************************************/
-
-struct GCSharedHash * noargallocateGCSharedHash();
-struct GCSharedHash * allocateGCSharedHash(int size);
-void freeGCSharedHash(struct GCSharedHash *);
-
-bool GCSharedHashrehash(struct GCSharedHash * thisvar);
-int GCSharedHashadd(struct GCSharedHash *, int key, int data);
-#ifdef MULTICORE
-struct GCSharedHash * allocateGCSharedHash_I(int size);
-int GCSharedHashadd_I(struct GCSharedHash *, int key, int data);
-#endif
-int GCSharedHashget(struct GCSharedHash *,int key, int* data);
-
-struct GCSharedHash {
-  int numelements;
-  int size;
-  struct GCSharedNode **bucket;
-  struct GCSharedNode *listhead;
-  struct GCSharedNode *listtail;
-};
-
-inline int GCSharedHashcountset(struct GCSharedHash * thisvar);
-
-/* RuntimeHashException  *************************************************/
-
-
-/* RuntimeIterator *****************************************************/
-struct GCSharedNode {
-  struct GCSharedNode *next;
-  struct GCSharedNode *lnext;
-  struct GCSharedNode *lprev;
-  int data;
-  int key;
-};
-
-/* MGCSharedHash *********************************************************/
-typedef struct mgcsharedhashlistnode {
-  void * key;
-  void * val; //this can be cast to another type or used to point to a
-              //larger structure
-  struct mgcsharedhashlistnode * next;
-} mgcsharedhashlistnode_t;
-
-#define NUMMGCSHAREDLIST 250
-typedef struct mgcsharedlist {
-  struct mgcsharedhashlistnode array[NUMMGCSHAREDLIST];
-  int num;
-  struct mgcsharedlist *next;
-} mgcsharedliststruct_t;
-
-typedef struct mgcsharedhashtbl {
-  mgcsharedhashlistnode_t * table;       // points to beginning of hash table
-  mgcsharedhashlistnode_t * list;
-  mgcsharedliststruct_t * structs;
-  unsigned int size;
-  unsigned int mask;
-  unsigned int numelements;
-  unsigned int threshold;
-  double loadfactor;
-} mgcsharedhashtbl_t;
-
-mgcsharedhashtbl_t * mgcsharedhashCreate(unsigned int size, double loadfactor);
-mgcsharedhashtbl_t * mgcsharedhashCreate_I(unsigned int size,double loadfactor);
-int mgcsharedhashInsert(mgcsharedhashtbl_t * tbl, void * key, void * val);
-void * mgcsharedhashSearch(mgcsharedhashtbl_t * tbl, void * key);
-//unsigned int mgchashResize(unsigned int newsize);
-int mgcsharedhashInsert_I(mgcsharedhashtbl_t * tbl, void * key, void * val);
-//unsigned int mgchashResize_I(unsigned int newsize);
-//void mgcsharedhashDelete(mgcsharedhashtbl_t * tbl);
-void mgcsharedhashReset(mgcsharedhashtbl_t * tbl);
-
-#endif
-
-#endif
diff --git a/Robust/src/Runtime/MGCHash.c b/Robust/src/Runtime/MGCHash.c
deleted file mode 100755
index dabe7e29..00000000
--- a/Robust/src/Runtime/MGCHash.c
+++ /dev/null
@@ -1,533 +0,0 @@
-#include "MGCHash.h"
-#ifdef MULTICORE
-#include "runtime_arch.h"
-#else
-#include <stdio.h>
-#endif
-#ifdef DMALLOC
-#include "dmalloc.h"
-#endif
-
-#ifndef INTPTR
-#ifdef BIT64
-#define INTPTR long
-#define INTPTRSHIFT 3
-#else
-#define INTPTR int
-#define INTPTRSHIFT 2
-#endif
-#endif
-
-#define GC_SHIFT_BITS 4
-
-/* mgchash ********************************************************/
-mgchashtable_t * mgchashCreate(unsigned int size, double loadfactor) {
-  mgchashtable_t *ctable;
-  mgchashlistnode_t *nodes;
-  int i;
-
-  if (size <= 0) {
-#ifdef MULTICORE
-    BAMBOO_EXIT(0xf101);
-#else
-    printf("Negative Hashtable size Exception\n");
-    exit(-1);
-#endif
-  }
-
-  // Allocate space for the hash table
-  ctable = (mgchashtable_t *)RUNMALLOC(sizeof(mgchashtable_t));
-  if(ctable == NULL) {
-	// Run out of local memory
-	BAMBOO_EXIT(0xf102);
-  }
-  ctable->table = (mgchashlistnode_t*)RUNMALLOC(size*sizeof(mgchashlistnode_t));
-  if(ctable->table == NULL) {
-	// Run out of local memory
-	BAMBOO_EXIT(0xf103);
-  }
-  ctable->loadfactor = loadfactor;
-  ctable->size = size;
-  ctable->threshold=size*loadfactor;
-
-  ctable->mask = (size << (GC_SHIFT_BITS))-1;
-  //ctable->list = NULL;
-  ctable->structs = (mgcliststruct_t*)RUNMALLOC(1*sizeof(mgcliststruct_t));
-  ctable->numelements = 0; // Initial number of elements in the hash
-
-  return ctable;
-}
-
-void mgchashreset(mgchashtable_t * tbl) {
-  mgchashlistnode_t *ptr = tbl->table;
-  int i;
-
-  /*if (tbl->numelements<(tbl->size>>6)) {
-	mgchashlistnode_t *top=&ptr[tbl->size];
-	mgchashlistnode_t * list = tbl->list;
-	while(list != NULL) {
-      mgchashlistnode_t * next = list->lnext;
-      if ((list >= ptr) && (list < top)) {
-		//zero in list
-        list->key=NULL;
-        list->next=NULL;
-      }
-      list = next;
-	}
-  } else {*/
-	BAMBOO_MEMSET_WH(tbl->table, '\0', sizeof(mgchashlistnode_t)*tbl->size);
-  //}
-  // TODO now never release any allocated memory, may need to be changed
-  //mgcliststruct_t * next = tbl->structs;
-  while(tbl->structs->next!=NULL) {
-    mgcliststruct_t * next = tbl->structs->next;
-    RUNFREE(tbl->structs);
-    tbl->structs=next;
-	/*next->num = 0;
-	next = next->next;*/
-  }
-  tbl->structs->num = 0;
-  tbl->numelements = 0;
-}
-
-//Store objects and their pointers into hash
-void mgchashInsert(mgchashtable_t * tbl, void * key, void *val) {
-  mgchashlistnode_t *ptr;
-
-  if(tbl->numelements > (tbl->threshold)) {
-    //Resize
-    unsigned int newsize = tbl->size << 1 + 1;
-    mgchashResize(tbl, newsize);
-  }
-
-  ptr=&tbl->table[(((unsigned INTPTR)key)&tbl->mask)>>(GC_SHIFT_BITS)]; 
-  tbl->numelements++;
-
-  if(ptr->key==0) {
-    // the first time insert a value for the key
-    ptr->key=key;
-    ptr->val=val;
-	/*ptr->lnext = tbl->list;
-	tbl->list = ptr;*/
-  } else { // Insert in the beginning of linked list
-    mgchashlistnode_t * node;
-    if (tbl->structs->num<NUMMGCLIST) {
-      node=&tbl->structs->array[tbl->structs->num];
-      tbl->structs->num++;
-    } else {
-      //get new list
-      mgcliststruct_t *tcl=RUNMALLOC(1*sizeof(mgcliststruct_t));
-      tcl->next=tbl->structs;
-      tbl->structs=tcl;
-      node=&tcl->array[0];
-      tcl->num=1;
-    }
-    node->key = key;
-    node->val = val;
-    node->next = ptr->next;
-    ptr->next = node;
-	/*node->lnext = tbl->list;
-	tbl->list = node;*/
-  }
-}
-
-#ifdef MULTICORE_GC
-mgchashtable_t * mgchashCreate_I(unsigned int size, double loadfactor) {
-  mgchashtable_t *ctable;
-  mgchashlistnode_t *nodes;
-  int i;
-
-  if (size <= 0) {
-#ifdef MULTICORE
-    BAMBOO_EXIT(0xf101);
-#else
-    printf("Negative Hashtable size Exception\n");
-    exit(-1);
-#endif
-  }
-
-  // Allocate space for the hash table
-  ctable = (mgchashtable_t*)RUNMALLOC_I(sizeof(mgchashtable_t));
-  if(ctable == NULL) {
-	// Run out of local memory
-	BAMBOO_EXIT(0xf102);
-  }
-  ctable->table=(mgchashlistnode_t*)RUNMALLOC_I(size*sizeof(mgchashlistnode_t));
-  if(ctable->table == NULL) {
-	// Run out of local memory
-	BAMBOO_EXIT(0xf103);
-  }
-  ctable->loadfactor = loadfactor;
-  ctable->size = size;
-  ctable->threshold=size*loadfactor;
-
-  ctable->mask = (size << (GC_SHIFT_BITS))-1;
-  //ctable->list = NULL;
-  ctable->structs = (mgcliststruct_t*)RUNMALLOC_I(1*sizeof(mgcliststruct_t));
-  ctable->numelements = 0; // Initial number of elements in the hash
-
-  return ctable;
-}
-
-void mgchashInsert_I(mgchashtable_t * tbl, void * key, void *val) {
-  mgchashlistnode_t *ptr;
-
-  if(tbl->numelements > (tbl->threshold)) {
-    //Resize
-    unsigned int newsize = tbl->size << 1 + 1;
-    mgchashResize_I(tbl, newsize);
-  }
-
-  ptr = &tbl->table[(((unsigned INTPTR)key)&tbl->mask)>>(GC_SHIFT_BITS)];
-  tbl->numelements++;
-
-  if(ptr->key==0) {
-    ptr->key=key;
-    ptr->val=val;
-	/*ptr->lnext = tbl->list;
-	tbl->list = ptr;*/
-    return;
-  } else { // Insert in the beginning of linked list
-    mgchashlistnode_t * node;
-    if (tbl->structs->num<NUMMGCLIST) {
-      node=&tbl->structs->array[tbl->structs->num];
-      tbl->structs->num++;
-    } else {
-      //get new list
-      mgcliststruct_t *tcl=RUNMALLOC_I(1*sizeof(mgcliststruct_t));
-      tcl->next=tbl->structs;
-      tbl->structs=tcl;
-      node=&tcl->array[0];
-      tcl->num=1;
-    }
-    node->key = key;
-    node->val = val;
-    node->next = ptr->next;
-    ptr->next = node;
-	/*node->lnext = tbl->list;
-	tbl->list = node;*/
-  }
-}
-#endif
-
-// Search for an address for a given oid
-INLINE void * mgchashSearch(mgchashtable_t * tbl, void * key) {
-  //REMOVE HASH FUNCTION CALL TO MAKE SURE IT IS INLINED HERE]
-  mgchashlistnode_t *node = 
-	&tbl->table[(((unsigned INTPTR)key)&tbl->mask)>>(GC_SHIFT_BITS)];
-
-  do {
-    if(node->key == key) {
-      return node->val;
-    }
-    node = node->next;
-  } while(node != NULL);
-
-  return NULL;
-}
-
-unsigned int mgchashResize(mgchashtable_t * tbl, unsigned int newsize) {
-  mgchashlistnode_t *node, *ptr, *curr;  // curr and next keep track of the 
-                                         // current and the next 
-										 // mgchashlistnodes in a linked list
-  unsigned int oldsize;
-  int isfirst;    // Keeps track of the first element in the 
-                  // chashlistnode_t for each bin in hashtable
-  unsigned int i,index;
-  unsigned int mask;
-
-  ptr = tbl->table;
-  oldsize = tbl->size;
-
-  if((node = RUNMALLOC(newsize*sizeof(mgchashlistnode_t))) == NULL) {
-    printf("Calloc error %s %d\n", __FILE__, __LINE__);
-    return 1;
-  }
-
-  tbl->table = node; //Update the global hashtable upon resize()
-  tbl->size = newsize;
-  tbl->threshold = newsize * tbl->loadfactor;
-  mask = tbl->mask = (newsize << (GC_SHIFT_BITS)) - 1;
-  //tbl->list = NULL;
-
-  for(i = 0; i < oldsize; i++) {   //Outer loop for each bin in hash table
-    curr = &ptr[i];
-    isfirst = 1;
-    do {  //Inner loop to go through linked lists
-      void * key;
-      mgchashlistnode_t *tmp,*next;
-
-      if ((key=curr->key) == 0) { 
-		//Exit inner loop if there the first element is 0
-		break;
-		//key = val =0 for element if not present within the hash table
-	  }
-      index = (((unsigned INTPTR)key) & mask) >> (GC_SHIFT_BITS);
-      tmp=&node[index];
-      next = curr->next;
-      // Insert into the new table
-      if(tmp->key == 0) {
-		tmp->key = key;
-		tmp->val = curr->val;
-		/*tmp->lnext = tbl->list;
-		tbl->list = tmp;*/
-      } /*
-	   NOTE:  Add this case if you change this...
-	   This case currently never happens because of the way things rehash....*/
-	   else if (isfirst) {
-		 mgchashlistnode_t *newnode= RUNMALLOC(1*sizeof(mgchashlistnode_t));
-		 newnode->key = curr->key;
-		 newnode->val = curr->val;
-		 newnode->next = tmp->next;
-		 tmp->next=newnode;
-		 /*newnode->lnext = tbl->list;
-		 tbl->list = newnode;*/
-	   } 
-      else {
-		curr->next=tmp->next;
-		tmp->next=curr;
-		/*curr->lnext = tbl->list;
-		tbl->list = curr;*/
-      }
-
-      isfirst = 0;
-      curr = next;
-    } while(curr!=NULL);
-  }
-
-  RUNFREE(ptr);            //Free the memory of the old hash table
-  return 0;
-}
-
-#ifdef MULTICORE_GC
-unsigned int mgchashResize_I(mgchashtable_t * tbl, unsigned int newsize) {
-  mgchashlistnode_t *node, *ptr, *curr; // curr and next keep track of the 
-                                        // current and the next 
-										// mgchashlistnodes in a linked list
-  unsigned int oldsize;
-  int isfirst; // Keeps track of the first element in the chashlistnode_t 
-               // for each bin in hashtable
-  unsigned int i,index;
-  unsigned int mask;
-
-  ptr = tbl->table;
-  oldsize = tbl->size;
-
-  if((node = RUNMALLOC_I(newsize*sizeof(mgchashlistnode_t))) == NULL) {
-    BAMBOO_EXIT(0xf104);
-    printf("Calloc error %s %d\n", __FILE__, __LINE__);
-    return 1;
-  }
-
-  tbl->table = node;  //Update the global hashtable upon resize()
-  tbl->size = newsize;
-  tbl->threshold = newsize * tbl->loadfactor;
-  mask = tbl->mask = (newsize << (GC_SHIFT_BITS))-1;
-  //tbl->list = NULL;
-
-  for(i = 0; i < oldsize; i++) {  //Outer loop for each bin in hash table
-    curr = &ptr[i];
-    isfirst = 1;
-    do { //Inner loop to go through linked lists
-      void * key;
-      mgchashlistnode_t *tmp,*next;
-
-      if ((key=curr->key) == 0) {
-		//Exit inner loop if there the first element is 0
-		break;
-		//key = val =0 for element if not present within the hash table
-      }
-      index = (((unsigned INTPTR)key) & mask) >> (GC_SHIFT_BITS);
-      tmp=&node[index];
-      next = curr->next;
-      // Insert into the new table
-      if(tmp->key == 0) {
-		tmp->key = key;
-		tmp->val = curr->val;
-		/*tmp->lnext = tbl->list;
-		tbl->list = tmp;*/
-      } /*
-	   NOTE:  Add this case if you change this...
-	   This case currently never happens because of the way things rehash....*/
-      else if (isfirst) {
-		mgchashlistnode_t *newnode=RUNMALLOC_I(1*sizeof(mgchashlistnode_t)); 
-		newnode->key = curr->key;
-		newnode->val = curr->val;
-		newnode->next = tmp->next;
-		tmp->next=newnode;
-		/*newnode->lnext = tbl->list;
-		tbl->list = newnode;*/
-      } else {
-		curr->next=tmp->next;
-		tmp->next=curr;
-		/*curr->lnext = tbl->list;
-		tbl->list = curr;*/
-      }
-
-      isfirst = 0;
-      curr = next;
-    } while(curr!=NULL);
-  }
-  RUNFREE(ptr); //Free the memory of the old hash table
-  return 0;
-}
-#endif
-
-//Delete the entire hash table
-void mgchashDelete(mgchashtable_t * tbl) {
-  int i;
-  mgcliststruct_t *ptr=tbl->structs;
-  while(ptr!=NULL) {
-    mgcliststruct_t *next=ptr->next;
-    RUNFREE(ptr);
-    ptr=next;
-  }
-  RUNFREE(tbl->table);
-  tbl->table=NULL;
-  tbl->structs=NULL;
-}
-
-/* MGCHASH ********************************************************/
-
-struct MGCHash * allocateMGCHash(int size,
-                                 int conflicts) {
-  struct MGCHash *thisvar;
-  if (size <= 0) {
-#ifdef MULTICORE
-    BAMBOO_EXIT(0xf105);
-#else
-    printf("Negative Hashtable size Exception\n");
-    exit(-1);
-#endif
-  }
-  thisvar=(struct MGCHash *)RUNMALLOC(sizeof(struct MGCHash));
-  thisvar->size = size;
-  thisvar->bucket =
-    (struct MGCNode *) RUNMALLOC(sizeof(struct MGCNode)*size);
-  //Set data counts
-  thisvar->num4conflicts = conflicts;
-  return thisvar;
-}
-
-void freeMGCHash(struct MGCHash *thisvar) {
-  int i = 0;
-  for(i=thisvar->size-1; i>=0; i--) {
-    struct MGCNode *ptr;
-    for(ptr=thisvar->bucket[i].next; ptr!=NULL; ) {
-      struct MGCNode * nextptr=ptr->next;
-      RUNFREE(ptr);
-      ptr=nextptr;
-    }
-  }
-  RUNFREE(thisvar->bucket);
-  RUNFREE(thisvar);
-}
-
-int MGCHashadd(struct MGCHash * thisvar, int data) {
-  // Rehash code
-  unsigned int hashkey;
-  struct MGCNode *ptr;
-
-  int mask = (thisvar->size << (GC_SHIFT_BITS))-1;
-  hashkey = (((unsigned INTPTR)data)&mask)>>(GC_SHIFT_BITS); 
-  //hashkey = (unsigned int)data % thisvar->size;
-  ptr = &thisvar->bucket[hashkey];
-
-  struct MGCNode * prev = NULL;
-  if(ptr->data < thisvar->num4conflicts) {
-    struct MGCNode *node=RUNMALLOC(sizeof(struct MGCNode));
-    node->data=data;
-    node->next=(ptr->next);
-    ptr->next=node;
-    ptr->data++;
-  } else {
-    while (ptr->next!=NULL) {
-      prev = ptr;
-      ptr = ptr->next;
-    }
-    ptr->data = data;
-    ptr->next = thisvar->bucket[hashkey].next;
-    thisvar->bucket[hashkey].next = ptr;
-    prev->next = NULL;
-  }
-
-  return 1;
-}
-
-#ifdef MULTICORE
-struct MGCHash * allocateMGCHash_I(int size,
-                                   int conflicts) {
-  struct MGCHash *thisvar;
-  if (size <= 0) {
-#ifdef MULTICORE
-    BAMBOO_EXIT(0xf106);
-#else
-    printf("Negative Hashtable size Exception\n");
-    exit(-1);
-#endif
-  }
-  thisvar=(struct MGCHash *)RUNMALLOC_I(sizeof(struct MGCHash));
-  thisvar->size = size;
-  thisvar->bucket =
-    (struct MGCNode *) RUNMALLOC_I(sizeof(struct MGCNode)*size);
-  //Set data counts
-  thisvar->num4conflicts = conflicts;
-  return thisvar;
-}
-
-int MGCHashadd_I(struct MGCHash * thisvar, int data) {
-  // Rehash code
-  unsigned int hashkey;
-  struct MGCNode *ptr;
-
-  int mask = (thisvar->size << (GC_SHIFT_BITS))-1;
-  hashkey = (((unsigned INTPTR)data)&mask)>>(GC_SHIFT_BITS);
-  //hashkey = (unsigned int)data % thisvar->size;
-  ptr = &thisvar->bucket[hashkey];
-
-  struct MGCNode * prev = NULL;
-  if(ptr->data < thisvar->num4conflicts) {
-    struct MGCNode *node=RUNMALLOC_I(sizeof(struct MGCNode));
-    node->data=data;
-    node->next=(ptr->next);
-    ptr->next=node;
-    ptr->data++;
-  } else {
-    while (ptr->next!=NULL) {
-      prev = ptr;
-      ptr = ptr->next;
-    }
-    ptr->data = data;
-    ptr->next = thisvar->bucket[hashkey].next;
-    thisvar->bucket[hashkey].next = ptr;
-    prev->next = NULL;
-  }
-
-  return 1;
-}
-#endif
-
-int MGCHashcontains(struct MGCHash *thisvar, int data) {
-  int mask = (thisvar->size << (GC_SHIFT_BITS))-1;
-  unsigned int hashkey = (((unsigned INTPTR)data)&mask)>>(GC_SHIFT_BITS);
-  //unsigned int hashkey = (unsigned int)data % thisvar->size;
-
-  struct MGCNode *ptr = thisvar->bucket[hashkey].next;
-  struct MGCNode *prev = NULL;
-  while (ptr!=NULL) {
-    if (ptr->data == data) {
-      if(prev != NULL) {
-	prev->next = NULL;
-	ptr->next = thisvar->bucket[hashkey].next;
-	thisvar->bucket[hashkey].next = ptr;
-      }
-
-      return 1;       // success
-    }
-    prev = ptr;
-    ptr = ptr->next;
-  }
-
-  return 0;   // failure
-}
-
diff --git a/Robust/src/Runtime/MGCHash.h b/Robust/src/Runtime/MGCHash.h
deleted file mode 100755
index 03844ebc..00000000
--- a/Robust/src/Runtime/MGCHash.h
+++ /dev/null
@@ -1,87 +0,0 @@
-#ifndef MGCHASH_H
-#define MGCHASH_H
-
-#ifndef bool
-#define bool int
-#endif
-
-#ifndef true
-#define true 1
-#endif
-
-#ifndef false
-#define false 0
-#endif
-
-#ifndef INLINE
-#define INLINE    inline __attribute__((always_inline))
-#endif
-
-#include "mem.h"
-
-/* mgchash *********************************************************/
-typedef struct mgchashlistnode {
-  void * key;
-  void * val; //this can be cast to another type or used to point to a
-              //larger structure
-  struct mgchashlistnode *next;
-  //struct mgchashlistnode *lnext;
-} mgchashlistnode_t;
-
-#define NUMMGCLIST 250
-typedef struct mgclist {
-  struct mgchashlistnode array[NUMMGCLIST];
-  int num;
-  struct mgclist *next;
-} mgcliststruct_t;
-
-typedef struct mgchashtable {
-  mgchashlistnode_t * table;       // points to beginning of hash table
-  //mgchashlistnode_t * list;
-  mgcliststruct_t * structs;
-  unsigned int size;
-  unsigned int mask;
-  unsigned int numelements;
-  unsigned int threshold;
-  double loadfactor;
-} mgchashtable_t;
-
-mgchashtable_t * mgchashCreate(unsigned int size, double loadfactor);
-void mgchashInsert(mgchashtable_t * tbl, void * key, void *val);
-void * mgchashSearch(mgchashtable_t * tbl, void * key);
-unsigned int mgchashResize(mgchashtable_t * tbl, unsigned int newsize);
-#ifdef MULTICORE_GC
-mgchashtable_t * mgchashCreate_I(unsigned int size, double loadfactor);
-void mgchashInsert_I(mgchashtable_t * tbl, void * key, void *val);
-unsigned int mgchashResize_I(mgchashtable_t * tbl, unsigned int newsize);
-#endif
-void mgchashDelete(mgchashtable_t * tbl);
-void mgchashreset(mgchashtable_t * tbl);
-
-
-/** MGCHash *******************************************************************/
-struct MGCHash * allocateMGCHash(int size, int conflicts);
-void freeMGCHash(struct MGCHash *);
-
-//void MGCHashrehash(struct MGCHash * thisvar);
-int MGCHashadd(struct MGCHash *, int data);
-#ifdef MULTICORE
-struct MGCHash * allocateMGCHash_I(int size, int conflicts);
-int MGCHashadd_I(struct MGCHash *, int data);
-#endif
-int MGCHashcontains(struct MGCHash *,int data);
-
-struct MGCHash {
-  int num4conflicts;
-  int size;
-  struct MGCNode *bucket;
-};
-
-/* MGCHashException  *************************************************/
-
-struct MGCNode {
-  struct MGCNode * next;
-  int data;
-};
-
-#endif
diff --git a/Robust/src/Runtime/RAW/Makefile.raw.1 b/Robust/src/Runtime/RAW/Makefile.raw.1
deleted file mode 100644
index 7b98f54b..00000000
--- a/Robust/src/Runtime/RAW/Makefile.raw.1
+++ /dev/null
@@ -1,21 +0,0 @@
-TOPDIR=/home/jzhou/starsearch
-include $(TOPDIR)/Makefile.include
-
-RGCCFLAGS += -O2 
-RGCCFLAGS += ${RAWRGCCFLAGS}
-
-USE_SLGCC=1
-
-SIM-CYCLES = 10000
-
-ATTRIBUTES += HWIC
-
-TILE_PATTERN = 4x1
-
-OBJECT_FILES_COMMON = multicoretask.o multicoreruntime.o Queue.o file.o math.o object.o \
-					  GenericHashtable.o SimpleHash.o ObjectHash.o socket.o taskdefs.o \
-					  methods.o mem.o task_arch.o raw_dataCache.o raw_interrupt.o
-					  
-# this is for a multi-tile test
-include $(COMMONDIR)/Makefile.all
-
diff --git a/Robust/src/Runtime/RAW/Makefile.raw.2 b/Robust/src/Runtime/RAW/Makefile.raw.2
deleted file mode 100644
index e0554e13..00000000
--- a/Robust/src/Runtime/RAW/Makefile.raw.2
+++ /dev/null
@@ -1,21 +0,0 @@
-TOPDIR=/home/jzhou/starsearch
-include $(TOPDIR)/Makefile.include
-
-RGCCFLAGS += -O2 
-RGCCFLAGS += ${RAWRGCCFLAGS}
-
-USE_SLGCC=1
-
-SIM-CYCLES = 10000
-
-ATTRIBUTES += HWIC
-
-TILE_PATTERN = 4x2
-
-OBJECT_FILES_COMMON = multicoretask.o multicoreruntime.o Queue.o file.o math.o object.o \
-					  GenericHashtable.o SimpleHash.o ObjectHash.o socket.o taskdefs.o \
-					  methods.o mem.o task_arch.o raw_dataCache.o raw_interrupt2.o
-					  
-# this is for a multi-tile test
-include $(COMMONDIR)/Makefile.all
-
diff --git a/Robust/src/Runtime/RAW/Makefile.raw.4 b/Robust/src/Runtime/RAW/Makefile.raw.4
deleted file mode 100644
index 3b43ee90..00000000
--- a/Robust/src/Runtime/RAW/Makefile.raw.4
+++ /dev/null
@@ -1,21 +0,0 @@
-TOPDIR=/home/jzhou/starsearch
-include $(TOPDIR)/Makefile.include
-
-RGCCFLAGS += -O2 
-RGCCFLAGS += ${RAWRGCCFLAGS}
-
-USE_SLGCC=1
-
-SIM-CYCLES = 10000
-
-ATTRIBUTES += HWIC
-
-TILE_PATTERN = 4x4
-
-OBJECT_FILES_COMMON = multicoretask.o multicoreruntime.o Queue.o file.o math.o object.o \
-					  GenericHashtable.o SimpleHash.o ObjectHash.o socket.o taskdefs.o \
-					  methods.o mem.o task_arch.o raw_dataCache.o raw_interrupt4.o
-					  
-# this is for a multi-tile test
-include $(COMMONDIR)/Makefile.all
-
diff --git a/Robust/src/Runtime/RAW/Makefile.raw.io b/Robust/src/Runtime/RAW/Makefile.raw.io
deleted file mode 100644
index 5b87e3dc..00000000
--- a/Robust/src/Runtime/RAW/Makefile.raw.io
+++ /dev/null
@@ -1,58 +0,0 @@
-
-USEBOOTLOADER=no
-
-ifeq ($(USEBOOTLOADER),yes)
-ATTRIBUTES      += LARGE_STATIC_DATA
-endif
-
-# We need to define the host OS to get access
-# to the host specific OS defines!   - VS 
-DEFS	+= -D$(shell uname -s) -D__raw__
-
-TOPDIR=/home/jzhou/starsearch
-include $(TOPDIR)/Makefile.include
-
-RGCCFLAGS += -O2 
-RGCCFLAGS += ${RAWRGCCFLAGS} 
-
-USE_SLGCC=1
-
-SIM-CYCLES = 10000
-
-ATTRIBUTES += HWIC
-
-TILES = 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
-
-#TILE_PATTERN = 4x1
-
-OBJECT_FILES_COMMON = multicoretask.o multicoreruntime.o Queue.o file.o math.o object.o \
-					  GenericHashtable.o SimpleHash.o ObjectHash.o socket.o taskdefs.o \
-					  methods.o mem.o task_arch.o raw_dataCache.o raw_interrupt.o
-
-OBJECT_FILES_00 = $(OBJECT_FILES_COMMON)
-OBJECT_FILES_01 = $(OBJECT_FILES_COMMON)
-OBJECT_FILES_02 = $(OBJECT_FILES_COMMON)
-OBJECT_FILES_03 = $(OBJECT_FILES_COMMON)
-OBJECT_FILES_04 = $(OBJECT_FILES_COMMON)
-OBJECT_FILES_05 = $(OBJECT_FILES_COMMON)
-OBJECT_FILES_06 = $(OBJECT_FILES_COMMON)
-OBJECT_FILES_07 = $(OBJECT_FILES_COMMON)
-OBJECT_FILES_08 = $(OBJECT_FILES_COMMON)
-OBJECT_FILES_09 = $(OBJECT_FILES_COMMON)
-OBJECT_FILES_10 = $(OBJECT_FILES_COMMON)
-OBJECT_FILES_11 = $(OBJECT_FILES_COMMON)
-OBJECT_FILES_12 = $(OBJECT_FILES_COMMON)
-OBJECT_FILES_13 = $(OBJECT_FILES_COMMON)
-OBJECT_FILES_14 = $(OBJECT_FILES_COMMON)
-OBJECT_FILES_15 = $(OBJECT_FILES_COMMON)
-
-# this is for a multi-tile test
-include $(COMMONDIR)/Makefile.all
-
-ifneq ($(USEBOOTLOADER),yes)
-# Load the host interface and host OS simulator into btl
-BTL-ARGS += -host # -imem_size 65536
-endif
-
-BTL-ARGS += -host_stop_time
-
diff --git a/Robust/src/Runtime/RAW/raw_dataCache.s b/Robust/src/Runtime/RAW/raw_dataCache.s
deleted file mode 100644
index 566bf221..00000000
--- a/Robust/src/Runtime/RAW/raw_dataCache.s
+++ /dev/null
@@ -1,40 +0,0 @@
-.text
-.global flushAddr
-.global invalidateAddr
-.global flushCacheline
-.global invalidateCacheline
-	
-flushAddr:
-# arguments come in on $4 and $5
-# $4 has the address
-# $5 has the length, eventually
-
-	afl $4, 0
-	jr    $31
-	
-
-invalidateAddr:
-# arguments come in on $4 and $5
-# $4 has the address
-# $5 has the length, eventually
-
-	ainv $4, 0
-	jr $31
-
-
-flushCacheline:
-# arguments come in on $4
-# $4 has the base tag address
-
-	tagfl $4
-	jr $31
-
-invalidateCacheline:
-# arguments come in on $4
-# $4 has the base tag address
-
-	tagsw $0, $4
-#	mtsri PASS, 0x1111
-#	mtsr PASS, $8
-#	ainv $8, 0
-	jr $31
diff --git a/Robust/src/Runtime/RAW/raw_interrupt.s b/Robust/src/Runtime/RAW/raw_interrupt.s
deleted file mode 100644
index cfad9b58..00000000
--- a/Robust/src/Runtime/RAW/raw_interrupt.s
+++ /dev/null
@@ -1,131 +0,0 @@
-#include <raw_asm.h>
-
-.text
-	.align	2
-	.globl	setup_ints
-	.ent	setup_ints
-setup_ints:	
-	# set up dynamic network
-	uintoff
-	intoff
-
-	# set gdn_cfg
-	xor $8,$8,$8
-	aui $8,$8,(3<<11)|(0 <<6)|(0 <<1)
-	ori $8, (0 <<12)|(2<<9)
-	mtsr	GDN_CFG,$8
-#	mtsr	PASS,$8
-
-	# set exception vector
-    la $3, interrupt_table
-#	mtsri PASS, 0xaaa
-#	mtsr PASS, $3
-    mtsr EX_BASE_ADDR, $3
-
-	# set EX_MASK
-	mfsr	$8,EX_MASK
-	ori	$8,$8,0x20          # 1 << kVEC_GDN_AVAIL
-	mtsr	EX_MASK,$8
-
-	inton
-	uinton
-	jr $31
-	.end	setup_ints
-
-.macro empty_vec fail_code
-        mtsri FAIL, \fail_code
-1:      b 1b
-        nop
-        nop
-.endm
-
-interrupt_table:
-
-vec_gdn_refill:
-        empty_vec 0x2300
-vec_gdn_complete:
-        empty_vec 0x2301
-vec_trace:
-        empty_vec 0x2302
-vec_extern:
-        empty_vec 0x2303
-vec_timer:
-        empty_vec 0x2304
-vec_gdn_avail:
-#	mtsri PASS, 0xef00
-	uintoff
-
-	addiu   $sp,$sp,-112
-	sw      $31,0x64($sp)
-	sw      $30,0x60($sp)
-	sw      $23,0x5c($sp)
-	sw      $22,0x58($sp)
-	sw      $21,0x54($sp)
-	sw      $20,0x50($sp)
-	sw      $19,0x4c($sp)
-	sw      $18,0x48($sp)
-	sw      $17,0x44($sp)
-	sw      $16,0x40($sp)
-	sw      $15,0x3c($sp)
-	sw      $14,0x38($sp)
-	sw      $13,0x34($sp)
-	sw      $12,0x30($sp)
-	sw      $11,0x2c($sp)
-	sw      $10,0x28($sp)
-	sw      $9,0x24($sp)
-	sw      $8,0x20($sp)
-	sw      $7,0x1c($sp)
-	sw      $6,0x18($sp)
-	sw      $5,0x14($sp)
-	sw      $4,0x10($sp)
-	sw      $3,0xc($sp)
-	sw      $2,0x8($sp)
-	.set noat
-	sw      $1,0x4($sp)
-	.set at
-	mfhi    $8
-	mflo    $9
-	sw      $8,0x68($sp)
-	sw      $9,0x6c($sp)
-	lw      $8,0x20($sp)
-	lw      $9,0x24($sp)
-
-	jal receiveObject
-
-	lw      $8,0x68($sp)
-	lw      $9,0x6c($sp)
-	mthi    $8
-	mtlo    $9
-	lw      $31,0x64($sp)
-	lw      $30,0x60($sp)
-	lw      $23,0x5c($sp)
-	lw      $22,0x58($sp)
-	lw      $21,0x54($sp)
-	lw      $20,0x50($sp)
-	lw      $19,0x4c($sp)
-	lw      $18,0x48($sp)
-	lw      $17,0x44($sp)
-	lw      $16,0x40($sp)
-	lw      $15,0x3c($sp)
-	lw      $14,0x38($sp)
-	lw      $13,0x34($sp)
-	lw      $12,0x30($sp)
-	lw      $11,0x2c($sp)
-	lw      $10,0x28($sp)
-	lw      $9,0x24($sp)
-	lw      $8,0x20($sp)
-	lw      $7,0x1c($sp)
-	lw      $6,0x18($sp)
-	lw      $5,0x14($sp)
-	lw      $4,0x10($sp)
-	lw      $3,0xc($sp)
-	lw      $2,0x8($sp)
-	.set noat
-	lw      $1,0x4($sp)
-	.set at
-	addiu   $sp,$sp,112
-
-#	mtsri PASS, 0xefff
-	dret
-vec_event_counters:
-        empty_vec 0x2306
diff --git a/Robust/src/Runtime/RAW/raw_interrupt2.s b/Robust/src/Runtime/RAW/raw_interrupt2.s
deleted file mode 100644
index 1f23cfb1..00000000
--- a/Robust/src/Runtime/RAW/raw_interrupt2.s
+++ /dev/null
@@ -1,132 +0,0 @@
-#include <raw_asm.h>
-
-	.text
-	.align	2
-	.globl	setup_ints
-	.ent	setup_ints
-setup_ints:	
-	# set up dynamic network
-	uintoff
-	intoff
-
-	# set gdn_cfg
-	xor $8,$8,$8
-	aui $8,$8,(3<<11)|(1 <<6)|(0 <<1)
-	ori $8, (0 <<12)|(2<<9)
-	mtsr	GDN_CFG,$8
-#	mtsr	PASS,$8
-
-	# set exception vector
-    la $3, interrupt_table
-#	mtsri PASS, 0xaaa
-#	mtsr PASS, $3
-    mtsr EX_BASE_ADDR, $3
-
-	# set EX_MASK
-	mfsr	$8,EX_MASK
-	ori	$8,$8,0x20          # 1 << kVEC_GDN_AVAIL
-	mtsr	EX_MASK,$8
-
-	inton
-	uinton
-	jr $31
-	.end	setup_ints
-
-.macro empty_vec fail_code
-        mtsri FAIL, \fail_code
-1:      b 1b
-        nop
-        nop
-.endm
-
-interrupt_table:
-
-vec_gdn_refill:
-        empty_vec 0x2300
-vec_gdn_complete:
-        empty_vec 0x2301
-vec_trace:
-        empty_vec 0x2302
-vec_extern:
-        empty_vec 0x2303
-vec_timer:
-        empty_vec 0x2304
-vec_gdn_avail:
-#	mtsri PASS, 0xef00
-	uintoff
-
-	addiu   $sp,$sp,-112
-	sw      $31,0x64($sp)
-	sw      $30,0x60($sp)
-	sw      $23,0x5c($sp)
-	sw      $22,0x58($sp)
-	sw      $21,0x54($sp)
-	sw      $20,0x50($sp)
-	sw      $19,0x4c($sp)
-	sw      $18,0x48($sp)
-	sw      $17,0x44($sp)
-	sw      $16,0x40($sp)
-	sw      $15,0x3c($sp)
-	sw      $14,0x38($sp)
-	sw      $13,0x34($sp)
-	sw      $12,0x30($sp)
-	sw      $11,0x2c($sp)
-	sw      $10,0x28($sp)
-	sw      $9,0x24($sp)
-	sw      $8,0x20($sp)
-	sw      $7,0x1c($sp)
-	sw      $6,0x18($sp)
-	sw      $5,0x14($sp)
-	sw      $4,0x10($sp)
-	sw      $3,0xc($sp)
-	sw      $2,0x8($sp)
-	.set noat
-	sw      $1,0x4($sp)
-	.set at
-	mfhi    $8
-	mflo    $9
-	sw      $8,0x68($sp)
-	sw      $9,0x6c($sp)
-	lw      $8,0x20($sp)
-	lw      $9,0x24($sp)
-
-	jal receiveObject
-
-	lw      $8,0x68($sp)
-	lw      $9,0x6c($sp)
-	mthi    $8
-	mtlo    $9
-	lw      $31,0x64($sp)
-	lw      $30,0x60($sp)
-	lw      $23,0x5c($sp)
-	lw      $22,0x58($sp)
-	lw      $21,0x54($sp)
-	lw      $20,0x50($sp)
-	lw      $19,0x4c($sp)
-	lw      $18,0x48($sp)
-	lw      $17,0x44($sp)
-	lw      $16,0x40($sp)
-	lw      $15,0x3c($sp)
-	lw      $14,0x38($sp)
-	lw      $13,0x34($sp)
-	lw      $12,0x30($sp)
-	lw      $11,0x2c($sp)
-	lw      $10,0x28($sp)
-	lw      $9,0x24($sp)
-	lw      $8,0x20($sp)
-	lw      $7,0x1c($sp)
-	lw      $6,0x18($sp)
-	lw      $5,0x14($sp)
-	lw      $4,0x10($sp)
-	lw      $3,0xc($sp)
-	lw      $2,0x8($sp)
-	.set noat
-	lw      $1,0x4($sp)
-	.set at
-	addiu   $sp,$sp,112
-
-#	mtsri PASS, 0xefff
-	dret
-vec_event_counters:
-        empty_vec 0x2306
-
diff --git a/Robust/src/Runtime/RAW/raw_interrupt4.s b/Robust/src/Runtime/RAW/raw_interrupt4.s
deleted file mode 100644
index a505e303..00000000
--- a/Robust/src/Runtime/RAW/raw_interrupt4.s
+++ /dev/null
@@ -1,132 +0,0 @@
-#include <raw_asm.h>
-
-	.text
-	.align	2
-	.globl	setup_ints
-	.ent	setup_ints
-setup_ints:	
-	# set up dynamic network
-	uintoff
-	intoff
-
-	# set gdn_cfg
-	xor $8,$8,$8
-	aui $8,$8,(3<<11)|(3 <<6)|(0 <<1)
-	ori $8, (0 <<12)|(2<<9)
-	mtsr	GDN_CFG,$8
-#	mtsr	PASS,$8
-
-	# set exception vector
-    la $3, interrupt_table
-#	mtsri PASS, 0xaaa
-#	mtsr PASS, $3
-    mtsr EX_BASE_ADDR, $3
-
-	# set EX_MASK
-	mfsr	$8,EX_MASK
-	ori	$8,$8,0x20          # 1 << kVEC_GDN_AVAIL
-	mtsr	EX_MASK,$8
-
-	inton
-	uinton
-	jr $31
-	.end	setup_ints
-
-.macro empty_vec fail_code
-        mtsri FAIL, \fail_code
-1:      b 1b
-        nop
-        nop
-.endm
-
-interrupt_table:
-
-vec_gdn_refill:
-        empty_vec 0x2300
-vec_gdn_complete:
-        empty_vec 0x2301
-vec_trace:
-        empty_vec 0x2302
-vec_extern:
-        empty_vec 0x2303
-vec_timer:
-        empty_vec 0x2304
-vec_gdn_avail:
-#	mtsri PASS, 0xef00
-	uintoff
-
-	addiu   $sp,$sp,-112
-	sw      $31,0x64($sp)
-	sw      $30,0x60($sp)
-	sw      $23,0x5c($sp)
-	sw      $22,0x58($sp)
-	sw      $21,0x54($sp)
-	sw      $20,0x50($sp)
-	sw      $19,0x4c($sp)
-	sw      $18,0x48($sp)
-	sw      $17,0x44($sp)
-	sw      $16,0x40($sp)
-	sw      $15,0x3c($sp)
-	sw      $14,0x38($sp)
-	sw      $13,0x34($sp)
-	sw      $12,0x30($sp)
-	sw      $11,0x2c($sp)
-	sw      $10,0x28($sp)
-	sw      $9,0x24($sp)
-	sw      $8,0x20($sp)
-	sw      $7,0x1c($sp)
-	sw      $6,0x18($sp)
-	sw      $5,0x14($sp)
-	sw      $4,0x10($sp)
-	sw      $3,0xc($sp)
-	sw      $2,0x8($sp)
-	.set noat
-	sw      $1,0x4($sp)
-	.set at
-	mfhi    $8
-	mflo    $9
-	sw      $8,0x68($sp)
-	sw      $9,0x6c($sp)
-	lw      $8,0x20($sp)
-	lw      $9,0x24($sp)
-
-	jal receiveObject
-
-	lw      $8,0x68($sp)
-	lw      $9,0x6c($sp)
-	mthi    $8
-	mtlo    $9
-	lw      $31,0x64($sp)
-	lw      $30,0x60($sp)
-	lw      $23,0x5c($sp)
-	lw      $22,0x58($sp)
-	lw      $21,0x54($sp)
-	lw      $20,0x50($sp)
-	lw      $19,0x4c($sp)
-	lw      $18,0x48($sp)
-	lw      $17,0x44($sp)
-	lw      $16,0x40($sp)
-	lw      $15,0x3c($sp)
-	lw      $14,0x38($sp)
-	lw      $13,0x34($sp)
-	lw      $12,0x30($sp)
-	lw      $11,0x2c($sp)
-	lw      $10,0x28($sp)
-	lw      $9,0x24($sp)
-	lw      $8,0x20($sp)
-	lw      $7,0x1c($sp)
-	lw      $6,0x18($sp)
-	lw      $5,0x14($sp)
-	lw      $4,0x10($sp)
-	lw      $3,0xc($sp)
-	lw      $2,0x8($sp)
-	.set noat
-	lw      $1,0x4($sp)
-	.set at
-	addiu   $sp,$sp,112
-
-#	mtsri PASS, 0xefff
-	dret
-vec_event_counters:
-        empty_vec 0x2306
-
diff --git a/Robust/src/Runtime/RAW/runtime_arch.h b/Robust/src/Runtime/RAW/runtime_arch.h
deleted file mode 100644
index 19210e6e..00000000
--- a/Robust/src/Runtime/RAW/runtime_arch.h
+++ /dev/null
@@ -1,70 +0,0 @@
-#ifndef RUNTIME_ARCH
-#define RUNTIME_ARCH
-
-#ifdef PROFILE
-#ifdef RAWUSEIO
-#include "stdio.h"
-#include "string.h"
-#endif
-#endif
-#include <raw.h>
-#include <raw_compiler_defs.h>
-
-#define BAMBOO_CACHE_LINE_SIZE (kCacheLineSize)
-#define BAMBOO_CACHE_LINE_MASK (kCacheLineMask)
-
-#define BAMBOO_NUM_OF_CORE corenum   // the # of current residing core
-#define BAMBOO_GET_NUM_OF_CORE() (raw_get_abs_pos_x() + raw_get_array_size_x() * raw_get_abs_pos_y())  // compute the # of current residing core
-#define BAMBOO_DEBUGPRINT(x) (raw_test_pass((x)))
-#define BAMBOO_DEBUGPRINT_REG(x) (raw_test_pass_reg((x)))
-
-#define BAMBOO_SHARE_MEM_CALLOC(x, y) (calloc((x), (y)))  // allocate an array of x elements each of whose size in bytes is y on shared memory 
-
-#ifdef INTERRUPT
-// locks for global data structures related to obj queue
-#define BAMBOO_START_CRITICAL_SECTION_OBJ_QUEUE() raw_user_interrupts_off()
-#define BAMBOO_CLOSE_CRITICAL_SECTION_OBJ_QUEUE() raw_user_interrupts_on()
-// locks for global data structures related to status data
-#define BAMBOO_START_CRITICAL_SECTION_STATUS() raw_user_interrupts_off()
-#define BAMBOO_CLOSE_CRITICAL_SECTION_STATUS() raw_user_interrupts_on()
-// locks for global data structures related to msg data
-#define BAMBOO_START_CRITICAL_SECTION_MSG() raw_user_interrupts_off()
-#define BAMBOO_CLOSE_CRITICAL_SECTION_MSG() raw_user_interrupts_on()
-// locks for global data structures related to lock table
-#define BAMBOO_START_CRITICAL_SECTION_LOCK() raw_user_interrupts_off()
-#define BAMBOO_CLOSE_CRITICAL_SECTION_LOCK() raw_user_interrupts_on()
-// locks for allocating memory
-#define BAMBOO_START_CRITICAL_SECTION_MEM() raw_user_interrupts_off()
-#define BAMBOO_CLOSE_CRITICAL_SECTION_MEM() raw_user_interrupts_on()
-// locks for all global data structures
-#define BAMBOO_START_CRITICAL_SECTION() raw_user_interrupts_off()
-#define BAMBOO_CLOSE_CRITICAL_SECTION() raw_user_interrupts_on()
-#else
-// locks for global data structures related to obj queue
-#define BAMBOO_START_CRITICAL_SECTION_OBJ_QUEUE()  
-#define BAMBOO_CLOSE_CRITICAL_SECTION_OBJ_QUEUE()  
-// locks for global data structures related to status data
-#define BAMBOO_START_CRITICAL_SECTION_STATUS()  
-#define BAMBOO_CLOSE_CRITICAL_SECTION_STATUS()  
-// locks for global data structures related to msg data
-#define BAMBOO_START_CRITICAL_SECTION_MSG()  
-#define BAMBOO_CLOSE_CRITICAL_SECTION_MSG()  
-// locks for global data structures related to lock table
-#define BAMBOO_START_CRITICAL_SECTION_LOCK()  
-#define BAMBOO_CLOSE_CRITICAL_SECTION_LOCK()  
-// locks for allocating memory
-#define BAMBOO_START_CRITICAL_SECTION_MEM()  
-#define BAMBOO_CLOSE_CRITICAL_SECTION_MEM()  
-// locks for all global data structures
-#define BAMBOO_START_CRITICAL_SECTION()  
-#define BAMBOO_CLOSE_CRITICAL_SECTION()  
-#endif
-
-#define BAMBOO_WAITING_FOR_LOCK() (receiveObject())
-#define BAMBOO_CACHE_FLUSH_RANGE(x, y)  (raw_invalidate_cache_range((x), (y)))
-#define BAMBOO_CACHE_FLUSH_ALL() (raw_flush_entire_cache())
-#define BAMBOO_EXIT(x) (raw_test_done((x)))
-#define BAMBOO_MSG_AVAIL() (gdn_input_avail())
-#define BAMBOO_GET_EXE_TIME() (raw_get_cycle())
-
-#endif // #ifndef RUNTIME_ARCH
diff --git a/Robust/src/Runtime/RAW/task_arch.c b/Robust/src/Runtime/RAW/task_arch.c
deleted file mode 100644
index 4a247a3b..00000000
--- a/Robust/src/Runtime/RAW/task_arch.c
+++ /dev/null
@@ -1,1592 +0,0 @@
-#ifdef TASK
-#include "runtime.h"
-#include "multicoreruntime.h"
-#include "runtime_arch.h"
-
-__attribute__((always_inline)) inline void initialization() {
-} // initialization()
-
-__attribute__((always_inline)) inline void initCommunication() {
-#ifdef INTERRUPT
-  if (corenum < NUMCORES) {
-    // set up interrupts
-    setup_ints();
-    raw_user_interrupts_on();
-  }
-#endif
-}
-
-__attribute__((always_inline)) inline void fakeExecution()  {
-  // handle communications
-  while(true) {
-	  receiveObject();
-  }
-}
-
-#ifdef USEIO
-int main(void) {
-#else
-void begin() {
-#endif // #ifdef USEIO
-  run(NULL);
-}
-
-__attribute__((always_inline)) inline void terminate()  {
-	raw_test_done(1);
-}
-
-// helper function to compute the coordinates of a core from the core number
-#define calCoords(core_num, coordX, coordY) \
-  *(coordX) = (core_num) % raw_get_array_size_x();\
-  *(coordY) = core_num / raw_get_array_size_x();
-
-// transfer an object to targetcore
-// format: object
-inline void transferObject(struct transObjInfo * transObj) {//  __attribute__((always_inline)){
-  void * obj = transObj->objptr;
-  int type=((int *)obj)[0];
-  int targetcore = transObj->targetcore;  
-
-  unsigned msgHdr;
-  int self_y, self_x, target_y, target_x;
-  // for 32 bit machine, the size of fixed part is always 3 words
-  int msgsize = 3 + transObj->length * 2;
-  int i = 0;
-
-  struct ___Object___ * newobj = (struct ___Object___ *)obj;
-
-  calCoords(corenum, &self_x, &self_y);
-  calCoords(targetcore, &target_x, &target_y);
-  isMsgSending = true;
-  // Build the message header
-  msgHdr = construct_dyn_hdr(0, msgsize, 0,             // msgsize word sent.
-                             self_y, self_x,
-                             target_y, target_x);
-  // start sending msg, set sand msg flag
-  gdn_send(msgHdr);                     
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xbbbb);
-  BAMBOO_DEBUGPRINT(0xb000 + targetcore);       // targetcore
-#endif
-  gdn_send(0);
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0);
-#endif
-  gdn_send(msgsize);
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT_REG(msgsize);
-#endif
-  gdn_send((int)obj);
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT_REG(obj);
-#endif
-  for(i = 0; i < transObj->length; ++i) {
-    int taskindex = transObj->queues[2*i];
-    int paramindex = transObj->queues[2*i+1];
-    gdn_send(taskindex);
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT_REG(taskindex);
-#endif
-    gdn_send(paramindex);
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT_REG(paramindex);
-#endif
-  }
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xffff);
-#endif
-  // end of sending this msg, set sand msg flag false
-  isMsgSending = false;
-  ++(self_numsendobjs);
-  // check if there are pending msgs
-  while(isMsgHanging) {
-	  // get the msg from outmsgdata[]
-	  // length + target + msg
-	  outmsgleft = outmsgdata[outmsgindex++];
-	  int target = outmsgdata[outmsgindex++];
-	  calCoords(target, &target_x, &target_y);
-	  // mark to start sending the msg
-	  isMsgSending = true;
-	  // Build the message header
-	  msgHdr = construct_dyn_hdr(0, outmsgleft, 0,                        // msgsize word sent.
-                                 self_y, self_x,
-                                 target_y, target_x);
-	  gdn_send(msgHdr);                           
-#ifdef DEBUG
-	  BAMBOO_DEBUGPRINT(0xbbbb);
-	  BAMBOO_DEBUGPRINT(0xb000 + target);             // targetcore
-#endif
-	  while(outmsgleft-- > 0) {
-		  gdn_send(outmsgdata[outmsgindex++]);
-#ifdef DEBUG
-		  BAMBOO_DEBUGPRINT_REG(outmsgdata[outmsgindex - 1]);
-#endif
-	  }
-#ifdef DEBUG
-	  BAMBOO_DEBUGPRINT(0xffff);
-#endif
-	  // mark to end sending the msg
-	  isMsgSending = false;
-	  BAMBOO_START_CRITICAL_SECTION_MSG();
-	  // check if there are still msg hanging
-	  if(outmsgindex == outmsglast) {
-		  // no more msgs
-		  outmsgindex = outmsglast = 0;
-		  isMsgHanging = false;
-	  }
-	  BAMBOO_CLOSE_CRITICAL_SECTION_MSG();
-  }
-}
-
-__attribute__((always_inline)) inline void send_msg_1 (int targetcore, 
-		                                                   int n0) {
-  // send this msg
-  unsigned msgHdr;
-  int self_y, self_x, target_y, target_x;
-  msglength = 1;
-
-  // get the x-coord and y-coord of the target core
-  calCoords(corenum, &self_x, &self_y);
-  calCoords(targetcore, &target_x, &target_y);
-
-  // mark to start sending the msg
-  isMsgSending = true;
-  // Build the message header
-  msgHdr = construct_dyn_hdr(0, msglength, 0,             // msgsize word sent.
-                             self_y, self_x,
-                             target_y, target_x);
-  gdn_send(msgHdr);                     // Send the message header
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xbbbb);
-  BAMBOO_DEBUGPRINT(0xb000 + targetcore);       // targetcore
-#endif
-  gdn_send(n0);
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(n0);
-  BAMBOO_DEBUGPRINT(0xffff);
-#endif
-  // mark to end sending the msg
-  isMsgSending = false;
-  // check if there are pending msgs
-  while(isMsgHanging) {
-	  // get the msg from outmsgdata[]
-	  // length + target + msg
-	  outmsgleft = outmsgdata[outmsgindex++];
-	  int target = outmsgdata[outmsgindex++];
-	  calCoords(target, &target_x, &target_y);
-	  // mark to start sending the msg
-	  isMsgSending = true;
-	  // Build the message header
-	  msgHdr = construct_dyn_hdr(0, outmsgleft, 0,                        // msgsize word sent.
-                                 self_y, self_x,
-                                 target_y, target_x);
-	  gdn_send(msgHdr);                           
-#ifdef DEBUG
-	  BAMBOO_DEBUGPRINT(0xbbbb);
-	  BAMBOO_DEBUGPRINT(0xb000 + target);             // targetcore
-#endif
-	  while(outmsgleft-- > 0) {
-		  gdn_send(outmsgdata[outmsgindex++]);
-#ifdef DEBUG
-		  BAMBOO_DEBUGPRINT_REG(outmsgdata[outmsgindex - 1]);
-#endif
-	  }
-#ifdef DEBUG
-	  BAMBOO_DEBUGPRINT(0xffff);
-#endif
-	  // mark to end sending the msg
-	  isMsgSending = false;
-	  BAMBOO_START_CRITICAL_SECTION_MSG();
-	  // check if there are still msg hanging
-	  if(outmsgindex == outmsglast) {
-		  // no more msgs
-		  outmsgindex = outmsglast = 0;
-		  isMsgHanging = false;
-	  }
-	  BAMBOO_CLOSE_CRITICAL_SECTION_MSG();
-  }
-}
-
-__attribute__((always_inline)) inline void send_msg_2 (int targetcore, 
-		                                                   int n0, 
-																											 int n1) {
-  // send this msg
-  unsigned msgHdr;
-  int self_y, self_x, target_y, target_x;
-  msglength = 2;
-
-  // get the x-coord and y-coord of the target core
-  calCoords(corenum, &self_x, &self_y);
-  calCoords(targetcore, &target_x, &target_y);
-
-  // mark to start sending the msg
-  isMsgSending = true;
-  // Build the message header
-  msgHdr = construct_dyn_hdr(0, msglength, 0,             // msgsize word sent.
-                             self_y, self_x,
-                             target_y, target_x);
-  gdn_send(msgHdr);                     // Send the message header
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xbbbb);
-  BAMBOO_DEBUGPRINT(0xb000 + targetcore);       // targetcore
-#endif
-  gdn_send(n0);
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(n0);
-#endif
-  gdn_send(n1);
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(n1);
-  BAMBOO_DEBUGPRINT(0xffff);
-#endif
-  // mark to end sending the msg
-  isMsgSending = false;
-  // check if there are pending msgs
-  while(isMsgHanging) {
-	  // get the msg from outmsgdata[]
-	  // length + target + msg
-	  outmsgleft = outmsgdata[outmsgindex++];
-	  int target = outmsgdata[outmsgindex++];
-	  calCoords(target, &target_x, &target_y);
-	  // mark to start sending the msg
-	  isMsgSending = true;
-	  // Build the message header
-	  msgHdr = construct_dyn_hdr(0, outmsgleft, 0,                        // msgsize word sent.
-                                 self_y, self_x,
-                                 target_y, target_x);
-	  gdn_send(msgHdr);                           
-#ifdef DEBUG
-	  BAMBOO_DEBUGPRINT(0xbbbb);
-	  BAMBOO_DEBUGPRINT(0xb000 + target);             // targetcore
-#endif
-	  while(outmsgleft-- > 0) {
-		  gdn_send(outmsgdata[outmsgindex++]);
-#ifdef DEBUG
-		  BAMBOO_DEBUGPRINT_REG(outmsgdata[outmsgindex - 1]);
-#endif
-	  }
-#ifdef DEBUG
-	  BAMBOO_DEBUGPRINT(0xffff);
-#endif
-	  // mark to end sending the msg
-	  isMsgSending = false;
-	  BAMBOO_START_CRITICAL_SECTION_MSG();
-	  // check if there are still msg hanging
-	  if(outmsgindex == outmsglast) {
-		  // no more msgs
-		  outmsgindex = outmsglast = 0;
-		  isMsgHanging = false;
-	  }
-	  BAMBOO_CLOSE_CRITICAL_SECTION_MSG();
-  }
-}
-
-__attribute__((always_inline)) inline void send_msg_3 (int targetcore, 
-		                                                   int n0, 
-																											 int n1, 
-																											 int n2) {
-  // send this msg
-  unsigned msgHdr;
-  int self_y, self_x, target_y, target_x;
-  msglength = 3;
-
-  // get the x-coord and y-coord of the target core
-  calCoords(corenum, &self_x, &self_y);
-  calCoords(targetcore, &target_x, &target_y);
-
-  // mark to start sending the msg
-  isMsgSending = true;
-  // Build the message header
-  msgHdr = construct_dyn_hdr(0, msglength, 0,             // msgsize word sent.
-                             self_y, self_x,
-                             target_y, target_x);
-  gdn_send(msgHdr);                     // Send the message header
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xbbbb);
-  BAMBOO_DEBUGPRINT(0xb000 + targetcore);       // targetcore
-#endif
-  gdn_send(n0);
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(n0);
-#endif
-  gdn_send(n1);
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(n1);
-#endif
-  gdn_send(n2);
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(n2);
-  BAMBOO_DEBUGPRINT(0xffff);
-#endif
-  // mark to end sending the msg
-  isMsgSending = false;
-  // check if there are pending msgs
-  while(isMsgHanging) {
-	  // get the msg from outmsgdata[]
-	  // length + target + msg
-	  outmsgleft = outmsgdata[outmsgindex++];
-	  int target = outmsgdata[outmsgindex++];
-	  calCoords(target, &target_x, &target_y);
-	  // mark to start sending the msg
-	  isMsgSending = true;
-	  // Build the message header
-	  msgHdr = construct_dyn_hdr(0, outmsgleft, 0,                        // msgsize word sent.
-                                 self_y, self_x,
-                                 target_y, target_x);
-	  gdn_send(msgHdr);                           
-#ifdef DEBUG
-	  BAMBOO_DEBUGPRINT(0xbbbb);
-	  BAMBOO_DEBUGPRINT(0xb000 + target);             // targetcore
-#endif
-	  while(outmsgleft-- > 0) {
-		  gdn_send(outmsgdata[outmsgindex++]);
-#ifdef DEBUG
-		  BAMBOO_DEBUGPRINT_REG(outmsgdata[outmsgindex - 1]);
-#endif
-	  }
-#ifdef DEBUG
-	  BAMBOO_DEBUGPRINT(0xffff);
-#endif
-	  // mark to end sending the msg
-	  isMsgSending = false;
-	  BAMBOO_START_CRITICAL_SECTION_MSG();
-	  // check if there are still msg hanging
-	  if(outmsgindex == outmsglast) {
-		  // no more msgs
-		  outmsgindex = outmsglast = 0;
-		  isMsgHanging = false;
-	  }
-	  BAMBOO_CLOSE_CRITICAL_SECTION_MSG();
-  }
-}
-
-__attribute__((always_inline)) inline void send_msg_4 (int targetcore, 
-		                                                   int n0, 
-																											 int n1, 
-																											 int n2, 
-																											 int n3) {
-  // send this msg
-  unsigned msgHdr;
-  int self_y, self_x, target_y, target_x;
-  msglength = 4;
-
-  // get the x-coord and y-coord of the target core
-  calCoords(corenum, &self_x, &self_y);
-  calCoords(targetcore, &target_x, &target_y);
-
-  // mark to start sending the msg
-  isMsgSending = true;
-  // Build the message header
-  msgHdr = construct_dyn_hdr(0, msglength, 0,             // msgsize word sent.
-                             self_y, self_x,
-                             target_y, target_x);
-  gdn_send(msgHdr);                     // Send the message header
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xbbbb);
-  BAMBOO_DEBUGPRINT(0xb000 + targetcore);       // targetcore
-#endif
-  gdn_send(n0);
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(n0);
-#endif
-  gdn_send(n1);
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(n1);
-#endif
-  gdn_send(n2);
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(n2);
-#endif
-  gdn_send(n3);
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(n3);
-  BAMBOO_DEBUGPRINT(0xffff);
-#endif
-  // mark to end sending the msg
-  isMsgSending = false;
-  // check if there are pending msgs
-  while(isMsgHanging) {
-	  // get the msg from outmsgdata[]
-	  // length + target + msg
-	  outmsgleft = outmsgdata[outmsgindex++];
-	  int target = outmsgdata[outmsgindex++];
-	  calCoords(target, &target_x, &target_y);
-	  // mark to start sending the msg
-	  isMsgSending = true;
-	  // Build the message header
-	  msgHdr = construct_dyn_hdr(0, outmsgleft, 0,                        // msgsize word sent.
-                                 self_y, self_x,
-                                 target_y, target_x);
-	  gdn_send(msgHdr);                           
-#ifdef DEBUG
-	  BAMBOO_DEBUGPRINT(0xbbbb);
-	  BAMBOO_DEBUGPRINT(0xb000 + target);             // targetcore
-#endif
-	  while(outmsgleft-- > 0) {
-		  gdn_send(outmsgdata[outmsgindex++]);
-#ifdef DEBUG
-		  BAMBOO_DEBUGPRINT_REG(outmsgdata[outmsgindex - 1]);
-#endif
-	  }
-#ifdef DEBUG
-	  BAMBOO_DEBUGPRINT(0xffff);
-#endif
-	  // mark to end sending the msg
-	  isMsgSending = false;
-	  BAMBOO_START_CRITICAL_SECTION_MSG();
-	  // check if there are still msg hanging
-	  if(outmsgindex == outmsglast) {
-		  // no more msgs
-		  outmsgindex = outmsglast = 0;
-		  isMsgHanging = false;
-	  }
-	  BAMBOO_CLOSE_CRITICAL_SECTION_MSG();
-  }
-}
-
-__attribute__((always_inline)) inline void send_msg_5 (int targetcore, 
-		                                                   int n0, 
-																											 int n1, 
-																											 int n2, 
-																											 int n3, 
-																											 int n4) {
-  // send this msg
-  unsigned msgHdr;
-  int self_y, self_x, target_y, target_x;
-  msglength = 5;
-
-  // get the x-coord and y-coord of the target core
-  calCoords(corenum, &self_x, &self_y);
-  calCoords(targetcore, &target_x, &target_y);
-
-  // mark to start sending the msg
-  isMsgSending = true;
-  // Build the message header
-  msgHdr = construct_dyn_hdr(0, msglength, 0,             // msgsize word sent.
-                             self_y, self_x,
-                             target_y, target_x);
-  gdn_send(msgHdr);                     // Send the message header
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xbbbb);
-  BAMBOO_DEBUGPRINT(0xb000 + targetcore);       // targetcore
-#endif
-  gdn_send(n0);
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(n0);
-#endif
-  gdn_send(n1);
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(n1);
-#endif
-  gdn_send(n2);
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(n2);
-#endif
-  gdn_send(n3);
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(n3);
-#endif
-  gdn_send(n4);
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(n4);
-  BAMBOO_DEBUGPRINT(0xffff);
-#endif
-  // mark to end sending the msg
-  isMsgSending = false;
-  // check if there are pending msgs
-  while(isMsgHanging) {
-	  // get the msg from outmsgdata[]
-	  // length + target + msg
-	  outmsgleft = outmsgdata[outmsgindex++];
-	  int target = outmsgdata[outmsgindex++];
-	  calCoords(target, &target_x, &target_y);
-	  // mark to start sending the msg
-	  isMsgSending = true;
-	  // Build the message header
-	  msgHdr = construct_dyn_hdr(0, outmsgleft, 0,                        // msgsize word sent.
-                                 self_y, self_x,
-                                 target_y, target_x);
-	  gdn_send(msgHdr);                           
-#ifdef DEBUG
-	  BAMBOO_DEBUGPRINT(0xbbbb);
-	  BAMBOO_DEBUGPRINT(0xb000 + target);             // targetcore
-#endif
-	  while(outmsgleft-- > 0) {
-		  gdn_send(outmsgdata[outmsgindex++]);
-#ifdef DEBUG
-		  BAMBOO_DEBUGPRINT_REG(outmsgdata[outmsgindex - 1]);
-#endif
-	  }
-#ifdef DEBUG
-	  BAMBOO_DEBUGPRINT(0xffff);
-#endif
-	  // mark to end sending the msg
-	  isMsgSending = false;
-	  BAMBOO_START_CRITICAL_SECTION_MSG();
-	  // check if there are still msg hanging
-	  if(outmsgindex == outmsglast) {
-		  // no more msgs
-		  outmsgindex = outmsglast = 0;
-		  isMsgHanging = false;
-	  }
-	  BAMBOO_CLOSE_CRITICAL_SECTION_MSG();
-  }
-}
-
-__attribute__((always_inline)) inline void send_msg_6 (int targetcore, 
-		                                                   int n0, 
-																											 int n1, 
-																											 int n2, 
-																											 int n3, 
-																											 int n4, 
-																											 int n5) {
-  // send this msg
-  unsigned msgHdr;
-  int self_y, self_x, target_y, target_x;
-  msglength = 6;
-
-  // get the x-coord and y-coord of the target core
-  calCoords(corenum, &self_x, &self_y);
-  calCoords(targetcore, &target_x, &target_y);
-
-  // mark to start sending the msg
-  isMsgSending = true;
-  // Build the message header
-  msgHdr = construct_dyn_hdr(0, msglength, 0,             // msgsize word sent.
-                             self_y, self_x,
-                             target_y, target_x);
-  gdn_send(msgHdr);                     // Send the message header
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xbbbb);
-  BAMBOO_DEBUGPRINT(0xb000 + targetcore);       // targetcore
-#endif
-  gdn_send(n0);
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(n0);
-#endif
-  gdn_send(n1);
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(n1);
-#endif
-  gdn_send(n2);
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(n2);
-#endif
-  gdn_send(n3);
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(n3);
-#endif
-  gdn_send(n4);
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(n4);
-#endif
-  gdn_send(n5);
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(n5);
-  BAMBOO_DEBUGPRINT(0xffff);
-#endif
-  // mark to end sending the msg
-  isMsgSending = false;
-  // check if there are pending msgs
-  while(isMsgHanging) {
-	  // get the msg from outmsgdata[]
-	  // length + target + msg
-	  outmsgleft = outmsgdata[outmsgindex++];
-	  int target = outmsgdata[outmsgindex++];
-	  calCoords(target, &target_x, &target_y);
-	  // mark to start sending the msg
-	  isMsgSending = true;
-	  // Build the message header
-	  msgHdr = construct_dyn_hdr(0, outmsgleft, 0,                        // msgsize word sent.
-                                 self_y, self_x,
-                                 target_y, target_x);
-	  gdn_send(msgHdr);                           
-#ifdef DEBUG
-	  BAMBOO_DEBUGPRINT(0xbbbb);
-	  BAMBOO_DEBUGPRINT(0xb000 + target);             // targetcore
-#endif
-	  while(outmsgleft-- > 0) {
-		  gdn_send(outmsgdata[outmsgindex++]);
-#ifdef DEBUG
-		  BAMBOO_DEBUGPRINT_REG(outmsgdata[outmsgindex - 1]);
-#endif
-	  }
-#ifdef DEBUG
-	  BAMBOO_DEBUGPRINT(0xffff);
-#endif
-	  // mark to end sending the msg
-	  isMsgSending = false;
-	  BAMBOO_START_CRITICAL_SECTION_MSG();
-	  // check if there are still msg hanging
-	  if(outmsgindex == outmsglast) {
-		  // no more msgs
-		  outmsgindex = outmsglast = 0;
-		  isMsgHanging = false;
-	  }
-	  BAMBOO_CLOSE_CRITICAL_SECTION_MSG();
-  }
-}
-
-__attribute__((always_inline)) inline void cache_msg_2 (int targetcore, 
-		                                                    int n0, 
-																												int n1) {
-  // cache this msg
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xdede);
-#endif
-  isMsgHanging = true;
-  // cache the msg in outmsgdata and send it later
-  // msglength + target core + msg
-  outmsgdata[outmsglast++] = 2;
-  outmsgdata[outmsglast++] = targetcore;
-  outmsgdata[outmsglast++] = n0;
-  outmsgdata[outmsglast++] = n1;
-}
-
-__attribute__((always_inline)) inline void cache_msg_3 (int targetcore, 
-		                                                    int n0, 
-																												int n1, 
-																												int n2) {
-  // cache this msg
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xdede);
-#endif
-  isMsgHanging = true;
-  // cache the msg in outmsgdata and send it later
-  // msglength + target core + msg
-  outmsgdata[outmsglast++] = 3;
-  outmsgdata[outmsglast++] = targetcore;
-  outmsgdata[outmsglast++] = n0;
-  outmsgdata[outmsglast++] = n1;
-  outmsgdata[outmsglast++] = n2;
-}
-
-__attribute__((always_inline)) inline void cache_msg_4 (int targetcore, 
-		                                                    int n0, 
-																												int n1, 
-																												int n2, 
-																												int n3) {
-  // cache this msg
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xdede);
-#endif
-  isMsgHanging = true;
-  // cache the msg in outmsgdata and send it later
-  // msglength + target core + msg
-  outmsgdata[outmsglast++] = 4;
-  outmsgdata[outmsglast++] = targetcore;
-  outmsgdata[outmsglast++] = n0;
-  outmsgdata[outmsglast++] = n1;
-  outmsgdata[outmsglast++] = n2;
-  outmsgdata[outmsglast++] = n3;
-}
-
-__attribute__((always_inline)) inline void cache_msg_5 (int targetcore, 
-		                                                    int n0, 
-																												int n1, 
-																												int n2, 
-																												int n3, 
-																												int n4) {
-  // cache this msg
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xdede);
-#endif
-  isMsgHanging = true;
-  // cache the msg in outmsgdata and send it later
-  // msglength + target core + msg
-  outmsgdata[outmsglast++] = 5;
-  outmsgdata[outmsglast++] = targetcore;
-  outmsgdata[outmsglast++] = n0;
-  outmsgdata[outmsglast++] = n1;
-  outmsgdata[outmsglast++] = n2;
-  outmsgdata[outmsglast++] = n3;
-  outmsgdata[outmsglast++] = n4;
-}
-
-
-__attribute__((always_inline)) inline void cache_msg_6 (int targetcore, 
-		                                                    int n0, 
-																												int n1, 
-																												int n2, 
-																												int n3, 
-																												int n4, 
-																												int n5) {
-  // cache this msg
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xdede);
-#endif
-  isMsgHanging = true;
-  // cache the msg in outmsgdata and send it later
-  // msglength + target core + msg
-  outmsgdata[outmsglast++] = 6;
-  outmsgdata[outmsglast++] = targetcore;
-  outmsgdata[outmsglast++] = n0;
-  outmsgdata[outmsglast++] = n1;
-  outmsgdata[outmsglast++] = n2;
-  outmsgdata[outmsglast++] = n3;
-  outmsgdata[outmsglast++] = n4;
-  outmsgdata[outmsglast++] = n5;
-}
-
-__attribute__((always_inline)) inline int receiveMsg() {
-  if(gdn_input_avail() == 0) {
-#ifdef DEBUG
-    if(corenum < NUMCORES) {
-      BAMBOO_DEBUGPRINT(0xd001);
-    }
-#endif
-    return -1;
-  }
-#ifdef PROFILE
-  /*if(isInterrupt && (!interruptInfoOverflow)) {
-     // BAMBOO_DEBUGPRINT(0xffff);
-     interruptInfoArray[interruptInfoIndex] = RUNMALLOC_I(sizeof(struct interrupt_info));
-     interruptInfoArray[interruptInfoIndex]->startTime = raw_get_cycle();
-     interruptInfoArray[interruptInfoIndex]->endTime = -1;
-     }*/
-#endif
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xcccc);
-#endif
-  while((gdn_input_avail() != 0) && (msgdataindex < msglength)) {
-    msgdata[msgdataindex] = gdn_receive();
-    if(msgdataindex == 0) {
-		if(msgdata[0] > 0xc) {
-			msglength = 3;
-		} else if (msgdata[0] == 0xc) {
-			msglength = 1;
-		} else if(msgdata[0] > 8) {
-			msglength = 4;
-		} else if(msgdata[0] == 8) {
-			msglength = 6;
-		} else if(msgdata[0] > 5) {
-			msglength = 2;
-		} else if (msgdata[0] > 2) {
-			msglength = 4;
-		} else if (msgdata[0] == 2) {
-			msglength = 5;
-		} else if (msgdata[0] > 0) {
-			msglength = 4;
-		}
-    } else if((msgdataindex == 1) && (msgdata[0] == 0)) {
-      msglength = msgdata[msgdataindex];
-    }
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT_REG(msgdata[msgdataindex]);
-#endif
-    msgdataindex++;
-  }
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xffff);
-#endif
-  return msgdataindex;
-}
-
-bool getreadlock(void * ptr) {
-  int targetcore = 0;
-  lockobj = (int)ptr;
-  if(((struct ___Object___ *)ptr)->lock == NULL) {
-	lock2require = lockobj;
-  } else {
-	lock2require = (int)(((struct ___Object___ *)ptr)->lock);
-  }
-  targetcore = (lock2require >> 5) % NUMCORES;
-  lockflag = false;
-#ifndef INTERRUPT
-  reside = false;
-#endif
-  lockresult = 0;
-
-  if(targetcore == BAMBOO_NUM_OF_CORE) {
-    // reside on this core
-    int deny = 0;
-	BAMBOO_START_CRITICAL_SECTION_LOCK();
-#ifdef DEBUG
-	BAMBOO_DEBUGPRINT(0xf001);
-#endif
-	deny = processlockrequest(0, lock2require, (int)ptr, BAMBOO_NUM_OF_CORE, BAMBOO_NUM_OF_CORE, false);
-	BAMBOO_CLOSE_CRITICAL_SECTION_LOCK();
-#ifdef DEBUG
-	BAMBOO_DEBUGPRINT(0xf000);
-#endif
-    if(deny == -1) {
-		// redirected
-		return true;
-	} else {
-		if(lockobj == (int)ptr) {
-			if(deny) {
-				lockresult = 0;
-			} else {
-				lockresult = 1;
-			}
-			lockflag = true;
-#ifndef INTERRUPT
-			reside = true;
-#endif
-		} else {
-			// conflicts on lockresults
-			BAMBOO_EXIT(0xa018);
-		}
-	}
-    return true;
-  } else {
-	  // send lock request msg
-	  // for 32 bit machine, the size is always 5 words
-	  send_msg_5(targetcore, LOCKREQUEST, 0, (int)ptr, 
-				       lock2require, BAMBOO_NUM_OF_CORE);
-  }
-  return true;
-}
-
-bool getreadlock_I_r(void * ptr, void * redirectlock, int core, bool cache);
-bool getwritelock_I_r(void* lock, void* redirectlock, int core, bool cache);
-
-void releasereadlock(void * ptr) {
-  int targetcore = 0;
-  int reallock = 0;
-  if(((struct ___Object___ *)ptr)->lock == NULL) {
-	reallock = (int)ptr;
-  } else {
-	reallock = (int)(((struct ___Object___ *)ptr)->lock);
-  }
-  targetcore = (reallock >> 5) % NUMCORES;
-
-  if(targetcore == BAMBOO_NUM_OF_CORE) {
-	BAMBOO_START_CRITICAL_SECTION_LOCK();
-#ifdef DEBUG
-	BAMBOO_DEBUGPRINT(0xf001);
-#endif
-    // reside on this core
-    if(!RuntimeHashcontainskey(locktbl, reallock)) {
-      // no locks for this object, something is wrong
-      BAMBOO_EXIT(0xa019);
-    } else {
-      int rwlock_obj = 0;
-	  struct LockValue * lockvalue = NULL;
-      RuntimeHashget(locktbl, reallock, &rwlock_obj);
-	  lockvalue = (struct LockValue *)rwlock_obj;
-      lockvalue->value--;
-    }
-	BAMBOO_CLOSE_CRITICAL_SECTION_LOCK();
-#ifdef DEBUG
-	BAMBOO_DEBUGPRINT(0xf000);
-#endif
-    return;
-  } else {
-	// send lock release msg
-	// for 32 bit machine, the size is always 4 words
-	send_msg_4(targetcore, LOCKRELEASE, 0, (int)ptr, reallock);
-  }
-}
-
-// redirected lock request
-bool getreadlock_I_r(void * ptr, void * redirectlock, int core, bool cache) {
-  int targetcore = 0;
-  
-  if(core == BAMBOO_NUM_OF_CORE) {
-	  lockobj = (int)ptr;
-	  lock2require = (int)redirectlock;
-	  lockflag = false;
-#ifndef INTERRUPT
-	  reside = false;
-#endif
-	  lockresult = 0;
-  }  
-  targetcore = ((int)redirectlock >> 5) % NUMCORES;
-  
-  if(targetcore == BAMBOO_NUM_OF_CORE) {
-    // reside on this core
-    int deny = processlockrequest(0, (int)redirectlock, (int)ptr, BAMBOO_NUM_OF_CORE, core, cache);
-	if(deny == -1) {
-		// redirected
-		return true;
-	} else {
-		if(core == BAMBOO_NUM_OF_CORE) {
-			if(lockobj == (int)ptr) {
-				if(deny) {
-					lockresult = 0;
-				} else {
-					lockresult = 1;
-					RuntimeHashadd_I(objRedirectLockTbl, (int)ptr, (int)redirectlock);
-				}
-				lockflag = true;
-#ifndef INTERRUPT
-				reside = true;
-#endif
-			} else {
-				// conflicts on lockresults
-				BAMBOO_EXIT(0xa01a);
-			}
-			return true;
-		} else {
-			// send lock grant/deny request to the root requiring core
-			// check if there is still some msg on sending
-			if((!cache) || (cache && !isMsgSending)) {
-				send_msg_4(core, deny==1?REDIRECTDENY:REDIRECTGROUNT, 0, 
-						       (int)ptr, (int)redirectlock);
-			} else {
-				cache_msg_4(core, deny==1?REDIRECTDENY:REDIRECTGROUNT, 0, 
-						        (int)ptr, (int)redirectlock);
-			}
-		}
-	}
-  } else {
-	// redirect the lock request
-	// for 32 bit machine, the size is always 6 words
-	if((!cache) || (cache && !isMsgSending)) {
-		send_msg_6(targetcore, REDIRECTLOCK, 0, (int)ptr, lock2require, 
-				       core, BAMBOO_NUM_OF_CORE);
-	} else {
-		cache_msg_6(targetcore, REDIRECTLOCK, 0, (int)ptr, lock2require, 
-				        core, BAMBOO_NUM_OF_CORE);
-	}
-  }
-  return true;
-}
-
-// not reentrant
-bool getwritelock(void * ptr) {
-  int targetcore = 0;
-
-  // for 32 bit machine, the size is always 5 words
-  //int msgsize = 5;
-
-  lockobj = (int)ptr;
-  if(((struct ___Object___ *)ptr)->lock == NULL) {
-	lock2require = lockobj;
-  } else {
-	lock2require = (int)(((struct ___Object___ *)ptr)->lock);
-  }
-  targetcore = (lock2require >> 5) % NUMCORES;
-  lockflag = false;
-#ifndef INTERRUPT
-  reside = false;
-#endif
-  lockresult = 0;
-
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xe551);
-  BAMBOO_DEBUGPRINT_REG(lockobj);
-  BAMBOO_DEBUGPRINT_REG(lock2require);
-  BAMBOO_DEBUGPRINT_REG(targetcore);
-#endif
-
-  if(targetcore == BAMBOO_NUM_OF_CORE) {
-    // reside on this core
-    int deny = 0;
-	BAMBOO_START_CRITICAL_SECTION_LOCK();
-#ifdef DEBUG
-	BAMBOO_DEBUGPRINT(0xf001);
-#endif
-	deny = processlockrequest(1, lock2require, (int)ptr, BAMBOO_NUM_OF_CORE, BAMBOO_NUM_OF_CORE, false);
-	BAMBOO_CLOSE_CRITICAL_SECTION_LOCK();
-#ifdef DEBUG
-	BAMBOO_DEBUGPRINT(0xf000);
-#endif
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xe555);
-    BAMBOO_DEBUGPRINT_REG(lockresult);
-#endif
-    if(deny == -1) {
-		// redirected
-		return true;
-	} else {
-		if(lockobj == (int)ptr) {
-			if(deny) {
-				lockresult = 0;
-			} else {
-				lockresult = 1;
-			}
-			lockflag = true;
-#ifndef INTERRUPT
-			reside = true;
-#endif
-		} else {
-			// conflicts on lockresults
-			BAMBOO_EXIT(0xa01b);
-		}
-	}
-    return true;
-  } else {
-	  // send lock request msg
-	  // for 32 bit machine, the size is always 5 words
-	  send_msg_5(targetcore, LOCKREQUEST, 1, (int)ptr, lock2require, 
-				       BAMBOO_NUM_OF_CORE);
-  }
-  return true;
-}
-
-void releasewritelock(void * ptr) {
-  int targetcore = 0;
-  int reallock = 0;
-  if(((struct ___Object___ *)ptr)->lock == NULL) {
-	reallock = (int)ptr;
-  } else {
-	reallock = (int)(((struct ___Object___ *)ptr)->lock);
-  }
-  targetcore = (reallock >> 5) % NUMCORES;
-
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xe661);
-  BAMBOO_DEBUGPRINT_REG((int)ptr);
-  BAMBOO_DEBUGPRINT_REG(reallock);
-  BAMBOO_DEBUGPRINT_REG(targetcore);
-#endif
-
-  if(targetcore == BAMBOO_NUM_OF_CORE) {
-	BAMBOO_START_CRITICAL_SECTION_LOCK();
-#ifdef DEBUG
-	BAMBOO_DEBUGPRINT(0xf001);
-#endif
-    // reside on this core
-    if(!RuntimeHashcontainskey(locktbl, reallock)) {
-      // no locks for this object, something is wrong
-      BAMBOO_EXIT(0xa01c);
-    } else {
-      int rwlock_obj = 0;
-	  struct LockValue * lockvalue = NULL;
-      RuntimeHashget(locktbl, reallock, &rwlock_obj);
-	  lockvalue = (struct LockValue *)rwlock_obj;
-      lockvalue->value++;
-    }
-	BAMBOO_CLOSE_CRITICAL_SECTION_LOCK();
-#ifdef DEBUG
-	BAMBOO_DEBUGPRINT(0xf000);
-#endif
-    return;
-  } else {
-	// send lock release msg
-	// for 32 bit machine, the size is always 4 words
-	send_msg_4(targetcore, LOCKRELEASE, 1, (int)ptr, reallock);
-  }
-}
-
-bool getwritelock_I(void * ptr) {
-  int targetcore = 0;
-  lockobj = (int)ptr;
-  if(((struct ___Object___ *)ptr)->lock == NULL) {
-	lock2require = lockobj;
-  } else {
-	lock2require = (int)(((struct ___Object___ *)ptr)->lock);
-  }
-  targetcore = (lock2require >> 5) % NUMCORES;
-  lockflag = false;
-#ifndef INTERRUPT
-  reside = false;
-#endif
-  lockresult = 0;
-
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xe561);
-  BAMBOO_DEBUGPRINT_REG(lockobj);
-  BAMBOO_DEBUGPRINT_REG(lock2require);
-  BAMBOO_DEBUGPRINT_REG(targetcore);
-#endif
-
-  if(targetcore == BAMBOO_NUM_OF_CORE) {
-    // reside on this core
-	int deny = processlockrequest(1, (int)lock2require, (int)ptr, BAMBOO_NUM_OF_CORE, BAMBOO_NUM_OF_CORE, false);
-	if(deny == -1) {
-		// redirected
-		return true;
-	} else {
-		if(lockobj == (int)ptr) {
-			if(deny) {
-				lockresult = 0;
-#ifdef DEBUG
-				BAMBOO_DEBUGPRINT(0);
-#endif
-			} else {
-				lockresult = 1;
-#ifdef DEBUG
-				BAMBOO_DEBUGPRINT(1);
-#endif
-			}
-			lockflag = true;
-#ifndef INTERRUPT
-			reside = true;
-#endif
-		} else {
-			// conflicts on lockresults
-			BAMBOO_EXIT(0xa01e);
-		}
-		return true;
-	}
-  } else {
-	  // send lock request msg
-	  // for 32 bit machine, the size is always 5 words
-	  send_msg_5(targetcore, LOCKREQUEST, 1, (int)ptr, lock2require, 
-				       BAMBOO_NUM_OF_CORE);
-  }
-  return true;
-}
-
-// redirected lock request
-bool getwritelock_I_r(void * ptr, void * redirectlock, int core, bool cache) {
-  int targetcore = 0;
-
-  if(core == BAMBOO_NUM_OF_CORE) {
-	  lockobj = (int)ptr;
-	  lock2require = (int)redirectlock;
-	  lockflag = false;
-#ifndef INTERRUPT
-	  reside = false;
-#endif
-	  lockresult = 0;
-  }
-  targetcore = ((int)redirectlock >> 5) % NUMCORES;
-
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xe571);
-  BAMBOO_DEBUGPRINT_REG((int)ptr);
-  BAMBOO_DEBUGPRINT_REG((int)redirectlock);
-  BAMBOO_DEBUGPRINT_REG(core);
-  BAMBOO_DEBUGPRINT_REG((int)cache);
-  BAMBOO_DEBUGPRINT_REG(targetcore);
-#endif
-
-
-  if(targetcore == BAMBOO_NUM_OF_CORE) {
-    // reside on this core
-	int deny = processlockrequest(1, (int)redirectlock, (int)ptr, BAMBOO_NUM_OF_CORE, core, cache);
-	if(deny == -1) {
-		// redirected
-		return true;
-	} else {
-		if(core == BAMBOO_NUM_OF_CORE) {
-			if(lockobj == (int)ptr) {
-				if(deny) {
-					lockresult = 0;
-				} else {
-					lockresult = 1;
-					RuntimeHashadd_I(objRedirectLockTbl, (int)ptr, (int)redirectlock);
-				}
-				lockflag = true;
-#ifndef INTERRUPT
-				reside = true;
-#endif
-			} else {
-				// conflicts on lockresults
-				BAMBOO_EXIT(0xa01f);
-			}
-			return true;
-		} else {
-			// send lock grant/deny request to the root requiring core
-			// check if there is still some msg on sending
-			if((!cache) || (cache && !isMsgSending)) {
-				send_msg_4(core, deny==1?REDIRECTDENY:REDIRECTGROUNT, 1, 
-							     (int)ptr, (int)redirectlock);
-			} else {
-				cache_msg_4(core, deny==1?REDIRECTDENY:REDIRECTGROUNT, 1, 
-						        (int)ptr, (int)redirectlock);
-			}
-		}
-	}
-  } else {
-	// redirect the lock request
-	// for 32 bit machine, the size is always 6 words
-	if((!cache) || (cache && !isMsgSending)) {
-		send_msg_6(targetcore, REDIRECTLOCK, 1, (int)ptr, (int)redirectlock, 
-				       core, BAMBOO_NUM_OF_CORE);
-	} else {
-		cache_msg_6(targetcore, REDIRECTLOCK, 1, (int)ptr, (int)redirectlock, 
-				        core, BAMBOO_NUM_OF_CORE);
-	}
-  }
-  return true;
-}
-
-void releasewritelock_I(void * ptr) {
-  int targetcore = 0;
-  int reallock = 0;
-  if(((struct ___Object___ *)ptr)->lock == NULL) {
-	reallock = (int)ptr;
-  } else {
-	reallock = (int)(((struct ___Object___ *)ptr)->lock);
-  }
-  targetcore = (reallock >> 5) % NUMCORES;
-
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xe681);
-  BAMBOO_DEBUGPRINT_REG((int)ptr);
-  BAMBOO_DEBUGPRINT_REG(reallock);
-  BAMBOO_DEBUGPRINT_REG(targetcore);
-#endif
-
-  if(targetcore == BAMBOO_NUM_OF_CORE) {
-    // reside on this core
-    if(!RuntimeHashcontainskey(locktbl, reallock)) {
-      // no locks for this object, something is wrong
-      BAMBOO_EXIT(0xa020);
-    } else {
-      int rwlock_obj = 0;
-	  struct LockValue * lockvalue = NULL;
-      RuntimeHashget(locktbl, reallock, &rwlock_obj);
-	  lockvalue = (struct LockValue *)rwlock_obj;
-      lockvalue->value++;
-    }
-    return;
-  } else {
-	// send lock release msg
-	// for 32 bit machine, the size is always 4 words
-	send_msg_4(targetcore, LOCKRELEASE, 1, (int)ptr, reallock);
-  }
-}
-
-void releasewritelock_I_r(void * lock, void * redirectlock) {
-  int targetcore = 0;
-  int reallock = (int)lock;
-  targetcore = (reallock >> 5) % NUMCORES;
-
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xe691);
-  BAMBOO_DEBUGPRINT_REG((int)lock);
-  BAMBOO_DEBUGPRINT_REG(reallock);
-  BAMBOO_DEBUGPRINT_REG(targetcore);
-#endif
-
-  if(targetcore == BAMBOO_NUM_OF_CORE) {
-    // reside on this core
-    if(!RuntimeHashcontainskey(locktbl, reallock)) {
-      // no locks for this object, something is wrong
-      BAMBOO_EXIT(0xa021);
-    } else {
-      int rwlock_obj = 0;
-	  struct LockValue * lockvalue = NULL;
-#ifdef DEBUG
-      BAMBOO_DEBUGPRINT(0xe692);
-#endif
-      RuntimeHashget(locktbl, reallock, &rwlock_obj);
-	  lockvalue = (struct LockValue *)rwlock_obj;
-#ifdef DEBUG
-      BAMBOO_DEBUGPRINT_REG(lockvalue->value);
-#endif
-      lockvalue->value++;
-	  lockvalue->redirectlock = (int)redirectlock;
-#ifdef DEBUG
-      BAMBOO_DEBUGPRINT_REG(lockvalue->value);
-#endif
-    }
-    return;
-  } else {
-	// send lock release msg
-	// for 32 bit machine, the size is always 4 words
-	send_msg_4(targetcore, REDIRECTRELEASE, 1, (int)lock, (int)redirectlock);
-  }
-}
-
-/* this function is to process lock requests. 
- * can only be invoked in receiveObject() */
-// if return -1: the lock request is redirected
-//            0: the lock request is approved
-//            1: the lock request is denied
-__attribute__((always_inline)) int processlockrequest(int locktype, int lock, int obj, int requestcore, int rootrequestcore, bool cache) {
-  int deny = 0;
-  if( ((lock >> 5) % NUMCORES) != BAMBOO_NUM_OF_CORE ) {
-	  // the lock should not be on this core
-#ifndef TILERA
-	  BAMBOO_DEBUGPRINT_REG(requestcore);
-	  BAMBOO_DEBUGPRINT_REG(lock);
-	  BAMBOO_DEBUGPRINT_REG(BAMBOO_NUM_OF_CORE);
-#endif
-	  BAMBOO_EXIT(0xa017);
-  }
-  if(!RuntimeHashcontainskey(locktbl, lock)) {
-	  // no locks for this object
-	  // first time to operate on this shared object
-	  // create a lock for it
-	  // the lock is an integer: 0 -- stall, >0 -- read lock, -1 -- write lock
-	  struct LockValue * lockvalue = (struct LockValue *)(RUNMALLOC_I(sizeof(struct LockValue)));
-	  lockvalue->redirectlock = 0;
-#ifdef DEBUG
-#ifndef TILERA
-	  BAMBOO_DEBUGPRINT(0xe110);
-#endif
-#endif
-	  if(locktype == 0) {
-		  lockvalue->value = 1;
-	  } else {
-		  lockvalue->value = -1;
-	  }
-	  RuntimeHashadd_I(locktbl, lock, (int)lockvalue);
-  } else {
-	  int rwlock_obj = 0;
-	  struct LockValue * lockvalue = NULL;
-#ifdef DEBUG
-#ifndef TILERA
-	  BAMBOO_DEBUGPRINT(0xe111);
-#endif
-#endif
-	  RuntimeHashget(locktbl, lock, &rwlock_obj);
-	  lockvalue = (struct LockValue *)(rwlock_obj);
-#ifdef DEBUG
-#ifndef TILERA
-	  BAMBOO_DEBUGPRINT_REG(lockvalue->redirectlock);
-#endif
-#endif
-	  if(lockvalue->redirectlock != 0) {
-		  // this lock is redirected
-#ifdef DEBUG
-#ifndef TILERA
-		  BAMBOO_DEBUGPRINT(0xe112);
-#endif
-#endif
-		  if(locktype == 0) {
-			  getreadlock_I_r((void *)obj, (void *)lockvalue->redirectlock, rootrequestcore, cache);
-		  } else {
-			  getwritelock_I_r((void *)obj, (void *)lockvalue->redirectlock, rootrequestcore, cache);
-		  }
-		  return -1;  // redirected
-	  } else {
-#ifdef DEBUG
-#ifndef TILERA
-		  BAMBOO_DEBUGPRINT_REG(lockvalue->value);
-#endif
-#endif
-		  if(0 == lockvalue->value) {
-			  if(locktype == 0) {
-				  lockvalue->value = 1;
-			  } else {
-				  lockvalue->value = -1;
-			  }
-		  } else if((lockvalue->value > 0) && (locktype == 0)) {
-			  // read lock request and there are only read locks
-			  lockvalue->value++;
-		  } else {
-			  deny = 1;
-		  }
-#ifdef DEBUG
-#ifndef TILERA
-		  BAMBOO_DEBUGPRINT_REG(lockvalue->value);
-#endif
-#endif
-	  }
-  }
-  return deny;
-}
-
-__attribute__((always_inline)) void processlockrelease(int locktype, int lock, int redirectlock, bool isredirect) {
-	if(!RuntimeHashcontainskey(locktbl, lock)) {
-    // no locks for this object, something is wrong
-#ifdef DEBUG
-		BAMBOO_DEBUGPRINT_REG(lock);
-#endif
-		BAMBOO_EXIT(0xa00b);
-	} else {
-		int rwlock_obj = 0;
-		struct LockValue * lockvalue = NULL;
-		RuntimeHashget(locktbl, lock, &rwlock_obj);
-		lockvalue = (struct LockValue*)(rwlock_obj);
-#ifdef DEBUG
-		BAMBOO_DEBUGPRINT(0xe884);
-		BAMBOO_DEBUGPRINT_REG(lockvalue->value);
-#endif
-		if(locktype == 0) {
-			lockvalue->value--;
-		} else {
-			lockvalue->value++;
-		}
-#ifdef DEBUG
-		BAMBOO_DEBUGPRINT_REG(lockvalue->value);
-#endif
-		if(isredirect) {
-			lockvalue->redirectlock = redirectlock;
-		}
-	}
-}
-
-#ifdef PROFILE
-__attribute__((always_inline)) inline void profileTaskStart(char * taskname) {
-  if(!taskInfoOverflow) {
-	  TaskInfo* taskInfo = RUNMALLOC(sizeof(struct task_info));
-	  taskInfoArray[taskInfoIndex] = taskInfo;
-	  taskInfo->taskName = taskname;
-	  taskInfo->startTime = raw_get_cycle();
-	  taskInfo->endTime = -1;
-	  taskInfo->exitIndex = -1;
-	  taskInfo->newObjs = NULL;
-  }
-}
-
-__attribute__((always_inline)) inline void profileTaskEnd() {
-  if(!taskInfoOverflow) {
-	  taskInfoArray[taskInfoIndex]->endTime = raw_get_cycle();
-	  taskInfoIndex++;
-	  if(taskInfoIndex == TASKINFOLENGTH) {
-		  taskInfoOverflow = true;
-	  }
-  }
-}
-
-// output the profiling data
-void outputProfileData() {
-#ifdef USEIO
-  FILE * fp;
-  char fn[50];
-  int self_y, self_x;
-  char c_y, c_x;
-  int i;
-  int totaltasktime = 0;
-  int preprocessingtime = 0;
-  int objqueuecheckingtime = 0;
-  int postprocessingtime = 0;
-  //int interruptiontime = 0;
-  int other = 0;
-  int averagetasktime = 0;
-  int tasknum = 0;
-
-  for(i = 0; i < 50; i++) {
-    fn[i] = 0;
-  }
-
-  calCoords(corenum, &self_y, &self_x);
-  c_y = (char)self_y + '0';
-  c_x = (char)self_x + '0';
-  strcat(fn, "profile_");
-  strcat(fn, &c_x);
-  strcat(fn, "_");
-  strcat(fn, &c_y);
-  strcat(fn, ".rst");
-
-  if((fp = fopen(fn, "w+")) == NULL) {
-    fprintf(stderr, "fopen error\n");
-    return;
-  }
-
-  fprintf(fp, "Task Name, Start Time, End Time, Duration, Exit Index(, NewObj Name, Num)+\n");
-  // output task related info
-  for(i = 0; i < taskInfoIndex; i++) {
-    TaskInfo* tmpTInfo = taskInfoArray[i];
-    int duration = tmpTInfo->endTime - tmpTInfo->startTime;
-    fprintf(fp, "%s, %d, %d, %d, %d", tmpTInfo->taskName, tmpTInfo->startTime, tmpTInfo->endTime, duration, tmpTInfo->exitIndex);
-	// summarize new obj info
-	if(tmpTInfo->newObjs != NULL) {
-		struct RuntimeHash * nobjtbl = allocateRuntimeHash(5);
-		struct RuntimeIterator * iter = NULL;
-		while(0 == isEmpty(tmpTInfo->newObjs)) {
-			char * objtype = (char *)(getItem(tmpTInfo->newObjs));
-			if(RuntimeHashcontainskey(nobjtbl, (int)(objtype))) {
-				int num = 0;
-				RuntimeHashget(nobjtbl, (int)objtype, &num);
-				RuntimeHashremovekey(nobjtbl, (int)objtype);
-				num++;
-				RuntimeHashadd(nobjtbl, (int)objtype, num);
-			} else {
-				RuntimeHashadd(nobjtbl, (int)objtype, 1);
-			}
-			//fprintf(stderr, "new obj!\n");
-		}
-
-		// output all new obj info
-		iter = RuntimeHashcreateiterator(nobjtbl);
-		while(RunhasNext(iter)) {
-			char * objtype = (char *)Runkey(iter);
-			int num = Runnext(iter);
-			fprintf(fp, ", %s, %d", objtype, num);
-		}
-	}
-	fprintf(fp, "\n");
-    if(strcmp(tmpTInfo->taskName, "tpd checking") == 0) {
-      preprocessingtime += duration;
-    } else if(strcmp(tmpTInfo->taskName, "post task execution") == 0) {
-      postprocessingtime += duration;
-    } else if(strcmp(tmpTInfo->taskName, "objqueue checking") == 0) {
-      objqueuecheckingtime += duration;
-    } else {
-      totaltasktime += duration;
-      averagetasktime += duration;
-      tasknum++;
-    }
-  }
-
-  if(taskInfoOverflow) {
-    fprintf(stderr, "Caution: task info overflow!\n");
-  }
-
-  other = totalexetime - totaltasktime - preprocessingtime - postprocessingtime;
-  averagetasktime /= tasknum;
-
-  fprintf(fp, "\nTotal time: %d\n", totalexetime);
-  fprintf(fp, "Total task execution time: %d (%f%%)\n", totaltasktime, ((double)totaltasktime/(double)totalexetime)*100);
-  fprintf(fp, "Total objqueue checking time: %d (%f%%)\n", objqueuecheckingtime, ((double)objqueuecheckingtime/(double)totalexetime)*100);
-  fprintf(fp, "Total pre-processing time: %d (%f%%)\n", preprocessingtime, ((double)preprocessingtime/(double)totalexetime)*100);
-  fprintf(fp, "Total post-processing time: %d (%f%%)\n", postprocessingtime, ((double)postprocessingtime/(double)totalexetime)*100);
-  fprintf(fp, "Other time: %d (%f%%)\n", other, ((double)other/(double)totalexetime)*100);
-
-  fprintf(fp, "\nAverage task execution time: %d\n", averagetasktime);
-
-  fclose(fp);
-#else
-  int i = 0;
-  int j = 0;
-
-  BAMBOO_DEBUGPRINT(0xdddd);
-  // output task related info
-  for(i= 0; i < taskInfoIndex; i++) {
-    TaskInfo* tmpTInfo = taskInfoArray[i];
-    char* tmpName = tmpTInfo->taskName;
-    int nameLen = strlen(tmpName);
-    BAMBOO_DEBUGPRINT(0xddda);
-    for(j = 0; j < nameLen; j++) {
-      BAMBOO_DEBUGPRINT_REG(tmpName[j]);
-    }
-    BAMBOO_DEBUGPRINT(0xdddb);
-    BAMBOO_DEBUGPRINT_REG(tmpTInfo->startTime);
-    BAMBOO_DEBUGPRINT_REG(tmpTInfo->endTime);
-	BAMBOO_DEBUGPRINT_REG(tmpTInfo->exitIndex);
-	if(tmpTInfo->newObjs != NULL) {
-		struct RuntimeHash * nobjtbl = allocateRuntimeHash(5);
-		struct RuntimeIterator * iter = NULL;
-		while(0 == isEmpty(tmpTInfo->newObjs)) {
-			char * objtype = (char *)(getItem(tmpTInfo->newObjs));
-			if(RuntimeHashcontainskey(nobjtbl, (int)(objtype))) {
-				int num = 0;
-				RuntimeHashget(nobjtbl, (int)objtype, &num);
-				RuntimeHashremovekey(nobjtbl, (int)objtype);
-				num++;
-				RuntimeHashadd(nobjtbl, (int)objtype, num);
-			} else {
-				RuntimeHashadd(nobjtbl, (int)objtype, 1);
-			}
-		}
-
-		// ouput all new obj info
-		iter = RuntimeHashcreateiterator(nobjtbl);
-		while(RunhasNext(iter)) {
-			char * objtype = (char *)Runkey(iter);
-			int num = Runnext(iter);
-			int nameLen = strlen(objtype);
-			BAMBOO_DEBUGPRINT(0xddda);
-			for(j = 0; j < nameLen; j++) {
-				BAMBOO_DEBUGPRINT_REG(objtype[j]);
-			}
-			BAMBOO_DEBUGPRINT(0xdddb);
-			BAMBOO_DEBUGPRINT_REG(num);
-		}
-	}
-    BAMBOO_DEBUGPRINT(0xdddc);
-  }
-
-  if(taskInfoOverflow) {
-    BAMBOO_DEBUGPRINT(0xefee);
-  }
-
-  // output interrupt related info
-  /*for(i = 0; i < interruptInfoIndex; i++) {
-       InterruptInfo* tmpIInfo = interruptInfoArray[i];
-       BAMBOO_DEBUGPRINT(0xddde);
-       BAMBOO_DEBUGPRINT_REG(tmpIInfo->startTime);
-       BAMBOO_DEBUGPRINT_REG(tmpIInfo->endTime);
-       BAMBOO_DEBUGPRINT(0xdddf);
-     }
-
-     if(interruptInfoOverflow) {
-       BAMBOO_DEBUGPRINT(0xefef);
-     }*/
-
-  BAMBOO_DEBUGPRINT(0xeeee);
-#endif
-}
-#endif  // #ifdef PROFILE
-
-#endif // #ifdef TASK
diff --git a/Robust/src/Runtime/bamboo/GCSharedHash.c b/Robust/src/Runtime/bamboo/GCSharedHash.c
new file mode 100755
index 00000000..04e92ffb
--- /dev/null
+++ b/Robust/src/Runtime/bamboo/GCSharedHash.c
@@ -0,0 +1,472 @@
+#ifdef MULTICORE_GC
+
+#include "GCSharedHash.h"
+#ifdef MULTICORE
+#include "runtime_arch.h"
+#else
+#include <stdio.h>
+#endif
+
+#ifndef INTPTR
+#ifdef BIT64
+#define INTPTR long
+#define INTPTRSHIFT 3
+#else
+#define INTPTR int
+#define INTPTRSHIFT 2
+#endif
+#endif
+
+#ifndef INLINE
+#define INLINE    inline __attribute__((always_inline))
+#endif // #ifndef INLINE
+
+#define GC_SHIFT_BITS  4
+
+/* GCSHARED HASH ********************************************************/
+
+// params: startaddr -- the start addr of the shared memory
+//         rsize -- remaining size of the available shared memory
+struct GCSharedHash * noargallocateGCSharedHash() {
+  return allocateGCSharedHash(100);
+}
+
+struct GCSharedHash * allocateGCSharedHash(int size) {
+  struct GCSharedHash *thisvar; 
+  if (size <= 0) {
+#ifdef MULTICORE
+    BAMBOO_EXIT(0xf201);
+#else
+    printf("Negative Hashtable size Exception\n");
+    exit(-1);
+#endif
+  } 
+  thisvar=(struct GCSharedHash *)FREEMALLOC_NGC(sizeof(struct GCSharedHash));
+  if(thisvar == NULL) {
+	return NULL;
+  }
+  thisvar->size = size;
+  thisvar->bucket = 
+	(struct GCSharedNode **)FREEMALLOC_NGC(sizeof(struct GCSharedNode *)*size);
+  if(thisvar->bucket == NULL) {
+	FREE_NGC(thisvar);
+	return NULL;
+  }
+  /* Set allocation blocks*/
+  thisvar->listhead=NULL;
+  thisvar->listtail=NULL;
+  /*Set data counts*/
+  thisvar->numelements = 0;
+  return thisvar;
+}
+
+void freeGCSharedHash(struct GCSharedHash *thisvar) {
+  struct GCSharedNode *ptr=thisvar->listhead;
+  FREE_NGC(thisvar->bucket);
+  while(ptr) {
+    struct GCSharedNode *next=ptr->lnext;
+    FREE_NGC(ptr);
+    ptr=next;
+  }
+  FREE_NGC(thisvar);
+}
+
+bool GCSharedHashrehash(struct GCSharedHash * thisvar) {
+  int newsize=thisvar->size;
+  struct GCSharedNode ** newbucket = (struct GCSharedNode **)
+	FREEMALLOC_NGC(sizeof(struct GCSharedNode *)*newsize);
+  if(newbucket == NULL) {
+	return false;
+  }
+  int i;
+  for(i=thisvar->size-1; i>=0; i--) {
+    struct GCSharedNode *ptr;
+    for(ptr=thisvar->bucket[i]; ptr!=NULL;) {
+      struct GCSharedNode * nextptr=ptr->next;
+      unsigned int newhashkey=(unsigned int)ptr->key % newsize;
+      ptr->next=newbucket[newhashkey];
+      newbucket[newhashkey]=ptr;
+      ptr=nextptr;
+    }
+  }
+  thisvar->size=newsize;
+  FREE_NGC(thisvar->bucket);
+  thisvar->bucket=newbucket;
+  return true;
+}
+
+int GCSharedHashadd(struct GCSharedHash * thisvar,int key, int data) {
+  /* Rehash code */
+  unsigned int hashkey;
+  struct GCSharedNode **ptr;
+
+  if (thisvar->numelements>=thisvar->size) {
+    int newsize=2*thisvar->size+1;
+    struct GCSharedNode ** newbucket = 
+	  (struct GCSharedNode **)FREEMALLOC_NGC(
+		  sizeof(struct GCSharedNode *)*newsize);
+	if(newbucket == NULL) {
+	  return -1;
+	}
+    int i;
+    for(i=thisvar->size-1; i>=0; i--) {
+      struct GCSharedNode *ptr;
+      for(ptr=thisvar->bucket[i]; ptr!=NULL;) {
+	struct GCSharedNode * nextptr=ptr->next;
+	unsigned int newhashkey=(unsigned int)ptr->key % newsize;
+	ptr->next=newbucket[newhashkey];
+	newbucket[newhashkey]=ptr;
+	ptr=nextptr;
+      }
+    }
+    thisvar->size=newsize;
+    FREE_NGC(thisvar->bucket);
+    thisvar->bucket=newbucket;
+  }
+
+  hashkey = (unsigned int)key % thisvar->size;
+  ptr = &thisvar->bucket[hashkey];
+
+  /* check that thisvar key/object pair isn't already here */
+  /* TBD can be optimized for set v. relation */
+
+  while (*ptr) {
+    if ((*ptr)->key == key && (*ptr)->data == data) {
+      return 0;
+    }
+    ptr = &((*ptr)->next);
+  }
+
+  {
+    struct GCSharedNode *node=FREEMALLOC_NGC(sizeof(struct GCSharedNode));
+	if(node == NULL) {
+	  return -1;
+	}
+    node->data=data;
+    node->key=key;
+    node->next=(*ptr);
+    *ptr=node;
+    if (thisvar->listhead==NULL) {
+      thisvar->listhead=node;
+      thisvar->listtail=node;
+      node->lnext=NULL;
+      node->lprev=NULL;
+    } else {
+      node->lprev=NULL;
+      node->lnext=thisvar->listhead;
+      thisvar->listhead->lprev=node;
+      thisvar->listhead=node;
+    }
+  }
+
+  thisvar->numelements++;
+  return 1;
+}
+
+#ifdef MULTICORE 
+struct GCSharedHash * allocateGCSharedHash_I(int size) {
+  struct GCSharedHash *thisvar;
+  if (size <= 0) {
+#ifdef MULTICORE
+    BAMBOO_EXIT(0xf202);
+#else
+    printf("Negative Hashtable size Exception\n");
+    exit(-1);
+#endif
+  }
+  thisvar=(struct GCSharedHash *)FREEMALLOC_NGC_I(sizeof(struct GCSharedHash));
+  if(thisvar == NULL) {
+	return NULL;
+  }
+  thisvar->size = size;
+  thisvar->bucket = 
+	(struct GCSharedNode **)FREEMALLOC_NGC_I(
+		sizeof(struct GCSharedNode *)*size);
+  if(thisvar->bucket == NULL) {
+	FREE_NGC_I(thisvar);
+	return NULL;
+  }
+  /* Set allocation blocks*/
+  thisvar->listhead=NULL;
+  thisvar->listtail=NULL;
+  /*Set data counts*/
+  thisvar->numelements = 0;
+  return thisvar;
+}
+
+int GCSharedHashadd_I(struct GCSharedHash * thisvar,int key, int data) {
+  /* Rehash code */
+  unsigned int hashkey;
+  struct GCSharedNode **ptr;
+
+  if (thisvar->numelements>=thisvar->size) {
+    int newsize=2*thisvar->size+1;
+    struct GCSharedNode ** newbucket = 
+	  (struct GCSharedNode **)FREEMALLOC_NGC_I(
+		  sizeof(struct GCSharedNode *)*newsize);
+	if(newbucket == NULL) {
+	  return -1;
+	}
+    int i;
+    for(i=thisvar->size-1; i>=0; i--) {
+      struct GCSharedNode *ptr;
+      for(ptr=thisvar->bucket[i]; ptr!=NULL;) {
+	struct GCSharedNode * nextptr=ptr->next;
+	unsigned int newhashkey=(unsigned int)ptr->key % newsize;
+	ptr->next=newbucket[newhashkey];
+	newbucket[newhashkey]=ptr;
+	ptr=nextptr;
+      }
+    }
+    thisvar->size=newsize;
+    FREE_NGC_I(thisvar->bucket);
+    thisvar->bucket=newbucket;
+  }
+
+  hashkey = (unsigned int)key % thisvar->size;
+  ptr = &thisvar->bucket[hashkey];
+
+  /* check that thisvar key/object pair isn't already here */
+  /* TBD can be optimized for set v. relation */
+
+  while (*ptr) {
+    if ((*ptr)->key == key && (*ptr)->data == data) {
+      return 0;
+    }
+    ptr = &((*ptr)->next);
+  }
+
+  {
+    struct GCSharedNode *node=FREEMALLOC_NGC_I(sizeof(struct GCSharedNode));
+	if(node == NULL) {
+	  return -1;
+	}
+    node->data=data;
+    node->key=key;
+    node->next=(*ptr);
+    *ptr=node;
+    if (thisvar->listhead==NULL) {
+      thisvar->listhead=node;
+      thisvar->listtail=node;
+      node->lnext=NULL;
+      node->lprev=NULL;
+    } else {
+      node->lprev=NULL;
+      node->lnext=thisvar->listhead;
+      thisvar->listhead->lprev=node;
+      thisvar->listhead=node;
+    }
+  }
+
+  thisvar->numelements++;
+  return 1;
+}
+#endif
+
+int GCSharedHashget(struct GCSharedHash *thisvar, int key, int *data) {
+  unsigned int hashkey = (unsigned int)key % thisvar->size;
+
+  struct GCSharedNode *ptr = thisvar->bucket[hashkey];
+  while (ptr) {
+    if (ptr->key == key) {
+      *data = ptr->data;
+      return 1;       /* success */
+    }
+    ptr = ptr->next;
+  }
+
+  return 0;   /* failure */
+}
+
+/* MGCSHAREDHASH ********************************************************/
+
+mgcsharedhashtbl_t * mgcsharedhashCreate(unsigned int size, 
+                                         double loadfactor) {
+  mgcsharedhashtbl_t * ctable;
+  mgcsharedhashlistnode_t * nodes;
+  int i;
+
+  ctable = (mgcsharedhashtbl_t *)FREEMALLOC_NGC(sizeof(mgcsharedhashtbl_t));
+  if(ctable == NULL) {
+	// TODO
+	BAMBOO_EXIT(0xf203);
+	return NULL;
+  }
+  // Allocate space for the hash table
+  ctable->table = (mgcsharedhashlistnode_t *)FREEMALLOC_NGC(
+	  size*sizeof(mgcsharedhashlistnode_t));
+  if(ctable->table == NULL) {
+	BAMBOO_EXIT(0xf204); // TODO
+	return NULL;
+  }
+  ctable->size = size;
+  ctable->loadfactor = loadfactor;
+  ctable->threshold = size*loadfactor;
+
+  ctable->mask = (size << (GC_SHIFT_BITS))-1;
+
+  ctable->structs = NULL ; //FREEMALLOC_NGC(1*sizeof(mgcliststruct_t));
+  ctable->numelements = 0; // Initial number of elements in the hash
+  ctable->list = NULL;
+
+  return ctable;
+}
+
+mgcsharedhashtbl_t * mgcsharedhashCreate_I(unsigned int size, 
+                                           double loadfactor) {
+  mgcsharedhashtbl_t * ctable;
+  mgcsharedhashlistnode_t * nodes;
+  int i;
+
+  ctable = (mgcsharedhashtbl_t *)FREEMALLOC_NGC_I(sizeof(mgcsharedhashtbl_t));
+  if(ctable == NULL) {
+	// TODO
+	BAMBOO_EXIT(0xf205);
+	return NULL;
+  }
+  // Allocate space for the hash table
+  ctable->table = (mgcsharedhashlistnode_t *)FREEMALLOC_NGC_I(
+	  size*sizeof(mgcsharedhashlistnode_t));
+  if(ctable->table == NULL) {
+	BAMBOO_EXIT(0xf206); // TODO
+	return NULL;
+  }
+  ctable->size = size;
+  ctable->loadfactor = loadfactor;
+  ctable->threshold = size*loadfactor;
+
+  ctable->mask = (size << (GC_SHIFT_BITS))-1;
+
+  ctable->structs = NULL ; //FREEMALLOC_NGC(1*sizeof(mgcliststruct_t));
+  ctable->numelements = 0; // Initial number of elements in the hash
+  ctable->list = NULL;
+
+  return ctable;
+}
+
+void mgcsharedhashReset(mgcsharedhashtbl_t * tbl) {
+  mgcsharedhashlistnode_t * ptr = tbl->table;
+
+  if ((tbl->numelements) < (tbl->size>>6)) {
+	mgcsharedhashlistnode_t *top = &ptr[tbl->size];
+	mgcsharedhashlistnode_t * list = tbl->list;
+	while(list != NULL) {  
+      mgcsharedhashlistnode_t * next = list->next;
+      if ((list >= ptr) && (list < top)) {
+		//zero in list
+        list->key=NULL;
+        list->next=NULL;
+      }
+      list = next;
+	}
+  } else {
+	BAMBOO_MEMSET_WH(tbl->table, '\0', 
+		sizeof(mgcsharedhashlistnode_t)*tbl->size);
+  }
+
+  mgcsharedliststruct_t * structs = tbl->structs;
+  while(structs != NULL) {
+    mgcsharedliststruct_t * next = structs->next;
+	BAMBOO_MEMSET_WH(structs->array, '\0', 
+		structs->num * sizeof(mgcsharedhashlistnode_t));
+	structs->num = 0;
+    structs = next;
+  }
+  tbl->numelements = 0;
+}
+
+//Store objects and their pointers into hash
+//Using open addressing
+int mgcsharedhashInsert(mgcsharedhashtbl_t * tbl, void * key, void * val) {
+  mgcsharedhashlistnode_t * ptr;
+
+  if(tbl->numelements > (tbl->threshold)) {
+    //Never resize, simply don't insert any more
+    return -1;
+  }
+
+  //int keyto = ((unsigned INTPTR)key) % (tbl->size);
+  //ptr=&tbl->table[keyto];
+  ptr=&tbl->table[(((unsigned INTPTR)key)&tbl->mask)>>(GC_SHIFT_BITS)];
+
+  if(ptr->key==0) {
+    // the first time insert a value for the key
+    ptr->key=key;
+    ptr->val=val;
+  } else { // Insert to the next empty place
+	mgcsharedhashlistnode_t *top = &tbl->table[tbl->size];
+    do {
+	  ptr++;
+	} while((ptr < top) && (ptr->key != NULL));
+	if(ptr >= top) {
+	  return -1;
+	} else {
+	  ptr->key = key;
+	  ptr->val = val;
+	}
+  }
+  ptr->next = tbl->list;
+  tbl->list = ptr;
+  tbl->numelements++;
+  return 1;
+}
+
+int mgcsharedhashInsert_I(mgcsharedhashtbl_t * tbl, void * key, void * val) {
+  mgcsharedhashlistnode_t * ptr;
+
+  if(tbl->numelements > (tbl->threshold)) {
+    //Never resize, simply don't insert any more
+    return -1;
+  }
+
+  //int keyto = ((unsigned INTPTR)key) % (tbl->size);
+  //ptr=&tbl->table[keyto];
+  ptr=&tbl->table[(((unsigned INTPTR)key)&tbl->mask)>>(GC_SHIFT_BITS)];
+
+  if(ptr->key==0) {
+    // the first time insert a value for the key
+    ptr->key=key;
+    ptr->val=val;
+  } else { // Insert to the next empty place
+	mgcsharedhashlistnode_t * top = &tbl->table[tbl->size];
+	mgcsharedhashlistnode_t * start = ptr;
+    do {
+	  ptr++;
+	  if(ptr->key == 0) {
+		break;
+	  }
+	} while(ptr < top);
+	if(ptr >= top) {
+	  return -1;
+	} else {
+	  ptr->key = key;
+	  ptr->val = val;
+	}
+  }
+  ptr->next = tbl->list;
+  tbl->list = ptr;
+  tbl->numelements++;
+  return 1;
+}
+
+// Search for an address for a given oid
+INLINE void * mgcsharedhashSearch(mgcsharedhashtbl_t * tbl, void * key) {
+  //REMOVE HASH FUNCTION CALL TO MAKE SURE IT IS INLINED HERE]
+  //int keyto = ((unsigned INTPTR)key) % (tbl->size);
+  //mgcsharedhashlistnode_t * node=&tbl->table[keyto];
+  mgcsharedhashlistnode_t * node = 
+	&tbl->table[(((unsigned INTPTR)key)&tbl->mask)>>(GC_SHIFT_BITS)];
+  mgcsharedhashlistnode_t *top = &tbl->table[tbl->size];
+
+  do {
+	//i++;
+    if(node->key == key) {
+      return node->val;
+    }
+    node++;
+  } while(node < top);
+
+  return NULL;
+}
+
+#endif
diff --git a/Robust/src/Runtime/bamboo/GCSharedHash.h b/Robust/src/Runtime/bamboo/GCSharedHash.h
new file mode 100755
index 00000000..94725fbf
--- /dev/null
+++ b/Robust/src/Runtime/bamboo/GCSharedHash.h
@@ -0,0 +1,94 @@
+#ifdef MULTICORE_GC
+
+#ifndef GCSHAREDHASH_H
+#define GCSHAREDHASH_H
+
+#ifndef bool
+#define bool int
+#endif
+
+#ifndef true
+#define true 1
+#endif
+
+#ifndef false
+#define false 0
+#endif
+
+#include "mem.h"
+
+/* GCSharedHash *********************************************************/
+
+struct GCSharedHash * noargallocateGCSharedHash();
+struct GCSharedHash * allocateGCSharedHash(int size);
+void freeGCSharedHash(struct GCSharedHash *);
+
+bool GCSharedHashrehash(struct GCSharedHash * thisvar);
+int GCSharedHashadd(struct GCSharedHash *, int key, int data);
+#ifdef MULTICORE
+struct GCSharedHash * allocateGCSharedHash_I(int size);
+int GCSharedHashadd_I(struct GCSharedHash *, int key, int data);
+#endif
+int GCSharedHashget(struct GCSharedHash *,int key, int* data);
+
+struct GCSharedHash {
+  int numelements;
+  int size;
+  struct GCSharedNode **bucket;
+  struct GCSharedNode *listhead;
+  struct GCSharedNode *listtail;
+};
+
+inline int GCSharedHashcountset(struct GCSharedHash * thisvar);
+
+/* RuntimeHashException  *************************************************/
+
+
+/* RuntimeIterator *****************************************************/
+struct GCSharedNode {
+  struct GCSharedNode *next;
+  struct GCSharedNode *lnext;
+  struct GCSharedNode *lprev;
+  int data;
+  int key;
+};
+
+/* MGCSharedHash *********************************************************/
+typedef struct mgcsharedhashlistnode {
+  void * key;
+  void * val; //this can be cast to another type or used to point to a
+              //larger structure
+  struct mgcsharedhashlistnode * next;
+} mgcsharedhashlistnode_t;
+
+#define NUMMGCSHAREDLIST 250
+typedef struct mgcsharedlist {
+  struct mgcsharedhashlistnode array[NUMMGCSHAREDLIST];
+  int num;
+  struct mgcsharedlist *next;
+} mgcsharedliststruct_t;
+
+typedef struct mgcsharedhashtbl {
+  mgcsharedhashlistnode_t * table;       // points to beginning of hash table
+  mgcsharedhashlistnode_t * list;
+  mgcsharedliststruct_t * structs;
+  unsigned int size;
+  unsigned int mask;
+  unsigned int numelements;
+  unsigned int threshold;
+  double loadfactor;
+} mgcsharedhashtbl_t;
+
+mgcsharedhashtbl_t * mgcsharedhashCreate(unsigned int size, double loadfactor);
+mgcsharedhashtbl_t * mgcsharedhashCreate_I(unsigned int size,double loadfactor);
+int mgcsharedhashInsert(mgcsharedhashtbl_t * tbl, void * key, void * val);
+void * mgcsharedhashSearch(mgcsharedhashtbl_t * tbl, void * key);
+//unsigned int mgchashResize(unsigned int newsize);
+int mgcsharedhashInsert_I(mgcsharedhashtbl_t * tbl, void * key, void * val);
+//unsigned int mgchashResize_I(unsigned int newsize);
+//void mgcsharedhashDelete(mgcsharedhashtbl_t * tbl);
+void mgcsharedhashReset(mgcsharedhashtbl_t * tbl);
+
+#endif
+
+#endif
diff --git a/Robust/src/Runtime/bamboo/MGCHash.c b/Robust/src/Runtime/bamboo/MGCHash.c
new file mode 100644
index 00000000..dabe7e29
--- /dev/null
+++ b/Robust/src/Runtime/bamboo/MGCHash.c
@@ -0,0 +1,533 @@
+#include "MGCHash.h"
+#ifdef MULTICORE
+#include "runtime_arch.h"
+#else
+#include <stdio.h>
+#endif
+#ifdef DMALLOC
+#include "dmalloc.h"
+#endif
+
+#ifndef INTPTR
+#ifdef BIT64
+#define INTPTR long
+#define INTPTRSHIFT 3
+#else
+#define INTPTR int
+#define INTPTRSHIFT 2
+#endif
+#endif
+
+#define GC_SHIFT_BITS 4
+
+/* mgchash ********************************************************/
+mgchashtable_t * mgchashCreate(unsigned int size, double loadfactor) {
+  mgchashtable_t *ctable;
+  mgchashlistnode_t *nodes;
+  int i;
+
+  if (size <= 0) {
+#ifdef MULTICORE
+    BAMBOO_EXIT(0xf101);
+#else
+    printf("Negative Hashtable size Exception\n");
+    exit(-1);
+#endif
+  }
+
+  // Allocate space for the hash table
+  ctable = (mgchashtable_t *)RUNMALLOC(sizeof(mgchashtable_t));
+  if(ctable == NULL) {
+	// Run out of local memory
+	BAMBOO_EXIT(0xf102);
+  }
+  ctable->table = (mgchashlistnode_t*)RUNMALLOC(size*sizeof(mgchashlistnode_t));
+  if(ctable->table == NULL) {
+	// Run out of local memory
+	BAMBOO_EXIT(0xf103);
+  }
+  ctable->loadfactor = loadfactor;
+  ctable->size = size;
+  ctable->threshold=size*loadfactor;
+
+  ctable->mask = (size << (GC_SHIFT_BITS))-1;
+  //ctable->list = NULL;
+  ctable->structs = (mgcliststruct_t*)RUNMALLOC(1*sizeof(mgcliststruct_t));
+  ctable->numelements = 0; // Initial number of elements in the hash
+
+  return ctable;
+}
+
+void mgchashreset(mgchashtable_t * tbl) {
+  mgchashlistnode_t *ptr = tbl->table;
+  int i;
+
+  /*if (tbl->numelements<(tbl->size>>6)) {
+	mgchashlistnode_t *top=&ptr[tbl->size];
+	mgchashlistnode_t * list = tbl->list;
+	while(list != NULL) {
+      mgchashlistnode_t * next = list->lnext;
+      if ((list >= ptr) && (list < top)) {
+		//zero in list
+        list->key=NULL;
+        list->next=NULL;
+      }
+      list = next;
+	}
+  } else {*/
+	BAMBOO_MEMSET_WH(tbl->table, '\0', sizeof(mgchashlistnode_t)*tbl->size);
+  //}
+  // TODO now never release any allocated memory, may need to be changed
+  //mgcliststruct_t * next = tbl->structs;
+  while(tbl->structs->next!=NULL) {
+    mgcliststruct_t * next = tbl->structs->next;
+    RUNFREE(tbl->structs);
+    tbl->structs=next;
+	/*next->num = 0;
+	next = next->next;*/
+  }
+  tbl->structs->num = 0;
+  tbl->numelements = 0;
+}
+
+//Store objects and their pointers into hash
+void mgchashInsert(mgchashtable_t * tbl, void * key, void *val) {
+  mgchashlistnode_t *ptr;
+
+  if(tbl->numelements > (tbl->threshold)) {
+    //Resize
+    unsigned int newsize = tbl->size << 1 + 1;
+    mgchashResize(tbl, newsize);
+  }
+
+  ptr=&tbl->table[(((unsigned INTPTR)key)&tbl->mask)>>(GC_SHIFT_BITS)]; 
+  tbl->numelements++;
+
+  if(ptr->key==0) {
+    // the first time insert a value for the key
+    ptr->key=key;
+    ptr->val=val;
+	/*ptr->lnext = tbl->list;
+	tbl->list = ptr;*/
+  } else { // Insert in the beginning of linked list
+    mgchashlistnode_t * node;
+    if (tbl->structs->num<NUMMGCLIST) {
+      node=&tbl->structs->array[tbl->structs->num];
+      tbl->structs->num++;
+    } else {
+      //get new list
+      mgcliststruct_t *tcl=RUNMALLOC(1*sizeof(mgcliststruct_t));
+      tcl->next=tbl->structs;
+      tbl->structs=tcl;
+      node=&tcl->array[0];
+      tcl->num=1;
+    }
+    node->key = key;
+    node->val = val;
+    node->next = ptr->next;
+    ptr->next = node;
+	/*node->lnext = tbl->list;
+	tbl->list = node;*/
+  }
+}
+
+#ifdef MULTICORE_GC
+mgchashtable_t * mgchashCreate_I(unsigned int size, double loadfactor) {
+  mgchashtable_t *ctable;
+  mgchashlistnode_t *nodes;
+  int i;
+
+  if (size <= 0) {
+#ifdef MULTICORE
+    BAMBOO_EXIT(0xf101);
+#else
+    printf("Negative Hashtable size Exception\n");
+    exit(-1);
+#endif
+  }
+
+  // Allocate space for the hash table
+  ctable = (mgchashtable_t*)RUNMALLOC_I(sizeof(mgchashtable_t));
+  if(ctable == NULL) {
+	// Run out of local memory
+	BAMBOO_EXIT(0xf102);
+  }
+  ctable->table=(mgchashlistnode_t*)RUNMALLOC_I(size*sizeof(mgchashlistnode_t));
+  if(ctable->table == NULL) {
+	// Run out of local memory
+	BAMBOO_EXIT(0xf103);
+  }
+  ctable->loadfactor = loadfactor;
+  ctable->size = size;
+  ctable->threshold=size*loadfactor;
+
+  ctable->mask = (size << (GC_SHIFT_BITS))-1;
+  //ctable->list = NULL;
+  ctable->structs = (mgcliststruct_t*)RUNMALLOC_I(1*sizeof(mgcliststruct_t));
+  ctable->numelements = 0; // Initial number of elements in the hash
+
+  return ctable;
+}
+
+void mgchashInsert_I(mgchashtable_t * tbl, void * key, void *val) {
+  mgchashlistnode_t *ptr;
+
+  if(tbl->numelements > (tbl->threshold)) {
+    //Resize
+    unsigned int newsize = tbl->size << 1 + 1;
+    mgchashResize_I(tbl, newsize);
+  }
+
+  ptr = &tbl->table[(((unsigned INTPTR)key)&tbl->mask)>>(GC_SHIFT_BITS)];
+  tbl->numelements++;
+
+  if(ptr->key==0) {
+    ptr->key=key;
+    ptr->val=val;
+	/*ptr->lnext = tbl->list;
+	tbl->list = ptr;*/
+    return;
+  } else { // Insert in the beginning of linked list
+    mgchashlistnode_t * node;
+    if (tbl->structs->num<NUMMGCLIST) {
+      node=&tbl->structs->array[tbl->structs->num];
+      tbl->structs->num++;
+    } else {
+      //get new list
+      mgcliststruct_t *tcl=RUNMALLOC_I(1*sizeof(mgcliststruct_t));
+      tcl->next=tbl->structs;
+      tbl->structs=tcl;
+      node=&tcl->array[0];
+      tcl->num=1;
+    }
+    node->key = key;
+    node->val = val;
+    node->next = ptr->next;
+    ptr->next = node;
+	/*node->lnext = tbl->list;
+	tbl->list = node;*/
+  }
+}
+#endif
+
+// Search for an address for a given oid
+INLINE void * mgchashSearch(mgchashtable_t * tbl, void * key) {
+  //REMOVE HASH FUNCTION CALL TO MAKE SURE IT IS INLINED HERE]
+  mgchashlistnode_t *node = 
+	&tbl->table[(((unsigned INTPTR)key)&tbl->mask)>>(GC_SHIFT_BITS)];
+
+  do {
+    if(node->key == key) {
+      return node->val;
+    }
+    node = node->next;
+  } while(node != NULL);
+
+  return NULL;
+}
+
+unsigned int mgchashResize(mgchashtable_t * tbl, unsigned int newsize) {
+  mgchashlistnode_t *node, *ptr, *curr;  // curr and next keep track of the 
+                                         // current and the next 
+										 // mgchashlistnodes in a linked list
+  unsigned int oldsize;
+  int isfirst;    // Keeps track of the first element in the 
+                  // chashlistnode_t for each bin in hashtable
+  unsigned int i,index;
+  unsigned int mask;
+
+  ptr = tbl->table;
+  oldsize = tbl->size;
+
+  if((node = RUNMALLOC(newsize*sizeof(mgchashlistnode_t))) == NULL) {
+    printf("Calloc error %s %d\n", __FILE__, __LINE__);
+    return 1;
+  }
+
+  tbl->table = node; //Update the global hashtable upon resize()
+  tbl->size = newsize;
+  tbl->threshold = newsize * tbl->loadfactor;
+  mask = tbl->mask = (newsize << (GC_SHIFT_BITS)) - 1;
+  //tbl->list = NULL;
+
+  for(i = 0; i < oldsize; i++) {   //Outer loop for each bin in hash table
+    curr = &ptr[i];
+    isfirst = 1;
+    do {  //Inner loop to go through linked lists
+      void * key;
+      mgchashlistnode_t *tmp,*next;
+
+      if ((key=curr->key) == 0) { 
+		//Exit inner loop if there the first element is 0
+		break;
+		//key = val =0 for element if not present within the hash table
+	  }
+      index = (((unsigned INTPTR)key) & mask) >> (GC_SHIFT_BITS);
+      tmp=&node[index];
+      next = curr->next;
+      // Insert into the new table
+      if(tmp->key == 0) {
+		tmp->key = key;
+		tmp->val = curr->val;
+		/*tmp->lnext = tbl->list;
+		tbl->list = tmp;*/
+      } /*
+	   NOTE:  Add this case if you change this...
+	   This case currently never happens because of the way things rehash....*/
+	   else if (isfirst) {
+		 mgchashlistnode_t *newnode= RUNMALLOC(1*sizeof(mgchashlistnode_t));
+		 newnode->key = curr->key;
+		 newnode->val = curr->val;
+		 newnode->next = tmp->next;
+		 tmp->next=newnode;
+		 /*newnode->lnext = tbl->list;
+		 tbl->list = newnode;*/
+	   } 
+      else {
+		curr->next=tmp->next;
+		tmp->next=curr;
+		/*curr->lnext = tbl->list;
+		tbl->list = curr;*/
+      }
+
+      isfirst = 0;
+      curr = next;
+    } while(curr!=NULL);
+  }
+
+  RUNFREE(ptr);            //Free the memory of the old hash table
+  return 0;
+}
+
+#ifdef MULTICORE_GC
+unsigned int mgchashResize_I(mgchashtable_t * tbl, unsigned int newsize) {
+  mgchashlistnode_t *node, *ptr, *curr; // curr and next keep track of the 
+                                        // current and the next 
+										// mgchashlistnodes in a linked list
+  unsigned int oldsize;
+  int isfirst; // Keeps track of the first element in the chashlistnode_t 
+               // for each bin in hashtable
+  unsigned int i,index;
+  unsigned int mask;
+
+  ptr = tbl->table;
+  oldsize = tbl->size;
+
+  if((node = RUNMALLOC_I(newsize*sizeof(mgchashlistnode_t))) == NULL) {
+    BAMBOO_EXIT(0xf104);
+    printf("Calloc error %s %d\n", __FILE__, __LINE__);
+    return 1;
+  }
+
+  tbl->table = node;  //Update the global hashtable upon resize()
+  tbl->size = newsize;
+  tbl->threshold = newsize * tbl->loadfactor;
+  mask = tbl->mask = (newsize << (GC_SHIFT_BITS))-1;
+  //tbl->list = NULL;
+
+  for(i = 0; i < oldsize; i++) {  //Outer loop for each bin in hash table
+    curr = &ptr[i];
+    isfirst = 1;
+    do { //Inner loop to go through linked lists
+      void * key;
+      mgchashlistnode_t *tmp,*next;
+
+      if ((key=curr->key) == 0) {
+		//Exit inner loop if there the first element is 0
+		break;
+		//key = val =0 for element if not present within the hash table
+      }
+      index = (((unsigned INTPTR)key) & mask) >> (GC_SHIFT_BITS);
+      tmp=&node[index];
+      next = curr->next;
+      // Insert into the new table
+      if(tmp->key == 0) {
+		tmp->key = key;
+		tmp->val = curr->val;
+		/*tmp->lnext = tbl->list;
+		tbl->list = tmp;*/
+      } /*
+	   NOTE:  Add this case if you change this...
+	   This case currently never happens because of the way things rehash....*/
+      else if (isfirst) {
+		mgchashlistnode_t *newnode=RUNMALLOC_I(1*sizeof(mgchashlistnode_t)); 
+		newnode->key = curr->key;
+		newnode->val = curr->val;
+		newnode->next = tmp->next;
+		tmp->next=newnode;
+		/*newnode->lnext = tbl->list;
+		tbl->list = newnode;*/
+      } else {
+		curr->next=tmp->next;
+		tmp->next=curr;
+		/*curr->lnext = tbl->list;
+		tbl->list = curr;*/
+      }
+
+      isfirst = 0;
+      curr = next;
+    } while(curr!=NULL);
+  }
+  RUNFREE(ptr); //Free the memory of the old hash table
+  return 0;
+}
+#endif
+
+//Delete the entire hash table
+void mgchashDelete(mgchashtable_t * tbl) {
+  int i;
+  mgcliststruct_t *ptr=tbl->structs;
+  while(ptr!=NULL) {
+    mgcliststruct_t *next=ptr->next;
+    RUNFREE(ptr);
+    ptr=next;
+  }
+  RUNFREE(tbl->table);
+  tbl->table=NULL;
+  tbl->structs=NULL;
+}
+
+/* MGCHASH ********************************************************/
+
+struct MGCHash * allocateMGCHash(int size,
+                                 int conflicts) {
+  struct MGCHash *thisvar;
+  if (size <= 0) {
+#ifdef MULTICORE
+    BAMBOO_EXIT(0xf105);
+#else
+    printf("Negative Hashtable size Exception\n");
+    exit(-1);
+#endif
+  }
+  thisvar=(struct MGCHash *)RUNMALLOC(sizeof(struct MGCHash));
+  thisvar->size = size;
+  thisvar->bucket =
+    (struct MGCNode *) RUNMALLOC(sizeof(struct MGCNode)*size);
+  //Set data counts
+  thisvar->num4conflicts = conflicts;
+  return thisvar;
+}
+
+void freeMGCHash(struct MGCHash *thisvar) {
+  int i = 0;
+  for(i=thisvar->size-1; i>=0; i--) {
+    struct MGCNode *ptr;
+    for(ptr=thisvar->bucket[i].next; ptr!=NULL; ) {
+      struct MGCNode * nextptr=ptr->next;
+      RUNFREE(ptr);
+      ptr=nextptr;
+    }
+  }
+  RUNFREE(thisvar->bucket);
+  RUNFREE(thisvar);
+}
+
+int MGCHashadd(struct MGCHash * thisvar, int data) {
+  // Rehash code
+  unsigned int hashkey;
+  struct MGCNode *ptr;
+
+  int mask = (thisvar->size << (GC_SHIFT_BITS))-1;
+  hashkey = (((unsigned INTPTR)data)&mask)>>(GC_SHIFT_BITS); 
+  //hashkey = (unsigned int)data % thisvar->size;
+  ptr = &thisvar->bucket[hashkey];
+
+  struct MGCNode * prev = NULL;
+  if(ptr->data < thisvar->num4conflicts) {
+    struct MGCNode *node=RUNMALLOC(sizeof(struct MGCNode));
+    node->data=data;
+    node->next=(ptr->next);
+    ptr->next=node;
+    ptr->data++;
+  } else {
+    while (ptr->next!=NULL) {
+      prev = ptr;
+      ptr = ptr->next;
+    }
+    ptr->data = data;
+    ptr->next = thisvar->bucket[hashkey].next;
+    thisvar->bucket[hashkey].next = ptr;
+    prev->next = NULL;
+  }
+
+  return 1;
+}
+
+#ifdef MULTICORE
+struct MGCHash * allocateMGCHash_I(int size,
+                                   int conflicts) {
+  struct MGCHash *thisvar;
+  if (size <= 0) {
+#ifdef MULTICORE
+    BAMBOO_EXIT(0xf106);
+#else
+    printf("Negative Hashtable size Exception\n");
+    exit(-1);
+#endif
+  }
+  thisvar=(struct MGCHash *)RUNMALLOC_I(sizeof(struct MGCHash));
+  thisvar->size = size;
+  thisvar->bucket =
+    (struct MGCNode *) RUNMALLOC_I(sizeof(struct MGCNode)*size);
+  //Set data counts
+  thisvar->num4conflicts = conflicts;
+  return thisvar;
+}
+
+int MGCHashadd_I(struct MGCHash * thisvar, int data) {
+  // Rehash code
+  unsigned int hashkey;
+  struct MGCNode *ptr;
+
+  int mask = (thisvar->size << (GC_SHIFT_BITS))-1;
+  hashkey = (((unsigned INTPTR)data)&mask)>>(GC_SHIFT_BITS);
+  //hashkey = (unsigned int)data % thisvar->size;
+  ptr = &thisvar->bucket[hashkey];
+
+  struct MGCNode * prev = NULL;
+  if(ptr->data < thisvar->num4conflicts) {
+    struct MGCNode *node=RUNMALLOC_I(sizeof(struct MGCNode));
+    node->data=data;
+    node->next=(ptr->next);
+    ptr->next=node;
+    ptr->data++;
+  } else {
+    while (ptr->next!=NULL) {
+      prev = ptr;
+      ptr = ptr->next;
+    }
+    ptr->data = data;
+    ptr->next = thisvar->bucket[hashkey].next;
+    thisvar->bucket[hashkey].next = ptr;
+    prev->next = NULL;
+  }
+
+  return 1;
+}
+#endif
+
+int MGCHashcontains(struct MGCHash *thisvar, int data) {
+  int mask = (thisvar->size << (GC_SHIFT_BITS))-1;
+  unsigned int hashkey = (((unsigned INTPTR)data)&mask)>>(GC_SHIFT_BITS);
+  //unsigned int hashkey = (unsigned int)data % thisvar->size;
+
+  struct MGCNode *ptr = thisvar->bucket[hashkey].next;
+  struct MGCNode *prev = NULL;
+  while (ptr!=NULL) {
+    if (ptr->data == data) {
+      if(prev != NULL) {
+	prev->next = NULL;
+	ptr->next = thisvar->bucket[hashkey].next;
+	thisvar->bucket[hashkey].next = ptr;
+      }
+
+      return 1;       // success
+    }
+    prev = ptr;
+    ptr = ptr->next;
+  }
+
+  return 0;   // failure
+}
+
diff --git a/Robust/src/Runtime/bamboo/MGCHash.h b/Robust/src/Runtime/bamboo/MGCHash.h
new file mode 100644
index 00000000..03844ebc
--- /dev/null
+++ b/Robust/src/Runtime/bamboo/MGCHash.h
@@ -0,0 +1,87 @@
+#ifndef MGCHASH_H
+#define MGCHASH_H
+
+#ifndef bool
+#define bool int
+#endif
+
+#ifndef true
+#define true 1
+#endif
+
+#ifndef false
+#define false 0
+#endif
+
+#ifndef INLINE
+#define INLINE    inline __attribute__((always_inline))
+#endif
+
+#include "mem.h"
+
+/* mgchash *********************************************************/
+typedef struct mgchashlistnode {
+  void * key;
+  void * val; //this can be cast to another type or used to point to a
+              //larger structure
+  struct mgchashlistnode *next;
+  //struct mgchashlistnode *lnext;
+} mgchashlistnode_t;
+
+#define NUMMGCLIST 250
+typedef struct mgclist {
+  struct mgchashlistnode array[NUMMGCLIST];
+  int num;
+  struct mgclist *next;
+} mgcliststruct_t;
+
+typedef struct mgchashtable {
+  mgchashlistnode_t * table;       // points to beginning of hash table
+  //mgchashlistnode_t * list;
+  mgcliststruct_t * structs;
+  unsigned int size;
+  unsigned int mask;
+  unsigned int numelements;
+  unsigned int threshold;
+  double loadfactor;
+} mgchashtable_t;
+
+mgchashtable_t * mgchashCreate(unsigned int size, double loadfactor);
+void mgchashInsert(mgchashtable_t * tbl, void * key, void *val);
+void * mgchashSearch(mgchashtable_t * tbl, void * key);
+unsigned int mgchashResize(mgchashtable_t * tbl, unsigned int newsize);
+#ifdef MULTICORE_GC
+mgchashtable_t * mgchashCreate_I(unsigned int size, double loadfactor);
+void mgchashInsert_I(mgchashtable_t * tbl, void * key, void *val);
+unsigned int mgchashResize_I(mgchashtable_t * tbl, unsigned int newsize);
+#endif
+void mgchashDelete(mgchashtable_t * tbl);
+void mgchashreset(mgchashtable_t * tbl);
+
+
+/** MGCHash *******************************************************************/
+struct MGCHash * allocateMGCHash(int size, int conflicts);
+void freeMGCHash(struct MGCHash *);
+
+//void MGCHashrehash(struct MGCHash * thisvar);
+int MGCHashadd(struct MGCHash *, int data);
+#ifdef MULTICORE
+struct MGCHash * allocateMGCHash_I(int size, int conflicts);
+int MGCHashadd_I(struct MGCHash *, int data);
+#endif
+int MGCHashcontains(struct MGCHash *,int data);
+
+struct MGCHash {
+  int num4conflicts;
+  int size;
+  struct MGCNode *bucket;
+};
+
+/* MGCHashException  *************************************************/
+
+struct MGCNode {
+  struct MGCNode * next;
+  int data;
+};
+
+#endif
diff --git a/Robust/src/Runtime/bamboo/RAW/Makefile.raw.1 b/Robust/src/Runtime/bamboo/RAW/Makefile.raw.1
new file mode 100644
index 00000000..7b98f54b
--- /dev/null
+++ b/Robust/src/Runtime/bamboo/RAW/Makefile.raw.1
@@ -0,0 +1,21 @@
+TOPDIR=/home/jzhou/starsearch
+include $(TOPDIR)/Makefile.include
+
+RGCCFLAGS += -O2 
+RGCCFLAGS += ${RAWRGCCFLAGS}
+
+USE_SLGCC=1
+
+SIM-CYCLES = 10000
+
+ATTRIBUTES += HWIC
+
+TILE_PATTERN = 4x1
+
+OBJECT_FILES_COMMON = multicoretask.o multicoreruntime.o Queue.o file.o math.o object.o \
+					  GenericHashtable.o SimpleHash.o ObjectHash.o socket.o taskdefs.o \
+					  methods.o mem.o task_arch.o raw_dataCache.o raw_interrupt.o
+					  
+# this is for a multi-tile test
+include $(COMMONDIR)/Makefile.all
+
diff --git a/Robust/src/Runtime/bamboo/RAW/Makefile.raw.2 b/Robust/src/Runtime/bamboo/RAW/Makefile.raw.2
new file mode 100644
index 00000000..e0554e13
--- /dev/null
+++ b/Robust/src/Runtime/bamboo/RAW/Makefile.raw.2
@@ -0,0 +1,21 @@
+TOPDIR=/home/jzhou/starsearch
+include $(TOPDIR)/Makefile.include
+
+RGCCFLAGS += -O2 
+RGCCFLAGS += ${RAWRGCCFLAGS}
+
+USE_SLGCC=1
+
+SIM-CYCLES = 10000
+
+ATTRIBUTES += HWIC
+
+TILE_PATTERN = 4x2
+
+OBJECT_FILES_COMMON = multicoretask.o multicoreruntime.o Queue.o file.o math.o object.o \
+					  GenericHashtable.o SimpleHash.o ObjectHash.o socket.o taskdefs.o \
+					  methods.o mem.o task_arch.o raw_dataCache.o raw_interrupt2.o
+					  
+# this is for a multi-tile test
+include $(COMMONDIR)/Makefile.all
+
diff --git a/Robust/src/Runtime/bamboo/RAW/Makefile.raw.4 b/Robust/src/Runtime/bamboo/RAW/Makefile.raw.4
new file mode 100644
index 00000000..3b43ee90
--- /dev/null
+++ b/Robust/src/Runtime/bamboo/RAW/Makefile.raw.4
@@ -0,0 +1,21 @@
+TOPDIR=/home/jzhou/starsearch
+include $(TOPDIR)/Makefile.include
+
+RGCCFLAGS += -O2 
+RGCCFLAGS += ${RAWRGCCFLAGS}
+
+USE_SLGCC=1
+
+SIM-CYCLES = 10000
+
+ATTRIBUTES += HWIC
+
+TILE_PATTERN = 4x4
+
+OBJECT_FILES_COMMON = multicoretask.o multicoreruntime.o Queue.o file.o math.o object.o \
+					  GenericHashtable.o SimpleHash.o ObjectHash.o socket.o taskdefs.o \
+					  methods.o mem.o task_arch.o raw_dataCache.o raw_interrupt4.o
+					  
+# this is for a multi-tile test
+include $(COMMONDIR)/Makefile.all
+
diff --git a/Robust/src/Runtime/bamboo/RAW/Makefile.raw.io b/Robust/src/Runtime/bamboo/RAW/Makefile.raw.io
new file mode 100644
index 00000000..5b87e3dc
--- /dev/null
+++ b/Robust/src/Runtime/bamboo/RAW/Makefile.raw.io
@@ -0,0 +1,58 @@
+
+USEBOOTLOADER=no
+
+ifeq ($(USEBOOTLOADER),yes)
+ATTRIBUTES      += LARGE_STATIC_DATA
+endif
+
+# We need to define the host OS to get access
+# to the host specific OS defines!   - VS 
+DEFS	+= -D$(shell uname -s) -D__raw__
+
+TOPDIR=/home/jzhou/starsearch
+include $(TOPDIR)/Makefile.include
+
+RGCCFLAGS += -O2 
+RGCCFLAGS += ${RAWRGCCFLAGS} 
+
+USE_SLGCC=1
+
+SIM-CYCLES = 10000
+
+ATTRIBUTES += HWIC
+
+TILES = 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
+
+#TILE_PATTERN = 4x1
+
+OBJECT_FILES_COMMON = multicoretask.o multicoreruntime.o Queue.o file.o math.o object.o \
+					  GenericHashtable.o SimpleHash.o ObjectHash.o socket.o taskdefs.o \
+					  methods.o mem.o task_arch.o raw_dataCache.o raw_interrupt.o
+
+OBJECT_FILES_00 = $(OBJECT_FILES_COMMON)
+OBJECT_FILES_01 = $(OBJECT_FILES_COMMON)
+OBJECT_FILES_02 = $(OBJECT_FILES_COMMON)
+OBJECT_FILES_03 = $(OBJECT_FILES_COMMON)
+OBJECT_FILES_04 = $(OBJECT_FILES_COMMON)
+OBJECT_FILES_05 = $(OBJECT_FILES_COMMON)
+OBJECT_FILES_06 = $(OBJECT_FILES_COMMON)
+OBJECT_FILES_07 = $(OBJECT_FILES_COMMON)
+OBJECT_FILES_08 = $(OBJECT_FILES_COMMON)
+OBJECT_FILES_09 = $(OBJECT_FILES_COMMON)
+OBJECT_FILES_10 = $(OBJECT_FILES_COMMON)
+OBJECT_FILES_11 = $(OBJECT_FILES_COMMON)
+OBJECT_FILES_12 = $(OBJECT_FILES_COMMON)
+OBJECT_FILES_13 = $(OBJECT_FILES_COMMON)
+OBJECT_FILES_14 = $(OBJECT_FILES_COMMON)
+OBJECT_FILES_15 = $(OBJECT_FILES_COMMON)
+
+# this is for a multi-tile test
+include $(COMMONDIR)/Makefile.all
+
+ifneq ($(USEBOOTLOADER),yes)
+# Load the host interface and host OS simulator into btl
+BTL-ARGS += -host # -imem_size 65536
+endif
+
+BTL-ARGS += -host_stop_time
+
diff --git a/Robust/src/Runtime/bamboo/RAW/raw_dataCache.s b/Robust/src/Runtime/bamboo/RAW/raw_dataCache.s
new file mode 100644
index 00000000..566bf221
--- /dev/null
+++ b/Robust/src/Runtime/bamboo/RAW/raw_dataCache.s
@@ -0,0 +1,40 @@
+.text
+.global flushAddr
+.global invalidateAddr
+.global flushCacheline
+.global invalidateCacheline
+	
+flushAddr:
+# arguments come in on $4 and $5
+# $4 has the address
+# $5 has the length, eventually
+
+	afl $4, 0
+	jr    $31
+	
+
+invalidateAddr:
+# arguments come in on $4 and $5
+# $4 has the address
+# $5 has the length, eventually
+
+	ainv $4, 0
+	jr $31
+
+
+flushCacheline:
+# arguments come in on $4
+# $4 has the base tag address
+
+	tagfl $4
+	jr $31
+
+invalidateCacheline:
+# arguments come in on $4
+# $4 has the base tag address
+
+	tagsw $0, $4
+#	mtsri PASS, 0x1111
+#	mtsr PASS, $8
+#	ainv $8, 0
+	jr $31
diff --git a/Robust/src/Runtime/bamboo/RAW/raw_interrupt.s b/Robust/src/Runtime/bamboo/RAW/raw_interrupt.s
new file mode 100644
index 00000000..cfad9b58
--- /dev/null
+++ b/Robust/src/Runtime/bamboo/RAW/raw_interrupt.s
@@ -0,0 +1,131 @@
+#include <raw_asm.h>
+
+.text
+	.align	2
+	.globl	setup_ints
+	.ent	setup_ints
+setup_ints:	
+	# set up dynamic network
+	uintoff
+	intoff
+
+	# set gdn_cfg
+	xor $8,$8,$8
+	aui $8,$8,(3<<11)|(0 <<6)|(0 <<1)
+	ori $8, (0 <<12)|(2<<9)
+	mtsr	GDN_CFG,$8
+#	mtsr	PASS,$8
+
+	# set exception vector
+    la $3, interrupt_table
+#	mtsri PASS, 0xaaa
+#	mtsr PASS, $3
+    mtsr EX_BASE_ADDR, $3
+
+	# set EX_MASK
+	mfsr	$8,EX_MASK
+	ori	$8,$8,0x20          # 1 << kVEC_GDN_AVAIL
+	mtsr	EX_MASK,$8
+
+	inton
+	uinton
+	jr $31
+	.end	setup_ints
+
+.macro empty_vec fail_code
+        mtsri FAIL, \fail_code
+1:      b 1b
+        nop
+        nop
+.endm
+
+interrupt_table:
+
+vec_gdn_refill:
+        empty_vec 0x2300
+vec_gdn_complete:
+        empty_vec 0x2301
+vec_trace:
+        empty_vec 0x2302
+vec_extern:
+        empty_vec 0x2303
+vec_timer:
+        empty_vec 0x2304
+vec_gdn_avail:
+#	mtsri PASS, 0xef00
+	uintoff
+
+	addiu   $sp,$sp,-112
+	sw      $31,0x64($sp)
+	sw      $30,0x60($sp)
+	sw      $23,0x5c($sp)
+	sw      $22,0x58($sp)
+	sw      $21,0x54($sp)
+	sw      $20,0x50($sp)
+	sw      $19,0x4c($sp)
+	sw      $18,0x48($sp)
+	sw      $17,0x44($sp)
+	sw      $16,0x40($sp)
+	sw      $15,0x3c($sp)
+	sw      $14,0x38($sp)
+	sw      $13,0x34($sp)
+	sw      $12,0x30($sp)
+	sw      $11,0x2c($sp)
+	sw      $10,0x28($sp)
+	sw      $9,0x24($sp)
+	sw      $8,0x20($sp)
+	sw      $7,0x1c($sp)
+	sw      $6,0x18($sp)
+	sw      $5,0x14($sp)
+	sw      $4,0x10($sp)
+	sw      $3,0xc($sp)
+	sw      $2,0x8($sp)
+	.set noat
+	sw      $1,0x4($sp)
+	.set at
+	mfhi    $8
+	mflo    $9
+	sw      $8,0x68($sp)
+	sw      $9,0x6c($sp)
+	lw      $8,0x20($sp)
+	lw      $9,0x24($sp)
+
+	jal receiveObject
+
+	lw      $8,0x68($sp)
+	lw      $9,0x6c($sp)
+	mthi    $8
+	mtlo    $9
+	lw      $31,0x64($sp)
+	lw      $30,0x60($sp)
+	lw      $23,0x5c($sp)
+	lw      $22,0x58($sp)
+	lw      $21,0x54($sp)
+	lw      $20,0x50($sp)
+	lw      $19,0x4c($sp)
+	lw      $18,0x48($sp)
+	lw      $17,0x44($sp)
+	lw      $16,0x40($sp)
+	lw      $15,0x3c($sp)
+	lw      $14,0x38($sp)
+	lw      $13,0x34($sp)
+	lw      $12,0x30($sp)
+	lw      $11,0x2c($sp)
+	lw      $10,0x28($sp)
+	lw      $9,0x24($sp)
+	lw      $8,0x20($sp)
+	lw      $7,0x1c($sp)
+	lw      $6,0x18($sp)
+	lw      $5,0x14($sp)
+	lw      $4,0x10($sp)
+	lw      $3,0xc($sp)
+	lw      $2,0x8($sp)
+	.set noat
+	lw      $1,0x4($sp)
+	.set at
+	addiu   $sp,$sp,112
+
+#	mtsri PASS, 0xefff
+	dret
+vec_event_counters:
+        empty_vec 0x2306
diff --git a/Robust/src/Runtime/bamboo/RAW/raw_interrupt2.s b/Robust/src/Runtime/bamboo/RAW/raw_interrupt2.s
new file mode 100644
index 00000000..1f23cfb1
--- /dev/null
+++ b/Robust/src/Runtime/bamboo/RAW/raw_interrupt2.s
@@ -0,0 +1,132 @@
+#include <raw_asm.h>
+
+	.text
+	.align	2
+	.globl	setup_ints
+	.ent	setup_ints
+setup_ints:	
+	# set up dynamic network
+	uintoff
+	intoff
+
+	# set gdn_cfg
+	xor $8,$8,$8
+	aui $8,$8,(3<<11)|(1 <<6)|(0 <<1)
+	ori $8, (0 <<12)|(2<<9)
+	mtsr	GDN_CFG,$8
+#	mtsr	PASS,$8
+
+	# set exception vector
+    la $3, interrupt_table
+#	mtsri PASS, 0xaaa
+#	mtsr PASS, $3
+    mtsr EX_BASE_ADDR, $3
+
+	# set EX_MASK
+	mfsr	$8,EX_MASK
+	ori	$8,$8,0x20          # 1 << kVEC_GDN_AVAIL
+	mtsr	EX_MASK,$8
+
+	inton
+	uinton
+	jr $31
+	.end	setup_ints
+
+.macro empty_vec fail_code
+        mtsri FAIL, \fail_code
+1:      b 1b
+        nop
+        nop
+.endm
+
+interrupt_table:
+
+vec_gdn_refill:
+        empty_vec 0x2300
+vec_gdn_complete:
+        empty_vec 0x2301
+vec_trace:
+        empty_vec 0x2302
+vec_extern:
+        empty_vec 0x2303
+vec_timer:
+        empty_vec 0x2304
+vec_gdn_avail:
+#	mtsri PASS, 0xef00
+	uintoff
+
+	addiu   $sp,$sp,-112
+	sw      $31,0x64($sp)
+	sw      $30,0x60($sp)
+	sw      $23,0x5c($sp)
+	sw      $22,0x58($sp)
+	sw      $21,0x54($sp)
+	sw      $20,0x50($sp)
+	sw      $19,0x4c($sp)
+	sw      $18,0x48($sp)
+	sw      $17,0x44($sp)
+	sw      $16,0x40($sp)
+	sw      $15,0x3c($sp)
+	sw      $14,0x38($sp)
+	sw      $13,0x34($sp)
+	sw      $12,0x30($sp)
+	sw      $11,0x2c($sp)
+	sw      $10,0x28($sp)
+	sw      $9,0x24($sp)
+	sw      $8,0x20($sp)
+	sw      $7,0x1c($sp)
+	sw      $6,0x18($sp)
+	sw      $5,0x14($sp)
+	sw      $4,0x10($sp)
+	sw      $3,0xc($sp)
+	sw      $2,0x8($sp)
+	.set noat
+	sw      $1,0x4($sp)
+	.set at
+	mfhi    $8
+	mflo    $9
+	sw      $8,0x68($sp)
+	sw      $9,0x6c($sp)
+	lw      $8,0x20($sp)
+	lw      $9,0x24($sp)
+
+	jal receiveObject
+
+	lw      $8,0x68($sp)
+	lw      $9,0x6c($sp)
+	mthi    $8
+	mtlo    $9
+	lw      $31,0x64($sp)
+	lw      $30,0x60($sp)
+	lw      $23,0x5c($sp)
+	lw      $22,0x58($sp)
+	lw      $21,0x54($sp)
+	lw      $20,0x50($sp)
+	lw      $19,0x4c($sp)
+	lw      $18,0x48($sp)
+	lw      $17,0x44($sp)
+	lw      $16,0x40($sp)
+	lw      $15,0x3c($sp)
+	lw      $14,0x38($sp)
+	lw      $13,0x34($sp)
+	lw      $12,0x30($sp)
+	lw      $11,0x2c($sp)
+	lw      $10,0x28($sp)
+	lw      $9,0x24($sp)
+	lw      $8,0x20($sp)
+	lw      $7,0x1c($sp)
+	lw      $6,0x18($sp)
+	lw      $5,0x14($sp)
+	lw      $4,0x10($sp)
+	lw      $3,0xc($sp)
+	lw      $2,0x8($sp)
+	.set noat
+	lw      $1,0x4($sp)
+	.set at
+	addiu   $sp,$sp,112
+
+#	mtsri PASS, 0xefff
+	dret
+vec_event_counters:
+        empty_vec 0x2306
+
diff --git a/Robust/src/Runtime/bamboo/RAW/raw_interrupt4.s b/Robust/src/Runtime/bamboo/RAW/raw_interrupt4.s
new file mode 100644
index 00000000..a505e303
--- /dev/null
+++ b/Robust/src/Runtime/bamboo/RAW/raw_interrupt4.s
@@ -0,0 +1,132 @@
+#include <raw_asm.h>
+
+	.text
+	.align	2
+	.globl	setup_ints
+	.ent	setup_ints
+setup_ints:	
+	# set up dynamic network
+	uintoff
+	intoff
+
+	# set gdn_cfg
+	xor $8,$8,$8
+	aui $8,$8,(3<<11)|(3 <<6)|(0 <<1)
+	ori $8, (0 <<12)|(2<<9)
+	mtsr	GDN_CFG,$8
+#	mtsr	PASS,$8
+
+	# set exception vector
+    la $3, interrupt_table
+#	mtsri PASS, 0xaaa
+#	mtsr PASS, $3
+    mtsr EX_BASE_ADDR, $3
+
+	# set EX_MASK
+	mfsr	$8,EX_MASK
+	ori	$8,$8,0x20          # 1 << kVEC_GDN_AVAIL
+	mtsr	EX_MASK,$8
+
+	inton
+	uinton
+	jr $31
+	.end	setup_ints
+
+.macro empty_vec fail_code
+        mtsri FAIL, \fail_code
+1:      b 1b
+        nop
+        nop
+.endm
+
+interrupt_table:
+
+vec_gdn_refill:
+        empty_vec 0x2300
+vec_gdn_complete:
+        empty_vec 0x2301
+vec_trace:
+        empty_vec 0x2302
+vec_extern:
+        empty_vec 0x2303
+vec_timer:
+        empty_vec 0x2304
+vec_gdn_avail:
+#	mtsri PASS, 0xef00
+	uintoff
+
+	addiu   $sp,$sp,-112
+	sw      $31,0x64($sp)
+	sw      $30,0x60($sp)
+	sw      $23,0x5c($sp)
+	sw      $22,0x58($sp)
+	sw      $21,0x54($sp)
+	sw      $20,0x50($sp)
+	sw      $19,0x4c($sp)
+	sw      $18,0x48($sp)
+	sw      $17,0x44($sp)
+	sw      $16,0x40($sp)
+	sw      $15,0x3c($sp)
+	sw      $14,0x38($sp)
+	sw      $13,0x34($sp)
+	sw      $12,0x30($sp)
+	sw      $11,0x2c($sp)
+	sw      $10,0x28($sp)
+	sw      $9,0x24($sp)
+	sw      $8,0x20($sp)
+	sw      $7,0x1c($sp)
+	sw      $6,0x18($sp)
+	sw      $5,0x14($sp)
+	sw      $4,0x10($sp)
+	sw      $3,0xc($sp)
+	sw      $2,0x8($sp)
+	.set noat
+	sw      $1,0x4($sp)
+	.set at
+	mfhi    $8
+	mflo    $9
+	sw      $8,0x68($sp)
+	sw      $9,0x6c($sp)
+	lw      $8,0x20($sp)
+	lw      $9,0x24($sp)
+
+	jal receiveObject
+
+	lw      $8,0x68($sp)
+	lw      $9,0x6c($sp)
+	mthi    $8
+	mtlo    $9
+	lw      $31,0x64($sp)
+	lw      $30,0x60($sp)
+	lw      $23,0x5c($sp)
+	lw      $22,0x58($sp)
+	lw      $21,0x54($sp)
+	lw      $20,0x50($sp)
+	lw      $19,0x4c($sp)
+	lw      $18,0x48($sp)
+	lw      $17,0x44($sp)
+	lw      $16,0x40($sp)
+	lw      $15,0x3c($sp)
+	lw      $14,0x38($sp)
+	lw      $13,0x34($sp)
+	lw      $12,0x30($sp)
+	lw      $11,0x2c($sp)
+	lw      $10,0x28($sp)
+	lw      $9,0x24($sp)
+	lw      $8,0x20($sp)
+	lw      $7,0x1c($sp)
+	lw      $6,0x18($sp)
+	lw      $5,0x14($sp)
+	lw      $4,0x10($sp)
+	lw      $3,0xc($sp)
+	lw      $2,0x8($sp)
+	.set noat
+	lw      $1,0x4($sp)
+	.set at
+	addiu   $sp,$sp,112
+
+#	mtsri PASS, 0xefff
+	dret
+vec_event_counters:
+        empty_vec 0x2306
+
diff --git a/Robust/src/Runtime/bamboo/RAW/runtime_arch.h b/Robust/src/Runtime/bamboo/RAW/runtime_arch.h
new file mode 100644
index 00000000..19210e6e
--- /dev/null
+++ b/Robust/src/Runtime/bamboo/RAW/runtime_arch.h
@@ -0,0 +1,70 @@
+#ifndef RUNTIME_ARCH
+#define RUNTIME_ARCH
+
+#ifdef PROFILE
+#ifdef RAWUSEIO
+#include "stdio.h"
+#include "string.h"
+#endif
+#endif
+#include <raw.h>
+#include <raw_compiler_defs.h>
+
+#define BAMBOO_CACHE_LINE_SIZE (kCacheLineSize)
+#define BAMBOO_CACHE_LINE_MASK (kCacheLineMask)
+
+#define BAMBOO_NUM_OF_CORE corenum   // the # of current residing core
+#define BAMBOO_GET_NUM_OF_CORE() (raw_get_abs_pos_x() + raw_get_array_size_x() * raw_get_abs_pos_y())  // compute the # of current residing core
+#define BAMBOO_DEBUGPRINT(x) (raw_test_pass((x)))
+#define BAMBOO_DEBUGPRINT_REG(x) (raw_test_pass_reg((x)))
+
+#define BAMBOO_SHARE_MEM_CALLOC(x, y) (calloc((x), (y)))  // allocate an array of x elements each of whose size in bytes is y on shared memory 
+
+#ifdef INTERRUPT
+// locks for global data structures related to obj queue
+#define BAMBOO_START_CRITICAL_SECTION_OBJ_QUEUE() raw_user_interrupts_off()
+#define BAMBOO_CLOSE_CRITICAL_SECTION_OBJ_QUEUE() raw_user_interrupts_on()
+// locks for global data structures related to status data
+#define BAMBOO_START_CRITICAL_SECTION_STATUS() raw_user_interrupts_off()
+#define BAMBOO_CLOSE_CRITICAL_SECTION_STATUS() raw_user_interrupts_on()
+// locks for global data structures related to msg data
+#define BAMBOO_START_CRITICAL_SECTION_MSG() raw_user_interrupts_off()
+#define BAMBOO_CLOSE_CRITICAL_SECTION_MSG() raw_user_interrupts_on()
+// locks for global data structures related to lock table
+#define BAMBOO_START_CRITICAL_SECTION_LOCK() raw_user_interrupts_off()
+#define BAMBOO_CLOSE_CRITICAL_SECTION_LOCK() raw_user_interrupts_on()
+// locks for allocating memory
+#define BAMBOO_START_CRITICAL_SECTION_MEM() raw_user_interrupts_off()
+#define BAMBOO_CLOSE_CRITICAL_SECTION_MEM() raw_user_interrupts_on()
+// locks for all global data structures
+#define BAMBOO_START_CRITICAL_SECTION() raw_user_interrupts_off()
+#define BAMBOO_CLOSE_CRITICAL_SECTION() raw_user_interrupts_on()
+#else
+// locks for global data structures related to obj queue
+#define BAMBOO_START_CRITICAL_SECTION_OBJ_QUEUE()  
+#define BAMBOO_CLOSE_CRITICAL_SECTION_OBJ_QUEUE()  
+// locks for global data structures related to status data
+#define BAMBOO_START_CRITICAL_SECTION_STATUS()  
+#define BAMBOO_CLOSE_CRITICAL_SECTION_STATUS()  
+// locks for global data structures related to msg data
+#define BAMBOO_START_CRITICAL_SECTION_MSG()  
+#define BAMBOO_CLOSE_CRITICAL_SECTION_MSG()  
+// locks for global data structures related to lock table
+#define BAMBOO_START_CRITICAL_SECTION_LOCK()  
+#define BAMBOO_CLOSE_CRITICAL_SECTION_LOCK()  
+// locks for allocating memory
+#define BAMBOO_START_CRITICAL_SECTION_MEM()  
+#define BAMBOO_CLOSE_CRITICAL_SECTION_MEM()  
+// locks for all global data structures
+#define BAMBOO_START_CRITICAL_SECTION()  
+#define BAMBOO_CLOSE_CRITICAL_SECTION()  
+#endif
+
+#define BAMBOO_WAITING_FOR_LOCK() (receiveObject())
+#define BAMBOO_CACHE_FLUSH_RANGE(x, y)  (raw_invalidate_cache_range((x), (y)))
+#define BAMBOO_CACHE_FLUSH_ALL() (raw_flush_entire_cache())
+#define BAMBOO_EXIT(x) (raw_test_done((x)))
+#define BAMBOO_MSG_AVAIL() (gdn_input_avail())
+#define BAMBOO_GET_EXE_TIME() (raw_get_cycle())
+
+#endif // #ifndef RUNTIME_ARCH
diff --git a/Robust/src/Runtime/bamboo/RAW/task_arch.c b/Robust/src/Runtime/bamboo/RAW/task_arch.c
new file mode 100644
index 00000000..4a247a3b
--- /dev/null
+++ b/Robust/src/Runtime/bamboo/RAW/task_arch.c
@@ -0,0 +1,1592 @@
+#ifdef TASK
+#include "runtime.h"
+#include "multicoreruntime.h"
+#include "runtime_arch.h"
+
+__attribute__((always_inline)) inline void initialization() {
+} // initialization()
+
+__attribute__((always_inline)) inline void initCommunication() {
+#ifdef INTERRUPT
+  if (corenum < NUMCORES) {
+    // set up interrupts
+    setup_ints();
+    raw_user_interrupts_on();
+  }
+#endif
+}
+
+__attribute__((always_inline)) inline void fakeExecution()  {
+  // handle communications
+  while(true) {
+	  receiveObject();
+  }
+}
+
+#ifdef USEIO
+int main(void) {
+#else
+void begin() {
+#endif // #ifdef USEIO
+  run(NULL);
+}
+
+__attribute__((always_inline)) inline void terminate()  {
+	raw_test_done(1);
+}
+
+// helper function to compute the coordinates of a core from the core number
+#define calCoords(core_num, coordX, coordY) \
+  *(coordX) = (core_num) % raw_get_array_size_x();\
+  *(coordY) = core_num / raw_get_array_size_x();
+
+// transfer an object to targetcore
+// format: object
+inline void transferObject(struct transObjInfo * transObj) {//  __attribute__((always_inline)){
+  void * obj = transObj->objptr;
+  int type=((int *)obj)[0];
+  int targetcore = transObj->targetcore;  
+
+  unsigned msgHdr;
+  int self_y, self_x, target_y, target_x;
+  // for 32 bit machine, the size of fixed part is always 3 words
+  int msgsize = 3 + transObj->length * 2;
+  int i = 0;
+
+  struct ___Object___ * newobj = (struct ___Object___ *)obj;
+
+  calCoords(corenum, &self_x, &self_y);
+  calCoords(targetcore, &target_x, &target_y);
+  isMsgSending = true;
+  // Build the message header
+  msgHdr = construct_dyn_hdr(0, msgsize, 0,             // msgsize word sent.
+                             self_y, self_x,
+                             target_y, target_x);
+  // start sending msg, set sand msg flag
+  gdn_send(msgHdr);                     
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(0xbbbb);
+  BAMBOO_DEBUGPRINT(0xb000 + targetcore);       // targetcore
+#endif
+  gdn_send(0);
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(0);
+#endif
+  gdn_send(msgsize);
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT_REG(msgsize);
+#endif
+  gdn_send((int)obj);
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT_REG(obj);
+#endif
+  for(i = 0; i < transObj->length; ++i) {
+    int taskindex = transObj->queues[2*i];
+    int paramindex = transObj->queues[2*i+1];
+    gdn_send(taskindex);
+#ifdef DEBUG
+    BAMBOO_DEBUGPRINT_REG(taskindex);
+#endif
+    gdn_send(paramindex);
+#ifdef DEBUG
+    BAMBOO_DEBUGPRINT_REG(paramindex);
+#endif
+  }
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(0xffff);
+#endif
+  // end of sending this msg, set sand msg flag false
+  isMsgSending = false;
+  ++(self_numsendobjs);
+  // check if there are pending msgs
+  while(isMsgHanging) {
+	  // get the msg from outmsgdata[]
+	  // length + target + msg
+	  outmsgleft = outmsgdata[outmsgindex++];
+	  int target = outmsgdata[outmsgindex++];
+	  calCoords(target, &target_x, &target_y);
+	  // mark to start sending the msg
+	  isMsgSending = true;
+	  // Build the message header
+	  msgHdr = construct_dyn_hdr(0, outmsgleft, 0,                        // msgsize word sent.
+                                 self_y, self_x,
+                                 target_y, target_x);
+	  gdn_send(msgHdr);                           
+#ifdef DEBUG
+	  BAMBOO_DEBUGPRINT(0xbbbb);
+	  BAMBOO_DEBUGPRINT(0xb000 + target);             // targetcore
+#endif
+	  while(outmsgleft-- > 0) {
+		  gdn_send(outmsgdata[outmsgindex++]);
+#ifdef DEBUG
+		  BAMBOO_DEBUGPRINT_REG(outmsgdata[outmsgindex - 1]);
+#endif
+	  }
+#ifdef DEBUG
+	  BAMBOO_DEBUGPRINT(0xffff);
+#endif
+	  // mark to end sending the msg
+	  isMsgSending = false;
+	  BAMBOO_START_CRITICAL_SECTION_MSG();
+	  // check if there are still msg hanging
+	  if(outmsgindex == outmsglast) {
+		  // no more msgs
+		  outmsgindex = outmsglast = 0;
+		  isMsgHanging = false;
+	  }
+	  BAMBOO_CLOSE_CRITICAL_SECTION_MSG();
+  }
+}
+
+__attribute__((always_inline)) inline void send_msg_1 (int targetcore, 
+		                                                   int n0) {
+  // send this msg
+  unsigned msgHdr;
+  int self_y, self_x, target_y, target_x;
+  msglength = 1;
+
+  // get the x-coord and y-coord of the target core
+  calCoords(corenum, &self_x, &self_y);
+  calCoords(targetcore, &target_x, &target_y);
+
+  // mark to start sending the msg
+  isMsgSending = true;
+  // Build the message header
+  msgHdr = construct_dyn_hdr(0, msglength, 0,             // msgsize word sent.
+                             self_y, self_x,
+                             target_y, target_x);
+  gdn_send(msgHdr);                     // Send the message header
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(0xbbbb);
+  BAMBOO_DEBUGPRINT(0xb000 + targetcore);       // targetcore
+#endif
+  gdn_send(n0);
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(n0);
+  BAMBOO_DEBUGPRINT(0xffff);
+#endif
+  // mark to end sending the msg
+  isMsgSending = false;
+  // check if there are pending msgs
+  while(isMsgHanging) {
+	  // get the msg from outmsgdata[]
+	  // length + target + msg
+	  outmsgleft = outmsgdata[outmsgindex++];
+	  int target = outmsgdata[outmsgindex++];
+	  calCoords(target, &target_x, &target_y);
+	  // mark to start sending the msg
+	  isMsgSending = true;
+	  // Build the message header
+	  msgHdr = construct_dyn_hdr(0, outmsgleft, 0,                        // msgsize word sent.
+                                 self_y, self_x,
+                                 target_y, target_x);
+	  gdn_send(msgHdr);                           
+#ifdef DEBUG
+	  BAMBOO_DEBUGPRINT(0xbbbb);
+	  BAMBOO_DEBUGPRINT(0xb000 + target);             // targetcore
+#endif
+	  while(outmsgleft-- > 0) {
+		  gdn_send(outmsgdata[outmsgindex++]);
+#ifdef DEBUG
+		  BAMBOO_DEBUGPRINT_REG(outmsgdata[outmsgindex - 1]);
+#endif
+	  }
+#ifdef DEBUG
+	  BAMBOO_DEBUGPRINT(0xffff);
+#endif
+	  // mark to end sending the msg
+	  isMsgSending = false;
+	  BAMBOO_START_CRITICAL_SECTION_MSG();
+	  // check if there are still msg hanging
+	  if(outmsgindex == outmsglast) {
+		  // no more msgs
+		  outmsgindex = outmsglast = 0;
+		  isMsgHanging = false;
+	  }
+	  BAMBOO_CLOSE_CRITICAL_SECTION_MSG();
+  }
+}
+
+__attribute__((always_inline)) inline void send_msg_2 (int targetcore, 
+		                                                   int n0, 
+																											 int n1) {
+  // send this msg
+  unsigned msgHdr;
+  int self_y, self_x, target_y, target_x;
+  msglength = 2;
+
+  // get the x-coord and y-coord of the target core
+  calCoords(corenum, &self_x, &self_y);
+  calCoords(targetcore, &target_x, &target_y);
+
+  // mark to start sending the msg
+  isMsgSending = true;
+  // Build the message header
+  msgHdr = construct_dyn_hdr(0, msglength, 0,             // msgsize word sent.
+                             self_y, self_x,
+                             target_y, target_x);
+  gdn_send(msgHdr);                     // Send the message header
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(0xbbbb);
+  BAMBOO_DEBUGPRINT(0xb000 + targetcore);       // targetcore
+#endif
+  gdn_send(n0);
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(n0);
+#endif
+  gdn_send(n1);
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(n1);
+  BAMBOO_DEBUGPRINT(0xffff);
+#endif
+  // mark to end sending the msg
+  isMsgSending = false;
+  // check if there are pending msgs
+  while(isMsgHanging) {
+	  // get the msg from outmsgdata[]
+	  // length + target + msg
+	  outmsgleft = outmsgdata[outmsgindex++];
+	  int target = outmsgdata[outmsgindex++];
+	  calCoords(target, &target_x, &target_y);
+	  // mark to start sending the msg
+	  isMsgSending = true;
+	  // Build the message header
+	  msgHdr = construct_dyn_hdr(0, outmsgleft, 0,                        // msgsize word sent.
+                                 self_y, self_x,
+                                 target_y, target_x);
+	  gdn_send(msgHdr);                           
+#ifdef DEBUG
+	  BAMBOO_DEBUGPRINT(0xbbbb);
+	  BAMBOO_DEBUGPRINT(0xb000 + target);             // targetcore
+#endif
+	  while(outmsgleft-- > 0) {
+		  gdn_send(outmsgdata[outmsgindex++]);
+#ifdef DEBUG
+		  BAMBOO_DEBUGPRINT_REG(outmsgdata[outmsgindex - 1]);
+#endif
+	  }
+#ifdef DEBUG
+	  BAMBOO_DEBUGPRINT(0xffff);
+#endif
+	  // mark to end sending the msg
+	  isMsgSending = false;
+	  BAMBOO_START_CRITICAL_SECTION_MSG();
+	  // check if there are still msg hanging
+	  if(outmsgindex == outmsglast) {
+		  // no more msgs
+		  outmsgindex = outmsglast = 0;
+		  isMsgHanging = false;
+	  }
+	  BAMBOO_CLOSE_CRITICAL_SECTION_MSG();
+  }
+}
+
+__attribute__((always_inline)) inline void send_msg_3 (int targetcore, 
+		                                                   int n0, 
+																											 int n1, 
+																											 int n2) {
+  // send this msg
+  unsigned msgHdr;
+  int self_y, self_x, target_y, target_x;
+  msglength = 3;
+
+  // get the x-coord and y-coord of the target core
+  calCoords(corenum, &self_x, &self_y);
+  calCoords(targetcore, &target_x, &target_y);
+
+  // mark to start sending the msg
+  isMsgSending = true;
+  // Build the message header
+  msgHdr = construct_dyn_hdr(0, msglength, 0,             // msgsize word sent.
+                             self_y, self_x,
+                             target_y, target_x);
+  gdn_send(msgHdr);                     // Send the message header
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(0xbbbb);
+  BAMBOO_DEBUGPRINT(0xb000 + targetcore);       // targetcore
+#endif
+  gdn_send(n0);
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(n0);
+#endif
+  gdn_send(n1);
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(n1);
+#endif
+  gdn_send(n2);
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(n2);
+  BAMBOO_DEBUGPRINT(0xffff);
+#endif
+  // mark to end sending the msg
+  isMsgSending = false;
+  // check if there are pending msgs
+  while(isMsgHanging) {
+	  // get the msg from outmsgdata[]
+	  // length + target + msg
+	  outmsgleft = outmsgdata[outmsgindex++];
+	  int target = outmsgdata[outmsgindex++];
+	  calCoords(target, &target_x, &target_y);
+	  // mark to start sending the msg
+	  isMsgSending = true;
+	  // Build the message header
+	  msgHdr = construct_dyn_hdr(0, outmsgleft, 0,                        // msgsize word sent.
+                                 self_y, self_x,
+                                 target_y, target_x);
+	  gdn_send(msgHdr);                           
+#ifdef DEBUG
+	  BAMBOO_DEBUGPRINT(0xbbbb);
+	  BAMBOO_DEBUGPRINT(0xb000 + target);             // targetcore
+#endif
+	  while(outmsgleft-- > 0) {
+		  gdn_send(outmsgdata[outmsgindex++]);
+#ifdef DEBUG
+		  BAMBOO_DEBUGPRINT_REG(outmsgdata[outmsgindex - 1]);
+#endif
+	  }
+#ifdef DEBUG
+	  BAMBOO_DEBUGPRINT(0xffff);
+#endif
+	  // mark to end sending the msg
+	  isMsgSending = false;
+	  BAMBOO_START_CRITICAL_SECTION_MSG();
+	  // check if there are still msg hanging
+	  if(outmsgindex == outmsglast) {
+		  // no more msgs
+		  outmsgindex = outmsglast = 0;
+		  isMsgHanging = false;
+	  }
+	  BAMBOO_CLOSE_CRITICAL_SECTION_MSG();
+  }
+}
+
+__attribute__((always_inline)) inline void send_msg_4 (int targetcore, 
+		                                                   int n0, 
+																											 int n1, 
+																											 int n2, 
+																											 int n3) {
+  // send this msg
+  unsigned msgHdr;
+  int self_y, self_x, target_y, target_x;
+  msglength = 4;
+
+  // get the x-coord and y-coord of the target core
+  calCoords(corenum, &self_x, &self_y);
+  calCoords(targetcore, &target_x, &target_y);
+
+  // mark to start sending the msg
+  isMsgSending = true;
+  // Build the message header
+  msgHdr = construct_dyn_hdr(0, msglength, 0,             // msgsize word sent.
+                             self_y, self_x,
+                             target_y, target_x);
+  gdn_send(msgHdr);                     // Send the message header
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(0xbbbb);
+  BAMBOO_DEBUGPRINT(0xb000 + targetcore);       // targetcore
+#endif
+  gdn_send(n0);
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(n0);
+#endif
+  gdn_send(n1);
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(n1);
+#endif
+  gdn_send(n2);
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(n2);
+#endif
+  gdn_send(n3);
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(n3);
+  BAMBOO_DEBUGPRINT(0xffff);
+#endif
+  // mark to end sending the msg
+  isMsgSending = false;
+  // check if there are pending msgs
+  while(isMsgHanging) {
+	  // get the msg from outmsgdata[]
+	  // length + target + msg
+	  outmsgleft = outmsgdata[outmsgindex++];
+	  int target = outmsgdata[outmsgindex++];
+	  calCoords(target, &target_x, &target_y);
+	  // mark to start sending the msg
+	  isMsgSending = true;
+	  // Build the message header
+	  msgHdr = construct_dyn_hdr(0, outmsgleft, 0,                        // msgsize word sent.
+                                 self_y, self_x,
+                                 target_y, target_x);
+	  gdn_send(msgHdr);                           
+#ifdef DEBUG
+	  BAMBOO_DEBUGPRINT(0xbbbb);
+	  BAMBOO_DEBUGPRINT(0xb000 + target);             // targetcore
+#endif
+	  while(outmsgleft-- > 0) {
+		  gdn_send(outmsgdata[outmsgindex++]);
+#ifdef DEBUG
+		  BAMBOO_DEBUGPRINT_REG(outmsgdata[outmsgindex - 1]);
+#endif
+	  }
+#ifdef DEBUG
+	  BAMBOO_DEBUGPRINT(0xffff);
+#endif
+	  // mark to end sending the msg
+	  isMsgSending = false;
+	  BAMBOO_START_CRITICAL_SECTION_MSG();
+	  // check if there are still msg hanging
+	  if(outmsgindex == outmsglast) {
+		  // no more msgs
+		  outmsgindex = outmsglast = 0;
+		  isMsgHanging = false;
+	  }
+	  BAMBOO_CLOSE_CRITICAL_SECTION_MSG();
+  }
+}
+
+__attribute__((always_inline)) inline void send_msg_5 (int targetcore, 
+		                                                   int n0, 
+																											 int n1, 
+																											 int n2, 
+																											 int n3, 
+																											 int n4) {
+  // send this msg
+  unsigned msgHdr;
+  int self_y, self_x, target_y, target_x;
+  msglength = 5;
+
+  // get the x-coord and y-coord of the target core
+  calCoords(corenum, &self_x, &self_y);
+  calCoords(targetcore, &target_x, &target_y);
+
+  // mark to start sending the msg
+  isMsgSending = true;
+  // Build the message header
+  msgHdr = construct_dyn_hdr(0, msglength, 0,             // msgsize word sent.
+                             self_y, self_x,
+                             target_y, target_x);
+  gdn_send(msgHdr);                     // Send the message header
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(0xbbbb);
+  BAMBOO_DEBUGPRINT(0xb000 + targetcore);       // targetcore
+#endif
+  gdn_send(n0);
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(n0);
+#endif
+  gdn_send(n1);
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(n1);
+#endif
+  gdn_send(n2);
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(n2);
+#endif
+  gdn_send(n3);
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(n3);
+#endif
+  gdn_send(n4);
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(n4);
+  BAMBOO_DEBUGPRINT(0xffff);
+#endif
+  // mark to end sending the msg
+  isMsgSending = false;
+  // check if there are pending msgs
+  while(isMsgHanging) {
+	  // get the msg from outmsgdata[]
+	  // length + target + msg
+	  outmsgleft = outmsgdata[outmsgindex++];
+	  int target = outmsgdata[outmsgindex++];
+	  calCoords(target, &target_x, &target_y);
+	  // mark to start sending the msg
+	  isMsgSending = true;
+	  // Build the message header
+	  msgHdr = construct_dyn_hdr(0, outmsgleft, 0,                        // msgsize word sent.
+                                 self_y, self_x,
+                                 target_y, target_x);
+	  gdn_send(msgHdr);                           
+#ifdef DEBUG
+	  BAMBOO_DEBUGPRINT(0xbbbb);
+	  BAMBOO_DEBUGPRINT(0xb000 + target);             // targetcore
+#endif
+	  while(outmsgleft-- > 0) {
+		  gdn_send(outmsgdata[outmsgindex++]);
+#ifdef DEBUG
+		  BAMBOO_DEBUGPRINT_REG(outmsgdata[outmsgindex - 1]);
+#endif
+	  }
+#ifdef DEBUG
+	  BAMBOO_DEBUGPRINT(0xffff);
+#endif
+	  // mark to end sending the msg
+	  isMsgSending = false;
+	  BAMBOO_START_CRITICAL_SECTION_MSG();
+	  // check if there are still msg hanging
+	  if(outmsgindex == outmsglast) {
+		  // no more msgs
+		  outmsgindex = outmsglast = 0;
+		  isMsgHanging = false;
+	  }
+	  BAMBOO_CLOSE_CRITICAL_SECTION_MSG();
+  }
+}
+
+__attribute__((always_inline)) inline void send_msg_6 (int targetcore, 
+		                                                   int n0, 
+																											 int n1, 
+																											 int n2, 
+																											 int n3, 
+																											 int n4, 
+																											 int n5) {
+  // send this msg
+  unsigned msgHdr;
+  int self_y, self_x, target_y, target_x;
+  msglength = 6;
+
+  // get the x-coord and y-coord of the target core
+  calCoords(corenum, &self_x, &self_y);
+  calCoords(targetcore, &target_x, &target_y);
+
+  // mark to start sending the msg
+  isMsgSending = true;
+  // Build the message header
+  msgHdr = construct_dyn_hdr(0, msglength, 0,             // msgsize word sent.
+                             self_y, self_x,
+                             target_y, target_x);
+  gdn_send(msgHdr);                     // Send the message header
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(0xbbbb);
+  BAMBOO_DEBUGPRINT(0xb000 + targetcore);       // targetcore
+#endif
+  gdn_send(n0);
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(n0);
+#endif
+  gdn_send(n1);
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(n1);
+#endif
+  gdn_send(n2);
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(n2);
+#endif
+  gdn_send(n3);
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(n3);
+#endif
+  gdn_send(n4);
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(n4);
+#endif
+  gdn_send(n5);
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(n5);
+  BAMBOO_DEBUGPRINT(0xffff);
+#endif
+  // mark to end sending the msg
+  isMsgSending = false;
+  // check if there are pending msgs
+  while(isMsgHanging) {
+	  // get the msg from outmsgdata[]
+	  // length + target + msg
+	  outmsgleft = outmsgdata[outmsgindex++];
+	  int target = outmsgdata[outmsgindex++];
+	  calCoords(target, &target_x, &target_y);
+	  // mark to start sending the msg
+	  isMsgSending = true;
+	  // Build the message header
+	  msgHdr = construct_dyn_hdr(0, outmsgleft, 0,                        // msgsize word sent.
+                                 self_y, self_x,
+                                 target_y, target_x);
+	  gdn_send(msgHdr);                           
+#ifdef DEBUG
+	  BAMBOO_DEBUGPRINT(0xbbbb);
+	  BAMBOO_DEBUGPRINT(0xb000 + target);             // targetcore
+#endif
+	  while(outmsgleft-- > 0) {
+		  gdn_send(outmsgdata[outmsgindex++]);
+#ifdef DEBUG
+		  BAMBOO_DEBUGPRINT_REG(outmsgdata[outmsgindex - 1]);
+#endif
+	  }
+#ifdef DEBUG
+	  BAMBOO_DEBUGPRINT(0xffff);
+#endif
+	  // mark to end sending the msg
+	  isMsgSending = false;
+	  BAMBOO_START_CRITICAL_SECTION_MSG();
+	  // check if there are still msg hanging
+	  if(outmsgindex == outmsglast) {
+		  // no more msgs
+		  outmsgindex = outmsglast = 0;
+		  isMsgHanging = false;
+	  }
+	  BAMBOO_CLOSE_CRITICAL_SECTION_MSG();
+  }
+}
+
+__attribute__((always_inline)) inline void cache_msg_2 (int targetcore, 
+		                                                    int n0, 
+																												int n1) {
+  // cache this msg
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(0xdede);
+#endif
+  isMsgHanging = true;
+  // cache the msg in outmsgdata and send it later
+  // msglength + target core + msg
+  outmsgdata[outmsglast++] = 2;
+  outmsgdata[outmsglast++] = targetcore;
+  outmsgdata[outmsglast++] = n0;
+  outmsgdata[outmsglast++] = n1;
+}
+
+__attribute__((always_inline)) inline void cache_msg_3 (int targetcore, 
+		                                                    int n0, 
+																												int n1, 
+																												int n2) {
+  // cache this msg
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(0xdede);
+#endif
+  isMsgHanging = true;
+  // cache the msg in outmsgdata and send it later
+  // msglength + target core + msg
+  outmsgdata[outmsglast++] = 3;
+  outmsgdata[outmsglast++] = targetcore;
+  outmsgdata[outmsglast++] = n0;
+  outmsgdata[outmsglast++] = n1;
+  outmsgdata[outmsglast++] = n2;
+}
+
+__attribute__((always_inline)) inline void cache_msg_4 (int targetcore, 
+		                                                    int n0, 
+																												int n1, 
+																												int n2, 
+																												int n3) {
+  // cache this msg
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(0xdede);
+#endif
+  isMsgHanging = true;
+  // cache the msg in outmsgdata and send it later
+  // msglength + target core + msg
+  outmsgdata[outmsglast++] = 4;
+  outmsgdata[outmsglast++] = targetcore;
+  outmsgdata[outmsglast++] = n0;
+  outmsgdata[outmsglast++] = n1;
+  outmsgdata[outmsglast++] = n2;
+  outmsgdata[outmsglast++] = n3;
+}
+
+__attribute__((always_inline)) inline void cache_msg_5 (int targetcore, 
+		                                                    int n0, 
+																												int n1, 
+																												int n2, 
+																												int n3, 
+																												int n4) {
+  // cache this msg
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(0xdede);
+#endif
+  isMsgHanging = true;
+  // cache the msg in outmsgdata and send it later
+  // msglength + target core + msg
+  outmsgdata[outmsglast++] = 5;
+  outmsgdata[outmsglast++] = targetcore;
+  outmsgdata[outmsglast++] = n0;
+  outmsgdata[outmsglast++] = n1;
+  outmsgdata[outmsglast++] = n2;
+  outmsgdata[outmsglast++] = n3;
+  outmsgdata[outmsglast++] = n4;
+}
+
+
+__attribute__((always_inline)) inline void cache_msg_6 (int targetcore, 
+		                                                    int n0, 
+																												int n1, 
+																												int n2, 
+																												int n3, 
+																												int n4, 
+																												int n5) {
+  // cache this msg
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(0xdede);
+#endif
+  isMsgHanging = true;
+  // cache the msg in outmsgdata and send it later
+  // msglength + target core + msg
+  outmsgdata[outmsglast++] = 6;
+  outmsgdata[outmsglast++] = targetcore;
+  outmsgdata[outmsglast++] = n0;
+  outmsgdata[outmsglast++] = n1;
+  outmsgdata[outmsglast++] = n2;
+  outmsgdata[outmsglast++] = n3;
+  outmsgdata[outmsglast++] = n4;
+  outmsgdata[outmsglast++] = n5;
+}
+
+__attribute__((always_inline)) inline int receiveMsg() {
+  if(gdn_input_avail() == 0) {
+#ifdef DEBUG
+    if(corenum < NUMCORES) {
+      BAMBOO_DEBUGPRINT(0xd001);
+    }
+#endif
+    return -1;
+  }
+#ifdef PROFILE
+  /*if(isInterrupt && (!interruptInfoOverflow)) {
+     // BAMBOO_DEBUGPRINT(0xffff);
+     interruptInfoArray[interruptInfoIndex] = RUNMALLOC_I(sizeof(struct interrupt_info));
+     interruptInfoArray[interruptInfoIndex]->startTime = raw_get_cycle();
+     interruptInfoArray[interruptInfoIndex]->endTime = -1;
+     }*/
+#endif
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(0xcccc);
+#endif
+  while((gdn_input_avail() != 0) && (msgdataindex < msglength)) {
+    msgdata[msgdataindex] = gdn_receive();
+    if(msgdataindex == 0) {
+		if(msgdata[0] > 0xc) {
+			msglength = 3;
+		} else if (msgdata[0] == 0xc) {
+			msglength = 1;
+		} else if(msgdata[0] > 8) {
+			msglength = 4;
+		} else if(msgdata[0] == 8) {
+			msglength = 6;
+		} else if(msgdata[0] > 5) {
+			msglength = 2;
+		} else if (msgdata[0] > 2) {
+			msglength = 4;
+		} else if (msgdata[0] == 2) {
+			msglength = 5;
+		} else if (msgdata[0] > 0) {
+			msglength = 4;
+		}
+    } else if((msgdataindex == 1) && (msgdata[0] == 0)) {
+      msglength = msgdata[msgdataindex];
+    }
+#ifdef DEBUG
+    BAMBOO_DEBUGPRINT_REG(msgdata[msgdataindex]);
+#endif
+    msgdataindex++;
+  }
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(0xffff);
+#endif
+  return msgdataindex;
+}
+
+bool getreadlock(void * ptr) {
+  int targetcore = 0;
+  lockobj = (int)ptr;
+  if(((struct ___Object___ *)ptr)->lock == NULL) {
+	lock2require = lockobj;
+  } else {
+	lock2require = (int)(((struct ___Object___ *)ptr)->lock);
+  }
+  targetcore = (lock2require >> 5) % NUMCORES;
+  lockflag = false;
+#ifndef INTERRUPT
+  reside = false;
+#endif
+  lockresult = 0;
+
+  if(targetcore == BAMBOO_NUM_OF_CORE) {
+    // reside on this core
+    int deny = 0;
+	BAMBOO_START_CRITICAL_SECTION_LOCK();
+#ifdef DEBUG
+	BAMBOO_DEBUGPRINT(0xf001);
+#endif
+	deny = processlockrequest(0, lock2require, (int)ptr, BAMBOO_NUM_OF_CORE, BAMBOO_NUM_OF_CORE, false);
+	BAMBOO_CLOSE_CRITICAL_SECTION_LOCK();
+#ifdef DEBUG
+	BAMBOO_DEBUGPRINT(0xf000);
+#endif
+    if(deny == -1) {
+		// redirected
+		return true;
+	} else {
+		if(lockobj == (int)ptr) {
+			if(deny) {
+				lockresult = 0;
+			} else {
+				lockresult = 1;
+			}
+			lockflag = true;
+#ifndef INTERRUPT
+			reside = true;
+#endif
+		} else {
+			// conflicts on lockresults
+			BAMBOO_EXIT(0xa018);
+		}
+	}
+    return true;
+  } else {
+	  // send lock request msg
+	  // for 32 bit machine, the size is always 5 words
+	  send_msg_5(targetcore, LOCKREQUEST, 0, (int)ptr, 
+				       lock2require, BAMBOO_NUM_OF_CORE);
+  }
+  return true;
+}
+
+bool getreadlock_I_r(void * ptr, void * redirectlock, int core, bool cache);
+bool getwritelock_I_r(void* lock, void* redirectlock, int core, bool cache);
+
+void releasereadlock(void * ptr) {
+  int targetcore = 0;
+  int reallock = 0;
+  if(((struct ___Object___ *)ptr)->lock == NULL) {
+	reallock = (int)ptr;
+  } else {
+	reallock = (int)(((struct ___Object___ *)ptr)->lock);
+  }
+  targetcore = (reallock >> 5) % NUMCORES;
+
+  if(targetcore == BAMBOO_NUM_OF_CORE) {
+	BAMBOO_START_CRITICAL_SECTION_LOCK();
+#ifdef DEBUG
+	BAMBOO_DEBUGPRINT(0xf001);
+#endif
+    // reside on this core
+    if(!RuntimeHashcontainskey(locktbl, reallock)) {
+      // no locks for this object, something is wrong
+      BAMBOO_EXIT(0xa019);
+    } else {
+      int rwlock_obj = 0;
+	  struct LockValue * lockvalue = NULL;
+      RuntimeHashget(locktbl, reallock, &rwlock_obj);
+	  lockvalue = (struct LockValue *)rwlock_obj;
+      lockvalue->value--;
+    }
+	BAMBOO_CLOSE_CRITICAL_SECTION_LOCK();
+#ifdef DEBUG
+	BAMBOO_DEBUGPRINT(0xf000);
+#endif
+    return;
+  } else {
+	// send lock release msg
+	// for 32 bit machine, the size is always 4 words
+	send_msg_4(targetcore, LOCKRELEASE, 0, (int)ptr, reallock);
+  }
+}
+
+// redirected lock request
+bool getreadlock_I_r(void * ptr, void * redirectlock, int core, bool cache) {
+  int targetcore = 0;
+  
+  if(core == BAMBOO_NUM_OF_CORE) {
+	  lockobj = (int)ptr;
+	  lock2require = (int)redirectlock;
+	  lockflag = false;
+#ifndef INTERRUPT
+	  reside = false;
+#endif
+	  lockresult = 0;
+  }  
+  targetcore = ((int)redirectlock >> 5) % NUMCORES;
+  
+  if(targetcore == BAMBOO_NUM_OF_CORE) {
+    // reside on this core
+    int deny = processlockrequest(0, (int)redirectlock, (int)ptr, BAMBOO_NUM_OF_CORE, core, cache);
+	if(deny == -1) {
+		// redirected
+		return true;
+	} else {
+		if(core == BAMBOO_NUM_OF_CORE) {
+			if(lockobj == (int)ptr) {
+				if(deny) {
+					lockresult = 0;
+				} else {
+					lockresult = 1;
+					RuntimeHashadd_I(objRedirectLockTbl, (int)ptr, (int)redirectlock);
+				}
+				lockflag = true;
+#ifndef INTERRUPT
+				reside = true;
+#endif
+			} else {
+				// conflicts on lockresults
+				BAMBOO_EXIT(0xa01a);
+			}
+			return true;
+		} else {
+			// send lock grant/deny request to the root requiring core
+			// check if there is still some msg on sending
+			if((!cache) || (cache && !isMsgSending)) {
+				send_msg_4(core, deny==1?REDIRECTDENY:REDIRECTGROUNT, 0, 
+						       (int)ptr, (int)redirectlock);
+			} else {
+				cache_msg_4(core, deny==1?REDIRECTDENY:REDIRECTGROUNT, 0, 
+						        (int)ptr, (int)redirectlock);
+			}
+		}
+	}
+  } else {
+	// redirect the lock request
+	// for 32 bit machine, the size is always 6 words
+	if((!cache) || (cache && !isMsgSending)) {
+		send_msg_6(targetcore, REDIRECTLOCK, 0, (int)ptr, lock2require, 
+				       core, BAMBOO_NUM_OF_CORE);
+	} else {
+		cache_msg_6(targetcore, REDIRECTLOCK, 0, (int)ptr, lock2require, 
+				        core, BAMBOO_NUM_OF_CORE);
+	}
+  }
+  return true;
+}
+
+// not reentrant
+bool getwritelock(void * ptr) {
+  int targetcore = 0;
+
+  // for 32 bit machine, the size is always 5 words
+  //int msgsize = 5;
+
+  lockobj = (int)ptr;
+  if(((struct ___Object___ *)ptr)->lock == NULL) {
+	lock2require = lockobj;
+  } else {
+	lock2require = (int)(((struct ___Object___ *)ptr)->lock);
+  }
+  targetcore = (lock2require >> 5) % NUMCORES;
+  lockflag = false;
+#ifndef INTERRUPT
+  reside = false;
+#endif
+  lockresult = 0;
+
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(0xe551);
+  BAMBOO_DEBUGPRINT_REG(lockobj);
+  BAMBOO_DEBUGPRINT_REG(lock2require);
+  BAMBOO_DEBUGPRINT_REG(targetcore);
+#endif
+
+  if(targetcore == BAMBOO_NUM_OF_CORE) {
+    // reside on this core
+    int deny = 0;
+	BAMBOO_START_CRITICAL_SECTION_LOCK();
+#ifdef DEBUG
+	BAMBOO_DEBUGPRINT(0xf001);
+#endif
+	deny = processlockrequest(1, lock2require, (int)ptr, BAMBOO_NUM_OF_CORE, BAMBOO_NUM_OF_CORE, false);
+	BAMBOO_CLOSE_CRITICAL_SECTION_LOCK();
+#ifdef DEBUG
+	BAMBOO_DEBUGPRINT(0xf000);
+#endif
+#ifdef DEBUG
+    BAMBOO_DEBUGPRINT(0xe555);
+    BAMBOO_DEBUGPRINT_REG(lockresult);
+#endif
+    if(deny == -1) {
+		// redirected
+		return true;
+	} else {
+		if(lockobj == (int)ptr) {
+			if(deny) {
+				lockresult = 0;
+			} else {
+				lockresult = 1;
+			}
+			lockflag = true;
+#ifndef INTERRUPT
+			reside = true;
+#endif
+		} else {
+			// conflicts on lockresults
+			BAMBOO_EXIT(0xa01b);
+		}
+	}
+    return true;
+  } else {
+	  // send lock request msg
+	  // for 32 bit machine, the size is always 5 words
+	  send_msg_5(targetcore, LOCKREQUEST, 1, (int)ptr, lock2require, 
+				       BAMBOO_NUM_OF_CORE);
+  }
+  return true;
+}
+
+void releasewritelock(void * ptr) {
+  int targetcore = 0;
+  int reallock = 0;
+  if(((struct ___Object___ *)ptr)->lock == NULL) {
+	reallock = (int)ptr;
+  } else {
+	reallock = (int)(((struct ___Object___ *)ptr)->lock);
+  }
+  targetcore = (reallock >> 5) % NUMCORES;
+
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(0xe661);
+  BAMBOO_DEBUGPRINT_REG((int)ptr);
+  BAMBOO_DEBUGPRINT_REG(reallock);
+  BAMBOO_DEBUGPRINT_REG(targetcore);
+#endif
+
+  if(targetcore == BAMBOO_NUM_OF_CORE) {
+	BAMBOO_START_CRITICAL_SECTION_LOCK();
+#ifdef DEBUG
+	BAMBOO_DEBUGPRINT(0xf001);
+#endif
+    // reside on this core
+    if(!RuntimeHashcontainskey(locktbl, reallock)) {
+      // no locks for this object, something is wrong
+      BAMBOO_EXIT(0xa01c);
+    } else {
+      int rwlock_obj = 0;
+	  struct LockValue * lockvalue = NULL;
+      RuntimeHashget(locktbl, reallock, &rwlock_obj);
+	  lockvalue = (struct LockValue *)rwlock_obj;
+      lockvalue->value++;
+    }
+	BAMBOO_CLOSE_CRITICAL_SECTION_LOCK();
+#ifdef DEBUG
+	BAMBOO_DEBUGPRINT(0xf000);
+#endif
+    return;
+  } else {
+	// send lock release msg
+	// for 32 bit machine, the size is always 4 words
+	send_msg_4(targetcore, LOCKRELEASE, 1, (int)ptr, reallock);
+  }
+}
+
+bool getwritelock_I(void * ptr) {
+  int targetcore = 0;
+  lockobj = (int)ptr;
+  if(((struct ___Object___ *)ptr)->lock == NULL) {
+	lock2require = lockobj;
+  } else {
+	lock2require = (int)(((struct ___Object___ *)ptr)->lock);
+  }
+  targetcore = (lock2require >> 5) % NUMCORES;
+  lockflag = false;
+#ifndef INTERRUPT
+  reside = false;
+#endif
+  lockresult = 0;
+
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(0xe561);
+  BAMBOO_DEBUGPRINT_REG(lockobj);
+  BAMBOO_DEBUGPRINT_REG(lock2require);
+  BAMBOO_DEBUGPRINT_REG(targetcore);
+#endif
+
+  if(targetcore == BAMBOO_NUM_OF_CORE) {
+    // reside on this core
+	int deny = processlockrequest(1, (int)lock2require, (int)ptr, BAMBOO_NUM_OF_CORE, BAMBOO_NUM_OF_CORE, false);
+	if(deny == -1) {
+		// redirected
+		return true;
+	} else {
+		if(lockobj == (int)ptr) {
+			if(deny) {
+				lockresult = 0;
+#ifdef DEBUG
+				BAMBOO_DEBUGPRINT(0);
+#endif
+			} else {
+				lockresult = 1;
+#ifdef DEBUG
+				BAMBOO_DEBUGPRINT(1);
+#endif
+			}
+			lockflag = true;
+#ifndef INTERRUPT
+			reside = true;
+#endif
+		} else {
+			// conflicts on lockresults
+			BAMBOO_EXIT(0xa01e);
+		}
+		return true;
+	}
+  } else {
+	  // send lock request msg
+	  // for 32 bit machine, the size is always 5 words
+	  send_msg_5(targetcore, LOCKREQUEST, 1, (int)ptr, lock2require, 
+				       BAMBOO_NUM_OF_CORE);
+  }
+  return true;
+}
+
+// redirected lock request
+bool getwritelock_I_r(void * ptr, void * redirectlock, int core, bool cache) {
+  int targetcore = 0;
+
+  if(core == BAMBOO_NUM_OF_CORE) {
+	  lockobj = (int)ptr;
+	  lock2require = (int)redirectlock;
+	  lockflag = false;
+#ifndef INTERRUPT
+	  reside = false;
+#endif
+	  lockresult = 0;
+  }
+  targetcore = ((int)redirectlock >> 5) % NUMCORES;
+
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(0xe571);
+  BAMBOO_DEBUGPRINT_REG((int)ptr);
+  BAMBOO_DEBUGPRINT_REG((int)redirectlock);
+  BAMBOO_DEBUGPRINT_REG(core);
+  BAMBOO_DEBUGPRINT_REG((int)cache);
+  BAMBOO_DEBUGPRINT_REG(targetcore);
+#endif
+
+
+  if(targetcore == BAMBOO_NUM_OF_CORE) {
+    // reside on this core
+	int deny = processlockrequest(1, (int)redirectlock, (int)ptr, BAMBOO_NUM_OF_CORE, core, cache);
+	if(deny == -1) {
+		// redirected
+		return true;
+	} else {
+		if(core == BAMBOO_NUM_OF_CORE) {
+			if(lockobj == (int)ptr) {
+				if(deny) {
+					lockresult = 0;
+				} else {
+					lockresult = 1;
+					RuntimeHashadd_I(objRedirectLockTbl, (int)ptr, (int)redirectlock);
+				}
+				lockflag = true;
+#ifndef INTERRUPT
+				reside = true;
+#endif
+			} else {
+				// conflicts on lockresults
+				BAMBOO_EXIT(0xa01f);
+			}
+			return true;
+		} else {
+			// send lock grant/deny request to the root requiring core
+			// check if there is still some msg on sending
+			if((!cache) || (cache && !isMsgSending)) {
+				send_msg_4(core, deny==1?REDIRECTDENY:REDIRECTGROUNT, 1, 
+							     (int)ptr, (int)redirectlock);
+			} else {
+				cache_msg_4(core, deny==1?REDIRECTDENY:REDIRECTGROUNT, 1, 
+						        (int)ptr, (int)redirectlock);
+			}
+		}
+	}
+  } else {
+	// redirect the lock request
+	// for 32 bit machine, the size is always 6 words
+	if((!cache) || (cache && !isMsgSending)) {
+		send_msg_6(targetcore, REDIRECTLOCK, 1, (int)ptr, (int)redirectlock, 
+				       core, BAMBOO_NUM_OF_CORE);
+	} else {
+		cache_msg_6(targetcore, REDIRECTLOCK, 1, (int)ptr, (int)redirectlock, 
+				        core, BAMBOO_NUM_OF_CORE);
+	}
+  }
+  return true;
+}
+
+void releasewritelock_I(void * ptr) {
+  int targetcore = 0;
+  int reallock = 0;
+  if(((struct ___Object___ *)ptr)->lock == NULL) {
+	reallock = (int)ptr;
+  } else {
+	reallock = (int)(((struct ___Object___ *)ptr)->lock);
+  }
+  targetcore = (reallock >> 5) % NUMCORES;
+
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(0xe681);
+  BAMBOO_DEBUGPRINT_REG((int)ptr);
+  BAMBOO_DEBUGPRINT_REG(reallock);
+  BAMBOO_DEBUGPRINT_REG(targetcore);
+#endif
+
+  if(targetcore == BAMBOO_NUM_OF_CORE) {
+    // reside on this core
+    if(!RuntimeHashcontainskey(locktbl, reallock)) {
+      // no locks for this object, something is wrong
+      BAMBOO_EXIT(0xa020);
+    } else {
+      int rwlock_obj = 0;
+	  struct LockValue * lockvalue = NULL;
+      RuntimeHashget(locktbl, reallock, &rwlock_obj);
+	  lockvalue = (struct LockValue *)rwlock_obj;
+      lockvalue->value++;
+    }
+    return;
+  } else {
+	// send lock release msg
+	// for 32 bit machine, the size is always 4 words
+	send_msg_4(targetcore, LOCKRELEASE, 1, (int)ptr, reallock);
+  }
+}
+
+void releasewritelock_I_r(void * lock, void * redirectlock) {
+  int targetcore = 0;
+  int reallock = (int)lock;
+  targetcore = (reallock >> 5) % NUMCORES;
+
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(0xe691);
+  BAMBOO_DEBUGPRINT_REG((int)lock);
+  BAMBOO_DEBUGPRINT_REG(reallock);
+  BAMBOO_DEBUGPRINT_REG(targetcore);
+#endif
+
+  if(targetcore == BAMBOO_NUM_OF_CORE) {
+    // reside on this core
+    if(!RuntimeHashcontainskey(locktbl, reallock)) {
+      // no locks for this object, something is wrong
+      BAMBOO_EXIT(0xa021);
+    } else {
+      int rwlock_obj = 0;
+	  struct LockValue * lockvalue = NULL;
+#ifdef DEBUG
+      BAMBOO_DEBUGPRINT(0xe692);
+#endif
+      RuntimeHashget(locktbl, reallock, &rwlock_obj);
+	  lockvalue = (struct LockValue *)rwlock_obj;
+#ifdef DEBUG
+      BAMBOO_DEBUGPRINT_REG(lockvalue->value);
+#endif
+      lockvalue->value++;
+	  lockvalue->redirectlock = (int)redirectlock;
+#ifdef DEBUG
+      BAMBOO_DEBUGPRINT_REG(lockvalue->value);
+#endif
+    }
+    return;
+  } else {
+	// send lock release msg
+	// for 32 bit machine, the size is always 4 words
+	send_msg_4(targetcore, REDIRECTRELEASE, 1, (int)lock, (int)redirectlock);
+  }
+}
+
+/* this function is to process lock requests. 
+ * can only be invoked in receiveObject() */
+// if return -1: the lock request is redirected
+//            0: the lock request is approved
+//            1: the lock request is denied
+__attribute__((always_inline)) int processlockrequest(int locktype, int lock, int obj, int requestcore, int rootrequestcore, bool cache) {
+  int deny = 0;
+  if( ((lock >> 5) % NUMCORES) != BAMBOO_NUM_OF_CORE ) {
+	  // the lock should not be on this core
+#ifndef TILERA
+	  BAMBOO_DEBUGPRINT_REG(requestcore);
+	  BAMBOO_DEBUGPRINT_REG(lock);
+	  BAMBOO_DEBUGPRINT_REG(BAMBOO_NUM_OF_CORE);
+#endif
+	  BAMBOO_EXIT(0xa017);
+  }
+  if(!RuntimeHashcontainskey(locktbl, lock)) {
+	  // no locks for this object
+	  // first time to operate on this shared object
+	  // create a lock for it
+	  // the lock is an integer: 0 -- stall, >0 -- read lock, -1 -- write lock
+	  struct LockValue * lockvalue = (struct LockValue *)(RUNMALLOC_I(sizeof(struct LockValue)));
+	  lockvalue->redirectlock = 0;
+#ifdef DEBUG
+#ifndef TILERA
+	  BAMBOO_DEBUGPRINT(0xe110);
+#endif
+#endif
+	  if(locktype == 0) {
+		  lockvalue->value = 1;
+	  } else {
+		  lockvalue->value = -1;
+	  }
+	  RuntimeHashadd_I(locktbl, lock, (int)lockvalue);
+  } else {
+	  int rwlock_obj = 0;
+	  struct LockValue * lockvalue = NULL;
+#ifdef DEBUG
+#ifndef TILERA
+	  BAMBOO_DEBUGPRINT(0xe111);
+#endif
+#endif
+	  RuntimeHashget(locktbl, lock, &rwlock_obj);
+	  lockvalue = (struct LockValue *)(rwlock_obj);
+#ifdef DEBUG
+#ifndef TILERA
+	  BAMBOO_DEBUGPRINT_REG(lockvalue->redirectlock);
+#endif
+#endif
+	  if(lockvalue->redirectlock != 0) {
+		  // this lock is redirected
+#ifdef DEBUG
+#ifndef TILERA
+		  BAMBOO_DEBUGPRINT(0xe112);
+#endif
+#endif
+		  if(locktype == 0) {
+			  getreadlock_I_r((void *)obj, (void *)lockvalue->redirectlock, rootrequestcore, cache);
+		  } else {
+			  getwritelock_I_r((void *)obj, (void *)lockvalue->redirectlock, rootrequestcore, cache);
+		  }
+		  return -1;  // redirected
+	  } else {
+#ifdef DEBUG
+#ifndef TILERA
+		  BAMBOO_DEBUGPRINT_REG(lockvalue->value);
+#endif
+#endif
+		  if(0 == lockvalue->value) {
+			  if(locktype == 0) {
+				  lockvalue->value = 1;
+			  } else {
+				  lockvalue->value = -1;
+			  }
+		  } else if((lockvalue->value > 0) && (locktype == 0)) {
+			  // read lock request and there are only read locks
+			  lockvalue->value++;
+		  } else {
+			  deny = 1;
+		  }
+#ifdef DEBUG
+#ifndef TILERA
+		  BAMBOO_DEBUGPRINT_REG(lockvalue->value);
+#endif
+#endif
+	  }
+  }
+  return deny;
+}
+
+__attribute__((always_inline)) void processlockrelease(int locktype, int lock, int redirectlock, bool isredirect) {
+	if(!RuntimeHashcontainskey(locktbl, lock)) {
+    // no locks for this object, something is wrong
+#ifdef DEBUG
+		BAMBOO_DEBUGPRINT_REG(lock);
+#endif
+		BAMBOO_EXIT(0xa00b);
+	} else {
+		int rwlock_obj = 0;
+		struct LockValue * lockvalue = NULL;
+		RuntimeHashget(locktbl, lock, &rwlock_obj);
+		lockvalue = (struct LockValue*)(rwlock_obj);
+#ifdef DEBUG
+		BAMBOO_DEBUGPRINT(0xe884);
+		BAMBOO_DEBUGPRINT_REG(lockvalue->value);
+#endif
+		if(locktype == 0) {
+			lockvalue->value--;
+		} else {
+			lockvalue->value++;
+		}
+#ifdef DEBUG
+		BAMBOO_DEBUGPRINT_REG(lockvalue->value);
+#endif
+		if(isredirect) {
+			lockvalue->redirectlock = redirectlock;
+		}
+	}
+}
+
+#ifdef PROFILE
+__attribute__((always_inline)) inline void profileTaskStart(char * taskname) {
+  if(!taskInfoOverflow) {
+	  TaskInfo* taskInfo = RUNMALLOC(sizeof(struct task_info));
+	  taskInfoArray[taskInfoIndex] = taskInfo;
+	  taskInfo->taskName = taskname;
+	  taskInfo->startTime = raw_get_cycle();
+	  taskInfo->endTime = -1;
+	  taskInfo->exitIndex = -1;
+	  taskInfo->newObjs = NULL;
+  }
+}
+
+__attribute__((always_inline)) inline void profileTaskEnd() {
+  if(!taskInfoOverflow) {
+	  taskInfoArray[taskInfoIndex]->endTime = raw_get_cycle();
+	  taskInfoIndex++;
+	  if(taskInfoIndex == TASKINFOLENGTH) {
+		  taskInfoOverflow = true;
+	  }
+  }
+}
+
+// output the profiling data
+void outputProfileData() {
+#ifdef USEIO
+  FILE * fp;
+  char fn[50];
+  int self_y, self_x;
+  char c_y, c_x;
+  int i;
+  int totaltasktime = 0;
+  int preprocessingtime = 0;
+  int objqueuecheckingtime = 0;
+  int postprocessingtime = 0;
+  //int interruptiontime = 0;
+  int other = 0;
+  int averagetasktime = 0;
+  int tasknum = 0;
+
+  for(i = 0; i < 50; i++) {
+    fn[i] = 0;
+  }
+
+  calCoords(corenum, &self_y, &self_x);
+  c_y = (char)self_y + '0';
+  c_x = (char)self_x + '0';
+  strcat(fn, "profile_");
+  strcat(fn, &c_x);
+  strcat(fn, "_");
+  strcat(fn, &c_y);
+  strcat(fn, ".rst");
+
+  if((fp = fopen(fn, "w+")) == NULL) {
+    fprintf(stderr, "fopen error\n");
+    return;
+  }
+
+  fprintf(fp, "Task Name, Start Time, End Time, Duration, Exit Index(, NewObj Name, Num)+\n");
+  // output task related info
+  for(i = 0; i < taskInfoIndex; i++) {
+    TaskInfo* tmpTInfo = taskInfoArray[i];
+    int duration = tmpTInfo->endTime - tmpTInfo->startTime;
+    fprintf(fp, "%s, %d, %d, %d, %d", tmpTInfo->taskName, tmpTInfo->startTime, tmpTInfo->endTime, duration, tmpTInfo->exitIndex);
+	// summarize new obj info
+	if(tmpTInfo->newObjs != NULL) {
+		struct RuntimeHash * nobjtbl = allocateRuntimeHash(5);
+		struct RuntimeIterator * iter = NULL;
+		while(0 == isEmpty(tmpTInfo->newObjs)) {
+			char * objtype = (char *)(getItem(tmpTInfo->newObjs));
+			if(RuntimeHashcontainskey(nobjtbl, (int)(objtype))) {
+				int num = 0;
+				RuntimeHashget(nobjtbl, (int)objtype, &num);
+				RuntimeHashremovekey(nobjtbl, (int)objtype);
+				num++;
+				RuntimeHashadd(nobjtbl, (int)objtype, num);
+			} else {
+				RuntimeHashadd(nobjtbl, (int)objtype, 1);
+			}
+			//fprintf(stderr, "new obj!\n");
+		}
+
+		// output all new obj info
+		iter = RuntimeHashcreateiterator(nobjtbl);
+		while(RunhasNext(iter)) {
+			char * objtype = (char *)Runkey(iter);
+			int num = Runnext(iter);
+			fprintf(fp, ", %s, %d", objtype, num);
+		}
+	}
+	fprintf(fp, "\n");
+    if(strcmp(tmpTInfo->taskName, "tpd checking") == 0) {
+      preprocessingtime += duration;
+    } else if(strcmp(tmpTInfo->taskName, "post task execution") == 0) {
+      postprocessingtime += duration;
+    } else if(strcmp(tmpTInfo->taskName, "objqueue checking") == 0) {
+      objqueuecheckingtime += duration;
+    } else {
+      totaltasktime += duration;
+      averagetasktime += duration;
+      tasknum++;
+    }
+  }
+
+  if(taskInfoOverflow) {
+    fprintf(stderr, "Caution: task info overflow!\n");
+  }
+
+  other = totalexetime - totaltasktime - preprocessingtime - postprocessingtime;
+  averagetasktime /= tasknum;
+
+  fprintf(fp, "\nTotal time: %d\n", totalexetime);
+  fprintf(fp, "Total task execution time: %d (%f%%)\n", totaltasktime, ((double)totaltasktime/(double)totalexetime)*100);
+  fprintf(fp, "Total objqueue checking time: %d (%f%%)\n", objqueuecheckingtime, ((double)objqueuecheckingtime/(double)totalexetime)*100);
+  fprintf(fp, "Total pre-processing time: %d (%f%%)\n", preprocessingtime, ((double)preprocessingtime/(double)totalexetime)*100);
+  fprintf(fp, "Total post-processing time: %d (%f%%)\n", postprocessingtime, ((double)postprocessingtime/(double)totalexetime)*100);
+  fprintf(fp, "Other time: %d (%f%%)\n", other, ((double)other/(double)totalexetime)*100);
+
+  fprintf(fp, "\nAverage task execution time: %d\n", averagetasktime);
+
+  fclose(fp);
+#else
+  int i = 0;
+  int j = 0;
+
+  BAMBOO_DEBUGPRINT(0xdddd);
+  // output task related info
+  for(i= 0; i < taskInfoIndex; i++) {
+    TaskInfo* tmpTInfo = taskInfoArray[i];
+    char* tmpName = tmpTInfo->taskName;
+    int nameLen = strlen(tmpName);
+    BAMBOO_DEBUGPRINT(0xddda);
+    for(j = 0; j < nameLen; j++) {
+      BAMBOO_DEBUGPRINT_REG(tmpName[j]);
+    }
+    BAMBOO_DEBUGPRINT(0xdddb);
+    BAMBOO_DEBUGPRINT_REG(tmpTInfo->startTime);
+    BAMBOO_DEBUGPRINT_REG(tmpTInfo->endTime);
+	BAMBOO_DEBUGPRINT_REG(tmpTInfo->exitIndex);
+	if(tmpTInfo->newObjs != NULL) {
+		struct RuntimeHash * nobjtbl = allocateRuntimeHash(5);
+		struct RuntimeIterator * iter = NULL;
+		while(0 == isEmpty(tmpTInfo->newObjs)) {
+			char * objtype = (char *)(getItem(tmpTInfo->newObjs));
+			if(RuntimeHashcontainskey(nobjtbl, (int)(objtype))) {
+				int num = 0;
+				RuntimeHashget(nobjtbl, (int)objtype, &num);
+				RuntimeHashremovekey(nobjtbl, (int)objtype);
+				num++;
+				RuntimeHashadd(nobjtbl, (int)objtype, num);
+			} else {
+				RuntimeHashadd(nobjtbl, (int)objtype, 1);
+			}
+		}
+
+		// ouput all new obj info
+		iter = RuntimeHashcreateiterator(nobjtbl);
+		while(RunhasNext(iter)) {
+			char * objtype = (char *)Runkey(iter);
+			int num = Runnext(iter);
+			int nameLen = strlen(objtype);
+			BAMBOO_DEBUGPRINT(0xddda);
+			for(j = 0; j < nameLen; j++) {
+				BAMBOO_DEBUGPRINT_REG(objtype[j]);
+			}
+			BAMBOO_DEBUGPRINT(0xdddb);
+			BAMBOO_DEBUGPRINT_REG(num);
+		}
+	}
+    BAMBOO_DEBUGPRINT(0xdddc);
+  }
+
+  if(taskInfoOverflow) {
+    BAMBOO_DEBUGPRINT(0xefee);
+  }
+
+  // output interrupt related info
+  /*for(i = 0; i < interruptInfoIndex; i++) {
+       InterruptInfo* tmpIInfo = interruptInfoArray[i];
+       BAMBOO_DEBUGPRINT(0xddde);
+       BAMBOO_DEBUGPRINT_REG(tmpIInfo->startTime);
+       BAMBOO_DEBUGPRINT_REG(tmpIInfo->endTime);
+       BAMBOO_DEBUGPRINT(0xdddf);
+     }
+
+     if(interruptInfoOverflow) {
+       BAMBOO_DEBUGPRINT(0xefef);
+     }*/
+
+  BAMBOO_DEBUGPRINT(0xeeee);
+#endif
+}
+#endif  // #ifdef PROFILE
+
+#endif // #ifdef TASK
diff --git a/Robust/src/Runtime/bamboo/multicoregarbage.c b/Robust/src/Runtime/bamboo/multicoregarbage.c
new file mode 100644
index 00000000..8ecfb6ee
--- /dev/null
+++ b/Robust/src/Runtime/bamboo/multicoregarbage.c
@@ -0,0 +1,3735 @@
+#ifdef MULTICORE_GC
+#include "runtime.h"
+#include "multicoregarbage.h"
+#include "multicoreruntime.h"
+#include "runtime_arch.h"
+#include "SimpleHash.h"
+#include "GenericHashtable.h"
+#include "ObjectHash.h"
+#include "GCSharedHash.h"
+
+// TODO for profiling the flush phase
+#ifdef GC_PROFILE
+/*int num_mapinforequest;
+int num_markrequest;
+unsigned long long marktime;*/
+#endif
+
+extern int corenum;
+extern struct parameterwrapper ** objectqueues[][NUMCLASSES];
+extern int numqueues[][NUMCLASSES];
+
+extern struct genhashtable * activetasks;
+extern struct parameterwrapper ** objectqueues[][NUMCLASSES];
+extern struct taskparamdescriptor *currtpd;
+
+extern struct LockValue runtime_locks[MAXTASKPARAMS];
+extern int runtime_locklen;
+
+#ifdef SMEMM
+extern unsigned int gcmem_mixed_threshold;
+extern unsigned int gcmem_mixed_usedmem;
+#endif
+
+struct pointerblock {
+  void * ptrs[NUMPTRS];
+  struct pointerblock *next;
+};
+
+struct pointerblock *gchead=NULL;
+int gcheadindex=0;
+struct pointerblock *gctail=NULL;
+int gctailindex=0;
+struct pointerblock *gctail2=NULL;
+int gctailindex2=0;
+struct pointerblock *gcspare=NULL;
+
+#define NUMLOBJPTRS 20
+
+struct lobjpointerblock {
+  void * lobjs[NUMLOBJPTRS];
+  //void * dsts[NUMLOBJPTRS];
+  int lengths[NUMLOBJPTRS];
+  //void * origs[NUMLOBJPTRS];
+  int hosts[NUMLOBJPTRS];
+  struct lobjpointerblock *next;
+  struct lobjpointerblock *prev;
+};
+
+struct lobjpointerblock *gclobjhead=NULL;
+int gclobjheadindex=0;
+struct lobjpointerblock *gclobjtail=NULL;
+int gclobjtailindex=0;
+struct lobjpointerblock *gclobjtail2=NULL;
+int gclobjtailindex2=0;
+struct lobjpointerblock *gclobjspare=NULL;
+
+#ifdef GC_DEBUG
+// dump whole mem in blocks
+inline void dumpSMem() {
+  int block = 0;
+  int sblock = 0;
+  int j = 0;
+  int i = 0;
+  int coren = 0;
+  int x = 0;
+  int y = 0;
+  printf("(%x,%x) Dump shared mem: \n", udn_tile_coord_x(), 
+	     udn_tile_coord_y());
+  // reserved blocks for sblocktbl
+  printf("(%x,%x) ++++ reserved sblocks ++++ \n", udn_tile_coord_x(), 
+	     udn_tile_coord_y());
+  for(i=BAMBOO_BASE_VA; i<gcbaseva; i+= 4*16) {
+    printf("(%x,%x) 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x \n",
+		   udn_tile_coord_x(), udn_tile_coord_y(),
+           *((int *)(i)), *((int *)(i + 4)),
+           *((int *)(i + 4*2)), *((int *)(i + 4*3)),
+           *((int *)(i + 4*4)), *((int *)(i + 4*5)),
+           *((int *)(i + 4*6)), *((int *)(i + 4*7)),
+           *((int *)(i + 4*8)), *((int *)(i + 4*9)),
+           *((int *)(i + 4*10)), *((int *)(i + 4*11)),
+           *((int *)(i + 4*12)), *((int *)(i + 4*13)),
+           *((int *)(i + 4*14)), *((int *)(i + 4*15)));
+  }
+  sblock = gcreservedsb;
+  bool advanceblock = false;
+  // remaining memory
+  for(i=gcbaseva; i<gcbaseva+BAMBOO_SHARED_MEM_SIZE; i+=4*16) {
+    advanceblock = false;
+    // computing sblock # and block #, core coordinate (x,y) also
+    if(j%((BAMBOO_SMEM_SIZE)/(4*16)) == 0) {
+      // finished a sblock
+      if(j < ((BAMBOO_LARGE_SMEM_BOUND)/(4*16))) {
+		if((j > 0) && (j%((BAMBOO_SMEM_SIZE_L)/(4*16)) == 0)) {
+		  // finished a block
+		  block++;
+		  advanceblock = true;
+		}
+      } else {
+		// finished a block
+		block++;
+		advanceblock = true;
+      }
+      // compute core #
+      if(advanceblock) {
+		coren = gc_block2core[block%(NUMCORES4GC*2)];
+      }
+      // compute core coordinate
+      BAMBOO_COORDS(coren, &x, &y);
+      printf("(%x,%x) ==== %d, %d : core (%d,%d), saddr %x====\n",
+		     udn_tile_coord_x(), udn_tile_coord_y(),
+             block, sblock++, x, y,
+             (sblock-1)*(BAMBOO_SMEM_SIZE)+gcbaseva);
+    }
+    j++;
+    printf("(%x,%x) 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x \n",
+		   udn_tile_coord_x(), udn_tile_coord_y(),
+           *((int *)(i)), *((int *)(i + 4)),
+           *((int *)(i + 4*2)), *((int *)(i + 4*3)),
+           *((int *)(i + 4*4)), *((int *)(i + 4*5)),
+           *((int *)(i + 4*6)), *((int *)(i + 4*7)),
+           *((int *)(i + 4*8)), *((int *)(i + 4*9)),
+           *((int *)(i + 4*10)), *((int *)(i + 4*11)),
+           *((int *)(i + 4*12)), *((int *)(i + 4*13)),
+           *((int *)(i + 4*14)), *((int *)(i + 4*15)));
+  }
+  printf("(%x,%x) \n", udn_tile_coord_x(), udn_tile_coord_y());
+}
+#endif
+
+// should be invoked with interruption closed
+inline void gc_enqueue_I(void *ptr) {
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(0xe601);
+  BAMBOO_DEBUGPRINT_REG(ptr);
+#endif
+  if (gcheadindex==NUMPTRS) {
+    struct pointerblock * tmp;
+    if (gcspare!=NULL) {
+      tmp=gcspare;
+      gcspare=NULL;
+    } else {
+      tmp=RUNMALLOC_I(sizeof(struct pointerblock));
+    }             // if (gcspare!=NULL)
+    gchead->next=tmp;
+    gchead=tmp;
+    gcheadindex=0;
+  } // if (gcheadindex==NUMPTRS)
+  gchead->ptrs[gcheadindex++]=ptr;
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(0xe602);
+#endif
+} // void gc_enqueue_I(void *ptr)
+
+// dequeue and destroy the queue
+inline void * gc_dequeue_I() {
+  if (gctailindex==NUMPTRS) {
+    struct pointerblock *tmp=gctail;
+    gctail=gctail->next;
+    gctailindex=0;
+    if (gcspare!=NULL) {
+      RUNFREE(tmp);
+    } else {
+      gcspare=tmp;
+    }             // if (gcspare!=NULL)
+  } // if (gctailindex==NUMPTRS)
+  return gctail->ptrs[gctailindex++];
+} // void * gc_dequeue()
+
+// dequeue and do not destroy the queue
+inline void * gc_dequeue2_I() {
+  if (gctailindex2==NUMPTRS) {
+    struct pointerblock *tmp=gctail2;
+    gctail2=gctail2->next;
+    gctailindex2=0;
+  } // if (gctailindex2==NUMPTRS)
+  return gctail2->ptrs[gctailindex2++];
+} // void * gc_dequeue2()
+
+inline int gc_moreItems_I() {
+  if ((gchead==gctail)&&(gctailindex==gcheadindex))
+    return 0;
+  return 1;
+} // int gc_moreItems()
+
+inline int gc_moreItems2_I() {
+  if ((gchead==gctail2)&&(gctailindex2==gcheadindex))
+    return 0;
+  return 1;
+} // int gc_moreItems2()
+
+// should be invoked with interruption closed
+// enqueue a large obj: start addr & length
+inline void gc_lobjenqueue_I(void *ptr,
+                             int length,
+                             int host) {
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(0xe901);
+#endif
+  if (gclobjheadindex==NUMLOBJPTRS) {
+    struct lobjpointerblock * tmp;
+    if (gclobjspare!=NULL) {
+      tmp=gclobjspare;
+      gclobjspare=NULL;
+    } else {
+      tmp=RUNMALLOC_I(sizeof(struct lobjpointerblock));
+    }             // if (gclobjspare!=NULL)
+    gclobjhead->next=tmp;
+    tmp->prev = gclobjhead;
+    gclobjhead=tmp;
+    gclobjheadindex=0;
+  } // if (gclobjheadindex==NUMLOBJPTRS)
+  gclobjhead->lobjs[gclobjheadindex]=ptr;
+  gclobjhead->lengths[gclobjheadindex]=length;
+  gclobjhead->hosts[gclobjheadindex++]=host;
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT_REG(gclobjhead->lobjs[gclobjheadindex-1]);
+  BAMBOO_DEBUGPRINT_REG(gclobjhead->lengths[gclobjheadindex-1]);
+  BAMBOO_DEBUGPRINT_REG(gclobjhead->hosts[gclobjheadindex-1]);
+#endif
+} // void gc_lobjenqueue_I(void *ptr...)
+
+// dequeue and destroy the queue
+inline void * gc_lobjdequeue_I(int * length,
+                               int * host) {
+  if (gclobjtailindex==NUMLOBJPTRS) {
+    struct lobjpointerblock *tmp=gclobjtail;
+    gclobjtail=gclobjtail->next;
+    gclobjtailindex=0;
+    gclobjtail->prev = NULL;
+    if (gclobjspare!=NULL) {
+      RUNFREE(tmp);
+    } else {
+      gclobjspare=tmp;
+      tmp->next = NULL;
+      tmp->prev = NULL;
+    }             // if (gclobjspare!=NULL)
+  } // if (gclobjtailindex==NUMLOBJPTRS)
+  if(length != NULL) {
+    *length = gclobjtail->lengths[gclobjtailindex];
+  }
+  if(host != NULL) {
+    *host = (int)(gclobjtail->hosts[gclobjtailindex]);
+  }
+  return gclobjtail->lobjs[gclobjtailindex++];
+} // void * gc_lobjdequeue()
+
+inline int gc_lobjmoreItems_I() {
+  if ((gclobjhead==gclobjtail)&&(gclobjtailindex==gclobjheadindex))
+    return 0;
+  return 1;
+} // int gc_lobjmoreItems()
+
+// dequeue and don't destroy the queue
+inline void gc_lobjdequeue2_I() {
+  if (gclobjtailindex2==NUMLOBJPTRS) {
+    gclobjtail2=gclobjtail2->next;
+    gclobjtailindex2=1;
+  } else {
+    gclobjtailindex2++;
+  }      // if (gclobjtailindex2==NUMLOBJPTRS)
+} // void * gc_lobjdequeue2()
+
+inline int gc_lobjmoreItems2_I() {
+  if ((gclobjhead==gclobjtail2)&&(gclobjtailindex2==gclobjheadindex))
+    return 0;
+  return 1;
+} // int gc_lobjmoreItems2()
+
+// 'reversly' dequeue and don't destroy the queue
+inline void gc_lobjdequeue3_I() {
+  if (gclobjtailindex2==0) {
+    gclobjtail2=gclobjtail2->prev;
+    gclobjtailindex2=NUMLOBJPTRS-1;
+  } else {
+    gclobjtailindex2--;
+  }      // if (gclobjtailindex2==NUMLOBJPTRS)
+} // void * gc_lobjdequeue3()
+
+inline int gc_lobjmoreItems3_I() {
+  if ((gclobjtail==gclobjtail2)&&(gclobjtailindex2==gclobjtailindex))
+    return 0;
+  return 1;
+} // int gc_lobjmoreItems3()
+
+inline void gc_lobjqueueinit4_I() {
+  gclobjtail2 = gclobjtail;
+  gclobjtailindex2 = gclobjtailindex;
+} // void gc_lobjqueueinit2()
+
+inline void * gc_lobjdequeue4_I(int * length,
+                                int * host) {
+  if (gclobjtailindex2==NUMLOBJPTRS) {
+    gclobjtail2=gclobjtail2->next;
+    gclobjtailindex2=0;
+  } // if (gclobjtailindex==NUMLOBJPTRS)
+  if(length != NULL) {
+    *length = gclobjtail2->lengths[gclobjtailindex2];
+  }
+  if(host != NULL) {
+    *host = (int)(gclobjtail2->hosts[gclobjtailindex2]);
+  }
+  return gclobjtail2->lobjs[gclobjtailindex2++];
+} // void * gc_lobjdequeue()
+
+inline int gc_lobjmoreItems4_I() {
+  if ((gclobjhead==gclobjtail2)&&(gclobjtailindex2==gclobjheadindex))
+    return 0;
+  return 1;
+} // int gc_lobjmoreItems(
+
+INTPTR gccurr_heapbound = 0;
+
+inline void gettype_size(void * ptr,
+                         int * ttype,
+                         int * tsize) {
+  int type = ((int *)ptr)[0];
+  int size = 0;
+  if(type < NUMCLASSES) {
+    // a normal object
+    size = classsize[type];
+  } else {
+    // an array
+    struct ArrayObject *ao=(struct ArrayObject *)ptr;
+    int elementsize=classsize[type];
+    int length=ao->___length___;
+    size=sizeof(struct ArrayObject)+length*elementsize;
+  }       // if(type < NUMCLASSES)
+  *ttype = type;
+  *tsize = size;
+}
+
+inline bool isLarge(void * ptr,
+                    int * ttype,
+                    int * tsize) {
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(0xe701);
+  BAMBOO_DEBUGPRINT_REG(ptr);
+#endif
+  // check if a pointer is referring to a large object
+  gettype_size(ptr, ttype, tsize);
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(*tsize);
+#endif
+  int bound = (BAMBOO_SMEM_SIZE);
+  if(((int)ptr-gcbaseva) < (BAMBOO_LARGE_SMEM_BOUND)) {
+    bound = (BAMBOO_SMEM_SIZE_L);
+  }
+  if((((int)ptr-gcbaseva)%(bound))==0) {
+    // ptr is a start of a block
+#ifdef DEBUG
+    BAMBOO_DEBUGPRINT(0xe702);
+    BAMBOO_DEBUGPRINT(1);
+#endif
+    return true;
+  }
+  if((bound-(((int)ptr-gcbaseva)%bound)) < (*tsize)) {
+    // it acrosses the boundary of current block
+#ifdef DEBUG
+    BAMBOO_DEBUGPRINT(0xe703);
+    BAMBOO_DEBUGPRINT(1);
+#endif
+    return true;
+  }
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(0);
+#endif
+  return false;
+} // bool isLarge(void * ptr, int * ttype, int * tsize)
+
+inline int hostcore(void * ptr) {
+  // check the host core of ptr
+  int host = 0;
+  RESIDECORE(ptr, &host);
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(0xedd0);
+  BAMBOO_DEBUGPRINT_REG(ptr);
+  BAMBOO_DEBUGPRINT_REG(host);
+#endif
+  return host;
+} // int hostcore(void * ptr)
+
+inline bool isLocal(void * ptr) {
+  // check if a pointer is in shared heap on this core
+  return hostcore(ptr) == BAMBOO_NUM_OF_CORE;
+} // bool isLocal(void * ptr)
+
+inline bool gc_checkCoreStatus_I() {
+  bool allStall = true;
+  for(int i = 0; i < NUMCORES4GC; ++i) {
+    if(gccorestatus[i] != 0) {
+      allStall = false;
+      break;
+    }             // if(gccorestatus[i] != 0)
+  }       // for(i = 0; i < NUMCORES4GC; ++i)
+  return allStall;
+}
+
+inline bool gc_checkAllCoreStatus_I() {
+  bool allStall = true;
+  for(int i = 0; i < NUMCORESACTIVE; ++i) {
+    if(gccorestatus[i] != 0) {
+      allStall = false;
+      break;
+    }             // if(gccorestatus[i] != 0)
+  }       // for(i = 0; i < NUMCORESACTIVE; ++i)
+  return allStall;
+}
+
+inline void checkMarkStatue() {
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(0xee01);
+#endif
+  int i;
+  if((!waitconfirm) ||
+     (waitconfirm && (numconfirm == 0))) {
+#ifdef DEBUG
+    BAMBOO_DEBUGPRINT(0xee02);
+#endif
+	int entry_index = 0;
+	if(waitconfirm) {
+	  // phase 2
+	  entry_index = (gcnumsrobjs_index == 0) ? 1 : 0;
+	} else {
+	  // phase 1
+	  entry_index = gcnumsrobjs_index;
+	}
+    BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
+    gccorestatus[BAMBOO_NUM_OF_CORE] = 0;
+    gcnumsendobjs[entry_index][BAMBOO_NUM_OF_CORE] = gcself_numsendobjs;
+    gcnumreceiveobjs[entry_index][BAMBOO_NUM_OF_CORE] = gcself_numreceiveobjs;
+    // check the status of all cores
+    bool allStall = gc_checkAllCoreStatus_I();
+#ifdef DEBUG
+    BAMBOO_DEBUGPRINT(0xee03);
+#endif
+    if(allStall) {
+#ifdef DEBUG
+      BAMBOO_DEBUGPRINT(0xee04);
+#endif
+      // ask for confirm
+      if(!waitconfirm) {
+#ifdef DEBUG
+		BAMBOO_DEBUGPRINT(0xee05);
+#endif
+		// the first time found all cores stall
+		// send out status confirm msg to all other cores
+		// reset the corestatus array too
+		gccorestatus[BAMBOO_NUM_OF_CORE] = 1;
+		waitconfirm = true;
+		numconfirm = NUMCORESACTIVE - 1;
+		BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
+		for(i = 1; i < NUMCORESACTIVE; ++i) {
+		  gccorestatus[i] = 1;
+		  // send mark phase finish confirm request msg to core i
+		  send_msg_1(i, GCMARKCONFIRM, false);
+		}  // for(i = 1; i < NUMCORESACTIVE; ++i)
+      } else {
+		// Phase 2
+		// check if the sum of send objs and receive obj are the same
+		// yes->check if the info is the latest; no->go on executing
+		int sumsendobj = 0;
+		for(i = 0; i < NUMCORESACTIVE; ++i) {
+		  sumsendobj += gcnumsendobjs[gcnumsrobjs_index][i];
+		}  // for(i = 0; i < NUMCORESACTIVE; ++i)
+#ifdef DEBUG
+		BAMBOO_DEBUGPRINT(0xee06);
+		BAMBOO_DEBUGPRINT_REG(sumsendobj);
+#endif
+		for(i = 0; i < NUMCORESACTIVE; ++i) {
+		  sumsendobj -= gcnumreceiveobjs[gcnumsrobjs_index][i];
+		}  // for(i = 0; i < NUMCORESACTIVE; ++i)
+#ifdef DEBUG
+		BAMBOO_DEBUGPRINT(0xee07);
+		BAMBOO_DEBUGPRINT_REG(sumsendobj);
+#endif
+		if(0 == sumsendobj) {
+		  // Check if there are changes of the numsendobjs or numreceiveobjs on
+		  // each core
+		  bool ischanged = false;
+		  for(i = 0; i < NUMCORESACTIVE; ++i) {
+			if((gcnumsendobjs[0][i] != gcnumsendobjs[1][i]) || 
+				(gcnumreceiveobjs[0][i] != gcnumreceiveobjs[1][i]) ) {
+			  ischanged = true;
+			  break;
+			}
+		  }  // for(i = 0; i < NUMCORESACTIVE; ++i)
+#ifdef DEBUG
+		  BAMBOO_DEBUGPRINT(0xee08);
+		  BAMBOO_DEBUGPRINT_REG(ischanged);
+#endif
+		  if(!ischanged) {
+#ifdef DEBUG
+			BAMBOO_DEBUGPRINT(0xee09);
+#endif
+			// all the core status info are the latest
+			// stop mark phase
+			gcphase = COMPACTPHASE;
+			// restore the gcstatus for all cores
+			for(i = 0; i < NUMCORESACTIVE; ++i) {
+			  gccorestatus[i] = 1;
+			}  // for(i = 0; i < NUMCORESACTIVE; ++i)
+		  } else {
+			waitconfirm = false;
+			gcnumsrobjs_index = (gcnumsrobjs_index == 0) ? 1 : 0;
+		  } // if(!ischanged)
+		} else {
+		  // There were changes between phase 1 and phase 2, can not decide 
+		  // whether the mark phase has been finished
+		  waitconfirm = false;
+		  // As it fails in phase 2, flip the entries
+		  gcnumsrobjs_index = (gcnumsrobjs_index == 0) ? 1 : 0;
+		} // if(0 == sumsendobj) else ...
+		BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
+      } // if(!gcwaitconfirm) else()
+    } else {
+      BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
+    } // if(allStall)
+  }  // if((!waitconfirm)...
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(0xee0a);
+#endif
+} // void checkMarkStatue()
+/*
+inline bool preGC() {
+  // preparation for gc
+  // make sure to clear all incoming msgs espacially transfer obj msgs
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(0xec01);
+#endif
+  int i;
+  if((!waitconfirm) ||
+     (waitconfirm && (numconfirm == 0))) {
+    // send out status confirm msgs to all cores to check if there are
+    // transfer obj msgs on-the-fly
+    waitconfirm = true;
+    numconfirm = NUMCORESACTIVE - 1;
+    for(i = 1; i < NUMCORESACTIVE; ++i) {
+      corestatus[i] = 1;
+      // send status confirm msg to core i
+      send_msg_1(i, STATUSCONFIRM, false);
+    }   // for(i = 1; i < NUMCORESACTIVE; ++i)
+
+#ifdef DEBUG
+    BAMBOO_DEBUGPRINT(0xec02);
+#endif
+    while(true) {
+      if(numconfirm == 0) {
+		break;
+      }
+    }   // wait for confirmations
+    waitconfirm = false;
+    numconfirm = 0;
+#ifdef DEBUG
+    BAMBOO_DEBUGPRINT(0xec03);
+#endif
+    numsendobjs[BAMBOO_NUM_OF_CORE] = self_numsendobjs;
+    numreceiveobjs[BAMBOO_NUM_OF_CORE] = self_numreceiveobjs;
+    int sumsendobj = 0;
+#ifdef DEBUG
+    BAMBOO_DEBUGPRINT(0xec04);
+#endif
+    for(i = 0; i < NUMCORESACTIVE; ++i) {
+      sumsendobj += numsendobjs[i];
+#ifdef DEBUG
+      BAMBOO_DEBUGPRINT(0xf000 + numsendobjs[i]);
+#endif
+    }             // for(i = 1; i < NUMCORESACTIVE; ++i)
+#ifdef DEBUG
+    BAMBOO_DEBUGPRINT(0xec05);
+    BAMBOO_DEBUGPRINT_REG(sumsendobj);
+#endif
+    for(i = 0; i < NUMCORESACTIVE; ++i) {
+      sumsendobj -= numreceiveobjs[i];
+#ifdef DEBUG
+      BAMBOO_DEBUGPRINT(0xf000 + numreceiveobjs[i]);
+#endif
+    }             // for(i = 1; i < NUMCORESACTIVE; ++i)
+#ifdef DEBUG
+    BAMBOO_DEBUGPRINT(0xec06);
+    BAMBOO_DEBUGPRINT_REG(sumsendobj);
+#endif
+    if(0 == sumsendobj) {
+      return true;
+    } else {
+      // still have some transfer obj msgs on-the-fly, can not start gc
+      return false;
+    }  // if(0 == sumsendobj)
+  } else {
+#ifdef DEBUG
+    BAMBOO_DEBUGPRINT(0xec07);
+#endif
+    // previously asked for status confirmation and do not have all the
+    // confirmations yet, can not start gc
+    return false;
+  }       // if((!waitconfirm) ||
+} // bool preGC()*/
+
+inline void initGC() {
+  int i;
+  if(STARTUPCORE == BAMBOO_NUM_OF_CORE) {
+    for(i = 0; i < NUMCORES4GC; ++i) {
+      gccorestatus[i] = 1;
+      gcnumsendobjs[0][i] = gcnumsendobjs[1][i] = 0;
+      gcnumreceiveobjs[0][i] = gcnumreceiveobjs[1][i] = 0;
+      gcloads[i] = 0;
+      gcrequiredmems[i] = 0;
+      gcfilledblocks[i] = 0;
+      gcstopblock[i] = 0;
+    } // for(i = 0; i < NUMCORES4GC; ++i)
+    for(i = NUMCORES4GC; i < NUMCORESACTIVE; ++i) {
+      gccorestatus[i] = 1;
+      gcnumsendobjs[0][i] = gcnumsendobjs[1][i] = 0;
+      gcnumreceiveobjs[0][i] = gcnumreceiveobjs[1][i] = 0;
+    }
+    gcheaptop = 0;
+    gctopcore = 0;
+    gctopblock = 0;
+  } // if(STARTUPCORE == BAMBOO_NUM_OF_CORE)
+  gcself_numsendobjs = 0;
+  gcself_numreceiveobjs = 0;
+  gcmarkedptrbound = 0;
+  gcobj2map = 0;
+  gcmappedobj = 0;
+  //gcismapped = false;
+  gcnumlobjs = 0;
+  gcmovestartaddr = 0;
+  gctomove = false;
+  gcblock2fill = 0;
+  gcmovepending = 0;
+  gccurr_heaptop = 0;
+  gcdstcore = 0;
+
+  // initialize queue
+  if (gchead==NULL) {
+    gcheadindex=gctailindex=gctailindex2 = 0;
+    gchead=gctail=gctail2=RUNMALLOC(sizeof(struct pointerblock));
+  } else {
+    gctailindex = gctailindex2 = gcheadindex;
+    gctail = gctail2 = gchead;
+  }
+
+  // initialize the large obj queues
+  if (gclobjhead==NULL) {
+    gclobjheadindex=0;
+    gclobjtailindex=0;
+    gclobjtailindex2 = 0;
+    gclobjhead=gclobjtail=gclobjtail2=
+	  RUNMALLOC(sizeof(struct lobjpointerblock));
+  } else {
+    gclobjtailindex = gclobjtailindex2 = gclobjheadindex = 0;
+    gclobjtail = gclobjtail2 = gclobjhead;
+  }
+  gclobjhead->next = gclobjhead->prev = NULL;
+
+#ifdef LOCALHASHTBL_TEST
+  freeRuntimeHash(gcpointertbl);
+  gcpointertbl = allocateRuntimeHash(20);
+#else
+  mgchashreset(gcpointertbl);
+#endif
+  //gcpointertbl = allocateMGCHash(20);
+
+  freeMGCHash(gcforwardobjtbl);
+  gcforwardobjtbl = allocateMGCHash(20, 3);
+
+  // initialize the mapping info related structures
+  if((BAMBOO_NUM_OF_CORE < NUMCORES4GC) && (gcsharedptbl != NULL)) {
+	// Never free the shared hash table, just reset it
+	/*freeGCSharedHash(gcsharedptbl);
+	gcsharedptbl = allocateGCSharedHash(20);*/
+	mgcsharedhashReset(gcsharedptbl);
+  }
+  // Zero out the remaining bamboo_cur_msp 
+  // Only zero out the first 4 bytes of the remaining memory
+  /*if((bamboo_cur_msp != 0) 
+	  && (bamboo_smem_zero_top == bamboo_cur_msp) 
+	  && (bamboo_smem_size > 0)) {
+	*((int *)bamboo_cur_msp) = 0;
+  }*/
+#ifdef GC_PROFILE
+  // TODO
+  /*num_mapinforequest = 0;
+  num_mapinforequest_i = 0;
+  flushstalltime = 0;
+  flushstalltime_i = 0;
+  num_markrequest = 0;
+  marktime = 0;*/
+  gc_num_livespace = 0;
+  gc_num_freespace = 0;
+  gc_num_lobj = 0;
+  gc_num_lobjspace = 0;
+  gc_num_liveobj = 0;
+  gc_num_forwardobj = 0;
+  gc_num_profiles = NUMCORESACTIVE - 1;
+#endif
+} // void initGC()
+
+// compute load balance for all cores
+inline int loadbalance(int * heaptop) {
+  // compute load balance
+  int i;
+
+  // get the total loads
+  int tloads = gcloads[STARTUPCORE];
+  for(i = 1; i < NUMCORES4GC; i++) {
+    tloads += gcloads[i];
+  }
+  *heaptop = gcbaseva + tloads;
+
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(0xdddd);
+  BAMBOO_DEBUGPRINT_REG(tloads);
+  BAMBOO_DEBUGPRINT_REG(*heaptop);
+#endif
+  int b = 0;
+  BLOCKINDEX(*heaptop, &b);
+  int numbpc = b / NUMCORES4GC;       // num of blocks per core
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT_REG(b);
+  BAMBOO_DEBUGPRINT_REG(numbpc);
+#endif
+  gctopblock = b;
+  RESIDECORE(heaptop, &gctopcore);
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT_REG(gctopcore);
+#endif
+  return numbpc;
+} // void loadbalance(int * heaptop)
+
+inline bool cacheLObjs() {
+  // check the total mem size need for large objs
+  unsigned long long sumsize = 0;
+  int size = 0;
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(0xe801);
+#endif
+  gclobjtail2 = gclobjtail;
+  gclobjtailindex2 = gclobjtailindex;
+  int tmp_lobj = 0;
+  int tmp_len = 0;
+  int tmp_host = 0;
+  // compute total mem size required and sort the lobjs in ascending order
+  while(gc_lobjmoreItems2_I()) {
+    gc_lobjdequeue2_I();
+    tmp_lobj = gclobjtail2->lobjs[gclobjtailindex2-1];
+    tmp_host = gclobjtail2->hosts[gclobjtailindex2-1];
+    tmp_len = gclobjtail2->lengths[gclobjtailindex2 - 1];
+    sumsize += tmp_len;
+#ifdef GC_PROFILE
+	gc_num_lobj++;
+#endif
+#ifdef DEBUG
+    BAMBOO_DEBUGPRINT_REG(gclobjtail2->lobjs[gclobjtailindex2-1]);
+    BAMBOO_DEBUGPRINT_REG(tmp_len);
+    BAMBOO_DEBUGPRINT_REG(sumsize);
+#endif
+    int i = gclobjtailindex2-1;
+    struct lobjpointerblock * tmp_block = gclobjtail2;
+    // find the place to insert
+    while(true) {
+      if(i == 0) {
+		if(tmp_block->prev == NULL) {
+		  break;
+		}
+		if(tmp_block->prev->lobjs[NUMLOBJPTRS-1] > tmp_lobj) {
+		  tmp_block->lobjs[i] = tmp_block->prev->lobjs[NUMLOBJPTRS-1];
+		  tmp_block->lengths[i] = tmp_block->prev->lengths[NUMLOBJPTRS-1];
+		  tmp_block->hosts[i] = tmp_block->prev->hosts[NUMLOBJPTRS-1];
+		  tmp_block = tmp_block->prev;
+		  i = NUMLOBJPTRS-1;
+		} else {
+		  break;
+		}  // if(tmp_block->prev->lobjs[NUMLOBJPTRS-1] < tmp_lobj)
+	  } else {
+		if(tmp_block->lobjs[i-1] > tmp_lobj) {
+		  tmp_block->lobjs[i] = tmp_block->lobjs[i-1];
+		  tmp_block->lengths[i] = tmp_block->lengths[i-1];
+		  tmp_block->hosts[i] = tmp_block->hosts[i-1];
+		  i--;
+		} else {
+		  break;
+		}  // if(tmp_block->lobjs[i-1] < tmp_lobj)
+      }  // if(i ==0 ) else {}
+    }   // while(true)
+    // insert it
+    if(i != gclobjtailindex2 - 1) {
+      tmp_block->lobjs[i] = tmp_lobj;
+      tmp_block->lengths[i] = tmp_len;
+      tmp_block->hosts[i] = tmp_host;
+    }
+  }  // while(gc_lobjmoreItems2())
+
+#ifdef GC_PROFILE
+  gc_num_lobjspace = sumsize;
+#endif
+  // check if there are enough space to cache these large objs
+  INTPTR dst = gcbaseva + (BAMBOO_SHARED_MEM_SIZE) -sumsize;
+  if((unsigned long long)gcheaptop > (unsigned long long)dst) {
+    // do not have enough room to cache large objs
+#ifdef DEBUG
+    BAMBOO_DEBUGPRINT(0xe802);
+    BAMBOO_DEBUGPRINT_REG(dst);
+    BAMBOO_DEBUGPRINT_REG(gcheaptop);
+	BAMBOO_DEBUGPRINT_REG(sumsize);
+#endif
+    return false;
+  }
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(0xe803);
+  BAMBOO_DEBUGPRINT_REG(dst);
+  BAMBOO_DEBUGPRINT_REG(gcheaptop);
+#endif
+
+  gcheaptop = dst; // Note: record the start of cached lobjs with gcheaptop
+  // cache the largeObjs to the top of the shared heap
+  //gclobjtail2 = gclobjtail;
+  //gclobjtailindex2 = gclobjtailindex;
+  dst = gcbaseva + (BAMBOO_SHARED_MEM_SIZE);
+  while(gc_lobjmoreItems3_I()) {
+    gc_lobjdequeue3_I();
+    size = gclobjtail2->lengths[gclobjtailindex2];
+    // set the mark field to , indicating that this obj has been moved
+    // and need to be flushed
+    ((int *)(gclobjtail2->lobjs[gclobjtailindex2]))[6] = COMPACTED;
+    dst -= size;
+    if((int)dst < (int)(gclobjtail2->lobjs[gclobjtailindex2])+size) {
+      memmove(dst, gclobjtail2->lobjs[gclobjtailindex2], size);
+    } else {
+      //BAMBOO_WRITE_HINT_CACHE(dst, size);
+      memcpy(dst, gclobjtail2->lobjs[gclobjtailindex2], size);
+    }
+#ifdef DEBUG
+    BAMBOO_DEBUGPRINT(0x804);
+    BAMBOO_DEBUGPRINT_REG(gclobjtail2->lobjs[gclobjtailindex2]);
+    BAMBOO_DEBUGPRINT(dst);
+    BAMBOO_DEBUGPRINT_REG(size);
+    BAMBOO_DEBUGPRINT_REG(*((int*)gclobjtail2->lobjs[gclobjtailindex2]));
+    BAMBOO_DEBUGPRINT_REG(*((int*)(dst)));
+#endif
+  }
+  return true;
+} // void cacheLObjs()
+
+// update the bmmboo_smemtbl to record current shared mem usage
+void updateSmemTbl(int coren,
+                   int localtop) {
+  int ltopcore = 0;
+  int bound = BAMBOO_SMEM_SIZE_L;
+  BLOCKINDEX(localtop, &ltopcore);
+  if(localtop >= (gcbaseva+(BAMBOO_LARGE_SMEM_BOUND))) {
+    bound = BAMBOO_SMEM_SIZE;
+  }
+  int load = (localtop-gcbaseva)%bound;
+  int i = 0;
+  int j = 0;
+  int toset = 0;
+  do {
+    toset = gc_core2block[2*coren+i]+(NUMCORES4GC*2)*j;
+    if(toset < ltopcore) {
+      bamboo_smemtbl[toset]=
+        (toset<NUMCORES4GC) ? BAMBOO_SMEM_SIZE_L : BAMBOO_SMEM_SIZE;
+#ifdef SMEMM
+	  gcmem_mixed_usedmem += bamboo_smemtbl[toset];
+#endif
+    } else if(toset == ltopcore) {
+      bamboo_smemtbl[toset] = load;
+#ifdef SMEMM
+	  gcmem_mixed_usedmem += bamboo_smemtbl[toset];
+#endif
+      break;
+    } else {
+      break;
+    }
+    i++;
+    if(i == 2) {
+      i = 0;
+      j++;
+    }
+  } while(true);
+} // void updateSmemTbl(int, int)
+
+inline void moveLObjs() {
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(0xea01);
+#endif
+#ifdef SMEMM
+  // update the gcmem_mixed_usedmem
+  gcmem_mixed_usedmem = 0;
+#endif
+  // zero out the smemtbl
+  BAMBOO_MEMSET_WH(bamboo_smemtbl, 0, sizeof(int)*gcnumblock);
+  // find current heap top
+  // flush all gcloads to indicate the real heap top on one core
+  // previous it represents the next available ptr on a core
+  if((gcloads[0] > (gcbaseva+(BAMBOO_SMEM_SIZE_L)))
+     && ((gcloads[0]%(BAMBOO_SMEM_SIZE)) == 0)) {
+    // edge of a block, check if this is exactly the heaptop
+    BASEPTR(0, gcfilledblocks[0]-1, &(gcloads[0]));
+    gcloads[0]+=(gcfilledblocks[0]>1 ?
+                 (BAMBOO_SMEM_SIZE) : (BAMBOO_SMEM_SIZE_L));
+  }
+  updateSmemTbl(0, gcloads[0]);
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(0xea02);
+  BAMBOO_DEBUGPRINT_REG(gcloads[0]);
+  BAMBOO_DEBUGPRINT_REG(bamboo_smemtbl[0]);
+#endif
+  for(int i = 1; i < NUMCORES4GC; i++) {
+    int tmptop = 0;
+#ifdef DEBUG
+    BAMBOO_DEBUGPRINT(0xf000+i);
+    BAMBOO_DEBUGPRINT_REG(gcloads[i]);
+    BAMBOO_DEBUGPRINT_REG(gcfilledblocks[i]);
+#endif
+    if((gcfilledblocks[i] > 0)
+       && ((gcloads[i] % (BAMBOO_SMEM_SIZE)) == 0)) {
+      // edge of a block, check if this is exactly the heaptop
+      BASEPTR(i, gcfilledblocks[i]-1, &gcloads[i]);
+      gcloads[i] += 
+		(gcfilledblocks[i]>1 ? (BAMBOO_SMEM_SIZE) : (BAMBOO_SMEM_SIZE_L));
+      tmptop = gcloads[i];
+    }
+    updateSmemTbl(i, gcloads[i]);
+#ifdef DEBUG
+    BAMBOO_DEBUGPRINT_REG(gcloads[i]);
+#endif
+  } // for(int i = 1; i < NUMCORES4GC; i++) {
+
+  // find current heap top
+  // TODO
+  // a bug here: when using local allocation, directly move large objects
+  // to the highest free chunk might not be memory efficient
+  int tmpheaptop = 0;
+  int size = 0;
+  int bound = 0;
+  int i = 0;
+  for(i = gcnumblock-1; i >= 0; i--) {
+    if(bamboo_smemtbl[i] > 0) {
+      break;
+    }
+  }
+  if(i == -1) {
+    tmpheaptop = gcbaseva;
+  } else {
+    tmpheaptop = gcbaseva+bamboo_smemtbl[i]+((i<NUMCORES4GC) ?
+		(BAMBOO_SMEM_SIZE_L*i) :
+        (BAMBOO_SMEM_SIZE*(i-NUMCORES4GC)+BAMBOO_LARGE_SMEM_BOUND));
+  }
+
+  // move large objs from gcheaptop to tmpheaptop
+  // write the header first
+  unsigned int tomove = gcbaseva + (BAMBOO_SHARED_MEM_SIZE) -gcheaptop;
+#ifdef SMEMM
+  gcmem_mixed_usedmem += tomove;
+#endif
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(0xea03);
+  BAMBOO_DEBUGPRINT_REG(tomove);
+  BAMBOO_DEBUGPRINT_REG(tmpheaptop);
+  BAMBOO_DEBUGPRINT_REG(gcheaptop);
+#endif
+  // flush the sbstartbl
+  BAMBOO_MEMSET_WH(&(gcsbstarttbl[gcreservedsb]), '\0',
+	  (BAMBOO_SHARED_MEM_SIZE/BAMBOO_SMEM_SIZE-gcreservedsb)*sizeof(INTPTR));
+  if(tomove == 0) {
+    gcheaptop = tmpheaptop;
+  } else {
+    // check how many blocks it acrosses
+    int remain = tmpheaptop-gcbaseva;
+    int sb = remain/(BAMBOO_SMEM_SIZE) + gcreservedsb;//number of the sblock
+    int b = 0;  // number of the block
+    BLOCKINDEX(tmpheaptop, &b);
+    // check the remaining space in this block
+    bound = (BAMBOO_SMEM_SIZE);
+    if(remain < (BAMBOO_LARGE_SMEM_BOUND)) {
+      bound = (BAMBOO_SMEM_SIZE_L);
+    }
+    remain = bound - remain%bound;
+
+#ifdef DEBUG
+    BAMBOO_DEBUGPRINT(0xea04);
+#endif
+    size = 0;
+    int isize = 0;
+    int host = 0;
+    int ptr = 0;
+    int base = tmpheaptop;
+    int cpysize = 0;
+    remain -= BAMBOO_CACHE_LINE_SIZE;
+    tmpheaptop += BAMBOO_CACHE_LINE_SIZE;
+    gc_lobjqueueinit4_I();
+    while(gc_lobjmoreItems4_I()) {
+      ptr = (int)(gc_lobjdequeue4_I(&size, &host));
+      ALIGNSIZE(size, &isize);
+      if(remain < isize) {
+		// this object acrosses blocks
+		if(cpysize > 0) {
+		  // close current block, fill its header
+		  BAMBOO_MEMSET_WH(base, '\0', BAMBOO_CACHE_LINE_SIZE);
+		  *((int*)base) = cpysize + BAMBOO_CACHE_LINE_SIZE;
+		  bamboo_smemtbl[b]+=BAMBOO_CACHE_LINE_SIZE;//add the size of header
+		  cpysize = 0;
+		  base = tmpheaptop;
+		  if(remain == 0) {
+			remain = ((tmpheaptop-gcbaseva)<(BAMBOO_LARGE_SMEM_BOUND)) ?
+					 BAMBOO_SMEM_SIZE_L : BAMBOO_SMEM_SIZE;
+		  }
+		  remain -= BAMBOO_CACHE_LINE_SIZE;
+		  tmpheaptop += BAMBOO_CACHE_LINE_SIZE;
+		  BLOCKINDEX(tmpheaptop, &b);
+		  sb = (tmpheaptop-gcbaseva)/(BAMBOO_SMEM_SIZE) + gcreservedsb;
+		}  // if(cpysize > 0)
+
+		// move the large obj
+		if((int)gcheaptop < (int)(tmpheaptop)+size) {
+		  memmove(tmpheaptop, gcheaptop, size);
+		} else {
+		  //BAMBOO_WRITE_HINT_CACHE(tmpheaptop, size);
+		  memcpy(tmpheaptop, gcheaptop, size);
+		}
+		// fill the remaining space with -2 padding
+		BAMBOO_MEMSET_WH(tmpheaptop+size, -2, isize-size);
+#ifdef DEBUG
+		BAMBOO_DEBUGPRINT(0xea05);
+		BAMBOO_DEBUGPRINT_REG(gcheaptop);
+		BAMBOO_DEBUGPRINT_REG(tmpheaptop);
+		BAMBOO_DEBUGPRINT_REG(size);
+		BAMBOO_DEBUGPRINT_REG(isize);
+		BAMBOO_DEBUGPRINT_REG(base);
+#endif
+		gcheaptop += size;
+		// cache the mapping info anyway
+		//if(ptr != tmpheaptop) {
+		BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
+#ifdef LOCALHASHTBL_TEST
+		RuntimeHashadd_I(gcpointertbl, ptr, tmpheaptop);
+#else
+		mgchashInsert_I(gcpointertbl, ptr, tmpheaptop);
+#endif
+		//MGCHashadd_I(gcpointertbl, ptr, tmpheaptop);
+		BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
+		//}
+#ifdef DEBUG
+		BAMBOO_DEBUGPRINT(0xcdca);
+		BAMBOO_DEBUGPRINT_REG(ptr);
+		BAMBOO_DEBUGPRINT_REG(tmpheaptop);
+#endif
+		if(host != BAMBOO_NUM_OF_CORE) {
+		  // send the original host core with the mapping info
+		  send_msg_3(host, GCLOBJMAPPING, ptr, tmpheaptop, false);
+#ifdef DEBUG
+		  BAMBOO_DEBUGPRINT(0xcdcb);
+		  BAMBOO_DEBUGPRINT_REG(ptr);
+		  BAMBOO_DEBUGPRINT_REG(tmpheaptop);
+#endif
+		} // if(host != BAMBOO_NUM_OF_CORE)
+		tmpheaptop += isize;
+
+		// set the gcsbstarttbl and bamboo_smemtbl
+		int tmpsbs = 1+(isize-remain-1)/BAMBOO_SMEM_SIZE;
+		for(int k = 1; k < tmpsbs; k++) {
+		  gcsbstarttbl[sb+k] = (INTPTR)(-1);
+		}
+		sb += tmpsbs;
+		bound = (b<NUMCORES4GC) ? BAMBOO_SMEM_SIZE_L : BAMBOO_SMEM_SIZE;
+		BLOCKINDEX(tmpheaptop-1, &tmpsbs);
+		for(; b < tmpsbs; b++) {
+		  bamboo_smemtbl[b] = bound;
+		  if(b==NUMCORES4GC-1) {
+			bound = BAMBOO_SMEM_SIZE;
+		  }
+		}
+		if(((isize-remain)%(BAMBOO_SMEM_SIZE)) == 0) {
+		  gcsbstarttbl[sb] = (INTPTR)(-1);
+		  remain = ((tmpheaptop-gcbaseva)<(BAMBOO_LARGE_SMEM_BOUND)) ?
+				   BAMBOO_SMEM_SIZE_L : BAMBOO_SMEM_SIZE;
+		  bamboo_smemtbl[b] = bound;
+		} else {
+		  gcsbstarttbl[sb] = (INTPTR)(tmpheaptop);
+		  remain = tmpheaptop-gcbaseva;
+		  bamboo_smemtbl[b] = remain%bound;
+		  remain = bound - bamboo_smemtbl[b];
+		} // if(((isize-remain)%(BAMBOO_SMEM_SIZE)) == 0) else ...
+
+		// close current block and fill the header
+		BAMBOO_MEMSET_WH(base, '\0', BAMBOO_CACHE_LINE_SIZE);
+		*((int*)base) = isize + BAMBOO_CACHE_LINE_SIZE;
+		cpysize = 0;
+		base = tmpheaptop;
+		if(remain == BAMBOO_CACHE_LINE_SIZE) {
+		  // fill with 0 in case
+		  BAMBOO_MEMSET_WH(tmpheaptop, '\0', remain);
+		}
+		remain -= BAMBOO_CACHE_LINE_SIZE;
+		tmpheaptop += BAMBOO_CACHE_LINE_SIZE;
+      } else {
+		remain -= isize;
+		// move the large obj
+		if((int)gcheaptop < (int)(tmpheaptop)+size) {
+		  memmove(tmpheaptop, gcheaptop, size);
+		} else {
+		  //BAMBOO_WRITE_HINT_CACHE(tmpheaptop, size);
+		  memcpy(tmpheaptop, gcheaptop, size);
+		}
+		// fill the remaining space with -2 padding
+		BAMBOO_MEMSET_WH(tmpheaptop+size, -2, isize-size);
+		// zero out original mem caching the lobj
+		//BAMBOO_MEMSET_WH(gcheaptop, '\0', size); // TODO ??
+#ifdef DEBUG
+		BAMBOO_DEBUGPRINT(0xea06);
+		BAMBOO_DEBUGPRINT_REG(gcheaptop);
+		BAMBOO_DEBUGPRINT_REG(tmpheaptop);
+		BAMBOO_DEBUGPRINT_REG(size);
+		BAMBOO_DEBUGPRINT_REG(isize);
+#endif
+
+		gcheaptop += size;
+		cpysize += isize;
+		// cache the mapping info anyway
+		//if(ptr != tmpheaptop) {
+		BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
+#ifdef LOCALHASHTBL_TEST
+		RuntimeHashadd_I(gcpointertbl, ptr, tmpheaptop);
+#else
+		mgchashInsert_I(gcpointertbl, ptr, tmpheaptop);
+#endif
+		//MGCHashadd_I(gcpointertbl, ptr, tmpheaptop);
+		BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
+		//}
+#ifdef DEBUG
+		BAMBOO_DEBUGPRINT(0xcdcc);
+		BAMBOO_DEBUGPRINT_REG(ptr);
+		BAMBOO_DEBUGPRINT_REG(tmpheaptop);
+		BAMBOO_DEBUGPRINT_REG(*((int*)tmpheaptop));
+#endif
+		if(host != BAMBOO_NUM_OF_CORE) {
+		  // send the original host core with the mapping info
+		  send_msg_3(host, GCLOBJMAPPING, ptr, tmpheaptop, false);
+#ifdef DEBUG
+		  BAMBOO_DEBUGPRINT(0xcdcd);
+		  BAMBOO_DEBUGPRINT_REG(ptr);
+		  BAMBOO_DEBUGPRINT_REG(tmpheaptop);
+#endif
+		}                         // if(host != BAMBOO_NUM_OF_CORE)
+		tmpheaptop += isize;
+
+		// update bamboo_smemtbl
+		bamboo_smemtbl[b] += isize;
+	  }  // if(remain < isize) else ...
+    }  // while(gc_lobjmoreItems())
+    if(cpysize > 0) {
+      // close current block, fill the header
+      BAMBOO_MEMSET_WH(base, '\0', BAMBOO_CACHE_LINE_SIZE);
+      *((int*)base) = cpysize + BAMBOO_CACHE_LINE_SIZE;
+      bamboo_smemtbl[b] += BAMBOO_CACHE_LINE_SIZE;// add the size of the header
+    } else {
+      tmpheaptop -= BAMBOO_CACHE_LINE_SIZE;
+    }
+    gcheaptop = tmpheaptop;
+
+  } // if(tomove == 0)
+
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(0xea07);
+  BAMBOO_DEBUGPRINT_REG(gcheaptop);
+#endif
+
+  bamboo_free_block = 0;
+  int tbound = 0;
+  do {
+    tbound = (bamboo_free_block<NUMCORES4GC) ?
+             BAMBOO_SMEM_SIZE_L : BAMBOO_SMEM_SIZE;
+    if(bamboo_smemtbl[bamboo_free_block] == tbound) {
+      bamboo_free_block++;
+    } else {
+      // the first non-full partition
+      break;
+    }
+  } while(true);
+
+  // TODO
+  /*unsigned long long gc_num_livespace = 0;
+  for(int tmpi = 0; tmpi < gcnumblock; tmpi++) {
+	gc_num_livespace += bamboo_smemtbl[tmpi];
+  }
+  BAMBOO_DEBUGPRINT_REG(gc_num_livespace);
+  BAMBOO_DEBUGPRINT_REG(bamboo_free_block);*/
+
+#ifdef GC_PROFILE
+  // check how many live space there are
+  gc_num_livespace = 0;
+  for(int tmpi = 0; tmpi < gcnumblock; tmpi++) {
+	gc_num_livespace += bamboo_smemtbl[tmpi];
+  }
+  gc_num_freespace = (BAMBOO_SHARED_MEM_SIZE) - gc_num_livespace;
+#endif
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(0xea08);
+  BAMBOO_DEBUGPRINT_REG(gcheaptop);
+#endif
+} // void moveLObjs()
+
+inline void markObj(void * objptr) {
+  if(objptr == NULL) {
+    return;
+  }
+  if(ISSHAREDOBJ(objptr)) {
+    int host = hostcore(objptr);
+    if(BAMBOO_NUM_OF_CORE == host) {
+      // on this core
+      BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
+      if(((int *)objptr)[6] == INIT) {
+		// this is the first time that this object is discovered,
+		// set the flag as DISCOVERED
+		((int *)objptr)[6] |= DISCOVERED;
+		gc_enqueue_I(objptr);
+	  }
+      BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
+    } else {
+#ifdef DEBUG
+      BAMBOO_DEBUGPRINT(0xbbbb);
+      BAMBOO_DEBUGPRINT_REG(host);
+      BAMBOO_DEBUGPRINT_REG(objptr);
+#endif
+      // check if this obj has been forwarded
+      if(!MGCHashcontains(gcforwardobjtbl, (int)objptr)) {
+#ifdef GC_PROFILE
+		// TODO unsigned long long ttime = BAMBOO_GET_EXE_TIME();
+#endif
+		// send a msg to host informing that objptr is active
+		send_msg_2(host, GCMARKEDOBJ, objptr, /*BAMBOO_NUM_OF_CORE,*/ false);
+#ifdef GC_PROFILE
+		// TODO
+		/*
+		marktime += BAMBOO_GET_EXE_TIME() - ttime;
+		num_markrequest++;*/
+		gc_num_forwardobj++;
+#endif // GC_PROFILE
+		gcself_numsendobjs++;
+		MGCHashadd(gcforwardobjtbl, (int)objptr);
+      }
+    }
+  } else {
+    BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
+    gc_enqueue_I(objptr);
+    BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
+  }       // if(ISSHAREDOBJ(objptr))
+} // void markObj(void * objptr)
+
+// enqueue root objs
+inline void tomark(struct garbagelist * stackptr) {
+  if(MARKPHASE != gcphase) {
+#ifdef DEBUG
+    BAMBOO_DEBUGPRINT_REG(gcphase);
+#endif
+    BAMBOO_EXIT(0xb101);
+  }
+  gcbusystatus = true;
+  gcnumlobjs = 0;
+
+  int i,j;
+  // enqueue current stack
+  while(stackptr!=NULL) {
+#ifdef DEBUG
+    BAMBOO_DEBUGPRINT(0xe501);
+    BAMBOO_DEBUGPRINT_REG(stackptr->size);
+    BAMBOO_DEBUGPRINT_REG(stackptr->next);
+    BAMBOO_DEBUGPRINT_REG(stackptr->array[0]);
+#endif
+    for(i=0; i<stackptr->size; i++) {
+      if(stackptr->array[i] != NULL) {
+		markObj(stackptr->array[i]);
+      }
+    }
+    stackptr=stackptr->next;
+  }
+
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(0xe503);
+#endif
+  // enqueue objectsets
+  if(BAMBOO_NUM_OF_CORE < NUMCORESACTIVE) {
+    for(i=0; i<NUMCLASSES; i++) {
+      struct parameterwrapper ** queues =
+        objectqueues[BAMBOO_NUM_OF_CORE][i];
+      int length = numqueues[BAMBOO_NUM_OF_CORE][i];
+      for(j = 0; j < length; ++j) {
+		struct parameterwrapper * parameter = queues[j];
+		struct ObjectHash * set=parameter->objectset;
+		struct ObjectNode * ptr=set->listhead;
+		while(ptr!=NULL) {
+		  markObj((void *)ptr->key);
+		  ptr=ptr->lnext;
+		}
+      }
+    }
+  }
+
+  // euqueue current task descriptor
+  if(currtpd != NULL) {
+#ifdef DEBUG
+    BAMBOO_DEBUGPRINT(0xe504);
+#endif
+    for(i=0; i<currtpd->numParameters; i++) {
+      markObj(currtpd->parameterArray[i]);
+    }
+  }
+
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(0xe505);
+#endif
+  // euqueue active tasks
+  if(activetasks != NULL) {
+    struct genpointerlist * ptr=activetasks->list;
+    while(ptr!=NULL) {
+      struct taskparamdescriptor *tpd=ptr->src;
+      int i;
+      for(i=0; i<tpd->numParameters; i++) {
+		markObj(tpd->parameterArray[i]);
+      }
+      ptr=ptr->inext;
+    }
+  }
+
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(0xe506);
+#endif
+  // enqueue cached transferred obj
+  struct QueueItem * tmpobjptr =  getHead(&objqueue);
+  while(tmpobjptr != NULL) {
+    struct transObjInfo * objInfo =
+      (struct transObjInfo *)(tmpobjptr->objectptr);
+    markObj(objInfo->objptr);
+    tmpobjptr = getNextQueueItem(tmpobjptr);
+  }
+
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(0xe507);
+#endif
+  // enqueue cached objs to be transferred
+  struct QueueItem * item = getHead(totransobjqueue);
+  while(item != NULL) {
+    struct transObjInfo * totransobj =
+      (struct transObjInfo *)(item->objectptr);
+    markObj(totransobj->objptr);
+    item = getNextQueueItem(item);
+  }       // while(item != NULL)
+
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(0xe508);
+#endif
+  // enqueue lock related info
+  for(i = 0; i < runtime_locklen; ++i) {
+    markObj((void *)(runtime_locks[i].redirectlock));
+    if(runtime_locks[i].value != NULL) {
+      markObj((void *)(runtime_locks[i].value));
+    }
+  }
+
+} // void tomark(struct garbagelist * stackptr)
+
+inline void mark(bool isfirst,
+                 struct garbagelist * stackptr) {
+#ifdef DEBUG
+  if(BAMBOO_NUM_OF_CORE == 0) BAMBOO_DEBUGPRINT(0xed01);
+#endif
+  if(isfirst) {
+#ifdef DEBUG
+    if(BAMBOO_NUM_OF_CORE == 0) BAMBOO_DEBUGPRINT(0xed02);
+#endif
+    // enqueue root objs
+    tomark(stackptr);
+    gccurr_heaptop = 0; // record the size of all active objs in this core
+                        // aligned but does not consider block boundaries
+    gcmarkedptrbound = 0;
+  }
+#ifdef DEBUG
+  if(BAMBOO_NUM_OF_CORE == 0) BAMBOO_DEBUGPRINT(0xed03);
+#endif
+  int isize = 0;
+  bool checkfield = true;
+  bool sendStall = false;
+  // mark phase
+  while(MARKPHASE == gcphase) {
+#ifdef DEBUG
+    if(BAMBOO_NUM_OF_CORE == 0) BAMBOO_DEBUGPRINT(0xed04);
+#endif
+    while(true) {
+      BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
+      bool hasItems = gc_moreItems2_I();
+      BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
+#ifdef DEBUG
+      BAMBOO_DEBUGPRINT(0xed05);
+#endif
+      if(!hasItems) {
+		break;
+      }
+      sendStall = false;
+      gcbusystatus = true;
+      checkfield = true;
+      void * ptr = gc_dequeue2_I();
+
+#ifdef DEBUG
+      BAMBOO_DEBUGPRINT_REG(ptr);
+#endif
+      int size = 0;
+      int isize = 0;
+      int type = 0;
+      // check if it is a shared obj
+      if(ISSHAREDOBJ(ptr)) {
+		// a shared obj, check if it is a local obj on this core
+		int host = hostcore(ptr);
+		bool islocal = (host == BAMBOO_NUM_OF_CORE);
+		if(islocal) {
+		  bool isnotmarked = ((((int *)ptr)[6] & DISCOVERED) != 0);
+		  if(isLarge(ptr, &type, &size) && isnotmarked) {
+			// ptr is a large object and not marked or enqueued
+#ifdef DEBUG
+			BAMBOO_DEBUGPRINT(0xecec);
+			BAMBOO_DEBUGPRINT_REG(ptr);
+			BAMBOO_DEBUGPRINT_REG(*((int*)ptr));
+#endif
+			BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
+			gc_lobjenqueue_I(ptr, size, BAMBOO_NUM_OF_CORE);
+			gcnumlobjs++;
+			BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
+			// mark this obj
+			((int *)ptr)[6] = ((int *)ptr)[6] & (~DISCOVERED) | MARKED;
+		  } else if(isnotmarked) {
+			// ptr is an unmarked active object on this core
+			ALIGNSIZE(size, &isize);
+			gccurr_heaptop += isize;
+#ifdef DEBUG
+			BAMBOO_DEBUGPRINT(0xaaaa);
+			BAMBOO_DEBUGPRINT_REG(ptr);
+			BAMBOO_DEBUGPRINT_REG(isize);
+			BAMBOO_DEBUGPRINT(((int *)(ptr))[0]);
+#endif
+			// mark this obj
+			((int *)ptr)[6] = ((int *)ptr)[6] & (~DISCOVERED) | MARKED;
+		  
+			if(ptr + size > gcmarkedptrbound) {
+			  gcmarkedptrbound = ptr + size;
+			} // if(ptr + size > gcmarkedptrbound)
+		  } else {
+			// ptr is not an active obj or has been marked
+			checkfield = false;
+		  } // if(isLarge(ptr, &type, &size)) else ...
+		}  /* can never reach here
+		else {
+#ifdef DEBUG
+		  if(BAMBOO_NUM_OF_CORE == 0) {
+			BAMBOO_DEBUGPRINT(0xbbbb);
+			BAMBOO_DEBUGPRINT_REG(host);
+			BAMBOO_DEBUGPRINT_REG(ptr);
+		  }
+#endif
+		  // check if this obj has been forwarded
+		  if(!MGCHashcontains(gcforwardobjtbl, (int)ptr)) {
+			// send a msg to host informing that ptr is active
+			send_msg_2(host, GCMARKEDOBJ, ptr, false);
+			gcself_numsendobjs++;
+			MGCHashadd(gcforwardobjtbl, (int)ptr);
+		  }
+			checkfield = false;
+		}// if(isLocal(ptr)) else ...*/
+	  }   // if(ISSHAREDOBJ(ptr))
+#ifdef DEBUG
+      BAMBOO_DEBUGPRINT(0xed06);
+#endif
+
+      if(checkfield) {
+		// scan all pointers in ptr
+		unsigned INTPTR * pointer;
+		pointer=pointerarray[type];
+		if (pointer==0) {
+		  /* Array of primitives */
+		  /* Do nothing */
+		} else if (((INTPTR)pointer)==1) {
+		  /* Array of pointers */
+		  struct ArrayObject *ao=(struct ArrayObject *) ptr;
+		  int length=ao->___length___;
+		  int j;
+		  for(j=0; j<length; j++) {
+			void *objptr =
+			  ((void **)(((char *)&ao->___length___)+sizeof(int)))[j];
+			markObj(objptr);
+		  }
+		} else {
+		  INTPTR size=pointer[0];
+		  int i;
+		  for(i=1; i<=size; i++) {
+			unsigned int offset=pointer[i];
+			void * objptr=*((void **)(((char *)ptr)+offset));
+			markObj(objptr);
+		  }
+		}     // if (pointer==0) else if ... else ...
+      }   // if(checkfield)
+    }     // while(gc_moreItems2())
+#ifdef DEBUG
+    BAMBOO_DEBUGPRINT(0xed07);
+#endif
+    gcbusystatus = false;
+    // send mark finish msg to core coordinator
+    if(STARTUPCORE == BAMBOO_NUM_OF_CORE) {
+#ifdef DEBUG
+      BAMBOO_DEBUGPRINT(0xed08);
+#endif
+      gccorestatus[BAMBOO_NUM_OF_CORE] = 0;
+      gcnumsendobjs[gcnumsrobjs_index][BAMBOO_NUM_OF_CORE]=gcself_numsendobjs;
+      gcnumreceiveobjs[gcnumsrobjs_index][BAMBOO_NUM_OF_CORE]=
+		gcself_numreceiveobjs;
+      gcloads[BAMBOO_NUM_OF_CORE] = gccurr_heaptop;
+    } else {
+      if(!sendStall) {
+#ifdef DEBUG
+		BAMBOO_DEBUGPRINT(0xed09);
+#endif
+		send_msg_4(STARTUPCORE, GCFINISHMARK, BAMBOO_NUM_OF_CORE,
+				   gcself_numsendobjs, gcself_numreceiveobjs, false);
+		sendStall = true;
+      }
+    }             // if(STARTUPCORE == BAMBOO_NUM_OF_CORE) ...
+#ifdef DEBUG
+    BAMBOO_DEBUGPRINT(0xed0a);
+#endif
+
+    if(BAMBOO_NUM_OF_CORE == STARTUPCORE) {
+#ifdef DEBUG
+      BAMBOO_DEBUGPRINT(0xed0b);
+#endif
+      return;
+    }
+  }       // while(MARKPHASE == gcphase)
+} // mark()
+
+inline void compact2Heaptophelper_I(int coren,
+                                    int* p,
+                                    int* numblocks,
+                                    int* remain) {
+  int b;
+  int memneed = gcrequiredmems[coren] + BAMBOO_CACHE_LINE_SIZE;
+  if(STARTUPCORE == coren) {
+    gctomove = true;
+    gcmovestartaddr = *p;
+    gcdstcore = gctopcore;
+    gcblock2fill = *numblocks + 1;
+  } else {
+    send_msg_4(coren, GCMOVESTART, gctopcore, *p, (*numblocks) + 1, false);
+  }
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT_REG(coren);
+  BAMBOO_DEBUGPRINT_REG(gctopcore);
+  BAMBOO_DEBUGPRINT_REG(*p);
+  BAMBOO_DEBUGPRINT_REG(*numblocks+1);
+#endif
+  if(memneed < *remain) {
+#ifdef DEBUG
+    BAMBOO_DEBUGPRINT(0xd104);
+#endif
+    *p = *p + memneed;
+    gcrequiredmems[coren] = 0;
+    gcloads[gctopcore] += memneed;
+    *remain = *remain - memneed;
+  } else {
+#ifdef DEBUG
+    BAMBOO_DEBUGPRINT(0xd105);
+#endif
+    // next available block
+    *p = *p + *remain;
+    gcfilledblocks[gctopcore] += 1;
+    int newbase = 0;
+    BASEPTR(gctopcore, gcfilledblocks[gctopcore], &newbase);
+    gcloads[gctopcore] = newbase;
+    gcrequiredmems[coren] -= *remain - BAMBOO_CACHE_LINE_SIZE;
+    gcstopblock[gctopcore]++;
+    gctopcore = NEXTTOPCORE(gctopblock);
+    gctopblock++;
+    *numblocks = gcstopblock[gctopcore];
+    *p = gcloads[gctopcore];
+    BLOCKINDEX(*p, &b);
+    *remain=(b<NUMCORES4GC) ?
+             ((BAMBOO_SMEM_SIZE_L)-((*p)%(BAMBOO_SMEM_SIZE_L)))
+	     : ((BAMBOO_SMEM_SIZE)-((*p)%(BAMBOO_SMEM_SIZE)));
+#ifdef DEBUG
+    BAMBOO_DEBUGPRINT(0xd106);
+    BAMBOO_DEBUGPRINT_REG(gctopcore);
+    BAMBOO_DEBUGPRINT_REG(*p);
+    BAMBOO_DEBUGPRINT_REG(b);
+    BAMBOO_DEBUGPRINT_REG(*remain);
+#endif
+  }       // if(memneed < remain)
+  gcmovepending--;
+} // void compact2Heaptophelper_I(int, int*, int*, int*)
+
+inline void compact2Heaptop() {
+  // no cores with spare mem and some cores are blocked with pending move
+  // find the current heap top and make them move to the heap top
+  int p;
+  int numblocks = gcfilledblocks[gctopcore];
+  //BASEPTR(gctopcore, numblocks, &p);
+  p = gcloads[gctopcore];
+  int b;
+  BLOCKINDEX(p, &b);
+  int remain = (b<NUMCORES4GC) ?
+               ((BAMBOO_SMEM_SIZE_L)-(p%(BAMBOO_SMEM_SIZE_L)))
+	       : ((BAMBOO_SMEM_SIZE)-(p%(BAMBOO_SMEM_SIZE)));
+  // check if the top core finishes
+  BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
+  if(gccorestatus[gctopcore] != 0) {
+#ifdef DEBUG
+    BAMBOO_DEBUGPRINT(0xd101);
+    BAMBOO_DEBUGPRINT_REG(gctopcore);
+#endif
+    // let the top core finishes its own work first
+    compact2Heaptophelper_I(gctopcore, &p, &numblocks, &remain);
+    BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
+    return;
+  }
+  BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
+
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(0xd102);
+  BAMBOO_DEBUGPRINT_REG(gctopcore);
+  BAMBOO_DEBUGPRINT_REG(p);
+  BAMBOO_DEBUGPRINT_REG(b);
+  BAMBOO_DEBUGPRINT_REG(remain);
+#endif
+  for(int i = 0; i < NUMCORES4GC; i++) {
+    BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
+    if((gccorestatus[i] != 0) && (gcrequiredmems[i] > 0)) {
+#ifdef DEBUG
+      BAMBOO_DEBUGPRINT(0xd103);
+#endif
+      compact2Heaptophelper_I(i, &p, &numblocks, &remain);
+      if(gccorestatus[gctopcore] != 0) {
+#ifdef DEBUG
+		BAMBOO_DEBUGPRINT(0xd101);
+		BAMBOO_DEBUGPRINT_REG(gctopcore);
+#endif
+		BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
+		// the top core is not free now
+		return;
+      }
+    }             // if((gccorestatus[i] != 0) && (gcrequiredmems[i] > 0))
+    BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
+  }       // for(i = 0; i < NUMCORES4GC; i++)
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(0xd106);
+#endif
+} // void compact2Heaptop()
+
+inline void resolvePendingMoveRequest() {
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(0xeb01);
+#endif
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(0xeeee);
+  for(int k = 0; k < NUMCORES4GC; k++) {
+    BAMBOO_DEBUGPRINT(0xf000+k);
+    BAMBOO_DEBUGPRINT_REG(gccorestatus[k]);
+    BAMBOO_DEBUGPRINT_REG(gcloads[k]);
+    BAMBOO_DEBUGPRINT_REG(gcfilledblocks[k]);
+    BAMBOO_DEBUGPRINT_REG(gcstopblock[k]);
+  }
+  BAMBOO_DEBUGPRINT(0xffff);
+#endif
+  int i;
+  int j;
+  bool nosparemem = true;
+  bool haspending = false;
+  bool hasrunning = false;
+  bool noblock = false;
+  int dstcore = 0;       // the core who need spare mem
+  int sourcecore = 0;       // the core who has spare mem
+  for(i = j = 0; (i < NUMCORES4GC) && (j < NUMCORES4GC); ) {
+    if(nosparemem) {
+      // check if there are cores with spare mem
+      if(gccorestatus[i] == 0) {
+		// finished working, check if it still have spare mem
+		if(gcfilledblocks[i] < gcstopblock[i]) {
+		  // still have spare mem
+		  nosparemem = false;
+		  sourcecore = i;
+		}  // if(gcfilledblocks[i] < gcstopblock[i]) else ...
+      }
+      i++;
+    }             // if(nosparemem)
+    if(!haspending) {
+      if(gccorestatus[j] != 0) {
+		// not finished, check if it has pending move requests
+		if((gcfilledblocks[j]==gcstopblock[j])&&(gcrequiredmems[j]>0)) {
+		  dstcore = j;
+		  haspending = true;
+		} else {
+		  hasrunning = true;
+		}  // if((gcfilledblocks[i] == gcstopblock[i])...) else ...
+      }  // if(gccorestatus[i] == 0) else ...
+      j++;
+    }  // if(!haspending)
+    if(!nosparemem && haspending) {
+      // find match
+      int tomove = 0;
+      int startaddr = 0;
+      BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
+      gcrequiredmems[dstcore] = assignSpareMem_I(sourcecore,
+                                                 gcrequiredmems[dstcore],
+                                                 &tomove,
+                                                 &startaddr);
+      BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
+#ifdef DEBUG
+      BAMBOO_DEBUGPRINT(0xeb02);
+      BAMBOO_DEBUGPRINT_REG(sourcecore);
+      BAMBOO_DEBUGPRINT_REG(dstcore);
+      BAMBOO_DEBUGPRINT_REG(startaddr);
+      BAMBOO_DEBUGPRINT_REG(tomove);
+#endif
+      if(STARTUPCORE == dstcore) {
+#ifdef DEBUG
+		BAMBOO_DEBUGPRINT(0xeb03);
+#endif
+		gcdstcore = sourcecore;
+		gctomove = true;
+		gcmovestartaddr = startaddr;
+		gcblock2fill = tomove;
+      } else {
+#ifdef DEBUG
+		BAMBOO_DEBUGPRINT(0xeb04);
+#endif
+		send_msg_4(dstcore, GCMOVESTART, sourcecore,
+				   startaddr, tomove, false);
+      }
+      gcmovepending--;
+      nosparemem = true;
+      haspending = false;
+      noblock = true;
+    }
+  }       // for(i = 0; i < NUMCORES4GC; i++)
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(0xcccc);
+  BAMBOO_DEBUGPRINT_REG(hasrunning);
+  BAMBOO_DEBUGPRINT_REG(haspending);
+  BAMBOO_DEBUGPRINT_REG(noblock);
+#endif
+
+  if(!hasrunning && !noblock) {
+    gcphase = SUBTLECOMPACTPHASE;
+    compact2Heaptop();
+  }
+
+} // void resovePendingMoveRequest()
+
+struct moveHelper {
+  int numblocks;       // block num for heap
+  INTPTR base;       // base virtual address of current heap block
+  INTPTR ptr;       // virtual address of current heap top
+  int offset;       // offset in current heap block
+  int blockbase;       // virtual address of current small block to check
+  int blockbound;       // bound virtual address of current small blcok
+  int sblockindex;       // index of the small blocks
+  int top;       // real size of current heap block to check
+  int bound;       // bound size of current heap block to check
+}; // struct moveHelper
+
+// If out of boundary of valid shared memory, return false, else return true
+inline bool nextSBlock(struct moveHelper * orig) {
+  orig->blockbase = orig->blockbound;
+  bool sbchanged = false;
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(0xecc0);
+  BAMBOO_DEBUGPRINT_REG(orig->blockbase);
+  BAMBOO_DEBUGPRINT_REG(orig->blockbound);
+  BAMBOO_DEBUGPRINT_REG(orig->bound);
+  BAMBOO_DEBUGPRINT_REG(orig->ptr);
+#endif
+outernextSBlock:
+  // check if across a big block
+  // TODO now do not zero out the whole memory, maybe the last two conditions
+  // are useless now
+  if((orig->blockbase >= orig->bound) || (orig->ptr >= orig->bound)
+     || ((orig->ptr != NULL) && (*((int*)orig->ptr))==0)
+     || ((*((int*)orig->blockbase))==0)) {
+innernextSBlock:
+    // end of current heap block, jump to next one
+    orig->numblocks++;
+#ifdef DEBUG
+    BAMBOO_DEBUGPRINT(0xecc1);
+    BAMBOO_DEBUGPRINT_REG(orig->numblocks);
+#endif
+    BASEPTR(BAMBOO_NUM_OF_CORE, orig->numblocks, &(orig->base));
+#ifdef DEBUG
+    BAMBOO_DEBUGPRINT(orig->base);
+#endif
+    if(orig->base >= gcbaseva + BAMBOO_SHARED_MEM_SIZE) {
+      // out of boundary
+      orig->ptr = orig->base; // set current ptr to out of boundary too
+      return false;
+    }
+    //orig->bound = orig->base + BAMBOO_SMEM_SIZE;
+    orig->blockbase = orig->base;
+    orig->sblockindex = (orig->blockbase-gcbaseva)/BAMBOO_SMEM_SIZE;
+    sbchanged = true;
+    int blocknum = 0;
+    BLOCKINDEX(orig->base, &blocknum);
+    if(bamboo_smemtbl[blocknum] == 0) {
+      // goto next block
+      goto innernextSBlock;
+    }
+	// check the bamboo_smemtbl to decide the real bound
+	orig->bound = orig->base + bamboo_smemtbl[blocknum];
+  } else if(0 == (orig->blockbase%BAMBOO_SMEM_SIZE)) {
+    orig->sblockindex += 1;
+    sbchanged = true;
+  }  // if((orig->blockbase >= orig->bound) || (orig->ptr >= orig->bound)...
+
+  // check if this sblock should be skipped or have special start point
+  if(gcsbstarttbl[orig->sblockindex] == -1) {
+    // goto next sblock
+#ifdef DEBUG
+    BAMBOO_DEBUGPRINT(0xecc2);
+#endif
+    orig->sblockindex += 1;
+    orig->blockbase += BAMBOO_SMEM_SIZE;
+    goto outernextSBlock;
+  } else if((gcsbstarttbl[orig->sblockindex] != 0)
+            && (sbchanged)) {
+    // the first time to access this SBlock
+#ifdef DEBUG
+    BAMBOO_DEBUGPRINT(0xecc3);
+#endif
+    // not start from the very beginning
+    orig->blockbase = gcsbstarttbl[orig->sblockindex];
+  }       // if(gcsbstarttbl[orig->sblockindex] == -1) else ...
+
+  // setup information for this sblock
+  orig->blockbound = orig->blockbase + *((int*)(orig->blockbase));
+  orig->offset = BAMBOO_CACHE_LINE_SIZE;
+  orig->ptr = orig->blockbase + orig->offset;
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(0xecc4);
+  BAMBOO_DEBUGPRINT_REG(orig->base);
+  BAMBOO_DEBUGPRINT_REG(orig->bound);
+  BAMBOO_DEBUGPRINT_REG(orig->ptr);
+  BAMBOO_DEBUGPRINT_REG(orig->blockbound);
+  BAMBOO_DEBUGPRINT_REG(orig->blockbase);
+  BAMBOO_DEBUGPRINT_REG(orig->offset);
+#endif
+  if(orig->ptr >= orig->bound) {
+    // met a lobj, move to next block
+    goto innernextSBlock;
+  }
+
+  return true;
+} // bool nextSBlock(struct moveHelper * orig)
+
+// return false if there are no available data to compact
+inline bool initOrig_Dst(struct moveHelper * orig,
+                         struct moveHelper * to) {
+  // init the dst ptr
+  to->numblocks = 0;
+  to->top = to->offset = BAMBOO_CACHE_LINE_SIZE;
+  to->bound = BAMBOO_SMEM_SIZE_L;
+  BASEPTR(BAMBOO_NUM_OF_CORE, to->numblocks, &(to->base));
+
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(0xef01);
+  BAMBOO_DEBUGPRINT_REG(to->base);
+#endif
+  to->ptr = to->base + to->offset;
+
+  // init the orig ptr
+  orig->numblocks = 0;
+  orig->base = to->base;
+  int blocknum = 0;
+  BLOCKINDEX(orig->base, &blocknum);
+  // check the bamboo_smemtbl to decide the real bound
+  orig->bound = orig->base + bamboo_smemtbl[blocknum];
+  orig->blockbase = orig->base;
+  orig->sblockindex = (orig->base - gcbaseva) / BAMBOO_SMEM_SIZE;
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(0xef02);
+  BAMBOO_DEBUGPRINT_REG(orig->base);
+  BAMBOO_DEBUGPRINT_REG(orig->sblockindex);
+  BAMBOO_DEBUGPRINT_REG(gcsbstarttbl);
+  BAMBOO_DEBUGPRINT_REG(gcsbstarttbl[orig->sblockindex]);
+#endif
+
+  if(gcsbstarttbl[orig->sblockindex] == -1) {
+#ifdef DEBUG
+    BAMBOO_DEBUGPRINT(0xef03);
+#endif
+    // goto next sblock
+    orig->blockbound =
+      gcbaseva+BAMBOO_SMEM_SIZE*(orig->sblockindex+1);
+    return nextSBlock(orig);
+  } else if(gcsbstarttbl[orig->sblockindex] != 0) {
+#ifdef DEBUG
+    BAMBOO_DEBUGPRINT(0xef04);
+#endif
+    orig->blockbase = gcsbstarttbl[orig->sblockindex];
+  }
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(0xef05);
+#endif
+  orig->blockbound = orig->blockbase + *((int*)(orig->blockbase));
+  orig->offset = BAMBOO_CACHE_LINE_SIZE;
+  orig->ptr = orig->blockbase + orig->offset;
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(0xef06);
+  BAMBOO_DEBUGPRINT_REG(orig->base);
+#endif
+  return true;
+} // bool initOrig_Dst(struct moveHelper * orig, struct moveHelper * to)
+
+inline void nextBlock(struct moveHelper * to) {
+  to->top = to->bound + BAMBOO_CACHE_LINE_SIZE;       // header!
+  to->bound += BAMBOO_SMEM_SIZE;
+  to->numblocks++;
+  BASEPTR(BAMBOO_NUM_OF_CORE, to->numblocks, &(to->base));
+  to->offset = BAMBOO_CACHE_LINE_SIZE;
+  to->ptr = to->base + to->offset;
+} // void nextBlock(struct moveHelper * to)
+
+// endaddr does not contain spaces for headers
+inline bool moveobj(struct moveHelper * orig,
+                    struct moveHelper * to,
+                    int stopblock) {
+  if(stopblock == 0) {
+    return true;
+  }
+
+#ifdef DEBUG
+  //if((int)orig->ptr > 0x10767a00) {
+  BAMBOO_DEBUGPRINT(0xe201);
+  BAMBOO_DEBUGPRINT_REG(orig->ptr);
+  BAMBOO_DEBUGPRINT_REG(to->ptr);
+  //}
+#endif
+
+  int type = 0;
+  int size = 0;
+  int mark = 0;
+  int isize = 0;
+innermoveobj:
+  while((char)(*((int*)(orig->ptr))) == (char)(-2)) {
+    orig->ptr = (int*)(orig->ptr) + 1;
+  }
+  if((orig->ptr >= orig->bound) || (orig->ptr == orig->blockbound)) {
+    if(!nextSBlock(orig)) {
+      // finished, no more data
+      return true;
+    }
+    goto innermoveobj;
+  }
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(0xe202);
+  BAMBOO_DEBUGPRINT_REG(orig->ptr);
+  BAMBOO_DEBUGPRINT(((int *)(orig->ptr))[0]);
+#endif
+  // check the obj's type, size and mark flag
+  type = ((int *)(orig->ptr))[0];
+  size = 0;
+  if(type == 0) {
+    // end of this block, go to next one
+    if(!nextSBlock(orig)) {
+      // finished, no more data
+      return true;
+    }
+    goto innermoveobj;
+  } else if(type < NUMCLASSES) {
+    // a normal object
+    size = classsize[type];
+  } else {
+    // an array
+    struct ArrayObject *ao=(struct ArrayObject *)(orig->ptr);
+    int elementsize=classsize[type];
+    int length=ao->___length___;
+    size=sizeof(struct ArrayObject)+length*elementsize;
+  }
+  mark = ((int *)(orig->ptr))[6];
+  bool isremote = ((((int *)(orig->ptr))[6] & REMOTEM) != 0);
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(0xe203);
+  BAMBOO_DEBUGPRINT_REG(orig->ptr);
+  BAMBOO_DEBUGPRINT_REG(size);
+#endif
+  ALIGNSIZE(size, &isize);       // no matter is the obj marked or not
+                                 // should be able to across it
+  if((mark & MARKED) != 0) {
+#ifdef DEBUG
+//if((int)orig->ptr > 0x10760f00) {
+    BAMBOO_DEBUGPRINT(0xe204);
+//}
+#endif
+#ifdef GC_PROFILE
+	gc_num_liveobj++;
+#endif
+    // marked obj, copy it to current heap top
+    // check to see if remaining space is enough
+    if(to->top + isize > to->bound) {
+      // fill 0 indicating the end of this block
+      BAMBOO_MEMSET_WH(to->ptr,  '\0', to->bound - to->top);
+      // fill the header of this block and then go to next block
+      to->offset += to->bound - to->top;
+      BAMBOO_MEMSET_WH(to->base, '\0', BAMBOO_CACHE_LINE_SIZE);
+      (*((int*)(to->base))) = to->offset;
+      nextBlock(to);
+      if(stopblock == to->numblocks) {
+		// already fulfilled the block
+		return true;
+      }   // if(stopblock == to->numblocks)
+    }   // if(to->top + isize > to->bound)
+    // set the mark field to 2, indicating that this obj has been moved
+    // and need to be flushed
+    ((int *)(orig->ptr))[6] = COMPACTED;
+    if(to->ptr != orig->ptr) {
+      if((int)(orig->ptr) < (int)(to->ptr)+size) {
+		memmove(to->ptr, orig->ptr, size);
+      } else {
+		//BAMBOO_WRITE_HINT_CACHE(to->ptr, size);
+		memcpy(to->ptr, orig->ptr, size);
+      }
+      // fill the remaining space with -2
+      BAMBOO_MEMSET_WH(to->ptr+size, -2, isize-size);
+    }
+    // store mapping info
+    BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
+#ifdef LOCALHASHTBL_TEST
+    RuntimeHashadd_I(gcpointertbl, orig->ptr, to->ptr);
+#else
+	mgchashInsert_I(gcpointertbl, orig->ptr, to->ptr);
+#endif
+	//MGCHashadd_I(gcpointertbl, orig->ptr, to->ptr);
+	if(isremote) {
+#ifdef GC_PROFILE
+	//unsigned long long ttimet = BAMBOO_GET_EXE_TIME();
+#endif
+	  // add to the sharedptbl
+	  if(gcsharedptbl != NULL) {
+		//GCSharedHashadd_I(gcsharedptbl, orig->ptr, to->ptr);
+		mgcsharedhashInsert_I(gcsharedptbl, orig->ptr, to->ptr);
+		//num_mapinforequest++; // TODO
+	  }
+#ifdef GC_PROFILE
+	//flushstalltime_i += BAMBOO_GET_EXE_TIME()-ttimet;
+#endif
+	}
+    BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
+    //}
+#ifdef DEBUG
+    BAMBOO_DEBUGPRINT(0xcdce);
+    BAMBOO_DEBUGPRINT_REG(orig->ptr);
+    BAMBOO_DEBUGPRINT_REG(to->ptr);
+	BAMBOO_DEBUGPRINT_REG(isize);
+#endif
+    gccurr_heaptop -= isize;
+    to->ptr += isize;
+    to->offset += isize;
+    to->top += isize;
+    if(to->top == to->bound) {
+      // fill the header of this block and then go to next block
+      BAMBOO_MEMSET_WH(to->base, '\0', BAMBOO_CACHE_LINE_SIZE);
+      (*((int*)(to->base))) = to->offset;
+      nextBlock(to);
+    }
+  }       // if(mark == 1)
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(0xe205);
+#endif
+  // move to next obj
+  orig->ptr += size;
+
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT_REG(isize);
+  BAMBOO_DEBUGPRINT_REG(size);
+  BAMBOO_DEBUGPRINT_REG(orig->ptr);
+  BAMBOO_DEBUGPRINT_REG(orig->bound);
+#endif
+  if((orig->ptr > orig->bound) || (orig->ptr == orig->blockbound)) {
+#ifdef DEBUG
+    BAMBOO_DEBUGPRINT(0xe206);
+#endif
+    if(!nextSBlock(orig)) {
+      // finished, no more data
+      return true;
+    }
+  }
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(0xe207);
+  BAMBOO_DEBUGPRINT_REG(orig->ptr);
+#endif
+  return false;
+} //bool moveobj(struct moveHelper* orig,struct moveHelper* to,int* endaddr)
+
+// should be invoked with interrupt closed
+inline int assignSpareMem_I(int sourcecore,
+                            int * requiredmem,
+                            int * tomove,
+                            int * startaddr) {
+  int b = 0;
+  BLOCKINDEX(gcloads[sourcecore], &b);
+  int boundptr = (b<NUMCORES4GC) ? ((b+1)*BAMBOO_SMEM_SIZE_L)
+		 : (BAMBOO_LARGE_SMEM_BOUND+(b-NUMCORES4GC+1)*BAMBOO_SMEM_SIZE);
+  int remain = boundptr - gcloads[sourcecore];
+  int memneed = requiredmem + BAMBOO_CACHE_LINE_SIZE;
+  *startaddr = gcloads[sourcecore];
+  *tomove = gcfilledblocks[sourcecore] + 1;
+  if(memneed < remain) {
+    gcloads[sourcecore] += memneed;
+    return 0;
+  } else {
+    // next available block
+    gcfilledblocks[sourcecore] += 1;
+    int newbase = 0;
+    BASEPTR(sourcecore, gcfilledblocks[sourcecore], &newbase);
+    gcloads[sourcecore] = newbase;
+    return requiredmem-remain;
+  }
+} // int assignSpareMem_I(int ,int * , int * , int * )
+
+// should be invoked with interrupt closed
+inline bool gcfindSpareMem_I(int * startaddr,
+                             int * tomove,
+                             int * dstcore,
+                             int requiredmem,
+                             int requiredcore) {
+  for(int k = 0; k < NUMCORES4GC; k++) {
+    if((gccorestatus[k] == 0) && (gcfilledblocks[k] < gcstopblock[k])) {
+      // check if this stopped core has enough mem
+      assignSpareMem_I(k, requiredmem, tomove, startaddr);
+      *dstcore = k;
+      return true;
+    }
+  }
+  // if can not find spare mem right now, hold the request
+  gcrequiredmems[requiredcore] = requiredmem;
+  gcmovepending++;
+  return false;
+} //bool gcfindSpareMem_I(int* startaddr,int* tomove,int mem,int core)
+
+inline bool compacthelper(struct moveHelper * orig,
+                          struct moveHelper * to,
+                          int * filledblocks,
+                          int * heaptopptr,
+                          bool * localcompact) {
+  // scan over all objs in this block, compact the marked objs
+  // loop stop when finishing either scanning all active objs or
+  // fulfilled the gcstopblock
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(0xe101);
+  BAMBOO_DEBUGPRINT_REG(gcblock2fill);
+  BAMBOO_DEBUGPRINT_REG(gcmarkedptrbound);
+#endif
+innercompact:
+  while(orig->ptr < gcmarkedptrbound) {
+    bool stop = moveobj(orig, to, gcblock2fill);
+    if(stop) {
+      break;
+    }
+  }
+  // if no objs have been compact, do nothing,
+  // otherwise, fill the header of this block
+  if(to->offset > BAMBOO_CACHE_LINE_SIZE) {
+    BAMBOO_MEMSET_WH(to->base, '\0', BAMBOO_CACHE_LINE_SIZE);
+    (*((int*)(to->base))) = to->offset;
+  } else {
+    to->offset = 0;
+    to->ptr = to->base;
+    to->top -= BAMBOO_CACHE_LINE_SIZE;
+  }       // if(to->offset > BAMBOO_CACHE_LINE_SIZE) else ...
+  if(*localcompact) {
+    *heaptopptr = to->ptr;
+    *filledblocks = to->numblocks;
+  }
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(0xe102);
+  BAMBOO_DEBUGPRINT_REG(orig->ptr);
+  BAMBOO_DEBUGPRINT_REG(gcmarkedptrbound);
+  BAMBOO_DEBUGPRINT_REG(*heaptopptr);
+  BAMBOO_DEBUGPRINT_REG(*filledblocks);
+  BAMBOO_DEBUGPRINT_REG(gccurr_heaptop);
+#endif
+
+  // send msgs to core coordinator indicating that the compact is finishing
+  // send compact finish message to core coordinator
+  if(STARTUPCORE == BAMBOO_NUM_OF_CORE) {
+    gcfilledblocks[BAMBOO_NUM_OF_CORE] = *filledblocks;
+    gcloads[BAMBOO_NUM_OF_CORE] = *heaptopptr;
+    if(orig->ptr < gcmarkedptrbound) {
+#ifdef DEBUG
+      BAMBOO_DEBUGPRINT(0xe103);
+#endif
+      // ask for more mem
+      gctomove = false;
+      BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
+      if(gcfindSpareMem_I(&gcmovestartaddr, &gcblock2fill, &gcdstcore,
+                          gccurr_heaptop, BAMBOO_NUM_OF_CORE)) {
+#ifdef DEBUG
+		BAMBOO_DEBUGPRINT(0xe104);
+#endif
+		gctomove = true;
+      } else {
+		BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
+#ifdef DEBUG
+		BAMBOO_DEBUGPRINT(0xe105);
+#endif
+		return false;
+      }
+      BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
+    } else {
+#ifdef DEBUG
+      BAMBOO_DEBUGPRINT(0xe106);
+#endif
+      gccorestatus[BAMBOO_NUM_OF_CORE] = 0;
+      gctomove = false;
+      return true;
+    }
+  } else {
+    if(orig->ptr < gcmarkedptrbound) {
+#ifdef DEBUG
+      BAMBOO_DEBUGPRINT(0xe107);
+#endif
+      // ask for more mem
+      gctomove = false;
+      send_msg_5(STARTUPCORE, GCFINISHCOMPACT, BAMBOO_NUM_OF_CORE,
+                 *filledblocks, *heaptopptr, gccurr_heaptop, false);
+    } else {
+#ifdef DEBUG
+      BAMBOO_DEBUGPRINT(0xe108);
+      BAMBOO_DEBUGPRINT_REG(*heaptopptr);
+#endif
+      // finish compacting
+      send_msg_5(STARTUPCORE, GCFINISHCOMPACT, BAMBOO_NUM_OF_CORE,
+                 *filledblocks, *heaptopptr, 0, false);
+    }
+  }       // if(STARTUPCORE == BAMBOO_NUM_OF_CORE)
+
+  if(orig->ptr < gcmarkedptrbound) {
+#ifdef DEBUG
+    BAMBOO_DEBUGPRINT(0xe109);
+#endif
+    // still have unpacked obj
+    while(true) {
+      if(gctomove) {
+		break;
+      }
+    }
+    ;
+	gctomove = false;
+#ifdef DEBUG
+    BAMBOO_DEBUGPRINT(0xe10a);
+#endif
+
+    to->ptr = gcmovestartaddr;
+    to->numblocks = gcblock2fill - 1;
+    to->bound = (to->numblocks==0) ?
+                BAMBOO_SMEM_SIZE_L :
+                BAMBOO_SMEM_SIZE_L+BAMBOO_SMEM_SIZE*to->numblocks;
+    BASEPTR(gcdstcore, to->numblocks, &(to->base));
+    to->offset = to->ptr - to->base;
+    to->top = (to->numblocks==0) ?
+              (to->offset) : (to->bound-BAMBOO_SMEM_SIZE+to->offset);
+    to->base = to->ptr;
+    to->offset = BAMBOO_CACHE_LINE_SIZE;
+    to->ptr += to->offset;             // for header
+    to->top += to->offset;
+    if(gcdstcore == BAMBOO_NUM_OF_CORE) {
+      *localcompact = true;
+    } else {
+      *localcompact = false;
+    }
+    goto innercompact;
+  }
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(0xe10b);
+#endif
+  return true;
+} // void compacthelper()
+
+inline void compact() {
+  if(COMPACTPHASE != gcphase) {
+    BAMBOO_EXIT(0xb102);
+  }
+
+  // initialize pointers for comapcting
+  struct moveHelper * orig =
+    (struct moveHelper *)RUNMALLOC(sizeof(struct moveHelper));
+  struct moveHelper * to =
+    (struct moveHelper *)RUNMALLOC(sizeof(struct moveHelper));
+
+  if(!initOrig_Dst(orig, to)) {
+    // no available data to compact
+    // send compact finish msg to STARTUP core
+#ifdef DEBUG
+    BAMBOO_DEBUGPRINT(0xe001);
+    BAMBOO_DEBUGPRINT_REG(to->base);
+#endif
+    send_msg_5(STARTUPCORE, GCFINISHCOMPACT, BAMBOO_NUM_OF_CORE,
+               0, to->base, 0, false);
+    RUNFREE(orig);
+    RUNFREE(to);
+    return;
+  }
+
+  int filledblocks = 0;
+  INTPTR heaptopptr = 0;
+  bool localcompact = true;
+  compacthelper(orig, to, &filledblocks, &heaptopptr, &localcompact);
+
+  RUNFREE(orig);
+  RUNFREE(to);
+} // compact()
+
+// if return NULL, means
+//   1. objptr is NULL
+//   2. objptr is not a shared obj
+// in these cases, remain the original value is OK
+inline void * flushObj(void * objptr) {
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(0xe401);
+#endif
+  if(objptr == NULL) {
+    return NULL;
+  }
+  void * dstptr = NULL;
+  if(ISSHAREDOBJ(objptr)) {
+#ifdef DEBUG
+    BAMBOO_DEBUGPRINT(0xe402);
+    BAMBOO_DEBUGPRINT_REG(objptr);
+#endif
+    // a shared obj ptr, change to new address
+    BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
+#ifdef GC_PROFILE
+    //unsigned long long ttime = BAMBOO_GET_EXE_TIME();
+#endif
+#ifdef LOCALHASHTBL_TEST
+    RuntimeHashget(gcpointertbl, objptr, &dstptr);
+#else
+	dstptr = mgchashSearch(gcpointertbl, objptr);
+#endif
+	//MGCHashget(gcpointertbl, objptr, &dstptr);
+#ifdef GC_PROFILE
+    //flushstalltime += BAMBOO_GET_EXE_TIME()-ttime;
+#endif
+    BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
+#ifdef DEBUG
+    BAMBOO_DEBUGPRINT_REG(dstptr);
+#endif
+
+    if(NULL == dstptr) {
+      // no mapping info
+#ifdef DEBUG
+      BAMBOO_DEBUGPRINT(0xe403);
+      BAMBOO_DEBUGPRINT_REG(objptr);
+      BAMBOO_DEBUGPRINT_REG(hostcore(objptr));
+#endif
+      if(hostcore(objptr) == BAMBOO_NUM_OF_CORE) {
+		// error! the obj is right on this core, but cannot find it
+		//BAMBOO_DEBUGPRINT(0xecec);
+		BAMBOO_DEBUGPRINT_REG(objptr);
+		BAMBOO_EXIT(0xb103);
+		// assume that the obj has not been moved, use the original address
+		//dstptr = objptr;
+      } else {
+		int hostc = hostcore(objptr);
+#ifdef GC_PROFILE
+		//unsigned long long ttimet = BAMBOO_GET_EXE_TIME();
+#endif
+		// check the corresponsing sharedptbl
+		BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
+		//struct GCSharedHash * sptbl = gcrpointertbls[hostcore(objptr)];
+		mgcsharedhashtbl_t * sptbl = gcrpointertbls[hostc];
+		if(sptbl != NULL) {
+		  //GCSharedHashget(sptbl, (int)objptr, &dstptr);
+		  dstptr = mgcsharedhashSearch(sptbl, (int)objptr);
+		  if(dstptr != NULL) {
+#ifdef LOCALHASHTBL_TEST
+			RuntimeHashadd_I(gcpointertbl, (int)objptr, (int)dstptr);
+#else
+			mgchashInsert_I(gcpointertbl, (int)objptr, (int)dstptr);
+#endif
+		  }
+		}
+		BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
+#ifdef GC_PROFILE
+		//flushstalltime_i += BAMBOO_GET_EXE_TIME()-ttimet;
+#endif
+
+		if(dstptr == NULL) {
+		  // still can not get the mapping info,
+		  // send msg to host core for the mapping info
+		  gcobj2map = (int)objptr;
+		  gcismapped = false;
+		  gcmappedobj = NULL;
+#ifdef GC_PROFILE
+		  // TODO
+		  //num_mapinforequest++;
+		  //unsigned long long ttime = BAMBOO_GET_EXE_TIME();
+#endif
+#ifdef GC_PROFILE
+		  //unsigned long long ttimet = BAMBOO_GET_EXE_TIME();
+#endif
+		  // the first time require the mapping, send msg to the hostcore
+		  // for the mapping info
+		  send_msg_3(hostc, GCMAPREQUEST, (int)objptr,
+			  BAMBOO_NUM_OF_CORE, false);
+		  while(true) {
+			if(gcismapped) {
+			  break;
+			}
+		  }
+#ifdef GC_PROFILE
+		  //flushstalltime_i += BAMBOO_GET_EXE_TIME()-ttimet;
+#endif
+#ifdef GC_PROFILE
+		  // TODO
+		  //flushstalltime += BAMBOO_GET_EXE_TIME() - ttime;
+#endif
+		  BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
+#ifdef LOCALHASHTBL_TEST
+		  RuntimeHashget(gcpointertbl, objptr, &dstptr);
+#else
+		  dstptr = mgchashSearch(gcpointertbl, objptr);
+#endif
+		  //MGCHashget(gcpointertbl, objptr, &dstptr);
+		  BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
+		} // if(dstptr == NULL)
+	  }    // if(hostcore(objptr) == BAMBOO_NUM_OF_CORE) else ...
+#ifdef DEBUG
+      BAMBOO_DEBUGPRINT_REG(dstptr);
+#endif
+    }     // if(NULL == dstptr)
+  }      // if(ISSHAREDOBJ(objptr))
+         // if not a shared obj, return NULL to indicate no need to flush
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(0xe404);
+#endif
+  return dstptr;
+} // void flushObj(void * objptr)
+
+inline void flushRuntimeObj(struct garbagelist * stackptr) {
+  int i,j;
+  // flush current stack
+  while(stackptr!=NULL) {
+    for(i=0; i<stackptr->size; i++) {
+      if(stackptr->array[i] != NULL) {
+		void * dst = flushObj(stackptr->array[i]);
+		if(dst != NULL) {
+		  stackptr->array[i] = dst;
+		}
+      }
+    }
+    stackptr=stackptr->next;
+  }
+
+  // flush objectsets
+  if(BAMBOO_NUM_OF_CORE < NUMCORESACTIVE) {
+    for(i=0; i<NUMCLASSES; i++) {
+      struct parameterwrapper ** queues =
+        objectqueues[BAMBOO_NUM_OF_CORE][i];
+      int length = numqueues[BAMBOO_NUM_OF_CORE][i];
+      for(j = 0; j < length; ++j) {
+		struct parameterwrapper * parameter = queues[j];
+		struct ObjectHash * set=parameter->objectset;
+		struct ObjectNode * ptr=set->listhead;
+		while(ptr!=NULL) {
+		  void * dst = flushObj((void *)ptr->key);
+		  if(dst != NULL) {
+			ptr->key = dst;
+		  }
+		  ptr=ptr->lnext;
+		}
+		ObjectHashrehash(set);
+      }
+    }
+  }
+
+  // flush current task descriptor
+  if(currtpd != NULL) {
+    for(i=0; i<currtpd->numParameters; i++) {
+      void * dst = flushObj(currtpd->parameterArray[i]);
+      if(dst != NULL) {
+		currtpd->parameterArray[i] = dst;
+      }
+    }
+  }
+
+  // flush active tasks
+  if(activetasks != NULL) {
+    struct genpointerlist * ptr=activetasks->list;
+    while(ptr!=NULL) {
+      struct taskparamdescriptor *tpd=ptr->src;
+      int i;
+      for(i=0; i<tpd->numParameters; i++) {
+		void * dst = flushObj(tpd->parameterArray[i]);
+		if(dst != NULL) {
+		  tpd->parameterArray[i] = dst;
+		}
+      }
+      ptr=ptr->inext;
+    }
+    genrehash(activetasks);
+  }
+
+  // flush cached transferred obj
+  struct QueueItem * tmpobjptr =  getHead(&objqueue);
+  while(tmpobjptr != NULL) {
+    struct transObjInfo * objInfo =
+      (struct transObjInfo *)(tmpobjptr->objectptr);
+    void * dst = flushObj(objInfo->objptr);
+    if(dst != NULL) {
+      objInfo->objptr = dst;
+    }
+    tmpobjptr = getNextQueueItem(tmpobjptr);
+  }
+
+  // flush cached objs to be transferred
+  struct QueueItem * item = getHead(totransobjqueue);
+  while(item != NULL) {
+    struct transObjInfo * totransobj =
+      (struct transObjInfo *)(item->objectptr);
+    void * dst = flushObj(totransobj->objptr);
+    if(dst != NULL) {
+      totransobj->objptr = dst;
+    }
+    item = getNextQueueItem(item);
+  }       // while(item != NULL)
+
+  // enqueue lock related info
+  for(i = 0; i < runtime_locklen; ++i) {
+    void * dst = flushObj(runtime_locks[i].redirectlock);
+    if(dst != NULL) {
+      runtime_locks[i].redirectlock = (int)dst;
+    }
+    if(runtime_locks[i].value != NULL) {
+      void * dst=flushObj(runtime_locks[i].value);
+      if(dst != NULL) {
+		runtime_locks[i].value = (int)dst;
+      }
+    }
+  }
+
+} // void flushRuntimeObj(struct garbagelist * stackptr)
+
+inline void transmappinginfo() {
+  // broadcast the sharedptbl pointer
+  for(int i = 0; i < NUMCORESACTIVE; i++) {
+	if(i != BAMBOO_NUM_OF_CORE) {
+	  send_msg_3(i, GCMAPTBL, gcsharedptbl, BAMBOO_NUM_OF_CORE, false);
+	}
+  }
+
+  if(STARTUPCORE != BAMBOO_NUM_OF_CORE) {
+	send_msg_2(STARTUPCORE, GCFINISHMAPINFO, BAMBOO_NUM_OF_CORE, false);
+  }
+}
+
+inline void flush(struct garbagelist * stackptr) {
+#ifdef GC_PROFILE
+  /* TODO if(BAMBOO_NUM_OF_CORE == 0) {
+    BAMBOO_DEBUGPRINT(0xcccc);
+    BAMBOO_DEBUGPRINT_REG(BAMBOO_GET_EXE_TIME());
+  }*/
+#endif
+
+  flushRuntimeObj(stackptr);
+#ifdef GC_PROFILE
+  // TODO if(BAMBOO_NUM_OF_CORE == 0) BAMBOO_DEBUGPRINT_REG(BAMBOO_GET_EXE_TIME());
+#endif
+
+  while(true) {
+    BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
+    bool hasItems = gc_moreItems_I();
+    BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
+    if(!hasItems) {
+      break;
+    }
+
+#ifdef DEBUG
+    BAMBOO_DEBUGPRINT(0xe301);
+#endif
+    BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
+    void * ptr = gc_dequeue_I();
+    BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
+    if(ISSHAREDOBJ(ptr)) {
+      // should be a local shared obj and should have mapping info
+      ptr = flushObj(ptr);
+#ifdef DEBUG
+      BAMBOO_DEBUGPRINT(0xe302);
+      BAMBOO_DEBUGPRINT_REG(ptr);
+      BAMBOO_DEBUGPRINT_REG(tptr);
+      BAMBOO_DEBUGPRINT_REG(((int *)(tptr))[0]);
+#endif
+      if(ptr == NULL) {
+		BAMBOO_EXIT(0xb105);
+      }
+    } // if(ISSHAREDOBJ(ptr))
+    if((!ISSHAREDOBJ(ptr)) || (((int *)(ptr))[6] == COMPACTED)) {
+      int type = ((int *)(ptr))[0];
+      // scan all pointers in ptr
+      unsigned INTPTR * pointer;
+      pointer=pointerarray[type];
+#ifdef DEBUG
+      BAMBOO_DEBUGPRINT(0xe303);
+      BAMBOO_DEBUGPRINT_REG(pointer);
+#endif
+      if (pointer==0) {
+		/* Array of primitives */
+		/* Do nothing */
+      } else if (((INTPTR)pointer)==1) {
+#ifdef DEBUG
+		BAMBOO_DEBUGPRINT(0xe304);
+#endif
+		/* Array of pointers */
+		struct ArrayObject *ao=(struct ArrayObject *) ptr;
+		int length=ao->___length___;
+		int j;
+		for(j=0; j<length; j++) {
+#ifdef DEBUG
+		  BAMBOO_DEBUGPRINT(0xe305);
+#endif
+		  void *objptr=
+			((void **)(((char *)&ao->___length___)+sizeof(int)))[j];
+#ifdef DEBUG
+		  BAMBOO_DEBUGPRINT_REG(objptr);
+#endif
+		  if(objptr != NULL) {
+			void * dst = flushObj(objptr);
+			if(dst != NULL) {
+			  ((void **)(((char *)&ao->___length___)+sizeof(int)))[j] = dst;
+			}
+		  }
+		}
+      } else {
+#ifdef DEBUG
+		BAMBOO_DEBUGPRINT(0xe306);
+#endif
+		INTPTR size=pointer[0];
+		int i;
+		for(i=1; i<=size; i++) {
+#ifdef DEBUG
+		  BAMBOO_DEBUGPRINT(0xe307);
+#endif
+		  unsigned int offset=pointer[i];
+		  void * objptr=*((void **)(((char *)ptr)+offset));
+#ifdef DEBUG
+		  BAMBOO_DEBUGPRINT_REG(objptr);
+#endif
+		  if(objptr != NULL) {
+			void * dst = flushObj(objptr);
+			if(dst != NULL) {
+			  *((void **)(((char *)ptr)+offset)) = dst;
+			}
+		  }
+		} // for(i=1; i<=size; i++)
+      }  // if (pointer==0) else if (((INTPTR)pointer)==1) else ()
+         // restore the mark field, indicating that this obj has been flushed
+      if(ISSHAREDOBJ(ptr)) {
+		((int *)(ptr))[6] = INIT;
+      }
+    }  // if((!ISSHAREDOBJ(ptr)) || (((int *)(ptr))[6] == COMPACTED))
+  }   // while(gc_moreItems())
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(0xe308);
+#endif
+#ifdef GC_PROFILE
+  // TODO if(BAMBOO_NUM_OF_CORE == 0) BAMBOO_DEBUGPRINT_REG(BAMBOO_GET_EXE_TIME());
+#endif
+
+  // TODO bug here: the startup core contains all lobjs' info, thus all the
+  // lobjs are flushed in sequence.
+  // flush lobjs
+  while(gc_lobjmoreItems_I()) {
+#ifdef DEBUG
+    BAMBOO_DEBUGPRINT(0xe309);
+#endif
+    void * ptr = gc_lobjdequeue_I(NULL, NULL);
+    ptr = flushObj(ptr);
+#ifdef DEBUG
+    BAMBOO_DEBUGPRINT(0xe30a);
+    BAMBOO_DEBUGPRINT_REG(ptr);
+    BAMBOO_DEBUGPRINT_REG(tptr);
+    BAMBOO_DEBUGPRINT_REG(((int *)(tptr))[0]);
+#endif
+    if(ptr == NULL) {
+      BAMBOO_EXIT(0xb106);
+    }
+    if(((int *)(ptr))[6] == COMPACTED) {
+      int type = ((int *)(ptr))[0];
+      // scan all pointers in ptr
+      unsigned INTPTR * pointer;
+      pointer=pointerarray[type];
+#ifdef DEBUG
+      BAMBOO_DEBUGPRINT(0xe30b);
+      BAMBOO_DEBUGPRINT_REG(pointer);
+#endif
+      if (pointer==0) {
+		/* Array of primitives */
+		/* Do nothing */
+      } else if (((INTPTR)pointer)==1) {
+#ifdef DEBUG
+		BAMBOO_DEBUGPRINT(0xe30c);
+#endif
+		/* Array of pointers */
+		struct ArrayObject *ao=(struct ArrayObject *) ptr;
+		int length=ao->___length___;
+		int j;
+		for(j=0; j<length; j++) {
+#ifdef DEBUG
+		  BAMBOO_DEBUGPRINT(0xe30d);
+#endif
+		  void *objptr=
+			((void **)(((char *)&ao->___length___)+sizeof(int)))[j];
+#ifdef DEBUG
+		  BAMBOO_DEBUGPRINT_REG(objptr);
+#endif
+		  if(objptr != NULL) {
+			void * dst = flushObj(objptr);
+			if(dst != NULL) {
+			  ((void **)(((char *)&ao->___length___)+sizeof(int)))[j] = dst;
+			}
+		  }
+		}
+      } else {
+#ifdef DEBUG
+		BAMBOO_DEBUGPRINT(0xe30e);
+#endif
+		INTPTR size=pointer[0];
+		int i;
+		for(i=1; i<=size; i++) {
+#ifdef DEBUG
+		  BAMBOO_DEBUGPRINT(0xe30f);
+#endif
+		  unsigned int offset=pointer[i];
+		  void * objptr=*((void **)(((char *)ptr)+offset));
+
+#ifdef DEBUG
+		  BAMBOO_DEBUGPRINT_REG(objptr);
+#endif
+		  if(objptr != NULL) {
+			void * dst = flushObj(objptr);
+			if(dst != NULL) {
+			  *((void **)(((char *)ptr)+offset)) = dst;
+			}
+		  }
+		}  // for(i=1; i<=size; i++)
+      }  // if (pointer==0) else if (((INTPTR)pointer)==1) else ()
+         // restore the mark field, indicating that this obj has been flushed
+      ((int *)(ptr))[6] = INIT;
+    }     // if(((int *)(ptr))[6] == COMPACTED)
+  }     // while(gc_lobjmoreItems())
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(0xe310);
+#endif
+#ifdef GC_PROFILE
+  // TODO if(BAMBOO_NUM_OF_CORE == 0) BAMBOO_DEBUGPRINT_REG(BAMBOO_GET_EXE_TIME());
+#endif
+
+  // send flush finish message to core coordinator
+  if(STARTUPCORE == BAMBOO_NUM_OF_CORE) {
+    gccorestatus[BAMBOO_NUM_OF_CORE] = 0;
+  } else {
+    send_msg_2(STARTUPCORE, GCFINISHFLUSH, BAMBOO_NUM_OF_CORE, false);
+  }
+#ifdef GC_PROFILE
+  // TODO 
+  //if(BAMBOO_NUM_OF_CORE == 0) {
+    //BAMBOO_DEBUGPRINT(0xffff);
+    //BAMBOO_DEBUGPRINT_REG(num_mapinforequest);
+    //BAMBOO_DEBUGPRINT_REG(flushstalltime);
+    //BAMBOO_DEBUGPRINT_REG(num_mapinforequest_i);
+    //BAMBOO_DEBUGPRINT_REG(flushstalltime_i);
+  //}
+  //BAMBOO_DEBUGPRINT_REG(flushstalltime);
+#endif
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(0xe311);
+#endif
+} // flush()
+
+#ifdef GC_CACHE_ADAPT
+// prepare for cache adaption:
+//   -- flush the shared heap
+//   -- clean dtlb entries
+//   -- change cache strategy
+void cacheAdapt(bool isgccachestage) {
+  // flush the shared heap
+  BAMBOO_CACHE_FLUSH_L2();
+
+  // clean the dtlb entries
+  BAMBOO_CLEAN_DTLB();
+
+  // change the cache strategy
+  gccachestage = isgccachestage;
+}
+#endif // GC_CACHE_ADAPT
+
+inline void gc_collect(struct garbagelist * stackptr) {
+  //BAMBOO_DEBUGPRINT(0xcccc); // TODO 
+  // inform the master that this core is at a gc safe point and is ready to 
+  // do gc
+  send_msg_4(STARTUPCORE, GCFINISHPRE, BAMBOO_NUM_OF_CORE, self_numsendobjs, 
+	  self_numreceiveobjs, false);
+
+  // core collector routine
+  while(true) {
+    if(INITPHASE == gcphase) {
+      break;
+    }
+  }
+#ifdef RAWPATH // TODO GC_DEBUG
+  printf("(%X,%X) Do initGC\n", udn_tile_coord_x(), udn_tile_coord_y());
+#endif
+  initGC();
+#ifdef GC_CACHE_ADAPT
+  // prepare for cache adaption:
+  cacheAdapt(true);
+#endif // GC_CACHE_ADAPT
+  //send init finish msg to core coordinator
+  send_msg_2(STARTUPCORE, GCFINISHINIT, BAMBOO_NUM_OF_CORE, false);
+
+  while(true) {
+    if(MARKPHASE == gcphase) {
+      break;
+    }
+  }
+#ifdef RAWPATH // TODO GC_DEBUG
+  printf("(%x,%x) Start mark phase\n", udn_tile_coord_x(), 
+	     udn_tile_coord_y());
+#endif
+  mark(true, stackptr);
+#ifdef RAWPATH // TODO GC_DEBUG
+  printf("(%x,%x) Finish mark phase, start compact phase\n", 
+	     udn_tile_coord_x(), udn_tile_coord_y());
+#endif
+  compact();
+#ifdef RAWPATH // TODO GC_DEBUG
+  printf("(%x,%x) Finish compact phase\n", udn_tile_coord_x(),
+	     udn_tile_coord_y());
+#endif
+
+  while(true) {
+	if(MAPPHASE == gcphase) {
+	  break;
+	}
+  }
+#ifdef RAWPATH // TODO GC_DEBUG
+  printf("(%x,%x) Start map phase\n", udn_tile_coord_x(), 
+	     udn_tile_coord_y());
+#endif
+  transmappinginfo();
+#ifdef RAWPATH // TODO GC_DEBUG
+  printf("(%x,%x) Finish map phase\n", udn_tile_coord_x(),
+	     udn_tile_coord_y());
+#endif
+
+  while(true) {
+    if(FLUSHPHASE == gcphase) {
+      break;
+    }
+  }
+#ifdef RAWPATH // TODO GC_DEBUG
+  printf("(%x,%x) Start flush phase\n", udn_tile_coord_x(), 
+	     udn_tile_coord_y());
+#endif
+#ifdef GC_PROFILE
+  /*BAMBOO_DEBUGPRINT(0xaaaa);
+  BAMBOO_DEBUGPRINT_REG(gc_num_obj);
+  BAMBOO_DEBUGPRINT_REG(gc_num_liveobj);
+  BAMBOO_DEBUGPRINT_REG(gc_num_forwardobj);
+  BAMBOO_DEBUGPRINT(0xaaab);*/
+  // send the num of obj/liveobj/forwardobj to the startupcore
+  if(STARTUPCORE != BAMBOO_NUM_OF_CORE) {
+	send_msg_4(STARTUPCORE, GCPROFILES, gc_num_obj, 
+		gc_num_liveobj, gc_num_forwardobj, false);
+  }
+  gc_num_obj = 0;
+#endif // GC_PROFLIE
+  flush(stackptr);
+#ifdef RAWPATH // TODO GC_DEBUG
+  printf("(%x,%x) Finish flush phase\n", udn_tile_coord_x(),
+	     udn_tile_coord_y());
+#endif
+
+#ifdef GC_CACHE_ADAPT
+  while(true) {
+    if(PREFINISHPHASE == gcphase) {
+      break;
+    }
+  }
+#ifdef RAWPATH // TODO GC_DEBUG
+  printf("(%x,%x) Start prefinish phase\n", udn_tile_coord_x(), 
+	     udn_tile_coord_y());
+#endif
+  cacheAdapt(false);
+  //send init finish msg to core coordinator
+  send_msg_2(STARTUPCORE, GCFINISHPREF, BAMBOO_NUM_OF_CORE, false);
+#ifdef RAWPATH // TODO GC_DEBUG
+  printf("(%x,%x) Finish prefinish phase\n", udn_tile_coord_x(),
+	     udn_tile_coord_y());
+#endif
+#endif // GC_CACHE_ADAPT
+
+  while(true) {
+    if(FINISHPHASE == gcphase) {
+      break;
+    }
+  }
+#ifdef RAWPATH // TODO GC_DEBUG
+  printf("(%x,%x) Finish gc!\n", udn_tile_coord_x(), udn_tile_coord_y());
+#endif
+} // void gc_collect(struct garbagelist * stackptr)
+
+inline void gc_nocollect(struct garbagelist * stackptr) {
+  //BAMBOO_DEBUGPRINT(0xcccc); // TODO
+  // inform the master that this core is at a gc safe point and is ready to 
+  // do gc
+  send_msg_4(STARTUPCORE, GCFINISHPRE, BAMBOO_NUM_OF_CORE, self_numsendobjs, 
+	  self_numreceiveobjs, false);
+  
+  while(true) {
+    if(INITPHASE == gcphase) {
+      break;
+    }
+  }
+#ifdef RAWPATH // TODO GC_DEBUG
+  printf("(%x,%x) Do initGC\n", udn_tile_coord_x(), udn_tile_coord_y());
+#endif
+  initGC();
+#ifdef GC_CACHE_ADAPT
+  // prepare for cache adaption:
+  cacheAdapt(true);
+#endif // GC_CACHE_ADAPT
+  //send init finish msg to core coordinator
+  send_msg_2(STARTUPCORE, GCFINISHINIT, BAMBOO_NUM_OF_CORE, false);
+
+  while(true) {
+    if(MARKPHASE == gcphase) {
+      break;
+    }
+  }
+#ifdef RAWPATH // TODO GC_DEBUG
+  printf("(%x,%x) Start mark phase\n", udn_tile_coord_x(), 
+	     udn_tile_coord_y());
+#endif
+  mark(true, stackptr);
+#ifdef RAWPATH // TODO GC_DEBUG
+  printf("(%x,%x) Finish mark phase, wait for flush\n", 
+	     udn_tile_coord_x(), udn_tile_coord_y());
+#endif
+
+  // non-gc core collector routine
+  while(true) {
+    if(FLUSHPHASE == gcphase) {
+      break;
+    }
+  }
+#ifdef RAWPATH // TODO GC_DEBUG
+  printf("(%x,%x) Start flush phase\n", udn_tile_coord_x(), 
+	     udn_tile_coord_y());
+#endif
+#ifdef GC_PROFILE
+  /*BAMBOO_DEBUGPRINT(0xaaaa);
+  BAMBOO_DEBUGPRINT_REG(gc_num_obj);
+  BAMBOO_DEBUGPRINT_REG(gc_num_liveobj);
+  BAMBOO_DEBUGPRINT_REG(gc_num_forwardobj);
+  BAMBOO_DEBUGPRINT(0xaaab);*/
+  if(STARTUPCORE != BAMBOO_NUM_OF_CORE) {
+	send_msg_4(STARTUPCORE, GCPROFILES, gc_num_obj, 
+		gc_num_liveobj, gc_num_forwardobj, false);
+  }
+  gc_num_obj = 0;
+#endif // GC_PROFLIE
+  flush(stackptr);
+#ifdef RAWPATH // TODO GC_DEBUG
+  printf("(%x,%x) Finish flush phase\n", udn_tile_coord_x(), 
+	     udn_tile_coord_y());
+#endif
+
+#ifdef GC_CACHE_ADAPT
+  while(true) {
+    if(PREFINISHPHASE == gcphase) {
+      break;
+    }
+  }
+#ifdef RAWPATH // TODO GC_DEBUG
+  printf("(%x,%x) Start prefinish phase\n", udn_tile_coord_x(), 
+	     udn_tile_coord_y());
+#endif
+  cacheAdapt(false);
+  //send init finish msg to core coordinator
+  send_msg_2(STARTUPCORE, GCFINISHPREF, BAMBOO_NUM_OF_CORE, false);
+#ifdef RAWPATH // TODO GC_DEBUG
+  printf("(%x,%x) Finish prefinish phase\n", udn_tile_coord_x(),
+	     udn_tile_coord_y());
+#endif
+#endif // GC_CACHE_ADAPT
+
+  while(true) {
+    if(FINISHPHASE == gcphase) {
+      break;
+    }
+  }
+#ifdef RAWPATH // TODO GC_DEBUG
+  printf("(%x,%x) Finish gc!\n", udn_tile_coord_x(), udn_tile_coord_y());
+#endif
+} // void gc_collect(struct garbagelist * stackptr)
+
+inline void gc_master(struct garbagelist * stackptr) {
+
+  gcphase = INITPHASE;
+  int i = 0;
+  waitconfirm = false;
+  numconfirm = 0;
+  initGC();
+
+  // Note: all cores need to init gc including non-gc cores
+  for(i = 1; i < NUMCORESACTIVE /*NUMCORES4GC*/; i++) {
+	// send GC init messages to all cores
+	send_msg_1(i, GCSTARTINIT, false);
+  }
+  bool isfirst = true;
+  bool allStall = false;
+
+#ifdef GC_CACHE_ADAPT
+  // prepare for cache adaption:
+  cacheAdapt(true);
+#endif // GC_CACHE_ADAPT
+
+#ifdef RAWPATH // TODO GC_DEBUG
+  printf("(%x,%x) Check core status \n", udn_tile_coord_x(), 
+		 udn_tile_coord_y());
+#endif
+
+  gccorestatus[BAMBOO_NUM_OF_CORE] = 0;
+  while(true) {
+	BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
+	if(gc_checkAllCoreStatus_I()) {
+	  BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
+	  break;
+	}
+	BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
+  }
+#ifdef GC_PROFILE
+  gc_profileItem();
+#endif
+#ifdef RAWPATH // TODO GC_DEBUG
+  printf("(%x,%x) Start mark phase \n", udn_tile_coord_x(), 
+		 udn_tile_coord_y());
+#endif
+  // all cores have finished compacting
+  // restore the gcstatus of all cores
+  // Note: all cores have to do mark including non-gc cores
+  gccorestatus[BAMBOO_NUM_OF_CORE] = 1;
+  for(i = 1; i < NUMCORESACTIVE /*NUMCORES4GC*/; ++i) {
+	gccorestatus[i] = 1;
+	// send GC start messages to all cores
+	send_msg_1(i, GCSTART, false);
+  }
+
+  gcphase = MARKPHASE;
+  // mark phase
+  while(MARKPHASE == gcphase) {
+	mark(isfirst, stackptr);
+	if(isfirst) {
+	  isfirst = false;
+	}
+
+	// check gcstatus
+	checkMarkStatue();
+  }   // while(MARKPHASE == gcphase)
+  // send msgs to all cores requiring large objs info
+  // Note: only need to ask gc cores, non-gc cores do not host any objs
+  numconfirm = NUMCORES4GC - 1;
+  for(i = 1; i < NUMCORES4GC; ++i) {
+	send_msg_1(i, GCLOBJREQUEST, false);
+  }
+  gcloads[BAMBOO_NUM_OF_CORE] = gccurr_heaptop;
+  while(true) {
+	if(numconfirm==0) {
+	  break;
+	}
+  }   // wait for responses
+  // check the heaptop
+  if(gcheaptop < gcmarkedptrbound) {
+	gcheaptop = gcmarkedptrbound;
+  }
+#ifdef GC_PROFILE
+  gc_profileItem();
+  // TODO
+  /*if(BAMBOO_NUM_OF_CORE == 0) {
+	BAMBOO_DEBUGPRINT(0xeeee);
+	BAMBOO_DEBUGPRINT_REG(num_markrequest);
+	BAMBOO_DEBUGPRINT_REG(marktime);
+  }*/
+#endif
+#ifdef RAWPATH // TODO GC_DEBUG
+  printf("(%x,%x) prepare to cache large objs \n", udn_tile_coord_x(),
+		 udn_tile_coord_y());
+  //dumpSMem();
+#endif
+  // cache all large objs
+  if(!cacheLObjs()) {
+	// no enough space to cache large objs
+	BAMBOO_EXIT(0xb107);
+  }
+  // predict number of blocks to fill for each core
+  int tmpheaptop = 0;
+  int numpbc = loadbalance(&tmpheaptop);
+  // TODO
+  numpbc = (BAMBOO_SHARED_MEM_SIZE)/(BAMBOO_SMEM_SIZE);
+#ifdef RAWPATH // TODO GC_DEBUG
+  printf("(%x,%x) mark phase finished \n", udn_tile_coord_x(), 
+		 udn_tile_coord_y());
+  //dumpSMem();
+#endif
+  //int tmptopptr = 0;
+  //BASEPTR(gctopcore, 0, &tmptopptr);
+  // TODO
+  //tmptopptr = gcbaseva + (BAMBOO_SHARED_MEM_SIZE);
+  tmpheaptop = gcbaseva + (BAMBOO_SHARED_MEM_SIZE);
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(0xabab);
+  BAMBOO_DEBUGPRINT_REG(tmptopptr);
+#endif
+  for(i = 0; i < NUMCORES4GC; ++i) {
+	int tmpcoreptr = 0;
+	BASEPTR(i, numpbc, &tmpcoreptr);
+	//send start compact messages to all cores
+	//TODO bug here, do not know if the direction is positive or negtive?
+	if (tmpcoreptr < tmpheaptop /*tmptopptr*/) {
+	  gcstopblock[i] = numpbc + 1;
+	  if(i != STARTUPCORE) {
+		send_msg_2(i, GCSTARTCOMPACT, numpbc+1, false);
+	  } else {
+		gcblock2fill = numpbc+1;
+	  }                         // if(i != STARTUPCORE)
+	} else {
+	  gcstopblock[i] = numpbc;
+	  if(i != STARTUPCORE) {
+		send_msg_2(i, GCSTARTCOMPACT, numpbc, false);
+	  } else {
+		gcblock2fill = numpbc;
+	  }    // if(i != STARTUPCORE)
+	}
+#ifdef DEBUG
+	BAMBOO_DEBUGPRINT(0xf000+i);
+	BAMBOO_DEBUGPRINT_REG(tmpcoreptr);
+	BAMBOO_DEBUGPRINT_REG(gcstopblock[i]);
+#endif
+	// init some data strutures for compact phase
+	gcloads[i] = 0;
+	gcfilledblocks[i] = 0;
+	gcrequiredmems[i] = 0;
+  }
+
+#ifdef GC_PROFILE
+  gc_profileItem();
+#endif
+
+  // compact phase
+  bool finalcompact = false;
+  // initialize pointers for comapcting
+  struct moveHelper * orig =
+	(struct moveHelper *)RUNMALLOC(sizeof(struct moveHelper));
+  struct moveHelper * to =
+	(struct moveHelper *)RUNMALLOC(sizeof(struct moveHelper));
+  initOrig_Dst(orig, to);
+  int filledblocks = 0;
+  INTPTR heaptopptr = 0;
+  bool finishcompact = false;
+  bool iscontinue = true;
+  bool localcompact = true;
+  while((COMPACTPHASE == gcphase) || (SUBTLECOMPACTPHASE == gcphase)) {
+	if((!finishcompact) && iscontinue) {
+#ifdef DEBUG
+	  BAMBOO_DEBUGPRINT(0xe001);
+	  BAMBOO_DEBUGPRINT_REG(numpbc);
+	  BAMBOO_DEBUGPRINT_REG(gcblock2fill);
+#endif
+	  finishcompact = compacthelper(orig, to, &filledblocks,
+									&heaptopptr, &localcompact);
+#ifdef DEBUG
+	  BAMBOO_DEBUGPRINT(0xe002);
+	  BAMBOO_DEBUGPRINT_REG(finishcompact);
+	  BAMBOO_DEBUGPRINT_REG(gctomove);
+	  BAMBOO_DEBUGPRINT_REG(gcrequiredmems[0]);
+	  BAMBOO_DEBUGPRINT_REG(gcfilledblocks[0]);
+	  BAMBOO_DEBUGPRINT_REG(gcstopblock[0]);
+#endif
+	}
+
+	BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
+	if(gc_checkCoreStatus_I()) {
+	  // all cores have finished compacting
+	  // restore the gcstatus of all cores
+	  for(i = 0; i < NUMCORES4GC; ++i) {
+		gccorestatus[i] = 1;
+	  }
+	  BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
+	  break;
+	} else {
+	  BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
+	  // check if there are spare mem for pending move requires
+	  if(COMPACTPHASE == gcphase) {
+#ifdef DEBUG
+		BAMBOO_DEBUGPRINT(0xe003);
+#endif
+		resolvePendingMoveRequest();
+#ifdef DEBUG
+		BAMBOO_DEBUGPRINT_REG(gctomove);
+#endif
+	  } else {
+#ifdef DEBUG
+		BAMBOO_DEBUGPRINT(0xe004);
+#endif
+		compact2Heaptop();
+	  }
+	}   // if(gc_checkCoreStatus_I()) else ...
+
+	if(gctomove) {
+#ifdef DEBUG
+	  BAMBOO_DEBUGPRINT(0xe005);
+	  BAMBOO_DEBUGPRINT_REG(gcmovestartaddr);
+	  BAMBOO_DEBUGPRINT_REG(gcblock2fill);
+	  BAMBOO_DEBUGPRINT_REG(gctomove);
+#endif
+	  to->ptr = gcmovestartaddr;
+	  to->numblocks = gcblock2fill - 1;
+	  to->bound = (to->numblocks==0) ?
+				  BAMBOO_SMEM_SIZE_L :
+				  BAMBOO_SMEM_SIZE_L+BAMBOO_SMEM_SIZE*to->numblocks;
+	  BASEPTR(gcdstcore, to->numblocks, &(to->base));
+	  to->offset = to->ptr - to->base;
+	  to->top = (to->numblocks==0) ?
+				(to->offset) : (to->bound-BAMBOO_SMEM_SIZE+to->offset);
+	  to->base = to->ptr;
+	  to->offset = BAMBOO_CACHE_LINE_SIZE;
+	  to->ptr += to->offset;                         // for header
+	  to->top += to->offset;
+	  if(gcdstcore == BAMBOO_NUM_OF_CORE) {
+		localcompact = true;
+	  } else {
+		localcompact = false;
+	  }
+	  gctomove = false;
+	  iscontinue = true;
+	} else if(!finishcompact) {
+	  // still pending
+	  iscontinue = false;
+	}  // if(gctomove)
+  }  // while(COMPACTPHASE == gcphase)
+#ifdef GC_PROFILE
+  gc_profileItem();
+#endif
+#ifdef RAWPATH // TODO GC_DEBUG
+  printf("(%x,%x) prepare to move large objs \n", udn_tile_coord_x(),
+		 udn_tile_coord_y());
+  //dumpSMem();
+#endif
+  // move largeObjs
+  moveLObjs();
+#ifdef RAWPATH // TODO GC_DEBUG
+  printf("(%x,%x) compact phase finished \n", udn_tile_coord_x(), 
+		 udn_tile_coord_y());
+  //dumpSMem();
+#endif
+  RUNFREE(orig);
+  RUNFREE(to);
+  orig = to = NULL;
+
+  gcphase = MAPPHASE;
+  gccorestatus[BAMBOO_NUM_OF_CORE] = 1;
+  // Note: all cores should flush their runtime data including non-gc
+  //       cores
+  for(i = 1; i < NUMCORES4GC; ++i) {
+	// send start flush messages to all cores
+	gccorestatus[i] = 1;
+	send_msg_1(i, GCSTARTMAPINFO, false);
+  }
+#ifdef GC_PROFILE
+  gc_profileItem();
+#endif
+#ifdef RAWPATH // TODO GC_DEBUG
+  printf("(%x,%x) Start map phase \n", udn_tile_coord_x(), 
+		 udn_tile_coord_y());
+#endif
+  // mapinto phase
+  transmappinginfo();
+#ifdef RAWPATH // TODO GC_DEBUG
+  printf("(%x,%x) Finish map phase \n", udn_tile_coord_x(), 
+		 udn_tile_coord_y());
+#endif
+  gccorestatus[BAMBOO_NUM_OF_CORE] = 0;
+  while(MAPPHASE == gcphase) {
+	// check the status of all cores
+	BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
+	if(gc_checkCoreStatus_I()) {
+	  // all cores have finished sending mapping info 
+	  BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
+	  break;
+	}
+	BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
+  }  // while(MAPPHASE == gcphase)
+
+  gcphase = FLUSHPHASE;
+  gccorestatus[BAMBOO_NUM_OF_CORE] = 1;
+  // Note: all cores should flush their runtime data including non-gc
+  //       cores
+  for(i = 1; i < NUMCORESACTIVE /*NUMCORES4GC*/; ++i) {
+	// send start flush messages to all cores
+	gccorestatus[i] = 1;
+	send_msg_1(i, GCSTARTFLUSH, false);
+  }
+#ifdef GC_PROFILE
+  gc_profileItem();
+#endif
+#ifdef RAWPATH // TODO GC_DEBUG
+  printf("(%x,%x) Start flush phase \n", udn_tile_coord_x(), 
+		 udn_tile_coord_y());
+#endif
+  // flush phase
+  flush(stackptr);
+  gccorestatus[BAMBOO_NUM_OF_CORE] = 0;
+  while(FLUSHPHASE == gcphase) {
+	// check the status of all cores
+	BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
+	if(gc_checkAllCoreStatus_I()) {
+	  BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
+	  break;
+	}
+	BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
+  }  // while(FLUSHPHASE == gcphase)
+#ifdef RAWPATH // TODO GC_DEBUG
+  printf("(%x,%x) Finish flush phase \n", udn_tile_coord_x(), 
+		 udn_tile_coord_y());
+#endif
+
+#ifdef GC_CACHE_ADAPT
+  gcphase = PREFINISHPHASE;
+  gccorestatus[BAMBOO_NUM_OF_CORE] = 1;
+  // Note: all cores should flush their runtime data including non-gc
+  //       cores
+  for(i = 1; i < NUMCORESACTIVE /*NUMCORES4GC*/; ++i) {
+	// send start flush messages to all cores
+	gccorestatus[i] = 1;
+	send_msg_1(i, GCSTARTPREF, false);
+  }
+#ifdef GC_PROFILE
+  gc_profileItem();
+#endif
+#ifdef RAWPATH // TODO GC_DEBUG
+  printf("(%x,%x) Start prefinish phase \n", udn_tile_coord_x(), 
+		 udn_tile_coord_y());
+#endif
+  // flush phase
+  cacheAdapt(false);
+  gccorestatus[BAMBOO_NUM_OF_CORE] = 0;
+  while(PREFINISHPHASE == gcphase) {
+	// check the status of all cores
+	BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
+	if(gc_checkAllCoreStatus_I()) {
+	  BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
+	  break;
+	}
+	BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
+  }  // while(PREFINISHPHASE == gcphase)
+#endif // GC_CACHE_ADAPT
+
+  gcphase = FINISHPHASE;
+
+  // invalidate all shared mem pointers
+  // put it here as it takes time to inform all the other cores to
+  // finish gc and it might cause problem when some core resumes
+  // mutator earlier than the other cores
+  bamboo_cur_msp = NULL;
+  bamboo_smem_size = 0;
+  bamboo_smem_zero_top = NULL;
+  gcflag = false;
+  gcprocessing = false;
+
+#ifdef GC_PROFILE
+  gc_profileEnd();
+#endif
+  gccorestatus[BAMBOO_NUM_OF_CORE] = 1;
+  for(i = 1; i < NUMCORESACTIVE /*NUMCORES4GC*/; ++i) {
+	// send gc finish messages to all cores
+	send_msg_1(i, GCFINISH, false);
+	gccorestatus[i] = 1;
+  }
+#ifdef RAWPATH // TODO GC_DEBUG
+  printf("(%x,%x) gc finished \n", udn_tile_coord_x(), 
+		 udn_tile_coord_y());
+  //dumpSMem();
+#endif
+  //BAMBOO_DEBUGPRINT(0x1111); // TODO
+/*#ifdef GC_PROFILE_S
+  BAMBOO_DEBUGPRINT(0xaaaa);
+  BAMBOO_DEBUGPRINT_REG(gc_num_obj);
+  BAMBOO_DEBUGPRINT_REG(gc_num_liveobj);
+  BAMBOO_DEBUGPRINT_REG(gc_num_forwardobj);
+  BAMBOO_DEBUGPRINT_REG(gc_num_profiles);
+  BAMBOO_DEBUGPRINT(0xaaab);
+  if(STARTUPCORE == BAMBOO_NUM_OF_CORE) {
+	BAMBOO_DEBUGPRINT(0xaaac);
+	BAMBOO_DEBUGPRINT_REG(gc_num_livespace);
+	BAMBOO_DEBUGPRINT_REG(gc_num_freespace);
+	BAMBOO_DEBUGPRINT(0xaaad);
+  }
+  gc_num_obj = gc_num_liveobj;
+  gc_num_liveobj = 0;
+  gc_num_forwardobj = 0;
+#endif // GC_PROFLIE_S*/
+} // void gc_master(struct garbagelist * stackptr)
+
+inline bool gc(struct garbagelist * stackptr) {
+  // check if do gc
+  if(!gcflag) {
+    gcprocessing = false;
+    return false;
+  }
+
+  // core coordinator routine
+  if(0 == BAMBOO_NUM_OF_CORE) {
+#ifdef GC_DEBUG
+    printf("(%x,%X) Check if can do gc or not\n", udn_tile_coord_x(),
+		   udn_tile_coord_y());
+#endif
+	bool isallstall = true;
+	gccorestatus[BAMBOO_NUM_OF_CORE] = 0;
+	BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
+	int ti = 0;
+	for(ti = 0; ti < NUMCORESACTIVE; ++ti) {
+	  if(gccorestatus[ti] != 0) {
+		isallstall = false;
+		break;
+	  }
+	}
+	if(!isallstall) {
+	  BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
+	  // some of the cores are still executing the mutator and did not reach
+	  // some gc safe point, therefore it is not ready to do gc
+	  // in case that there are some pregc information msg lost, send a confirm
+	  // msg to the 'busy' core
+	  send_msg_1(ti, GCSTARTPRE, false);
+	  gcflag = true;
+	  return false;
+	} else {
+	  // TODO
+#ifdef GC_PROFILE
+    gc_profileStart();
+#endif
+	  //BAMBOO_DEBUGPRINT(0x1111); // TODO
+pregccheck:
+	  //BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
+	  gcnumsendobjs[0][BAMBOO_NUM_OF_CORE] = self_numsendobjs;
+	  gcnumreceiveobjs[0][BAMBOO_NUM_OF_CORE] = self_numreceiveobjs;
+	  int sumsendobj = 0;
+#ifdef DEBUG
+	  BAMBOO_DEBUGPRINT(0xec04);
+#endif
+	  for(int i = 0; i < NUMCORESACTIVE; ++i) {
+		sumsendobj += gcnumsendobjs[0][i];
+#ifdef DEBUG
+		BAMBOO_DEBUGPRINT(0xf000 + gcnumsendobjs[0][i]);
+#endif
+	  }  // for(i = 1; i < NUMCORESACTIVE; ++i)
+#ifdef DEBUG
+	  BAMBOO_DEBUGPRINT(0xec05);
+	  BAMBOO_DEBUGPRINT_REG(sumsendobj);
+#endif
+	  for(int i = 0; i < NUMCORESACTIVE; ++i) {
+		sumsendobj -= gcnumreceiveobjs[0][i];
+#ifdef DEBUG
+		BAMBOO_DEBUGPRINT(0xf000 + gcnumreceiveobjs[i]);
+#endif
+	  }  // for(i = 1; i < NUMCORESACTIVE; ++i)
+#ifdef DEBUG
+	  BAMBOO_DEBUGPRINT(0xec06);
+	  BAMBOO_DEBUGPRINT_REG(sumsendobj);
+#endif
+	  if(0 != sumsendobj) {
+		// there were still some msgs on the fly, wait until there 
+		// are some update pregc information coming and check it again
+		gcprecheck = false;
+		BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
+		//BAMBOO_DEBUGPRINT(0x2222); // TODO
+		while(true) {
+		  if(gcprecheck) {
+			break;
+		  }
+		}
+		goto pregccheck;
+	  } else {
+		BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
+	  }
+	}
+/*
+#ifdef GC_PROFILE
+    gc_profileStart();
+#endif
+*/
+#ifdef RAWPATH // TODO GC_DEBUG
+    printf("(%x,%x) start gc! \n", udn_tile_coord_x(), udn_tile_coord_y());
+    //dumpSMem();
+#endif
+	// Zero out the remaining bamboo_cur_msp 
+	// Only zero out the first 4 bytes of the remaining memory
+	// Move the operation here because for the GC_CACHE_ADAPT version,
+	// we need to make sure during the gcinit phase the shared heap is not 
+	// touched. Otherwise, there would be problem when adapt the cache 
+	// strategy.
+	if((bamboo_cur_msp != 0) 
+		&& (bamboo_smem_zero_top == bamboo_cur_msp) 
+		&& (bamboo_smem_size > 0)) {
+	  *((int *)bamboo_cur_msp) = 0;
+	}
+#ifdef GC_FLUSH_DTLB
+	if(gc_num_flush_dtlb < GC_NUM_FLUSH_DTLB) {
+	  BAMBOO_CLEAN_DTLB();
+	  gc_num_flush_dtlb++;
+	}
+#endif
+#ifdef GC_CACHE_ADAPT
+    //BAMBOO_DEBUGPRINT(BAMBOO_GET_EXE_TIME());
+    // disable the timer interrupt
+    bamboo_mask_timer_intr();
+    // get the sampling data TODO
+    bamboo_output_dtlb_sampling();
+#endif // GC_CACHE_ADAPT
+	gcprocessing = true;
+	gc_master(stackptr);
+  } else if(BAMBOO_NUM_OF_CORE < NUMCORES4GC) {
+	// Zero out the remaining bamboo_cur_msp 
+	// Only zero out the first 4 bytes of the remaining memory
+	// Move the operation here because for the GC_CACHE_ADAPT version,
+	// we need to make sure during the gcinit phase the shared heap is not 
+	// touched. Otherwise, there would be problem when adapt the cache 
+	// strategy.
+	if((bamboo_cur_msp != 0) 
+		&& (bamboo_smem_zero_top == bamboo_cur_msp) 
+		&& (bamboo_smem_size > 0)) {
+	  *((int *)bamboo_cur_msp) = 0;
+	}
+#ifdef GC_FLUSH_DTLB
+	if(gc_num_flush_dtlb < GC_NUM_FLUSH_DTLB) {
+	  BAMBOO_CLEAN_DTLB();
+	  gc_num_flush_dtlb++;
+	}
+#endif
+#ifdef GC_CACHE_ADAPT
+	// disable the timer interrupt
+	bamboo_mask_timer_intr();
+	// get the sampling data TODO
+	bamboo_output_dtlb_sampling();
+#endif // GC_CACHE_ADAPT
+    gcprocessing = true;
+    gc_collect(stackptr);
+
+    // invalidate all shared mem pointers
+    bamboo_cur_msp = NULL;
+    bamboo_smem_size = 0;
+	bamboo_smem_zero_top = NULL;
+    gcflag = false;
+    gcprocessing = false;
+  } else {
+	// Zero out the remaining bamboo_cur_msp 
+	// Only zero out the first 4 bytes of the remaining memory
+	// Move the operation here because for the GC_CACHE_ADAPT version,
+	// we need to make sure during the gcinit phase the shared heap is not 
+	// touched. Otherwise, there would be problem when adapt the cache 
+	// strategy.
+	if((bamboo_cur_msp != 0) 
+		&& (bamboo_smem_zero_top == bamboo_cur_msp) 
+		&& (bamboo_smem_size > 0)) {
+	  *((int *)bamboo_cur_msp) = 0;
+	}
+#ifdef GC_FLUSH_DTLB
+	if(gc_num_flush_dtlb < GC_NUM_FLUSH_DTLB) {
+	  BAMBOO_CLEAN_DTLB();
+	  gc_num_flush_dtlb++;
+	}
+#endif
+#ifdef GC_CACHE_ADAPT
+	// disable the timer interrupt
+	bamboo_mask_timer_intr();
+	// get the sampling data TODO
+	bamboo_output_dtlb_sampling();
+#endif // GC_CACHE_ADAPT
+    // not a gc core, should wait for gcfinish msg
+    gcprocessing = true;
+    gc_nocollect(stackptr);
+
+    // invalidate all shared mem pointers
+    bamboo_cur_msp = NULL;
+    bamboo_smem_size = 0;
+    bamboo_smem_zero_top = NULL;
+	gcflag = false;
+    gcprocessing = false;
+  }
+#ifdef GC_CACHE_ADAPT
+  // reset the sampling arrays
+  bamboo_dtlb_sampling_reset();
+  // enable the timer interrupt
+  bamboo_tile_timer_set_next_event(500000000); // TODO
+  bamboo_unmask_timer_intr();
+#endif // GC_CACHE_ADAPT
+  //if(STARTUPCORE == BAMBOO_NUM_OF_CORE) BAMBOO_DEBUGPRINT(0xeeee); // TODO 
+  return true;
+} // void gc(struct garbagelist * stackptr)
+
+#ifdef GC_PROFILE
+inline void gc_profileStart(void) {
+  if(!gc_infoOverflow) {
+    GCInfo* gcInfo = RUNMALLOC(sizeof(struct gc_info));
+    gc_infoArray[gc_infoIndex] = gcInfo;
+    gcInfo->index = 1;
+    gcInfo->time[0] = BAMBOO_GET_EXE_TIME();
+  }
+}
+
+inline void gc_profileItem(void) {
+  if(!gc_infoOverflow) {
+    GCInfo* gcInfo = gc_infoArray[gc_infoIndex];
+    gcInfo->time[gcInfo->index++] = BAMBOO_GET_EXE_TIME();
+  }
+}
+
+inline void gc_profileEnd(void) {
+  if(!gc_infoOverflow) {
+    GCInfo* gcInfo = gc_infoArray[gc_infoIndex];
+    gcInfo->time[gcInfo->index++] = BAMBOO_GET_EXE_TIME();
+	gcInfo->time[gcInfo->index++] = gc_num_livespace;
+	gcInfo->time[gcInfo->index++] = gc_num_freespace;
+	gcInfo->time[gcInfo->index++] = gc_num_lobj;
+	gcInfo->time[gcInfo->index++] = gc_num_lobjspace;
+	gcInfo->time[gcInfo->index++] = gc_num_obj;
+	gcInfo->time[gcInfo->index++] = gc_num_liveobj;
+	gcInfo->time[gcInfo->index++] = gc_num_forwardobj;
+    gc_infoIndex++;
+    if(gc_infoIndex == GCINFOLENGTH) {
+      gc_infoOverflow = true;
+      //taskInfoIndex = 0;
+    }
+  }
+}
+
+// output the profiling data
+void gc_outputProfileData() {
+/*#ifdef USEIO
+  int i,j;
+  unsigned long long totalgc = 0;
+
+  //printf("Start Time, End Time, Duration\n");
+  // output task related info
+  for(i = 0; i < gc_infoIndex; i++) {
+    GCInfo * gcInfo = gc_infoArray[i];
+    unsigned long long tmp = 0;
+    for(j = 0; j < gcInfo->index; j++) {
+      printf("%lld(%lld), ", gcInfo->time[j], (gcInfo->time[j]-tmp));
+      tmp = gcInfo->time[j];
+    }
+    tmp = (tmp-gcInfo->time[0]);
+    printf(" ++ %lld \n", tmp);
+    totalgc += tmp;
+  }
+
+  if(gc_infoOverflow) {
+    printf("Caution: gc info overflow!\n");
+  }
+
+  printf("\n\n total gc time: %lld \n", totalgc);
+#else*/
+  int i = 0;
+  int j = 0;
+  unsigned long long totalgc = 0;
+
+#ifndef BAMBOO_MEMPROF
+  BAMBOO_DEBUGPRINT(0xdddd);
+#endif
+  // output task related info
+  for(i= 0; i < gc_infoIndex; i++) {
+    GCInfo * gcInfo = gc_infoArray[i];
+#ifdef BAMBOO_MEMPROF
+    unsigned long long tmp=gcInfo->time[gcInfo->index-8]-gcInfo->time[0]; //0;
+#else
+	unsigned long long tmp = 0;
+    BAMBOO_DEBUGPRINT(0xddda);
+    for(j = 0; j < gcInfo->index - 7; j++) {
+      BAMBOO_DEBUGPRINT(gcInfo->time[j]);
+      BAMBOO_DEBUGPRINT(gcInfo->time[j]-tmp);
+      BAMBOO_DEBUGPRINT(0xdddb);
+      tmp = gcInfo->time[j];
+    }
+    tmp = (tmp-gcInfo->time[0]);
+    BAMBOO_DEBUGPRINT_REG(tmp);
+	BAMBOO_DEBUGPRINT(0xdddc);
+	BAMBOO_DEBUGPRINT(gcInfo->time[gcInfo->index - 7]);
+	BAMBOO_DEBUGPRINT(gcInfo->time[gcInfo->index - 6]);
+	BAMBOO_DEBUGPRINT(gcInfo->time[gcInfo->index - 5]);
+	BAMBOO_DEBUGPRINT(gcInfo->time[gcInfo->index - 4]);
+	BAMBOO_DEBUGPRINT(gcInfo->time[gcInfo->index - 3]);
+	BAMBOO_DEBUGPRINT(gcInfo->time[gcInfo->index - 2]);
+	BAMBOO_DEBUGPRINT(gcInfo->time[gcInfo->index - 1]);
+    BAMBOO_DEBUGPRINT(0xddde);
+#endif
+    totalgc += tmp;
+  }
+#ifndef BAMBOO_MEMPROF
+  BAMBOO_DEBUGPRINT(0xdddf);
+#endif
+  BAMBOO_DEBUGPRINT_REG(totalgc);
+
+  if(gc_infoOverflow) {
+    BAMBOO_DEBUGPRINT(0xefee);
+  }
+
+#ifndef BAMBOO_MEMPROF
+  BAMBOO_DEBUGPRINT(0xeeee);
+#endif
+//#endif
+}
+#endif  // #ifdef GC_PROFILE
+
+#endif
diff --git a/Robust/src/Runtime/bamboo/multicoregarbage.h b/Robust/src/Runtime/bamboo/multicoregarbage.h
new file mode 100644
index 00000000..a824bf75
--- /dev/null
+++ b/Robust/src/Runtime/bamboo/multicoregarbage.h
@@ -0,0 +1,276 @@
+#ifndef MULTICORE_GARBAGE_H
+#define MULTICORE_GARBAGE_H
+#include "multicoregc.h"
+#include "multicorehelper.h"  // for mappins between core # and block #
+#include "structdefs.h"
+#include "MGCHash.h"
+#include "GCSharedHash.h"
+
+#ifndef bool
+#define bool int
+#endif
+
+// data structures for GC
+#ifdef GC_DEBUG
+#define BAMBOO_SMEM_SIZE_L (BAMBOO_SMEM_SIZE * 2)
+#else
+#define BAMBOO_SMEM_SIZE_L (BAMBOO_SMEM_SIZE * 2)
+#endif
+#define BAMBOO_LARGE_SMEM_BOUND (BAMBOO_SMEM_SIZE_L*NUMCORES4GC)
+// let each gc core to have one big block, this is very important
+// for the computation of NUMBLOCKS(s, n), DO NOT change this!
+
+#ifdef GC_FLUSH_DTLB
+#define GC_NUM_FLUSH_DTLB 1
+int gc_num_flush_dtlb;
+#endif
+
+#define NUMPTRS 100
+
+// for GC profile
+#ifdef GC_PROFILE
+#define GCINFOLENGTH 100
+
+#ifdef GC_CACHE_ADAPT
+#define GC_PROFILE_NUM_FIELD 16
+#else
+#define GC_PROFILE_NUM_FIELD 15
+#endif
+
+typedef struct gc_info {
+  unsigned long long time[GC_PROFILE_NUM_FIELD];
+  int index;
+} GCInfo;
+
+GCInfo * gc_infoArray[GCINFOLENGTH];
+int gc_infoIndex;
+bool gc_infoOverflow;
+unsigned long long gc_num_livespace;
+unsigned long long gc_num_freespace;
+unsigned long long gc_num_lobjspace;
+unsigned int gc_num_lobj;
+
+// TODO
+/*unsigned long long flushstalltime;
+unsigned long long flushstalltime_i;
+int num_mapinforequest_i;*/
+unsigned int gc_num_liveobj;
+unsigned int gc_num_obj;
+unsigned int gc_num_forwardobj;
+int gc_num_profiles;
+
+#endif // GC_PROFILE
+
+typedef enum {
+  INIT = 0,           // 0
+  DISCOVERED = 2,     // 2
+  REMOTEM = 4,        // 4
+  MARKED = 8,         // 8
+  COMPACTED = 16,     // 16
+  FLUSHED = 32,       // 32
+  END = 33            // 33
+} GCOBJFLAG;
+
+typedef enum {
+  INITPHASE = 0x0,         // 0x0
+  MARKPHASE,               // 0x1
+  COMPACTPHASE,            // 0x2
+  SUBTLECOMPACTPHASE,      // 0x3
+  MAPPHASE,                // 0x4
+  FLUSHPHASE,              // 0x5
+#ifdef GC_CACHE_ADAPT
+  PREFINISHPHASE,          // 0x6
+#endif // GC_CACHE_ADAPT
+  FINISHPHASE              // 0x6/0x7
+} GCPHASETYPE;
+
+volatile bool gcflag;
+volatile bool gcprocessing;
+volatile GCPHASETYPE gcphase; // indicating GC phase
+
+volatile bool gcpreinform; // counter for stopped cores
+volatile bool gcprecheck; // indicates if there are updated pregc information
+
+int gccurr_heaptop;
+struct MGCHash * gcforwardobjtbl; // cache forwarded objs in mark phase
+// for mark phase termination
+volatile int gccorestatus[NUMCORESACTIVE]; // records status of each core
+                                           // 1: running gc
+                                           // 0: stall
+volatile int gcnumsendobjs[2][NUMCORESACTIVE]; // the # of objects sent out
+volatile int gcnumreceiveobjs[2][NUMCORESACTIVE]; // the # of objects received
+volatile int gcnumsrobjs_index;  // indicates which entry to record the info 
+		                         // received before phase 1 of the mark finish 
+						         // checking process
+								 // the info received in phase 2 must be 
+								 // recorded in the other entry
+volatile bool gcbusystatus;
+int gcself_numsendobjs;
+int gcself_numreceiveobjs;
+
+// for load balancing
+INTPTR gcheaptop;
+int gcloads[NUMCORES4GC];
+int gctopcore; // the core host the top of the heap
+int gctopblock; // the number of current top block
+
+int gcnumlobjs;
+
+// compact instruction
+INTPTR gcmarkedptrbound;
+int gcblock2fill;
+int gcstopblock[NUMCORES4GC]; // indicate when to stop compact phase
+int gcfilledblocks[NUMCORES4GC]; //indicate how many blocks have been fulfilled
+// move instruction;
+INTPTR gcmovestartaddr;
+int gcdstcore;
+volatile bool gctomove;
+int gcrequiredmems[NUMCORES4GC]; //record pending mem requests
+volatile int gcmovepending;
+
+// data structures to record remote cores that transferred the marked 
+// objs in the mark phase
+/*struct rcoreinfo{
+  int high;
+  int low;
+};
+struct RuntimeHash * gcrcoretbl;
+#define NUM_MAPPING 40
+void * gcmappingtbl[NUMCORESACTIVE][NUM_MAPPING];*/
+
+// shared memory pointer for shared pointer mapping tbls
+// In GC version, this block of memory is located at the bottom of the 
+// shared memory, right on the top of the smem tbl.
+// The bottom of the shared memory = sbstart tbl + smemtbl 
+//                                  + NUMCORES4GC bamboo_rmsp
+// These three types of table are always reside at the bottom of the shared 
+// memory and will never be moved or garbage collected
+#ifdef GC_SMALLPAGESIZE
+#define BAMBOO_RMSP_SIZE (1024 * 1024)
+#else
+#define BAMBOO_RMSP_SIZE (BAMBOO_SMEM_SIZE) // (45 * 16 * 1024)
+#endif
+mspace bamboo_rmsp;
+// shared pointer mapping tbl
+//volatile struct GCSharedHash * gcsharedptbl;
+mgcsharedhashtbl_t * gcsharedptbl;
+// remote shared pointer tbls
+//struct GCSharedHash * gcrpointertbls[NUMCORES4GC];
+mgcsharedhashtbl_t * gcrpointertbls[NUMCORES4GC];
+
+#ifdef LOCALHASHTBL_TEST
+struct RuntimeHash * gcpointertbl;
+#else
+mgchashtable_t * gcpointertbl;
+#endif
+//struct MGCHash * gcpointertbl;
+int gcobj2map;
+int gcmappedobj;
+volatile bool gcismapped;
+
+// table recording the starting address of each small block
+// (size is BAMBOO_SMEM_SIZE)
+// Note: 1. this table always resides on the very bottom of the shared memory
+//       2. the first two blocks are reserved for this table, would never be
+//          moved or garbage collected.
+INTPTR * gcsbstarttbl;
+int gcreservedsb;  // number of reserved sblock for sbstarttbl
+int gcnumblock; // number of total blocks in the shared mem
+int gcbaseva; // base va for shared memory without reserved sblocks
+#ifdef GC_CACHE_ADAPT
+int gctopva; // top va for shared memory without reserved sblocks
+volatile bool gccachestage;
+#endif // GC_CACHE_ADAPT
+
+#define ISSHAREDOBJ(p) \
+  ((((int)p)>gcbaseva)&&(((int)p)<(gcbaseva+(BAMBOO_SHARED_MEM_SIZE))))
+
+#define ALIGNSIZE(s, as) \
+  (*((int*)as)) = (((s) & (~(BAMBOO_CACHE_LINE_MASK))) + (BAMBOO_CACHE_LINE_SIZE))
+
+// mapping of pointer to block # (start from 0), here the block # is
+// the global index
+#define BLOCKINDEX(p, b) \
+  { \
+    int t = (p) - gcbaseva; \
+    if(t < (BAMBOO_LARGE_SMEM_BOUND)) { \
+      (*((int*)b)) = t / (BAMBOO_SMEM_SIZE_L); \
+    } else { \
+      (*((int*)b)) = NUMCORES4GC+((t-(BAMBOO_LARGE_SMEM_BOUND))/(BAMBOO_SMEM_SIZE)); \
+    } \
+  }
+
+// mapping of pointer to core #
+#define RESIDECORE(p, c) \
+  { \
+    if(1 == (NUMCORES4GC)) { \
+      (*((int*)c)) = 0; \
+    } else { \
+      int b; \
+      BLOCKINDEX((p), &b); \
+      (*((int*)c)) = gc_block2core[(b%(NUMCORES4GC*2))]; \
+    } \
+  }
+
+// NOTE: n starts from 0
+// mapping of heaptop (how many bytes there are in the local heap) to
+// the number of the block
+// the number of the block indicates that the block is the xth block on
+// the local heap
+#define NUMBLOCKS(s, n) \
+  if(s < (BAMBOO_SMEM_SIZE_L)) { \
+    (*((int*)(n))) = 0; \
+  } else { \
+    (*((int*)(n))) = 1 + ((s) - (BAMBOO_SMEM_SIZE_L)) / (BAMBOO_SMEM_SIZE); \
+  }
+
+#define OFFSET(s, o) \
+  if(s < BAMBOO_SMEM_SIZE_L) { \
+    (*((int*)(o))) = (s); \
+  } else { \
+    (*((int*)(o))) = ((s) - (BAMBOO_SMEM_SIZE_L)) % (BAMBOO_SMEM_SIZE); \
+  }
+
+// mapping of (core #, index of the block) to the global block index
+#define BLOCKINDEX2(c, n) (gc_core2block[(2*(c))+((n)%2)]+((NUMCORES4GC*2)*((n)/2)))
+
+// mapping of (core #, number of the block) to the base pointer of the block
+#define BASEPTR(c, n, p) \
+  { \
+    int b = BLOCKINDEX2((c), (n)); \
+    if(b < (NUMCORES4GC)) { \
+      (*((int*)p)) = gcbaseva + b * (BAMBOO_SMEM_SIZE_L); \
+    } else { \
+      (*((int*)p)) = gcbaseva+(BAMBOO_LARGE_SMEM_BOUND)+ \
+                     (b-(NUMCORES4GC))*(BAMBOO_SMEM_SIZE); \
+    } \
+  }
+
+// the next core in the top of the heap
+#define NEXTTOPCORE(b) (gc_block2core[((b)+1)%(NUMCORES4GC*2)])
+
+inline bool gc(struct garbagelist * stackptr); // core coordinator routine
+inline void gc_collect(struct garbagelist* stackptr); //core collector routine
+inline void gc_nocollect(struct garbagelist* stackptr); //non-gc core collector routine
+inline void transferMarkResults_I();
+inline void gc_enqueue_I(void *ptr);
+inline void gc_lobjenqueue_I(void *ptr, int length, int host);
+inline bool gcfindSpareMem_I(int * startaddr,
+                             int * tomove,
+                             int * dstcore,
+                             int requiredmem,
+                             int requiredcore);
+
+inline void * gc_lobjdequeue4(int * length, int * host);
+inline int gc_lobjmoreItems4();
+inline void gc_lobjqueueinit4();
+
+#ifdef GC_PROFILE
+INLINE void gc_profileStart(void);
+INLINE void gc_profileItem(void);
+INLINE void gc_profileEnd(void);
+void gc_outputProfileData();
+#endif
+
+#endif
+
diff --git a/Robust/src/Runtime/bamboo/multicoregc.h b/Robust/src/Runtime/bamboo/multicoregc.h
new file mode 100644
index 00000000..0f7ddc4c
--- /dev/null
+++ b/Robust/src/Runtime/bamboo/multicoregc.h
@@ -0,0 +1,16 @@
+#ifndef MULTICORE_GC_H
+#define MULTICORE_GC_H
+
+struct garbagelist {
+  int size;
+  struct garbagelist *next;
+  void * array[];
+};
+
+struct listitem {
+  struct listitem * prev;
+  struct listitem * next;
+  struct garbagelist * stackptr;
+};
+
+#endif // MULTICORE_GC_H
diff --git a/Robust/src/Runtime/bamboo/multicorehelper.h b/Robust/src/Runtime/bamboo/multicorehelper.h
new file mode 100644
index 00000000..3519f5a7
--- /dev/null
+++ b/Robust/src/Runtime/bamboo/multicorehelper.h
@@ -0,0 +1,56 @@
+#ifndef MULTICORE_HELPER_H
+#define MULTICORE_HELPER_H
+
+#ifdef GC_1
+// NUMCORES4GC = 1
+static int gc_core2block[2] = {0,1};
+
+static int gc_block2core[2] = { 0,  0};
+#elif defined GC_56
+// NUMCORES4GC = 56
+static int gc_core2block[112] = {
+  0,111,  15, 96,  16,95,  31,80,  32,79,  47,64,  48,63,
+  1,110,  14, 97,  17,94,  30,81,  33,78,  46,65,  49,62,
+  2,109,  13, 98,  18,93,  29,82,  34,77,  45,66,  50,61,
+  3,108,  12, 99,  19,92,  28,83,  35,76,  44,67,  51,60,
+  4,107,  11,100,  20,91,  27,84,  36,75,  43,68,  52,59,
+  5,106,  10,101,  21,90,  26,85,  37,74,  42,69,  53,58,
+  6,105,   9,102,  22,89,  25,86,  38,73,  41,70,  54,57,
+  7,104,   8,103,  23,88,  24,87,  39,72,  40,71,  55,56
+};
+
+static int gc_block2core[112] = {
+  0,  7, 14, 21, 28, 35, 42, 49, 50, 43, 36, 29, 22, 15,  8,  1,
+  2,  9, 16, 23, 30, 37, 44, 51, 52, 45, 38, 31, 24, 17, 10,  3,
+  4, 11, 18, 25, 32, 39, 46, 53, 54, 47, 40, 33, 26, 19, 12,  5,
+  6, 13, 20, 27, 34, 41, 48, 55, 55, 48, 41, 34, 27, 20, 13,  6,
+  5, 12, 19, 26, 33, 40, 47, 54, 53, 46, 39, 32, 25, 18, 11,  4,
+  3, 10, 17, 24, 31, 38, 45, 52, 51, 44, 37, 30, 23, 16,  9,  2,
+  1,  8, 15, 22, 29, 36, 43, 50, 49, 42, 35, 28, 21, 14,  7,  0
+};
+#elif defined GC_62
+// NUMCORES4GC = 62
+static int gc_core2block[124] = {
+  0,123,  15,108,  16,107,  31,92,  32,91,  47,76,
+  1,122,  14,109,  17,106,  30,93,  33,90,  46,77,  48,75,  61,62,
+  2,121,  13,110,  18,105,  29,94,  34,89,  45,78,  49,74,  60,63,
+  3,120,  12,111,  19,104,  28,95,  35,88,  44,79,  50,73,  59,64,
+  4,119,  11,112,  20,103,  27,96,  36,87,  43,80,  51,72,  58,65,
+  5,118,  10,113,  21,102,  26,97,  37,86,  42,81,  52,71,  57,66,
+  6,117,   9,114,  22,101,  25,98,  38,85,  41,82,  53,70,  56,67,
+  7,116,   8,115,  23,100,  24,99,  39,84,  40,83,  54,69,  55,68
+};
+
+static int gc_block2core[124] = {
+  0,  6, 14, 22, 30, 38, 46, 54, 55, 47, 39, 31, 23, 15,  7,  1,
+  2,  8, 16, 24, 32, 40, 48, 56, 57, 49, 41, 33, 25, 17,  9,  3,
+  4, 10, 18, 26, 34, 42, 50, 58, 59, 51, 43, 35, 27, 19, 11,  5,
+  12, 20, 28, 36, 44, 52, 60, 61, 53, 45, 37, 29, 21, 13,
+  13, 21, 29, 37, 45, 53, 61, 60, 52, 44, 36, 28, 20, 12,
+  5, 11, 19, 27, 35, 43, 51, 59, 58, 50, 42, 34, 26, 18, 10,  4,
+  3,  9, 17, 25, 33, 41, 49, 57, 56, 48, 40, 32, 24, 16,  8,  2,
+  1,  7, 15, 23, 31, 39, 47, 55, 54, 46, 38, 30, 22, 14,  6,  0
+};
+#endif
+
+#endif // MULTICORE_HELPER_H
diff --git a/Robust/src/Runtime/bamboo/multicoreruntime.c b/Robust/src/Runtime/bamboo/multicoreruntime.c
new file mode 100644
index 00000000..140d68f5
--- /dev/null
+++ b/Robust/src/Runtime/bamboo/multicoreruntime.c
@@ -0,0 +1,337 @@
+#include "runtime.h"
+#include "structdefs.h"
+#include "mem.h"
+#ifndef MULTICORE
+#include <fcntl.h>
+#include <errno.h>
+#include <signal.h>
+#endif
+#ifndef RAW
+#include <stdio.h>
+#endif
+#ifdef MULTICORE
+#include "runtime_arch.h"
+#endif
+//#include "option.h"
+
+extern int classsize[];
+extern int typearray[];
+extern int typearray2[];
+#ifndef MULTICORE
+jmp_buf error_handler;
+int instructioncount;
+
+char *options;
+int injectfailures=0;
+float failurechance=0;
+int errors=0;
+int injectinstructionfailures;
+int failurecount;
+float instfailurechance=0;
+int numfailures;
+int instaccum=0;
+#ifdef DMALLOC
+#include "dmalloc.h"
+#endif
+#endif
+
+int debugtask=0;
+
+int instanceof(struct ___Object___ *ptr, int type) {
+  int i=ptr->type;
+  do {
+    if (i==type)
+      return 1;
+    i=typearray[i];
+  } while(i!=-1);
+  i=ptr->type;
+  if (i>NUMCLASSES) {
+    do {
+      if (i==type)
+	return 1;
+      i=typearray2[i-NUMCLASSES];
+    } while(i!=-1);
+  }
+  return 0;
+}
+
+#ifdef MULTICORE
+void initializeexithandler() {
+}
+#else
+void exithandler(int sig, siginfo_t *info, void * uap) {
+#ifdef DEBUG
+  printf("exit in exithandler\n");
+#endif
+  exit(0);
+}
+
+void initializeexithandler() {
+  struct sigaction sig;
+  sig.sa_sigaction=&exithandler;
+  sig.sa_flags=SA_SIGINFO;
+  sigemptyset(&sig.sa_mask);
+  sigaction(SIGUSR2, &sig, 0);
+}
+#endif
+
+/* This function inject failures */
+
+void injectinstructionfailure() {
+#ifdef MULTICORE
+  // not supported in MULTICORE version
+  return;
+#else
+#ifdef TASK
+  if (injectinstructionfailures) {
+    if (numfailures==0)
+      return;
+    instructioncount=failurecount;
+    instaccum+=failurecount;
+    if ((((double)random())/RAND_MAX)<instfailurechance) {
+      if (numfailures>0)
+	numfailures--;
+      printf("FAILURE!!! %d\n",numfailures);
+      longjmp(error_handler,11);
+    }
+  }
+#else
+#ifdef THREADS
+  if (injectinstructionfailures) {
+    if (numfailures==0)
+      return;
+    instaccum+=failurecount;
+    if ((((double)random())/RAND_MAX)<instfailurechance) {
+      if (numfailures>0)
+	numfailures--;
+      printf("FAILURE!!! %d\n",numfailures);
+      threadexit();
+    }
+  }
+#endif
+#endif
+#endif
+}
+
+#ifdef D___Double______nativeparsedouble____L___String___
+double CALL01(___Double______nativeparsedouble____L___String___,struct ___String___ * ___str___) {
+  int length=VAR(___str___)->___count___;
+  int maxlength=(length>60) ? 60 : length;
+  char str[maxlength+1];
+  struct ArrayObject * chararray=VAR(___str___)->___value___;
+  int i;
+  int offset=VAR(___str___)->___offset___;
+  for(i=0; i<maxlength; i++) {
+    str[i]=((short *)(((char *)&chararray->___length___)+sizeof(int)))[i+offset];
+  }
+  str[i]=0;
+  double d=atof(str);
+  return d;
+}
+#endif
+
+#ifdef D___String______convertdoubletochar____D__AR_C
+int CALL12(___String______convertdoubletochar____D__AR_C, double ___val___, double ___val___, struct ArrayObject ___chararray___) {
+  int length=VAR(___chararray___)->___length___;
+  char str[length];
+  int i;
+  int num=snprintf(str, length, "%f",___val___);
+  if (num>=length)
+    num=length-1;
+  for(i=0; i<length; i++) {
+    ((short *)(((char *)&VAR(___chararray___)->___length___)+sizeof(int)))[i]=(short)str[i];
+  }
+  return num;
+}
+#else
+int CALL12(___String______convertdoubletochar____D__AR_C, double ___val___, double ___val___, struct ArrayObject ___chararray___) {
+  return 0;
+}
+#endif
+
+void CALL11(___System______exit____I,int ___status___, int ___status___) {
+#ifdef MULTICORE
+  BAMBOO_EXIT(___status___);
+#else
+#ifdef DEBUG
+  printf("exit in CALL11\n");
+#endif
+  exit(___status___);
+#endif
+}
+
+//#ifdef D___Vector______removeElement_____AR_L___Object____I_I
+void CALL23(___Vector______removeElement_____AR_L___Object____I_I, int ___index___, int ___size___, struct ArrayObject * ___array___, int ___index___, int ___size___) {
+  char* offset=((char *)(&VAR(___array___)->___length___))+sizeof(unsigned int)+sizeof(void *)*___index___;
+  memmove(offset, offset+sizeof(void *),(___size___-___index___-1)*sizeof(void *));
+}
+//#endif
+
+void CALL11(___System______printI____I,int ___status___, int ___status___) {
+#ifdef MULTICORE
+  BAMBOO_DEBUGPRINT(0x1111);
+  BAMBOO_DEBUGPRINT_REG(___status___);
+#else
+#ifdef DEBUG
+  printf("printI in CALL11\n");
+#endif
+  printf("%d\n", ___status___);
+#endif
+}
+
+long CALL00(___System______currentTimeMillis____) {
+#ifdef MULTICORE
+  // not supported in MULTICORE version
+  return -1;
+#else
+  struct timeval tv; long long retval;
+  gettimeofday(&tv, NULL);
+  retval = tv.tv_sec; /* seconds */
+  retval*=1000; /* milliseconds */
+  retval+= (tv.tv_usec/1000); /* adjust milliseconds & add them in */
+  return retval;
+#endif
+}
+
+void CALL01(___System______printString____L___String___,struct ___String___ * ___s___) {
+#ifdef MULTICORE
+#else
+  struct ArrayObject * chararray=VAR(___s___)->___value___;
+  int i;
+  int offset=VAR(___s___)->___offset___;
+  for(i=0; i<VAR(___s___)->___count___; i++) {
+    short sc=((short *)(((char *)&chararray->___length___)+sizeof(int)))[i+offset];
+    putchar(sc);
+  }
+#endif
+}
+
+/* Object allocation function */
+
+#ifdef MULTICORE_GC
+void * allocate_new(void * ptr, int type) {
+  struct ___Object___ * v=(struct ___Object___ *)FREEMALLOC((struct garbagelist *) ptr, classsize[type]);
+#ifdef DEBUG
+  printf("(%x,%x): new object: %x (%d, %x) \n", udn_tile_coord_x(),
+         udn_tile_coord_y(), (int)v, type, classsize[type]);
+#endif
+  v->type=type;
+  v->version = 0;
+  v->lock = NULL;
+  v->lockcount = 0;
+  initlock(v);
+#ifdef GC_PROFILE
+  extern unsigned int gc_num_obj;
+  gc_num_obj++;
+#endif
+  return v;
+}
+
+/* Array allocation function */
+
+struct ArrayObject * allocate_newarray(void * ptr, int type, int length) {
+  struct ArrayObject * v=(struct ArrayObject *)FREEMALLOC((struct garbagelist *) ptr, sizeof(struct ArrayObject)+length*classsize[type]);
+#ifdef DEBUG
+  printf("(%x,%x): new array object: %x (%d, %x)\n", udn_tile_coord_x(),
+         udn_tile_coord_y(), (int)v, type, 
+		 sizeof(struct ArrayObject)+length*classsize[type]);
+#endif
+  v->type=type;
+  v->version = 0;
+  v->lock = NULL;
+  if (length<0) {
+    return NULL;
+  }
+  v->___length___=length;
+  initlock(v);
+#ifdef GC_PROFILE
+  extern unsigned int gc_num_obj;
+  gc_num_obj++;
+#endif
+  return v;
+}
+
+#else
+void * allocate_new(int type) {
+  struct ___Object___ * v=FREEMALLOC(classsize[type]);
+  v->type=type;
+  v->version = 0;
+  //v->numlocks = 0;
+  v->lock = NULL;
+  initlock(v);
+  return v;
+}
+
+/* Array allocation function */
+
+struct ArrayObject * allocate_newarray(int type, int length) {
+  struct ArrayObject * v=FREEMALLOC(sizeof(struct ArrayObject)+length*classsize[type]);
+  v->type=type;
+  v->version = 0;
+  //v->numlocks = 0;
+  v->lock = NULL;
+  v->___length___=length;
+  initlock(v);
+  return v;
+}
+#endif
+
+
+/* Converts C character arrays into Java strings */
+#ifdef MULTICORE_GC
+struct ___String___ * NewString(void * ptr, const char *str,int length) {
+#else
+struct ___String___ * NewString(const char *str,int length) {
+#endif
+  int i;
+#ifdef MULTICORE_GC
+  struct ArrayObject * chararray=allocate_newarray((struct garbagelist *)ptr, CHARARRAYTYPE, length);
+  int ptrarray[]={1, (int) ptr, (int) chararray};
+  struct ___String___ * strobj=allocate_new((struct garbagelist *) &ptrarray, STRINGTYPE);
+  chararray=(struct ArrayObject *) ptrarray[2];
+#else
+  struct ArrayObject * chararray=allocate_newarray(CHARARRAYTYPE, length);
+  struct ___String___ * strobj=allocate_new(STRINGTYPE);
+#endif
+  strobj->___value___=chararray;
+  strobj->___count___=length;
+  strobj->___offset___=0;
+
+  for(i=0; i<length; i++) {
+    ((short *)(((char *)&chararray->___length___)+sizeof(int)))[i]=(short)str[i];
+  }
+  return strobj;
+}
+
+/* Generated code calls this if we fail a bounds check */
+
+void failedboundschk() {
+#ifndef TASK
+  printf("Array out of bounds\n");
+#ifdef THREADS
+  threadexit();
+#else
+  exit(-1);
+#endif
+#else
+#ifndef MULTICORE
+  printf("Array out of bounds\n");
+  longjmp(error_handler,2);
+#else
+  BAMBOO_EXIT(0xa001);
+#endif
+#endif
+}
+
+/* Abort task call */
+void abort_task() {
+#ifdef TASK
+#ifndef MULTICORE
+  printf("Aborting\n");
+  longjmp(error_handler,4);
+#endif
+#else
+  printf("Aborting\n");
+  exit(-1);
+#endif
+}
diff --git a/Robust/src/Runtime/bamboo/multicoreruntime.h b/Robust/src/Runtime/bamboo/multicoreruntime.h
new file mode 100644
index 00000000..c734d5cc
--- /dev/null
+++ b/Robust/src/Runtime/bamboo/multicoreruntime.h
@@ -0,0 +1,605 @@
+#ifndef MULTICORE_RUNTIME
+#define MULTICORE_RUNTIME
+
+#ifndef INLINE
+#define INLINE    inline __attribute__((always_inline))
+#endif
+
+#ifndef bool
+#define bool int
+#define true 1
+#define false 0
+#endif
+
+////////////////////////////////////////////////////////////////
+// global variables                                          //
+///////////////////////////////////////////////////////////////
+
+// record the starting time
+unsigned long long bamboo_start_time;
+
+// data structures for msgs
+#define BAMBOO_OUT_BUF_LENGTH 3000
+#define BAMBOO_MSG_BUF_LENGTH 3000
+int msgdata[BAMBOO_MSG_BUF_LENGTH];
+volatile int msgdataindex;
+volatile int msgdatalast;
+int msglength;
+volatile bool msgdatafull;
+int outmsgdata[BAMBOO_OUT_BUF_LENGTH];
+int outmsgindex;
+int outmsglast;
+int outmsgleft;
+volatile bool isMsgHanging;
+//volatile bool isMsgSending;
+
+#define MSG_INDEXINC_I() \
+  msgdataindex = (msgdataindex + 1) % (BAMBOO_MSG_BUF_LENGTH)
+
+#define MSG_LASTINDEXINC_I() \
+  msgdatalast = (msgdatalast + 1) % (BAMBOO_MSG_BUF_LENGTH)
+
+#define MSG_CACHE_I(n) \
+  msgdata[msgdatalast] = (n); \
+  MSG_LASTINDEXINC_I()
+
+// NOTE: if msgdataindex == msgdatalast, it always means that the buffer if
+//       full. In the case that the buffer is empty, should never call this
+//       MACRO
+#define MSG_REMAINSIZE_I(s) \
+  if(msgdataindex < msgdatalast) { \
+    (*(int*)s) = msgdatalast - msgdataindex; \
+  } else if((msgdataindex == msgdatalast) && (!msgdatafull)) { \
+    (*(int*)s) = 0; \
+  }       else { \
+    (*(int*)s) = (BAMBOO_MSG_BUF_LENGTH) -msgdataindex + msgdatalast; \
+  }
+
+#define OUTMSG_INDEXINC() \
+  outmsgindex = (outmsgindex + 1) % (BAMBOO_OUT_BUF_LENGTH)
+
+#define OUTMSG_LASTINDEXINC() \
+  outmsglast = (outmsglast + 1) % (BAMBOO_OUT_BUF_LENGTH); \
+  if(outmsglast == outmsgindex) { \
+    BAMBOO_EXIT(0xdd01); \
+  }
+
+#define OUTMSG_CACHE(n) \
+  outmsgdata[outmsglast] = (n); \
+  OUTMSG_LASTINDEXINC();
+
+#define MAX_PACKET_WORDS 5
+
+/* Message format:
+ *      type + Msgbody
+ * type: 1 -- transfer object
+ *       2 -- transfer stall msg
+ *       3 -- lock request
+ *       4 -- lock grount
+ *       5 -- lock deny
+ *       6 -- lock release
+ *       // add for profile info
+ *       7 -- transfer profile output msg
+ *       8 -- transfer profile output finish msg
+ *       // add for alias lock strategy
+ *       9 -- redirect lock request
+ *       a -- lock grant with redirect info
+ *       b -- lock deny with redirect info
+ *       c -- lock release with redirect info
+ *       d -- status confirm request
+ *       e -- status report msg
+ *       f -- terminate
+ *      10 -- requiring for new memory
+ *      11 -- response for new memory request
+ *      12 -- GC init phase start
+ *      13 -- GC start
+ *      14 -- compact phase start
+ *      15 -- flush phase start
+ *      16 -- init phase finish
+ *      17 -- mark phase finish
+ *      18 -- compact phase finish
+ *      19 -- flush phase finish
+ *      1a -- GC finish
+ *      1b -- marked phase finish confirm request
+ *      1c -- marked phase finish confirm response
+ *      1d -- markedObj msg
+ *      1e -- start moving objs msg
+ *      1f -- ask for mapping info of a markedObj
+ *      20 -- mapping info of a markedObj
+ *      21 -- large objs info request
+ *      22 -- large objs info response
+ *      23 -- large objs mapping info
+ *
+ * ObjMsg: 1 + size of msg + obj's address + (task index + param index)+
+ * StallMsg: 2 + corenum + sendobjs + receiveobjs
+ *             (size is always 4 * sizeof(int))
+ * LockMsg: 3 + lock type + obj pointer + lock + request core
+ *            (size is always 5 * sizeof(int))
+ *          4/5/6 + lock type + obj pointer + lock
+ *            (size is always 4 * sizeof(int))
+ *          9 + lock type + obj pointer +  redirect lock + root request core
+ *            + request core
+ *            (size is always 6 * sizeof(int))
+ *          a/b + lock type + obj pointer + redirect lock
+ *              (size is always 4 * sizeof(int))
+ *          c + lock type + lock + redirect lock
+ *            (size is always 4 * sizeof(int))
+ *          lock type: 0 -- read; 1 -- write
+ * ProfileMsg: 7 + totalexetime
+ *               (size is always 2 * sizeof(int))
+ *             8 + corenum
+ *               (size is always 2 * sizeof(int))
+ * StatusMsg: d (size is always 1 * sizeof(int))
+ *            e + status + corenum + sendobjs + receiveobjs
+ *              (size is always 5 * sizeof(int))
+ *            status: 0 -- stall; 1 -- busy
+ * TerminateMsg: f (size is always 1 * sizeof(int)
+ * MemoryMsg: 10 + size + corenum
+ *              (size is always 3 * sizeof(int))
+ *           11 + base_va + size
+ *              (size is always 3 * sizeof(int))
+ * GCMsg: 12/13 (size is always 1 * sizeof(int))
+ *        14 + size of msg + (num of objs to move + (start address
+ *           + end address + dst core + start dst)+)?
+ *           + (num of incoming objs + (start dst + orig core)+)?
+ *           + (num of large obj lists + (start address + lenght
+ *           + start dst)+)?
+ *        15 (size is always 1 * sizeof(int))
+ *        16 + corenum
+ *           (size is always 2 * sizeof(int))
+ *        17 + corenum + gcsendobjs + gcreceiveobjs
+ *           (size if always 4 * sizeof(int))
+ *        18 + corenum + fulfilled blocks num + (finish compact(1) + current
+ *           heap top)/(need mem(0) + mem need)
+ *           size is always 5 * sizeof(int))
+ *        19 + corenum
+ *              (size is always 2 * sizeof(int))
+ *        1a (size is always 1 * sizeof(int))
+ *        1b (size if always 1 * sizeof(int))
+ *        1c + size of msg + corenum + gcsendobjs + gcreceiveobjs
+ *           (size is always 5 * sizeof(int))
+ *        1d + obj's address + request core
+ *           (size is always 3 * sizeof(int))
+ *        1e + corenum + start addr + end addr
+ *           (size if always 4 * sizeof(int))
+ *        1f + obj's address + corenum
+ *           (size is always 3 * sizeof(int))
+ *        20 + obj's address + dst address
+ *           (size if always 3 * sizeof(int))
+ *        21 (size is always 1 * sizeof(int))
+ *        22 + size of msg + corenum + current heap size
+ *           + (num of large obj lists + (start address + length)+)?
+ *        23 + orig large obj ptr + new large obj ptr
+ *            (size is always 3 * sizeof(int))
+ */
+typedef enum {
+  MSGSTART = 0xD0,       // 0xD0
+  TRANSOBJ,              // 0xD1
+  TRANSTALL,             // 0xD2
+  LOCKREQUEST,           // 0xD3
+  LOCKGROUNT,            // 0xD4
+  LOCKDENY,              // 0xD5
+  LOCKRELEASE,           // 0xD6
+  PROFILEOUTPUT,         // 0xD7
+  PROFILEFINISH,         // 0xD8
+  REDIRECTLOCK,          // 0xD9
+  REDIRECTGROUNT,        // 0xDa
+  REDIRECTDENY,          // 0xDb
+  REDIRECTRELEASE,       // 0xDc
+  STATUSCONFIRM,         // 0xDd
+  STATUSREPORT,          // 0xDe
+  TERMINATE,             // 0xDf
+  MEMREQUEST,            // 0xE0
+  MEMRESPONSE,           // 0xE1
+#ifdef MULTICORE_GC
+  GCSTARTPRE,            // 0xE2
+  GCSTARTINIT,           // 0xE3
+  GCSTART,               // 0xE4
+  GCSTARTCOMPACT,        // 0xE5
+  GCSTARTMAPINFO,        // 0xE6
+  GCSTARTFLUSH,          // 0xE7
+  GCFINISHPRE,           // 0xE8
+  GCFINISHINIT,          // 0xE9
+  GCFINISHMARK,          // 0xEa
+  GCFINISHCOMPACT,       // 0xEb
+  GCFINISHMAPINFO,       // 0xEc
+  GCFINISHFLUSH,         // 0xEd
+  GCFINISH,              // 0xEe
+  GCMARKCONFIRM,         // 0xEf
+  GCMARKREPORT,          // 0xF0
+  GCMARKEDOBJ,           // 0xF1
+  GCMOVESTART,           // 0xF2
+  GCMAPREQUEST,          // 0xF3
+  GCMAPINFO,             // 0xF4
+  GCMAPTBL,              // 0xF5
+  GCLOBJREQUEST,         // 0xF6
+  GCLOBJINFO,            // 0xF7
+  GCLOBJMAPPING,         // 0xF8
+#ifdef GC_PROFILE
+  GCPROFILES,            // 0xF9
+#endif
+#ifdef GC_CACHE_ADAPT
+  GCSTARTPOSTINIT,       // 0xFa
+  GCSTARTPREF,           // 0xFb
+  GCFINISHPOSTINIT,      // 0xFc
+  GCFINISHPREF,          // 0xFd
+#endif // GC_CACHE_ADAPT
+#endif
+  MSGEND
+} MSGTYPE;
+
+/////////////////////////////////////////////////////////////////////////////////
+// NOTE: BAMBOO_TOTALCORE -- number of the available cores in the processor.
+//                           No greater than the number of all the cores in
+//                           the processor
+//       NUMCORES -- number of cores chosen to deploy the application. It can
+//                   be greater than that required to fully parallelize the
+//                   application. The same as NUMCORES.
+//       NUMCORESACTIVE -- number of cores that really execute the
+//                         application. No greater than NUMCORES
+//       NUMCORES4GC -- number of cores for gc. No greater than NUMCORES.
+//                      NOTE: currently only support ontinuous cores as gc
+//                            cores, i.e. 0~NUMCORES4GC-1
+////////////////////////////////////////////////////////////////////////////////
+// data structures of status for termination
+// only check working cores
+volatile int corestatus[NUMCORESACTIVE]; // records status of each core
+                                         // 1: running tasks
+                                         // 0: stall
+volatile int numsendobjs[NUMCORESACTIVE]; // records how many objects a core
+                                          // has sent out
+volatile int numreceiveobjs[NUMCORESACTIVE]; // records how many objects a
+                                             // core has received
+volatile int numconfirm;
+volatile bool waitconfirm;
+bool busystatus;
+int self_numsendobjs;
+int self_numreceiveobjs;
+
+// get rid of lock msgs for GC version
+#ifndef MULTICORE_GC
+// data structures for locking
+struct RuntimeHash locktable;
+static struct RuntimeHash* locktbl = &locktable;
+struct RuntimeHash * lockRedirectTbl;
+struct RuntimeHash * objRedirectLockTbl;
+#endif
+struct LockValue {
+  int redirectlock;
+  int value;
+};
+int lockobj;
+int lock2require;
+int lockresult;
+bool lockflag;
+
+// data structures for waiting objs
+struct Queue objqueue;
+struct Queue * totransobjqueue; // queue to hold objs to be transferred
+                                // should be cleared whenever enter a task
+
+// data structures for shared memory allocation
+#ifdef TILERA_BME
+#define BAMBOO_BASE_VA 0xd000000
+#elif defined TILERA_ZLINUX
+#ifdef MULTICORE_GC
+#define BAMBOO_BASE_VA 0xd000000
+#endif // MULTICORE_GC
+#endif // TILERA_BME
+
+#ifdef BAMBOO_MEMPROF
+#define GC_BAMBOO_NUMCORES 56
+#else
+#define GC_BAMBOO_NUMCORES 62
+#endif
+
+#ifdef GC_DEBUG
+#include "structdefs.h"
+#define BAMBOO_NUM_PAGES (NUMCORES4GC*(2+1)+3)
+#define BAMBOO_PAGE_SIZE (64 * 64)
+#define BAMBOO_SMEM_SIZE (64 * 64) // (BAMBOO_PAGE_SIZE)
+#define BAMBOO_SHARED_MEM_SIZE ((BAMBOO_PAGE_SIZE) *(BAMBOO_NUM_PAGES))
+#else
+#ifdef GC_LARGESHAREDHEAP
+#define BAMBOO_NUM_PAGES ((GC_BAMBOO_NUMCORES)*(2+2))
+#elif defined GC_LARGESHAREDHEAP2
+#define BAMBOO_NUM_PAGES ((GC_BAMBOO_NUMCORES)*(2+2))
+#else
+#define BAMBOO_NUM_PAGES ((GC_BAMBOO_NUMCORES)*(2+3)) //(15 * 1024) //(64 * 4 * 0.75) //(1024 * 1024 * 3.5)  3G
+#endif
+#ifdef GC_LARGEPAGESIZE
+#define BAMBOO_PAGE_SIZE (4 * 1024 * 1024)  // (4096)
+#define BAMBOO_SMEM_SIZE (4 * 1024 * 1024)
+#elif defined GC_SMALLPAGESIZE
+#define BAMBOO_PAGE_SIZE (256 * 1024)  // (4096)
+#define BAMBOO_SMEM_SIZE (256 * 1024)
+#elif defined GC_SMALLPAGESIZE2
+#define BAMBOO_PAGE_SIZE (64 * 1024)  // (4096)
+#define BAMBOO_SMEM_SIZE (64 * 1024)
+#else
+#define BAMBOO_PAGE_SIZE (1024 * 1024)  // (4096)
+#define BAMBOO_SMEM_SIZE (1024 * 1024)
+#endif // GC_LARGEPAGESIZE
+#define BAMBOO_SHARED_MEM_SIZE ((BAMBOO_PAGE_SIZE) * (BAMBOO_NUM_PAGES)) //(1024 * 1024 * 240)
+//((unsigned long long int)(3.0 * 1024 * 1024 * 1024)) // 3G 
+#endif // GC_DEBUG
+
+#ifdef MULTICORE_GC
+volatile bool gc_localheap_s;
+#endif
+
+#ifdef MULTICORE_GC
+#include "multicoregarbage.h"
+
+typedef enum {
+  SMEMLOCAL = 0x0,// 0x0, using local mem only
+  SMEMFIXED,      // 0x1, use local mem in lower address space(1 block only)
+                  //      and global mem in higher address space
+  SMEMMIXED,      // 0x2, like FIXED mode but use a threshold to control
+  SMEMGLOBAL,     // 0x3, using global mem only
+  SMEMEND
+} SMEMSTRATEGY;
+
+SMEMSTRATEGY bamboo_smem_mode; //-DSMEML: LOCAL; -DSMEMF: FIXED;
+                               //-DSMEMM: MIXED; -DSMEMG: GLOBAL;
+
+struct freeMemItem {
+  INTPTR ptr;
+  int size;
+  int startblock;
+  int endblock;
+  struct freeMemItem * next;
+};
+
+struct freeMemList {
+  struct freeMemItem * head;
+  struct freeMemItem * backuplist; // hold removed freeMemItem for reuse;
+                                   // only maintain 1 freemMemItem
+};
+
+// table recording the number of allocated bytes on each block
+// Note: this table resides on the bottom of the shared heap for all cores
+//       to access
+volatile int * bamboo_smemtbl;
+volatile int bamboo_free_block;
+//bool bamboo_smem_flushed;
+//struct freeMemList * bamboo_free_mem_list;
+int bamboo_reserved_smem; // reserved blocks on the top of the shared heap
+                          // e.g. 20% of the heap and should not be allocated
+                          // otherwise gc is invoked
+volatile INTPTR bamboo_smem_zero_top;
+#define BAMBOO_SMEM_ZERO_UNIT_SIZE (4 * 1024) // 4KB
+#else
+//volatile mspace bamboo_free_msp;
+INTPTR bamboo_free_smemp;
+int bamboo_free_smem_size;
+#endif
+volatile bool smemflag;
+volatile INTPTR bamboo_cur_msp;
+volatile int bamboo_smem_size;
+
+// for test TODO
+int total_num_t6;
+
+// data structures for profile mode
+#ifdef PROFILE
+
+#define TASKINFOLENGTH 3000 // 0
+#ifdef PROFILE_INTERRUPT
+#define INTERRUPTINFOLENGTH 50 //0
+#endif // PROFILE_INTERRUPT
+
+bool stall;
+//bool isInterrupt;
+int totalexetime;
+//unsigned long long interrupttime;
+
+typedef struct task_info {
+  char* taskName;
+  unsigned long long startTime;
+  unsigned long long endTime;
+  unsigned long long exitIndex;
+  struct Queue * newObjs;
+} TaskInfo;
+
+TaskInfo * taskInfoArray[TASKINFOLENGTH];
+int taskInfoIndex;
+bool taskInfoOverflow;
+#ifdef PROFILE_INTERRUPT
+typedef struct interrupt_info {
+  unsigned long long startTime;
+  unsigned long long endTime;
+} InterruptInfo;
+
+InterruptInfo * interruptInfoArray[INTERRUPTINFOLENGTH];
+int interruptInfoIndex;
+bool interruptInfoOverflow;
+#endif // PROFILE_INTERUPT
+volatile int profilestatus[NUMCORESACTIVE]; // records status of each core
+                                            // 1: running tasks
+                                            // 0: stall
+#endif // #ifdef PROFILE
+
+#ifndef INTERRUPT
+bool reside;
+#endif
+/////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////
+// these are functions should be implemented in           //
+// multicore runtime for any multicore processors         //
+////////////////////////////////////////////////////////////
+#ifdef TASK
+#ifdef MULTICORE
+INLINE void initialization(void);
+INLINE void initCommunication(void);
+INLINE void fakeExecution(void);
+INLINE void terminate(void);
+INLINE void initlock(struct ___Object___ * v);
+#ifdef BAMBOO_MEMPROF
+INLINE void terminatememprof(void);
+#endif
+
+// lock related functions
+bool getreadlock(void* ptr);
+void releasereadlock(void* ptr);
+bool getwritelock(void* ptr);
+void releasewritelock(void* ptr);
+bool getwritelock_I(void* ptr);
+void releasewritelock_I(void * ptr);
+#ifndef MULTICORE_GC
+void releasewritelock_r(void * lock, void * redirectlock);
+#endif
+/* this function is to process lock requests.
+ * can only be invoked in receiveObject() */
+// if return -1: the lock request is redirected
+//            0: the lock request is approved
+//            1: the lock request is denied
+INLINE int processlockrequest(int locktype,
+                              int lock,
+                              int obj,
+                              int requestcore,
+                              int rootrequestcore,
+                              bool cache);
+INLINE void processlockrelease(int locktype,
+                               int lock,
+                               int redirectlock,
+                               bool redirect);
+
+// msg related functions
+INLINE void send_hanging_msg(bool isInterrupt);
+INLINE void send_msg_1(int targetcore,
+                       unsigned long n0,
+					   bool isInterrupt);
+INLINE void send_msg_2(int targetcore,
+                       unsigned long n0,
+                       unsigned long n1,
+					   bool isInterrupt);
+INLINE void send_msg_3(int targetcore,
+                       unsigned long n0,
+                       unsigned long n1,
+                       unsigned long n2,
+					   bool isInterrupt);
+INLINE void send_msg_4(int targetcore,
+                       unsigned long n0,
+                       unsigned long n1,
+                       unsigned long n2,
+                       unsigned long n3,
+					   bool isInterrupt);
+INLINE void send_msg_5(int targetcore,
+                       unsigned long n0,
+                       unsigned long n1,
+                       unsigned long n2,
+                       unsigned long n3,
+                       unsigned long n4,
+					   bool isInterrupt);
+INLINE void send_msg_6(int targetcore,
+                       unsigned long n0,
+                       unsigned long n1,
+                       unsigned long n2,
+                       unsigned long n3,
+                       unsigned long n4,
+                       unsigned long n5,
+					   bool isInterrupt);
+INLINE void cache_msg_1(int targetcore,
+                        unsigned long n0);
+INLINE void cache_msg_2(int targetcore,
+                        unsigned long n0,
+                        unsigned long n1);
+INLINE void cache_msg_3(int targetcore,
+                        unsigned long n0,
+                        unsigned long n1,
+                        unsigned long n2);
+INLINE void cache_msg_4(int targetcore,
+                        unsigned long n0,
+                        unsigned long n1,
+                        unsigned long n2,
+                        unsigned long n3);
+INLINE void cache_msg_5(int targetcore,
+                        unsigned long n0,
+                        unsigned long n1,
+                        unsigned long n2,
+                        unsigned long n3,
+                        unsigned long n4);
+INLINE void cache_msg_6(int targetcore,
+                        unsigned long n0,
+                        unsigned long n1,
+                        unsigned long n2,
+                        unsigned long n3,
+                        unsigned long n4,
+                        unsigned long n5);
+INLINE void transferObject(struct transObjInfo * transObj);
+INLINE int receiveMsg(uint32_t send_port_pending);
+
+#ifdef MULTICORE_GC
+INLINE void transferMarkResults();
+#endif
+
+#ifdef PROFILE
+INLINE void profileTaskStart(char * taskname);
+INLINE void profileTaskEnd(void);
+void outputProfileData();
+#endif  // #ifdef PROFILE
+///////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////
+// For each version of BAMBOO runtime, there should be a header file named //
+// runtim_arch.h defining following MARCOS:                                //
+// BAMBOO_NUM_OF_CORE: the # of current residing core                      //
+// BAMBOO_GET_NUM_OF_CORE(): compute the # of current residing core        //
+// BAMBOO_COORDS(c, x, y): convert the cpu # to coords (*x, *y)            //
+// BAMBOO_DEBUGPRINT(x): print out integer x                               //
+// BAMBOO_DEBUGPRINT_REG(x): print out value of variable x                 //
+// BAMBOO_EXIT_APP(x): exit the whole application                          //
+// BAMBOO_EXIT(x): error exit routine with error #                         //
+// BAMBOO_DIE(x): error exit routine with error msg                        //
+// BAMBOO_GET_EXE_TIME(): rountine to get current clock cycle number       //
+// BAMBOO_MSG_AVAIL(): checking if there are msgs coming in                //
+// BAMBOO_GCMSG_AVAIL(): checking if there are gcmsgs coming in            //
+// BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT(): change to runtime mode from    //
+//                                          client mode                    //
+// BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME(): change to client mode from     //
+//                                          runtime mode                   //
+// BAMBOO_ENTER_SEND_MODE_FROM_CLIENT(): change to send mode from          //
+//                                       client mode                       //
+// BAMBOO_ENTER_CLIENT_MODE_FROM_SEND(): change to client mode from        //
+//                                       send mode                         //
+// BAMBOO_ENTER_RUNTIME_MODE_FROM_SEND(): change to runtime mode from      //
+//                                        send mode                        //
+// BAMBOO_ENTER_SEND_MODE_FROM_RUNTIME(): change to send mode from         //
+//                                        runtime mode                     //
+// BAMBOO_WAITING_FOR_LOCK(): routine executed while waiting for lock      //
+//                            request response                             //
+// BAMBOO_LOCAL_MEM_CALLOC(x, y): allocate an array of x elements each of  //
+//                                whose size in bytes is y on local memory //
+//                                which is given by the hypervisor         //
+// BAMBOO_LOCAL_MEM_FREE(x): free space with ptr x on local memory         //
+// BAMBOO_LOCAL_MEM_CLOSE(): close the local heap                          //
+// BAMBOO_LOCAL_MEM_CALLOC_S(x, y): allocate an array of x elements each of//
+//                                  whose size in bytes is y on local      //
+//                                  memory which is not from the hypervisor//
+//                                  but is allocated from the free memory  //
+// BAMBOO_LOCAL_MEM_FREE_S(x): free space with ptr x on self-allocated     //
+//                             local memory                                //
+// BAMBOO_LOCAL_MEM_CLOSE_S(): close the self-allocated local heap        //
+// BAMBOO_SHARE_MEM_CALLOC_I(x, y): allocate an array of x elements each of//
+//                                whose size in bytes is y on shared memory//
+// BAMBOO_SHARE_MEM_CLOSE(): close the shared heap                         //
+// BAMBOO_CACHE_LINE_SIZE: the cache line size                             //
+// BAMBOO_CACHE_LINE_MASK: mask for a cache line                           //
+// BAMBOO_CACHE_FLUSH_RANGE(x, y): flush cache lines started at x with     //
+//                                 length y                                //
+// BAMBOO_CACHE_FLUSH_ALL(): flush the whole cache of a core if necessary  //
+// BAMBOO_MEMSET_WH(x, y, z): memset the specified region of memory (start //
+//                            address x, size z) to value y with write     //
+//                            hint, the processor will not fetch the       //
+//                            current content of the memory and directly   //
+//                            write                                        //
+// BAMBOO_CLEAN_DTLB(): zero-out all the dtlb entries                      //
+// BAMBOO_CACHE_FLUSH_L2(): Flush the contents of this tile's L2 back to   //
+//                          main memory                                    //
+/////////////////////////////////////////////////////////////////////////////
+
+#endif  // #ifdef MULTICORE
+#endif  // #ifdef TASK
+#endif  // #ifndef MULTICORE_RUNTIME
diff --git a/Robust/src/Runtime/bamboo/multicoretask.c b/Robust/src/Runtime/bamboo/multicoretask.c
new file mode 100644
index 00000000..cfcb41d8
--- /dev/null
+++ b/Robust/src/Runtime/bamboo/multicoretask.c
@@ -0,0 +1,4600 @@
+#ifdef TASK
+#include "runtime.h"
+#include "multicoreruntime.h"
+#include "runtime_arch.h"
+#include "GenericHashtable.h"
+
+#ifndef INLINE
+#define INLINE    inline __attribute__((always_inline))
+#endif // #ifndef INLINE
+
+//  data structures for task invocation
+struct genhashtable * activetasks;
+struct taskparamdescriptor * currtpd;
+struct LockValue runtime_locks[MAXTASKPARAMS];
+int runtime_locklen;
+
+// specific functions used inside critical sections
+void enqueueObject_I(void * ptr,
+                     struct parameterwrapper ** queues,
+                     int length);
+int enqueuetasks_I(struct parameterwrapper *parameter,
+                   struct parameterwrapper *prevptr,
+                   struct ___Object___ *ptr,
+                   int * enterflags,
+                   int numenterflags);
+
+#ifdef MULTICORE_GC
+#ifdef SMEMF
+#define NUM_CORES2TEST 5
+#ifdef GC_1
+int core2test[1][NUM_CORES2TEST] = {
+  {0, -1, -1, -1, -1}
+};
+#elif defined GC_56
+int core2test[56][NUM_CORES2TEST] = {
+  { 0, -1,  7, -1,  1}, { 1, -1,  8,  0,  2}, { 2, -1,  9,  1,  3},
+  { 3, -1, 10,  2,  4}, { 4, -1, 11,  3,  5}, { 5, -1, 12,  4,  6},
+  { 6, -1, 13,  5, -1}, { 7,  0, 14, -1,  8}, { 8,  1, 15,  7,  9},
+  { 9,  2, 16,  8, 10}, {10,  3, 17,  9, 11}, {11,  4, 18, 10, 12},
+  {12,  5, 19, 11, 13}, {13,  6, 20, 12, -1}, {14,  7, 21, -1, 15},
+  {15,  8, 22, 14, 16}, {16,  9, 23, 15, 17}, {17, 10, 24, 16, 18},
+  {18, 11, 25, 17, 19}, {19, 12, 26, 18, 20}, {20, 13, 27, 19, -1},
+  {21, 14, 28, -1, 22}, {22, 15, 29, 21, 23}, {23, 16, 30, 22, 24},
+  {24, 17, 31, 23, 25}, {25, 18, 32, 24, 26}, {26, 19, 33, 25, 27},
+  {27, 20, 34, 26, -1}, {28, 21, 35, -1, 29}, {29, 22, 36, 28, 30},
+  {30, 23, 37, 29, 31}, {31, 24, 38, 30, 32}, {32, 25, 39, 31, 33},
+  {33, 26, 40, 32, 34}, {34, 27, 41, 33, -1}, {35, 28, 42, -1, 36},
+  {36, 29, 43, 35, 37}, {37, 30, 44, 36, 38}, {38, 31, 45, 37, 39},
+  {39, 32, 46, 38, 40}, {40, 33, 47, 39, 41}, {41, 34, 48, 40, -1},
+  {42, 35, 49, -1, 43}, {43, 36, 50, 42, 44}, {44, 37, 51, 43, 45},
+  {45, 38, 52, 44, 46}, {46, 39, 53, 45, 47}, {47, 40, 54, 46, 48},
+  {48, 41, 55, 47, -1}, {49, 42, -1, -1, 50}, {50, 43, -1, 49, 51},
+  {51, 44, -1, 50, 52}, {52, 45, -1, 51, 53}, {53, 46, -1, 52, 54},
+  {54, 47, -1, 53, 55}, {55, 48, -1, 54, -1}
+};
+#elif defined GC_62
+int core2test[62][NUM_CORES2TEST] = {
+  { 0, -1,  6, -1,  1}, { 1, -1,  7,  0,  2}, { 2, -1,  8,  1,  3},
+  { 3, -1,  9,  2,  4}, { 4, -1, 10,  3,  5}, { 5, -1, 11,  4, -1},
+  { 6,  0, 14, -1,  7}, { 7,  1, 15,  6,  8}, { 8,  2, 16,  7,  9},
+  { 9,  3, 17,  8, 10}, {10,  4, 18,  9, 11}, {11,  5, 19, 10, 12},
+  {12, -1, 20, 11, 13}, {13, -1, 21, 12, -1}, {14,  6, 22, -1, 15},
+  {15,  7, 23, 14, 16}, {16,  8, 24, 15, 17}, {17,  9, 25, 16, 18},
+  {18, 10, 26, 17, 19}, {19, 11, 27, 18, 20}, {20, 12, 28, 19, 21},
+  {21, 13, 29, 28, -1}, {22, 14, 30, -1, 23}, {23, 15, 31, 22, 24},
+  {24, 16, 32, 23, 25}, {25, 17, 33, 24, 26}, {26, 18, 34, 25, 27},
+  {27, 19, 35, 26, 28}, {28, 20, 36, 27, 29}, {29, 21, 37, 28, -1},
+  {30, 22, 38, -1, 31}, {31, 23, 39, 30, 32}, {32, 24, 40, 31, 33},
+  {33, 25, 41, 32, 34}, {34, 26, 42, 33, 35}, {35, 27, 43, 34, 36},
+  {36, 28, 44, 35, 37}, {37, 29, 45, 36, -1}, {38, 30, 46, -1, 39},
+  {39, 31, 47, 38, 40}, {40, 32, 48, 39, 41}, {41, 33, 49, 40, 42},
+  {42, 34, 50, 41, 43}, {43, 35, 51, 42, 44}, {44, 36, 52, 43, 45},
+  {45, 37, 53, 44, -1}, {46, 38, 54, -1, 47}, {47, 39, 55, 46, 48},
+  {48, 40, 56, 47, 49}, {49, 41, 57, 48, 50}, {50, 42, 58, 49, 51},
+  {51, 43, 59, 50, 52}, {52, 44, 60, 51, 53}, {53, 45, 61, 52, -1},
+  {54, 46, -1, -1, 55}, {55, 47, -1, 54, 56}, {56, 48, -1, 55, 57},
+  {57, 49, -1, 56, 59}, {58, 50, -1, 57, 59}, {59, 51, -1, 58, 60},
+  {60, 52, -1, 59, 61}, {61, 53, -1, 60, -1}
+};
+#endif // GC_1
+#elif defined SMEMM
+unsigned int gcmem_mixed_threshold = 0;
+unsigned int gcmem_mixed_usedmem = 0;
+#define NUM_CORES2TEST 9
+#ifdef GC_1
+int core2test[1][NUM_CORES2TEST] = {
+  {0, -1, -1, -1, -1, -1, -1, -1, -1}
+};
+#elif defined GC_56
+int core2test[56][NUM_CORES2TEST] = {
+  { 0, -1,  7, -1,  1, -1, 14, -1,  2}, 
+  { 1, -1,  8,  0,  2, -1, 15, -1,  3}, 
+  { 2, -1,  9,  1,  3, -1, 16,  0,  4}, 
+  { 3, -1, 10,  2,  4, -1, 17,  1,  5}, 
+  { 4, -1, 11,  3,  5, -1, 18,  2,  6}, 
+  { 5, -1, 12,  4,  6, -1, 19,  3, -1},
+  { 6, -1, 13,  5, -1, -1, 20,  4, -1}, 
+  { 7,  0, 14, -1,  8, -1, 21, -1,  9}, 
+  { 8,  1, 15,  7,  9, -1, 22, -1, 10}, 
+  { 9,  2, 16,  8, 10, -1, 23,  7, 11}, 
+  {10,  3, 17,  9, 11, -1, 24,  8, 12}, 
+  {11,  4, 18, 10, 12, -1, 25,  9, 13},
+  {12,  5, 19, 11, 13, -1, 26, 10, -1}, 
+  {13,  6, 20, 12, -1, -1, 27, 11, -1}, 
+  {14,  7, 21, -1, 15,  0, 28, -1, 16}, 
+  {15,  8, 22, 14, 16,  1, 29, -1, 17}, 
+  {16,  9, 23, 15, 17,  2, 30, 14, 18}, 
+  {17, 10, 24, 16, 18,  3, 31, 15, 19},
+  {18, 11, 25, 17, 19,  4, 32, 16, 20}, 
+  {19, 12, 26, 18, 20,  5, 33, 17, -1}, 
+  {20, 13, 27, 19, -1,  6, 34, 18, -1}, 
+  {21, 14, 28, -1, 22,  7, 35, -1, 23}, 
+  {22, 15, 29, 21, 23,  8, 36, -1, 24}, 
+  {23, 16, 30, 22, 24,  9, 37, 21, 25},
+  {24, 17, 31, 23, 25, 10, 38, 22, 26}, 
+  {25, 18, 32, 24, 26, 11, 39, 23, 27}, 
+  {26, 19, 33, 25, 27, 12, 40, 24, -1}, 
+  {27, 20, 34, 26, -1, 13, 41, 25, -1}, 
+  {28, 21, 35, -1, 29, 14, 42, -1, 30}, 
+  {29, 22, 36, 28, 30, 15, 43, -1, 31},
+  {30, 23, 37, 29, 31, 16, 44, 28, 32}, 
+  {31, 24, 38, 30, 32, 17, 45, 29, 33}, 
+  {32, 25, 39, 31, 33, 18, 46, 30, 34}, 
+  {33, 26, 40, 32, 34, 19, 47, 31, -1}, 
+  {34, 27, 41, 33, -1, 20, 48, 32, -1}, 
+  {35, 28, 42, -1, 36, 21, 49, -1, 37},
+  {36, 29, 43, 35, 37, 22, 50, -1, 38}, 
+  {37, 30, 44, 36, 38, 23, 51, 35, 39}, 
+  {38, 31, 45, 37, 39, 24, 52, 36, 40}, 
+  {39, 32, 46, 38, 40, 25, 53, 37, 41}, 
+  {40, 33, 47, 39, 41, 26, 54, 38, -1}, 
+  {41, 34, 48, 40, -1, 27, 55, 39, -1},
+  {42, 35, 49, -1, 43, 28, -1, -1, 44}, 
+  {43, 36, 50, 42, 44, 29, -1, -1, 45}, 
+  {44, 37, 51, 43, 45, 30, -1, 42, 46}, 
+  {45, 38, 52, 44, 46, 31, -1, 43, 47}, 
+  {46, 39, 53, 45, 47, 32, -1, 44, 48}, 
+  {47, 40, 54, 46, 48, 33, -1, 45, -1},
+  {48, 41, 55, 47, -1, 34, -1, 46, -1}, 
+  {49, 42, -1, -1, 50, 35, -1, -1, 51}, 
+  {50, 43, -1, 49, 51, 36, -1, -1, 52}, 
+  {51, 44, -1, 50, 52, 37, -1, 49, 53}, 
+  {52, 45, -1, 51, 53, 38, -1, 50, 54}, 
+  {53, 46, -1, 52, 54, 39, -1, 51, 55},
+  {54, 47, -1, 53, 55, 40, -1, 52, -1}, 
+  {55, 48, -1, 54, -1, 41, -1, 53, -1}
+};
+#elif defined GC_62
+int core2test[62][NUM_CORES2TEST] = {
+  { 0, -1,  6, -1,  1, -1, 14, -1,  2}, 
+  { 1, -1,  7,  0,  2, -1, 15, -1,  3}, 
+  { 2, -1,  8,  1,  3, -1, 16,  0,  4}, 
+  { 3, -1,  9,  2,  4, -1, 17,  1,  5}, 
+  { 4, -1, 10,  3,  5, -1, 18,  2, -1}, 
+  { 5, -1, 11,  4, -1, -1, 19,  3, -1},
+  { 6,  0, 14, -1,  7, -1, 22, -1,  8}, 
+  { 7,  1, 15,  6,  8, -1, 23, -1,  9}, 
+  { 8,  2, 16,  7,  9, -1, 24,  6, 10}, 
+  { 9,  3, 17,  8, 10, -1, 25,  7, 11}, 
+  {10,  4, 18,  9, 11, -1, 26,  8, 12}, 
+  {11,  5, 19, 10, 12, -1, 27,  9, 13},
+  {12, -1, 20, 11, 13, -1, 28, 10, -1}, 
+  {13, -1, 21, 12, -1, -1, 29, 11, -1}, 
+  {14,  6, 22, -1, 15,  0, 30, -1, 16}, 
+  {15,  7, 23, 14, 16,  1, 31, -1, 17}, 
+  {16,  8, 24, 15, 17,  2, 32, 14, 18}, 
+  {17,  9, 25, 16, 18,  3, 33, 15, 19},
+  {18, 10, 26, 17, 19,  4, 34, 16, 20}, 
+  {19, 11, 27, 18, 20,  5, 35, 17, 21}, 
+  {20, 12, 28, 19, 21, -1, 36, 18, -1}, 
+  {21, 13, 29, 28, -1, -1, 37, 19, -1}, 
+  {22, 14, 30, -1, 23,  6, 38, -1, 24}, 
+  {23, 15, 31, 22, 24,  7, 39, -1, 25},
+  {24, 16, 32, 23, 25,  8, 40, 22, 26}, 
+  {25, 17, 33, 24, 26,  9, 41, 23, 27}, 
+  {26, 18, 34, 25, 27, 10, 42, 24, 28}, 
+  {27, 19, 35, 26, 28, 11, 43, 25, 29}, 
+  {28, 20, 36, 27, 29, 12, 44, 26, -1}, 
+  {29, 21, 37, 28, -1, 13, 45, 27, -1},
+  {30, 22, 38, -1, 31, 22, 46, -1, 32}, 
+  {31, 23, 39, 30, 32, 15, 47, -1, 33}, 
+  {32, 24, 40, 31, 33, 16, 48, 30, 34}, 
+  {33, 25, 41, 32, 34, 17, 49, 31, 35}, 
+  {34, 26, 42, 33, 35, 18, 50, 32, 36}, 
+  {35, 27, 43, 34, 36, 19, 51, 33, 37},
+  {36, 28, 44, 35, 37, 20, 52, 34, -1}, 
+  {37, 29, 45, 36, -1, 21, 53, 35, -1}, 
+  {38, 30, 46, -1, 39, 22, 54, -1, 40}, 
+  {39, 31, 47, 38, 40, 23, 55, -1, 41}, 
+  {40, 32, 48, 39, 41, 24, 56, 38, 42}, 
+  {41, 33, 49, 40, 42, 25, 57, 39, 43},
+  {42, 34, 50, 41, 43, 26, 58, 40, 44}, 
+  {43, 35, 51, 42, 44, 27, 59, 41, 45}, 
+  {44, 36, 52, 43, 45, 28, 60, 42, -1}, 
+  {45, 37, 53, 44, -1, 29, 61, 43, -1}, 
+  {46, 38, 54, -1, 47, 30, -1, -1, 48}, 
+  {47, 39, 55, 46, 48, 31, -1, -1, 49},
+  {48, 40, 56, 47, 49, 32, -1, 46, 50}, 
+  {49, 41, 57, 48, 50, 33, -1, 47, 51}, 
+  {50, 42, 58, 49, 51, 34, -1, 48, 52}, 
+  {51, 43, 59, 50, 52, 35, -1, 49, 53}, 
+  {52, 44, 60, 51, 53, 36, -1, 50, -1}, 
+  {53, 45, 61, 52, -1, 37, -1, 51, -1},
+  {54, 46, -1, -1, 55, 38, -1, -1, 56}, 
+  {55, 47, -1, 54, 56, 39, -1, -1, 57}, 
+  {56, 48, -1, 55, 57, 40, -1, 54, 58}, 
+  {57, 49, -1, 56, 59, 41, -1, 55, 59}, 
+  {58, 50, -1, 57, 59, 42, -1, 56, 60}, 
+  {59, 51, -1, 58, 60, 43, -1, 57, 61},
+  {60, 52, -1, 59, 61, 44, -1, 58, -1}, 
+  {61, 53, -1, 60, -1, 45, -1, 59, -1}
+};
+#endif // GC_1
+#endif
+
+inline __attribute__((always_inline))
+void setupsmemmode(void) {
+#ifdef SMEML
+  // Only allocate local mem chunks to each core.
+  // If a core has used up its local shared memory, start gc.
+  bamboo_smem_mode = SMEMLOCAL;
+#elif defined SMEMF
+  // Allocate the local shared memory to each core with the highest priority,
+  // if a core has used up its local shared memory, try to allocate the 
+  // shared memory that belong to its neighbours, if also failed, start gc.
+  bamboo_smem_mode = SMEMFIXED;
+#elif defined SMEMM
+  // Allocate the local shared memory to each core with the highest priority,
+  // if a core has used up its local shared memory, try to allocate the 
+  // shared memory that belong to its neighbours first, if failed, check 
+  // current memory allocation rate, if it has already reached the threshold,
+  // start gc, otherwise, allocate the shared memory globally.  If all the 
+  // shared memory has been used up, start gc.
+  bamboo_smem_mode = SMEMMIXED;
+#elif defined SMEMG
+  // Allocate all the memory chunks globally, do not consider the host cores
+  // When all the shared memory are used up, start gc.
+  bamboo_smem_mode = SMEMGLOBAL;
+#else
+  // defaultly using local mode
+  bamboo_smem_mode = SMEMLOCAL;
+  //bamboo_smem_mode = SMEMGLOBAL;
+  //bamboo_smem_mode = SMEMFIXED;
+#endif
+} // void setupsmemmode(void)
+#endif
+
+inline __attribute__((always_inline))
+void initruntimedata() {
+  int i;
+  // initialize the arrays
+  if(STARTUPCORE == BAMBOO_NUM_OF_CORE) {
+    // startup core to initialize corestatus[]
+    for(i = 0; i < NUMCORESACTIVE; ++i) {
+      corestatus[i] = 1;
+      numsendobjs[i] = 0;
+      numreceiveobjs[i] = 0;
+#ifdef PROFILE
+      // initialize the profile data arrays
+      profilestatus[i] = 1;
+#endif
+#ifdef MULTICORE_GC
+      gccorestatus[i] = 1;
+      gcnumsendobjs[0][i] = gcnumsendobjs[1][i] = 0;
+      gcnumreceiveobjs[0][i] = gcnumreceiveobjs[1][i] = 0;
+#endif
+    } // for(i = 0; i < NUMCORESACTIVE; ++i)
+#ifdef MULTICORE_GC
+    for(i = 0; i < NUMCORES4GC; ++i) {
+      gcloads[i] = 0;
+      gcrequiredmems[i] = 0;
+      gcstopblock[i] = 0;
+      gcfilledblocks[i] = 0;
+    } // for(i = 0; i < NUMCORES4GC; ++i)
+#ifdef GC_PROFILE
+    gc_infoIndex = 0;
+    gc_infoOverflow = false;
+	gc_num_livespace = 0;
+	gc_num_freespace = 0;
+#endif
+#endif
+    numconfirm = 0;
+    waitconfirm = false;
+
+    // TODO for test
+    total_num_t6 = 0;
+  }
+
+  busystatus = true;
+  self_numsendobjs = 0;
+  self_numreceiveobjs = 0;
+
+  for(i = 0; i < BAMBOO_MSG_BUF_LENGTH; ++i) {
+    msgdata[i] = -1;
+  }
+  msgdataindex = 0;
+  msgdatalast = 0;
+  msglength = BAMBOO_MSG_BUF_LENGTH;
+  msgdatafull = false;
+  for(i = 0; i < BAMBOO_OUT_BUF_LENGTH; ++i) {
+    outmsgdata[i] = -1;
+  }
+  outmsgindex = 0;
+  outmsglast = 0;
+  outmsgleft = 0;
+  isMsgHanging = false;
+  //isMsgSending = false;
+
+  smemflag = true;
+  bamboo_cur_msp = NULL;
+  bamboo_smem_size = 0;
+  totransobjqueue = createQueue_I();
+
+#ifdef MULTICORE_GC
+  bamboo_smem_zero_top = NULL;
+  gcflag = false;
+  gcprocessing = false;
+  gcphase = FINISHPHASE;
+  //gcnumpre = 0;
+  gcprecheck = true;
+  gccurr_heaptop = 0;
+  gcself_numsendobjs = 0;
+  gcself_numreceiveobjs = 0;
+  gcmarkedptrbound = 0;
+#ifdef LOCALHASHTBL_TEST
+  gcpointertbl = allocateRuntimeHash_I(20);
+#else
+  gcpointertbl = mgchashCreate_I(2000, 0.75);
+#endif
+  //gcpointertbl = allocateMGCHash_I(20);
+  gcforwardobjtbl = allocateMGCHash_I(20, 3);
+  gcobj2map = 0;
+  gcmappedobj = 0;
+  //gcismapped = false;
+  gcnumlobjs = 0;
+  gcheaptop = 0;
+  gctopcore = 0;
+  gctopblock = 0;
+  gcmovestartaddr = 0;
+  gctomove = false;
+  gcmovepending = 0;
+  gcblock2fill = 0;
+  gcsbstarttbl = BAMBOO_BASE_VA;
+  bamboo_smemtbl = (void *)gcsbstarttbl
+               + (BAMBOO_SHARED_MEM_SIZE/BAMBOO_SMEM_SIZE)*sizeof(INTPTR);
+  if(BAMBOO_NUM_OF_CORE < NUMCORES4GC) {
+	int t_size = ((BAMBOO_RMSP_SIZE)-sizeof(mgcsharedhashtbl_t)*2
+		-128*sizeof(size_t))/sizeof(mgcsharedhashlistnode_t)-2;
+	int kk = 0;
+	unsigned int tmp_k = 1 << (sizeof(int)*8 -1);
+	while(((t_size & tmp_k) == 0) && (kk < sizeof(int)*8)) {
+	  t_size = t_size << 1;
+	  kk++;
+	}
+	t_size = tmp_k >> kk;
+	gcsharedptbl = mgcsharedhashCreate_I(t_size,0.30);//allocateGCSharedHash_I(20);
+  } else {
+	gcsharedptbl = NULL;
+  }
+  BAMBOO_MEMSET_WH(gcrpointertbls, 0, 
+	  sizeof(mgcsharedhashtbl_t *)*NUMCORES4GC);
+	  //sizeof(struct RuntimeHash *)*NUMCORES4GC);
+#ifdef SMEMM
+  gcmem_mixed_threshold = (unsigned int)((BAMBOO_SHARED_MEM_SIZE
+		-bamboo_reserved_smem*BAMBOO_SMEM_SIZE)*0.8);
+  gcmem_mixed_usedmem = 0;
+#endif
+#ifdef GC_PROFILE
+  gc_num_obj = 0;
+  gc_num_liveobj = 0;
+  gc_num_forwardobj = 0;
+  gc_num_profiles = NUMCORESACTIVE - 1;
+#endif
+#ifdef GC_FLUSH_DTLB
+  gc_num_flush_dtlb = 0;
+#endif
+  gc_localheap_s = false;
+#ifdef GC_CACHE_ADAPT
+  gccachestage = false;
+  // enable the timer interrupt
+  bamboo_tile_timer_set_next_event(500000000); // TODO
+  bamboo_unmask_timer_intr();
+  //BAMBOO_DEBUGPRINT(BAMBOO_GET_EXE_TIME());
+  bamboo_dtlb_sampling_process();
+#endif // GC_CACHE_ADAPT
+#else
+  // create the lock table, lockresult table and obj queue
+  locktable.size = 20;
+  locktable.bucket =
+    (struct RuntimeNode **) RUNMALLOC_I(sizeof(struct RuntimeNode *)*20);
+  /* Set allocation blocks*/
+  locktable.listhead=NULL;
+  locktable.listtail=NULL;
+  /*Set data counts*/
+  locktable.numelements = 0;
+  lockobj = 0;
+  lock2require = 0;
+  lockresult = 0;
+  lockflag = false;
+  lockRedirectTbl = allocateRuntimeHash_I(20);
+  objRedirectLockTbl = allocateRuntimeHash_I(20);
+#endif
+#ifndef INTERRUPT
+  reside = false;
+#endif
+  objqueue.head = NULL;
+  objqueue.tail = NULL;
+
+  currtpd = NULL;
+
+#ifdef PROFILE
+  stall = false;
+  //isInterrupt = true;
+  totalexetime = -1;
+  //interrupttime = 0;
+  taskInfoIndex = 0;
+  taskInfoOverflow = false;
+#ifdef PROFILE_INTERRUPT
+  interruptInfoIndex = 0;
+  interruptInfoOverflow = false;
+#endif // PROFILE_INTERRUPT
+#endif // PROFILE
+
+  for(i = 0; i < MAXTASKPARAMS; i++) {
+    runtime_locks[i].redirectlock = 0;
+    runtime_locks[i].value = 0;
+  }
+  runtime_locklen = 0;
+}
+
+inline __attribute__((always_inline))
+void disruntimedata() {
+#ifdef MULTICORE_GC
+#ifdef LOCALHASHTBL_TEST
+  freeRuntimeHash(gcpointertbl);
+#else
+  mgchashDelete(gcpointertbl);
+#endif
+  //freeMGCHash(gcpointertbl);
+  freeMGCHash(gcforwardobjtbl);
+  // for mapping info structures
+  //freeRuntimeHash(gcrcoretbl);
+#else
+  freeRuntimeHash(lockRedirectTbl);
+  freeRuntimeHash(objRedirectLockTbl);
+  RUNFREE(locktable.bucket);
+#endif
+  if(activetasks != NULL) {
+    genfreehashtable(activetasks);
+  }
+  if(currtpd != NULL) {
+    RUNFREE(currtpd->parameterArray);
+    RUNFREE(currtpd);
+    currtpd = NULL;
+  }
+  BAMBOO_LOCAL_MEM_CLOSE();
+  BAMBOO_SHARE_MEM_CLOSE();
+}
+
+inline __attribute__((always_inline))
+bool checkObjQueue() {
+  bool rflag = false;
+  struct transObjInfo * objInfo = NULL;
+  int grount = 0;
+
+#ifdef PROFILE
+#ifdef ACCURATEPROFILE
+  bool isChecking = false;
+  if(!isEmpty(&objqueue)) {
+    profileTaskStart("objqueue checking");
+    isChecking = true;
+  }       // if(!isEmpty(&objqueue))
+#endif
+#endif
+
+  while(!isEmpty(&objqueue)) {
+    void * obj = NULL;
+    BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
+#ifdef DEBUG
+    BAMBOO_DEBUGPRINT(0xf001);
+#endif
+#ifdef PROFILE
+    //isInterrupt = false;
+#endif
+#ifdef DEBUG
+    BAMBOO_DEBUGPRINT(0xeee1);
+#endif
+    rflag = true;
+    objInfo = (struct transObjInfo *)getItem(&objqueue);
+    obj = objInfo->objptr;
+#ifdef DEBUG
+    BAMBOO_DEBUGPRINT_REG((int)obj);
+#endif
+    // grab lock and flush the obj
+    grount = 0;
+    getwritelock_I(obj);
+    while(!lockflag) {
+      BAMBOO_WAITING_FOR_LOCK(0);
+    }   // while(!lockflag)
+    grount = lockresult;
+#ifdef DEBUG
+    BAMBOO_DEBUGPRINT_REG(grount);
+#endif
+
+    lockresult = 0;
+    lockobj = 0;
+    lock2require = 0;
+    lockflag = false;
+#ifndef INTERRUPT
+    reside = false;
+#endif
+
+    if(grount == 1) {
+      int k = 0;
+      // flush the object
+#ifdef CACHEFLUSH
+      BAMBOO_CACHE_FLUSH_RANGE((int)obj,sizeof(int));
+      BAMBOO_CACHE_FLUSH_RANGE((int)obj,
+		  classsize[((struct ___Object___ *)obj)->type]);
+#endif
+      // enqueue the object
+      for(k = 0; k < objInfo->length; ++k) {
+		int taskindex = objInfo->queues[2 * k];
+		int paramindex = objInfo->queues[2 * k + 1];
+		struct parameterwrapper ** queues =
+		  &(paramqueues[BAMBOO_NUM_OF_CORE][taskindex][paramindex]);
+#ifdef DEBUG
+		BAMBOO_DEBUGPRINT_REG(taskindex);
+		BAMBOO_DEBUGPRINT_REG(paramindex);
+		struct ___Object___ * tmpptr = (struct ___Object___ *)obj;
+		tprintf("Process %x(%d): receive obj %x(%lld), ptrflag %x\n",
+				BAMBOO_NUM_OF_CORE, BAMBOO_NUM_OF_CORE, (int)obj,
+				(long)obj, tmpptr->flag);
+#endif
+		enqueueObject_I(obj, queues, 1);
+#ifdef DEBUG
+		BAMBOO_DEBUGPRINT_REG(hashsize(activetasks));
+#endif
+      }  // for(k = 0; k < objInfo->length; ++k)
+      releasewritelock_I(obj);
+      RUNFREE(objInfo->queues);
+      RUNFREE(objInfo);
+    } else {
+      // can not get lock
+      // put it at the end of the queue if no update version in the queue
+      struct QueueItem * qitem = getHead(&objqueue);
+      struct QueueItem * prev = NULL;
+      while(qitem != NULL) {
+		struct transObjInfo * tmpinfo =
+			(struct transObjInfo *)(qitem->objectptr);
+		if(tmpinfo->objptr == obj) {
+		  // the same object in the queue, which should be enqueued
+		  // recently. Current one is outdate, do not re-enqueue it
+		  RUNFREE(objInfo->queues);
+		  RUNFREE(objInfo);
+		  goto objqueuebreak;
+		} else {
+		  prev = qitem;
+		}  // if(tmpinfo->objptr == obj)
+		qitem = getNextQueueItem(prev);
+	  }  // while(qitem != NULL)
+      // try to execute active tasks already enqueued first
+      addNewItem_I(&objqueue, objInfo);
+#ifdef PROFILE
+      //isInterrupt = true;
+#endif
+objqueuebreak:
+      BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
+#ifdef DEBUG
+      BAMBOO_DEBUGPRINT(0xf000);
+#endif
+      break;
+    }  // if(grount == 1)
+    BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
+#ifdef DEBUG
+    BAMBOO_DEBUGPRINT(0xf000);
+#endif
+  }  // while(!isEmpty(&objqueue))
+
+#ifdef PROFILE
+#ifdef ACCURATEPROFILE
+  if(isChecking) {
+    profileTaskEnd();
+  }  // if(isChecking)
+#endif
+#endif
+
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(0xee02);
+#endif
+  return rflag;
+}
+
+inline __attribute__((always_inline))
+void checkCoreStatus() {
+  bool allStall = false;
+  int i = 0;
+  int sumsendobj = 0;
+  if((!waitconfirm) ||
+     (waitconfirm && (numconfirm == 0))) {
+#ifdef DEBUG
+    BAMBOO_DEBUGPRINT(0xee04);
+    BAMBOO_DEBUGPRINT_REG(waitconfirm);
+#endif
+    BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
+#ifdef DEBUG
+    BAMBOO_DEBUGPRINT(0xf001);
+#endif
+    corestatus[BAMBOO_NUM_OF_CORE] = 0;
+    numsendobjs[BAMBOO_NUM_OF_CORE] = self_numsendobjs;
+    numreceiveobjs[BAMBOO_NUM_OF_CORE] = self_numreceiveobjs;
+    // check the status of all cores
+    allStall = true;
+#ifdef DEBUG
+    BAMBOO_DEBUGPRINT_REG(NUMCORESACTIVE);
+#endif
+    for(i = 0; i < NUMCORESACTIVE; ++i) {
+#ifdef DEBUG
+      BAMBOO_DEBUGPRINT(0xe000 + corestatus[i]);
+#endif
+      if(corestatus[i] != 0) {
+		allStall = false;
+		break;
+      }
+    }  // for(i = 0; i < NUMCORESACTIVE; ++i)
+    if(allStall) {
+      // check if the sum of send objs and receive obj are the same
+      // yes->check if the info is the latest; no->go on executing
+      sumsendobj = 0;
+      for(i = 0; i < NUMCORESACTIVE; ++i) {
+		sumsendobj += numsendobjs[i];
+#ifdef DEBUG
+		BAMBOO_DEBUGPRINT(0xf000 + numsendobjs[i]);
+#endif
+      }  // for(i = 0; i < NUMCORESACTIVE; ++i)
+      for(i = 0; i < NUMCORESACTIVE; ++i) {
+		sumsendobj -= numreceiveobjs[i];
+#ifdef DEBUG
+		BAMBOO_DEBUGPRINT(0xf000 + numreceiveobjs[i]);
+#endif
+      }  // for(i = 0; i < NUMCORESACTIVE; ++i)
+      if(0 == sumsendobj) {
+		if(!waitconfirm) {
+		  // the first time found all cores stall
+		  // send out status confirm msg to all other cores
+		  // reset the corestatus array too
+#ifdef DEBUG
+		  BAMBOO_DEBUGPRINT(0xee05);
+#endif
+		  corestatus[BAMBOO_NUM_OF_CORE] = 1;
+		  waitconfirm = true;
+		  numconfirm = NUMCORESACTIVE - 1;
+		  BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
+		  for(i = 1; i < NUMCORESACTIVE; ++i) {
+			corestatus[i] = 1;
+			// send status confirm msg to core i
+			send_msg_1(i, STATUSCONFIRM, false);
+		  }   // for(i = 1; i < NUMCORESACTIVE; ++i)
+		  return;
+		} else {
+		  // all the core status info are the latest
+		  // terminate; for profiling mode, send request to all
+		  // other cores to pour out profiling data
+#ifdef DEBUG
+		  BAMBOO_DEBUGPRINT(0xee06);
+#endif
+
+#ifdef USEIO
+		  totalexetime = BAMBOO_GET_EXE_TIME() - bamboo_start_time;
+#else
+#ifdef PROFILE
+		  //BAMBOO_DEBUGPRINT_REG(interrupttime);
+#endif
+
+		  BAMBOO_DEBUGPRINT(BAMBOO_GET_EXE_TIME() - bamboo_start_time);
+		  //BAMBOO_DEBUGPRINT_REG(total_num_t6); // TODO for test
+#ifdef GC_FLUSH_DTLB
+		  BAMBOO_DEBUGPRINT_REG(gc_num_flush_dtlb);
+#endif
+#ifndef BAMBOO_MEMPROF
+		  BAMBOO_DEBUGPRINT(0xbbbbbbbb);
+#endif
+#endif
+		  // profile mode, send msgs to other cores to request pouring
+		  // out progiling data
+#ifdef PROFILE
+		  BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
+#ifdef DEBUG
+		  BAMBOO_DEBUGPRINT(0xf000);
+#endif
+		  for(i = 1; i < NUMCORESACTIVE; ++i) {
+			// send profile request msg to core i
+			send_msg_2(i, PROFILEOUTPUT, totalexetime, false);
+		  } // for(i = 1; i < NUMCORESACTIVE; ++i)
+#ifndef RT_TEST
+		  // pour profiling data on startup core
+		  outputProfileData();
+#endif
+		  while(true) {
+			BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
+#ifdef DEBUG
+			BAMBOO_DEBUGPRINT(0xf001);
+#endif
+			profilestatus[BAMBOO_NUM_OF_CORE] = 0;
+			// check the status of all cores
+			allStall = true;
+#ifdef DEBUG
+			BAMBOO_DEBUGPRINT_REG(NUMCORESACTIVE);
+#endif
+			for(i = 0; i < NUMCORESACTIVE; ++i) {
+#ifdef DEBUG
+			  BAMBOO_DEBUGPRINT(0xe000 + profilestatus[i]);
+#endif
+			  if(profilestatus[i] != 0) {
+				allStall = false;
+				break;
+			  }
+			}  // for(i = 0; i < NUMCORESACTIVE; ++i)
+			if(!allStall) {
+			  int halt = 100;
+			  BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
+#ifdef DEBUG
+			  BAMBOO_DEBUGPRINT(0xf000);
+#endif
+			  while(halt--) {
+			  }
+			} else {
+			  BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
+			  break;
+			}  // if(!allStall)
+		  }  // while(true)
+#endif
+
+		  // gc_profile mode, output gc prfiling data
+#ifdef MULTICORE_GC
+#ifdef GC_CACHE_ADAPT
+		  bamboo_mask_timer_intr(); // disable the TILE_TIMER interrupt
+#endif // GC_CACHE_ADAPT
+#ifdef GC_PROFILE
+		  gc_outputProfileData();
+#endif // #ifdef GC_PROFILE
+#endif // #ifdef MULTICORE_GC
+		  disruntimedata();
+		  BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
+		  terminate();  // All done.
+		}  // if(!waitconfirm)
+      } else {
+		// still some objects on the fly on the network
+		// reset the waitconfirm and numconfirm
+#ifdef DEBUG
+		BAMBOO_DEBUGPRINT(0xee07);
+#endif
+		waitconfirm = false;
+		numconfirm = 0;
+	  }  //  if(0 == sumsendobj)
+    } else {
+      // not all cores are stall, keep on waiting
+#ifdef DEBUG
+      BAMBOO_DEBUGPRINT(0xee08);
+#endif
+      waitconfirm = false;
+      numconfirm = 0;
+    }  //  if(allStall)
+    BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
+#ifdef DEBUG
+    BAMBOO_DEBUGPRINT(0xf000);
+#endif
+  }  // if((!waitconfirm) ||
+}
+
+// main function for each core
+inline void run(void * arg) {
+  int i = 0;
+  int argc = 1;
+  char ** argv = NULL;
+  bool sendStall = false;
+  bool isfirst = true;
+  bool tocontinue = false;
+
+  corenum = BAMBOO_GET_NUM_OF_CORE();
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(0xeeee);
+  BAMBOO_DEBUGPRINT_REG(corenum);
+  BAMBOO_DEBUGPRINT(STARTUPCORE);
+#endif
+  //BAMBOO_DEBUGPRINT(BAMBOO_GET_EXE_TIME()); // TODO
+
+  // initialize runtime data structures
+  initruntimedata();
+
+  // other architecture related initialization
+  initialization();
+  initCommunication();
+
+  initializeexithandler();
+
+  // main process of the execution module
+  if(BAMBOO_NUM_OF_CORE > NUMCORESACTIVE - 1) {
+    // non-executing cores, only processing communications
+    activetasks = NULL;
+#ifdef PROFILE
+    //isInterrupt = false;
+#endif
+    fakeExecution();
+  } else {
+    /* Create queue of active tasks */
+    activetasks=
+      genallocatehashtable((unsigned int (*)(void *)) &hashCodetpd,
+                           (int (*)(void *,void *)) &comparetpd);
+
+    /* Process task information */
+    processtasks();
+
+    if(STARTUPCORE == BAMBOO_NUM_OF_CORE) {
+      /* Create startup object */
+      createstartupobject(argc, argv);
+    }
+
+#ifdef DEBUG
+    BAMBOO_DEBUGPRINT(0xee00);
+#endif
+
+    while(true) {
+
+#ifdef MULTICORE_GC
+//#ifdef GC_CACHE_ADAPT
+	  // do dtlb sampling if necessary
+//	  bamboo_dtlb_sampling_process();
+//#endif // GC_CACHE_ADAPT
+      // check if need to do GC
+      if(gcflag) {
+		gc(NULL);
+	  }
+#endif // MULTICORE_GC
+
+      // check if there are new active tasks can be executed
+      executetasks();
+      if(busystatus) {
+		sendStall = false;
+      }
+
+#ifndef INTERRUPT
+      while(receiveObject() != -1) {
+      }
+#endif
+
+#ifdef DEBUG
+      BAMBOO_DEBUGPRINT(0xee01);
+#endif
+
+      // check if there are some pending objects,
+      // if yes, enqueue them and executetasks again
+      tocontinue = checkObjQueue();
+
+      if(!tocontinue) {
+		// check if stop
+		if(STARTUPCORE == BAMBOO_NUM_OF_CORE) {
+		  if(isfirst) {
+#ifdef DEBUG
+			BAMBOO_DEBUGPRINT(0xee03);
+#endif
+			isfirst = false;
+		  }
+		  checkCoreStatus();
+		} else {
+		  if(!sendStall) {
+#ifdef DEBUG
+			BAMBOO_DEBUGPRINT(0xee09);
+#endif
+#ifdef PROFILE
+			if(!stall) {
+#endif
+			if(isfirst) {
+			  // wait for some time
+			  int halt = 10000;
+#ifdef DEBUG
+			  BAMBOO_DEBUGPRINT(0xee0a);
+#endif
+			  while(halt--) {
+			  }
+			  isfirst = false;
+			} else {
+			  // send StallMsg to startup core
+#ifdef DEBUG
+			  BAMBOO_DEBUGPRINT(0xee0b);
+#endif
+			  // send stall msg
+			  send_msg_4(STARTUPCORE, TRANSTALL, BAMBOO_NUM_OF_CORE,
+						 self_numsendobjs, self_numreceiveobjs, false);
+			  sendStall = true;
+			  isfirst = true;
+			  busystatus = false;
+			}
+#ifdef PROFILE
+		  }
+#endif
+		  } else {
+			isfirst = true;
+			busystatus = false;
+#ifdef DEBUG
+			BAMBOO_DEBUGPRINT(0xee0c);
+#endif
+		  }   // if(!sendStall)
+		}   // if(STARTUPCORE == BAMBOO_NUM_OF_CORE)
+      }  // if(!tocontinue)
+    }  // while(true)
+  } // if(BAMBOO_NUM_OF_CORE > NUMCORESACTIVE - 1)
+
+} // run()
+
+struct ___createstartupobject____I_locals {
+  INTPTR size;
+  void * next;
+  struct  ___StartupObject___ * ___startupobject___;
+  struct ArrayObject * ___stringarray___;
+}; // struct ___createstartupobject____I_locals
+
+void createstartupobject(int argc,
+                         char ** argv) {
+  int i;
+
+  /* Allocate startup object     */
+#ifdef MULTICORE_GC
+  struct ___createstartupobject____I_locals ___locals___ = 
+  {2, NULL, NULL, NULL};
+  struct ___StartupObject___ *startupobject=
+    (struct ___StartupObject___*) allocate_new(&___locals___, STARTUPTYPE);
+  ___locals___.___startupobject___ = startupobject;
+  struct ArrayObject * stringarray=
+    allocate_newarray(&___locals___, STRINGARRAYTYPE, argc-1);
+  ___locals___.___stringarray___ = stringarray;
+#else
+  struct ___StartupObject___ *startupobject=
+    (struct ___StartupObject___*) allocate_new(STARTUPTYPE);
+  struct ArrayObject * stringarray=
+    allocate_newarray(STRINGARRAYTYPE, argc-1);
+#endif
+  /* Build array of strings */
+  startupobject->___parameters___=stringarray;
+  for(i=1; i<argc; i++) {
+    int length=strlen(argv[i]);
+#ifdef MULTICORE_GC
+    struct ___String___ *newstring=NewString(&___locals___, argv[i],length);
+#else
+    struct ___String___ *newstring=NewString(argv[i],length);
+#endif
+    ((void **)(((char *)&stringarray->___length___)+sizeof(int)))[i-1]=
+      newstring;
+  }
+
+  startupobject->version = 0;
+  startupobject->lock = NULL;
+
+  /* Set initialized flag for startup object */
+  flagorandinit(startupobject,1,0xFFFFFFFF);
+  enqueueObject(startupobject, NULL, 0);
+#ifdef CACHEFLUSH
+  BAMBOO_CACHE_FLUSH_ALL();
+#endif
+}
+
+int hashCodetpd(struct taskparamdescriptor *ftd) {
+  int hash=(int)ftd->task;
+  int i;
+  for(i=0; i<ftd->numParameters; i++) {
+    hash^=(int)ftd->parameterArray[i];
+  }
+  return hash;
+}
+
+int comparetpd(struct taskparamdescriptor *ftd1,
+               struct taskparamdescriptor *ftd2) {
+  int i;
+  if (ftd1->task!=ftd2->task)
+    return 0;
+  for(i=0; i<ftd1->numParameters; i++)
+    if(ftd1->parameterArray[i]!=ftd2->parameterArray[i])
+      return 0;
+  return 1;
+}
+
+/* This function sets a tag. */
+#ifdef MULTICORE_GC
+void tagset(void *ptr,
+            struct ___Object___ * obj,
+            struct ___TagDescriptor___ * tagd) {
+#else
+void tagset(struct ___Object___ * obj,
+            struct ___TagDescriptor___ * tagd) {
+#endif
+  struct ArrayObject * ao=NULL;
+  struct ___Object___ * tagptr=obj->___tags___;
+  if (tagptr==NULL) {
+    obj->___tags___=(struct ___Object___ *)tagd;
+  } else {
+    /* Have to check if it is already set */
+    if (tagptr->type==TAGTYPE) {
+      struct ___TagDescriptor___ * td=(struct ___TagDescriptor___ *) tagptr;
+      if (td==tagd) {
+		return;
+      }
+#ifdef MULTICORE_GC
+      int ptrarray[]={2, (int) ptr, (int) obj, (int)tagd};
+      struct ArrayObject * ao=
+        allocate_newarray(&ptrarray,TAGARRAYTYPE,TAGARRAYINTERVAL);
+      obj=(struct ___Object___ *)ptrarray[2];
+      tagd=(struct ___TagDescriptor___ *)ptrarray[3];
+      td=(struct ___TagDescriptor___ *) obj->___tags___;
+#else
+      ao=allocate_newarray(TAGARRAYTYPE,TAGARRAYINTERVAL);
+#endif
+
+      ARRAYSET(ao, struct ___TagDescriptor___ *, 0, td);
+      ARRAYSET(ao, struct ___TagDescriptor___ *, 1, tagd);
+      obj->___tags___=(struct ___Object___ *) ao;
+      ao->___cachedCode___=2;
+    } else {
+      /* Array Case */
+      int i;
+      struct ArrayObject *ao=(struct ArrayObject *) tagptr;
+      for(i=0; i<ao->___cachedCode___; i++) {
+		struct ___TagDescriptor___ * td=
+		  ARRAYGET(ao, struct ___TagDescriptor___*, i);
+		if (td==tagd) {
+		  return;
+		}
+      }
+      if (ao->___cachedCode___<ao->___length___) {
+		ARRAYSET(ao, struct ___TagDescriptor___ *,ao->___cachedCode___,tagd);
+		ao->___cachedCode___++;
+      } else {
+#ifdef MULTICORE_GC
+		int ptrarray[]={2,(int) ptr, (int) obj, (int) tagd};
+		struct ArrayObject * aonew=
+		  allocate_newarray(&ptrarray,TAGARRAYTYPE,
+							TAGARRAYINTERVAL+ao->___length___);
+		obj=(struct ___Object___ *)ptrarray[2];
+		tagd=(struct ___TagDescriptor___ *) ptrarray[3];
+		ao=(struct ArrayObject *)obj->___tags___;
+#else
+		struct ArrayObject * aonew=
+		  allocate_newarray(TAGARRAYTYPE,TAGARRAYINTERVAL+ao->___length___);
+#endif
+
+		aonew->___cachedCode___=ao->___length___+1;
+		for(i=0; i<ao->___length___; i++) {
+		  ARRAYSET(aonew, struct ___TagDescriptor___*, i,
+				   ARRAYGET(ao, struct ___TagDescriptor___*, i));
+		}
+		ARRAYSET(aonew, struct ___TagDescriptor___ *, ao->___length___,tagd);
+      }
+    }
+  }
+
+  {
+    struct ___Object___ * tagset=tagd->flagptr;
+    if(tagset==NULL) {
+      tagd->flagptr=obj;
+    } else if (tagset->type!=OBJECTARRAYTYPE) {
+#ifdef MULTICORE_GC
+      int ptrarray[]={2, (int) ptr, (int) obj, (int)tagd};
+      struct ArrayObject * ao=
+        allocate_newarray(&ptrarray,OBJECTARRAYTYPE,OBJECTARRAYINTERVAL);
+      obj=(struct ___Object___ *)ptrarray[2];
+      tagd=(struct ___TagDescriptor___ *)ptrarray[3];
+#else
+      struct ArrayObject * ao=
+        allocate_newarray(OBJECTARRAYTYPE,OBJECTARRAYINTERVAL);
+#endif
+      ARRAYSET(ao, struct ___Object___ *, 0, tagd->flagptr);
+      ARRAYSET(ao, struct ___Object___ *, 1, obj);
+      ao->___cachedCode___=2;
+      tagd->flagptr=(struct ___Object___ *)ao;
+    } else {
+      struct ArrayObject *ao=(struct ArrayObject *) tagset;
+      if (ao->___cachedCode___<ao->___length___) {
+		ARRAYSET(ao, struct ___Object___*, ao->___cachedCode___++, obj);
+      } else {
+		int i;
+#ifdef MULTICORE_GC
+		int ptrarray[]={2, (int) ptr, (int) obj, (int)tagd};
+		struct ArrayObject * aonew=
+		  allocate_newarray(&ptrarray,OBJECTARRAYTYPE,
+							OBJECTARRAYINTERVAL+ao->___length___);
+		obj=(struct ___Object___ *)ptrarray[2];
+		tagd=(struct ___TagDescriptor___ *)ptrarray[3];
+		ao=(struct ArrayObject *)tagd->flagptr;
+#else
+		struct ArrayObject * aonew=allocate_newarray(OBJECTARRAYTYPE,
+			OBJECTARRAYINTERVAL+ao->___length___);
+#endif
+		aonew->___cachedCode___=ao->___cachedCode___+1;
+		for(i=0; i<ao->___length___; i++) {
+		  ARRAYSET(aonew, struct ___Object___*, i,
+				   ARRAYGET(ao, struct ___Object___*, i));
+		}
+		ARRAYSET(aonew, struct ___Object___ *, ao->___cachedCode___, obj);
+		tagd->flagptr=(struct ___Object___ *) aonew;
+      }
+    }
+  }
+}
+
+/* This function clears a tag. */
+#ifdef MULTICORE_GC
+void tagclear(void *ptr,
+              struct ___Object___ * obj,
+              struct ___TagDescriptor___ * tagd) {
+#else
+void tagclear(struct ___Object___ * obj,
+              struct ___TagDescriptor___ * tagd) {
+#endif
+  /* We'll assume that tag is alway there.
+     Need to statically check for this of course. */
+  struct ___Object___ * tagptr=obj->___tags___;
+
+  if (tagptr->type==TAGTYPE) {
+    if ((struct ___TagDescriptor___ *)tagptr==tagd)
+      obj->___tags___=NULL;
+  } else {
+    struct ArrayObject *ao=(struct ArrayObject *) tagptr;
+    int i;
+    for(i=0; i<ao->___cachedCode___; i++) {
+      struct ___TagDescriptor___ * td=
+        ARRAYGET(ao, struct ___TagDescriptor___ *, i);
+      if (td==tagd) {
+		ao->___cachedCode___--;
+		if (i<ao->___cachedCode___)
+		  ARRAYSET(ao, struct ___TagDescriptor___ *, i,
+			  ARRAYGET(ao,struct ___TagDescriptor___*,ao->___cachedCode___));
+		ARRAYSET(ao,struct ___TagDescriptor___ *,ao->___cachedCode___, NULL);
+		if (ao->___cachedCode___==0)
+		  obj->___tags___=NULL;
+		goto PROCESSCLEAR;
+      }
+    }
+  }
+PROCESSCLEAR:
+  {
+    struct ___Object___ *tagset=tagd->flagptr;
+    if (tagset->type!=OBJECTARRAYTYPE) {
+      if (tagset==obj)
+		tagd->flagptr=NULL;
+    } else {
+      struct ArrayObject *ao=(struct ArrayObject *) tagset;
+      int i;
+      for(i=0; i<ao->___cachedCode___; i++) {
+		struct ___Object___ * tobj=ARRAYGET(ao, struct ___Object___ *, i);
+		if (tobj==obj) {
+		  ao->___cachedCode___--;
+		  if (i<ao->___cachedCode___)
+			ARRAYSET(ao, struct ___Object___ *, i,
+				ARRAYGET(ao, struct ___Object___ *, ao->___cachedCode___));
+		  ARRAYSET(ao, struct ___Object___ *, ao->___cachedCode___, NULL);
+		  if (ao->___cachedCode___==0)
+			tagd->flagptr=NULL;
+		  goto ENDCLEAR;
+		}
+      }
+    }
+  }
+ENDCLEAR:
+  return;
+}
+
+/* This function allocates a new tag. */
+#ifdef MULTICORE_GC
+struct ___TagDescriptor___ * allocate_tag(void *ptr,
+                                          int index) {
+  struct ___TagDescriptor___ * v=
+    (struct ___TagDescriptor___ *) FREEMALLOC((struct garbagelist *) ptr,
+                                              classsize[TAGTYPE]);
+#else
+struct ___TagDescriptor___ * allocate_tag(int index) {
+  struct ___TagDescriptor___ * v=FREEMALLOC(classsize[TAGTYPE]);
+#endif
+  v->type=TAGTYPE;
+  v->flag=index;
+  return v;
+}
+
+
+
+/* This function updates the flag for object ptr.  It or's the flag
+   with the or mask and and's it with the andmask. */
+
+void flagbody(struct ___Object___ *ptr,
+              int flag,
+              struct parameterwrapper ** queues,
+              int length,
+              bool isnew);
+
+int flagcomp(const int *val1, const int *val2) {
+  return (*val1)-(*val2);
+}
+
+void flagorand(void * ptr,
+               int ormask,
+               int andmask,
+               struct parameterwrapper ** queues,
+               int length) {
+  {
+    int oldflag=((int *)ptr)[1];
+    int flag=ormask|oldflag;
+    flag&=andmask;
+    flagbody(ptr, flag, queues, length, false);
+  }
+}
+
+bool intflagorand(void * ptr,
+                  int ormask,
+                  int andmask) {
+  {
+    int oldflag=((int *)ptr)[1];
+    int flag=ormask|oldflag;
+    flag&=andmask;
+    if (flag==oldflag)   /* Don't do anything */
+      return false;
+    else {
+      flagbody(ptr, flag, NULL, 0, false);
+      return true;
+    }
+  }
+}
+
+void flagorandinit(void * ptr,
+                   int ormask,
+                   int andmask) {
+  int oldflag=((int *)ptr)[1];
+  int flag=ormask|oldflag;
+  flag&=andmask;
+  flagbody(ptr,flag,NULL,0,true);
+}
+
+void flagbody(struct ___Object___ *ptr,
+              int flag,
+              struct parameterwrapper ** vqueues,
+              int vlength,
+              bool isnew) {
+  struct parameterwrapper * flagptr = NULL;
+  int i = 0;
+  struct parameterwrapper ** queues = vqueues;
+  int length = vlength;
+  int next;
+  int UNUSED, UNUSED2;
+  int * enterflags = NULL;
+  if((!isnew) && (queues == NULL)) {
+    if(BAMBOO_NUM_OF_CORE < NUMCORESACTIVE) {
+      queues = objectqueues[BAMBOO_NUM_OF_CORE][ptr->type];
+      length = numqueues[BAMBOO_NUM_OF_CORE][ptr->type];
+    } else {
+      return;
+    }
+  }
+  ptr->flag=flag;
+
+  /*Remove object from all queues */
+  for(i = 0; i < length; ++i) {
+    flagptr = queues[i];
+    ObjectHashget(flagptr->objectset, (int) ptr, (int *) &next,
+                  (int *) &enterflags, &UNUSED, &UNUSED2);
+    ObjectHashremove(flagptr->objectset, (int)ptr);
+    if (enterflags!=NULL)
+      RUNFREE(enterflags);
+  }
+}
+
+void enqueueObject(void * vptr,
+                   struct parameterwrapper ** vqueues,
+                   int vlength) {
+  struct ___Object___ *ptr = (struct ___Object___ *)vptr;
+
+  {
+    //struct QueueItem *tmpptr;
+    struct parameterwrapper * parameter=NULL;
+    int j;
+    int i;
+    struct parameterwrapper * prevptr=NULL;
+    struct ___Object___ *tagptr=NULL;
+    struct parameterwrapper ** queues = vqueues;
+    int length = vlength;
+    if(BAMBOO_NUM_OF_CORE > NUMCORESACTIVE - 1) {
+      return;
+    }
+    if(queues == NULL) {
+      queues = objectqueues[BAMBOO_NUM_OF_CORE][ptr->type];
+      length = numqueues[BAMBOO_NUM_OF_CORE][ptr->type];
+    }
+    tagptr=ptr->___tags___;
+
+    /* Outer loop iterates through all parameter queues an object of
+       this type could be in.  */
+    for(j = 0; j < length; ++j) {
+      parameter = queues[j];
+      /* Check tags */
+      if (parameter->numbertags>0) {
+		if (tagptr==NULL)
+		  goto nextloop;  //that means the object has no tag
+		//but that param needs tag
+		else if(tagptr->type==TAGTYPE) {     //one tag
+		  //struct ___TagDescriptor___ * tag=
+		  //(struct ___TagDescriptor___*) tagptr;
+		  for(i=0; i<parameter->numbertags; i++) {
+			//slotid is parameter->tagarray[2*i];
+			int tagid=parameter->tagarray[2*i+1];
+			if (tagid!=tagptr->flag)
+			  goto nextloop;   /*We don't have this tag */
+		  }
+		} else {                         //multiple tags
+		  struct ArrayObject * ao=(struct ArrayObject *) tagptr;
+		  for(i=0; i<parameter->numbertags; i++) {
+			//slotid is parameter->tagarray[2*i];
+			int tagid=parameter->tagarray[2*i+1];
+			int j;
+			for(j=0; j<ao->___cachedCode___; j++) {
+			  if (tagid==ARRAYGET(ao, struct ___TagDescriptor___*, j)->flag)
+				goto foundtag;
+			}
+			goto nextloop;
+foundtag:
+			;
+		  }
+		}
+      }
+
+      /* Check flags */
+      for(i=0; i<parameter->numberofterms; i++) {
+		int andmask=parameter->intarray[i*2];
+		int checkmask=parameter->intarray[i*2+1];
+		if ((ptr->flag&andmask)==checkmask) {
+		  enqueuetasks(parameter, prevptr, ptr, NULL, 0);
+		  prevptr=parameter;
+		  break;
+		}
+      }
+nextloop:
+      ;
+    }
+  }
+}
+
+void enqueueObject_I(void * vptr,
+                     struct parameterwrapper ** vqueues,
+                     int vlength) {
+  struct ___Object___ *ptr = (struct ___Object___ *)vptr;
+
+  {
+    //struct QueueItem *tmpptr;
+    struct parameterwrapper * parameter=NULL;
+    int j;
+    int i;
+    struct parameterwrapper * prevptr=NULL;
+    struct ___Object___ *tagptr=NULL;
+    struct parameterwrapper ** queues = vqueues;
+    int length = vlength;
+    if(BAMBOO_NUM_OF_CORE > NUMCORESACTIVE - 1) {
+      return;
+    }
+    if(queues == NULL) {
+      queues = objectqueues[BAMBOO_NUM_OF_CORE][ptr->type];
+      length = numqueues[BAMBOO_NUM_OF_CORE][ptr->type];
+    }
+    tagptr=ptr->___tags___;
+
+    /* Outer loop iterates through all parameter queues an object of
+       this type could be in.  */
+    for(j = 0; j < length; ++j) {
+      parameter = queues[j];
+      /* Check tags */
+      if (parameter->numbertags>0) {
+		if (tagptr==NULL)
+		  goto nextloop;      //that means the object has no tag
+		//but that param needs tag
+		else if(tagptr->type==TAGTYPE) {   //one tag
+		//struct ___TagDescriptor___*tag=(struct ___TagDescriptor___*)tagptr;
+		  for(i=0; i<parameter->numbertags; i++) {
+			//slotid is parameter->tagarray[2*i];
+			int tagid=parameter->tagarray[2*i+1];
+			if (tagid!=tagptr->flag)
+			  goto nextloop;            /*We don't have this tag */
+		  }
+		} else {    //multiple tags
+		  struct ArrayObject * ao=(struct ArrayObject *) tagptr;
+		  for(i=0; i<parameter->numbertags; i++) {
+			//slotid is parameter->tagarray[2*i];
+			int tagid=parameter->tagarray[2*i+1];
+			int j;
+			for(j=0; j<ao->___cachedCode___; j++) {
+			  if (tagid==ARRAYGET(ao, struct ___TagDescriptor___*, j)->flag)
+				goto foundtag;
+			}
+			goto nextloop;
+foundtag:
+			;
+		  }
+		}
+      }
+
+      /* Check flags */
+      for(i=0; i<parameter->numberofterms; i++) {
+		int andmask=parameter->intarray[i*2];
+		int checkmask=parameter->intarray[i*2+1];
+		if ((ptr->flag&andmask)==checkmask) {
+		  enqueuetasks_I(parameter, prevptr, ptr, NULL, 0);
+		  prevptr=parameter;
+		  break;
+		}
+      }
+nextloop:
+      ;
+    }
+  }
+}
+
+
+int * getAliasLock(void ** ptrs,
+                   int length,
+                   struct RuntimeHash * tbl) {
+  if(length == 0) {
+    return (int*)(RUNMALLOC(sizeof(int)));
+  } else {
+    int i = 0;
+    int locks[length];
+    int locklen = 0;
+    bool redirect = false;
+    int redirectlock = 0;
+    for(; i < length; i++) {
+      struct ___Object___ * ptr = (struct ___Object___ *)(ptrs[i]);
+      int lock = 0;
+      int j = 0;
+      if(ptr->lock == NULL) {
+		lock = (int)(ptr);
+      } else {
+		lock = (int)(ptr->lock);
+      }
+      if(redirect) {
+		if(lock != redirectlock) {
+		  RuntimeHashadd(tbl, lock, redirectlock);
+		}
+      } else {
+		if(RuntimeHashcontainskey(tbl, lock)) {
+		  // already redirected
+		  redirect = true;
+		  RuntimeHashget(tbl, lock, &redirectlock);
+		  for(; j < locklen; j++) {
+			if(locks[j] != redirectlock) {
+			  RuntimeHashadd(tbl, locks[j], redirectlock);
+			}
+		  }
+		} else {
+		  bool insert = true;
+		  for(j = 0; j < locklen; j++) {
+			if(locks[j] == lock) {
+			  insert = false;
+			  break;
+			} else if(locks[j] > lock) {
+			  break;
+			}
+		  }
+		  if(insert) {
+			int h = locklen;
+			for(; h > j; h--) {
+			  locks[h] = locks[h-1];
+			}
+			locks[j] = lock;
+			locklen++;
+		  }
+		}
+      }
+    }
+    if(redirect) {
+      return (int *)redirectlock;
+    } else {
+      return (int *)(locks[0]);
+    }
+  }
+}
+
+void addAliasLock(void * ptr,
+                  int lock) {
+  struct ___Object___ * obj = (struct ___Object___ *)ptr;
+  if(((int)ptr != lock) && (obj->lock != (int*)lock)) {
+    // originally no alias lock associated or have a different alias lock
+    // flush it as the new one
+    obj->lock = (int *)lock;
+  }
+}
+
+#ifdef PROFILE
+inline void setTaskExitIndex(int index) {
+  taskInfoArray[taskInfoIndex]->exitIndex = index;
+}
+
+inline void addNewObjInfo(void * nobj) {
+  if(taskInfoArray[taskInfoIndex]->newObjs == NULL) {
+    taskInfoArray[taskInfoIndex]->newObjs = createQueue();
+  }
+  addNewItem(taskInfoArray[taskInfoIndex]->newObjs, nobj);
+}
+#endif
+
+#ifdef MULTICORE_GC
+// Only allocate local mem chunks to each core.
+// If a core has used up its local shared memory, start gc.
+void * localmalloc_I(int coren,
+                     int isize,
+                     int * allocsize) {
+  void * mem = NULL;
+  int gccorenum = (coren < NUMCORES4GC) ? (coren) : (coren % NUMCORES4GC);
+  int i = 0;
+  int j = 0;
+  int tofindb = gc_core2block[2*gccorenum+i]+(NUMCORES4GC*2)*j;
+  int totest = tofindb;
+  int bound = BAMBOO_SMEM_SIZE_L;
+  int foundsmem = 0;
+  int size = 0;
+  do {
+    bound = (totest < NUMCORES4GC) ? BAMBOO_SMEM_SIZE_L : BAMBOO_SMEM_SIZE;
+    int nsize = bamboo_smemtbl[totest];
+    bool islocal = true;
+    if(nsize < bound) {
+      bool tocheck = true;
+      // have some space in the block
+      if(totest == tofindb) {
+		// the first partition
+		size = bound - nsize;
+      } else if(nsize == 0) {
+		// an empty partition, can be appended
+		size += bound;
+      } else {
+		// not an empty partition, can not be appended
+		// the last continuous block is not big enough, go to check the next
+		// local block
+		islocal = true;
+		tocheck = false;
+      } // if(totest == tofindb) else if(nsize == 0) else ...
+      if(tocheck) {
+		if(size >= isize) {
+		  // have enough space in the block, malloc
+		  foundsmem = 1;
+		  break;
+		} else {
+		  // no enough space yet, try to append next continuous block
+		  islocal = false;
+		}  // if(size > isize) else ...
+      }  // if(tocheck)
+    } // if(nsize < bound)
+    if(islocal) {
+      // no space in the block, go to check the next block
+      i++;
+      if(2==i) {
+		i = 0;
+		j++;
+      }
+      tofindb = totest = gc_core2block[2*gccorenum+i]+(NUMCORES4GC*2)*j;
+    } else {
+      totest += 1;
+    }  // if(islocal) else ...
+    if(totest > gcnumblock-1-bamboo_reserved_smem) {
+      // no more local mem, do not find suitable block
+      foundsmem = 2;
+      break;
+    }  // if(totest > gcnumblock-1-bamboo_reserved_smem) ...
+  } while(true);
+
+  if(foundsmem == 1) {
+    // find suitable block
+    mem = gcbaseva+bamboo_smemtbl[tofindb]+((tofindb<NUMCORES4GC) ?
+          (BAMBOO_SMEM_SIZE_L*tofindb) : (BAMBOO_LARGE_SMEM_BOUND+
+          (tofindb-NUMCORES4GC)*BAMBOO_SMEM_SIZE));
+    *allocsize = size;
+    // set bamboo_smemtbl
+    for(i = tofindb; i <= totest; i++) {
+      bamboo_smemtbl[i]=(i<NUMCORES4GC)?BAMBOO_SMEM_SIZE_L:BAMBOO_SMEM_SIZE;
+    }
+  } else if(foundsmem == 2) {
+    // no suitable block
+    *allocsize = 0;
+  }
+
+  return mem;
+} // void * localmalloc_I(int, int, int *)
+
+#ifdef SMEMF
+// Allocate the local shared memory to each core with the highest priority,
+// if a core has used up its local shared memory, try to allocate the 
+// shared memory that belong to its neighbours, if also failed, start gc.
+void * fixedmalloc_I(int coren,
+                     int isize,
+                     int * allocsize) {
+  void * mem = NULL;
+  int i = 0;
+  int j = 0;
+  int k = 0;
+  int gccorenum = (coren < NUMCORES4GC) ? (coren) : (coren % NUMCORES4GC);
+  int coords_x = bamboo_cpu2coords[gccorenum*2];
+  int coords_y = bamboo_cpu2coords[gccorenum*2+1];
+  int ii = 1;
+  int tofindb = gc_core2block[2*core2test[gccorenum][k]+i]+(NUMCORES4GC*2)*j;
+  int totest = tofindb;
+  int bound = BAMBOO_SMEM_SIZE_L;
+  int foundsmem = 0;
+  int size = 0;
+  do {
+    bound = (totest < NUMCORES4GC) ? BAMBOO_SMEM_SIZE_L : BAMBOO_SMEM_SIZE;
+    int nsize = bamboo_smemtbl[totest];
+    bool islocal = true;
+    if(nsize < bound) {
+      bool tocheck = true;
+      // have some space in the block
+      if(totest == tofindb) {
+		// the first partition
+		size = bound - nsize;
+      } else if(nsize == 0) {
+		// an empty partition, can be appended
+		size += bound;
+      } else {
+		// not an empty partition, can not be appended
+		// the last continuous block is not big enough, go to check the next
+		// local block
+		islocal = true;
+		tocheck = false;
+      } // if(totest == tofindb) else if(nsize == 0) else ...
+      if(tocheck) {
+		if(size >= isize) {
+		  // have enough space in the block, malloc
+		  foundsmem = 1;
+		  break;
+		} else {
+		  // no enough space yet, try to append next continuous block
+		  // TODO may consider to go to next local block?
+		  islocal = false;
+		}  // if(size > isize) else ...
+      }  // if(tocheck)
+    } // if(nsize < bound)
+    if(islocal) {
+      // no space in the block, go to check the next block
+      i++;
+      if(2==i) {
+		i = 0;
+		j++;
+      }
+      tofindb=totest=
+		gc_core2block[2*core2test[gccorenum][k]+i]+(NUMCORES4GC*2)*j;
+    } else {
+      totest += 1;
+    }  // if(islocal) else ...
+    if(totest > gcnumblock-1-bamboo_reserved_smem) {
+      // no more local mem, do not find suitable block on local mem
+	  // try to malloc shared memory assigned to the neighbour cores
+	  do{
+		k++;
+		if(k >= NUM_CORES2TEST) {
+		  // no more memory available on either coren or its neighbour cores
+		  foundsmem = 2;
+		  goto memsearchresult;
+		}
+	  } while(core2test[gccorenum][k] == -1);
+	  i = 0;
+	  j = 0;
+	  tofindb=totest=
+		gc_core2block[2*core2test[gccorenum][k]+i]+(NUMCORES4GC*2)*j;
+    }  // if(totest > gcnumblock-1-bamboo_reserved_smem) ...
+  } while(true);
+
+memsearchresult:
+  if(foundsmem == 1) {
+    // find suitable block
+    mem = gcbaseva+bamboo_smemtbl[tofindb]+((tofindb<NUMCORES4GC) ?
+          (BAMBOO_SMEM_SIZE_L*tofindb) : (BAMBOO_LARGE_SMEM_BOUND+
+          (tofindb-NUMCORES4GC)*BAMBOO_SMEM_SIZE));
+    *allocsize = size;
+    // set bamboo_smemtbl
+    for(i = tofindb; i <= totest; i++) {
+      bamboo_smemtbl[i]=(i<NUMCORES4GC)?BAMBOO_SMEM_SIZE_L:BAMBOO_SMEM_SIZE;
+    }
+  } else if(foundsmem == 2) {
+    // no suitable block
+    *allocsize = 0;
+  }
+
+  return mem;
+} // void * fixedmalloc_I(int, int, int *)
+#endif // #ifdef SMEMF
+
+#ifdef SMEMM
+// Allocate the local shared memory to each core with the highest priority,
+// if a core has used up its local shared memory, try to allocate the 
+// shared memory that belong to its neighbours first, if failed, check 
+// current memory allocation rate, if it has already reached the threshold,
+// start gc, otherwise, allocate the shared memory globally.  If all the 
+// shared memory has been used up, start gc.
+void * mixedmalloc_I(int coren,
+                     int isize,
+                     int * allocsize) {
+  void * mem = NULL;
+  int i = 0;
+  int j = 0;
+  int k = 0;
+  int gccorenum = (coren < NUMCORES4GC) ? (coren) : (coren % NUMCORES4GC);
+  int ii = 1;
+  int tofindb = gc_core2block[2*core2test[gccorenum][k]+i]+(NUMCORES4GC*2)*j;
+  int totest = tofindb;
+  int bound = BAMBOO_SMEM_SIZE_L;
+  int foundsmem = 0;
+  int size = 0;
+  do {
+    bound = (totest < NUMCORES4GC) ? BAMBOO_SMEM_SIZE_L : BAMBOO_SMEM_SIZE;
+    int nsize = bamboo_smemtbl[totest];
+    bool islocal = true;
+    if(nsize < bound) {
+      bool tocheck = true;
+      // have some space in the block
+      if(totest == tofindb) {
+		// the first partition
+		size = bound - nsize;
+      } else if(nsize == 0) {
+		// an empty partition, can be appended
+		size += bound;
+      } else {
+		// not an empty partition, can not be appended
+		// the last continuous block is not big enough, go to check the next
+		// local block
+		islocal = true;
+		tocheck = false;
+      } // if(totest == tofindb) else if(nsize == 0) else ...
+      if(tocheck) {
+		if(size >= isize) {
+		  // have enough space in the block, malloc
+		  foundsmem = 1;
+		  break;
+		} else {
+		  // no enough space yet, try to append next continuous block
+		  // TODO may consider to go to next local block?
+		  islocal = false;
+		}  // if(size > isize) else ...
+      }  // if(tocheck)
+    } // if(nsize < bound)
+    if(islocal) {
+      // no space in the block, go to check the next block
+      i++;
+      if(2==i) {
+		i = 0;
+		j++;
+      }
+      tofindb=totest=
+		gc_core2block[2*core2test[gccorenum][k]+i]+(NUMCORES4GC*2)*j;
+    } else {
+      totest += 1;
+    }  // if(islocal) else ...
+    if(totest > gcnumblock-1-bamboo_reserved_smem) {
+      // no more local mem, do not find suitable block on local mem
+	  // try to malloc shared memory assigned to the neighbour cores
+	  do{
+		k++;
+		if(k >= NUM_CORES2TEST) {
+		  if(gcmem_mixed_usedmem >= gcmem_mixed_threshold) {
+			// no more memory available on either coren or its neighbour cores
+			foundsmem = 2;
+			goto memmixedsearchresult;
+		  } else {
+			// try allocate globally
+			mem = globalmalloc_I(coren, isize, allocsize);
+			return mem;
+		  }
+		}
+	  } while(core2test[gccorenum][k] == -1);
+	  i = 0;
+	  j = 0;
+	  tofindb=totest=
+		gc_core2block[2*core2test[gccorenum][k]+i]+(NUMCORES4GC*2)*j;
+    }  // if(totest > gcnumblock-1-bamboo_reserved_smem) ...
+  } while(true);
+
+memmixedsearchresult:
+  if(foundsmem == 1) {
+    // find suitable block
+    mem = gcbaseva+bamboo_smemtbl[tofindb]+((tofindb<NUMCORES4GC) ?
+          (BAMBOO_SMEM_SIZE_L*tofindb) : (BAMBOO_LARGE_SMEM_BOUND+
+          (tofindb-NUMCORES4GC)*BAMBOO_SMEM_SIZE));
+    *allocsize = size;
+    // set bamboo_smemtbl
+    for(i = tofindb; i <= totest; i++) {
+      bamboo_smemtbl[i]=(i<NUMCORES4GC)?BAMBOO_SMEM_SIZE_L:BAMBOO_SMEM_SIZE;
+    }
+	gcmem_mixed_usedmem += size;
+	if(tofindb == bamboo_free_block) {
+      bamboo_free_block = totest+1;
+    }
+  } else if(foundsmem == 2) {
+    // no suitable block
+    *allocsize = 0;
+  }
+
+  return mem;
+} // void * mixedmalloc_I(int, int, int *)
+#endif // #ifdef SMEMM
+
+// Allocate all the memory chunks globally, do not consider the host cores
+// When all the shared memory are used up, start gc.
+void * globalmalloc_I(int coren,
+                      int isize,
+                      int * allocsize) {
+  void * mem = NULL;
+  int tofindb = bamboo_free_block;       //0;
+  int totest = tofindb;
+  int bound = BAMBOO_SMEM_SIZE_L;
+  int foundsmem = 0;
+  int size = 0;
+  if(tofindb > gcnumblock-1-bamboo_reserved_smem) {
+	// Out of shared memory
+    *allocsize = 0;
+    return NULL;
+  }
+  do {
+    bound = (totest < NUMCORES4GC) ? BAMBOO_SMEM_SIZE_L : BAMBOO_SMEM_SIZE;
+    int nsize = bamboo_smemtbl[totest];
+    bool isnext = false;
+    if(nsize < bound) {
+      bool tocheck = true;
+      // have some space in the block
+      if(totest == tofindb) {
+		// the first partition
+		size = bound - nsize;
+      } else if(nsize == 0) {
+		// an empty partition, can be appended
+		size += bound;
+      } else {
+		// not an empty partition, can not be appended
+		// the last continuous block is not big enough, start another block
+		isnext = true;
+		tocheck = false;
+      }  // if(totest == tofindb) else if(nsize == 0) else ...
+      if(tocheck) {
+		if(size >= isize) {
+		  // have enough space in the block, malloc
+		  foundsmem = 1;
+		  break;
+		}  // if(size > isize)
+      }   // if(tocheck)
+    } else {
+      isnext = true;
+    }  // if(nsize < bound) else ...
+    totest += 1;
+    if(totest > gcnumblock-1-bamboo_reserved_smem) {
+      // no more local mem, do not find suitable block
+      foundsmem = 2;
+      break;
+    }  // if(totest > gcnumblock-1-bamboo_reserved_smem) ...
+    if(isnext) {
+      // start another block
+      tofindb = totest;
+    } // if(islocal)
+  } while(true);
+
+  if(foundsmem == 1) {
+    // find suitable block
+    mem = gcbaseva+bamboo_smemtbl[tofindb]+((tofindb<NUMCORES4GC) ?
+          (BAMBOO_SMEM_SIZE_L*tofindb) : (BAMBOO_LARGE_SMEM_BOUND+
+          (tofindb-NUMCORES4GC)*BAMBOO_SMEM_SIZE));
+    *allocsize = size;
+    // set bamboo_smemtbl
+    for(int i = tofindb; i <= totest; i++) {
+      bamboo_smemtbl[i]=(i<NUMCORES4GC)?BAMBOO_SMEM_SIZE_L:BAMBOO_SMEM_SIZE;
+    }
+    if(tofindb == bamboo_free_block) {
+      bamboo_free_block = totest+1;
+    }
+  } else if(foundsmem == 2) {
+    // no suitable block
+    *allocsize = 0;
+    mem = NULL;
+  }
+
+  return mem;
+} // void * globalmalloc_I(int, int, int *)
+#endif // #ifdef MULTICORE_GC
+
+// malloc from the shared memory
+void * smemalloc_I(int coren,
+                   int size,
+                   int * allocsize) {
+  void * mem = NULL;
+#ifdef MULTICORE_GC
+  int isize = size+(BAMBOO_CACHE_LINE_SIZE);
+
+  // go through the bamboo_smemtbl for suitable partitions
+  switch(bamboo_smem_mode) {
+  case SMEMLOCAL: {
+    mem = localmalloc_I(coren, isize, allocsize);
+    break;
+  }
+
+  case SMEMFIXED: {
+#ifdef SMEMF
+	mem = fixedmalloc_I(coren, isize, allocsize);
+#else
+	// not supported yet
+	BAMBOO_EXIT(0xe001);
+#endif
+    break;
+  }
+
+  case SMEMMIXED: {
+#ifdef SMEMM
+	mem = mixedmalloc_I(coren, isize, allocsize);
+#else
+	// not supported yet
+    BAMBOO_EXIT(0xe002);
+#endif
+    break;
+  }
+
+  case SMEMGLOBAL: {
+    mem = globalmalloc_I(coren, isize, allocsize);
+    break;
+  }
+
+  default:
+    break;
+  }
+
+  if(mem == NULL) {
+#else 
+  int toallocate = (size>(BAMBOO_SMEM_SIZE)) ? (size) : (BAMBOO_SMEM_SIZE);
+  if(toallocate > bamboo_free_smem_size) {
+	// no enough mem
+	mem = NULL;
+  } else {
+	mem = (void *)bamboo_free_smemp;
+	bamboo_free_smemp = ((void*)bamboo_free_smemp) + toallocate;
+	bamboo_free_smem_size -= toallocate;
+  }
+  *allocsize = toallocate;
+  if(mem == NULL) {
+#endif // MULTICORE_GC
+    // no enough shared global memory
+    *allocsize = 0;
+#ifdef MULTICORE_GC
+	if(!gcflag) {
+	  gcflag = true;
+	  // inform other cores to stop and wait for gc
+	  gcprecheck = true;
+	  for(int i = 0; i < NUMCORESACTIVE; i++) {
+		// reuse the gcnumsendobjs & gcnumreceiveobjs
+		gccorestatus[i] = 1;
+		gcnumsendobjs[0][i] = 0;
+		gcnumreceiveobjs[0][i] = 0;
+	  }
+	  for(int i = 0; i < NUMCORESACTIVE; i++) {
+		if(i != BAMBOO_NUM_OF_CORE) {
+		  if(BAMBOO_CHECK_SEND_MODE()) {
+			cache_msg_1(i, GCSTARTPRE);
+		  } else {
+			send_msg_1(i, GCSTARTPRE, true);
+		  }
+		}
+	  }
+	}
+	return NULL;
+#else
+    BAMBOO_DEBUGPRINT(0xa001);
+    BAMBOO_EXIT(0xa001);
+#endif
+  }
+  return mem;
+}  // void * smemalloc_I(int, int, int)
+
+INLINE int checkMsgLength_I(int size) {
+#ifdef DEBUG
+#ifndef TILERA
+  BAMBOO_DEBUGPRINT(0xcccc);
+#endif
+#endif
+  int type = msgdata[msgdataindex];
+  switch(type) {
+  case STATUSCONFIRM:
+  case TERMINATE:
+#ifdef MULTICORE_GC
+  case GCSTARTPRE:
+  case GCSTARTINIT:
+  case GCSTART:
+  case GCSTARTMAPINFO:
+  case GCSTARTFLUSH:
+  case GCFINISH:
+  case GCMARKCONFIRM:
+  case GCLOBJREQUEST:
+#ifdef GC_CACHE_ADAPT
+  case GCSTARTPREF:
+#endif // GC_CACHE_ADAPT
+#endif // MULTICORE_GC
+  {
+	msglength = 1;
+	break;
+  }
+
+  case PROFILEOUTPUT:
+  case PROFILEFINISH:
+#ifdef MULTICORE_GC
+  case GCSTARTCOMPACT:
+  case GCMARKEDOBJ:
+  case GCFINISHINIT:
+  case GCFINISHMAPINFO:
+  case GCFINISHFLUSH:
+#ifdef GC_CACHE_ADAPT
+  case GCFINISHPREF:
+#endif // GC_CACHE_ADAPT
+#endif // MULTICORE_GC
+  {
+	msglength = 2;
+	break;
+  }
+
+  case MEMREQUEST:
+  case MEMRESPONSE:
+#ifdef MULTICORE_GC
+  case GCMAPREQUEST:
+  case GCMAPINFO:
+  case GCMAPTBL:
+  case GCLOBJMAPPING:
+#endif
+  {
+	msglength = 3;
+	break;
+  }
+
+  case TRANSTALL:
+  case LOCKGROUNT:
+  case LOCKDENY:
+  case LOCKRELEASE:
+  case REDIRECTGROUNT:
+  case REDIRECTDENY:
+  case REDIRECTRELEASE:
+#ifdef MULTICORE_GC
+  case GCFINISHPRE:
+  case GCFINISHMARK:
+  case GCMOVESTART:
+#ifdef GC_PROFILE
+  case GCPROFILES:
+#endif
+#endif
+  {
+	msglength = 4;
+	break;
+  }
+
+  case LOCKREQUEST:
+  case STATUSREPORT:
+#ifdef MULTICORE_GC
+  case GCFINISHCOMPACT:
+  case GCMARKREPORT:
+#endif
+  {
+	msglength = 5;
+	break;
+  }
+
+  case REDIRECTLOCK:
+  {
+    msglength = 6;
+    break;
+  }
+
+  case TRANSOBJ:   // nonfixed size
+#ifdef MULTICORE_GC
+  case GCLOBJINFO:
+#endif
+  {             // nonfixed size
+	if(size > 1) {
+	  msglength = msgdata[msgdataindex+1];
+	} else {
+	  return -1;
+	}
+	break;
+  }
+
+  default:
+  {
+    BAMBOO_DEBUGPRINT_REG(type);
+	BAMBOO_DEBUGPRINT_REG(size);
+    BAMBOO_DEBUGPRINT_REG(msgdataindex);
+	BAMBOO_DEBUGPRINT_REG(msgdatalast);
+	BAMBOO_DEBUGPRINT_REG(msgdatafull);
+    int i = 6;
+    while(i-- > 0) {
+      BAMBOO_DEBUGPRINT(msgdata[msgdataindex+i]);
+    }
+    BAMBOO_EXIT(0xd005);
+    break;
+  }
+  }
+#ifdef DEBUG
+#ifndef TILERA
+  BAMBOO_DEBUGPRINT_REG(msgdata[msgdataindex]);
+#endif
+#endif
+#ifdef DEBUG
+#ifndef TILERA
+  BAMBOO_DEBUGPRINT(0xffff);
+#endif
+#endif
+  return msglength;
+}
+
+INLINE void processmsg_transobj_I() {
+#ifdef PROFILE_INTERRUPT
+  /*if(!interruptInfoOverflow) {
+    InterruptInfo* intInfo = RUNMALLOC_I(sizeof(struct interrupt_info));
+    interruptInfoArray[interruptInfoIndex] = intInfo;
+    intInfo->startTime = BAMBOO_GET_EXE_TIME();
+    intInfo->endTime = -1;
+  }*/
+#endif
+  MSG_INDEXINC_I();
+  struct transObjInfo * transObj=RUNMALLOC_I(sizeof(struct transObjInfo));
+  int k = 0;
+#ifdef DEBUG
+#ifndef CLOSE_PRINT
+  BAMBOO_DEBUGPRINT(0xe880);
+#endif
+#endif
+  if(BAMBOO_NUM_OF_CORE > NUMCORESACTIVE - 1) {
+#ifndef CLOSE_PRINT
+    BAMBOO_DEBUGPRINT_REG(msgdata[msgdataindex] /*[2]*/);
+#endif
+    BAMBOO_EXIT(0xa002);
+  }
+  // store the object and its corresponding queue info, enqueue it later
+  transObj->objptr = (void *)msgdata[msgdataindex];  //[2]
+  MSG_INDEXINC_I();
+  transObj->length = (msglength - 3) / 2;
+  transObj->queues = RUNMALLOC_I(sizeof(int)*(msglength - 3));
+  for(k = 0; k < transObj->length; ++k) {
+    transObj->queues[2*k] = msgdata[msgdataindex];   //[3+2*k];
+    MSG_INDEXINC_I();
+#ifdef DEBUG
+#ifndef CLOSE_PRINT
+    //BAMBOO_DEBUGPRINT_REG(transObj->queues[2*k]);
+#endif
+#endif
+    transObj->queues[2*k+1] = msgdata[msgdataindex]; //[3+2*k+1];
+    MSG_INDEXINC_I();
+#ifdef DEBUG
+#ifndef CLOSE_PRINT
+    //BAMBOO_DEBUGPRINT_REG(transObj->queues[2*k+1]);
+#endif
+#endif
+  }
+  // check if there is an existing duplicate item
+  {
+    struct QueueItem * qitem = getHead(&objqueue);
+    struct QueueItem * prev = NULL;
+    while(qitem != NULL) {
+      struct transObjInfo * tmpinfo =
+        (struct transObjInfo *)(qitem->objectptr);
+      if(tmpinfo->objptr == transObj->objptr) {
+		// the same object, remove outdate one
+		RUNFREE(tmpinfo->queues);
+		RUNFREE(tmpinfo);
+		removeItem(&objqueue, qitem);
+		//break;
+      } else {
+		prev = qitem;
+      }
+      if(prev == NULL) {
+		qitem = getHead(&objqueue);
+      } else {
+		qitem = getNextQueueItem(prev);
+      }
+    }
+    addNewItem_I(&objqueue, (void *)transObj);
+  }
+  ++(self_numreceiveobjs);
+#ifdef MULTICORE_GC
+  if(gcprocessing) {
+	if(STARTUPCORE == BAMBOO_NUM_OF_CORE) {
+	  // set the gcprecheck to enable checking again
+	  gcprecheck = true;
+	} else {
+	  // send a update pregc information msg to the master core
+	  if(BAMBOO_CHECK_SEND_MODE()) {
+		cache_msg_4(STARTUPCORE, GCFINISHPRE, BAMBOO_NUM_OF_CORE, 
+			self_numsendobjs, self_numreceiveobjs);
+	  } else {
+		send_msg_4(STARTUPCORE, GCFINISHPRE, BAMBOO_NUM_OF_CORE, 
+			self_numsendobjs, self_numreceiveobjs, true);
+	  }
+	}
+  }
+#endif 
+#ifdef PROFILE_INTERRUPT
+  /*if(!interruptInfoOverflow) {
+    interruptInfoArray[interruptInfoIndex]->endTime=BAMBOO_GET_EXE_TIME();
+    interruptInfoIndex++;
+    if(interruptInfoIndex == INTERRUPTINFOLENGTH) {
+      interruptInfoOverflow = true;
+    }
+  }*/
+#endif
+}
+
+INLINE void processmsg_transtall_I() {
+  if(BAMBOO_NUM_OF_CORE != STARTUPCORE) {
+    // non startup core can not receive stall msg
+#ifndef CLOSE_PRINT
+    BAMBOO_DEBUGPRINT_REG(msgdata[msgdataindex] /*[1]*/);
+#endif
+    BAMBOO_EXIT(0xa003);
+  }
+  int num_core = msgdata[msgdataindex]; //[1]
+  MSG_INDEXINC_I();
+  if(num_core < NUMCORESACTIVE) {
+#ifdef DEBUG
+#ifndef CLOSE_PRINT
+    BAMBOO_DEBUGPRINT(0xe881);
+#endif
+#endif
+    corestatus[num_core] = 0;
+    numsendobjs[num_core] = msgdata[msgdataindex]; //[2];
+    MSG_INDEXINC_I();
+    numreceiveobjs[num_core] = msgdata[msgdataindex]; //[3];
+    MSG_INDEXINC_I();
+  }
+}
+
+#ifndef MULTICORE_GC
+INLINE void processmsg_lockrequest_I() {
+  // check to see if there is a lock exist for the required obj
+  // msgdata[1] -> lock type
+  int locktype = msgdata[msgdataindex]; //[1];
+  MSG_INDEXINC_I();
+  int data2 = msgdata[msgdataindex];  // obj pointer
+  MSG_INDEXINC_I();
+  int data3 = msgdata[msgdataindex];  // lock
+  MSG_INDEXINC_I();
+  int data4 = msgdata[msgdataindex];  // request core
+  MSG_INDEXINC_I();
+  // -1: redirected, 0: approved, 1: denied
+  int deny=processlockrequest(locktype, data3, data2, data4, data4, true);
+  if(deny == -1) {
+    // this lock request is redirected
+    return;
+  } else {
+    // send response msg
+    // for 32 bit machine, the size is always 4 words, cache the msg first
+    int tmp = deny==1 ? LOCKDENY : LOCKGROUNT;
+    if(BAMBOO_CHECK_SEND_MODE()) {
+	  cache_msg_4(data4, tmp, locktype, data2, data3);
+    } else {
+	  send_msg_4(data4, tmp, locktype, data2, data3, true);
+    }
+  }
+}
+
+INLINE void processmsg_lockgrount_I() {
+  MSG_INDEXINC_I();
+  if(BAMBOO_NUM_OF_CORE > NUMCORESACTIVE - 1) {
+#ifndef CLOSE_PRINT
+    BAMBOO_DEBUGPRINT_REG(msgdata[msgdataindex] /*[2]*/);
+#endif
+    BAMBOO_EXIT(0xa004);
+  }
+  int data2 = msgdata[msgdataindex];
+  MSG_INDEXINC_I();
+  int data3 = msgdata[msgdataindex];
+  MSG_INDEXINC_I();
+  if((lockobj == data2) && (lock2require == data3)) {
+#ifdef DEBUG
+#ifndef CLOSE_PRINT
+    BAMBOO_DEBUGPRINT(0xe882);
+#endif
+#endif
+    lockresult = 1;
+    lockflag = true;
+#ifndef INTERRUPT
+    reside = false;
+#endif
+  } else {
+    // conflicts on lockresults
+#ifndef CLOSE_PRINT
+    BAMBOO_DEBUGPRINT_REG(data2);
+#endif
+    BAMBOO_EXIT(0xa005);
+  }
+}
+
+INLINE void processmsg_lockdeny_I() {
+  MSG_INDEXINC_I();
+  int data2 = msgdata[msgdataindex];
+  MSG_INDEXINC_I();
+  int data3 = msgdata[msgdataindex];
+  MSG_INDEXINC_I();
+  if(BAMBOO_NUM_OF_CORE > NUMCORESACTIVE - 1) {
+#ifndef CLOSE_PRINT
+    BAMBOO_DEBUGPRINT_REG(data2);
+#endif
+    BAMBOO_EXIT(0xa006);
+  }
+  if((lockobj == data2) && (lock2require == data3)) {
+#ifdef DEBUG
+#ifndef CLOSE_PRINT
+    BAMBOO_DEBUGPRINT(0xe883);
+#endif
+#endif
+    lockresult = 0;
+    lockflag = true;
+#ifndef INTERRUPT
+    reside = false;
+#endif
+  } else {
+    // conflicts on lockresults
+#ifndef CLOSE_PRINT
+    BAMBOO_DEBUGPRINT_REG(data2);
+#endif
+    BAMBOO_EXIT(0xa007);
+  }
+}
+
+INLINE void processmsg_lockrelease_I() {
+  int data1 = msgdata[msgdataindex];
+  MSG_INDEXINC_I();
+  int data2 = msgdata[msgdataindex];
+  MSG_INDEXINC_I();
+  // receive lock release msg
+  processlockrelease(data1, data2, 0, false);
+}
+
+INLINE void processmsg_redirectlock_I() {
+  // check to see if there is a lock exist for the required obj
+  int data1 = msgdata[msgdataindex];
+  MSG_INDEXINC_I();    //msgdata[1]; // lock type
+  int data2 = msgdata[msgdataindex];
+  MSG_INDEXINC_I();    //msgdata[2]; // obj pointer
+  int data3 = msgdata[msgdataindex];
+  MSG_INDEXINC_I();    //msgdata[3]; // redirect lock
+  int data4 = msgdata[msgdataindex];
+  MSG_INDEXINC_I();    //msgdata[4]; // root request core
+  int data5 = msgdata[msgdataindex];
+  MSG_INDEXINC_I();    //msgdata[5]; // request core
+  int deny = processlockrequest(data1, data3, data2, data5, data4, true);
+  if(deny == -1) {
+    // this lock request is redirected
+    return;
+  } else {
+    // send response msg
+    // for 32 bit machine, the size is always 4 words, cache the msg first
+    if(BAMBOO_CHECK_SEND_MODE()) {
+	  cache_msg_4(data4, deny==1 ? REDIRECTDENY : REDIRECTGROUNT,
+				  data1, data2, data3);
+    } else {
+	  send_msg_4(data4, deny==1?REDIRECTDENY:REDIRECTGROUNT,
+				 data1, data2, data3, true);
+    }
+  }
+}
+
+INLINE void processmsg_redirectgrount_I() {
+  MSG_INDEXINC_I();
+  int data2 = msgdata[msgdataindex];
+  MSG_INDEXINC_I();
+  if(BAMBOO_NUM_OF_CORE > NUMCORESACTIVE - 1) {
+#ifndef CLOSE_PRINT
+    BAMBOO_DEBUGPRINT_REG(data2);
+#endif
+    BAMBOO_EXIT(0xa00a);
+  }
+  if(lockobj == data2) {
+#ifdef DEBUG
+#ifndef CLOSE_PRINT
+    BAMBOO_DEBUGPRINT(0xe891);
+#endif
+#endif
+    int data3 = msgdata[msgdataindex];
+    MSG_INDEXINC_I();
+    lockresult = 1;
+    lockflag = true;
+    RuntimeHashadd_I(objRedirectLockTbl, lockobj, data3);
+#ifndef INTERRUPT
+    reside = false;
+#endif
+  } else {
+    // conflicts on lockresults
+#ifndef CLOSE_PRINT
+    BAMBOO_DEBUGPRINT_REG(data2);
+#endif
+    BAMBOO_EXIT(0xa00b);
+  }
+}
+
+INLINE void processmsg_redirectdeny_I() {
+  MSG_INDEXINC_I();
+  int data2 = msgdata[msgdataindex];
+  MSG_INDEXINC_I();
+  if(BAMBOO_NUM_OF_CORE > NUMCORESACTIVE - 1) {
+#ifndef CLOSE_PRINT
+    BAMBOO_DEBUGPRINT_REG(data2);
+#endif
+    BAMBOO_EXIT(0xa00c);
+  }
+  if(lockobj == data2) {
+#ifdef DEBUG
+#ifndef CLOSE_PRINT
+    BAMBOO_DEBUGPRINT(0xe892);
+#endif
+#endif
+    lockresult = 0;
+    lockflag = true;
+#ifndef INTERRUPT
+    reside = false;
+#endif
+  } else {
+    // conflicts on lockresults
+#ifndef CLOSE_PRINT
+    BAMBOO_DEBUGPRINT_REG(data2);
+#endif
+    BAMBOO_EXIT(0xa00d);
+  }
+}
+
+INLINE void processmsg_redirectrelease_I() {
+  int data1 = msgdata[msgdataindex];
+  MSG_INDEXINC_I();
+  int data2 = msgdata[msgdataindex];
+  MSG_INDEXINC_I();
+  int data3 = msgdata[msgdataindex];
+  MSG_INDEXINC_I();
+  processlockrelease(data1, data2, data3, true);
+}
+#endif // #ifndef MULTICORE_GC
+
+#ifdef PROFILE
+INLINE void processmsg_profileoutput_I() {
+  if(BAMBOO_NUM_OF_CORE == STARTUPCORE) {
+    // startup core can not receive profile output finish msg
+    BAMBOO_EXIT(0xa008);
+  }
+#ifdef DEBUG
+#ifndef CLOSE_PRINT
+  BAMBOO_DEBUGPRINT(0xe885);
+#endif
+#endif
+  stall = true;
+  totalexetime = msgdata[msgdataindex];  //[1]
+  MSG_INDEXINC_I();
+#ifdef RT_TEST
+  BAMBOO_DEBUGPRINT_REG(dot_num);
+#else
+  outputProfileData();
+#endif
+  // cache the msg first
+  if(BAMBOO_CHECK_SEND_MODE()) {
+	cache_msg_2(STARTUPCORE, PROFILEFINISH, BAMBOO_NUM_OF_CORE);
+  } else {
+	send_msg_2(STARTUPCORE, PROFILEFINISH, BAMBOO_NUM_OF_CORE, true);
+  }
+}
+
+INLINE void processmsg_profilefinish_I() {
+  if(BAMBOO_NUM_OF_CORE != STARTUPCORE) {
+    // non startup core can not receive profile output finish msg
+#ifndef CLOSE_PRINT
+    BAMBOO_DEBUGPRINT_REG(msgdata[msgdataindex /*1*/]);
+#endif
+    BAMBOO_EXIT(0xa009);
+  }
+#ifdef DEBUG
+#ifndef CLOSE_PRINT
+  BAMBOO_DEBUGPRINT(0xe886);
+#endif
+#endif
+  int data1 = msgdata[msgdataindex];
+  MSG_INDEXINC_I();
+  profilestatus[data1] = 0;
+}
+#endif // #ifdef PROFILE
+
+INLINE void processmsg_statusconfirm_I() {
+  if((BAMBOO_NUM_OF_CORE == STARTUPCORE)
+     || (BAMBOO_NUM_OF_CORE > NUMCORESACTIVE - 1)) {
+    // wrong core to receive such msg
+    BAMBOO_EXIT(0xa00e);
+  } else {
+    // send response msg
+#ifdef DEBUG
+#ifndef CLOSE_PRINT
+    BAMBOO_DEBUGPRINT(0xe887);
+#endif
+#endif
+    // cache the msg first
+    if(BAMBOO_CHECK_SEND_MODE()) {
+	  cache_msg_5(STARTUPCORE, STATUSREPORT,
+				  busystatus ? 1 : 0, BAMBOO_NUM_OF_CORE,
+				  self_numsendobjs, self_numreceiveobjs);
+    } else {
+	  send_msg_5(STARTUPCORE, STATUSREPORT, busystatus?1:0,
+				 BAMBOO_NUM_OF_CORE, self_numsendobjs,
+				 self_numreceiveobjs, true);
+    }
+  }
+}
+
+INLINE void processmsg_statusreport_I() {
+  int data1 = msgdata[msgdataindex];
+  MSG_INDEXINC_I();
+  int data2 = msgdata[msgdataindex];
+  MSG_INDEXINC_I();
+  int data3 = msgdata[msgdataindex];
+  MSG_INDEXINC_I();
+  int data4 = msgdata[msgdataindex];
+  MSG_INDEXINC_I();
+  // receive a status confirm info
+  if(BAMBOO_NUM_OF_CORE != STARTUPCORE) {
+    // wrong core to receive such msg
+#ifndef CLOSE_PRINT
+    BAMBOO_DEBUGPRINT_REG(data2);
+#endif
+    BAMBOO_EXIT(0xa00f);
+  } else {
+#ifdef DEBUG
+#ifndef CLOSE_PRINT
+    BAMBOO_DEBUGPRINT(0xe888);
+#endif
+#endif
+    if(waitconfirm) {
+      numconfirm--;
+    }
+    corestatus[data2] = data1;
+    numsendobjs[data2] = data3;
+    numreceiveobjs[data2] = data4;
+  }
+}
+
+INLINE void processmsg_terminate_I() {
+#ifdef DEBUG
+#ifndef CLOSE_PRINT
+  BAMBOO_DEBUGPRINT(0xe889);
+#endif
+#endif
+  disruntimedata();
+#ifdef MULTICORE_GC
+#ifdef GC_CACHE_ADAPT
+  bamboo_mask_timer_intr(); // disable the TILE_TIMER interrupt
+#endif // GC_CACHE_ADAPT
+#endif // MULTICORE_GC
+  BAMBOO_EXIT_APP(0);
+}
+
+INLINE void processmsg_memrequest_I() {
+#ifdef PROFILE_INTERRUPT
+  /*if(!interruptInfoOverflow) {
+    InterruptInfo* intInfo = RUNMALLOC_I(sizeof(struct interrupt_info));
+    interruptInfoArray[interruptInfoIndex] = intInfo;
+    intInfo->startTime = BAMBOO_GET_EXE_TIME();
+    intInfo->endTime = -1;
+  }*/
+#endif
+  int data1 = msgdata[msgdataindex];
+  MSG_INDEXINC_I();
+  int data2 = msgdata[msgdataindex];
+  MSG_INDEXINC_I();
+  // receive a shared memory request msg
+  if(BAMBOO_NUM_OF_CORE != STARTUPCORE) {
+    // wrong core to receive such msg
+#ifndef CLOSE_PRINT
+    BAMBOO_DEBUGPRINT_REG(data2);
+#endif
+    BAMBOO_EXIT(0xa010);
+  } else {
+#ifdef DEBUG
+#ifndef CLOSE_PRINT
+    BAMBOO_DEBUGPRINT(0xe88a);
+#endif
+#endif
+    int allocsize = 0;
+    void * mem = NULL;
+#ifdef MULTICORE_GC
+    if(gcprocessing) {
+      // is currently doing gc, dump this msg
+      if(INITPHASE == gcphase) {
+		// if still in the initphase of gc, send a startinit msg again,
+		// cache the msg first
+		if(BAMBOO_CHECK_SEND_MODE()) {
+		  cache_msg_1(data2, GCSTARTINIT);
+		} else {
+		  send_msg_1(data2, GCSTARTINIT, true);
+		}
+      }
+    } else {
+#endif
+    mem = smemalloc_I(data2, data1, &allocsize);
+    if(mem != NULL) {
+      // send the start_va to request core, cache the msg first
+      if(BAMBOO_CHECK_SEND_MODE()) {
+		cache_msg_3(data2, MEMRESPONSE, mem, allocsize);
+      } else {
+		send_msg_3(data2, MEMRESPONSE, mem, allocsize, true);
+	  }
+    } //else 
+	  // if mem == NULL, the gcflag of the startup core has been set
+	  // and all the other cores have been informed to start gc
+#ifdef MULTICORE_GC
+  }
+#endif
+  }
+#ifdef PROFILE_INTERRUPT
+  /*if(!interruptInfoOverflow) {
+    interruptInfoArray[interruptInfoIndex]->endTime=BAMBOO_GET_EXE_TIME();
+    interruptInfoIndex++;
+    if(interruptInfoIndex == INTERRUPTINFOLENGTH) {
+      interruptInfoOverflow = true;
+    }
+  }*/
+#endif
+}
+
+INLINE void processmsg_memresponse_I() {
+  int data1 = msgdata[msgdataindex];
+  MSG_INDEXINC_I();
+  int data2 = msgdata[msgdataindex];
+  MSG_INDEXINC_I();
+  // receive a shared memory response msg
+#ifdef DEBUG
+#ifndef CLOSE_PRINT
+  BAMBOO_DEBUGPRINT(0xe88b);
+#endif
+#endif
+#ifdef MULTICORE_GC
+  // if is currently doing gc, dump this msg
+  if(!gcprocessing) {
+#endif
+  if(data2 == 0) {
+    bamboo_smem_size = 0;
+    bamboo_cur_msp = 0;
+#ifdef MULTICORE_GC
+	bamboo_smem_zero_top = 0;
+#endif
+  } else {
+#ifdef MULTICORE_GC
+    // fill header to store the size of this mem block
+    BAMBOO_MEMSET_WH(data1, '\0', BAMBOO_CACHE_LINE_SIZE); 
+	//memset(data1, 0, BAMBOO_CACHE_LINE_SIZE);
+    (*((int*)data1)) = data2;
+    bamboo_smem_size = data2 - BAMBOO_CACHE_LINE_SIZE;
+    bamboo_cur_msp = data1 + BAMBOO_CACHE_LINE_SIZE;
+	bamboo_smem_zero_top = bamboo_cur_msp;
+#else
+    bamboo_smem_size = data2;
+    bamboo_cur_msp =(void*)(data1);
+#endif
+  }
+  smemflag = true;
+#ifdef MULTICORE_GC
+}
+#endif
+}
+
+#ifdef MULTICORE_GC
+INLINE void processmsg_gcstartpre_I() {
+  if(gcprocessing) {
+	// already stall for gc
+	// send a update pregc information msg to the master core
+	if(BAMBOO_CHECK_SEND_MODE()) {
+	  cache_msg_4(STARTUPCORE, GCFINISHPRE, BAMBOO_NUM_OF_CORE, 
+		  self_numsendobjs, self_numreceiveobjs);
+	} else {
+	  send_msg_4(STARTUPCORE, GCFINISHPRE, BAMBOO_NUM_OF_CORE, 
+		  self_numsendobjs, self_numreceiveobjs, true);
+	}
+  } else {
+	// the first time to be informed to start gc
+	gcflag = true;
+	if(!smemflag) {
+	  // is waiting for response of mem request
+	  // let it return NULL and start gc
+	  bamboo_smem_size = 0;
+	  bamboo_cur_msp = NULL;
+	  smemflag = true;
+	  bamboo_smem_zero_top = NULL;
+	}
+  }
+}
+
+INLINE void processmsg_gcstartinit_I() {
+  gcphase = INITPHASE;
+}
+
+INLINE void processmsg_gcstart_I() {
+#ifdef DEBUG
+#ifndef CLOSE_PRINT
+  BAMBOO_DEBUGPRINT(0xe88c);
+#endif
+#endif
+  // set the GC flag
+  gcphase = MARKPHASE;
+}
+
+INLINE void processmsg_gcstartcompact_I() {
+  gcblock2fill = msgdata[msgdataindex];
+  MSG_INDEXINC_I();  //msgdata[1];
+  gcphase = COMPACTPHASE;
+}
+
+INLINE void processmsg_gcstartmapinfo_I() {
+  gcphase = MAPPHASE;
+}
+
+INLINE void processmsg_gcstartflush_I() {
+  gcphase = FLUSHPHASE;
+}
+
+INLINE void processmsg_gcfinishpre_I() {
+  int data1 = msgdata[msgdataindex];
+  MSG_INDEXINC_I();
+  int data2 = msgdata[msgdataindex];
+  MSG_INDEXINC_I();
+  int data3 = msgdata[msgdataindex];
+  MSG_INDEXINC_I();
+  // received a init phase finish msg
+  if(BAMBOO_NUM_OF_CORE != STARTUPCORE) {
+    // non startup core can not receive this msg
+#ifndef CLOSE_PRINT
+    BAMBOO_DEBUGPRINT_REG(data1);
+#endif
+    BAMBOO_EXIT(0xb000);
+  }
+  // All cores should do init GC
+  if(!gcprecheck) {
+	gcprecheck = true;
+  }
+  gccorestatus[data1] = 0;
+  gcnumsendobjs[0][data1] = data2;
+  gcnumreceiveobjs[0][data1] = data3;
+}
+
+INLINE void processmsg_gcfinishinit_I() {
+  int data1 = msgdata[msgdataindex];
+  MSG_INDEXINC_I();
+  // received a init phase finish msg
+  if(BAMBOO_NUM_OF_CORE != STARTUPCORE) {
+    // non startup core can not receive this msg
+#ifndef CLOSE_PRINT
+    BAMBOO_DEBUGPRINT_REG(data1);
+#endif
+    BAMBOO_EXIT(0xb001);
+  }
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(0xe88c);
+  BAMBOO_DEBUGPRINT_REG(data1);
+#endif
+  // All cores should do init GC
+  if(data1 < NUMCORESACTIVE) {
+    gccorestatus[data1] = 0;
+  }
+}
+
+INLINE void processmsg_gcfinishmark_I() {
+  int data1 = msgdata[msgdataindex];
+  MSG_INDEXINC_I();
+  int data2 = msgdata[msgdataindex];
+  MSG_INDEXINC_I();
+  int data3 = msgdata[msgdataindex];
+  MSG_INDEXINC_I();
+  // received a mark phase finish msg
+  if(BAMBOO_NUM_OF_CORE != STARTUPCORE) {
+    // non startup core can not receive this msg
+#ifndef CLOSE_PRINT
+    BAMBOO_DEBUGPRINT_REG(data1);
+#endif
+    BAMBOO_EXIT(0xb002);
+  }
+  // all cores should do mark
+  if(data1 < NUMCORESACTIVE) {
+    gccorestatus[data1] = 0;
+	int entry_index = 0;
+	if(waitconfirm)  {
+	  // phase 2
+	  entry_index = (gcnumsrobjs_index == 0) ? 1 : 0;
+	} else {
+	  // phase 1
+	  entry_index = gcnumsrobjs_index;
+	}
+    gcnumsendobjs[entry_index][data1] = data2;
+    gcnumreceiveobjs[entry_index][data1] = data3;
+  }
+}
+
+INLINE void processmsg_gcfinishcompact_I() {
+  if(BAMBOO_NUM_OF_CORE != STARTUPCORE) {
+    // non startup core can not receive this msg
+    // return -1
+#ifndef CLOSE_PRINT
+    BAMBOO_DEBUGPRINT_REG(msgdata[msgdataindex] /*[1]*/);
+#endif
+    BAMBOO_EXIT(0xb003);
+  }
+  int cnum = msgdata[msgdataindex];
+  MSG_INDEXINC_I();       //msgdata[1];
+  int filledblocks = msgdata[msgdataindex];
+  MSG_INDEXINC_I();       //msgdata[2];
+  int heaptop = msgdata[msgdataindex];
+  MSG_INDEXINC_I();       //msgdata[3];
+  int data4 = msgdata[msgdataindex];
+  MSG_INDEXINC_I();       //msgdata[4];
+  // only gc cores need to do compact
+  if(cnum < NUMCORES4GC) {
+    if(COMPACTPHASE == gcphase) {
+      gcfilledblocks[cnum] = filledblocks;
+      gcloads[cnum] = heaptop;
+    }
+    if(data4 > 0) {
+      // ask for more mem
+      int startaddr = 0;
+      int tomove = 0;
+      int dstcore = 0;
+      if(gcfindSpareMem_I(&startaddr, &tomove, &dstcore, data4, cnum)) {
+		// cache the msg first
+		if(BAMBOO_CHECK_SEND_MODE()) {
+		  cache_msg_4(cnum, GCMOVESTART, dstcore, startaddr, tomove);
+		} else {
+		  send_msg_4(cnum, GCMOVESTART, dstcore, startaddr, tomove, true);
+		}
+      }
+    } else {
+      gccorestatus[cnum] = 0;
+    }  // if(data4>0)
+  }  // if(cnum < NUMCORES4GC)
+}
+
+INLINE void processmsg_gcfinishmapinfo_I() {
+  int data1 = msgdata[msgdataindex];
+  MSG_INDEXINC_I();
+  // received a map phase finish msg
+  if(BAMBOO_NUM_OF_CORE != STARTUPCORE) {
+    // non startup core can not receive this msg
+#ifndef CLOSE_PRINT
+    BAMBOO_DEBUGPRINT_REG(data1);
+#endif
+    BAMBOO_EXIT(0xb004);
+  }
+  // all cores should do flush
+  if(data1 < NUMCORES4GC) {
+    gccorestatus[data1] = 0;
+  }
+}
+
+
+INLINE void processmsg_gcfinishflush_I() {
+  int data1 = msgdata[msgdataindex];
+  MSG_INDEXINC_I();
+  // received a flush phase finish msg
+  if(BAMBOO_NUM_OF_CORE != STARTUPCORE) {
+    // non startup core can not receive this msg
+#ifndef CLOSE_PRINT
+    BAMBOO_DEBUGPRINT_REG(data1);
+#endif
+    BAMBOO_EXIT(0xb005);
+  }
+  // all cores should do flush
+  if(data1 < NUMCORESACTIVE) {
+    gccorestatus[data1] = 0;
+  }
+}
+
+INLINE void processmsg_gcmarkconfirm_I() {
+  if((BAMBOO_NUM_OF_CORE == STARTUPCORE)
+     || (BAMBOO_NUM_OF_CORE > NUMCORESACTIVE - 1)) {
+    // wrong core to receive such msg
+    BAMBOO_EXIT(0xb006);
+  } else {
+    // send response msg, cahce the msg first
+    if(BAMBOO_CHECK_SEND_MODE()) {
+	  cache_msg_5(STARTUPCORE, GCMARKREPORT, BAMBOO_NUM_OF_CORE,
+				  gcbusystatus, gcself_numsendobjs,
+				  gcself_numreceiveobjs);
+    } else {
+	  send_msg_5(STARTUPCORE, GCMARKREPORT, BAMBOO_NUM_OF_CORE,
+				 gcbusystatus, gcself_numsendobjs,
+				 gcself_numreceiveobjs, true);
+    }
+  }
+}
+
+INLINE void processmsg_gcmarkreport_I() {
+  int data1 = msgdata[msgdataindex];
+  MSG_INDEXINC_I();
+  int data2 = msgdata[msgdataindex];
+  MSG_INDEXINC_I();
+  int data3 = msgdata[msgdataindex];
+  MSG_INDEXINC_I();
+  int data4 = msgdata[msgdataindex];
+  MSG_INDEXINC_I();
+  // received a marked phase finish confirm response msg
+  if(BAMBOO_NUM_OF_CORE != STARTUPCORE) {
+    // wrong core to receive such msg
+#ifndef CLOSE_PRINT
+    BAMBOO_DEBUGPRINT_REG(data2);
+#endif
+    BAMBOO_EXIT(0xb007);
+  } else {
+	int entry_index = 0;
+    if(waitconfirm) {
+	  // phse 2
+      numconfirm--;
+	  entry_index = (gcnumsrobjs_index == 0) ? 1 : 0;
+    } else {
+	  // can never reach here
+	  // phase 1
+	  entry_index = gcnumsrobjs_index;
+	}
+    gccorestatus[data1] = data2;
+    gcnumsendobjs[entry_index][data1] = data3;
+    gcnumreceiveobjs[entry_index][data1] = data4;
+  }
+}
+
+INLINE void processmsg_gcmarkedobj_I() {
+  int data1 = msgdata[msgdataindex];
+  MSG_INDEXINC_I();
+  // received a markedObj msg
+  if(((int *)data1)[6] == INIT) {
+    // this is the first time that this object is discovered,
+    // set the flag as DISCOVERED
+    ((int *)data1)[6] = DISCOVERED;
+    gc_enqueue_I(data1);
+  } 
+  // set the remote flag
+  ((int *)data1)[6] |= REMOTEM;
+  gcself_numreceiveobjs++;
+  gcbusystatus = true;
+}
+
+INLINE void processmsg_gcmovestart_I() {
+  gctomove = true;
+  gcdstcore = msgdata[msgdataindex];
+  MSG_INDEXINC_I();       //msgdata[1];
+  gcmovestartaddr = msgdata[msgdataindex];
+  MSG_INDEXINC_I();       //msgdata[2];
+  gcblock2fill = msgdata[msgdataindex];
+  MSG_INDEXINC_I();       //msgdata[3];
+}
+
+INLINE void processmsg_gcmaprequest_I() {
+#ifdef GC_PROFILE
+  //unsigned long long ttime = BAMBOO_GET_EXE_TIME();
+#endif
+  void * dstptr = NULL;
+  int data1 = msgdata[msgdataindex];
+  MSG_INDEXINC_I();
+#ifdef GC_PROFILE
+  // TODO unsigned long long ttime = BAMBOO_GET_EXE_TIME();
+#endif
+#ifdef LOCALHASHTBL_TEST
+  RuntimeHashget(gcpointertbl, data1, &dstptr);
+#else
+  dstptr = mgchashSearch(gcpointertbl, data1);
+#endif
+  //MGCHashget(gcpointertbl, data1, &dstptr);
+#ifdef GC_PROFILE
+  // TODO flushstalltime += BAMBOO_GET_EXE_TIME() - ttime;
+#endif
+  int data2 = msgdata[msgdataindex];
+  MSG_INDEXINC_I();
+#ifdef GC_PROFILE
+  // TODO unsigned long long ttimei = BAMBOO_GET_EXE_TIME();
+#endif
+  if(NULL == dstptr) {
+    // no such pointer in this core, something is wrong
+#ifdef DEBUG
+    BAMBOO_DEBUGPRINT_REG(data1);
+    BAMBOO_DEBUGPRINT_REG(data2);
+#endif
+    BAMBOO_EXIT(0xb008);
+    //assume that the object was not moved, use the original address
+    /*if(isMsgSending) {
+            cache_msg_3(msgdata[2], GCMAPINFO, msgdata[1], msgdata[1]);
+       } else {
+            send_msg_3(msgdata[2], GCMAPINFO, msgdata[1], msgdata[1]);
+       }*/
+  } else {
+    // send back the mapping info, cache the msg first
+    if(BAMBOO_CHECK_SEND_MODE()) {
+	  cache_msg_3(data2, GCMAPINFO, data1, (int)dstptr);
+    } else {
+	  send_msg_3(data2, GCMAPINFO, data1, (int)dstptr, true);
+    }
+  }
+#ifdef GC_PROFILE
+  // TODO flushstalltime_i += BAMBOO_GET_EXE_TIME()-ttimei;
+  //num_mapinforequest_i++;
+#endif
+}
+
+INLINE void processmsg_gcmapinfo_I() {
+#ifdef GC_PROFILE
+  //unsigned long long ttime = BAMBOO_GET_EXE_TIME();
+#endif
+  int data1 = msgdata[msgdataindex];
+  MSG_INDEXINC_I();
+  gcmappedobj = msgdata[msgdataindex];  // [2]
+  MSG_INDEXINC_I();
+#ifdef LOCALHASHTBL_TEST
+  RuntimeHashadd_I(gcpointertbl, data1, gcmappedobj);
+#else
+  mgchashInsert_I(gcpointertbl, data1, gcmappedobj);
+#endif
+  //MGCHashadd_I(gcpointertbl, data1, gcmappedobj);
+  if(data1 == gcobj2map) {
+	gcismapped = true;
+  }
+#ifdef GC_PROFILE
+  //flushstalltime += BAMBOO_GET_EXE_TIME() - ttime;
+#endif
+}
+
+INLINE void processmsg_gcmaptbl_I() {
+  int data1 = msgdata[msgdataindex];
+  MSG_INDEXINC_I();
+  int data2 = msgdata[msgdataindex];
+  MSG_INDEXINC_I();
+  gcrpointertbls[data2] = (mgcsharedhashtbl_t *)data1; //(struct GCSharedHash *)data1;
+}
+
+INLINE void processmsg_gclobjinfo_I() {
+  numconfirm--;
+
+  int data1 = msgdata[msgdataindex];
+  MSG_INDEXINC_I();
+  int data2 = msgdata[msgdataindex];
+  MSG_INDEXINC_I();
+  if(BAMBOO_NUM_OF_CORE > NUMCORES4GC - 1) {
+#ifndef CLOSE_PRINT
+    BAMBOO_DEBUGPRINT_REG(data2);
+#endif
+    BAMBOO_EXIT(0xb009);
+  }
+  // store the mark result info
+  int cnum = data2;
+  gcloads[cnum] = msgdata[msgdataindex];
+  MSG_INDEXINC_I();       // msgdata[3];
+  int data4 = msgdata[msgdataindex];
+  MSG_INDEXINC_I();
+  if(gcheaptop < data4) {
+    gcheaptop = data4;
+  }
+  // large obj info here
+  for(int k = 5; k < data1; ) {
+    int lobj = msgdata[msgdataindex];
+    MSG_INDEXINC_I();   //msgdata[k++];
+    int length = msgdata[msgdataindex];
+    MSG_INDEXINC_I();   //msgdata[k++];
+    gc_lobjenqueue_I(lobj, length, cnum);
+    gcnumlobjs++;
+  }  // for(int k = 5; k < msgdata[1];)
+}
+
+INLINE void processmsg_gclobjmapping_I() {
+  int data1 = msgdata[msgdataindex];
+  MSG_INDEXINC_I();
+  int data2 = msgdata[msgdataindex];
+  MSG_INDEXINC_I();
+#ifdef LOCALHASHTBL_TEST
+  RuntimeHashadd_I(gcpointertbl, data1, data2);
+#else
+  mgchashInsert_I(gcpointertbl, data1, data2);
+#endif
+  //MGCHashadd_I(gcpointertbl, data1, data2);
+  mgcsharedhashInsert_I(gcsharedptbl, data1, data2);
+}
+
+#ifdef GC_PROFILE
+INLINE void processmsg_gcprofiles_I() {
+  int data1 = msgdata[msgdataindex];
+  MSG_INDEXINC_I();
+  int data2 = msgdata[msgdataindex];
+  MSG_INDEXINC_I();
+  int data3 = msgdata[msgdataindex];
+  MSG_INDEXINC_I();
+  gc_num_obj += data1;
+  gc_num_liveobj += data2;
+  gc_num_forwardobj += data3;
+  gc_num_profiles--;
+}
+#endif // GC_PROFILE
+
+#ifdef GC_CACHE_ADAPT
+INLINE void processmsg_gcstartpref_I() {
+  gcphase = PREFINISHPHASE;
+}
+
+INLINE void processmsg_gcfinishpref_I() {
+  int data1 = msgdata[msgdataindex];
+  MSG_INDEXINC_I();
+  // received a flush phase finish msg
+  if(BAMBOO_NUM_OF_CORE != STARTUPCORE) {
+    // non startup core can not receive this msg
+#ifndef CLOSE_PRINT
+    BAMBOO_DEBUGPRINT_REG(data1);
+#endif
+    BAMBOO_EXIT(0xb00a);
+  }
+  // all cores should do flush
+  if(data1 < NUMCORESACTIVE) {
+    gccorestatus[data1] = 0;
+  }
+}
+#endif // GC_CACHE_ADAPT
+#endif // #ifdef MULTICORE_GC
+
+// receive object transferred from other cores
+// or the terminate message from other cores
+// Should be invoked in critical sections!!
+// NOTICE: following format is for threadsimulate version only
+//         RAW version please see previous description
+// format: type + object
+// type: -1--stall msg
+//      !-1--object
+// return value: 0--received an object
+//               1--received nothing
+//               2--received a Stall Msg
+//               3--received a lock Msg
+//               RAW version: -1 -- received nothing
+//                            otherwise -- received msg type
+int receiveObject(int send_port_pending) {
+#ifdef PROFILE_INTERRUPT
+  if(!interruptInfoOverflow) {
+    InterruptInfo* intInfo = RUNMALLOC_I(sizeof(struct interrupt_info));
+    interruptInfoArray[interruptInfoIndex] = intInfo;
+    intInfo->startTime = BAMBOO_GET_EXE_TIME();
+    intInfo->endTime = -1;
+  }
+#endif
+msg:
+  // get the incoming msgs
+  if(receiveMsg(send_port_pending) == -1) {
+    return -1;
+  }
+processmsg:
+  // processing received msgs
+  int size = 0;
+  MSG_REMAINSIZE_I(&size);
+  if((size == 0) || (checkMsgLength_I(size) == -1)) {
+    // not a whole msg
+    // have new coming msg
+    if((BAMBOO_MSG_AVAIL() != 0) && !msgdatafull) {
+      goto msg;
+    } else {
+      return -1;
+    }
+  }
+
+  if(msglength <= size) {
+    // have some whole msg
+    MSGTYPE type;
+    type = msgdata[msgdataindex]; //[0]
+    MSG_INDEXINC_I();
+    msgdatafull = false;
+    // TODO
+    //tprintf("msg type: %x\n", type);
+    switch(type) {
+    case TRANSOBJ: {
+      // receive a object transfer msg
+      processmsg_transobj_I();
+      break;
+    }   // case TRANSOBJ
+
+    case TRANSTALL: {
+      // receive a stall msg
+      processmsg_transtall_I();
+      break;
+    }   // case TRANSTALL
+
+// GC version have no lock msgs
+#ifndef MULTICORE_GC
+    case LOCKREQUEST: {
+      // receive lock request msg, handle it right now
+      processmsg_lockrequest_I();
+      break;
+    }   // case LOCKREQUEST
+
+    case LOCKGROUNT: {
+      // receive lock grount msg
+      processmsg_lockgrount_I();
+      break;
+    }   // case LOCKGROUNT
+
+    case LOCKDENY: {
+      // receive lock deny msg
+      processmsg_lockdeny_I();
+      break;
+    }   // case LOCKDENY
+
+    case LOCKRELEASE: {
+      processmsg_lockrelease_I();
+      break;
+    }   // case LOCKRELEASE
+#endif // #ifndef MULTICORE_GC
+
+#ifdef PROFILE
+    case PROFILEOUTPUT: {
+      // receive an output profile data request msg
+      processmsg_profileoutput_I();
+      break;
+    }   // case PROFILEOUTPUT
+
+    case PROFILEFINISH: {
+      // receive a profile output finish msg
+      processmsg_profilefinish_I();
+      break;
+    }   // case PROFILEFINISH
+#endif // #ifdef PROFILE
+
+// GC version has no lock msgs
+#ifndef MULTICORE_GC
+    case REDIRECTLOCK: {
+      // receive a redirect lock request msg, handle it right now
+      processmsg_redirectlock_I();
+      break;
+    }   // case REDIRECTLOCK
+
+    case REDIRECTGROUNT: {
+      // receive a lock grant msg with redirect info
+      processmsg_redirectgrount_I();
+      break;
+    }   // case REDIRECTGROUNT
+
+    case REDIRECTDENY: {
+      // receive a lock deny msg with redirect info
+      processmsg_redirectdeny_I();
+      break;
+    }   // case REDIRECTDENY
+
+    case REDIRECTRELEASE: {
+      // receive a lock release msg with redirect info
+      processmsg_redirectrelease_I();
+      break;
+    }   // case REDIRECTRELEASE
+#endif // #ifndef MULTICORE_GC
+
+    case STATUSCONFIRM: {
+      // receive a status confirm info
+      processmsg_statusconfirm_I();
+      break;
+    }   // case STATUSCONFIRM
+
+    case STATUSREPORT: {
+      processmsg_statusreport_I();
+      break;
+    }   // case STATUSREPORT
+
+    case TERMINATE: {
+      // receive a terminate msg
+      processmsg_terminate_I();
+      break;
+    }   // case TERMINATE
+
+    case MEMREQUEST: {
+      processmsg_memrequest_I();
+      break;
+    }   // case MEMREQUEST
+
+    case MEMRESPONSE: {
+      processmsg_memresponse_I();
+      break;
+    }   // case MEMRESPONSE
+
+#ifdef MULTICORE_GC
+    // GC msgs
+    case GCSTARTPRE: {
+      processmsg_gcstartpre_I();
+      break;
+    }   // case GCSTARTPRE
+	
+	case GCSTARTINIT: {
+      processmsg_gcstartinit_I();
+      break;
+    }   // case GCSTARTINIT
+
+    case GCSTART: {
+      // receive a start GC msg
+      processmsg_gcstart_I();
+      break;
+    }   // case GCSTART
+
+    case GCSTARTCOMPACT: {
+      // a compact phase start msg
+      processmsg_gcstartcompact_I();
+      break;
+    }   // case GCSTARTCOMPACT
+
+	case GCSTARTMAPINFO: {
+      // received a flush phase start msg
+      processmsg_gcstartmapinfo_I();
+      break;
+    }   // case GCSTARTFLUSH
+
+    case GCSTARTFLUSH: {
+      // received a flush phase start msg
+      processmsg_gcstartflush_I();
+      break;
+    }   // case GCSTARTFLUSH
+
+    case GCFINISHPRE: {
+      processmsg_gcfinishpre_I();
+      break;
+    }   // case GCFINISHPRE
+	
+	case GCFINISHINIT: {
+      processmsg_gcfinishinit_I();
+      break;
+    }   // case GCFINISHINIT
+
+    case GCFINISHMARK: {
+      processmsg_gcfinishmark_I();
+      break;
+    }   // case GCFINISHMARK
+
+    case GCFINISHCOMPACT: {
+      // received a compact phase finish msg
+      processmsg_gcfinishcompact_I();
+      break;
+    }   // case GCFINISHCOMPACT
+
+	case GCFINISHMAPINFO: {
+      processmsg_gcfinishmapinfo_I();
+      break;
+    }   // case GCFINISHMAPINFO
+
+    case GCFINISHFLUSH: {
+      processmsg_gcfinishflush_I();
+      break;
+    }   // case GCFINISHFLUSH
+
+    case GCFINISH: {
+      // received a GC finish msg
+      gcphase = FINISHPHASE;
+      break;
+    }   // case GCFINISH
+
+    case GCMARKCONFIRM: {
+      // received a marked phase finish confirm request msg
+      // all cores should do mark
+      processmsg_gcmarkconfirm_I();
+      break;
+    }   // case GCMARKCONFIRM
+
+    case GCMARKREPORT: {
+      processmsg_gcmarkreport_I();
+      break;
+    }   // case GCMARKREPORT
+
+    case GCMARKEDOBJ: {
+      processmsg_gcmarkedobj_I();
+      break;
+    }   // case GCMARKEDOBJ
+
+    case GCMOVESTART: {
+      // received a start moving objs msg
+      processmsg_gcmovestart_I();
+      break;
+    }   // case GCMOVESTART
+
+    case GCMAPREQUEST: {
+      // received a mapping info request msg
+      processmsg_gcmaprequest_I();
+      break;
+    }   // case GCMAPREQUEST
+
+    case GCMAPINFO: {
+      // received a mapping info response msg
+      processmsg_gcmapinfo_I();
+      break;
+    }   // case GCMAPINFO
+
+    case GCMAPTBL: {
+      // received a mapping tbl response msg
+      processmsg_gcmaptbl_I();
+      break;
+    }   // case GCMAPTBL
+	
+	case GCLOBJREQUEST: {
+      // received a large objs info request msg
+      transferMarkResults_I();
+      break;
+    }   // case GCLOBJREQUEST
+
+    case GCLOBJINFO: {
+      // received a large objs info response msg
+      processmsg_gclobjinfo_I();
+      break;
+    }   // case GCLOBJINFO
+
+    case GCLOBJMAPPING: {
+      // received a large obj mapping info msg
+      processmsg_gclobjmapping_I();
+      break;
+    }  // case GCLOBJMAPPING
+
+#ifdef GC_PROFILE
+	case GCPROFILES: {
+      // received a gcprofiles msg
+      processmsg_gcprofiles_I();
+      break;
+    }
+#endif // GC_PROFILE
+
+#ifdef GC_CACHE_ADAPT
+	case GCSTARTPREF: {
+      // received a gcstartpref msg
+      processmsg_gcstartpref_I();
+      break;
+    }
+
+	case GCFINISHPREF: {
+      // received a gcfinishpref msg
+      processmsg_gcfinishpref_I();
+      break;
+    }
+#endif // GC_CACHE_ADAPT
+#endif // #ifdef MULTICORE_GC
+
+    default:
+      break;
+    }  // switch(type)
+    msglength = BAMBOO_MSG_BUF_LENGTH;
+    // TODO
+    //printf("++ msg: %x \n", type);
+
+    if(msgdataindex != msgdatalast) {
+      // still have available msg
+      goto processmsg;
+    }
+#ifdef DEBUG
+#ifndef CLOSE_PRINT
+    BAMBOO_DEBUGPRINT(0xe88d);
+#endif
+#endif
+
+    // have new coming msg
+    if(BAMBOO_MSG_AVAIL() != 0) {
+      goto msg;
+    } // TODO
+
+#ifdef PROFILE_INTERRUPT
+  if(!interruptInfoOverflow) {
+    interruptInfoArray[interruptInfoIndex]->endTime=BAMBOO_GET_EXE_TIME();
+    interruptInfoIndex++;
+    if(interruptInfoIndex == INTERRUPTINFOLENGTH) {
+      interruptInfoOverflow = true;
+    }
+  }
+#endif
+    return (int)type;
+  } else {
+    // not a whole msg
+#ifdef DEBUG
+#ifndef CLOSE_PRINT
+    BAMBOO_DEBUGPRINT(0xe88e);
+#endif
+#endif
+    return -2;
+  }
+}
+
+int enqueuetasks(struct parameterwrapper *parameter,
+                 struct parameterwrapper *prevptr,
+                 struct ___Object___ *ptr,
+                 int * enterflags,
+                 int numenterflags) {
+  void * taskpointerarray[MAXTASKPARAMS];
+  int j;
+  //int numparams=parameter->task->numParameters;
+  int numiterators=parameter->task->numTotal-1;
+  int retval=1;
+
+  struct taskdescriptor * task=parameter->task;
+
+  //this add the object to parameterwrapper
+  ObjectHashadd(parameter->objectset, (int) ptr, 0, (int) enterflags,
+                numenterflags, enterflags==NULL);
+
+  /* Add enqueued object to parameter vector */
+  taskpointerarray[parameter->slot]=ptr;
+
+  /* Reset iterators */
+  for(j=0; j<numiterators; j++) {
+    toiReset(&parameter->iterators[j]);
+  }
+
+  /* Find initial state */
+  for(j=0; j<numiterators; j++) {
+backtrackinit:
+    if(toiHasNext(&parameter->iterators[j],taskpointerarray OPTARG(failed)))
+      toiNext(&parameter->iterators[j], taskpointerarray OPTARG(failed));
+    else if (j>0) {
+      /* Need to backtrack */
+      toiReset(&parameter->iterators[j]);
+      j--;
+      goto backtrackinit;
+    } else {
+      /* Nothing to enqueue */
+      return retval;
+    }
+  }
+
+  while(1) {
+    /* Enqueue current state */
+    //int launch = 0;
+    struct taskparamdescriptor *tpd=
+      RUNMALLOC(sizeof(struct taskparamdescriptor));
+    tpd->task=task;
+    tpd->numParameters=numiterators+1;
+    tpd->parameterArray=RUNMALLOC(sizeof(void *)*(numiterators+1));
+
+    for(j=0; j<=numiterators; j++) {
+      //store the actual parameters
+      tpd->parameterArray[j]=taskpointerarray[j];
+    }
+    /* Enqueue task */
+    if (( /*!gencontains(failedtasks, tpd)&&*/
+          !gencontains(activetasks,tpd))) {
+      genputtable(activetasks, tpd, tpd);
+    } else {
+      RUNFREE(tpd->parameterArray);
+      RUNFREE(tpd);
+    }
+
+    /* This loop iterates to the next parameter combination */
+    if (numiterators==0)
+      return retval;
+
+    for(j=numiterators-1; j<numiterators; j++) {
+backtrackinc:
+      if(toiHasNext(
+			&parameter->iterators[j],taskpointerarray OPTARG(failed)))
+		toiNext(&parameter->iterators[j], taskpointerarray OPTARG(failed));
+      else if (j>0) {
+		/* Need to backtrack */
+		toiReset(&parameter->iterators[j]);
+		j--;
+		goto backtrackinc;
+      } else {
+		/* Nothing more to enqueue */
+		return retval;
+      }
+    }
+  }
+  return retval;
+}
+
+int enqueuetasks_I(struct parameterwrapper *parameter,
+                   struct parameterwrapper *prevptr,
+                   struct ___Object___ *ptr,
+                   int * enterflags,
+                   int numenterflags) {
+  void * taskpointerarray[MAXTASKPARAMS];
+  int j;
+  //int numparams=parameter->task->numParameters;
+  int numiterators=parameter->task->numTotal-1;
+  int retval=1;
+  //int addnormal=1;
+  //int adderror=1;
+
+  struct taskdescriptor * task=parameter->task;
+
+  //this add the object to parameterwrapper
+  ObjectHashadd_I(parameter->objectset, (int) ptr, 0, (int) enterflags,
+                  numenterflags, enterflags==NULL);
+
+  /* Add enqueued object to parameter vector */
+  taskpointerarray[parameter->slot]=ptr;
+
+  /* Reset iterators */
+  for(j=0; j<numiterators; j++) {
+    toiReset(&parameter->iterators[j]);
+  }
+
+  /* Find initial state */
+  for(j=0; j<numiterators; j++) {
+backtrackinit:
+    if(toiHasNext(&parameter->iterators[j],taskpointerarray OPTARG(failed)))
+      toiNext(&parameter->iterators[j], taskpointerarray OPTARG(failed));
+    else if (j>0) {
+      /* Need to backtrack */
+      toiReset(&parameter->iterators[j]);
+      j--;
+      goto backtrackinit;
+    } else {
+      /* Nothing to enqueue */
+      return retval;
+    }
+  }
+
+  while(1) {
+    /* Enqueue current state */
+    //int launch = 0;
+    struct taskparamdescriptor *tpd=
+      RUNMALLOC_I(sizeof(struct taskparamdescriptor));
+    tpd->task=task;
+    tpd->numParameters=numiterators+1;
+    tpd->parameterArray=RUNMALLOC_I(sizeof(void *)*(numiterators+1));
+
+    for(j=0; j<=numiterators; j++) {
+      //store the actual parameters
+      tpd->parameterArray[j]=taskpointerarray[j];
+    }
+    /* Enqueue task */
+    if (( /*!gencontains(failedtasks, tpd)&&*/
+          !gencontains(activetasks,tpd))) {
+      genputtable_I(activetasks, tpd, tpd);
+    } else {
+      RUNFREE(tpd->parameterArray);
+      RUNFREE(tpd);
+    }
+
+    /* This loop iterates to the next parameter combination */
+    if (numiterators==0)
+      return retval;
+
+    for(j=numiterators-1; j<numiterators; j++) {
+backtrackinc:
+      if(toiHasNext(
+			&parameter->iterators[j], taskpointerarray OPTARG(failed)))
+		toiNext(&parameter->iterators[j], taskpointerarray OPTARG(failed));
+      else if (j>0) {
+		/* Need to backtrack */
+		toiReset(&parameter->iterators[j]);
+		j--;
+		goto backtrackinc;
+      } else {
+		/* Nothing more to enqueue */
+		return retval;
+      }
+    }
+  }
+  return retval;
+}
+
+#ifdef MULTICORE_GC
+#define OFFSET 2
+#else
+#define OFFSET 0
+#endif
+
+int containstag(struct ___Object___ *ptr,
+                struct ___TagDescriptor___ *tag);
+
+#ifndef MULTICORE_GC
+void releasewritelock_r(void * lock, void * redirectlock) {
+  int targetcore = 0;
+  int reallock = (int)lock;
+  targetcore = (reallock >> 5) % NUMCORES;
+
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(0xe671);
+  BAMBOO_DEBUGPRINT_REG((int)lock);
+  BAMBOO_DEBUGPRINT_REG(reallock);
+  BAMBOO_DEBUGPRINT_REG(targetcore);
+#endif
+
+  if(targetcore == BAMBOO_NUM_OF_CORE) {
+    BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
+#ifdef DEBUG
+    BAMBOO_DEBUGPRINT(0xf001);
+#endif
+    // reside on this core
+    if(!RuntimeHashcontainskey(locktbl, reallock)) {
+      // no locks for this object, something is wrong
+      BAMBOO_EXIT(0xa00b);
+    } else {
+      int rwlock_obj = 0;
+      struct LockValue * lockvalue = NULL;
+#ifdef DEBUG
+      BAMBOO_DEBUGPRINT(0xe672);
+#endif
+      RuntimeHashget(locktbl, reallock, &rwlock_obj);
+      lockvalue = (struct LockValue *)rwlock_obj;
+#ifdef DEBUG
+      BAMBOO_DEBUGPRINT_REG(lockvalue->value);
+#endif
+      lockvalue->value++;
+      lockvalue->redirectlock = (int)redirectlock;
+#ifdef DEBUG
+      BAMBOO_DEBUGPRINT_REG(lockvalue->value);
+#endif
+    }
+    BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
+#ifdef DEBUG
+    BAMBOO_DEBUGPRINT(0xf000);
+#endif
+    return;
+  } else {
+    // send lock release with redirect info msg
+    // for 32 bit machine, the size is always 4 words
+    send_msg_4(targetcore, REDIRECTRELEASE, 1, (int)lock,
+               (int)redirectlock, false);
+  }
+}
+#endif
+
+void executetasks() {
+  void * taskpointerarray[MAXTASKPARAMS+OFFSET];
+  int numparams=0;
+  int numtotal=0;
+  struct ___Object___ * tmpparam = NULL;
+  struct parameterdescriptor * pd=NULL;
+  struct parameterwrapper *pw=NULL;
+  int j = 0;
+  int x = 0;
+  bool islock = true;
+
+  int grount = 0;
+  int andmask=0;
+  int checkmask=0;
+
+newtask:
+  while(hashsize(activetasks)>0) {
+#ifdef MULTICORE_GC
+//#ifdef GC_CACHE_ADAPT
+	  // do dtlb sampling if necessary
+//	  bamboo_dtlb_sampling_process();
+//#endif // GC_CACHE_ADAPT
+    if(gcflag) gc(NULL);
+#endif
+#ifdef DEBUG
+    BAMBOO_DEBUGPRINT(0xe990);
+#endif
+
+    /* See if there are any active tasks */
+    //if (hashsize(activetasks)>0) {
+    int i;
+#ifdef PROFILE
+#ifdef ACCURATEPROFILE
+    profileTaskStart("tpd checking");
+#endif
+#endif
+    //long clock1;
+    //clock1 = BAMBOO_GET_EXE_TIME();
+
+    busystatus = true;
+    currtpd=(struct taskparamdescriptor *) getfirstkey(activetasks);
+    genfreekey(activetasks, currtpd);
+
+    numparams=currtpd->task->numParameters;
+    numtotal=currtpd->task->numTotal;
+
+    // clear the lockRedirectTbl
+    // (TODO, this table should be empty after all locks are released)
+    // reset all locks
+    /*for(j = 0; j < MAXTASKPARAMS; j++) {
+            runtime_locks[j].redirectlock = 0;
+            runtime_locks[j].value = 0;
+       }*/
+    // get all required locks
+    runtime_locklen = 0;
+    // check which locks are needed
+    for(i = 0; i < numparams; i++) {
+      void * param = currtpd->parameterArray[i];
+      int tmplock = 0;
+      int j = 0;
+      bool insert = true;
+      if(((struct ___Object___ *)param)->type == STARTUPTYPE) {
+		islock = false;
+		taskpointerarray[i+OFFSET]=param;
+		goto execute;
+      }
+      if(((struct ___Object___ *)param)->lock == NULL) {
+		tmplock = (int)param;
+      } else {
+		tmplock = (int)(((struct ___Object___ *)param)->lock);
+      }
+      // insert into the locks array
+      for(j = 0; j < runtime_locklen; j++) {
+		if(runtime_locks[j].value == tmplock) {
+		  insert = false;
+		  break;
+		} else if(runtime_locks[j].value > tmplock) {
+		  break;
+		}
+      }
+      if(insert) {
+		int h = runtime_locklen;
+		for(; h > j; h--) {
+		  runtime_locks[h].redirectlock = runtime_locks[h-1].redirectlock;
+		  runtime_locks[h].value = runtime_locks[h-1].value;
+		}
+		runtime_locks[j].value = tmplock;
+		runtime_locks[j].redirectlock = (int)param;
+		runtime_locklen++;
+      }
+    }  // line 2713: for(i = 0; i < numparams; i++)
+       // grab these required locks
+#ifdef DEBUG
+    BAMBOO_DEBUGPRINT(0xe991);
+#endif
+    //long clock2;
+    //clock2 = BAMBOO_GET_EXE_TIME();
+
+    for(i = 0; i < runtime_locklen; i++) {
+      int * lock = (int *)(runtime_locks[i].redirectlock);
+      islock = true;
+      // require locks for this parameter if it is not a startup object
+#ifdef DEBUG
+      BAMBOO_DEBUGPRINT_REG((int)lock);
+      BAMBOO_DEBUGPRINT_REG((int)(runtime_locks[i].value));
+#endif
+      getwritelock(lock);
+      BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
+#ifdef DEBUG
+      BAMBOO_DEBUGPRINT(0xf001);
+#endif
+#ifdef PROFILE
+      //isInterrupt = false;
+#endif
+      while(!lockflag) {
+		BAMBOO_WAITING_FOR_LOCK(0);
+	  }
+#ifndef INTERRUPT
+      if(reside) {
+		while(BAMBOO_WAITING_FOR_LOCK(0) != -1) {
+		}
+      }
+#endif
+      grount = lockresult;
+
+      lockresult = 0;
+      lockobj = 0;
+      lock2require = 0;
+      lockflag = false;
+#ifndef INTERRUPT
+      reside = false;
+#endif
+#ifdef PROFILE
+      //isInterrupt = true;
+#endif
+      BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
+#ifdef DEBUG
+      BAMBOO_DEBUGPRINT(0xf000);
+#endif
+
+      if(grount == 0) {
+#ifdef DEBUG
+		BAMBOO_DEBUGPRINT(0xe992);
+		BAMBOO_DEBUGPRINT_REG(lock);
+#endif
+		// check if has the lock already
+		// can not get the lock, try later
+		// release all grabbed locks for previous parameters
+		for(j = 0; j < i; ++j) {
+		  lock = (int*)(runtime_locks[j].redirectlock);
+		  releasewritelock(lock);
+		}
+		genputtable(activetasks, currtpd, currtpd);
+		if(hashsize(activetasks) == 1) {
+		  // only one task right now, wait a little while before next try
+		  int halt = 10000;
+		  while(halt--) {
+		  }
+		}
+#ifdef PROFILE
+#ifdef ACCURATEPROFILE
+		// fail, set the end of the checkTaskInfo
+		profileTaskEnd();
+#endif
+#endif
+		goto newtask;
+	//}
+      }
+    }   // line 2752:  for(i = 0; i < runtime_locklen; i++)
+
+    /*long clock3;
+       clock3 = BAMBOO_GET_EXE_TIME();
+       //tprintf("sort: %d, grab: %d \n", clock2-clock1, clock3-clock2);*/
+
+#ifdef DEBUG
+    BAMBOO_DEBUGPRINT(0xe993);
+#endif
+    /* Make sure that the parameters are still in the queues */
+    for(i=0; i<numparams; i++) {
+      void * parameter=currtpd->parameterArray[i];
+
+      // flush the object
+#ifdef CACHEFLUSH
+      BAMBOO_CACHE_FLUSH_RANGE((int)parameter,
+		  classsize[((struct ___Object___ *)parameter)->type]);
+#endif
+      tmpparam = (struct ___Object___ *)parameter;
+      pd=currtpd->task->descriptorarray[i];
+      pw=(struct parameterwrapper *) pd->queue;
+      /* Check that object is still in queue */
+      {
+		if (!ObjectHashcontainskey(pw->objectset, (int) parameter)) {
+#ifdef DEBUG
+		  BAMBOO_DEBUGPRINT(0xe994);
+		  BAMBOO_DEBUGPRINT_REG(parameter);
+#endif
+		  // release grabbed locks
+		  for(j = 0; j < runtime_locklen; ++j) {
+			int * lock = (int *)(runtime_locks[j].redirectlock);
+			releasewritelock(lock);
+		  }
+		  RUNFREE(currtpd->parameterArray);
+		  RUNFREE(currtpd);
+		  currtpd = NULL;
+		  goto newtask;
+		}
+      }   // line2865
+          /* Check if the object's flags still meets requirements */
+      {
+		int tmpi = 0;
+		bool ismet = false;
+		for(tmpi = 0; tmpi < pw->numberofterms; ++tmpi) {
+		  andmask=pw->intarray[tmpi*2];
+		  checkmask=pw->intarray[tmpi*2+1];
+		  if((((struct ___Object___ *)parameter)->flag&andmask)==checkmask) {
+			ismet = true;
+			break;
+		  }
+		}
+		if (!ismet) {
+		  // flags are never suitable
+		  // remove this obj from the queue
+		  int next;
+		  int UNUSED, UNUSED2;
+		  int * enterflags;
+#ifdef DEBUG
+		  BAMBOO_DEBUGPRINT(0xe995);
+		  BAMBOO_DEBUGPRINT_REG(parameter);
+#endif
+		  ObjectHashget(pw->objectset, (int) parameter, (int *) &next,
+						(int *) &enterflags, &UNUSED, &UNUSED2);
+		  ObjectHashremove(pw->objectset, (int)parameter);
+		  if (enterflags!=NULL)
+			RUNFREE(enterflags);
+		  // release grabbed locks
+		  for(j = 0; j < runtime_locklen; ++j) {
+			int * lock = (int *)(runtime_locks[j].redirectlock);
+			releasewritelock(lock);
+		  }
+		  RUNFREE(currtpd->parameterArray);
+		  RUNFREE(currtpd);
+		  currtpd = NULL;
+#ifdef PROFILE
+#ifdef ACCURATEPROFILE
+		  // fail, set the end of the checkTaskInfo
+		  profileTaskEnd();
+#endif
+#endif
+		  goto newtask;
+		}   // line 2878: if (!ismet)
+      }   // line 2867
+parameterpresent:
+      ;
+      /* Check that object still has necessary tags */
+      for(j=0; j<pd->numbertags; j++) {
+		int slotid=pd->tagarray[2*j]+numparams;
+		struct ___TagDescriptor___ *tagd=currtpd->parameterArray[slotid];
+		if (!containstag(parameter, tagd)) {
+#ifdef DEBUG
+		  BAMBOO_DEBUGPRINT(0xe996);
+#endif
+		  {
+			// release grabbed locks
+			int tmpj = 0;
+			for(tmpj = 0; tmpj < runtime_locklen; ++tmpj) {
+			  int * lock = (int *)(runtime_locks[tmpj].redirectlock);
+			  releasewritelock(lock);
+			}
+		  }
+		  RUNFREE(currtpd->parameterArray);
+		  RUNFREE(currtpd);
+		  currtpd = NULL;
+		  goto newtask;
+		}   // line2911: if (!containstag(parameter, tagd))
+      }   // line 2808: for(j=0; j<pd->numbertags; j++)
+
+      taskpointerarray[i+OFFSET]=parameter;
+    }   // line 2824: for(i=0; i<numparams; i++)
+        /* Copy the tags */
+    for(; i<numtotal; i++) {
+      taskpointerarray[i+OFFSET]=currtpd->parameterArray[i];
+    }
+
+    {
+execute:
+      /* Actually call task */
+#ifdef MULTICORE_GC
+      ((int *)taskpointerarray)[0]=currtpd->numParameters;
+      taskpointerarray[1]=NULL;
+#endif
+#ifdef PROFILE
+#ifdef ACCURATEPROFILE
+      // check finish, set the end of the checkTaskInfo
+      profileTaskEnd();
+#endif
+      profileTaskStart(currtpd->task->name);
+#endif
+      // TODO
+      //long clock4;
+      //clock4 = BAMBOO_GET_EXE_TIME();
+      //tprintf("sort: %d, grab: %d, check: %d \n", (int)(clock2-clock1), (int)(clock3-clock2), (int)(clock4-clock3));
+
+#ifdef DEBUG
+      BAMBOO_DEBUGPRINT(0xe997);
+#endif
+      ((void (*)(void **))currtpd->task->taskptr)(taskpointerarray);
+      // TODO
+      //long clock5;
+      //clock5 = BAMBOO_GET_EXE_TIME();
+      // tprintf("sort: %d, grab: %d, check: %d \n", (int)(clock2-clock1), (int)(clock3-clock2), (int)(clock4-clock3));
+
+#ifdef PROFILE
+#ifdef ACCURATEPROFILE
+      // task finish, set the end of the checkTaskInfo
+      profileTaskEnd();
+      // new a PostTaskInfo for the post-task execution
+      profileTaskStart("post task execution");
+#endif
+#endif
+#ifdef DEBUG
+      BAMBOO_DEBUGPRINT(0xe998);
+      BAMBOO_DEBUGPRINT_REG(islock);
+#endif
+
+      if(islock) {
+#ifdef DEBUG
+		BAMBOO_DEBUGPRINT(0xe999);
+#endif
+		for(i = 0; i < runtime_locklen; ++i) {
+		  void * ptr = (void *)(runtime_locks[i].redirectlock);
+		  int * lock = (int *)(runtime_locks[i].value);
+#ifdef DEBUG
+		  BAMBOO_DEBUGPRINT_REG((int)ptr);
+		  BAMBOO_DEBUGPRINT_REG((int)lock);
+		  BAMBOO_DEBUGPRINT_REG(*((int*)lock+5));
+#endif
+#ifndef MULTICORE_GC
+		  if(RuntimeHashcontainskey(lockRedirectTbl, (int)lock)) {
+			int redirectlock;
+			RuntimeHashget(lockRedirectTbl, (int)lock, &redirectlock);
+			RuntimeHashremovekey(lockRedirectTbl, (int)lock);
+			releasewritelock_r(lock, (int *)redirectlock);
+		  } else {
+#else
+		  {
+#endif
+			releasewritelock(ptr);
+		  }
+		}
+      }     // line 3015: if(islock)
+
+      //long clock6;
+      //clock6 = BAMBOO_GET_EXE_TIME();
+      //tprintf("sort: %d, grab: %d, check: %d \n", (int)(clock2-clock1), (int)(clock3-clock2), (int)(clock4-clock3));
+
+#ifdef PROFILE
+      // post task execution finish, set the end of the postTaskInfo
+      profileTaskEnd();
+#endif
+
+      // Free up task parameter descriptor
+      RUNFREE(currtpd->parameterArray);
+      RUNFREE(currtpd);
+      currtpd = NULL;
+#ifdef DEBUG
+      BAMBOO_DEBUGPRINT(0xe99a);
+#endif
+      //long clock7;
+      //clock7 = BAMBOO_GET_EXE_TIME();
+      //tprintf("sort: %d, grab: %d, check: %d, release: %d, other %d \n", (int)(clock2-clock1), (int)(clock3-clock2), (int)(clock4-clock3), (int)(clock6-clock5), (int)(clock7-clock6));
+
+    }   //
+    //} //  if (hashsize(activetasks)>0)
+  } //  while(hashsize(activetasks)>0)
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(0xe99b);
+#endif
+}
+
+/* This function processes an objects tags */
+void processtags(struct parameterdescriptor *pd,
+                 int index,
+                 struct parameterwrapper *parameter,
+                 int * iteratorcount,
+                 int *statusarray,
+                 int numparams) {
+  int i;
+
+  for(i=0; i<pd->numbertags; i++) {
+    int slotid=pd->tagarray[2*i];
+    int tagid=pd->tagarray[2*i+1];
+
+    if (statusarray[slotid+numparams]==0) {
+      parameter->iterators[*iteratorcount].istag=1;
+      parameter->iterators[*iteratorcount].tagid=tagid;
+      parameter->iterators[*iteratorcount].slot=slotid+numparams;
+      parameter->iterators[*iteratorcount].tagobjectslot=index;
+      statusarray[slotid+numparams]=1;
+      (*iteratorcount)++;
+    }
+  }
+}
+
+
+void processobject(struct parameterwrapper *parameter,
+                   int index,
+                   struct parameterdescriptor *pd,
+                   int *iteratorcount,
+                   int * statusarray,
+                   int numparams) {
+  int i;
+  int tagcount=0;
+  struct ObjectHash * objectset=
+    ((struct parameterwrapper *)pd->queue)->objectset;
+
+  parameter->iterators[*iteratorcount].istag=0;
+  parameter->iterators[*iteratorcount].slot=index;
+  parameter->iterators[*iteratorcount].objectset=objectset;
+  statusarray[index]=1;
+
+  for(i=0; i<pd->numbertags; i++) {
+    int slotid=pd->tagarray[2*i];
+    //int tagid=pd->tagarray[2*i+1];
+    if (statusarray[slotid+numparams]!=0) {
+      /* This tag has already been enqueued, use it to narrow search */
+      parameter->iterators[*iteratorcount].tagbindings[tagcount]=
+        slotid+numparams;
+      tagcount++;
+    }
+  }
+  parameter->iterators[*iteratorcount].numtags=tagcount;
+
+  (*iteratorcount)++;
+}
+
+/* This function builds the iterators for a task & parameter */
+
+void builditerators(struct taskdescriptor * task,
+                    int index,
+                    struct parameterwrapper * parameter) {
+  int statusarray[MAXTASKPARAMS];
+  int i;
+  int numparams=task->numParameters;
+  int iteratorcount=0;
+  for(i=0; i<MAXTASKPARAMS; i++) statusarray[i]=0;
+
+  statusarray[index]=1; /* Initial parameter */
+  /* Process tags for initial iterator */
+
+  processtags(task->descriptorarray[index], index, parameter,
+              &iteratorcount, statusarray, numparams);
+
+  while(1) {
+loopstart:
+    /* Check for objects with existing tags */
+    for(i=0; i<numparams; i++) {
+      if (statusarray[i]==0) {
+		struct parameterdescriptor *pd=task->descriptorarray[i];
+		int j;
+		for(j=0; j<pd->numbertags; j++) {
+		  int slotid=pd->tagarray[2*j];
+		  if(statusarray[slotid+numparams]!=0) {
+			processobject(parameter,i,pd,&iteratorcount,
+				statusarray,numparams);
+			processtags(pd,i,parameter,&iteratorcount,statusarray,numparams);
+			goto loopstart;
+		  }
+		}
+      }
+    }
+
+    /* Next do objects w/ unbound tags*/
+
+    for(i=0; i<numparams; i++) {
+      if (statusarray[i]==0) {
+		struct parameterdescriptor *pd=task->descriptorarray[i];
+		if (pd->numbertags>0) {
+		  processobject(parameter,i,pd,&iteratorcount,statusarray,numparams);
+		  processtags(pd,i,parameter,&iteratorcount,statusarray,numparams);
+		  goto loopstart;
+		}
+      }
+    }
+
+    /* Nothing with a tag enqueued */
+
+    for(i=0; i<numparams; i++) {
+      if (statusarray[i]==0) {
+		struct parameterdescriptor *pd=task->descriptorarray[i];
+		processobject(parameter,i,pd,&iteratorcount,statusarray,numparams);
+		processtags(pd,i,parameter,&iteratorcount,statusarray,numparams);
+		goto loopstart;
+      }
+    }
+
+    /* Nothing left */
+    return;
+  }
+}
+
+void printdebug() {
+  int i;
+  int j;
+  if(BAMBOO_NUM_OF_CORE > NUMCORESACTIVE - 1) {
+    return;
+  }
+  for(i=0; i<numtasks[BAMBOO_NUM_OF_CORE]; i++) {
+    struct taskdescriptor * task=taskarray[BAMBOO_NUM_OF_CORE][i];
+#ifndef RAW
+    printf("%s\n", task->name);
+#endif
+    for(j=0; j<task->numParameters; j++) {
+      struct parameterdescriptor *param=task->descriptorarray[j];
+      struct parameterwrapper *parameter=param->queue;
+      struct ObjectHash * set=parameter->objectset;
+      struct ObjectIterator objit;
+#ifndef RAW
+      printf("  Parameter %d\n", j);
+#endif
+      ObjectHashiterator(set, &objit);
+      while(ObjhasNext(&objit)) {
+		struct ___Object___ * obj=(struct ___Object___ *)Objkey(&objit);
+		struct ___Object___ * tagptr=obj->___tags___;
+		int nonfailed=Objdata4(&objit);
+		int numflags=Objdata3(&objit);
+		int flags=Objdata2(&objit);
+		Objnext(&objit);
+#ifndef RAW
+		printf("    Contains %lx\n", obj);
+		printf("      flag=%d\n", obj->flag);
+#endif
+		if (tagptr==NULL) {
+		} else if (tagptr->type==TAGTYPE) {
+#ifndef RAW
+		  printf("      tag=%lx\n",tagptr);
+#else
+		  ;
+#endif
+		} else {
+		  int tagindex=0;
+		  struct ArrayObject *ao=(struct ArrayObject *)tagptr;
+		  for(; tagindex<ao->___cachedCode___; tagindex++) {
+#ifndef RAW
+			printf("      tag=%lx\n",ARRAYGET(ao,struct ___TagDescriptor___*,
+											  tagindex));
+#else
+			;
+#endif
+		  }
+		}
+      }
+    }
+  }
+}
+
+
+/* This function processes the task information to create queues for
+   each parameter type. */
+
+void processtasks() {
+  int i;
+  if(BAMBOO_NUM_OF_CORE > NUMCORESACTIVE - 1) {
+    return;
+  }
+  for(i=0; i<numtasks[BAMBOO_NUM_OF_CORE]; i++) {
+    struct taskdescriptor * task=taskarray[BAMBOO_NUM_OF_CORE][i];
+    int j;
+
+    /* Build objectsets */
+    for(j=0; j<task->numParameters; j++) {
+      struct parameterdescriptor *param=task->descriptorarray[j];
+      struct parameterwrapper *parameter=param->queue;
+      parameter->objectset=allocateObjectHash(10);
+      parameter->task=task;
+    }
+
+    /* Build iterators for parameters */
+    for(j=0; j<task->numParameters; j++) {
+      struct parameterdescriptor *param=task->descriptorarray[j];
+      struct parameterwrapper *parameter=param->queue;
+      builditerators(task, j, parameter);
+    }
+  }
+}
+
+void toiReset(struct tagobjectiterator * it) {
+  if (it->istag) {
+    it->tagobjindex=0;
+  } else if (it->numtags>0) {
+    it->tagobjindex=0;
+  } else {
+    ObjectHashiterator(it->objectset, &it->it);
+  }
+}
+
+int toiHasNext(struct tagobjectiterator *it,
+               void ** objectarray OPTARG(int * failed)) {
+  if (it->istag) {
+    /* Iterate tag */
+    /* Get object with tags */
+    struct ___Object___ *obj=objectarray[it->tagobjectslot];
+    struct ___Object___ *tagptr=obj->___tags___;
+    if (tagptr->type==TAGTYPE) {
+      if ((it->tagobjindex==0)&& /* First object */
+		  (it->tagid==((struct ___TagDescriptor___ *)tagptr)->flag)) /* Right tag type */
+		return 1;
+	  else
+		return 0;
+    } else {
+      struct ArrayObject *ao=(struct ArrayObject *) tagptr;
+      int tagindex=it->tagobjindex;
+      for(; tagindex<ao->___cachedCode___; tagindex++) {
+		struct ___TagDescriptor___ *td=
+		  ARRAYGET(ao, struct ___TagDescriptor___ *, tagindex);
+		if (td->flag==it->tagid) {
+		  it->tagobjindex=tagindex; /* Found right type of tag */
+		  return 1;
+		}
+      }
+      return 0;
+    }
+  } else if (it->numtags>0) {
+    /* Use tags to locate appropriate objects */
+    struct ___TagDescriptor___ *tag=objectarray[it->tagbindings[0]];
+    struct ___Object___ *objptr=tag->flagptr;
+    int i;
+    if (objptr->type!=OBJECTARRAYTYPE) {
+      if (it->tagobjindex>0)
+		return 0;
+      if (!ObjectHashcontainskey(it->objectset, (int) objptr))
+		return 0;
+      for(i=1; i<it->numtags; i++) {
+		struct ___TagDescriptor___ *tag2=objectarray[it->tagbindings[i]];
+		if (!containstag(objptr,tag2))
+		  return 0;
+      }
+      return 1;
+    } else {
+      struct ArrayObject *ao=(struct ArrayObject *) objptr;
+      int tagindex;
+      int i;
+      for(tagindex=it->tagobjindex;tagindex<ao->___cachedCode___;tagindex++){
+		struct ___Object___ *objptr=
+		  ARRAYGET(ao,struct ___Object___*,tagindex);
+		if (!ObjectHashcontainskey(it->objectset, (int) objptr))
+		  continue;
+		for(i=1; i<it->numtags; i++) {
+		  struct ___TagDescriptor___ *tag2=objectarray[it->tagbindings[i]];
+		  if (!containstag(objptr,tag2))
+			goto nexttag;
+		}
+		it->tagobjindex=tagindex;
+		return 1;
+nexttag:
+		;
+	  }
+      it->tagobjindex=tagindex;
+      return 0;
+    }
+  } else {
+    return ObjhasNext(&it->it);
+  }
+}
+
+int containstag(struct ___Object___ *ptr,
+                struct ___TagDescriptor___ *tag) {
+  int j;
+  struct ___Object___ * objptr=tag->flagptr;
+  if (objptr->type==OBJECTARRAYTYPE) {
+    struct ArrayObject *ao=(struct ArrayObject *)objptr;
+    for(j=0; j<ao->___cachedCode___; j++) {
+      if (ptr==ARRAYGET(ao, struct ___Object___*, j)) {
+		return 1;
+      }
+    }
+    return 0;
+  } else {
+    return objptr==ptr;
+  }
+}
+
+void toiNext(struct tagobjectiterator *it,
+             void ** objectarray OPTARG(int * failed)) {
+  /* hasNext has all of the intelligence */
+  if(it->istag) {
+    /* Iterate tag */
+    /* Get object with tags */
+    struct ___Object___ *obj=objectarray[it->tagobjectslot];
+    struct ___Object___ *tagptr=obj->___tags___;
+    if (tagptr->type==TAGTYPE) {
+      it->tagobjindex++;
+      objectarray[it->slot]=tagptr;
+    } else {
+      struct ArrayObject *ao=(struct ArrayObject *) tagptr;
+      objectarray[it->slot]=
+        ARRAYGET(ao, struct ___TagDescriptor___ *, it->tagobjindex++);
+    }
+  } else if (it->numtags>0) {
+    /* Use tags to locate appropriate objects */
+    struct ___TagDescriptor___ *tag=objectarray[it->tagbindings[0]];
+    struct ___Object___ *objptr=tag->flagptr;
+    if (objptr->type!=OBJECTARRAYTYPE) {
+      it->tagobjindex++;
+      objectarray[it->slot]=objptr;
+    } else {
+      struct ArrayObject *ao=(struct ArrayObject *) objptr;
+      objectarray[it->slot]=
+        ARRAYGET(ao, struct ___Object___ *, it->tagobjindex++);
+    }
+  } else {
+    /* Iterate object */
+    objectarray[it->slot]=(void *)Objkey(&it->it);
+    Objnext(&it->it);
+  }
+}
+
+#ifdef PROFILE
+inline void profileTaskStart(char * taskname) {
+  if(!taskInfoOverflow) {
+    TaskInfo* taskInfo = RUNMALLOC(sizeof(struct task_info));
+    taskInfoArray[taskInfoIndex] = taskInfo;
+    taskInfo->taskName = taskname;
+    taskInfo->startTime = BAMBOO_GET_EXE_TIME();
+    taskInfo->endTime = -1;
+    taskInfo->exitIndex = -1;
+    taskInfo->newObjs = NULL;
+  }
+}
+
+inline void profileTaskEnd() {
+  if(!taskInfoOverflow) {
+    taskInfoArray[taskInfoIndex]->endTime = BAMBOO_GET_EXE_TIME();
+    taskInfoIndex++;
+    if(taskInfoIndex == TASKINFOLENGTH) {
+      taskInfoOverflow = true;
+      //taskInfoIndex = 0;
+    }
+  }
+}
+
+// output the profiling data
+void outputProfileData() {
+#ifdef USEIO
+  int i;
+  unsigned long long totaltasktime = 0;
+  unsigned long long preprocessingtime = 0;
+  unsigned long long objqueuecheckingtime = 0;
+  unsigned long long postprocessingtime = 0;
+  //int interruptiontime = 0;
+  unsigned long long other = 0;
+  unsigned long long averagetasktime = 0;
+  int tasknum = 0;
+
+  printf("Task Name, Start Time, End Time, Duration, Exit Index(, NewObj Name, Num)+\n");
+  // output task related info
+  for(i = 0; i < taskInfoIndex; i++) {
+    TaskInfo* tmpTInfo = taskInfoArray[i];
+    unsigned long long duration = tmpTInfo->endTime - tmpTInfo->startTime;
+    printf("%s, %lld, %lld, %lld, %lld",
+           tmpTInfo->taskName, tmpTInfo->startTime, tmpTInfo->endTime,
+           duration, tmpTInfo->exitIndex);
+    // summarize new obj info
+    if(tmpTInfo->newObjs != NULL) {
+      struct RuntimeHash * nobjtbl = allocateRuntimeHash(5);
+      struct RuntimeIterator * iter = NULL;
+      while(0 == isEmpty(tmpTInfo->newObjs)) {
+		char * objtype = (char *)(getItem(tmpTInfo->newObjs));
+		if(RuntimeHashcontainskey(nobjtbl, (int)(objtype))) {
+		  int num = 0;
+		  RuntimeHashget(nobjtbl, (int)objtype, &num);
+		  RuntimeHashremovekey(nobjtbl, (int)objtype);
+		  num++;
+		  RuntimeHashadd(nobjtbl, (int)objtype, num);
+		} else {
+		  RuntimeHashadd(nobjtbl, (int)objtype, 1);
+		}
+		//printf(stderr, "new obj!\n");
+      }
+
+      // output all new obj info
+      iter = RuntimeHashcreateiterator(nobjtbl);
+      while(RunhasNext(iter)) {
+		char * objtype = (char *)Runkey(iter);
+		int num = Runnext(iter);
+		printf(", %s, %d", objtype, num);
+      }
+    }
+    printf("\n");
+    if(strcmp(tmpTInfo->taskName, "tpd checking") == 0) {
+      preprocessingtime += duration;
+    } else if(strcmp(tmpTInfo->taskName, "post task execution") == 0) {
+      postprocessingtime += duration;
+    } else if(strcmp(tmpTInfo->taskName, "objqueue checking") == 0) {
+      objqueuecheckingtime += duration;
+    } else {
+      totaltasktime += duration;
+      averagetasktime += duration;
+      tasknum++;
+    }
+  }
+
+  if(taskInfoOverflow) {
+    printf("Caution: task info overflow!\n");
+  }
+
+  other = totalexetime-totaltasktime-preprocessingtime-postprocessingtime;
+  averagetasktime /= tasknum;
+
+  printf("\nTotal time: %lld\n", totalexetime);
+  printf("Total task execution time: %lld (%d%%)\n", totaltasktime,
+         (int)(((double)totaltasktime/(double)totalexetime)*100));
+  printf("Total objqueue checking time: %lld (%d%%)\n",
+         objqueuecheckingtime,
+         (int)(((double)objqueuecheckingtime/(double)totalexetime)*100));
+  printf("Total pre-processing time: %lld (%d%%)\n", preprocessingtime,
+         (int)(((double)preprocessingtime/(double)totalexetime)*100));
+  printf("Total post-processing time: %lld (%d%%)\n", postprocessingtime,
+         (int)(((double)postprocessingtime/(double)totalexetime)*100));
+  printf("Other time: %lld (%d%%)\n", other,
+         (int)(((double)other/(double)totalexetime)*100));
+
+
+  printf("\nAverage task execution time: %lld\n", averagetasktime);
+
+  //printf("\nTotal time spent for interruptions: %lld\n", interrupttime);
+#else
+  int i = 0;
+  int j = 0;
+
+  BAMBOO_DEBUGPRINT(0xdddd);
+  // output task related info
+  for(i= 0; i < taskInfoIndex; i++) {
+    TaskInfo* tmpTInfo = taskInfoArray[i];
+    char* tmpName = tmpTInfo->taskName;
+    int nameLen = strlen(tmpName);
+    BAMBOO_DEBUGPRINT(0xddda);
+    for(j = 0; j < nameLen; j++) {
+      BAMBOO_DEBUGPRINT_REG(tmpName[j]);
+    }
+    BAMBOO_DEBUGPRINT(0xdddb);
+    BAMBOO_DEBUGPRINT_REG(tmpTInfo->startTime);
+    BAMBOO_DEBUGPRINT_REG(tmpTInfo->endTime);
+    BAMBOO_DEBUGPRINT_REG(tmpTInfo->exitIndex);
+    if(tmpTInfo->newObjs != NULL) {
+      struct RuntimeHash * nobjtbl = allocateRuntimeHash(5);
+      struct RuntimeIterator * iter = NULL;
+      while(0 == isEmpty(tmpTInfo->newObjs)) {
+		char * objtype = (char *)(getItem(tmpTInfo->newObjs));
+		if(RuntimeHashcontainskey(nobjtbl, (int)(objtype))) {
+		  int num = 0;
+		  RuntimeHashget(nobjtbl, (int)objtype, &num);
+		  RuntimeHashremovekey(nobjtbl, (int)objtype);
+		  num++;
+		  RuntimeHashadd(nobjtbl, (int)objtype, num);
+		} else {
+		  RuntimeHashadd(nobjtbl, (int)objtype, 1);
+		}
+      }
+
+      // ouput all new obj info
+      iter = RuntimeHashcreateiterator(nobjtbl);
+      while(RunhasNext(iter)) {
+		char * objtype = (char *)Runkey(iter);
+		int num = Runnext(iter);
+		int nameLen = strlen(objtype);
+		BAMBOO_DEBUGPRINT(0xddda);
+		for(j = 0; j < nameLen; j++) {
+		  BAMBOO_DEBUGPRINT_REG(objtype[j]);
+		}
+		BAMBOO_DEBUGPRINT(0xdddb);
+		BAMBOO_DEBUGPRINT_REG(num);
+	  }
+	}
+	BAMBOO_DEBUGPRINT(0xdddc);
+  }
+
+  if(taskInfoOverflow) {
+	BAMBOO_DEBUGPRINT(0xefee);
+  }
+
+#ifdef PROFILE_INTERRUPT
+  // output interrupt related info
+  for(i = 0; i < interruptInfoIndex; i++) {
+	InterruptInfo* tmpIInfo = interruptInfoArray[i];
+	BAMBOO_DEBUGPRINT(0xddde);
+	BAMBOO_DEBUGPRINT_REG(tmpIInfo->startTime);
+	BAMBOO_DEBUGPRINT_REG(tmpIInfo->endTime);
+	BAMBOO_DEBUGPRINT(0xdddf);
+  }
+
+  if(interruptInfoOverflow) {
+	BAMBOO_DEBUGPRINT(0xefef);
+  }
+#endif // PROFILE_INTERRUPT
+
+  BAMBOO_DEBUGPRINT(0xeeee);
+#endif
+}
+#endif  // #ifdef PROFILE
+
+#endif
diff --git a/Robust/src/Runtime/multicoregarbage.c b/Robust/src/Runtime/multicoregarbage.c
deleted file mode 100644
index 8ecfb6ee..00000000
--- a/Robust/src/Runtime/multicoregarbage.c
+++ /dev/null
@@ -1,3735 +0,0 @@
-#ifdef MULTICORE_GC
-#include "runtime.h"
-#include "multicoregarbage.h"
-#include "multicoreruntime.h"
-#include "runtime_arch.h"
-#include "SimpleHash.h"
-#include "GenericHashtable.h"
-#include "ObjectHash.h"
-#include "GCSharedHash.h"
-
-// TODO for profiling the flush phase
-#ifdef GC_PROFILE
-/*int num_mapinforequest;
-int num_markrequest;
-unsigned long long marktime;*/
-#endif
-
-extern int corenum;
-extern struct parameterwrapper ** objectqueues[][NUMCLASSES];
-extern int numqueues[][NUMCLASSES];
-
-extern struct genhashtable * activetasks;
-extern struct parameterwrapper ** objectqueues[][NUMCLASSES];
-extern struct taskparamdescriptor *currtpd;
-
-extern struct LockValue runtime_locks[MAXTASKPARAMS];
-extern int runtime_locklen;
-
-#ifdef SMEMM
-extern unsigned int gcmem_mixed_threshold;
-extern unsigned int gcmem_mixed_usedmem;
-#endif
-
-struct pointerblock {
-  void * ptrs[NUMPTRS];
-  struct pointerblock *next;
-};
-
-struct pointerblock *gchead=NULL;
-int gcheadindex=0;
-struct pointerblock *gctail=NULL;
-int gctailindex=0;
-struct pointerblock *gctail2=NULL;
-int gctailindex2=0;
-struct pointerblock *gcspare=NULL;
-
-#define NUMLOBJPTRS 20
-
-struct lobjpointerblock {
-  void * lobjs[NUMLOBJPTRS];
-  //void * dsts[NUMLOBJPTRS];
-  int lengths[NUMLOBJPTRS];
-  //void * origs[NUMLOBJPTRS];
-  int hosts[NUMLOBJPTRS];
-  struct lobjpointerblock *next;
-  struct lobjpointerblock *prev;
-};
-
-struct lobjpointerblock *gclobjhead=NULL;
-int gclobjheadindex=0;
-struct lobjpointerblock *gclobjtail=NULL;
-int gclobjtailindex=0;
-struct lobjpointerblock *gclobjtail2=NULL;
-int gclobjtailindex2=0;
-struct lobjpointerblock *gclobjspare=NULL;
-
-#ifdef GC_DEBUG
-// dump whole mem in blocks
-inline void dumpSMem() {
-  int block = 0;
-  int sblock = 0;
-  int j = 0;
-  int i = 0;
-  int coren = 0;
-  int x = 0;
-  int y = 0;
-  printf("(%x,%x) Dump shared mem: \n", udn_tile_coord_x(), 
-	     udn_tile_coord_y());
-  // reserved blocks for sblocktbl
-  printf("(%x,%x) ++++ reserved sblocks ++++ \n", udn_tile_coord_x(), 
-	     udn_tile_coord_y());
-  for(i=BAMBOO_BASE_VA; i<gcbaseva; i+= 4*16) {
-    printf("(%x,%x) 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x \n",
-		   udn_tile_coord_x(), udn_tile_coord_y(),
-           *((int *)(i)), *((int *)(i + 4)),
-           *((int *)(i + 4*2)), *((int *)(i + 4*3)),
-           *((int *)(i + 4*4)), *((int *)(i + 4*5)),
-           *((int *)(i + 4*6)), *((int *)(i + 4*7)),
-           *((int *)(i + 4*8)), *((int *)(i + 4*9)),
-           *((int *)(i + 4*10)), *((int *)(i + 4*11)),
-           *((int *)(i + 4*12)), *((int *)(i + 4*13)),
-           *((int *)(i + 4*14)), *((int *)(i + 4*15)));
-  }
-  sblock = gcreservedsb;
-  bool advanceblock = false;
-  // remaining memory
-  for(i=gcbaseva; i<gcbaseva+BAMBOO_SHARED_MEM_SIZE; i+=4*16) {
-    advanceblock = false;
-    // computing sblock # and block #, core coordinate (x,y) also
-    if(j%((BAMBOO_SMEM_SIZE)/(4*16)) == 0) {
-      // finished a sblock
-      if(j < ((BAMBOO_LARGE_SMEM_BOUND)/(4*16))) {
-		if((j > 0) && (j%((BAMBOO_SMEM_SIZE_L)/(4*16)) == 0)) {
-		  // finished a block
-		  block++;
-		  advanceblock = true;
-		}
-      } else {
-		// finished a block
-		block++;
-		advanceblock = true;
-      }
-      // compute core #
-      if(advanceblock) {
-		coren = gc_block2core[block%(NUMCORES4GC*2)];
-      }
-      // compute core coordinate
-      BAMBOO_COORDS(coren, &x, &y);
-      printf("(%x,%x) ==== %d, %d : core (%d,%d), saddr %x====\n",
-		     udn_tile_coord_x(), udn_tile_coord_y(),
-             block, sblock++, x, y,
-             (sblock-1)*(BAMBOO_SMEM_SIZE)+gcbaseva);
-    }
-    j++;
-    printf("(%x,%x) 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x \n",
-		   udn_tile_coord_x(), udn_tile_coord_y(),
-           *((int *)(i)), *((int *)(i + 4)),
-           *((int *)(i + 4*2)), *((int *)(i + 4*3)),
-           *((int *)(i + 4*4)), *((int *)(i + 4*5)),
-           *((int *)(i + 4*6)), *((int *)(i + 4*7)),
-           *((int *)(i + 4*8)), *((int *)(i + 4*9)),
-           *((int *)(i + 4*10)), *((int *)(i + 4*11)),
-           *((int *)(i + 4*12)), *((int *)(i + 4*13)),
-           *((int *)(i + 4*14)), *((int *)(i + 4*15)));
-  }
-  printf("(%x,%x) \n", udn_tile_coord_x(), udn_tile_coord_y());
-}
-#endif
-
-// should be invoked with interruption closed
-inline void gc_enqueue_I(void *ptr) {
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xe601);
-  BAMBOO_DEBUGPRINT_REG(ptr);
-#endif
-  if (gcheadindex==NUMPTRS) {
-    struct pointerblock * tmp;
-    if (gcspare!=NULL) {
-      tmp=gcspare;
-      gcspare=NULL;
-    } else {
-      tmp=RUNMALLOC_I(sizeof(struct pointerblock));
-    }             // if (gcspare!=NULL)
-    gchead->next=tmp;
-    gchead=tmp;
-    gcheadindex=0;
-  } // if (gcheadindex==NUMPTRS)
-  gchead->ptrs[gcheadindex++]=ptr;
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xe602);
-#endif
-} // void gc_enqueue_I(void *ptr)
-
-// dequeue and destroy the queue
-inline void * gc_dequeue_I() {
-  if (gctailindex==NUMPTRS) {
-    struct pointerblock *tmp=gctail;
-    gctail=gctail->next;
-    gctailindex=0;
-    if (gcspare!=NULL) {
-      RUNFREE(tmp);
-    } else {
-      gcspare=tmp;
-    }             // if (gcspare!=NULL)
-  } // if (gctailindex==NUMPTRS)
-  return gctail->ptrs[gctailindex++];
-} // void * gc_dequeue()
-
-// dequeue and do not destroy the queue
-inline void * gc_dequeue2_I() {
-  if (gctailindex2==NUMPTRS) {
-    struct pointerblock *tmp=gctail2;
-    gctail2=gctail2->next;
-    gctailindex2=0;
-  } // if (gctailindex2==NUMPTRS)
-  return gctail2->ptrs[gctailindex2++];
-} // void * gc_dequeue2()
-
-inline int gc_moreItems_I() {
-  if ((gchead==gctail)&&(gctailindex==gcheadindex))
-    return 0;
-  return 1;
-} // int gc_moreItems()
-
-inline int gc_moreItems2_I() {
-  if ((gchead==gctail2)&&(gctailindex2==gcheadindex))
-    return 0;
-  return 1;
-} // int gc_moreItems2()
-
-// should be invoked with interruption closed
-// enqueue a large obj: start addr & length
-inline void gc_lobjenqueue_I(void *ptr,
-                             int length,
-                             int host) {
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xe901);
-#endif
-  if (gclobjheadindex==NUMLOBJPTRS) {
-    struct lobjpointerblock * tmp;
-    if (gclobjspare!=NULL) {
-      tmp=gclobjspare;
-      gclobjspare=NULL;
-    } else {
-      tmp=RUNMALLOC_I(sizeof(struct lobjpointerblock));
-    }             // if (gclobjspare!=NULL)
-    gclobjhead->next=tmp;
-    tmp->prev = gclobjhead;
-    gclobjhead=tmp;
-    gclobjheadindex=0;
-  } // if (gclobjheadindex==NUMLOBJPTRS)
-  gclobjhead->lobjs[gclobjheadindex]=ptr;
-  gclobjhead->lengths[gclobjheadindex]=length;
-  gclobjhead->hosts[gclobjheadindex++]=host;
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT_REG(gclobjhead->lobjs[gclobjheadindex-1]);
-  BAMBOO_DEBUGPRINT_REG(gclobjhead->lengths[gclobjheadindex-1]);
-  BAMBOO_DEBUGPRINT_REG(gclobjhead->hosts[gclobjheadindex-1]);
-#endif
-} // void gc_lobjenqueue_I(void *ptr...)
-
-// dequeue and destroy the queue
-inline void * gc_lobjdequeue_I(int * length,
-                               int * host) {
-  if (gclobjtailindex==NUMLOBJPTRS) {
-    struct lobjpointerblock *tmp=gclobjtail;
-    gclobjtail=gclobjtail->next;
-    gclobjtailindex=0;
-    gclobjtail->prev = NULL;
-    if (gclobjspare!=NULL) {
-      RUNFREE(tmp);
-    } else {
-      gclobjspare=tmp;
-      tmp->next = NULL;
-      tmp->prev = NULL;
-    }             // if (gclobjspare!=NULL)
-  } // if (gclobjtailindex==NUMLOBJPTRS)
-  if(length != NULL) {
-    *length = gclobjtail->lengths[gclobjtailindex];
-  }
-  if(host != NULL) {
-    *host = (int)(gclobjtail->hosts[gclobjtailindex]);
-  }
-  return gclobjtail->lobjs[gclobjtailindex++];
-} // void * gc_lobjdequeue()
-
-inline int gc_lobjmoreItems_I() {
-  if ((gclobjhead==gclobjtail)&&(gclobjtailindex==gclobjheadindex))
-    return 0;
-  return 1;
-} // int gc_lobjmoreItems()
-
-// dequeue and don't destroy the queue
-inline void gc_lobjdequeue2_I() {
-  if (gclobjtailindex2==NUMLOBJPTRS) {
-    gclobjtail2=gclobjtail2->next;
-    gclobjtailindex2=1;
-  } else {
-    gclobjtailindex2++;
-  }      // if (gclobjtailindex2==NUMLOBJPTRS)
-} // void * gc_lobjdequeue2()
-
-inline int gc_lobjmoreItems2_I() {
-  if ((gclobjhead==gclobjtail2)&&(gclobjtailindex2==gclobjheadindex))
-    return 0;
-  return 1;
-} // int gc_lobjmoreItems2()
-
-// 'reversly' dequeue and don't destroy the queue
-inline void gc_lobjdequeue3_I() {
-  if (gclobjtailindex2==0) {
-    gclobjtail2=gclobjtail2->prev;
-    gclobjtailindex2=NUMLOBJPTRS-1;
-  } else {
-    gclobjtailindex2--;
-  }      // if (gclobjtailindex2==NUMLOBJPTRS)
-} // void * gc_lobjdequeue3()
-
-inline int gc_lobjmoreItems3_I() {
-  if ((gclobjtail==gclobjtail2)&&(gclobjtailindex2==gclobjtailindex))
-    return 0;
-  return 1;
-} // int gc_lobjmoreItems3()
-
-inline void gc_lobjqueueinit4_I() {
-  gclobjtail2 = gclobjtail;
-  gclobjtailindex2 = gclobjtailindex;
-} // void gc_lobjqueueinit2()
-
-inline void * gc_lobjdequeue4_I(int * length,
-                                int * host) {
-  if (gclobjtailindex2==NUMLOBJPTRS) {
-    gclobjtail2=gclobjtail2->next;
-    gclobjtailindex2=0;
-  } // if (gclobjtailindex==NUMLOBJPTRS)
-  if(length != NULL) {
-    *length = gclobjtail2->lengths[gclobjtailindex2];
-  }
-  if(host != NULL) {
-    *host = (int)(gclobjtail2->hosts[gclobjtailindex2]);
-  }
-  return gclobjtail2->lobjs[gclobjtailindex2++];
-} // void * gc_lobjdequeue()
-
-inline int gc_lobjmoreItems4_I() {
-  if ((gclobjhead==gclobjtail2)&&(gclobjtailindex2==gclobjheadindex))
-    return 0;
-  return 1;
-} // int gc_lobjmoreItems(
-
-INTPTR gccurr_heapbound = 0;
-
-inline void gettype_size(void * ptr,
-                         int * ttype,
-                         int * tsize) {
-  int type = ((int *)ptr)[0];
-  int size = 0;
-  if(type < NUMCLASSES) {
-    // a normal object
-    size = classsize[type];
-  } else {
-    // an array
-    struct ArrayObject *ao=(struct ArrayObject *)ptr;
-    int elementsize=classsize[type];
-    int length=ao->___length___;
-    size=sizeof(struct ArrayObject)+length*elementsize;
-  }       // if(type < NUMCLASSES)
-  *ttype = type;
-  *tsize = size;
-}
-
-inline bool isLarge(void * ptr,
-                    int * ttype,
-                    int * tsize) {
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xe701);
-  BAMBOO_DEBUGPRINT_REG(ptr);
-#endif
-  // check if a pointer is referring to a large object
-  gettype_size(ptr, ttype, tsize);
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(*tsize);
-#endif
-  int bound = (BAMBOO_SMEM_SIZE);
-  if(((int)ptr-gcbaseva) < (BAMBOO_LARGE_SMEM_BOUND)) {
-    bound = (BAMBOO_SMEM_SIZE_L);
-  }
-  if((((int)ptr-gcbaseva)%(bound))==0) {
-    // ptr is a start of a block
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xe702);
-    BAMBOO_DEBUGPRINT(1);
-#endif
-    return true;
-  }
-  if((bound-(((int)ptr-gcbaseva)%bound)) < (*tsize)) {
-    // it acrosses the boundary of current block
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xe703);
-    BAMBOO_DEBUGPRINT(1);
-#endif
-    return true;
-  }
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0);
-#endif
-  return false;
-} // bool isLarge(void * ptr, int * ttype, int * tsize)
-
-inline int hostcore(void * ptr) {
-  // check the host core of ptr
-  int host = 0;
-  RESIDECORE(ptr, &host);
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xedd0);
-  BAMBOO_DEBUGPRINT_REG(ptr);
-  BAMBOO_DEBUGPRINT_REG(host);
-#endif
-  return host;
-} // int hostcore(void * ptr)
-
-inline bool isLocal(void * ptr) {
-  // check if a pointer is in shared heap on this core
-  return hostcore(ptr) == BAMBOO_NUM_OF_CORE;
-} // bool isLocal(void * ptr)
-
-inline bool gc_checkCoreStatus_I() {
-  bool allStall = true;
-  for(int i = 0; i < NUMCORES4GC; ++i) {
-    if(gccorestatus[i] != 0) {
-      allStall = false;
-      break;
-    }             // if(gccorestatus[i] != 0)
-  }       // for(i = 0; i < NUMCORES4GC; ++i)
-  return allStall;
-}
-
-inline bool gc_checkAllCoreStatus_I() {
-  bool allStall = true;
-  for(int i = 0; i < NUMCORESACTIVE; ++i) {
-    if(gccorestatus[i] != 0) {
-      allStall = false;
-      break;
-    }             // if(gccorestatus[i] != 0)
-  }       // for(i = 0; i < NUMCORESACTIVE; ++i)
-  return allStall;
-}
-
-inline void checkMarkStatue() {
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xee01);
-#endif
-  int i;
-  if((!waitconfirm) ||
-     (waitconfirm && (numconfirm == 0))) {
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xee02);
-#endif
-	int entry_index = 0;
-	if(waitconfirm) {
-	  // phase 2
-	  entry_index = (gcnumsrobjs_index == 0) ? 1 : 0;
-	} else {
-	  // phase 1
-	  entry_index = gcnumsrobjs_index;
-	}
-    BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
-    gccorestatus[BAMBOO_NUM_OF_CORE] = 0;
-    gcnumsendobjs[entry_index][BAMBOO_NUM_OF_CORE] = gcself_numsendobjs;
-    gcnumreceiveobjs[entry_index][BAMBOO_NUM_OF_CORE] = gcself_numreceiveobjs;
-    // check the status of all cores
-    bool allStall = gc_checkAllCoreStatus_I();
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xee03);
-#endif
-    if(allStall) {
-#ifdef DEBUG
-      BAMBOO_DEBUGPRINT(0xee04);
-#endif
-      // ask for confirm
-      if(!waitconfirm) {
-#ifdef DEBUG
-		BAMBOO_DEBUGPRINT(0xee05);
-#endif
-		// the first time found all cores stall
-		// send out status confirm msg to all other cores
-		// reset the corestatus array too
-		gccorestatus[BAMBOO_NUM_OF_CORE] = 1;
-		waitconfirm = true;
-		numconfirm = NUMCORESACTIVE - 1;
-		BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-		for(i = 1; i < NUMCORESACTIVE; ++i) {
-		  gccorestatus[i] = 1;
-		  // send mark phase finish confirm request msg to core i
-		  send_msg_1(i, GCMARKCONFIRM, false);
-		}  // for(i = 1; i < NUMCORESACTIVE; ++i)
-      } else {
-		// Phase 2
-		// check if the sum of send objs and receive obj are the same
-		// yes->check if the info is the latest; no->go on executing
-		int sumsendobj = 0;
-		for(i = 0; i < NUMCORESACTIVE; ++i) {
-		  sumsendobj += gcnumsendobjs[gcnumsrobjs_index][i];
-		}  // for(i = 0; i < NUMCORESACTIVE; ++i)
-#ifdef DEBUG
-		BAMBOO_DEBUGPRINT(0xee06);
-		BAMBOO_DEBUGPRINT_REG(sumsendobj);
-#endif
-		for(i = 0; i < NUMCORESACTIVE; ++i) {
-		  sumsendobj -= gcnumreceiveobjs[gcnumsrobjs_index][i];
-		}  // for(i = 0; i < NUMCORESACTIVE; ++i)
-#ifdef DEBUG
-		BAMBOO_DEBUGPRINT(0xee07);
-		BAMBOO_DEBUGPRINT_REG(sumsendobj);
-#endif
-		if(0 == sumsendobj) {
-		  // Check if there are changes of the numsendobjs or numreceiveobjs on
-		  // each core
-		  bool ischanged = false;
-		  for(i = 0; i < NUMCORESACTIVE; ++i) {
-			if((gcnumsendobjs[0][i] != gcnumsendobjs[1][i]) || 
-				(gcnumreceiveobjs[0][i] != gcnumreceiveobjs[1][i]) ) {
-			  ischanged = true;
-			  break;
-			}
-		  }  // for(i = 0; i < NUMCORESACTIVE; ++i)
-#ifdef DEBUG
-		  BAMBOO_DEBUGPRINT(0xee08);
-		  BAMBOO_DEBUGPRINT_REG(ischanged);
-#endif
-		  if(!ischanged) {
-#ifdef DEBUG
-			BAMBOO_DEBUGPRINT(0xee09);
-#endif
-			// all the core status info are the latest
-			// stop mark phase
-			gcphase = COMPACTPHASE;
-			// restore the gcstatus for all cores
-			for(i = 0; i < NUMCORESACTIVE; ++i) {
-			  gccorestatus[i] = 1;
-			}  // for(i = 0; i < NUMCORESACTIVE; ++i)
-		  } else {
-			waitconfirm = false;
-			gcnumsrobjs_index = (gcnumsrobjs_index == 0) ? 1 : 0;
-		  } // if(!ischanged)
-		} else {
-		  // There were changes between phase 1 and phase 2, can not decide 
-		  // whether the mark phase has been finished
-		  waitconfirm = false;
-		  // As it fails in phase 2, flip the entries
-		  gcnumsrobjs_index = (gcnumsrobjs_index == 0) ? 1 : 0;
-		} // if(0 == sumsendobj) else ...
-		BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-      } // if(!gcwaitconfirm) else()
-    } else {
-      BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-    } // if(allStall)
-  }  // if((!waitconfirm)...
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xee0a);
-#endif
-} // void checkMarkStatue()
-/*
-inline bool preGC() {
-  // preparation for gc
-  // make sure to clear all incoming msgs espacially transfer obj msgs
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xec01);
-#endif
-  int i;
-  if((!waitconfirm) ||
-     (waitconfirm && (numconfirm == 0))) {
-    // send out status confirm msgs to all cores to check if there are
-    // transfer obj msgs on-the-fly
-    waitconfirm = true;
-    numconfirm = NUMCORESACTIVE - 1;
-    for(i = 1; i < NUMCORESACTIVE; ++i) {
-      corestatus[i] = 1;
-      // send status confirm msg to core i
-      send_msg_1(i, STATUSCONFIRM, false);
-    }   // for(i = 1; i < NUMCORESACTIVE; ++i)
-
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xec02);
-#endif
-    while(true) {
-      if(numconfirm == 0) {
-		break;
-      }
-    }   // wait for confirmations
-    waitconfirm = false;
-    numconfirm = 0;
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xec03);
-#endif
-    numsendobjs[BAMBOO_NUM_OF_CORE] = self_numsendobjs;
-    numreceiveobjs[BAMBOO_NUM_OF_CORE] = self_numreceiveobjs;
-    int sumsendobj = 0;
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xec04);
-#endif
-    for(i = 0; i < NUMCORESACTIVE; ++i) {
-      sumsendobj += numsendobjs[i];
-#ifdef DEBUG
-      BAMBOO_DEBUGPRINT(0xf000 + numsendobjs[i]);
-#endif
-    }             // for(i = 1; i < NUMCORESACTIVE; ++i)
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xec05);
-    BAMBOO_DEBUGPRINT_REG(sumsendobj);
-#endif
-    for(i = 0; i < NUMCORESACTIVE; ++i) {
-      sumsendobj -= numreceiveobjs[i];
-#ifdef DEBUG
-      BAMBOO_DEBUGPRINT(0xf000 + numreceiveobjs[i]);
-#endif
-    }             // for(i = 1; i < NUMCORESACTIVE; ++i)
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xec06);
-    BAMBOO_DEBUGPRINT_REG(sumsendobj);
-#endif
-    if(0 == sumsendobj) {
-      return true;
-    } else {
-      // still have some transfer obj msgs on-the-fly, can not start gc
-      return false;
-    }  // if(0 == sumsendobj)
-  } else {
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xec07);
-#endif
-    // previously asked for status confirmation and do not have all the
-    // confirmations yet, can not start gc
-    return false;
-  }       // if((!waitconfirm) ||
-} // bool preGC()*/
-
-inline void initGC() {
-  int i;
-  if(STARTUPCORE == BAMBOO_NUM_OF_CORE) {
-    for(i = 0; i < NUMCORES4GC; ++i) {
-      gccorestatus[i] = 1;
-      gcnumsendobjs[0][i] = gcnumsendobjs[1][i] = 0;
-      gcnumreceiveobjs[0][i] = gcnumreceiveobjs[1][i] = 0;
-      gcloads[i] = 0;
-      gcrequiredmems[i] = 0;
-      gcfilledblocks[i] = 0;
-      gcstopblock[i] = 0;
-    } // for(i = 0; i < NUMCORES4GC; ++i)
-    for(i = NUMCORES4GC; i < NUMCORESACTIVE; ++i) {
-      gccorestatus[i] = 1;
-      gcnumsendobjs[0][i] = gcnumsendobjs[1][i] = 0;
-      gcnumreceiveobjs[0][i] = gcnumreceiveobjs[1][i] = 0;
-    }
-    gcheaptop = 0;
-    gctopcore = 0;
-    gctopblock = 0;
-  } // if(STARTUPCORE == BAMBOO_NUM_OF_CORE)
-  gcself_numsendobjs = 0;
-  gcself_numreceiveobjs = 0;
-  gcmarkedptrbound = 0;
-  gcobj2map = 0;
-  gcmappedobj = 0;
-  //gcismapped = false;
-  gcnumlobjs = 0;
-  gcmovestartaddr = 0;
-  gctomove = false;
-  gcblock2fill = 0;
-  gcmovepending = 0;
-  gccurr_heaptop = 0;
-  gcdstcore = 0;
-
-  // initialize queue
-  if (gchead==NULL) {
-    gcheadindex=gctailindex=gctailindex2 = 0;
-    gchead=gctail=gctail2=RUNMALLOC(sizeof(struct pointerblock));
-  } else {
-    gctailindex = gctailindex2 = gcheadindex;
-    gctail = gctail2 = gchead;
-  }
-
-  // initialize the large obj queues
-  if (gclobjhead==NULL) {
-    gclobjheadindex=0;
-    gclobjtailindex=0;
-    gclobjtailindex2 = 0;
-    gclobjhead=gclobjtail=gclobjtail2=
-	  RUNMALLOC(sizeof(struct lobjpointerblock));
-  } else {
-    gclobjtailindex = gclobjtailindex2 = gclobjheadindex = 0;
-    gclobjtail = gclobjtail2 = gclobjhead;
-  }
-  gclobjhead->next = gclobjhead->prev = NULL;
-
-#ifdef LOCALHASHTBL_TEST
-  freeRuntimeHash(gcpointertbl);
-  gcpointertbl = allocateRuntimeHash(20);
-#else
-  mgchashreset(gcpointertbl);
-#endif
-  //gcpointertbl = allocateMGCHash(20);
-
-  freeMGCHash(gcforwardobjtbl);
-  gcforwardobjtbl = allocateMGCHash(20, 3);
-
-  // initialize the mapping info related structures
-  if((BAMBOO_NUM_OF_CORE < NUMCORES4GC) && (gcsharedptbl != NULL)) {
-	// Never free the shared hash table, just reset it
-	/*freeGCSharedHash(gcsharedptbl);
-	gcsharedptbl = allocateGCSharedHash(20);*/
-	mgcsharedhashReset(gcsharedptbl);
-  }
-  // Zero out the remaining bamboo_cur_msp 
-  // Only zero out the first 4 bytes of the remaining memory
-  /*if((bamboo_cur_msp != 0) 
-	  && (bamboo_smem_zero_top == bamboo_cur_msp) 
-	  && (bamboo_smem_size > 0)) {
-	*((int *)bamboo_cur_msp) = 0;
-  }*/
-#ifdef GC_PROFILE
-  // TODO
-  /*num_mapinforequest = 0;
-  num_mapinforequest_i = 0;
-  flushstalltime = 0;
-  flushstalltime_i = 0;
-  num_markrequest = 0;
-  marktime = 0;*/
-  gc_num_livespace = 0;
-  gc_num_freespace = 0;
-  gc_num_lobj = 0;
-  gc_num_lobjspace = 0;
-  gc_num_liveobj = 0;
-  gc_num_forwardobj = 0;
-  gc_num_profiles = NUMCORESACTIVE - 1;
-#endif
-} // void initGC()
-
-// compute load balance for all cores
-inline int loadbalance(int * heaptop) {
-  // compute load balance
-  int i;
-
-  // get the total loads
-  int tloads = gcloads[STARTUPCORE];
-  for(i = 1; i < NUMCORES4GC; i++) {
-    tloads += gcloads[i];
-  }
-  *heaptop = gcbaseva + tloads;
-
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xdddd);
-  BAMBOO_DEBUGPRINT_REG(tloads);
-  BAMBOO_DEBUGPRINT_REG(*heaptop);
-#endif
-  int b = 0;
-  BLOCKINDEX(*heaptop, &b);
-  int numbpc = b / NUMCORES4GC;       // num of blocks per core
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT_REG(b);
-  BAMBOO_DEBUGPRINT_REG(numbpc);
-#endif
-  gctopblock = b;
-  RESIDECORE(heaptop, &gctopcore);
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT_REG(gctopcore);
-#endif
-  return numbpc;
-} // void loadbalance(int * heaptop)
-
-inline bool cacheLObjs() {
-  // check the total mem size need for large objs
-  unsigned long long sumsize = 0;
-  int size = 0;
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xe801);
-#endif
-  gclobjtail2 = gclobjtail;
-  gclobjtailindex2 = gclobjtailindex;
-  int tmp_lobj = 0;
-  int tmp_len = 0;
-  int tmp_host = 0;
-  // compute total mem size required and sort the lobjs in ascending order
-  while(gc_lobjmoreItems2_I()) {
-    gc_lobjdequeue2_I();
-    tmp_lobj = gclobjtail2->lobjs[gclobjtailindex2-1];
-    tmp_host = gclobjtail2->hosts[gclobjtailindex2-1];
-    tmp_len = gclobjtail2->lengths[gclobjtailindex2 - 1];
-    sumsize += tmp_len;
-#ifdef GC_PROFILE
-	gc_num_lobj++;
-#endif
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT_REG(gclobjtail2->lobjs[gclobjtailindex2-1]);
-    BAMBOO_DEBUGPRINT_REG(tmp_len);
-    BAMBOO_DEBUGPRINT_REG(sumsize);
-#endif
-    int i = gclobjtailindex2-1;
-    struct lobjpointerblock * tmp_block = gclobjtail2;
-    // find the place to insert
-    while(true) {
-      if(i == 0) {
-		if(tmp_block->prev == NULL) {
-		  break;
-		}
-		if(tmp_block->prev->lobjs[NUMLOBJPTRS-1] > tmp_lobj) {
-		  tmp_block->lobjs[i] = tmp_block->prev->lobjs[NUMLOBJPTRS-1];
-		  tmp_block->lengths[i] = tmp_block->prev->lengths[NUMLOBJPTRS-1];
-		  tmp_block->hosts[i] = tmp_block->prev->hosts[NUMLOBJPTRS-1];
-		  tmp_block = tmp_block->prev;
-		  i = NUMLOBJPTRS-1;
-		} else {
-		  break;
-		}  // if(tmp_block->prev->lobjs[NUMLOBJPTRS-1] < tmp_lobj)
-	  } else {
-		if(tmp_block->lobjs[i-1] > tmp_lobj) {
-		  tmp_block->lobjs[i] = tmp_block->lobjs[i-1];
-		  tmp_block->lengths[i] = tmp_block->lengths[i-1];
-		  tmp_block->hosts[i] = tmp_block->hosts[i-1];
-		  i--;
-		} else {
-		  break;
-		}  // if(tmp_block->lobjs[i-1] < tmp_lobj)
-      }  // if(i ==0 ) else {}
-    }   // while(true)
-    // insert it
-    if(i != gclobjtailindex2 - 1) {
-      tmp_block->lobjs[i] = tmp_lobj;
-      tmp_block->lengths[i] = tmp_len;
-      tmp_block->hosts[i] = tmp_host;
-    }
-  }  // while(gc_lobjmoreItems2())
-
-#ifdef GC_PROFILE
-  gc_num_lobjspace = sumsize;
-#endif
-  // check if there are enough space to cache these large objs
-  INTPTR dst = gcbaseva + (BAMBOO_SHARED_MEM_SIZE) -sumsize;
-  if((unsigned long long)gcheaptop > (unsigned long long)dst) {
-    // do not have enough room to cache large objs
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xe802);
-    BAMBOO_DEBUGPRINT_REG(dst);
-    BAMBOO_DEBUGPRINT_REG(gcheaptop);
-	BAMBOO_DEBUGPRINT_REG(sumsize);
-#endif
-    return false;
-  }
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xe803);
-  BAMBOO_DEBUGPRINT_REG(dst);
-  BAMBOO_DEBUGPRINT_REG(gcheaptop);
-#endif
-
-  gcheaptop = dst; // Note: record the start of cached lobjs with gcheaptop
-  // cache the largeObjs to the top of the shared heap
-  //gclobjtail2 = gclobjtail;
-  //gclobjtailindex2 = gclobjtailindex;
-  dst = gcbaseva + (BAMBOO_SHARED_MEM_SIZE);
-  while(gc_lobjmoreItems3_I()) {
-    gc_lobjdequeue3_I();
-    size = gclobjtail2->lengths[gclobjtailindex2];
-    // set the mark field to , indicating that this obj has been moved
-    // and need to be flushed
-    ((int *)(gclobjtail2->lobjs[gclobjtailindex2]))[6] = COMPACTED;
-    dst -= size;
-    if((int)dst < (int)(gclobjtail2->lobjs[gclobjtailindex2])+size) {
-      memmove(dst, gclobjtail2->lobjs[gclobjtailindex2], size);
-    } else {
-      //BAMBOO_WRITE_HINT_CACHE(dst, size);
-      memcpy(dst, gclobjtail2->lobjs[gclobjtailindex2], size);
-    }
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0x804);
-    BAMBOO_DEBUGPRINT_REG(gclobjtail2->lobjs[gclobjtailindex2]);
-    BAMBOO_DEBUGPRINT(dst);
-    BAMBOO_DEBUGPRINT_REG(size);
-    BAMBOO_DEBUGPRINT_REG(*((int*)gclobjtail2->lobjs[gclobjtailindex2]));
-    BAMBOO_DEBUGPRINT_REG(*((int*)(dst)));
-#endif
-  }
-  return true;
-} // void cacheLObjs()
-
-// update the bmmboo_smemtbl to record current shared mem usage
-void updateSmemTbl(int coren,
-                   int localtop) {
-  int ltopcore = 0;
-  int bound = BAMBOO_SMEM_SIZE_L;
-  BLOCKINDEX(localtop, &ltopcore);
-  if(localtop >= (gcbaseva+(BAMBOO_LARGE_SMEM_BOUND))) {
-    bound = BAMBOO_SMEM_SIZE;
-  }
-  int load = (localtop-gcbaseva)%bound;
-  int i = 0;
-  int j = 0;
-  int toset = 0;
-  do {
-    toset = gc_core2block[2*coren+i]+(NUMCORES4GC*2)*j;
-    if(toset < ltopcore) {
-      bamboo_smemtbl[toset]=
-        (toset<NUMCORES4GC) ? BAMBOO_SMEM_SIZE_L : BAMBOO_SMEM_SIZE;
-#ifdef SMEMM
-	  gcmem_mixed_usedmem += bamboo_smemtbl[toset];
-#endif
-    } else if(toset == ltopcore) {
-      bamboo_smemtbl[toset] = load;
-#ifdef SMEMM
-	  gcmem_mixed_usedmem += bamboo_smemtbl[toset];
-#endif
-      break;
-    } else {
-      break;
-    }
-    i++;
-    if(i == 2) {
-      i = 0;
-      j++;
-    }
-  } while(true);
-} // void updateSmemTbl(int, int)
-
-inline void moveLObjs() {
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xea01);
-#endif
-#ifdef SMEMM
-  // update the gcmem_mixed_usedmem
-  gcmem_mixed_usedmem = 0;
-#endif
-  // zero out the smemtbl
-  BAMBOO_MEMSET_WH(bamboo_smemtbl, 0, sizeof(int)*gcnumblock);
-  // find current heap top
-  // flush all gcloads to indicate the real heap top on one core
-  // previous it represents the next available ptr on a core
-  if((gcloads[0] > (gcbaseva+(BAMBOO_SMEM_SIZE_L)))
-     && ((gcloads[0]%(BAMBOO_SMEM_SIZE)) == 0)) {
-    // edge of a block, check if this is exactly the heaptop
-    BASEPTR(0, gcfilledblocks[0]-1, &(gcloads[0]));
-    gcloads[0]+=(gcfilledblocks[0]>1 ?
-                 (BAMBOO_SMEM_SIZE) : (BAMBOO_SMEM_SIZE_L));
-  }
-  updateSmemTbl(0, gcloads[0]);
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xea02);
-  BAMBOO_DEBUGPRINT_REG(gcloads[0]);
-  BAMBOO_DEBUGPRINT_REG(bamboo_smemtbl[0]);
-#endif
-  for(int i = 1; i < NUMCORES4GC; i++) {
-    int tmptop = 0;
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xf000+i);
-    BAMBOO_DEBUGPRINT_REG(gcloads[i]);
-    BAMBOO_DEBUGPRINT_REG(gcfilledblocks[i]);
-#endif
-    if((gcfilledblocks[i] > 0)
-       && ((gcloads[i] % (BAMBOO_SMEM_SIZE)) == 0)) {
-      // edge of a block, check if this is exactly the heaptop
-      BASEPTR(i, gcfilledblocks[i]-1, &gcloads[i]);
-      gcloads[i] += 
-		(gcfilledblocks[i]>1 ? (BAMBOO_SMEM_SIZE) : (BAMBOO_SMEM_SIZE_L));
-      tmptop = gcloads[i];
-    }
-    updateSmemTbl(i, gcloads[i]);
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT_REG(gcloads[i]);
-#endif
-  } // for(int i = 1; i < NUMCORES4GC; i++) {
-
-  // find current heap top
-  // TODO
-  // a bug here: when using local allocation, directly move large objects
-  // to the highest free chunk might not be memory efficient
-  int tmpheaptop = 0;
-  int size = 0;
-  int bound = 0;
-  int i = 0;
-  for(i = gcnumblock-1; i >= 0; i--) {
-    if(bamboo_smemtbl[i] > 0) {
-      break;
-    }
-  }
-  if(i == -1) {
-    tmpheaptop = gcbaseva;
-  } else {
-    tmpheaptop = gcbaseva+bamboo_smemtbl[i]+((i<NUMCORES4GC) ?
-		(BAMBOO_SMEM_SIZE_L*i) :
-        (BAMBOO_SMEM_SIZE*(i-NUMCORES4GC)+BAMBOO_LARGE_SMEM_BOUND));
-  }
-
-  // move large objs from gcheaptop to tmpheaptop
-  // write the header first
-  unsigned int tomove = gcbaseva + (BAMBOO_SHARED_MEM_SIZE) -gcheaptop;
-#ifdef SMEMM
-  gcmem_mixed_usedmem += tomove;
-#endif
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xea03);
-  BAMBOO_DEBUGPRINT_REG(tomove);
-  BAMBOO_DEBUGPRINT_REG(tmpheaptop);
-  BAMBOO_DEBUGPRINT_REG(gcheaptop);
-#endif
-  // flush the sbstartbl
-  BAMBOO_MEMSET_WH(&(gcsbstarttbl[gcreservedsb]), '\0',
-	  (BAMBOO_SHARED_MEM_SIZE/BAMBOO_SMEM_SIZE-gcreservedsb)*sizeof(INTPTR));
-  if(tomove == 0) {
-    gcheaptop = tmpheaptop;
-  } else {
-    // check how many blocks it acrosses
-    int remain = tmpheaptop-gcbaseva;
-    int sb = remain/(BAMBOO_SMEM_SIZE) + gcreservedsb;//number of the sblock
-    int b = 0;  // number of the block
-    BLOCKINDEX(tmpheaptop, &b);
-    // check the remaining space in this block
-    bound = (BAMBOO_SMEM_SIZE);
-    if(remain < (BAMBOO_LARGE_SMEM_BOUND)) {
-      bound = (BAMBOO_SMEM_SIZE_L);
-    }
-    remain = bound - remain%bound;
-
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xea04);
-#endif
-    size = 0;
-    int isize = 0;
-    int host = 0;
-    int ptr = 0;
-    int base = tmpheaptop;
-    int cpysize = 0;
-    remain -= BAMBOO_CACHE_LINE_SIZE;
-    tmpheaptop += BAMBOO_CACHE_LINE_SIZE;
-    gc_lobjqueueinit4_I();
-    while(gc_lobjmoreItems4_I()) {
-      ptr = (int)(gc_lobjdequeue4_I(&size, &host));
-      ALIGNSIZE(size, &isize);
-      if(remain < isize) {
-		// this object acrosses blocks
-		if(cpysize > 0) {
-		  // close current block, fill its header
-		  BAMBOO_MEMSET_WH(base, '\0', BAMBOO_CACHE_LINE_SIZE);
-		  *((int*)base) = cpysize + BAMBOO_CACHE_LINE_SIZE;
-		  bamboo_smemtbl[b]+=BAMBOO_CACHE_LINE_SIZE;//add the size of header
-		  cpysize = 0;
-		  base = tmpheaptop;
-		  if(remain == 0) {
-			remain = ((tmpheaptop-gcbaseva)<(BAMBOO_LARGE_SMEM_BOUND)) ?
-					 BAMBOO_SMEM_SIZE_L : BAMBOO_SMEM_SIZE;
-		  }
-		  remain -= BAMBOO_CACHE_LINE_SIZE;
-		  tmpheaptop += BAMBOO_CACHE_LINE_SIZE;
-		  BLOCKINDEX(tmpheaptop, &b);
-		  sb = (tmpheaptop-gcbaseva)/(BAMBOO_SMEM_SIZE) + gcreservedsb;
-		}  // if(cpysize > 0)
-
-		// move the large obj
-		if((int)gcheaptop < (int)(tmpheaptop)+size) {
-		  memmove(tmpheaptop, gcheaptop, size);
-		} else {
-		  //BAMBOO_WRITE_HINT_CACHE(tmpheaptop, size);
-		  memcpy(tmpheaptop, gcheaptop, size);
-		}
-		// fill the remaining space with -2 padding
-		BAMBOO_MEMSET_WH(tmpheaptop+size, -2, isize-size);
-#ifdef DEBUG
-		BAMBOO_DEBUGPRINT(0xea05);
-		BAMBOO_DEBUGPRINT_REG(gcheaptop);
-		BAMBOO_DEBUGPRINT_REG(tmpheaptop);
-		BAMBOO_DEBUGPRINT_REG(size);
-		BAMBOO_DEBUGPRINT_REG(isize);
-		BAMBOO_DEBUGPRINT_REG(base);
-#endif
-		gcheaptop += size;
-		// cache the mapping info anyway
-		//if(ptr != tmpheaptop) {
-		BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
-#ifdef LOCALHASHTBL_TEST
-		RuntimeHashadd_I(gcpointertbl, ptr, tmpheaptop);
-#else
-		mgchashInsert_I(gcpointertbl, ptr, tmpheaptop);
-#endif
-		//MGCHashadd_I(gcpointertbl, ptr, tmpheaptop);
-		BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-		//}
-#ifdef DEBUG
-		BAMBOO_DEBUGPRINT(0xcdca);
-		BAMBOO_DEBUGPRINT_REG(ptr);
-		BAMBOO_DEBUGPRINT_REG(tmpheaptop);
-#endif
-		if(host != BAMBOO_NUM_OF_CORE) {
-		  // send the original host core with the mapping info
-		  send_msg_3(host, GCLOBJMAPPING, ptr, tmpheaptop, false);
-#ifdef DEBUG
-		  BAMBOO_DEBUGPRINT(0xcdcb);
-		  BAMBOO_DEBUGPRINT_REG(ptr);
-		  BAMBOO_DEBUGPRINT_REG(tmpheaptop);
-#endif
-		} // if(host != BAMBOO_NUM_OF_CORE)
-		tmpheaptop += isize;
-
-		// set the gcsbstarttbl and bamboo_smemtbl
-		int tmpsbs = 1+(isize-remain-1)/BAMBOO_SMEM_SIZE;
-		for(int k = 1; k < tmpsbs; k++) {
-		  gcsbstarttbl[sb+k] = (INTPTR)(-1);
-		}
-		sb += tmpsbs;
-		bound = (b<NUMCORES4GC) ? BAMBOO_SMEM_SIZE_L : BAMBOO_SMEM_SIZE;
-		BLOCKINDEX(tmpheaptop-1, &tmpsbs);
-		for(; b < tmpsbs; b++) {
-		  bamboo_smemtbl[b] = bound;
-		  if(b==NUMCORES4GC-1) {
-			bound = BAMBOO_SMEM_SIZE;
-		  }
-		}
-		if(((isize-remain)%(BAMBOO_SMEM_SIZE)) == 0) {
-		  gcsbstarttbl[sb] = (INTPTR)(-1);
-		  remain = ((tmpheaptop-gcbaseva)<(BAMBOO_LARGE_SMEM_BOUND)) ?
-				   BAMBOO_SMEM_SIZE_L : BAMBOO_SMEM_SIZE;
-		  bamboo_smemtbl[b] = bound;
-		} else {
-		  gcsbstarttbl[sb] = (INTPTR)(tmpheaptop);
-		  remain = tmpheaptop-gcbaseva;
-		  bamboo_smemtbl[b] = remain%bound;
-		  remain = bound - bamboo_smemtbl[b];
-		} // if(((isize-remain)%(BAMBOO_SMEM_SIZE)) == 0) else ...
-
-		// close current block and fill the header
-		BAMBOO_MEMSET_WH(base, '\0', BAMBOO_CACHE_LINE_SIZE);
-		*((int*)base) = isize + BAMBOO_CACHE_LINE_SIZE;
-		cpysize = 0;
-		base = tmpheaptop;
-		if(remain == BAMBOO_CACHE_LINE_SIZE) {
-		  // fill with 0 in case
-		  BAMBOO_MEMSET_WH(tmpheaptop, '\0', remain);
-		}
-		remain -= BAMBOO_CACHE_LINE_SIZE;
-		tmpheaptop += BAMBOO_CACHE_LINE_SIZE;
-      } else {
-		remain -= isize;
-		// move the large obj
-		if((int)gcheaptop < (int)(tmpheaptop)+size) {
-		  memmove(tmpheaptop, gcheaptop, size);
-		} else {
-		  //BAMBOO_WRITE_HINT_CACHE(tmpheaptop, size);
-		  memcpy(tmpheaptop, gcheaptop, size);
-		}
-		// fill the remaining space with -2 padding
-		BAMBOO_MEMSET_WH(tmpheaptop+size, -2, isize-size);
-		// zero out original mem caching the lobj
-		//BAMBOO_MEMSET_WH(gcheaptop, '\0', size); // TODO ??
-#ifdef DEBUG
-		BAMBOO_DEBUGPRINT(0xea06);
-		BAMBOO_DEBUGPRINT_REG(gcheaptop);
-		BAMBOO_DEBUGPRINT_REG(tmpheaptop);
-		BAMBOO_DEBUGPRINT_REG(size);
-		BAMBOO_DEBUGPRINT_REG(isize);
-#endif
-
-		gcheaptop += size;
-		cpysize += isize;
-		// cache the mapping info anyway
-		//if(ptr != tmpheaptop) {
-		BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
-#ifdef LOCALHASHTBL_TEST
-		RuntimeHashadd_I(gcpointertbl, ptr, tmpheaptop);
-#else
-		mgchashInsert_I(gcpointertbl, ptr, tmpheaptop);
-#endif
-		//MGCHashadd_I(gcpointertbl, ptr, tmpheaptop);
-		BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-		//}
-#ifdef DEBUG
-		BAMBOO_DEBUGPRINT(0xcdcc);
-		BAMBOO_DEBUGPRINT_REG(ptr);
-		BAMBOO_DEBUGPRINT_REG(tmpheaptop);
-		BAMBOO_DEBUGPRINT_REG(*((int*)tmpheaptop));
-#endif
-		if(host != BAMBOO_NUM_OF_CORE) {
-		  // send the original host core with the mapping info
-		  send_msg_3(host, GCLOBJMAPPING, ptr, tmpheaptop, false);
-#ifdef DEBUG
-		  BAMBOO_DEBUGPRINT(0xcdcd);
-		  BAMBOO_DEBUGPRINT_REG(ptr);
-		  BAMBOO_DEBUGPRINT_REG(tmpheaptop);
-#endif
-		}                         // if(host != BAMBOO_NUM_OF_CORE)
-		tmpheaptop += isize;
-
-		// update bamboo_smemtbl
-		bamboo_smemtbl[b] += isize;
-	  }  // if(remain < isize) else ...
-    }  // while(gc_lobjmoreItems())
-    if(cpysize > 0) {
-      // close current block, fill the header
-      BAMBOO_MEMSET_WH(base, '\0', BAMBOO_CACHE_LINE_SIZE);
-      *((int*)base) = cpysize + BAMBOO_CACHE_LINE_SIZE;
-      bamboo_smemtbl[b] += BAMBOO_CACHE_LINE_SIZE;// add the size of the header
-    } else {
-      tmpheaptop -= BAMBOO_CACHE_LINE_SIZE;
-    }
-    gcheaptop = tmpheaptop;
-
-  } // if(tomove == 0)
-
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xea07);
-  BAMBOO_DEBUGPRINT_REG(gcheaptop);
-#endif
-
-  bamboo_free_block = 0;
-  int tbound = 0;
-  do {
-    tbound = (bamboo_free_block<NUMCORES4GC) ?
-             BAMBOO_SMEM_SIZE_L : BAMBOO_SMEM_SIZE;
-    if(bamboo_smemtbl[bamboo_free_block] == tbound) {
-      bamboo_free_block++;
-    } else {
-      // the first non-full partition
-      break;
-    }
-  } while(true);
-
-  // TODO
-  /*unsigned long long gc_num_livespace = 0;
-  for(int tmpi = 0; tmpi < gcnumblock; tmpi++) {
-	gc_num_livespace += bamboo_smemtbl[tmpi];
-  }
-  BAMBOO_DEBUGPRINT_REG(gc_num_livespace);
-  BAMBOO_DEBUGPRINT_REG(bamboo_free_block);*/
-
-#ifdef GC_PROFILE
-  // check how many live space there are
-  gc_num_livespace = 0;
-  for(int tmpi = 0; tmpi < gcnumblock; tmpi++) {
-	gc_num_livespace += bamboo_smemtbl[tmpi];
-  }
-  gc_num_freespace = (BAMBOO_SHARED_MEM_SIZE) - gc_num_livespace;
-#endif
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xea08);
-  BAMBOO_DEBUGPRINT_REG(gcheaptop);
-#endif
-} // void moveLObjs()
-
-inline void markObj(void * objptr) {
-  if(objptr == NULL) {
-    return;
-  }
-  if(ISSHAREDOBJ(objptr)) {
-    int host = hostcore(objptr);
-    if(BAMBOO_NUM_OF_CORE == host) {
-      // on this core
-      BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
-      if(((int *)objptr)[6] == INIT) {
-		// this is the first time that this object is discovered,
-		// set the flag as DISCOVERED
-		((int *)objptr)[6] |= DISCOVERED;
-		gc_enqueue_I(objptr);
-	  }
-      BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-    } else {
-#ifdef DEBUG
-      BAMBOO_DEBUGPRINT(0xbbbb);
-      BAMBOO_DEBUGPRINT_REG(host);
-      BAMBOO_DEBUGPRINT_REG(objptr);
-#endif
-      // check if this obj has been forwarded
-      if(!MGCHashcontains(gcforwardobjtbl, (int)objptr)) {
-#ifdef GC_PROFILE
-		// TODO unsigned long long ttime = BAMBOO_GET_EXE_TIME();
-#endif
-		// send a msg to host informing that objptr is active
-		send_msg_2(host, GCMARKEDOBJ, objptr, /*BAMBOO_NUM_OF_CORE,*/ false);
-#ifdef GC_PROFILE
-		// TODO
-		/*
-		marktime += BAMBOO_GET_EXE_TIME() - ttime;
-		num_markrequest++;*/
-		gc_num_forwardobj++;
-#endif // GC_PROFILE
-		gcself_numsendobjs++;
-		MGCHashadd(gcforwardobjtbl, (int)objptr);
-      }
-    }
-  } else {
-    BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
-    gc_enqueue_I(objptr);
-    BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-  }       // if(ISSHAREDOBJ(objptr))
-} // void markObj(void * objptr)
-
-// enqueue root objs
-inline void tomark(struct garbagelist * stackptr) {
-  if(MARKPHASE != gcphase) {
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT_REG(gcphase);
-#endif
-    BAMBOO_EXIT(0xb101);
-  }
-  gcbusystatus = true;
-  gcnumlobjs = 0;
-
-  int i,j;
-  // enqueue current stack
-  while(stackptr!=NULL) {
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xe501);
-    BAMBOO_DEBUGPRINT_REG(stackptr->size);
-    BAMBOO_DEBUGPRINT_REG(stackptr->next);
-    BAMBOO_DEBUGPRINT_REG(stackptr->array[0]);
-#endif
-    for(i=0; i<stackptr->size; i++) {
-      if(stackptr->array[i] != NULL) {
-		markObj(stackptr->array[i]);
-      }
-    }
-    stackptr=stackptr->next;
-  }
-
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xe503);
-#endif
-  // enqueue objectsets
-  if(BAMBOO_NUM_OF_CORE < NUMCORESACTIVE) {
-    for(i=0; i<NUMCLASSES; i++) {
-      struct parameterwrapper ** queues =
-        objectqueues[BAMBOO_NUM_OF_CORE][i];
-      int length = numqueues[BAMBOO_NUM_OF_CORE][i];
-      for(j = 0; j < length; ++j) {
-		struct parameterwrapper * parameter = queues[j];
-		struct ObjectHash * set=parameter->objectset;
-		struct ObjectNode * ptr=set->listhead;
-		while(ptr!=NULL) {
-		  markObj((void *)ptr->key);
-		  ptr=ptr->lnext;
-		}
-      }
-    }
-  }
-
-  // euqueue current task descriptor
-  if(currtpd != NULL) {
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xe504);
-#endif
-    for(i=0; i<currtpd->numParameters; i++) {
-      markObj(currtpd->parameterArray[i]);
-    }
-  }
-
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xe505);
-#endif
-  // euqueue active tasks
-  if(activetasks != NULL) {
-    struct genpointerlist * ptr=activetasks->list;
-    while(ptr!=NULL) {
-      struct taskparamdescriptor *tpd=ptr->src;
-      int i;
-      for(i=0; i<tpd->numParameters; i++) {
-		markObj(tpd->parameterArray[i]);
-      }
-      ptr=ptr->inext;
-    }
-  }
-
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xe506);
-#endif
-  // enqueue cached transferred obj
-  struct QueueItem * tmpobjptr =  getHead(&objqueue);
-  while(tmpobjptr != NULL) {
-    struct transObjInfo * objInfo =
-      (struct transObjInfo *)(tmpobjptr->objectptr);
-    markObj(objInfo->objptr);
-    tmpobjptr = getNextQueueItem(tmpobjptr);
-  }
-
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xe507);
-#endif
-  // enqueue cached objs to be transferred
-  struct QueueItem * item = getHead(totransobjqueue);
-  while(item != NULL) {
-    struct transObjInfo * totransobj =
-      (struct transObjInfo *)(item->objectptr);
-    markObj(totransobj->objptr);
-    item = getNextQueueItem(item);
-  }       // while(item != NULL)
-
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xe508);
-#endif
-  // enqueue lock related info
-  for(i = 0; i < runtime_locklen; ++i) {
-    markObj((void *)(runtime_locks[i].redirectlock));
-    if(runtime_locks[i].value != NULL) {
-      markObj((void *)(runtime_locks[i].value));
-    }
-  }
-
-} // void tomark(struct garbagelist * stackptr)
-
-inline void mark(bool isfirst,
-                 struct garbagelist * stackptr) {
-#ifdef DEBUG
-  if(BAMBOO_NUM_OF_CORE == 0) BAMBOO_DEBUGPRINT(0xed01);
-#endif
-  if(isfirst) {
-#ifdef DEBUG
-    if(BAMBOO_NUM_OF_CORE == 0) BAMBOO_DEBUGPRINT(0xed02);
-#endif
-    // enqueue root objs
-    tomark(stackptr);
-    gccurr_heaptop = 0; // record the size of all active objs in this core
-                        // aligned but does not consider block boundaries
-    gcmarkedptrbound = 0;
-  }
-#ifdef DEBUG
-  if(BAMBOO_NUM_OF_CORE == 0) BAMBOO_DEBUGPRINT(0xed03);
-#endif
-  int isize = 0;
-  bool checkfield = true;
-  bool sendStall = false;
-  // mark phase
-  while(MARKPHASE == gcphase) {
-#ifdef DEBUG
-    if(BAMBOO_NUM_OF_CORE == 0) BAMBOO_DEBUGPRINT(0xed04);
-#endif
-    while(true) {
-      BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
-      bool hasItems = gc_moreItems2_I();
-      BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-#ifdef DEBUG
-      BAMBOO_DEBUGPRINT(0xed05);
-#endif
-      if(!hasItems) {
-		break;
-      }
-      sendStall = false;
-      gcbusystatus = true;
-      checkfield = true;
-      void * ptr = gc_dequeue2_I();
-
-#ifdef DEBUG
-      BAMBOO_DEBUGPRINT_REG(ptr);
-#endif
-      int size = 0;
-      int isize = 0;
-      int type = 0;
-      // check if it is a shared obj
-      if(ISSHAREDOBJ(ptr)) {
-		// a shared obj, check if it is a local obj on this core
-		int host = hostcore(ptr);
-		bool islocal = (host == BAMBOO_NUM_OF_CORE);
-		if(islocal) {
-		  bool isnotmarked = ((((int *)ptr)[6] & DISCOVERED) != 0);
-		  if(isLarge(ptr, &type, &size) && isnotmarked) {
-			// ptr is a large object and not marked or enqueued
-#ifdef DEBUG
-			BAMBOO_DEBUGPRINT(0xecec);
-			BAMBOO_DEBUGPRINT_REG(ptr);
-			BAMBOO_DEBUGPRINT_REG(*((int*)ptr));
-#endif
-			BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
-			gc_lobjenqueue_I(ptr, size, BAMBOO_NUM_OF_CORE);
-			gcnumlobjs++;
-			BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-			// mark this obj
-			((int *)ptr)[6] = ((int *)ptr)[6] & (~DISCOVERED) | MARKED;
-		  } else if(isnotmarked) {
-			// ptr is an unmarked active object on this core
-			ALIGNSIZE(size, &isize);
-			gccurr_heaptop += isize;
-#ifdef DEBUG
-			BAMBOO_DEBUGPRINT(0xaaaa);
-			BAMBOO_DEBUGPRINT_REG(ptr);
-			BAMBOO_DEBUGPRINT_REG(isize);
-			BAMBOO_DEBUGPRINT(((int *)(ptr))[0]);
-#endif
-			// mark this obj
-			((int *)ptr)[6] = ((int *)ptr)[6] & (~DISCOVERED) | MARKED;
-		  
-			if(ptr + size > gcmarkedptrbound) {
-			  gcmarkedptrbound = ptr + size;
-			} // if(ptr + size > gcmarkedptrbound)
-		  } else {
-			// ptr is not an active obj or has been marked
-			checkfield = false;
-		  } // if(isLarge(ptr, &type, &size)) else ...
-		}  /* can never reach here
-		else {
-#ifdef DEBUG
-		  if(BAMBOO_NUM_OF_CORE == 0) {
-			BAMBOO_DEBUGPRINT(0xbbbb);
-			BAMBOO_DEBUGPRINT_REG(host);
-			BAMBOO_DEBUGPRINT_REG(ptr);
-		  }
-#endif
-		  // check if this obj has been forwarded
-		  if(!MGCHashcontains(gcforwardobjtbl, (int)ptr)) {
-			// send a msg to host informing that ptr is active
-			send_msg_2(host, GCMARKEDOBJ, ptr, false);
-			gcself_numsendobjs++;
-			MGCHashadd(gcforwardobjtbl, (int)ptr);
-		  }
-			checkfield = false;
-		}// if(isLocal(ptr)) else ...*/
-	  }   // if(ISSHAREDOBJ(ptr))
-#ifdef DEBUG
-      BAMBOO_DEBUGPRINT(0xed06);
-#endif
-
-      if(checkfield) {
-		// scan all pointers in ptr
-		unsigned INTPTR * pointer;
-		pointer=pointerarray[type];
-		if (pointer==0) {
-		  /* Array of primitives */
-		  /* Do nothing */
-		} else if (((INTPTR)pointer)==1) {
-		  /* Array of pointers */
-		  struct ArrayObject *ao=(struct ArrayObject *) ptr;
-		  int length=ao->___length___;
-		  int j;
-		  for(j=0; j<length; j++) {
-			void *objptr =
-			  ((void **)(((char *)&ao->___length___)+sizeof(int)))[j];
-			markObj(objptr);
-		  }
-		} else {
-		  INTPTR size=pointer[0];
-		  int i;
-		  for(i=1; i<=size; i++) {
-			unsigned int offset=pointer[i];
-			void * objptr=*((void **)(((char *)ptr)+offset));
-			markObj(objptr);
-		  }
-		}     // if (pointer==0) else if ... else ...
-      }   // if(checkfield)
-    }     // while(gc_moreItems2())
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xed07);
-#endif
-    gcbusystatus = false;
-    // send mark finish msg to core coordinator
-    if(STARTUPCORE == BAMBOO_NUM_OF_CORE) {
-#ifdef DEBUG
-      BAMBOO_DEBUGPRINT(0xed08);
-#endif
-      gccorestatus[BAMBOO_NUM_OF_CORE] = 0;
-      gcnumsendobjs[gcnumsrobjs_index][BAMBOO_NUM_OF_CORE]=gcself_numsendobjs;
-      gcnumreceiveobjs[gcnumsrobjs_index][BAMBOO_NUM_OF_CORE]=
-		gcself_numreceiveobjs;
-      gcloads[BAMBOO_NUM_OF_CORE] = gccurr_heaptop;
-    } else {
-      if(!sendStall) {
-#ifdef DEBUG
-		BAMBOO_DEBUGPRINT(0xed09);
-#endif
-		send_msg_4(STARTUPCORE, GCFINISHMARK, BAMBOO_NUM_OF_CORE,
-				   gcself_numsendobjs, gcself_numreceiveobjs, false);
-		sendStall = true;
-      }
-    }             // if(STARTUPCORE == BAMBOO_NUM_OF_CORE) ...
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xed0a);
-#endif
-
-    if(BAMBOO_NUM_OF_CORE == STARTUPCORE) {
-#ifdef DEBUG
-      BAMBOO_DEBUGPRINT(0xed0b);
-#endif
-      return;
-    }
-  }       // while(MARKPHASE == gcphase)
-} // mark()
-
-inline void compact2Heaptophelper_I(int coren,
-                                    int* p,
-                                    int* numblocks,
-                                    int* remain) {
-  int b;
-  int memneed = gcrequiredmems[coren] + BAMBOO_CACHE_LINE_SIZE;
-  if(STARTUPCORE == coren) {
-    gctomove = true;
-    gcmovestartaddr = *p;
-    gcdstcore = gctopcore;
-    gcblock2fill = *numblocks + 1;
-  } else {
-    send_msg_4(coren, GCMOVESTART, gctopcore, *p, (*numblocks) + 1, false);
-  }
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT_REG(coren);
-  BAMBOO_DEBUGPRINT_REG(gctopcore);
-  BAMBOO_DEBUGPRINT_REG(*p);
-  BAMBOO_DEBUGPRINT_REG(*numblocks+1);
-#endif
-  if(memneed < *remain) {
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xd104);
-#endif
-    *p = *p + memneed;
-    gcrequiredmems[coren] = 0;
-    gcloads[gctopcore] += memneed;
-    *remain = *remain - memneed;
-  } else {
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xd105);
-#endif
-    // next available block
-    *p = *p + *remain;
-    gcfilledblocks[gctopcore] += 1;
-    int newbase = 0;
-    BASEPTR(gctopcore, gcfilledblocks[gctopcore], &newbase);
-    gcloads[gctopcore] = newbase;
-    gcrequiredmems[coren] -= *remain - BAMBOO_CACHE_LINE_SIZE;
-    gcstopblock[gctopcore]++;
-    gctopcore = NEXTTOPCORE(gctopblock);
-    gctopblock++;
-    *numblocks = gcstopblock[gctopcore];
-    *p = gcloads[gctopcore];
-    BLOCKINDEX(*p, &b);
-    *remain=(b<NUMCORES4GC) ?
-             ((BAMBOO_SMEM_SIZE_L)-((*p)%(BAMBOO_SMEM_SIZE_L)))
-	     : ((BAMBOO_SMEM_SIZE)-((*p)%(BAMBOO_SMEM_SIZE)));
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xd106);
-    BAMBOO_DEBUGPRINT_REG(gctopcore);
-    BAMBOO_DEBUGPRINT_REG(*p);
-    BAMBOO_DEBUGPRINT_REG(b);
-    BAMBOO_DEBUGPRINT_REG(*remain);
-#endif
-  }       // if(memneed < remain)
-  gcmovepending--;
-} // void compact2Heaptophelper_I(int, int*, int*, int*)
-
-inline void compact2Heaptop() {
-  // no cores with spare mem and some cores are blocked with pending move
-  // find the current heap top and make them move to the heap top
-  int p;
-  int numblocks = gcfilledblocks[gctopcore];
-  //BASEPTR(gctopcore, numblocks, &p);
-  p = gcloads[gctopcore];
-  int b;
-  BLOCKINDEX(p, &b);
-  int remain = (b<NUMCORES4GC) ?
-               ((BAMBOO_SMEM_SIZE_L)-(p%(BAMBOO_SMEM_SIZE_L)))
-	       : ((BAMBOO_SMEM_SIZE)-(p%(BAMBOO_SMEM_SIZE)));
-  // check if the top core finishes
-  BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
-  if(gccorestatus[gctopcore] != 0) {
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xd101);
-    BAMBOO_DEBUGPRINT_REG(gctopcore);
-#endif
-    // let the top core finishes its own work first
-    compact2Heaptophelper_I(gctopcore, &p, &numblocks, &remain);
-    BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-    return;
-  }
-  BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xd102);
-  BAMBOO_DEBUGPRINT_REG(gctopcore);
-  BAMBOO_DEBUGPRINT_REG(p);
-  BAMBOO_DEBUGPRINT_REG(b);
-  BAMBOO_DEBUGPRINT_REG(remain);
-#endif
-  for(int i = 0; i < NUMCORES4GC; i++) {
-    BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
-    if((gccorestatus[i] != 0) && (gcrequiredmems[i] > 0)) {
-#ifdef DEBUG
-      BAMBOO_DEBUGPRINT(0xd103);
-#endif
-      compact2Heaptophelper_I(i, &p, &numblocks, &remain);
-      if(gccorestatus[gctopcore] != 0) {
-#ifdef DEBUG
-		BAMBOO_DEBUGPRINT(0xd101);
-		BAMBOO_DEBUGPRINT_REG(gctopcore);
-#endif
-		BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-		// the top core is not free now
-		return;
-      }
-    }             // if((gccorestatus[i] != 0) && (gcrequiredmems[i] > 0))
-    BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-  }       // for(i = 0; i < NUMCORES4GC; i++)
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xd106);
-#endif
-} // void compact2Heaptop()
-
-inline void resolvePendingMoveRequest() {
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xeb01);
-#endif
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xeeee);
-  for(int k = 0; k < NUMCORES4GC; k++) {
-    BAMBOO_DEBUGPRINT(0xf000+k);
-    BAMBOO_DEBUGPRINT_REG(gccorestatus[k]);
-    BAMBOO_DEBUGPRINT_REG(gcloads[k]);
-    BAMBOO_DEBUGPRINT_REG(gcfilledblocks[k]);
-    BAMBOO_DEBUGPRINT_REG(gcstopblock[k]);
-  }
-  BAMBOO_DEBUGPRINT(0xffff);
-#endif
-  int i;
-  int j;
-  bool nosparemem = true;
-  bool haspending = false;
-  bool hasrunning = false;
-  bool noblock = false;
-  int dstcore = 0;       // the core who need spare mem
-  int sourcecore = 0;       // the core who has spare mem
-  for(i = j = 0; (i < NUMCORES4GC) && (j < NUMCORES4GC); ) {
-    if(nosparemem) {
-      // check if there are cores with spare mem
-      if(gccorestatus[i] == 0) {
-		// finished working, check if it still have spare mem
-		if(gcfilledblocks[i] < gcstopblock[i]) {
-		  // still have spare mem
-		  nosparemem = false;
-		  sourcecore = i;
-		}  // if(gcfilledblocks[i] < gcstopblock[i]) else ...
-      }
-      i++;
-    }             // if(nosparemem)
-    if(!haspending) {
-      if(gccorestatus[j] != 0) {
-		// not finished, check if it has pending move requests
-		if((gcfilledblocks[j]==gcstopblock[j])&&(gcrequiredmems[j]>0)) {
-		  dstcore = j;
-		  haspending = true;
-		} else {
-		  hasrunning = true;
-		}  // if((gcfilledblocks[i] == gcstopblock[i])...) else ...
-      }  // if(gccorestatus[i] == 0) else ...
-      j++;
-    }  // if(!haspending)
-    if(!nosparemem && haspending) {
-      // find match
-      int tomove = 0;
-      int startaddr = 0;
-      BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
-      gcrequiredmems[dstcore] = assignSpareMem_I(sourcecore,
-                                                 gcrequiredmems[dstcore],
-                                                 &tomove,
-                                                 &startaddr);
-      BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-#ifdef DEBUG
-      BAMBOO_DEBUGPRINT(0xeb02);
-      BAMBOO_DEBUGPRINT_REG(sourcecore);
-      BAMBOO_DEBUGPRINT_REG(dstcore);
-      BAMBOO_DEBUGPRINT_REG(startaddr);
-      BAMBOO_DEBUGPRINT_REG(tomove);
-#endif
-      if(STARTUPCORE == dstcore) {
-#ifdef DEBUG
-		BAMBOO_DEBUGPRINT(0xeb03);
-#endif
-		gcdstcore = sourcecore;
-		gctomove = true;
-		gcmovestartaddr = startaddr;
-		gcblock2fill = tomove;
-      } else {
-#ifdef DEBUG
-		BAMBOO_DEBUGPRINT(0xeb04);
-#endif
-		send_msg_4(dstcore, GCMOVESTART, sourcecore,
-				   startaddr, tomove, false);
-      }
-      gcmovepending--;
-      nosparemem = true;
-      haspending = false;
-      noblock = true;
-    }
-  }       // for(i = 0; i < NUMCORES4GC; i++)
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xcccc);
-  BAMBOO_DEBUGPRINT_REG(hasrunning);
-  BAMBOO_DEBUGPRINT_REG(haspending);
-  BAMBOO_DEBUGPRINT_REG(noblock);
-#endif
-
-  if(!hasrunning && !noblock) {
-    gcphase = SUBTLECOMPACTPHASE;
-    compact2Heaptop();
-  }
-
-} // void resovePendingMoveRequest()
-
-struct moveHelper {
-  int numblocks;       // block num for heap
-  INTPTR base;       // base virtual address of current heap block
-  INTPTR ptr;       // virtual address of current heap top
-  int offset;       // offset in current heap block
-  int blockbase;       // virtual address of current small block to check
-  int blockbound;       // bound virtual address of current small blcok
-  int sblockindex;       // index of the small blocks
-  int top;       // real size of current heap block to check
-  int bound;       // bound size of current heap block to check
-}; // struct moveHelper
-
-// If out of boundary of valid shared memory, return false, else return true
-inline bool nextSBlock(struct moveHelper * orig) {
-  orig->blockbase = orig->blockbound;
-  bool sbchanged = false;
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xecc0);
-  BAMBOO_DEBUGPRINT_REG(orig->blockbase);
-  BAMBOO_DEBUGPRINT_REG(orig->blockbound);
-  BAMBOO_DEBUGPRINT_REG(orig->bound);
-  BAMBOO_DEBUGPRINT_REG(orig->ptr);
-#endif
-outernextSBlock:
-  // check if across a big block
-  // TODO now do not zero out the whole memory, maybe the last two conditions
-  // are useless now
-  if((orig->blockbase >= orig->bound) || (orig->ptr >= orig->bound)
-     || ((orig->ptr != NULL) && (*((int*)orig->ptr))==0)
-     || ((*((int*)orig->blockbase))==0)) {
-innernextSBlock:
-    // end of current heap block, jump to next one
-    orig->numblocks++;
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xecc1);
-    BAMBOO_DEBUGPRINT_REG(orig->numblocks);
-#endif
-    BASEPTR(BAMBOO_NUM_OF_CORE, orig->numblocks, &(orig->base));
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(orig->base);
-#endif
-    if(orig->base >= gcbaseva + BAMBOO_SHARED_MEM_SIZE) {
-      // out of boundary
-      orig->ptr = orig->base; // set current ptr to out of boundary too
-      return false;
-    }
-    //orig->bound = orig->base + BAMBOO_SMEM_SIZE;
-    orig->blockbase = orig->base;
-    orig->sblockindex = (orig->blockbase-gcbaseva)/BAMBOO_SMEM_SIZE;
-    sbchanged = true;
-    int blocknum = 0;
-    BLOCKINDEX(orig->base, &blocknum);
-    if(bamboo_smemtbl[blocknum] == 0) {
-      // goto next block
-      goto innernextSBlock;
-    }
-	// check the bamboo_smemtbl to decide the real bound
-	orig->bound = orig->base + bamboo_smemtbl[blocknum];
-  } else if(0 == (orig->blockbase%BAMBOO_SMEM_SIZE)) {
-    orig->sblockindex += 1;
-    sbchanged = true;
-  }  // if((orig->blockbase >= orig->bound) || (orig->ptr >= orig->bound)...
-
-  // check if this sblock should be skipped or have special start point
-  if(gcsbstarttbl[orig->sblockindex] == -1) {
-    // goto next sblock
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xecc2);
-#endif
-    orig->sblockindex += 1;
-    orig->blockbase += BAMBOO_SMEM_SIZE;
-    goto outernextSBlock;
-  } else if((gcsbstarttbl[orig->sblockindex] != 0)
-            && (sbchanged)) {
-    // the first time to access this SBlock
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xecc3);
-#endif
-    // not start from the very beginning
-    orig->blockbase = gcsbstarttbl[orig->sblockindex];
-  }       // if(gcsbstarttbl[orig->sblockindex] == -1) else ...
-
-  // setup information for this sblock
-  orig->blockbound = orig->blockbase + *((int*)(orig->blockbase));
-  orig->offset = BAMBOO_CACHE_LINE_SIZE;
-  orig->ptr = orig->blockbase + orig->offset;
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xecc4);
-  BAMBOO_DEBUGPRINT_REG(orig->base);
-  BAMBOO_DEBUGPRINT_REG(orig->bound);
-  BAMBOO_DEBUGPRINT_REG(orig->ptr);
-  BAMBOO_DEBUGPRINT_REG(orig->blockbound);
-  BAMBOO_DEBUGPRINT_REG(orig->blockbase);
-  BAMBOO_DEBUGPRINT_REG(orig->offset);
-#endif
-  if(orig->ptr >= orig->bound) {
-    // met a lobj, move to next block
-    goto innernextSBlock;
-  }
-
-  return true;
-} // bool nextSBlock(struct moveHelper * orig)
-
-// return false if there are no available data to compact
-inline bool initOrig_Dst(struct moveHelper * orig,
-                         struct moveHelper * to) {
-  // init the dst ptr
-  to->numblocks = 0;
-  to->top = to->offset = BAMBOO_CACHE_LINE_SIZE;
-  to->bound = BAMBOO_SMEM_SIZE_L;
-  BASEPTR(BAMBOO_NUM_OF_CORE, to->numblocks, &(to->base));
-
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xef01);
-  BAMBOO_DEBUGPRINT_REG(to->base);
-#endif
-  to->ptr = to->base + to->offset;
-
-  // init the orig ptr
-  orig->numblocks = 0;
-  orig->base = to->base;
-  int blocknum = 0;
-  BLOCKINDEX(orig->base, &blocknum);
-  // check the bamboo_smemtbl to decide the real bound
-  orig->bound = orig->base + bamboo_smemtbl[blocknum];
-  orig->blockbase = orig->base;
-  orig->sblockindex = (orig->base - gcbaseva) / BAMBOO_SMEM_SIZE;
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xef02);
-  BAMBOO_DEBUGPRINT_REG(orig->base);
-  BAMBOO_DEBUGPRINT_REG(orig->sblockindex);
-  BAMBOO_DEBUGPRINT_REG(gcsbstarttbl);
-  BAMBOO_DEBUGPRINT_REG(gcsbstarttbl[orig->sblockindex]);
-#endif
-
-  if(gcsbstarttbl[orig->sblockindex] == -1) {
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xef03);
-#endif
-    // goto next sblock
-    orig->blockbound =
-      gcbaseva+BAMBOO_SMEM_SIZE*(orig->sblockindex+1);
-    return nextSBlock(orig);
-  } else if(gcsbstarttbl[orig->sblockindex] != 0) {
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xef04);
-#endif
-    orig->blockbase = gcsbstarttbl[orig->sblockindex];
-  }
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xef05);
-#endif
-  orig->blockbound = orig->blockbase + *((int*)(orig->blockbase));
-  orig->offset = BAMBOO_CACHE_LINE_SIZE;
-  orig->ptr = orig->blockbase + orig->offset;
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xef06);
-  BAMBOO_DEBUGPRINT_REG(orig->base);
-#endif
-  return true;
-} // bool initOrig_Dst(struct moveHelper * orig, struct moveHelper * to)
-
-inline void nextBlock(struct moveHelper * to) {
-  to->top = to->bound + BAMBOO_CACHE_LINE_SIZE;       // header!
-  to->bound += BAMBOO_SMEM_SIZE;
-  to->numblocks++;
-  BASEPTR(BAMBOO_NUM_OF_CORE, to->numblocks, &(to->base));
-  to->offset = BAMBOO_CACHE_LINE_SIZE;
-  to->ptr = to->base + to->offset;
-} // void nextBlock(struct moveHelper * to)
-
-// endaddr does not contain spaces for headers
-inline bool moveobj(struct moveHelper * orig,
-                    struct moveHelper * to,
-                    int stopblock) {
-  if(stopblock == 0) {
-    return true;
-  }
-
-#ifdef DEBUG
-  //if((int)orig->ptr > 0x10767a00) {
-  BAMBOO_DEBUGPRINT(0xe201);
-  BAMBOO_DEBUGPRINT_REG(orig->ptr);
-  BAMBOO_DEBUGPRINT_REG(to->ptr);
-  //}
-#endif
-
-  int type = 0;
-  int size = 0;
-  int mark = 0;
-  int isize = 0;
-innermoveobj:
-  while((char)(*((int*)(orig->ptr))) == (char)(-2)) {
-    orig->ptr = (int*)(orig->ptr) + 1;
-  }
-  if((orig->ptr >= orig->bound) || (orig->ptr == orig->blockbound)) {
-    if(!nextSBlock(orig)) {
-      // finished, no more data
-      return true;
-    }
-    goto innermoveobj;
-  }
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xe202);
-  BAMBOO_DEBUGPRINT_REG(orig->ptr);
-  BAMBOO_DEBUGPRINT(((int *)(orig->ptr))[0]);
-#endif
-  // check the obj's type, size and mark flag
-  type = ((int *)(orig->ptr))[0];
-  size = 0;
-  if(type == 0) {
-    // end of this block, go to next one
-    if(!nextSBlock(orig)) {
-      // finished, no more data
-      return true;
-    }
-    goto innermoveobj;
-  } else if(type < NUMCLASSES) {
-    // a normal object
-    size = classsize[type];
-  } else {
-    // an array
-    struct ArrayObject *ao=(struct ArrayObject *)(orig->ptr);
-    int elementsize=classsize[type];
-    int length=ao->___length___;
-    size=sizeof(struct ArrayObject)+length*elementsize;
-  }
-  mark = ((int *)(orig->ptr))[6];
-  bool isremote = ((((int *)(orig->ptr))[6] & REMOTEM) != 0);
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xe203);
-  BAMBOO_DEBUGPRINT_REG(orig->ptr);
-  BAMBOO_DEBUGPRINT_REG(size);
-#endif
-  ALIGNSIZE(size, &isize);       // no matter is the obj marked or not
-                                 // should be able to across it
-  if((mark & MARKED) != 0) {
-#ifdef DEBUG
-//if((int)orig->ptr > 0x10760f00) {
-    BAMBOO_DEBUGPRINT(0xe204);
-//}
-#endif
-#ifdef GC_PROFILE
-	gc_num_liveobj++;
-#endif
-    // marked obj, copy it to current heap top
-    // check to see if remaining space is enough
-    if(to->top + isize > to->bound) {
-      // fill 0 indicating the end of this block
-      BAMBOO_MEMSET_WH(to->ptr,  '\0', to->bound - to->top);
-      // fill the header of this block and then go to next block
-      to->offset += to->bound - to->top;
-      BAMBOO_MEMSET_WH(to->base, '\0', BAMBOO_CACHE_LINE_SIZE);
-      (*((int*)(to->base))) = to->offset;
-      nextBlock(to);
-      if(stopblock == to->numblocks) {
-		// already fulfilled the block
-		return true;
-      }   // if(stopblock == to->numblocks)
-    }   // if(to->top + isize > to->bound)
-    // set the mark field to 2, indicating that this obj has been moved
-    // and need to be flushed
-    ((int *)(orig->ptr))[6] = COMPACTED;
-    if(to->ptr != orig->ptr) {
-      if((int)(orig->ptr) < (int)(to->ptr)+size) {
-		memmove(to->ptr, orig->ptr, size);
-      } else {
-		//BAMBOO_WRITE_HINT_CACHE(to->ptr, size);
-		memcpy(to->ptr, orig->ptr, size);
-      }
-      // fill the remaining space with -2
-      BAMBOO_MEMSET_WH(to->ptr+size, -2, isize-size);
-    }
-    // store mapping info
-    BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
-#ifdef LOCALHASHTBL_TEST
-    RuntimeHashadd_I(gcpointertbl, orig->ptr, to->ptr);
-#else
-	mgchashInsert_I(gcpointertbl, orig->ptr, to->ptr);
-#endif
-	//MGCHashadd_I(gcpointertbl, orig->ptr, to->ptr);
-	if(isremote) {
-#ifdef GC_PROFILE
-	//unsigned long long ttimet = BAMBOO_GET_EXE_TIME();
-#endif
-	  // add to the sharedptbl
-	  if(gcsharedptbl != NULL) {
-		//GCSharedHashadd_I(gcsharedptbl, orig->ptr, to->ptr);
-		mgcsharedhashInsert_I(gcsharedptbl, orig->ptr, to->ptr);
-		//num_mapinforequest++; // TODO
-	  }
-#ifdef GC_PROFILE
-	//flushstalltime_i += BAMBOO_GET_EXE_TIME()-ttimet;
-#endif
-	}
-    BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-    //}
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xcdce);
-    BAMBOO_DEBUGPRINT_REG(orig->ptr);
-    BAMBOO_DEBUGPRINT_REG(to->ptr);
-	BAMBOO_DEBUGPRINT_REG(isize);
-#endif
-    gccurr_heaptop -= isize;
-    to->ptr += isize;
-    to->offset += isize;
-    to->top += isize;
-    if(to->top == to->bound) {
-      // fill the header of this block and then go to next block
-      BAMBOO_MEMSET_WH(to->base, '\0', BAMBOO_CACHE_LINE_SIZE);
-      (*((int*)(to->base))) = to->offset;
-      nextBlock(to);
-    }
-  }       // if(mark == 1)
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xe205);
-#endif
-  // move to next obj
-  orig->ptr += size;
-
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT_REG(isize);
-  BAMBOO_DEBUGPRINT_REG(size);
-  BAMBOO_DEBUGPRINT_REG(orig->ptr);
-  BAMBOO_DEBUGPRINT_REG(orig->bound);
-#endif
-  if((orig->ptr > orig->bound) || (orig->ptr == orig->blockbound)) {
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xe206);
-#endif
-    if(!nextSBlock(orig)) {
-      // finished, no more data
-      return true;
-    }
-  }
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xe207);
-  BAMBOO_DEBUGPRINT_REG(orig->ptr);
-#endif
-  return false;
-} //bool moveobj(struct moveHelper* orig,struct moveHelper* to,int* endaddr)
-
-// should be invoked with interrupt closed
-inline int assignSpareMem_I(int sourcecore,
-                            int * requiredmem,
-                            int * tomove,
-                            int * startaddr) {
-  int b = 0;
-  BLOCKINDEX(gcloads[sourcecore], &b);
-  int boundptr = (b<NUMCORES4GC) ? ((b+1)*BAMBOO_SMEM_SIZE_L)
-		 : (BAMBOO_LARGE_SMEM_BOUND+(b-NUMCORES4GC+1)*BAMBOO_SMEM_SIZE);
-  int remain = boundptr - gcloads[sourcecore];
-  int memneed = requiredmem + BAMBOO_CACHE_LINE_SIZE;
-  *startaddr = gcloads[sourcecore];
-  *tomove = gcfilledblocks[sourcecore] + 1;
-  if(memneed < remain) {
-    gcloads[sourcecore] += memneed;
-    return 0;
-  } else {
-    // next available block
-    gcfilledblocks[sourcecore] += 1;
-    int newbase = 0;
-    BASEPTR(sourcecore, gcfilledblocks[sourcecore], &newbase);
-    gcloads[sourcecore] = newbase;
-    return requiredmem-remain;
-  }
-} // int assignSpareMem_I(int ,int * , int * , int * )
-
-// should be invoked with interrupt closed
-inline bool gcfindSpareMem_I(int * startaddr,
-                             int * tomove,
-                             int * dstcore,
-                             int requiredmem,
-                             int requiredcore) {
-  for(int k = 0; k < NUMCORES4GC; k++) {
-    if((gccorestatus[k] == 0) && (gcfilledblocks[k] < gcstopblock[k])) {
-      // check if this stopped core has enough mem
-      assignSpareMem_I(k, requiredmem, tomove, startaddr);
-      *dstcore = k;
-      return true;
-    }
-  }
-  // if can not find spare mem right now, hold the request
-  gcrequiredmems[requiredcore] = requiredmem;
-  gcmovepending++;
-  return false;
-} //bool gcfindSpareMem_I(int* startaddr,int* tomove,int mem,int core)
-
-inline bool compacthelper(struct moveHelper * orig,
-                          struct moveHelper * to,
-                          int * filledblocks,
-                          int * heaptopptr,
-                          bool * localcompact) {
-  // scan over all objs in this block, compact the marked objs
-  // loop stop when finishing either scanning all active objs or
-  // fulfilled the gcstopblock
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xe101);
-  BAMBOO_DEBUGPRINT_REG(gcblock2fill);
-  BAMBOO_DEBUGPRINT_REG(gcmarkedptrbound);
-#endif
-innercompact:
-  while(orig->ptr < gcmarkedptrbound) {
-    bool stop = moveobj(orig, to, gcblock2fill);
-    if(stop) {
-      break;
-    }
-  }
-  // if no objs have been compact, do nothing,
-  // otherwise, fill the header of this block
-  if(to->offset > BAMBOO_CACHE_LINE_SIZE) {
-    BAMBOO_MEMSET_WH(to->base, '\0', BAMBOO_CACHE_LINE_SIZE);
-    (*((int*)(to->base))) = to->offset;
-  } else {
-    to->offset = 0;
-    to->ptr = to->base;
-    to->top -= BAMBOO_CACHE_LINE_SIZE;
-  }       // if(to->offset > BAMBOO_CACHE_LINE_SIZE) else ...
-  if(*localcompact) {
-    *heaptopptr = to->ptr;
-    *filledblocks = to->numblocks;
-  }
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xe102);
-  BAMBOO_DEBUGPRINT_REG(orig->ptr);
-  BAMBOO_DEBUGPRINT_REG(gcmarkedptrbound);
-  BAMBOO_DEBUGPRINT_REG(*heaptopptr);
-  BAMBOO_DEBUGPRINT_REG(*filledblocks);
-  BAMBOO_DEBUGPRINT_REG(gccurr_heaptop);
-#endif
-
-  // send msgs to core coordinator indicating that the compact is finishing
-  // send compact finish message to core coordinator
-  if(STARTUPCORE == BAMBOO_NUM_OF_CORE) {
-    gcfilledblocks[BAMBOO_NUM_OF_CORE] = *filledblocks;
-    gcloads[BAMBOO_NUM_OF_CORE] = *heaptopptr;
-    if(orig->ptr < gcmarkedptrbound) {
-#ifdef DEBUG
-      BAMBOO_DEBUGPRINT(0xe103);
-#endif
-      // ask for more mem
-      gctomove = false;
-      BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
-      if(gcfindSpareMem_I(&gcmovestartaddr, &gcblock2fill, &gcdstcore,
-                          gccurr_heaptop, BAMBOO_NUM_OF_CORE)) {
-#ifdef DEBUG
-		BAMBOO_DEBUGPRINT(0xe104);
-#endif
-		gctomove = true;
-      } else {
-		BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-#ifdef DEBUG
-		BAMBOO_DEBUGPRINT(0xe105);
-#endif
-		return false;
-      }
-      BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-    } else {
-#ifdef DEBUG
-      BAMBOO_DEBUGPRINT(0xe106);
-#endif
-      gccorestatus[BAMBOO_NUM_OF_CORE] = 0;
-      gctomove = false;
-      return true;
-    }
-  } else {
-    if(orig->ptr < gcmarkedptrbound) {
-#ifdef DEBUG
-      BAMBOO_DEBUGPRINT(0xe107);
-#endif
-      // ask for more mem
-      gctomove = false;
-      send_msg_5(STARTUPCORE, GCFINISHCOMPACT, BAMBOO_NUM_OF_CORE,
-                 *filledblocks, *heaptopptr, gccurr_heaptop, false);
-    } else {
-#ifdef DEBUG
-      BAMBOO_DEBUGPRINT(0xe108);
-      BAMBOO_DEBUGPRINT_REG(*heaptopptr);
-#endif
-      // finish compacting
-      send_msg_5(STARTUPCORE, GCFINISHCOMPACT, BAMBOO_NUM_OF_CORE,
-                 *filledblocks, *heaptopptr, 0, false);
-    }
-  }       // if(STARTUPCORE == BAMBOO_NUM_OF_CORE)
-
-  if(orig->ptr < gcmarkedptrbound) {
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xe109);
-#endif
-    // still have unpacked obj
-    while(true) {
-      if(gctomove) {
-		break;
-      }
-    }
-    ;
-	gctomove = false;
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xe10a);
-#endif
-
-    to->ptr = gcmovestartaddr;
-    to->numblocks = gcblock2fill - 1;
-    to->bound = (to->numblocks==0) ?
-                BAMBOO_SMEM_SIZE_L :
-                BAMBOO_SMEM_SIZE_L+BAMBOO_SMEM_SIZE*to->numblocks;
-    BASEPTR(gcdstcore, to->numblocks, &(to->base));
-    to->offset = to->ptr - to->base;
-    to->top = (to->numblocks==0) ?
-              (to->offset) : (to->bound-BAMBOO_SMEM_SIZE+to->offset);
-    to->base = to->ptr;
-    to->offset = BAMBOO_CACHE_LINE_SIZE;
-    to->ptr += to->offset;             // for header
-    to->top += to->offset;
-    if(gcdstcore == BAMBOO_NUM_OF_CORE) {
-      *localcompact = true;
-    } else {
-      *localcompact = false;
-    }
-    goto innercompact;
-  }
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xe10b);
-#endif
-  return true;
-} // void compacthelper()
-
-inline void compact() {
-  if(COMPACTPHASE != gcphase) {
-    BAMBOO_EXIT(0xb102);
-  }
-
-  // initialize pointers for comapcting
-  struct moveHelper * orig =
-    (struct moveHelper *)RUNMALLOC(sizeof(struct moveHelper));
-  struct moveHelper * to =
-    (struct moveHelper *)RUNMALLOC(sizeof(struct moveHelper));
-
-  if(!initOrig_Dst(orig, to)) {
-    // no available data to compact
-    // send compact finish msg to STARTUP core
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xe001);
-    BAMBOO_DEBUGPRINT_REG(to->base);
-#endif
-    send_msg_5(STARTUPCORE, GCFINISHCOMPACT, BAMBOO_NUM_OF_CORE,
-               0, to->base, 0, false);
-    RUNFREE(orig);
-    RUNFREE(to);
-    return;
-  }
-
-  int filledblocks = 0;
-  INTPTR heaptopptr = 0;
-  bool localcompact = true;
-  compacthelper(orig, to, &filledblocks, &heaptopptr, &localcompact);
-
-  RUNFREE(orig);
-  RUNFREE(to);
-} // compact()
-
-// if return NULL, means
-//   1. objptr is NULL
-//   2. objptr is not a shared obj
-// in these cases, remain the original value is OK
-inline void * flushObj(void * objptr) {
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xe401);
-#endif
-  if(objptr == NULL) {
-    return NULL;
-  }
-  void * dstptr = NULL;
-  if(ISSHAREDOBJ(objptr)) {
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xe402);
-    BAMBOO_DEBUGPRINT_REG(objptr);
-#endif
-    // a shared obj ptr, change to new address
-    BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
-#ifdef GC_PROFILE
-    //unsigned long long ttime = BAMBOO_GET_EXE_TIME();
-#endif
-#ifdef LOCALHASHTBL_TEST
-    RuntimeHashget(gcpointertbl, objptr, &dstptr);
-#else
-	dstptr = mgchashSearch(gcpointertbl, objptr);
-#endif
-	//MGCHashget(gcpointertbl, objptr, &dstptr);
-#ifdef GC_PROFILE
-    //flushstalltime += BAMBOO_GET_EXE_TIME()-ttime;
-#endif
-    BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT_REG(dstptr);
-#endif
-
-    if(NULL == dstptr) {
-      // no mapping info
-#ifdef DEBUG
-      BAMBOO_DEBUGPRINT(0xe403);
-      BAMBOO_DEBUGPRINT_REG(objptr);
-      BAMBOO_DEBUGPRINT_REG(hostcore(objptr));
-#endif
-      if(hostcore(objptr) == BAMBOO_NUM_OF_CORE) {
-		// error! the obj is right on this core, but cannot find it
-		//BAMBOO_DEBUGPRINT(0xecec);
-		BAMBOO_DEBUGPRINT_REG(objptr);
-		BAMBOO_EXIT(0xb103);
-		// assume that the obj has not been moved, use the original address
-		//dstptr = objptr;
-      } else {
-		int hostc = hostcore(objptr);
-#ifdef GC_PROFILE
-		//unsigned long long ttimet = BAMBOO_GET_EXE_TIME();
-#endif
-		// check the corresponsing sharedptbl
-		BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
-		//struct GCSharedHash * sptbl = gcrpointertbls[hostcore(objptr)];
-		mgcsharedhashtbl_t * sptbl = gcrpointertbls[hostc];
-		if(sptbl != NULL) {
-		  //GCSharedHashget(sptbl, (int)objptr, &dstptr);
-		  dstptr = mgcsharedhashSearch(sptbl, (int)objptr);
-		  if(dstptr != NULL) {
-#ifdef LOCALHASHTBL_TEST
-			RuntimeHashadd_I(gcpointertbl, (int)objptr, (int)dstptr);
-#else
-			mgchashInsert_I(gcpointertbl, (int)objptr, (int)dstptr);
-#endif
-		  }
-		}
-		BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-#ifdef GC_PROFILE
-		//flushstalltime_i += BAMBOO_GET_EXE_TIME()-ttimet;
-#endif
-
-		if(dstptr == NULL) {
-		  // still can not get the mapping info,
-		  // send msg to host core for the mapping info
-		  gcobj2map = (int)objptr;
-		  gcismapped = false;
-		  gcmappedobj = NULL;
-#ifdef GC_PROFILE
-		  // TODO
-		  //num_mapinforequest++;
-		  //unsigned long long ttime = BAMBOO_GET_EXE_TIME();
-#endif
-#ifdef GC_PROFILE
-		  //unsigned long long ttimet = BAMBOO_GET_EXE_TIME();
-#endif
-		  // the first time require the mapping, send msg to the hostcore
-		  // for the mapping info
-		  send_msg_3(hostc, GCMAPREQUEST, (int)objptr,
-			  BAMBOO_NUM_OF_CORE, false);
-		  while(true) {
-			if(gcismapped) {
-			  break;
-			}
-		  }
-#ifdef GC_PROFILE
-		  //flushstalltime_i += BAMBOO_GET_EXE_TIME()-ttimet;
-#endif
-#ifdef GC_PROFILE
-		  // TODO
-		  //flushstalltime += BAMBOO_GET_EXE_TIME() - ttime;
-#endif
-		  BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
-#ifdef LOCALHASHTBL_TEST
-		  RuntimeHashget(gcpointertbl, objptr, &dstptr);
-#else
-		  dstptr = mgchashSearch(gcpointertbl, objptr);
-#endif
-		  //MGCHashget(gcpointertbl, objptr, &dstptr);
-		  BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-		} // if(dstptr == NULL)
-	  }    // if(hostcore(objptr) == BAMBOO_NUM_OF_CORE) else ...
-#ifdef DEBUG
-      BAMBOO_DEBUGPRINT_REG(dstptr);
-#endif
-    }     // if(NULL == dstptr)
-  }      // if(ISSHAREDOBJ(objptr))
-         // if not a shared obj, return NULL to indicate no need to flush
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xe404);
-#endif
-  return dstptr;
-} // void flushObj(void * objptr)
-
-inline void flushRuntimeObj(struct garbagelist * stackptr) {
-  int i,j;
-  // flush current stack
-  while(stackptr!=NULL) {
-    for(i=0; i<stackptr->size; i++) {
-      if(stackptr->array[i] != NULL) {
-		void * dst = flushObj(stackptr->array[i]);
-		if(dst != NULL) {
-		  stackptr->array[i] = dst;
-		}
-      }
-    }
-    stackptr=stackptr->next;
-  }
-
-  // flush objectsets
-  if(BAMBOO_NUM_OF_CORE < NUMCORESACTIVE) {
-    for(i=0; i<NUMCLASSES; i++) {
-      struct parameterwrapper ** queues =
-        objectqueues[BAMBOO_NUM_OF_CORE][i];
-      int length = numqueues[BAMBOO_NUM_OF_CORE][i];
-      for(j = 0; j < length; ++j) {
-		struct parameterwrapper * parameter = queues[j];
-		struct ObjectHash * set=parameter->objectset;
-		struct ObjectNode * ptr=set->listhead;
-		while(ptr!=NULL) {
-		  void * dst = flushObj((void *)ptr->key);
-		  if(dst != NULL) {
-			ptr->key = dst;
-		  }
-		  ptr=ptr->lnext;
-		}
-		ObjectHashrehash(set);
-      }
-    }
-  }
-
-  // flush current task descriptor
-  if(currtpd != NULL) {
-    for(i=0; i<currtpd->numParameters; i++) {
-      void * dst = flushObj(currtpd->parameterArray[i]);
-      if(dst != NULL) {
-		currtpd->parameterArray[i] = dst;
-      }
-    }
-  }
-
-  // flush active tasks
-  if(activetasks != NULL) {
-    struct genpointerlist * ptr=activetasks->list;
-    while(ptr!=NULL) {
-      struct taskparamdescriptor *tpd=ptr->src;
-      int i;
-      for(i=0; i<tpd->numParameters; i++) {
-		void * dst = flushObj(tpd->parameterArray[i]);
-		if(dst != NULL) {
-		  tpd->parameterArray[i] = dst;
-		}
-      }
-      ptr=ptr->inext;
-    }
-    genrehash(activetasks);
-  }
-
-  // flush cached transferred obj
-  struct QueueItem * tmpobjptr =  getHead(&objqueue);
-  while(tmpobjptr != NULL) {
-    struct transObjInfo * objInfo =
-      (struct transObjInfo *)(tmpobjptr->objectptr);
-    void * dst = flushObj(objInfo->objptr);
-    if(dst != NULL) {
-      objInfo->objptr = dst;
-    }
-    tmpobjptr = getNextQueueItem(tmpobjptr);
-  }
-
-  // flush cached objs to be transferred
-  struct QueueItem * item = getHead(totransobjqueue);
-  while(item != NULL) {
-    struct transObjInfo * totransobj =
-      (struct transObjInfo *)(item->objectptr);
-    void * dst = flushObj(totransobj->objptr);
-    if(dst != NULL) {
-      totransobj->objptr = dst;
-    }
-    item = getNextQueueItem(item);
-  }       // while(item != NULL)
-
-  // enqueue lock related info
-  for(i = 0; i < runtime_locklen; ++i) {
-    void * dst = flushObj(runtime_locks[i].redirectlock);
-    if(dst != NULL) {
-      runtime_locks[i].redirectlock = (int)dst;
-    }
-    if(runtime_locks[i].value != NULL) {
-      void * dst=flushObj(runtime_locks[i].value);
-      if(dst != NULL) {
-		runtime_locks[i].value = (int)dst;
-      }
-    }
-  }
-
-} // void flushRuntimeObj(struct garbagelist * stackptr)
-
-inline void transmappinginfo() {
-  // broadcast the sharedptbl pointer
-  for(int i = 0; i < NUMCORESACTIVE; i++) {
-	if(i != BAMBOO_NUM_OF_CORE) {
-	  send_msg_3(i, GCMAPTBL, gcsharedptbl, BAMBOO_NUM_OF_CORE, false);
-	}
-  }
-
-  if(STARTUPCORE != BAMBOO_NUM_OF_CORE) {
-	send_msg_2(STARTUPCORE, GCFINISHMAPINFO, BAMBOO_NUM_OF_CORE, false);
-  }
-}
-
-inline void flush(struct garbagelist * stackptr) {
-#ifdef GC_PROFILE
-  /* TODO if(BAMBOO_NUM_OF_CORE == 0) {
-    BAMBOO_DEBUGPRINT(0xcccc);
-    BAMBOO_DEBUGPRINT_REG(BAMBOO_GET_EXE_TIME());
-  }*/
-#endif
-
-  flushRuntimeObj(stackptr);
-#ifdef GC_PROFILE
-  // TODO if(BAMBOO_NUM_OF_CORE == 0) BAMBOO_DEBUGPRINT_REG(BAMBOO_GET_EXE_TIME());
-#endif
-
-  while(true) {
-    BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
-    bool hasItems = gc_moreItems_I();
-    BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-    if(!hasItems) {
-      break;
-    }
-
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xe301);
-#endif
-    BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
-    void * ptr = gc_dequeue_I();
-    BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-    if(ISSHAREDOBJ(ptr)) {
-      // should be a local shared obj and should have mapping info
-      ptr = flushObj(ptr);
-#ifdef DEBUG
-      BAMBOO_DEBUGPRINT(0xe302);
-      BAMBOO_DEBUGPRINT_REG(ptr);
-      BAMBOO_DEBUGPRINT_REG(tptr);
-      BAMBOO_DEBUGPRINT_REG(((int *)(tptr))[0]);
-#endif
-      if(ptr == NULL) {
-		BAMBOO_EXIT(0xb105);
-      }
-    } // if(ISSHAREDOBJ(ptr))
-    if((!ISSHAREDOBJ(ptr)) || (((int *)(ptr))[6] == COMPACTED)) {
-      int type = ((int *)(ptr))[0];
-      // scan all pointers in ptr
-      unsigned INTPTR * pointer;
-      pointer=pointerarray[type];
-#ifdef DEBUG
-      BAMBOO_DEBUGPRINT(0xe303);
-      BAMBOO_DEBUGPRINT_REG(pointer);
-#endif
-      if (pointer==0) {
-		/* Array of primitives */
-		/* Do nothing */
-      } else if (((INTPTR)pointer)==1) {
-#ifdef DEBUG
-		BAMBOO_DEBUGPRINT(0xe304);
-#endif
-		/* Array of pointers */
-		struct ArrayObject *ao=(struct ArrayObject *) ptr;
-		int length=ao->___length___;
-		int j;
-		for(j=0; j<length; j++) {
-#ifdef DEBUG
-		  BAMBOO_DEBUGPRINT(0xe305);
-#endif
-		  void *objptr=
-			((void **)(((char *)&ao->___length___)+sizeof(int)))[j];
-#ifdef DEBUG
-		  BAMBOO_DEBUGPRINT_REG(objptr);
-#endif
-		  if(objptr != NULL) {
-			void * dst = flushObj(objptr);
-			if(dst != NULL) {
-			  ((void **)(((char *)&ao->___length___)+sizeof(int)))[j] = dst;
-			}
-		  }
-		}
-      } else {
-#ifdef DEBUG
-		BAMBOO_DEBUGPRINT(0xe306);
-#endif
-		INTPTR size=pointer[0];
-		int i;
-		for(i=1; i<=size; i++) {
-#ifdef DEBUG
-		  BAMBOO_DEBUGPRINT(0xe307);
-#endif
-		  unsigned int offset=pointer[i];
-		  void * objptr=*((void **)(((char *)ptr)+offset));
-#ifdef DEBUG
-		  BAMBOO_DEBUGPRINT_REG(objptr);
-#endif
-		  if(objptr != NULL) {
-			void * dst = flushObj(objptr);
-			if(dst != NULL) {
-			  *((void **)(((char *)ptr)+offset)) = dst;
-			}
-		  }
-		} // for(i=1; i<=size; i++)
-      }  // if (pointer==0) else if (((INTPTR)pointer)==1) else ()
-         // restore the mark field, indicating that this obj has been flushed
-      if(ISSHAREDOBJ(ptr)) {
-		((int *)(ptr))[6] = INIT;
-      }
-    }  // if((!ISSHAREDOBJ(ptr)) || (((int *)(ptr))[6] == COMPACTED))
-  }   // while(gc_moreItems())
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xe308);
-#endif
-#ifdef GC_PROFILE
-  // TODO if(BAMBOO_NUM_OF_CORE == 0) BAMBOO_DEBUGPRINT_REG(BAMBOO_GET_EXE_TIME());
-#endif
-
-  // TODO bug here: the startup core contains all lobjs' info, thus all the
-  // lobjs are flushed in sequence.
-  // flush lobjs
-  while(gc_lobjmoreItems_I()) {
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xe309);
-#endif
-    void * ptr = gc_lobjdequeue_I(NULL, NULL);
-    ptr = flushObj(ptr);
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xe30a);
-    BAMBOO_DEBUGPRINT_REG(ptr);
-    BAMBOO_DEBUGPRINT_REG(tptr);
-    BAMBOO_DEBUGPRINT_REG(((int *)(tptr))[0]);
-#endif
-    if(ptr == NULL) {
-      BAMBOO_EXIT(0xb106);
-    }
-    if(((int *)(ptr))[6] == COMPACTED) {
-      int type = ((int *)(ptr))[0];
-      // scan all pointers in ptr
-      unsigned INTPTR * pointer;
-      pointer=pointerarray[type];
-#ifdef DEBUG
-      BAMBOO_DEBUGPRINT(0xe30b);
-      BAMBOO_DEBUGPRINT_REG(pointer);
-#endif
-      if (pointer==0) {
-		/* Array of primitives */
-		/* Do nothing */
-      } else if (((INTPTR)pointer)==1) {
-#ifdef DEBUG
-		BAMBOO_DEBUGPRINT(0xe30c);
-#endif
-		/* Array of pointers */
-		struct ArrayObject *ao=(struct ArrayObject *) ptr;
-		int length=ao->___length___;
-		int j;
-		for(j=0; j<length; j++) {
-#ifdef DEBUG
-		  BAMBOO_DEBUGPRINT(0xe30d);
-#endif
-		  void *objptr=
-			((void **)(((char *)&ao->___length___)+sizeof(int)))[j];
-#ifdef DEBUG
-		  BAMBOO_DEBUGPRINT_REG(objptr);
-#endif
-		  if(objptr != NULL) {
-			void * dst = flushObj(objptr);
-			if(dst != NULL) {
-			  ((void **)(((char *)&ao->___length___)+sizeof(int)))[j] = dst;
-			}
-		  }
-		}
-      } else {
-#ifdef DEBUG
-		BAMBOO_DEBUGPRINT(0xe30e);
-#endif
-		INTPTR size=pointer[0];
-		int i;
-		for(i=1; i<=size; i++) {
-#ifdef DEBUG
-		  BAMBOO_DEBUGPRINT(0xe30f);
-#endif
-		  unsigned int offset=pointer[i];
-		  void * objptr=*((void **)(((char *)ptr)+offset));
-
-#ifdef DEBUG
-		  BAMBOO_DEBUGPRINT_REG(objptr);
-#endif
-		  if(objptr != NULL) {
-			void * dst = flushObj(objptr);
-			if(dst != NULL) {
-			  *((void **)(((char *)ptr)+offset)) = dst;
-			}
-		  }
-		}  // for(i=1; i<=size; i++)
-      }  // if (pointer==0) else if (((INTPTR)pointer)==1) else ()
-         // restore the mark field, indicating that this obj has been flushed
-      ((int *)(ptr))[6] = INIT;
-    }     // if(((int *)(ptr))[6] == COMPACTED)
-  }     // while(gc_lobjmoreItems())
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xe310);
-#endif
-#ifdef GC_PROFILE
-  // TODO if(BAMBOO_NUM_OF_CORE == 0) BAMBOO_DEBUGPRINT_REG(BAMBOO_GET_EXE_TIME());
-#endif
-
-  // send flush finish message to core coordinator
-  if(STARTUPCORE == BAMBOO_NUM_OF_CORE) {
-    gccorestatus[BAMBOO_NUM_OF_CORE] = 0;
-  } else {
-    send_msg_2(STARTUPCORE, GCFINISHFLUSH, BAMBOO_NUM_OF_CORE, false);
-  }
-#ifdef GC_PROFILE
-  // TODO 
-  //if(BAMBOO_NUM_OF_CORE == 0) {
-    //BAMBOO_DEBUGPRINT(0xffff);
-    //BAMBOO_DEBUGPRINT_REG(num_mapinforequest);
-    //BAMBOO_DEBUGPRINT_REG(flushstalltime);
-    //BAMBOO_DEBUGPRINT_REG(num_mapinforequest_i);
-    //BAMBOO_DEBUGPRINT_REG(flushstalltime_i);
-  //}
-  //BAMBOO_DEBUGPRINT_REG(flushstalltime);
-#endif
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xe311);
-#endif
-} // flush()
-
-#ifdef GC_CACHE_ADAPT
-// prepare for cache adaption:
-//   -- flush the shared heap
-//   -- clean dtlb entries
-//   -- change cache strategy
-void cacheAdapt(bool isgccachestage) {
-  // flush the shared heap
-  BAMBOO_CACHE_FLUSH_L2();
-
-  // clean the dtlb entries
-  BAMBOO_CLEAN_DTLB();
-
-  // change the cache strategy
-  gccachestage = isgccachestage;
-}
-#endif // GC_CACHE_ADAPT
-
-inline void gc_collect(struct garbagelist * stackptr) {
-  //BAMBOO_DEBUGPRINT(0xcccc); // TODO 
-  // inform the master that this core is at a gc safe point and is ready to 
-  // do gc
-  send_msg_4(STARTUPCORE, GCFINISHPRE, BAMBOO_NUM_OF_CORE, self_numsendobjs, 
-	  self_numreceiveobjs, false);
-
-  // core collector routine
-  while(true) {
-    if(INITPHASE == gcphase) {
-      break;
-    }
-  }
-#ifdef RAWPATH // TODO GC_DEBUG
-  printf("(%X,%X) Do initGC\n", udn_tile_coord_x(), udn_tile_coord_y());
-#endif
-  initGC();
-#ifdef GC_CACHE_ADAPT
-  // prepare for cache adaption:
-  cacheAdapt(true);
-#endif // GC_CACHE_ADAPT
-  //send init finish msg to core coordinator
-  send_msg_2(STARTUPCORE, GCFINISHINIT, BAMBOO_NUM_OF_CORE, false);
-
-  while(true) {
-    if(MARKPHASE == gcphase) {
-      break;
-    }
-  }
-#ifdef RAWPATH // TODO GC_DEBUG
-  printf("(%x,%x) Start mark phase\n", udn_tile_coord_x(), 
-	     udn_tile_coord_y());
-#endif
-  mark(true, stackptr);
-#ifdef RAWPATH // TODO GC_DEBUG
-  printf("(%x,%x) Finish mark phase, start compact phase\n", 
-	     udn_tile_coord_x(), udn_tile_coord_y());
-#endif
-  compact();
-#ifdef RAWPATH // TODO GC_DEBUG
-  printf("(%x,%x) Finish compact phase\n", udn_tile_coord_x(),
-	     udn_tile_coord_y());
-#endif
-
-  while(true) {
-	if(MAPPHASE == gcphase) {
-	  break;
-	}
-  }
-#ifdef RAWPATH // TODO GC_DEBUG
-  printf("(%x,%x) Start map phase\n", udn_tile_coord_x(), 
-	     udn_tile_coord_y());
-#endif
-  transmappinginfo();
-#ifdef RAWPATH // TODO GC_DEBUG
-  printf("(%x,%x) Finish map phase\n", udn_tile_coord_x(),
-	     udn_tile_coord_y());
-#endif
-
-  while(true) {
-    if(FLUSHPHASE == gcphase) {
-      break;
-    }
-  }
-#ifdef RAWPATH // TODO GC_DEBUG
-  printf("(%x,%x) Start flush phase\n", udn_tile_coord_x(), 
-	     udn_tile_coord_y());
-#endif
-#ifdef GC_PROFILE
-  /*BAMBOO_DEBUGPRINT(0xaaaa);
-  BAMBOO_DEBUGPRINT_REG(gc_num_obj);
-  BAMBOO_DEBUGPRINT_REG(gc_num_liveobj);
-  BAMBOO_DEBUGPRINT_REG(gc_num_forwardobj);
-  BAMBOO_DEBUGPRINT(0xaaab);*/
-  // send the num of obj/liveobj/forwardobj to the startupcore
-  if(STARTUPCORE != BAMBOO_NUM_OF_CORE) {
-	send_msg_4(STARTUPCORE, GCPROFILES, gc_num_obj, 
-		gc_num_liveobj, gc_num_forwardobj, false);
-  }
-  gc_num_obj = 0;
-#endif // GC_PROFLIE
-  flush(stackptr);
-#ifdef RAWPATH // TODO GC_DEBUG
-  printf("(%x,%x) Finish flush phase\n", udn_tile_coord_x(),
-	     udn_tile_coord_y());
-#endif
-
-#ifdef GC_CACHE_ADAPT
-  while(true) {
-    if(PREFINISHPHASE == gcphase) {
-      break;
-    }
-  }
-#ifdef RAWPATH // TODO GC_DEBUG
-  printf("(%x,%x) Start prefinish phase\n", udn_tile_coord_x(), 
-	     udn_tile_coord_y());
-#endif
-  cacheAdapt(false);
-  //send init finish msg to core coordinator
-  send_msg_2(STARTUPCORE, GCFINISHPREF, BAMBOO_NUM_OF_CORE, false);
-#ifdef RAWPATH // TODO GC_DEBUG
-  printf("(%x,%x) Finish prefinish phase\n", udn_tile_coord_x(),
-	     udn_tile_coord_y());
-#endif
-#endif // GC_CACHE_ADAPT
-
-  while(true) {
-    if(FINISHPHASE == gcphase) {
-      break;
-    }
-  }
-#ifdef RAWPATH // TODO GC_DEBUG
-  printf("(%x,%x) Finish gc!\n", udn_tile_coord_x(), udn_tile_coord_y());
-#endif
-} // void gc_collect(struct garbagelist * stackptr)
-
-inline void gc_nocollect(struct garbagelist * stackptr) {
-  //BAMBOO_DEBUGPRINT(0xcccc); // TODO
-  // inform the master that this core is at a gc safe point and is ready to 
-  // do gc
-  send_msg_4(STARTUPCORE, GCFINISHPRE, BAMBOO_NUM_OF_CORE, self_numsendobjs, 
-	  self_numreceiveobjs, false);
-  
-  while(true) {
-    if(INITPHASE == gcphase) {
-      break;
-    }
-  }
-#ifdef RAWPATH // TODO GC_DEBUG
-  printf("(%x,%x) Do initGC\n", udn_tile_coord_x(), udn_tile_coord_y());
-#endif
-  initGC();
-#ifdef GC_CACHE_ADAPT
-  // prepare for cache adaption:
-  cacheAdapt(true);
-#endif // GC_CACHE_ADAPT
-  //send init finish msg to core coordinator
-  send_msg_2(STARTUPCORE, GCFINISHINIT, BAMBOO_NUM_OF_CORE, false);
-
-  while(true) {
-    if(MARKPHASE == gcphase) {
-      break;
-    }
-  }
-#ifdef RAWPATH // TODO GC_DEBUG
-  printf("(%x,%x) Start mark phase\n", udn_tile_coord_x(), 
-	     udn_tile_coord_y());
-#endif
-  mark(true, stackptr);
-#ifdef RAWPATH // TODO GC_DEBUG
-  printf("(%x,%x) Finish mark phase, wait for flush\n", 
-	     udn_tile_coord_x(), udn_tile_coord_y());
-#endif
-
-  // non-gc core collector routine
-  while(true) {
-    if(FLUSHPHASE == gcphase) {
-      break;
-    }
-  }
-#ifdef RAWPATH // TODO GC_DEBUG
-  printf("(%x,%x) Start flush phase\n", udn_tile_coord_x(), 
-	     udn_tile_coord_y());
-#endif
-#ifdef GC_PROFILE
-  /*BAMBOO_DEBUGPRINT(0xaaaa);
-  BAMBOO_DEBUGPRINT_REG(gc_num_obj);
-  BAMBOO_DEBUGPRINT_REG(gc_num_liveobj);
-  BAMBOO_DEBUGPRINT_REG(gc_num_forwardobj);
-  BAMBOO_DEBUGPRINT(0xaaab);*/
-  if(STARTUPCORE != BAMBOO_NUM_OF_CORE) {
-	send_msg_4(STARTUPCORE, GCPROFILES, gc_num_obj, 
-		gc_num_liveobj, gc_num_forwardobj, false);
-  }
-  gc_num_obj = 0;
-#endif // GC_PROFLIE
-  flush(stackptr);
-#ifdef RAWPATH // TODO GC_DEBUG
-  printf("(%x,%x) Finish flush phase\n", udn_tile_coord_x(), 
-	     udn_tile_coord_y());
-#endif
-
-#ifdef GC_CACHE_ADAPT
-  while(true) {
-    if(PREFINISHPHASE == gcphase) {
-      break;
-    }
-  }
-#ifdef RAWPATH // TODO GC_DEBUG
-  printf("(%x,%x) Start prefinish phase\n", udn_tile_coord_x(), 
-	     udn_tile_coord_y());
-#endif
-  cacheAdapt(false);
-  //send init finish msg to core coordinator
-  send_msg_2(STARTUPCORE, GCFINISHPREF, BAMBOO_NUM_OF_CORE, false);
-#ifdef RAWPATH // TODO GC_DEBUG
-  printf("(%x,%x) Finish prefinish phase\n", udn_tile_coord_x(),
-	     udn_tile_coord_y());
-#endif
-#endif // GC_CACHE_ADAPT
-
-  while(true) {
-    if(FINISHPHASE == gcphase) {
-      break;
-    }
-  }
-#ifdef RAWPATH // TODO GC_DEBUG
-  printf("(%x,%x) Finish gc!\n", udn_tile_coord_x(), udn_tile_coord_y());
-#endif
-} // void gc_collect(struct garbagelist * stackptr)
-
-inline void gc_master(struct garbagelist * stackptr) {
-
-  gcphase = INITPHASE;
-  int i = 0;
-  waitconfirm = false;
-  numconfirm = 0;
-  initGC();
-
-  // Note: all cores need to init gc including non-gc cores
-  for(i = 1; i < NUMCORESACTIVE /*NUMCORES4GC*/; i++) {
-	// send GC init messages to all cores
-	send_msg_1(i, GCSTARTINIT, false);
-  }
-  bool isfirst = true;
-  bool allStall = false;
-
-#ifdef GC_CACHE_ADAPT
-  // prepare for cache adaption:
-  cacheAdapt(true);
-#endif // GC_CACHE_ADAPT
-
-#ifdef RAWPATH // TODO GC_DEBUG
-  printf("(%x,%x) Check core status \n", udn_tile_coord_x(), 
-		 udn_tile_coord_y());
-#endif
-
-  gccorestatus[BAMBOO_NUM_OF_CORE] = 0;
-  while(true) {
-	BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
-	if(gc_checkAllCoreStatus_I()) {
-	  BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-	  break;
-	}
-	BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-  }
-#ifdef GC_PROFILE
-  gc_profileItem();
-#endif
-#ifdef RAWPATH // TODO GC_DEBUG
-  printf("(%x,%x) Start mark phase \n", udn_tile_coord_x(), 
-		 udn_tile_coord_y());
-#endif
-  // all cores have finished compacting
-  // restore the gcstatus of all cores
-  // Note: all cores have to do mark including non-gc cores
-  gccorestatus[BAMBOO_NUM_OF_CORE] = 1;
-  for(i = 1; i < NUMCORESACTIVE /*NUMCORES4GC*/; ++i) {
-	gccorestatus[i] = 1;
-	// send GC start messages to all cores
-	send_msg_1(i, GCSTART, false);
-  }
-
-  gcphase = MARKPHASE;
-  // mark phase
-  while(MARKPHASE == gcphase) {
-	mark(isfirst, stackptr);
-	if(isfirst) {
-	  isfirst = false;
-	}
-
-	// check gcstatus
-	checkMarkStatue();
-  }   // while(MARKPHASE == gcphase)
-  // send msgs to all cores requiring large objs info
-  // Note: only need to ask gc cores, non-gc cores do not host any objs
-  numconfirm = NUMCORES4GC - 1;
-  for(i = 1; i < NUMCORES4GC; ++i) {
-	send_msg_1(i, GCLOBJREQUEST, false);
-  }
-  gcloads[BAMBOO_NUM_OF_CORE] = gccurr_heaptop;
-  while(true) {
-	if(numconfirm==0) {
-	  break;
-	}
-  }   // wait for responses
-  // check the heaptop
-  if(gcheaptop < gcmarkedptrbound) {
-	gcheaptop = gcmarkedptrbound;
-  }
-#ifdef GC_PROFILE
-  gc_profileItem();
-  // TODO
-  /*if(BAMBOO_NUM_OF_CORE == 0) {
-	BAMBOO_DEBUGPRINT(0xeeee);
-	BAMBOO_DEBUGPRINT_REG(num_markrequest);
-	BAMBOO_DEBUGPRINT_REG(marktime);
-  }*/
-#endif
-#ifdef RAWPATH // TODO GC_DEBUG
-  printf("(%x,%x) prepare to cache large objs \n", udn_tile_coord_x(),
-		 udn_tile_coord_y());
-  //dumpSMem();
-#endif
-  // cache all large objs
-  if(!cacheLObjs()) {
-	// no enough space to cache large objs
-	BAMBOO_EXIT(0xb107);
-  }
-  // predict number of blocks to fill for each core
-  int tmpheaptop = 0;
-  int numpbc = loadbalance(&tmpheaptop);
-  // TODO
-  numpbc = (BAMBOO_SHARED_MEM_SIZE)/(BAMBOO_SMEM_SIZE);
-#ifdef RAWPATH // TODO GC_DEBUG
-  printf("(%x,%x) mark phase finished \n", udn_tile_coord_x(), 
-		 udn_tile_coord_y());
-  //dumpSMem();
-#endif
-  //int tmptopptr = 0;
-  //BASEPTR(gctopcore, 0, &tmptopptr);
-  // TODO
-  //tmptopptr = gcbaseva + (BAMBOO_SHARED_MEM_SIZE);
-  tmpheaptop = gcbaseva + (BAMBOO_SHARED_MEM_SIZE);
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xabab);
-  BAMBOO_DEBUGPRINT_REG(tmptopptr);
-#endif
-  for(i = 0; i < NUMCORES4GC; ++i) {
-	int tmpcoreptr = 0;
-	BASEPTR(i, numpbc, &tmpcoreptr);
-	//send start compact messages to all cores
-	//TODO bug here, do not know if the direction is positive or negtive?
-	if (tmpcoreptr < tmpheaptop /*tmptopptr*/) {
-	  gcstopblock[i] = numpbc + 1;
-	  if(i != STARTUPCORE) {
-		send_msg_2(i, GCSTARTCOMPACT, numpbc+1, false);
-	  } else {
-		gcblock2fill = numpbc+1;
-	  }                         // if(i != STARTUPCORE)
-	} else {
-	  gcstopblock[i] = numpbc;
-	  if(i != STARTUPCORE) {
-		send_msg_2(i, GCSTARTCOMPACT, numpbc, false);
-	  } else {
-		gcblock2fill = numpbc;
-	  }    // if(i != STARTUPCORE)
-	}
-#ifdef DEBUG
-	BAMBOO_DEBUGPRINT(0xf000+i);
-	BAMBOO_DEBUGPRINT_REG(tmpcoreptr);
-	BAMBOO_DEBUGPRINT_REG(gcstopblock[i]);
-#endif
-	// init some data strutures for compact phase
-	gcloads[i] = 0;
-	gcfilledblocks[i] = 0;
-	gcrequiredmems[i] = 0;
-  }
-
-#ifdef GC_PROFILE
-  gc_profileItem();
-#endif
-
-  // compact phase
-  bool finalcompact = false;
-  // initialize pointers for comapcting
-  struct moveHelper * orig =
-	(struct moveHelper *)RUNMALLOC(sizeof(struct moveHelper));
-  struct moveHelper * to =
-	(struct moveHelper *)RUNMALLOC(sizeof(struct moveHelper));
-  initOrig_Dst(orig, to);
-  int filledblocks = 0;
-  INTPTR heaptopptr = 0;
-  bool finishcompact = false;
-  bool iscontinue = true;
-  bool localcompact = true;
-  while((COMPACTPHASE == gcphase) || (SUBTLECOMPACTPHASE == gcphase)) {
-	if((!finishcompact) && iscontinue) {
-#ifdef DEBUG
-	  BAMBOO_DEBUGPRINT(0xe001);
-	  BAMBOO_DEBUGPRINT_REG(numpbc);
-	  BAMBOO_DEBUGPRINT_REG(gcblock2fill);
-#endif
-	  finishcompact = compacthelper(orig, to, &filledblocks,
-									&heaptopptr, &localcompact);
-#ifdef DEBUG
-	  BAMBOO_DEBUGPRINT(0xe002);
-	  BAMBOO_DEBUGPRINT_REG(finishcompact);
-	  BAMBOO_DEBUGPRINT_REG(gctomove);
-	  BAMBOO_DEBUGPRINT_REG(gcrequiredmems[0]);
-	  BAMBOO_DEBUGPRINT_REG(gcfilledblocks[0]);
-	  BAMBOO_DEBUGPRINT_REG(gcstopblock[0]);
-#endif
-	}
-
-	BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
-	if(gc_checkCoreStatus_I()) {
-	  // all cores have finished compacting
-	  // restore the gcstatus of all cores
-	  for(i = 0; i < NUMCORES4GC; ++i) {
-		gccorestatus[i] = 1;
-	  }
-	  BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-	  break;
-	} else {
-	  BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-	  // check if there are spare mem for pending move requires
-	  if(COMPACTPHASE == gcphase) {
-#ifdef DEBUG
-		BAMBOO_DEBUGPRINT(0xe003);
-#endif
-		resolvePendingMoveRequest();
-#ifdef DEBUG
-		BAMBOO_DEBUGPRINT_REG(gctomove);
-#endif
-	  } else {
-#ifdef DEBUG
-		BAMBOO_DEBUGPRINT(0xe004);
-#endif
-		compact2Heaptop();
-	  }
-	}   // if(gc_checkCoreStatus_I()) else ...
-
-	if(gctomove) {
-#ifdef DEBUG
-	  BAMBOO_DEBUGPRINT(0xe005);
-	  BAMBOO_DEBUGPRINT_REG(gcmovestartaddr);
-	  BAMBOO_DEBUGPRINT_REG(gcblock2fill);
-	  BAMBOO_DEBUGPRINT_REG(gctomove);
-#endif
-	  to->ptr = gcmovestartaddr;
-	  to->numblocks = gcblock2fill - 1;
-	  to->bound = (to->numblocks==0) ?
-				  BAMBOO_SMEM_SIZE_L :
-				  BAMBOO_SMEM_SIZE_L+BAMBOO_SMEM_SIZE*to->numblocks;
-	  BASEPTR(gcdstcore, to->numblocks, &(to->base));
-	  to->offset = to->ptr - to->base;
-	  to->top = (to->numblocks==0) ?
-				(to->offset) : (to->bound-BAMBOO_SMEM_SIZE+to->offset);
-	  to->base = to->ptr;
-	  to->offset = BAMBOO_CACHE_LINE_SIZE;
-	  to->ptr += to->offset;                         // for header
-	  to->top += to->offset;
-	  if(gcdstcore == BAMBOO_NUM_OF_CORE) {
-		localcompact = true;
-	  } else {
-		localcompact = false;
-	  }
-	  gctomove = false;
-	  iscontinue = true;
-	} else if(!finishcompact) {
-	  // still pending
-	  iscontinue = false;
-	}  // if(gctomove)
-  }  // while(COMPACTPHASE == gcphase)
-#ifdef GC_PROFILE
-  gc_profileItem();
-#endif
-#ifdef RAWPATH // TODO GC_DEBUG
-  printf("(%x,%x) prepare to move large objs \n", udn_tile_coord_x(),
-		 udn_tile_coord_y());
-  //dumpSMem();
-#endif
-  // move largeObjs
-  moveLObjs();
-#ifdef RAWPATH // TODO GC_DEBUG
-  printf("(%x,%x) compact phase finished \n", udn_tile_coord_x(), 
-		 udn_tile_coord_y());
-  //dumpSMem();
-#endif
-  RUNFREE(orig);
-  RUNFREE(to);
-  orig = to = NULL;
-
-  gcphase = MAPPHASE;
-  gccorestatus[BAMBOO_NUM_OF_CORE] = 1;
-  // Note: all cores should flush their runtime data including non-gc
-  //       cores
-  for(i = 1; i < NUMCORES4GC; ++i) {
-	// send start flush messages to all cores
-	gccorestatus[i] = 1;
-	send_msg_1(i, GCSTARTMAPINFO, false);
-  }
-#ifdef GC_PROFILE
-  gc_profileItem();
-#endif
-#ifdef RAWPATH // TODO GC_DEBUG
-  printf("(%x,%x) Start map phase \n", udn_tile_coord_x(), 
-		 udn_tile_coord_y());
-#endif
-  // mapinto phase
-  transmappinginfo();
-#ifdef RAWPATH // TODO GC_DEBUG
-  printf("(%x,%x) Finish map phase \n", udn_tile_coord_x(), 
-		 udn_tile_coord_y());
-#endif
-  gccorestatus[BAMBOO_NUM_OF_CORE] = 0;
-  while(MAPPHASE == gcphase) {
-	// check the status of all cores
-	BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
-	if(gc_checkCoreStatus_I()) {
-	  // all cores have finished sending mapping info 
-	  BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-	  break;
-	}
-	BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-  }  // while(MAPPHASE == gcphase)
-
-  gcphase = FLUSHPHASE;
-  gccorestatus[BAMBOO_NUM_OF_CORE] = 1;
-  // Note: all cores should flush their runtime data including non-gc
-  //       cores
-  for(i = 1; i < NUMCORESACTIVE /*NUMCORES4GC*/; ++i) {
-	// send start flush messages to all cores
-	gccorestatus[i] = 1;
-	send_msg_1(i, GCSTARTFLUSH, false);
-  }
-#ifdef GC_PROFILE
-  gc_profileItem();
-#endif
-#ifdef RAWPATH // TODO GC_DEBUG
-  printf("(%x,%x) Start flush phase \n", udn_tile_coord_x(), 
-		 udn_tile_coord_y());
-#endif
-  // flush phase
-  flush(stackptr);
-  gccorestatus[BAMBOO_NUM_OF_CORE] = 0;
-  while(FLUSHPHASE == gcphase) {
-	// check the status of all cores
-	BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
-	if(gc_checkAllCoreStatus_I()) {
-	  BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-	  break;
-	}
-	BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-  }  // while(FLUSHPHASE == gcphase)
-#ifdef RAWPATH // TODO GC_DEBUG
-  printf("(%x,%x) Finish flush phase \n", udn_tile_coord_x(), 
-		 udn_tile_coord_y());
-#endif
-
-#ifdef GC_CACHE_ADAPT
-  gcphase = PREFINISHPHASE;
-  gccorestatus[BAMBOO_NUM_OF_CORE] = 1;
-  // Note: all cores should flush their runtime data including non-gc
-  //       cores
-  for(i = 1; i < NUMCORESACTIVE /*NUMCORES4GC*/; ++i) {
-	// send start flush messages to all cores
-	gccorestatus[i] = 1;
-	send_msg_1(i, GCSTARTPREF, false);
-  }
-#ifdef GC_PROFILE
-  gc_profileItem();
-#endif
-#ifdef RAWPATH // TODO GC_DEBUG
-  printf("(%x,%x) Start prefinish phase \n", udn_tile_coord_x(), 
-		 udn_tile_coord_y());
-#endif
-  // flush phase
-  cacheAdapt(false);
-  gccorestatus[BAMBOO_NUM_OF_CORE] = 0;
-  while(PREFINISHPHASE == gcphase) {
-	// check the status of all cores
-	BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
-	if(gc_checkAllCoreStatus_I()) {
-	  BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-	  break;
-	}
-	BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-  }  // while(PREFINISHPHASE == gcphase)
-#endif // GC_CACHE_ADAPT
-
-  gcphase = FINISHPHASE;
-
-  // invalidate all shared mem pointers
-  // put it here as it takes time to inform all the other cores to
-  // finish gc and it might cause problem when some core resumes
-  // mutator earlier than the other cores
-  bamboo_cur_msp = NULL;
-  bamboo_smem_size = 0;
-  bamboo_smem_zero_top = NULL;
-  gcflag = false;
-  gcprocessing = false;
-
-#ifdef GC_PROFILE
-  gc_profileEnd();
-#endif
-  gccorestatus[BAMBOO_NUM_OF_CORE] = 1;
-  for(i = 1; i < NUMCORESACTIVE /*NUMCORES4GC*/; ++i) {
-	// send gc finish messages to all cores
-	send_msg_1(i, GCFINISH, false);
-	gccorestatus[i] = 1;
-  }
-#ifdef RAWPATH // TODO GC_DEBUG
-  printf("(%x,%x) gc finished \n", udn_tile_coord_x(), 
-		 udn_tile_coord_y());
-  //dumpSMem();
-#endif
-  //BAMBOO_DEBUGPRINT(0x1111); // TODO
-/*#ifdef GC_PROFILE_S
-  BAMBOO_DEBUGPRINT(0xaaaa);
-  BAMBOO_DEBUGPRINT_REG(gc_num_obj);
-  BAMBOO_DEBUGPRINT_REG(gc_num_liveobj);
-  BAMBOO_DEBUGPRINT_REG(gc_num_forwardobj);
-  BAMBOO_DEBUGPRINT_REG(gc_num_profiles);
-  BAMBOO_DEBUGPRINT(0xaaab);
-  if(STARTUPCORE == BAMBOO_NUM_OF_CORE) {
-	BAMBOO_DEBUGPRINT(0xaaac);
-	BAMBOO_DEBUGPRINT_REG(gc_num_livespace);
-	BAMBOO_DEBUGPRINT_REG(gc_num_freespace);
-	BAMBOO_DEBUGPRINT(0xaaad);
-  }
-  gc_num_obj = gc_num_liveobj;
-  gc_num_liveobj = 0;
-  gc_num_forwardobj = 0;
-#endif // GC_PROFLIE_S*/
-} // void gc_master(struct garbagelist * stackptr)
-
-inline bool gc(struct garbagelist * stackptr) {
-  // check if do gc
-  if(!gcflag) {
-    gcprocessing = false;
-    return false;
-  }
-
-  // core coordinator routine
-  if(0 == BAMBOO_NUM_OF_CORE) {
-#ifdef GC_DEBUG
-    printf("(%x,%X) Check if can do gc or not\n", udn_tile_coord_x(),
-		   udn_tile_coord_y());
-#endif
-	bool isallstall = true;
-	gccorestatus[BAMBOO_NUM_OF_CORE] = 0;
-	BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
-	int ti = 0;
-	for(ti = 0; ti < NUMCORESACTIVE; ++ti) {
-	  if(gccorestatus[ti] != 0) {
-		isallstall = false;
-		break;
-	  }
-	}
-	if(!isallstall) {
-	  BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-	  // some of the cores are still executing the mutator and did not reach
-	  // some gc safe point, therefore it is not ready to do gc
-	  // in case that there are some pregc information msg lost, send a confirm
-	  // msg to the 'busy' core
-	  send_msg_1(ti, GCSTARTPRE, false);
-	  gcflag = true;
-	  return false;
-	} else {
-	  // TODO
-#ifdef GC_PROFILE
-    gc_profileStart();
-#endif
-	  //BAMBOO_DEBUGPRINT(0x1111); // TODO
-pregccheck:
-	  //BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
-	  gcnumsendobjs[0][BAMBOO_NUM_OF_CORE] = self_numsendobjs;
-	  gcnumreceiveobjs[0][BAMBOO_NUM_OF_CORE] = self_numreceiveobjs;
-	  int sumsendobj = 0;
-#ifdef DEBUG
-	  BAMBOO_DEBUGPRINT(0xec04);
-#endif
-	  for(int i = 0; i < NUMCORESACTIVE; ++i) {
-		sumsendobj += gcnumsendobjs[0][i];
-#ifdef DEBUG
-		BAMBOO_DEBUGPRINT(0xf000 + gcnumsendobjs[0][i]);
-#endif
-	  }  // for(i = 1; i < NUMCORESACTIVE; ++i)
-#ifdef DEBUG
-	  BAMBOO_DEBUGPRINT(0xec05);
-	  BAMBOO_DEBUGPRINT_REG(sumsendobj);
-#endif
-	  for(int i = 0; i < NUMCORESACTIVE; ++i) {
-		sumsendobj -= gcnumreceiveobjs[0][i];
-#ifdef DEBUG
-		BAMBOO_DEBUGPRINT(0xf000 + gcnumreceiveobjs[i]);
-#endif
-	  }  // for(i = 1; i < NUMCORESACTIVE; ++i)
-#ifdef DEBUG
-	  BAMBOO_DEBUGPRINT(0xec06);
-	  BAMBOO_DEBUGPRINT_REG(sumsendobj);
-#endif
-	  if(0 != sumsendobj) {
-		// there were still some msgs on the fly, wait until there 
-		// are some update pregc information coming and check it again
-		gcprecheck = false;
-		BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-		//BAMBOO_DEBUGPRINT(0x2222); // TODO
-		while(true) {
-		  if(gcprecheck) {
-			break;
-		  }
-		}
-		goto pregccheck;
-	  } else {
-		BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-	  }
-	}
-/*
-#ifdef GC_PROFILE
-    gc_profileStart();
-#endif
-*/
-#ifdef RAWPATH // TODO GC_DEBUG
-    printf("(%x,%x) start gc! \n", udn_tile_coord_x(), udn_tile_coord_y());
-    //dumpSMem();
-#endif
-	// Zero out the remaining bamboo_cur_msp 
-	// Only zero out the first 4 bytes of the remaining memory
-	// Move the operation here because for the GC_CACHE_ADAPT version,
-	// we need to make sure during the gcinit phase the shared heap is not 
-	// touched. Otherwise, there would be problem when adapt the cache 
-	// strategy.
-	if((bamboo_cur_msp != 0) 
-		&& (bamboo_smem_zero_top == bamboo_cur_msp) 
-		&& (bamboo_smem_size > 0)) {
-	  *((int *)bamboo_cur_msp) = 0;
-	}
-#ifdef GC_FLUSH_DTLB
-	if(gc_num_flush_dtlb < GC_NUM_FLUSH_DTLB) {
-	  BAMBOO_CLEAN_DTLB();
-	  gc_num_flush_dtlb++;
-	}
-#endif
-#ifdef GC_CACHE_ADAPT
-    //BAMBOO_DEBUGPRINT(BAMBOO_GET_EXE_TIME());
-    // disable the timer interrupt
-    bamboo_mask_timer_intr();
-    // get the sampling data TODO
-    bamboo_output_dtlb_sampling();
-#endif // GC_CACHE_ADAPT
-	gcprocessing = true;
-	gc_master(stackptr);
-  } else if(BAMBOO_NUM_OF_CORE < NUMCORES4GC) {
-	// Zero out the remaining bamboo_cur_msp 
-	// Only zero out the first 4 bytes of the remaining memory
-	// Move the operation here because for the GC_CACHE_ADAPT version,
-	// we need to make sure during the gcinit phase the shared heap is not 
-	// touched. Otherwise, there would be problem when adapt the cache 
-	// strategy.
-	if((bamboo_cur_msp != 0) 
-		&& (bamboo_smem_zero_top == bamboo_cur_msp) 
-		&& (bamboo_smem_size > 0)) {
-	  *((int *)bamboo_cur_msp) = 0;
-	}
-#ifdef GC_FLUSH_DTLB
-	if(gc_num_flush_dtlb < GC_NUM_FLUSH_DTLB) {
-	  BAMBOO_CLEAN_DTLB();
-	  gc_num_flush_dtlb++;
-	}
-#endif
-#ifdef GC_CACHE_ADAPT
-	// disable the timer interrupt
-	bamboo_mask_timer_intr();
-	// get the sampling data TODO
-	bamboo_output_dtlb_sampling();
-#endif // GC_CACHE_ADAPT
-    gcprocessing = true;
-    gc_collect(stackptr);
-
-    // invalidate all shared mem pointers
-    bamboo_cur_msp = NULL;
-    bamboo_smem_size = 0;
-	bamboo_smem_zero_top = NULL;
-    gcflag = false;
-    gcprocessing = false;
-  } else {
-	// Zero out the remaining bamboo_cur_msp 
-	// Only zero out the first 4 bytes of the remaining memory
-	// Move the operation here because for the GC_CACHE_ADAPT version,
-	// we need to make sure during the gcinit phase the shared heap is not 
-	// touched. Otherwise, there would be problem when adapt the cache 
-	// strategy.
-	if((bamboo_cur_msp != 0) 
-		&& (bamboo_smem_zero_top == bamboo_cur_msp) 
-		&& (bamboo_smem_size > 0)) {
-	  *((int *)bamboo_cur_msp) = 0;
-	}
-#ifdef GC_FLUSH_DTLB
-	if(gc_num_flush_dtlb < GC_NUM_FLUSH_DTLB) {
-	  BAMBOO_CLEAN_DTLB();
-	  gc_num_flush_dtlb++;
-	}
-#endif
-#ifdef GC_CACHE_ADAPT
-	// disable the timer interrupt
-	bamboo_mask_timer_intr();
-	// get the sampling data TODO
-	bamboo_output_dtlb_sampling();
-#endif // GC_CACHE_ADAPT
-    // not a gc core, should wait for gcfinish msg
-    gcprocessing = true;
-    gc_nocollect(stackptr);
-
-    // invalidate all shared mem pointers
-    bamboo_cur_msp = NULL;
-    bamboo_smem_size = 0;
-    bamboo_smem_zero_top = NULL;
-	gcflag = false;
-    gcprocessing = false;
-  }
-#ifdef GC_CACHE_ADAPT
-  // reset the sampling arrays
-  bamboo_dtlb_sampling_reset();
-  // enable the timer interrupt
-  bamboo_tile_timer_set_next_event(500000000); // TODO
-  bamboo_unmask_timer_intr();
-#endif // GC_CACHE_ADAPT
-  //if(STARTUPCORE == BAMBOO_NUM_OF_CORE) BAMBOO_DEBUGPRINT(0xeeee); // TODO 
-  return true;
-} // void gc(struct garbagelist * stackptr)
-
-#ifdef GC_PROFILE
-inline void gc_profileStart(void) {
-  if(!gc_infoOverflow) {
-    GCInfo* gcInfo = RUNMALLOC(sizeof(struct gc_info));
-    gc_infoArray[gc_infoIndex] = gcInfo;
-    gcInfo->index = 1;
-    gcInfo->time[0] = BAMBOO_GET_EXE_TIME();
-  }
-}
-
-inline void gc_profileItem(void) {
-  if(!gc_infoOverflow) {
-    GCInfo* gcInfo = gc_infoArray[gc_infoIndex];
-    gcInfo->time[gcInfo->index++] = BAMBOO_GET_EXE_TIME();
-  }
-}
-
-inline void gc_profileEnd(void) {
-  if(!gc_infoOverflow) {
-    GCInfo* gcInfo = gc_infoArray[gc_infoIndex];
-    gcInfo->time[gcInfo->index++] = BAMBOO_GET_EXE_TIME();
-	gcInfo->time[gcInfo->index++] = gc_num_livespace;
-	gcInfo->time[gcInfo->index++] = gc_num_freespace;
-	gcInfo->time[gcInfo->index++] = gc_num_lobj;
-	gcInfo->time[gcInfo->index++] = gc_num_lobjspace;
-	gcInfo->time[gcInfo->index++] = gc_num_obj;
-	gcInfo->time[gcInfo->index++] = gc_num_liveobj;
-	gcInfo->time[gcInfo->index++] = gc_num_forwardobj;
-    gc_infoIndex++;
-    if(gc_infoIndex == GCINFOLENGTH) {
-      gc_infoOverflow = true;
-      //taskInfoIndex = 0;
-    }
-  }
-}
-
-// output the profiling data
-void gc_outputProfileData() {
-/*#ifdef USEIO
-  int i,j;
-  unsigned long long totalgc = 0;
-
-  //printf("Start Time, End Time, Duration\n");
-  // output task related info
-  for(i = 0; i < gc_infoIndex; i++) {
-    GCInfo * gcInfo = gc_infoArray[i];
-    unsigned long long tmp = 0;
-    for(j = 0; j < gcInfo->index; j++) {
-      printf("%lld(%lld), ", gcInfo->time[j], (gcInfo->time[j]-tmp));
-      tmp = gcInfo->time[j];
-    }
-    tmp = (tmp-gcInfo->time[0]);
-    printf(" ++ %lld \n", tmp);
-    totalgc += tmp;
-  }
-
-  if(gc_infoOverflow) {
-    printf("Caution: gc info overflow!\n");
-  }
-
-  printf("\n\n total gc time: %lld \n", totalgc);
-#else*/
-  int i = 0;
-  int j = 0;
-  unsigned long long totalgc = 0;
-
-#ifndef BAMBOO_MEMPROF
-  BAMBOO_DEBUGPRINT(0xdddd);
-#endif
-  // output task related info
-  for(i= 0; i < gc_infoIndex; i++) {
-    GCInfo * gcInfo = gc_infoArray[i];
-#ifdef BAMBOO_MEMPROF
-    unsigned long long tmp=gcInfo->time[gcInfo->index-8]-gcInfo->time[0]; //0;
-#else
-	unsigned long long tmp = 0;
-    BAMBOO_DEBUGPRINT(0xddda);
-    for(j = 0; j < gcInfo->index - 7; j++) {
-      BAMBOO_DEBUGPRINT(gcInfo->time[j]);
-      BAMBOO_DEBUGPRINT(gcInfo->time[j]-tmp);
-      BAMBOO_DEBUGPRINT(0xdddb);
-      tmp = gcInfo->time[j];
-    }
-    tmp = (tmp-gcInfo->time[0]);
-    BAMBOO_DEBUGPRINT_REG(tmp);
-	BAMBOO_DEBUGPRINT(0xdddc);
-	BAMBOO_DEBUGPRINT(gcInfo->time[gcInfo->index - 7]);
-	BAMBOO_DEBUGPRINT(gcInfo->time[gcInfo->index - 6]);
-	BAMBOO_DEBUGPRINT(gcInfo->time[gcInfo->index - 5]);
-	BAMBOO_DEBUGPRINT(gcInfo->time[gcInfo->index - 4]);
-	BAMBOO_DEBUGPRINT(gcInfo->time[gcInfo->index - 3]);
-	BAMBOO_DEBUGPRINT(gcInfo->time[gcInfo->index - 2]);
-	BAMBOO_DEBUGPRINT(gcInfo->time[gcInfo->index - 1]);
-    BAMBOO_DEBUGPRINT(0xddde);
-#endif
-    totalgc += tmp;
-  }
-#ifndef BAMBOO_MEMPROF
-  BAMBOO_DEBUGPRINT(0xdddf);
-#endif
-  BAMBOO_DEBUGPRINT_REG(totalgc);
-
-  if(gc_infoOverflow) {
-    BAMBOO_DEBUGPRINT(0xefee);
-  }
-
-#ifndef BAMBOO_MEMPROF
-  BAMBOO_DEBUGPRINT(0xeeee);
-#endif
-//#endif
-}
-#endif  // #ifdef GC_PROFILE
-
-#endif
diff --git a/Robust/src/Runtime/multicoregarbage.h b/Robust/src/Runtime/multicoregarbage.h
deleted file mode 100644
index a824bf75..00000000
--- a/Robust/src/Runtime/multicoregarbage.h
+++ /dev/null
@@ -1,276 +0,0 @@
-#ifndef MULTICORE_GARBAGE_H
-#define MULTICORE_GARBAGE_H
-#include "multicoregc.h"
-#include "multicorehelper.h"  // for mappins between core # and block #
-#include "structdefs.h"
-#include "MGCHash.h"
-#include "GCSharedHash.h"
-
-#ifndef bool
-#define bool int
-#endif
-
-// data structures for GC
-#ifdef GC_DEBUG
-#define BAMBOO_SMEM_SIZE_L (BAMBOO_SMEM_SIZE * 2)
-#else
-#define BAMBOO_SMEM_SIZE_L (BAMBOO_SMEM_SIZE * 2)
-#endif
-#define BAMBOO_LARGE_SMEM_BOUND (BAMBOO_SMEM_SIZE_L*NUMCORES4GC)
-// let each gc core to have one big block, this is very important
-// for the computation of NUMBLOCKS(s, n), DO NOT change this!
-
-#ifdef GC_FLUSH_DTLB
-#define GC_NUM_FLUSH_DTLB 1
-int gc_num_flush_dtlb;
-#endif
-
-#define NUMPTRS 100
-
-// for GC profile
-#ifdef GC_PROFILE
-#define GCINFOLENGTH 100
-
-#ifdef GC_CACHE_ADAPT
-#define GC_PROFILE_NUM_FIELD 16
-#else
-#define GC_PROFILE_NUM_FIELD 15
-#endif
-
-typedef struct gc_info {
-  unsigned long long time[GC_PROFILE_NUM_FIELD];
-  int index;
-} GCInfo;
-
-GCInfo * gc_infoArray[GCINFOLENGTH];
-int gc_infoIndex;
-bool gc_infoOverflow;
-unsigned long long gc_num_livespace;
-unsigned long long gc_num_freespace;
-unsigned long long gc_num_lobjspace;
-unsigned int gc_num_lobj;
-
-// TODO
-/*unsigned long long flushstalltime;
-unsigned long long flushstalltime_i;
-int num_mapinforequest_i;*/
-unsigned int gc_num_liveobj;
-unsigned int gc_num_obj;
-unsigned int gc_num_forwardobj;
-int gc_num_profiles;
-
-#endif // GC_PROFILE
-
-typedef enum {
-  INIT = 0,           // 0
-  DISCOVERED = 2,     // 2
-  REMOTEM = 4,        // 4
-  MARKED = 8,         // 8
-  COMPACTED = 16,     // 16
-  FLUSHED = 32,       // 32
-  END = 33            // 33
-} GCOBJFLAG;
-
-typedef enum {
-  INITPHASE = 0x0,         // 0x0
-  MARKPHASE,               // 0x1
-  COMPACTPHASE,            // 0x2
-  SUBTLECOMPACTPHASE,      // 0x3
-  MAPPHASE,                // 0x4
-  FLUSHPHASE,              // 0x5
-#ifdef GC_CACHE_ADAPT
-  PREFINISHPHASE,          // 0x6
-#endif // GC_CACHE_ADAPT
-  FINISHPHASE              // 0x6/0x7
-} GCPHASETYPE;
-
-volatile bool gcflag;
-volatile bool gcprocessing;
-volatile GCPHASETYPE gcphase; // indicating GC phase
-
-volatile bool gcpreinform; // counter for stopped cores
-volatile bool gcprecheck; // indicates if there are updated pregc information
-
-int gccurr_heaptop;
-struct MGCHash * gcforwardobjtbl; // cache forwarded objs in mark phase
-// for mark phase termination
-volatile int gccorestatus[NUMCORESACTIVE]; // records status of each core
-                                           // 1: running gc
-                                           // 0: stall
-volatile int gcnumsendobjs[2][NUMCORESACTIVE]; // the # of objects sent out
-volatile int gcnumreceiveobjs[2][NUMCORESACTIVE]; // the # of objects received
-volatile int gcnumsrobjs_index;  // indicates which entry to record the info 
-		                         // received before phase 1 of the mark finish 
-						         // checking process
-								 // the info received in phase 2 must be 
-								 // recorded in the other entry
-volatile bool gcbusystatus;
-int gcself_numsendobjs;
-int gcself_numreceiveobjs;
-
-// for load balancing
-INTPTR gcheaptop;
-int gcloads[NUMCORES4GC];
-int gctopcore; // the core host the top of the heap
-int gctopblock; // the number of current top block
-
-int gcnumlobjs;
-
-// compact instruction
-INTPTR gcmarkedptrbound;
-int gcblock2fill;
-int gcstopblock[NUMCORES4GC]; // indicate when to stop compact phase
-int gcfilledblocks[NUMCORES4GC]; //indicate how many blocks have been fulfilled
-// move instruction;
-INTPTR gcmovestartaddr;
-int gcdstcore;
-volatile bool gctomove;
-int gcrequiredmems[NUMCORES4GC]; //record pending mem requests
-volatile int gcmovepending;
-
-// data structures to record remote cores that transferred the marked 
-// objs in the mark phase
-/*struct rcoreinfo{
-  int high;
-  int low;
-};
-struct RuntimeHash * gcrcoretbl;
-#define NUM_MAPPING 40
-void * gcmappingtbl[NUMCORESACTIVE][NUM_MAPPING];*/
-
-// shared memory pointer for shared pointer mapping tbls
-// In GC version, this block of memory is located at the bottom of the 
-// shared memory, right on the top of the smem tbl.
-// The bottom of the shared memory = sbstart tbl + smemtbl 
-//                                  + NUMCORES4GC bamboo_rmsp
-// These three types of table are always reside at the bottom of the shared 
-// memory and will never be moved or garbage collected
-#ifdef GC_SMALLPAGESIZE
-#define BAMBOO_RMSP_SIZE (1024 * 1024)
-#else
-#define BAMBOO_RMSP_SIZE (BAMBOO_SMEM_SIZE) // (45 * 16 * 1024)
-#endif
-mspace bamboo_rmsp;
-// shared pointer mapping tbl
-//volatile struct GCSharedHash * gcsharedptbl;
-mgcsharedhashtbl_t * gcsharedptbl;
-// remote shared pointer tbls
-//struct GCSharedHash * gcrpointertbls[NUMCORES4GC];
-mgcsharedhashtbl_t * gcrpointertbls[NUMCORES4GC];
-
-#ifdef LOCALHASHTBL_TEST
-struct RuntimeHash * gcpointertbl;
-#else
-mgchashtable_t * gcpointertbl;
-#endif
-//struct MGCHash * gcpointertbl;
-int gcobj2map;
-int gcmappedobj;
-volatile bool gcismapped;
-
-// table recording the starting address of each small block
-// (size is BAMBOO_SMEM_SIZE)
-// Note: 1. this table always resides on the very bottom of the shared memory
-//       2. the first two blocks are reserved for this table, would never be
-//          moved or garbage collected.
-INTPTR * gcsbstarttbl;
-int gcreservedsb;  // number of reserved sblock for sbstarttbl
-int gcnumblock; // number of total blocks in the shared mem
-int gcbaseva; // base va for shared memory without reserved sblocks
-#ifdef GC_CACHE_ADAPT
-int gctopva; // top va for shared memory without reserved sblocks
-volatile bool gccachestage;
-#endif // GC_CACHE_ADAPT
-
-#define ISSHAREDOBJ(p) \
-  ((((int)p)>gcbaseva)&&(((int)p)<(gcbaseva+(BAMBOO_SHARED_MEM_SIZE))))
-
-#define ALIGNSIZE(s, as) \
-  (*((int*)as)) = (((s) & (~(BAMBOO_CACHE_LINE_MASK))) + (BAMBOO_CACHE_LINE_SIZE))
-
-// mapping of pointer to block # (start from 0), here the block # is
-// the global index
-#define BLOCKINDEX(p, b) \
-  { \
-    int t = (p) - gcbaseva; \
-    if(t < (BAMBOO_LARGE_SMEM_BOUND)) { \
-      (*((int*)b)) = t / (BAMBOO_SMEM_SIZE_L); \
-    } else { \
-      (*((int*)b)) = NUMCORES4GC+((t-(BAMBOO_LARGE_SMEM_BOUND))/(BAMBOO_SMEM_SIZE)); \
-    } \
-  }
-
-// mapping of pointer to core #
-#define RESIDECORE(p, c) \
-  { \
-    if(1 == (NUMCORES4GC)) { \
-      (*((int*)c)) = 0; \
-    } else { \
-      int b; \
-      BLOCKINDEX((p), &b); \
-      (*((int*)c)) = gc_block2core[(b%(NUMCORES4GC*2))]; \
-    } \
-  }
-
-// NOTE: n starts from 0
-// mapping of heaptop (how many bytes there are in the local heap) to
-// the number of the block
-// the number of the block indicates that the block is the xth block on
-// the local heap
-#define NUMBLOCKS(s, n) \
-  if(s < (BAMBOO_SMEM_SIZE_L)) { \
-    (*((int*)(n))) = 0; \
-  } else { \
-    (*((int*)(n))) = 1 + ((s) - (BAMBOO_SMEM_SIZE_L)) / (BAMBOO_SMEM_SIZE); \
-  }
-
-#define OFFSET(s, o) \
-  if(s < BAMBOO_SMEM_SIZE_L) { \
-    (*((int*)(o))) = (s); \
-  } else { \
-    (*((int*)(o))) = ((s) - (BAMBOO_SMEM_SIZE_L)) % (BAMBOO_SMEM_SIZE); \
-  }
-
-// mapping of (core #, index of the block) to the global block index
-#define BLOCKINDEX2(c, n) (gc_core2block[(2*(c))+((n)%2)]+((NUMCORES4GC*2)*((n)/2)))
-
-// mapping of (core #, number of the block) to the base pointer of the block
-#define BASEPTR(c, n, p) \
-  { \
-    int b = BLOCKINDEX2((c), (n)); \
-    if(b < (NUMCORES4GC)) { \
-      (*((int*)p)) = gcbaseva + b * (BAMBOO_SMEM_SIZE_L); \
-    } else { \
-      (*((int*)p)) = gcbaseva+(BAMBOO_LARGE_SMEM_BOUND)+ \
-                     (b-(NUMCORES4GC))*(BAMBOO_SMEM_SIZE); \
-    } \
-  }
-
-// the next core in the top of the heap
-#define NEXTTOPCORE(b) (gc_block2core[((b)+1)%(NUMCORES4GC*2)])
-
-inline bool gc(struct garbagelist * stackptr); // core coordinator routine
-inline void gc_collect(struct garbagelist* stackptr); //core collector routine
-inline void gc_nocollect(struct garbagelist* stackptr); //non-gc core collector routine
-inline void transferMarkResults_I();
-inline void gc_enqueue_I(void *ptr);
-inline void gc_lobjenqueue_I(void *ptr, int length, int host);
-inline bool gcfindSpareMem_I(int * startaddr,
-                             int * tomove,
-                             int * dstcore,
-                             int requiredmem,
-                             int requiredcore);
-
-inline void * gc_lobjdequeue4(int * length, int * host);
-inline int gc_lobjmoreItems4();
-inline void gc_lobjqueueinit4();
-
-#ifdef GC_PROFILE
-INLINE void gc_profileStart(void);
-INLINE void gc_profileItem(void);
-INLINE void gc_profileEnd(void);
-void gc_outputProfileData();
-#endif
-
-#endif
-
diff --git a/Robust/src/Runtime/multicoregc.h b/Robust/src/Runtime/multicoregc.h
deleted file mode 100644
index 0f7ddc4c..00000000
--- a/Robust/src/Runtime/multicoregc.h
+++ /dev/null
@@ -1,16 +0,0 @@
-#ifndef MULTICORE_GC_H
-#define MULTICORE_GC_H
-
-struct garbagelist {
-  int size;
-  struct garbagelist *next;
-  void * array[];
-};
-
-struct listitem {
-  struct listitem * prev;
-  struct listitem * next;
-  struct garbagelist * stackptr;
-};
-
-#endif // MULTICORE_GC_H
diff --git a/Robust/src/Runtime/multicorehelper.h b/Robust/src/Runtime/multicorehelper.h
deleted file mode 100644
index 3519f5a7..00000000
--- a/Robust/src/Runtime/multicorehelper.h
+++ /dev/null
@@ -1,56 +0,0 @@
-#ifndef MULTICORE_HELPER_H
-#define MULTICORE_HELPER_H
-
-#ifdef GC_1
-// NUMCORES4GC = 1
-static int gc_core2block[2] = {0,1};
-
-static int gc_block2core[2] = { 0,  0};
-#elif defined GC_56
-// NUMCORES4GC = 56
-static int gc_core2block[112] = {
-  0,111,  15, 96,  16,95,  31,80,  32,79,  47,64,  48,63,
-  1,110,  14, 97,  17,94,  30,81,  33,78,  46,65,  49,62,
-  2,109,  13, 98,  18,93,  29,82,  34,77,  45,66,  50,61,
-  3,108,  12, 99,  19,92,  28,83,  35,76,  44,67,  51,60,
-  4,107,  11,100,  20,91,  27,84,  36,75,  43,68,  52,59,
-  5,106,  10,101,  21,90,  26,85,  37,74,  42,69,  53,58,
-  6,105,   9,102,  22,89,  25,86,  38,73,  41,70,  54,57,
-  7,104,   8,103,  23,88,  24,87,  39,72,  40,71,  55,56
-};
-
-static int gc_block2core[112] = {
-  0,  7, 14, 21, 28, 35, 42, 49, 50, 43, 36, 29, 22, 15,  8,  1,
-  2,  9, 16, 23, 30, 37, 44, 51, 52, 45, 38, 31, 24, 17, 10,  3,
-  4, 11, 18, 25, 32, 39, 46, 53, 54, 47, 40, 33, 26, 19, 12,  5,
-  6, 13, 20, 27, 34, 41, 48, 55, 55, 48, 41, 34, 27, 20, 13,  6,
-  5, 12, 19, 26, 33, 40, 47, 54, 53, 46, 39, 32, 25, 18, 11,  4,
-  3, 10, 17, 24, 31, 38, 45, 52, 51, 44, 37, 30, 23, 16,  9,  2,
-  1,  8, 15, 22, 29, 36, 43, 50, 49, 42, 35, 28, 21, 14,  7,  0
-};
-#elif defined GC_62
-// NUMCORES4GC = 62
-static int gc_core2block[124] = {
-  0,123,  15,108,  16,107,  31,92,  32,91,  47,76,
-  1,122,  14,109,  17,106,  30,93,  33,90,  46,77,  48,75,  61,62,
-  2,121,  13,110,  18,105,  29,94,  34,89,  45,78,  49,74,  60,63,
-  3,120,  12,111,  19,104,  28,95,  35,88,  44,79,  50,73,  59,64,
-  4,119,  11,112,  20,103,  27,96,  36,87,  43,80,  51,72,  58,65,
-  5,118,  10,113,  21,102,  26,97,  37,86,  42,81,  52,71,  57,66,
-  6,117,   9,114,  22,101,  25,98,  38,85,  41,82,  53,70,  56,67,
-  7,116,   8,115,  23,100,  24,99,  39,84,  40,83,  54,69,  55,68
-};
-
-static int gc_block2core[124] = {
-  0,  6, 14, 22, 30, 38, 46, 54, 55, 47, 39, 31, 23, 15,  7,  1,
-  2,  8, 16, 24, 32, 40, 48, 56, 57, 49, 41, 33, 25, 17,  9,  3,
-  4, 10, 18, 26, 34, 42, 50, 58, 59, 51, 43, 35, 27, 19, 11,  5,
-  12, 20, 28, 36, 44, 52, 60, 61, 53, 45, 37, 29, 21, 13,
-  13, 21, 29, 37, 45, 53, 61, 60, 52, 44, 36, 28, 20, 12,
-  5, 11, 19, 27, 35, 43, 51, 59, 58, 50, 42, 34, 26, 18, 10,  4,
-  3,  9, 17, 25, 33, 41, 49, 57, 56, 48, 40, 32, 24, 16,  8,  2,
-  1,  7, 15, 23, 31, 39, 47, 55, 54, 46, 38, 30, 22, 14,  6,  0
-};
-#endif
-
-#endif // MULTICORE_HELPER_H
diff --git a/Robust/src/Runtime/multicoreruntime.c b/Robust/src/Runtime/multicoreruntime.c
deleted file mode 100644
index 140d68f5..00000000
--- a/Robust/src/Runtime/multicoreruntime.c
+++ /dev/null
@@ -1,337 +0,0 @@
-#include "runtime.h"
-#include "structdefs.h"
-#include "mem.h"
-#ifndef MULTICORE
-#include <fcntl.h>
-#include <errno.h>
-#include <signal.h>
-#endif
-#ifndef RAW
-#include <stdio.h>
-#endif
-#ifdef MULTICORE
-#include "runtime_arch.h"
-#endif
-//#include "option.h"
-
-extern int classsize[];
-extern int typearray[];
-extern int typearray2[];
-#ifndef MULTICORE
-jmp_buf error_handler;
-int instructioncount;
-
-char *options;
-int injectfailures=0;
-float failurechance=0;
-int errors=0;
-int injectinstructionfailures;
-int failurecount;
-float instfailurechance=0;
-int numfailures;
-int instaccum=0;
-#ifdef DMALLOC
-#include "dmalloc.h"
-#endif
-#endif
-
-int debugtask=0;
-
-int instanceof(struct ___Object___ *ptr, int type) {
-  int i=ptr->type;
-  do {
-    if (i==type)
-      return 1;
-    i=typearray[i];
-  } while(i!=-1);
-  i=ptr->type;
-  if (i>NUMCLASSES) {
-    do {
-      if (i==type)
-	return 1;
-      i=typearray2[i-NUMCLASSES];
-    } while(i!=-1);
-  }
-  return 0;
-}
-
-#ifdef MULTICORE
-void initializeexithandler() {
-}
-#else
-void exithandler(int sig, siginfo_t *info, void * uap) {
-#ifdef DEBUG
-  printf("exit in exithandler\n");
-#endif
-  exit(0);
-}
-
-void initializeexithandler() {
-  struct sigaction sig;
-  sig.sa_sigaction=&exithandler;
-  sig.sa_flags=SA_SIGINFO;
-  sigemptyset(&sig.sa_mask);
-  sigaction(SIGUSR2, &sig, 0);
-}
-#endif
-
-/* This function inject failures */
-
-void injectinstructionfailure() {
-#ifdef MULTICORE
-  // not supported in MULTICORE version
-  return;
-#else
-#ifdef TASK
-  if (injectinstructionfailures) {
-    if (numfailures==0)
-      return;
-    instructioncount=failurecount;
-    instaccum+=failurecount;
-    if ((((double)random())/RAND_MAX)<instfailurechance) {
-      if (numfailures>0)
-	numfailures--;
-      printf("FAILURE!!! %d\n",numfailures);
-      longjmp(error_handler,11);
-    }
-  }
-#else
-#ifdef THREADS
-  if (injectinstructionfailures) {
-    if (numfailures==0)
-      return;
-    instaccum+=failurecount;
-    if ((((double)random())/RAND_MAX)<instfailurechance) {
-      if (numfailures>0)
-	numfailures--;
-      printf("FAILURE!!! %d\n",numfailures);
-      threadexit();
-    }
-  }
-#endif
-#endif
-#endif
-}
-
-#ifdef D___Double______nativeparsedouble____L___String___
-double CALL01(___Double______nativeparsedouble____L___String___,struct ___String___ * ___str___) {
-  int length=VAR(___str___)->___count___;
-  int maxlength=(length>60) ? 60 : length;
-  char str[maxlength+1];
-  struct ArrayObject * chararray=VAR(___str___)->___value___;
-  int i;
-  int offset=VAR(___str___)->___offset___;
-  for(i=0; i<maxlength; i++) {
-    str[i]=((short *)(((char *)&chararray->___length___)+sizeof(int)))[i+offset];
-  }
-  str[i]=0;
-  double d=atof(str);
-  return d;
-}
-#endif
-
-#ifdef D___String______convertdoubletochar____D__AR_C
-int CALL12(___String______convertdoubletochar____D__AR_C, double ___val___, double ___val___, struct ArrayObject ___chararray___) {
-  int length=VAR(___chararray___)->___length___;
-  char str[length];
-  int i;
-  int num=snprintf(str, length, "%f",___val___);
-  if (num>=length)
-    num=length-1;
-  for(i=0; i<length; i++) {
-    ((short *)(((char *)&VAR(___chararray___)->___length___)+sizeof(int)))[i]=(short)str[i];
-  }
-  return num;
-}
-#else
-int CALL12(___String______convertdoubletochar____D__AR_C, double ___val___, double ___val___, struct ArrayObject ___chararray___) {
-  return 0;
-}
-#endif
-
-void CALL11(___System______exit____I,int ___status___, int ___status___) {
-#ifdef MULTICORE
-  BAMBOO_EXIT(___status___);
-#else
-#ifdef DEBUG
-  printf("exit in CALL11\n");
-#endif
-  exit(___status___);
-#endif
-}
-
-//#ifdef D___Vector______removeElement_____AR_L___Object____I_I
-void CALL23(___Vector______removeElement_____AR_L___Object____I_I, int ___index___, int ___size___, struct ArrayObject * ___array___, int ___index___, int ___size___) {
-  char* offset=((char *)(&VAR(___array___)->___length___))+sizeof(unsigned int)+sizeof(void *)*___index___;
-  memmove(offset, offset+sizeof(void *),(___size___-___index___-1)*sizeof(void *));
-}
-//#endif
-
-void CALL11(___System______printI____I,int ___status___, int ___status___) {
-#ifdef MULTICORE
-  BAMBOO_DEBUGPRINT(0x1111);
-  BAMBOO_DEBUGPRINT_REG(___status___);
-#else
-#ifdef DEBUG
-  printf("printI in CALL11\n");
-#endif
-  printf("%d\n", ___status___);
-#endif
-}
-
-long CALL00(___System______currentTimeMillis____) {
-#ifdef MULTICORE
-  // not supported in MULTICORE version
-  return -1;
-#else
-  struct timeval tv; long long retval;
-  gettimeofday(&tv, NULL);
-  retval = tv.tv_sec; /* seconds */
-  retval*=1000; /* milliseconds */
-  retval+= (tv.tv_usec/1000); /* adjust milliseconds & add them in */
-  return retval;
-#endif
-}
-
-void CALL01(___System______printString____L___String___,struct ___String___ * ___s___) {
-#ifdef MULTICORE
-#else
-  struct ArrayObject * chararray=VAR(___s___)->___value___;
-  int i;
-  int offset=VAR(___s___)->___offset___;
-  for(i=0; i<VAR(___s___)->___count___; i++) {
-    short sc=((short *)(((char *)&chararray->___length___)+sizeof(int)))[i+offset];
-    putchar(sc);
-  }
-#endif
-}
-
-/* Object allocation function */
-
-#ifdef MULTICORE_GC
-void * allocate_new(void * ptr, int type) {
-  struct ___Object___ * v=(struct ___Object___ *)FREEMALLOC((struct garbagelist *) ptr, classsize[type]);
-#ifdef DEBUG
-  printf("(%x,%x): new object: %x (%d, %x) \n", udn_tile_coord_x(),
-         udn_tile_coord_y(), (int)v, type, classsize[type]);
-#endif
-  v->type=type;
-  v->version = 0;
-  v->lock = NULL;
-  v->lockcount = 0;
-  initlock(v);
-#ifdef GC_PROFILE
-  extern unsigned int gc_num_obj;
-  gc_num_obj++;
-#endif
-  return v;
-}
-
-/* Array allocation function */
-
-struct ArrayObject * allocate_newarray(void * ptr, int type, int length) {
-  struct ArrayObject * v=(struct ArrayObject *)FREEMALLOC((struct garbagelist *) ptr, sizeof(struct ArrayObject)+length*classsize[type]);
-#ifdef DEBUG
-  printf("(%x,%x): new array object: %x (%d, %x)\n", udn_tile_coord_x(),
-         udn_tile_coord_y(), (int)v, type, 
-		 sizeof(struct ArrayObject)+length*classsize[type]);
-#endif
-  v->type=type;
-  v->version = 0;
-  v->lock = NULL;
-  if (length<0) {
-    return NULL;
-  }
-  v->___length___=length;
-  initlock(v);
-#ifdef GC_PROFILE
-  extern unsigned int gc_num_obj;
-  gc_num_obj++;
-#endif
-  return v;
-}
-
-#else
-void * allocate_new(int type) {
-  struct ___Object___ * v=FREEMALLOC(classsize[type]);
-  v->type=type;
-  v->version = 0;
-  //v->numlocks = 0;
-  v->lock = NULL;
-  initlock(v);
-  return v;
-}
-
-/* Array allocation function */
-
-struct ArrayObject * allocate_newarray(int type, int length) {
-  struct ArrayObject * v=FREEMALLOC(sizeof(struct ArrayObject)+length*classsize[type]);
-  v->type=type;
-  v->version = 0;
-  //v->numlocks = 0;
-  v->lock = NULL;
-  v->___length___=length;
-  initlock(v);
-  return v;
-}
-#endif
-
-
-/* Converts C character arrays into Java strings */
-#ifdef MULTICORE_GC
-struct ___String___ * NewString(void * ptr, const char *str,int length) {
-#else
-struct ___String___ * NewString(const char *str,int length) {
-#endif
-  int i;
-#ifdef MULTICORE_GC
-  struct ArrayObject * chararray=allocate_newarray((struct garbagelist *)ptr, CHARARRAYTYPE, length);
-  int ptrarray[]={1, (int) ptr, (int) chararray};
-  struct ___String___ * strobj=allocate_new((struct garbagelist *) &ptrarray, STRINGTYPE);
-  chararray=(struct ArrayObject *) ptrarray[2];
-#else
-  struct ArrayObject * chararray=allocate_newarray(CHARARRAYTYPE, length);
-  struct ___String___ * strobj=allocate_new(STRINGTYPE);
-#endif
-  strobj->___value___=chararray;
-  strobj->___count___=length;
-  strobj->___offset___=0;
-
-  for(i=0; i<length; i++) {
-    ((short *)(((char *)&chararray->___length___)+sizeof(int)))[i]=(short)str[i];
-  }
-  return strobj;
-}
-
-/* Generated code calls this if we fail a bounds check */
-
-void failedboundschk() {
-#ifndef TASK
-  printf("Array out of bounds\n");
-#ifdef THREADS
-  threadexit();
-#else
-  exit(-1);
-#endif
-#else
-#ifndef MULTICORE
-  printf("Array out of bounds\n");
-  longjmp(error_handler,2);
-#else
-  BAMBOO_EXIT(0xa001);
-#endif
-#endif
-}
-
-/* Abort task call */
-void abort_task() {
-#ifdef TASK
-#ifndef MULTICORE
-  printf("Aborting\n");
-  longjmp(error_handler,4);
-#endif
-#else
-  printf("Aborting\n");
-  exit(-1);
-#endif
-}
diff --git a/Robust/src/Runtime/multicoreruntime.h b/Robust/src/Runtime/multicoreruntime.h
deleted file mode 100644
index c734d5cc..00000000
--- a/Robust/src/Runtime/multicoreruntime.h
+++ /dev/null
@@ -1,605 +0,0 @@
-#ifndef MULTICORE_RUNTIME
-#define MULTICORE_RUNTIME
-
-#ifndef INLINE
-#define INLINE    inline __attribute__((always_inline))
-#endif
-
-#ifndef bool
-#define bool int
-#define true 1
-#define false 0
-#endif
-
-////////////////////////////////////////////////////////////////
-// global variables                                          //
-///////////////////////////////////////////////////////////////
-
-// record the starting time
-unsigned long long bamboo_start_time;
-
-// data structures for msgs
-#define BAMBOO_OUT_BUF_LENGTH 3000
-#define BAMBOO_MSG_BUF_LENGTH 3000
-int msgdata[BAMBOO_MSG_BUF_LENGTH];
-volatile int msgdataindex;
-volatile int msgdatalast;
-int msglength;
-volatile bool msgdatafull;
-int outmsgdata[BAMBOO_OUT_BUF_LENGTH];
-int outmsgindex;
-int outmsglast;
-int outmsgleft;
-volatile bool isMsgHanging;
-//volatile bool isMsgSending;
-
-#define MSG_INDEXINC_I() \
-  msgdataindex = (msgdataindex + 1) % (BAMBOO_MSG_BUF_LENGTH)
-
-#define MSG_LASTINDEXINC_I() \
-  msgdatalast = (msgdatalast + 1) % (BAMBOO_MSG_BUF_LENGTH)
-
-#define MSG_CACHE_I(n) \
-  msgdata[msgdatalast] = (n); \
-  MSG_LASTINDEXINC_I()
-
-// NOTE: if msgdataindex == msgdatalast, it always means that the buffer if
-//       full. In the case that the buffer is empty, should never call this
-//       MACRO
-#define MSG_REMAINSIZE_I(s) \
-  if(msgdataindex < msgdatalast) { \
-    (*(int*)s) = msgdatalast - msgdataindex; \
-  } else if((msgdataindex == msgdatalast) && (!msgdatafull)) { \
-    (*(int*)s) = 0; \
-  }       else { \
-    (*(int*)s) = (BAMBOO_MSG_BUF_LENGTH) -msgdataindex + msgdatalast; \
-  }
-
-#define OUTMSG_INDEXINC() \
-  outmsgindex = (outmsgindex + 1) % (BAMBOO_OUT_BUF_LENGTH)
-
-#define OUTMSG_LASTINDEXINC() \
-  outmsglast = (outmsglast + 1) % (BAMBOO_OUT_BUF_LENGTH); \
-  if(outmsglast == outmsgindex) { \
-    BAMBOO_EXIT(0xdd01); \
-  }
-
-#define OUTMSG_CACHE(n) \
-  outmsgdata[outmsglast] = (n); \
-  OUTMSG_LASTINDEXINC();
-
-#define MAX_PACKET_WORDS 5
-
-/* Message format:
- *      type + Msgbody
- * type: 1 -- transfer object
- *       2 -- transfer stall msg
- *       3 -- lock request
- *       4 -- lock grount
- *       5 -- lock deny
- *       6 -- lock release
- *       // add for profile info
- *       7 -- transfer profile output msg
- *       8 -- transfer profile output finish msg
- *       // add for alias lock strategy
- *       9 -- redirect lock request
- *       a -- lock grant with redirect info
- *       b -- lock deny with redirect info
- *       c -- lock release with redirect info
- *       d -- status confirm request
- *       e -- status report msg
- *       f -- terminate
- *      10 -- requiring for new memory
- *      11 -- response for new memory request
- *      12 -- GC init phase start
- *      13 -- GC start
- *      14 -- compact phase start
- *      15 -- flush phase start
- *      16 -- init phase finish
- *      17 -- mark phase finish
- *      18 -- compact phase finish
- *      19 -- flush phase finish
- *      1a -- GC finish
- *      1b -- marked phase finish confirm request
- *      1c -- marked phase finish confirm response
- *      1d -- markedObj msg
- *      1e -- start moving objs msg
- *      1f -- ask for mapping info of a markedObj
- *      20 -- mapping info of a markedObj
- *      21 -- large objs info request
- *      22 -- large objs info response
- *      23 -- large objs mapping info
- *
- * ObjMsg: 1 + size of msg + obj's address + (task index + param index)+
- * StallMsg: 2 + corenum + sendobjs + receiveobjs
- *             (size is always 4 * sizeof(int))
- * LockMsg: 3 + lock type + obj pointer + lock + request core
- *            (size is always 5 * sizeof(int))
- *          4/5/6 + lock type + obj pointer + lock
- *            (size is always 4 * sizeof(int))
- *          9 + lock type + obj pointer +  redirect lock + root request core
- *            + request core
- *            (size is always 6 * sizeof(int))
- *          a/b + lock type + obj pointer + redirect lock
- *              (size is always 4 * sizeof(int))
- *          c + lock type + lock + redirect lock
- *            (size is always 4 * sizeof(int))
- *          lock type: 0 -- read; 1 -- write
- * ProfileMsg: 7 + totalexetime
- *               (size is always 2 * sizeof(int))
- *             8 + corenum
- *               (size is always 2 * sizeof(int))
- * StatusMsg: d (size is always 1 * sizeof(int))
- *            e + status + corenum + sendobjs + receiveobjs
- *              (size is always 5 * sizeof(int))
- *            status: 0 -- stall; 1 -- busy
- * TerminateMsg: f (size is always 1 * sizeof(int)
- * MemoryMsg: 10 + size + corenum
- *              (size is always 3 * sizeof(int))
- *           11 + base_va + size
- *              (size is always 3 * sizeof(int))
- * GCMsg: 12/13 (size is always 1 * sizeof(int))
- *        14 + size of msg + (num of objs to move + (start address
- *           + end address + dst core + start dst)+)?
- *           + (num of incoming objs + (start dst + orig core)+)?
- *           + (num of large obj lists + (start address + lenght
- *           + start dst)+)?
- *        15 (size is always 1 * sizeof(int))
- *        16 + corenum
- *           (size is always 2 * sizeof(int))
- *        17 + corenum + gcsendobjs + gcreceiveobjs
- *           (size if always 4 * sizeof(int))
- *        18 + corenum + fulfilled blocks num + (finish compact(1) + current
- *           heap top)/(need mem(0) + mem need)
- *           size is always 5 * sizeof(int))
- *        19 + corenum
- *              (size is always 2 * sizeof(int))
- *        1a (size is always 1 * sizeof(int))
- *        1b (size if always 1 * sizeof(int))
- *        1c + size of msg + corenum + gcsendobjs + gcreceiveobjs
- *           (size is always 5 * sizeof(int))
- *        1d + obj's address + request core
- *           (size is always 3 * sizeof(int))
- *        1e + corenum + start addr + end addr
- *           (size if always 4 * sizeof(int))
- *        1f + obj's address + corenum
- *           (size is always 3 * sizeof(int))
- *        20 + obj's address + dst address
- *           (size if always 3 * sizeof(int))
- *        21 (size is always 1 * sizeof(int))
- *        22 + size of msg + corenum + current heap size
- *           + (num of large obj lists + (start address + length)+)?
- *        23 + orig large obj ptr + new large obj ptr
- *            (size is always 3 * sizeof(int))
- */
-typedef enum {
-  MSGSTART = 0xD0,       // 0xD0
-  TRANSOBJ,              // 0xD1
-  TRANSTALL,             // 0xD2
-  LOCKREQUEST,           // 0xD3
-  LOCKGROUNT,            // 0xD4
-  LOCKDENY,              // 0xD5
-  LOCKRELEASE,           // 0xD6
-  PROFILEOUTPUT,         // 0xD7
-  PROFILEFINISH,         // 0xD8
-  REDIRECTLOCK,          // 0xD9
-  REDIRECTGROUNT,        // 0xDa
-  REDIRECTDENY,          // 0xDb
-  REDIRECTRELEASE,       // 0xDc
-  STATUSCONFIRM,         // 0xDd
-  STATUSREPORT,          // 0xDe
-  TERMINATE,             // 0xDf
-  MEMREQUEST,            // 0xE0
-  MEMRESPONSE,           // 0xE1
-#ifdef MULTICORE_GC
-  GCSTARTPRE,            // 0xE2
-  GCSTARTINIT,           // 0xE3
-  GCSTART,               // 0xE4
-  GCSTARTCOMPACT,        // 0xE5
-  GCSTARTMAPINFO,        // 0xE6
-  GCSTARTFLUSH,          // 0xE7
-  GCFINISHPRE,           // 0xE8
-  GCFINISHINIT,          // 0xE9
-  GCFINISHMARK,          // 0xEa
-  GCFINISHCOMPACT,       // 0xEb
-  GCFINISHMAPINFO,       // 0xEc
-  GCFINISHFLUSH,         // 0xEd
-  GCFINISH,              // 0xEe
-  GCMARKCONFIRM,         // 0xEf
-  GCMARKREPORT,          // 0xF0
-  GCMARKEDOBJ,           // 0xF1
-  GCMOVESTART,           // 0xF2
-  GCMAPREQUEST,          // 0xF3
-  GCMAPINFO,             // 0xF4
-  GCMAPTBL,              // 0xF5
-  GCLOBJREQUEST,         // 0xF6
-  GCLOBJINFO,            // 0xF7
-  GCLOBJMAPPING,         // 0xF8
-#ifdef GC_PROFILE
-  GCPROFILES,            // 0xF9
-#endif
-#ifdef GC_CACHE_ADAPT
-  GCSTARTPOSTINIT,       // 0xFa
-  GCSTARTPREF,           // 0xFb
-  GCFINISHPOSTINIT,      // 0xFc
-  GCFINISHPREF,          // 0xFd
-#endif // GC_CACHE_ADAPT
-#endif
-  MSGEND
-} MSGTYPE;
-
-/////////////////////////////////////////////////////////////////////////////////
-// NOTE: BAMBOO_TOTALCORE -- number of the available cores in the processor.
-//                           No greater than the number of all the cores in
-//                           the processor
-//       NUMCORES -- number of cores chosen to deploy the application. It can
-//                   be greater than that required to fully parallelize the
-//                   application. The same as NUMCORES.
-//       NUMCORESACTIVE -- number of cores that really execute the
-//                         application. No greater than NUMCORES
-//       NUMCORES4GC -- number of cores for gc. No greater than NUMCORES.
-//                      NOTE: currently only support ontinuous cores as gc
-//                            cores, i.e. 0~NUMCORES4GC-1
-////////////////////////////////////////////////////////////////////////////////
-// data structures of status for termination
-// only check working cores
-volatile int corestatus[NUMCORESACTIVE]; // records status of each core
-                                         // 1: running tasks
-                                         // 0: stall
-volatile int numsendobjs[NUMCORESACTIVE]; // records how many objects a core
-                                          // has sent out
-volatile int numreceiveobjs[NUMCORESACTIVE]; // records how many objects a
-                                             // core has received
-volatile int numconfirm;
-volatile bool waitconfirm;
-bool busystatus;
-int self_numsendobjs;
-int self_numreceiveobjs;
-
-// get rid of lock msgs for GC version
-#ifndef MULTICORE_GC
-// data structures for locking
-struct RuntimeHash locktable;
-static struct RuntimeHash* locktbl = &locktable;
-struct RuntimeHash * lockRedirectTbl;
-struct RuntimeHash * objRedirectLockTbl;
-#endif
-struct LockValue {
-  int redirectlock;
-  int value;
-};
-int lockobj;
-int lock2require;
-int lockresult;
-bool lockflag;
-
-// data structures for waiting objs
-struct Queue objqueue;
-struct Queue * totransobjqueue; // queue to hold objs to be transferred
-                                // should be cleared whenever enter a task
-
-// data structures for shared memory allocation
-#ifdef TILERA_BME
-#define BAMBOO_BASE_VA 0xd000000
-#elif defined TILERA_ZLINUX
-#ifdef MULTICORE_GC
-#define BAMBOO_BASE_VA 0xd000000
-#endif // MULTICORE_GC
-#endif // TILERA_BME
-
-#ifdef BAMBOO_MEMPROF
-#define GC_BAMBOO_NUMCORES 56
-#else
-#define GC_BAMBOO_NUMCORES 62
-#endif
-
-#ifdef GC_DEBUG
-#include "structdefs.h"
-#define BAMBOO_NUM_PAGES (NUMCORES4GC*(2+1)+3)
-#define BAMBOO_PAGE_SIZE (64 * 64)
-#define BAMBOO_SMEM_SIZE (64 * 64) // (BAMBOO_PAGE_SIZE)
-#define BAMBOO_SHARED_MEM_SIZE ((BAMBOO_PAGE_SIZE) *(BAMBOO_NUM_PAGES))
-#else
-#ifdef GC_LARGESHAREDHEAP
-#define BAMBOO_NUM_PAGES ((GC_BAMBOO_NUMCORES)*(2+2))
-#elif defined GC_LARGESHAREDHEAP2
-#define BAMBOO_NUM_PAGES ((GC_BAMBOO_NUMCORES)*(2+2))
-#else
-#define BAMBOO_NUM_PAGES ((GC_BAMBOO_NUMCORES)*(2+3)) //(15 * 1024) //(64 * 4 * 0.75) //(1024 * 1024 * 3.5)  3G
-#endif
-#ifdef GC_LARGEPAGESIZE
-#define BAMBOO_PAGE_SIZE (4 * 1024 * 1024)  // (4096)
-#define BAMBOO_SMEM_SIZE (4 * 1024 * 1024)
-#elif defined GC_SMALLPAGESIZE
-#define BAMBOO_PAGE_SIZE (256 * 1024)  // (4096)
-#define BAMBOO_SMEM_SIZE (256 * 1024)
-#elif defined GC_SMALLPAGESIZE2
-#define BAMBOO_PAGE_SIZE (64 * 1024)  // (4096)
-#define BAMBOO_SMEM_SIZE (64 * 1024)
-#else
-#define BAMBOO_PAGE_SIZE (1024 * 1024)  // (4096)
-#define BAMBOO_SMEM_SIZE (1024 * 1024)
-#endif // GC_LARGEPAGESIZE
-#define BAMBOO_SHARED_MEM_SIZE ((BAMBOO_PAGE_SIZE) * (BAMBOO_NUM_PAGES)) //(1024 * 1024 * 240)
-//((unsigned long long int)(3.0 * 1024 * 1024 * 1024)) // 3G 
-#endif // GC_DEBUG
-
-#ifdef MULTICORE_GC
-volatile bool gc_localheap_s;
-#endif
-
-#ifdef MULTICORE_GC
-#include "multicoregarbage.h"
-
-typedef enum {
-  SMEMLOCAL = 0x0,// 0x0, using local mem only
-  SMEMFIXED,      // 0x1, use local mem in lower address space(1 block only)
-                  //      and global mem in higher address space
-  SMEMMIXED,      // 0x2, like FIXED mode but use a threshold to control
-  SMEMGLOBAL,     // 0x3, using global mem only
-  SMEMEND
-} SMEMSTRATEGY;
-
-SMEMSTRATEGY bamboo_smem_mode; //-DSMEML: LOCAL; -DSMEMF: FIXED;
-                               //-DSMEMM: MIXED; -DSMEMG: GLOBAL;
-
-struct freeMemItem {
-  INTPTR ptr;
-  int size;
-  int startblock;
-  int endblock;
-  struct freeMemItem * next;
-};
-
-struct freeMemList {
-  struct freeMemItem * head;
-  struct freeMemItem * backuplist; // hold removed freeMemItem for reuse;
-                                   // only maintain 1 freemMemItem
-};
-
-// table recording the number of allocated bytes on each block
-// Note: this table resides on the bottom of the shared heap for all cores
-//       to access
-volatile int * bamboo_smemtbl;
-volatile int bamboo_free_block;
-//bool bamboo_smem_flushed;
-//struct freeMemList * bamboo_free_mem_list;
-int bamboo_reserved_smem; // reserved blocks on the top of the shared heap
-                          // e.g. 20% of the heap and should not be allocated
-                          // otherwise gc is invoked
-volatile INTPTR bamboo_smem_zero_top;
-#define BAMBOO_SMEM_ZERO_UNIT_SIZE (4 * 1024) // 4KB
-#else
-//volatile mspace bamboo_free_msp;
-INTPTR bamboo_free_smemp;
-int bamboo_free_smem_size;
-#endif
-volatile bool smemflag;
-volatile INTPTR bamboo_cur_msp;
-volatile int bamboo_smem_size;
-
-// for test TODO
-int total_num_t6;
-
-// data structures for profile mode
-#ifdef PROFILE
-
-#define TASKINFOLENGTH 3000 // 0
-#ifdef PROFILE_INTERRUPT
-#define INTERRUPTINFOLENGTH 50 //0
-#endif // PROFILE_INTERRUPT
-
-bool stall;
-//bool isInterrupt;
-int totalexetime;
-//unsigned long long interrupttime;
-
-typedef struct task_info {
-  char* taskName;
-  unsigned long long startTime;
-  unsigned long long endTime;
-  unsigned long long exitIndex;
-  struct Queue * newObjs;
-} TaskInfo;
-
-TaskInfo * taskInfoArray[TASKINFOLENGTH];
-int taskInfoIndex;
-bool taskInfoOverflow;
-#ifdef PROFILE_INTERRUPT
-typedef struct interrupt_info {
-  unsigned long long startTime;
-  unsigned long long endTime;
-} InterruptInfo;
-
-InterruptInfo * interruptInfoArray[INTERRUPTINFOLENGTH];
-int interruptInfoIndex;
-bool interruptInfoOverflow;
-#endif // PROFILE_INTERUPT
-volatile int profilestatus[NUMCORESACTIVE]; // records status of each core
-                                            // 1: running tasks
-                                            // 0: stall
-#endif // #ifdef PROFILE
-
-#ifndef INTERRUPT
-bool reside;
-#endif
-/////////////////////////////////////////////////////////////
-
-////////////////////////////////////////////////////////////
-// these are functions should be implemented in           //
-// multicore runtime for any multicore processors         //
-////////////////////////////////////////////////////////////
-#ifdef TASK
-#ifdef MULTICORE
-INLINE void initialization(void);
-INLINE void initCommunication(void);
-INLINE void fakeExecution(void);
-INLINE void terminate(void);
-INLINE void initlock(struct ___Object___ * v);
-#ifdef BAMBOO_MEMPROF
-INLINE void terminatememprof(void);
-#endif
-
-// lock related functions
-bool getreadlock(void* ptr);
-void releasereadlock(void* ptr);
-bool getwritelock(void* ptr);
-void releasewritelock(void* ptr);
-bool getwritelock_I(void* ptr);
-void releasewritelock_I(void * ptr);
-#ifndef MULTICORE_GC
-void releasewritelock_r(void * lock, void * redirectlock);
-#endif
-/* this function is to process lock requests.
- * can only be invoked in receiveObject() */
-// if return -1: the lock request is redirected
-//            0: the lock request is approved
-//            1: the lock request is denied
-INLINE int processlockrequest(int locktype,
-                              int lock,
-                              int obj,
-                              int requestcore,
-                              int rootrequestcore,
-                              bool cache);
-INLINE void processlockrelease(int locktype,
-                               int lock,
-                               int redirectlock,
-                               bool redirect);
-
-// msg related functions
-INLINE void send_hanging_msg(bool isInterrupt);
-INLINE void send_msg_1(int targetcore,
-                       unsigned long n0,
-					   bool isInterrupt);
-INLINE void send_msg_2(int targetcore,
-                       unsigned long n0,
-                       unsigned long n1,
-					   bool isInterrupt);
-INLINE void send_msg_3(int targetcore,
-                       unsigned long n0,
-                       unsigned long n1,
-                       unsigned long n2,
-					   bool isInterrupt);
-INLINE void send_msg_4(int targetcore,
-                       unsigned long n0,
-                       unsigned long n1,
-                       unsigned long n2,
-                       unsigned long n3,
-					   bool isInterrupt);
-INLINE void send_msg_5(int targetcore,
-                       unsigned long n0,
-                       unsigned long n1,
-                       unsigned long n2,
-                       unsigned long n3,
-                       unsigned long n4,
-					   bool isInterrupt);
-INLINE void send_msg_6(int targetcore,
-                       unsigned long n0,
-                       unsigned long n1,
-                       unsigned long n2,
-                       unsigned long n3,
-                       unsigned long n4,
-                       unsigned long n5,
-					   bool isInterrupt);
-INLINE void cache_msg_1(int targetcore,
-                        unsigned long n0);
-INLINE void cache_msg_2(int targetcore,
-                        unsigned long n0,
-                        unsigned long n1);
-INLINE void cache_msg_3(int targetcore,
-                        unsigned long n0,
-                        unsigned long n1,
-                        unsigned long n2);
-INLINE void cache_msg_4(int targetcore,
-                        unsigned long n0,
-                        unsigned long n1,
-                        unsigned long n2,
-                        unsigned long n3);
-INLINE void cache_msg_5(int targetcore,
-                        unsigned long n0,
-                        unsigned long n1,
-                        unsigned long n2,
-                        unsigned long n3,
-                        unsigned long n4);
-INLINE void cache_msg_6(int targetcore,
-                        unsigned long n0,
-                        unsigned long n1,
-                        unsigned long n2,
-                        unsigned long n3,
-                        unsigned long n4,
-                        unsigned long n5);
-INLINE void transferObject(struct transObjInfo * transObj);
-INLINE int receiveMsg(uint32_t send_port_pending);
-
-#ifdef MULTICORE_GC
-INLINE void transferMarkResults();
-#endif
-
-#ifdef PROFILE
-INLINE void profileTaskStart(char * taskname);
-INLINE void profileTaskEnd(void);
-void outputProfileData();
-#endif  // #ifdef PROFILE
-///////////////////////////////////////////////////////////
-
-/////////////////////////////////////////////////////////////////////////////
-// For each version of BAMBOO runtime, there should be a header file named //
-// runtim_arch.h defining following MARCOS:                                //
-// BAMBOO_NUM_OF_CORE: the # of current residing core                      //
-// BAMBOO_GET_NUM_OF_CORE(): compute the # of current residing core        //
-// BAMBOO_COORDS(c, x, y): convert the cpu # to coords (*x, *y)            //
-// BAMBOO_DEBUGPRINT(x): print out integer x                               //
-// BAMBOO_DEBUGPRINT_REG(x): print out value of variable x                 //
-// BAMBOO_EXIT_APP(x): exit the whole application                          //
-// BAMBOO_EXIT(x): error exit routine with error #                         //
-// BAMBOO_DIE(x): error exit routine with error msg                        //
-// BAMBOO_GET_EXE_TIME(): rountine to get current clock cycle number       //
-// BAMBOO_MSG_AVAIL(): checking if there are msgs coming in                //
-// BAMBOO_GCMSG_AVAIL(): checking if there are gcmsgs coming in            //
-// BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT(): change to runtime mode from    //
-//                                          client mode                    //
-// BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME(): change to client mode from     //
-//                                          runtime mode                   //
-// BAMBOO_ENTER_SEND_MODE_FROM_CLIENT(): change to send mode from          //
-//                                       client mode                       //
-// BAMBOO_ENTER_CLIENT_MODE_FROM_SEND(): change to client mode from        //
-//                                       send mode                         //
-// BAMBOO_ENTER_RUNTIME_MODE_FROM_SEND(): change to runtime mode from      //
-//                                        send mode                        //
-// BAMBOO_ENTER_SEND_MODE_FROM_RUNTIME(): change to send mode from         //
-//                                        runtime mode                     //
-// BAMBOO_WAITING_FOR_LOCK(): routine executed while waiting for lock      //
-//                            request response                             //
-// BAMBOO_LOCAL_MEM_CALLOC(x, y): allocate an array of x elements each of  //
-//                                whose size in bytes is y on local memory //
-//                                which is given by the hypervisor         //
-// BAMBOO_LOCAL_MEM_FREE(x): free space with ptr x on local memory         //
-// BAMBOO_LOCAL_MEM_CLOSE(): close the local heap                          //
-// BAMBOO_LOCAL_MEM_CALLOC_S(x, y): allocate an array of x elements each of//
-//                                  whose size in bytes is y on local      //
-//                                  memory which is not from the hypervisor//
-//                                  but is allocated from the free memory  //
-// BAMBOO_LOCAL_MEM_FREE_S(x): free space with ptr x on self-allocated     //
-//                             local memory                                //
-// BAMBOO_LOCAL_MEM_CLOSE_S(): close the self-allocated local heap        //
-// BAMBOO_SHARE_MEM_CALLOC_I(x, y): allocate an array of x elements each of//
-//                                whose size in bytes is y on shared memory//
-// BAMBOO_SHARE_MEM_CLOSE(): close the shared heap                         //
-// BAMBOO_CACHE_LINE_SIZE: the cache line size                             //
-// BAMBOO_CACHE_LINE_MASK: mask for a cache line                           //
-// BAMBOO_CACHE_FLUSH_RANGE(x, y): flush cache lines started at x with     //
-//                                 length y                                //
-// BAMBOO_CACHE_FLUSH_ALL(): flush the whole cache of a core if necessary  //
-// BAMBOO_MEMSET_WH(x, y, z): memset the specified region of memory (start //
-//                            address x, size z) to value y with write     //
-//                            hint, the processor will not fetch the       //
-//                            current content of the memory and directly   //
-//                            write                                        //
-// BAMBOO_CLEAN_DTLB(): zero-out all the dtlb entries                      //
-// BAMBOO_CACHE_FLUSH_L2(): Flush the contents of this tile's L2 back to   //
-//                          main memory                                    //
-/////////////////////////////////////////////////////////////////////////////
-
-#endif  // #ifdef MULTICORE
-#endif  // #ifdef TASK
-#endif  // #ifndef MULTICORE_RUNTIME
diff --git a/Robust/src/Runtime/multicoretask.c b/Robust/src/Runtime/multicoretask.c
deleted file mode 100644
index cfcb41d8..00000000
--- a/Robust/src/Runtime/multicoretask.c
+++ /dev/null
@@ -1,4600 +0,0 @@
-#ifdef TASK
-#include "runtime.h"
-#include "multicoreruntime.h"
-#include "runtime_arch.h"
-#include "GenericHashtable.h"
-
-#ifndef INLINE
-#define INLINE    inline __attribute__((always_inline))
-#endif // #ifndef INLINE
-
-//  data structures for task invocation
-struct genhashtable * activetasks;
-struct taskparamdescriptor * currtpd;
-struct LockValue runtime_locks[MAXTASKPARAMS];
-int runtime_locklen;
-
-// specific functions used inside critical sections
-void enqueueObject_I(void * ptr,
-                     struct parameterwrapper ** queues,
-                     int length);
-int enqueuetasks_I(struct parameterwrapper *parameter,
-                   struct parameterwrapper *prevptr,
-                   struct ___Object___ *ptr,
-                   int * enterflags,
-                   int numenterflags);
-
-#ifdef MULTICORE_GC
-#ifdef SMEMF
-#define NUM_CORES2TEST 5
-#ifdef GC_1
-int core2test[1][NUM_CORES2TEST] = {
-  {0, -1, -1, -1, -1}
-};
-#elif defined GC_56
-int core2test[56][NUM_CORES2TEST] = {
-  { 0, -1,  7, -1,  1}, { 1, -1,  8,  0,  2}, { 2, -1,  9,  1,  3},
-  { 3, -1, 10,  2,  4}, { 4, -1, 11,  3,  5}, { 5, -1, 12,  4,  6},
-  { 6, -1, 13,  5, -1}, { 7,  0, 14, -1,  8}, { 8,  1, 15,  7,  9},
-  { 9,  2, 16,  8, 10}, {10,  3, 17,  9, 11}, {11,  4, 18, 10, 12},
-  {12,  5, 19, 11, 13}, {13,  6, 20, 12, -1}, {14,  7, 21, -1, 15},
-  {15,  8, 22, 14, 16}, {16,  9, 23, 15, 17}, {17, 10, 24, 16, 18},
-  {18, 11, 25, 17, 19}, {19, 12, 26, 18, 20}, {20, 13, 27, 19, -1},
-  {21, 14, 28, -1, 22}, {22, 15, 29, 21, 23}, {23, 16, 30, 22, 24},
-  {24, 17, 31, 23, 25}, {25, 18, 32, 24, 26}, {26, 19, 33, 25, 27},
-  {27, 20, 34, 26, -1}, {28, 21, 35, -1, 29}, {29, 22, 36, 28, 30},
-  {30, 23, 37, 29, 31}, {31, 24, 38, 30, 32}, {32, 25, 39, 31, 33},
-  {33, 26, 40, 32, 34}, {34, 27, 41, 33, -1}, {35, 28, 42, -1, 36},
-  {36, 29, 43, 35, 37}, {37, 30, 44, 36, 38}, {38, 31, 45, 37, 39},
-  {39, 32, 46, 38, 40}, {40, 33, 47, 39, 41}, {41, 34, 48, 40, -1},
-  {42, 35, 49, -1, 43}, {43, 36, 50, 42, 44}, {44, 37, 51, 43, 45},
-  {45, 38, 52, 44, 46}, {46, 39, 53, 45, 47}, {47, 40, 54, 46, 48},
-  {48, 41, 55, 47, -1}, {49, 42, -1, -1, 50}, {50, 43, -1, 49, 51},
-  {51, 44, -1, 50, 52}, {52, 45, -1, 51, 53}, {53, 46, -1, 52, 54},
-  {54, 47, -1, 53, 55}, {55, 48, -1, 54, -1}
-};
-#elif defined GC_62
-int core2test[62][NUM_CORES2TEST] = {
-  { 0, -1,  6, -1,  1}, { 1, -1,  7,  0,  2}, { 2, -1,  8,  1,  3},
-  { 3, -1,  9,  2,  4}, { 4, -1, 10,  3,  5}, { 5, -1, 11,  4, -1},
-  { 6,  0, 14, -1,  7}, { 7,  1, 15,  6,  8}, { 8,  2, 16,  7,  9},
-  { 9,  3, 17,  8, 10}, {10,  4, 18,  9, 11}, {11,  5, 19, 10, 12},
-  {12, -1, 20, 11, 13}, {13, -1, 21, 12, -1}, {14,  6, 22, -1, 15},
-  {15,  7, 23, 14, 16}, {16,  8, 24, 15, 17}, {17,  9, 25, 16, 18},
-  {18, 10, 26, 17, 19}, {19, 11, 27, 18, 20}, {20, 12, 28, 19, 21},
-  {21, 13, 29, 28, -1}, {22, 14, 30, -1, 23}, {23, 15, 31, 22, 24},
-  {24, 16, 32, 23, 25}, {25, 17, 33, 24, 26}, {26, 18, 34, 25, 27},
-  {27, 19, 35, 26, 28}, {28, 20, 36, 27, 29}, {29, 21, 37, 28, -1},
-  {30, 22, 38, -1, 31}, {31, 23, 39, 30, 32}, {32, 24, 40, 31, 33},
-  {33, 25, 41, 32, 34}, {34, 26, 42, 33, 35}, {35, 27, 43, 34, 36},
-  {36, 28, 44, 35, 37}, {37, 29, 45, 36, -1}, {38, 30, 46, -1, 39},
-  {39, 31, 47, 38, 40}, {40, 32, 48, 39, 41}, {41, 33, 49, 40, 42},
-  {42, 34, 50, 41, 43}, {43, 35, 51, 42, 44}, {44, 36, 52, 43, 45},
-  {45, 37, 53, 44, -1}, {46, 38, 54, -1, 47}, {47, 39, 55, 46, 48},
-  {48, 40, 56, 47, 49}, {49, 41, 57, 48, 50}, {50, 42, 58, 49, 51},
-  {51, 43, 59, 50, 52}, {52, 44, 60, 51, 53}, {53, 45, 61, 52, -1},
-  {54, 46, -1, -1, 55}, {55, 47, -1, 54, 56}, {56, 48, -1, 55, 57},
-  {57, 49, -1, 56, 59}, {58, 50, -1, 57, 59}, {59, 51, -1, 58, 60},
-  {60, 52, -1, 59, 61}, {61, 53, -1, 60, -1}
-};
-#endif // GC_1
-#elif defined SMEMM
-unsigned int gcmem_mixed_threshold = 0;
-unsigned int gcmem_mixed_usedmem = 0;
-#define NUM_CORES2TEST 9
-#ifdef GC_1
-int core2test[1][NUM_CORES2TEST] = {
-  {0, -1, -1, -1, -1, -1, -1, -1, -1}
-};
-#elif defined GC_56
-int core2test[56][NUM_CORES2TEST] = {
-  { 0, -1,  7, -1,  1, -1, 14, -1,  2}, 
-  { 1, -1,  8,  0,  2, -1, 15, -1,  3}, 
-  { 2, -1,  9,  1,  3, -1, 16,  0,  4}, 
-  { 3, -1, 10,  2,  4, -1, 17,  1,  5}, 
-  { 4, -1, 11,  3,  5, -1, 18,  2,  6}, 
-  { 5, -1, 12,  4,  6, -1, 19,  3, -1},
-  { 6, -1, 13,  5, -1, -1, 20,  4, -1}, 
-  { 7,  0, 14, -1,  8, -1, 21, -1,  9}, 
-  { 8,  1, 15,  7,  9, -1, 22, -1, 10}, 
-  { 9,  2, 16,  8, 10, -1, 23,  7, 11}, 
-  {10,  3, 17,  9, 11, -1, 24,  8, 12}, 
-  {11,  4, 18, 10, 12, -1, 25,  9, 13},
-  {12,  5, 19, 11, 13, -1, 26, 10, -1}, 
-  {13,  6, 20, 12, -1, -1, 27, 11, -1}, 
-  {14,  7, 21, -1, 15,  0, 28, -1, 16}, 
-  {15,  8, 22, 14, 16,  1, 29, -1, 17}, 
-  {16,  9, 23, 15, 17,  2, 30, 14, 18}, 
-  {17, 10, 24, 16, 18,  3, 31, 15, 19},
-  {18, 11, 25, 17, 19,  4, 32, 16, 20}, 
-  {19, 12, 26, 18, 20,  5, 33, 17, -1}, 
-  {20, 13, 27, 19, -1,  6, 34, 18, -1}, 
-  {21, 14, 28, -1, 22,  7, 35, -1, 23}, 
-  {22, 15, 29, 21, 23,  8, 36, -1, 24}, 
-  {23, 16, 30, 22, 24,  9, 37, 21, 25},
-  {24, 17, 31, 23, 25, 10, 38, 22, 26}, 
-  {25, 18, 32, 24, 26, 11, 39, 23, 27}, 
-  {26, 19, 33, 25, 27, 12, 40, 24, -1}, 
-  {27, 20, 34, 26, -1, 13, 41, 25, -1}, 
-  {28, 21, 35, -1, 29, 14, 42, -1, 30}, 
-  {29, 22, 36, 28, 30, 15, 43, -1, 31},
-  {30, 23, 37, 29, 31, 16, 44, 28, 32}, 
-  {31, 24, 38, 30, 32, 17, 45, 29, 33}, 
-  {32, 25, 39, 31, 33, 18, 46, 30, 34}, 
-  {33, 26, 40, 32, 34, 19, 47, 31, -1}, 
-  {34, 27, 41, 33, -1, 20, 48, 32, -1}, 
-  {35, 28, 42, -1, 36, 21, 49, -1, 37},
-  {36, 29, 43, 35, 37, 22, 50, -1, 38}, 
-  {37, 30, 44, 36, 38, 23, 51, 35, 39}, 
-  {38, 31, 45, 37, 39, 24, 52, 36, 40}, 
-  {39, 32, 46, 38, 40, 25, 53, 37, 41}, 
-  {40, 33, 47, 39, 41, 26, 54, 38, -1}, 
-  {41, 34, 48, 40, -1, 27, 55, 39, -1},
-  {42, 35, 49, -1, 43, 28, -1, -1, 44}, 
-  {43, 36, 50, 42, 44, 29, -1, -1, 45}, 
-  {44, 37, 51, 43, 45, 30, -1, 42, 46}, 
-  {45, 38, 52, 44, 46, 31, -1, 43, 47}, 
-  {46, 39, 53, 45, 47, 32, -1, 44, 48}, 
-  {47, 40, 54, 46, 48, 33, -1, 45, -1},
-  {48, 41, 55, 47, -1, 34, -1, 46, -1}, 
-  {49, 42, -1, -1, 50, 35, -1, -1, 51}, 
-  {50, 43, -1, 49, 51, 36, -1, -1, 52}, 
-  {51, 44, -1, 50, 52, 37, -1, 49, 53}, 
-  {52, 45, -1, 51, 53, 38, -1, 50, 54}, 
-  {53, 46, -1, 52, 54, 39, -1, 51, 55},
-  {54, 47, -1, 53, 55, 40, -1, 52, -1}, 
-  {55, 48, -1, 54, -1, 41, -1, 53, -1}
-};
-#elif defined GC_62
-int core2test[62][NUM_CORES2TEST] = {
-  { 0, -1,  6, -1,  1, -1, 14, -1,  2}, 
-  { 1, -1,  7,  0,  2, -1, 15, -1,  3}, 
-  { 2, -1,  8,  1,  3, -1, 16,  0,  4}, 
-  { 3, -1,  9,  2,  4, -1, 17,  1,  5}, 
-  { 4, -1, 10,  3,  5, -1, 18,  2, -1}, 
-  { 5, -1, 11,  4, -1, -1, 19,  3, -1},
-  { 6,  0, 14, -1,  7, -1, 22, -1,  8}, 
-  { 7,  1, 15,  6,  8, -1, 23, -1,  9}, 
-  { 8,  2, 16,  7,  9, -1, 24,  6, 10}, 
-  { 9,  3, 17,  8, 10, -1, 25,  7, 11}, 
-  {10,  4, 18,  9, 11, -1, 26,  8, 12}, 
-  {11,  5, 19, 10, 12, -1, 27,  9, 13},
-  {12, -1, 20, 11, 13, -1, 28, 10, -1}, 
-  {13, -1, 21, 12, -1, -1, 29, 11, -1}, 
-  {14,  6, 22, -1, 15,  0, 30, -1, 16}, 
-  {15,  7, 23, 14, 16,  1, 31, -1, 17}, 
-  {16,  8, 24, 15, 17,  2, 32, 14, 18}, 
-  {17,  9, 25, 16, 18,  3, 33, 15, 19},
-  {18, 10, 26, 17, 19,  4, 34, 16, 20}, 
-  {19, 11, 27, 18, 20,  5, 35, 17, 21}, 
-  {20, 12, 28, 19, 21, -1, 36, 18, -1}, 
-  {21, 13, 29, 28, -1, -1, 37, 19, -1}, 
-  {22, 14, 30, -1, 23,  6, 38, -1, 24}, 
-  {23, 15, 31, 22, 24,  7, 39, -1, 25},
-  {24, 16, 32, 23, 25,  8, 40, 22, 26}, 
-  {25, 17, 33, 24, 26,  9, 41, 23, 27}, 
-  {26, 18, 34, 25, 27, 10, 42, 24, 28}, 
-  {27, 19, 35, 26, 28, 11, 43, 25, 29}, 
-  {28, 20, 36, 27, 29, 12, 44, 26, -1}, 
-  {29, 21, 37, 28, -1, 13, 45, 27, -1},
-  {30, 22, 38, -1, 31, 22, 46, -1, 32}, 
-  {31, 23, 39, 30, 32, 15, 47, -1, 33}, 
-  {32, 24, 40, 31, 33, 16, 48, 30, 34}, 
-  {33, 25, 41, 32, 34, 17, 49, 31, 35}, 
-  {34, 26, 42, 33, 35, 18, 50, 32, 36}, 
-  {35, 27, 43, 34, 36, 19, 51, 33, 37},
-  {36, 28, 44, 35, 37, 20, 52, 34, -1}, 
-  {37, 29, 45, 36, -1, 21, 53, 35, -1}, 
-  {38, 30, 46, -1, 39, 22, 54, -1, 40}, 
-  {39, 31, 47, 38, 40, 23, 55, -1, 41}, 
-  {40, 32, 48, 39, 41, 24, 56, 38, 42}, 
-  {41, 33, 49, 40, 42, 25, 57, 39, 43},
-  {42, 34, 50, 41, 43, 26, 58, 40, 44}, 
-  {43, 35, 51, 42, 44, 27, 59, 41, 45}, 
-  {44, 36, 52, 43, 45, 28, 60, 42, -1}, 
-  {45, 37, 53, 44, -1, 29, 61, 43, -1}, 
-  {46, 38, 54, -1, 47, 30, -1, -1, 48}, 
-  {47, 39, 55, 46, 48, 31, -1, -1, 49},
-  {48, 40, 56, 47, 49, 32, -1, 46, 50}, 
-  {49, 41, 57, 48, 50, 33, -1, 47, 51}, 
-  {50, 42, 58, 49, 51, 34, -1, 48, 52}, 
-  {51, 43, 59, 50, 52, 35, -1, 49, 53}, 
-  {52, 44, 60, 51, 53, 36, -1, 50, -1}, 
-  {53, 45, 61, 52, -1, 37, -1, 51, -1},
-  {54, 46, -1, -1, 55, 38, -1, -1, 56}, 
-  {55, 47, -1, 54, 56, 39, -1, -1, 57}, 
-  {56, 48, -1, 55, 57, 40, -1, 54, 58}, 
-  {57, 49, -1, 56, 59, 41, -1, 55, 59}, 
-  {58, 50, -1, 57, 59, 42, -1, 56, 60}, 
-  {59, 51, -1, 58, 60, 43, -1, 57, 61},
-  {60, 52, -1, 59, 61, 44, -1, 58, -1}, 
-  {61, 53, -1, 60, -1, 45, -1, 59, -1}
-};
-#endif // GC_1
-#endif
-
-inline __attribute__((always_inline))
-void setupsmemmode(void) {
-#ifdef SMEML
-  // Only allocate local mem chunks to each core.
-  // If a core has used up its local shared memory, start gc.
-  bamboo_smem_mode = SMEMLOCAL;
-#elif defined SMEMF
-  // Allocate the local shared memory to each core with the highest priority,
-  // if a core has used up its local shared memory, try to allocate the 
-  // shared memory that belong to its neighbours, if also failed, start gc.
-  bamboo_smem_mode = SMEMFIXED;
-#elif defined SMEMM
-  // Allocate the local shared memory to each core with the highest priority,
-  // if a core has used up its local shared memory, try to allocate the 
-  // shared memory that belong to its neighbours first, if failed, check 
-  // current memory allocation rate, if it has already reached the threshold,
-  // start gc, otherwise, allocate the shared memory globally.  If all the 
-  // shared memory has been used up, start gc.
-  bamboo_smem_mode = SMEMMIXED;
-#elif defined SMEMG
-  // Allocate all the memory chunks globally, do not consider the host cores
-  // When all the shared memory are used up, start gc.
-  bamboo_smem_mode = SMEMGLOBAL;
-#else
-  // defaultly using local mode
-  bamboo_smem_mode = SMEMLOCAL;
-  //bamboo_smem_mode = SMEMGLOBAL;
-  //bamboo_smem_mode = SMEMFIXED;
-#endif
-} // void setupsmemmode(void)
-#endif
-
-inline __attribute__((always_inline))
-void initruntimedata() {
-  int i;
-  // initialize the arrays
-  if(STARTUPCORE == BAMBOO_NUM_OF_CORE) {
-    // startup core to initialize corestatus[]
-    for(i = 0; i < NUMCORESACTIVE; ++i) {
-      corestatus[i] = 1;
-      numsendobjs[i] = 0;
-      numreceiveobjs[i] = 0;
-#ifdef PROFILE
-      // initialize the profile data arrays
-      profilestatus[i] = 1;
-#endif
-#ifdef MULTICORE_GC
-      gccorestatus[i] = 1;
-      gcnumsendobjs[0][i] = gcnumsendobjs[1][i] = 0;
-      gcnumreceiveobjs[0][i] = gcnumreceiveobjs[1][i] = 0;
-#endif
-    } // for(i = 0; i < NUMCORESACTIVE; ++i)
-#ifdef MULTICORE_GC
-    for(i = 0; i < NUMCORES4GC; ++i) {
-      gcloads[i] = 0;
-      gcrequiredmems[i] = 0;
-      gcstopblock[i] = 0;
-      gcfilledblocks[i] = 0;
-    } // for(i = 0; i < NUMCORES4GC; ++i)
-#ifdef GC_PROFILE
-    gc_infoIndex = 0;
-    gc_infoOverflow = false;
-	gc_num_livespace = 0;
-	gc_num_freespace = 0;
-#endif
-#endif
-    numconfirm = 0;
-    waitconfirm = false;
-
-    // TODO for test
-    total_num_t6 = 0;
-  }
-
-  busystatus = true;
-  self_numsendobjs = 0;
-  self_numreceiveobjs = 0;
-
-  for(i = 0; i < BAMBOO_MSG_BUF_LENGTH; ++i) {
-    msgdata[i] = -1;
-  }
-  msgdataindex = 0;
-  msgdatalast = 0;
-  msglength = BAMBOO_MSG_BUF_LENGTH;
-  msgdatafull = false;
-  for(i = 0; i < BAMBOO_OUT_BUF_LENGTH; ++i) {
-    outmsgdata[i] = -1;
-  }
-  outmsgindex = 0;
-  outmsglast = 0;
-  outmsgleft = 0;
-  isMsgHanging = false;
-  //isMsgSending = false;
-
-  smemflag = true;
-  bamboo_cur_msp = NULL;
-  bamboo_smem_size = 0;
-  totransobjqueue = createQueue_I();
-
-#ifdef MULTICORE_GC
-  bamboo_smem_zero_top = NULL;
-  gcflag = false;
-  gcprocessing = false;
-  gcphase = FINISHPHASE;
-  //gcnumpre = 0;
-  gcprecheck = true;
-  gccurr_heaptop = 0;
-  gcself_numsendobjs = 0;
-  gcself_numreceiveobjs = 0;
-  gcmarkedptrbound = 0;
-#ifdef LOCALHASHTBL_TEST
-  gcpointertbl = allocateRuntimeHash_I(20);
-#else
-  gcpointertbl = mgchashCreate_I(2000, 0.75);
-#endif
-  //gcpointertbl = allocateMGCHash_I(20);
-  gcforwardobjtbl = allocateMGCHash_I(20, 3);
-  gcobj2map = 0;
-  gcmappedobj = 0;
-  //gcismapped = false;
-  gcnumlobjs = 0;
-  gcheaptop = 0;
-  gctopcore = 0;
-  gctopblock = 0;
-  gcmovestartaddr = 0;
-  gctomove = false;
-  gcmovepending = 0;
-  gcblock2fill = 0;
-  gcsbstarttbl = BAMBOO_BASE_VA;
-  bamboo_smemtbl = (void *)gcsbstarttbl
-               + (BAMBOO_SHARED_MEM_SIZE/BAMBOO_SMEM_SIZE)*sizeof(INTPTR);
-  if(BAMBOO_NUM_OF_CORE < NUMCORES4GC) {
-	int t_size = ((BAMBOO_RMSP_SIZE)-sizeof(mgcsharedhashtbl_t)*2
-		-128*sizeof(size_t))/sizeof(mgcsharedhashlistnode_t)-2;
-	int kk = 0;
-	unsigned int tmp_k = 1 << (sizeof(int)*8 -1);
-	while(((t_size & tmp_k) == 0) && (kk < sizeof(int)*8)) {
-	  t_size = t_size << 1;
-	  kk++;
-	}
-	t_size = tmp_k >> kk;
-	gcsharedptbl = mgcsharedhashCreate_I(t_size,0.30);//allocateGCSharedHash_I(20);
-  } else {
-	gcsharedptbl = NULL;
-  }
-  BAMBOO_MEMSET_WH(gcrpointertbls, 0, 
-	  sizeof(mgcsharedhashtbl_t *)*NUMCORES4GC);
-	  //sizeof(struct RuntimeHash *)*NUMCORES4GC);
-#ifdef SMEMM
-  gcmem_mixed_threshold = (unsigned int)((BAMBOO_SHARED_MEM_SIZE
-		-bamboo_reserved_smem*BAMBOO_SMEM_SIZE)*0.8);
-  gcmem_mixed_usedmem = 0;
-#endif
-#ifdef GC_PROFILE
-  gc_num_obj = 0;
-  gc_num_liveobj = 0;
-  gc_num_forwardobj = 0;
-  gc_num_profiles = NUMCORESACTIVE - 1;
-#endif
-#ifdef GC_FLUSH_DTLB
-  gc_num_flush_dtlb = 0;
-#endif
-  gc_localheap_s = false;
-#ifdef GC_CACHE_ADAPT
-  gccachestage = false;
-  // enable the timer interrupt
-  bamboo_tile_timer_set_next_event(500000000); // TODO
-  bamboo_unmask_timer_intr();
-  //BAMBOO_DEBUGPRINT(BAMBOO_GET_EXE_TIME());
-  bamboo_dtlb_sampling_process();
-#endif // GC_CACHE_ADAPT
-#else
-  // create the lock table, lockresult table and obj queue
-  locktable.size = 20;
-  locktable.bucket =
-    (struct RuntimeNode **) RUNMALLOC_I(sizeof(struct RuntimeNode *)*20);
-  /* Set allocation blocks*/
-  locktable.listhead=NULL;
-  locktable.listtail=NULL;
-  /*Set data counts*/
-  locktable.numelements = 0;
-  lockobj = 0;
-  lock2require = 0;
-  lockresult = 0;
-  lockflag = false;
-  lockRedirectTbl = allocateRuntimeHash_I(20);
-  objRedirectLockTbl = allocateRuntimeHash_I(20);
-#endif
-#ifndef INTERRUPT
-  reside = false;
-#endif
-  objqueue.head = NULL;
-  objqueue.tail = NULL;
-
-  currtpd = NULL;
-
-#ifdef PROFILE
-  stall = false;
-  //isInterrupt = true;
-  totalexetime = -1;
-  //interrupttime = 0;
-  taskInfoIndex = 0;
-  taskInfoOverflow = false;
-#ifdef PROFILE_INTERRUPT
-  interruptInfoIndex = 0;
-  interruptInfoOverflow = false;
-#endif // PROFILE_INTERRUPT
-#endif // PROFILE
-
-  for(i = 0; i < MAXTASKPARAMS; i++) {
-    runtime_locks[i].redirectlock = 0;
-    runtime_locks[i].value = 0;
-  }
-  runtime_locklen = 0;
-}
-
-inline __attribute__((always_inline))
-void disruntimedata() {
-#ifdef MULTICORE_GC
-#ifdef LOCALHASHTBL_TEST
-  freeRuntimeHash(gcpointertbl);
-#else
-  mgchashDelete(gcpointertbl);
-#endif
-  //freeMGCHash(gcpointertbl);
-  freeMGCHash(gcforwardobjtbl);
-  // for mapping info structures
-  //freeRuntimeHash(gcrcoretbl);
-#else
-  freeRuntimeHash(lockRedirectTbl);
-  freeRuntimeHash(objRedirectLockTbl);
-  RUNFREE(locktable.bucket);
-#endif
-  if(activetasks != NULL) {
-    genfreehashtable(activetasks);
-  }
-  if(currtpd != NULL) {
-    RUNFREE(currtpd->parameterArray);
-    RUNFREE(currtpd);
-    currtpd = NULL;
-  }
-  BAMBOO_LOCAL_MEM_CLOSE();
-  BAMBOO_SHARE_MEM_CLOSE();
-}
-
-inline __attribute__((always_inline))
-bool checkObjQueue() {
-  bool rflag = false;
-  struct transObjInfo * objInfo = NULL;
-  int grount = 0;
-
-#ifdef PROFILE
-#ifdef ACCURATEPROFILE
-  bool isChecking = false;
-  if(!isEmpty(&objqueue)) {
-    profileTaskStart("objqueue checking");
-    isChecking = true;
-  }       // if(!isEmpty(&objqueue))
-#endif
-#endif
-
-  while(!isEmpty(&objqueue)) {
-    void * obj = NULL;
-    BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xf001);
-#endif
-#ifdef PROFILE
-    //isInterrupt = false;
-#endif
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xeee1);
-#endif
-    rflag = true;
-    objInfo = (struct transObjInfo *)getItem(&objqueue);
-    obj = objInfo->objptr;
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT_REG((int)obj);
-#endif
-    // grab lock and flush the obj
-    grount = 0;
-    getwritelock_I(obj);
-    while(!lockflag) {
-      BAMBOO_WAITING_FOR_LOCK(0);
-    }   // while(!lockflag)
-    grount = lockresult;
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT_REG(grount);
-#endif
-
-    lockresult = 0;
-    lockobj = 0;
-    lock2require = 0;
-    lockflag = false;
-#ifndef INTERRUPT
-    reside = false;
-#endif
-
-    if(grount == 1) {
-      int k = 0;
-      // flush the object
-#ifdef CACHEFLUSH
-      BAMBOO_CACHE_FLUSH_RANGE((int)obj,sizeof(int));
-      BAMBOO_CACHE_FLUSH_RANGE((int)obj,
-		  classsize[((struct ___Object___ *)obj)->type]);
-#endif
-      // enqueue the object
-      for(k = 0; k < objInfo->length; ++k) {
-		int taskindex = objInfo->queues[2 * k];
-		int paramindex = objInfo->queues[2 * k + 1];
-		struct parameterwrapper ** queues =
-		  &(paramqueues[BAMBOO_NUM_OF_CORE][taskindex][paramindex]);
-#ifdef DEBUG
-		BAMBOO_DEBUGPRINT_REG(taskindex);
-		BAMBOO_DEBUGPRINT_REG(paramindex);
-		struct ___Object___ * tmpptr = (struct ___Object___ *)obj;
-		tprintf("Process %x(%d): receive obj %x(%lld), ptrflag %x\n",
-				BAMBOO_NUM_OF_CORE, BAMBOO_NUM_OF_CORE, (int)obj,
-				(long)obj, tmpptr->flag);
-#endif
-		enqueueObject_I(obj, queues, 1);
-#ifdef DEBUG
-		BAMBOO_DEBUGPRINT_REG(hashsize(activetasks));
-#endif
-      }  // for(k = 0; k < objInfo->length; ++k)
-      releasewritelock_I(obj);
-      RUNFREE(objInfo->queues);
-      RUNFREE(objInfo);
-    } else {
-      // can not get lock
-      // put it at the end of the queue if no update version in the queue
-      struct QueueItem * qitem = getHead(&objqueue);
-      struct QueueItem * prev = NULL;
-      while(qitem != NULL) {
-		struct transObjInfo * tmpinfo =
-			(struct transObjInfo *)(qitem->objectptr);
-		if(tmpinfo->objptr == obj) {
-		  // the same object in the queue, which should be enqueued
-		  // recently. Current one is outdate, do not re-enqueue it
-		  RUNFREE(objInfo->queues);
-		  RUNFREE(objInfo);
-		  goto objqueuebreak;
-		} else {
-		  prev = qitem;
-		}  // if(tmpinfo->objptr == obj)
-		qitem = getNextQueueItem(prev);
-	  }  // while(qitem != NULL)
-      // try to execute active tasks already enqueued first
-      addNewItem_I(&objqueue, objInfo);
-#ifdef PROFILE
-      //isInterrupt = true;
-#endif
-objqueuebreak:
-      BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-#ifdef DEBUG
-      BAMBOO_DEBUGPRINT(0xf000);
-#endif
-      break;
-    }  // if(grount == 1)
-    BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xf000);
-#endif
-  }  // while(!isEmpty(&objqueue))
-
-#ifdef PROFILE
-#ifdef ACCURATEPROFILE
-  if(isChecking) {
-    profileTaskEnd();
-  }  // if(isChecking)
-#endif
-#endif
-
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xee02);
-#endif
-  return rflag;
-}
-
-inline __attribute__((always_inline))
-void checkCoreStatus() {
-  bool allStall = false;
-  int i = 0;
-  int sumsendobj = 0;
-  if((!waitconfirm) ||
-     (waitconfirm && (numconfirm == 0))) {
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xee04);
-    BAMBOO_DEBUGPRINT_REG(waitconfirm);
-#endif
-    BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xf001);
-#endif
-    corestatus[BAMBOO_NUM_OF_CORE] = 0;
-    numsendobjs[BAMBOO_NUM_OF_CORE] = self_numsendobjs;
-    numreceiveobjs[BAMBOO_NUM_OF_CORE] = self_numreceiveobjs;
-    // check the status of all cores
-    allStall = true;
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT_REG(NUMCORESACTIVE);
-#endif
-    for(i = 0; i < NUMCORESACTIVE; ++i) {
-#ifdef DEBUG
-      BAMBOO_DEBUGPRINT(0xe000 + corestatus[i]);
-#endif
-      if(corestatus[i] != 0) {
-		allStall = false;
-		break;
-      }
-    }  // for(i = 0; i < NUMCORESACTIVE; ++i)
-    if(allStall) {
-      // check if the sum of send objs and receive obj are the same
-      // yes->check if the info is the latest; no->go on executing
-      sumsendobj = 0;
-      for(i = 0; i < NUMCORESACTIVE; ++i) {
-		sumsendobj += numsendobjs[i];
-#ifdef DEBUG
-		BAMBOO_DEBUGPRINT(0xf000 + numsendobjs[i]);
-#endif
-      }  // for(i = 0; i < NUMCORESACTIVE; ++i)
-      for(i = 0; i < NUMCORESACTIVE; ++i) {
-		sumsendobj -= numreceiveobjs[i];
-#ifdef DEBUG
-		BAMBOO_DEBUGPRINT(0xf000 + numreceiveobjs[i]);
-#endif
-      }  // for(i = 0; i < NUMCORESACTIVE; ++i)
-      if(0 == sumsendobj) {
-		if(!waitconfirm) {
-		  // the first time found all cores stall
-		  // send out status confirm msg to all other cores
-		  // reset the corestatus array too
-#ifdef DEBUG
-		  BAMBOO_DEBUGPRINT(0xee05);
-#endif
-		  corestatus[BAMBOO_NUM_OF_CORE] = 1;
-		  waitconfirm = true;
-		  numconfirm = NUMCORESACTIVE - 1;
-		  BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-		  for(i = 1; i < NUMCORESACTIVE; ++i) {
-			corestatus[i] = 1;
-			// send status confirm msg to core i
-			send_msg_1(i, STATUSCONFIRM, false);
-		  }   // for(i = 1; i < NUMCORESACTIVE; ++i)
-		  return;
-		} else {
-		  // all the core status info are the latest
-		  // terminate; for profiling mode, send request to all
-		  // other cores to pour out profiling data
-#ifdef DEBUG
-		  BAMBOO_DEBUGPRINT(0xee06);
-#endif
-
-#ifdef USEIO
-		  totalexetime = BAMBOO_GET_EXE_TIME() - bamboo_start_time;
-#else
-#ifdef PROFILE
-		  //BAMBOO_DEBUGPRINT_REG(interrupttime);
-#endif
-
-		  BAMBOO_DEBUGPRINT(BAMBOO_GET_EXE_TIME() - bamboo_start_time);
-		  //BAMBOO_DEBUGPRINT_REG(total_num_t6); // TODO for test
-#ifdef GC_FLUSH_DTLB
-		  BAMBOO_DEBUGPRINT_REG(gc_num_flush_dtlb);
-#endif
-#ifndef BAMBOO_MEMPROF
-		  BAMBOO_DEBUGPRINT(0xbbbbbbbb);
-#endif
-#endif
-		  // profile mode, send msgs to other cores to request pouring
-		  // out progiling data
-#ifdef PROFILE
-		  BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-#ifdef DEBUG
-		  BAMBOO_DEBUGPRINT(0xf000);
-#endif
-		  for(i = 1; i < NUMCORESACTIVE; ++i) {
-			// send profile request msg to core i
-			send_msg_2(i, PROFILEOUTPUT, totalexetime, false);
-		  } // for(i = 1; i < NUMCORESACTIVE; ++i)
-#ifndef RT_TEST
-		  // pour profiling data on startup core
-		  outputProfileData();
-#endif
-		  while(true) {
-			BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
-#ifdef DEBUG
-			BAMBOO_DEBUGPRINT(0xf001);
-#endif
-			profilestatus[BAMBOO_NUM_OF_CORE] = 0;
-			// check the status of all cores
-			allStall = true;
-#ifdef DEBUG
-			BAMBOO_DEBUGPRINT_REG(NUMCORESACTIVE);
-#endif
-			for(i = 0; i < NUMCORESACTIVE; ++i) {
-#ifdef DEBUG
-			  BAMBOO_DEBUGPRINT(0xe000 + profilestatus[i]);
-#endif
-			  if(profilestatus[i] != 0) {
-				allStall = false;
-				break;
-			  }
-			}  // for(i = 0; i < NUMCORESACTIVE; ++i)
-			if(!allStall) {
-			  int halt = 100;
-			  BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-#ifdef DEBUG
-			  BAMBOO_DEBUGPRINT(0xf000);
-#endif
-			  while(halt--) {
-			  }
-			} else {
-			  BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-			  break;
-			}  // if(!allStall)
-		  }  // while(true)
-#endif
-
-		  // gc_profile mode, output gc prfiling data
-#ifdef MULTICORE_GC
-#ifdef GC_CACHE_ADAPT
-		  bamboo_mask_timer_intr(); // disable the TILE_TIMER interrupt
-#endif // GC_CACHE_ADAPT
-#ifdef GC_PROFILE
-		  gc_outputProfileData();
-#endif // #ifdef GC_PROFILE
-#endif // #ifdef MULTICORE_GC
-		  disruntimedata();
-		  BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-		  terminate();  // All done.
-		}  // if(!waitconfirm)
-      } else {
-		// still some objects on the fly on the network
-		// reset the waitconfirm and numconfirm
-#ifdef DEBUG
-		BAMBOO_DEBUGPRINT(0xee07);
-#endif
-		waitconfirm = false;
-		numconfirm = 0;
-	  }  //  if(0 == sumsendobj)
-    } else {
-      // not all cores are stall, keep on waiting
-#ifdef DEBUG
-      BAMBOO_DEBUGPRINT(0xee08);
-#endif
-      waitconfirm = false;
-      numconfirm = 0;
-    }  //  if(allStall)
-    BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xf000);
-#endif
-  }  // if((!waitconfirm) ||
-}
-
-// main function for each core
-inline void run(void * arg) {
-  int i = 0;
-  int argc = 1;
-  char ** argv = NULL;
-  bool sendStall = false;
-  bool isfirst = true;
-  bool tocontinue = false;
-
-  corenum = BAMBOO_GET_NUM_OF_CORE();
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xeeee);
-  BAMBOO_DEBUGPRINT_REG(corenum);
-  BAMBOO_DEBUGPRINT(STARTUPCORE);
-#endif
-  //BAMBOO_DEBUGPRINT(BAMBOO_GET_EXE_TIME()); // TODO
-
-  // initialize runtime data structures
-  initruntimedata();
-
-  // other architecture related initialization
-  initialization();
-  initCommunication();
-
-  initializeexithandler();
-
-  // main process of the execution module
-  if(BAMBOO_NUM_OF_CORE > NUMCORESACTIVE - 1) {
-    // non-executing cores, only processing communications
-    activetasks = NULL;
-#ifdef PROFILE
-    //isInterrupt = false;
-#endif
-    fakeExecution();
-  } else {
-    /* Create queue of active tasks */
-    activetasks=
-      genallocatehashtable((unsigned int (*)(void *)) &hashCodetpd,
-                           (int (*)(void *,void *)) &comparetpd);
-
-    /* Process task information */
-    processtasks();
-
-    if(STARTUPCORE == BAMBOO_NUM_OF_CORE) {
-      /* Create startup object */
-      createstartupobject(argc, argv);
-    }
-
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xee00);
-#endif
-
-    while(true) {
-
-#ifdef MULTICORE_GC
-//#ifdef GC_CACHE_ADAPT
-	  // do dtlb sampling if necessary
-//	  bamboo_dtlb_sampling_process();
-//#endif // GC_CACHE_ADAPT
-      // check if need to do GC
-      if(gcflag) {
-		gc(NULL);
-	  }
-#endif // MULTICORE_GC
-
-      // check if there are new active tasks can be executed
-      executetasks();
-      if(busystatus) {
-		sendStall = false;
-      }
-
-#ifndef INTERRUPT
-      while(receiveObject() != -1) {
-      }
-#endif
-
-#ifdef DEBUG
-      BAMBOO_DEBUGPRINT(0xee01);
-#endif
-
-      // check if there are some pending objects,
-      // if yes, enqueue them and executetasks again
-      tocontinue = checkObjQueue();
-
-      if(!tocontinue) {
-		// check if stop
-		if(STARTUPCORE == BAMBOO_NUM_OF_CORE) {
-		  if(isfirst) {
-#ifdef DEBUG
-			BAMBOO_DEBUGPRINT(0xee03);
-#endif
-			isfirst = false;
-		  }
-		  checkCoreStatus();
-		} else {
-		  if(!sendStall) {
-#ifdef DEBUG
-			BAMBOO_DEBUGPRINT(0xee09);
-#endif
-#ifdef PROFILE
-			if(!stall) {
-#endif
-			if(isfirst) {
-			  // wait for some time
-			  int halt = 10000;
-#ifdef DEBUG
-			  BAMBOO_DEBUGPRINT(0xee0a);
-#endif
-			  while(halt--) {
-			  }
-			  isfirst = false;
-			} else {
-			  // send StallMsg to startup core
-#ifdef DEBUG
-			  BAMBOO_DEBUGPRINT(0xee0b);
-#endif
-			  // send stall msg
-			  send_msg_4(STARTUPCORE, TRANSTALL, BAMBOO_NUM_OF_CORE,
-						 self_numsendobjs, self_numreceiveobjs, false);
-			  sendStall = true;
-			  isfirst = true;
-			  busystatus = false;
-			}
-#ifdef PROFILE
-		  }
-#endif
-		  } else {
-			isfirst = true;
-			busystatus = false;
-#ifdef DEBUG
-			BAMBOO_DEBUGPRINT(0xee0c);
-#endif
-		  }   // if(!sendStall)
-		}   // if(STARTUPCORE == BAMBOO_NUM_OF_CORE)
-      }  // if(!tocontinue)
-    }  // while(true)
-  } // if(BAMBOO_NUM_OF_CORE > NUMCORESACTIVE - 1)
-
-} // run()
-
-struct ___createstartupobject____I_locals {
-  INTPTR size;
-  void * next;
-  struct  ___StartupObject___ * ___startupobject___;
-  struct ArrayObject * ___stringarray___;
-}; // struct ___createstartupobject____I_locals
-
-void createstartupobject(int argc,
-                         char ** argv) {
-  int i;
-
-  /* Allocate startup object     */
-#ifdef MULTICORE_GC
-  struct ___createstartupobject____I_locals ___locals___ = 
-  {2, NULL, NULL, NULL};
-  struct ___StartupObject___ *startupobject=
-    (struct ___StartupObject___*) allocate_new(&___locals___, STARTUPTYPE);
-  ___locals___.___startupobject___ = startupobject;
-  struct ArrayObject * stringarray=
-    allocate_newarray(&___locals___, STRINGARRAYTYPE, argc-1);
-  ___locals___.___stringarray___ = stringarray;
-#else
-  struct ___StartupObject___ *startupobject=
-    (struct ___StartupObject___*) allocate_new(STARTUPTYPE);
-  struct ArrayObject * stringarray=
-    allocate_newarray(STRINGARRAYTYPE, argc-1);
-#endif
-  /* Build array of strings */
-  startupobject->___parameters___=stringarray;
-  for(i=1; i<argc; i++) {
-    int length=strlen(argv[i]);
-#ifdef MULTICORE_GC
-    struct ___String___ *newstring=NewString(&___locals___, argv[i],length);
-#else
-    struct ___String___ *newstring=NewString(argv[i],length);
-#endif
-    ((void **)(((char *)&stringarray->___length___)+sizeof(int)))[i-1]=
-      newstring;
-  }
-
-  startupobject->version = 0;
-  startupobject->lock = NULL;
-
-  /* Set initialized flag for startup object */
-  flagorandinit(startupobject,1,0xFFFFFFFF);
-  enqueueObject(startupobject, NULL, 0);
-#ifdef CACHEFLUSH
-  BAMBOO_CACHE_FLUSH_ALL();
-#endif
-}
-
-int hashCodetpd(struct taskparamdescriptor *ftd) {
-  int hash=(int)ftd->task;
-  int i;
-  for(i=0; i<ftd->numParameters; i++) {
-    hash^=(int)ftd->parameterArray[i];
-  }
-  return hash;
-}
-
-int comparetpd(struct taskparamdescriptor *ftd1,
-               struct taskparamdescriptor *ftd2) {
-  int i;
-  if (ftd1->task!=ftd2->task)
-    return 0;
-  for(i=0; i<ftd1->numParameters; i++)
-    if(ftd1->parameterArray[i]!=ftd2->parameterArray[i])
-      return 0;
-  return 1;
-}
-
-/* This function sets a tag. */
-#ifdef MULTICORE_GC
-void tagset(void *ptr,
-            struct ___Object___ * obj,
-            struct ___TagDescriptor___ * tagd) {
-#else
-void tagset(struct ___Object___ * obj,
-            struct ___TagDescriptor___ * tagd) {
-#endif
-  struct ArrayObject * ao=NULL;
-  struct ___Object___ * tagptr=obj->___tags___;
-  if (tagptr==NULL) {
-    obj->___tags___=(struct ___Object___ *)tagd;
-  } else {
-    /* Have to check if it is already set */
-    if (tagptr->type==TAGTYPE) {
-      struct ___TagDescriptor___ * td=(struct ___TagDescriptor___ *) tagptr;
-      if (td==tagd) {
-		return;
-      }
-#ifdef MULTICORE_GC
-      int ptrarray[]={2, (int) ptr, (int) obj, (int)tagd};
-      struct ArrayObject * ao=
-        allocate_newarray(&ptrarray,TAGARRAYTYPE,TAGARRAYINTERVAL);
-      obj=(struct ___Object___ *)ptrarray[2];
-      tagd=(struct ___TagDescriptor___ *)ptrarray[3];
-      td=(struct ___TagDescriptor___ *) obj->___tags___;
-#else
-      ao=allocate_newarray(TAGARRAYTYPE,TAGARRAYINTERVAL);
-#endif
-
-      ARRAYSET(ao, struct ___TagDescriptor___ *, 0, td);
-      ARRAYSET(ao, struct ___TagDescriptor___ *, 1, tagd);
-      obj->___tags___=(struct ___Object___ *) ao;
-      ao->___cachedCode___=2;
-    } else {
-      /* Array Case */
-      int i;
-      struct ArrayObject *ao=(struct ArrayObject *) tagptr;
-      for(i=0; i<ao->___cachedCode___; i++) {
-		struct ___TagDescriptor___ * td=
-		  ARRAYGET(ao, struct ___TagDescriptor___*, i);
-		if (td==tagd) {
-		  return;
-		}
-      }
-      if (ao->___cachedCode___<ao->___length___) {
-		ARRAYSET(ao, struct ___TagDescriptor___ *,ao->___cachedCode___,tagd);
-		ao->___cachedCode___++;
-      } else {
-#ifdef MULTICORE_GC
-		int ptrarray[]={2,(int) ptr, (int) obj, (int) tagd};
-		struct ArrayObject * aonew=
-		  allocate_newarray(&ptrarray,TAGARRAYTYPE,
-							TAGARRAYINTERVAL+ao->___length___);
-		obj=(struct ___Object___ *)ptrarray[2];
-		tagd=(struct ___TagDescriptor___ *) ptrarray[3];
-		ao=(struct ArrayObject *)obj->___tags___;
-#else
-		struct ArrayObject * aonew=
-		  allocate_newarray(TAGARRAYTYPE,TAGARRAYINTERVAL+ao->___length___);
-#endif
-
-		aonew->___cachedCode___=ao->___length___+1;
-		for(i=0; i<ao->___length___; i++) {
-		  ARRAYSET(aonew, struct ___TagDescriptor___*, i,
-				   ARRAYGET(ao, struct ___TagDescriptor___*, i));
-		}
-		ARRAYSET(aonew, struct ___TagDescriptor___ *, ao->___length___,tagd);
-      }
-    }
-  }
-
-  {
-    struct ___Object___ * tagset=tagd->flagptr;
-    if(tagset==NULL) {
-      tagd->flagptr=obj;
-    } else if (tagset->type!=OBJECTARRAYTYPE) {
-#ifdef MULTICORE_GC
-      int ptrarray[]={2, (int) ptr, (int) obj, (int)tagd};
-      struct ArrayObject * ao=
-        allocate_newarray(&ptrarray,OBJECTARRAYTYPE,OBJECTARRAYINTERVAL);
-      obj=(struct ___Object___ *)ptrarray[2];
-      tagd=(struct ___TagDescriptor___ *)ptrarray[3];
-#else
-      struct ArrayObject * ao=
-        allocate_newarray(OBJECTARRAYTYPE,OBJECTARRAYINTERVAL);
-#endif
-      ARRAYSET(ao, struct ___Object___ *, 0, tagd->flagptr);
-      ARRAYSET(ao, struct ___Object___ *, 1, obj);
-      ao->___cachedCode___=2;
-      tagd->flagptr=(struct ___Object___ *)ao;
-    } else {
-      struct ArrayObject *ao=(struct ArrayObject *) tagset;
-      if (ao->___cachedCode___<ao->___length___) {
-		ARRAYSET(ao, struct ___Object___*, ao->___cachedCode___++, obj);
-      } else {
-		int i;
-#ifdef MULTICORE_GC
-		int ptrarray[]={2, (int) ptr, (int) obj, (int)tagd};
-		struct ArrayObject * aonew=
-		  allocate_newarray(&ptrarray,OBJECTARRAYTYPE,
-							OBJECTARRAYINTERVAL+ao->___length___);
-		obj=(struct ___Object___ *)ptrarray[2];
-		tagd=(struct ___TagDescriptor___ *)ptrarray[3];
-		ao=(struct ArrayObject *)tagd->flagptr;
-#else
-		struct ArrayObject * aonew=allocate_newarray(OBJECTARRAYTYPE,
-			OBJECTARRAYINTERVAL+ao->___length___);
-#endif
-		aonew->___cachedCode___=ao->___cachedCode___+1;
-		for(i=0; i<ao->___length___; i++) {
-		  ARRAYSET(aonew, struct ___Object___*, i,
-				   ARRAYGET(ao, struct ___Object___*, i));
-		}
-		ARRAYSET(aonew, struct ___Object___ *, ao->___cachedCode___, obj);
-		tagd->flagptr=(struct ___Object___ *) aonew;
-      }
-    }
-  }
-}
-
-/* This function clears a tag. */
-#ifdef MULTICORE_GC
-void tagclear(void *ptr,
-              struct ___Object___ * obj,
-              struct ___TagDescriptor___ * tagd) {
-#else
-void tagclear(struct ___Object___ * obj,
-              struct ___TagDescriptor___ * tagd) {
-#endif
-  /* We'll assume that tag is alway there.
-     Need to statically check for this of course. */
-  struct ___Object___ * tagptr=obj->___tags___;
-
-  if (tagptr->type==TAGTYPE) {
-    if ((struct ___TagDescriptor___ *)tagptr==tagd)
-      obj->___tags___=NULL;
-  } else {
-    struct ArrayObject *ao=(struct ArrayObject *) tagptr;
-    int i;
-    for(i=0; i<ao->___cachedCode___; i++) {
-      struct ___TagDescriptor___ * td=
-        ARRAYGET(ao, struct ___TagDescriptor___ *, i);
-      if (td==tagd) {
-		ao->___cachedCode___--;
-		if (i<ao->___cachedCode___)
-		  ARRAYSET(ao, struct ___TagDescriptor___ *, i,
-			  ARRAYGET(ao,struct ___TagDescriptor___*,ao->___cachedCode___));
-		ARRAYSET(ao,struct ___TagDescriptor___ *,ao->___cachedCode___, NULL);
-		if (ao->___cachedCode___==0)
-		  obj->___tags___=NULL;
-		goto PROCESSCLEAR;
-      }
-    }
-  }
-PROCESSCLEAR:
-  {
-    struct ___Object___ *tagset=tagd->flagptr;
-    if (tagset->type!=OBJECTARRAYTYPE) {
-      if (tagset==obj)
-		tagd->flagptr=NULL;
-    } else {
-      struct ArrayObject *ao=(struct ArrayObject *) tagset;
-      int i;
-      for(i=0; i<ao->___cachedCode___; i++) {
-		struct ___Object___ * tobj=ARRAYGET(ao, struct ___Object___ *, i);
-		if (tobj==obj) {
-		  ao->___cachedCode___--;
-		  if (i<ao->___cachedCode___)
-			ARRAYSET(ao, struct ___Object___ *, i,
-				ARRAYGET(ao, struct ___Object___ *, ao->___cachedCode___));
-		  ARRAYSET(ao, struct ___Object___ *, ao->___cachedCode___, NULL);
-		  if (ao->___cachedCode___==0)
-			tagd->flagptr=NULL;
-		  goto ENDCLEAR;
-		}
-      }
-    }
-  }
-ENDCLEAR:
-  return;
-}
-
-/* This function allocates a new tag. */
-#ifdef MULTICORE_GC
-struct ___TagDescriptor___ * allocate_tag(void *ptr,
-                                          int index) {
-  struct ___TagDescriptor___ * v=
-    (struct ___TagDescriptor___ *) FREEMALLOC((struct garbagelist *) ptr,
-                                              classsize[TAGTYPE]);
-#else
-struct ___TagDescriptor___ * allocate_tag(int index) {
-  struct ___TagDescriptor___ * v=FREEMALLOC(classsize[TAGTYPE]);
-#endif
-  v->type=TAGTYPE;
-  v->flag=index;
-  return v;
-}
-
-
-
-/* This function updates the flag for object ptr.  It or's the flag
-   with the or mask and and's it with the andmask. */
-
-void flagbody(struct ___Object___ *ptr,
-              int flag,
-              struct parameterwrapper ** queues,
-              int length,
-              bool isnew);
-
-int flagcomp(const int *val1, const int *val2) {
-  return (*val1)-(*val2);
-}
-
-void flagorand(void * ptr,
-               int ormask,
-               int andmask,
-               struct parameterwrapper ** queues,
-               int length) {
-  {
-    int oldflag=((int *)ptr)[1];
-    int flag=ormask|oldflag;
-    flag&=andmask;
-    flagbody(ptr, flag, queues, length, false);
-  }
-}
-
-bool intflagorand(void * ptr,
-                  int ormask,
-                  int andmask) {
-  {
-    int oldflag=((int *)ptr)[1];
-    int flag=ormask|oldflag;
-    flag&=andmask;
-    if (flag==oldflag)   /* Don't do anything */
-      return false;
-    else {
-      flagbody(ptr, flag, NULL, 0, false);
-      return true;
-    }
-  }
-}
-
-void flagorandinit(void * ptr,
-                   int ormask,
-                   int andmask) {
-  int oldflag=((int *)ptr)[1];
-  int flag=ormask|oldflag;
-  flag&=andmask;
-  flagbody(ptr,flag,NULL,0,true);
-}
-
-void flagbody(struct ___Object___ *ptr,
-              int flag,
-              struct parameterwrapper ** vqueues,
-              int vlength,
-              bool isnew) {
-  struct parameterwrapper * flagptr = NULL;
-  int i = 0;
-  struct parameterwrapper ** queues = vqueues;
-  int length = vlength;
-  int next;
-  int UNUSED, UNUSED2;
-  int * enterflags = NULL;
-  if((!isnew) && (queues == NULL)) {
-    if(BAMBOO_NUM_OF_CORE < NUMCORESACTIVE) {
-      queues = objectqueues[BAMBOO_NUM_OF_CORE][ptr->type];
-      length = numqueues[BAMBOO_NUM_OF_CORE][ptr->type];
-    } else {
-      return;
-    }
-  }
-  ptr->flag=flag;
-
-  /*Remove object from all queues */
-  for(i = 0; i < length; ++i) {
-    flagptr = queues[i];
-    ObjectHashget(flagptr->objectset, (int) ptr, (int *) &next,
-                  (int *) &enterflags, &UNUSED, &UNUSED2);
-    ObjectHashremove(flagptr->objectset, (int)ptr);
-    if (enterflags!=NULL)
-      RUNFREE(enterflags);
-  }
-}
-
-void enqueueObject(void * vptr,
-                   struct parameterwrapper ** vqueues,
-                   int vlength) {
-  struct ___Object___ *ptr = (struct ___Object___ *)vptr;
-
-  {
-    //struct QueueItem *tmpptr;
-    struct parameterwrapper * parameter=NULL;
-    int j;
-    int i;
-    struct parameterwrapper * prevptr=NULL;
-    struct ___Object___ *tagptr=NULL;
-    struct parameterwrapper ** queues = vqueues;
-    int length = vlength;
-    if(BAMBOO_NUM_OF_CORE > NUMCORESACTIVE - 1) {
-      return;
-    }
-    if(queues == NULL) {
-      queues = objectqueues[BAMBOO_NUM_OF_CORE][ptr->type];
-      length = numqueues[BAMBOO_NUM_OF_CORE][ptr->type];
-    }
-    tagptr=ptr->___tags___;
-
-    /* Outer loop iterates through all parameter queues an object of
-       this type could be in.  */
-    for(j = 0; j < length; ++j) {
-      parameter = queues[j];
-      /* Check tags */
-      if (parameter->numbertags>0) {
-		if (tagptr==NULL)
-		  goto nextloop;  //that means the object has no tag
-		//but that param needs tag
-		else if(tagptr->type==TAGTYPE) {     //one tag
-		  //struct ___TagDescriptor___ * tag=
-		  //(struct ___TagDescriptor___*) tagptr;
-		  for(i=0; i<parameter->numbertags; i++) {
-			//slotid is parameter->tagarray[2*i];
-			int tagid=parameter->tagarray[2*i+1];
-			if (tagid!=tagptr->flag)
-			  goto nextloop;   /*We don't have this tag */
-		  }
-		} else {                         //multiple tags
-		  struct ArrayObject * ao=(struct ArrayObject *) tagptr;
-		  for(i=0; i<parameter->numbertags; i++) {
-			//slotid is parameter->tagarray[2*i];
-			int tagid=parameter->tagarray[2*i+1];
-			int j;
-			for(j=0; j<ao->___cachedCode___; j++) {
-			  if (tagid==ARRAYGET(ao, struct ___TagDescriptor___*, j)->flag)
-				goto foundtag;
-			}
-			goto nextloop;
-foundtag:
-			;
-		  }
-		}
-      }
-
-      /* Check flags */
-      for(i=0; i<parameter->numberofterms; i++) {
-		int andmask=parameter->intarray[i*2];
-		int checkmask=parameter->intarray[i*2+1];
-		if ((ptr->flag&andmask)==checkmask) {
-		  enqueuetasks(parameter, prevptr, ptr, NULL, 0);
-		  prevptr=parameter;
-		  break;
-		}
-      }
-nextloop:
-      ;
-    }
-  }
-}
-
-void enqueueObject_I(void * vptr,
-                     struct parameterwrapper ** vqueues,
-                     int vlength) {
-  struct ___Object___ *ptr = (struct ___Object___ *)vptr;
-
-  {
-    //struct QueueItem *tmpptr;
-    struct parameterwrapper * parameter=NULL;
-    int j;
-    int i;
-    struct parameterwrapper * prevptr=NULL;
-    struct ___Object___ *tagptr=NULL;
-    struct parameterwrapper ** queues = vqueues;
-    int length = vlength;
-    if(BAMBOO_NUM_OF_CORE > NUMCORESACTIVE - 1) {
-      return;
-    }
-    if(queues == NULL) {
-      queues = objectqueues[BAMBOO_NUM_OF_CORE][ptr->type];
-      length = numqueues[BAMBOO_NUM_OF_CORE][ptr->type];
-    }
-    tagptr=ptr->___tags___;
-
-    /* Outer loop iterates through all parameter queues an object of
-       this type could be in.  */
-    for(j = 0; j < length; ++j) {
-      parameter = queues[j];
-      /* Check tags */
-      if (parameter->numbertags>0) {
-		if (tagptr==NULL)
-		  goto nextloop;      //that means the object has no tag
-		//but that param needs tag
-		else if(tagptr->type==TAGTYPE) {   //one tag
-		//struct ___TagDescriptor___*tag=(struct ___TagDescriptor___*)tagptr;
-		  for(i=0; i<parameter->numbertags; i++) {
-			//slotid is parameter->tagarray[2*i];
-			int tagid=parameter->tagarray[2*i+1];
-			if (tagid!=tagptr->flag)
-			  goto nextloop;            /*We don't have this tag */
-		  }
-		} else {    //multiple tags
-		  struct ArrayObject * ao=(struct ArrayObject *) tagptr;
-		  for(i=0; i<parameter->numbertags; i++) {
-			//slotid is parameter->tagarray[2*i];
-			int tagid=parameter->tagarray[2*i+1];
-			int j;
-			for(j=0; j<ao->___cachedCode___; j++) {
-			  if (tagid==ARRAYGET(ao, struct ___TagDescriptor___*, j)->flag)
-				goto foundtag;
-			}
-			goto nextloop;
-foundtag:
-			;
-		  }
-		}
-      }
-
-      /* Check flags */
-      for(i=0; i<parameter->numberofterms; i++) {
-		int andmask=parameter->intarray[i*2];
-		int checkmask=parameter->intarray[i*2+1];
-		if ((ptr->flag&andmask)==checkmask) {
-		  enqueuetasks_I(parameter, prevptr, ptr, NULL, 0);
-		  prevptr=parameter;
-		  break;
-		}
-      }
-nextloop:
-      ;
-    }
-  }
-}
-
-
-int * getAliasLock(void ** ptrs,
-                   int length,
-                   struct RuntimeHash * tbl) {
-  if(length == 0) {
-    return (int*)(RUNMALLOC(sizeof(int)));
-  } else {
-    int i = 0;
-    int locks[length];
-    int locklen = 0;
-    bool redirect = false;
-    int redirectlock = 0;
-    for(; i < length; i++) {
-      struct ___Object___ * ptr = (struct ___Object___ *)(ptrs[i]);
-      int lock = 0;
-      int j = 0;
-      if(ptr->lock == NULL) {
-		lock = (int)(ptr);
-      } else {
-		lock = (int)(ptr->lock);
-      }
-      if(redirect) {
-		if(lock != redirectlock) {
-		  RuntimeHashadd(tbl, lock, redirectlock);
-		}
-      } else {
-		if(RuntimeHashcontainskey(tbl, lock)) {
-		  // already redirected
-		  redirect = true;
-		  RuntimeHashget(tbl, lock, &redirectlock);
-		  for(; j < locklen; j++) {
-			if(locks[j] != redirectlock) {
-			  RuntimeHashadd(tbl, locks[j], redirectlock);
-			}
-		  }
-		} else {
-		  bool insert = true;
-		  for(j = 0; j < locklen; j++) {
-			if(locks[j] == lock) {
-			  insert = false;
-			  break;
-			} else if(locks[j] > lock) {
-			  break;
-			}
-		  }
-		  if(insert) {
-			int h = locklen;
-			for(; h > j; h--) {
-			  locks[h] = locks[h-1];
-			}
-			locks[j] = lock;
-			locklen++;
-		  }
-		}
-      }
-    }
-    if(redirect) {
-      return (int *)redirectlock;
-    } else {
-      return (int *)(locks[0]);
-    }
-  }
-}
-
-void addAliasLock(void * ptr,
-                  int lock) {
-  struct ___Object___ * obj = (struct ___Object___ *)ptr;
-  if(((int)ptr != lock) && (obj->lock != (int*)lock)) {
-    // originally no alias lock associated or have a different alias lock
-    // flush it as the new one
-    obj->lock = (int *)lock;
-  }
-}
-
-#ifdef PROFILE
-inline void setTaskExitIndex(int index) {
-  taskInfoArray[taskInfoIndex]->exitIndex = index;
-}
-
-inline void addNewObjInfo(void * nobj) {
-  if(taskInfoArray[taskInfoIndex]->newObjs == NULL) {
-    taskInfoArray[taskInfoIndex]->newObjs = createQueue();
-  }
-  addNewItem(taskInfoArray[taskInfoIndex]->newObjs, nobj);
-}
-#endif
-
-#ifdef MULTICORE_GC
-// Only allocate local mem chunks to each core.
-// If a core has used up its local shared memory, start gc.
-void * localmalloc_I(int coren,
-                     int isize,
-                     int * allocsize) {
-  void * mem = NULL;
-  int gccorenum = (coren < NUMCORES4GC) ? (coren) : (coren % NUMCORES4GC);
-  int i = 0;
-  int j = 0;
-  int tofindb = gc_core2block[2*gccorenum+i]+(NUMCORES4GC*2)*j;
-  int totest = tofindb;
-  int bound = BAMBOO_SMEM_SIZE_L;
-  int foundsmem = 0;
-  int size = 0;
-  do {
-    bound = (totest < NUMCORES4GC) ? BAMBOO_SMEM_SIZE_L : BAMBOO_SMEM_SIZE;
-    int nsize = bamboo_smemtbl[totest];
-    bool islocal = true;
-    if(nsize < bound) {
-      bool tocheck = true;
-      // have some space in the block
-      if(totest == tofindb) {
-		// the first partition
-		size = bound - nsize;
-      } else if(nsize == 0) {
-		// an empty partition, can be appended
-		size += bound;
-      } else {
-		// not an empty partition, can not be appended
-		// the last continuous block is not big enough, go to check the next
-		// local block
-		islocal = true;
-		tocheck = false;
-      } // if(totest == tofindb) else if(nsize == 0) else ...
-      if(tocheck) {
-		if(size >= isize) {
-		  // have enough space in the block, malloc
-		  foundsmem = 1;
-		  break;
-		} else {
-		  // no enough space yet, try to append next continuous block
-		  islocal = false;
-		}  // if(size > isize) else ...
-      }  // if(tocheck)
-    } // if(nsize < bound)
-    if(islocal) {
-      // no space in the block, go to check the next block
-      i++;
-      if(2==i) {
-		i = 0;
-		j++;
-      }
-      tofindb = totest = gc_core2block[2*gccorenum+i]+(NUMCORES4GC*2)*j;
-    } else {
-      totest += 1;
-    }  // if(islocal) else ...
-    if(totest > gcnumblock-1-bamboo_reserved_smem) {
-      // no more local mem, do not find suitable block
-      foundsmem = 2;
-      break;
-    }  // if(totest > gcnumblock-1-bamboo_reserved_smem) ...
-  } while(true);
-
-  if(foundsmem == 1) {
-    // find suitable block
-    mem = gcbaseva+bamboo_smemtbl[tofindb]+((tofindb<NUMCORES4GC) ?
-          (BAMBOO_SMEM_SIZE_L*tofindb) : (BAMBOO_LARGE_SMEM_BOUND+
-          (tofindb-NUMCORES4GC)*BAMBOO_SMEM_SIZE));
-    *allocsize = size;
-    // set bamboo_smemtbl
-    for(i = tofindb; i <= totest; i++) {
-      bamboo_smemtbl[i]=(i<NUMCORES4GC)?BAMBOO_SMEM_SIZE_L:BAMBOO_SMEM_SIZE;
-    }
-  } else if(foundsmem == 2) {
-    // no suitable block
-    *allocsize = 0;
-  }
-
-  return mem;
-} // void * localmalloc_I(int, int, int *)
-
-#ifdef SMEMF
-// Allocate the local shared memory to each core with the highest priority,
-// if a core has used up its local shared memory, try to allocate the 
-// shared memory that belong to its neighbours, if also failed, start gc.
-void * fixedmalloc_I(int coren,
-                     int isize,
-                     int * allocsize) {
-  void * mem = NULL;
-  int i = 0;
-  int j = 0;
-  int k = 0;
-  int gccorenum = (coren < NUMCORES4GC) ? (coren) : (coren % NUMCORES4GC);
-  int coords_x = bamboo_cpu2coords[gccorenum*2];
-  int coords_y = bamboo_cpu2coords[gccorenum*2+1];
-  int ii = 1;
-  int tofindb = gc_core2block[2*core2test[gccorenum][k]+i]+(NUMCORES4GC*2)*j;
-  int totest = tofindb;
-  int bound = BAMBOO_SMEM_SIZE_L;
-  int foundsmem = 0;
-  int size = 0;
-  do {
-    bound = (totest < NUMCORES4GC) ? BAMBOO_SMEM_SIZE_L : BAMBOO_SMEM_SIZE;
-    int nsize = bamboo_smemtbl[totest];
-    bool islocal = true;
-    if(nsize < bound) {
-      bool tocheck = true;
-      // have some space in the block
-      if(totest == tofindb) {
-		// the first partition
-		size = bound - nsize;
-      } else if(nsize == 0) {
-		// an empty partition, can be appended
-		size += bound;
-      } else {
-		// not an empty partition, can not be appended
-		// the last continuous block is not big enough, go to check the next
-		// local block
-		islocal = true;
-		tocheck = false;
-      } // if(totest == tofindb) else if(nsize == 0) else ...
-      if(tocheck) {
-		if(size >= isize) {
-		  // have enough space in the block, malloc
-		  foundsmem = 1;
-		  break;
-		} else {
-		  // no enough space yet, try to append next continuous block
-		  // TODO may consider to go to next local block?
-		  islocal = false;
-		}  // if(size > isize) else ...
-      }  // if(tocheck)
-    } // if(nsize < bound)
-    if(islocal) {
-      // no space in the block, go to check the next block
-      i++;
-      if(2==i) {
-		i = 0;
-		j++;
-      }
-      tofindb=totest=
-		gc_core2block[2*core2test[gccorenum][k]+i]+(NUMCORES4GC*2)*j;
-    } else {
-      totest += 1;
-    }  // if(islocal) else ...
-    if(totest > gcnumblock-1-bamboo_reserved_smem) {
-      // no more local mem, do not find suitable block on local mem
-	  // try to malloc shared memory assigned to the neighbour cores
-	  do{
-		k++;
-		if(k >= NUM_CORES2TEST) {
-		  // no more memory available on either coren or its neighbour cores
-		  foundsmem = 2;
-		  goto memsearchresult;
-		}
-	  } while(core2test[gccorenum][k] == -1);
-	  i = 0;
-	  j = 0;
-	  tofindb=totest=
-		gc_core2block[2*core2test[gccorenum][k]+i]+(NUMCORES4GC*2)*j;
-    }  // if(totest > gcnumblock-1-bamboo_reserved_smem) ...
-  } while(true);
-
-memsearchresult:
-  if(foundsmem == 1) {
-    // find suitable block
-    mem = gcbaseva+bamboo_smemtbl[tofindb]+((tofindb<NUMCORES4GC) ?
-          (BAMBOO_SMEM_SIZE_L*tofindb) : (BAMBOO_LARGE_SMEM_BOUND+
-          (tofindb-NUMCORES4GC)*BAMBOO_SMEM_SIZE));
-    *allocsize = size;
-    // set bamboo_smemtbl
-    for(i = tofindb; i <= totest; i++) {
-      bamboo_smemtbl[i]=(i<NUMCORES4GC)?BAMBOO_SMEM_SIZE_L:BAMBOO_SMEM_SIZE;
-    }
-  } else if(foundsmem == 2) {
-    // no suitable block
-    *allocsize = 0;
-  }
-
-  return mem;
-} // void * fixedmalloc_I(int, int, int *)
-#endif // #ifdef SMEMF
-
-#ifdef SMEMM
-// Allocate the local shared memory to each core with the highest priority,
-// if a core has used up its local shared memory, try to allocate the 
-// shared memory that belong to its neighbours first, if failed, check 
-// current memory allocation rate, if it has already reached the threshold,
-// start gc, otherwise, allocate the shared memory globally.  If all the 
-// shared memory has been used up, start gc.
-void * mixedmalloc_I(int coren,
-                     int isize,
-                     int * allocsize) {
-  void * mem = NULL;
-  int i = 0;
-  int j = 0;
-  int k = 0;
-  int gccorenum = (coren < NUMCORES4GC) ? (coren) : (coren % NUMCORES4GC);
-  int ii = 1;
-  int tofindb = gc_core2block[2*core2test[gccorenum][k]+i]+(NUMCORES4GC*2)*j;
-  int totest = tofindb;
-  int bound = BAMBOO_SMEM_SIZE_L;
-  int foundsmem = 0;
-  int size = 0;
-  do {
-    bound = (totest < NUMCORES4GC) ? BAMBOO_SMEM_SIZE_L : BAMBOO_SMEM_SIZE;
-    int nsize = bamboo_smemtbl[totest];
-    bool islocal = true;
-    if(nsize < bound) {
-      bool tocheck = true;
-      // have some space in the block
-      if(totest == tofindb) {
-		// the first partition
-		size = bound - nsize;
-      } else if(nsize == 0) {
-		// an empty partition, can be appended
-		size += bound;
-      } else {
-		// not an empty partition, can not be appended
-		// the last continuous block is not big enough, go to check the next
-		// local block
-		islocal = true;
-		tocheck = false;
-      } // if(totest == tofindb) else if(nsize == 0) else ...
-      if(tocheck) {
-		if(size >= isize) {
-		  // have enough space in the block, malloc
-		  foundsmem = 1;
-		  break;
-		} else {
-		  // no enough space yet, try to append next continuous block
-		  // TODO may consider to go to next local block?
-		  islocal = false;
-		}  // if(size > isize) else ...
-      }  // if(tocheck)
-    } // if(nsize < bound)
-    if(islocal) {
-      // no space in the block, go to check the next block
-      i++;
-      if(2==i) {
-		i = 0;
-		j++;
-      }
-      tofindb=totest=
-		gc_core2block[2*core2test[gccorenum][k]+i]+(NUMCORES4GC*2)*j;
-    } else {
-      totest += 1;
-    }  // if(islocal) else ...
-    if(totest > gcnumblock-1-bamboo_reserved_smem) {
-      // no more local mem, do not find suitable block on local mem
-	  // try to malloc shared memory assigned to the neighbour cores
-	  do{
-		k++;
-		if(k >= NUM_CORES2TEST) {
-		  if(gcmem_mixed_usedmem >= gcmem_mixed_threshold) {
-			// no more memory available on either coren or its neighbour cores
-			foundsmem = 2;
-			goto memmixedsearchresult;
-		  } else {
-			// try allocate globally
-			mem = globalmalloc_I(coren, isize, allocsize);
-			return mem;
-		  }
-		}
-	  } while(core2test[gccorenum][k] == -1);
-	  i = 0;
-	  j = 0;
-	  tofindb=totest=
-		gc_core2block[2*core2test[gccorenum][k]+i]+(NUMCORES4GC*2)*j;
-    }  // if(totest > gcnumblock-1-bamboo_reserved_smem) ...
-  } while(true);
-
-memmixedsearchresult:
-  if(foundsmem == 1) {
-    // find suitable block
-    mem = gcbaseva+bamboo_smemtbl[tofindb]+((tofindb<NUMCORES4GC) ?
-          (BAMBOO_SMEM_SIZE_L*tofindb) : (BAMBOO_LARGE_SMEM_BOUND+
-          (tofindb-NUMCORES4GC)*BAMBOO_SMEM_SIZE));
-    *allocsize = size;
-    // set bamboo_smemtbl
-    for(i = tofindb; i <= totest; i++) {
-      bamboo_smemtbl[i]=(i<NUMCORES4GC)?BAMBOO_SMEM_SIZE_L:BAMBOO_SMEM_SIZE;
-    }
-	gcmem_mixed_usedmem += size;
-	if(tofindb == bamboo_free_block) {
-      bamboo_free_block = totest+1;
-    }
-  } else if(foundsmem == 2) {
-    // no suitable block
-    *allocsize = 0;
-  }
-
-  return mem;
-} // void * mixedmalloc_I(int, int, int *)
-#endif // #ifdef SMEMM
-
-// Allocate all the memory chunks globally, do not consider the host cores
-// When all the shared memory are used up, start gc.
-void * globalmalloc_I(int coren,
-                      int isize,
-                      int * allocsize) {
-  void * mem = NULL;
-  int tofindb = bamboo_free_block;       //0;
-  int totest = tofindb;
-  int bound = BAMBOO_SMEM_SIZE_L;
-  int foundsmem = 0;
-  int size = 0;
-  if(tofindb > gcnumblock-1-bamboo_reserved_smem) {
-	// Out of shared memory
-    *allocsize = 0;
-    return NULL;
-  }
-  do {
-    bound = (totest < NUMCORES4GC) ? BAMBOO_SMEM_SIZE_L : BAMBOO_SMEM_SIZE;
-    int nsize = bamboo_smemtbl[totest];
-    bool isnext = false;
-    if(nsize < bound) {
-      bool tocheck = true;
-      // have some space in the block
-      if(totest == tofindb) {
-		// the first partition
-		size = bound - nsize;
-      } else if(nsize == 0) {
-		// an empty partition, can be appended
-		size += bound;
-      } else {
-		// not an empty partition, can not be appended
-		// the last continuous block is not big enough, start another block
-		isnext = true;
-		tocheck = false;
-      }  // if(totest == tofindb) else if(nsize == 0) else ...
-      if(tocheck) {
-		if(size >= isize) {
-		  // have enough space in the block, malloc
-		  foundsmem = 1;
-		  break;
-		}  // if(size > isize)
-      }   // if(tocheck)
-    } else {
-      isnext = true;
-    }  // if(nsize < bound) else ...
-    totest += 1;
-    if(totest > gcnumblock-1-bamboo_reserved_smem) {
-      // no more local mem, do not find suitable block
-      foundsmem = 2;
-      break;
-    }  // if(totest > gcnumblock-1-bamboo_reserved_smem) ...
-    if(isnext) {
-      // start another block
-      tofindb = totest;
-    } // if(islocal)
-  } while(true);
-
-  if(foundsmem == 1) {
-    // find suitable block
-    mem = gcbaseva+bamboo_smemtbl[tofindb]+((tofindb<NUMCORES4GC) ?
-          (BAMBOO_SMEM_SIZE_L*tofindb) : (BAMBOO_LARGE_SMEM_BOUND+
-          (tofindb-NUMCORES4GC)*BAMBOO_SMEM_SIZE));
-    *allocsize = size;
-    // set bamboo_smemtbl
-    for(int i = tofindb; i <= totest; i++) {
-      bamboo_smemtbl[i]=(i<NUMCORES4GC)?BAMBOO_SMEM_SIZE_L:BAMBOO_SMEM_SIZE;
-    }
-    if(tofindb == bamboo_free_block) {
-      bamboo_free_block = totest+1;
-    }
-  } else if(foundsmem == 2) {
-    // no suitable block
-    *allocsize = 0;
-    mem = NULL;
-  }
-
-  return mem;
-} // void * globalmalloc_I(int, int, int *)
-#endif // #ifdef MULTICORE_GC
-
-// malloc from the shared memory
-void * smemalloc_I(int coren,
-                   int size,
-                   int * allocsize) {
-  void * mem = NULL;
-#ifdef MULTICORE_GC
-  int isize = size+(BAMBOO_CACHE_LINE_SIZE);
-
-  // go through the bamboo_smemtbl for suitable partitions
-  switch(bamboo_smem_mode) {
-  case SMEMLOCAL: {
-    mem = localmalloc_I(coren, isize, allocsize);
-    break;
-  }
-
-  case SMEMFIXED: {
-#ifdef SMEMF
-	mem = fixedmalloc_I(coren, isize, allocsize);
-#else
-	// not supported yet
-	BAMBOO_EXIT(0xe001);
-#endif
-    break;
-  }
-
-  case SMEMMIXED: {
-#ifdef SMEMM
-	mem = mixedmalloc_I(coren, isize, allocsize);
-#else
-	// not supported yet
-    BAMBOO_EXIT(0xe002);
-#endif
-    break;
-  }
-
-  case SMEMGLOBAL: {
-    mem = globalmalloc_I(coren, isize, allocsize);
-    break;
-  }
-
-  default:
-    break;
-  }
-
-  if(mem == NULL) {
-#else 
-  int toallocate = (size>(BAMBOO_SMEM_SIZE)) ? (size) : (BAMBOO_SMEM_SIZE);
-  if(toallocate > bamboo_free_smem_size) {
-	// no enough mem
-	mem = NULL;
-  } else {
-	mem = (void *)bamboo_free_smemp;
-	bamboo_free_smemp = ((void*)bamboo_free_smemp) + toallocate;
-	bamboo_free_smem_size -= toallocate;
-  }
-  *allocsize = toallocate;
-  if(mem == NULL) {
-#endif // MULTICORE_GC
-    // no enough shared global memory
-    *allocsize = 0;
-#ifdef MULTICORE_GC
-	if(!gcflag) {
-	  gcflag = true;
-	  // inform other cores to stop and wait for gc
-	  gcprecheck = true;
-	  for(int i = 0; i < NUMCORESACTIVE; i++) {
-		// reuse the gcnumsendobjs & gcnumreceiveobjs
-		gccorestatus[i] = 1;
-		gcnumsendobjs[0][i] = 0;
-		gcnumreceiveobjs[0][i] = 0;
-	  }
-	  for(int i = 0; i < NUMCORESACTIVE; i++) {
-		if(i != BAMBOO_NUM_OF_CORE) {
-		  if(BAMBOO_CHECK_SEND_MODE()) {
-			cache_msg_1(i, GCSTARTPRE);
-		  } else {
-			send_msg_1(i, GCSTARTPRE, true);
-		  }
-		}
-	  }
-	}
-	return NULL;
-#else
-    BAMBOO_DEBUGPRINT(0xa001);
-    BAMBOO_EXIT(0xa001);
-#endif
-  }
-  return mem;
-}  // void * smemalloc_I(int, int, int)
-
-INLINE int checkMsgLength_I(int size) {
-#ifdef DEBUG
-#ifndef TILERA
-  BAMBOO_DEBUGPRINT(0xcccc);
-#endif
-#endif
-  int type = msgdata[msgdataindex];
-  switch(type) {
-  case STATUSCONFIRM:
-  case TERMINATE:
-#ifdef MULTICORE_GC
-  case GCSTARTPRE:
-  case GCSTARTINIT:
-  case GCSTART:
-  case GCSTARTMAPINFO:
-  case GCSTARTFLUSH:
-  case GCFINISH:
-  case GCMARKCONFIRM:
-  case GCLOBJREQUEST:
-#ifdef GC_CACHE_ADAPT
-  case GCSTARTPREF:
-#endif // GC_CACHE_ADAPT
-#endif // MULTICORE_GC
-  {
-	msglength = 1;
-	break;
-  }
-
-  case PROFILEOUTPUT:
-  case PROFILEFINISH:
-#ifdef MULTICORE_GC
-  case GCSTARTCOMPACT:
-  case GCMARKEDOBJ:
-  case GCFINISHINIT:
-  case GCFINISHMAPINFO:
-  case GCFINISHFLUSH:
-#ifdef GC_CACHE_ADAPT
-  case GCFINISHPREF:
-#endif // GC_CACHE_ADAPT
-#endif // MULTICORE_GC
-  {
-	msglength = 2;
-	break;
-  }
-
-  case MEMREQUEST:
-  case MEMRESPONSE:
-#ifdef MULTICORE_GC
-  case GCMAPREQUEST:
-  case GCMAPINFO:
-  case GCMAPTBL:
-  case GCLOBJMAPPING:
-#endif
-  {
-	msglength = 3;
-	break;
-  }
-
-  case TRANSTALL:
-  case LOCKGROUNT:
-  case LOCKDENY:
-  case LOCKRELEASE:
-  case REDIRECTGROUNT:
-  case REDIRECTDENY:
-  case REDIRECTRELEASE:
-#ifdef MULTICORE_GC
-  case GCFINISHPRE:
-  case GCFINISHMARK:
-  case GCMOVESTART:
-#ifdef GC_PROFILE
-  case GCPROFILES:
-#endif
-#endif
-  {
-	msglength = 4;
-	break;
-  }
-
-  case LOCKREQUEST:
-  case STATUSREPORT:
-#ifdef MULTICORE_GC
-  case GCFINISHCOMPACT:
-  case GCMARKREPORT:
-#endif
-  {
-	msglength = 5;
-	break;
-  }
-
-  case REDIRECTLOCK:
-  {
-    msglength = 6;
-    break;
-  }
-
-  case TRANSOBJ:   // nonfixed size
-#ifdef MULTICORE_GC
-  case GCLOBJINFO:
-#endif
-  {             // nonfixed size
-	if(size > 1) {
-	  msglength = msgdata[msgdataindex+1];
-	} else {
-	  return -1;
-	}
-	break;
-  }
-
-  default:
-  {
-    BAMBOO_DEBUGPRINT_REG(type);
-	BAMBOO_DEBUGPRINT_REG(size);
-    BAMBOO_DEBUGPRINT_REG(msgdataindex);
-	BAMBOO_DEBUGPRINT_REG(msgdatalast);
-	BAMBOO_DEBUGPRINT_REG(msgdatafull);
-    int i = 6;
-    while(i-- > 0) {
-      BAMBOO_DEBUGPRINT(msgdata[msgdataindex+i]);
-    }
-    BAMBOO_EXIT(0xd005);
-    break;
-  }
-  }
-#ifdef DEBUG
-#ifndef TILERA
-  BAMBOO_DEBUGPRINT_REG(msgdata[msgdataindex]);
-#endif
-#endif
-#ifdef DEBUG
-#ifndef TILERA
-  BAMBOO_DEBUGPRINT(0xffff);
-#endif
-#endif
-  return msglength;
-}
-
-INLINE void processmsg_transobj_I() {
-#ifdef PROFILE_INTERRUPT
-  /*if(!interruptInfoOverflow) {
-    InterruptInfo* intInfo = RUNMALLOC_I(sizeof(struct interrupt_info));
-    interruptInfoArray[interruptInfoIndex] = intInfo;
-    intInfo->startTime = BAMBOO_GET_EXE_TIME();
-    intInfo->endTime = -1;
-  }*/
-#endif
-  MSG_INDEXINC_I();
-  struct transObjInfo * transObj=RUNMALLOC_I(sizeof(struct transObjInfo));
-  int k = 0;
-#ifdef DEBUG
-#ifndef CLOSE_PRINT
-  BAMBOO_DEBUGPRINT(0xe880);
-#endif
-#endif
-  if(BAMBOO_NUM_OF_CORE > NUMCORESACTIVE - 1) {
-#ifndef CLOSE_PRINT
-    BAMBOO_DEBUGPRINT_REG(msgdata[msgdataindex] /*[2]*/);
-#endif
-    BAMBOO_EXIT(0xa002);
-  }
-  // store the object and its corresponding queue info, enqueue it later
-  transObj->objptr = (void *)msgdata[msgdataindex];  //[2]
-  MSG_INDEXINC_I();
-  transObj->length = (msglength - 3) / 2;
-  transObj->queues = RUNMALLOC_I(sizeof(int)*(msglength - 3));
-  for(k = 0; k < transObj->length; ++k) {
-    transObj->queues[2*k] = msgdata[msgdataindex];   //[3+2*k];
-    MSG_INDEXINC_I();
-#ifdef DEBUG
-#ifndef CLOSE_PRINT
-    //BAMBOO_DEBUGPRINT_REG(transObj->queues[2*k]);
-#endif
-#endif
-    transObj->queues[2*k+1] = msgdata[msgdataindex]; //[3+2*k+1];
-    MSG_INDEXINC_I();
-#ifdef DEBUG
-#ifndef CLOSE_PRINT
-    //BAMBOO_DEBUGPRINT_REG(transObj->queues[2*k+1]);
-#endif
-#endif
-  }
-  // check if there is an existing duplicate item
-  {
-    struct QueueItem * qitem = getHead(&objqueue);
-    struct QueueItem * prev = NULL;
-    while(qitem != NULL) {
-      struct transObjInfo * tmpinfo =
-        (struct transObjInfo *)(qitem->objectptr);
-      if(tmpinfo->objptr == transObj->objptr) {
-		// the same object, remove outdate one
-		RUNFREE(tmpinfo->queues);
-		RUNFREE(tmpinfo);
-		removeItem(&objqueue, qitem);
-		//break;
-      } else {
-		prev = qitem;
-      }
-      if(prev == NULL) {
-		qitem = getHead(&objqueue);
-      } else {
-		qitem = getNextQueueItem(prev);
-      }
-    }
-    addNewItem_I(&objqueue, (void *)transObj);
-  }
-  ++(self_numreceiveobjs);
-#ifdef MULTICORE_GC
-  if(gcprocessing) {
-	if(STARTUPCORE == BAMBOO_NUM_OF_CORE) {
-	  // set the gcprecheck to enable checking again
-	  gcprecheck = true;
-	} else {
-	  // send a update pregc information msg to the master core
-	  if(BAMBOO_CHECK_SEND_MODE()) {
-		cache_msg_4(STARTUPCORE, GCFINISHPRE, BAMBOO_NUM_OF_CORE, 
-			self_numsendobjs, self_numreceiveobjs);
-	  } else {
-		send_msg_4(STARTUPCORE, GCFINISHPRE, BAMBOO_NUM_OF_CORE, 
-			self_numsendobjs, self_numreceiveobjs, true);
-	  }
-	}
-  }
-#endif 
-#ifdef PROFILE_INTERRUPT
-  /*if(!interruptInfoOverflow) {
-    interruptInfoArray[interruptInfoIndex]->endTime=BAMBOO_GET_EXE_TIME();
-    interruptInfoIndex++;
-    if(interruptInfoIndex == INTERRUPTINFOLENGTH) {
-      interruptInfoOverflow = true;
-    }
-  }*/
-#endif
-}
-
-INLINE void processmsg_transtall_I() {
-  if(BAMBOO_NUM_OF_CORE != STARTUPCORE) {
-    // non startup core can not receive stall msg
-#ifndef CLOSE_PRINT
-    BAMBOO_DEBUGPRINT_REG(msgdata[msgdataindex] /*[1]*/);
-#endif
-    BAMBOO_EXIT(0xa003);
-  }
-  int num_core = msgdata[msgdataindex]; //[1]
-  MSG_INDEXINC_I();
-  if(num_core < NUMCORESACTIVE) {
-#ifdef DEBUG
-#ifndef CLOSE_PRINT
-    BAMBOO_DEBUGPRINT(0xe881);
-#endif
-#endif
-    corestatus[num_core] = 0;
-    numsendobjs[num_core] = msgdata[msgdataindex]; //[2];
-    MSG_INDEXINC_I();
-    numreceiveobjs[num_core] = msgdata[msgdataindex]; //[3];
-    MSG_INDEXINC_I();
-  }
-}
-
-#ifndef MULTICORE_GC
-INLINE void processmsg_lockrequest_I() {
-  // check to see if there is a lock exist for the required obj
-  // msgdata[1] -> lock type
-  int locktype = msgdata[msgdataindex]; //[1];
-  MSG_INDEXINC_I();
-  int data2 = msgdata[msgdataindex];  // obj pointer
-  MSG_INDEXINC_I();
-  int data3 = msgdata[msgdataindex];  // lock
-  MSG_INDEXINC_I();
-  int data4 = msgdata[msgdataindex];  // request core
-  MSG_INDEXINC_I();
-  // -1: redirected, 0: approved, 1: denied
-  int deny=processlockrequest(locktype, data3, data2, data4, data4, true);
-  if(deny == -1) {
-    // this lock request is redirected
-    return;
-  } else {
-    // send response msg
-    // for 32 bit machine, the size is always 4 words, cache the msg first
-    int tmp = deny==1 ? LOCKDENY : LOCKGROUNT;
-    if(BAMBOO_CHECK_SEND_MODE()) {
-	  cache_msg_4(data4, tmp, locktype, data2, data3);
-    } else {
-	  send_msg_4(data4, tmp, locktype, data2, data3, true);
-    }
-  }
-}
-
-INLINE void processmsg_lockgrount_I() {
-  MSG_INDEXINC_I();
-  if(BAMBOO_NUM_OF_CORE > NUMCORESACTIVE - 1) {
-#ifndef CLOSE_PRINT
-    BAMBOO_DEBUGPRINT_REG(msgdata[msgdataindex] /*[2]*/);
-#endif
-    BAMBOO_EXIT(0xa004);
-  }
-  int data2 = msgdata[msgdataindex];
-  MSG_INDEXINC_I();
-  int data3 = msgdata[msgdataindex];
-  MSG_INDEXINC_I();
-  if((lockobj == data2) && (lock2require == data3)) {
-#ifdef DEBUG
-#ifndef CLOSE_PRINT
-    BAMBOO_DEBUGPRINT(0xe882);
-#endif
-#endif
-    lockresult = 1;
-    lockflag = true;
-#ifndef INTERRUPT
-    reside = false;
-#endif
-  } else {
-    // conflicts on lockresults
-#ifndef CLOSE_PRINT
-    BAMBOO_DEBUGPRINT_REG(data2);
-#endif
-    BAMBOO_EXIT(0xa005);
-  }
-}
-
-INLINE void processmsg_lockdeny_I() {
-  MSG_INDEXINC_I();
-  int data2 = msgdata[msgdataindex];
-  MSG_INDEXINC_I();
-  int data3 = msgdata[msgdataindex];
-  MSG_INDEXINC_I();
-  if(BAMBOO_NUM_OF_CORE > NUMCORESACTIVE - 1) {
-#ifndef CLOSE_PRINT
-    BAMBOO_DEBUGPRINT_REG(data2);
-#endif
-    BAMBOO_EXIT(0xa006);
-  }
-  if((lockobj == data2) && (lock2require == data3)) {
-#ifdef DEBUG
-#ifndef CLOSE_PRINT
-    BAMBOO_DEBUGPRINT(0xe883);
-#endif
-#endif
-    lockresult = 0;
-    lockflag = true;
-#ifndef INTERRUPT
-    reside = false;
-#endif
-  } else {
-    // conflicts on lockresults
-#ifndef CLOSE_PRINT
-    BAMBOO_DEBUGPRINT_REG(data2);
-#endif
-    BAMBOO_EXIT(0xa007);
-  }
-}
-
-INLINE void processmsg_lockrelease_I() {
-  int data1 = msgdata[msgdataindex];
-  MSG_INDEXINC_I();
-  int data2 = msgdata[msgdataindex];
-  MSG_INDEXINC_I();
-  // receive lock release msg
-  processlockrelease(data1, data2, 0, false);
-}
-
-INLINE void processmsg_redirectlock_I() {
-  // check to see if there is a lock exist for the required obj
-  int data1 = msgdata[msgdataindex];
-  MSG_INDEXINC_I();    //msgdata[1]; // lock type
-  int data2 = msgdata[msgdataindex];
-  MSG_INDEXINC_I();    //msgdata[2]; // obj pointer
-  int data3 = msgdata[msgdataindex];
-  MSG_INDEXINC_I();    //msgdata[3]; // redirect lock
-  int data4 = msgdata[msgdataindex];
-  MSG_INDEXINC_I();    //msgdata[4]; // root request core
-  int data5 = msgdata[msgdataindex];
-  MSG_INDEXINC_I();    //msgdata[5]; // request core
-  int deny = processlockrequest(data1, data3, data2, data5, data4, true);
-  if(deny == -1) {
-    // this lock request is redirected
-    return;
-  } else {
-    // send response msg
-    // for 32 bit machine, the size is always 4 words, cache the msg first
-    if(BAMBOO_CHECK_SEND_MODE()) {
-	  cache_msg_4(data4, deny==1 ? REDIRECTDENY : REDIRECTGROUNT,
-				  data1, data2, data3);
-    } else {
-	  send_msg_4(data4, deny==1?REDIRECTDENY:REDIRECTGROUNT,
-				 data1, data2, data3, true);
-    }
-  }
-}
-
-INLINE void processmsg_redirectgrount_I() {
-  MSG_INDEXINC_I();
-  int data2 = msgdata[msgdataindex];
-  MSG_INDEXINC_I();
-  if(BAMBOO_NUM_OF_CORE > NUMCORESACTIVE - 1) {
-#ifndef CLOSE_PRINT
-    BAMBOO_DEBUGPRINT_REG(data2);
-#endif
-    BAMBOO_EXIT(0xa00a);
-  }
-  if(lockobj == data2) {
-#ifdef DEBUG
-#ifndef CLOSE_PRINT
-    BAMBOO_DEBUGPRINT(0xe891);
-#endif
-#endif
-    int data3 = msgdata[msgdataindex];
-    MSG_INDEXINC_I();
-    lockresult = 1;
-    lockflag = true;
-    RuntimeHashadd_I(objRedirectLockTbl, lockobj, data3);
-#ifndef INTERRUPT
-    reside = false;
-#endif
-  } else {
-    // conflicts on lockresults
-#ifndef CLOSE_PRINT
-    BAMBOO_DEBUGPRINT_REG(data2);
-#endif
-    BAMBOO_EXIT(0xa00b);
-  }
-}
-
-INLINE void processmsg_redirectdeny_I() {
-  MSG_INDEXINC_I();
-  int data2 = msgdata[msgdataindex];
-  MSG_INDEXINC_I();
-  if(BAMBOO_NUM_OF_CORE > NUMCORESACTIVE - 1) {
-#ifndef CLOSE_PRINT
-    BAMBOO_DEBUGPRINT_REG(data2);
-#endif
-    BAMBOO_EXIT(0xa00c);
-  }
-  if(lockobj == data2) {
-#ifdef DEBUG
-#ifndef CLOSE_PRINT
-    BAMBOO_DEBUGPRINT(0xe892);
-#endif
-#endif
-    lockresult = 0;
-    lockflag = true;
-#ifndef INTERRUPT
-    reside = false;
-#endif
-  } else {
-    // conflicts on lockresults
-#ifndef CLOSE_PRINT
-    BAMBOO_DEBUGPRINT_REG(data2);
-#endif
-    BAMBOO_EXIT(0xa00d);
-  }
-}
-
-INLINE void processmsg_redirectrelease_I() {
-  int data1 = msgdata[msgdataindex];
-  MSG_INDEXINC_I();
-  int data2 = msgdata[msgdataindex];
-  MSG_INDEXINC_I();
-  int data3 = msgdata[msgdataindex];
-  MSG_INDEXINC_I();
-  processlockrelease(data1, data2, data3, true);
-}
-#endif // #ifndef MULTICORE_GC
-
-#ifdef PROFILE
-INLINE void processmsg_profileoutput_I() {
-  if(BAMBOO_NUM_OF_CORE == STARTUPCORE) {
-    // startup core can not receive profile output finish msg
-    BAMBOO_EXIT(0xa008);
-  }
-#ifdef DEBUG
-#ifndef CLOSE_PRINT
-  BAMBOO_DEBUGPRINT(0xe885);
-#endif
-#endif
-  stall = true;
-  totalexetime = msgdata[msgdataindex];  //[1]
-  MSG_INDEXINC_I();
-#ifdef RT_TEST
-  BAMBOO_DEBUGPRINT_REG(dot_num);
-#else
-  outputProfileData();
-#endif
-  // cache the msg first
-  if(BAMBOO_CHECK_SEND_MODE()) {
-	cache_msg_2(STARTUPCORE, PROFILEFINISH, BAMBOO_NUM_OF_CORE);
-  } else {
-	send_msg_2(STARTUPCORE, PROFILEFINISH, BAMBOO_NUM_OF_CORE, true);
-  }
-}
-
-INLINE void processmsg_profilefinish_I() {
-  if(BAMBOO_NUM_OF_CORE != STARTUPCORE) {
-    // non startup core can not receive profile output finish msg
-#ifndef CLOSE_PRINT
-    BAMBOO_DEBUGPRINT_REG(msgdata[msgdataindex /*1*/]);
-#endif
-    BAMBOO_EXIT(0xa009);
-  }
-#ifdef DEBUG
-#ifndef CLOSE_PRINT
-  BAMBOO_DEBUGPRINT(0xe886);
-#endif
-#endif
-  int data1 = msgdata[msgdataindex];
-  MSG_INDEXINC_I();
-  profilestatus[data1] = 0;
-}
-#endif // #ifdef PROFILE
-
-INLINE void processmsg_statusconfirm_I() {
-  if((BAMBOO_NUM_OF_CORE == STARTUPCORE)
-     || (BAMBOO_NUM_OF_CORE > NUMCORESACTIVE - 1)) {
-    // wrong core to receive such msg
-    BAMBOO_EXIT(0xa00e);
-  } else {
-    // send response msg
-#ifdef DEBUG
-#ifndef CLOSE_PRINT
-    BAMBOO_DEBUGPRINT(0xe887);
-#endif
-#endif
-    // cache the msg first
-    if(BAMBOO_CHECK_SEND_MODE()) {
-	  cache_msg_5(STARTUPCORE, STATUSREPORT,
-				  busystatus ? 1 : 0, BAMBOO_NUM_OF_CORE,
-				  self_numsendobjs, self_numreceiveobjs);
-    } else {
-	  send_msg_5(STARTUPCORE, STATUSREPORT, busystatus?1:0,
-				 BAMBOO_NUM_OF_CORE, self_numsendobjs,
-				 self_numreceiveobjs, true);
-    }
-  }
-}
-
-INLINE void processmsg_statusreport_I() {
-  int data1 = msgdata[msgdataindex];
-  MSG_INDEXINC_I();
-  int data2 = msgdata[msgdataindex];
-  MSG_INDEXINC_I();
-  int data3 = msgdata[msgdataindex];
-  MSG_INDEXINC_I();
-  int data4 = msgdata[msgdataindex];
-  MSG_INDEXINC_I();
-  // receive a status confirm info
-  if(BAMBOO_NUM_OF_CORE != STARTUPCORE) {
-    // wrong core to receive such msg
-#ifndef CLOSE_PRINT
-    BAMBOO_DEBUGPRINT_REG(data2);
-#endif
-    BAMBOO_EXIT(0xa00f);
-  } else {
-#ifdef DEBUG
-#ifndef CLOSE_PRINT
-    BAMBOO_DEBUGPRINT(0xe888);
-#endif
-#endif
-    if(waitconfirm) {
-      numconfirm--;
-    }
-    corestatus[data2] = data1;
-    numsendobjs[data2] = data3;
-    numreceiveobjs[data2] = data4;
-  }
-}
-
-INLINE void processmsg_terminate_I() {
-#ifdef DEBUG
-#ifndef CLOSE_PRINT
-  BAMBOO_DEBUGPRINT(0xe889);
-#endif
-#endif
-  disruntimedata();
-#ifdef MULTICORE_GC
-#ifdef GC_CACHE_ADAPT
-  bamboo_mask_timer_intr(); // disable the TILE_TIMER interrupt
-#endif // GC_CACHE_ADAPT
-#endif // MULTICORE_GC
-  BAMBOO_EXIT_APP(0);
-}
-
-INLINE void processmsg_memrequest_I() {
-#ifdef PROFILE_INTERRUPT
-  /*if(!interruptInfoOverflow) {
-    InterruptInfo* intInfo = RUNMALLOC_I(sizeof(struct interrupt_info));
-    interruptInfoArray[interruptInfoIndex] = intInfo;
-    intInfo->startTime = BAMBOO_GET_EXE_TIME();
-    intInfo->endTime = -1;
-  }*/
-#endif
-  int data1 = msgdata[msgdataindex];
-  MSG_INDEXINC_I();
-  int data2 = msgdata[msgdataindex];
-  MSG_INDEXINC_I();
-  // receive a shared memory request msg
-  if(BAMBOO_NUM_OF_CORE != STARTUPCORE) {
-    // wrong core to receive such msg
-#ifndef CLOSE_PRINT
-    BAMBOO_DEBUGPRINT_REG(data2);
-#endif
-    BAMBOO_EXIT(0xa010);
-  } else {
-#ifdef DEBUG
-#ifndef CLOSE_PRINT
-    BAMBOO_DEBUGPRINT(0xe88a);
-#endif
-#endif
-    int allocsize = 0;
-    void * mem = NULL;
-#ifdef MULTICORE_GC
-    if(gcprocessing) {
-      // is currently doing gc, dump this msg
-      if(INITPHASE == gcphase) {
-		// if still in the initphase of gc, send a startinit msg again,
-		// cache the msg first
-		if(BAMBOO_CHECK_SEND_MODE()) {
-		  cache_msg_1(data2, GCSTARTINIT);
-		} else {
-		  send_msg_1(data2, GCSTARTINIT, true);
-		}
-      }
-    } else {
-#endif
-    mem = smemalloc_I(data2, data1, &allocsize);
-    if(mem != NULL) {
-      // send the start_va to request core, cache the msg first
-      if(BAMBOO_CHECK_SEND_MODE()) {
-		cache_msg_3(data2, MEMRESPONSE, mem, allocsize);
-      } else {
-		send_msg_3(data2, MEMRESPONSE, mem, allocsize, true);
-	  }
-    } //else 
-	  // if mem == NULL, the gcflag of the startup core has been set
-	  // and all the other cores have been informed to start gc
-#ifdef MULTICORE_GC
-  }
-#endif
-  }
-#ifdef PROFILE_INTERRUPT
-  /*if(!interruptInfoOverflow) {
-    interruptInfoArray[interruptInfoIndex]->endTime=BAMBOO_GET_EXE_TIME();
-    interruptInfoIndex++;
-    if(interruptInfoIndex == INTERRUPTINFOLENGTH) {
-      interruptInfoOverflow = true;
-    }
-  }*/
-#endif
-}
-
-INLINE void processmsg_memresponse_I() {
-  int data1 = msgdata[msgdataindex];
-  MSG_INDEXINC_I();
-  int data2 = msgdata[msgdataindex];
-  MSG_INDEXINC_I();
-  // receive a shared memory response msg
-#ifdef DEBUG
-#ifndef CLOSE_PRINT
-  BAMBOO_DEBUGPRINT(0xe88b);
-#endif
-#endif
-#ifdef MULTICORE_GC
-  // if is currently doing gc, dump this msg
-  if(!gcprocessing) {
-#endif
-  if(data2 == 0) {
-    bamboo_smem_size = 0;
-    bamboo_cur_msp = 0;
-#ifdef MULTICORE_GC
-	bamboo_smem_zero_top = 0;
-#endif
-  } else {
-#ifdef MULTICORE_GC
-    // fill header to store the size of this mem block
-    BAMBOO_MEMSET_WH(data1, '\0', BAMBOO_CACHE_LINE_SIZE); 
-	//memset(data1, 0, BAMBOO_CACHE_LINE_SIZE);
-    (*((int*)data1)) = data2;
-    bamboo_smem_size = data2 - BAMBOO_CACHE_LINE_SIZE;
-    bamboo_cur_msp = data1 + BAMBOO_CACHE_LINE_SIZE;
-	bamboo_smem_zero_top = bamboo_cur_msp;
-#else
-    bamboo_smem_size = data2;
-    bamboo_cur_msp =(void*)(data1);
-#endif
-  }
-  smemflag = true;
-#ifdef MULTICORE_GC
-}
-#endif
-}
-
-#ifdef MULTICORE_GC
-INLINE void processmsg_gcstartpre_I() {
-  if(gcprocessing) {
-	// already stall for gc
-	// send a update pregc information msg to the master core
-	if(BAMBOO_CHECK_SEND_MODE()) {
-	  cache_msg_4(STARTUPCORE, GCFINISHPRE, BAMBOO_NUM_OF_CORE, 
-		  self_numsendobjs, self_numreceiveobjs);
-	} else {
-	  send_msg_4(STARTUPCORE, GCFINISHPRE, BAMBOO_NUM_OF_CORE, 
-		  self_numsendobjs, self_numreceiveobjs, true);
-	}
-  } else {
-	// the first time to be informed to start gc
-	gcflag = true;
-	if(!smemflag) {
-	  // is waiting for response of mem request
-	  // let it return NULL and start gc
-	  bamboo_smem_size = 0;
-	  bamboo_cur_msp = NULL;
-	  smemflag = true;
-	  bamboo_smem_zero_top = NULL;
-	}
-  }
-}
-
-INLINE void processmsg_gcstartinit_I() {
-  gcphase = INITPHASE;
-}
-
-INLINE void processmsg_gcstart_I() {
-#ifdef DEBUG
-#ifndef CLOSE_PRINT
-  BAMBOO_DEBUGPRINT(0xe88c);
-#endif
-#endif
-  // set the GC flag
-  gcphase = MARKPHASE;
-}
-
-INLINE void processmsg_gcstartcompact_I() {
-  gcblock2fill = msgdata[msgdataindex];
-  MSG_INDEXINC_I();  //msgdata[1];
-  gcphase = COMPACTPHASE;
-}
-
-INLINE void processmsg_gcstartmapinfo_I() {
-  gcphase = MAPPHASE;
-}
-
-INLINE void processmsg_gcstartflush_I() {
-  gcphase = FLUSHPHASE;
-}
-
-INLINE void processmsg_gcfinishpre_I() {
-  int data1 = msgdata[msgdataindex];
-  MSG_INDEXINC_I();
-  int data2 = msgdata[msgdataindex];
-  MSG_INDEXINC_I();
-  int data3 = msgdata[msgdataindex];
-  MSG_INDEXINC_I();
-  // received a init phase finish msg
-  if(BAMBOO_NUM_OF_CORE != STARTUPCORE) {
-    // non startup core can not receive this msg
-#ifndef CLOSE_PRINT
-    BAMBOO_DEBUGPRINT_REG(data1);
-#endif
-    BAMBOO_EXIT(0xb000);
-  }
-  // All cores should do init GC
-  if(!gcprecheck) {
-	gcprecheck = true;
-  }
-  gccorestatus[data1] = 0;
-  gcnumsendobjs[0][data1] = data2;
-  gcnumreceiveobjs[0][data1] = data3;
-}
-
-INLINE void processmsg_gcfinishinit_I() {
-  int data1 = msgdata[msgdataindex];
-  MSG_INDEXINC_I();
-  // received a init phase finish msg
-  if(BAMBOO_NUM_OF_CORE != STARTUPCORE) {
-    // non startup core can not receive this msg
-#ifndef CLOSE_PRINT
-    BAMBOO_DEBUGPRINT_REG(data1);
-#endif
-    BAMBOO_EXIT(0xb001);
-  }
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xe88c);
-  BAMBOO_DEBUGPRINT_REG(data1);
-#endif
-  // All cores should do init GC
-  if(data1 < NUMCORESACTIVE) {
-    gccorestatus[data1] = 0;
-  }
-}
-
-INLINE void processmsg_gcfinishmark_I() {
-  int data1 = msgdata[msgdataindex];
-  MSG_INDEXINC_I();
-  int data2 = msgdata[msgdataindex];
-  MSG_INDEXINC_I();
-  int data3 = msgdata[msgdataindex];
-  MSG_INDEXINC_I();
-  // received a mark phase finish msg
-  if(BAMBOO_NUM_OF_CORE != STARTUPCORE) {
-    // non startup core can not receive this msg
-#ifndef CLOSE_PRINT
-    BAMBOO_DEBUGPRINT_REG(data1);
-#endif
-    BAMBOO_EXIT(0xb002);
-  }
-  // all cores should do mark
-  if(data1 < NUMCORESACTIVE) {
-    gccorestatus[data1] = 0;
-	int entry_index = 0;
-	if(waitconfirm)  {
-	  // phase 2
-	  entry_index = (gcnumsrobjs_index == 0) ? 1 : 0;
-	} else {
-	  // phase 1
-	  entry_index = gcnumsrobjs_index;
-	}
-    gcnumsendobjs[entry_index][data1] = data2;
-    gcnumreceiveobjs[entry_index][data1] = data3;
-  }
-}
-
-INLINE void processmsg_gcfinishcompact_I() {
-  if(BAMBOO_NUM_OF_CORE != STARTUPCORE) {
-    // non startup core can not receive this msg
-    // return -1
-#ifndef CLOSE_PRINT
-    BAMBOO_DEBUGPRINT_REG(msgdata[msgdataindex] /*[1]*/);
-#endif
-    BAMBOO_EXIT(0xb003);
-  }
-  int cnum = msgdata[msgdataindex];
-  MSG_INDEXINC_I();       //msgdata[1];
-  int filledblocks = msgdata[msgdataindex];
-  MSG_INDEXINC_I();       //msgdata[2];
-  int heaptop = msgdata[msgdataindex];
-  MSG_INDEXINC_I();       //msgdata[3];
-  int data4 = msgdata[msgdataindex];
-  MSG_INDEXINC_I();       //msgdata[4];
-  // only gc cores need to do compact
-  if(cnum < NUMCORES4GC) {
-    if(COMPACTPHASE == gcphase) {
-      gcfilledblocks[cnum] = filledblocks;
-      gcloads[cnum] = heaptop;
-    }
-    if(data4 > 0) {
-      // ask for more mem
-      int startaddr = 0;
-      int tomove = 0;
-      int dstcore = 0;
-      if(gcfindSpareMem_I(&startaddr, &tomove, &dstcore, data4, cnum)) {
-		// cache the msg first
-		if(BAMBOO_CHECK_SEND_MODE()) {
-		  cache_msg_4(cnum, GCMOVESTART, dstcore, startaddr, tomove);
-		} else {
-		  send_msg_4(cnum, GCMOVESTART, dstcore, startaddr, tomove, true);
-		}
-      }
-    } else {
-      gccorestatus[cnum] = 0;
-    }  // if(data4>0)
-  }  // if(cnum < NUMCORES4GC)
-}
-
-INLINE void processmsg_gcfinishmapinfo_I() {
-  int data1 = msgdata[msgdataindex];
-  MSG_INDEXINC_I();
-  // received a map phase finish msg
-  if(BAMBOO_NUM_OF_CORE != STARTUPCORE) {
-    // non startup core can not receive this msg
-#ifndef CLOSE_PRINT
-    BAMBOO_DEBUGPRINT_REG(data1);
-#endif
-    BAMBOO_EXIT(0xb004);
-  }
-  // all cores should do flush
-  if(data1 < NUMCORES4GC) {
-    gccorestatus[data1] = 0;
-  }
-}
-
-
-INLINE void processmsg_gcfinishflush_I() {
-  int data1 = msgdata[msgdataindex];
-  MSG_INDEXINC_I();
-  // received a flush phase finish msg
-  if(BAMBOO_NUM_OF_CORE != STARTUPCORE) {
-    // non startup core can not receive this msg
-#ifndef CLOSE_PRINT
-    BAMBOO_DEBUGPRINT_REG(data1);
-#endif
-    BAMBOO_EXIT(0xb005);
-  }
-  // all cores should do flush
-  if(data1 < NUMCORESACTIVE) {
-    gccorestatus[data1] = 0;
-  }
-}
-
-INLINE void processmsg_gcmarkconfirm_I() {
-  if((BAMBOO_NUM_OF_CORE == STARTUPCORE)
-     || (BAMBOO_NUM_OF_CORE > NUMCORESACTIVE - 1)) {
-    // wrong core to receive such msg
-    BAMBOO_EXIT(0xb006);
-  } else {
-    // send response msg, cahce the msg first
-    if(BAMBOO_CHECK_SEND_MODE()) {
-	  cache_msg_5(STARTUPCORE, GCMARKREPORT, BAMBOO_NUM_OF_CORE,
-				  gcbusystatus, gcself_numsendobjs,
-				  gcself_numreceiveobjs);
-    } else {
-	  send_msg_5(STARTUPCORE, GCMARKREPORT, BAMBOO_NUM_OF_CORE,
-				 gcbusystatus, gcself_numsendobjs,
-				 gcself_numreceiveobjs, true);
-    }
-  }
-}
-
-INLINE void processmsg_gcmarkreport_I() {
-  int data1 = msgdata[msgdataindex];
-  MSG_INDEXINC_I();
-  int data2 = msgdata[msgdataindex];
-  MSG_INDEXINC_I();
-  int data3 = msgdata[msgdataindex];
-  MSG_INDEXINC_I();
-  int data4 = msgdata[msgdataindex];
-  MSG_INDEXINC_I();
-  // received a marked phase finish confirm response msg
-  if(BAMBOO_NUM_OF_CORE != STARTUPCORE) {
-    // wrong core to receive such msg
-#ifndef CLOSE_PRINT
-    BAMBOO_DEBUGPRINT_REG(data2);
-#endif
-    BAMBOO_EXIT(0xb007);
-  } else {
-	int entry_index = 0;
-    if(waitconfirm) {
-	  // phse 2
-      numconfirm--;
-	  entry_index = (gcnumsrobjs_index == 0) ? 1 : 0;
-    } else {
-	  // can never reach here
-	  // phase 1
-	  entry_index = gcnumsrobjs_index;
-	}
-    gccorestatus[data1] = data2;
-    gcnumsendobjs[entry_index][data1] = data3;
-    gcnumreceiveobjs[entry_index][data1] = data4;
-  }
-}
-
-INLINE void processmsg_gcmarkedobj_I() {
-  int data1 = msgdata[msgdataindex];
-  MSG_INDEXINC_I();
-  // received a markedObj msg
-  if(((int *)data1)[6] == INIT) {
-    // this is the first time that this object is discovered,
-    // set the flag as DISCOVERED
-    ((int *)data1)[6] = DISCOVERED;
-    gc_enqueue_I(data1);
-  } 
-  // set the remote flag
-  ((int *)data1)[6] |= REMOTEM;
-  gcself_numreceiveobjs++;
-  gcbusystatus = true;
-}
-
-INLINE void processmsg_gcmovestart_I() {
-  gctomove = true;
-  gcdstcore = msgdata[msgdataindex];
-  MSG_INDEXINC_I();       //msgdata[1];
-  gcmovestartaddr = msgdata[msgdataindex];
-  MSG_INDEXINC_I();       //msgdata[2];
-  gcblock2fill = msgdata[msgdataindex];
-  MSG_INDEXINC_I();       //msgdata[3];
-}
-
-INLINE void processmsg_gcmaprequest_I() {
-#ifdef GC_PROFILE
-  //unsigned long long ttime = BAMBOO_GET_EXE_TIME();
-#endif
-  void * dstptr = NULL;
-  int data1 = msgdata[msgdataindex];
-  MSG_INDEXINC_I();
-#ifdef GC_PROFILE
-  // TODO unsigned long long ttime = BAMBOO_GET_EXE_TIME();
-#endif
-#ifdef LOCALHASHTBL_TEST
-  RuntimeHashget(gcpointertbl, data1, &dstptr);
-#else
-  dstptr = mgchashSearch(gcpointertbl, data1);
-#endif
-  //MGCHashget(gcpointertbl, data1, &dstptr);
-#ifdef GC_PROFILE
-  // TODO flushstalltime += BAMBOO_GET_EXE_TIME() - ttime;
-#endif
-  int data2 = msgdata[msgdataindex];
-  MSG_INDEXINC_I();
-#ifdef GC_PROFILE
-  // TODO unsigned long long ttimei = BAMBOO_GET_EXE_TIME();
-#endif
-  if(NULL == dstptr) {
-    // no such pointer in this core, something is wrong
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT_REG(data1);
-    BAMBOO_DEBUGPRINT_REG(data2);
-#endif
-    BAMBOO_EXIT(0xb008);
-    //assume that the object was not moved, use the original address
-    /*if(isMsgSending) {
-            cache_msg_3(msgdata[2], GCMAPINFO, msgdata[1], msgdata[1]);
-       } else {
-            send_msg_3(msgdata[2], GCMAPINFO, msgdata[1], msgdata[1]);
-       }*/
-  } else {
-    // send back the mapping info, cache the msg first
-    if(BAMBOO_CHECK_SEND_MODE()) {
-	  cache_msg_3(data2, GCMAPINFO, data1, (int)dstptr);
-    } else {
-	  send_msg_3(data2, GCMAPINFO, data1, (int)dstptr, true);
-    }
-  }
-#ifdef GC_PROFILE
-  // TODO flushstalltime_i += BAMBOO_GET_EXE_TIME()-ttimei;
-  //num_mapinforequest_i++;
-#endif
-}
-
-INLINE void processmsg_gcmapinfo_I() {
-#ifdef GC_PROFILE
-  //unsigned long long ttime = BAMBOO_GET_EXE_TIME();
-#endif
-  int data1 = msgdata[msgdataindex];
-  MSG_INDEXINC_I();
-  gcmappedobj = msgdata[msgdataindex];  // [2]
-  MSG_INDEXINC_I();
-#ifdef LOCALHASHTBL_TEST
-  RuntimeHashadd_I(gcpointertbl, data1, gcmappedobj);
-#else
-  mgchashInsert_I(gcpointertbl, data1, gcmappedobj);
-#endif
-  //MGCHashadd_I(gcpointertbl, data1, gcmappedobj);
-  if(data1 == gcobj2map) {
-	gcismapped = true;
-  }
-#ifdef GC_PROFILE
-  //flushstalltime += BAMBOO_GET_EXE_TIME() - ttime;
-#endif
-}
-
-INLINE void processmsg_gcmaptbl_I() {
-  int data1 = msgdata[msgdataindex];
-  MSG_INDEXINC_I();
-  int data2 = msgdata[msgdataindex];
-  MSG_INDEXINC_I();
-  gcrpointertbls[data2] = (mgcsharedhashtbl_t *)data1; //(struct GCSharedHash *)data1;
-}
-
-INLINE void processmsg_gclobjinfo_I() {
-  numconfirm--;
-
-  int data1 = msgdata[msgdataindex];
-  MSG_INDEXINC_I();
-  int data2 = msgdata[msgdataindex];
-  MSG_INDEXINC_I();
-  if(BAMBOO_NUM_OF_CORE > NUMCORES4GC - 1) {
-#ifndef CLOSE_PRINT
-    BAMBOO_DEBUGPRINT_REG(data2);
-#endif
-    BAMBOO_EXIT(0xb009);
-  }
-  // store the mark result info
-  int cnum = data2;
-  gcloads[cnum] = msgdata[msgdataindex];
-  MSG_INDEXINC_I();       // msgdata[3];
-  int data4 = msgdata[msgdataindex];
-  MSG_INDEXINC_I();
-  if(gcheaptop < data4) {
-    gcheaptop = data4;
-  }
-  // large obj info here
-  for(int k = 5; k < data1; ) {
-    int lobj = msgdata[msgdataindex];
-    MSG_INDEXINC_I();   //msgdata[k++];
-    int length = msgdata[msgdataindex];
-    MSG_INDEXINC_I();   //msgdata[k++];
-    gc_lobjenqueue_I(lobj, length, cnum);
-    gcnumlobjs++;
-  }  // for(int k = 5; k < msgdata[1];)
-}
-
-INLINE void processmsg_gclobjmapping_I() {
-  int data1 = msgdata[msgdataindex];
-  MSG_INDEXINC_I();
-  int data2 = msgdata[msgdataindex];
-  MSG_INDEXINC_I();
-#ifdef LOCALHASHTBL_TEST
-  RuntimeHashadd_I(gcpointertbl, data1, data2);
-#else
-  mgchashInsert_I(gcpointertbl, data1, data2);
-#endif
-  //MGCHashadd_I(gcpointertbl, data1, data2);
-  mgcsharedhashInsert_I(gcsharedptbl, data1, data2);
-}
-
-#ifdef GC_PROFILE
-INLINE void processmsg_gcprofiles_I() {
-  int data1 = msgdata[msgdataindex];
-  MSG_INDEXINC_I();
-  int data2 = msgdata[msgdataindex];
-  MSG_INDEXINC_I();
-  int data3 = msgdata[msgdataindex];
-  MSG_INDEXINC_I();
-  gc_num_obj += data1;
-  gc_num_liveobj += data2;
-  gc_num_forwardobj += data3;
-  gc_num_profiles--;
-}
-#endif // GC_PROFILE
-
-#ifdef GC_CACHE_ADAPT
-INLINE void processmsg_gcstartpref_I() {
-  gcphase = PREFINISHPHASE;
-}
-
-INLINE void processmsg_gcfinishpref_I() {
-  int data1 = msgdata[msgdataindex];
-  MSG_INDEXINC_I();
-  // received a flush phase finish msg
-  if(BAMBOO_NUM_OF_CORE != STARTUPCORE) {
-    // non startup core can not receive this msg
-#ifndef CLOSE_PRINT
-    BAMBOO_DEBUGPRINT_REG(data1);
-#endif
-    BAMBOO_EXIT(0xb00a);
-  }
-  // all cores should do flush
-  if(data1 < NUMCORESACTIVE) {
-    gccorestatus[data1] = 0;
-  }
-}
-#endif // GC_CACHE_ADAPT
-#endif // #ifdef MULTICORE_GC
-
-// receive object transferred from other cores
-// or the terminate message from other cores
-// Should be invoked in critical sections!!
-// NOTICE: following format is for threadsimulate version only
-//         RAW version please see previous description
-// format: type + object
-// type: -1--stall msg
-//      !-1--object
-// return value: 0--received an object
-//               1--received nothing
-//               2--received a Stall Msg
-//               3--received a lock Msg
-//               RAW version: -1 -- received nothing
-//                            otherwise -- received msg type
-int receiveObject(int send_port_pending) {
-#ifdef PROFILE_INTERRUPT
-  if(!interruptInfoOverflow) {
-    InterruptInfo* intInfo = RUNMALLOC_I(sizeof(struct interrupt_info));
-    interruptInfoArray[interruptInfoIndex] = intInfo;
-    intInfo->startTime = BAMBOO_GET_EXE_TIME();
-    intInfo->endTime = -1;
-  }
-#endif
-msg:
-  // get the incoming msgs
-  if(receiveMsg(send_port_pending) == -1) {
-    return -1;
-  }
-processmsg:
-  // processing received msgs
-  int size = 0;
-  MSG_REMAINSIZE_I(&size);
-  if((size == 0) || (checkMsgLength_I(size) == -1)) {
-    // not a whole msg
-    // have new coming msg
-    if((BAMBOO_MSG_AVAIL() != 0) && !msgdatafull) {
-      goto msg;
-    } else {
-      return -1;
-    }
-  }
-
-  if(msglength <= size) {
-    // have some whole msg
-    MSGTYPE type;
-    type = msgdata[msgdataindex]; //[0]
-    MSG_INDEXINC_I();
-    msgdatafull = false;
-    // TODO
-    //tprintf("msg type: %x\n", type);
-    switch(type) {
-    case TRANSOBJ: {
-      // receive a object transfer msg
-      processmsg_transobj_I();
-      break;
-    }   // case TRANSOBJ
-
-    case TRANSTALL: {
-      // receive a stall msg
-      processmsg_transtall_I();
-      break;
-    }   // case TRANSTALL
-
-// GC version have no lock msgs
-#ifndef MULTICORE_GC
-    case LOCKREQUEST: {
-      // receive lock request msg, handle it right now
-      processmsg_lockrequest_I();
-      break;
-    }   // case LOCKREQUEST
-
-    case LOCKGROUNT: {
-      // receive lock grount msg
-      processmsg_lockgrount_I();
-      break;
-    }   // case LOCKGROUNT
-
-    case LOCKDENY: {
-      // receive lock deny msg
-      processmsg_lockdeny_I();
-      break;
-    }   // case LOCKDENY
-
-    case LOCKRELEASE: {
-      processmsg_lockrelease_I();
-      break;
-    }   // case LOCKRELEASE
-#endif // #ifndef MULTICORE_GC
-
-#ifdef PROFILE
-    case PROFILEOUTPUT: {
-      // receive an output profile data request msg
-      processmsg_profileoutput_I();
-      break;
-    }   // case PROFILEOUTPUT
-
-    case PROFILEFINISH: {
-      // receive a profile output finish msg
-      processmsg_profilefinish_I();
-      break;
-    }   // case PROFILEFINISH
-#endif // #ifdef PROFILE
-
-// GC version has no lock msgs
-#ifndef MULTICORE_GC
-    case REDIRECTLOCK: {
-      // receive a redirect lock request msg, handle it right now
-      processmsg_redirectlock_I();
-      break;
-    }   // case REDIRECTLOCK
-
-    case REDIRECTGROUNT: {
-      // receive a lock grant msg with redirect info
-      processmsg_redirectgrount_I();
-      break;
-    }   // case REDIRECTGROUNT
-
-    case REDIRECTDENY: {
-      // receive a lock deny msg with redirect info
-      processmsg_redirectdeny_I();
-      break;
-    }   // case REDIRECTDENY
-
-    case REDIRECTRELEASE: {
-      // receive a lock release msg with redirect info
-      processmsg_redirectrelease_I();
-      break;
-    }   // case REDIRECTRELEASE
-#endif // #ifndef MULTICORE_GC
-
-    case STATUSCONFIRM: {
-      // receive a status confirm info
-      processmsg_statusconfirm_I();
-      break;
-    }   // case STATUSCONFIRM
-
-    case STATUSREPORT: {
-      processmsg_statusreport_I();
-      break;
-    }   // case STATUSREPORT
-
-    case TERMINATE: {
-      // receive a terminate msg
-      processmsg_terminate_I();
-      break;
-    }   // case TERMINATE
-
-    case MEMREQUEST: {
-      processmsg_memrequest_I();
-      break;
-    }   // case MEMREQUEST
-
-    case MEMRESPONSE: {
-      processmsg_memresponse_I();
-      break;
-    }   // case MEMRESPONSE
-
-#ifdef MULTICORE_GC
-    // GC msgs
-    case GCSTARTPRE: {
-      processmsg_gcstartpre_I();
-      break;
-    }   // case GCSTARTPRE
-	
-	case GCSTARTINIT: {
-      processmsg_gcstartinit_I();
-      break;
-    }   // case GCSTARTINIT
-
-    case GCSTART: {
-      // receive a start GC msg
-      processmsg_gcstart_I();
-      break;
-    }   // case GCSTART
-
-    case GCSTARTCOMPACT: {
-      // a compact phase start msg
-      processmsg_gcstartcompact_I();
-      break;
-    }   // case GCSTARTCOMPACT
-
-	case GCSTARTMAPINFO: {
-      // received a flush phase start msg
-      processmsg_gcstartmapinfo_I();
-      break;
-    }   // case GCSTARTFLUSH
-
-    case GCSTARTFLUSH: {
-      // received a flush phase start msg
-      processmsg_gcstartflush_I();
-      break;
-    }   // case GCSTARTFLUSH
-
-    case GCFINISHPRE: {
-      processmsg_gcfinishpre_I();
-      break;
-    }   // case GCFINISHPRE
-	
-	case GCFINISHINIT: {
-      processmsg_gcfinishinit_I();
-      break;
-    }   // case GCFINISHINIT
-
-    case GCFINISHMARK: {
-      processmsg_gcfinishmark_I();
-      break;
-    }   // case GCFINISHMARK
-
-    case GCFINISHCOMPACT: {
-      // received a compact phase finish msg
-      processmsg_gcfinishcompact_I();
-      break;
-    }   // case GCFINISHCOMPACT
-
-	case GCFINISHMAPINFO: {
-      processmsg_gcfinishmapinfo_I();
-      break;
-    }   // case GCFINISHMAPINFO
-
-    case GCFINISHFLUSH: {
-      processmsg_gcfinishflush_I();
-      break;
-    }   // case GCFINISHFLUSH
-
-    case GCFINISH: {
-      // received a GC finish msg
-      gcphase = FINISHPHASE;
-      break;
-    }   // case GCFINISH
-
-    case GCMARKCONFIRM: {
-      // received a marked phase finish confirm request msg
-      // all cores should do mark
-      processmsg_gcmarkconfirm_I();
-      break;
-    }   // case GCMARKCONFIRM
-
-    case GCMARKREPORT: {
-      processmsg_gcmarkreport_I();
-      break;
-    }   // case GCMARKREPORT
-
-    case GCMARKEDOBJ: {
-      processmsg_gcmarkedobj_I();
-      break;
-    }   // case GCMARKEDOBJ
-
-    case GCMOVESTART: {
-      // received a start moving objs msg
-      processmsg_gcmovestart_I();
-      break;
-    }   // case GCMOVESTART
-
-    case GCMAPREQUEST: {
-      // received a mapping info request msg
-      processmsg_gcmaprequest_I();
-      break;
-    }   // case GCMAPREQUEST
-
-    case GCMAPINFO: {
-      // received a mapping info response msg
-      processmsg_gcmapinfo_I();
-      break;
-    }   // case GCMAPINFO
-
-    case GCMAPTBL: {
-      // received a mapping tbl response msg
-      processmsg_gcmaptbl_I();
-      break;
-    }   // case GCMAPTBL
-	
-	case GCLOBJREQUEST: {
-      // received a large objs info request msg
-      transferMarkResults_I();
-      break;
-    }   // case GCLOBJREQUEST
-
-    case GCLOBJINFO: {
-      // received a large objs info response msg
-      processmsg_gclobjinfo_I();
-      break;
-    }   // case GCLOBJINFO
-
-    case GCLOBJMAPPING: {
-      // received a large obj mapping info msg
-      processmsg_gclobjmapping_I();
-      break;
-    }  // case GCLOBJMAPPING
-
-#ifdef GC_PROFILE
-	case GCPROFILES: {
-      // received a gcprofiles msg
-      processmsg_gcprofiles_I();
-      break;
-    }
-#endif // GC_PROFILE
-
-#ifdef GC_CACHE_ADAPT
-	case GCSTARTPREF: {
-      // received a gcstartpref msg
-      processmsg_gcstartpref_I();
-      break;
-    }
-
-	case GCFINISHPREF: {
-      // received a gcfinishpref msg
-      processmsg_gcfinishpref_I();
-      break;
-    }
-#endif // GC_CACHE_ADAPT
-#endif // #ifdef MULTICORE_GC
-
-    default:
-      break;
-    }  // switch(type)
-    msglength = BAMBOO_MSG_BUF_LENGTH;
-    // TODO
-    //printf("++ msg: %x \n", type);
-
-    if(msgdataindex != msgdatalast) {
-      // still have available msg
-      goto processmsg;
-    }
-#ifdef DEBUG
-#ifndef CLOSE_PRINT
-    BAMBOO_DEBUGPRINT(0xe88d);
-#endif
-#endif
-
-    // have new coming msg
-    if(BAMBOO_MSG_AVAIL() != 0) {
-      goto msg;
-    } // TODO
-
-#ifdef PROFILE_INTERRUPT
-  if(!interruptInfoOverflow) {
-    interruptInfoArray[interruptInfoIndex]->endTime=BAMBOO_GET_EXE_TIME();
-    interruptInfoIndex++;
-    if(interruptInfoIndex == INTERRUPTINFOLENGTH) {
-      interruptInfoOverflow = true;
-    }
-  }
-#endif
-    return (int)type;
-  } else {
-    // not a whole msg
-#ifdef DEBUG
-#ifndef CLOSE_PRINT
-    BAMBOO_DEBUGPRINT(0xe88e);
-#endif
-#endif
-    return -2;
-  }
-}
-
-int enqueuetasks(struct parameterwrapper *parameter,
-                 struct parameterwrapper *prevptr,
-                 struct ___Object___ *ptr,
-                 int * enterflags,
-                 int numenterflags) {
-  void * taskpointerarray[MAXTASKPARAMS];
-  int j;
-  //int numparams=parameter->task->numParameters;
-  int numiterators=parameter->task->numTotal-1;
-  int retval=1;
-
-  struct taskdescriptor * task=parameter->task;
-
-  //this add the object to parameterwrapper
-  ObjectHashadd(parameter->objectset, (int) ptr, 0, (int) enterflags,
-                numenterflags, enterflags==NULL);
-
-  /* Add enqueued object to parameter vector */
-  taskpointerarray[parameter->slot]=ptr;
-
-  /* Reset iterators */
-  for(j=0; j<numiterators; j++) {
-    toiReset(&parameter->iterators[j]);
-  }
-
-  /* Find initial state */
-  for(j=0; j<numiterators; j++) {
-backtrackinit:
-    if(toiHasNext(&parameter->iterators[j],taskpointerarray OPTARG(failed)))
-      toiNext(&parameter->iterators[j], taskpointerarray OPTARG(failed));
-    else if (j>0) {
-      /* Need to backtrack */
-      toiReset(&parameter->iterators[j]);
-      j--;
-      goto backtrackinit;
-    } else {
-      /* Nothing to enqueue */
-      return retval;
-    }
-  }
-
-  while(1) {
-    /* Enqueue current state */
-    //int launch = 0;
-    struct taskparamdescriptor *tpd=
-      RUNMALLOC(sizeof(struct taskparamdescriptor));
-    tpd->task=task;
-    tpd->numParameters=numiterators+1;
-    tpd->parameterArray=RUNMALLOC(sizeof(void *)*(numiterators+1));
-
-    for(j=0; j<=numiterators; j++) {
-      //store the actual parameters
-      tpd->parameterArray[j]=taskpointerarray[j];
-    }
-    /* Enqueue task */
-    if (( /*!gencontains(failedtasks, tpd)&&*/
-          !gencontains(activetasks,tpd))) {
-      genputtable(activetasks, tpd, tpd);
-    } else {
-      RUNFREE(tpd->parameterArray);
-      RUNFREE(tpd);
-    }
-
-    /* This loop iterates to the next parameter combination */
-    if (numiterators==0)
-      return retval;
-
-    for(j=numiterators-1; j<numiterators; j++) {
-backtrackinc:
-      if(toiHasNext(
-			&parameter->iterators[j],taskpointerarray OPTARG(failed)))
-		toiNext(&parameter->iterators[j], taskpointerarray OPTARG(failed));
-      else if (j>0) {
-		/* Need to backtrack */
-		toiReset(&parameter->iterators[j]);
-		j--;
-		goto backtrackinc;
-      } else {
-		/* Nothing more to enqueue */
-		return retval;
-      }
-    }
-  }
-  return retval;
-}
-
-int enqueuetasks_I(struct parameterwrapper *parameter,
-                   struct parameterwrapper *prevptr,
-                   struct ___Object___ *ptr,
-                   int * enterflags,
-                   int numenterflags) {
-  void * taskpointerarray[MAXTASKPARAMS];
-  int j;
-  //int numparams=parameter->task->numParameters;
-  int numiterators=parameter->task->numTotal-1;
-  int retval=1;
-  //int addnormal=1;
-  //int adderror=1;
-
-  struct taskdescriptor * task=parameter->task;
-
-  //this add the object to parameterwrapper
-  ObjectHashadd_I(parameter->objectset, (int) ptr, 0, (int) enterflags,
-                  numenterflags, enterflags==NULL);
-
-  /* Add enqueued object to parameter vector */
-  taskpointerarray[parameter->slot]=ptr;
-
-  /* Reset iterators */
-  for(j=0; j<numiterators; j++) {
-    toiReset(&parameter->iterators[j]);
-  }
-
-  /* Find initial state */
-  for(j=0; j<numiterators; j++) {
-backtrackinit:
-    if(toiHasNext(&parameter->iterators[j],taskpointerarray OPTARG(failed)))
-      toiNext(&parameter->iterators[j], taskpointerarray OPTARG(failed));
-    else if (j>0) {
-      /* Need to backtrack */
-      toiReset(&parameter->iterators[j]);
-      j--;
-      goto backtrackinit;
-    } else {
-      /* Nothing to enqueue */
-      return retval;
-    }
-  }
-
-  while(1) {
-    /* Enqueue current state */
-    //int launch = 0;
-    struct taskparamdescriptor *tpd=
-      RUNMALLOC_I(sizeof(struct taskparamdescriptor));
-    tpd->task=task;
-    tpd->numParameters=numiterators+1;
-    tpd->parameterArray=RUNMALLOC_I(sizeof(void *)*(numiterators+1));
-
-    for(j=0; j<=numiterators; j++) {
-      //store the actual parameters
-      tpd->parameterArray[j]=taskpointerarray[j];
-    }
-    /* Enqueue task */
-    if (( /*!gencontains(failedtasks, tpd)&&*/
-          !gencontains(activetasks,tpd))) {
-      genputtable_I(activetasks, tpd, tpd);
-    } else {
-      RUNFREE(tpd->parameterArray);
-      RUNFREE(tpd);
-    }
-
-    /* This loop iterates to the next parameter combination */
-    if (numiterators==0)
-      return retval;
-
-    for(j=numiterators-1; j<numiterators; j++) {
-backtrackinc:
-      if(toiHasNext(
-			&parameter->iterators[j], taskpointerarray OPTARG(failed)))
-		toiNext(&parameter->iterators[j], taskpointerarray OPTARG(failed));
-      else if (j>0) {
-		/* Need to backtrack */
-		toiReset(&parameter->iterators[j]);
-		j--;
-		goto backtrackinc;
-      } else {
-		/* Nothing more to enqueue */
-		return retval;
-      }
-    }
-  }
-  return retval;
-}
-
-#ifdef MULTICORE_GC
-#define OFFSET 2
-#else
-#define OFFSET 0
-#endif
-
-int containstag(struct ___Object___ *ptr,
-                struct ___TagDescriptor___ *tag);
-
-#ifndef MULTICORE_GC
-void releasewritelock_r(void * lock, void * redirectlock) {
-  int targetcore = 0;
-  int reallock = (int)lock;
-  targetcore = (reallock >> 5) % NUMCORES;
-
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xe671);
-  BAMBOO_DEBUGPRINT_REG((int)lock);
-  BAMBOO_DEBUGPRINT_REG(reallock);
-  BAMBOO_DEBUGPRINT_REG(targetcore);
-#endif
-
-  if(targetcore == BAMBOO_NUM_OF_CORE) {
-    BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xf001);
-#endif
-    // reside on this core
-    if(!RuntimeHashcontainskey(locktbl, reallock)) {
-      // no locks for this object, something is wrong
-      BAMBOO_EXIT(0xa00b);
-    } else {
-      int rwlock_obj = 0;
-      struct LockValue * lockvalue = NULL;
-#ifdef DEBUG
-      BAMBOO_DEBUGPRINT(0xe672);
-#endif
-      RuntimeHashget(locktbl, reallock, &rwlock_obj);
-      lockvalue = (struct LockValue *)rwlock_obj;
-#ifdef DEBUG
-      BAMBOO_DEBUGPRINT_REG(lockvalue->value);
-#endif
-      lockvalue->value++;
-      lockvalue->redirectlock = (int)redirectlock;
-#ifdef DEBUG
-      BAMBOO_DEBUGPRINT_REG(lockvalue->value);
-#endif
-    }
-    BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xf000);
-#endif
-    return;
-  } else {
-    // send lock release with redirect info msg
-    // for 32 bit machine, the size is always 4 words
-    send_msg_4(targetcore, REDIRECTRELEASE, 1, (int)lock,
-               (int)redirectlock, false);
-  }
-}
-#endif
-
-void executetasks() {
-  void * taskpointerarray[MAXTASKPARAMS+OFFSET];
-  int numparams=0;
-  int numtotal=0;
-  struct ___Object___ * tmpparam = NULL;
-  struct parameterdescriptor * pd=NULL;
-  struct parameterwrapper *pw=NULL;
-  int j = 0;
-  int x = 0;
-  bool islock = true;
-
-  int grount = 0;
-  int andmask=0;
-  int checkmask=0;
-
-newtask:
-  while(hashsize(activetasks)>0) {
-#ifdef MULTICORE_GC
-//#ifdef GC_CACHE_ADAPT
-	  // do dtlb sampling if necessary
-//	  bamboo_dtlb_sampling_process();
-//#endif // GC_CACHE_ADAPT
-    if(gcflag) gc(NULL);
-#endif
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xe990);
-#endif
-
-    /* See if there are any active tasks */
-    //if (hashsize(activetasks)>0) {
-    int i;
-#ifdef PROFILE
-#ifdef ACCURATEPROFILE
-    profileTaskStart("tpd checking");
-#endif
-#endif
-    //long clock1;
-    //clock1 = BAMBOO_GET_EXE_TIME();
-
-    busystatus = true;
-    currtpd=(struct taskparamdescriptor *) getfirstkey(activetasks);
-    genfreekey(activetasks, currtpd);
-
-    numparams=currtpd->task->numParameters;
-    numtotal=currtpd->task->numTotal;
-
-    // clear the lockRedirectTbl
-    // (TODO, this table should be empty after all locks are released)
-    // reset all locks
-    /*for(j = 0; j < MAXTASKPARAMS; j++) {
-            runtime_locks[j].redirectlock = 0;
-            runtime_locks[j].value = 0;
-       }*/
-    // get all required locks
-    runtime_locklen = 0;
-    // check which locks are needed
-    for(i = 0; i < numparams; i++) {
-      void * param = currtpd->parameterArray[i];
-      int tmplock = 0;
-      int j = 0;
-      bool insert = true;
-      if(((struct ___Object___ *)param)->type == STARTUPTYPE) {
-		islock = false;
-		taskpointerarray[i+OFFSET]=param;
-		goto execute;
-      }
-      if(((struct ___Object___ *)param)->lock == NULL) {
-		tmplock = (int)param;
-      } else {
-		tmplock = (int)(((struct ___Object___ *)param)->lock);
-      }
-      // insert into the locks array
-      for(j = 0; j < runtime_locklen; j++) {
-		if(runtime_locks[j].value == tmplock) {
-		  insert = false;
-		  break;
-		} else if(runtime_locks[j].value > tmplock) {
-		  break;
-		}
-      }
-      if(insert) {
-		int h = runtime_locklen;
-		for(; h > j; h--) {
-		  runtime_locks[h].redirectlock = runtime_locks[h-1].redirectlock;
-		  runtime_locks[h].value = runtime_locks[h-1].value;
-		}
-		runtime_locks[j].value = tmplock;
-		runtime_locks[j].redirectlock = (int)param;
-		runtime_locklen++;
-      }
-    }  // line 2713: for(i = 0; i < numparams; i++)
-       // grab these required locks
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xe991);
-#endif
-    //long clock2;
-    //clock2 = BAMBOO_GET_EXE_TIME();
-
-    for(i = 0; i < runtime_locklen; i++) {
-      int * lock = (int *)(runtime_locks[i].redirectlock);
-      islock = true;
-      // require locks for this parameter if it is not a startup object
-#ifdef DEBUG
-      BAMBOO_DEBUGPRINT_REG((int)lock);
-      BAMBOO_DEBUGPRINT_REG((int)(runtime_locks[i].value));
-#endif
-      getwritelock(lock);
-      BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
-#ifdef DEBUG
-      BAMBOO_DEBUGPRINT(0xf001);
-#endif
-#ifdef PROFILE
-      //isInterrupt = false;
-#endif
-      while(!lockflag) {
-		BAMBOO_WAITING_FOR_LOCK(0);
-	  }
-#ifndef INTERRUPT
-      if(reside) {
-		while(BAMBOO_WAITING_FOR_LOCK(0) != -1) {
-		}
-      }
-#endif
-      grount = lockresult;
-
-      lockresult = 0;
-      lockobj = 0;
-      lock2require = 0;
-      lockflag = false;
-#ifndef INTERRUPT
-      reside = false;
-#endif
-#ifdef PROFILE
-      //isInterrupt = true;
-#endif
-      BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
-#ifdef DEBUG
-      BAMBOO_DEBUGPRINT(0xf000);
-#endif
-
-      if(grount == 0) {
-#ifdef DEBUG
-		BAMBOO_DEBUGPRINT(0xe992);
-		BAMBOO_DEBUGPRINT_REG(lock);
-#endif
-		// check if has the lock already
-		// can not get the lock, try later
-		// release all grabbed locks for previous parameters
-		for(j = 0; j < i; ++j) {
-		  lock = (int*)(runtime_locks[j].redirectlock);
-		  releasewritelock(lock);
-		}
-		genputtable(activetasks, currtpd, currtpd);
-		if(hashsize(activetasks) == 1) {
-		  // only one task right now, wait a little while before next try
-		  int halt = 10000;
-		  while(halt--) {
-		  }
-		}
-#ifdef PROFILE
-#ifdef ACCURATEPROFILE
-		// fail, set the end of the checkTaskInfo
-		profileTaskEnd();
-#endif
-#endif
-		goto newtask;
-	//}
-      }
-    }   // line 2752:  for(i = 0; i < runtime_locklen; i++)
-
-    /*long clock3;
-       clock3 = BAMBOO_GET_EXE_TIME();
-       //tprintf("sort: %d, grab: %d \n", clock2-clock1, clock3-clock2);*/
-
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT(0xe993);
-#endif
-    /* Make sure that the parameters are still in the queues */
-    for(i=0; i<numparams; i++) {
-      void * parameter=currtpd->parameterArray[i];
-
-      // flush the object
-#ifdef CACHEFLUSH
-      BAMBOO_CACHE_FLUSH_RANGE((int)parameter,
-		  classsize[((struct ___Object___ *)parameter)->type]);
-#endif
-      tmpparam = (struct ___Object___ *)parameter;
-      pd=currtpd->task->descriptorarray[i];
-      pw=(struct parameterwrapper *) pd->queue;
-      /* Check that object is still in queue */
-      {
-		if (!ObjectHashcontainskey(pw->objectset, (int) parameter)) {
-#ifdef DEBUG
-		  BAMBOO_DEBUGPRINT(0xe994);
-		  BAMBOO_DEBUGPRINT_REG(parameter);
-#endif
-		  // release grabbed locks
-		  for(j = 0; j < runtime_locklen; ++j) {
-			int * lock = (int *)(runtime_locks[j].redirectlock);
-			releasewritelock(lock);
-		  }
-		  RUNFREE(currtpd->parameterArray);
-		  RUNFREE(currtpd);
-		  currtpd = NULL;
-		  goto newtask;
-		}
-      }   // line2865
-          /* Check if the object's flags still meets requirements */
-      {
-		int tmpi = 0;
-		bool ismet = false;
-		for(tmpi = 0; tmpi < pw->numberofterms; ++tmpi) {
-		  andmask=pw->intarray[tmpi*2];
-		  checkmask=pw->intarray[tmpi*2+1];
-		  if((((struct ___Object___ *)parameter)->flag&andmask)==checkmask) {
-			ismet = true;
-			break;
-		  }
-		}
-		if (!ismet) {
-		  // flags are never suitable
-		  // remove this obj from the queue
-		  int next;
-		  int UNUSED, UNUSED2;
-		  int * enterflags;
-#ifdef DEBUG
-		  BAMBOO_DEBUGPRINT(0xe995);
-		  BAMBOO_DEBUGPRINT_REG(parameter);
-#endif
-		  ObjectHashget(pw->objectset, (int) parameter, (int *) &next,
-						(int *) &enterflags, &UNUSED, &UNUSED2);
-		  ObjectHashremove(pw->objectset, (int)parameter);
-		  if (enterflags!=NULL)
-			RUNFREE(enterflags);
-		  // release grabbed locks
-		  for(j = 0; j < runtime_locklen; ++j) {
-			int * lock = (int *)(runtime_locks[j].redirectlock);
-			releasewritelock(lock);
-		  }
-		  RUNFREE(currtpd->parameterArray);
-		  RUNFREE(currtpd);
-		  currtpd = NULL;
-#ifdef PROFILE
-#ifdef ACCURATEPROFILE
-		  // fail, set the end of the checkTaskInfo
-		  profileTaskEnd();
-#endif
-#endif
-		  goto newtask;
-		}   // line 2878: if (!ismet)
-      }   // line 2867
-parameterpresent:
-      ;
-      /* Check that object still has necessary tags */
-      for(j=0; j<pd->numbertags; j++) {
-		int slotid=pd->tagarray[2*j]+numparams;
-		struct ___TagDescriptor___ *tagd=currtpd->parameterArray[slotid];
-		if (!containstag(parameter, tagd)) {
-#ifdef DEBUG
-		  BAMBOO_DEBUGPRINT(0xe996);
-#endif
-		  {
-			// release grabbed locks
-			int tmpj = 0;
-			for(tmpj = 0; tmpj < runtime_locklen; ++tmpj) {
-			  int * lock = (int *)(runtime_locks[tmpj].redirectlock);
-			  releasewritelock(lock);
-			}
-		  }
-		  RUNFREE(currtpd->parameterArray);
-		  RUNFREE(currtpd);
-		  currtpd = NULL;
-		  goto newtask;
-		}   // line2911: if (!containstag(parameter, tagd))
-      }   // line 2808: for(j=0; j<pd->numbertags; j++)
-
-      taskpointerarray[i+OFFSET]=parameter;
-    }   // line 2824: for(i=0; i<numparams; i++)
-        /* Copy the tags */
-    for(; i<numtotal; i++) {
-      taskpointerarray[i+OFFSET]=currtpd->parameterArray[i];
-    }
-
-    {
-execute:
-      /* Actually call task */
-#ifdef MULTICORE_GC
-      ((int *)taskpointerarray)[0]=currtpd->numParameters;
-      taskpointerarray[1]=NULL;
-#endif
-#ifdef PROFILE
-#ifdef ACCURATEPROFILE
-      // check finish, set the end of the checkTaskInfo
-      profileTaskEnd();
-#endif
-      profileTaskStart(currtpd->task->name);
-#endif
-      // TODO
-      //long clock4;
-      //clock4 = BAMBOO_GET_EXE_TIME();
-      //tprintf("sort: %d, grab: %d, check: %d \n", (int)(clock2-clock1), (int)(clock3-clock2), (int)(clock4-clock3));
-
-#ifdef DEBUG
-      BAMBOO_DEBUGPRINT(0xe997);
-#endif
-      ((void (*)(void **))currtpd->task->taskptr)(taskpointerarray);
-      // TODO
-      //long clock5;
-      //clock5 = BAMBOO_GET_EXE_TIME();
-      // tprintf("sort: %d, grab: %d, check: %d \n", (int)(clock2-clock1), (int)(clock3-clock2), (int)(clock4-clock3));
-
-#ifdef PROFILE
-#ifdef ACCURATEPROFILE
-      // task finish, set the end of the checkTaskInfo
-      profileTaskEnd();
-      // new a PostTaskInfo for the post-task execution
-      profileTaskStart("post task execution");
-#endif
-#endif
-#ifdef DEBUG
-      BAMBOO_DEBUGPRINT(0xe998);
-      BAMBOO_DEBUGPRINT_REG(islock);
-#endif
-
-      if(islock) {
-#ifdef DEBUG
-		BAMBOO_DEBUGPRINT(0xe999);
-#endif
-		for(i = 0; i < runtime_locklen; ++i) {
-		  void * ptr = (void *)(runtime_locks[i].redirectlock);
-		  int * lock = (int *)(runtime_locks[i].value);
-#ifdef DEBUG
-		  BAMBOO_DEBUGPRINT_REG((int)ptr);
-		  BAMBOO_DEBUGPRINT_REG((int)lock);
-		  BAMBOO_DEBUGPRINT_REG(*((int*)lock+5));
-#endif
-#ifndef MULTICORE_GC
-		  if(RuntimeHashcontainskey(lockRedirectTbl, (int)lock)) {
-			int redirectlock;
-			RuntimeHashget(lockRedirectTbl, (int)lock, &redirectlock);
-			RuntimeHashremovekey(lockRedirectTbl, (int)lock);
-			releasewritelock_r(lock, (int *)redirectlock);
-		  } else {
-#else
-		  {
-#endif
-			releasewritelock(ptr);
-		  }
-		}
-      }     // line 3015: if(islock)
-
-      //long clock6;
-      //clock6 = BAMBOO_GET_EXE_TIME();
-      //tprintf("sort: %d, grab: %d, check: %d \n", (int)(clock2-clock1), (int)(clock3-clock2), (int)(clock4-clock3));
-
-#ifdef PROFILE
-      // post task execution finish, set the end of the postTaskInfo
-      profileTaskEnd();
-#endif
-
-      // Free up task parameter descriptor
-      RUNFREE(currtpd->parameterArray);
-      RUNFREE(currtpd);
-      currtpd = NULL;
-#ifdef DEBUG
-      BAMBOO_DEBUGPRINT(0xe99a);
-#endif
-      //long clock7;
-      //clock7 = BAMBOO_GET_EXE_TIME();
-      //tprintf("sort: %d, grab: %d, check: %d, release: %d, other %d \n", (int)(clock2-clock1), (int)(clock3-clock2), (int)(clock4-clock3), (int)(clock6-clock5), (int)(clock7-clock6));
-
-    }   //
-    //} //  if (hashsize(activetasks)>0)
-  } //  while(hashsize(activetasks)>0)
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xe99b);
-#endif
-}
-
-/* This function processes an objects tags */
-void processtags(struct parameterdescriptor *pd,
-                 int index,
-                 struct parameterwrapper *parameter,
-                 int * iteratorcount,
-                 int *statusarray,
-                 int numparams) {
-  int i;
-
-  for(i=0; i<pd->numbertags; i++) {
-    int slotid=pd->tagarray[2*i];
-    int tagid=pd->tagarray[2*i+1];
-
-    if (statusarray[slotid+numparams]==0) {
-      parameter->iterators[*iteratorcount].istag=1;
-      parameter->iterators[*iteratorcount].tagid=tagid;
-      parameter->iterators[*iteratorcount].slot=slotid+numparams;
-      parameter->iterators[*iteratorcount].tagobjectslot=index;
-      statusarray[slotid+numparams]=1;
-      (*iteratorcount)++;
-    }
-  }
-}
-
-
-void processobject(struct parameterwrapper *parameter,
-                   int index,
-                   struct parameterdescriptor *pd,
-                   int *iteratorcount,
-                   int * statusarray,
-                   int numparams) {
-  int i;
-  int tagcount=0;
-  struct ObjectHash * objectset=
-    ((struct parameterwrapper *)pd->queue)->objectset;
-
-  parameter->iterators[*iteratorcount].istag=0;
-  parameter->iterators[*iteratorcount].slot=index;
-  parameter->iterators[*iteratorcount].objectset=objectset;
-  statusarray[index]=1;
-
-  for(i=0; i<pd->numbertags; i++) {
-    int slotid=pd->tagarray[2*i];
-    //int tagid=pd->tagarray[2*i+1];
-    if (statusarray[slotid+numparams]!=0) {
-      /* This tag has already been enqueued, use it to narrow search */
-      parameter->iterators[*iteratorcount].tagbindings[tagcount]=
-        slotid+numparams;
-      tagcount++;
-    }
-  }
-  parameter->iterators[*iteratorcount].numtags=tagcount;
-
-  (*iteratorcount)++;
-}
-
-/* This function builds the iterators for a task & parameter */
-
-void builditerators(struct taskdescriptor * task,
-                    int index,
-                    struct parameterwrapper * parameter) {
-  int statusarray[MAXTASKPARAMS];
-  int i;
-  int numparams=task->numParameters;
-  int iteratorcount=0;
-  for(i=0; i<MAXTASKPARAMS; i++) statusarray[i]=0;
-
-  statusarray[index]=1; /* Initial parameter */
-  /* Process tags for initial iterator */
-
-  processtags(task->descriptorarray[index], index, parameter,
-              &iteratorcount, statusarray, numparams);
-
-  while(1) {
-loopstart:
-    /* Check for objects with existing tags */
-    for(i=0; i<numparams; i++) {
-      if (statusarray[i]==0) {
-		struct parameterdescriptor *pd=task->descriptorarray[i];
-		int j;
-		for(j=0; j<pd->numbertags; j++) {
-		  int slotid=pd->tagarray[2*j];
-		  if(statusarray[slotid+numparams]!=0) {
-			processobject(parameter,i,pd,&iteratorcount,
-				statusarray,numparams);
-			processtags(pd,i,parameter,&iteratorcount,statusarray,numparams);
-			goto loopstart;
-		  }
-		}
-      }
-    }
-
-    /* Next do objects w/ unbound tags*/
-
-    for(i=0; i<numparams; i++) {
-      if (statusarray[i]==0) {
-		struct parameterdescriptor *pd=task->descriptorarray[i];
-		if (pd->numbertags>0) {
-		  processobject(parameter,i,pd,&iteratorcount,statusarray,numparams);
-		  processtags(pd,i,parameter,&iteratorcount,statusarray,numparams);
-		  goto loopstart;
-		}
-      }
-    }
-
-    /* Nothing with a tag enqueued */
-
-    for(i=0; i<numparams; i++) {
-      if (statusarray[i]==0) {
-		struct parameterdescriptor *pd=task->descriptorarray[i];
-		processobject(parameter,i,pd,&iteratorcount,statusarray,numparams);
-		processtags(pd,i,parameter,&iteratorcount,statusarray,numparams);
-		goto loopstart;
-      }
-    }
-
-    /* Nothing left */
-    return;
-  }
-}
-
-void printdebug() {
-  int i;
-  int j;
-  if(BAMBOO_NUM_OF_CORE > NUMCORESACTIVE - 1) {
-    return;
-  }
-  for(i=0; i<numtasks[BAMBOO_NUM_OF_CORE]; i++) {
-    struct taskdescriptor * task=taskarray[BAMBOO_NUM_OF_CORE][i];
-#ifndef RAW
-    printf("%s\n", task->name);
-#endif
-    for(j=0; j<task->numParameters; j++) {
-      struct parameterdescriptor *param=task->descriptorarray[j];
-      struct parameterwrapper *parameter=param->queue;
-      struct ObjectHash * set=parameter->objectset;
-      struct ObjectIterator objit;
-#ifndef RAW
-      printf("  Parameter %d\n", j);
-#endif
-      ObjectHashiterator(set, &objit);
-      while(ObjhasNext(&objit)) {
-		struct ___Object___ * obj=(struct ___Object___ *)Objkey(&objit);
-		struct ___Object___ * tagptr=obj->___tags___;
-		int nonfailed=Objdata4(&objit);
-		int numflags=Objdata3(&objit);
-		int flags=Objdata2(&objit);
-		Objnext(&objit);
-#ifndef RAW
-		printf("    Contains %lx\n", obj);
-		printf("      flag=%d\n", obj->flag);
-#endif
-		if (tagptr==NULL) {
-		} else if (tagptr->type==TAGTYPE) {
-#ifndef RAW
-		  printf("      tag=%lx\n",tagptr);
-#else
-		  ;
-#endif
-		} else {
-		  int tagindex=0;
-		  struct ArrayObject *ao=(struct ArrayObject *)tagptr;
-		  for(; tagindex<ao->___cachedCode___; tagindex++) {
-#ifndef RAW
-			printf("      tag=%lx\n",ARRAYGET(ao,struct ___TagDescriptor___*,
-											  tagindex));
-#else
-			;
-#endif
-		  }
-		}
-      }
-    }
-  }
-}
-
-
-/* This function processes the task information to create queues for
-   each parameter type. */
-
-void processtasks() {
-  int i;
-  if(BAMBOO_NUM_OF_CORE > NUMCORESACTIVE - 1) {
-    return;
-  }
-  for(i=0; i<numtasks[BAMBOO_NUM_OF_CORE]; i++) {
-    struct taskdescriptor * task=taskarray[BAMBOO_NUM_OF_CORE][i];
-    int j;
-
-    /* Build objectsets */
-    for(j=0; j<task->numParameters; j++) {
-      struct parameterdescriptor *param=task->descriptorarray[j];
-      struct parameterwrapper *parameter=param->queue;
-      parameter->objectset=allocateObjectHash(10);
-      parameter->task=task;
-    }
-
-    /* Build iterators for parameters */
-    for(j=0; j<task->numParameters; j++) {
-      struct parameterdescriptor *param=task->descriptorarray[j];
-      struct parameterwrapper *parameter=param->queue;
-      builditerators(task, j, parameter);
-    }
-  }
-}
-
-void toiReset(struct tagobjectiterator * it) {
-  if (it->istag) {
-    it->tagobjindex=0;
-  } else if (it->numtags>0) {
-    it->tagobjindex=0;
-  } else {
-    ObjectHashiterator(it->objectset, &it->it);
-  }
-}
-
-int toiHasNext(struct tagobjectiterator *it,
-               void ** objectarray OPTARG(int * failed)) {
-  if (it->istag) {
-    /* Iterate tag */
-    /* Get object with tags */
-    struct ___Object___ *obj=objectarray[it->tagobjectslot];
-    struct ___Object___ *tagptr=obj->___tags___;
-    if (tagptr->type==TAGTYPE) {
-      if ((it->tagobjindex==0)&& /* First object */
-		  (it->tagid==((struct ___TagDescriptor___ *)tagptr)->flag)) /* Right tag type */
-		return 1;
-	  else
-		return 0;
-    } else {
-      struct ArrayObject *ao=(struct ArrayObject *) tagptr;
-      int tagindex=it->tagobjindex;
-      for(; tagindex<ao->___cachedCode___; tagindex++) {
-		struct ___TagDescriptor___ *td=
-		  ARRAYGET(ao, struct ___TagDescriptor___ *, tagindex);
-		if (td->flag==it->tagid) {
-		  it->tagobjindex=tagindex; /* Found right type of tag */
-		  return 1;
-		}
-      }
-      return 0;
-    }
-  } else if (it->numtags>0) {
-    /* Use tags to locate appropriate objects */
-    struct ___TagDescriptor___ *tag=objectarray[it->tagbindings[0]];
-    struct ___Object___ *objptr=tag->flagptr;
-    int i;
-    if (objptr->type!=OBJECTARRAYTYPE) {
-      if (it->tagobjindex>0)
-		return 0;
-      if (!ObjectHashcontainskey(it->objectset, (int) objptr))
-		return 0;
-      for(i=1; i<it->numtags; i++) {
-		struct ___TagDescriptor___ *tag2=objectarray[it->tagbindings[i]];
-		if (!containstag(objptr,tag2))
-		  return 0;
-      }
-      return 1;
-    } else {
-      struct ArrayObject *ao=(struct ArrayObject *) objptr;
-      int tagindex;
-      int i;
-      for(tagindex=it->tagobjindex;tagindex<ao->___cachedCode___;tagindex++){
-		struct ___Object___ *objptr=
-		  ARRAYGET(ao,struct ___Object___*,tagindex);
-		if (!ObjectHashcontainskey(it->objectset, (int) objptr))
-		  continue;
-		for(i=1; i<it->numtags; i++) {
-		  struct ___TagDescriptor___ *tag2=objectarray[it->tagbindings[i]];
-		  if (!containstag(objptr,tag2))
-			goto nexttag;
-		}
-		it->tagobjindex=tagindex;
-		return 1;
-nexttag:
-		;
-	  }
-      it->tagobjindex=tagindex;
-      return 0;
-    }
-  } else {
-    return ObjhasNext(&it->it);
-  }
-}
-
-int containstag(struct ___Object___ *ptr,
-                struct ___TagDescriptor___ *tag) {
-  int j;
-  struct ___Object___ * objptr=tag->flagptr;
-  if (objptr->type==OBJECTARRAYTYPE) {
-    struct ArrayObject *ao=(struct ArrayObject *)objptr;
-    for(j=0; j<ao->___cachedCode___; j++) {
-      if (ptr==ARRAYGET(ao, struct ___Object___*, j)) {
-		return 1;
-      }
-    }
-    return 0;
-  } else {
-    return objptr==ptr;
-  }
-}
-
-void toiNext(struct tagobjectiterator *it,
-             void ** objectarray OPTARG(int * failed)) {
-  /* hasNext has all of the intelligence */
-  if(it->istag) {
-    /* Iterate tag */
-    /* Get object with tags */
-    struct ___Object___ *obj=objectarray[it->tagobjectslot];
-    struct ___Object___ *tagptr=obj->___tags___;
-    if (tagptr->type==TAGTYPE) {
-      it->tagobjindex++;
-      objectarray[it->slot]=tagptr;
-    } else {
-      struct ArrayObject *ao=(struct ArrayObject *) tagptr;
-      objectarray[it->slot]=
-        ARRAYGET(ao, struct ___TagDescriptor___ *, it->tagobjindex++);
-    }
-  } else if (it->numtags>0) {
-    /* Use tags to locate appropriate objects */
-    struct ___TagDescriptor___ *tag=objectarray[it->tagbindings[0]];
-    struct ___Object___ *objptr=tag->flagptr;
-    if (objptr->type!=OBJECTARRAYTYPE) {
-      it->tagobjindex++;
-      objectarray[it->slot]=objptr;
-    } else {
-      struct ArrayObject *ao=(struct ArrayObject *) objptr;
-      objectarray[it->slot]=
-        ARRAYGET(ao, struct ___Object___ *, it->tagobjindex++);
-    }
-  } else {
-    /* Iterate object */
-    objectarray[it->slot]=(void *)Objkey(&it->it);
-    Objnext(&it->it);
-  }
-}
-
-#ifdef PROFILE
-inline void profileTaskStart(char * taskname) {
-  if(!taskInfoOverflow) {
-    TaskInfo* taskInfo = RUNMALLOC(sizeof(struct task_info));
-    taskInfoArray[taskInfoIndex] = taskInfo;
-    taskInfo->taskName = taskname;
-    taskInfo->startTime = BAMBOO_GET_EXE_TIME();
-    taskInfo->endTime = -1;
-    taskInfo->exitIndex = -1;
-    taskInfo->newObjs = NULL;
-  }
-}
-
-inline void profileTaskEnd() {
-  if(!taskInfoOverflow) {
-    taskInfoArray[taskInfoIndex]->endTime = BAMBOO_GET_EXE_TIME();
-    taskInfoIndex++;
-    if(taskInfoIndex == TASKINFOLENGTH) {
-      taskInfoOverflow = true;
-      //taskInfoIndex = 0;
-    }
-  }
-}
-
-// output the profiling data
-void outputProfileData() {
-#ifdef USEIO
-  int i;
-  unsigned long long totaltasktime = 0;
-  unsigned long long preprocessingtime = 0;
-  unsigned long long objqueuecheckingtime = 0;
-  unsigned long long postprocessingtime = 0;
-  //int interruptiontime = 0;
-  unsigned long long other = 0;
-  unsigned long long averagetasktime = 0;
-  int tasknum = 0;
-
-  printf("Task Name, Start Time, End Time, Duration, Exit Index(, NewObj Name, Num)+\n");
-  // output task related info
-  for(i = 0; i < taskInfoIndex; i++) {
-    TaskInfo* tmpTInfo = taskInfoArray[i];
-    unsigned long long duration = tmpTInfo->endTime - tmpTInfo->startTime;
-    printf("%s, %lld, %lld, %lld, %lld",
-           tmpTInfo->taskName, tmpTInfo->startTime, tmpTInfo->endTime,
-           duration, tmpTInfo->exitIndex);
-    // summarize new obj info
-    if(tmpTInfo->newObjs != NULL) {
-      struct RuntimeHash * nobjtbl = allocateRuntimeHash(5);
-      struct RuntimeIterator * iter = NULL;
-      while(0 == isEmpty(tmpTInfo->newObjs)) {
-		char * objtype = (char *)(getItem(tmpTInfo->newObjs));
-		if(RuntimeHashcontainskey(nobjtbl, (int)(objtype))) {
-		  int num = 0;
-		  RuntimeHashget(nobjtbl, (int)objtype, &num);
-		  RuntimeHashremovekey(nobjtbl, (int)objtype);
-		  num++;
-		  RuntimeHashadd(nobjtbl, (int)objtype, num);
-		} else {
-		  RuntimeHashadd(nobjtbl, (int)objtype, 1);
-		}
-		//printf(stderr, "new obj!\n");
-      }
-
-      // output all new obj info
-      iter = RuntimeHashcreateiterator(nobjtbl);
-      while(RunhasNext(iter)) {
-		char * objtype = (char *)Runkey(iter);
-		int num = Runnext(iter);
-		printf(", %s, %d", objtype, num);
-      }
-    }
-    printf("\n");
-    if(strcmp(tmpTInfo->taskName, "tpd checking") == 0) {
-      preprocessingtime += duration;
-    } else if(strcmp(tmpTInfo->taskName, "post task execution") == 0) {
-      postprocessingtime += duration;
-    } else if(strcmp(tmpTInfo->taskName, "objqueue checking") == 0) {
-      objqueuecheckingtime += duration;
-    } else {
-      totaltasktime += duration;
-      averagetasktime += duration;
-      tasknum++;
-    }
-  }
-
-  if(taskInfoOverflow) {
-    printf("Caution: task info overflow!\n");
-  }
-
-  other = totalexetime-totaltasktime-preprocessingtime-postprocessingtime;
-  averagetasktime /= tasknum;
-
-  printf("\nTotal time: %lld\n", totalexetime);
-  printf("Total task execution time: %lld (%d%%)\n", totaltasktime,
-         (int)(((double)totaltasktime/(double)totalexetime)*100));
-  printf("Total objqueue checking time: %lld (%d%%)\n",
-         objqueuecheckingtime,
-         (int)(((double)objqueuecheckingtime/(double)totalexetime)*100));
-  printf("Total pre-processing time: %lld (%d%%)\n", preprocessingtime,
-         (int)(((double)preprocessingtime/(double)totalexetime)*100));
-  printf("Total post-processing time: %lld (%d%%)\n", postprocessingtime,
-         (int)(((double)postprocessingtime/(double)totalexetime)*100));
-  printf("Other time: %lld (%d%%)\n", other,
-         (int)(((double)other/(double)totalexetime)*100));
-
-
-  printf("\nAverage task execution time: %lld\n", averagetasktime);
-
-  //printf("\nTotal time spent for interruptions: %lld\n", interrupttime);
-#else
-  int i = 0;
-  int j = 0;
-
-  BAMBOO_DEBUGPRINT(0xdddd);
-  // output task related info
-  for(i= 0; i < taskInfoIndex; i++) {
-    TaskInfo* tmpTInfo = taskInfoArray[i];
-    char* tmpName = tmpTInfo->taskName;
-    int nameLen = strlen(tmpName);
-    BAMBOO_DEBUGPRINT(0xddda);
-    for(j = 0; j < nameLen; j++) {
-      BAMBOO_DEBUGPRINT_REG(tmpName[j]);
-    }
-    BAMBOO_DEBUGPRINT(0xdddb);
-    BAMBOO_DEBUGPRINT_REG(tmpTInfo->startTime);
-    BAMBOO_DEBUGPRINT_REG(tmpTInfo->endTime);
-    BAMBOO_DEBUGPRINT_REG(tmpTInfo->exitIndex);
-    if(tmpTInfo->newObjs != NULL) {
-      struct RuntimeHash * nobjtbl = allocateRuntimeHash(5);
-      struct RuntimeIterator * iter = NULL;
-      while(0 == isEmpty(tmpTInfo->newObjs)) {
-		char * objtype = (char *)(getItem(tmpTInfo->newObjs));
-		if(RuntimeHashcontainskey(nobjtbl, (int)(objtype))) {
-		  int num = 0;
-		  RuntimeHashget(nobjtbl, (int)objtype, &num);
-		  RuntimeHashremovekey(nobjtbl, (int)objtype);
-		  num++;
-		  RuntimeHashadd(nobjtbl, (int)objtype, num);
-		} else {
-		  RuntimeHashadd(nobjtbl, (int)objtype, 1);
-		}
-      }
-
-      // ouput all new obj info
-      iter = RuntimeHashcreateiterator(nobjtbl);
-      while(RunhasNext(iter)) {
-		char * objtype = (char *)Runkey(iter);
-		int num = Runnext(iter);
-		int nameLen = strlen(objtype);
-		BAMBOO_DEBUGPRINT(0xddda);
-		for(j = 0; j < nameLen; j++) {
-		  BAMBOO_DEBUGPRINT_REG(objtype[j]);
-		}
-		BAMBOO_DEBUGPRINT(0xdddb);
-		BAMBOO_DEBUGPRINT_REG(num);
-	  }
-	}
-	BAMBOO_DEBUGPRINT(0xdddc);
-  }
-
-  if(taskInfoOverflow) {
-	BAMBOO_DEBUGPRINT(0xefee);
-  }
-
-#ifdef PROFILE_INTERRUPT
-  // output interrupt related info
-  for(i = 0; i < interruptInfoIndex; i++) {
-	InterruptInfo* tmpIInfo = interruptInfoArray[i];
-	BAMBOO_DEBUGPRINT(0xddde);
-	BAMBOO_DEBUGPRINT_REG(tmpIInfo->startTime);
-	BAMBOO_DEBUGPRINT_REG(tmpIInfo->endTime);
-	BAMBOO_DEBUGPRINT(0xdddf);
-  }
-
-  if(interruptInfoOverflow) {
-	BAMBOO_DEBUGPRINT(0xefef);
-  }
-#endif // PROFILE_INTERRUPT
-
-  BAMBOO_DEBUGPRINT(0xeeee);
-#endif
-}
-#endif  // #ifdef PROFILE
-
-#endif
diff --git a/Robust/src/buildscript b/Robust/src/buildscript
index 4c943b1e..a0d05e25 100755
--- a/Robust/src/buildscript
+++ b/Robust/src/buildscript
@@ -631,6 +631,8 @@ done
 
 BUILDDIR="$CURDIR/$tmpbuilddirectory"
 
+BAMBOORUNTIME=$ROBUSTROOT/Runtime/bamboo
+
 cd $1
 cd $CURDIR
 shift
@@ -789,15 +791,15 @@ else
 MAKEFILE="$MAKEFILE.$RAWCONFIG"
 fi #useio version
 
-cp $ROBUSTROOT/Runtime/RAW/$MAKEFILE ./Makefile
+cp $BAMBOORUNTIME/RAW/$MAKEFILE ./Makefile
 cp ../Runtime/*.c ./
 cp ../Runtime/*.h ./
 cp ../Runtime/*.S ./
 cp ../Runtime/*.s ./
-cp ../Runtime/RAW/*.c ./
-cp ../Runtime/RAW/*.h ./
-cp ../Runtime/RAW/*.S ./
-cp ../Runtime/RAW/*.s ./
+cp $BAMBOORUNTIME/RAW/*.c ./
+cp $BAMBOORUNTIME/RAW/*.h ./
+cp $BAMBOORUNTIME/RAW/*.S ./
+cp $BAMBOORUNTIME/RAW/*.s ./
 cp ../$tmpbuilddirectory/*.c ./
 cp ../$tmpbuilddirectory/*.h ./
 
@@ -980,33 +982,33 @@ else
 cp $ROBUSTROOT/Tilera/Runtime/$TILERA_INDIR/bamboo-vmlinux-pci.hvc ./bamboo-vmlinux-pci.hvc
 fi
 fi
-cp ../Runtime/multicoretask.c ./
-cp ../Runtime/multicoreruntime.c ./
 cp ../Runtime/Queue.c ./
 cp ../Runtime/file.c ./
 cp ../Runtime/math.c ./
 cp ../Runtime/object.c ./
 cp ../Runtime/GenericHashtable.c ./
 cp ../Runtime/SimpleHash.c ./
-cp ../Runtime/GCSharedHash.c ./
 cp ../Runtime/ObjectHash.c ./
 cp ../Runtime/socket.c ./
 cp ../Runtime/mem.c ./
-cp ../Runtime/multicoregarbage.c ./
-cp ../Runtime/MGCHash.c ./
 cp ../Runtime/GenericHashtable.h ./
 cp ../Runtime/mem.h ./
-cp ../Runtime/multicoreruntime.h ./
 cp ../Runtime/object.h ./
 cp ../Runtime/ObjectHash.h ./
 cp ../Runtime/Queue.h ./
 cp ../Runtime/runtime.h ./
 cp ../Runtime/SimpleHash.h ./
-cp ../Runtime/GCSharedHash.h ./
-cp ../Runtime/multicoregc.h ./
-cp ../Runtime/multicoregarbage.h ./
-cp ../Runtime/multicorehelper.h ./
-cp ../Runtime/MGCHash.h ./
+cp $BAMBOORUNTIME/multicoretask.c ./
+cp $BAMBOORUNTIME/multicoreruntime.c ./
+cp $BAMBOORUNTIME/GCSharedHash.c ./
+cp $BAMBOORUNTIME/multicoregarbage.c ./
+cp $BAMBOORUNTIME/MGCHash.c ./
+cp $BAMBOORUNTIME/multicoreruntime.h ./
+cp $BAMBOORUNTIME/GCSharedHash.h ./
+cp $BAMBOORUNTIME/multicoregc.h ./
+cp $BAMBOORUNTIME/multicoregarbage.h ./
+cp $BAMBOORUNTIME/multicorehelper.h ./
+cp $BAMBOORUNTIME/MGCHash.h ./
 cp ../Tilera/Runtime/*.c ./
 cp ../Tilera/Runtime/*.h ./
 cp ../Tilera/Runtime/$TILERA_INDIR/*.c ./
@@ -1033,7 +1035,8 @@ INCLUDES="$INCLUDES -I$ROBUSTROOT/Runtime -I. -IRuntime/include \
 
 if $MULTICOREFLAG
 then
-RUNTIMEFILE="$ROBUSTROOT/Runtime/multicoreruntime.c $ROBUSTROOT/Runtime/multicoretask.c"
+RUNTIMEFILE="$BAMBOORUNTIME/multicoreruntime.c $BAMBOORUNTIME/multicoretask.c"
+INCLUDES="$INCLUDES -I$BAMBOORUNTIME"
 else
 RUNTIMEFILE="$ROBUSTROOT/Runtime/runtime.c $ROBUSTROOT/Runtime/task.c"
 fi