remove some redundant outputs
[IRC.git] / Robust / src / Runtime / multicoretask.c
index 4f629a9357fd4a2b1cf33b274c66c4b9786445ee..cfe56c73b9e74327a186047efda568b30f8870ff 100644 (file)
 #ifdef TASK
 #include "runtime.h"
-#ifndef RAW
-#include "structdefs.h"
-#include "mem.h"
-#include "checkpoint.h"
-#include "Queue.h"
-#include "SimpleHash.h"
+#include "multicoreruntime.h"
+#include "runtime_arch.h"
 #include "GenericHashtable.h"
-#include <sys/select.h>
-#include <sys/types.h>
-#include <sys/mman.h>
-#include <string.h>
-#include <signal.h>
-#include <assert.h>
-#include <errno.h>
-#endif
-#ifdef RAW
-#ifdef RAWPROFILE
-#ifdef RAWUSEIO
-#include "stdio.h"
-#include "string.h"
-#endif
-#endif
-#include <raw.h>
-#include <raw_compiler_defs.h>
-#elif defined THREADSIMULATE
-// use POSIX message queue
-// for each core, its message queue named as
-// /msgqueue_corenum
-#include <mqueue.h>
-#include <sys/stat.h>
-#endif
-/*
-   extern int injectfailures;
-   extern float failurechance;
- */
-extern int debugtask;
-extern int instaccum;
-
-#ifdef RAW
-#define TOTALCORE raw_get_num_tiles()
-#endif
 
-#ifdef CONSCHECK
-#include "instrument.h"
-#endif
+#ifndef INLINE
+#define INLINE    inline __attribute__((always_inline))
+#endif // #ifndef INLINE
 
+//  data structures for task invocation
 struct genhashtable * activetasks;
-struct genhashtable * failedtasks;
 struct taskparamdescriptor * currtpd;
-#ifndef RAW
-struct RuntimeHash * forward;
-struct RuntimeHash * reverse;
-#endif
-
-int corestatus[NUMCORES]; // records status of each core
-                          // 1: running tasks
-// 0: stall
-int numsendobjs[NUMCORES]; // records how many objects a core has sent out
-int numreceiveobjs[NUMCORES]; // records how many objects a core has received
-#ifdef RAW
-struct RuntimeHash locktable;
-static struct RuntimeHash* locktbl = &locktable;
-void * curr_heapbase=0;
-void * curr_heaptop=0;
-int self_numsendobjs;
-int self_numreceiveobjs;
-int lockobj;
-int lockresult;
-bool lockflag;
-#ifndef INTERRUPT
-bool reside;
-#endif
-struct Queue objqueue;
-int msgdata[30];
-int msgtype;
-int msgdataindex;
-int msglength;
-int outmsgdata[30];
-int outmsgindex;
-int outmsglast;
-int outmsgleft;
-bool isMsgHanging;
-bool isMsgSending;
-void calCoords(int core_num, int* coordY, int* coordX);
-#elif defined THREADSIMULATE
-static struct RuntimeHash* locktbl;
-struct thread_data {
-  int corenum;
-  int argc;
-  char** argv;
-  int numsendobjs;
-  int numreceiveobjs;
-};
-struct thread_data thread_data_array[NUMCORES];
-mqd_t mqd[NUMCORES];
-static pthread_key_t key;
-static pthread_rwlock_t rwlock_tbl;
-static pthread_rwlock_t rwlock_init;
-
-void run(void * arg);
-#endif
-
-bool transStallMsg(int targetcore);
-void transTerminateMsg(int targetcore);
-int receiveObject();
-bool getreadlock(void* ptr);
-void releasereadlock(void* ptr);
-#ifdef RAW
-bool getreadlock_I(void* ptr);
-void releasereadlock_I(void* ptr);
-#endif
-bool getwritelock(void* ptr);
-void releasewritelock(void* ptr);
-
-// profiling mode of RAW version
-#ifdef RAWPROFILE
-
-#define TASKINFOLENGTH 150
-//#define INTERRUPTINFOLENGTH 500
-
-bool stall;
-//bool isInterrupt;
-int totalexetime;
-
-typedef struct task_info {
-  char* taskName;
-  int startTime;
-  int endTime;
-} TaskInfo;
-
-/*typedef struct interrupt_info {
-  int startTime;
-  int endTime;
-} InterruptInfo;*/
-
-TaskInfo * taskInfoArray[TASKINFOLENGTH];
-int taskInfoIndex;
-bool taskInfoOverflow;
-/*InterruptInfo * interruptInfoArray[INTERRUPTINFOLENGTH];
-int interruptInfoIndex;
-bool interruptInfoOverflow;*/
-int profilestatus[NUMCORES]; // records status of each core
-                             // 1: running tasks
-// 0: stall
-bool transProfileRequestMsg(int targetcore);
-void outputProfileData();
-#endif
-
-#ifdef RAW
-//#ifdef RAWPROFILE
-#ifdef RAWUSEIO
-int main(void) {
-#else
-void begin() {
-#endif
+struct LockValue runtime_locks[MAXTASKPARAMS];
+int runtime_locklen;
+
+// specific functions used inside critical sections
+void enqueueObject_I(void * ptr, 
+                                struct parameterwrapper ** queues, 
+                                                                                int length);
+int enqueuetasks_I(struct parameterwrapper *parameter, 
+                              struct parameterwrapper *prevptr, 
+                                                                        struct ___Object___ *ptr, 
+                                                                        int * enterflags, 
+                                                                        int numenterflags);
+
+#ifdef MULTICORE_GC
+inline __attribute__((always_inline)) 
+void setupsmemmode(void) {
+#ifdef SMEML
+       bamboo_smem_mode = SMEMLOCAL;
+#elif defined SMEMF
+       bamboo_smem_mode = SMEMFIXED;
+#elif defined SMEMM
+       bamboo_smem_mode = SMEMMIXED;
+#elif defined SMEMG
+       bamboo_smem_mode = SMEMGLOBAL;
 #else
-int main(int argc, char **argv) {
+       // defaultly using local mode
+       //bamboo_smem_mode = SMEMLOCAL;
+       bamboo_smem_mode = SMEMGLOBAL;
 #endif
-#ifdef RAW
-  int i = 0;
-  int argc = 1;
-  char ** argv = NULL;
-  bool sendStall = false;
-  bool isfirst = true;
-  bool tocontinue = false;
-  struct QueueItem * objitem = NULL;
-  struct transObjInfo * objInfo = NULL;
-  int grount = 0;
-  bool allStall = true;
-  int sumsendobj = 0;
-
-#ifdef RAWDEBUG
-  raw_test_pass(0xee01);
+} // void setupsmemmode(void)
 #endif
-  corenum = raw_get_abs_pos_x() + 4 * raw_get_abs_pos_y();
 
-  // initialize the arrays
-  if(STARTUPCORE == corenum) {
+inline __attribute__((always_inline)) 
+void initruntimedata() {
+       int i;
+       // initialize the arrays
+  if(STARTUPCORE == BAMBOO_NUM_OF_CORE) {
     // startup core to initialize corestatus[]
-    for(i = 0; i < NUMCORES; ++i) {
+    for(i = 0; i < NUMCORESACTIVE; ++i) {
       corestatus[i] = 1;
-      numsendobjs[i] = 0;                   // assume all variables in RAW are local variables! MAY BE WRONG!!!
+      numsendobjs[i] = 0; 
       numreceiveobjs[i] = 0;
-    }
-#ifdef RAWPROFILE
-    for(i = 0; i < NUMCORES; ++i) {
-      profilestatus[i] = 1;
-    }
-#endif
-  }
+#ifdef PROFILE
+                       // initialize the profile data arrays
+                       profilestatus[i] = 1;
+#endif
+#ifdef MULTICORE_GC
+                       gccorestatus[i] = 1;
+                       gcnumsendobjs[i] = 0; 
+      gcnumreceiveobjs[i] = 0;
+#endif
+    } // for(i = 0; i < NUMCORESACTIVE; ++i)
+#ifdef MULTICORE_GC
+               for(i = 0; i < NUMCORES4GC; ++i) {
+                       gcloads[i] = 0;
+                       gcrequiredmems[i] = 0;
+                       gcstopblock[i] = 0;
+                       gcfilledblocks[i] = 0;
+    } // for(i = 0; i < NUMCORES4GC; ++i)
+#ifdef GC_PROFILE
+               gc_infoIndex = 0;
+               gc_infoOverflow = false;
+#endif
+#endif
+               numconfirm = 0;
+               waitconfirm = false; 
+               
+               // TODO for test
+               total_num_t6 = 0;
+  }
+
+  busystatus = true;
   self_numsendobjs = 0;
   self_numreceiveobjs = 0;
-  for(i = 0; i < 30; ++i) {
+
+  for(i = 0; i < BAMBOO_MSG_BUF_LENGTH; ++i) {
     msgdata[i] = -1;
   }
-  //msgdata = NULL;
-  msgtype = -1;
   msgdataindex = 0;
-  msglength = 30;
-
-  for(i = 0; i < 30; ++i) {
+       msgdatalast = 0;
+  msglength = BAMBOO_MSG_BUF_LENGTH;
+       msgdatafull = false;
+  for(i = 0; i < BAMBOO_OUT_BUF_LENGTH; ++i) {
     outmsgdata[i] = -1;
   }
   outmsgindex = 0;
@@ -208,655 +101,582 @@ int main(int argc, char **argv) {
   outmsgleft = 0;
   isMsgHanging = false;
   isMsgSending = false;
-#ifdef RAWDEBUG
-  raw_test_pass(0xee02);
-#endif
 
-  // create the lock table, lockresult table and obj queue
+  smemflag = true;
+  bamboo_cur_msp = NULL;
+  bamboo_smem_size = 0;
+       totransobjqueue = createQueue();
+
+#ifdef MULTICORE_GC
+       gcflag = false;
+       gcprocessing = false;
+       gcphase = FINISHPHASE;
+       gccurr_heaptop = 0;
+       gcself_numsendobjs = 0;
+       gcself_numreceiveobjs = 0;
+       gcmarkedptrbound = 0;
+       //mgchashCreate(2000, 0.75);
+       gcpointertbl = allocateRuntimeHash(20);
+       //gcpointertbl = allocateMGCHash(20);
+       gcforwardobjtbl = allocateMGCHash(20, 3);
+       gcobj2map = 0;
+       gcmappedobj = 0;
+       gcismapped = false;
+       gcnumlobjs = 0;
+       gcheaptop = 0;
+       gctopcore = 0;
+       gctopblock = 0;
+       gcmovestartaddr = 0;
+       gctomove = false;
+       gcmovepending = 0;
+       gcblock2fill = 0;
+       gcsbstarttbl = BAMBOO_BASE_VA;
+       bamboo_smemtbl = (void *)gcsbstarttbl
+               + (BAMBOO_SHARED_MEM_SIZE/BAMBOO_SMEM_SIZE)*sizeof(INTPTR); 
+#else
+       // create the lock table, lockresult table and obj queue
   locktable.size = 20;
-  locktable.bucket = (struct RuntimeNode **) RUNMALLOC_I(sizeof(struct RuntimeNode *)*20);
+  locktable.bucket = 
+               (struct RuntimeNode **) RUNMALLOC_I(sizeof(struct RuntimeNode *)*20);
   /* Set allocation blocks*/
   locktable.listhead=NULL;
   locktable.listtail=NULL;
   /*Set data counts*/
   locktable.numelements = 0;
   lockobj = 0;
+  lock2require = 0;
   lockresult = 0;
   lockflag = false;
+       lockRedirectTbl = allocateRuntimeHash(20);
+  objRedirectLockTbl = allocateRuntimeHash(20);
+#endif
 #ifndef INTERRUPT
   reside = false;
-#endif
+#endif  
   objqueue.head = NULL;
   objqueue.tail = NULL;
-#ifdef RAWDEBUG
-  raw_test_pass(0xee03);
-#endif
 
-#ifdef RAWPROFILE
+       currtpd = NULL;
+
+#ifdef PROFILE
   stall = false;
   //isInterrupt = true;
   totalexetime = -1;
   taskInfoIndex = 0;
-  /*interruptInfoIndex = 0;
   taskInfoOverflow = false;
+  /*interruptInfoIndex = 0;
   interruptInfoOverflow = false;*/
 #endif
 
-#ifdef INTERRUPT
-  if (corenum < NUMCORES) {
-    // set up interrupts
-    setup_ints();
-    raw_user_interrupts_on();
-#ifdef RAWDEBUG
-    raw_test_pass(0xee04);
-#endif
-  }
-#endif
-
-#elif defined THREADSIMULATE
-  errno = 0;
-  int tids[NUMCORES];
-  int rc[NUMCORES];
-  pthread_t threads[NUMCORES];
-  int i = 0;
-
-  // initialize three arrays and msg queue array
-  char * pathhead = "/msgqueue_";
-  int targetlen = strlen(pathhead);
-  for(i = 0; i < NUMCORES; ++i) {
-    corestatus[i] = 1;
-    numsendobjs[i] = 0;
-    numreceiveobjs[i] = 0;
-
-    char corenumstr[3];
-    int sourcelen = 0;
-    if(i < 10) {
-      corenumstr[0] = i + '0';
-      corenumstr[1] = '\0';
-      sourcelen = 1;
-    } else if(i < 100) {
-      corenumstr[1] = i %10 + '0';
-      corenumstr[0] = (i / 10) + '0';
-      corenumstr[2] = '\0';
-      sourcelen = 2;
-    } else {
-      printf("Error: i >= 100\n");
-      fflush(stdout);
-      exit(-1);
-    }
-    char path[targetlen + sourcelen + 1];
-    strcpy(path, pathhead);
-    strncat(path, corenumstr, sourcelen);
-    int oflags = O_RDONLY|O_CREAT|O_NONBLOCK;
-    int omodes = S_IRWXU|S_IRWXG|S_IRWXO;
-    mq_unlink(path);
-    mqd[i]= mq_open(path, oflags, omodes, NULL);
-    if(mqd[i] == -1) {
-      printf("[Main] mq_open %s fails: %d, error: %s\n", path, mqd[i], strerror(errno));
-      exit(-1);
-    } else {
-      printf("[Main] mq_open %s returns: %d\n", path, mqd[i]);
-    }
-  }
-
-  // create the key
-  pthread_key_create(&key, NULL);
-
-  // create the lock table and initialize its mutex
-  locktbl = allocateRuntimeHash(20);
-  int rc_locktbl = pthread_rwlock_init(&rwlock_tbl, NULL);
-  printf("[Main] initialize the rwlock for lock table: %d error: \n", rc_locktbl, strerror(rc_locktbl));
-
-  for(i = 0; i < NUMCORES; ++i) {
-    thread_data_array[i].corenum = i;
-    thread_data_array[i].argc = argc;
-    thread_data_array[i].argv = argv;
-    thread_data_array[i].numsendobjs = 0;
-    thread_data_array[i].numreceiveobjs = 0;
-    printf("[main] creating thread %d\n", i);
-    rc[i] = pthread_create(&threads[i], NULL, run, (void *)&thread_data_array[i]);
-    if (rc[i]) {
-      printf("[main] ERROR; return code from pthread_create() is %d\n", rc[i]);
-      fflush(stdout);
-      exit(-1);
-    }
-  }
-
-  //pthread_exit(NULL);
-  while(true) {
-  }
+       for(i = 0; i < MAXTASKPARAMS; i++) {
+               runtime_locks[i].redirectlock = 0;
+               runtime_locks[i].value = 0;
+       }
+       runtime_locklen = 0;
 }
 
-void run(void* arg) {
-  struct thread_data * my_tdata = (struct thread_data *)arg;
-  pthread_setspecific(key, (void *)my_tdata->corenum);
-  int argc = my_tdata->argc;
-  char** argv = my_tdata->argv;
-  printf("[run, %d] Thread %d runs: %x\n", my_tdata->corenum, my_tdata->corenum, (int)pthread_self());
-  fflush(stdout);
-
+inline __attribute__((always_inline))
+void disruntimedata() {
+#ifdef MULTICORE_GC
+       //mgchashDelete();
+       freeRuntimeHash(gcpointertbl);
+       //freeMGCHash(gcpointertbl);
+       freeMGCHash(gcforwardobjtbl);
+#else
+       freeRuntimeHash(lockRedirectTbl);
+       freeRuntimeHash(objRedirectLockTbl);
+       RUNFREE(locktable.bucket);
 #endif
+       if(activetasks != NULL) {
+               genfreehashtable(activetasks);
+       }
+       if(currtpd != NULL) {
+               RUNFREE(currtpd->parameterArray);
+               RUNFREE(currtpd);
+               currtpd = NULL;
+       }
+}
 
-#ifdef BOEHM_GC
-  GC_init(); // Initialize the garbage collector
-#endif
-#ifdef CONSCHECK
-  initializemmap();
-#endif
-#ifndef RAW
-  processOptions();
-#endif
-  initializeexithandler();
-#ifdef RAWDEBUG
-  raw_test_pass(0xee05);
-#endif
-  /* Create table for failed tasks */
-#ifdef RAW
-  if(corenum > NUMCORES - 1) {
-    failedtasks = NULL;
-    activetasks = NULL;
-/*#ifdef RAWPROFILE
-        raw_test_pass(0xee01);
-        raw_test_pass_reg(taskInfoIndex);
-        raw_test_pass_reg(taskInfoOverflow);
-        if(!taskInfoOverflow) {
-        TaskInfo* taskInfo = RUNMALLOC(sizeof(struct task_info));
-        taskInfoArray[taskInfoIndex] = taskInfo;
-        taskInfo->taskName = "msg handling";
-        taskInfo->startTime = raw_get_cycle();
-        taskInfo->endTime = -1;
-        }
- #endif*/
-#ifdef RAWPROFILE
-    //isInterrupt = false;
-#endif
-    while(true) {
-      receiveObject();
-    }
-  } else {
-#ifdef RAWDEBUG
-    raw_test_pass(0xee06);
-#endif
-#endif
-  /*failedtasks=genallocatehashtable((unsigned int (*)(void *)) &hashCodetpd,
-                                   (int (*)(void *,void *)) &comparetpd);*/
-  failedtasks = NULL;
-#ifdef RAWDEBUG
-  raw_test_pass(0xee07);
+inline __attribute__((always_inline))
+bool checkObjQueue() {
+       bool rflag = false;
+       struct transObjInfo * objInfo = NULL;
+       int grount = 0;
+
+#ifdef PROFILE
+#ifdef ACCURATEPROFILE
+       bool isChecking = false;
+       if(!isEmpty(&objqueue)) {
+               profileTaskStart("objqueue checking");
+               isChecking = true;
+       } // if(!isEmpty(&objqueue))
 #endif
-  /* Create queue of active tasks */
-  activetasks=genallocatehashtable((unsigned int(*) (void *)) &hashCodetpd,
-                                   (int(*) (void *,void *)) &comparetpd);
-#ifdef RAWDEBUG
-  raw_test_pass(0xee08);
 #endif
 
-  /* Process task information */
-  processtasks();
-#ifdef RAWDEBUG
-  raw_test_pass(0xee09);
+       while(!isEmpty(&objqueue)) {
+               void * obj = NULL;
+               BAMBOO_START_CRITICAL_SECTION_OBJ_QUEUE();
+#ifdef DEBUG
+               BAMBOO_DEBUGPRINT(0xf001);
 #endif
-
-  if(STARTUPCORE == corenum) {
-    /* Create startup object */
-    createstartupobject(argc, argv);
-  }
-#ifdef RAWDEBUG
-  raw_test_pass(0xee0a);
+#ifdef PROFILE
+               //isInterrupt = false;
+#endif 
+#ifdef DEBUG
+               BAMBOO_DEBUGPRINT(0xeee1);
 #endif
-
-#ifdef RAW
-#ifdef RAWDEBUG
-  raw_test_pass(0xee0b);
+               rflag = true;
+               objInfo = (struct transObjInfo *)getItem(&objqueue); 
+               obj = objInfo->objptr;
+#ifdef DEBUG
+               BAMBOO_DEBUGPRINT_REG((int)obj);
+#endif
+               // grab lock and flush the obj
+               grount = 0;
+               getwritelock_I(obj);
+               while(!lockflag) {
+                       BAMBOO_WAITING_FOR_LOCK();
+               } // while(!lockflag)
+               grount = lockresult;
+#ifdef DEBUG
+               BAMBOO_DEBUGPRINT_REG(grount);
 #endif
 
-  while(true) {
-/*#ifndef INTERRUPT
-          while(receiveObject() != -1) {
-          }
- #endif*/
-
-    // check if there are new active tasks can be executed
-    executetasks();
-
+               lockresult = 0;
+               lockobj = 0;
+               lock2require = 0;
+               lockflag = false;
 #ifndef INTERRUPT
-    while(receiveObject() != -1) {
-    }
+               reside = false;
+#endif
+
+               if(grount == 1) {
+                       int k = 0;
+                       // flush the object
+#ifdef CACHEFLUSH
+                       BAMBOO_CACHE_FLUSH_RANGE((int)obj,sizeof(int));
+                       BAMBOO_CACHE_FLUSH_RANGE((int)obj, 
+                                       classsize[((struct ___Object___ *)obj)->type]);
+#endif
+                       // enqueue the object
+                       for(k = 0; k < objInfo->length; ++k) {
+                               int taskindex = objInfo->queues[2 * k];
+                               int paramindex = objInfo->queues[2 * k + 1];
+                               struct parameterwrapper ** queues = 
+                                       &(paramqueues[BAMBOO_NUM_OF_CORE][taskindex][paramindex]);
+#ifdef DEBUG
+                               BAMBOO_DEBUGPRINT_REG(taskindex);
+                               BAMBOO_DEBUGPRINT_REG(paramindex);
+                               struct ___Object___ * tmpptr = (struct ___Object___ *)obj;
+                               tprintf("Process %x(%d): receive obj %x(%lld), ptrflag %x\n", 
+                                                               BAMBOO_NUM_OF_CORE, BAMBOO_NUM_OF_CORE, (int)obj, 
+                                                               (long)obj, tmpptr->flag);
+#endif
+                               enqueueObject_I(obj, queues, 1);
+#ifdef DEBUG                            
+                               BAMBOO_DEBUGPRINT_REG(hashsize(activetasks));
+#endif
+                       } // for(k = 0; k < objInfo->length; ++k)
+                       releasewritelock_I(obj);
+                       RUNFREE(objInfo->queues);
+                       RUNFREE(objInfo);
+               } else {
+                       // can not get lock
+                       // put it at the end of the queue if no update version in the queue
+                       struct QueueItem * qitem = getHead(&objqueue);
+                       struct QueueItem * prev = NULL;
+                       while(qitem != NULL) {
+                               struct transObjInfo * tmpinfo = 
+                                       (struct transObjInfo *)(qitem->objectptr);
+                               if(tmpinfo->objptr == obj) {
+                                       // the same object in the queue, which should be enqueued
+                                       // recently. Current one is outdate, do not re-enqueue it
+                                       RUNFREE(objInfo->queues);
+                                       RUNFREE(objInfo);
+                                       goto objqueuebreak;
+                               } else {
+                                       prev = qitem;
+                               } // if(tmpinfo->objptr == obj)
+                               qitem = getNextQueueItem(prev);
+                       } // while(qitem != NULL)
+                       // try to execute active tasks already enqueued first
+                       addNewItem_I(&objqueue, objInfo);
+#ifdef PROFILE
+                       //isInterrupt = true;
+#endif
+objqueuebreak:
+                       BAMBOO_CLOSE_CRITICAL_SECTION_OBJ_QUEUE();
+#ifdef DEBUG
+                       BAMBOO_DEBUGPRINT(0xf000);
 #endif
-
-#ifdef RAWDEBUG
-    raw_test_pass(0xee0c);
+                       break;
+               } // if(grount == 1)
+               BAMBOO_CLOSE_CRITICAL_SECTION_OBJ_QUEUE();
+#ifdef DEBUG
+               BAMBOO_DEBUGPRINT(0xf000);
 #endif
+       } // while(!isEmpty(&objqueue))
 
-    // check if there are some pending objects, if yes, enqueue them and executetasks again
-    tocontinue = false;
-#ifdef RAWDEBUG
-    raw_test_pass(0xee0d);
+#ifdef PROFILE
+#ifdef ACCURATEPROFILE
+       if(isChecking) {
+               profileTaskEnd();
+       } // if(isChecking)
 #endif
-#ifdef RAWPROFILE
-    {
-      bool isChecking = false;
-      if(!isEmpty(&objqueue)) {
-       if(!taskInfoOverflow) {
-         TaskInfo* taskInfo = RUNMALLOC(sizeof(struct task_info));
-         taskInfoArray[taskInfoIndex] = taskInfo;
-         taskInfo->taskName = "objqueue checking";
-         taskInfo->startTime = raw_get_cycle();
-         taskInfo->endTime = -1;
-       }
-       isChecking = true;
-      }
-#endif
-    while(!isEmpty(&objqueue)) {
-      void * obj = NULL;
-#ifdef INTERRUPT
-      raw_user_interrupts_off();
-#endif
-#ifdef RAWPROFILE
-      //isInterrupt = false;
-#endif
-#ifdef RAWDEBUG
-      raw_test_pass(0xeee1);
-#endif
-      sendStall = false;
-      tocontinue = true;
-      objitem = getTail(&objqueue);
-      //obj = objitem->objectptr;
-      objInfo = (struct transObjInfo *)objitem->objectptr;
-      obj = objInfo->objptr;
-#ifdef RAWDEBUG
-      raw_test_pass_reg((int)obj);
-#endif
-      // grab lock and flush the obj
-      getreadlock_I(obj);
-      while(!lockflag) {
-       receiveObject();
-      }
-      grount = lockresult;
-#ifdef RAWDEBUG
-      raw_test_pass_reg(grount);
 #endif
 
-      lockresult = 0;
-      lockobj = 0;
-      lockflag = false;
-#ifndef INTERRUPT
-      reside = false;
+#ifdef DEBUG
+       BAMBOO_DEBUGPRINT(0xee02);
 #endif
+       return rflag;
+}
 
-      if(grount == 1) {
-       int k = 0;
-       raw_invalidate_cache_range((int)obj, classsize[((struct ___Object___ *)obj)->type]);
-       // flush the obj
-       /*for(k = 0; k < classsize[((struct ___Object___ *)obj)->type]; ++k) {
-               invalidateAddr(obj + k);
-          }*/
-       // enqueue the object
-       for(k = 0; k < objInfo->length; ++k) {
-         int taskindex = objInfo->queues[2 * k];
-         int paramindex = objInfo->queues[2 * k + 1];
-         struct parameterwrapper ** queues = &(paramqueues[corenum][taskindex][paramindex]);
-#ifdef RAWDEBUG
-         raw_test_pass_reg(taskindex);
-         raw_test_pass_reg(paramindex);
-#endif
-         enqueueObject_I(obj, queues, 1);
-       }
-       removeItem(&objqueue, objitem);
-       releasereadlock_I(obj);
-       RUNFREE(objInfo->queues);
-       RUNFREE(objInfo);
-       /*enqueueObject_I(obj, NULL, 0);
-          removeItem(&objqueue, objitem);
-          releasereadlock_I(obj);*/
-      } else {
-       // can not get lock
-       // put it at the end of the queue
-       // and try to execute active tasks already enqueued first
-       removeItem(&objqueue, objitem);
-       addNewItem_I(&objqueue, objInfo);
-#ifdef RAWPROFILE
-       //isInterrupt = true;
-#endif
-#ifdef INTERRUPT
-       raw_user_interrupts_on();
-#endif
-       break;
-      }
-#ifdef INTERRUPT
-      raw_user_interrupts_on();
-#endif
-#ifdef RAWDEBUG
-      raw_test_pass(0xee0e);
-#endif
-    }
-#ifdef RAWPROFILE
-    if(isChecking && (!taskInfoOverflow)) {
-      taskInfoArray[taskInfoIndex]->endTime = raw_get_cycle();
-      taskInfoIndex++;
-      if(taskInfoIndex == TASKINFOLENGTH) {
-       taskInfoOverflow = true;
-      }
-    }
-  }
-#endif
-#ifdef RAWDEBUG
-    raw_test_pass(0xee0f);
+inline __attribute__((always_inline))
+void checkCoreStatus() {
+       bool allStall = false;
+       int i = 0;
+       int sumsendobj = 0;
+       if((!waitconfirm) || 
+                       (waitconfirm && (numconfirm == 0))) {
+#ifdef DEBUG
+               BAMBOO_DEBUGPRINT(0xee04);
+               BAMBOO_DEBUGPRINT_REG(waitconfirm);
 #endif
-
-    if(!tocontinue) {
-      // check if stop
-      if(STARTUPCORE == corenum) {
-       if(isfirst) {
-#ifdef RAWDEBUG
-         raw_test_pass(0xee10);
+               BAMBOO_START_CRITICAL_SECTION_STATUS();
+#ifdef DEBUG
+               BAMBOO_DEBUGPRINT(0xf001);
 #endif
-         isfirst = false;
-       }
-#ifdef INTERRUPT
-       raw_user_interrupts_off();
-#endif
-       corestatus[corenum] = 0;
-       numsendobjs[corenum] = self_numsendobjs;
-       numreceiveobjs[corenum] = self_numreceiveobjs;
-       // check the status of all cores
-       allStall = true;
-#ifdef RAWDEBUG
-       raw_test_pass_reg(NUMCORES);
-#endif
-       for(i = 0; i < NUMCORES; ++i) {
-#ifdef RAWDEBUG
-         raw_test_pass(0xe000 + corestatus[i]);
-#endif
-         if(corestatus[i] != 0) {
-           allStall = false;
-           break;
-         }
-       }
-       if(allStall) {
-         // check if the sum of send objs and receive obj are the same
-         // yes->terminate; for profiling mode, yes->send request to all
-         // other cores to pour out profiling data
-         // no->go on executing
-         sumsendobj = 0;
-         for(i = 0; i < NUMCORES; ++i) {
-           sumsendobj += numsendobjs[i];
-#ifdef RAWDEBUG
-           raw_test_pass(0xf000 + numsendobjs[i]);
+               corestatus[BAMBOO_NUM_OF_CORE] = 0;
+               numsendobjs[BAMBOO_NUM_OF_CORE] = self_numsendobjs;
+               numreceiveobjs[BAMBOO_NUM_OF_CORE] = self_numreceiveobjs;
+               // check the status of all cores
+               allStall = true;
+#ifdef DEBUG
+               BAMBOO_DEBUGPRINT_REG(NUMCORESACTIVE);
 #endif
-         }
-         for(i = 0; i < NUMCORES; ++i) {
-           sumsendobj -= numreceiveobjs[i];
-#ifdef RAWDEBUG
-           raw_test_pass(0xf000 + numreceiveobjs[i]);
+               for(i = 0; i < NUMCORESACTIVE; ++i) {
+#ifdef DEBUG
+                       BAMBOO_DEBUGPRINT(0xe000 + corestatus[i]);
+#endif
+                       if(corestatus[i] != 0) {
+                               allStall = false;
+                               break;
+                       }
+               } // for(i = 0; i < NUMCORESACTIVE; ++i)
+               if(allStall) {
+                       // check if the sum of send objs and receive obj are the same
+                       // yes->check if the info is the latest; no->go on executing
+                       sumsendobj = 0;
+                       for(i = 0; i < NUMCORESACTIVE; ++i) {
+                               sumsendobj += numsendobjs[i];
+#ifdef DEBUG
+                               BAMBOO_DEBUGPRINT(0xf000 + numsendobjs[i]);
 #endif
-         }
-         if(0 == sumsendobj) {
-           // terminate
-#ifdef RAWDEBUG
-           raw_test_pass(0xee11);
-#endif
-//#ifdef RAWPROFILE
-#ifdef RAWUSEIO
-           totalexetime = raw_get_cycle();
+                       } // for(i = 0; i < NUMCORESACTIVE; ++i)        
+                       for(i = 0; i < NUMCORESACTIVE; ++i) {
+                               sumsendobj -= numreceiveobjs[i];
+#ifdef DEBUG
+                               BAMBOO_DEBUGPRINT(0xf000 + numreceiveobjs[i]);
+#endif
+                       } // for(i = 0; i < NUMCORESACTIVE; ++i)
+                       if(0 == sumsendobj) {
+                               if(!waitconfirm) {
+                                       // the first time found all cores stall
+                                       // send out status confirm msg to all other cores
+                                       // reset the corestatus array too
+#ifdef DEBUG
+                                       BAMBOO_DEBUGPRINT(0xee05);
+#endif
+                                       corestatus[BAMBOO_NUM_OF_CORE] = 1;
+                                       for(i = 1; i < NUMCORESACTIVE; ++i) {   
+                                               corestatus[i] = 1;
+                                               // send status confirm msg to core i
+                                               send_msg_1(i, STATUSCONFIRM, false);
+                                       } // for(i = 1; i < NUMCORESACTIVE; ++i)
+                                       waitconfirm = true;
+                                       numconfirm = NUMCORESACTIVE - 1;
+                               } else {
+                                       // all the core status info are the latest
+                                       // terminate; for profiling mode, send request to all
+                                       // other cores to pour out profiling data
+#ifdef DEBUG
+                                       BAMBOO_DEBUGPRINT(0xee06);
+#endif                                           
+                        
+#ifdef USEIO
+                                       totalexetime = BAMBOO_GET_EXE_TIME() - bamboo_start_time;
 #else
-           raw_test_pass(0xbbbbbbbb);
-           raw_test_pass(raw_get_cycle());
-#endif
-
-           // profile mode, send msgs to other cores to request pouring
-           // out progiling data
-#ifdef RAWPROFILE
-#ifdef INTERRUPT
-           // reopen gdn_avail interrupts
-           raw_user_interrupts_on();
-#endif
-           for(i = 1; i < NUMCORES; ++i) {
-             transProfileRequestMsg(i);
-           }
-           // pour profiling data on startup core
-           outputProfileData();
-           while(true) {
-#ifdef INTERRUPT
-             raw_user_interrupts_off();
-#endif
-             profilestatus[corenum] = 0;
-             // check the status of all cores
-             allStall = true;
-#ifdef RAWDEBUG
-             raw_test_pass_reg(NUMCORES);
-#endif
-             for(i = 0; i < NUMCORES; ++i) {
-#ifdef RAWDEBUG
-               raw_test_pass(0xe000 + profilestatus[i]);
-#endif
-               if(profilestatus[i] != 0) {
-                 allStall = false;
-                 break;
-               }
-             }
-             if(!allStall) {
-               int halt = 10000;
-#ifdef INTERRUPT
-               raw_user_interrupts_on();
-#endif
-               while(halt--) {
-               }
-             } else {
-               break;
-             }
-           }
-#endif
 
-           raw_test_done(1);                                   // All done.
-         }
-       }
-#ifdef INTERRUPT
-       raw_user_interrupts_on();
-#endif
-      } else {
-       if(!sendStall) {
-#ifdef RAWDEBUG
-         raw_test_pass(0xee12);
+                                       BAMBOO_DEBUGPRINT(BAMBOO_GET_EXE_TIME() - bamboo_start_time);
+                                       //BAMBOO_DEBUGPRINT_REG(total_num_t6); // TODO for test
+                                       BAMBOO_DEBUGPRINT(0xbbbbbbbb);
 #endif
-#ifdef RAWPROFILE
-         if(!stall) {
-#endif
-         if(isfirst) {
-           // wait for some time
-           int halt = 10000;
-#ifdef RAWDEBUG
-           raw_test_pass(0xee13);
-#endif
-           while(halt--) {
-           }
-           isfirst = false;
-#ifdef RAWDEBUG
-           raw_test_pass(0xee14);
+                                       // profile mode, send msgs to other cores to request pouring
+                                       // out progiling data
+#ifdef PROFILE
+                                       BAMBOO_CLOSE_CRITICAL_SECTION_STATUS();
+#ifdef DEBUG
+                                       BAMBOO_DEBUGPRINT(0xf000);
+#endif
+                                       for(i = 1; i < NUMCORESACTIVE; ++i) {
+                                               // send profile request msg to core i
+                                               send_msg_2(i, PROFILEOUTPUT, totalexetime, false);
+                                       } // for(i = 1; i < NUMCORESACTIVE; ++i)
+                                       // pour profiling data on startup core
+                                       outputProfileData();
+                                       while(true) {
+                                               BAMBOO_START_CRITICAL_SECTION_STATUS();
+#ifdef DEBUG
+                                               BAMBOO_DEBUGPRINT(0xf001);
 #endif
-         } else {
-           // send StallMsg to startup core
-#ifdef RAWDEBUG
-           raw_test_pass(0xee15);
+                                               profilestatus[BAMBOO_NUM_OF_CORE] = 0;
+                                               // check the status of all cores
+                                               allStall = true;
+#ifdef DEBUG
+                                               BAMBOO_DEBUGPRINT_REG(NUMCORESACTIVE);
+#endif 
+                                               for(i = 0; i < NUMCORESACTIVE; ++i) {
+#ifdef DEBUG
+                                                       BAMBOO_DEBUGPRINT(0xe000 + profilestatus[i]);
+#endif
+                                                       if(profilestatus[i] != 0) {
+                                                               allStall = false;
+                                                               break;
+                                                       }
+                                               }  // for(i = 0; i < NUMCORESACTIVE; ++i)
+                                               if(!allStall) {
+                                                       int halt = 100;
+                                                       BAMBOO_CLOSE_CRITICAL_SECTION_STATUS();
+#ifdef DEBUG
+                                                       BAMBOO_DEBUGPRINT(0xf000);
+#endif
+                                                       while(halt--) {
+                                                       }
+                                               } else {
+                                                       break;
+                                               } // if(!allStall)
+                                       } // while(true)
+#endif
+
+                                       // gc_profile mode, ourput gc prfiling data
+#ifdef MULTICORE_GC
+#ifdef GC_PROFILE
+                                       gc_outputProfileData();
+#endif // #ifdef GC_PROFILE
+#endif // #ifdef MULTICORE_GC
+                                       disruntimedata();
+                                       terminate(); // All done.
+                               } // if(!waitconfirm)
+                       } else {
+                               // still some objects on the fly on the network
+                               // reset the waitconfirm and numconfirm
+#ifdef DEBUG
+                                       BAMBOO_DEBUGPRINT(0xee07);
 #endif
-           sendStall = transStallMsg(STARTUPCORE);
-           isfirst = true;
-         }
-#ifdef RAWPROFILE
-       }
+                               waitconfirm = false;
+                               numconfirm = 0;
+                       } //  if(0 == sumsendobj)
+               } else {
+                       // not all cores are stall, keep on waiting
+#ifdef DEBUG
+                       BAMBOO_DEBUGPRINT(0xee08);
 #endif
-       } else {
-         isfirst = true;
-#ifdef RAWDEBUG
-         raw_test_pass(0xee16);
+                       waitconfirm = false;
+                       numconfirm = 0;
+               } //  if(allStall)
+               BAMBOO_CLOSE_CRITICAL_SECTION_STATUS();
+#ifdef DEBUG
+               BAMBOO_DEBUGPRINT(0xf000);
 #endif
-       }
-      }
-    }
-  }
+       } // if((!waitconfirm) ||
 }
-#elif defined THREADSIMULATE
-  /* Start executing the tasks */
-  executetasks();
 
+// main function for each core
+inline void run(void * arg) {
   int i = 0;
-  // check if there are new objects coming
+  int argc = 1;
+  char ** argv = NULL;
   bool sendStall = false;
+  bool isfirst = true;
+  bool tocontinue = false;
 
-  int numofcore = pthread_getspecific(key);
-  while(true) {
-    switch(receiveObject()) {
-    case 0: {
-      printf("[run, %d] receive an object\n", numofcore);
-      sendStall = false;
-      // received an object
-      // check if there are new active tasks can be executed
-      executetasks();
-      break;
-    }
+  corenum = BAMBOO_GET_NUM_OF_CORE();
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(0xeeee);
+  BAMBOO_DEBUGPRINT_REG(corenum);
+  BAMBOO_DEBUGPRINT(STARTUPCORE);
+#endif
 
-    case 1: {
-      //printf("[run, %d] no msg\n", numofcore);
-      // no msg received
-      if(STARTUPCORE == numofcore) {
-       corestatus[numofcore] = 0;
-       // check the status of all cores
-       bool allStall = true;
-       for(i = 0; i < NUMCORES; ++i) {
-         if(corestatus[i] != 0) {
-           allStall = false;
-           break;
-         }
-       }
-       if(allStall) {
-         // check if the sum of send objs and receive obj are the same
-         // yes->terminate
-         // no->go on executing
-         int sumsendobj = 0;
-         for(i = 0; i < NUMCORES; ++i) {
-           sumsendobj += numsendobjs[i];
-         }
-         for(i = 0; i < NUMCORES; ++i) {
-           sumsendobj -= numreceiveobjs[i];
-         }
-         if(0 == sumsendobj) {
-           // terminate
-
-           // release all locks
-           int rc_tbl = pthread_rwlock_wrlock(&rwlock_tbl);
-           printf("[run, %d] getting the write lock for locktbl: %d error: \n", numofcore, rc_tbl, strerror(rc_tbl));
-           struct RuntimeIterator* it_lock = RuntimeHashcreateiterator(locktbl);
-           while(0 != RunhasNext(it_lock)) {
-             int key = Runkey(it_lock);
-             pthread_rwlock_t* rwlock_obj = (pthread_rwlock_t*)Runnext(it_lock);
-             int rc_des = pthread_rwlock_destroy(rwlock_obj);
-             printf("[run, %d] destroy the rwlock for object: %d error: \n", numofcore, key, strerror(rc_des));
-             RUNFREE(rwlock_obj);
-           }
-           freeRuntimeHash(locktbl);
-           locktbl = NULL;
-           RUNFREE(it_lock);
-
-           // destroy all message queues
-           char * pathhead = "/msgqueue_";
-           int targetlen = strlen(pathhead);
-           for(i = 0; i < NUMCORES; ++i) {
-             char corenumstr[3];
-             int sourcelen = 0;
-             if(i < 10) {
-               corenumstr[0] = i + '0';
-               corenumstr[1] = '\0';
-               sourcelen = 1;
-             } else if(i < 100) {
-               corenumstr[1] = i %10 + '0';
-               corenumstr[0] = (i / 10) + '0';
-               corenumstr[2] = '\0';
-               sourcelen = 2;
-             } else {
-               printf("Error: i >= 100\n");
-               fflush(stdout);
-               exit(-1);
-             }
-             char path[targetlen + sourcelen + 1];
-             strcpy(path, pathhead);
-             strncat(path, corenumstr, sourcelen);
-             mq_unlink(path);
-           }
+       // initialize runtime data structures
+       initruntimedata();
+
+  // other architecture related initialization
+  initialization();
+  initCommunication();
 
-           printf("[run, %d] terminate!\n", numofcore);
-           fflush(stdout);
-           exit(0);
+  initializeexithandler();
+
+  // main process of the execution module
+  if(BAMBOO_NUM_OF_CORE > NUMCORESACTIVE - 1) {
+       // non-executing cores, only processing communications
+    activetasks = NULL;
+/*#ifdef PROFILE
+        BAMBOO_DEBUGPRINT(0xee01);
+        BAMBOO_DEBUGPRINT_REG(taskInfoIndex);
+        BAMBOO_DEBUGPRINT_REG(taskInfoOverflow);
+               profileTaskStart("msg handling");
+        }
+ #endif*/
+#ifdef PROFILE
+    //isInterrupt = false;
+#endif
+               fakeExecution();
+  } else {
+         /* Create queue of active tasks */
+         activetasks=
+                       genallocatehashtable((unsigned int(*) (void *)) &hashCodetpd,
+                           (int(*) (void *,void *)) &comparetpd);
+         
+         /* Process task information */
+         processtasks();
+         
+         if(STARTUPCORE == BAMBOO_NUM_OF_CORE) {
+                 /* Create startup object */
+                 createstartupobject(argc, argv);
          }
-       }
-      } else {
-       if(!sendStall) {
-         // send StallMsg to startup core
-         sendStall = transStallMsg(STARTUPCORE);
-       }
-      }
-      break;
-    }
 
-    case 2: {
-      printf("[run, %d] receive a stall msg\n", numofcore);
-      // receive a Stall Msg, do nothing
-      assert(STARTUPCORE == numofcore);                                     // only startup core can receive such msg
-      sendStall = false;
-      break;
-    }
+#ifdef DEBUG
+         BAMBOO_DEBUGPRINT(0xee00);
+#endif
 
-      /* case 3: {
-                               printf("[run, %d] receive a terminate msg\n", numofcore);
-                               // receive a terminate Msg
-                               assert(STARTUPCORE != corenum); // only non-startup core can receive such msg
-                               mq_close(mqd[corenum]);
-                               fflush(stdout);
-                               exit(0);
-                               break;
-                       }*/
-    default: {
-      printf("[run, %d] Error: invalid message type.\n", numofcore);
-      fflush(stdout);
-      exit(-1);
-      break;
-    }
-    }
-  }
+         while(true) {
+#ifdef MULTICORE_GC
+                       // check if need to do GC
+                       gc(NULL);
 #endif
-}
 
-void createstartupobject(int argc, char ** argv) {
+                 // check if there are new active tasks can be executed
+                 executetasks();
+                       if(busystatus) {
+                               sendStall = false;
+                       }
+
+#ifndef INTERRUPT
+                 while(receiveObject() != -1) {
+                 }
+#endif  
+
+#ifdef DEBUG
+                 BAMBOO_DEBUGPRINT(0xee01);
+#endif  
+                 
+                 // check if there are some pending objects, 
+                       // if yes, enqueue them and executetasks again
+                 tocontinue = checkObjQueue();
+
+                 if(!tocontinue) {
+                         // check if stop
+                         if(STARTUPCORE == BAMBOO_NUM_OF_CORE) {
+                                 if(isfirst) {
+#ifdef DEBUG
+                                         BAMBOO_DEBUGPRINT(0xee03);
+#endif
+                                         isfirst = false;
+                                 }
+                                       checkCoreStatus();
+                         } else {
+                                 if(!sendStall) {
+#ifdef DEBUG
+                                         BAMBOO_DEBUGPRINT(0xee09);
+#endif
+#ifdef PROFILE
+                                         if(!stall) {
+#endif
+                                                 if(isfirst) {
+                                                         // wait for some time
+                                                         int halt = 10000;
+#ifdef DEBUG
+                                                         BAMBOO_DEBUGPRINT(0xee0a);
+#endif
+                                                         while(halt--) {
+                                                         }
+                                                         isfirst = false;
+                                                 } else {
+                                                         // send StallMsg to startup core
+#ifdef DEBUG
+                                                         BAMBOO_DEBUGPRINT(0xee0b);
+#endif
+                                                         // send stall msg
+                                                               send_msg_4(STARTUPCORE, TRANSTALL, BAMBOO_NUM_OF_CORE, 
+                                                                                      self_numsendobjs, self_numreceiveobjs, false);
+                                                         sendStall = true;
+                                                         isfirst = true;
+                                                         busystatus = false;
+                                                 }
+#ifdef PROFILE
+                                         }
+#endif
+                                 } else {
+                                         isfirst = true;
+                                         busystatus = false;
+#ifdef DEBUG
+                                         BAMBOO_DEBUGPRINT(0xee0c);
+#endif
+                                 } // if(!sendStall)
+                         } // if(STARTUPCORE == BAMBOO_NUM_OF_CORE) 
+                 } // if(!tocontinue)
+         } // while(true) 
+  } // if(BAMBOO_NUM_OF_CORE > NUMCORESACTIVE - 1)
+
+} // run()
+
+struct ___createstartupobject____I_locals {
+  INTPTR size;
+  void * next;
+  struct  ___StartupObject___ * ___startupobject___;
+  struct ArrayObject * ___stringarray___;
+}; // struct ___createstartupobject____I_locals
+
+void createstartupobject(int argc, 
+                                    char ** argv) {
   int i;
 
   /* Allocate startup object     */
-#ifdef PRECISE_GC
-  struct ___StartupObject___ *startupobject=(struct ___StartupObject___*) allocate_new(NULL, STARTUPTYPE);
-  struct ArrayObject * stringarray=allocate_newarray(NULL, STRINGARRAYTYPE, argc-1);
+#ifdef MULTICORE_GC
+       struct ___createstartupobject____I_locals ___locals___={2, NULL, NULL, NULL};
+  struct ___StartupObject___ *startupobject=
+               (struct ___StartupObject___*) allocate_new(&___locals___, STARTUPTYPE);
+       ___locals___.___startupobject___ = startupobject;
+  struct ArrayObject * stringarray=
+               allocate_newarray(&___locals___, STRINGARRAYTYPE, argc-1);
+       ___locals___.___stringarray___ = stringarray;
 #else
-  struct ___StartupObject___ *startupobject=(struct ___StartupObject___*) allocate_new(STARTUPTYPE);
-  struct ArrayObject * stringarray=allocate_newarray(STRINGARRAYTYPE, argc-1);
+  struct ___StartupObject___ *startupobject=
+               (struct ___StartupObject___*) allocate_new(STARTUPTYPE);
+  struct ArrayObject * stringarray=
+               allocate_newarray(STRINGARRAYTYPE, argc-1);
 #endif
   /* Build array of strings */
   startupobject->___parameters___=stringarray;
   for(i=1; i<argc; i++) {
     int length=strlen(argv[i]);
-#ifdef PRECISE_GC
-    struct ___String___ *newstring=NewString(NULL, argv[i],length);
+#ifdef MULTICORE_GC
+    struct ___String___ *newstring=NewString(&___locals___, argv[i],length);
 #else
     struct ___String___ *newstring=NewString(argv[i],length);
 #endif
-    ((void **)(((char *)&stringarray->___length___)+sizeof(int)))[i-1]=newstring;
+    ((void **)(((char *)&stringarray->___length___)+sizeof(int)))[i-1]=
+                       newstring;
   }
 
-  startupobject->isolate = 1;
   startupobject->version = 0;
+  startupobject->lock = NULL;
 
   /* Set initialized flag for startup object */
   flagorandinit(startupobject,1,0xFFFFFFFF);
   enqueueObject(startupobject, NULL, 0);
-#ifdef RAW
-  //flushAll();
-  raw_flush_entire_cache();
+#ifdef CACHEFLUSH
+  BAMBOO_CACHE_FLUSH_ALL();
 #endif
 }
 
@@ -869,7 +689,8 @@ int hashCodetpd(struct taskparamdescriptor *ftd) {
   return hash;
 }
 
-int comparetpd(struct taskparamdescriptor *ftd1, struct taskparamdescriptor *ftd2) {
+int comparetpd(struct taskparamdescriptor *ftd1, 
+                          struct taskparamdescriptor *ftd2) {
   int i;
   if (ftd1->task!=ftd2->task)
     return 0;
@@ -880,178 +701,135 @@ int comparetpd(struct taskparamdescriptor *ftd1, struct taskparamdescriptor *ftd
 }
 
 /* This function sets a tag. */
-#ifdef PRECISE_GC
-void tagset(void *ptr, struct ___Object___ * obj, struct ___TagDescriptor___ * tagd) {
+#ifdef MULTICORE_GC
+void tagset(void *ptr, 
+                       struct ___Object___ * obj, 
+                                               struct ___TagDescriptor___ * tagd) {
 #else
-void tagset(struct ___Object___ * obj, struct ___TagDescriptor___ * tagd) {
+void tagset(struct ___Object___ * obj, 
+                       struct ___TagDescriptor___ * tagd) {
 #endif
   struct ArrayObject * ao=NULL;
   struct ___Object___ * tagptr=obj->___tags___;
-#ifdef RAWDEBUG
-  raw_test_pass(0xebb0);
-#endif
   if (tagptr==NULL) {
-#ifdef RAWDEBUG
-    raw_test_pass(0xebb1);
-#endif
     obj->___tags___=(struct ___Object___ *)tagd;
   } else {
     /* Have to check if it is already set */
     if (tagptr->type==TAGTYPE) {
       struct ___TagDescriptor___ * td=(struct ___TagDescriptor___ *) tagptr;
-#ifdef RAWDEBUG
-      raw_test_pass(0xebb2);
-#endif
       if (td==tagd) {
-#ifdef RAWDEBUG
-       raw_test_pass(0xebb3);
-#endif
        return;
       }
-#ifdef PRECISE_GC
+#ifdef MULTICORE_GC
       int ptrarray[]={2, (int) ptr, (int) obj, (int)tagd};
-      struct ArrayObject * ao=allocate_newarray(&ptrarray,TAGARRAYTYPE,TAGARRAYINTERVAL);
+      struct ArrayObject * ao=
+                               allocate_newarray(&ptrarray,TAGARRAYTYPE,TAGARRAYINTERVAL);
       obj=(struct ___Object___ *)ptrarray[2];
       tagd=(struct ___TagDescriptor___ *)ptrarray[3];
       td=(struct ___TagDescriptor___ *) obj->___tags___;
 #else
-#ifdef RAWDEBUG
-      raw_test_pass(0xebb4);
-#endif
       ao=allocate_newarray(TAGARRAYTYPE,TAGARRAYINTERVAL);
 #endif
-#ifdef RAWDEBUG
-      raw_test_pass(0xebb5);
-#endif
+
       ARRAYSET(ao, struct ___TagDescriptor___ *, 0, td);
       ARRAYSET(ao, struct ___TagDescriptor___ *, 1, tagd);
       obj->___tags___=(struct ___Object___ *) ao;
       ao->___cachedCode___=2;
-#ifdef RAWDEBUG
-      raw_test_pass(0xebb6);
-#endif
     } else {
       /* Array Case */
       int i;
       struct ArrayObject *ao=(struct ArrayObject *) tagptr;
-#ifdef RAWDEBUG
-      raw_test_pass(0xebb7);
-#endif
       for(i=0; i<ao->___cachedCode___; i++) {
-       struct ___TagDescriptor___ * td=ARRAYGET(ao, struct ___TagDescriptor___*, i);
-#ifdef RAWDEBUG
-       raw_test_pass(0xebb8);
-#endif
+       struct ___TagDescriptor___ * td=
+               ARRAYGET(ao, struct ___TagDescriptor___*, i);
        if (td==tagd) {
-#ifdef RAWDEBUG
-         raw_test_pass(0xebb9);
-#endif
          return;
        }
       }
       if (ao->___cachedCode___<ao->___length___) {
-#ifdef RAWDEBUG
-       raw_test_pass(0xebba);
-#endif
        ARRAYSET(ao, struct ___TagDescriptor___ *, ao->___cachedCode___, tagd);
        ao->___cachedCode___++;
-#ifdef RAWDEBUG
-       raw_test_pass(0xebbb);
-#endif
       } else {
-#ifdef PRECISE_GC
+#ifdef MULTICORE_GC
        int ptrarray[]={2,(int) ptr, (int) obj, (int) tagd};
-       struct ArrayObject * aonew=allocate_newarray(&ptrarray,TAGARRAYTYPE,TAGARRAYINTERVAL+ao->___length___);
+       struct ArrayObject * aonew=
+               allocate_newarray(&ptrarray,TAGARRAYTYPE,
+                                             TAGARRAYINTERVAL+ao->___length___);
        obj=(struct ___Object___ *)ptrarray[2];
        tagd=(struct ___TagDescriptor___ *) ptrarray[3];
        ao=(struct ArrayObject *)obj->___tags___;
 #else
-       struct ArrayObject * aonew=allocate_newarray(TAGARRAYTYPE,TAGARRAYINTERVAL+ao->___length___);
-#endif
-#ifdef RAWDEBUG
-       raw_test_pass(0xebbc);
+       struct ArrayObject * aonew=
+               allocate_newarray(TAGARRAYTYPE,TAGARRAYINTERVAL+ao->___length___);
 #endif
+
        aonew->___cachedCode___=ao->___length___+1;
        for(i=0; i<ao->___length___; i++) {
-#ifdef RAWDEBUG
-         raw_test_pass(0xebbd);
-#endif
-         ARRAYSET(aonew, struct ___TagDescriptor___*, i, ARRAYGET(ao, struct ___TagDescriptor___*, i));
+         ARRAYSET(aonew, struct ___TagDescriptor___*, i, 
+                                    ARRAYGET(ao, struct ___TagDescriptor___*, i));
        }
-#ifdef RAWDEBUG
-       raw_test_pass(0xebbe);
-#endif
        ARRAYSET(aonew, struct ___TagDescriptor___ *, ao->___length___, tagd);
-#ifdef RAWDEBUG
-       raw_test_pass(0xebbf);
-#endif
       }
     }
   }
 
   {
     struct ___Object___ * tagset=tagd->flagptr;
-#ifdef RAWDEBUG
-    raw_test_pass(0xb008);
-#endif
     if(tagset==NULL) {
-#ifdef RAWDEBUG
-      raw_test_pass(0xb009);
-#endif
       tagd->flagptr=obj;
     } else if (tagset->type!=OBJECTARRAYTYPE) {
-#ifdef PRECISE_GC
+#ifdef MULTICORE_GC
       int ptrarray[]={2, (int) ptr, (int) obj, (int)tagd};
-      struct ArrayObject * ao=allocate_newarray(&ptrarray,OBJECTARRAYTYPE,OBJECTARRAYINTERVAL);
+      struct ArrayObject * ao=
+                               allocate_newarray(&ptrarray,OBJECTARRAYTYPE,OBJECTARRAYINTERVAL);
       obj=(struct ___Object___ *)ptrarray[2];
       tagd=(struct ___TagDescriptor___ *)ptrarray[3];
 #else
-      struct ArrayObject * ao=allocate_newarray(OBJECTARRAYTYPE,OBJECTARRAYINTERVAL);
+      struct ArrayObject * ao=
+                               allocate_newarray(OBJECTARRAYTYPE,OBJECTARRAYINTERVAL);
 #endif
       ARRAYSET(ao, struct ___Object___ *, 0, tagd->flagptr);
       ARRAYSET(ao, struct ___Object___ *, 1, obj);
       ao->___cachedCode___=2;
       tagd->flagptr=(struct ___Object___ *)ao;
-#ifdef RAWDEBUG
-      raw_test_pass(0xb00a);
-#endif
     } else {
       struct ArrayObject *ao=(struct ArrayObject *) tagset;
       if (ao->___cachedCode___<ao->___length___) {
-#ifdef RAWDEBUG
-       raw_test_pass(0xb00b);
-#endif
        ARRAYSET(ao, struct ___Object___*, ao->___cachedCode___++, obj);
       } else {
        int i;
-#ifdef PRECISE_GC
+#ifdef MULTICORE_GC
        int ptrarray[]={2, (int) ptr, (int) obj, (int)tagd};
-       struct ArrayObject * aonew=allocate_newarray(&ptrarray,OBJECTARRAYTYPE,OBJECTARRAYINTERVAL+ao->___length___);
+       struct ArrayObject * aonew=
+               allocate_newarray(&ptrarray,OBJECTARRAYTYPE,
+                                             OBJECTARRAYINTERVAL+ao->___length___);
        obj=(struct ___Object___ *)ptrarray[2];
        tagd=(struct ___TagDescriptor___ *)ptrarray[3];
        ao=(struct ArrayObject *)tagd->flagptr;
 #else
-       struct ArrayObject * aonew=allocate_newarray(OBJECTARRAYTYPE,OBJECTARRAYINTERVAL);
+       struct ArrayObject * aonew=
+               allocate_newarray(OBJECTARRAYTYPE,OBJECTARRAYINTERVAL+ao->___length___);
 #endif
        aonew->___cachedCode___=ao->___cachedCode___+1;
        for(i=0; i<ao->___length___; i++) {
-         ARRAYSET(aonew, struct ___Object___*, i, ARRAYGET(ao, struct ___Object___*, i));
+         ARRAYSET(aonew, struct ___Object___*, i, 
+                                    ARRAYGET(ao, struct ___Object___*, i));
        }
        ARRAYSET(aonew, struct ___Object___ *, ao->___cachedCode___, obj);
        tagd->flagptr=(struct ___Object___ *) aonew;
-#ifdef RAWDEBUG
-       raw_test_pass(0xb00c);
-#endif
       }
     }
   }
 }
 
 /* This function clears a tag. */
-#ifdef PRECISE_GC
-void tagclear(void *ptr, struct ___Object___ * obj, struct ___TagDescriptor___ * tagd) {
+#ifdef MULTICORE_GC
+void tagclear(void *ptr, 
+                         struct ___Object___ * obj, 
+                                                       struct ___TagDescriptor___ * tagd) {
 #else
-void tagclear(struct ___Object___ * obj, struct ___TagDescriptor___ * tagd) {
+void tagclear(struct ___Object___ * obj, 
+                         struct ___TagDescriptor___ * tagd) {
 #endif
   /* We'll assume that tag is alway there.
      Need to statically check for this of course. */
@@ -1060,30 +838,23 @@ void tagclear(struct ___Object___ * obj, struct ___TagDescriptor___ * tagd) {
   if (tagptr->type==TAGTYPE) {
     if ((struct ___TagDescriptor___ *)tagptr==tagd)
       obj->___tags___=NULL;
-    else
-#ifndef RAW
-      printf("ERROR 1 in tagclear\n");
-#endif
-      ;
   } else {
     struct ArrayObject *ao=(struct ArrayObject *) tagptr;
     int i;
     for(i=0; i<ao->___cachedCode___; i++) {
-      struct ___TagDescriptor___ * td=ARRAYGET(ao, struct ___TagDescriptor___ *, i);
+      struct ___TagDescriptor___ * td=
+                               ARRAYGET(ao, struct ___TagDescriptor___ *, i);
       if (td==tagd) {
        ao->___cachedCode___--;
        if (i<ao->___cachedCode___)
-         ARRAYSET(ao, struct ___TagDescriptor___ *, i, ARRAYGET(ao, struct ___TagDescriptor___ *, ao->___cachedCode___));
+         ARRAYSET(ao, struct ___TagDescriptor___ *, i, 
+                               ARRAYGET(ao, struct ___TagDescriptor___ *, ao->___cachedCode___));
        ARRAYSET(ao, struct ___TagDescriptor___ *, ao->___cachedCode___, NULL);
        if (ao->___cachedCode___==0)
          obj->___tags___=NULL;
        goto PROCESSCLEAR;
       }
     }
-#ifndef RAW
-    printf("ERROR 2 in tagclear\n");
-#endif
-    ;
   }
 PROCESSCLEAR:
   {
@@ -1091,11 +862,6 @@ PROCESSCLEAR:
     if (tagset->type!=OBJECTARRAYTYPE) {
       if (tagset==obj)
        tagd->flagptr=NULL;
-      else
-#ifndef RAW
-       printf("ERROR 3 in tagclear\n");
-#endif
-       ;
     } else {
       struct ArrayObject *ao=(struct ArrayObject *) tagset;
       int i;
@@ -1104,16 +870,14 @@ PROCESSCLEAR:
        if (tobj==obj) {
          ao->___cachedCode___--;
          if (i<ao->___cachedCode___)
-           ARRAYSET(ao, struct ___Object___ *, i, ARRAYGET(ao, struct ___Object___ *, ao->___cachedCode___));
+           ARRAYSET(ao, struct ___Object___ *, i, 
+                                       ARRAYGET(ao, struct ___Object___ *, ao->___cachedCode___));
          ARRAYSET(ao, struct ___Object___ *, ao->___cachedCode___, NULL);
          if (ao->___cachedCode___==0)
            tagd->flagptr=NULL;
          goto ENDCLEAR;
        }
       }
-#ifndef RAW
-      printf("ERROR 4 in tagclear\n");
-#endif
     }
   }
 ENDCLEAR:
@@ -1121,9 +885,12 @@ ENDCLEAR:
 }
 
 /* This function allocates a new tag. */
-#ifdef PRECISE_GC
-struct ___TagDescriptor___ * allocate_tag(void *ptr, int index) {
-  struct ___TagDescriptor___ * v=(struct ___TagDescriptor___ *) mygcmalloc((struct garbagelist *) ptr, classsize[TAGTYPE]);
+#ifdef MULTICORE_GC
+struct ___TagDescriptor___ * allocate_tag(void *ptr, 
+                                                     int index) {
+  struct ___TagDescriptor___ * v=
+               (struct ___TagDescriptor___ *) FREEMALLOC((struct garbagelist *) ptr, 
+                                                                     classsize[TAGTYPE]);
 #else
 struct ___TagDescriptor___ * allocate_tag(int index) {
   struct ___TagDescriptor___ * v=FREEMALLOC(classsize[TAGTYPE]);
@@ -1138,27 +905,32 @@ struct ___TagDescriptor___ * allocate_tag(int index) {
 /* This function updates the flag for object ptr.  It or's the flag
    with the or mask and and's it with the andmask. */
 
-void flagbody(struct ___Object___ *ptr, int flag, struct parameterwrapper ** queues, int length, bool isnew);
+void flagbody(struct ___Object___ *ptr, 
+                         int flag, 
+                                                       struct parameterwrapper ** queues, 
+                                                       int length, 
+                                                       bool isnew);
 
 int flagcomp(const int *val1, const int *val2) {
   return (*val1)-(*val2);
 }
 
-void flagorand(void * ptr, int ormask, int andmask, struct parameterwrapper ** queues, int length) {
+void flagorand(void * ptr, 
+                          int ormask, 
+                                                        int andmask, 
+                                                        struct parameterwrapper ** queues, 
+                                                        int length) {
   {
     int oldflag=((int *)ptr)[1];
     int flag=ormask|oldflag;
     flag&=andmask;
-#ifdef RAWDEBUG
-    raw_test_pass_reg((int)ptr);
-    raw_test_pass(0xaa000000 + oldflag);
-    raw_test_pass(0xaa000000 + flag);
-#endif
     flagbody(ptr, flag, queues, length, false);
   }
 }
 
-bool intflagorand(void * ptr, int ormask, int andmask) {
+bool intflagorand(void * ptr, 
+                             int ormask, 
+                                                                       int andmask) {
   {
     int oldflag=((int *)ptr)[1];
     int flag=ormask|oldflag;
@@ -1172,18 +944,20 @@ bool intflagorand(void * ptr, int ormask, int andmask) {
   }
 }
 
-void flagorandinit(void * ptr, int ormask, int andmask) {
+void flagorandinit(void * ptr, 
+                              int ormask, 
+                                                                        int andmask) {
   int oldflag=((int *)ptr)[1];
   int flag=ormask|oldflag;
   flag&=andmask;
-#ifdef RAWDEBUG
-  raw_test_pass(0xaa100000 + oldflag);
-  raw_test_pass(0xaa100000 + flag);
-#endif
   flagbody(ptr,flag,NULL,0,true);
 }
 
-void flagbody(struct ___Object___ *ptr, int flag, struct parameterwrapper ** vqueues, int vlength, bool isnew) {
+void flagbody(struct ___Object___ *ptr, 
+                         int flag, 
+                                                       struct parameterwrapper ** vqueues, 
+                                                       int vlength, 
+                                                       bool isnew) {
   struct parameterwrapper * flagptr = NULL;
   int i = 0;
   struct parameterwrapper ** queues = vqueues;
@@ -1192,2105 +966,1711 @@ void flagbody(struct ___Object___ *ptr, int flag, struct parameterwrapper ** vqu
   int UNUSED, UNUSED2;
   int * enterflags = NULL;
   if((!isnew) && (queues == NULL)) {
-#ifdef THREADSIMULATE
-    int numofcore = pthread_getspecific(key);
-    queues = objectqueues[numofcore][ptr->type];
-    length = numqueues[numofcore][ptr->type];
-#else
-#ifdef RAW
-    if(corenum < NUMCORES) {
-#endif
-    queues = objectqueues[corenum][ptr->type];
-    length = numqueues[corenum][ptr->type];
-#ifdef RAW
-  } else {
-    return;
-  }
-#endif
-#endif
+    if(BAMBOO_NUM_OF_CORE < NUMCORESACTIVE) {
+               queues = objectqueues[BAMBOO_NUM_OF_CORE][ptr->type];
+               length = numqueues[BAMBOO_NUM_OF_CORE][ptr->type];
+       } else {
+               return;
+       }
   }
   ptr->flag=flag;
-#ifdef RAWDEBUG
-  raw_test_pass(0xbb000000 + ptr->flag);
-#endif
 
   /*Remove object from all queues */
   for(i = 0; i < length; ++i) {
     flagptr = queues[i];
-    ObjectHashget(flagptr->objectset, (int) ptr, (int *) &next, (int *) &enterflags, &UNUSED, &UNUSED2);
+    ObjectHashget(flagptr->objectset, (int) ptr, (int *) &next, 
+                                         (int *) &enterflags, &UNUSED, &UNUSED2);
     ObjectHashremove(flagptr->objectset, (int)ptr);
     if (enterflags!=NULL)
       RUNFREE(enterflags);
   }
 }
 
-void enqueueObject(void * vptr, struct parameterwrapper ** vqueues, int vlength) {
-  struct ___Object___ *ptr = (struct ___Object___ *)vptr;
-
-  {
-    struct QueueItem *tmpptr;
-    struct parameterwrapper * parameter=NULL;
-    int j;
-    int i;
-    struct parameterwrapper * prevptr=NULL;
-    struct ___Object___ *tagptr=NULL;
-    struct parameterwrapper ** queues = vqueues;
-    int length = vlength;
-#ifdef RAW
-    if(corenum > NUMCORES - 1) {
-      return;
-    }
-#endif
-    if(queues == NULL) {
-#ifdef THREADSIMULATE
-      int numofcore = pthread_getspecific(key);
-      queues = objectqueues[numofcore][ptr->type];
-      length = numqueues[numofcore][ptr->type];
-#else
-      queues = objectqueues[corenum][ptr->type];
-      length = numqueues[corenum][ptr->type];
-#endif
-    }
-    tagptr=ptr->___tags___;
-
-    /* Outer loop iterates through all parameter queues an object of
-       this type could be in.  */
-    for(j = 0; j < length; ++j) {
-      parameter = queues[j];
-      /* Check tags */
-      if (parameter->numbertags>0) {
-       if (tagptr==NULL)
-         goto nextloop; //that means the object has no tag but that param needs tag
-       else if(tagptr->type==TAGTYPE) { //one tag
-         struct ___TagDescriptor___ * tag=(struct ___TagDescriptor___*) tagptr;
-         for(i=0; i<parameter->numbertags; i++) {
-           //slotid is parameter->tagarray[2*i];
-           int tagid=parameter->tagarray[2*i+1];
-           if (tagid!=tagptr->flag)
-             goto nextloop; /*We don't have this tag */
-         }
-       } else { //multiple tags
-         struct ArrayObject * ao=(struct ArrayObject *) tagptr;
-         for(i=0; i<parameter->numbertags; i++) {
-           //slotid is parameter->tagarray[2*i];
-           int tagid=parameter->tagarray[2*i+1];
-           int j;
-           for(j=0; j<ao->___cachedCode___; j++) {
-             if (tagid==ARRAYGET(ao, struct ___TagDescriptor___*, j)->flag)
-               goto foundtag;
-           }
-           goto nextloop;
+void enqueueObject(void * vptr, 
+                              struct parameterwrapper ** vqueues, 
+                                                                        int vlength) {
+       struct ___Object___ *ptr = (struct ___Object___ *)vptr;
+       
+       {
+               //struct QueueItem *tmpptr;
+               struct parameterwrapper * parameter=NULL;
+               int j;
+               int i;
+               struct parameterwrapper * prevptr=NULL;
+               struct ___Object___ *tagptr=NULL;
+               struct parameterwrapper ** queues = vqueues;
+               int length = vlength;
+               if(BAMBOO_NUM_OF_CORE > NUMCORESACTIVE - 1) {
+                       return;
+               }
+               if(queues == NULL) {
+                       queues = objectqueues[BAMBOO_NUM_OF_CORE][ptr->type];
+                       length = numqueues[BAMBOO_NUM_OF_CORE][ptr->type];
+               }
+               tagptr=ptr->___tags___;
+
+               /* Outer loop iterates through all parameter queues an object of
+                  this type could be in.  */
+               for(j = 0; j < length; ++j) {
+                       parameter = queues[j];     
+                       /* Check tags */
+                       if (parameter->numbertags>0) {
+                               if (tagptr==NULL)
+                                       goto nextloop; //that means the object has no tag 
+                                                //but that param needs tag
+                               else if(tagptr->type==TAGTYPE) { //one tag
+                                       //struct ___TagDescriptor___ * tag=
+                                       //(struct ___TagDescriptor___*) tagptr;  
+                                       for(i=0; i<parameter->numbertags; i++) {
+                                               //slotid is parameter->tagarray[2*i];
+                                               int tagid=parameter->tagarray[2*i+1];
+                                               if (tagid!=tagptr->flag)
+                                                       goto nextloop; /*We don't have this tag */
+                                       }
+                               } else { //multiple tags
+                                       struct ArrayObject * ao=(struct ArrayObject *) tagptr;
+                                       for(i=0; i<parameter->numbertags; i++) {
+                                               //slotid is parameter->tagarray[2*i];
+                                               int tagid=parameter->tagarray[2*i+1];
+                                               int j;
+                                               for(j=0; j<ao->___cachedCode___; j++) {
+                                                       if (tagid==ARRAYGET(ao, struct ___TagDescriptor___*, j)->flag)
+                                                               goto foundtag;
+                                               }
+                                               goto nextloop;
 foundtag:
-           ;
-         }
-       }
-      }
-
-      /* Check flags */
-      for(i=0; i<parameter->numberofterms; i++) {
-       int andmask=parameter->intarray[i*2];
-       int checkmask=parameter->intarray[i*2+1];
-       if ((ptr->flag&andmask)==checkmask) {
-#ifdef RAWDEBUG
-         raw_test_pass(0xcc000000 + andmask);
-         raw_test_pass_reg((int)ptr);
-         raw_test_pass(0xcc000000 + ptr->flag);
-         raw_test_pass(0xcc000000 + checkmask);
-#endif
-         enqueuetasks(parameter, prevptr, ptr, NULL, 0);
-         prevptr=parameter;
-         break;
-       }
-      }
+                                               ;
+                                       }
+                               }
+                       }
+       
+                       /* Check flags */
+                       for(i=0; i<parameter->numberofterms; i++) {
+                               int andmask=parameter->intarray[i*2];
+                               int checkmask=parameter->intarray[i*2+1];
+                               if ((ptr->flag&andmask)==checkmask) {
+                                       enqueuetasks(parameter, prevptr, ptr, NULL, 0);
+                                       prevptr=parameter;
+                                       break;
+                               }
+                       }
 nextloop:
-      ;
-    }
-  }
+                       ;
+               }
+       }
 }
 
-#ifdef RAW
-void enqueueObject_I(void * vptr, struct parameterwrapper ** vqueues, int vlength) {
-  struct ___Object___ *ptr = (struct ___Object___ *)vptr;
-
-  {
-    struct QueueItem *tmpptr;
-    struct parameterwrapper * parameter=NULL;
-    int j;
-    int i;
-    struct parameterwrapper * prevptr=NULL;
-    struct ___Object___ *tagptr=NULL;
-    struct parameterwrapper ** queues = vqueues;
-    int length = vlength;
-#ifdef RAW
-    if(corenum > NUMCORES - 1) {
-      return;
-    }
-#endif
-    if(queues == NULL) {
-#ifdef THREADSIMULATE
-      int numofcore = pthread_getspecific(key);
-      queues = objectqueues[numofcore][ptr->type];
-      length = numqueues[numofcore][ptr->type];
-#else
-      queues = objectqueues[corenum][ptr->type];
-      length = numqueues[corenum][ptr->type];
-#endif
-    }
-#ifdef RAWDEBUG
-    raw_test_pass(0xeaa1);
-    raw_test_pass_reg(queues);
-    raw_test_pass_reg(length);
-#endif
-    tagptr=ptr->___tags___;
-
-    /* Outer loop iterates through all parameter queues an object of
-       this type could be in.  */
-    for(j = 0; j < length; ++j) {
-      parameter = queues[j];
-      /* Check tags */
-      if (parameter->numbertags>0) {
-#ifdef RAWDEBUG
-       raw_test_pass(0xeaa2);
-       raw_test_pass_reg(tagptr);
-#endif
-       if (tagptr==NULL)
-         goto nextloop; //that means the object has no tag but that param needs tag
-       else if(tagptr->type==TAGTYPE) { //one tag
-         struct ___TagDescriptor___ * tag=(struct ___TagDescriptor___*) tagptr;
-#ifdef RAWDEBUG
-         raw_test_pass(0xeaa3);
-#endif
-         for(i=0; i<parameter->numbertags; i++) {
-           //slotid is parameter->tagarray[2*i];
-           int tagid=parameter->tagarray[2*i+1];
-           if (tagid!=tagptr->flag) {
-#ifdef RAWDEBUG
-             raw_test_pass(0xeaa4);
-#endif
-             goto nextloop; /*We don't have this tag */
-           }
-         }
-       } else { //multiple tags
-         struct ArrayObject * ao=(struct ArrayObject *) tagptr;
-#ifdef RAWDEBUG
-         raw_test_pass(0xeaa5);
-#endif
-         for(i=0; i<parameter->numbertags; i++) {
-           //slotid is parameter->tagarray[2*i];
-           int tagid=parameter->tagarray[2*i+1];
-           int j;
-           for(j=0; j<ao->___cachedCode___; j++) {
-             if (tagid==ARRAYGET(ao, struct ___TagDescriptor___*, j)->flag) {
-               goto foundtag;
-             }
-           }
-#ifdef RAWDEBUG
-           raw_test_pass(0xeaa6);
-#endif
-           goto nextloop;
+void enqueueObject_I(void * vptr, 
+                                struct parameterwrapper ** vqueues, 
+                                                                                int vlength) {
+       struct ___Object___ *ptr = (struct ___Object___ *)vptr;
+       
+       {
+               //struct QueueItem *tmpptr;
+               struct parameterwrapper * parameter=NULL;
+               int j;
+               int i;
+               struct parameterwrapper * prevptr=NULL;
+               struct ___Object___ *tagptr=NULL;
+               struct parameterwrapper ** queues = vqueues;
+               int length = vlength;
+               if(BAMBOO_NUM_OF_CORE > NUMCORESACTIVE - 1) {
+                       return;
+               }
+               if(queues == NULL) {
+                       queues = objectqueues[BAMBOO_NUM_OF_CORE][ptr->type];
+                       length = numqueues[BAMBOO_NUM_OF_CORE][ptr->type];
+               }
+               tagptr=ptr->___tags___;
+
+               /* Outer loop iterates through all parameter queues an object of
+                  this type could be in.  */
+               for(j = 0; j < length; ++j) {
+                       parameter = queues[j];     
+                       /* Check tags */
+                       if (parameter->numbertags>0) {
+                               if (tagptr==NULL)
+                                       goto nextloop; //that means the object has no tag 
+                                                //but that param needs tag
+                               else if(tagptr->type==TAGTYPE) { //one tag
+                                       //struct ___TagDescriptor___ * tag=(struct ___TagDescriptor___*) tagptr;         
+                                       for(i=0; i<parameter->numbertags; i++) {
+                                               //slotid is parameter->tagarray[2*i];
+                                               int tagid=parameter->tagarray[2*i+1];
+                                               if (tagid!=tagptr->flag)
+                                                       goto nextloop; /*We don't have this tag */
+                                       }
+                               } else { //multiple tags
+                                       struct ArrayObject * ao=(struct ArrayObject *) tagptr;
+                                       for(i=0; i<parameter->numbertags; i++) {
+                                               //slotid is parameter->tagarray[2*i];
+                                               int tagid=parameter->tagarray[2*i+1];
+                                               int j;
+                                               for(j=0; j<ao->___cachedCode___; j++) {
+                                                       if (tagid==ARRAYGET(ao, struct ___TagDescriptor___*, j)->flag)
+                                                               goto foundtag;
+                                               }
+                                               goto nextloop;
 foundtag:
-           ;
-         }
-       }
-      }
-
-      /* Check flags */
-      for(i=0; i<parameter->numberofterms; i++) {
-       int andmask=parameter->intarray[i*2];
-       int checkmask=parameter->intarray[i*2+1];
-#ifdef RAWDEBUG
-       raw_test_pass(0xeaa7);
-       raw_test_pass(0xcc000000 + andmask);
-       raw_test_pass_reg(ptr);
-       raw_test_pass(0xcc000000 + ptr->flag);
-       raw_test_pass(0xcc000000 + checkmask);
-#endif
-       if ((ptr->flag&andmask)==checkmask) {
-#ifdef RAWDEBUG
-         raw_test_pass(0xeaa8);
-#endif
-         enqueuetasks_I(parameter, prevptr, ptr, NULL, 0);
-         prevptr=parameter;
-         break;
-       }
-      }
+                                               ;
+                                       }
+                               }
+                       }
+
+                       /* Check flags */
+                       for(i=0; i<parameter->numberofterms; i++) {
+                               int andmask=parameter->intarray[i*2];
+                               int checkmask=parameter->intarray[i*2+1];
+                               if ((ptr->flag&andmask)==checkmask) {
+                                       enqueuetasks_I(parameter, prevptr, ptr, NULL, 0);
+                                       prevptr=parameter;
+                                       break;
+                               }
+                       }
 nextloop:
-      ;
-    }
-  }
-}
-
-// helper function to compute the coordinates of a core from the core number
-void calCoords(int core_num, int* coordY, int* coordX) {
-  *coordX = core_num % 4;
-  *coordY = core_num / 4;
+                       ;
+               }
+       }
 }
-#endif
 
-/* Message format for RAW version:
- *      type + Msgbody
- * type: 0 -- transfer object
- *       1 -- transfer stall msg
- *       2 -- lock request
- *       3 -- lock grount
- *       4 -- lock deny
- *       5 -- lock release
- *       6 -- transfer profile output msg
- *       7 -- transfer profile ouput finish msg
- *
- * ObjMsg: 0 + size of msg + obj's address + (task index + param index)+
- * StallMsg: 1 + corenum + sendobjs + receiveobjs (size is always 4 * sizeof(int))
- * LockMsg: 2 + lock type + obj pointer + request core (size is always 4 * sizeof(int))
- *          3/4/5 + lock type + obj pointer (size is always 3 * sizeof(int))
- *          lock type: 0 -- read; 1 -- write
- * ProfileMsg: 6 + totalexetime (size is always 2 * sizeof(int))
- *             7 + corenum (size is always 2 * sizeof(int))
- */
-
-// transfer an object to targetcore
-// format: object
-void transferObject(struct transObjInfo * transObj) {
-  void * obj = transObj->objptr;
-  int type=((int *)obj)[0];
-  int size=classsize[type];
-  int targetcore = transObj->targetcore;
-  //assert(type < NUMCLASSES); // can only transfer normal object
-
-#ifdef RAW
-  unsigned msgHdr;
-  int self_y, self_x, target_y, target_x;
-  //int isshared = 0;
-  // for 32 bit machine, the size of fixed part is always 3 words
-  //int msgsize = sizeof(int) * 2 + sizeof(void *);
-  int msgsize = 3 + transObj->length * 2;
-  int i = 0;
 
-  struct ___Object___ * newobj = (struct ___Object___ *)obj;
-  /*if(0 == newobj->isolate) {
-          isshared = 1;
-     }*/
-
-  calCoords(corenum, &self_y, &self_x);
-  calCoords(targetcore, &target_y, &target_x);
-  // Build the message header
-  msgHdr = construct_dyn_hdr(0, msgsize, 0,             // msgsize word sent.
-                             self_y, self_x,
-                             target_y, target_x);
-  // start sending msg, set sand msg flag
-  isMsgSending = true;
-  gdn_send(msgHdr);                     // Send the message header to EAST to handle fab(n - 1).
-#ifdef RAWDEBUG
-  raw_test_pass(0xbbbb);
-  raw_test_pass(0xb000 + targetcore);       // targetcore
-#endif
-  gdn_send(0);
-#ifdef RAWDEBUG
-  raw_test_pass(0);
-#endif
-  gdn_send(msgsize);
-#ifdef RAWDEBUG
-  raw_test_pass_reg(msgsize);
-#endif
-  gdn_send(obj);
-#ifdef RAWDEBUG
-  raw_test_pass_reg(obj);
-#endif
-  for(i = 0; i < transObj->length; ++i) {
-    int taskindex = transObj->queues[2*i];
-    int paramindex = transObj->queues[2*i+1];
-    gdn_send(taskindex);
-#ifdef RAWDEBUG
-    raw_test_pass_reg(taskindex);
-#endif
-    gdn_send(paramindex);
-#ifdef RAWDEBUG
-    raw_test_pass_reg(paramindex);
-#endif
-  }
-#ifdef RAWDEBUG
-  raw_test_pass(0xffff);
-#endif
-  ++(self_numsendobjs);
-  // end of sending this msg, set sand msg flag false
-  isMsgSending = false;
-  // check if there are pending msgs
-  while(isMsgHanging) {
-    // get the msg from outmsgdata[]
-    // length + target + msg
-    outmsgleft = outmsgdata[outmsgindex++];
-    targetcore = outmsgdata[outmsgindex++];
-    calCoords(targetcore, &target_y, &target_x);
-    // Build the message header
-    msgHdr = construct_dyn_hdr(0, outmsgleft, 0,                        // msgsize word sent.
-                               self_y, self_x,
-                               target_y, target_x);
-    isMsgSending = true;
-    gdn_send(msgHdr);                           // Send the message header to EAST to handle fab(n - 1).
-#ifdef RAWDEBUG
-    raw_test_pass(0xbbbb);
-    raw_test_pass(0xb000 + targetcore);             // targetcore
-#endif
-    while(outmsgleft-- > 0) {
-      gdn_send(outmsgdata[outmsgindex++]);
-#ifdef RAWDEBUG
-      raw_test_pass_reg(outmsgdata[outmsgindex - 1]);
-#endif
-    }
-#ifdef RAWDEBUG
-    raw_test_pass(0xffff);
-#endif
-    isMsgSending = false;
-#ifdef INTERRUPT
-    raw_user_interrupts_off();
-#endif
-    // check if there are still msg hanging
-    if(outmsgindex == outmsglast) {
-      // no more msgs
-      outmsgindex = outmsglast = 0;
-      isMsgHanging = false;
-    }
-#ifdef INTERRUPT
-    raw_user_interrupts_on();
-#endif
-  }
-#elif defined THREADSIMULATE
-  int numofcore = pthread_getspecific(key);
-
-  // use POSIX message queue to transfer objects between cores
-  mqd_t mqdnum;
-  char corenumstr[3];
-  int sourcelen = 0;
-  if(targetcore < 10) {
-    corenumstr[0] = targetcore + '0';
-    corenumstr[1] = '\0';
-    sourcelen = 1;
-  } else if(targetcore < 100) {
-    corenumstr[1] = targetcore % 10 + '0';
-    corenumstr[0] = (targetcore / 10) + '0';
-    corenumstr[2] = '\0';
-    sourcelen = 2;
-  } else {
-    printf("Error: targetcore >= 100\n");
-    fflush(stdout);
-    exit(-1);
-  }
-  char * pathhead = "/msgqueue_";
-  int targetlen = strlen(pathhead);
-  char path[targetlen + sourcelen + 1];
-  strcpy(path, pathhead);
-  strncat(path, corenumstr, sourcelen);
-  int oflags = O_WRONLY|O_NONBLOCK;
-  int omodes = S_IRWXU|S_IRWXG|S_IRWXO;
-  mqdnum = mq_open(path, oflags, omodes, NULL);
-  if(mqdnum==-1) {
-    printf("[transferObject, %d] mq_open %s fail: %d, error: %s\n", numofcore, path, mqdnum, strerror(errno));
-    fflush(stdout);
-    exit(-1);
-  }
-  /*struct ___Object___ * newobj = (struct ___Object___ *)obj;
-     if(0 == newobj->isolate) {
-          newobj = RUNMALLOC(size);
-          memcpy(newobj, obj, size);
-          newobj->original=obj;
-     }*/
-  struct transObjInfo * tmptransObj = RUNMALLOC(sizeof(struct transObjInfo));
-  memcpy(tmptransObj, transObj, sizeof(struct transObjInfo));
-  int * tmpqueue = RUNMALLOC(sizeof(int)*2*tmptransObj->length);
-  memcpy(tmpqueue, tmptransObj->queues, sizeof(int)*2*tmptransObj->length);
-  tmptransObj->queues = tmpqueue;
-  struct ___Object___ * newobj = RUNMALLOC(sizeof(struct ___Object___));
-  newobj->type = ((struct ___Object___ *)obj)->type;
-  newobj->original = (struct ___Object___ *)tmptransObj;
-  int ret;
-  do {
-    ret=mq_send(mqdnum, (void *)newobj, sizeof(struct ___Object___), 0);             // send the object into the queue
-    if(ret != 0) {
-      printf("[transferObject, %d] mq_send to %s returned: %d, error: %s\n", numofcore, path, ret, strerror(errno));
-    }
-  } while(ret!=0);
-  RUNFREE(newobj);
-  if(numofcore == STARTUPCORE) {
-    ++numsendobjs[numofcore];
-  } else {
-    ++(thread_data_array[numofcore].numsendobjs);
-  }
-  printf("[transferObject, %d] mq_send to %s returned: $%x\n", numofcore, path, ret);
-#endif
+int * getAliasLock(void ** ptrs, 
+                              int length, 
+                                                                        struct RuntimeHash * tbl) {
+       if(length == 0) {
+               return (int*)(RUNMALLOC(sizeof(int)));
+       } else {
+               int i = 0;
+               int locks[length];
+               int locklen = 0;
+               bool redirect = false;
+               int redirectlock = 0;
+               for(; i < length; i++) {
+                       struct ___Object___ * ptr = (struct ___Object___ *)(ptrs[i]);
+                       int lock = 0;
+                       int j = 0;
+                       if(ptr->lock == NULL) {
+                               lock = (int)(ptr);
+                       } else {
+                               lock = (int)(ptr->lock);
+                       }
+                       if(redirect) {
+                               if(lock != redirectlock) {
+                                       RuntimeHashadd(tbl, lock, redirectlock);
+                               }
+                       } else {
+                               if(RuntimeHashcontainskey(tbl, lock)) {
+                                       // already redirected
+                                       redirect = true;
+                                       RuntimeHashget(tbl, lock, &redirectlock);
+                                       for(; j < locklen; j++) {
+                                               if(locks[j] != redirectlock) {
+                                                       RuntimeHashadd(tbl, locks[j], redirectlock);
+                                               }
+                                       }
+                               } else {
+                                       bool insert = true;
+                                       for(j = 0; j < locklen; j++) {
+                                               if(locks[j] == lock) {
+                                                       insert = false;
+                                                       break;
+                                               } else if(locks[j] > lock) {
+                                                       break;
+                                               }
+                                       }
+                                       if(insert) {
+                                               int h = locklen;
+                                               for(; h > j; h--) {
+                                                       locks[h] = locks[h-1];
+                                               }       
+                                               locks[j] = lock;
+                                               locklen++;
+                                       }
+                               }
+                       }
+               }
+               if(redirect) {
+                       return (int *)redirectlock;
+               } else {
+                       return (int *)(locks[0]);
+               }
+       }
 }
 
-// send terminate message to targetcore
-// format: -1
-bool transStallMsg(int targetcore) {
-#ifdef RAW
-  unsigned msgHdr;
-  int self_y, self_x, target_y, target_x;
-  // for 32 bit machine, the size is always 4 words
-  //int msgsize = sizeof(int) * 4;
-  int msgsize = 4;
-
-  calCoords(corenum, &self_y, &self_x);
-  calCoords(targetcore, &target_y, &target_x);
-  // Build the message header
-  msgHdr = construct_dyn_hdr(0, msgsize, 0,             // msgsize word sent.
-                             self_y, self_x,
-                             target_y, target_x);
-  // start sending msgs, set msg sending flag
-  isMsgSending = true;
-  gdn_send(msgHdr);                     // Send the message header to EAST to handle fab(n - 1).
-#ifdef RAWDEBUG
-  raw_test_pass(0xbbbb);
-  raw_test_pass(0xb000 + targetcore);       // targetcore
-#endif
-  gdn_send(1);
-#ifdef RAWDEBUG
-  raw_test_pass(1);
-#endif
-  gdn_send(corenum);
-#ifdef RAWDEBUG
-  raw_test_pass_reg(corenum);
-#endif
-  gdn_send(self_numsendobjs);
-#ifdef RAWDEBUG
-  raw_test_pass_reg(self_numsendobjs);
-#endif
-  gdn_send(self_numreceiveobjs);
-#ifdef RAWDEBUG
-  raw_test_pass_reg(self_numreceiveobjs);
-  raw_test_pass(0xffff);
-#endif
-  // end of sending this msg, set sand msg flag false
-  isMsgSending = false;
-  // check if there are pending msgs
-  while(isMsgHanging) {
-    // get the msg from outmsgdata[]
-    // length + target + msg
-    outmsgleft = outmsgdata[outmsgindex++];
-    targetcore = outmsgdata[outmsgindex++];
-    calCoords(targetcore, &target_y, &target_x);
-    // Build the message header
-    msgHdr = construct_dyn_hdr(0, outmsgleft, 0,                        // msgsize word sent.
-                               self_y, self_x,
-                               target_y, target_x);
-    isMsgSending = true;
-    gdn_send(msgHdr);                           // Send the message header to EAST to handle fab(n - 1).
-#ifdef RAWDEBUG
-    raw_test_pass(0xbbbb);
-    raw_test_pass(0xb000 + targetcore);             // targetcore
-#endif
-    while(outmsgleft-- > 0) {
-      gdn_send(outmsgdata[outmsgindex++]);
-#ifdef RAWDEBUG
-      raw_test_pass_reg(outmsgdata[outmsgindex - 1]);
-#endif
-    }
-#ifdef RAWDEBUG
-    raw_test_pass(0xffff);
-#endif
-    isMsgSending = false;
-#ifdef INTERRUPT
-    raw_user_interrupts_off();
-#endif
-    // check if there are still msg hanging
-    if(outmsgindex == outmsglast) {
-      // no more msgs
-      outmsgindex = outmsglast = 0;
-      isMsgHanging = false;
-    }
-#ifdef INTERRUPT
-    raw_user_interrupts_on();
-#endif
+void addAliasLock(void * ptr, 
+                             int lock) {
+  struct ___Object___ * obj = (struct ___Object___ *)ptr;
+  if(((int)ptr != lock) && (obj->lock != (int*)lock)) {
+    // originally no alias lock associated or have a different alias lock
+    // flush it as the new one
+    obj->lock = (int *)lock;
   }
-  return true;
-#elif defined THREADSIMULATE
-  struct ___Object___ *newobj = RUNMALLOC(sizeof(struct ___Object___));
-  // use the first four int field to hold msgtype/corenum/sendobj/receiveobj
-  newobj->type = -1;
-  int numofcore = pthread_getspecific(key);
-  newobj->flag = numofcore;
-  newobj->___cachedHash___ = thread_data_array[numofcore].numsendobjs;
-  newobj->___cachedCode___ = thread_data_array[numofcore].numreceiveobjs;
-
-  // use POSIX message queue to send stall msg to startup core
-  assert(targetcore == STARTUPCORE);
-  mqd_t mqdnum;
-  char corenumstr[3];
-  int sourcelen = 0;
-  if(targetcore < 10) {
-    corenumstr[0] = targetcore + '0';
-    corenumstr[1] = '\0';
-    sourcelen = 1;
-  } else if(targetcore < 100) {
-    corenumstr[1] = targetcore % 10 + '0';
-    corenumstr[0] = (targetcore / 10) + '0';
-    corenumstr[2] = '\0';
-    sourcelen = 2;
-  } else {
-    printf("Error: targetcore >= 100\n");
-    fflush(stdout);
-    exit(-1);
-  }
-  char * pathhead = "/msgqueue_";
-  int targetlen = strlen(pathhead);
-  char path[targetlen + sourcelen + 1];
-  strcpy(path, pathhead);
-  strncat(path, corenumstr, sourcelen);
-  int oflags = O_WRONLY|O_NONBLOCK;
-  int omodes = S_IRWXU|S_IRWXG|S_IRWXO;
-  mqdnum = mq_open(path, oflags, omodes, NULL);
-  if(mqdnum==-1) {
-    printf("[transStallMsg, %d] mq_open %s fail: %d, error: %s\n", numofcore, path, mqdnum, strerror(errno));
-    fflush(stdout);
-    exit(-1);
-  }
-  int ret;
-  ret=mq_send(mqdnum, (void *)newobj, sizeof(struct ___Object___), 0);       // send the object into the queue
-  if(ret != 0) {
-    printf("[transStallMsg, %d] mq_send to %s returned: %d, error: %s\n", numofcore, path, ret, strerror(errno));
-    RUNFREE(newobj);
-    return false;
-  } else {
-    printf("[transStallMsg, %d] mq_send to %s returned: $%x\n", numofcore, path, ret);
-    printf("<transStallMsg> to %s index: %d, sendobjs: %d, receiveobjs: %d\n", path, newobj->flag, newobj->___cachedHash___, newobj->___cachedCode___);
-    RUNFREE(newobj);
-    return true;
-  }
-#endif
 }
 
-#ifdef RAWPROFILE
-// send profile request message to targetcore
-// format: 6
-bool transProfileRequestMsg(int targetcore) {
-  unsigned msgHdr;
-  int self_y, self_x, target_y, target_x;
-  // for 32 bit machine, the size is always 4 words
-  //int msgsize = sizeof(int) * 4;
-  int msgsize = 2;
-
-  calCoords(corenum, &self_y, &self_x);
-  calCoords(targetcore, &target_y, &target_x);
-  // Build the message header
-  msgHdr = construct_dyn_hdr(0, msgsize, 0,             // msgsize word sent.
-                             self_y, self_x,
-                             target_y, target_x);
-  // start sending msgs, set msg sending flag
-  isMsgSending = true;
-  gdn_send(msgHdr);                     // Send the message header to EAST to handle fab(n - 1).
-#ifdef RAWDEBUG
-  raw_test_pass(0xbbbb);
-  raw_test_pass(0xb000 + targetcore);       // targetcore
-#endif
-  gdn_send(6);
-#ifdef RAWDEBUG
-  raw_test_pass(6);
-#endif
-  gdn_send(totalexetime);
-#ifdef RAWDEBUG
-  raw_test_pass_reg(totalexetime);
-  raw_test_pass(0xffff);
-#endif
-  // end of sending this msg, set sand msg flag false
-  isMsgSending = false;
-  // check if there are pending msgs
-  while(isMsgHanging) {
-    // get the msg from outmsgdata[]
-    // length + target + msg
-    outmsgleft = outmsgdata[outmsgindex++];
-    targetcore = outmsgdata[outmsgindex++];
-    calCoords(targetcore, &target_y, &target_x);
-    // Build the message header
-    msgHdr = construct_dyn_hdr(0, outmsgleft, 0,                        // msgsize word sent.
-                               self_y, self_x,
-                               target_y, target_x);
-    isMsgSending = true;
-    gdn_send(msgHdr);
-#ifdef RAWDEBUG
-    raw_test_pass(0xbbbb);
-    raw_test_pass(0xb000 + targetcore);             // targetcore
-#endif
-    while(outmsgleft-- > 0) {
-      gdn_send(outmsgdata[outmsgindex++]);
-#ifdef RAWDEBUG
-      raw_test_pass_reg(outmsgdata[outmsgindex - 1]);
-#endif
-    }
-#ifdef RAWDEBUG
-    raw_test_pass(0xffff);
-#endif
-    isMsgSending = false;
-#ifdef INTERRUPT
-    raw_user_interrupts_off();
-#endif
-    // check if there are still msg hanging
-    if(outmsgindex == outmsglast) {
-      // no more msgs
-      outmsgindex = outmsglast = 0;
-      isMsgHanging = false;
-    }
-#ifdef INTERRUPT
-    raw_user_interrupts_on();
-#endif
-  }
-  return true;
+#ifdef PROFILE
+inline void setTaskExitIndex(int index) {
+       taskInfoArray[taskInfoIndex]->exitIndex = index;
 }
 
-// output the profiling data
-void outputProfileData() {
-#ifdef RAWUSEIO
-  FILE * fp;
-  char fn[50];
-  int self_y, self_x;
-  char c_y, c_x;
-  int i;
-  int totaltasktime = 0;
-  int preprocessingtime = 0;
-  int objqueuecheckingtime = 0;
-  int postprocessingtime = 0;
-  //int interruptiontime = 0;
-  int other = 0;
-  int averagetasktime = 0;
-  int tasknum = 0;
-
-  for(i = 0; i < 50; i++) {
-    fn[i] = 0;
-  }
-
-  calCoords(corenum, &self_y, &self_x);
-  c_y = (char)self_y + '0';
-  c_x = (char)self_x + '0';
-  strcat(fn, "profile_");
-  strcat(fn, &c_x);
-  strcat(fn, "_");
-  strcat(fn, &c_y);
-  strcat(fn, ".rst");
-
-  if((fp = fopen(fn, "w+")) == NULL) {
-    fprintf(stderr, "fopen error\n");
-    return -1;
-  }
-
-  fprintf(fp, "Task Name, Start Time, End Time, Duration\n");
-  // output task related info
-  for(i = 0; i < taskInfoIndex; i++) {
-    TaskInfo* tmpTInfo = taskInfoArray[i];
-    int duration = tmpTInfo->endTime - tmpTInfo->startTime;
-    fprintf(fp, "%s, %d, %d, %d\n", tmpTInfo->taskName, tmpTInfo->startTime, tmpTInfo->endTime, duration);
-    if(strcmp(tmpTInfo->taskName, "tpd checking") == 0) {
-      preprocessingtime += duration;
-    } else if(strcmp(tmpTInfo->taskName, "post task execution") == 0) {
-      postprocessingtime += duration;
-    } else if(strcmp(tmpTInfo->taskName, "objqueue checking") == 0) {
-      objqueuecheckingtime += duration;
-    } else {
-      totaltasktime += duration;
-      averagetasktime += duration;
-      tasknum++;
-    }
-  }
-
-  if(taskInfoOverflow) {
-    fprintf(stderr, "Caution: task info overflow!\n");
-  }
-
-  other = totalexetime - totaltasktime - preprocessingtime - postprocessingtime;
-  averagetasktime /= tasknum;
-
-  fprintf(fp, "\nTotal time: %d\n", totalexetime);
-  fprintf(fp, "Total task execution time: %d (%f%%)\n", totaltasktime, ((double)totaltasktime/(double)totalexetime)*100);
-  fprintf(fp, "Total objqueue checking time: %d (%f%%)\n", objqueuecheckingtime, ((double)objqueuecheckingtime/(double)totalexetime)*100);
-  fprintf(fp, "Total pre-processing time: %d (%f%%)\n", preprocessingtime, ((double)preprocessingtime/(double)totalexetime)*100);
-  fprintf(fp, "Total post-processing time: %d (%f%%)\n", postprocessingtime, ((double)postprocessingtime/(double)totalexetime)*100);
-  fprintf(fp, "Other time: %d (%f%%)\n", other, ((double)other/(double)totalexetime)*100);
-
-  fprintf(fp, "\nAverage task execution time: %d\n", averagetasktime);
-
-  fclose(fp);
-#else
-     int i = 0;
-     int j = 0;
-
-     raw_test_pass(0xdddd);
-     // output task related info
-     for(i= 0; i < taskInfoIndex; i++) {
-          TaskInfo* tmpTInfo = taskInfoArray[i];
-          char* tmpName = tmpTInfo->taskName;
-          int nameLen = strlen(tmpName);
-          raw_test_pass(0xddda);
-          for(j = 0; j < nameLen; j++) {
-                  raw_test_pass_reg(tmpName[j]);
-          }
-          raw_test_pass(0xdddb);
-          raw_test_pass_reg(tmpTInfo->startTime);
-          raw_test_pass_reg(tmpTInfo->endTime);
-          raw_test_pass(0xdddc);
-     }
-
-     if(taskInfoOverflow) {
-          raw_test_pass(0xefee);
-     }
-
-     // output interrupt related info
-     /*for(i = 0; i < interruptInfoIndex; i++) {
-          InterruptInfo* tmpIInfo = interruptInfoArray[i];
-          raw_test_pass(0xddde);
-          raw_test_pass_reg(tmpIInfo->startTime);
-          raw_test_pass_reg(tmpIInfo->endTime);
-          raw_test_pass(0xdddf);
-     }
-
-     if(interruptInfoOverflow) {
-          raw_test_pass(0xefef);
-     }*/
-
-     raw_test_pass(0xeeee);
-#endif
+inline void addNewObjInfo(void * nobj) {
+       if(taskInfoArray[taskInfoIndex]->newObjs == NULL) {
+               taskInfoArray[taskInfoIndex]->newObjs = createQueue();
+       }
+       addNewItem(taskInfoArray[taskInfoIndex]->newObjs, nobj);
 }
 #endif
 
-// receive object transferred from other cores
-// or the terminate message from other cores
-// NOTICE: following format is for threadsimulate version only
-//         RAW version please see previous description
-// format: type + object
-// type: -1--stall msg
-//      !-1--object
-// return value: 0--received an object
-//               1--received nothing
-//               2--received a Stall Msg
-//               3--received a lock Msg
-//               RAW version: -1 -- received nothing
-//                            otherwise -- received msg type
-int receiveObject() {
-#ifdef RAW
-  bool deny = false;
-  unsigned msgHdr;
-  int self_y, self_x, target_y, target_x;
-  int targetcore = 0;
-  if(gdn_input_avail() == 0) {
-#ifdef RAWDEBUG
-    if(corenum < NUMCORES) {
-      raw_test_pass(0xd001);
-    }
-#endif
-    return -1;
-  }
-#ifdef RAWPROFILE
-  /*if(isInterrupt && (!interruptInfoOverflow)) {
-    // raw_test_pass(0xffff);
-    interruptInfoArray[interruptInfoIndex] = RUNMALLOC_I(sizeof(struct interrupt_info));
-    interruptInfoArray[interruptInfoIndex]->startTime = raw_get_cycle();
-    interruptInfoArray[interruptInfoIndex]->endTime = -1;
-  }*/
-#endif
-msg:
-#ifdef RAWDEBUG
-  raw_test_pass(0xcccc);
-#endif
-  while((gdn_input_avail() != 0) && (msgdataindex < msglength)) {
-    msgdata[msgdataindex] = gdn_receive();
-    if(msgdataindex == 0) {
-      if(msgdata[0] == 7) {
-       msglength = 2;
-      } else if(msgdata[0] == 6) {
-       msglength = 2;
-      } else if(msgdata[0] > 2) {
-       msglength = 3;
-      } else if(msgdata[0] > 0) {
-       msglength = 4;
-      }
-    } else if((msgdataindex == 1) && (msgdata[0] == 0)) {
-      msglength = msgdata[msgdataindex];
-    }
-#ifdef RAWDEBUG
-    raw_test_pass_reg(msgdata[msgdataindex]);
-#endif
-    msgdataindex++;
-
-    /*if(msgdataindex == 0) {
-            // type
-            msgtype = gdn_receive();
-            if(msgtype > 2) {
-                    msglength = 3;
-            } else {
-                    msglength = 4;
-            }
-            if(msgtype != 0) {
-                    msgdata = (int *)RUNMALLOC_I(msglength * sizeof(int));
-                    msgdata[msgdataindex] = msgtype;
-            }
-     #ifdef RAWDEBUG
-            raw_test_pass_reg(msgtype);
-     #endif
-       } else if((msgdataindex == 1) && (msgtype == 0)) {
-            // object transfer msg
-            msglength = gdn_receive();
-            msgdata = (int *)RUNMALLOC_I(msglength * sizeof(int));
-            msgdata[0] = msgtype;
-            msgdata[msgdataindex] = msglength;
-     #ifdef RAWDEBUG
-            raw_test_pass_reg(msgdata[msgdataindex]);
-     #endif
-       } else {
-            msgdata[msgdataindex] = gdn_receive();
-     #ifdef RAWDEBUG
-            raw_test_pass_reg(msgdata[msgdataindex]);
-     #endif
-       }
-       msgdataindex++;*/
-  }
-#ifdef RAWDEBUG
-  raw_test_pass(0xffff);
-#endif
-  if(msgdataindex == msglength) {
-    // received a whole msg
-    int type, data1, data2;             // will receive at least 3 words including type
-    type = msgdata[0];
-    data1 = msgdata[1];
-    data2 = msgdata[2];
-    switch(type) {
-    case 0: {
-      // receive a object transfer msg
-      struct transObjInfo * transObj = RUNMALLOC_I(sizeof(struct transObjInfo));
-      int k = 0;
-      if(corenum > NUMCORES - 1) {
-       raw_test_done(0xa00a);
-      }
-      // store the object and its corresponding queue info, enqueue it later
-      transObj->objptr = (void *)data2;                                           // data1 is now size of the msg
-      transObj->length = (msglength - 3) / 2;
-      transObj->queues = RUNMALLOC_I(sizeof(int)*(msglength - 3));
-      for(k = 0; k < transObj->length; ++k) {
-       transObj->queues[2*k] = msgdata[3+2*k];
-#ifdef RAWDEBUG
-       raw_test_pass_reg(transObj->queues[2*k]);
-#endif
-       transObj->queues[2*k+1] = msgdata[3+2*k+1];
-#ifdef RAWDEBUG
-       raw_test_pass_reg(transObj->queues[2*k+1]);
-#endif
-      }
-      // check if there is an existing duplicate item
-      {
-       struct QueueItem * qitem = getTail(&objqueue);
-       struct QueueItem * prev = NULL;
-       while(qitem != NULL) {
-         struct transObjInfo * tmpinfo = (struct transObjInfo *)(qitem->objectptr);
-         if(tmpinfo->objptr == transObj->objptr) {
-           // the same object, remove outdate one
-           removeItem(&objqueue, qitem);
-         } else {
-           prev = qitem;
-         }
-         if(prev == NULL) {
-           qitem = getTail(&objqueue);
-         } else {
-           qitem = getNext(prev);
-         }
+#ifdef MULTICORE_GC
+void * localmalloc_I(int coren,
+                                int isize,
+                                int * allocsize) {
+       void * mem = NULL;
+       int i = 0;
+       int j = 0;
+       int tofindb = gc_core2block[2*coren+i]+(NUMCORES4GC*2)*j;
+       int totest = tofindb;
+       int bound = BAMBOO_SMEM_SIZE_L;
+       int foundsmem = 0;
+       int size = 0;
+       do {
+               bound = (totest < NUMCORES4GC) ? BAMBOO_SMEM_SIZE_L : BAMBOO_SMEM_SIZE;
+               int nsize = bamboo_smemtbl[totest];
+               bool islocal = true;
+               if(nsize < bound) {
+                       bool tocheck = true;
+                       // have some space in the block
+                       if(totest == tofindb) {
+                               // the first partition
+                               size = bound - nsize;
+                       } else if(nsize == 0) {
+                               // an empty partition, can be appended
+                               size += bound;
+                       } else {
+                               // not an empty partition, can not be appended
+                               // the last continuous block is not big enough, go to check the next
+                               // local block
+                               islocal = true;
+                               tocheck = false;
+                       } // if(totest == tofindb) else if(nsize == 0) else ...
+                       if(tocheck) {
+                               if(size >= isize) {
+                                       // have enough space in the block, malloc
+                                       foundsmem = 1;
+                                       break;
+                               } else {
+                                       // no enough space yet, try to append next continuous block
+                                       islocal = false;
+                               } // if(size > isize) else ...
+                       } // if(tocheck)
+               } // if(nsize < bound)
+               if(islocal) {
+                       // no space in the block, go to check the next block
+                       i++;
+                       if(2==i) {
+                               i = 0;
+                               j++;
+                       }
+                       tofindb = totest = gc_core2block[2*coren+i]+(NUMCORES4GC*2)*j;
+               } else {
+                       totest += 1;
+               } // if(islocal) else ...
+               if(totest > gcnumblock-1-bamboo_reserved_smem) {
+                       // no more local mem, do not find suitable block
+                       foundsmem = 2;
+                       break;
+               } // if(totest > gcnumblock-1-bamboo_reserved_smem) ...
+       } while(true);
+
+       if(foundsmem == 1) {
+               // find suitable block
+               mem = gcbaseva+bamboo_smemtbl[tofindb]+((tofindb<NUMCORES4GC)?
+                               (BAMBOO_SMEM_SIZE_L*tofindb):(BAMBOO_LARGE_SMEM_BOUND+
+                                       (tofindb-NUMCORES4GC)*BAMBOO_SMEM_SIZE));
+               *allocsize = size;
+               // set bamboo_smemtbl
+               for(i = tofindb; i <= totest; i++) {
+                       bamboo_smemtbl[i]=(i<NUMCORES4GC)?BAMBOO_SMEM_SIZE_L:BAMBOO_SMEM_SIZE;
+               }
+       } else if(foundsmem == 2) {
+               // no suitable block
+               *allocsize = 0;
        }
-       //memcpy(transObj->queues, msgdata[3], sizeof(int)*(msglength - 3));
-       addNewItem_I(&objqueue, (void *)transObj);
-      }
-      ++(self_numreceiveobjs);
-#ifdef RAWDEBUG
-      raw_test_pass(0xe881);
-#endif
-      /*
-         addNewItem_I(&objqueue, (void *)data2);
-       ++(self_numreceiveobjs);
-       #ifdef RAWDEBUG
-         raw_test_pass(0xe881);
-       #endif
-       */
-      break;
-    }
-
-    case 1: {
-      // receive a stall msg
-      if(corenum != STARTUPCORE) {
-       // non startup core can not receive stall msg
-       // return -1
-       raw_test_done(0xa001);
-      }
-      if(data1 < NUMCORES) {
-#ifdef RAWDEBUG
-       raw_test_pass(0xe882);
-#endif
-       corestatus[data1] = 0;
-       numsendobjs[data1] = data2;
-       numreceiveobjs[data1] = msgdata[3];
-      }
-      break;
-    }
 
-    case 2: {
-      // receive lock request msg
-      // for 32 bit machine, the size is always 3 words
-      //int msgsize = sizeof(int) * 3;
-      int msgsize = 3;
-      // lock request msg, handle it right now
-      // check to see if there is a lock exist in locktbl for the required obj
-      int data3 = msgdata[3];
-      deny = false;
-      if(!RuntimeHashcontainskey(locktbl, data2)) {
-       // no locks for this object
-       // first time to operate on this shared object
-       // create a lock for it
-       // the lock is an integer: 0 -- stall, >0 -- read lock, -1 -- write lock
-#ifdef RAWDEBUG
-       raw_test_pass(0xe883);
-#endif
-       if(data1 == 0) {
-         RuntimeHashadd_I(locktbl, data2, 1);
-       } else {
-         RuntimeHashadd_I(locktbl, data2, -1);
-       }
-      } else {
-       int rwlock_obj = 0;
-#ifdef RAWDEBUG
-       raw_test_pass(0xe884);
-#endif
-       RuntimeHashget(locktbl, data2, &rwlock_obj);
-#ifdef RAWDEBUG
-       raw_test_pass_reg(rwlock_obj);
-#endif
-       if(0 == rwlock_obj) {
-         if(data1 == 0) {
-           rwlock_obj = 1;
-         } else {
-           rwlock_obj = -1;
-         }
-         RuntimeHashremovekey(locktbl, data2);
-         RuntimeHashadd_I(locktbl, data2, rwlock_obj);
-       } else if((rwlock_obj > 0) && (data1 == 0)) {
-         // read lock request and there are only read locks
-         rwlock_obj++;
-         RuntimeHashremovekey(locktbl, data2);
-         RuntimeHashadd_I(locktbl, data2, rwlock_obj);
-       } else {
-         deny = true;
+       return mem;
+} // void * localmalloc_I(int, int, int *)
+
+void * globalmalloc_I(int coren,
+                                 int isize,
+                                 int * allocsize) {
+       void * mem = NULL;
+       int tofindb = bamboo_free_block; //0;
+       int totest = tofindb;
+       int bound = BAMBOO_SMEM_SIZE_L;
+       int foundsmem = 0;
+       int size = 0;
+       if(tofindb > gcnumblock-1-bamboo_reserved_smem) {
+               *allocsize = 0;
+               return NULL;
        }
-#ifdef RAWDEBUG
-       raw_test_pass_reg(rwlock_obj);
-#endif
-      }
-      targetcore = data3;
-      // check if there is still some msg on sending
-      if(isMsgSending) {
-#ifdef RAWDEBUG
-       raw_test_pass(0xe885);
-#endif
-       isMsgHanging = true;
-       // cache the msg in outmsgdata and send it later
-       // msglength + target core + msg
-       outmsgdata[outmsglast++] = msgsize;
-       outmsgdata[outmsglast++] = targetcore;
-       if(deny == true) {
-         outmsgdata[outmsglast++] = 4;
-       } else {
-         outmsgdata[outmsglast++] = 3;
-       }
-       outmsgdata[outmsglast++] = data1;
-       outmsgdata[outmsglast++] = data2;
-      } else {
-#ifdef RAWDEBUG
-       raw_test_pass(0xe886);
-#endif
-       // no msg on sending, send it out
-       calCoords(corenum, &self_y, &self_x);
-       calCoords(targetcore, &target_y, &target_x);
-       // Build the message header
-       msgHdr = construct_dyn_hdr(0, msgsize, 0,                                                               // msgsize word sent.
-                                  self_y, self_x,
-                                  target_y, target_x);
-       gdn_send(msgHdr);                                                               // Send the message header to EAST to handle fab(n - 1).
-#ifdef RAWDEBUG
-       raw_test_pass(0xbbbb);
-       raw_test_pass(0xb000 + targetcore);                                                 // targetcore
-#endif
-       if(deny == true) {
-         // deny the lock request
-         gdn_send(4);                                                       // lock request
-#ifdef RAWDEBUG
-         raw_test_pass(4);
-#endif
-       } else {
-         // grount the lock request
-         gdn_send(3);                                                       // lock request
-#ifdef RAWDEBUG
-         raw_test_pass(3);
-#endif
+       do {
+               bound = (totest < NUMCORES4GC) ? BAMBOO_SMEM_SIZE_L : BAMBOO_SMEM_SIZE;
+               int nsize = bamboo_smemtbl[totest];
+               bool isnext = false;
+               if(nsize < bound) {
+                       bool tocheck = true;
+                       // have some space in the block
+                       if(totest == tofindb) {
+                               // the first partition
+                               size = bound - nsize;
+                       } else if(nsize == 0) {
+                               // an empty partition, can be appended
+                               size += bound;
+                       } else {
+                               // not an empty partition, can not be appended
+                               // the last continuous block is not big enough, start another block
+                               isnext = true;
+                               tocheck = false;
+                       } // if(totest == tofindb) else if(nsize == 0) else ...
+                       if(tocheck) {
+                               if(size >= isize) {
+                                       // have enough space in the block, malloc
+                                       foundsmem = 1;
+                                       break;
+                               } // if(size > isize) 
+                       } // if(tocheck)
+               } else {
+                       isnext = true;
+               }// if(nsize < bound) else ...
+               totest += 1;
+               if(totest > gcnumblock-1-bamboo_reserved_smem) {
+                       // no more local mem, do not find suitable block
+                       foundsmem = 2;
+                       break;
+               } // if(totest > gcnumblock-1-bamboo_reserved_smem) ...
+               if(isnext) {
+                       // start another block
+                       tofindb = totest;
+               } // if(islocal) 
+       } while(true);
+
+       if(foundsmem == 1) {
+               // find suitable block
+               mem = gcbaseva+bamboo_smemtbl[tofindb]+((tofindb<NUMCORES4GC)?
+                               (BAMBOO_SMEM_SIZE_L*tofindb):(BAMBOO_LARGE_SMEM_BOUND+
+                                       (tofindb-NUMCORES4GC)*BAMBOO_SMEM_SIZE));
+               *allocsize = size;
+               // set bamboo_smemtbl
+               for(int i = tofindb; i <= totest; i++) {
+                       bamboo_smemtbl[i]=(i<NUMCORES4GC)?BAMBOO_SMEM_SIZE_L:BAMBOO_SMEM_SIZE;
+               }
+               if(tofindb == bamboo_free_block) {
+                       bamboo_free_block = totest+1;
+               }
+       } else if(foundsmem == 2) {
+               // no suitable block
+               *allocsize = 0;
+               mem = NULL;
        }
-       gdn_send(data1);                                                 // lock type
-#ifdef RAWDEBUG
-       raw_test_pass_reg(data1);
-#endif
-       gdn_send(data2);                                                 // lock target
-#ifdef RAWDEBUG
-       raw_test_pass_reg(data2);
-       raw_test_pass(0xffff);
-#endif
-      }
-      break;
-    }
 
-    case 3: {
-      // receive lock grount msg
-      if(corenum > NUMCORES - 1) {
-       raw_test_done(0xa00b);
-      }
-      if(lockobj == data2) {
-       lockresult = 1;
-       lockflag = true;
-#ifndef INTERRUPT
-       reside = false;
-#endif
-      } else {
-       // conflicts on lockresults
-       raw_test_done(0xa002);
-      }
-      break;
-    }
+       return mem;
+} // void * globalmalloc_I(int, int, int *)
+#endif // #ifdef MULTICORE_GC
+
+// malloc from the shared memory
+void * smemalloc_I(int coren,
+                              int size, 
+                              int * allocsize) {
+       void * mem = NULL;
+#ifdef MULTICORE_GC
+       int isize = size+(BAMBOO_CACHE_LINE_SIZE);
+
+       // go through the bamboo_smemtbl for suitable partitions
+       switch(bamboo_smem_mode) {
+               case SMEMLOCAL: {
+                 mem = localmalloc_I(coren, isize, allocsize);
+                       break;
+         }
 
-    case 4: {
-      // receive lock grount/deny msg
-      if(corenum > NUMCORES - 1) {
-       raw_test_done(0xa00c);
-      }
-      if(lockobj == data2) {
-       lockresult = 0;
-       lockflag = true;
-#ifndef INTERRUPT
-       reside = false;
-#endif
-      } else {
-       // conflicts on lockresults
-       raw_test_done(0xa003);
-      }
-      break;
-    }
+               case SMEMFIXED: {
+                       // TODO not supported yet
+                       BAMBOO_EXIT(0xe001);
+                       break;
+               }
 
-    case 5: {
-      // receive lock release msg
-      if(!RuntimeHashcontainskey(locktbl, data2)) {
-       // no locks for this object, something is wrong
-       raw_test_done(0xa004);
-      } else {
-       int rwlock_obj = 0;
-       RuntimeHashget(locktbl, data2, &rwlock_obj);
-#ifdef RAWDEBUG
-       raw_test_pass(0xe887);
-       raw_test_pass_reg(rwlock_obj);
-#endif
-       if(data1 == 0) {
-         rwlock_obj--;
-       } else {
-         rwlock_obj++;
-       }
-       RuntimeHashremovekey(locktbl, data2);
-       RuntimeHashadd_I(locktbl, data2, rwlock_obj);
-#ifdef RAWDEBUG
-       raw_test_pass_reg(rwlock_obj);
-#endif
-      }
-      break;
-    }
+               case SMEMMIXED: {
+                       // TODO not supported yet
+                       BAMBOO_EXIT(0xe002);
+                       break;
+               }
 
-#ifdef RAWPROFILE
-    case 6: {
-      // receive an output request msg
-      if(corenum == STARTUPCORE) {
-       // startup core can not receive profile output finish msg
-       // return -1
-       raw_test_done(0xa00a);
-      }
-      {
-       int msgsize = 2;
-       stall = true;
-       totalexetime = data1;
-       outputProfileData();
-       /*if(data1 >= NUMCORES) {
-               raw_test_pass(0xee04);
-          raw_test_pass_reg(taskInfoIndex);
-          raw_test_pass_reg(taskInfoOverflow);
-               if(!taskInfoOverflow) {
-                       taskInfoArray[taskInfoIndex]->endTime = raw_get_cycle();
-                       taskInfoIndex++;
-                       if(taskInfoIndex == TASKINFOLENGTH) {
-                               taskInfoOverflow = true;
-                       }
-               }
-          }*/
-       // no msg on sending, send it out
-       targetcore = STARTUPCORE;
-       calCoords(corenum, &self_y, &self_x);
-       calCoords(targetcore, &target_y, &target_x);
-       // Build the message header
-       msgHdr = construct_dyn_hdr(0, msgsize, 0,                                                               // msgsize word sent.
-                                  self_y, self_x,
-                                  target_y, target_x);
-       gdn_send(msgHdr);
-#ifdef RAWDEBUG
-       raw_test_pass(0xbbbb);
-       raw_test_pass(0xb000 + targetcore);                                                 // targetcore
-#endif
-       gdn_send(7);
-#ifdef RAWDEBUG
-       raw_test_pass(7);
-#endif
-       gdn_send(corenum);
-#ifdef RAWDEBUG
-       raw_test_pass_reg(corenum);
-       raw_test_pass(0xffff);
-#endif
-      }
-      break;
-    }
+               case SMEMGLOBAL: {
+                       mem = globalmalloc_I(coren, isize, allocsize);
+                       break;
+               }
 
-    case 7: {
-      // receive a profile output finish msg
-      if(corenum != STARTUPCORE) {
-       // non startup core can not receive profile output finish msg
-       // return -1
-       raw_test_done(0xa00b);
-      }
-      profilestatus[data1] = 0;
-      break;
-    }
-#endif
+               default:
+                       break;
+       }
 
-    default:
-      break;
-    }
-    //RUNFREE(msgdata);
-    //msgdata = NULL;
-    for(msgdataindex--; msgdataindex > 0; --msgdataindex) {
-      msgdata[msgdataindex] = -1;
-    }
-    msgtype = -1;
-    //msgdataindex = 0;
-    msglength = 30;
-#ifdef RAWDEBUG
-    raw_test_pass(0xe888);
-#endif
-    if(gdn_input_avail() != 0) {
-      goto msg;
-    }
-#ifdef RAWPROFILE
-/*    if(isInterrupt && (!interruptInfoOverflow)) {
-      interruptInfoArray[interruptInfoIndex]->endTime = raw_get_cycle();
-      interruptInfoIndex++;
-      if(interruptInfoIndex == INTERRUPTINFOLENGTH) {
-       interruptInfoOverflow = true;
-      }
-    }*/
-#endif
-    return type;
-  } else {
-    // not a whole msg
-#ifdef RAWDEBUG
-    raw_test_pass(0xe889);
-#endif
-#ifdef RAWPROFILE
-/*    if(isInterrupt && (!interruptInfoOverflow)) {
-      interruptInfoArray[interruptInfoIndex]->endTime = raw_get_cycle();
-      interruptInfoIndex++;
-      if(interruptInfoIndex == INTERRUPTINFOLENGTH) {
-       interruptInfoOverflow = true;
-      }
-    }*/
-#endif
-    return -2;
-  }
-#elif defined THREADSIMULATE
-  int numofcore = pthread_getspecific(key);
-  // use POSIX message queue to transfer object
-  int msglen = 0;
-  struct mq_attr mqattr;
-  mq_getattr(mqd[numofcore], &mqattr);
-  void * msgptr =RUNMALLOC(mqattr.mq_msgsize);
-  msglen=mq_receive(mqd[numofcore], msgptr, mqattr.mq_msgsize, NULL);       // receive the object into the queue
-  if(-1 == msglen) {
-    // no msg
-    free(msgptr);
-    return 1;
-  }
-  //printf("msg: %s\n",msgptr);
-  if(((int*)msgptr)[0] == -1) {
-    // StallMsg
-    struct ___Object___ * tmpptr = (struct ___Object___ *)msgptr;
-    int index = tmpptr->flag;
-    corestatus[index] = 0;
-    numsendobjs[index] = tmpptr->___cachedHash___;
-    numreceiveobjs[index] = tmpptr->___cachedCode___;
-    printf("<receiveObject> index: %d, sendobjs: %d, reveiveobjs: %d\n", index, numsendobjs[index], numreceiveobjs[index]);
-    free(msgptr);
-    return 2;
-  }       /*else if(((int*)msgptr)[0] == -2) {
-               // terminate msg
-               return 3;
-            } */
-  else {
-    // an object
-    if(numofcore == STARTUPCORE) {
-      ++(numreceiveobjs[numofcore]);
-    } else {
-      ++(thread_data_array[numofcore].numreceiveobjs);
-    }
-    struct ___Object___ * tmpptr = (struct ___Object___ *)msgptr;
-    struct transObjInfo * transObj = (struct transObjInfo *)tmpptr->original;
-    tmpptr = (struct ___Object___ *)(transObj->objptr);
-    int type = tmpptr->type;
-    int size=classsize[type];
-    struct ___Object___ * newobj=RUNMALLOC(size);
-    memcpy(newobj, tmpptr, size);
-    if(0 == newobj->isolate) {
-      newobj->original=tmpptr;
-    }
-    RUNFREE(msgptr);
-    tmpptr = NULL;
-    int k = 0;
-    for(k = 0; k < transObj->length; ++k) {
-      int taskindex = transObj->queues[2 * k];
-      int paramindex = transObj->queues[2 * k + 1];
-      struct parameterwrapper ** queues = &(paramqueues[numofcore][taskindex][paramindex]);
-      enqueueObject(newobj, queues, 1);
-    }
-    RUNFREE(transObj->queues);
-    RUNFREE(transObj);
-    return 0;
-  }
+       if(mem == NULL) {
+#else
+       int toallocate = (size>(BAMBOO_SMEM_SIZE)) ? (size):(BAMBOO_SMEM_SIZE);
+       mem = mspace_calloc(bamboo_free_msp, 1, toallocate);
+       *allocsize = toallocate;
+       if(mem == NULL) {
+#endif
+               // no enough shared global memory
+               *allocsize = 0;
+#ifdef MULTICORE_GC
+               gcflag = true;
+               return NULL;
+#else
+               BAMBOO_DEBUGPRINT(0xa001);
+               BAMBOO_EXIT(0xa001);
 #endif
-}
+       }
+       return mem;
+}  // void * smemalloc_I(int, int, int)
 
-bool getreadlock(void * ptr) {
-#ifdef RAW
-  unsigned msgHdr;
-  int self_y, self_x, target_y, target_x;
-  int targetcore = 0;       //((int)ptr >> 5) % TOTALCORE;
-  // for 32 bit machine, the size is always 4 words
-  //int msgsize = sizeof(int) * 4;
-  int msgsize = 4;
-  int tc = TOTALCORE;
-#ifdef INTERRUPT
-  raw_user_interrupts_off();
-#endif
-  targetcore = ((int)ptr >> 5) % tc;
-#ifdef INTERRUPT
-  raw_user_interrupts_on();
+INLINE int checkMsgLength_I(int size) {
+#ifdef DEBUG
+#ifndef TILERA
+  BAMBOO_DEBUGPRINT(0xcccc);
+#endif
+#endif
+       int type = msgdata[msgdataindex];
+       switch(type) {
+               case STATUSCONFIRM:
+               case TERMINATE:
+#ifdef MULTICORE_GC
+               case GCSTARTINIT: 
+               case GCSTART: 
+               case GCSTARTFLUSH: 
+               case GCFINISH: 
+               case GCMARKCONFIRM: 
+               case GCLOBJREQUEST: 
+#endif 
+               {
+                       msglength = 1;
+                       break;
+               }
+               case PROFILEOUTPUT:
+               case PROFILEFINISH:
+#ifdef MULTICORE_GC
+               case GCSTARTCOMPACT:
+               case GCFINISHINIT: 
+               case GCFINISHFLUSH: 
+               case GCMARKEDOBJ: 
+#endif
+               {
+                       msglength = 2;
+                       break;
+               }
+               case MEMREQUEST: 
+               case MEMRESPONSE:
+#ifdef MULTICORE_GC
+               case GCMAPREQUEST: 
+               case GCMAPINFO: 
+               case GCLOBJMAPPING: 
+#endif 
+               {
+                       msglength = 3;
+                       break;
+               }
+               case TRANSTALL:
+               case LOCKGROUNT:
+               case LOCKDENY:
+               case LOCKRELEASE:
+               case REDIRECTGROUNT:
+               case REDIRECTDENY:
+               case REDIRECTRELEASE:
+#ifdef MULTICORE_GC
+               case GCFINISHMARK:
+               case GCMOVESTART:
+#endif
+               { 
+                       msglength = 4;
+                       break;
+               }
+               case LOCKREQUEST:
+               case STATUSREPORT:
+#ifdef MULTICORE_GC
+               case GCFINISHCOMPACT:
+               case GCMARKREPORT: 
+#endif 
+               {
+                       msglength = 5;
+                       break;
+               }
+               case REDIRECTLOCK: 
+               {
+                       msglength = 6;
+                       break;
+               }
+               case TRANSOBJ:  // nonfixed size
+#ifdef MULTICORE_GC
+               case GCLOBJINFO: 
+#endif
+               { // nonfixed size 
+                       if(size > 1) {
+                               msglength = msgdata[msgdataindex+1];
+                       } else {
+                               return -1;
+                       }
+                       break;
+               }
+               default: 
+               {
+                       BAMBOO_DEBUGPRINT_REG(type);
+                       int i = 6;
+                       while(i-- > 0) {
+                               BAMBOO_DEBUGPRINT(msgdata[msgdataindex+i]);
+                       }
+                       BAMBOO_EXIT(0xd005);
+                       break;
+               }
+       }
+#ifdef DEBUG
+#ifndef TILERA
+       BAMBOO_DEBUGPRINT_REG(msgdata[msgdataindex]);
 #endif
-
-  lockobj = (int)ptr;
-  lockflag = false;
-#ifndef INTERRUPT
-  reside = false;
 #endif
-  lockresult = 0;
-
-  if(targetcore == corenum) {
-    // reside on this core
-    bool deny = false;
-#ifdef INTERRUPT
-    raw_user_interrupts_off();
-#endif
-    if(!RuntimeHashcontainskey(locktbl, (int)ptr)) {
-      // no locks for this object
-      // first time to operate on this shared object
-      // create a lock for it
-      // the lock is an integer: 0 -- stall, >0 -- read lock, -1 -- write lock
-      RuntimeHashadd_I(locktbl, (int)ptr, 1);
-    } else {
-      int rwlock_obj = 0;
-      RuntimeHashget(locktbl, (int)ptr, &rwlock_obj);
-      if(-1 != rwlock_obj) {
-       rwlock_obj++;
-       RuntimeHashremovekey(locktbl, (int)ptr);
-       RuntimeHashadd_I(locktbl, (int)ptr, rwlock_obj);
-      } else {
-       deny = true;
-      }
-    }
-#ifdef INTERRUPT
-    raw_user_interrupts_on();
+#ifdef DEBUG
+#ifndef TILERA
+  BAMBOO_DEBUGPRINT(0xffff);
 #endif
-    if(lockobj == (int)ptr) {
-      if(deny) {
-       lockresult = 0;
-      } else {
-       lockresult = 1;
-      }
-      lockflag = true;
-#ifndef INTERRUPT
-      reside = true;
 #endif
-    } else {
-      // conflicts on lockresults
-      raw_test_done(0xa005);
-    }
-    return true;
-  }
+       return msglength;
+}
 
-  calCoords(corenum, &self_y, &self_x);
-  calCoords(targetcore, &target_y, &target_x);
-  // Build the message header
-  msgHdr = construct_dyn_hdr(0, msgsize, 0,             // msgsize word sent.
-                             self_y, self_x,
-                             target_y, target_x);
-  // start sending the msg, set send msg flag
-  isMsgSending = true;
-  gdn_send(msgHdr);                     // Send the message header to EAST to handle fab(n - 1).
-#ifdef RAWDEBUG
-  raw_test_pass(0xbbbb);
-  raw_test_pass(0xb000 + targetcore);       // targetcore
-#endif
-  gdn_send(2);   // lock request
- #ifdef RAWDEBUG
-  raw_test_pass(2);
-#endif
-  gdn_send(0);       // read lock
-#ifdef RAWDEBUG
-  raw_test_pass(0);
-#endif
-  gdn_send(ptr);
-#ifdef RAWDEBUG
-  raw_test_pass_reg(ptr);
-#endif
-  gdn_send(corenum);
-#ifdef RAWDEBUG
-  raw_test_pass_reg(corenum);
-  raw_test_pass(0xffff);
-#endif
-  // end of sending this msg, set sand msg flag false
-  isMsgSending = false;
-  // check if there are pending msgs
-  while(isMsgHanging) {
-    // get the msg from outmsgdata[]
-    // length + target + msg
-    outmsgleft = outmsgdata[outmsgindex++];
-    targetcore = outmsgdata[outmsgindex++];
-    calCoords(targetcore, &target_y, &target_x);
-    // Build the message header
-    msgHdr = construct_dyn_hdr(0, outmsgleft, 0,                        // msgsize word sent.
-                               self_y, self_x,
-                               target_y, target_x);
-    isMsgSending = true;
-    gdn_send(msgHdr);                           // Send the message header to EAST to handle fab(n - 1).
-#ifdef RAWDEBUG
-    raw_test_pass(0xbbbb);
-    raw_test_pass(0xb000 + targetcore);             // targetcore
-#endif
-    while(outmsgleft-- > 0) {
-      gdn_send(outmsgdata[outmsgindex++]);
-#ifdef RAWDEBUG
-      raw_test_pass_reg(outmsgdata[outmsgindex - 1]);
+INLINE void processmsg_transobj_I() {
+       MSG_INDEXINC_I();
+       struct transObjInfo * transObj = RUNMALLOC_I(sizeof(struct transObjInfo));
+       int k = 0;
+#ifdef DEBUG
+#ifndef CLOSE_PRINT
+       BAMBOO_DEBUGPRINT(0xe880);
+#endif
+#endif
+       if(BAMBOO_NUM_OF_CORE > NUMCORESACTIVE - 1) {
+#ifndef CLOSE_PRINT
+               BAMBOO_DEBUGPRINT_REG(msgdata[msgdataindex]/*[2]*/);
+#endif
+               BAMBOO_EXIT(0xa002);
+       } 
+       // store the object and its corresponding queue info, enqueue it later
+       transObj->objptr = (void *)msgdata[msgdataindex]; //[2]
+       MSG_INDEXINC_I();
+       transObj->length = (msglength - 3) / 2;
+       transObj->queues = RUNMALLOC_I(sizeof(int)*(msglength - 3));
+       for(k = 0; k < transObj->length; ++k) {
+               transObj->queues[2*k] = msgdata[msgdataindex]; //[3+2*k];
+               MSG_INDEXINC_I();
+#ifdef DEBUG
+#ifndef CLOSE_PRINT
+               //BAMBOO_DEBUGPRINT_REG(transObj->queues[2*k]);
 #endif
-    }
-#ifdef RAWDEBUG
-    raw_test_pass(0xffff);
-#endif
-    isMsgSending = false;
-#ifdef INTERRUPT
-    raw_user_interrupts_off();
-#endif
-    // check if there are still msg hanging
-    if(outmsgindex == outmsglast) {
-      // no more msgs
-      outmsgindex = outmsglast = 0;
-      isMsgHanging = false;
-    }
-#ifdef INTERRUPT
-    raw_user_interrupts_on();
 #endif
-  }
-  return true;
-#elif defined THREADSIMULATE
-  int numofcore = pthread_getspecific(key);
-
-  int rc = pthread_rwlock_tryrdlock(&rwlock_tbl);
-  printf("[getreadlock, %d] getting the read lock for locktbl: %d error: \n", numofcore, rc, strerror(rc));
-  if(0 != rc) {
-    return false;
-  }
-  if(!RuntimeHashcontainskey(locktbl, (int)ptr)) {
-    // no locks for this object
-    // first time to operate on this shared object
-    // create a lock for it
-    rc = pthread_rwlock_unlock(&rwlock_tbl);
-    printf("[getreadlock, %d] release the read lock for locktbl: %d error: \n", numofcore, rc, strerror(rc));
-    pthread_rwlock_t* rwlock = (pthread_rwlock_t *)RUNMALLOC(sizeof(pthread_rwlock_t));
-    memcpy(rwlock, &rwlock_init, sizeof(pthread_rwlock_t));
-    rc = pthread_rwlock_init(rwlock, NULL);
-    printf("[getreadlock, %d] initialize the rwlock for object %d: %d error: \n", numofcore, (int)ptr, rc, strerror(rc));
-    rc = pthread_rwlock_trywrlock(&rwlock_tbl);
-    printf("[getreadlock, %d] getting the write lock for locktbl: %d error: \n", numofcore, rc, strerror(rc));
-    if(0 != rc) {
-      RUNFREE(rwlock);
-      return false;
-    } else {
-      if(!RuntimeHashcontainskey(locktbl, (int)ptr)) {
-       // check again
-       RuntimeHashadd(locktbl, (int)ptr, (int)rwlock);
-      } else {
-       RUNFREE(rwlock);
-       RuntimeHashget(locktbl, (int)ptr, (int*)&rwlock);
-      }
-      rc = pthread_rwlock_unlock(&rwlock_tbl);
-      printf("[getreadlock, %d] release the write lock for locktbl: %d error: \n", numofcore, rc, strerror(rc));
-    }
-    rc = pthread_rwlock_tryrdlock(rwlock);
-    printf("[getreadlock, %d] getting read lock for object %d: %d error: \n", numofcore, (int)ptr, rc, strerror(rc));
-    if(0 != rc) {
-      return false;
-    } else {
-      return true;
-    }
-  } else {
-    pthread_rwlock_t* rwlock_obj = NULL;
-    RuntimeHashget(locktbl, (int)ptr, (int*)&rwlock_obj);
-    rc = pthread_rwlock_unlock(&rwlock_tbl);
-    printf("[getreadlock, %d] release the read lock for locktbl: %d error: \n", numofcore, rc, strerror(rc));
-    int rc_obj = pthread_rwlock_tryrdlock(rwlock_obj);
-    printf("[getreadlock, %d] getting read lock for object %d: %d error: \n", numofcore, (int)ptr, rc_obj, strerror(rc_obj));
-    if(0 != rc_obj) {
-      return false;
-    } else {
-      return true;
-    }
-  }
+               transObj->queues[2*k+1] = msgdata[msgdataindex]; //[3+2*k+1];
+               MSG_INDEXINC_I();
+#ifdef DEBUG
+#ifndef CLOSE_PRINT
+               //BAMBOO_DEBUGPRINT_REG(transObj->queues[2*k+1]);
 #endif
+#endif
+       }
+       // check if there is an existing duplicate item
+       {
+               struct QueueItem * qitem = getHead(&objqueue);
+               struct QueueItem * prev = NULL;
+               while(qitem != NULL) {
+                       struct transObjInfo * tmpinfo = 
+                               (struct transObjInfo *)(qitem->objectptr);
+                       if(tmpinfo->objptr == transObj->objptr) {
+                               // the same object, remove outdate one
+                               RUNFREE(tmpinfo->queues);
+                               RUNFREE(tmpinfo);
+                               removeItem(&objqueue, qitem);
+                               //break;
+                       } else {
+                               prev = qitem;
+                       }
+                       if(prev == NULL) {
+                               qitem = getHead(&objqueue);
+                       } else {
+                               qitem = getNextQueueItem(prev);
+                       }
+               }
+               addNewItem_I(&objqueue, (void *)transObj);
+       }
+       ++(self_numreceiveobjs);
 }
 
-void releasereadlock(void * ptr) {
-#ifdef RAW
-  unsigned msgHdr;
-  int self_y, self_x, target_y, target_x;
-  int targetcore = 0;       //((int)ptr >> 5) % TOTALCORE;
-  // for 32 bit machine, the size is always 3 words
-  //int msgsize = sizeof(int) * 3;
-  int msgsize = 3;
-  int tc = TOTALCORE;
-#ifdef INTERRUPT
-  raw_user_interrupts_off();
+INLINE void processmsg_transtall_I() {
+       if(BAMBOO_NUM_OF_CORE != STARTUPCORE) {
+       // non startup core can not receive stall msg
+#ifndef CLOSE_PRINT
+               BAMBOO_DEBUGPRINT_REG(msgdata[msgdataindex]/*[1]*/);
+#endif
+               BAMBOO_EXIT(0xa003);
+       } 
+       int num_core = msgdata[msgdataindex]; //[1]
+       MSG_INDEXINC_I();
+       if(num_core < NUMCORESACTIVE) {
+#ifdef DEBUG
+#ifndef CLOSE_PRINT
+               BAMBOO_DEBUGPRINT(0xe881);
 #endif
-  targetcore = ((int)ptr >> 5) % tc;
-#ifdef INTERRUPT
-  raw_user_interrupts_on();
 #endif
+               corestatus[num_core] = 0;
+               numsendobjs[num_core] = msgdata[msgdataindex]; //[2];
+               MSG_INDEXINC_I();
+               numreceiveobjs[num_core] = msgdata[msgdataindex]; //[3];
+               MSG_INDEXINC_I();
+       }
+}
+
+#ifndef MULTICORE_GC
+INLINE void processmsg_lockrequest_I() {
+       // check to see if there is a lock exist for the required obj
+       // msgdata[1] -> lock type
+       int locktype = msgdata[msgdataindex]; //[1];
+       MSG_INDEXINC_I();
+       int data2 = msgdata[msgdataindex]; // obj pointer
+       MSG_INDEXINC_I();
+       int data3 = msgdata[msgdataindex]; // lock
+       MSG_INDEXINC_I();
+       int data4 = msgdata[msgdataindex]; // request core
+       MSG_INDEXINC_I();
+       // -1: redirected, 0: approved, 1: denied
+       int deny = processlockrequest(locktype, data3, data2, data4, data4, true);  
+       if(deny == -1) {
+               // this lock request is redirected
+               return;
+       } else {
+               // send response msg
+               // for 32 bit machine, the size is always 4 words
+               int tmp = deny==1?LOCKDENY:LOCKGROUNT;
+               if(isMsgSending) {
+                       cache_msg_4(data4, tmp, locktype, data2, data3);
+               } else {
+                       send_msg_4(data4, tmp, locktype, data2, data3, true);
+               }
+       }
+}
 
-  if(targetcore == corenum) {
-#ifdef INTERRUPT
-    raw_user_interrupts_off();
+INLINE void processmsg_lockgrount_I() {
+       MSG_INDEXINC_I();
+       if(BAMBOO_NUM_OF_CORE > NUMCORESACTIVE - 1) {
+#ifndef CLOSE_PRINT
+               BAMBOO_DEBUGPRINT_REG(msgdata[msgdataindex]/*[2]*/);
+#endif
+               BAMBOO_EXIT(0xa004);
+       } 
+       int data2 = msgdata[msgdataindex];
+       MSG_INDEXINC_I();
+       int data3 = msgdata[msgdataindex];
+       MSG_INDEXINC_I();
+       if((lockobj == data2) && (lock2require == data3)) {
+#ifdef DEBUG
+#ifndef CLOSE_PRINT
+               BAMBOO_DEBUGPRINT(0xe882);
 #endif
-    // reside on this core
-    if(!RuntimeHashcontainskey(locktbl, (int)ptr)) {
-      // no locks for this object, something is wrong
-      raw_test_done(0xa006);
-    } else {
-      int rwlock_obj = 0;
-      RuntimeHashget(locktbl, (int)ptr, &rwlock_obj);
-      rwlock_obj--;
-      RuntimeHashremovekey(locktbl, (int)ptr);
-      RuntimeHashadd_I(locktbl, (int)ptr, rwlock_obj);
-    }
-#ifdef INTERRUPT
-    raw_user_interrupts_on();
 #endif
-    return;
-  }
+               lockresult = 1;
+               lockflag = true;
+#ifndef INTERRUPT
+               reside = false;
+#endif
+       } else {
+               // conflicts on lockresults
+#ifndef CLOSE_PRINT
+               BAMBOO_DEBUGPRINT_REG(data2);
+#endif
+               BAMBOO_EXIT(0xa005);
+       }
+}
 
-  calCoords(corenum, &self_y, &self_x);
-  calCoords(targetcore, &target_y, &target_x);
-  // Build the message header
-  msgHdr = construct_dyn_hdr(0, msgsize, 0,             // msgsize word sent.
-                             self_y, self_x,
-                             target_y, target_x);
-  // start sending the msg, set send msg flag
-  isMsgSending = true;
-  gdn_send(msgHdr);                     // Send the message header to EAST to handle fab(n - 1).
-#ifdef RAWDEBUG
-  raw_test_pass(0xbbbb);
-  raw_test_pass(0xb000 + targetcore);       // targetcore
-#endif
-  gdn_send(5);   // lock release
-#ifdef RAWDEBUG
-  raw_test_pass(5);
-#endif
-  gdn_send(0);       // read lock
-#ifdef RAWDEBUG
-  raw_test_pass(0);
-#endif
-  gdn_send(ptr);
-#ifdef RAWDEBUG
-  raw_test_pass_reg(ptr);
-  raw_test_pass(0xffff);
-#endif
-  // end of sending this msg, set sand msg flag false
-  isMsgSending = false;
-  // check if there are pending msgs
-  while(isMsgHanging) {
-    // get the msg from outmsgdata[]
-    // length + target + msg
-    outmsgleft = outmsgdata[outmsgindex++];
-    targetcore = outmsgdata[outmsgindex++];
-    calCoords(targetcore, &target_y, &target_x);
-    // Build the message header
-    msgHdr = construct_dyn_hdr(0, outmsgleft, 0,                        // msgsize word sent.
-                               self_y, self_x,
-                               target_y, target_x);
-    isMsgSending = true;
-    gdn_send(msgHdr);                           // Send the message header to EAST to handle fab(n - 1).
-#ifdef RAWDEBUG
-    raw_test_pass(0xbbbb);
-    raw_test_pass(0xb000 + targetcore);             // targetcore
-#endif
-    while(outmsgleft-- > 0) {
-      gdn_send(outmsgdata[outmsgindex++]);
-#ifdef RAWDEBUG
-      raw_test_pass_reg(outmsgdata[outmsgindex - 1]);
+INLINE void processmsg_lockdeny_I() {
+       MSG_INDEXINC_I();
+       int data2 = msgdata[msgdataindex];
+       MSG_INDEXINC_I();
+       int data3 = msgdata[msgdataindex];
+       MSG_INDEXINC_I();
+       if(BAMBOO_NUM_OF_CORE > NUMCORESACTIVE - 1) {
+#ifndef CLOSE_PRINT
+               BAMBOO_DEBUGPRINT_REG(data2);
+#endif
+               BAMBOO_EXIT(0xa006);
+       } 
+       if((lockobj == data2) && (lock2require == data3)) {
+#ifdef DEBUG
+#ifndef CLOSE_PRINT
+               BAMBOO_DEBUGPRINT(0xe883);
 #endif
-    }
-#ifdef RAWDEBUG
-    raw_test_pass(0xffff);
-#endif
-    isMsgSending = false;
-#ifdef INTERRUPT
-    raw_user_interrupts_off();
-#endif
-    // check if there are still msg hanging
-    if(outmsgindex == outmsglast) {
-      // no more msgs
-      outmsgindex = outmsglast = 0;
-      isMsgHanging = false;
-    }
-#ifdef INTERRUPT
-    raw_user_interrupts_on();
 #endif
-  }
-#elif defined THREADSIMULATE
-  int numofcore = pthread_getspecific(key);
-  int rc = pthread_rwlock_rdlock(&rwlock_tbl);
-  printf("[releasereadlock, %d] getting the read lock for locktbl: %d error: \n", numofcore, rc, strerror(rc));
-  if(!RuntimeHashcontainskey(locktbl, (int)ptr)) {
-    printf("[releasereadlock, %d] Error: try to release a lock without previously grab it\n", numofcore);
-    exit(-1);
-  }
-  pthread_rwlock_t* rwlock_obj = NULL;
-  RuntimeHashget(locktbl, (int)ptr, (int*)&rwlock_obj);
-  int rc_obj = pthread_rwlock_unlock(rwlock_obj);
-  printf("[releasereadlock, %d] unlocked object %d: %d error: \n", numofcore, (int)ptr, rc_obj, strerror(rc_obj));
-  rc = pthread_rwlock_unlock(&rwlock_tbl);
-  printf("[releasereadlock, %d] release the read lock for locktbl: %d error: \n", numofcore, rc, strerror(rc));
+               lockresult = 0;
+               lockflag = true;
+#ifndef INTERRUPT
+               reside = false;
+#endif
+               } else {
+               // conflicts on lockresults
+#ifndef CLOSE_PRINT
+               BAMBOO_DEBUGPRINT_REG(data2);
 #endif
+               BAMBOO_EXIT(0xa007);
+       }
 }
 
-#ifdef RAW
-bool getreadlock_I(void * ptr) {
-  unsigned msgHdr;
-  int self_y, self_x, target_y, target_x;
-  int targetcore = ((int)ptr >> 5) % TOTALCORE;
-  // for 32 bit machine, the size is always 4 words
-  //int msgsize = sizeof(int) * 4;
-  int msgsize = 4;
+INLINE void processmsg_lockrelease_I() {
+       int data1 = msgdata[msgdataindex];
+       MSG_INDEXINC_I();
+       int data2 = msgdata[msgdataindex];
+       MSG_INDEXINC_I();
+       // receive lock release msg
+       processlockrelease(data1, data2, 0, false);
+}
 
-  lockobj = (int)ptr;
-  lockflag = false;
+INLINE void processmsg_redirectlock_I() {
+       // check to see if there is a lock exist for the required obj
+       int data1 = msgdata[msgdataindex];
+       MSG_INDEXINC_I(); //msgdata[1]; // lock type
+       int data2 = msgdata[msgdataindex];
+       MSG_INDEXINC_I();//msgdata[2]; // obj pointer
+       int data3 = msgdata[msgdataindex];
+       MSG_INDEXINC_I(); //msgdata[3]; // redirect lock
+       int data4 = msgdata[msgdataindex];
+       MSG_INDEXINC_I(); //msgdata[4]; // root request core
+       int data5 = msgdata[msgdataindex];
+       MSG_INDEXINC_I(); //msgdata[5]; // request core
+       int deny = processlockrequest(data1, data3, data2, data5, data4, true);
+       if(deny == -1) {
+               // this lock request is redirected
+               return;
+       } else {
+               // send response msg
+               // for 32 bit machine, the size is always 4 words
+               if(isMsgSending) {
+                       cache_msg_4(data4, deny==1?REDIRECTDENY:REDIRECTGROUNT, 
+                                                                       data1, data2, data3);
+               } else {
+                       send_msg_4(data4, deny==1?REDIRECTDENY:REDIRECTGROUNT, 
+                                                                data1, data2, data3, true);
+               }
+       }
+}
+
+INLINE void processmsg_redirectgrount_I() {
+       MSG_INDEXINC_I();
+       int data2 = msgdata[msgdataindex];
+       MSG_INDEXINC_I();
+       if(BAMBOO_NUM_OF_CORE > NUMCORESACTIVE - 1) {
+#ifndef CLOSE_PRINT
+               BAMBOO_DEBUGPRINT_REG(data2);
+#endif
+               BAMBOO_EXIT(0xa00a);
+       }
+       if(lockobj == data2) {
+#ifdef DEBUG
+#ifndef CLOSE_PRINT
+               BAMBOO_DEBUGPRINT(0xe891);
+#endif
+#endif
+               int data3 = msgdata[msgdataindex];
+               MSG_INDEXINC_I();
+               lockresult = 1;
+               lockflag = true;
+               RuntimeHashadd_I(objRedirectLockTbl, lockobj, data3);
 #ifndef INTERRUPT
-  reside = false;
+               reside = false;
 #endif
-  lockresult = 0;
+       } else {
+               // conflicts on lockresults
+#ifndef CLOSE_PRINT
+               BAMBOO_DEBUGPRINT_REG(data2);
+#endif
+               BAMBOO_EXIT(0xa00b);
+       }
+}
 
-  if(targetcore == corenum) {
-    // reside on this core
-    bool deny = false;
-    if(!RuntimeHashcontainskey(locktbl, (int)ptr)) {
-      // no locks for this object
-      // first time to operate on this shared object
-      // create a lock for it
-      // the lock is an integer: 0 -- stall, >0 -- read lock, -1 -- write lock
-      RuntimeHashadd_I(locktbl, (int)ptr, 1);
-    } else {
-      int rwlock_obj = 0;
-      RuntimeHashget(locktbl, (int)ptr, &rwlock_obj);
-      if(-1 != rwlock_obj) {
-       rwlock_obj++;
-       RuntimeHashremovekey(locktbl, (int)ptr);
-       RuntimeHashadd_I(locktbl, (int)ptr, rwlock_obj);
-      } else {
-       deny = true;
-      }
-    }
-    if(lockobj == (int)ptr) {
-      if(deny) {
-       lockresult = 0;
-      } else {
-       lockresult = 1;
-      }
-      lockflag = true;
+INLINE void processmsg_redirectdeny_I() {
+       MSG_INDEXINC_I();
+       int data2 = msgdata[msgdataindex];
+       MSG_INDEXINC_I();
+       if(BAMBOO_NUM_OF_CORE > NUMCORESACTIVE - 1) {
+#ifndef CLOSE_PRINT
+               BAMBOO_DEBUGPRINT_REG(data2);
+#endif
+               BAMBOO_EXIT(0xa00c);
+       }
+       if(lockobj == data2) {
+#ifdef DEBUG
+#ifndef CLOSE_PRINT
+               BAMBOO_DEBUGPRINT(0xe892);
+#endif
+#endif
+               lockresult = 0;
+               lockflag = true;
 #ifndef INTERRUPT
-      reside = true;
+               reside = false;
 #endif
-    } else {
-      // conflicts on lockresults
-      raw_test_done(0xa005);
-    }
-    return true;
-  }
-
-  calCoords(corenum, &self_y, &self_x);
-  calCoords(targetcore, &target_y, &target_x);
-  // Build the message header
-  msgHdr = construct_dyn_hdr(0, msgsize, 0,             // msgsize word sent.
-                             self_y, self_x,
-                             target_y, target_x);
-  gdn_send(msgHdr);                     // Send the message header to EAST to handle fab(n - 1).
-#ifdef RAWDEBUG
-  raw_test_pass(0xbbbb);
-  raw_test_pass(0xb000 + targetcore);       // targetcore
-#endif
-  gdn_send(2);   // lock request
-#ifdef RAWDEBUG
-  raw_test_pass(2);
-#endif
-  gdn_send(0);       // read lock
-#ifdef RAWDEBUG
-  raw_test_pass(0);
-#endif
-  gdn_send(ptr);
-#ifdef RAWDEBUG
-  raw_test_pass_reg(ptr);
-#endif
-  gdn_send(corenum);
-#ifdef RAWDEBUG
-  raw_test_pass_reg(corenum);
-  raw_test_pass(0xffff);
-#endif
-  return true;
+       } else {
+               // conflicts on lockresults
+#ifndef CLOSE_PRINT
+               BAMBOO_DEBUGPRINT_REG(data2);
+#endif
+               BAMBOO_EXIT(0xa00d);
+       }
 }
 
-void releasereadlock_I(void * ptr) {
-  unsigned msgHdr;
-  int self_y, self_x, target_y, target_x;
-  int targetcore = ((int)ptr >> 5) % TOTALCORE;
-  // for 32 bit machine, the size is always 3 words
-  //int msgsize = sizeof(int) * 3;
-  int msgsize = 3;
-
-  if(targetcore == corenum) {
-    // reside on this core
-    if(!RuntimeHashcontainskey(locktbl, (int)ptr)) {
-      // no locks for this object, something is wrong
-      raw_test_done(0xa006);
-    } else {
-      int rwlock_obj = 0;
-      RuntimeHashget(locktbl, (int)ptr, &rwlock_obj);
-      rwlock_obj--;
-      RuntimeHashremovekey(locktbl, (int)ptr);
-      RuntimeHashadd_I(locktbl, (int)ptr, rwlock_obj);
-    }
-    return;
-  }
+INLINE void processmsg_redirectrelease_I() {
+       int data1 = msgdata[msgdataindex];
+       MSG_INDEXINC_I();
+       int data2 = msgdata[msgdataindex];
+       MSG_INDEXINC_I();
+       int data3 = msgdata[msgdataindex];
+       MSG_INDEXINC_I();
+       processlockrelease(data1, data2, data3, true);
+}
+#endif // #ifndef MULTICORE_GC
 
-  calCoords(corenum, &self_y, &self_x);
-  calCoords(targetcore, &target_y, &target_x);
-  // Build the message header
-  msgHdr = construct_dyn_hdr(0, msgsize, 0,             // msgsize word sent.
-                             self_y, self_x,
-                             target_y, target_x);
-  gdn_send(msgHdr);                     // Send the message header to EAST to handle fab(n - 1).
-#ifdef RAWDEBUG
-  raw_test_pass(0xbbbb);
-  raw_test_pass(0xb000 + targetcore);       // targetcore
-#endif
-  gdn_send(5);   // lock release
-#ifdef RAWDEBUG
-  raw_test_pass(5);
-#endif
-  gdn_send(0);       // read lock
-#ifdef RAWDEBUG
-  raw_test_pass(0);
-#endif
-  gdn_send(ptr);
-#ifdef RAWDEBUG
-  raw_test_pass_reg(ptr);
-  raw_test_pass(0xffff);
+#ifdef PROFILE
+INLINE void processmsg_profileoutput_I() {
+       if(BAMBOO_NUM_OF_CORE == STARTUPCORE) {
+               // startup core can not receive profile output finish msg
+               BAMBOO_EXIT(0xa008);
+       }
+#ifdef DEBUG
+#ifndef CLOSE_PRINT
+       BAMBOO_DEBUGPRINT(0xe885);
 #endif
+#endif
+       stall = true;
+       totalexetime = msgdata[msgdataindex]; //[1]
+       MSG_INDEXINC_I();
+       outputProfileData();
+       if(isMsgSending) {
+               cache_msg_2(STARTUPCORE, PROFILEFINISH, BAMBOO_NUM_OF_CORE);
+       } else {
+               send_msg_2(STARTUPCORE, PROFILEFINISH, BAMBOO_NUM_OF_CORE, true);
+       }
 }
+
+INLINE void processmsg_profilefinish_I() {
+       if(BAMBOO_NUM_OF_CORE != STARTUPCORE) {
+               // non startup core can not receive profile output finish msg
+#ifndef CLOSE_PRINT
+               BAMBOO_DEBUGPRINT_REG(msgdata[msgdataindex/*1*/]);
+#endif
+               BAMBOO_EXIT(0xa009);
+       }
+#ifdef DEBUG
+#ifndef CLOSE_PRINT
+       BAMBOO_DEBUGPRINT(0xe886);
+#endif
 #endif
+       int data1 = msgdata[msgdataindex];
+       MSG_INDEXINC_I();
+       profilestatus[data1] = 0;
+}
+#endif // #ifdef PROFILE
+
+INLINE void processmsg_statusconfirm_I() {
+       if((BAMBOO_NUM_OF_CORE == STARTUPCORE) 
+                       || (BAMBOO_NUM_OF_CORE > NUMCORESACTIVE - 1)) {
+               // wrong core to receive such msg
+               BAMBOO_EXIT(0xa00e);
+       } else {
+               // send response msg
+#ifdef DEBUG
+#ifndef CLOSE_PRINT
+               BAMBOO_DEBUGPRINT(0xe887);
+#endif
+#endif
+               if(isMsgSending) {
+                       cache_msg_5(STARTUPCORE, STATUSREPORT, 
+                                                                       busystatus?1:0, BAMBOO_NUM_OF_CORE,
+                                                                       self_numsendobjs, self_numreceiveobjs);
+               } else {
+                       send_msg_5(STARTUPCORE, STATUSREPORT, busystatus?1:0, 
+                                                                BAMBOO_NUM_OF_CORE, self_numsendobjs, 
+                                                                self_numreceiveobjs, true);
+               }
+       }
+}
 
-// not reentrant
-bool getwritelock(void * ptr) {
-#ifdef RAW
-  unsigned msgHdr;
-  int self_y, self_x, target_y, target_x;
-  int targetcore = 0;       //((int)ptr >> 5) % TOTALCORE;
-  // for 32 bit machine, the size is always 4 words
-  //int msgsize = sizeof(int) * 4;
-  int msgsize= 4;
-  int tc = TOTALCORE;
-#ifdef INTERRUPT
-  raw_user_interrupts_off();
+INLINE void processmsg_statusreport_I() {
+       int data1 = msgdata[msgdataindex];
+       MSG_INDEXINC_I();
+       int data2 = msgdata[msgdataindex];
+       MSG_INDEXINC_I();
+       int data3 = msgdata[msgdataindex];
+       MSG_INDEXINC_I();
+       int data4 = msgdata[msgdataindex];
+       MSG_INDEXINC_I();
+       // receive a status confirm info
+       if(BAMBOO_NUM_OF_CORE != STARTUPCORE) {
+               // wrong core to receive such msg
+#ifndef CLOSE_PRINT
+               BAMBOO_DEBUGPRINT_REG(data2);
+#endif
+               BAMBOO_EXIT(0xa00f);
+       } else {
+#ifdef DEBUG
+#ifndef CLOSE_PRINT
+               BAMBOO_DEBUGPRINT(0xe888);
 #endif
-  targetcore = ((int)ptr >> 5) % tc;
-#ifdef INTERRUPT
-  raw_user_interrupts_on();
 #endif
+               if(waitconfirm) {
+                       numconfirm--;
+               }
+               corestatus[data2] = data1;
+               numsendobjs[data2] = data3;
+               numreceiveobjs[data2] = data4;
+       }
+}
 
-#ifdef RAWDEBUG
-  raw_test_pass(0xe551);
-  raw_test_pass_reg(ptr);
-  raw_test_pass_reg(targetcore);
-  raw_test_pass_reg(tc);
+INLINE void processmsg_terminate_I() {
+#ifdef DEBUG
+#ifndef CLOSE_PRINT
+       BAMBOO_DEBUGPRINT(0xe889);
+#endif
 #endif
+       disruntimedata();
+       BAMBOO_EXIT(0);
+}
 
-  lockobj = (int)ptr;
-  lockflag = false;
-#ifndef INTERRUPT
-  reside = false;
+INLINE void processmsg_memrequest_I() {
+       int data1 = msgdata[msgdataindex];
+       MSG_INDEXINC_I();
+       int data2 = msgdata[msgdataindex];
+       MSG_INDEXINC_I();
+       // receive a shared memory request msg
+       if(BAMBOO_NUM_OF_CORE != STARTUPCORE) {
+               // wrong core to receive such msg
+#ifndef CLOSE_PRINT
+               BAMBOO_DEBUGPRINT_REG(data2);
+#endif
+               BAMBOO_EXIT(0xa010);
+       } else {
+#ifdef DEBUG
+#ifndef CLOSE_PRINT
+               BAMBOO_DEBUGPRINT(0xe88a);
+#endif
+#endif
+               int allocsize = 0;
+               void * mem = NULL;
+#ifdef MULTICORE_GC
+               if(gcprocessing) {
+                       // is currently doing gc, dump this msg
+                       if(INITPHASE == gcphase) {
+                               // if still in the initphase of gc, send a startinit msg again
+                               if(isMsgSending) {
+                                       cache_msg_1(data2, GCSTARTINIT);
+                               } else {
+                                       send_msg_1(data2, GCSTARTINIT, true);
+                               }
+                       }
+               } else { 
+#endif
+               mem = smemalloc_I(data2, data1, &allocsize);
+               if(mem != NULL) {
+                       // send the start_va to request core
+                       if(isMsgSending) {
+                               cache_msg_3(data2, MEMRESPONSE, mem, allocsize);
+                       } else {
+                               send_msg_3(data2, MEMRESPONSE, mem, allocsize, true);
+                       } 
+               } // if mem == NULL, the gcflag of the startup core has been set
+                       // and the gc should be started later, then a GCSTARTINIT msg
+                       // will be sent to the requesting core to notice it to start gc
+                       // and try malloc again
+#ifdef MULTICORE_GC
+               }
 #endif
-  lockresult = 0;
+       }
+}
 
-  if(targetcore == corenum) {
-    // reside on this core
-    bool deny = false;
-#ifdef INTERRUPT
-    raw_user_interrupts_off();
-#endif
-    if(!RuntimeHashcontainskey(locktbl, (int)ptr)) {
-      // no locks for this object
-      // first time to operate on this shared object
-      // create a lock for it
-      // the lock is an integer: 0 -- stall, >0 -- read lock, -1 -- write lock
-#ifdef RAWDEBUG
-      raw_test_pass(0xe552);
-#endif
-      RuntimeHashadd_I(locktbl, (int)ptr, -1);
-    } else {
-      int rwlock_obj = 0;
-      RuntimeHashget(locktbl, (int)ptr, &rwlock_obj);
-#ifdef RAWDEBUG
-      raw_test_pass(0xe553);
-      raw_test_pass_reg(rwlock_obj);
-#endif
-      if(0 == rwlock_obj) {
-       rwlock_obj = -1;
-       RuntimeHashremovekey(locktbl, (int)ptr);
-       RuntimeHashadd_I(locktbl, (int)ptr, rwlock_obj);
-      } else {
-       deny = true;
-      }
-    }
-#ifdef INTERRUPT
-    raw_user_interrupts_on();
+INLINE void processmsg_memresponse_I() {
+       int data1 = msgdata[msgdataindex];
+       MSG_INDEXINC_I();
+       int data2 = msgdata[msgdataindex];
+       MSG_INDEXINC_I();
+       // receive a shared memory response msg
+#ifdef DEBUG
+#ifndef CLOSE_PRINT
+       BAMBOO_DEBUGPRINT(0xe88b);
 #endif
-#ifdef RAWDEBUG
-    raw_test_pass(0xe554);
-    raw_test_pass_reg(lockresult);
 #endif
-    if(lockobj == (int)ptr) {
-      if(deny) {
-       lockresult = 0;
-#ifdef RAWDEBUG
-       raw_test_pass(0);
+#ifdef MULTICORE_GC
+       // if is currently doing gc, dump this msg
+       if(!gcprocessing) {
 #endif
-      } else {
-       lockresult = 1;
-#ifdef RAWDEBUG
-       raw_test_pass(1);
+       if(data2 == 0) {
+               bamboo_smem_size = 0;
+               bamboo_cur_msp = 0;
+       } else {
+#ifdef MULTICORE_GC
+               // fill header to store the size of this mem block
+               memset(data1, 0, BAMBOO_CACHE_LINE_SIZE);
+               (*((int*)data1)) = data2;
+               bamboo_smem_size = data2 - BAMBOO_CACHE_LINE_SIZE;
+               bamboo_cur_msp = data1 + BAMBOO_CACHE_LINE_SIZE;
+#else
+               bamboo_smem_size = data2;
+               bamboo_cur_msp =(void*)(data1);
 #endif
-      }
-      lockflag = true;
-#ifndef INTERRUPT
-      reside = true;
+       }
+       smemflag = true;
+#ifdef MULTICORE_GC
+       }
 #endif
-    } else {
-      // conflicts on lockresults
-      raw_test_done(0xa007);
-    }
-    return true;
-  }
+}
 
-#ifdef RAWDEBUG
-  raw_test_pass(0xe555);
-#endif
-  calCoords(corenum, &self_y, &self_x);
-  calCoords(targetcore, &target_y, &target_x);
-  // Build the message header
-  msgHdr = construct_dyn_hdr(0, msgsize, 0,             // msgsize word sent.
-                             self_y, self_x,
-                             target_y, target_x);
-  // start sending the msg, set send msg flag
-  isMsgSending = true;
-  gdn_send(msgHdr);                     // Send the message header to EAST to handle fab(n - 1).
-#ifdef RAWDEBUG
-  raw_test_pass(0xbbbb);
-  raw_test_pass(0xb000 + targetcore);       // targetcore
-#endif
-  gdn_send(2);   // lock request
-#ifdef RAWDEBUG
-  raw_test_pass(2);
-#endif
-  gdn_send(1);       // write lock
-#ifdef RAWDEBUG
-  raw_test_pass(1);
-#endif
-  gdn_send(ptr);
-#ifdef RAWDEBUG
-  raw_test_pass_reg(ptr);
-#endif
-  gdn_send(corenum);
-#ifdef RAWDEBUG
-  raw_test_pass_reg(corenum);
-  raw_test_pass(0xffff);
-#endif
-  // end of sending this msg, set sand msg flag false
-  isMsgSending = false;
-  // check if there are pending msgs
-  while(isMsgHanging) {
-    // get the msg from outmsgdata[]
-    // length + target + msg
-    outmsgleft = outmsgdata[outmsgindex++];
-    targetcore = outmsgdata[outmsgindex++];
-    calCoords(targetcore, &target_y, &target_x);
-    // Build the message header
-    msgHdr = construct_dyn_hdr(0, outmsgleft, 0,                        // msgsize word sent.
-                               self_y, self_x,
-                               target_y, target_x);
-    isMsgSending = true;
-    gdn_send(msgHdr);                           // Send the message header to EAST to handle fab(n - 1).
-#ifdef RAWDEBUG
-    raw_test_pass(0xbbbb);
-    raw_test_pass(0xb000 + targetcore);             // targetcore
-#endif
-    while(outmsgleft-- > 0) {
-      gdn_send(outmsgdata[outmsgindex++]);
-#ifdef RAWDEBUG
-      raw_test_pass_reg(outmsgdata[outmsgindex - 1]);
+#ifdef MULTICORE_GC
+INLINE void processmsg_gcstartinit_I() {
+       gcflag = true;
+       gcphase = INITPHASE;
+       if(!smemflag) {
+               // is waiting for response of mem request
+               // let it return NULL and start gc
+               bamboo_smem_size = 0;
+               bamboo_cur_msp = NULL;
+               smemflag = true;
+       }
+}
+
+INLINE void processmsg_gcstart_I() {
+#ifdef DEBUG
+#ifndef CLOSE_PRINT
+       BAMBOO_DEBUGPRINT(0xe88c);
 #endif
-    }
-#ifdef RAWDEBUG
-    raw_test_pass(0xffff);
-#endif
-    isMsgSending = false;
-#ifdef INTERRUPT
-    raw_user_interrupts_off();
-#endif
-    // check if there are still msg hanging
-    if(outmsgindex == outmsglast) {
-      // no more msgs
-      outmsgindex = outmsglast = 0;
-      isMsgHanging = false;
-    }
-#ifdef INTERRUPT
-    raw_user_interrupts_on();
 #endif
-  }
-  return true;
-#elif defined THREADSIMULATE
-  int numofcore = pthread_getspecific(key);
-
-  int rc = pthread_rwlock_tryrdlock(&rwlock_tbl);
-  printf("[getwritelock, %d] getting the read lock for locktbl: %d error: \n", numofcore, rc, strerror(rc));
-  if(0 != rc) {
-    return false;
-  }
-  if(!RuntimeHashcontainskey(locktbl, (int)ptr)) {
-    // no locks for this object
-    // first time to operate on this shared object
-    // create a lock for it
-    rc = pthread_rwlock_unlock(&rwlock_tbl);
-    printf("[getwritelock, %d] release the read lock for locktbl: %d error: \n", numofcore, rc, strerror(rc));
-    pthread_rwlock_t* rwlock = (pthread_rwlock_t *)RUNMALLOC(sizeof(pthread_rwlock_t));
-    memcpy(rwlock, &rwlock_init, sizeof(pthread_rwlock_t));
-    rc = pthread_rwlock_init(rwlock, NULL);
-    printf("[getwritelock, %d] initialize the rwlock for object %d: %d error: \n", numofcore, (int)ptr, rc, strerror(rc));
-    rc = pthread_rwlock_trywrlock(&rwlock_tbl);
-    printf("[getwritelock, %d] getting the write lock for locktbl: %d error: \n", numofcore, rc, strerror(rc));
-    if(0 != rc) {
-      pthread_rwlock_destroy(rwlock);
-      RUNFREE(rwlock);
-      return false;
-    } else {
-      if(!RuntimeHashcontainskey(locktbl, (int)ptr)) {
-       // check again
-       RuntimeHashadd(locktbl, (int)ptr, (int)rwlock);
-      } else {
-       pthread_rwlock_destroy(rwlock);
-       RUNFREE(rwlock);
-       RuntimeHashget(locktbl, (int)ptr, (int*)&rwlock);
-      }
-      rc = pthread_rwlock_unlock(&rwlock_tbl);
-      printf("[getwritelock, %d] release the write lock for locktbl: %d error: \n", numofcore, rc, strerror(rc));
-    }
-    rc = pthread_rwlock_trywrlock(rwlock);
-    printf("[getwritelock, %d] getting write lock for object %d: %d error: \n", numofcore, (int)ptr, rc, strerror(rc));
-    if(0 != rc) {
-      return false;
-    } else {
-      return true;
-    }
-  } else {
-    pthread_rwlock_t* rwlock_obj = NULL;
-    RuntimeHashget(locktbl, (int)ptr, (int*)&rwlock_obj);
-    rc = pthread_rwlock_unlock(&rwlock_tbl);
-    printf("[getwritelock, %d] release the read lock for locktbl: %d error: \n", numofcore, rc, strerror(rc));
-    int rc_obj = pthread_rwlock_trywrlock(rwlock_obj);
-    printf("[getwritelock, %d] getting write lock for object %d: %d error: \n", numofcore, (int)ptr, rc_obj, strerror(rc_obj));
-    if(0 != rc_obj) {
-      return false;
-    } else {
-      return true;
-    }
-  }
+       // set the GC flag
+       gcphase = MARKPHASE;
+}
 
-#endif
+INLINE void processmsg_gcstartcompact_I() {
+       gcblock2fill = msgdata[msgdataindex];
+       MSG_INDEXINC_I(); //msgdata[1];
+       gcphase = COMPACTPHASE;
 }
 
-void releasewritelock(void * ptr) {
-#ifdef RAW
-  unsigned msgHdr;
-  int self_y, self_x, target_y, target_x;
-  int targetcore = 0;       //((int)ptr >> 5) % TOTALCORE;
-  // for 32 bit machine, the size is always 3 words
-  //int msgsize = sizeof(int) * 3;
-  int msgsize = 3;
-  int tc = TOTALCORE;
-#ifdef INTERRUPT
-  raw_user_interrupts_off();
+INLINE void processmsg_gcstartflush_I() {
+       gcphase = FLUSHPHASE;
+}
+
+INLINE void processmsg_gcfinishinit_I() {
+       int data1 = msgdata[msgdataindex];
+       MSG_INDEXINC_I();
+       // received a init phase finish msg
+       if(BAMBOO_NUM_OF_CORE != STARTUPCORE) {
+               // non startup core can not receive this msg
+#ifndef CLOSE_PRINT
+               BAMBOO_DEBUGPRINT_REG(data1);
 #endif
-  targetcore = ((int)ptr >> 5) % tc;
-#ifdef INTERRUPT
-  raw_user_interrupts_on();
+               BAMBOO_EXIT(0xb001);
+       }
+#ifdef DEBUG
+       BAMBOO_DEBUGPRINT(0xe88c);
+       BAMBOO_DEBUGPRINT_REG(data1);
 #endif
+       // All cores should do init GC
+       if(data1 < NUMCORESACTIVE) {
+               gccorestatus[data1] = 0;
+       }
+}
+
+INLINE void processmsg_gcfinishmark_I() {
+       int data1 = msgdata[msgdataindex];
+       MSG_INDEXINC_I();
+       int data2 = msgdata[msgdataindex];
+       MSG_INDEXINC_I();
+       int data3 = msgdata[msgdataindex];
+       MSG_INDEXINC_I();
+       // received a mark phase finish msg
+       if(BAMBOO_NUM_OF_CORE != STARTUPCORE) {
+               // non startup core can not receive this msg
+#ifndef CLOSE_PRINT
+               BAMBOO_DEBUGPRINT_REG(data1);
+#endif
+               BAMBOO_EXIT(0xb002);
+       }
+       // all cores should do mark
+       if(data1 < NUMCORESACTIVE) {
+               gccorestatus[data1] = 0;
+               gcnumsendobjs[data1] = data2;
+               gcnumreceiveobjs[data1] = data3;
+       }
+}
 
-  if(targetcore == corenum) {
-#ifdef INTERRUPT
-    raw_user_interrupts_off();
+INLINE void processmsg_gcfinishcompact_I() {
+       if(BAMBOO_NUM_OF_CORE != STARTUPCORE) {
+               // non startup core can not receive this msg
+               // return -1
+#ifndef CLOSE_PRINT
+               BAMBOO_DEBUGPRINT_REG(msgdata[msgdataindex]/*[1]*/);
 #endif
-    // reside on this core
-    if(!RuntimeHashcontainskey(locktbl, (int)ptr)) {
-      // no locks for this object, something is wrong
-      raw_test_done(0xa008);
-    } else {
-      int rwlock_obj = 0;
-#ifdef RAWDEBUG
-      raw_test_pass(0xe662);
+               BAMBOO_EXIT(0xb003);
+       }
+       int cnum = msgdata[msgdataindex];
+       MSG_INDEXINC_I(); //msgdata[1];
+       int filledblocks = msgdata[msgdataindex];
+       MSG_INDEXINC_I(); //msgdata[2];
+       int heaptop = msgdata[msgdataindex];
+       MSG_INDEXINC_I(); //msgdata[3];
+       int data4 = msgdata[msgdataindex];
+       MSG_INDEXINC_I(); //msgdata[4];
+       // only gc cores need to do compact
+       if(cnum < NUMCORES4GC) {
+               if(COMPACTPHASE == gcphase) {
+                       gcfilledblocks[cnum] = filledblocks;
+                       gcloads[cnum] = heaptop;
+               }
+               if(data4 > 0) {
+                       // ask for more mem
+                       int startaddr = 0;
+                       int tomove = 0;
+                       int dstcore = 0;
+                       if(gcfindSpareMem_I(&startaddr, &tomove, &dstcore, data4, cnum)) {
+                               if(isMsgSending) {
+                                       cache_msg_4(cnum, GCMOVESTART, dstcore, startaddr, tomove);
+                         } else {
+                                       send_msg_4(cnum, GCMOVESTART, dstcore, startaddr, tomove, true);
+                               }
+                       }
+               } else {
+                       gccorestatus[cnum] = 0;
+               } // if(data4>0)
+       } // if(cnum < NUMCORES4GC)
+}
+
+INLINE void processmsg_gcfinishflush_I() {
+       int data1 = msgdata[msgdataindex];
+       MSG_INDEXINC_I();
+       // received a flush phase finish msg
+       if(BAMBOO_NUM_OF_CORE != STARTUPCORE) {
+               // non startup core can not receive this msg
+               // return -1
+#ifndef CLOSE_PRINT
+               BAMBOO_DEBUGPRINT_REG(data1);
+#endif
+               BAMBOO_EXIT(0xb004);
+       } 
+       // all cores should do flush
+       if(data1 < NUMCORESACTIVE) {
+               gccorestatus[data1] = 0;
+       }
+}
+
+INLINE void processmsg_gcmarkconfirm_I() {
+       if((BAMBOO_NUM_OF_CORE == STARTUPCORE) 
+                       || (BAMBOO_NUM_OF_CORE > NUMCORESACTIVE - 1)) {
+               // wrong core to receive such msg
+               BAMBOO_EXIT(0xb005);
+       } else {
+               // send response msg
+               if(isMsgSending) {
+                       cache_msg_5(STARTUPCORE, GCMARKREPORT, BAMBOO_NUM_OF_CORE, 
+                                                                       gcbusystatus, gcself_numsendobjs, 
+                                                                       gcself_numreceiveobjs);
+               } else {
+                       send_msg_5(STARTUPCORE, GCMARKREPORT, BAMBOO_NUM_OF_CORE, 
+                                                                gcbusystatus, gcself_numsendobjs, 
+                                                                gcself_numreceiveobjs, true);
+               }
+       }
+}
+
+INLINE void processmsg_gcmarkreport_I() {
+       int data1 = msgdata[msgdataindex];
+       MSG_INDEXINC_I();
+       int data2 = msgdata[msgdataindex];
+       MSG_INDEXINC_I();
+       int data3 = msgdata[msgdataindex];
+       MSG_INDEXINC_I();
+       int data4 = msgdata[msgdataindex];
+       MSG_INDEXINC_I();
+       // received a marked phase finish confirm response msg
+       if(BAMBOO_NUM_OF_CORE != STARTUPCORE) {
+               // wrong core to receive such msg
+#ifndef CLOSE_PRINT
+               BAMBOO_DEBUGPRINT_REG(data2);
+#endif
+               BAMBOO_EXIT(0xb006);
+       } else {
+               if(waitconfirm) {
+                       numconfirm--;
+               }
+               gccorestatus[data1] = data2;
+               gcnumsendobjs[data1] = data3;
+               gcnumreceiveobjs[data1] = data4;
+       }
+}
+
+INLINE void processmsg_gcmarkedobj_I() {
+       int data1 = msgdata[msgdataindex];
+       MSG_INDEXINC_I();
+       // received a markedObj msg
+       if(((int *)data1)[6] == INIT) {
+                       // this is the first time that this object is discovered,
+                       // set the flag as DISCOVERED
+                       ((int *)data1)[6] = DISCOVERED;
+                       gc_enqueue_I(data1);
+       }
+       gcself_numreceiveobjs++;
+       gcbusystatus = true;
+}
+
+INLINE void processmsg_gcmovestart_I() {
+       gctomove = true;
+       gcdstcore = msgdata[msgdataindex];
+       MSG_INDEXINC_I(); //msgdata[1];
+       gcmovestartaddr = msgdata[msgdataindex];
+       MSG_INDEXINC_I(); //msgdata[2];
+       gcblock2fill = msgdata[msgdataindex];
+       MSG_INDEXINC_I(); //msgdata[3];
+}
+
+INLINE void processmsg_gcmaprequest_I() {
+#ifdef GC_PROFILE
+       //unsigned long long ttime = BAMBOO_GET_EXE_TIME();
+#endif
+       void * dstptr = NULL;
+       int data1 = msgdata[msgdataindex];
+       MSG_INDEXINC_I();
+       //dstptr = mgchashSearch(msgdata[1]);
+#ifdef GC_PROFILE
+       unsigned long long ttime = BAMBOO_GET_EXE_TIME();
+#endif
+       RuntimeHashget(gcpointertbl, data1, &dstptr);
+#ifdef GC_PROFILE
+       flushstalltime += BAMBOO_GET_EXE_TIME() - ttime;
+#endif
+       int data2 = msgdata[msgdataindex];
+       MSG_INDEXINC_I();
+       //MGCHashget(gcpointertbl, msgdata[1], &dstptr);
+#ifdef GC_PROFILE
+       unsigned long long ttimei = BAMBOO_GET_EXE_TIME();
+#endif
+       if(NULL == dstptr) {
+               // no such pointer in this core, something is wrong
+#ifdef DEBUG
+               BAMBOO_DEBUGPRINT_REG(data1);
+               BAMBOO_DEBUGPRINT_REG(data2);
+#endif
+               BAMBOO_EXIT(0xb007);
+               //assume that the object was not moved, use the original address
+               /*if(isMsgSending) {
+                       cache_msg_3(msgdata[2], GCMAPINFO, msgdata[1], msgdata[1]);
+               } else {
+                       send_msg_3(msgdata[2], GCMAPINFO, msgdata[1], msgdata[1], true);
+               }*/
+       } else {
+               // send back the mapping info
+               if(isMsgSending) {
+                       cache_msg_3(data2, GCMAPINFO, data1, (int)dstptr);
+               } else {
+                       send_msg_3(data2, GCMAPINFO, data1, (int)dstptr, true);
+               }
+       }
+#ifdef GC_PROFILE
+       flushstalltime_i += BAMBOO_GET_EXE_TIME()-ttimei;
+       //num_mapinforequest_i++;
 #endif
-      RuntimeHashget(locktbl, (int)ptr, &rwlock_obj);
-#ifdef RAWDEBUG
-      raw_test_pass_reg(rwlock_obj);
+}
+
+INLINE void processmsg_gcmapinfo_I() {
+#ifdef GC_PROFILE
+       //unsigned long long ttime = BAMBOO_GET_EXE_TIME();
 #endif
-      rwlock_obj++;
-      RuntimeHashremovekey(locktbl, (int)ptr);
-      RuntimeHashadd_I(locktbl, (int)ptr, rwlock_obj);
-#ifdef RAWDEBUG
-      raw_test_pass_reg(rwlock_obj);
+       int data1 = msgdata[msgdataindex];
+       MSG_INDEXINC_I();
+       if(data1 != gcobj2map) {
+                       // obj not matched, something is wrong
+#ifdef DEBUG
+                       BAMBOO_DEBUGPRINT_REG(gcobj2map);
+                       BAMBOO_DEBUGPRINT_REG(msgdata[1]);
+#endif
+                       BAMBOO_EXIT(0xb008);
+               } else {
+                       gcmappedobj = msgdata[msgdataindex]; // [2]
+      MSG_INDEXINC_I();
+                       //mgchashReplace_I(msgdata[1], msgdata[2]);
+                       //mgchashInsert_I(gcobj2map, gcmappedobj);
+                       RuntimeHashadd_I(gcpointertbl, gcobj2map, gcmappedobj);
+                       //MGCHashadd_I(gcpointertbl, gcobj2map, gcmappedobj);
+               }
+               gcismapped = true;
+#ifdef GC_PROFILE
+                       //flushstalltime += BAMBOO_GET_EXE_TIME() - ttime;
+#endif
+}
+
+INLINE void processmsg_gclobjinfo_I() {
+       numconfirm--;
+
+       int data1 = msgdata[msgdataindex];
+       MSG_INDEXINC_I();
+       int data2 = msgdata[msgdataindex];
+       MSG_INDEXINC_I();
+       if(BAMBOO_NUM_OF_CORE > NUMCORES4GC - 1) {
+#ifndef CLOSE_PRINT
+               BAMBOO_DEBUGPRINT_REG(data2);
+#endif
+               BAMBOO_EXIT(0xb009);
+       } 
+       // store the mark result info 
+       int cnum = data2;
+       gcloads[cnum] = msgdata[msgdataindex];
+       MSG_INDEXINC_I(); // msgdata[3];
+       int data4 = msgdata[msgdataindex];
+       MSG_INDEXINC_I();
+       if(gcheaptop < data4) {
+               gcheaptop = data4;
+       }
+       // large obj info here
+       for(int k = 5; k < data1;) {
+               int lobj = msgdata[msgdataindex];
+               MSG_INDEXINC_I(); //msgdata[k++];
+               int length = msgdata[msgdataindex];
+               MSG_INDEXINC_I(); //msgdata[k++];
+               gc_lobjenqueue_I(lobj, length, cnum);
+               gcnumlobjs++;
+       } // for(int k = 5; k < msgdata[1];)
+}
+
+INLINE void processmsg_gclobjmapping_I() {
+       int data1 = msgdata[msgdataindex];
+       MSG_INDEXINC_I();
+       int data2 = msgdata[msgdataindex];
+       MSG_INDEXINC_I();
+       //mgchashInsert_I(msgdata[1], msgdata[2]);
+       RuntimeHashadd_I(gcpointertbl, data1, data2);
+       //MGCHashadd_I(gcpointertbl, msgdata[1], msgdata[2]);
+}
+#endif // #ifdef MULTICORE_GC
+
+// receive object transferred from other cores
+// or the terminate message from other cores
+// Should be invoked in critical sections!!
+// NOTICE: following format is for threadsimulate version only
+//         RAW version please see previous description
+// format: type + object
+// type: -1--stall msg
+//      !-1--object
+// return value: 0--received an object
+//               1--received nothing
+//               2--received a Stall Msg
+//               3--received a lock Msg
+//               RAW version: -1 -- received nothing
+//                            otherwise -- received msg type
+int receiveObject() {
+msg:
+       // get the incoming msgs
+  if(receiveMsg() == -1) {
+         return -1;
+  }
+processmsg:
+       // processing received msgs
+       int size = 0;
+       MSG_REMAINSIZE_I(&size);
+  if(checkMsgLength_I(size) == -1) {
+               // not a whole msg
+               // have new coming msg
+               if(BAMBOO_MSG_AVAIL() != 0) {
+                       goto msg;
+               } else {
+                       return -1;
+               }
+       }
+
+       if(msglength <= size) {
+               // have some whole msg
+  //if(msgdataindex == msglength) {
+    // received a whole msg
+    MSGTYPE type;
+    type = msgdata[msgdataindex]; //[0]
+               MSG_INDEXINC_I();
+               msgdatafull = false;
+               // TODO
+               //tprintf("msg type: %x\n", type);
+    switch(type) {
+                       case TRANSOBJ: {
+                               // receive a object transfer msg
+                               processmsg_transobj_I();
+                               break;
+                       } // case TRANSOBJ
+
+                       case TRANSTALL: {
+                               // receive a stall msg
+                               processmsg_transtall_I();
+                               break;
+                       } // case TRANSTALL
+
+// GC version have no lock msgs
+#ifndef MULTICORE_GC
+                       case LOCKREQUEST: {
+                               // receive lock request msg, handle it right now
+                               processmsg_lockrequest_I();
+                               break;
+                       } // case LOCKREQUEST
+
+                       case LOCKGROUNT: {
+                               // receive lock grount msg
+                               processmsg_lockgrount_I();
+                               break;
+                       } // case LOCKGROUNT
+
+                       case LOCKDENY: {
+                               // receive lock deny msg
+                               processmsg_lockdeny_I();
+                               break;
+                       } // case LOCKDENY
+
+                       case LOCKRELEASE: {
+                               processmsg_lockrelease_I();
+                               break;
+                       } // case LOCKRELEASE
+#endif // #ifndef MULTICORE_GC
+
+#ifdef PROFILE
+                       case PROFILEOUTPUT: {
+                               // receive an output profile data request msg
+                               processmsg_profileoutput_I();
+                               break;
+                       } // case PROFILEOUTPUT
+
+                       case PROFILEFINISH: {
+                               // receive a profile output finish msg
+                               processmsg_profilefinish_I();
+                               break;
+                       } // case PROFILEFINISH
+#endif // #ifdef PROFILE
+
+// GC version has no lock msgs
+#ifndef MULTICORE_GC
+                       case REDIRECTLOCK: {
+                               // receive a redirect lock request msg, handle it right now
+                               processmsg_redirectlock_I();
+                               break;
+                       } // case REDIRECTLOCK
+
+                       case REDIRECTGROUNT: {
+                               // receive a lock grant msg with redirect info
+                               processmsg_redirectgrount_I();
+                               break;
+                       } // case REDIRECTGROUNT
+                       
+                       case REDIRECTDENY: {
+                               // receive a lock deny msg with redirect info
+                               processmsg_redirectdeny_I();
+                               break;
+                       } // case REDIRECTDENY
+
+                       case REDIRECTRELEASE: {
+                               // receive a lock release msg with redirect info
+                               processmsg_redirectrelease_I();
+                               break;
+                       } // case REDIRECTRELEASE
+#endif // #ifndef MULTICORE_GC
+       
+                       case STATUSCONFIRM: {
+                               // receive a status confirm info
+                               processmsg_statusconfirm_I();
+                               break;
+                       } // case STATUSCONFIRM
+
+                       case STATUSREPORT: {
+                               processmsg_statusreport_I();
+                               break;
+                       } // case STATUSREPORT
+
+                       case TERMINATE: {
+                               // receive a terminate msg
+                               processmsg_terminate_I();
+                               break;
+                       } // case TERMINATE
+
+                       case MEMREQUEST: {
+                               processmsg_memrequest_I();
+                               break;
+                       } // case MEMREQUEST
+
+                       case MEMRESPONSE: {
+                               processmsg_memresponse_I();
+                               break;
+                       } // case MEMRESPONSE
+
+#ifdef MULTICORE_GC
+                       // GC msgs
+                       case GCSTARTINIT: {
+                               processmsg_gcstartinit_I();
+                               break;
+                       } // case GCSTARTINIT
+
+                       case GCSTART: {
+                               // receive a start GC msg
+                               processmsg_gcstart_I();
+                               break;
+                       } // case GCSTART
+
+                       case GCSTARTCOMPACT: {
+                               // a compact phase start msg
+                               processmsg_gcstartcompact_I();
+                               break;
+                       } // case GCSTARTCOMPACT
+
+                       case GCSTARTFLUSH: {
+                               // received a flush phase start msg
+                               processmsg_gcstartflush_I();
+                               break;
+                       } // case GCSTARTFLUSH
+                       
+                       case GCFINISHINIT: {
+                               processmsg_gcfinishinit_I();
+                               break;
+                       } // case GCFINISHINIT
+
+                       case GCFINISHMARK: {
+                               processmsg_gcfinishmark_I();
+                               break;
+                       } // case GCFINISHMARK
+                       
+                       case GCFINISHCOMPACT: {
+                               // received a compact phase finish msg
+                               processmsg_gcfinishcompact_I();
+                               break;
+                       } // case GCFINISHCOMPACT
+
+                       case GCFINISHFLUSH: {
+                               processmsg_gcfinishflush_I();
+                               break;
+                       } // case GCFINISHFLUSH
+
+                       case GCFINISH: {
+                               // received a GC finish msg
+                               gcphase = FINISHPHASE;
+                               break;
+                       } // case GCFINISH
+
+                       case GCMARKCONFIRM: {
+                               // received a marked phase finish confirm request msg
+                               // all cores should do mark
+                               processmsg_gcmarkconfirm_I();
+                               break;
+                       } // case GCMARKCONFIRM
+
+                       case GCMARKREPORT: {
+                               processmsg_gcmarkreport_I();
+                               break;
+                       } // case GCMARKREPORT
+
+                       case GCMARKEDOBJ: {
+                               processmsg_gcmarkedobj_I();
+                               break;
+                       } // case GCMARKEDOBJ
+
+                       case GCMOVESTART: {
+                               // received a start moving objs msg
+                               processmsg_gcmovestart_I();
+                               break;
+                       } // case GCMOVESTART
+                       
+                       case GCMAPREQUEST: {
+                               // received a mapping info request msg
+                               processmsg_gcmaprequest_I();
+                               break;
+                       } // case GCMAPREQUEST
+
+                       case GCMAPINFO: {
+                               // received a mapping info response msg
+                               processmsg_gcmapinfo_I();
+                               break;
+                       } // case GCMAPINFO
+
+                       case GCLOBJREQUEST: {
+                               // received a large objs info request msg
+                               transferMarkResults_I();
+                               break;
+                       } // case GCLOBJREQUEST
+
+                       case GCLOBJINFO: {
+                               // received a large objs info response msg
+                               processmsg_gclobjinfo_I();
+                               break;
+                       } // case GCLOBJINFO
+                       
+                       case GCLOBJMAPPING: {
+                               // received a large obj mapping info msg
+                               processmsg_gclobjmapping_I();
+                               break;
+                       } // case GCLOBJMAPPING
+
+#endif // #ifdef MULTICORE_GC
+
+                       default:
+                               break;
+               } // switch(type)
+               //memset(msgdata, '\0', sizeof(int) * msgdataindex);
+               //msgdataindex = 0;
+               msglength = BAMBOO_MSG_BUF_LENGTH;
+               // TODO
+               //printf("++ msg: %x \n", type);
+               if(msgdataindex != msgdatalast) {
+                       // still have available msg
+                       goto processmsg;
+               }
+#ifdef DEBUG
+#ifndef CLOSE_PRINT
+               BAMBOO_DEBUGPRINT(0xe88d);
 #endif
-    }
-#ifdef INTERRUPT
-    raw_user_interrupts_on();
 #endif
-    return;
-  }
 
-#ifdef RAWDEBUG
-  raw_test_pass(0xe663);
-#endif
-  calCoords(corenum, &self_y, &self_x);
-  calCoords(targetcore, &target_y, &target_x);
-  // Build the message header
-  msgHdr = construct_dyn_hdr(0, msgsize, 0,             // msgsize word sent.
-                             self_y, self_x,
-                             target_y, target_x);
-  // start sending the msg, set send msg flag
-  isMsgSending = true;
-  gdn_send(msgHdr);                     // Send the message header to EAST to handle fab(n - 1).
-#ifdef RAWDEBUG
-  raw_test_pass(0xbbbb);
-  raw_test_pass(0xb000 + targetcore);
-#endif
-  gdn_send(5);   // lock release
- #ifdef RAWDEBUG
-  raw_test_pass(5);
-#endif
-  gdn_send(1);       // write lock
-#ifdef RAWDEBUG
-  raw_test_pass(1);
-#endif
-  gdn_send(ptr);
-#ifdef RAWDEBUG
-  raw_test_pass_reg(ptr);
-  raw_test_pass(0xffff);
-#endif
-  // end of sending this msg, set sand msg flag false
-  isMsgSending = false;
-  // check if there are pending msgs
-  while(isMsgHanging) {
-    // get the msg from outmsgdata[]
-    // length + target + msg
-    outmsgleft = outmsgdata[outmsgindex++];
-    targetcore = outmsgdata[outmsgindex++];
-    calCoords(targetcore, &target_y, &target_x);
-    // Build the message header
-    msgHdr = construct_dyn_hdr(0, outmsgleft, 0,                        // msgsize word sent.
-                               self_y, self_x,
-                               target_y, target_x);
-    isMsgSending = true;
-    gdn_send(msgHdr);                           // Send the message header to EAST to handle fab(n - 1).
-#ifdef RAWDEBUG
-    raw_test_pass(0xbbbb);
-    raw_test_pass(0xb000 + targetcore);             // targetcore
-#endif
-    while(outmsgleft-- > 0) {
-      gdn_send(outmsgdata[outmsgindex++]);
-#ifdef RAWDEBUG
-      raw_test_pass_reg(outmsgdata[outmsgindex - 1]);
+               // have new coming msg
+               if(BAMBOO_MSG_AVAIL() != 0) {
+                       goto msg;
+               }
+
+#ifdef PROFILE
+/*if(isInterrupt) {
+               profileTaskEnd();
+       }*/
 #endif
-    }
-#ifdef RAWDEBUG
-    raw_test_pass(0xffff);
-#endif
-    isMsgSending = false;
-#ifdef INTERRUPT
-    raw_user_interrupts_off();
-#endif
-    // check if there are still msg hanging
-    if(outmsgindex == outmsglast) {
-      // no more msgs
-      outmsgindex = outmsglast = 0;
-      isMsgHanging = false;
-    }
-#ifdef INTERRUPT
-    raw_user_interrupts_on();
+               return (int)type;
+       } else {
+               // not a whole msg
+#ifdef DEBUG
+#ifndef CLOSE_PRINT
+               BAMBOO_DEBUGPRINT(0xe88e);
 #endif
-  }
-#elif defined THREADSIMULATE
-  int numofcore = pthread_getspecific(key);
-  int rc = pthread_rwlock_rdlock(&rwlock_tbl);
-  printf("[releasewritelock, %d] getting the read lock for locktbl: %d error: \n", numofcore, rc, strerror(rc));
-  if(!RuntimeHashcontainskey(locktbl, (int)ptr)) {
-    printf("[releasewritelock, %d] Error: try to release a lock without previously grab it\n", numofcore);
-    exit(-1);
-  }
-  pthread_rwlock_t* rwlock_obj = NULL;
-  RuntimeHashget(locktbl, (int)ptr, (int*)&rwlock_obj);
-  int rc_obj = pthread_rwlock_unlock(rwlock_obj);
-  printf("[releasewritelock, %d] unlocked object %d: %d error:\n", numofcore, (int)ptr, rc_obj, strerror(rc_obj));
-  rc = pthread_rwlock_unlock(&rwlock_tbl);
-  printf("[releasewritelock, %d] release the read lock for locktbl: %d error: \n", numofcore, rc, strerror(rc));
 #endif
+#ifdef PROFILE
+       /*  if(isInterrupt) {
+                               profileTaskEnd();
+                       }*/
+#endif
+    return -2;
+  }
 }
 
-int enqueuetasks(struct parameterwrapper *parameter, struct parameterwrapper *prevptr, struct ___Object___ *ptr, int * enterflags, int numenterflags) {
+int enqueuetasks(struct parameterwrapper *parameter, 
+                            struct parameterwrapper *prevptr, 
+                                                                struct ___Object___ *ptr, 
+                                                                int * enterflags, 
+                                                                int numenterflags) {
   void * taskpointerarray[MAXTASKPARAMS];
   int j;
-  int numparams=parameter->task->numParameters;
+  //int numparams=parameter->task->numParameters;
   int numiterators=parameter->task->numTotal-1;
   int retval=1;
-  int addnormal=1;
-  int adderror=1;
 
   struct taskdescriptor * task=parameter->task;
 
-  ObjectHashadd(parameter->objectset, (int) ptr, 0, (int) enterflags, numenterflags, enterflags==NULL);      //this add the object to parameterwrapper
+   //this add the object to parameterwrapper
+   ObjectHashadd(parameter->objectset, (int) ptr, 0, (int) enterflags, 
+                                  numenterflags, enterflags==NULL);
 
   /* Add enqueued object to parameter vector */
   taskpointerarray[parameter->slot]=ptr;
@@ -3303,7 +2683,7 @@ int enqueuetasks(struct parameterwrapper *parameter, struct parameterwrapper *pr
   /* Find initial state */
   for(j=0; j<numiterators; j++) {
 backtrackinit:
-    if(toiHasNext(&parameter->iterators[j], taskpointerarray OPTARG(failed)))
+    if(toiHasNext(&parameter->iterators[j],taskpointerarray OPTARG(failed)))
       toiNext(&parameter->iterators[j], taskpointerarray OPTARG(failed));
     else if (j>0) {
       /* Need to backtrack */
@@ -3316,20 +2696,23 @@ backtrackinit:
     }
   }
 
-
   while(1) {
     /* Enqueue current state */
-    int launch = 0;
-    struct taskparamdescriptor *tpd=RUNMALLOC(sizeof(struct taskparamdescriptor));
+    //int launch = 0;
+    struct taskparamdescriptor *tpd=
+                       RUNMALLOC(sizeof(struct taskparamdescriptor));
     tpd->task=task;
     tpd->numParameters=numiterators+1;
     tpd->parameterArray=RUNMALLOC(sizeof(void *)*(numiterators+1));
+
     for(j=0; j<=numiterators; j++) {
-      tpd->parameterArray[j]=taskpointerarray[j]; //store the actual parameters
+                       //store the actual parameters
+      tpd->parameterArray[j]=taskpointerarray[j]; 
     }
     /* Enqueue task */
-    if ((/*!gencontains(failedtasks, tpd)&&*/ !gencontains(activetasks,tpd))) {
-      genputtable(activetasks, tpd, tpd);
+    if ((/*!gencontains(failedtasks, tpd)&&*/ 
+                                       !gencontains(activetasks,tpd))) {
+               genputtable(activetasks, tpd, tpd);
     } else {
       RUNFREE(tpd->parameterArray);
       RUNFREE(tpd);
@@ -3341,7 +2724,7 @@ backtrackinit:
 
     for(j=numiterators-1; j<numiterators; j++) {
 backtrackinc:
-      if(toiHasNext(&parameter->iterators[j], taskpointerarray OPTARG(failed)))
+      if(toiHasNext(&parameter->iterators[j],taskpointerarray OPTARG(failed)))
        toiNext(&parameter->iterators[j], taskpointerarray OPTARG(failed));
       else if (j>0) {
        /* Need to backtrack */
@@ -3357,19 +2740,24 @@ backtrackinc:
   return retval;
 }
 
-#ifdef RAW
-int enqueuetasks_I(struct parameterwrapper *parameter, struct parameterwrapper *prevptr, struct ___Object___ *ptr, int * enterflags, int numenterflags) {
+int enqueuetasks_I(struct parameterwrapper *parameter, 
+                              struct parameterwrapper *prevptr, 
+                                                                        struct ___Object___ *ptr, 
+                                                                        int * enterflags, 
+                                                                        int numenterflags) {
   void * taskpointerarray[MAXTASKPARAMS];
   int j;
-  int numparams=parameter->task->numParameters;
+  //int numparams=parameter->task->numParameters;
   int numiterators=parameter->task->numTotal-1;
   int retval=1;
-  int addnormal=1;
-  int adderror=1;
+  //int addnormal=1;
+  //int adderror=1;
 
   struct taskdescriptor * task=parameter->task;
 
-  ObjectHashadd_I(parameter->objectset, (int) ptr, 0, (int) enterflags, numenterflags, enterflags==NULL);      //this add the object to parameterwrapper
+   //this add the object to parameterwrapper
+   ObjectHashadd_I(parameter->objectset, (int) ptr, 0, (int) enterflags, 
+                                    numenterflags, enterflags==NULL);  
 
   /* Add enqueued object to parameter vector */
   taskpointerarray[parameter->slot]=ptr;
@@ -3382,7 +2770,7 @@ int enqueuetasks_I(struct parameterwrapper *parameter, struct parameterwrapper *
   /* Find initial state */
   for(j=0; j<numiterators; j++) {
 backtrackinit:
-    if(toiHasNext(&parameter->iterators[j], taskpointerarray OPTARG(failed)))
+    if(toiHasNext(&parameter->iterators[j],taskpointerarray OPTARG(failed)))
       toiNext(&parameter->iterators[j], taskpointerarray OPTARG(failed));
     else if (j>0) {
       /* Need to backtrack */
@@ -3397,17 +2785,21 @@ backtrackinit:
 
   while(1) {
     /* Enqueue current state */
-    int launch = 0;
-    struct taskparamdescriptor *tpd=RUNMALLOC_I(sizeof(struct taskparamdescriptor));
+    //int launch = 0;
+    struct taskparamdescriptor *tpd=
+                       RUNMALLOC_I(sizeof(struct taskparamdescriptor));
     tpd->task=task;
     tpd->numParameters=numiterators+1;
     tpd->parameterArray=RUNMALLOC_I(sizeof(void *)*(numiterators+1));
+
     for(j=0; j<=numiterators; j++) {
-      tpd->parameterArray[j]=taskpointerarray[j]; //store the actual parameters
+                       //store the actual parameters
+      tpd->parameterArray[j]=taskpointerarray[j]; 
     }
     /* Enqueue task */
-    if ((/*!gencontains(failedtasks, tpd)&&*/ !gencontains(activetasks,tpd))) {
-      genputtable_I(activetasks, tpd, tpd);
+    if ((/*!gencontains(failedtasks, tpd)&&*/ 
+                                       !gencontains(activetasks,tpd))) {
+               genputtable_I(activetasks, tpd, tpd);
     } else {
       RUNFREE(tpd->parameterArray);
       RUNFREE(tpd);
@@ -3434,47 +2826,67 @@ backtrackinc:
   }
   return retval;
 }
-#endif
 
-/* Handler for signals. The signals catch null pointer errors and
-   arithmatic errors. */
-#ifndef RAW
-void myhandler(int sig, siginfo_t *info, void *uap) {
-  sigset_t toclear;
-#ifdef DEBUG
-  printf("sig=%d\n",sig);
-  printf("signal\n");
-#endif
-  sigemptyset(&toclear);
-  sigaddset(&toclear, sig);
-  sigprocmask(SIG_UNBLOCK, &toclear,NULL);
-  longjmp(error_handler,1);
-}
+#ifdef MULTICORE_GC
+#define OFFSET 2
+#else
+#define OFFSET 0
 #endif
 
-fd_set readfds;
-int maxreadfd;
-struct RuntimeHash *fdtoobject;
+int containstag(struct ___Object___ *ptr, 
+                           struct ___TagDescriptor___ *tag);
 
-void addreadfd(int fd) {
-  if (fd>=maxreadfd)
-    maxreadfd=fd+1;
-  FD_SET(fd, &readfds);
-}
+#ifndef MULTICORE_GC
+void releasewritelock_r(void * lock, void * redirectlock) {
+  int targetcore = 0;
+  int reallock = (int)lock;
+  targetcore = (reallock >> 5) % BAMBOO_TOTALCORE;
+
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(0xe671);
+  BAMBOO_DEBUGPRINT_REG((int)lock);
+  BAMBOO_DEBUGPRINT_REG(reallock);
+  BAMBOO_DEBUGPRINT_REG(targetcore);
+#endif
 
-void removereadfd(int fd) {
-  FD_CLR(fd, &readfds);
-  if (maxreadfd==(fd+1)) {
-    maxreadfd--;
-    while(maxreadfd>0&&!FD_ISSET(maxreadfd-1, &readfds))
-      maxreadfd--;
+  if(targetcore == BAMBOO_NUM_OF_CORE) {
+       BAMBOO_START_CRITICAL_SECTION_LOCK();
+#ifdef DEBUG
+       BAMBOO_DEBUGPRINT(0xf001);
+#endif
+    // reside on this core
+    if(!RuntimeHashcontainskey(locktbl, reallock)) {
+      // no locks for this object, something is wrong
+      BAMBOO_EXIT(0xa011);
+    } else {
+      int rwlock_obj = 0;
+         struct LockValue * lockvalue = NULL;
+#ifdef DEBUG
+      BAMBOO_DEBUGPRINT(0xe672);
+#endif
+      RuntimeHashget(locktbl, reallock, &rwlock_obj);
+         lockvalue = (struct LockValue *)rwlock_obj;
+#ifdef DEBUG
+      BAMBOO_DEBUGPRINT_REG(lockvalue->value);
+#endif
+      lockvalue->value++;
+         lockvalue->redirectlock = (int)redirectlock;
+#ifdef DEBUG
+      BAMBOO_DEBUGPRINT_REG(lockvalue->value);
+#endif
+    }
+       BAMBOO_CLOSE_CRITICAL_SECTION_LOCK();
+#ifdef DEBUG
+       BAMBOO_DEBUGPRINT(0xf000);
+#endif
+    return;
+  } else {
+         // send lock release with redirect info msg
+         // for 32 bit machine, the size is always 4 words
+               send_msg_4(targetcore, REDIRECTRELEASE, 1, (int)lock, 
+                                      (int)redirectlock, false);
   }
 }
-
-#ifdef PRECISE_GC
-#define OFFSET 2
-#else
-#define OFFSET 0
 #endif
 
 void executetasks() {
@@ -3486,275 +2898,200 @@ void executetasks() {
   struct parameterwrapper *pw=NULL;
   int j = 0;
   int x = 0;
-  bool lock = true;
+  bool islock = true;
 
-#ifdef RAW
   int grount = 0;
   int andmask=0;
   int checkmask=0;
-#ifdef RAWDEBUG
-  raw_test_pass(0xe991);
-#endif
-#endif
-
-#ifndef RAW
-  /* Set up signal handlers */
-  struct sigaction sig;
-  sig.sa_sigaction=&myhandler;
-  sig.sa_flags=SA_SIGINFO;
-  sigemptyset(&sig.sa_mask);
-
-  /* Catch bus errors, segmentation faults, and floating point exceptions*/
-  sigaction(SIGBUS,&sig,0);
-  sigaction(SIGSEGV,&sig,0);
-  sigaction(SIGFPE,&sig,0);
-  sigaction(SIGPIPE,&sig,0);
-#endif
-
-#ifndef RAW
-  /* Zero fd set */
-  FD_ZERO(&readfds);
-#endif
-  maxreadfd=0;
-#ifndef RAW
-  fdtoobject=allocateRuntimeHash(100);
-#endif
-
-#ifndef RAW
-  /* Map first block of memory to protected, anonymous page */
-  mmap(0, 0x1000, 0, MAP_SHARED|MAP_FIXED|MAP_ANON, -1, 0);
-#endif
 
 newtask:
-  while((hashsize(activetasks)>0)||(maxreadfd>0)) {
-
-#ifdef RAW
-#ifdef RAWDEBUG
-    raw_test_pass(0xe992);
+  while(hashsize(activetasks)>0) {
+#ifdef MULTICORE_GC
+               gc(NULL);
 #endif
-#else
-    /* Check if any filedescriptors have IO pending */
-    if (maxreadfd>0) {
-      int i;
-      struct timeval timeout={0,0};
-      fd_set tmpreadfds;
-      int numselect;
-      tmpreadfds=readfds;
-      numselect=select(maxreadfd, &tmpreadfds, NULL, NULL, &timeout);
-      if (numselect>0) {
-       /* Process ready fd's */
-       int fd;
-       for(fd=0; fd<maxreadfd; fd++) {
-         if (FD_ISSET(fd, &tmpreadfds)) {
-           /* Set ready flag on object */
-           void * objptr;
-           //      printf("Setting fd %d\n",fd);
-           if (RuntimeHashget(fdtoobject, fd,(int *) &objptr)) {
-             if(intflagorand(objptr,1,0xFFFFFFFF)) { /* Set the first flag to 1 */
-               enqueueObject(objptr, NULL, 0);
-             }
-           }
-         }
-       }
-      }
-    }
+#ifdef DEBUG
+    BAMBOO_DEBUGPRINT(0xe990);
 #endif
 
     /* See if there are any active tasks */
-    if (hashsize(activetasks)>0) {
+    //if (hashsize(activetasks)>0) {
       int i;
-#ifdef RAWPROFILE
-      if(!taskInfoOverflow) {
-       TaskInfo* checkTaskInfo = RUNMALLOC(sizeof(struct task_info));
-       taskInfoArray[taskInfoIndex] = checkTaskInfo;
-       checkTaskInfo->taskName = "tpd checking";
-       checkTaskInfo->startTime = raw_get_cycle();
-       checkTaskInfo->endTime = -1;
-      }
-#endif
-      currtpd=(struct taskparamdescriptor *) getfirstkey(activetasks);
-      genfreekey(activetasks, currtpd);
-
-      /* Check if this task has failed, allow a task that contains optional objects to fire */
-      /*if (gencontains(failedtasks, currtpd)) {
-         // Free up task parameter descriptor
-         RUNFREE(currtpd->parameterArray);
-         RUNFREE(currtpd);
-         goto newtask;
-         }*/
-      numparams=currtpd->task->numParameters;
-      numtotal=currtpd->task->numTotal;
-
-#ifdef THREADSIMULATE
-      int isolateflags[numparams];
-#endif
-      /* Make sure that the parameters are still in the queues */
-      for(i=0; i<numparams; i++) {
-       void * parameter=currtpd->parameterArray[i];
-#ifdef RAW
-#ifdef RAWDEBUG
-       raw_test_pass(0xe993);
+#ifdef PROFILE
+#ifdef ACCURATEPROFILE
+         profileTaskStart("tpd checking");
+#endif
+#endif
+         //long clock1;
+         //clock1 = BAMBOO_GET_EXE_TIME();
+
+         busystatus = true;
+               currtpd=(struct taskparamdescriptor *) getfirstkey(activetasks);
+               genfreekey(activetasks, currtpd);
+
+               numparams=currtpd->task->numParameters;
+               numtotal=currtpd->task->numTotal;
+
+         // clear the lockRedirectTbl 
+               // (TODO, this table should be empty after all locks are released)
+         // reset all locks
+         /*for(j = 0; j < MAXTASKPARAMS; j++) {
+                 runtime_locks[j].redirectlock = 0;
+                 runtime_locks[j].value = 0;
+         }*/
+         // get all required locks
+         runtime_locklen = 0;
+         // check which locks are needed
+         for(i = 0; i < numparams; i++) {
+                 void * param = currtpd->parameterArray[i];
+                 int tmplock = 0;
+                 int j = 0;
+                 bool insert = true;
+                 if(((struct ___Object___ *)param)->type == STARTUPTYPE) {
+                         islock = false;
+                         taskpointerarray[i+OFFSET]=param;
+                         goto execute;
+                 }
+                 if(((struct ___Object___ *)param)->lock == NULL) {
+                         tmplock = (int)param;
+                 } else {
+                         tmplock = (int)(((struct ___Object___ *)param)->lock);
+                 }
+                 // insert into the locks array
+                 for(j = 0; j < runtime_locklen; j++) {
+                         if(runtime_locks[j].value == tmplock) {
+                                 insert = false;
+                                 break;
+                         } else if(runtime_locks[j].value > tmplock) {
+                                 break;
+                         }
+                 }
+                 if(insert) {
+                         int h = runtime_locklen;
+                         for(; h > j; h--) {
+                                 runtime_locks[h].redirectlock = runtime_locks[h-1].redirectlock;
+                                 runtime_locks[h].value = runtime_locks[h-1].value;
+                         }
+                         runtime_locks[j].value = tmplock;
+                         runtime_locks[j].redirectlock = (int)param;
+                         runtime_locklen++;
+                 }               
+         } // line 2713: for(i = 0; i < numparams; i++) 
+         // grab these required locks
+#ifdef DEBUG
+         BAMBOO_DEBUGPRINT(0xe991);
 #endif
+         //long clock2;
+         //clock2 = BAMBOO_GET_EXE_TIME();
 
-       if(((struct ___Object___ *)parameter)->type == STARTUPTYPE) {
-         lock = false;
-         taskpointerarray[i+OFFSET]=parameter;
-         goto execute;
-       }
-       lock = true;
-       // require locks for this parameter if it is not a startup object
-       getwritelock(parameter);
-       grount = 0;
-
-#ifdef INTERRUPT
-       raw_user_interrupts_off();
-#endif
-#ifdef RAWPROFILE
-       //isInterrupt = false;
+         for(i = 0; i < runtime_locklen; i++) {
+                 int * lock = (int *)(runtime_locks[i].redirectlock);
+                 islock = true;
+                 // require locks for this parameter if it is not a startup object
+#ifdef DEBUG
+                 BAMBOO_DEBUGPRINT_REG((int)lock);
+                 BAMBOO_DEBUGPRINT_REG((int)(runtime_locks[i].value));
 #endif
-       while(!lockflag) {
-         receiveObject();
-       }
+                 getwritelock(lock);
+                 BAMBOO_START_CRITICAL_SECTION();
+#ifdef DEBUG
+                 BAMBOO_DEBUGPRINT(0xf001);
+#endif
+#ifdef PROFILE
+                 //isInterrupt = false;
+#endif 
+                 while(!lockflag) { 
+                         BAMBOO_WAITING_FOR_LOCK();
+                 }
 #ifndef INTERRUPT
-       if(reside) {
-         while(receiveObject() != -1) {
-         }
-       }
+                 if(reside) {
+                         while(BAMBOO_WAITING_FOR_LOCK() != -1) {
+                         }
+                 }
 #endif
-       grount = lockresult;
+                 grount = lockresult;
 
-       lockresult = 0;
-       lockobj = 0;
-       lockflag = false;
+                 lockresult = 0;
+                 lockobj = 0;
+                 lock2require = 0;
+                 lockflag = false;
 #ifndef INTERRUPT
-       reside = false;
+                 reside = false;
 #endif
-#ifdef RAWPROFILE
-       //isInterrupt = true;
+#ifdef PROFILE
+                 //isInterrupt = true;
 #endif
-#ifdef INTERRUPT
-       raw_user_interrupts_on();
+                 BAMBOO_CLOSE_CRITICAL_SECTION();
+#ifdef DEBUG
+                 BAMBOO_DEBUGPRINT(0xf000);
 #endif
 
-       if(grount == 0) {
-#ifdef RAWDEBUG
-         raw_test_pass(0xe994);
-#endif
-         // can not get the lock, try later
-         for(j = 0; j < i; ++j) {
-           releasewritelock(taskpointerarray[j+OFFSET]);
-         }
-         genputtable(activetasks, currtpd, currtpd);
-         if(hashsize(activetasks) == 1) {
-           // only one task right now, wait a little while before next try
-           int halt = 10000;
-           while(halt--) {
-           }
-         }
-#ifdef RAWPROFILE
-         // fail, set the end of the checkTaskInfo
-         if(!taskInfoOverflow) {
-           taskInfoArray[taskInfoIndex]->endTime = raw_get_cycle();
-           taskInfoIndex++;
-           if(taskInfoIndex == TASKINFOLENGTH) {
-             taskInfoOverflow = true;
-           }
-         }
+                 if(grount == 0) {
+#ifdef DEBUG
+                         BAMBOO_DEBUGPRINT(0xe992);
+                               BAMBOO_DEBUGPRINT_REG(lock);
+#endif
+                               // check if has the lock already
+                         // can not get the lock, try later
+                         // release all grabbed locks for previous parameters
+                         for(j = 0; j < i; ++j) { 
+                                 lock = (int*)(runtime_locks[j].redirectlock);
+                                 releasewritelock(lock);
+                         }
+                         genputtable(activetasks, currtpd, currtpd);
+                         if(hashsize(activetasks) == 1) {
+                                 // only one task right now, wait a little while before next try
+                                 int halt = 10000;
+                                 while(halt--) {
+                                 }
+                         }
+#ifdef PROFILE
+#ifdef ACCURATEPROFILE
+                         // fail, set the end of the checkTaskInfo
+                         profileTaskEnd();
+#endif
+#endif
+                         goto newtask;
+                               //}
+                 }
+         } // line 2752:  for(i = 0; i < runtime_locklen; i++)
+
+         /*long clock3;
+         clock3 = BAMBOO_GET_EXE_TIME();
+         //tprintf("sort: %d, grab: %d \n", clock2-clock1, clock3-clock2);*/
+
+#ifdef DEBUG
+       BAMBOO_DEBUGPRINT(0xe993);
 #endif
-         goto newtask;
-       }
+      /* Make sure that the parameters are still in the queues */
+      for(i=0; i<numparams; i++) {
+       void * parameter=currtpd->parameterArray[i];
+
        // flush the object
-       {
-         raw_invalidate_cache_range((int)parameter, classsize[((struct ___Object___ *)parameter)->type]);
-         /*int tmp = 0;
-            for(tmp = 0; tmp < classsize[((struct ___Object___ *)parameter)->type]; ++tmp) {
-                 invalidateAddr(parameter + tmp);
-            }*/
-       }
+#ifdef CACHEFLUSH
+       BAMBOO_CACHE_FLUSH_RANGE((int)parameter, 
+                       classsize[((struct ___Object___ *)parameter)->type]);
 #endif
        tmpparam = (struct ___Object___ *)parameter;
-#ifdef THREADSIMULATE
-       if(((struct ___Object___ *)parameter)->type == STARTUPTYPE) {
-         lock = false;
-         taskpointerarray[i+OFFSET]=parameter;
-         goto execute;
-       }
-       lock = true;
-       if(0 == tmpparam->isolate) {
-         isolateflags[i] = 0;
-         // shared object, need to flush with current value
-         //if(!getreadlock(tmpparam->original)) {
-         //    // fail to get read lock of the original object, try this task later
-         if(!getwritelock(tmpparam->original)) {
-           // fail to get write lock, release all obtained locks and try this task later
-           int j = 0;
-           for(j = 0; j < i; ++j) {
-             if(0 == isolateflags[j]) {
-               releasewritelock(((struct ___Object___ *)taskpointerarray[j+OFFSET])->original);
-             }
-           }
-           genputtable(activetasks, currtpd, currtpd);
-           goto newtask;
-         }
-         if(tmpparam->version != tmpparam->original->version) {
-           // some task on another core has changed this object
-           // flush this object
-           //memcpy(tmpparam, tmpparam->original, classsize[tmpparam->type]);
-           // release all obtained locks
-           int j = 0;
-           for(j = 0; j < i; ++j) {
-             if(0 == isolateflags[j]) {
-               releasewritelock(((struct ___Object___ *)taskpointerarray[j+OFFSET])->original);
-             }
-           }
-           releasewritelock(tmpparam->original);
-
-           // dequeue this object
-           int numofcore = pthread_getspecific(key);
-           struct parameterwrapper ** queues = objectqueues[numofcore][tmpparam->type];
-           int length = numqueues[numofcore][tmpparam->type];
-           for(j = 0; j < length; ++j) {
-             struct parameterwrapper * pw = queues[j];
-             if(ObjectHashcontainskey(pw->objectset, (int)tmpparam)) {
-               int next;
-               int UNUSED, UNUSED2;
-               int * enterflags;
-               ObjectHashget(pw->objectset, (int) tmpparam, (int *) &next, (int *) &enterflags, &UNUSED, &UNUSED2);
-               ObjectHashremove(pw->objectset, (int)tmpparam);
-               if (enterflags!=NULL)
-                 free(enterflags);
-             }
-           }
-           // try to enqueue it again to check if it feeds other tasks;
-           //enqueueObject(tmpparam, NULL, 0);
-           // Free up task parameter descriptor
-           RUNFREE(currtpd->parameterArray);
-           RUNFREE(currtpd);
-           goto newtask;
-         }
-       } else {
-         isolateflags[i] = 1;
-       }
-#endif
        pd=currtpd->task->descriptorarray[i];
        pw=(struct parameterwrapper *) pd->queue;
        /* Check that object is still in queue */
        {
          if (!ObjectHashcontainskey(pw->objectset, (int) parameter)) {
-#ifdef RAWDEBUG
-           raw_test_pass(0xe995);
+#ifdef DEBUG
+           BAMBOO_DEBUGPRINT(0xe994);
+                       BAMBOO_DEBUGPRINT_REG(parameter);
 #endif
            // release grabbed locks
-           for(j = 0; j < i; ++j) {
-             releasewritelock(taskpointerarray[j+OFFSET]);
+           for(j = 0; j < runtime_locklen; ++j) {
+               int * lock = (int *)(runtime_locks[j].redirectlock);
+               releasewritelock(lock);
            }
-           releasewritelock(parameter);
            RUNFREE(currtpd->parameterArray);
            RUNFREE(currtpd);
+                       currtpd = NULL;
            goto newtask;
          }
-       }
-#ifdef RAW
+       } // line2865
        /* Check if the object's flags still meets requirements */
        {
          int tmpi = 0;
@@ -3762,12 +3099,6 @@ newtask:
          for(tmpi = 0; tmpi < pw->numberofterms; ++tmpi) {
            andmask=pw->intarray[tmpi*2];
            checkmask=pw->intarray[tmpi*2+1];
-#ifdef RAWDEBUG
-           raw_test_pass(0xdd000000 + andmask);
-           raw_test_pass_reg((int)parameter);
-           raw_test_pass(0xdd000000 + ((struct ___Object___ *)parameter)->flag);
-           raw_test_pass(0xdd000000 + checkmask);
-#endif
            if((((struct ___Object___ *)parameter)->flag&andmask)==checkmask) {
              ismet = true;
              break;
@@ -3779,34 +3110,32 @@ newtask:
            int next;
            int UNUSED, UNUSED2;
            int * enterflags;
-#ifdef RAWDEBUG
-           raw_test_pass(0xe996);
+#ifdef DEBUG
+           BAMBOO_DEBUGPRINT(0xe995);
+                       BAMBOO_DEBUGPRINT_REG(parameter);
 #endif
-           ObjectHashget(pw->objectset, (int) parameter, (int *) &next, (int *) &enterflags, &UNUSED, &UNUSED2);
+           ObjectHashget(pw->objectset, (int) parameter, (int *) &next, 
+                                                 (int *) &enterflags, &UNUSED, &UNUSED2);
            ObjectHashremove(pw->objectset, (int)parameter);
            if (enterflags!=NULL)
-             free(enterflags);
+             RUNFREE(enterflags);
            // release grabbed locks
-           for(j = 0; j < i; ++j) {
-             releasewritelock(taskpointerarray[j+OFFSET]);
+           for(j = 0; j < runtime_locklen; ++j) {
+                int * lock = (int *)(runtime_locks[j].redirectlock);
+               releasewritelock(lock);
            }
-           releasewritelock(parameter);
            RUNFREE(currtpd->parameterArray);
            RUNFREE(currtpd);
-#ifdef RAWPROFILE
+                       currtpd = NULL;
+#ifdef PROFILE
+#ifdef ACCURATEPROFILE
            // fail, set the end of the checkTaskInfo
-           if(!taskInfoOverflow) {
-             taskInfoArray[taskInfoIndex]->endTime = raw_get_cycle();
-             taskInfoIndex++;
-             if(taskInfoIndex == TASKINFOLENGTH) {
-               taskInfoOverflow = true;
-             }
-           }
+               profileTaskEnd();
 #endif
-           goto newtask;
-         }
-       }
 #endif
+           goto newtask;
+         } // line 2878: if (!ismet)
+       } // line 2867
 parameterpresent:
        ;
        /* Check that object still has necessary tags */
@@ -3814,203 +3143,134 @@ parameterpresent:
          int slotid=pd->tagarray[2*j]+numparams;
          struct ___TagDescriptor___ *tagd=currtpd->parameterArray[slotid];
          if (!containstag(parameter, tagd)) {
-#ifdef RAWDEBUG
-           raw_test_pass(0xe997);
-#endif
+#ifdef DEBUG
+           BAMBOO_DEBUGPRINT(0xe996);
+#endif
+               {
+               // release grabbed locks
+               int tmpj = 0;
+           for(tmpj = 0; tmpj < runtime_locklen; ++tmpj) {
+                int * lock = (int *)(runtime_locks[tmpj].redirectlock);
+               releasewritelock(lock);
+           }
+               }
            RUNFREE(currtpd->parameterArray);
            RUNFREE(currtpd);
+                       currtpd = NULL;
            goto newtask;
-         }
-       }
+         } // line2911: if (!containstag(parameter, tagd))
+       } // line 2808: for(j=0; j<pd->numbertags; j++)
 
        taskpointerarray[i+OFFSET]=parameter;
-      }
+      } // line 2824: for(i=0; i<numparams; i++)
       /* Copy the tags */
       for(; i<numtotal; i++) {
        taskpointerarray[i+OFFSET]=currtpd->parameterArray[i];
       }
 
-#ifdef THREADSIMULATE
-      for(i = 0; i < numparams; ++i) {
-       if(0 == isolateflags[i]) {
-         struct ___Object___ * tmpparam = (struct ___Object___ *)taskpointerarray[i+OFFSET];
-         if(tmpparam != tmpparam->original) {
-           taskpointerarray[i+OFFSET] = tmpparam->original;
-         }
-       }
-      }
-#endif
-
       {
-#if 0
-#ifndef RAW
-       /* Checkpoint the state */
-       forward=allocateRuntimeHash(100);
-       reverse=allocateRuntimeHash(100);
-       //void ** checkpoint=makecheckpoint(currtpd->task->numParameters, currtpd->parameterArray, forward, reverse);
-#endif
-#endif
-       if (x=setjmp(error_handler)) {
-         int counter;
-         /* Recover */
-#ifndef RAW
-#ifdef DEBUG
-         printf("Fatal Error=%d, Recovering!\n",x);
-#endif
-#endif
-         /*
-            genputtable(failedtasks,currtpd,currtpd);
-            //restorecheckpoint(currtpd->task->numParameters, currtpd->parameterArray, checkpoint, forward, reverse);
-
-            freeRuntimeHash(forward);
-            freeRuntimeHash(reverse);
-            freemalloc();
-            forward=NULL;
-            reverse=NULL;
-          */
-         //fflush(stdout);
-#ifdef RAW
-#ifdef RAWDEBUG
-         raw_test_pass_reg(x);
-#endif
-         raw_test_done(0xa009);
-#else
-         exit(-1);
-#endif
-       } else {
-         /*if (injectfailures) {
-            if ((((double)random())/RAND_MAX)<failurechance) {
-             printf("\nINJECTING TASK FAILURE to %s\n", currtpd->task->name);
-             longjmp(error_handler,10);
-            }
-            }*/
+execute:
          /* Actually call task */
-#ifdef PRECISE_GC
-                                                                           ((int *)taskpointerarray)[0]=currtpd->numParameters;
+#ifdef MULTICORE_GC
+         ((int *)taskpointerarray)[0]=currtpd->numParameters;
          taskpointerarray[1]=NULL;
 #endif
-execute:
-#ifdef RAWPROFILE
-         {
-           // check finish, set the end of the checkTaskInfo
-           if(!taskInfoOverflow) {
-             taskInfoArray[taskInfoIndex]->endTime = raw_get_cycle();
-             taskInfoIndex++;
-             if(taskInfoIndex == TASKINFOLENGTH) {
-               taskInfoOverflow = true;
-             }
-           }
-         }
-         if(!taskInfoOverflow) {
-           // new a taskInfo for the task execution
-           TaskInfo* taskInfo = RUNMALLOC(sizeof(struct task_info));
-           taskInfoArray[taskInfoIndex] = taskInfo;
-           taskInfo->taskName = currtpd->task->name;
-           taskInfo->startTime = raw_get_cycle();
-           taskInfo->endTime = -1;
-         }
+#ifdef PROFILE
+#ifdef ACCURATEPROFILE
+         // check finish, set the end of the checkTaskInfo
+         profileTaskEnd();
 #endif
-
-         if(debugtask) {
-#ifndef RAW
-           printf("ENTER %s count=%d\n",currtpd->task->name, (instaccum-instructioncount));
+         profileTaskStart(currtpd->task->name);
 #endif
-           ((void(*) (void **))currtpd->task->taskptr)(taskpointerarray);
-#ifndef RAW
-           printf("EXIT %s count=%d\n",currtpd->task->name, (instaccum-instructioncount));
+         // TODO
+         //long clock4;
+         //clock4 = BAMBOO_GET_EXE_TIME();
+         //tprintf("sort: %d, grab: %d, check: %d \n", (int)(clock2-clock1), (int)(clock3-clock2), (int)(clock4-clock3));
+
+#ifdef DEBUG
+               BAMBOO_DEBUGPRINT(0xe997);
 #endif
-         } else {
-           ((void(*) (void **))currtpd->task->taskptr)(taskpointerarray);
-         }
-#ifdef RAWPROFILE
+               ((void(*) (void **))currtpd->task->taskptr)(taskpointerarray);
+               // TODO
+               //long clock5;
+         //clock5 = BAMBOO_GET_EXE_TIME();
+        // tprintf("sort: %d, grab: %d, check: %d \n", (int)(clock2-clock1), (int)(clock3-clock2), (int)(clock4-clock3));
+
+#ifdef PROFILE
+#ifdef ACCURATEPROFILE
          // task finish, set the end of the checkTaskInfo
-         if(!taskInfoOverflow) {
-           taskInfoArray[taskInfoIndex]->endTime = raw_get_cycle();
-           taskInfoIndex++;
-           if(taskInfoIndex == TASKINFOLENGTH) {
-             taskInfoOverflow = true;
-           }
-         }
+         profileTaskEnd();
          // new a PostTaskInfo for the post-task execution
-         if(!taskInfoOverflow) {
-           TaskInfo* postTaskInfo = RUNMALLOC(sizeof(struct task_info));
-           taskInfoArray[taskInfoIndex] = postTaskInfo;
-           postTaskInfo->taskName = "post task execution";
-           postTaskInfo->startTime = raw_get_cycle();
-           postTaskInfo->endTime = -1;
-         }
+         profileTaskStart("post task execution");
+#endif
 #endif
-#ifdef RAWDEBUG
-         raw_test_pass(0xe998);
-         raw_test_pass_reg(lock);
+#ifdef DEBUG
+         BAMBOO_DEBUGPRINT(0xe998);
+         BAMBOO_DEBUGPRINT_REG(islock);
 #endif
 
-         if(lock) {
-#ifdef RAW
-           for(i = 0; i < numparams; ++i) {
-             int j = 0;
-             struct ___Object___ * tmpparam = (struct ___Object___ *)taskpointerarray[i+OFFSET];
-#ifdef RAWDEBUG
-             raw_test_pass(0xe999);
-             raw_test_pass(0xdd100000 + tmpparam->flag);
+         if(islock) {
+#ifdef DEBUG
+                 BAMBOO_DEBUGPRINT(0xe999);
+#endif 
+           for(i = 0; i < runtime_locklen; ++i) {
+                               void * ptr = (void *)(runtime_locks[i].redirectlock);
+             int * lock = (int *)(runtime_locks[i].value);
+#ifdef DEBUG
+                 BAMBOO_DEBUGPRINT_REG((int)ptr);
+                 BAMBOO_DEBUGPRINT_REG((int)lock);
+                       BAMBOO_DEBUGPRINT_REG(*((int*)lock+5));
+#endif
+#ifndef MULTICORE_GC
+                 if(RuntimeHashcontainskey(lockRedirectTbl, (int)lock)) {
+                         int redirectlock;
+                         RuntimeHashget(lockRedirectTbl, (int)lock, &redirectlock);
+                         RuntimeHashremovekey(lockRedirectTbl, (int)lock);
+                         releasewritelock_r(lock, (int *)redirectlock);
+                 } else {
+#else
+                               {
 #endif
-             releasewritelock(tmpparam);
+               releasewritelock(ptr);
+                 }
            }
-#elif defined THREADSIMULATE
-           for(i = 0; i < numparams; ++i) {
-             if(0 == isolateflags[i]) {
-               struct ___Object___ * tmpparam = (struct ___Object___ *)taskpointerarray[i+OFFSET];
-               releasewritelock(tmpparam);
-             }
-           }
-#endif
-         }
+         } // line 3015: if(islock)
+
+               //long clock6;
+         //clock6 = BAMBOO_GET_EXE_TIME();
+         //tprintf("sort: %d, grab: %d, check: %d \n", (int)(clock2-clock1), (int)(clock3-clock2), (int)(clock4-clock3));
 
-#ifdef RAWPROFILE
+#ifdef PROFILE
          // post task execution finish, set the end of the postTaskInfo
-         if(!taskInfoOverflow) {
-           taskInfoArray[taskInfoIndex]->endTime = raw_get_cycle();
-           taskInfoIndex++;
-           if(taskInfoIndex == TASKINFOLENGTH) {
-             taskInfoOverflow = true;
-           }
-         }
+         profileTaskEnd();
 #endif
 
-#if 0
-#ifndef RAW
-         freeRuntimeHash(forward);
-         freeRuntimeHash(reverse);
-         freemalloc();
-#endif
-#endif
          // Free up task parameter descriptor
          RUNFREE(currtpd->parameterArray);
          RUNFREE(currtpd);
-#if 0
-#ifndef RAW
-         forward=NULL;
-         reverse=NULL;
-#endif
-#endif
-#ifdef RAWDEBUG
-         raw_test_pass(0xe99a);
-#endif
-#ifdef RAWPATH
-         raw_test_pass(0xe99a);
+               currtpd = NULL;
+#ifdef DEBUG
+         BAMBOO_DEBUGPRINT(0xe99a);
 #endif
+         //long clock7;
+         //clock7 = BAMBOO_GET_EXE_TIME();
+         //tprintf("sort: %d, grab: %d, check: %d, release: %d, other %d \n", (int)(clock2-clock1), (int)(clock3-clock2), (int)(clock4-clock3), (int)(clock6-clock5), (int)(clock7-clock6));
 
-       }
-      }
-    }
-  }
-#ifdef RAWDEBUG
-  raw_test_pass(0xe999);
+      } //  
+    //} //  if (hashsize(activetasks)>0)  
+  } //  while(hashsize(activetasks)>0)
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(0xe99b);
 #endif
 }
 
 /* This function processes an objects tags */
-void processtags(struct parameterdescriptor *pd, int index, struct parameterwrapper *parameter, int * iteratorcount, int *statusarray, int numparams) {
+void processtags(struct parameterdescriptor *pd, 
+                            int index, 
+                                                                struct parameterwrapper *parameter, 
+                                                                int * iteratorcount, 
+                                                                int *statusarray, 
+                                                                int numparams) {
   int i;
 
   for(i=0; i<pd->numbertags; i++) {
@@ -4029,10 +3289,16 @@ void processtags(struct parameterdescriptor *pd, int index, struct parameterwrap
 }
 
 
-void processobject(struct parameterwrapper *parameter, int index, struct parameterdescriptor *pd, int *iteratorcount, int * statusarray, int numparams) {
+void processobject(struct parameterwrapper *parameter, 
+                              int index, 
+                                                                        struct parameterdescriptor *pd, 
+                                                                        int *iteratorcount, 
+                                                                        int * statusarray, 
+                                                                        int numparams) {
   int i;
   int tagcount=0;
-  struct ObjectHash * objectset=((struct parameterwrapper *)pd->queue)->objectset;
+  struct ObjectHash * objectset=
+               ((struct parameterwrapper *)pd->queue)->objectset;
 
   parameter->iterators[*iteratorcount].istag=0;
   parameter->iterators[*iteratorcount].slot=index;
@@ -4041,10 +3307,11 @@ void processobject(struct parameterwrapper *parameter, int index, struct paramet
 
   for(i=0; i<pd->numbertags; i++) {
     int slotid=pd->tagarray[2*i];
-    int tagid=pd->tagarray[2*i+1];
+    //int tagid=pd->tagarray[2*i+1];
     if (statusarray[slotid+numparams]!=0) {
       /* This tag has already been enqueued, use it to narrow search */
-      parameter->iterators[*iteratorcount].tagbindings[tagcount]=slotid+numparams;
+      parameter->iterators[*iteratorcount].tagbindings[tagcount]=
+                               slotid+numparams;
       tagcount++;
     }
   }
@@ -4055,7 +3322,9 @@ void processobject(struct parameterwrapper *parameter, int index, struct paramet
 
 /* This function builds the iterators for a task & parameter */
 
-void builditerators(struct taskdescriptor * task, int index, struct parameterwrapper * parameter) {
+void builditerators(struct taskdescriptor * task, 
+                               int index, 
+                                                                               struct parameterwrapper * parameter) {
   int statusarray[MAXTASKPARAMS];
   int i;
   int numparams=task->numParameters;
@@ -4065,7 +3334,8 @@ void builditerators(struct taskdescriptor * task, int index, struct parameterwra
   statusarray[index]=1; /* Initial parameter */
   /* Process tags for initial iterator */
 
-  processtags(task->descriptorarray[index], index, parameter, &iteratorcount, statusarray, numparams);
+  processtags(task->descriptorarray[index], index, parameter, 
+                               &iteratorcount, statusarray, numparams);
 
   while(1) {
 loopstart:
@@ -4077,7 +3347,8 @@ loopstart:
        for(j=0; j<pd->numbertags; j++) {
          int slotid=pd->tagarray[2*j];
          if(statusarray[slotid+numparams]!=0) {
-           processobject(parameter, i, pd, &iteratorcount, statusarray, numparams);
+           processobject(parameter, i, pd, &iteratorcount, statusarray, 
+                                                 numparams);
            processtags(pd, i, parameter, &iteratorcount, statusarray, numparams);
            goto loopstart;
          }
@@ -4117,21 +3388,13 @@ loopstart:
 void printdebug() {
   int i;
   int j;
-#ifdef THREADSIMULATE
-  int numofcore = pthread_getspecific(key);
-  for(i=0; i<numtasks[numofcore]; i++) {
-    struct taskdescriptor * task=taskarray[numofcore][i];
-#else
-#ifdef RAW
-  if(corenum > NUMCORES - 1) {
+  if(BAMBOO_NUM_OF_CORE > NUMCORESACTIVE - 1) {
     return;
   }
-#endif
-  for(i=0; i<numtasks[corenum]; i++) {
-    struct taskdescriptor * task=taskarray[corenum][i];
-#endif
-#ifndef RAW
-    printf("%s\n", task->name);
+  for(i=0; i<numtasks[BAMBOO_NUM_OF_CORE]; i++) {
+    struct taskdescriptor * task=taskarray[BAMBOO_NUM_OF_CORE][i];
+#ifndef RAW 
+       printf("%s\n", task->name);
 #endif
     for(j=0; j<task->numParameters; j++) {
       struct parameterdescriptor *param=task->descriptorarray[j];
@@ -4139,7 +3402,7 @@ void printdebug() {
       struct ObjectHash * set=parameter->objectset;
       struct ObjectIterator objit;
 #ifndef RAW
-      printf("  Parameter %d\n", j);
+         printf("  Parameter %d\n", j);
 #endif
       ObjectHashiterator(set, &objit);
       while(ObjhasNext(&objit)) {
@@ -4157,14 +3420,18 @@ void printdebug() {
        } else if (tagptr->type==TAGTYPE) {
 #ifndef RAW
          printf("      tag=%lx\n",tagptr);
-#endif
+#else
          ;
+#endif
        } else {
          int tagindex=0;
          struct ArrayObject *ao=(struct ArrayObject *)tagptr;
          for(; tagindex<ao->___cachedCode___; tagindex++) {
 #ifndef RAW
-           printf("      tag=%lx\n",ARRAYGET(ao, struct ___TagDescriptor___*, tagindex));
+                 printf("      tag=%lx\n",ARRAYGET(ao, struct ___TagDescriptor___*, 
+                                                tagindex));
+#else
+                 ;
 #endif
          }
        }
@@ -4179,19 +3446,11 @@ void printdebug() {
 
 void processtasks() {
   int i;
-#ifdef RAW
-  if(corenum > NUMCORES - 1) {
+  if(BAMBOO_NUM_OF_CORE > NUMCORESACTIVE - 1) {
     return;
   }
-#endif
-#ifdef THREADSIMULATE
-  int numofcore = pthread_getspecific(key);
-  for(i=0; i<numtasks[numofcore]; i++) {
-    struct taskdescriptor *task=taskarray[numofcore][i];
-#else
-  for(i=0; i<numtasks[corenum]; i++) {
-    struct taskdescriptor * task=taskarray[corenum][i];
-#endif
+  for(i=0; i<numtasks[BAMBOO_NUM_OF_CORE]; i++) {
+    struct taskdescriptor * task=taskarray[BAMBOO_NUM_OF_CORE][i];
     int j;
 
     /* Build objectsets */
@@ -4221,7 +3480,8 @@ void toiReset(struct tagobjectiterator * it) {
   }
 }
 
-int toiHasNext(struct tagobjectiterator *it, void ** objectarray OPTARG(int * failed)) {
+int toiHasNext(struct tagobjectiterator *it, 
+                          void ** objectarray OPTARG(int * failed)) {
   if (it->istag) {
     /* Iterate tag */
     /* Get object with tags */
@@ -4237,7 +3497,8 @@ int toiHasNext(struct tagobjectiterator *it, void ** objectarray OPTARG(int * fa
       struct ArrayObject *ao=(struct ArrayObject *) tagptr;
       int tagindex=it->tagobjindex;
       for(; tagindex<ao->___cachedCode___; tagindex++) {
-       struct ___TagDescriptor___ *td=ARRAYGET(ao, struct ___TagDescriptor___ *, tagindex);
+       struct ___TagDescriptor___ *td=
+               ARRAYGET(ao, struct ___TagDescriptor___ *, tagindex);
        if (td->flag==it->tagid) {
          it->tagobjindex=tagindex; /* Found right type of tag */
          return 1;
@@ -4265,7 +3526,7 @@ int toiHasNext(struct tagobjectiterator *it, void ** objectarray OPTARG(int * fa
       struct ArrayObject *ao=(struct ArrayObject *) objptr;
       int tagindex;
       int i;
-      for(tagindex=it->tagobjindex; tagindex<ao->___cachedCode___; tagindex++) {
+      for(tagindex=it->tagobjindex;tagindex<ao->___cachedCode___;tagindex++) {
        struct ___Object___ *objptr=ARRAYGET(ao, struct ___Object___*, tagindex);
        if (!ObjectHashcontainskey(it->objectset, (int) objptr))
          continue;
@@ -4287,21 +3548,25 @@ nexttag:
   }
 }
 
-int containstag(struct ___Object___ *ptr, struct ___TagDescriptor___ *tag) {
+int containstag(struct ___Object___ *ptr, 
+                           struct ___TagDescriptor___ *tag) {
   int j;
   struct ___Object___ * objptr=tag->flagptr;
   if (objptr->type==OBJECTARRAYTYPE) {
     struct ArrayObject *ao=(struct ArrayObject *)objptr;
     for(j=0; j<ao->___cachedCode___; j++) {
-      if (ptr==ARRAYGET(ao, struct ___Object___*, j))
+      if (ptr==ARRAYGET(ao, struct ___Object___*, j)) {
        return 1;
+                       }
     }
     return 0;
-  } else
+  } else {
     return objptr==ptr;
+       }
 }
 
-void toiNext(struct tagobjectiterator *it, void ** objectarray OPTARG(int * failed)) {
+void toiNext(struct tagobjectiterator *it, 
+                        void ** objectarray OPTARG(int * failed)) {
   /* hasNext has all of the intelligence */
   if(it->istag) {
     /* Iterate tag */
@@ -4313,7 +3578,8 @@ void toiNext(struct tagobjectiterator *it, void ** objectarray OPTARG(int * fail
       objectarray[it->slot]=tagptr;
     } else {
       struct ArrayObject *ao=(struct ArrayObject *) tagptr;
-      objectarray[it->slot]=ARRAYGET(ao, struct ___TagDescriptor___ *, it->tagobjindex++);
+      objectarray[it->slot]=
+                               ARRAYGET(ao, struct ___TagDescriptor___ *, it->tagobjindex++);
     }
   } else if (it->numtags>0) {
     /* Use tags to locate appropriate objects */
@@ -4324,7 +3590,8 @@ void toiNext(struct tagobjectiterator *it, void ** objectarray OPTARG(int * fail
       objectarray[it->slot]=objptr;
     } else {
       struct ArrayObject *ao=(struct ArrayObject *) objptr;
-      objectarray[it->slot]=ARRAYGET(ao, struct ___Object___ *, it->tagobjindex++);
+      objectarray[it->slot]=
+                               ARRAYGET(ao, struct ___Object___ *, it->tagobjindex++);
     }
   } else {
     /* Iterate object */
@@ -4332,4 +3599,184 @@ void toiNext(struct tagobjectiterator *it, void ** objectarray OPTARG(int * fail
     Objnext(&it->it);
   }
 }
+
+#ifdef PROFILE
+inline void profileTaskStart(char * taskname) {
+  if(!taskInfoOverflow) {
+         TaskInfo* taskInfo = RUNMALLOC(sizeof(struct task_info));
+         taskInfoArray[taskInfoIndex] = taskInfo;
+         taskInfo->taskName = taskname;
+         taskInfo->startTime = BAMBOO_GET_EXE_TIME();
+         taskInfo->endTime = -1;
+         taskInfo->exitIndex = -1;
+         taskInfo->newObjs = NULL;
+  }
+}
+
+inline void profileTaskEnd() {
+  if(!taskInfoOverflow) {
+         taskInfoArray[taskInfoIndex]->endTime = BAMBOO_GET_EXE_TIME();
+         taskInfoIndex++;
+         if(taskInfoIndex == TASKINFOLENGTH) {
+                 taskInfoOverflow = true;
+                 //taskInfoIndex = 0;
+         }
+  }
+}
+
+// output the profiling data
+void outputProfileData() {
+#ifdef USEIO
+  int i;
+  unsigned long long totaltasktime = 0;
+  unsigned long long preprocessingtime = 0;
+  unsigned long long objqueuecheckingtime = 0;
+  unsigned long long postprocessingtime = 0;
+  //int interruptiontime = 0;
+  unsigned long long other = 0;
+  unsigned long long averagetasktime = 0;
+  int tasknum = 0;
+
+  printf("Task Name, Start Time, End Time, Duration, Exit Index(, NewObj Name, Num)+\n");
+  // output task related info
+  for(i = 0; i < taskInfoIndex; i++) {
+    TaskInfo* tmpTInfo = taskInfoArray[i];
+    unsigned long long duration = tmpTInfo->endTime - tmpTInfo->startTime;
+    printf("%s, %lld, %lld, %lld, %lld", 
+                       tmpTInfo->taskName, tmpTInfo->startTime, tmpTInfo->endTime, 
+                       duration, tmpTInfo->exitIndex);
+       // summarize new obj info
+       if(tmpTInfo->newObjs != NULL) {
+               struct RuntimeHash * nobjtbl = allocateRuntimeHash(5);
+               struct RuntimeIterator * iter = NULL;
+               while(0 == isEmpty(tmpTInfo->newObjs)) {
+                       char * objtype = (char *)(getItem(tmpTInfo->newObjs));
+                       if(RuntimeHashcontainskey(nobjtbl, (int)(objtype))) {
+                               int num = 0;
+                               RuntimeHashget(nobjtbl, (int)objtype, &num);
+                               RuntimeHashremovekey(nobjtbl, (int)objtype);
+                               num++;
+                               RuntimeHashadd(nobjtbl, (int)objtype, num);
+                       } else {
+                               RuntimeHashadd(nobjtbl, (int)objtype, 1);
+                       }
+                       //printf(stderr, "new obj!\n");
+               }
+
+               // output all new obj info
+               iter = RuntimeHashcreateiterator(nobjtbl);
+               while(RunhasNext(iter)) {
+                       char * objtype = (char *)Runkey(iter);
+                       int num = Runnext(iter);
+                       printf(", %s, %d", objtype, num);
+               }
+       }
+       printf("\n");
+    if(strcmp(tmpTInfo->taskName, "tpd checking") == 0) {
+      preprocessingtime += duration;
+    } else if(strcmp(tmpTInfo->taskName, "post task execution") == 0) {
+      postprocessingtime += duration;
+    } else if(strcmp(tmpTInfo->taskName, "objqueue checking") == 0) {
+      objqueuecheckingtime += duration;
+    } else {
+      totaltasktime += duration;
+      averagetasktime += duration;
+      tasknum++;
+    }
+  }
+
+  if(taskInfoOverflow) {
+    printf("Caution: task info overflow!\n");
+  }
+
+  other = totalexetime-totaltasktime-preprocessingtime-postprocessingtime;
+  averagetasktime /= tasknum;
+
+  printf("\nTotal time: %lld\n", totalexetime);
+  printf("Total task execution time: %lld (%d%%)\n", totaltasktime, 
+                          (int)(((double)totaltasktime/(double)totalexetime)*100));
+  printf("Total objqueue checking time: %lld (%d%%)\n", 
+                          objqueuecheckingtime, 
+                                (int)(((double)objqueuecheckingtime/(double)totalexetime)*100));
+  printf("Total pre-processing time: %lld (%d%%)\n", preprocessingtime, 
+                          (int)(((double)preprocessingtime/(double)totalexetime)*100));
+  printf("Total post-processing time: %lld (%d%%)\n", postprocessingtime, 
+                          (int)(((double)postprocessingtime/(double)totalexetime)*100));
+  printf("Other time: %lld (%d%%)\n", other, 
+                          (int)(((double)other/(double)totalexetime)*100));
+
+  printf("\nAverage task execution time: %lld\n", averagetasktime);
+#else
+  int i = 0;
+  int j = 0;
+
+  BAMBOO_DEBUGPRINT(0xdddd);
+  // output task related info
+  for(i= 0; i < taskInfoIndex; i++) {
+    TaskInfo* tmpTInfo = taskInfoArray[i];
+    char* tmpName = tmpTInfo->taskName;
+    int nameLen = strlen(tmpName);
+    BAMBOO_DEBUGPRINT(0xddda);
+    for(j = 0; j < nameLen; j++) {
+      BAMBOO_DEBUGPRINT_REG(tmpName[j]);
+    }
+    BAMBOO_DEBUGPRINT(0xdddb);
+    BAMBOO_DEBUGPRINT_REG(tmpTInfo->startTime);
+    BAMBOO_DEBUGPRINT_REG(tmpTInfo->endTime);
+       BAMBOO_DEBUGPRINT_REG(tmpTInfo->exitIndex);
+       if(tmpTInfo->newObjs != NULL) {
+               struct RuntimeHash * nobjtbl = allocateRuntimeHash(5);
+               struct RuntimeIterator * iter = NULL;
+               while(0 == isEmpty(tmpTInfo->newObjs)) {
+                       char * objtype = (char *)(getItem(tmpTInfo->newObjs));
+                       if(RuntimeHashcontainskey(nobjtbl, (int)(objtype))) {
+                               int num = 0;
+                               RuntimeHashget(nobjtbl, (int)objtype, &num);
+                               RuntimeHashremovekey(nobjtbl, (int)objtype);
+                               num++;
+                               RuntimeHashadd(nobjtbl, (int)objtype, num);
+                       } else {
+                               RuntimeHashadd(nobjtbl, (int)objtype, 1);
+                       }
+               }
+
+               // ouput all new obj info
+               iter = RuntimeHashcreateiterator(nobjtbl);
+               while(RunhasNext(iter)) {
+                       char * objtype = (char *)Runkey(iter);
+                       int num = Runnext(iter);
+                       int nameLen = strlen(objtype);
+                       BAMBOO_DEBUGPRINT(0xddda);
+                       for(j = 0; j < nameLen; j++) {
+                               BAMBOO_DEBUGPRINT_REG(objtype[j]);
+                       }
+                       BAMBOO_DEBUGPRINT(0xdddb);
+                       BAMBOO_DEBUGPRINT_REG(num);
+               }
+       }
+    BAMBOO_DEBUGPRINT(0xdddc);
+  }
+
+  if(taskInfoOverflow) {
+    BAMBOO_DEBUGPRINT(0xefee);
+  }
+
+  // output interrupt related info
+  /*for(i = 0; i < interruptInfoIndex; i++) {
+       InterruptInfo* tmpIInfo = interruptInfoArray[i];
+       BAMBOO_DEBUGPRINT(0xddde);
+       BAMBOO_DEBUGPRINT_REG(tmpIInfo->startTime);
+       BAMBOO_DEBUGPRINT_REG(tmpIInfo->endTime);
+       BAMBOO_DEBUGPRINT(0xdddf);
+     }
+
+     if(interruptInfoOverflow) {
+       BAMBOO_DEBUGPRINT(0xefef);
+     }*/
+
+  BAMBOO_DEBUGPRINT(0xeeee);
+#endif
+}
+#endif  // #ifdef PROFILE
+
 #endif