From 1f49ebdf8afeab77c5e1b251bd991175925c4cb2 Mon Sep 17 00:00:00 2001 From: jzhou Date: Sat, 21 Aug 2010 00:20:34 +0000 Subject: [PATCH] Fix a performance bug in the multicore gc version. In hvc files, should not reserve more than 16M for a core's stack+heap. Otherwise the performance would be greatly affected. It is because that all the reserved memory in hvc files will take *wired* DTLB entries. The more these entries are reserved, the less entries the program can use for other memory and thus the more DTLB miss would happen during execution. Also fixed some small bugs that break non-gc version and add codes for DTLB flushing --- .../GC/RayTracer/RayTracerBench.java | 2 +- Robust/src/Runtime/mem.c | 36 ++++++++++++++++++- Robust/src/Runtime/multicoregarbage.c | 18 ++++++++++ Robust/src/Runtime/multicoregarbage.h | 5 +++ Robust/src/Runtime/multicoreruntime.h | 15 +++++++- Robust/src/Runtime/multicoretask.c | 7 ++++ Robust/src/buildscript | 12 ------- 7 files changed, 80 insertions(+), 15 deletions(-) diff --git a/Robust/src/Benchmarks/Scheduling/GC/RayTracer/RayTracerBench.java b/Robust/src/Benchmarks/Scheduling/GC/RayTracer/RayTracerBench.java index 3e562ea3..5a288fda 100644 --- a/Robust/src/Benchmarks/Scheduling/GC/RayTracer/RayTracerBench.java +++ b/Robust/src/Benchmarks/Scheduling/GC/RayTracer/RayTracerBench.java @@ -1,7 +1,7 @@ task t1(StartupObject s{initialstate}) { //System.printString("task t1\n"); - int threadnum = 56; // 62; // 56; + int threadnum = 62; // 56; int size = threadnum * 25; Composer comp = new Composer(threadnum, size){compose}; RayTracer rt = new RayTracer(); diff --git a/Robust/src/Runtime/mem.c b/Robust/src/Runtime/mem.c index 227c4ee2..016312ed 100644 --- a/Robust/src/Runtime/mem.c +++ b/Robust/src/Runtime/mem.c @@ -9,8 +9,21 @@ void * mycalloc(int m, void * p = NULL; int isize = size; BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT(); +#ifdef MULTICORE_GC + extern bool gc_localheap_s; +inermycalloc_i: + p = gc_localheap_s ? BAMBOO_LOCAL_MEM_CALLOC_S(m, isize) : + BAMBOO_LOCAL_MEM_CALLOC(m, isize); +#else p = BAMBOO_LOCAL_MEM_CALLOC(m, isize); // calloc(m, isize); +#endif if(p == NULL) { +#ifdef MULTICORE_GC + if(!gc_localheap_s) { + gc_localheap_s = true; + goto inermycalloc_i; + } +#endif BAMBOO_EXIT(0xc001); } BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME(); @@ -121,18 +134,39 @@ void * mycalloc_i(int m, #ifdef DEBUG tprintf("ask for local mem: %x \n", isize); #endif +#ifdef MULTICORE_GC + extern bool gc_localheap_s; +inermycalloc_i: + p = gc_localheap_s ? BAMBOO_LOCAL_MEM_CALLOC_S(m, isize) : + BAMBOO_LOCAL_MEM_CALLOC(m, isize); +#else p = BAMBOO_LOCAL_MEM_CALLOC(m, isize); // calloc(m, isize); +#endif #ifdef DEBUG tprintf("new obj in local mem: %x, %x \n", p, isize); #endif if(p == NULL) { +#ifdef MULTICORE_GC + if(!gc_localheap_s) { + gc_localheap_s = true; + goto inermycalloc_i; + } +#endif BAMBOO_EXIT(0xc004); } return p; } void myfree(void * ptr) { - BAMBOO_LOCAL_MEM_FREE(ptr); +#ifdef MULTICORE_GC + if(ptr >= BAMBOO_LOCAL_HEAP_START_VA ) { +#endif + BAMBOO_LOCAL_MEM_FREE(ptr); +#ifdef MULTICORE_GC + } else if(ptr >= BAMBOO_LOCAL_HEAP_START_VA_S) { + BAMBOO_LOCAL_MEM_FREE_S(ptr); + } +#endif return; } diff --git a/Robust/src/Runtime/multicoregarbage.c b/Robust/src/Runtime/multicoregarbage.c index ca476ad0..8d669e49 100644 --- a/Robust/src/Runtime/multicoregarbage.c +++ b/Robust/src/Runtime/multicoregarbage.c @@ -3040,6 +3040,12 @@ pregccheck: #ifdef RAWPATH // TODO GC_DEBUG printf("(%x,%x) start gc! \n", udn_tile_coord_x(), udn_tile_coord_y()); //dumpSMem(); +#endif +#ifdef GC_FLUSH_DTLB + if(gc_num_flush_dtlb < GC_NUM_FLUSH_DTLB) { + BAMBOO_CLEAN_DTLB(); + gc_num_flush_dtlb++; + } #endif gcprocessing = true; gcphase = INITPHASE; @@ -3405,6 +3411,12 @@ pregccheck: gc_num_forwardobj = 0; #endif // GC_PROFLIE_S*/ } else if(BAMBOO_NUM_OF_CORE < NUMCORES4GC) { +#ifdef GC_FLUSH_DTLB + if(gc_num_flush_dtlb < GC_NUM_FLUSH_DTLB) { + BAMBOO_CLEAN_DTLB(); + gc_num_flush_dtlb++; + } +#endif gcprocessing = true; gc_collect(stackptr); @@ -3415,6 +3427,12 @@ pregccheck: gcflag = false; gcprocessing = false; } else { +#ifdef GC_FLUSH_DTLB + if(gc_num_flush_dtlb < GC_NUM_FLUSH_DTLB) { + BAMBOO_CLEAN_DTLB(); + gc_num_flush_dtlb++; + } +#endif // not a gc core, should wait for gcfinish msg gcprocessing = true; gc_nocollect(stackptr); diff --git a/Robust/src/Runtime/multicoregarbage.h b/Robust/src/Runtime/multicoregarbage.h index a6722e87..93a1b252 100644 --- a/Robust/src/Runtime/multicoregarbage.h +++ b/Robust/src/Runtime/multicoregarbage.h @@ -20,6 +20,11 @@ // let each gc core to have one big block, this is very important // for the computation of NUMBLOCKS(s, n), DO NOT change this! +#ifdef GC_FLUSH_DTLB +#define GC_NUM_FLUSH_DTLB 1 +int gc_num_flush_dtlb; +#endif + #define NUMPTRS 100 // for GC profile diff --git a/Robust/src/Runtime/multicoreruntime.h b/Robust/src/Runtime/multicoreruntime.h index 675821d6..5147ed2d 100644 --- a/Robust/src/Runtime/multicoreruntime.h +++ b/Robust/src/Runtime/multicoreruntime.h @@ -318,6 +318,10 @@ struct Queue * totransobjqueue; // queue to hold objs to be transferred //((unsigned long long int)(3.0 * 1024 * 1024 * 1024)) // 3G #endif // GC_DEBUG +#ifdef MULTICORE_GC +volatile bool gc_localheap_s; +#endif + #ifdef MULTICORE_GC #include "multicoregarbage.h" @@ -562,11 +566,19 @@ void outputProfileData(); // request response // // BAMBOO_LOCAL_MEM_CALLOC(x, y): allocate an array of x elements each of // // whose size in bytes is y on local memory // +// which is given by the hypervisor // // BAMBOO_LOCAL_MEM_FREE(x): free space with ptr x on local memory // // BAMBOO_LOCAL_MEM_CLOSE(): close the local heap // +// BAMBOO_LOCAL_MEM_CALLOC_S(x, y): allocate an array of x elements each of// +// whose size in bytes is y on local // +// memory which is not from the hypervisor// +// but is allocated from the free memory // +// BAMBOO_LOCAL_MEM_FREE_S(x): free space with ptr x on self-allocated // +// local memory // +// BAMBOO_LOCAL_MEM_CLOSE_S(): close the self-allocated local heap // // BAMBOO_SHARE_MEM_CALLOC_I(x, y): allocate an array of x elements each of// // whose size in bytes is y on shared memory// -// BAMBOO_SHARE_MEM_CLOSE(): close the shared heap // +// BAMBOO_SHARE_MEM_CLOSE(): close the shared heap // // BAMBOO_CACHE_LINE_SIZE: the cache line size // // BAMBOO_CACHE_LINE_MASK: mask for a cache line // // BAMBOO_CACHE_FLUSH_RANGE(x, y): flush cache lines started at x with // @@ -577,6 +589,7 @@ void outputProfileData(); // hint, the processor will not fetch the // // current content of the memory and directly // // write // +// BAMBOO_CLEAN_DTLB(): zero-out all the dtlb entries // ///////////////////////////////////////////////////////////////////////////// #endif // #ifdef MULTICORE diff --git a/Robust/src/Runtime/multicoretask.c b/Robust/src/Runtime/multicoretask.c index 9a6e6ae8..8bf89843 100644 --- a/Robust/src/Runtime/multicoretask.c +++ b/Robust/src/Runtime/multicoretask.c @@ -311,6 +311,10 @@ void initruntimedata() { gc_num_forwardobj = 0; gc_num_profiles = NUMCORESACTIVE - 1; #endif +#ifdef GC_FLUSH_DTLB + gc_num_flush_dtlb = 0; +#endif + gc_localheap_s = false; #else // create the lock table, lockresult table and obj queue locktable.size = 20; @@ -602,6 +606,9 @@ void checkCoreStatus() { BAMBOO_DEBUGPRINT(BAMBOO_GET_EXE_TIME() - bamboo_start_time); //BAMBOO_DEBUGPRINT_REG(total_num_t6); // TODO for test +#ifdef GC_FLUSH_DTLB + BAMBOO_DEBUGPRINT_REG(gc_num_flush_dtlb); +#endif #ifndef BAMBOO_MEMPROF BAMBOO_DEBUGPRINT(0xbbbbbbbb); #endif diff --git a/Robust/src/buildscript b/Robust/src/buildscript index 4b9ea31d..b8171764 100755 --- a/Robust/src/buildscript +++ b/Robust/src/buildscript @@ -201,7 +201,6 @@ OPTIONALFLAG=false EXITAFTERANALYSIS=false ASSEMBLY=false GCCORES='' -GC1COREFLAG=false TILERAN1COREFLAG=false TILERA56COREFLAG=false @@ -390,10 +389,6 @@ elif [[ $1 = '-numcore4gc' ]] then JAVAOPTS="$JAVAOPTS -numcore4gc $2" GCCORES="GC_$2" -if [[ "$2" -eq "1" ]] -then -GC1COREFLAG=true -fi shift elif [[ $1 = '-raw' ]] then @@ -812,13 +807,6 @@ TILERA_INDIR="BME" MAKEFILE="Makefile.tilera.$TILERACONFIG" SIMHVC="sim.hvc.$TILERACONFIG" PCIHVC="pci.hvc.$TILERACONFIG" -if $GC1COREFLAG -then # 1-core gc - if $TILERAN1COREFLAG - then # not only with 1 core - PCIHVC="$PCIHVC.1gc" - fi -fi if $TILERA56COREFLAG then PCIHVC="$PCIHVC.56" -- 2.34.1