From 9f8a96e73e9c809c2823047802af7b53f9c7fd84 Mon Sep 17 00:00:00 2001 From: jihoonl Date: Tue, 27 Apr 2010 16:57:25 +0000 Subject: [PATCH] polling thread ping only leader machine when a machine failure is detected --- .../Runtime/DSTM/interface_recovery/dstm.h | 2 +- .../DSTM/interface_recovery/dstmserver.c | 60 +++++++++++++------ .../Runtime/DSTM/interface_recovery/trans.c | 3 +- 3 files changed, 45 insertions(+), 20 deletions(-) diff --git a/Robust/src/Runtime/DSTM/interface_recovery/dstm.h b/Robust/src/Runtime/DSTM/interface_recovery/dstm.h index c7c9f216..ae3c6d02 100644 --- a/Robust/src/Runtime/DSTM/interface_recovery/dstm.h +++ b/Robust/src/Runtime/DSTM/interface_recovery/dstm.h @@ -301,7 +301,7 @@ unsigned int getBackupMachine(unsigned int mid); unsigned int getDuplicatedPrimaryMachine(unsigned int mid); int getNumLiveHostsInSystem(); int getMyStatus(); -void* startAsking(); +void* startPolling(); unsigned int checkIfAnyMachineDead(int*); void clearDeadThreadsNotification(); /* end duplication */ diff --git a/Robust/src/Runtime/DSTM/interface_recovery/dstmserver.c b/Robust/src/Runtime/DSTM/interface_recovery/dstmserver.c index d1049347..8bf19a18 100644 --- a/Robust/src/Runtime/DSTM/interface_recovery/dstmserver.c +++ b/Robust/src/Runtime/DSTM/interface_recovery/dstmserver.c @@ -43,6 +43,7 @@ tlist_t* transList; int okCommit; // machine flag extern numWaitMachine; extern unsigned int currentEpoch; +unsigned int leader_index; #endif @@ -176,7 +177,7 @@ void *dstmListen(void *lfd) { #ifdef RECOVERY if(firsttime) { do { - retval = pthread_create(&thread_dstm_asking, NULL, startAsking, NULL); + retval = pthread_create(&thread_dstm_asking, NULL, startPolling, NULL); }while(retval!=0); firsttime=0; pthread_detach(thread_dstm_asking); @@ -191,7 +192,7 @@ void *dstmListen(void *lfd) { } #ifdef RECOVERY -void* startAsking() +void* startPolling() { unsigned int deadMachineIndex = -1; int i; @@ -236,24 +237,41 @@ unsigned int checkIfAnyMachineDead(int* socklist) char response; while(1){ - for(i = 0; i< numHostsInSystem;i++) { - if(socklist[i] > 0) { - send_data(socklist[i], &control,sizeof(char)); - if(recv_data(socklist[i], &response, sizeof(char)) < 0) { - // if machine is dead, returns index of socket - return i; - } - else { - // machine responded - if(response != LIVE) { + if(okCommit == TRANS_OK) { + for(i = 0; i< numHostsInSystem;i++) { + if(socklist[i] > 0) { + send_data(socklist[i], &control,sizeof(char)); + + if(recv_data(socklist[i], &response, sizeof(char)) < 0) { + // if machine is dead, returns index of socket return i; } - } // end else - }// end if(socklist[i] - } // end for() + else { + // machine responded + if(response != LIVE) { + return i; + } + } // end else + }// end if(socklist[i] + } // end for() - clearDeadThreadsNotification(); + clearDeadThreadsNotification(); + } + else { + send_data(socklist[i],&control,sizeof(char)); + + if(recv_data(socklist[i], &response, sizeof(char)) < 0) { + // if machine is dead, returns index of socket + return i; + } + else { + // machine responded + if(response != LIVE) { + return i; + } + } // end else + } sleep(numLiveHostsInSystem); // wait for seconds for next checking } // end while(1) @@ -498,7 +516,9 @@ void *dstmAccept(void *acceptfd) { #ifdef RECOVERY case REQUEST_TRANS_WAIT: { + unsigned int new_leader_index; recv_data((int)acceptfd,&epoch_num,sizeof(unsigned int)); + recv_data((int)acceptfd,&new_leader_index,sizeof(unsigned int)); if(inspectEpoch(epoch_num) < 0) { response = RESPOND_HIGHER_EPOCH; @@ -506,8 +526,14 @@ void *dstmAccept(void *acceptfd) { } else { printf("Got new Leader! : %d\n",epoch_num); - currentEpoch = epoch_num; + stopTransactions(TRANS_BEFORE); + + pthread_mutex_lock(&recovery_mutex); + currentEpoch = epoch_num; + leader_index = new_leader_index; + pthread_mutex_unlock(&recovery_mutex); + response = RESPOND_TRANS_WAIT; send_data((int)acceptfd,&response,sizeof(char)); sendMyList((int)acceptfd); diff --git a/Robust/src/Runtime/DSTM/interface_recovery/trans.c b/Robust/src/Runtime/DSTM/interface_recovery/trans.c index 073daa5f..d5d433ef 100644 --- a/Robust/src/Runtime/DSTM/interface_recovery/trans.c +++ b/Robust/src/Runtime/DSTM/interface_recovery/trans.c @@ -532,8 +532,6 @@ int dstmStartup(const char * option) { updateLiveHosts(); setLocateObjHosts(); updateLiveHostsCommit(); -// leader = paxos(hostIpAddrs,liveHosts,myIpAddr,numHostsInSystem,numLiveHostsInSystem); -// printHostsStatus(); if(!allHostsLive()) { printf("Not all hosts live. Exiting.\n"); exit(-1); @@ -1895,6 +1893,7 @@ int pingMachines(unsigned int epoch_num,int* sdlist,tlist_t** tList) request = REQUEST_TRANS_WAIT; send_data(sdlist[i],&request, sizeof(char)); send_data(sdlist[i],&epoch_num,sizeof(unsigned int)); + send_data(sdlist[i],&myIndexInHostArray,sizeof(unsigned int)); } /* stop all local transactions */ -- 2.34.1