From: jihoonl Date: Fri, 21 May 2010 21:48:38 +0000 (+0000) Subject: single failure X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=aeda07c16896c37b04b5943359247fbcc6e83363;p=IRC.git single failure --- diff --git a/Robust/src/Runtime/DSTM/interface_recovery/dstmserver.c b/Robust/src/Runtime/DSTM/interface_recovery/dstmserver.c index ac5defb4..180f67b5 100644 --- a/Robust/src/Runtime/DSTM/interface_recovery/dstmserver.c +++ b/Robust/src/Runtime/DSTM/interface_recovery/dstmserver.c @@ -97,6 +97,7 @@ int dstmInit(void) { okCommit = TRANS_OK; currentEpoch = 1; + leader_index = -1; #endif @@ -259,18 +260,20 @@ unsigned int checkIfAnyMachineDead(int* socklist) clearDeadThreadsNotification(); } else { - send_data(socklist[i],&control,sizeof(char)); - - if(recv_data(socklist[i], &response, sizeof(char)) < 0) { - // if machine is dead, returns index of socket - return i; - } - else { - // machine responded - if(response != LIVE) { + if(leader_index >= 0 ) { + send_data(socklist[i],&control,sizeof(char)); + + if(recv_data(socklist[i], &response, sizeof(char)) < 0) { + // if machine is dead, returns index of socket return i; } - } // end else + else { + // machine responded + if(response != LIVE) { + return i; + } + } // end else + } } sleep(numLiveHostsInSystem); // wait for seconds for next checking @@ -304,9 +307,6 @@ void *dstmAccept(void *acceptfd) { unsigned int *oidarry, numoid, mid, threadid; int n, v; -#ifdef DEBUG - printf("%s-> Entering dstmAccept\n", __func__); fflush(stdout); -#endif /* Receive control messages from other machines */ while(1) { int ret=recv_data_errorcode((int)acceptfd, &control, sizeof(char)); @@ -320,9 +320,6 @@ void *dstmAccept(void *acceptfd) { // exit(0); break; } -#ifdef DEBUG - printf("%s-> dstmAccept control = %d\n", __func__, (int)control); -#endif switch(control) { case READ_REQUEST: #ifdef DEBUG @@ -566,6 +563,11 @@ void *dstmAccept(void *acceptfd) { printf("RESTART!!!\n"); okCommit = TRANS_OK; pthread_mutex_unlock(&liveHosts_mutex); + + pthread_mutex_lock(&recovery_mutex); + leader_index = -1; + pthread_mutex_unlock(&recovery_mutex); + break; case UPDATE_LIVE_HOSTS: #ifdef DEBUG @@ -1954,7 +1956,6 @@ char inspectTransaction(char finalResponse,unsigned int transid,char* debug,int // if decision is not lost and okCommit is not TRANS_FLAG, get out of this loop while(!((tNode->decision != DECISION_LOST) && (okCommit != TRANS_FLAG))) { // printf("%s -> transID : %u decision : %d is waiting flag : %d\n",debug,tNode->transid,tNode->decision,TRANS_FLAG); -// sleep(3); randomdelay(); } diff --git a/Robust/src/Runtime/DSTM/interface_recovery/trans.c b/Robust/src/Runtime/DSTM/interface_recovery/trans.c index 731fab8b..63431d08 100644 --- a/Robust/src/Runtime/DSTM/interface_recovery/trans.c +++ b/Robust/src/Runtime/DSTM/interface_recovery/trans.c @@ -160,10 +160,6 @@ GDBSEND1: else if( numbytes < 0) { // Receive returned an error. // Analyze underlying cause -#ifdef DEBUG - printf("%s -> fd : %d errno = %d %s\n",__func__, fd, errno,strerror(errno)); - fflush(stdout); -#endif if(errno == ECONNRESET || errno == EAGAIN || errno == EWOULDBLOCK) { // machine has failed // @@ -171,9 +167,6 @@ GDBSEND1: // when we start send and finish send see if it is longer // than our threshold // -#ifdef DEBUG - printf("%s -> EAGAIN : %s\n",__func__,(errno == EAGAIN)?"TRUE":"FALSE"); -#endif return -1; } else { #ifdef GDBDEBUG @@ -181,9 +174,6 @@ GDBSEND1: goto GDBSEND1; #endif -#ifdef DEBUG - printf("%s -> Unexpected ERROR!\n",__func__); -#endif return -2; } } @@ -202,9 +192,6 @@ GDBSEND1: } #endif } // close while loop -#ifdef DEBUG - printf("%s-> Exiting\n", __func__); -#endif return 0; // completed sending data } @@ -348,17 +335,11 @@ void recv_data_buf(int fd, struct readstruct * readbuffer, void *buffer, int buf } int recv_data_errorcode(int fd, void *buf, int buflen) { -#ifdef DEBUG - printf("%s-> Start; fd:%d, buflen:%d\n", __func__, fd, buflen); -#endif char *buffer = (char *)(buf); int size = buflen; int numbytes; while (size > 0) { numbytes = recv(fd, buffer, size, 0); -#ifdef DEBUG - printf("%s-> numbytes: %d\n", __func__, numbytes); -#endif if (numbytes==0) return 0; else if (numbytes == -1) { @@ -370,9 +351,6 @@ int recv_data_errorcode(int fd, void *buf, int buflen) { buffer += numbytes; size -= numbytes; } -#ifdef DEBUG - printf("%s-> Exiting\n", __func__); -#endif return 1; } @@ -1832,12 +1810,12 @@ void restoreDuplicationState(unsigned int deadHost,unsigned int epoch_num) printf("%s -> Entering\n",__func__); int* sdlist; tlist_t* tList; + int flag = 0; #ifdef RECOVERYSTATS printf("Recovery Start\n"); long long st; long long fi; - int flag = 0; unsigned int dupeSize = 0; // to calculate the size of backed up data st = myrdtsc(); // to get clock @@ -3832,8 +3810,8 @@ void reqClearNotifyList(unsigned int oid) return; } else { - printf("%s -> Pmid = %s\n",__func__,midtoIPString(pmid)); - printf("%s -> Bmid = %s\n",__func__,midtoIPString(bmid)); +// printf("%s -> Pmid = %s\n",__func__,midtoIPString(pmid)); +// printf("%s -> Bmid = %s\n",__func__,midtoIPString(bmid)); msg[0] = CLEAR_NOTIFY_LIST; *((unsigned int *)(&msg[1])) = oid; @@ -3861,12 +3839,10 @@ void printRecoveryStat() { int i; for(i=0; i < numRecovery;i++) { printf("Dead Machine = %s\n",midtoIPString(recoverStat[i].deadMachine)); - printf("Recoveryed data(byte) = %u\n",recoverStat[i].recoveredData); printf("Recovery Time(ms) = %ld\n",recoverStat[i].elapsedTime); } printf("**************************\n\n"); fflush(stdout); - fflush(stdout); #else printf("No stat\n"); #endif