From: bdemsky Date: Sat, 2 Jun 2007 18:22:06 +0000 (+0000) Subject: Tag version of Web Spider X-Git-Tag: preEdgeChange~561 X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=eb9029a544f0563187d3f88f10821606d7bd670c;p=IRC.git Tag version of Web Spider --- diff --git a/Robust/src/Benchmarks/Spider/BRTag/Query.java b/Robust/src/Benchmarks/Spider/BRTag/Query.java new file mode 100644 index 00000000..338f7d92 --- /dev/null +++ b/Robust/src/Benchmarks/Spider/BRTag/Query.java @@ -0,0 +1,84 @@ +public class Query { + flag requested; + flag processed; + flag received; + public int state; + + private String hostname; + private String path; + + private StringBuffer response; + + public Query(String hostname, String path) { + this.hostname=hostname; + this.path=path; + response=new StringBuffer(); + state=0; + } + + public void makeConnection(Socket s) { + InetAddress address=InetAddress.getByName(hostname); + int port=80; + s.fd=Socket.nativeBind(address.getAddress(), port); + s.nativeConnect(s.fd, address.getAddress(), port); + } + + public String getHostName() { + return hostname; + } + + public String getPath() { + return path; + } + + public void outputFile() { + StringBuffer sb=new StringBuffer(hostname); + sb.append(path); + FileOutputStream fos=new FileOutputStream(sb.toString().replace('/','#')); + fos.write(response.toString().getBytes()); + fos.close(); + } + + public String makewebcanonical(String page) { + StringBuffer b=new StringBuffer(getHostName(page)); + b.append("/"); + b.append(getPathName(page)); + return b.toString(); + } + + public String getHostName(String page) { + String http=new String("http://"); + if (page.indexOf(http)==-1) { + return getHostName(); + } else { + int beginindex=page.indexOf(http)+http.length(); + int endindex=page.indexOf('/',beginindex+1); + if ((beginindex==-1)) { + System.printString("ERROR"); + } + if (endindex==-1) + endindex=page.length(); + return page.subString(beginindex, endindex); + } + } + + public String getPathName(String page) { + String http=new String("http://"); + if (page.indexOf(http)==-1) { + String path=getPath(); + int lastindex=path.lastindexOf('/'); + if (lastindex==-1) + return page; + + StringBuffer sb=new StringBuffer(path.subString(0,lastindex+1)); + sb.append(page); + return sb.toString(); + } else { + int beginindex=page.indexOf(http)+http.length(); + int nextindex=page.indexOf('/',beginindex+1); + if ((beginindex==-1)||(nextindex==-1)) + return new String("index.html"); + return page.subString(nextindex+1, page.length()-1); + } + } +} diff --git a/Robust/src/Benchmarks/Spider/BRTag/QueryList.java b/Robust/src/Benchmarks/Spider/BRTag/QueryList.java new file mode 100644 index 00000000..90dc9dfe --- /dev/null +++ b/Robust/src/Benchmarks/Spider/BRTag/QueryList.java @@ -0,0 +1,14 @@ +public class QueryList { + flag initialized; + HashSet queries; + + public QueryList() { + queries=new HashSet(); + } + public boolean checkQuery(String x) { + return queries.contains(x); + } + public void addQuery(String x) { + queries.add(x); + } +} diff --git a/Robust/src/Benchmarks/Spider/BRTag/Spider.java b/Robust/src/Benchmarks/Spider/BRTag/Spider.java new file mode 100644 index 00000000..80be0274 --- /dev/null +++ b/Robust/src/Benchmarks/Spider/BRTag/Spider.java @@ -0,0 +1,105 @@ +task Startup(StartupObject s {initialstate}) { + String firstmachine=s.parameters[0]; + String firstpage=s.parameters[1]; + QueryList ql=new QueryList() {initialized}; + tag t=new tag(connect); + Socket sock=new Socket(){}{t}; + Query firstquery=new Query(firstmachine, firstpage){}{t}; + taskexit(s{!initialstate}); +} + +task requestQuery(Query q{!requested}{connect t}, Socket s{}{connect t}) { + String hostname=q.getHostName(); + q.makeConnection(s); + StringBuffer req=new StringBuffer("GET "); + req.append("/"); + req.append(q.getPath()); + req.append(" HTTP/1.1\r\nHost:"); + req.append(q.getHostName()); + req.append("\r\n\r\n"); + s.write(req.toString().getBytes()); + taskexit(q{requested}); +} + +task readResponse(Query q{requested && ! received}{connect t},Socket s{IOPending}{connect t}) { + // state 0 - nothing + // state 1 - \r + // state 2 - \r\n + // state 3 - \r\n\r + // state 4 - \r\n\r\n + if (q.state<4) { + if (q.state==0) { + byte[] b=new byte[1]; + int numchars=s.read(b); + if ((numchars==1) && (b[0]=='\r')) + q.state++; + } else if (q.state==1) { + byte[] b=new byte[1]; + int numchars=s.read(b); + if (numchars==1) { + if (b[0]=='\n') + q.state++; + else + q.state=0; + } + } else if (q.state==2) { + byte[] b=new byte[1]; + int numchars=s.read(b); + if (numchars==1) { + if (b[0]=='\r') + q.state++; + else + q.state=0; + } + } else if (q.state==3) { + byte[] b=new byte[1]; + int numchars=s.read(b); + if (numchars==1) { + if (b[0]=='\n') + q.state++; + else + q.state=0; + } + } + } else { + byte[] buffer=new byte[1024]; + int numchars=s.read(buffer); + if (numchars==0) { + s.close(); + taskexit(q{received}); + } else { + String curr=(new String(buffer)).subString(0,numchars); + q.response.append(curr); + } + } +} + +task processPage(Query q{received&&!processed}, QueryList ql{initialized}) { + int index=0; + String href=new String("href=\""); + String searchstr=q.response.toString(); + boolean cont=true; + q.outputFile(); + + while(cont) { + int mindex=searchstr.indexOf(href,index); + if (mindex!=-1) { + + int endquote=searchstr.indexOf('"', mindex+href.length()); + if (endquote!=-1) { + String match=searchstr.subString(mindex+href.length(), endquote); + String match2=q.makewebcanonical(match); + if (match2!=null&&!ql.checkQuery(match2)) { + ql.addQuery(match2); + System.printString(q.getHostName(match)); + System.printString(" "); + System.printString(q.getPathName(match)); + System.printString("\n"); + Query newq=new Query(q.getHostName(match), q.getPathName(match)){}; + } + index=endquote; + } else cont=false; + } else cont=false; + } + taskexit(q{processed}); +}