Tag version of Web Spider
authorbdemsky <bdemsky>
Sat, 2 Jun 2007 18:22:06 +0000 (18:22 +0000)
committerbdemsky <bdemsky>
Sat, 2 Jun 2007 18:22:06 +0000 (18:22 +0000)
Robust/src/Benchmarks/Spider/BRTag/Query.java [new file with mode: 0644]
Robust/src/Benchmarks/Spider/BRTag/QueryList.java [new file with mode: 0644]
Robust/src/Benchmarks/Spider/BRTag/Spider.java [new file with mode: 0644]

diff --git a/Robust/src/Benchmarks/Spider/BRTag/Query.java b/Robust/src/Benchmarks/Spider/BRTag/Query.java
new file mode 100644 (file)
index 0000000..338f7d9
--- /dev/null
@@ -0,0 +1,84 @@
+public class Query  {
+    flag requested;
+    flag processed;
+    flag received;
+    public int state;
+
+    private String hostname;
+    private String path;
+
+    private StringBuffer response;
+
+    public Query(String hostname, String path) {
+       this.hostname=hostname;
+       this.path=path;
+       response=new StringBuffer();
+       state=0;
+    }
+
+    public void makeConnection(Socket s) {
+       InetAddress address=InetAddress.getByName(hostname);
+       int port=80;
+        s.fd=Socket.nativeBind(address.getAddress(), port);
+        s.nativeConnect(s.fd, address.getAddress(), port);
+    }
+
+    public String getHostName() {
+       return hostname;
+    }
+
+    public String getPath() {
+       return path;
+    }
+
+    public void outputFile() {
+       StringBuffer sb=new StringBuffer(hostname);
+       sb.append(path);
+       FileOutputStream fos=new FileOutputStream(sb.toString().replace('/','#'));
+       fos.write(response.toString().getBytes());
+       fos.close();
+    }
+    
+    public String makewebcanonical(String page) {
+       StringBuffer b=new StringBuffer(getHostName(page));
+       b.append("/");
+       b.append(getPathName(page));
+       return b.toString();
+    }
+
+    public String getHostName(String page) {
+       String http=new String("http://");
+       if (page.indexOf(http)==-1) {
+           return getHostName();
+       } else {
+           int beginindex=page.indexOf(http)+http.length();
+           int endindex=page.indexOf('/',beginindex+1);
+           if ((beginindex==-1)) {
+               System.printString("ERROR");
+           }
+           if (endindex==-1)
+               endindex=page.length();
+           return page.subString(beginindex, endindex);
+       }
+    }
+
+    public String getPathName(String page) {
+       String http=new String("http://");
+       if (page.indexOf(http)==-1) {
+           String path=getPath();
+           int lastindex=path.lastindexOf('/');
+           if (lastindex==-1)
+               return page;
+           
+           StringBuffer sb=new StringBuffer(path.subString(0,lastindex+1));
+           sb.append(page);
+           return sb.toString();
+       } else {
+           int beginindex=page.indexOf(http)+http.length();
+           int nextindex=page.indexOf('/',beginindex+1);
+           if ((beginindex==-1)||(nextindex==-1))
+               return new String("index.html");
+           return page.subString(nextindex+1, page.length()-1);
+       }
+    }
+}
diff --git a/Robust/src/Benchmarks/Spider/BRTag/QueryList.java b/Robust/src/Benchmarks/Spider/BRTag/QueryList.java
new file mode 100644 (file)
index 0000000..90dc9df
--- /dev/null
@@ -0,0 +1,14 @@
+public class QueryList {
+    flag initialized;
+    HashSet queries;
+
+    public QueryList() {
+       queries=new HashSet();
+    }
+    public boolean checkQuery(String x) {
+       return queries.contains(x);
+    }
+    public void addQuery(String x) {
+       queries.add(x);
+    }
+}
diff --git a/Robust/src/Benchmarks/Spider/BRTag/Spider.java b/Robust/src/Benchmarks/Spider/BRTag/Spider.java
new file mode 100644 (file)
index 0000000..80be027
--- /dev/null
@@ -0,0 +1,105 @@
+task Startup(StartupObject s {initialstate}) {
+    String firstmachine=s.parameters[0];
+    String firstpage=s.parameters[1];
+    QueryList ql=new QueryList() {initialized};
+    tag t=new tag(connect);
+    Socket sock=new Socket(){}{t};
+    Query firstquery=new Query(firstmachine, firstpage){}{t};
+    taskexit(s{!initialstate});
+}
+
+task requestQuery(Query q{!requested}{connect t}, Socket s{}{connect t}) {
+    String hostname=q.getHostName();
+    q.makeConnection(s);
+    StringBuffer req=new StringBuffer("GET "); 
+    req.append("/");
+    req.append(q.getPath());
+    req.append(" HTTP/1.1\r\nHost:");
+    req.append(q.getHostName());
+    req.append("\r\n\r\n");
+    s.write(req.toString().getBytes());
+    taskexit(q{requested});
+}
+
+task readResponse(Query q{requested && ! received}{connect t},Socket s{IOPending}{connect t}) {
+    //    state 0 - nothing
+    //    state 1 - \r
+    //    state 2 - \r\n
+    //    state 3 - \r\n\r
+    //    state 4 - \r\n\r\n
+    if (q.state<4) {
+       if (q.state==0) {
+           byte[] b=new byte[1];
+           int numchars=s.read(b);
+           if ((numchars==1) && (b[0]=='\r'))
+               q.state++;
+       } else if (q.state==1) {
+           byte[] b=new byte[1];
+           int numchars=s.read(b);
+           if (numchars==1) {
+               if (b[0]=='\n')
+                   q.state++;
+               else
+                   q.state=0;
+           }
+       } else if (q.state==2) {
+           byte[] b=new byte[1];
+           int numchars=s.read(b);
+           if (numchars==1) {
+               if (b[0]=='\r')
+                   q.state++;
+               else
+                   q.state=0;
+           }
+       } else if (q.state==3) {
+           byte[] b=new byte[1];
+           int numchars=s.read(b);
+           if (numchars==1) {
+               if (b[0]=='\n')
+                   q.state++;
+               else
+                   q.state=0;
+           }
+       }
+    } else {
+       byte[] buffer=new byte[1024];
+       int numchars=s.read(buffer);
+       if (numchars==0) {
+           s.close();
+           taskexit(q{received});
+       } else {
+           String curr=(new String(buffer)).subString(0,numchars);
+           q.response.append(curr);
+       }
+    }
+}
+
+task processPage(Query q{received&&!processed}, QueryList ql{initialized}) {
+    int index=0;
+    String href=new String("href=\"");
+    String searchstr=q.response.toString();
+    boolean cont=true;
+    q.outputFile();
+
+    while(cont) {
+       int mindex=searchstr.indexOf(href,index);
+       if (mindex!=-1) {
+
+       int endquote=searchstr.indexOf('"', mindex+href.length());
+       if (endquote!=-1) {
+           String match=searchstr.subString(mindex+href.length(), endquote);
+           String match2=q.makewebcanonical(match);
+           if (match2!=null&&!ql.checkQuery(match2)) {
+               ql.addQuery(match2);
+               System.printString(q.getHostName(match));
+               System.printString("        ");
+               System.printString(q.getPathName(match));
+               System.printString("\n");
+               Query newq=new Query(q.getHostName(match), q.getPathName(match)){};
+           }
+           index=endquote;
+       } else cont=false;
+       } else cont=false;
+    }
+    taskexit(q{processed});
+}