From f689f0945aa34d327a901de8906d5b1d9a8cb8e9 Mon Sep 17 00:00:00 2001 From: hkhang Date: Thu, 29 Oct 2009 20:23:57 +0000 Subject: [PATCH] *** empty log message *** --- GlobalQuery.java | 92 +++++++++++++++++ LocalQuery.java | 95 ++++++++++++++++++ QueryList.java | 22 ++++ QueryQueue.java | 34 +++++++ QueryTask.java | 257 +++++++++++++++++++++++++++++++++++++++++++++++ Spider.java | 60 +++++++++++ dstm.conf | 3 + 7 files changed, 563 insertions(+) create mode 100644 GlobalQuery.java create mode 100644 LocalQuery.java create mode 100644 QueryList.java create mode 100644 QueryQueue.java create mode 100644 QueryTask.java create mode 100644 Spider.java create mode 100644 dstm.conf diff --git a/GlobalQuery.java b/GlobalQuery.java new file mode 100644 index 00000000..0a9d1630 --- /dev/null +++ b/GlobalQuery.java @@ -0,0 +1,92 @@ +public class GlobalQuery { + GlobalString hostname; + GlobalString path; + int depth; + + public GlobalQuery(GlobalString hostname) { + this.hostname = global new GlobalString(hostname); + this.path = global new GlobalString(""); + this.depth = 0; + } + + public GlobalQuery(GlobalString hostname, GlobalString path, int depth) { + this.hostname = global new GlobalString(hostname); + this.path = global new GlobalString(path); + this.depth = depth; + } + + public int getDepth() { + return depth; + } + + public GlobalString getHostName() { + return hostname; + } + + public GlobalString getPath() { + return path; + } + + public GlobalString makewebcanonical(GlobalString page) { + GlobalStringBuffer b = global new GlobalStringBuffer(getHostName(page)); + b.append("/"); + b.append(getPathName(page)); + return b.toGlobalString(); + } + + public GlobalString getHostName(GlobalString page) { + GlobalString http = global new GlobalString("http://"); + GlobalString https = global new GlobalString("https://"); + int beginindex; + int endindex; + + if ((page.indexOf(http) == -1) && (page.indexOf(https) == -1)) { + return getHostName(); + } + else if (page.indexOf(https) != -1) { + beginindex = page.indexOf(https) + https.length(); + } + else { + beginindex = page.indexOf(http) + http.length(); + } + endindex = page.indexOf('/',beginindex+1); + + if ((beginindex == -1)) { + System.printString("ERROR"); + } + if (endindex == -1) + endindex = page.length(); + + return page.subString(beginindex, endindex); + } + + + public GlobalString getPathName(GlobalString page) { + GlobalString http = global new GlobalString("http://"); + GlobalString https = global new GlobalString("https://"); + int beginindex; + int nextindex; + + if ((page.indexOf(http) == -1) && (page.indexOf(https) == -1)) { + GlobalString path = getPath(); + int lastindex = path.lastindexOf('/'); + if (lastindex == -1) + return page; + + GlobalStringBuffer sb = global new GlobalStringBuffer(path.subString(0,lastindex+1)); + sb.append(page); + return sb.toGlobalString(); + } + else if (page.indexOf(https) != -1) { + beginindex = page.indexOf(https) + https.length(); + } + else { + beginindex = page.indexOf(http) + http.length(); + } + nextindex = page.indexOf('/',beginindex+1); + + if ((beginindex == -1) || (nextindex == -1)) + return global new GlobalString("index.html"); + return page.subString(nextindex+1, page.length()); + } +} diff --git a/LocalQuery.java b/LocalQuery.java new file mode 100644 index 00000000..1beeadbe --- /dev/null +++ b/LocalQuery.java @@ -0,0 +1,95 @@ +public class LocalQuery { + String hostname; + String path; + StringBuffer response; + int depth; + + public LocalQuery(String hostname, String path, int depth) { + this.hostname = new String(hostname); + this.path = new String(path); + response = new StringBuffer(); + this.depth = depth; + } + + public int getDepth() { + return depth; + } + + public String getHostName() { + return hostname; + } + + public String getPath() { + return path; + } + + public void outputFile() { + StringBuffer sb = new StringBuffer(hostname); + sb.append(path); + FileOutputStream fos = new FileOutputStream(sb.toString().replace('/','#')); + fos.write(response.toString().getBytes()); + fos.close(); + } + + public String makewebcanonical(String page) { + StringBuffer b = new StringBuffer(getHostName(page)); + b.append("/"); + b.append(getPathName(page)); + return b.toString(); + } + + public String getHostName(String page) { + String http = new String("http://"); + String https = new String("https://"); + int beginindex; + int endindex; + + if ((page.indexOf(http) == -1) && (page.indexOf(https) == -1)) { + return getHostName(); + } + else if (page.indexOf(https) != -1) { + beginindex = page.indexOf(https) + https.length(); + } + else { + beginindex = page.indexOf(http) + http.length(); + } + endindex = page.indexOf('/',beginindex+1); + + if ((beginindex == -1)) { + System.printString("ERROR"); + } + if (endindex == -1) + endindex = page.length(); + + return page.subString(beginindex, endindex); + } + + public String getPathName(String page) { + String http = new String("http://"); + String https = new String("https://"); + int beginindex; + int nextindex; + + if ((page.indexOf(http) == -1) && (page.indexOf(https) == -1)) { + String path = getPath(); + int lastindex = path.lastindexOf('/'); + if (lastindex == -1) + return page; + + StringBuffer sb = new StringBuffer(path.subString(0,lastindex+1)); + sb.append(page); + return sb.toString(); + } + else if (page.indexOf(https) != -1) { + beginindex = page.indexOf(https) + https.length(); + } + else { + beginindex = page.indexOf(http) + http.length(); + } + nextindex = page.indexOf('/',beginindex+1); + + if ((beginindex==-1) || (nextindex==-1)) + return new String("index.html"); + return page.subString(nextindex+1, page.length()); + } +} diff --git a/QueryList.java b/QueryList.java new file mode 100644 index 00000000..d09167b0 --- /dev/null +++ b/QueryList.java @@ -0,0 +1,22 @@ +public class QueryList extends Queue { + Queue queries; + + public QueryList() { + queries = global new Queue(); + } + + public boolean checkQuery(GlobalString x) { + boolean set = false;; + for (int i = 0 ; i < size; i++) { + if (x.equals((GlobalString)elements[i])) { + set = true; + break; + } + } + return set; + } + + public void addQuery(GlobalString x) { + queries.push(x); + } +} diff --git a/QueryQueue.java b/QueryQueue.java new file mode 100644 index 00000000..915bb4b9 --- /dev/null +++ b/QueryQueue.java @@ -0,0 +1,34 @@ +public class QueryQueue { + HashSet queries; + int size; + + public QueryQueue() { + queries = new HashSet(); + size = 0; + } + + public LocalQuery pop() { + if (queries.isEmpty()) + return null; + LocalQuery q = (LocalQuery) queries.iterator().next(); + queries.remove(q); + size--; + return q; + } + + public void push(LocalQuery x) { + queries.add(x); + size++; + } + + public int size() { + return size; + } + + public boolean isEmpty() { + if (size == 0) + return true; + else + return false; + } +} diff --git a/QueryTask.java b/QueryTask.java new file mode 100644 index 00000000..e3339846 --- /dev/null +++ b/QueryTask.java @@ -0,0 +1,257 @@ +public class QueryTask extends Task { + int maxDepth; + Queue toprocess; + DistributedHashMap results; + GlobalString workingURL; + + public QueryTask(Queue todoList, DistributedHashMap doneList, int maxDepth, DistributedHashMap results) { + this.todoList = todoList; + this.doneList = doneList; + this.maxDepth = maxDepth; + this.results = results; + } + + public void execute() { + int depth; + int max; + + atomic { + depth = ((GlobalQuery)myWork).getDepth(); + max = this.maxDepth; + } + + if (depth < max) { + /* global variables */ + GlobalQuery gq; + + /* local variables */ + LocalQuery lq; + String hostname; + String path; + + atomic { + gq = (GlobalQuery)myWork; + hostname = new String(GlobalString.toLocalCharArray(gq.getHostName())); + path = new String(GlobalString.toLocalCharArray(gq.getPath())); + + GlobalStringBuffer gsb = global new GlobalStringBuffer(hostname); + gsb.append("/"); + gsb.append(path); + workingURL = global new GlobalString(gsb.toGlobalString()); + } + lq = new LocalQuery(hostname, path, depth); + + System.printString(lq.getDepth()+" "); + System.printString("Processing - Hostname : "); + System.printString(hostname); + System.printString(", Path : "); + System.printString(path); + System.printString("\n"); + + Socket s = new Socket(hostname, 80); + + requestQuery(hostname, path, s); + readResponse(lq, s); + + atomic { + processList(lq, workingURL, results); + } + + atomic { + toprocess = processPage(lq); + } + + s.close(); + } + } + + public void done(Object obj) { + GlobalString str = global new GlobalString("true"); + doneList.put(workingURL, str); + + while(!toprocess.isEmpty()) { + GlobalQuery q = (GlobalQuery)toprocess.pop(); + + GlobalString hostname = global new GlobalString(q.getHostName()); + GlobalString path = global new GlobalString(q.getPath()); + + GlobalStringBuffer gsb = global new GlobalStringBuffer(hostname); + gsb.append("/"); + gsb.append(path); + + if (!doneList.containsKey(gsb.toGlobalString())) { + todoList.push(q); + } + } + } + + public static void requestQuery(String hostname, String path, Socket sock) { + StringBuffer req = new StringBuffer("GET "); + req.append("/"); + req.append(path); + req.append(" HTTP/1.1\r\nHost:"); + req.append(hostname); + req.append("\r\n\r\n"); + sock.write(req.toString().getBytes()); + } + + public static void readResponse(LocalQuery lq, Socket sock) { + // state 0 - nothing + // state 1 - \r + // state 2 - \r\n + // state 3 - \r\n\r + // state 4 - \r\n\r\n + int state=0; + while(true) { + if (state<4) { + if (state==0) { + byte[] b=new byte[1]; + int numchars=sock.read(b); + if ((numchars==1)) { + if (b[0]=='\r') { + state++; + } + } else + return; + } else if (state==1) { + byte[] b=new byte[1]; + int numchars=sock.read(b); + if (numchars==1) { + if (b[0]=='\n') + state++; + else + state=0; + } else return; + } else if (state==2) { + byte[] b=new byte[1]; + int numchars=sock.read(b); + if (numchars==1) { + if (b[0]=='\r') + state++; + else + state=0; + } else return; + } else if (state==3) { + byte[] b=new byte[1]; + int numchars=sock.read(b); + if (numchars==1) { + if (b[0]=='\n') + state++; + else + state=0; + } else return; + } + } else { + byte[] buffer=new byte[1024]; + int numchars=sock.read(buffer); + if (numchars==0) + return; + else { + String curr=(new String(buffer)).subString(0,numchars); + lq.response.append(curr); + } + } + } + } + + public static void processList(LocalQuery lq, GlobalString url, DistributedHashMap results) { + String sTitle = new String(""); + String eTitle = new String(""); + String searchstr = lq.response.toString(); + LinkedList ll; + + int sIndex = searchstr.indexOf(sTitle); + if (sIndex != -1) { + int eIndex = searchstr.indexOf(eTitle, sIndex+sTitle.length()); + String title = new String(searchstr.subString(sIndex+sTitle.length(), eIndex)); + ll = tokenize(title); + + Queue q; + while (!ll.isEmpty()) { + GlobalString word = global new GlobalString(ll.pop().toString()); +// q = (Queue)(results.get(word)); + +// if (q == null) { + if (!results.containsKey(word)) { + q = global new Queue(); + } + else { + q = (Queue)(results.get(word)); + } + q.push(url); + results.put(word, q); + + System.out.println("Key : ["+word.toLocalString()+"],["+q.size()+"]"); +/* + for (int i = 0; i < q.size(); i++) { + Object obj = q.elements[i]; + GlobalString str = global new GlobalString((GlobalString)obj); + System.out.println("\t["+i+"] : "+str.toLocalString()); + }*/ + } + } + } + + public static LinkedList tokenize(String str) { + LinkedList ll; + int sIndex = 0; + int eIndex = 0; + String token; + + ll = new LinkedList(); + + // and, or, of, at, but, '.', ',', ':' ';', '"', ' ', '-', '=' + while (true) { + eIndex = str.indexOf(' ', sIndex); + if (eIndex == -1) { + token = str.subString(sIndex); + ll.add(token); + break; + } + else { + token = str.subString(sIndex, eIndex); + ll.add(token); + sIndex = eIndex+1; + } + } + + return ll; + } + + public static Queue processPage(LocalQuery lq) { + int index = 0; + String href = new String("href=\""); + String searchstr = lq.response.toString(); + int depth; + boolean cont = true; + Queue toprocess; + + depth = lq.getDepth() + 1; + + toprocess = global new Queue(); + + while(cont) { + int mindex = searchstr.indexOf(href,index); + if (mindex != -1) { + int endquote = searchstr.indexOf('"', mindex+href.length()); + if (endquote != -1) { + String match = searchstr.subString(mindex+href.length(), endquote); + String match2 = lq.makewebcanonical(match); + + GlobalString ghostname; + GlobalString gpath; + + ghostname = global new GlobalString(lq.getHostName(match)); + gpath = global new GlobalString(lq.getPathName(match)); + + if (match2 != null) { + GlobalQuery gq = global new GlobalQuery(ghostname, gpath, depth); + toprocess.push(gq); + } + index = endquote; + } else cont = false; + } else cont = false; + } + return toprocess; + } +} diff --git a/Spider.java b/Spider.java new file mode 100644 index 00000000..9335ef33 --- /dev/null +++ b/Spider.java @@ -0,0 +1,60 @@ +public class Spider { + public static void main(String[] args) { + int NUM_THREADS = 3; + int maxDepth = 3; + int i, j; + Work[] works; + QueryTask[] qt; + GlobalQuery[] currentWorkList; + + NUM_THREADS = Integer.parseInt(args[0]); + + if (args.length == 3) { + maxDepth = Integer.parseInt(args[2]); + } + + GlobalString firstmachine; + + int mid[] = new int[NUM_THREADS]; + mid[0] = (128<<24)|(195<<16)|(180<<8)|21; + mid[1] = (128<<24)|(195<<16)|(180<<8)|24; + mid[2] = (128<<24)|(195<<16)|(180<<8)|26; + + atomic { + firstmachine = global new GlobalString(args[1]); + + works = global new Work[NUM_THREADS]; + qt = global new QueryTask[NUM_THREADS]; + currentWorkList = global new GlobalQuery[NUM_THREADS]; + + GlobalQuery firstquery = global new GlobalQuery(firstmachine); + + Queue todoList = global new Queue(); + DistributedHashMap doneList = global new DistributedHashMap(500, 500, 0.75f); + DistributedHashMap results = global new DistributedHashMap(100, 100, 0.75f); + + todoList.push(firstquery); + + for (i = 0; i < NUM_THREADS; i++) { + qt[i] = global new QueryTask(todoList, doneList, maxDepth, results); + works[i] = global new Work(qt[i], NUM_THREADS, i, currentWorkList); + } + } + System.printString("Finished to create Objects\n"); + + Work tmp; + for (i = 0; i < NUM_THREADS; i++) { + atomic { + tmp = works[i]; + } + Thread.myStart(tmp, mid[i]); + } + + for (i = 0; i < NUM_THREADS; i++) { + atomic { + tmp = works[i]; + } + tmp.join(); + } + } +} diff --git a/dstm.conf b/dstm.conf new file mode 100644 index 00000000..935ef319 --- /dev/null +++ b/dstm.conf @@ -0,0 +1,3 @@ +128.195.180.21 +128.195.180.24 +128.195.180.26 -- 2.34.1