--- /dev/null
+public class GlobalQuery {
+ GlobalString hostname;
+ GlobalString path;
+ int depth;
+
+ public GlobalQuery(GlobalString hostname) {
+ this.hostname = global new GlobalString(hostname);
+ this.path = global new GlobalString("");
+ this.depth = 0;
+ }
+
+ public GlobalQuery(GlobalString hostname, GlobalString path, int depth) {
+ this.hostname = global new GlobalString(hostname);
+ this.path = global new GlobalString(path);
+ this.depth = depth;
+ }
+
+ public int getDepth() {
+ return depth;
+ }
+
+ public GlobalString getHostName() {
+ return hostname;
+ }
+
+ public GlobalString getPath() {
+ return path;
+ }
+
+ public GlobalString makewebcanonical(GlobalString page) {
+ GlobalStringBuffer b = global new GlobalStringBuffer(getHostName(page));
+ b.append("/");
+ b.append(getPathName(page));
+ return b.toGlobalString();
+ }
+
+ public GlobalString getHostName(GlobalString page) {
+ GlobalString http = global new GlobalString("http://");
+ GlobalString https = global new GlobalString("https://");
+ int beginindex;
+ int endindex;
+
+ if ((page.indexOf(http) == -1) && (page.indexOf(https) == -1)) {
+ return getHostName();
+ }
+ else if (page.indexOf(https) != -1) {
+ beginindex = page.indexOf(https) + https.length();
+ }
+ else {
+ beginindex = page.indexOf(http) + http.length();
+ }
+ endindex = page.indexOf('/',beginindex+1);
+
+ if ((beginindex == -1)) {
+ System.printString("ERROR");
+ }
+ if (endindex == -1)
+ endindex = page.length();
+
+ return page.subString(beginindex, endindex);
+ }
+
+
+ public GlobalString getPathName(GlobalString page) {
+ GlobalString http = global new GlobalString("http://");
+ GlobalString https = global new GlobalString("https://");
+ int beginindex;
+ int nextindex;
+
+ if ((page.indexOf(http) == -1) && (page.indexOf(https) == -1)) {
+ GlobalString path = getPath();
+ int lastindex = path.lastindexOf('/');
+ if (lastindex == -1)
+ return page;
+
+ GlobalStringBuffer sb = global new GlobalStringBuffer(path.subString(0,lastindex+1));
+ sb.append(page);
+ return sb.toGlobalString();
+ }
+ else if (page.indexOf(https) != -1) {
+ beginindex = page.indexOf(https) + https.length();
+ }
+ else {
+ beginindex = page.indexOf(http) + http.length();
+ }
+ nextindex = page.indexOf('/',beginindex+1);
+
+ if ((beginindex == -1) || (nextindex == -1))
+ return global new GlobalString("index.html");
+ return page.subString(nextindex+1, page.length());
+ }
+}
public String getHostName(String page) {
String http = new String("http://");
- if (page.indexOf(http) == -1) {
+ String https = new String("https://");
+ int beginindex;
+ int endindex;
+
+ if ((page.indexOf(http) == -1) && (page.indexOf(https) == -1)) {
return getHostName();
- } else {
- int beginindex = page.indexOf(http) + http.length();
- int endindex = page.indexOf('/',beginindex+1);
- if ((beginindex == -1)) {
- System.printString("ERROR");
- }
- if (endindex == -1)
- endindex=page.length();
- return page.subString(beginindex, endindex);
+ }
+ else if (page.indexOf(https) != -1) {
+ beginindex = page.indexOf(https) + https.length();
+ }
+ else {
+ beginindex = page.indexOf(http) + http.length();
}
+ endindex = page.indexOf('/',beginindex+1);
+
+ if ((beginindex == -1)) {
+ System.printString("ERROR");
+ }
+ if (endindex == -1)
+ endindex = page.length();
+
+ return page.subString(beginindex, endindex);
}
public String getPathName(String page) {
String http = new String("http://");
- if (page.indexOf(http) == -1) {
+ String https = new String("https://");
+ int beginindex;
+ int nextindex;
+
+ if ((page.indexOf(http) == -1) && (page.indexOf(https) == -1)) {
String path = getPath();
int lastindex = path.lastindexOf('/');
if (lastindex == -1)
StringBuffer sb = new StringBuffer(path.subString(0,lastindex+1));
sb.append(page);
return sb.toString();
- } else {
- int beginindex = page.indexOf(http) + http.length();
- int nextindex = page.indexOf('/',beginindex+1);
- if ((beginindex==-1) || (nextindex==-1))
- return new String("index.html");
- return page.subString(nextindex+1, page.length());
}
+ else if (page.indexOf(https) != -1) {
+ beginindex = page.indexOf(https) + https.length();
+ }
+ else {
+ beginindex = page.indexOf(http) + http.length();
+ }
+ nextindex = page.indexOf('/',beginindex+1);
+
+ if ((beginindex==-1) || (nextindex==-1))
+ return new String("index.html");
+ return page.subString(nextindex+1, page.length());
}
}
public class QueryQueue {
HashSet queries;
int size;
- int ddddddddddd;
public QueryQueue() {
queries = new HashSet();
--- /dev/null
+public class QueryTask extends Task {
+ int maxDepth;
+ Queue toprocess;
+ DistributedHashMap results;
+ GlobalString workingURL;
+
+ public QueryTask(Queue todoList, DistributedHashMap doneList, int maxDepth, DistributedHashMap results) {
+ this.todoList = todoList;
+ this.doneList = doneList;
+ this.maxDepth = maxDepth;
+ this.results = results;
+ }
+
+ public void execute() {
+ int depth;
+ int max;
+
+ atomic {
+ depth = ((GlobalQuery)myWork).getDepth();
+ max = this.maxDepth;
+ }
+
+ if (depth < max) {
+ /* global variables */
+ GlobalQuery gq;
+
+ /* local variables */
+ LocalQuery lq;
+ String hostname;
+ String path;
+
+ atomic {
+ gq = (GlobalQuery)myWork;
+ hostname = new String(GlobalString.toLocalCharArray(gq.getHostName()));
+ path = new String(GlobalString.toLocalCharArray(gq.getPath()));
+
+ GlobalStringBuffer gsb = global new GlobalStringBuffer(hostname);
+ gsb.append("/");
+ gsb.append(path);
+ workingURL = global new GlobalString(gsb.toGlobalString());
+ }
+ lq = new LocalQuery(hostname, path, depth);
+
+ System.printString(lq.getDepth()+" ");
+ System.printString("Processing - Hostname : ");
+ System.printString(hostname);
+ System.printString(", Path : ");
+ System.printString(path);
+ System.printString("\n");
+
+ Socket s = new Socket(hostname, 80);
+
+ requestQuery(hostname, path, s);
+ readResponse(lq, s);
+
+ atomic {
+ processList(lq, workingURL, results);
+ }
+
+ atomic {
+ toprocess = processPage(lq);
+ }
+
+ s.close();
+ }
+ }
+
+ public void done(Object obj) {
+ GlobalString str = global new GlobalString("true");
+ doneList.put(workingURL, str);
+
+ while(!toprocess.isEmpty()) {
+ GlobalQuery q = (GlobalQuery)toprocess.pop();
+
+ GlobalString hostname = global new GlobalString(q.getHostName());
+ GlobalString path = global new GlobalString(q.getPath());
+
+ GlobalStringBuffer gsb = global new GlobalStringBuffer(hostname);
+ gsb.append("/");
+ gsb.append(path);
+
+ if (!doneList.containsKey(gsb.toGlobalString())) {
+ todoList.push(q);
+ }
+ }
+ }
+
+ public static void requestQuery(String hostname, String path, Socket sock) {
+ StringBuffer req = new StringBuffer("GET ");
+ req.append("/");
+ req.append(path);
+ req.append(" HTTP/1.1\r\nHost:");
+ req.append(hostname);
+ req.append("\r\n\r\n");
+ sock.write(req.toString().getBytes());
+ }
+
+ public static void readResponse(LocalQuery lq, Socket sock) {
+ // state 0 - nothing
+ // state 1 - \r
+ // state 2 - \r\n
+ // state 3 - \r\n\r
+ // state 4 - \r\n\r\n
+ int state=0;
+ while(true) {
+ if (state<4) {
+ if (state==0) {
+ byte[] b=new byte[1];
+ int numchars=sock.read(b);
+ if ((numchars==1)) {
+ if (b[0]=='\r') {
+ state++;
+ }
+ } else
+ return;
+ } else if (state==1) {
+ byte[] b=new byte[1];
+ int numchars=sock.read(b);
+ if (numchars==1) {
+ if (b[0]=='\n')
+ state++;
+ else
+ state=0;
+ } else return;
+ } else if (state==2) {
+ byte[] b=new byte[1];
+ int numchars=sock.read(b);
+ if (numchars==1) {
+ if (b[0]=='\r')
+ state++;
+ else
+ state=0;
+ } else return;
+ } else if (state==3) {
+ byte[] b=new byte[1];
+ int numchars=sock.read(b);
+ if (numchars==1) {
+ if (b[0]=='\n')
+ state++;
+ else
+ state=0;
+ } else return;
+ }
+ } else {
+ byte[] buffer=new byte[1024];
+ int numchars=sock.read(buffer);
+ if (numchars==0)
+ return;
+ else {
+ String curr=(new String(buffer)).subString(0,numchars);
+ lq.response.append(curr);
+ }
+ }
+ }
+ }
+
+ public static void processList(LocalQuery lq, GlobalString url, DistributedHashMap results) {
+ String sTitle = new String("<title>");
+ String eTitle = new String("</title>");
+ String searchstr = lq.response.toString();
+ LinkedList ll;
+
+ int sIndex = searchstr.indexOf(sTitle);
+ if (sIndex != -1) {
+ int eIndex = searchstr.indexOf(eTitle, sIndex+sTitle.length());
+ String title = new String(searchstr.subString(sIndex+sTitle.length(), eIndex));
+ ll = tokenize(title);
+
+ Queue q;
+ while (!ll.isEmpty()) {
+ GlobalString word = global new GlobalString(ll.pop().toString());
+// q = (Queue)(results.get(word));
+
+// if (q == null) {
+ if (!results.containsKey(word)) {
+ q = global new Queue();
+ }
+ else {
+ q = (Queue)(results.get(word));
+ }
+ q.push(url);
+ results.put(word, q);
+
+ System.out.println("Key : ["+word.toLocalString()+"],["+q.size()+"]");
+/*
+ for (int i = 0; i < q.size(); i++) {
+ Object obj = q.elements[i];
+ GlobalString str = global new GlobalString((GlobalString)obj);
+ System.out.println("\t["+i+"] : "+str.toLocalString());
+ }*/
+ }
+ }
+ }
+
+ public static LinkedList tokenize(String str) {
+ LinkedList ll;
+ int sIndex = 0;
+ int eIndex = 0;
+ String token;
+
+ ll = new LinkedList();
+
+ // and, or, of, at, but, '.', ',', ':' ';', '"', ' ', '-', '='
+ while (true) {
+ eIndex = str.indexOf(' ', sIndex);
+ if (eIndex == -1) {
+ token = str.subString(sIndex);
+ ll.add(token);
+ break;
+ }
+ else {
+ token = str.subString(sIndex, eIndex);
+ ll.add(token);
+ sIndex = eIndex+1;
+ }
+ }
+
+ return ll;
+ }
+
+ public static Queue processPage(LocalQuery lq) {
+ int index = 0;
+ String href = new String("href=\"");
+ String searchstr = lq.response.toString();
+ int depth;
+ boolean cont = true;
+ Queue toprocess;
+
+ depth = lq.getDepth() + 1;
+
+ toprocess = global new Queue();
+
+ while(cont) {
+ int mindex = searchstr.indexOf(href,index);
+ if (mindex != -1) {
+ int endquote = searchstr.indexOf('"', mindex+href.length());
+ if (endquote != -1) {
+ String match = searchstr.subString(mindex+href.length(), endquote);
+ String match2 = lq.makewebcanonical(match);
+
+ GlobalString ghostname;
+ GlobalString gpath;
+
+ ghostname = global new GlobalString(lq.getHostName(match));
+ gpath = global new GlobalString(lq.getPathName(match));
+
+ if (match2 != null) {
+ GlobalQuery gq = global new GlobalQuery(ghostname, gpath, depth);
+ toprocess.push(gq);
+ }
+ index = endquote;
+ } else cont = false;
+ } else cont = false;
+ }
+ return toprocess;
+ }
+}
public class Spider {
public static void main(String[] args) {
- int NUM_THREADS = 4;
- int maxDepth = 5;
- int searchDepth = 10;
+ int NUM_THREADS = 3;
+ int maxDepth = 3;
int i, j;
Work[] works;
- QueryThread[] qt;
- Query[] currentWorkList;
+ QueryTask[] qt;
+ GlobalQuery[] currentWorkList;
NUM_THREADS = Integer.parseInt(args[0]);
+
+ if (args.length == 3) {
+ maxDepth = Integer.parseInt(args[2]);
+ }
+
GlobalString firstmachine;
- GlobalString firstpage;
-// int[] mid = getMID(NUM_THREADS);
int mid[] = new int[NUM_THREADS];
-/* mid[0] = (128<<24)|(195<<16)|(180<<8)|21; //dc-4
- mid[1] = (128<<24)|(195<<16)|(180<<8)|24; //dc-5
- mid[2] = (128<<24)|(195<<16)|(180<<8)|26; //dc-6
- */
- mid[0] = (128<<24)|(195<<16)|(136<<8)|162; //dc-1
- mid[1] = (128<<24)|(195<<16)|(136<<8)|163; //dc-2
- mid[2] = (128<<24)|(195<<16)|(136<<8)|164; //dc-3
- mid[3] = (128<<24)|(195<<16)|(136<<8)|165; //dc-3
- mid[4] = (128<<24)|(195<<16)|(136<<8)|166; //dc-3
- mid[5] = (128<<24)|(195<<16)|(136<<8)|167; //dc-3
+ mid[0] = (128<<24)|(195<<16)|(180<<8)|21;
+ mid[1] = (128<<24)|(195<<16)|(180<<8)|24;
+ mid[2] = (128<<24)|(195<<16)|(180<<8)|26;
atomic {
firstmachine = global new GlobalString(args[1]);
- firstpage = global new GlobalString(args[2]);
works = global new Work[NUM_THREADS];
- qt = global new QueryThread[NUM_THREADS];
- currentWorkList = global new Query[NUM_THREADS];
+ qt = global new QueryTask[NUM_THREADS];
+ currentWorkList = global new GlobalQuery[NUM_THREADS];
- Query firstquery = global new Query(firstmachine, firstpage, 0);
+ GlobalQuery firstquery = global new GlobalQuery(firstmachine);
Queue todoList = global new Queue();
- Queue doneList = global new Queue();
+ DistributedHashMap doneList = global new DistributedHashMap(500, 500, 0.75f);
+ DistributedHashMap results = global new DistributedHashMap(100, 100, 0.75f);
+
todoList.push(firstquery);
for (i = 0; i < NUM_THREADS; i++) {
- qt[i] = global new QueryThread(todoList, doneList, maxDepth, searchDepth);
+ qt[i] = global new QueryTask(todoList, doneList, maxDepth, results);
works[i] = global new Work(qt[i], NUM_THREADS, i, currentWorkList);
}
}
tmp.join();
}
}
-
- public static int[] getMID (int num_threads) {
- int[] mid = new int[num_threads];
-
- FileInputStream ifs = new FileInputStream("dstm.conf");
- String str;
- String sub;
- int fromIndex;
- int endIndex;
- double num;
-
- for (int i = 0; i < num_threads; i++) {
- int power = 3 - i;
- fromIndex = 0;
- num = 0;
-
- str = ifs.readLine();
-
- endIndex = str.indexOf('.', fromIndex);
- sub = str.subString(fromIndex, endIndex);
- num += (Integer.parseInt(sub) << 24);
-
- fromIndex = endIndex + 1;
- endIndex = str.indexOf('.', fromIndex);
- sub = str.subString(fromIndex, endIndex);
- num += (Integer.parseInt(sub) << 16);
-
- fromIndex = endIndex + 1;
- endIndex = str.indexOf('.', fromIndex);
- sub = str.subString(fromIndex, endIndex);
- num += (Integer.parseInt(sub) << 8);
-
- fromIndex = endIndex + 1;
- sub = str.subString(fromIndex);
- num += Integer.parseInt(sub);
-
- mid[i] = (int)num;
- }
- return mid;
- }
}
-#128.195.180.21
-#128.195.180.24
-#128.195.180.26
-128.195.136.162
-128.195.136.163
-128.195.136.164
-128.195.136.165
-128.195.136.166
-128.195.136.167
+128.195.180.21
+128.195.180.24
+128.195.180.26
MAINCLASS=Spider
SUBCLASS=Query
SRC1=${MAINCLASS}.java
-SRC2=${SUBCLASS}.java
+SRC2=Global${SUBCLASS}.java
SRC3=${SUBCLASS}Queue.java
SRC4=${SUBCLASS}Task.java
-FLAGS= -dsm -dsmtask -32bit -nooptimize -debug -recovery -mainclass ${MAINCLASS}
+FLAGS= -recovery -dsmtask -dsm -dsmtask -32bit -nooptimize -debug -mainclass ${MAINCLASS}
default:
../../../buildscript ${FLAGS} -o ${MAINCLASS} ${SRC2} ${SRC3} ${SRC4} ${SRC1}
clean:
rm -rf tmpbuilddirectory
rm *.bin
- rm *.php
- rm *.css
- rm www*
- rm eee*
- rm web*
for(int i=0; i<oldtable.length; i++) {
DHashEntry e=oldtable[i];
while(e!=null) {
- DHashEntry next=e.next;
- int bin=hash2(e.hashval, table.length, newCapacity);
- e.next=newtable[bin];
- newtable[bin]=e;
- e=next;
+ DHashEntry next=e.next;
+ int bin=hash2(e.hashval, table.length, newCapacity);
+ e.next=newtable[bin];
+ newtable[bin]=e;
+ e=next;
}
}
}
if (ptr!=null) {
if (ptr.hashval==hashcode&&ptr.key.equals(key)) {
- dhe.array[index2]=ptr.next;
- dhe.count--;
- return ptr.value;
+ dhe.array[index2]=ptr.next;
+ dhe.count--;
+ return ptr.value;
}
while(ptr.next!=null) {
- if (ptr.hashval==hashcode&&ptr.next.key.equals(key)) {
- Object oldvalue=ptr.value;
- ptr.next=ptr.next.next;
- dhe.count--;
- return oldvalue;
- }
- ptr=ptr.next;
- }
+ if (ptr.hashval==hashcode&&ptr.next.key.equals(key)) {
+ Object oldvalue=ptr.value;
+ ptr.next=ptr.next.next;
+ dhe.count--;
+ return oldvalue;
+ }
+ ptr=ptr.next;
+ }
}
return null;
}
while(ptr!=null) {
if (ptr.hashval==hashcode
&&ptr.key.equals(key)) {
- return ptr.value;
+ return ptr.value;
}
ptr=ptr.next;
}
while(ptr!=null) {
if (ptr.hashval==hashcode
&&ptr.key.equals(key)) {
- return true;
+ return true;
}
ptr=ptr.next;
}
while(ptr!=null) {
if (ptr.hashval==hashcode&&ptr.key.equals(key)) {
- Object oldvalue=ptr.value;
- ptr.value=value;
- return oldvalue;
+ Object oldvalue=ptr.value;
+ ptr.value=value;
+ return oldvalue;
}
ptr=ptr.next;
}
}
return null;
}
+
+ public int size() {
+ return table.length;
+ }
}
public class Task {
Queue todoList;
- Queue doneList;
+ DistributedHashMap doneList;
Object myWork;
Task() {}
atomic {
o = todoList.pop();
}
+// System.out.println("Size of TodoList : " + todoList.size());
return o;
}