From 4e420e80f5557b0f69439b536a1c2b14a48c5b69 Mon Sep 17 00:00:00 2001 From: adash Date: Fri, 13 Nov 2009 01:27:02 +0000 Subject: [PATCH] modifications to Whiplash signature bugs in string.split --- .../Distributed/SpamFilter/Mail.java | 33 ++- .../SpamFilter/SignatureComputer.java | 168 ++++++++++++- .../Distributed/SpamFilter/SpamFilter.java | 1 + .../SpamFilter/WhiplashSignature.java | 226 +++++++++++++++++- 4 files changed, 411 insertions(+), 17 deletions(-) diff --git a/Robust/src/Benchmarks/Distributed/SpamFilter/Mail.java b/Robust/src/Benchmarks/Distributed/SpamFilter/Mail.java index 81f56691..99824e8e 100644 --- a/Robust/src/Benchmarks/Distributed/SpamFilter/Mail.java +++ b/Robust/src/Benchmarks/Distributed/SpamFilter/Mail.java @@ -21,6 +21,12 @@ public class Mail { //same as hashcode of a class boolean isSpam; + /** + * this is a really simple implementation of a tokenizer + * used to build tokens from an email and divide email into parts + **/ + int MAX_TOKEN_SIZE; + public Mail() { messageID=null; } @@ -65,7 +71,9 @@ public class Mail { } } // parsed messageID, To, from, cc, Title - + /** + * error checking + **/ if(!chk) System.out.println("no line read"); @@ -79,6 +87,8 @@ public class Mail { } fileinput.close(); + + MAX_TOKEN_SIZE = 1024; } // ------------------------------------------------------- @@ -277,16 +287,16 @@ public class Mail { return body; } - /* TODO add this to process entire email public Vector returnEmail() { Vector myemail = new Vector(); - myemail.addElement(getCommonPart()); + System.out.println("DEBUG: getCommonPart.size= " + getCommonPart().size()); myemail.addElement(getURLs()); - myemail.addElement(getSplittedBody()); + System.out.println("DEBUG: getURLs.size= " + getURLs().size()); + myemail.addElement(getSplittedBody(MAX_TOKEN_SIZE)); + System.out.println("DEBUG: getSplittedBody.size= " + getSplittedBody(MAX_TOKEN_SIZE).size()); return myemail; } - */ public Vector getURLs() { @@ -324,9 +334,11 @@ public class Mail { noURLBody = new String(); Vector splittedBody = body.split(); + System.out.println("DEBUG: splittedBody.size()= " + splittedBody.size()); for(int i=0; i< splittedBody.size();i ++) { - String segment = (String)splittedBody.elementAt(i); + String segment = (String)(splittedBody.elementAt(i)); + System.out.println("DEBUG: segment= " + segment); if(!(segment.startsWith("http://") || isEmailAccount(segment))) noURLBody += segment; @@ -345,6 +357,8 @@ public class Mail { String tmpStr = new String(); tmpStr += charArray[0]; + System.out.println("tmpStr= " + tmpStr); + for(int i=1; i< noURLBody.length(); i++) { if((i % size) == 0) { @@ -378,12 +392,15 @@ public class Mail { public Vector checkMail(int userid) { //Preprocess emails //Vector partsOfMailStrings = mail.createMailStringsWithURL(); - Vector partsOfMailStrings = getCommonPart(); + //Vector partsOfMailStrings = getCommonPart(); //partsOfMailStrings.addElement(getBodyString()); + Vector partsOfMailStrings = returnEmail(); + //Compute signatures SignatureComputer sigComp = new SignatureComputer(); - Vector signatures = sigComp.computeSigs(partsOfMailStrings);//vector of strings + //Vector signatures = sigComp.computeSigs(partsOfMailStrings);//vector of strings + Vector signatures = sigComp.computeSigs(partsOfMailStrings);//vector of vector of strings return signatures; } diff --git a/Robust/src/Benchmarks/Distributed/SpamFilter/SignatureComputer.java b/Robust/src/Benchmarks/Distributed/SpamFilter/SignatureComputer.java index b4d2c77d..600dee42 100644 --- a/Robust/src/Benchmarks/Distributed/SpamFilter/SignatureComputer.java +++ b/Robust/src/Benchmarks/Distributed/SpamFilter/SignatureComputer.java @@ -74,29 +74,72 @@ public class SignatureComputer { if (EmailParts == null) return null; Vector printableSigs = new Vector(); // vector of strings - for (int mailIndex = 0; mailIndex < EmailParts.size(); mailIndex++) { - String mail = (String) (EmailParts.elementAt(mailIndex)); + + /** + * Step -I + * Get signatures for the common parts + **/ + + Vector commonpart = (Vector) (EmailParts.elementAt(0)); + for (int mailIndex = 0; mailIndex < commonpart.size(); mailIndex++) { + String mail = (String) (commonpart.elementAt(mailIndex)); if (mail == null) continue; /* - * Compute Sig for bodyparts that are cleaned. + * Compute Sig for email header that are cleaned. */ for (int engineIndex = 0; engineIndex < enginesToUseForCheck.length; engineIndex++) { int engineNo = enginesToUseForCheck[engineIndex]; String sig = null; + /* EphemeralSignature calculator */ if(engineNo==4) { sig = computeSignature(engineNo,mail); } - /* - if(engineNo==8) { + + if(engineNo!=4 || engineNo!=8) { + System.out.println("Err: Couldn't find the signature engine: " + engineNo); + } + + if (sig != null) { + String hash = engineNo + ":" + sig; + printableSigs.addElement(hash); + } else { + // we didn't produce a signature for the mail. + } + }//engine + } + + /** + * Step -II + * Get signatures for the body parts without URLs + **/ + Vector getBodywithNoURLs = (Vector)(EmailParts.elementAt(2)); + for (int mailIndex = 0; mailIndex < getBodywithNoURLs.size(); mailIndex++) { + String mail = (String) (getBodywithNoURLs.elementAt(mailIndex)); + + System.out.println("mail= " + mail); + + if (mail == null) continue; + + /* + * Compute Sig for email header that are cleaned. + */ + for (int engineIndex = 0; engineIndex < enginesToUseForCheck.length; engineIndex++) { + int engineNo = enginesToUseForCheck[engineIndex]; + if(engineNo==8) + continue; + String sig = null; + + /* EphemeralSignature calculator */ + if(engineNo==4) { sig = computeSignature(engineNo,mail); } + if(engineNo!=4 || engineNo!=8) { System.out.println("Err: Couldn't find the signature engine: " + engineNo); } - */ if (sig != null) { String hash = engineNo + ":" + sig; @@ -105,7 +148,114 @@ public class SignatureComputer { // we didn't produce a signature for the mail. } }//engine - }//each emails part + } + + /** + * Step -III + * Get signatures for the body parts with URLs + **/ + Vector getURLs = (Vector)(EmailParts.elementAt(1)); + for (int mailIndex = 0; mailIndex < getURLs.size(); mailIndex++) { + String mail = (String) (getURLs.elementAt(mailIndex)); + System.out.println("mail= " + mail); + + /* + * Compute Sig for bodyparts that are cleaned. + */ + for (int engineIndex = 0; engineIndex < enginesToUseForCheck.length; engineIndex++) { + int engineNo = enginesToUseForCheck[engineIndex]; + + if(engineNo==4) { + continue; + } + + /* WhiplashSignature calculator */ + String[] hosts = null; + String sig = null; + if(engineNo==8) { + //hosts = computeSignature(engineNo,mail); + hosts = sig8.computeSignature(mail); + if(hosts != null) { + for(int i=0; ipack() which + * we must do manually in java. + */ + private int convertHexToRazorEncoding(String hex3) { + if((hex3 == null)) + return 0; //error + int res = 0; + int cur = Integer.parseInt(hex3.substring(0,1),16); + cur = mirror4LSBits(cur); + res |= ( (cur&0xf) << 8); + if(hex3.length() >=2) { + cur = Integer.parseInt(hex3.substring(1,2),16); + } else { + cur = 0; + } + //cur = ( hex3.length() >=2 ? Integer.parseInt(hex3.substring(1,2),16) : 0); + cur = mirror4LSBits(cur); + res |= ((cur & 0xf) << 4); + if(hex3.length() >= 3) { + cur = Integer.parseInt(hex3.substring(2,3),16); + } else { + cur = 0; + } + //cur = ( hex3.length() >= 3 ? Integer.parseInt(hex3.substring(2,3),16): 0); + cur = mirror4LSBits(cur); + res |= (cur & 0xf); + return res; } + + /** + * mirrors the 4 least significant bytes of an integer + * @param cur an int containing 4 Least Singificant bytes like 00000...00abcd + * @return the mirrored 4 least significant bytes 00000...00dcba. all bits except a-b are lost. + */ + public int mirror4LSBits(int cur) { + int res = 0; + res |= (cur & 0x8)>>>3; + res |= (cur & 0x4)>>>1; + res |= (cur & 0x2)<<1; + res |= (cur & 0x1)<<3; + return res; + } + + public String[] whiplash(String text) { + + //System.out.println("Inside whiplash"); + if (text == null) { + return null; + } + String[] hosts = extractHosts(text); + if (hosts == null || hosts.length < 1) { + return null; + } + String[] sigs = new String[hosts.length]; + + for (int i = 0; i < hosts.length; i++) { + MD5 md = new MD5(); + String host = hosts[i]; + int len = host.length(); + byte buf[] = host.getBytes(); + byte sig[] = new byte[16]; + md.update(buf, len); + md.md5final(sig); + String signature = new String(sig); + sigs[i] = signature; + } + return sigs; + } + + public String[] extractHosts(String text) { + //System.out.println("Inside extractHosts"); + Vector hosts = new Vector(); + String buf = new String(text); + + System.out.println("buf= " + buf); + + String strwww = new String("www."); + int idx; + while ((idx = buf.indexOf(strwww)) != -1) { + int startidx = idx + strwww.length(); + //System.out.println("idx= " + idx + " startidx= " + startidx); + String strcom = new String("."); + buf = buf.substring(startidx); + int endidx = buf.indexOf(strcom); + String host = buf.substring(0, endidx); + System.out.println(host); + hosts.addElement(host); + buf = buf.substring(endidx+strcom.length()); + } + + if (hosts.size() == 0) { + return null; + } + + String[] retbuf = new String[hosts.size()]; + for (int i = 0; i < hosts.size(); i++) { + retbuf[i] = (String) (hosts.elementAt(i)); + } + + return retbuf; + } + +// Testing the signature computation +// public static void main(String[] args) { +// /* String testVector = " Test Vectors: \n"+ +// "\n" + +// "1. http:www.nodg.com@www.geocities.com/nxcisdsfdfdsy/off\n"+ +// "2. http:www.ksleybiuh.com@213.171.60.74/getoff/\n"+ +// "3. \n"+ +// "4. http:217.12.4.7/rmi/http:definethis.net/526/index.html\n"+ +// "5. http:magalygr8sex.free-host.com/h.html\n"+ +// "6. http:%3CVenkatrs%3E@218.80.74.102/thecard/4index.htm\n"+ +// "7. http:EBCDVKIGURGGCEOKXHINOCANVQOIDOXJWTWGPC@218.80.74.102/thecard/5in\n"+ +// "8. http:g.india2.bag.gs/remove_page.htm\n"+ +// "9. https:220.97.40.149\n"+ +// "10. http:mjaked.biz/unsubscribe.ddd?leaving\n"+ +// "11. http:g5j99m8@it.rd.yahoo.com/bassi/*http:www.lekobas.com/c/index.php\n"+ +// "12. look great / feel great\n"+ +// "13. \n"+ +// "14. www.pillzthatwork.com # anything that starts with www. \n"; +// */ +// String testVector = "\n"+ +// "\n"+ +// "

Our first autolink: www.autolink1.com or another link like www.autolink2.co.uk or how about https:plaintextlink1.co.uk or http:plaintextlink2.com

\n"+ +// "

now a masked link http://www.coveringlink1.com and another link http:plaintextlink3.net and how about https:plaintextlink4.to

\n"+ +// "

another masked link https:coveringlink2.com and another link https:plaintextlink5.com

\n"+ +// "\n"+ +// "\n"; +// String test1 = "Our first autolink: www.autolink1.com or another link like www.autolink2.co.uk or how about https:plaintextlink1.co.uk or http:plaintextlink2.com

\n"; +// WhiplashSignature whiplash = new WhiplashSignature(); +// String[] hosts = whiplash.computeSignature(testVector); +// //String[] hosts = whiplash.computeSignature(test1); +// for (int i = 0; i < hosts.length; i++) { +// String string = hosts[i]; +// System.out.println("host " + i + ":\t" + string); +// } +// } + } -- 2.34.1