--- /dev/null
+/*\r
+ * This is the MIT license, see also http://www.opensource.org/licenses/mit-license.html\r
+ *\r
+ * Copyright (c) 2001 Brian Pitcher\r
+ *\r
+ * Permission is hereby granted, free of charge, to any person obtaining a\r
+ * copy of this software and associated documentation files (the "Software"),\r
+ * to deal in the Software without restriction, including without limitation\r
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,\r
+ * and/or sell copies of the Software, and to permit persons to whom the\r
+ * Software is furnished to do so, subject to the following conditions:\r
+ *\r
+ * The above copyright notice and this permission notice shall be included in\r
+ * all copies or substantial portions of the Software.\r
+ *\r
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\r
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\r
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\r
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\r
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\r
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\r
+ * SOFTWARE.\r
+ */\r
+\r
+// $Header: /project/jiss/smhuang/leap/weblech/src/spider/HTMLParser.java,v 1.1 2010/06/30 15:45:26 smhuang Exp $\r
+\r
+package spider;\r
+\r
+import org.apache.log4j.Category;\r
+\r
+import java.util.List;\r
+import java.util.ArrayList;\r
+import java.util.HashSet;\r
+import java.util.Set;\r
+import java.net.URL;\r
+import java.net.MalformedURLException;\r
+import java.io.ByteArrayInputStream;\r
+import java.io.IOException;\r
+import java.io.FileWriter;\r
+import java.io.PrintWriter;\r
+\r
+import weblech.util.Log4j;\r
+\r
+public class HTMLParser\r
+{\r
+ private final static Category _logClass = Category.getInstance(URLObject.class);\r
+\r
+ private SpiderConfig config;\r
+\r
+ static\r
+ {\r
+ Log4j.init();\r
+ }\r
+\r
+ public HTMLParser(SpiderConfig config)\r
+ {\r
+ this.config = config;\r
+ }\r
+\r
+ public List parseLinksInDocument(URL sourceURL, String textContent)\r
+ {\r
+ return parseAsHTML(sourceURL, textContent);\r
+ }\r
+\r
+ private List parseAsHTML(URL sourceURL, String textContent)\r
+ {\r
+ _logClass.debug("parseAsHTML()");\r
+ ArrayList newURLs = new ArrayList();\r
+ HashSet newURLSet = new HashSet();\r
+\r
+ extractAttributesFromTags("img", "src", sourceURL, newURLs, newURLSet, textContent);\r
+ extractAttributesFromTags("a", "href", sourceURL, newURLs, newURLSet, textContent);\r
+ extractAttributesFromTags("body", "background", sourceURL, newURLs, newURLSet, textContent);\r
+ extractAttributesFromTags("frame", "src", sourceURL, newURLs, newURLSet, textContent);\r
+ extractAttributesFromTags("IMG", "SRC", sourceURL, newURLs, newURLSet, textContent);\r
+ extractAttributesFromTags("A", "HREF", sourceURL, newURLs, newURLSet, textContent);\r
+ extractAttributesFromTags("BODY", "BACKGROUND", sourceURL, newURLs, newURLSet, textContent);\r
+ extractAttributesFromTags("FRAME", "SRC", sourceURL, newURLs, newURLSet, textContent);\r
+\r
+ if(newURLs.size() == 0)\r
+ {\r
+ _logClass.debug("Got 0 new URLs from HTML parse, check HTML\n" + textContent);\r
+ }\r
+ _logClass.debug("Returning " + newURLs.size() + " urls extracted from page");\r
+ return newURLs;\r
+ }\r
+\r
+ private void extractAttributesFromTags(String tag, String attr, URL sourceURL, List newURLs, Set newURLSet, String input)\r
+ {\r
+ _logClass.debug("extractAttributesFromTags(" + tag + ", " + attr + ", ...)");\r
+\r
+ int startPos = 0;\r
+ String startTag = "<" + tag + " ";\r
+ String attrStr = attr + "=\"";\r
+ while(true)\r
+ {\r
+ int tagPos = input.indexOf(startTag, startPos);\r
+ if(tagPos < 0)\r
+ {\r
+ return;\r
+ }\r
+ int attrPos = input.indexOf(attrStr, tagPos + 1);\r
+ if(attrPos < 0)\r
+ {\r
+ startPos = tagPos + 1;\r
+ continue;\r
+ }\r
+ int nextClosePos = input.indexOf(">", tagPos + 1);\r
+ if(attrPos < nextClosePos)\r
+ {\r
+ // Ooh, found one\r
+ int closeQuotePos = input.indexOf("\"", attrPos + attrStr.length() + 1);\r
+ if(closeQuotePos > 0)\r
+ {\r
+ String urlStr = input.substring(attrPos + attrStr.length(), closeQuotePos);\r
+ if(urlStr.indexOf('#') != -1)\r
+ {\r
+ urlStr = urlStr.substring(0, urlStr.indexOf('#'));\r
+ }\r
+ //_logClass.debug("Found possible URL string: " + URL);\r
+\r
+ if(isMailTo(urlStr))\r
+ {\r
+ logMailURL(urlStr);\r
+ }\r
+ else\r
+ {\r
+ try\r
+ {\r
+\r
+ URL u = new URL(sourceURL, urlStr);\r
+ if(newURLSet.contains(u))\r
+ {\r
+ //_logClass.debug("Already found URL on page: " + u);\r
+ }\r
+ else\r
+ {\r
+ newURLs.add(u);\r
+ newURLSet.add(u);\r
+ //_logClass.debug("Found new URL on page: " + u);\r
+ }\r
+ }\r
+ catch(MalformedURLException murle)\r
+ {\r
+ }\r
+ }\r
+ }\r
+ startPos = tagPos + 1;\r
+ continue;\r
+ }\r
+ else\r
+ {\r
+ startPos = tagPos + 1;\r
+ continue;\r
+ }\r
+ }\r
+ }\r
+\r
+ private void logMailURL(String url)\r
+ {\r
+ _logClass.debug("logMailURL()");\r
+\r
+ try\r
+ {\r
+ FileWriter appendedFile = new FileWriter(config.getMailtoLogFile().toString(), true);\r
+ PrintWriter pW = new PrintWriter(appendedFile);\r
+ pW.println(url);\r
+ pW.flush();\r
+ pW.close();\r
+ }\r
+ catch(IOException ioe)\r
+ {\r
+ _logClass.warn("Caught IO exception writing mailto URL:" + ioe.getMessage(), ioe);\r
+ }\r
+ }\r
+\r
+ /**\r
+ * Check if a particular URL looks like it's a mailto: style link.\r
+ */\r
+ private boolean isMailTo(String url)\r
+ {\r
+ if(url == null)\r
+ {\r
+ return false;\r
+ }\r
+\r
+ url = url.toUpperCase();\r
+ return (url.indexOf("MAILTO:") != -1);\r
+ }\r
+}\r