Adding JMCR-Stable version
[Benchmarks_CSolver.git] / JMCR-Stable / real-world application / weblech / src / spider / HTMLParser.java
diff --git a/JMCR-Stable/real-world application/weblech/src/spider/HTMLParser.java b/JMCR-Stable/real-world application/weblech/src/spider/HTMLParser.java
new file mode 100644 (file)
index 0000000..648078a
--- /dev/null
@@ -0,0 +1,190 @@
+/*\r
+ * This is the MIT license, see also http://www.opensource.org/licenses/mit-license.html\r
+ *\r
+ * Copyright (c) 2001 Brian Pitcher\r
+ *\r
+ * Permission is hereby granted, free of charge, to any person obtaining a\r
+ * copy of this software and associated documentation files (the "Software"),\r
+ * to deal in the Software without restriction, including without limitation\r
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,\r
+ * and/or sell copies of the Software, and to permit persons to whom the\r
+ * Software is furnished to do so, subject to the following conditions:\r
+ *\r
+ * The above copyright notice and this permission notice shall be included in\r
+ * all copies or substantial portions of the Software.\r
+ *\r
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\r
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\r
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\r
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\r
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\r
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\r
+ * SOFTWARE.\r
+ */\r
+\r
+// $Header: /project/jiss/smhuang/leap/weblech/src/spider/HTMLParser.java,v 1.1 2010/06/30 15:45:26 smhuang Exp $\r
+\r
+package spider;\r
+\r
+import org.apache.log4j.Category;\r
+\r
+import java.util.List;\r
+import java.util.ArrayList;\r
+import java.util.HashSet;\r
+import java.util.Set;\r
+import java.net.URL;\r
+import java.net.MalformedURLException;\r
+import java.io.ByteArrayInputStream;\r
+import java.io.IOException;\r
+import java.io.FileWriter;\r
+import java.io.PrintWriter;\r
+\r
+import weblech.util.Log4j;\r
+\r
+public class HTMLParser\r
+{\r
+    private final static Category _logClass = Category.getInstance(URLObject.class);\r
+\r
+    private SpiderConfig config;\r
+\r
+    static\r
+    {\r
+        Log4j.init();\r
+    }\r
+\r
+    public HTMLParser(SpiderConfig config)\r
+    {\r
+        this.config = config;\r
+    }\r
+\r
+    public List parseLinksInDocument(URL sourceURL, String textContent)\r
+    {\r
+        return parseAsHTML(sourceURL, textContent);\r
+    }\r
+\r
+    private List parseAsHTML(URL sourceURL, String textContent)\r
+    {\r
+        _logClass.debug("parseAsHTML()");\r
+        ArrayList newURLs = new ArrayList();\r
+        HashSet newURLSet = new HashSet();\r
+\r
+        extractAttributesFromTags("img", "src", sourceURL, newURLs, newURLSet, textContent);\r
+        extractAttributesFromTags("a", "href", sourceURL, newURLs, newURLSet, textContent);\r
+        extractAttributesFromTags("body", "background", sourceURL, newURLs, newURLSet, textContent);\r
+        extractAttributesFromTags("frame", "src", sourceURL, newURLs, newURLSet, textContent);\r
+        extractAttributesFromTags("IMG", "SRC", sourceURL, newURLs, newURLSet, textContent);\r
+        extractAttributesFromTags("A", "HREF", sourceURL, newURLs, newURLSet, textContent);\r
+        extractAttributesFromTags("BODY", "BACKGROUND", sourceURL, newURLs, newURLSet, textContent);\r
+        extractAttributesFromTags("FRAME", "SRC", sourceURL, newURLs, newURLSet, textContent);\r
+\r
+        if(newURLs.size() == 0)\r
+        {\r
+            _logClass.debug("Got 0 new URLs from HTML parse, check HTML\n" + textContent);\r
+        }\r
+        _logClass.debug("Returning " + newURLs.size() + " urls extracted from page");\r
+        return newURLs;\r
+    }\r
+\r
+    private void extractAttributesFromTags(String tag, String attr, URL sourceURL, List newURLs, Set newURLSet, String input)\r
+    {\r
+        _logClass.debug("extractAttributesFromTags(" + tag + ", " + attr + ", ...)");\r
+\r
+        int startPos = 0;\r
+        String startTag = "<" + tag + " ";\r
+        String attrStr = attr + "=\"";\r
+        while(true)\r
+        {\r
+            int tagPos = input.indexOf(startTag, startPos);\r
+            if(tagPos < 0)\r
+            {\r
+                return;\r
+            }\r
+            int attrPos = input.indexOf(attrStr, tagPos + 1);\r
+            if(attrPos < 0)\r
+            {\r
+                startPos = tagPos + 1;\r
+                continue;\r
+            }\r
+            int nextClosePos = input.indexOf(">", tagPos + 1);\r
+            if(attrPos < nextClosePos)\r
+            {\r
+                // Ooh, found one\r
+                int closeQuotePos = input.indexOf("\"", attrPos + attrStr.length() + 1);\r
+                if(closeQuotePos > 0)\r
+                {\r
+                    String urlStr = input.substring(attrPos + attrStr.length(), closeQuotePos);\r
+                    if(urlStr.indexOf('#') != -1)\r
+                    {\r
+                        urlStr = urlStr.substring(0, urlStr.indexOf('#'));\r
+                    }\r
+                    //_logClass.debug("Found possible URL string: " + URL);\r
+\r
+                    if(isMailTo(urlStr))\r
+                    {\r
+                        logMailURL(urlStr);\r
+                    }\r
+                    else\r
+                    {\r
+                        try\r
+                        {\r
+\r
+                            URL u = new URL(sourceURL, urlStr);\r
+                            if(newURLSet.contains(u))\r
+                            {\r
+                                //_logClass.debug("Already found URL on page: " + u);\r
+                            }\r
+                            else\r
+                            {\r
+                                newURLs.add(u);\r
+                                newURLSet.add(u);\r
+                                //_logClass.debug("Found new URL on page: " + u);\r
+                            }\r
+                        }\r
+                        catch(MalformedURLException murle)\r
+                        {\r
+                        }\r
+                    }\r
+                }\r
+                startPos = tagPos + 1;\r
+                continue;\r
+            }\r
+            else\r
+            {\r
+                startPos = tagPos + 1;\r
+                continue;\r
+            }\r
+        }\r
+    }\r
+\r
+    private void logMailURL(String url)\r
+    {\r
+        _logClass.debug("logMailURL()");\r
+\r
+        try\r
+        {\r
+            FileWriter appendedFile = new FileWriter(config.getMailtoLogFile().toString(), true);\r
+            PrintWriter pW = new PrintWriter(appendedFile);\r
+            pW.println(url);\r
+            pW.flush();\r
+            pW.close();\r
+        }\r
+        catch(IOException ioe)\r
+        {\r
+            _logClass.warn("Caught IO exception writing mailto URL:" + ioe.getMessage(), ioe);\r
+        }\r
+    }\r
+\r
+    /**\r
+     * Check if a particular URL looks like it's a mailto: style link.\r
+     */\r
+    private boolean isMailTo(String url)\r
+    {\r
+        if(url == null)\r
+        {\r
+            return false;\r
+        }\r
+\r
+        url = url.toUpperCase();\r
+        return (url.indexOf("MAILTO:") != -1);\r
+    }\r
+}\r