Adding JMCR-Stable version
[Benchmarks_CSolver.git] / JMCR-Stable / real-world application / weblech / src / spider / Spider.java
diff --git a/JMCR-Stable/real-world application/weblech/src/spider/Spider.java b/JMCR-Stable/real-world application/weblech/src/spider/Spider.java
new file mode 100644 (file)
index 0000000..b132bf7
--- /dev/null
@@ -0,0 +1,325 @@
+/*\r
+ * This is the MIT license, see also http://www.opensource.org/licenses/mit-license.html\r
+ *\r
+ * Copyright (c) 2001 Brian Pitcher\r
+ *\r
+ * Permission is hereby granted, free of charge, to any person obtaining a\r
+ * copy of this software and associated documentation files (the "Software"),\r
+ * to deal in the Software without restriction, including without limitation\r
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,\r
+ * and/or sell copies of the Software, and to permit persons to whom the\r
+ * Software is furnished to do so, subject to the following conditions:\r
+ *\r
+ * The above copyright notice and this permission notice shall be included in\r
+ * all copies or substantial portions of the Software.\r
+ *\r
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\r
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\r
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\r
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\r
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\r
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\r
+ * SOFTWARE.\r
+ */\r
+\r
+// $Header: /project/jiss/smhuang/leap/weblech/src/spider/Spider.java,v 1.1 2010/06/30 15:45:26 smhuang Exp $\r
+\r
+package spider;\r
+\r
+import weblech.util.Log4j;\r
+import weblech.util.Logger;\r
+\r
+import java.util.*;\r
+import java.io.*;\r
+import java.net.URL;\r
+\r
+import org.apache.log4j.Category;\r
+\r
+public class Spider extends Logger implements Runnable, Constants\r
+{\r
+    /** Config for the spider */\r
+    private SpiderConfig config;\r
+    /**\r
+     * Download queue.\r
+     * Thread safety: To access the queue, first synchronize on it.\r
+     */\r
+    private DownloadQueue queue;\r
+    /**\r
+     * Set of URLs downloaded or scheduled, so we don't download a\r
+     * URL more than once.\r
+     * Thread safety: To access the set, first synchronize on it.\r
+     */\r
+    private Set urlsDownloadedOrScheduled;\r
+    /**\r
+     * Set of URLs currently being downloaded by Spider threads.\r
+     * Thread safety: To access the set, first synchronize on it.\r
+     */\r
+    private Set urlsDownloading;\r
+    /**\r
+     * Number of downloads currently taking place.\r
+     * Thread safety: To modify this value, first synchronize on\r
+     *                the download queue.\r
+     */\r
+    private int downloadsInProgress;\r
+    /** Whether the spider should quit */\r
+    private boolean quit;\r
+    /** Count of running Spider threads. */\r
+    private int running;\r
+    /** Time we last checkpointed. */\r
+    private long lastCheckpoint;\r
+\r
+    public Spider(SpiderConfig config)\r
+    {\r
+        this.config = config;\r
+        queue = new DownloadQueue(config);\r
+        queue.queueURL(new URLToDownload(config.getStartLocation(), 0));\r
+        urlsDownloadedOrScheduled = new HashSet();\r
+        urlsDownloading = new HashSet();\r
+        downloadsInProgress = 0;\r
+        lastCheckpoint = 0;\r
+    }\r
+\r
+    public void start()\r
+    {\r
+        quit = false;\r
+        running = 0;\r
+\r
+        for(int i = 0; i < config.getSpiderThreads(); i++)\r
+        {\r
+            _logClass.info("Starting Spider thread");\r
+            Thread t = new Thread(this, "Spider-Thread-" + (i + 1));\r
+            t.start();\r
+            running++;\r
+        }\r
+    }\r
+\r
+    public void stop()\r
+    {\r
+        quit = true;\r
+    }\r
+\r
+    public boolean isRunning()\r
+    {\r
+        return running == 0;//a correct version should be return running!=0;\r
+    }\r
+\r
+    private void checkpointIfNeeded()\r
+    {\r
+        if(config.getCheckpointInterval() == 0)\r
+        {\r
+            return;\r
+        }\r
+\r
+        if(System.currentTimeMillis() - lastCheckpoint > config.getCheckpointInterval())\r
+        {\r
+            synchronized(queue)\r
+            {\r
+                if(System.currentTimeMillis() - lastCheckpoint > config.getCheckpointInterval())\r
+                {\r
+                    writeCheckpoint();\r
+                    lastCheckpoint = System.currentTimeMillis();\r
+                }\r
+            }\r
+        }\r
+    }\r
+\r
+    private void writeCheckpoint()\r
+    {\r
+        _logClass.debug("writeCheckpoint()");\r
+        try\r
+        {\r
+            FileOutputStream fos = new FileOutputStream("spider.checkpoint", false);\r
+            ObjectOutputStream oos = new ObjectOutputStream(fos);\r
+            oos.writeObject(queue);\r
+            oos.writeObject(urlsDownloading);\r
+            oos.close();\r
+        }\r
+        catch(IOException ioe)\r
+        {\r
+            _logClass.warn("IO Exception attempting checkpoint: " + ioe.getMessage(), ioe);\r
+        }\r
+    }\r
+\r
+    public void readCheckpoint()\r
+    {\r
+        try\r
+        {\r
+            FileInputStream fis = new FileInputStream("spider.checkpoint");\r
+            ObjectInputStream ois = new ObjectInputStream(fis);\r
+            queue = (DownloadQueue) ois.readObject();\r
+            urlsDownloading = (Set) ois.readObject();\r
+            queue.queueURLs(urlsDownloading);\r
+            urlsDownloading.clear();\r
+        }\r
+        catch(Exception e)\r
+        {\r
+            _logClass.error("Caught exception reading checkpoint: " + e.getMessage(), e);\r
+        }\r
+    }\r
+\r
+    public void run()\r
+    {\r
+        HTMLParser htmlParser = new HTMLParser(config);\r
+        URLGetter urlGetter = new URLGetter(config);\r
+\r
+        int TIMES = 0;\r
+        while((queueSize() > 0 || downloadsInProgress > 0) && quit == false)\r
+        {\r
+            checkpointIfNeeded();\r
+            if(queueSize() == 0 && downloadsInProgress > 0)\r
+            {\r
+                // Wait for a download to finish before seeing if this thread should stop\r
+                try\r
+                {\r
+                    Thread.sleep(QUEUE_CHECK_INTERVAL);\r
+                    TIMES++;\r
+                    if(TIMES>2)break;\r
+                }\r
+                catch(InterruptedException ignored)\r
+                {\r
+                }\r
+                // Have another go at the loop\r
+                continue;\r
+            }\r
+            else if(queueSize() == 0)\r
+            {\r
+                break;\r
+            }\r
+            URLToDownload nextURL;\r
+            synchronized(queue)\r
+            {\r
+                nextURL = queue.getNextInQueue();\r
+                downloadsInProgress++;\r
+            }\r
+            synchronized(urlsDownloading)\r
+            {\r
+                urlsDownloading.add(nextURL);\r
+            }\r
+            int newDepth = nextURL.getDepth() + 1;\r
+            int maxDepth = config.getMaxDepth();\r
+            synchronized(urlsDownloading)\r
+            {\r
+                urlsDownloading.remove(nextURL);\r
+            }\r
+            List newURLs = downloadURL(nextURL, urlGetter, htmlParser);\r
+\r
+            newURLs = filterURLs(newURLs);\r
+\r
+            ArrayList u2dsToQueue = new ArrayList();\r
+            for(Iterator i = newURLs.iterator(); i.hasNext(); )\r
+            {\r
+                URL u = (URL) i.next();\r
+                // Download if not yet downloaded, and the new depth is less than the maximum\r
+                synchronized(urlsDownloadedOrScheduled)\r
+                {\r
+                    if(!urlsDownloadedOrScheduled.contains(u)\r
+                    && (maxDepth == 0 || newDepth <= maxDepth))\r
+                    {\r
+                        u2dsToQueue.add(new URLToDownload(u, nextURL.getURL(), newDepth));\r
+                        urlsDownloadedOrScheduled.add(u);\r
+                    }\r
+                }\r
+            }\r
+            synchronized(queue)\r
+            {\r
+                queue.queueURLs(u2dsToQueue);\r
+                downloadsInProgress--;\r
+            }\r
+        }\r
+        _logClass.info("Spider thread stopping");\r
+        running--;\r
+    }\r
+\r
+    /**\r
+     * Get the size of the download queue in a thread-safe manner.\r
+     */\r
+    private int queueSize()\r
+    {\r
+        synchronized(queue)\r
+        {\r
+            return queue.size();\r
+        }\r
+    }\r
+\r
+    /**\r
+     * Get a URL, and return new URLs that are referenced from it.\r
+     *\r
+     * @return A List of URL objects.\r
+     */\r
+    private List downloadURL(URLToDownload url, URLGetter urlGetter, HTMLParser htmlParser)\r
+    {\r
+        _logClass.debug("downloadURL(" + url + ")");\r
+\r
+        // Bail out early if image and already on disk\r
+        URLObject obj = new URLObject(url.getURL(), config);\r
+        if(obj.existsOnDisk())\r
+        {\r
+            if(config.refreshHTMLs() && (obj.isHTML() || obj.isXML()))\r
+            {\r
+                _logClass.info("Q: [" + queue + "] " + url);\r
+                obj = urlGetter.getURL(url);\r
+            }\r
+            else if(config.refreshImages() && obj.isImage())\r
+            {\r
+                _logClass.info("Q: [" + queue + "] " + url);\r
+                obj = urlGetter.getURL(url);\r
+            }\r
+        }\r
+        else\r
+        {\r
+            _logClass.info("Q: [" + queue + "] " + url);\r
+            obj = urlGetter.getURL(url);\r
+        }\r
+\r
+        if(obj == null)\r
+        {\r
+            return new ArrayList();\r
+        }\r
+\r
+        if(!obj.existsOnDisk())\r
+        {\r
+            obj.writeToFile();\r
+        }\r
+\r
+        if(obj.isHTML() || obj.isXML())\r
+        {\r
+            return htmlParser.parseLinksInDocument(url.getURL(), obj.getStringContent());\r
+        }\r
+        else if(obj.isImage())\r
+        {\r
+            return new ArrayList();\r
+        }\r
+        else\r
+        {\r
+            _logClass.warn("Unsupported content type received: " + obj.getContentType());\r
+            _logClass.info("URL was " + url);\r
+            return new ArrayList();\r
+        }\r
+    }\r
+\r
+    private List filterURLs(List URLs)\r
+    {\r
+        String match = config.getURLMatch();\r
+        ArrayList retVal = new ArrayList();\r
+\r
+        synchronized(urlsDownloadedOrScheduled)\r
+        {\r
+            for(Iterator i = URLs.iterator(); i.hasNext(); )\r
+            {\r
+                URL u = (URL) i.next();\r
+                if(urlsDownloadedOrScheduled.contains(u))\r
+                {\r
+                    continue;\r
+                }\r
+\r
+                String s = u.toExternalForm();\r
+                if(s.indexOf(match) != -1)\r
+                {\r
+                    retVal.add(u);\r
+                }\r
+            }\r
+        }\r
+        return retVal;\r
+    }\r
+\r
+}\r