--- /dev/null
+/*\r
+ * This is the MIT license, see also http://www.opensource.org/licenses/mit-license.html\r
+ *\r
+ * Copyright (c) 2001 Brian Pitcher\r
+ *\r
+ * Permission is hereby granted, free of charge, to any person obtaining a\r
+ * copy of this software and associated documentation files (the "Software"),\r
+ * to deal in the Software without restriction, including without limitation\r
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,\r
+ * and/or sell copies of the Software, and to permit persons to whom the\r
+ * Software is furnished to do so, subject to the following conditions:\r
+ *\r
+ * The above copyright notice and this permission notice shall be included in\r
+ * all copies or substantial portions of the Software.\r
+ *\r
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\r
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\r
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\r
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\r
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\r
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\r
+ * SOFTWARE.\r
+ */\r
+\r
+// $Header: /project/jiss/smhuang/leap/weblech/src/spider/Spider.java,v 1.1 2010/06/30 15:45:26 smhuang Exp $\r
+\r
+package spider;\r
+\r
+import weblech.util.Log4j;\r
+import weblech.util.Logger;\r
+\r
+import java.util.*;\r
+import java.io.*;\r
+import java.net.URL;\r
+\r
+import org.apache.log4j.Category;\r
+\r
+public class Spider extends Logger implements Runnable, Constants\r
+{\r
+ /** Config for the spider */\r
+ private SpiderConfig config;\r
+ /**\r
+ * Download queue.\r
+ * Thread safety: To access the queue, first synchronize on it.\r
+ */\r
+ private DownloadQueue queue;\r
+ /**\r
+ * Set of URLs downloaded or scheduled, so we don't download a\r
+ * URL more than once.\r
+ * Thread safety: To access the set, first synchronize on it.\r
+ */\r
+ private Set urlsDownloadedOrScheduled;\r
+ /**\r
+ * Set of URLs currently being downloaded by Spider threads.\r
+ * Thread safety: To access the set, first synchronize on it.\r
+ */\r
+ private Set urlsDownloading;\r
+ /**\r
+ * Number of downloads currently taking place.\r
+ * Thread safety: To modify this value, first synchronize on\r
+ * the download queue.\r
+ */\r
+ private int downloadsInProgress;\r
+ /** Whether the spider should quit */\r
+ private boolean quit;\r
+ /** Count of running Spider threads. */\r
+ private int running;\r
+ /** Time we last checkpointed. */\r
+ private long lastCheckpoint;\r
+\r
+ public Spider(SpiderConfig config)\r
+ {\r
+ this.config = config;\r
+ queue = new DownloadQueue(config);\r
+ queue.queueURL(new URLToDownload(config.getStartLocation(), 0));\r
+ urlsDownloadedOrScheduled = new HashSet();\r
+ urlsDownloading = new HashSet();\r
+ downloadsInProgress = 0;\r
+ lastCheckpoint = 0;\r
+ }\r
+\r
+ public void start()\r
+ {\r
+ quit = false;\r
+ running = 0;\r
+\r
+ for(int i = 0; i < config.getSpiderThreads(); i++)\r
+ {\r
+ _logClass.info("Starting Spider thread");\r
+ Thread t = new Thread(this, "Spider-Thread-" + (i + 1));\r
+ t.start();\r
+ running++;\r
+ }\r
+ }\r
+\r
+ public void stop()\r
+ {\r
+ quit = true;\r
+ }\r
+\r
+ public boolean isRunning()\r
+ {\r
+ return running == 0;//a correct version should be return running!=0;\r
+ }\r
+\r
+ private void checkpointIfNeeded()\r
+ {\r
+ if(config.getCheckpointInterval() == 0)\r
+ {\r
+ return;\r
+ }\r
+\r
+ if(System.currentTimeMillis() - lastCheckpoint > config.getCheckpointInterval())\r
+ {\r
+ synchronized(queue)\r
+ {\r
+ if(System.currentTimeMillis() - lastCheckpoint > config.getCheckpointInterval())\r
+ {\r
+ writeCheckpoint();\r
+ lastCheckpoint = System.currentTimeMillis();\r
+ }\r
+ }\r
+ }\r
+ }\r
+\r
+ private void writeCheckpoint()\r
+ {\r
+ _logClass.debug("writeCheckpoint()");\r
+ try\r
+ {\r
+ FileOutputStream fos = new FileOutputStream("spider.checkpoint", false);\r
+ ObjectOutputStream oos = new ObjectOutputStream(fos);\r
+ oos.writeObject(queue);\r
+ oos.writeObject(urlsDownloading);\r
+ oos.close();\r
+ }\r
+ catch(IOException ioe)\r
+ {\r
+ _logClass.warn("IO Exception attempting checkpoint: " + ioe.getMessage(), ioe);\r
+ }\r
+ }\r
+\r
+ public void readCheckpoint()\r
+ {\r
+ try\r
+ {\r
+ FileInputStream fis = new FileInputStream("spider.checkpoint");\r
+ ObjectInputStream ois = new ObjectInputStream(fis);\r
+ queue = (DownloadQueue) ois.readObject();\r
+ urlsDownloading = (Set) ois.readObject();\r
+ queue.queueURLs(urlsDownloading);\r
+ urlsDownloading.clear();\r
+ }\r
+ catch(Exception e)\r
+ {\r
+ _logClass.error("Caught exception reading checkpoint: " + e.getMessage(), e);\r
+ }\r
+ }\r
+\r
+ public void run()\r
+ {\r
+ HTMLParser htmlParser = new HTMLParser(config);\r
+ URLGetter urlGetter = new URLGetter(config);\r
+\r
+ int TIMES = 0;\r
+ while((queueSize() > 0 || downloadsInProgress > 0) && quit == false)\r
+ {\r
+ checkpointIfNeeded();\r
+ if(queueSize() == 0 && downloadsInProgress > 0)\r
+ {\r
+ // Wait for a download to finish before seeing if this thread should stop\r
+ try\r
+ {\r
+ Thread.sleep(QUEUE_CHECK_INTERVAL);\r
+ TIMES++;\r
+ if(TIMES>2)break;\r
+ }\r
+ catch(InterruptedException ignored)\r
+ {\r
+ }\r
+ // Have another go at the loop\r
+ continue;\r
+ }\r
+ else if(queueSize() == 0)\r
+ {\r
+ break;\r
+ }\r
+ URLToDownload nextURL;\r
+ synchronized(queue)\r
+ {\r
+ nextURL = queue.getNextInQueue();\r
+ downloadsInProgress++;\r
+ }\r
+ synchronized(urlsDownloading)\r
+ {\r
+ urlsDownloading.add(nextURL);\r
+ }\r
+ int newDepth = nextURL.getDepth() + 1;\r
+ int maxDepth = config.getMaxDepth();\r
+ synchronized(urlsDownloading)\r
+ {\r
+ urlsDownloading.remove(nextURL);\r
+ }\r
+ List newURLs = downloadURL(nextURL, urlGetter, htmlParser);\r
+\r
+ newURLs = filterURLs(newURLs);\r
+\r
+ ArrayList u2dsToQueue = new ArrayList();\r
+ for(Iterator i = newURLs.iterator(); i.hasNext(); )\r
+ {\r
+ URL u = (URL) i.next();\r
+ // Download if not yet downloaded, and the new depth is less than the maximum\r
+ synchronized(urlsDownloadedOrScheduled)\r
+ {\r
+ if(!urlsDownloadedOrScheduled.contains(u)\r
+ && (maxDepth == 0 || newDepth <= maxDepth))\r
+ {\r
+ u2dsToQueue.add(new URLToDownload(u, nextURL.getURL(), newDepth));\r
+ urlsDownloadedOrScheduled.add(u);\r
+ }\r
+ }\r
+ }\r
+ synchronized(queue)\r
+ {\r
+ queue.queueURLs(u2dsToQueue);\r
+ downloadsInProgress--;\r
+ }\r
+ }\r
+ _logClass.info("Spider thread stopping");\r
+ running--;\r
+ }\r
+\r
+ /**\r
+ * Get the size of the download queue in a thread-safe manner.\r
+ */\r
+ private int queueSize()\r
+ {\r
+ synchronized(queue)\r
+ {\r
+ return queue.size();\r
+ }\r
+ }\r
+\r
+ /**\r
+ * Get a URL, and return new URLs that are referenced from it.\r
+ *\r
+ * @return A List of URL objects.\r
+ */\r
+ private List downloadURL(URLToDownload url, URLGetter urlGetter, HTMLParser htmlParser)\r
+ {\r
+ _logClass.debug("downloadURL(" + url + ")");\r
+\r
+ // Bail out early if image and already on disk\r
+ URLObject obj = new URLObject(url.getURL(), config);\r
+ if(obj.existsOnDisk())\r
+ {\r
+ if(config.refreshHTMLs() && (obj.isHTML() || obj.isXML()))\r
+ {\r
+ _logClass.info("Q: [" + queue + "] " + url);\r
+ obj = urlGetter.getURL(url);\r
+ }\r
+ else if(config.refreshImages() && obj.isImage())\r
+ {\r
+ _logClass.info("Q: [" + queue + "] " + url);\r
+ obj = urlGetter.getURL(url);\r
+ }\r
+ }\r
+ else\r
+ {\r
+ _logClass.info("Q: [" + queue + "] " + url);\r
+ obj = urlGetter.getURL(url);\r
+ }\r
+\r
+ if(obj == null)\r
+ {\r
+ return new ArrayList();\r
+ }\r
+\r
+ if(!obj.existsOnDisk())\r
+ {\r
+ obj.writeToFile();\r
+ }\r
+\r
+ if(obj.isHTML() || obj.isXML())\r
+ {\r
+ return htmlParser.parseLinksInDocument(url.getURL(), obj.getStringContent());\r
+ }\r
+ else if(obj.isImage())\r
+ {\r
+ return new ArrayList();\r
+ }\r
+ else\r
+ {\r
+ _logClass.warn("Unsupported content type received: " + obj.getContentType());\r
+ _logClass.info("URL was " + url);\r
+ return new ArrayList();\r
+ }\r
+ }\r
+\r
+ private List filterURLs(List URLs)\r
+ {\r
+ String match = config.getURLMatch();\r
+ ArrayList retVal = new ArrayList();\r
+\r
+ synchronized(urlsDownloadedOrScheduled)\r
+ {\r
+ for(Iterator i = URLs.iterator(); i.hasNext(); )\r
+ {\r
+ URL u = (URL) i.next();\r
+ if(urlsDownloadedOrScheduled.contains(u))\r
+ {\r
+ continue;\r
+ }\r
+\r
+ String s = u.toExternalForm();\r
+ if(s.indexOf(match) != -1)\r
+ {\r
+ retVal.add(u);\r
+ }\r
+ }\r
+ }\r
+ return retVal;\r
+ }\r
+\r
+}\r