diff options
Diffstat (limited to 'batchtools/headless/src/main/java/com/galois/fiveui/BasicCrawlerController.java')
-rw-r--r-- | batchtools/headless/src/main/java/com/galois/fiveui/BasicCrawlerController.java | 150 |
1 files changed, 150 insertions, 0 deletions
diff --git a/batchtools/headless/src/main/java/com/galois/fiveui/BasicCrawlerController.java b/batchtools/headless/src/main/java/com/galois/fiveui/BasicCrawlerController.java new file mode 100644 index 0000000..79338ec --- /dev/null +++ b/batchtools/headless/src/main/java/com/galois/fiveui/BasicCrawlerController.java @@ -0,0 +1,150 @@ +package com.galois.fiveui; + +import java.util.ArrayList; +import java.util.List; + +import com.google.common.base.Function; + +import edu.uci.ics.crawler4j.crawler.CrawlConfig; +import edu.uci.ics.crawler4j.crawler.CrawlController; +import edu.uci.ics.crawler4j.fetcher.PageFetcher; +import edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig; +import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer; + +/** + * @author bjones + */ +public class BasicCrawlerController { + + private String seed; + private String tmpDir; + private int depth; + private int maxFetch; + private int politeness; + private int threads; + private Function<String, Boolean> predicate; + + /** + * Initialize a basic web crawler controller. + * + * @param seed URL to start the crawl + * @param domain string that all crawled page URLs must start with + * @param depth maximum depth to crawl + * @param maxFetch maximum number of pages to crawl + * @param politeness time in milliseconds to wait before making requests on same domain + * @param threads number of concurrent threads to use while crawling + * @param tmpDir temporary directory to store intermediate crawl data + * (must exist and be read/write before crawl starts) + */ + public BasicCrawlerController (String seed, final String domain, int depth, int maxFetch, + int politeness, int threads, String tmpDir) { + this.seed = seed; + this.predicate = new Function<String, Boolean>() { + public Boolean apply(String s) { + return s.startsWith(domain); + } + }; + this.depth = depth; + this.maxFetch = maxFetch; + this.politeness = politeness; + this.threads = threads; + this.tmpDir = tmpDir; + } + + /** + * Initialize a basic web crawler controller. + * + * @param seed URL to start the crawl + * @param pred a Function<String, Boolean> to be used as a predicate that all crawled URLs must pass + * @param depth maximum depth to crawl + * @param maxFetch maximum number of pages to crawl + * @param politeness time in milliseconds to wait before making requests on same domain + * @param threads number of concurrent threads to use while crawling + * @param tmpDir temporary directory to store intermediate crawl data + * (must exist and be read/write before crawl starts) + */ + public BasicCrawlerController (String seed, Function<String, Boolean> pred, int depth, int maxFetch, + int politeness, int threads, String tmpDir) { + this.seed = seed; + this.predicate = pred; + this.depth = depth; + this.maxFetch = maxFetch; + this.politeness = politeness; + this.threads = threads; + this.tmpDir = tmpDir; + } + + public List<String> go() throws Exception { + + /* + * crawlStorageFolder is a folder where intermediate crawl data is + * stored. + */ + String crawlStorageFolder = this.tmpDir; + + /* + * numberOfCrawlers shows the number of concurrent threads that should + * be initiated for crawling. + */ + int numberOfCrawlers = this.threads; + + CrawlConfig config = new CrawlConfig(); + + config.setCrawlStorageFolder(crawlStorageFolder); + + /* + * Be polite: Make sure that we don't send more than 1 request per + * second (1000 milliseconds between requests). + */ + config.setPolitenessDelay(this.politeness); + + /* + * You can set the maximum crawl depth here. The default value is -1 for + * unlimited depth + */ + config.setMaxDepthOfCrawling(this.depth); + + /* + * You can set the maximum number of pages to crawl. The default value + * is -1 for unlimited number of pages + */ + config.setMaxPagesToFetch(this.maxFetch); + + /* + * Delete the temporary crawl storage after we're done. + */ + config.setResumableCrawling(false); + + /* + * Instantiate the controller for this crawl. + */ + PageFetcher pageFetcher = new PageFetcher(config); + RobotstxtConfig robotstxtConfig = new RobotstxtConfig(); + //robotstxtConfig.setEnabled(false); // uncomment if you want to ignore robots.txt + RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher); + CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer); + + // add a seed URL + controller.addSeed(this.seed); + + /* + * Setup storage for data collection by the BasicCrawler class + */ + List<String> store = new ArrayList<String>(); + BasicCrawler.configure(this.predicate, store); + + /* + * Start the crawl. This is a blocking operation. + */ + try { + controller.start(BasicCrawler.class, numberOfCrawlers); + } finally { + controller.Shutdown(); + } + + /* + * Extract and return data collected by BasicCrawler + */ + return store; + } +} |