batchtools/headless/src/main/java/com/galois/fiveui/BasicCrawlerController.java


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150

package com.galois.fiveui;

import java.util.ArrayList;
import java.util.List;

import com.google.common.base.Function;

import edu.uci.ics.crawler4j.crawler.CrawlConfig;
import edu.uci.ics.crawler4j.crawler.CrawlController;
import edu.uci.ics.crawler4j.fetcher.PageFetcher;
import edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig;
import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer;

/**
 * @author bjones
 */
public class BasicCrawlerController {

	private String seed;
	private String tmpDir;
	private int depth;
	private int maxFetch;
	private int politeness;
	private int threads;
	private Function<String, Boolean> predicate;
	
	/**
	 * Initialize a basic web crawler controller.
	 * 
	 * @param seed URL to start the crawl
	 * @param domain string that all crawled page URLs must start with
	 * @param depth maximum depth to crawl
	 * @param maxFetch maximum number of pages to crawl
	 * @param politeness time in milliseconds to wait before making requests on same domain
	 * @param threads number of concurrent threads to use while crawling
	 * @param tmpDir temporary directory to store intermediate crawl data
	 *               (must exist and be read/write before crawl starts)
	 */
	public BasicCrawlerController (String seed, final String domain, int depth, int maxFetch,
			                       int politeness, int threads, String tmpDir) {
		this.seed = seed;
		this.predicate = new Function<String, Boolean>() {
			public Boolean apply(String s) {
				return s.startsWith(domain);
			}
		};
		this.depth = depth;
		this.maxFetch = maxFetch;
		this.politeness = politeness;
		this.threads = threads;
		this.tmpDir = tmpDir;
	}
	
	/**
	 * Initialize a basic web crawler controller.
	 * 
	 * @param seed URL to start the crawl
	 * @param pred a Function<String, Boolean> to be used as a predicate that all crawled URLs must pass
	 * @param depth maximum depth to crawl
	 * @param maxFetch maximum number of pages to crawl
	 * @param politeness time in milliseconds to wait before making requests on same domain
	 * @param threads number of concurrent threads to use while crawling
	 * @param tmpDir temporary directory to store intermediate crawl data
	 *               (must exist and be read/write before crawl starts)
	 */
	public BasicCrawlerController (String seed, Function<String, Boolean> pred, int depth, int maxFetch,
            int politeness, int threads, String tmpDir) {
		this.seed = seed;
		this.predicate = pred;
		this.depth = depth;
		this.maxFetch = maxFetch;
		this.politeness = politeness;
		this.threads = threads;
		this.tmpDir = tmpDir;
	}
	
	public List<String> go() throws Exception {

		/*
		 * crawlStorageFolder is a folder where intermediate crawl data is
		 * stored.
		 */
		String crawlStorageFolder = this.tmpDir;

		/*
		 * numberOfCrawlers shows the number of concurrent threads that should
		 * be initiated for crawling.
		 */
		int numberOfCrawlers = this.threads;

		CrawlConfig config = new CrawlConfig();

		config.setCrawlStorageFolder(crawlStorageFolder);

		/*
		 * Be polite: Make sure that we don't send more than 1 request per
		 * second (1000 milliseconds between requests).
		 */
		config.setPolitenessDelay(this.politeness);

		/*
		 * You can set the maximum crawl depth here. The default value is -1 for
		 * unlimited depth
		 */
		config.setMaxDepthOfCrawling(this.depth);

		/*
		 * You can set the maximum number of pages to crawl. The default value
		 * is -1 for unlimited number of pages
		 */
		config.setMaxPagesToFetch(this.maxFetch);
		
		/*
		 * Delete the temporary crawl storage after we're done.
		 */
		config.setResumableCrawling(false);
		
		/*
		 * Instantiate the controller for this crawl.
		 */
		PageFetcher pageFetcher = new PageFetcher(config);
		RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
		//robotstxtConfig.setEnabled(false); // uncomment if you want to ignore robots.txt
		RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
		CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);

		// add a seed URL
		controller.addSeed(this.seed);

		/*
		 * Setup storage for data collection by the BasicCrawler class
		 */
		List<String> store = new ArrayList<String>();
		BasicCrawler.configure(this.predicate, store);
		
		/*
		 * Start the crawl. This is a blocking operation.
		 */
		try {
			controller.start(BasicCrawler.class, numberOfCrawlers);
		} finally {
			controller.Shutdown();
		}
		
		/*
		 * Extract and return data collected by BasicCrawler
		 */
		return store;
	}
}