aboutsummaryrefslogtreecommitdiff
path: root/src/batchtools/headless/src/main/java/com/galois/fiveui/CrawlParameters.java
blob: a07d43aecb26c3a05ff8da653d0d9187041cd85c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
package com.galois.fiveui;

import com.google.common.base.Function;
import java.util.regex.Pattern;
import java.util.regex.Matcher;

import org.apache.log4j.Logger;

public class CrawlParameters {
	
	private static Logger logger = Logger.getLogger("com.galois.fiveui.CrawlParameters");
	
	public int depth;
	public int maxFetch;
	public int politeness;
	public String match;
	public Function<String, Boolean> matchFcn;
	
	private Boolean _doNotCrawl;
	private String _str;
	
	/**
	 * Construct (parse) a crawl type object from a string description
	 * 
	 * A valid description is a whitespace separated list as follows:
	 * "<depth> <maxFetch> <politeness> <start>"
	 * where:
	 * <ol>
	 *   <li> (depth :: int) depth of the crawl </li>
	 *   <li> (maxFetch :: int) maximum number of pages to crawl </li>
	 *   <li> (politeness :: int) number of milliseconds between hits on same domain </li>
	 *   <li> (match :: String) glob pattern to match URLs </li>
	 *  </ol>
	 *  or the string "none" which is, in spirit, equivalent to "0 1 1000 *",
	 *  but in practice the webcrawl is skipped entirely in this case.
	 * 
	 * @param desc a string description of the crawl type
	 * @throws Exception 
	 */
	public CrawlParameters(String desc) {
		String[] l = desc.split("\\s+");
		if (desc == "none" || l.length != 4) {
			this._doNotCrawl = true;
			this._str = desc;
			logger.debug("setting doNotCrawl = True");
			return;
		} else {	
			this.depth = Integer.parseInt(l[0]);
			this.maxFetch = Integer.parseInt(l[1]);
			this.politeness = Integer.parseInt(l[2]);
			this.match = l[3];
			this._doNotCrawl = false;
			this._str = desc;
			this.matchFcn = compileMatchFcn(this.match);
			logger.debug("setting depth: " + this.depth);
			logger.debug("setting maxFetch: " + this.maxFetch);
			logger.debug("setting politeness: " + this.politeness);
			logger.debug("setting match: " + this.match);
		}
	}
	
	public static Function<String, Boolean> compileMatchFcn(String glob) {
		String reg = glob.replaceAll("\\.", "\\.").replaceAll("\\*", ".*");
		final Pattern pat = Pattern.compile(reg);
		return new Function<String, Boolean>() {
			public Boolean apply(String input) {
				Matcher m = pat.matcher(input);
				return m.matches();
			}
		};
	}
	
	public static CrawlParameters none() {
		return new CrawlParameters("none");
	}
	
	public Boolean isNone() {
		return this._doNotCrawl;
	}
	
	public String toString() {
		return _str;
	}
}