blob: aa3cc34593cb76165de50b85921a9fabfef62a66 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
|
package com.galois.fiveui;
import com.google.common.base.Function;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
import org.apache.log4j.Logger;
public class CrawlParameters {
private static Logger logger = Logger.getLogger("com.galois.fiveui.CrawlParameters");
public int depth;
public int maxFetch;
public int politeness;
public String match;
public Function<String, Boolean> matchFcn;
private Boolean _doNotCrawl;
private String _str;
/**
* Construct (parse) a crawl type object from a string description
*
* A valid description is a whitespace separated list as follows:
* "<depth> <maxFetch> <politeness> <start>"
* where:
* <ol>
* <li> (depth :: int) depth of the crawl </li>
* <li> (maxFetch :: int) maximum number of pages to crawl </li>
* <li> (politeness :: int) number of milliseconds between hits on same domain </li>
* <li> (match :: String) glob pattern to match URLs </li>
* </ol>
* or the string "none" which is, in spirit, equivalent to "0 1 1000 *",
* but in practice the webcrawl is skipped entirely in this case.
*
* @param desc a string description of the crawl type
* @throws Exception
*/
public CrawlParameters(String desc) {
String[] l = desc.split("\\s+");
if (desc == "none" || l.length != 4) {
this._doNotCrawl = true;
this._str = desc;
logger.debug("setting doNotCrawl = True");
return;
} else {
this.depth = Integer.parseInt(l[0]);
this.maxFetch = Integer.parseInt(l[1]);
this.politeness = Integer.parseInt(l[2]);
this.match = l[3];
this._doNotCrawl = false;
this._str = desc;
this.matchFcn = compileMatchFcn(this.match);
logger.debug("setting depth: " + this.depth);
logger.debug("setting maxFetch: " + this.maxFetch);
logger.debug("setting politeness: " + this.politeness);
logger.debug("setting match: " + this.match);
}
}
public static Function<String, Boolean> compileMatchFcn(String glob) {
String reg = glob.replaceAll("\\.", "\\\\.").replaceAll("\\*", ".*");
final Pattern pat = Pattern.compile(reg);
return new Function<String, Boolean>() {
public Boolean apply(String input) {
Matcher m = pat.matcher(input);
return m.matches();
}
};
}
public static CrawlParameters none() {
return new CrawlParameters("none");
}
public Boolean isNone() {
return this._doNotCrawl;
}
public String toString() {
return _str;
}
}
|