diff options
author | Rogan Creswick <creswick@gmail.com> | 2013-06-07 16:00:22 -0700 |
---|---|---|
committer | Rogan Creswick <creswick@gmail.com> | 2013-06-07 16:00:22 -0700 |
commit | 88c95d18a81e4f107cc4e5967bfa45d1bf4882a1 (patch) | |
tree | f1b8f5bb1bffd8ea84078d829248dddbdc2f3544 /src/batchtools/headless/src/main/java/com/galois/fiveui/CrawlParameters.java | |
parent | 04d3c6e96ed4dd528418fe71a85e72316ae5bba4 (diff) |
cleaned up some test files that broke during a merge
Diffstat (limited to 'src/batchtools/headless/src/main/java/com/galois/fiveui/CrawlParameters.java')
-rw-r--r-- | src/batchtools/headless/src/main/java/com/galois/fiveui/CrawlParameters.java | 84 |
1 files changed, 84 insertions, 0 deletions
diff --git a/src/batchtools/headless/src/main/java/com/galois/fiveui/CrawlParameters.java b/src/batchtools/headless/src/main/java/com/galois/fiveui/CrawlParameters.java new file mode 100644 index 0000000..a07d43a --- /dev/null +++ b/src/batchtools/headless/src/main/java/com/galois/fiveui/CrawlParameters.java @@ -0,0 +1,84 @@ +package com.galois.fiveui; + +import com.google.common.base.Function; +import java.util.regex.Pattern; +import java.util.regex.Matcher; + +import org.apache.log4j.Logger; + +public class CrawlParameters { + + private static Logger logger = Logger.getLogger("com.galois.fiveui.CrawlParameters"); + + public int depth; + public int maxFetch; + public int politeness; + public String match; + public Function<String, Boolean> matchFcn; + + private Boolean _doNotCrawl; + private String _str; + + /** + * Construct (parse) a crawl type object from a string description + * + * A valid description is a whitespace separated list as follows: + * "<depth> <maxFetch> <politeness> <start>" + * where: + * <ol> + * <li> (depth :: int) depth of the crawl </li> + * <li> (maxFetch :: int) maximum number of pages to crawl </li> + * <li> (politeness :: int) number of milliseconds between hits on same domain </li> + * <li> (match :: String) glob pattern to match URLs </li> + * </ol> + * or the string "none" which is, in spirit, equivalent to "0 1 1000 *", + * but in practice the webcrawl is skipped entirely in this case. + * + * @param desc a string description of the crawl type + * @throws Exception + */ + public CrawlParameters(String desc) { + String[] l = desc.split("\\s+"); + if (desc == "none" || l.length != 4) { + this._doNotCrawl = true; + this._str = desc; + logger.debug("setting doNotCrawl = True"); + return; + } else { + this.depth = Integer.parseInt(l[0]); + this.maxFetch = Integer.parseInt(l[1]); + this.politeness = Integer.parseInt(l[2]); + this.match = l[3]; + this._doNotCrawl = false; + this._str = desc; + this.matchFcn = compileMatchFcn(this.match); + logger.debug("setting depth: " + this.depth); + logger.debug("setting maxFetch: " + this.maxFetch); + logger.debug("setting politeness: " + this.politeness); + logger.debug("setting match: " + this.match); + } + } + + public static Function<String, Boolean> compileMatchFcn(String glob) { + String reg = glob.replaceAll("\\.", "\\.").replaceAll("\\*", ".*"); + final Pattern pat = Pattern.compile(reg); + return new Function<String, Boolean>() { + public Boolean apply(String input) { + Matcher m = pat.matcher(input); + return m.matches(); + } + }; + } + + public static CrawlParameters none() { + return new CrawlParameters("none"); + } + + public Boolean isNone() { + return this._doNotCrawl; + } + + public String toString() { + return _str; + } +} |