package com.galois.fiveui; import com.google.common.base.Function; import java.util.regex.Pattern; import java.util.regex.Matcher; import org.apache.log4j.Logger; public class CrawlParameters { private static Logger logger = Logger.getLogger("com.galois.fiveui.CrawlParameters"); public int depth; public int maxFetch; public int politeness; public String match; public Function matchFcn; private Boolean _doNotCrawl; private String _str; /** * Construct (parse) a crawl type object from a string description * * A valid description is a whitespace separated list as follows: * " " * where: *
    *
  1. (depth :: int) depth of the crawl
  2. *
  3. (maxFetch :: int) maximum number of pages to crawl
  4. *
  5. (politeness :: int) number of milliseconds between hits on same domain
  6. *
  7. (match :: String) glob pattern to match URLs
  8. *
* or the string "none" which is, in spirit, equivalent to "0 1 1000 *", * but in practice the webcrawl is skipped entirely in this case. * * @param desc a string description of the crawl type * @throws Exception */ public CrawlParameters(String desc) { String[] l = desc.split("\\s+"); if (desc == "none" || l.length != 4) { this._doNotCrawl = true; this._str = desc; logger.debug("setting doNotCrawl = True"); return; } else { this.depth = Integer.parseInt(l[0]); this.maxFetch = Integer.parseInt(l[1]); this.politeness = Integer.parseInt(l[2]); this.match = l[3]; this._doNotCrawl = false; this._str = desc; this.matchFcn = compileMatchFcn(this.match); logger.debug("setting depth: " + this.depth); logger.debug("setting maxFetch: " + this.maxFetch); logger.debug("setting politeness: " + this.politeness); logger.debug("setting match: " + this.match); } } public static Function compileMatchFcn(String glob) { String reg = glob.replaceAll("\\.", "\\\\.").replaceAll("\\*", ".*"); final Pattern pat = Pattern.compile(reg); return new Function() { public Boolean apply(String input) { Matcher m = pat.matcher(input); return m.matches(); } }; } public static CrawlParameters none() { return new CrawlParameters("none"); } public Boolean isNone() { return this._doNotCrawl; } public String toString() { return _str; } }