headless/src/main/java/com/galois/fiveui/BasicCrawler.java


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90

package com.galois.fiveui;

import edu.uci.ics.crawler4j.crawler.Page;
import edu.uci.ics.crawler4j.crawler.WebCrawler;
import edu.uci.ics.crawler4j.parser.HtmlParseData;
import edu.uci.ics.crawler4j.url.WebURL;

import java.util.List;
import java.util.regex.Pattern;

import org.apache.log4j.Logger;

import com.google.common.base.Function;

/**
 * @author bjones
 */
public class BasicCrawler extends WebCrawler {

		private static Logger logger = Logger.getLogger("com.galois.fiveui.BasicCrawler");
        private final static Pattern FILTERS = Pattern.compile(
        		                                 ".*(\\.(css|js|bmp|gif|jpe?g"
                                               + "|png|tiff?|mid|mp2|mp3|mp4"
                                               + "|wav|avi|mov|mpeg|ram|m4v|pdf"
                                               + "|rm|smil|wmv|swf|wma|zip|rar|gz))$");
        public static Function<String, Boolean> _predicate;
        public static List<String> _urls;
        
        /**
         * Configure static properties of the class before a crawl.
         * 
         * @param pred URLs will be crawled only if pred.apply(URL) is
         *             true
         * @param urls reference to a list of strings which the crawler will
         *             append URLs to as it works
         */
        public static void configure(Function<String, Boolean> pred, List<String> urls) {
        	_predicate = pred;
        	_urls = urls;
        }
        
        /**
         * specify whether the given url should be crawled or not
         */
        @Override
        public boolean shouldVisit(WebURL url) {
                String href = url.getURL();
                Boolean yesno = !FILTERS.matcher(href).matches() && _predicate.apply(href);
                logger.debug("saying " + (yesno ? "yes" : "no") + " to " + href);
                return yesno;
        }

        /**
         * This function is called when a page is fetched and ready to be processed
         * by the program.
         */
        @Override
        public void visit(Page page) {
                int docid = page.getWebURL().getDocid();
                String url = page.getWebURL().getURL();
                String domain = page.getWebURL().getDomain();
                String path = page.getWebURL().getPath();
                String subDomain = page.getWebURL().getSubDomain();
                String parentUrl = page.getWebURL().getParentUrl();

                logger.debug(" - Docid: " + docid);
                logger.debug(" - URL: " + url);
                logger.debug(" - Domain: '" + domain + "'");
                logger.debug(" - Sub-domain: '" + subDomain + "'");
                logger.debug(" - Path: '" + path + "'");
                logger.debug(" - Parent page: " + parentUrl);

                if (page.getParseData() instanceof HtmlParseData) {
                        HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
                        String text = htmlParseData.getText();
                        String html = htmlParseData.getHtml();
                        List<WebURL> links = htmlParseData.getOutgoingUrls();

                        logger.debug(" -- Text length: " + text.length());
                        logger.debug(" -- Html length: " + html.length());
                        logger.debug(" -- Number of outgoing links: " + links.size());
                }
                logger.debug(" - =============");
                
                // append to URLs list
                if (null != _urls) {
                	_urls.add(url);
                }
        }
}