src/batchtools/headless/src/test/java/com/galois/fiveui/CrawlTest.java


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136

package com.galois.fiveui;

import java.io.File;
import java.io.IOException;
import java.net.BindException;
import java.util.List;

import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.junit.Test;
import junit.framework.Assert;

import org.apache.log4j.Level;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.log4j.BasicConfigurator;

import com.google.common.base.Function;
import com.google.common.io.Files;

import edu.uci.ics.crawler4j.util.IO;

public class CrawlTest {
	
	// TODO need a system independent way of getting the resources path
	private static String resourceDir = "src/test/resources/crawlTest/";
	private static Logger logger = Logger.getLogger("com.galois.fiveui.CrawlTest");
	private static NanoHTTPD httpServer = null;
	
	@BeforeClass
	public static void setupCrawlTests() {
		// Set up a simple configuration that logs on the console.
    	BasicConfigurator.configure();
    	logger.setLevel(Level.DEBUG);
    	Logger root = Logger.getRootLogger();
    	root.setLevel(Level.ERROR);
	    
	    // start up local web server for crawl tests
		
		logger.info("Starting NanoHTTPD webserver in " + resourceDir + " on port 8080 ...");
		try {
			httpServer = new NanoHTTPD(8080, new File(resourceDir));
		} catch (BindException e) {
			logger.info("assuming that local web server is already running");
		} catch (IOException e1) {
			e1.printStackTrace();
			Assert.assertTrue("failed to start NanoHTTPD in resource directory", false);
		}
	}
	
	@AfterClass
	public static void teardown() {
		LogManager.shutdown();
		httpServer.stop();
	}
	
	// Requires Internet access
	// @Test
	public void corpDotGaloisCrawlTest() {
		File tmpPath = Files.createTempDir();
		BasicCrawlerController con = 
				new BasicCrawlerController("http://corp.galois.com",
						                   "http://corp.galois.com", 
						                   2, 5, 1000, 1,
						                   tmpPath.getAbsolutePath());
		List<String> urls = null;
		try {
			urls = con.go();
			System.out.println(urls.toString());
		} catch (Exception e) {
			Assert.assertTrue("failed to complete webcrawl", false);
			e.printStackTrace();
		} finally {
			IO.deleteFolder(tmpPath);
		}
		
		Assert.assertEquals((urls != null) && (urls.size() == 5), true);
	}
	
	@Test
	public void testLocalCrawlDepth3one() {
		doLocalCrawlTest("http://localhost:8080/one.html", 3, 10, 9);
	}
	
	@Test
	public void testLocalCrawlDepth3two() {
		doLocalCrawlTest("http://localhost:8080/two.html", 3, 10, 3);
	}
	
	@Test
	public void testLocalCrawlDepth0one() {
		doLocalCrawlTest("http://localhost:8080/one.html", 0, 10, 1);
	}
	
	@Test
	public void testCrawlWithPredicate() {
		CrawlParameters c = new CrawlParameters("5 5 100 *one.html");
		doLocalCrawlTest("http://localhost:8080/one.html", c.matchFcn, c.depth, c.maxFetch, 1);
	}
	
	public void doLocalCrawlTest(String seed, int depth, int maxFetch, int oracle) {
		Function<String, Boolean> pred = new Function<String, Boolean>() {
			public Boolean apply(String s) {
				return s.startsWith("http");
			}
		};
		doLocalCrawlTest(seed, pred, depth, maxFetch, oracle);
	}
	
	public void doLocalCrawlTest(String seed, Function<String, Boolean> pred, int depth, int maxFetch, int oracle) {	

	    logger.info("Starting localCrawlTest ...");
	    logger.info("  seed " + seed + ", depth " + depth);
	    
	    File tmpPath = Files.createTempDir();
		BasicCrawlerController con = 
			new BasicCrawlerController(seed, pred, depth, maxFetch, 100, 1,
					                   tmpPath.getAbsolutePath());
		List<String> urls = null;
		try {
			logger.info("Starting webcrawl ...");
			urls = con.go();
			logger.info("RETURN -- " + urls.toString());
		} catch (Exception e) {
			e.printStackTrace();
			Assert.assertTrue("failed to run webcrawler", false);
		} finally {
			IO.deleteFolder(tmpPath);
		}
		
		// assert that we got oracle number of URLs
		Assert.assertTrue("got " + urls.size() + " URLs, expected " + oracle,
			(urls != null) && (urls.size() == oracle));
	}

}