package com.galois.fiveui; import java.io.File; import java.io.IOException; import java.net.BindException; import java.util.List; import org.junit.AfterClass; import org.junit.BeforeClass; import org.junit.Test; import org.junit.Assert; import org.apache.log4j.Level; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import org.apache.log4j.BasicConfigurator; import com.google.common.base.Function; import com.google.common.io.Files; import edu.uci.ics.crawler4j.util.IO; public class CrawlTest { // TODO need a system independent way of getting the resources path private static String resourceDir = "src/test/resources/crawlTest/"; private static Logger logger = Logger.getLogger("com.galois.fiveui.CrawlTest"); private static NanoHTTPD httpServer = null; @BeforeClass public static void setupCrawlTests() { // Set up a simple configuration that logs on the console. BasicConfigurator.configure(); logger.setLevel(Level.DEBUG); Logger root = Logger.getRootLogger(); root.setLevel(Level.ERROR); // start up local web server for crawl tests logger.info("Starting NanoHTTPD webserver in " + resourceDir + " on port 8080 ..."); try { httpServer = new NanoHTTPD(8080, new File(resourceDir)); } catch (BindException e) { logger.info("assuming that local web server is already running"); } catch (IOException e1) { e1.printStackTrace(); Assert.assertTrue("failed to start NanoHTTPD in resource directory", false); } } @AfterClass public static void teardown() { LogManager.shutdown(); httpServer.stop(); } // Requires Internet access // @Test public void corpDotGaloisCrawlTest() { File tmpPath = Files.createTempDir(); BasicCrawlerController con = new BasicCrawlerController("http://corp.galois.com", "http://corp.galois.com", 2, 5, 1000, 1, tmpPath.getAbsolutePath()); List urls = null; try { urls = con.go(); System.out.println(urls.toString()); } catch (Exception e) { Assert.assertTrue("failed to complete webcrawl", false); e.printStackTrace(); } finally { IO.deleteFolder(tmpPath); } Assert.assertEquals((urls != null) && (urls.size() == 5), true); } @Test public void testLocalCrawlDepth3one() { doLocalCrawlTest("http://localhost:8080/one.html", 3, 10, 9); } @Test public void testLocalCrawlDepth3two() { doLocalCrawlTest("http://localhost:8080/two.html", 3, 10, 3); } @Test public void testLocalCrawlDepth0one() { doLocalCrawlTest("http://localhost:8080/one.html", 0, 10, 1); } @Test public void testCrawlWithPredicate() { CrawlParameters c = new CrawlParameters("5 5 100 *one.html"); doLocalCrawlTest("http://localhost:8080/one.html", c.matchFcn, c.depth, c.maxFetch, 1); } public void doLocalCrawlTest(String seed, int depth, int maxFetch, int oracle) { Function pred = new Function() { public Boolean apply(String s) { return s.startsWith("http"); } }; doLocalCrawlTest(seed, pred, depth, maxFetch, oracle); } public void doLocalCrawlTest(String seed, Function pred, int depth, int maxFetch, int oracle) { logger.info("Starting localCrawlTest ..."); logger.info(" seed " + seed + ", depth " + depth); File tmpPath = Files.createTempDir(); BasicCrawlerController con = new BasicCrawlerController(seed, pred, depth, maxFetch, 100, 1, tmpPath.getAbsolutePath()); List urls = null; try { logger.info("Starting webcrawl ..."); urls = con.go(); logger.info("RETURN -- " + urls.toString()); } catch (Exception e) { e.printStackTrace(); Assert.assertTrue("failed to run webcrawler", false); } finally { IO.deleteFolder(tmpPath); } // assert that we got oracle number of URLs Assert.assertTrue("got " + urls.size() + " URLs, expected " + oracle, (urls != null) && (urls.size() == oracle)); } }