Webcrawler
Again, not really working code as there are a bunch of dependencies missing. This is really for demonstration purposes.
public class WebCrawler<T extends WebPage> implements Iterable<T> { private final HashSet<T> visitedPages = new HashSet<T>(); private final LinkedList<Object> workQueue = new LinkedList<Object>(); private final PageProcessor<T> processor; // Map of URLs to pages. private final Map<String, WebPage> pages = new HashMap<String, WebPage>(); private final Predicate<Object> unvisited = new Predicate<Object>() { public boolean satisfies(Object page){ return !WebCrawler.this.visitedPages.contains(page);}}; public WebCrawler(PageProcessor<T> processor, String... urls){ this.processor = processor; for (String url : urls){ this.workQueue.add(processor.page(url));}} /* Iterator which iterates over all WebPages that haven't yet been visited. * It is thoroughly lazy and a web page will never be visited until it turns * up in this iterator.*/ public final Iterator<T> pageIterator = (Iterator<T>) new FlatteningIterator( new ListeningIterator<Object>( new FilterIterator<Object>( unvisited, new PoppingIterator<Object>(this.workQueue))){ @Override public void onNext(Object next){ if (next instanceof WebPage){ WebCrawler.this.visitedPages.add((T)next); WebCrawler.this.workQueue.add( new FilterIterator(unvisited, WebCrawler.this.processor.linkedPages((WebPage)next)));}}}); public Iterator<T> iterator(){ return IteratorUtils.link(this.visitedPages.iterator(), this.pageIterator); } } /** * Abstract class representing a mechanism for processing urls into pages. Contains * utility methods and a cacheing strategy. * * @author david */ public abstract class PageProcessor<T extends WebPage> implements Transformer<String, T> { private final PageCache<T> cache; private final IteratorTransformer<String, T> iteratorTransformer = new IteratorTransformer<String, T>(this); private Predicate<String> domain; public PageProcessor(Predicate<String> domain, PageCache<T> cache){ this.domain = domain; this.cache = cache;} public PageProcessor(String domainPrefix, PageCache<T> cache){ this(StringUtils.startsWith(domainPrefix), cache);} /** * Take the Url and return a WebPage corresponding to it. */ protected abstract T process(String url); public T transform(String url){ return this.page(url); } /** * If the page has previously been processed, retrieve it from the internal cache. * Else process it and put it in the eternal cache. */ public T page(String url){ T page = cache.getCachedPage(url); if (page == null){ page = this.process(url); cache.cachePage(page);} return page;} /** * Returns an iterator over all pages linked to by this page. */ public Iterator<T> linkedPages(WebPage page){ return iteratorTransformer.transform(new FilterIterator(domain, page.getLinkUrls()));} } /** * A very simple PageProcessor<WebPage> implementation based on the HTMLParser library * which uses a MapBackedPageCache. * * @author david */ public class HtmlParserPageProcessor extends PageProcessor<WebPage> { private static NodeFilter ALLOWED_TAGS = new NodeFilter(){ public boolean accept(Node node){ return (node instanceof LinkTag) || (node instanceof TitleTag);}}; public HtmlParserPageProcessor(Predicate<String> domain){ super(domain, new MapBackedPageCache<WebPage>());} public HtmlParserPageProcessor(String domain){ super(domain, new MapBackedPageCache<WebPage>());} /** * Fetches the resource represented by the URL, parses the HTML and extracts * the title element and all the links and uses them to build a WebPage object. */ public WebPage process(String url){ try{ Parser parser = new Parser(url); NodeIterator iterator = parser.parse(ALLOWED_TAGS).elements(); String title = ""; List<String> links = new ArrayList<String>(); while (iterator.hasMoreNodes()){ Node node = iterator.nextNode(); if (node instanceof TitleTag) title = ((TitleTag)node).getTitle(); else if (node instanceof LinkTag) links.add(((LinkTag)node).extractLink());} return new WebPage(url, title, links);} catch (Exception e){ throw new RuntimeException(e); }} }