|
用httpunit写的spider程序:可以监测网站的错误页面!(1)
这个程序出自Java Tools for Extreme Programming一书。
import com.meterware.httpunit.*; import Java.util.HashSet; import Java.util.Set; public class CheckSite { private WebConversation conversation; private Set checkedLinks; private String host = "www.sohu.com"; public static void main(String[] args) throws Exception { CheckSite CS = new CheckSite(); CS.setUp(); CS.testEntireSite(); } public void setUp() { conversation = new WebConversation(); checkedLinks = new HashSet(); } public void testEntireSite() throws Exception { WebResponse response = conversation.getResponse("http://" + host); checkAllLinks(response); System.out.println("Site check finished. Link's checked: " + checkedLinks.size() + " : " + checkedLinks); } private void checkAllLinks(WebResponse response) throws Exception { if (!isHtml(response)) { return; } WebLink[] links = response.getLinks(); System.out.println(response.getTitle() + " -- links found = " + links.length); for (int i = 0; i < links.length; i++) { boolean newLink = checkedLinks.add(links[i].getURLString());
|