spider简单的爬虫程序!!!经典.doc_第1页
spider简单的爬虫程序!!!经典.doc_第2页
spider简单的爬虫程序!!!经典.doc_第3页
spider简单的爬虫程序!!!经典.doc_第4页
spider简单的爬虫程序!!!经典.doc_第5页
已阅读5页,还剩8页未读 继续免费阅读

下载本文档

版权说明:本文档由用户提供并上传,收益归属内容提供方,若内容存在侵权,请进行举报或认领

文档简介

spider简单的爬虫程序2008-10-10 16:29spider简单的爬虫程序1、基础准备htmlparser首页:/projects/htmlparser/下载:/project/showfiles.php?group_id=24399文件:htmlparser1_6_20060610.ziporg.htmlparserhtmlparser1.6cpdetector首页:/下载:/project/showfiles.php?group_id=114421文件:cpdetector_eclipse_project_1.0.7.zipcpdetectorcpdetector1.0.5spindle首页:/projects/spindle/(但是已经无法访问)2 修改spindle代码得到的spider简单的将URL打印出来了,解析的内容等等都没有处理解析HTML的基类HtmlParserUtil.javapackage mons.utils.html;import java.io.BufferedReader;import java.io.FileNotFoundException;import java.io.IOException;import java.io.InputStream;import java.io.InputStreamReader;import java.io.UnsupportedEncodingException;import .MalformedURLException;import .SocketException;import .SocketTimeoutException;import .URL;import .UnknownHostException;import java.nio.charset.Charset;import org.htmlparser.Parser;import org.htmlparser.util.NodeList;import org.htmlparser.util.ParserException;import org.htmlparser.visitors.HtmlPage;importcpdetector.io.ASCIIDetector;importcpdetector.io.CodepageDetectorProxy;importcpdetector.io.JChardetFacade;importcpdetector.io.ParsingDetector;importcpdetector.io.UnicodeDetector;public class HtmlParserUtil /* StringBuffer的缓冲区大小 */public static int TRANSFER_SIZE = 4096;/* 当前平台的行分隔符 */public static String lineSep = System.getProperty(line.separator);/* 自动探测页面编码,避免中文乱码的出现 */public static String autoDetectCharset(URL url) CodepageDetectorProxy detector = CodepageDetectorProxy.getInstance(); /* * ParsingDetector可用于检查HTML、XML等文件或字符流的编码 构造方法中的参数用于指示是否显示探测过程的详细信息 * 为false则不显示 */ detector.add(new ParsingDetector(false); detector.add(JChardetFacade.getInstance(); detector.add(ASCIIDetector.getInstance(); detector.add(UnicodeDetector.getInstance(); Charset charset = null; try charset = detector.detectCodepage(url); catch (MalformedURLException mue) mue.printStackTrace(); catch (IOException ie) ie.printStackTrace(); if (charset = null) charset = Charset.defaultCharset(); return ();/* 按照指定编码解析标准的html页面,为建立索引做准备 */public static String parseHtml(String url, String charset) String result = null; String content = null; try URL source = new URL(url); InputStream in = source.openStream(); BufferedReader reader = new BufferedReader(new InputStreamReader(in, charset); String line = new String(); StringBuffer temp = new StringBuffer(TRANSFER_SIZE); while (line = reader.readLine() != null) temp.append(line); temp.append(lineSep); reader.close(); in.close(); content = temp.toString(); catch (UnsupportedEncodingException uee) uee.printStackTrace(); catch (MalformedURLException mue) System.err.println(Invalid URL : + url); catch (UnknownHostException uhe) System.err.println(UnknowHost : + url); catch (SocketException se) System.err.println(Socket Error : + se.getMessage() + + url); catch (SocketTimeoutException ste) System.err.println(Socket Connection Time Out : + url); catch (FileNotFoundException fnfe) System.err.println(broken link + (FileNotFoundException) fnfe.getCause().getMessage() + ignored); catch (IOException ie) ie.printStackTrace(); if (content != null) Parser myParser = Parser.createParser(content, charset); HtmlPage visitor = new HtmlPage(myParser); try myParser.visitAllNodesWith(visitor); String body = null; String title = Untitled; if (visitor.getBody() != null) NodeList nodelist = visitor.getBody(); body = nodelist.asString().trim(); if (visitor.getTitle() != null) title = visitor.getTitle(); result = new String body, title ; catch (ParserException pe) pe.printStackTrace(); return result;多线程爬虫类 HtmlCaptureRunner.javapackage com.sillycat.api.thread.runner;import java.io.FileNotFoundException;import java.io.IOException;import .HttpURLConnection;import .MalformedURLException;import .SocketException;import .SocketTimeoutException;import .URL;import .UnknownHostException;import java.util.ArrayList;import java.util.HashSet;import mons.logging.Log;import mons.logging.LogFactory;import org.htmlparser.Parser;import org.htmlparser.PrototypicalNodeFactory;import org.htmlparser.filters.AndFilter;import org.htmlparser.filters.HasAttributeFilter;import org.htmlparser.filters.NodeClassFilter;import org.htmlparser.tags.BaseHrefTag;import org.htmlparser.tags.FrameTag;import org.htmlparser.tags.LinkTag;import org.htmlparser.tags.MetaTag;import org.htmlparser.util.EncodingChangeException;import org.htmlparser.util.NodeIterator;import org.htmlparser.util.NodeList;import org.htmlparser.util.ParserException;import mons.utils.StringUtil;import mons.utils.html.HtmlParserUtil;public class HtmlCaptureRunner implements Runnable public Log logger = LogFactory.getLog(getClass();/* 基准(初始)URL */protected String baseURL = null;private String contentPath = null;/* 待解析的URL地址集合,所有新检测到的链接均存放于此; 解析时按照先入先出(First-In First-Out)法则线性取出*/protected ArrayList URLs = new ArrayList();/* 已存储的URL地址集合,避免链接的重复抓取 */protected HashSet indexedURLs = new HashSet();protected Parser parser = new Parser();/* 程序运行线程数,默认2个线程 */protected int threads = 2;/* 解析页面时的字符编码 */protected String charset;/* 基准端口 */protected int basePort;/* 基准主机 */protected String baseHost;/* 是否存储,默认true */protected boolean justDatabase = true;/* 检测索引中是否存在当前URL信息,避免重复抓取 */protected boolean isRepeatedCheck = false;public HtmlCaptureRunner() PrototypicalNodeFactory factory = new PrototypicalNodeFactory(); factory.registerTag(new LocalLinkTag(); factory.registerTag(new LocalFrameTag(); factory.registerTag(new LocalBaseHrefTag(); parser.setNodeFactory(factory);public void capture() URLs.clear(); URLs.add(getBaseURL(); int responseCode = 0; String contentType = ; try HttpURLConnection uc = (HttpURLConnection) new URL(baseURL) .openConnection(); responseCode = uc.getResponseCode(); contentType = uc.getContentType(); catch (MalformedURLException mue) logger.error(Invalid URL : + getBaseURL(); catch (UnknownHostException uhe) logger.error(UnknowHost : + getBaseURL(); catch (SocketException se) logger.error(Socket Error : + se.getMessage() + + getBaseURL(); catch (IOException ie) logger.error(IOException : + ie); if (responseCode = HttpURLConnection.HTTP_OK & contentType.startsWith(text/html) try charset = HtmlParserUtil.autoDetectCharset(new URL(baseURL); basePort = new URL(baseURL).getPort(); baseHost = new URL(baseURL).getHost(); if (charset.equals(windows-1252) charset = GBK; long start = System.currentTimeMillis(); ArrayList threadList = new ArrayList(); for (int i = 0; i 0) Thread child = (Thread) threadList.remove(0); try child.join(); catch (InterruptedException ie) logger.error(InterruptedException : + ie); / for (int i = 0; i 0) / content System.out.println(url); / title / DateTools.timeToString(System.currentTimeMillis() /* 从URL队列mPages里取出单个的URL */public synchronized String dequeueURL() while (true) if (URLs.size() 0) String url = (String) URLs.remove(0); indexedURLs.add(url); if (isToBeCaptured(url) NodeList list; try int bookmark = URLs.size(); /* 获取页面所有节点 */ parser.setURL(url); try list = new NodeList(); for (NodeIterator e = parser.elements(); e .hasMoreNodes();) list.add(e.nextNode(); catch (EncodingChangeException ece) /* 解码出错的异常处理 */ parser.reset(); list = new NodeList(); for (NodeIterator e = parser.elements(); e .hasMoreNodes();) list.add(e.nextNode(); /* * 依据/wc/meta-user.html处理 * Robots tag */ NodeList robots = list .extractAllNodesThatMatch( new AndFilter(new NodeClassFilter( MetaTag.class), new HasAttributeFilter(name, robots), true); if (0 != robots.size() MetaTag robot = (MetaTag) robots.elementAt(0); String content = robot.getAttribute(content) .toLowerCase(); if (-1 != content.indexOf(none) | (-1 != content.indexOf(nofollow) for (int i = bookmark; i 0) try wait(); threads+; catch (InterruptedException ie) logger.error(InterruptedException : + ie); else notifyAll(); return null; private boolean isHTML(String url) if (!url.endsWith(.html) return false; if (StringUtil.isNotBlank(contentPath) if (!url.startsWith(baseURL + / + contentPath) return false; return true;/* 判断提取到的链接是否符合解析条件;标准为Port及Host与基准URL相同且类型为text/html或text/plain*/public boolean isToBeCaptured(String url) boolean flag = false; HttpURLConnection uc = null; int responseCode = 0; String contentType = ; String host = ; int port = 0; try URL source = new URL(url); String protocol = source.getProtocol(); if (protocol != null & protocol.equals(http) host = source.getHost(); port = source.getPort(); uc = (HttpURLConnection) source.openConnection(); uc.setConnectTimeout(8000); responseCode = uc.getResponseCode(); contentType = uc.getContentType(); catch (MalformedURLException mue) logger.error(Invalid URL : + url); catch (UnknownHostException uhe) logger.error(UnknowHost : + url); catch (SocketException se) logger.error(Socket Error : + se.getMessage() + + url); catch (SocketTimeoutException ste) logger.error(Socket Connection Time Out : + url); catch (FileNotFoundException fnfe) logger.error(broken link + url + ignored); catch (IOException ie) logger.error(IOException : + ie); if (port = basePort & responseCode = HttpURLConnection.HTTP_OK & host.equals(baseHost) & (contentType.startsWith(text/html) | contentType .startsWith(text/plain) flag = true; return flag;class LocalLinkTag extends LinkTag public void doSemanticAction() String link = getLink(); if (link.endsWith(/) link = link.substring(0, link.length() - 1); int pos = link.indexOf(#); if (pos != -1) link = link.substring(0, pos); /* 将链接加入到处理队列中 */ if (!(indexedURLs.contains(link) | URLs.contains(link) if (isHTML(link) URLs.add(link); setLink(link); /* Frame tag that rewrites the SRC URLs. The SRC URLs are mapped to local* targets if they match the source.*/class LocalFrameTag extends FrameTag public void doSemanticAction() String link = getFrameLocation(); if (link.endsWith(/) link = link.substring(0, link.length() - 1); int pos = link.indexOf(#); if (pos != -1) link = link.substring(0, pos); /* 将链接加入到处理队列中 */ if (!(indexedURLs.contains(link) | URLs.contains(link) if (isHTML(link) URLs.add(link); setFrameLocation(link); /* Base tag that doesnt show. The toHtml() method is overridden to return* an empty string, effectively shutting off the base reference.*/class LocalBaseHrefTag extends BaseHrefTag public String toHtml() return (); public String getBaseURL() return baseURL;public void setBaseURL(String baseURL) this.baseURL = baseURL;public int getThreads() return threads;public void setThreads(int threads) this.threads = threads;public String getCharset() return charset;public void setCharset(String charset) this.charset = charset;public int getBasePort() return basePort;public void setBasePort(int basePort) this.basePort = basePort;public String getBaseHost() return baseHost;public void setBaseHost(String baseHost) this.baseHost = baseHost;public boolean isJustDatabase() return justDatabase;public void setJustDatabase(boolean justDatabase) this

温馨提示

  • 1. 本站所有资源如无特殊说明,都需要本地电脑安装OFFICE2007和PDF阅读器。图纸软件为CAD,CAXA,PROE,UG,SolidWorks等.压缩文件请下载最新的WinRAR软件解压。
  • 2. 本站的文档不包含任何第三方提供的附件图纸等,如果需要附件,请联系上传者。文件的所有权益归上传用户所有。
  • 3. 本站RAR压缩包中若带图纸,网页内容里面会有图纸预览,若没有图纸预览就没有图纸。
  • 4. 未经权益所有人同意不得将文件中的内容挪作商业或盈利用途。
  • 5. 人人文库网仅提供信息存储空间,仅对用户上传内容的表现方式做保护处理,对用户上传分享的文档内容本身不做任何修改或编辑,并不能对任何下载内容负责。
  • 6. 下载文件中如有侵权或不适当内容,请与我们联系,我们立即纠正。
  • 7. 本站不保证下载资源的准确性、安全性和完整性, 同时也不承担用户因使用这些下载资源对自己和他人造成任何形式的伤害或损失。

评论

0/150

提交评论