crawl详解.doc

上传人：清*** IP属地：河南上传时间：2020-02-02 格式：DOC 页数：6 大小：39KB 积分：12 举报 版权申诉

已阅读5页，还剩1页未读，继续免费阅读

版权说明：本文档由用户提供并上传，收益归属内容提供方，若内容存在侵权，请进行举报或认领

文档简介

nutch crawl类详解2010-02-02 13:53/* Licensed to the Apache Software Foundation (ASF) under one or more* contributor license agreements. See the NOTICE file distributed with* this work for additional information regarding copyright ownership.* The ASF licenses this file to You under the Apache License, Version 2.0* (the License); you may not use this file except in compliance with* the License. You may obtain a copy of the License at* /licenses/LICENSE-2.0* Unless required by applicable law or agreed to in writing, software* distributed under the License is distributed on an AS IS BASIS,* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.* See the License for the specific language governing permissions and* limitations under the License.*/package org.apache.nutch.crawl;import java.util.*;import java.text.*;/ Commons Logging importsimport mons.logging.Log;import mons.logging.LogFactory;import org.apache.hadoop.fs.*;import org.apache.hadoop.conf.*;import org.apache.hadoop.mapred.*;import org.apache.nutch.parse.ParseSegment;import org.apache.nutch.indexer.DeleteDuplicates;import org.apache.nutch.indexer.IndexMerger;import org.apache.nutch.indexer.Indexer;import org.apache.nutch.util.HadoopFSUtil;import org.apache.nutch.util.NutchConfiguration;import org.apache.nutch.util.NutchJob;import org.apache.nutch.fetcher.Fetcher;public class Crawl /*返回一个Log工厂申请*/public static final Log LOG = LogFactory.getLog(Crawl.class);/*静态时间方法*/private static String getDate() return new SimpleDateFormat(yyyyMMddHHmmss).format (new Date(System.currentTimeMillis();/* Perform complete crawling and indexing given a set of root urls. */*主函数类*/*该知道，Nutch查找文件系统是基于Linux系统的机制的，所以提供启动的命令与Linux的Shell命令很相似。*/*输入爬行命令后，调用Crwal类，完成爬虫的工作.*/public static void main(String args) throws Exception /*检查命令参数是否合法，如果不合法给出提示.*/ if (args.length 1) System.out.println (Usage: Crawl -dir d -threads n -depth i -topN N); return; /*生成一个NutchConfiguration的对象，NutchConfiguration是管理Nutch自己的配置文件的类，*/ /*Configuration类是管理Hadoop配置文件的类。*/ /*可以在Hadoop的源代码中查看到该类的定义(读取hadoop-site.xml配置文件)*/ Configuration conf = NutchConfiguration.create(); /*这个配置文件主要是用于配置抓取企业内部网*/ /*hadoop-site.xml要高于hadoop-default.xml*/ /*优先级crawl-tool.xml高于nutch-site.xml，nutch-site.xml高于nutch-default.xml */ conf.addResource(crawl-tool.xml); /*抓取任务配置实例的创建*/ JobConf job = new NutchJob(conf); /*新建一个url*/ /*定义存放抓取结果的目标路径*/ Path rootUrlDir = null; /* *以下四句话， *作用应该是如果用户没有输入这四个中的值，在这里为添加默认 *dir depth -threads topN *可以看出，这四个数据就是使用Crawl命令时候用到的参数 */ /*Path是新建生成的文件夹*/ Path dir = new Path(crawl- + getDate(); /*并发访问数,默认为10*/ int threads = job.getInt(fetcher.threads.fetch, 10); /*抓取深度,默认为5*/ int depth = 5; /*抓取网页的数量*/ long topN = Long.MAX_VALUE; /*bin/nutch crawl urls -dir crawled -depth 5 -threads 10 -topN 30 & logs.log */ for (int i = 0; i args.length; i+) if (-dir.equals(argsi) /如果用户输入了-dir,新建一个文件夹，对应上面的crawled dir = new Path(argsi+1); i+; else if (-threads.equals(argsi) /如果用户输入了-threads，将其写入threads，对应5 threads = Integer.parseInt(argsi+1); i+; else if (-depth.equals(argsi) /如果用户输入了-depth，将其写入depth，对应10 depth = Integer.parseInt(argsi+1); i+; else if (-topN.equals(argsi) /如果用户输入了topN，写入topN 对应30 topN = Integer.parseInt(argsi+1); i+; else if (argsi != null) /此处应该是获得用户所要用到的根节点 /也就是爬行起始的地方 /rootUrlDir用来获得该爬行起始点所在的目录，对应上面的urls rootUrlDir = new Path(argsi); /*获得文件系统*/ /*fs应该是获得抓取的页面的文件夹，因为从下面可以看出如果存在同名的文件，将会调用异常处理*/ FileSystem fs = FileSystem.get(job); if(fs.exists(dir) System.out.println(文件已经存在:+dir); throw new RuntimeException(dir + already exists.); /*如果需要生成日志文件，加入相应的开始的信息*/ /*登录日志信息*/ if (LOG.isInfoEnabled() LOG.info(crawl started in: + dir); LOG.info(rootUrlDir = + rootUrlDir); LOG.info(threads = + threads); LOG.info(depth = + depth); if (topN != Long.MAX_VALUE) LOG.info(topN = + topN); /*新建5个文件夹*/ /*在目录dir下面创建下面5个目录，用来存放，抓取工作过程中不同操作生成的文件或者目录*/ Path crawlDb = new Path(dir + /crawldb); Path linkDb = new Path(dir + /linkdb); Path segments = new Path(dir + /segments); Path indexes = new Path(dir + /indexes); Path index = new Path(dir + /index); /*看不清这句是什么意思*/ Path tmpDir = job.getLocalPath(crawl+Path.SEPARATOR+getDate(); /*根据Configuration conf创建一个Injector实例*/ /*crawldb数据库内容得到更新，包括URL及其状态*/ /*注入新URL到crawldb中*/ /* *1.将URL集合进行格式化和过滤，消除其中的非法URL，并设定URL状态(UNFETCHED),按照一定方法进行初始化分值；*2.将URL进行合并，消除重复的URL入口；*3.将URL及其状态、分值存入crawldb数据库，与原数据库中重复的则删除旧的，更换新的。 * */ Injector injector = new Injector(conf); /根据Configuration conf创建一个Generator实例 /*从Crawldb中抓取新的Segment*/ Generator generator = new Generator(conf); /*它负责一个segment的爬取*/ Fetcher fetcher = new Fetcher(conf); /*它对一个segment运行ParseSegment*/ /*依我看是对html文本内容的提取*/ ParseSegment parseSegment = new ParseSegment(conf); /*初始化crawlDb文件夹，构建一个插入器*/ /*用fetch过程中获取的信息更新crawldb*/ CrawlDb crawlDbTool = new CrawlDb(conf); /*它用从segment中获取到的信息更新linkdb*/ LinkDb linkDbTool = new LinkDb(conf); /*创建一个segment的索引，利用crawldb和linkdb中的数据对索引中的页面打分*/ Indexer indexer = new Indexer(conf); /根据Configuration conf创建一个DeleteDuplicates实例 DeleteDuplicates dedup = new DeleteDuplicates(conf); /*它合并多个segment索引*/ IndexMerger merger = new IndexMerger(conf); / initialize crawlDb /*初始化crawlDb文件夹，构建一个插入器*/ injector.inject(crawlDb, rootUrlDir); int i; /* *可以看出每一层爬取会生成Segment里面的一个新的文件夹 *Segment代表一个网页集合，这个集合中的网页被作为一个小的单元统一地进行抓取和索引。 *它里面存储的数据主要有三个类型： *a fetchlist: 将要被抓取的网页的名称列表 *the fetcher output: 被抓取回来的网页的文件集合 *the index:利用lucene为 the fetcher output 建立的索引 */ for (i = 0; i 0) linkDbTool.invert(linkDb, segments, true, true, false); / invert links if(indexes != null) / Delete old indexes /如果索引存在则删除 if (fs.exists(indexes) /写入日志 LOG.info(Deleting old indexes: + indexes); fs.delete(indexes, true); / Delete old index if (fs.exists(index) LOG.info(Deleting old merged index: + index); fs.delete(index, true); / index, dedup & merge FileStatus fstats = fs.listStatus(segments, HadoopFSUtil.

人人文库> 全部分类> 教育资料 > 课设设计

温馨提示

1. 本站所有资源如无特殊说明，都需要本地电脑安装OFFICE2007和PDF阅读器。图纸软件为CAD,CAXA,PROE,UG,SolidWorks等.压缩文件请下载最新的WinRAR软件解压。
2. 本站的文档不包含任何第三方提供的附件图纸等，如果需要附件，请联系上传者。文件的所有权益归上传用户所有。
3. 本站RAR压缩包中若带图纸，网页内容里面会有图纸预览，若没有图纸预览就没有图纸。
4. 未经权益所有人同意不得将文件中的内容挪作商业或盈利用途。
5. 人人文库网仅提供信息存储空间，仅对用户上传内容的表现方式做保护处理，对用户上传分享的文档内容本身不做任何修改或编辑，并不能对任何下载内容负责。
6. 下载文件中如有侵权或不适当内容，请与我们联系，我们立即纠正。
7. 本站不保证下载资源的准确性、安全性和完整性, 同时也不承担用户因使用这些下载资源对自己和他人造成任何形式的伤害或损失。

crawl详解.doc

文档简介

温馨提示

最新文档

评论

crawl详解.doc

文档简介

温馨提示

最新文档

评论

相关文档