尚学堂科技_张志宇_lucene_构建一个简单的WEB搜索程序.doc_第1页
尚学堂科技_张志宇_lucene_构建一个简单的WEB搜索程序.doc_第2页
尚学堂科技_张志宇_lucene_构建一个简单的WEB搜索程序.doc_第3页
尚学堂科技_张志宇_lucene_构建一个简单的WEB搜索程序.doc_第4页
尚学堂科技_张志宇_lucene_构建一个简单的WEB搜索程序.doc_第5页
已阅读5页,还剩145页未读 继续免费阅读

下载本文档

版权说明:本文档由用户提供并上传,收益归属内容提供方,若内容存在侵权,请进行举报或认领

文档简介

Lucene_构建一个简单的WEB搜索程序lucene2.3.2tomcat6.0.16je-analysis1.4.0lukeall0.7.1Mysql jdbc driver3.1.13Tidy04aug2000r7MyEclipse6.0M1_E3.3l 项目周期n 3-4天l 目标n Lucene入门u 全文检索的概念,倒排索引的概念u 建立索引u 搜索u 中文分词的实现n Nutch入门n 串知识点Html,css,javascript,servlet,jsp,mysql,n 介绍MVC的概念n 演示借用一些javascript的成熟的框架实现页面的特殊效果。例如:ricon 学会使用myeclipsen 熟悉mysql数据库的用法l 什么时候用lucenen 数据库大量数据,文本字段内容很多n 非结构化文档1. 安装myeclipse l 建立工程web projectn 工程名称lucenel 如何配置tomcat服务器n 好处自动部署n Windowshow viewserversl 如何部署web appn Deploy按钮,添加tomcat项目l Web browser窗口n 最好不用此browsern Show viewweb browserl 引入jar包Lucene工程文件夹下,建立lib目录,拷贝如下jar包到lib目录n lucene-core-2.2.0.jarn Tidy.jarn lucene-2.2.0lucene-2.2.0contribanalyzerslucene-analyzers-2.2.0.jarn je-analysis-1.4.0.jarn mysql-connector-java-3.1.13-bin.jarl 显示line numberl Alt/自动完成快捷键效果出不来l .快捷键效果出不来2. 为一个文件建立索引(英文)确认已经引入包lucene-core-2.2.0.jarField.Store.YES和Field.Store.NO区别l termVector是Lucene 1.4.3新增的它提供一种向量机制来进行模糊查询,很少用。l DateTools.timeToStringIndexHTML.javaimport java.io.File;import org.apache.lucene.analysis.standard.StandardAnalyzer;import org.apache.lucene.document.Document;import org.apache.lucene.document.Field;import org.apache.lucene.index.IndexWriter;public class IndexHTML static String index = D:share05_Servlet_JSPtomcatapache-tomcat-5.5.17index;static String root = D:sharelucenesoftlucene-2.2.0lucene-2.2.0docsapiindex.html;public static void main(String args )throws ExceptionIndexWriter writer = new IndexWriter(index,new StandardAnalyzer(),true);Document doc = new Document();File f = new File(root);doc.add(new Field (path,f.getPath(),Field.Store.YES,Field.Index.UN_TOKENIZED);doc.add(new Field (content,我们是共产主义接班人,Field.Store.NO,Field.Index.TOKENIZED);writer.addDocument(doc);writer.optimize();writer.close();3. 如何确认索引已经正确建立?java -jar lukeall-0.7.1.jar4. tomcat配置l WEB-INFlibn lucene-core-2.2.0.jarn je-analysis-1.4.0.jarl 确保8080端口可用l reloadablen C:tomcatconfcontext.xml5. 为一个文件建立索引(递归)import java.io.File;import java.io.FileNotFoundException;import java.io.FileReader;import java.io.IOException;import org.apache.lucene.analysis.standard.StandardAnalyzer;import org.apache.lucene.document.Document;import org.apache.lucene.document.Field;import org.apache.lucene.index.CorruptIndexException;import org.apache.lucene.index.IndexWriter;import org.apache.lucene.store.LockObtainFailedException;public class IndexHTML1 static IndexWriter writer;public static void main(String args) throws Exception String root = D:share01_J2SEsofthtml_zh_CNhtmlzh_CNapijavalang;String index = D:sharetoolsapache-tomcat-6.0.14apache-tomcat-6.0.14index_cn;writer = new IndexWriter(index,new StandardAnalyzer(),true);File f = new File(root);indexDocs(f);writer.optimize();writer.close();private static void indexDocs(File f) throws Exception if(f.isDirectory()File subs = f.listFiles();for (int i = 0; i subs.length; i+) indexDocs(subsi);elseindexDoc(f);private static void indexDoc(File f) throws Exception System.out.println(f.getPath();Document doc = new Document();doc.add(new Field(path,f.getPath(),Field.Store.YES,Field.Index.UN_TOKENIZED);doc.add(new Field(content,new FileReader(f);writer.addDocument(doc);6. 为一个文件建立索引(使用Tidy)l 确认已经引入包Tidy.jarl 确认已经引入包je-analysis-1.4.0.jarimport java.io.File;import java.io.FileInputStream;import java.io.IOException;import java.io.InputStream;import java.io.InputStreamReader;import java.io.Reader;import java.text.DecimalFormat;import jeasy.analysis.MMAnalyzer;import org.apache.lucene.document.DateTools;import org.apache.lucene.document.Document;import org.apache.lucene.document.Field;import org.apache.lucene.index.IndexWriter;import org.w3c.dom.Element;import org.w3c.dom.Node;import org.w3c.dom.NodeList;import org.w3c.dom.Text;import org.w3c.tidy.Tidy;public class IndexHTMLTidy / 索引建立到那个目录static String index = C:tomcatindex_cn;/ 英文内容/ static String root =/ G:lessonslucenestudentsoftlucene-2.2.0lucene-2.2.0docsapiindex.html;/ 中文内容,java.lang下面的内容即可static String root = E:appdevelopjavaapihtml_zh_CNhtmlzh_CNapijavalang;static Document doc = null;static IndexWriter writer = null;public static void main(String args) throws Exception writer = new IndexWriter(index, new MMAnalyzer(), true);File f = new File(root);indexDocs(f);writer.addDocument(doc);writer.optimize();writer.close();System.out.println(ok.);public static void indexDocs(File f) throws Exception if (f.isDirectory() String file = f.list();for (int i = 0; i 读取单个字符。InputStreamReader ips = new InputStreamReader(new FileInputStream(f),gb2312);/ 适配器模式InputStream is = new ReaderToInputStream(ips);org.w3c.dom.Document root = tidy.parseDOM(is, null);/ 得到根元素Element rawDoc = root.getDocumentElement();String title = getTitle(rawDoc);String body = getBody(rawDoc);System.out.println(title);/ System.out.println(body);doc.add(new Field(title, title, Field.Store.YES,Field.Index.TOKENIZED);String summary = body;if (body.length() = 200) summary = body.substring(0, 200);doc.add(new Field(summary, summary, Field.Store.YES,Field.Index.TOKENIZED);doc.add(new Field(content, body, Field.Store.YES,Field.Index.TOKENIZED);writer.addDocument(doc);/ 适配器public static class ReaderToInputStream extends InputStream Reader reader;public ReaderToInputStream(Reader reader) super();this.reader = reader;Overridepublic int read() throws IOException try return reader.read(); catch (IOException e) throw e;/ 得到title标签内容protected static String getTitle(Element rawDoc) if (rawDoc = null) return ;String title = ;NodeList children = rawDoc.getElementsByTagName(title);if (children.getLength() 0) Element titleElement = (Element) children.item(0);Text text = (Text) titleElement.getFirstChild();if (text != null) title = text.getData();return title;/ 得到body标签内容protected static String getBody(Element rawDoc) if (rawDoc = null) return ;String body = ;NodeList children = rawDoc.getElementsByTagName(body);if (children.getLength() 0) body = getText(children.item(0);return body;/ 递归调用,因为标签里面还有标签protected static String getText(Node node) NodeList children = node.getChildNodes();StringBuffer sb = new StringBuffer();for (int i = 0; i children.getLength(); i+) Node child = children.item(i);switch (child.getNodeType() case Node.ELEMENT_NODE:sb.append(getText(child);sb.append( );break;case Node.TEXT_NODE:sb.append(Text) child).getData();break;return sb.toString();7. 支持Pdf格式的文件IndexHTMLTidy.javaimport java.io.BufferedInputStream;import java.io.File;import java.io.FileInputStream;import java.io.IOException;import java.io.InputStream;import java.io.InputStreamReader;import java.io.Reader;import java.io.UnsupportedEncodingException;import java.text.DecimalFormat;import jeasy.analysis.MMAnalyzer;import org.apache.lucene.document.DateTools;import org.apache.lucene.document.Document;import org.apache.lucene.document.Field;import org.apache.lucene.index.CorruptIndexException;import org.apache.lucene.index.IndexWriter;import org.w3c.dom.Element;import org.w3c.dom.Node;import org.w3c.dom.NodeList;import org.w3c.dom.Text;import org.w3c.tidy.Tidy;public class IndexHTMLTidy / 索引建立到那个目录static String index = C:tomcatindex_cn;/ 英文内容/ static String root =/ G:lessonslucenestudentsoftlucene-2.2.0lucene-2.2.0docsapiindex.html;/ 中文内容,java.lang下面的内容即可static String root = E:appdevelopjavaapihtml_zh_CNhtmlzh_CNapijavalang;/ static String root =/ D:shareTOOLShtml_zh_CNhtmlzh_CNapijavalang;static Document doc = null;static IndexWriter writer = null;public static void main(String args) throws Exception writer = new IndexWriter(index, new MMAnalyzer(), true);File f = new File(root);indexDocs(f);/ writer.addDocument(doc);writer.optimize();writer.close();System.out.println(ok.);public static void indexDocs(File f) throws Exception if (f.isDirectory() String file = f.list();for (int i = 0; i 读取单个字符。InputStreamReader ips = new InputStreamReader(new FileInputStream(f),gb2312);/ 适配器模式InputStream is = new ReaderToInputStream(ips);org.w3c.dom.Document root = tidy.parseDOM(is, null);/ 得到根元素Element rawDoc = root.getDocumentElement();String title = getTitle(rawDoc);String body = getBody(rawDoc);System.out.println(title);/ System.out.println(body);doc.add(new Field(title, title, Field.Store.YES,Field.Index.TOKENIZED);String summary = body;if (body.length() = 200) summary = body.substring(0, 200);doc.add(new Field(summary, summary, Field.Store.YES,Field.Index.TOKENIZED);doc.add(new Field(content, body, Field.Store.YES,Field.Index.TOKENIZED);writer.addDocument(doc);/ 适配器public static class ReaderToInputStream extends InputStream Reader reader;public ReaderToInputStream(Reader reader) super();this.reader = reader;Overridepublic int read() throws IOException try return reader.read(); catch (IOException e) throw e;/ 得到title标签内容protected static String getTitle(Element rawDoc) if (rawDoc = null) return ;String title = ;NodeList children = rawDoc.getElementsByTagName(title);if (children.getLength() 0) Element titleElement = (Element) children.item(0);Text text = (Text) titleElement.getFirstChild();if (text != null) title = text.getData();return title;/ 得到body标签内容protected static String getBody(Element rawDoc) if (rawDoc = null) return ;String body = ;NodeList children = rawDoc.getElementsByTagName(body);if (children.getLength() 0) body = getText(children.item(0);return body;/ 递归调用,因为标签里面还有标签protected static String getText(Node node) NodeList children = node.getChildNodes();StringBuffer sb = new StringBuffer();for (int i = 0; i children.getLength(); i+) Node child = children.item(i);switch (child.getNodeType() case Node.ELEMENT_NODE:sb.append(getText(child);sb.append( );break;case Node.TEXT_NODE:sb.append(Text) child).getData();break;return sb.toString();8. 为doc、xls、ppt文件建立索引Word.javaimport org.apache.poi.hwpf.extractor.WordExtractor;import java.io.File;import java.io.InputStream;public class Word public static void main(String args) throws Exception System.out.println(getContent(c:lucene.doc);public static String getContent(String s) throws Exception return getContent(new java.io.FileInputStream(s);public static String getContent(File f) throws Exception return getContent(new java.io.FileInputStream(f);public static String getContent(InputStream is) throws Exception String bodyText = null;WordExtractor ex = new WordExtractor(is);bodyText = ex.getText();return bodyText;Excel.javaimport org.apache.poi.hssf.usermodel.HSSFDateUtil;import org.apache.poi.hssf.usermodel.HSSFWorkbook;import org.apache.poi.hssf.usermodel.HSSFSheet;import org.apache.poi.hssf.usermodel.HSSFRow;import org.apache.poi.hssf.usermodel.HSSFCell;import java.io.File;import java.io.InputStream;import java.text.SimpleDateFormat;import java.util.Date;public class Excel public static void main(String args) throws Exception System.out.println(getContent(c:schedule.xls);public static String getContent(String s) throws Exception return getContent(new java.io.FileInputStream(s);public static String getContent(File f) throws Exception return getContent(new java.io.FileInputStream(f);public static String getContent(InputStream is) throws Exception StringBuffer content = new StringBuffer();/工作簿HSSFWorkbook workbook = new HSSFWorkbook(is);/循环每一个sheetfor (int numSheets = 0; numSheets workbook.getNumberOfSheets(); numSheets+) / 获得一个sheetHSSFSheet aSheet = workbook.getSheetAt(numSheets);content.append(n);if (null = aSheet) continue;/循环每一行for (int rowNum = 0; rowNum = aSheet.getLastRowNum(); rowNum+) content.append(n);/得到某一行HSSFRow aRow = aSheet.getRow(rowNum);if (null = aRow) continue;/循环每一列for (short cellNum = 0; cellNum = aRow.getLastCellNum(); cellNum+) /得到每一列HSSFCell aCell = aRow.getCell(cellNum);if (null = aCell) continue;/如果是字符串类型if (aCell.getCellType() = HSSFCell.CELL_TYPE_STRING) content.append(aCell.getRichStringCellValue().getString();/否则,如果是数值类型 else if (aCell.getCellType() = HSSFCell.CELL_TYPE_NUMERIC) /如果Cell的Type为CELL_TYPE_NUMERIC时,还需要进一步判断该Cell的数据格式,/因为它有可能是Date类型,在Excel中的Date类型也是以Double类型的数字存储的。boolean b = HSSFDateUtil.isCellDateFormatted(aCell);if (b) Date date = aCell.getDateCellValue();SimpleDateFormat df = new SimpleDateFormat(yyyy-MM-dd);content.append(df.format(date);elsecontent.append(aCell.getNumericCellValue();return content.toString();PowerPoint.javaimport java.io.File;import java.io.InputStream;import org.apache.poi.hslf.HSLFSlideShow;import org.apache.poi.hslf.model.TextRun;import org.apache.poi.hslf.model.Slide;import org.apache.poi.hslf.usermodel.SlideShow;public class PowerPoint public static void main(String args) throws E

温馨提示

  • 1. 本站所有资源如无特殊说明,都需要本地电脑安装OFFICE2007和PDF阅读器。图纸软件为CAD,CAXA,PROE,UG,SolidWorks等.压缩文件请下载最新的WinRAR软件解压。
  • 2. 本站的文档不包含任何第三方提供的附件图纸等,如果需要附件,请联系上传者。文件的所有权益归上传用户所有。
  • 3. 本站RAR压缩包中若带图纸,网页内容里面会有图纸预览,若没有图纸预览就没有图纸。
  • 4. 未经权益所有人同意不得将文件中的内容挪作商业或盈利用途。
  • 5. 人人文库网仅提供信息存储空间,仅对用户上传内容的表现方式做保护处理,对用户上传分享的文档内容本身不做任何修改或编辑,并不能对任何下载内容负责。
  • 6. 下载文件中如有侵权或不适当内容,请与我们联系,我们立即纠正。
  • 7. 本站不保证下载资源的准确性、安全性和完整性, 同时也不承担用户因使用这些下载资源对自己和他人造成任何形式的伤害或损失。

评论

0/150

提交评论