打开APP
userphoto
未登录

开通VIP,畅享免费电子书等14项超值服

开通VIP
使用Lucene对html文件进行索引
我修改了lucene的demo包的IndexHTML类,使其可以被其他Java类调用。

IndexHTML类


import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermEnum;
import java.io.File;
import java.util.Date;
import java.util.Arrays;

//还需调用demo的其他类。
import org.apache.lucene.demo;
/**
* Create html file index for searching
* @author tyrone
*
*/
public class IndexHTML {
private String DocsPath=null;

/**
  * the path for index file;
  */
private String IndexFilePath=null;

/**
  * true during deletion pass
  */
   private boolean deleting = false;
   /**
    * existing index
    */
   private IndexReader reader;
   /**
    * new index being built
    */
   private IndexWriter writer;
   /**
    * document id iterator
    */
   private TermEnum uidIter;
  
  
private void indexDocs(File file)throws Exception {
  if (file.isDirectory()) {     // if a directory
   String[] files = file.list();     // list its files
   Arrays.sort(files);     // sort the files
   for (int i = 0; i < files.length; i++)   // recursively index them
    this.indexDocs(new File(file, files[i]));
  
  } else if (file.getPath().endsWith(".html") || // index .html files
    file.getPath().endsWith(".htm") || // index .htm files
    file.getPath().endsWith(".txt")) { // index .txt files
  
   if (this.uidIter != null) {
    String uid = HTMLDocument.uid(file); // construct uid for doc
    
    while (uidIter.term() != null && uidIter.term().field() == "uid" &&
      uidIter.term().text().compareTo(uid) < 0) {
     if (deleting) {     // delete stale docs
      System.out.println("deleting " +
        HTMLDocument.uid2url(uidIter.term().text()));
      reader.delete(uidIter.term());
     }
     uidIter.next();
    }
    if (uidIter.term() != null && uidIter.term().field() == "uid" &&
      uidIter.term().text().compareTo(uid) == 0) {
     uidIter.next();     // keep matching docs
    } else if (!deleting) {     // add new docs
     Document doc = HTMLDocument.Document(file);
     System.out.println("adding " + doc.get("url"));
     writer.addDocument(doc);
    }
   } else { // creating a new index
    Document doc = HTMLDocument.Document(file);
    System.out.println("adding " + doc.get("url"));
    writer.addDocument(doc); // add docs unconditionally
   }
  }
  return;
}

   /**
    * Walk directory hierarchy in uid order, while keeping uid iterator from
    * existing index in sync.  Mismatches indicate one of:
    * (a) old documents to be deleted;
    * (b) unchanged documents, to be left alone;
    * or (c) new documents, to be indexed.
    */

   private void indexDocs(File file, String index, boolean create)
        throws Exception {
    if (!create) {      // incrementally update
    
     reader = IndexReader.open(index);    // open existing index
     uidIter = reader.terms(new Term("uid", "")); // init uid iterator
    
     this.indexDocs(file);
    
     if (deleting) {      // delete rest of stale docs
      while (uidIter.term() != null && uidIter.term().field() == "uid") {
       System.out.println("deleting " +
         HTMLDocument.uid2url(uidIter.term().text()));
       reader.delete(uidIter.term());
       uidIter.next();
      }
      deleting = false;
     }
    
     uidIter.close();      // close uid iterator
     reader.close();      // close existing index
    
    } else       // don‘t have exisiting
     this.indexDocs(file);    
    
   }
   /**
    * if create=true, create a new index, else refresh old index.
    * @param create
    */
public void run(boolean create) {
     try {
        String index = "index";
        File root = null;
   if (this.IndexFilePath!=null) {    // index file path
     index = this.IndexFilePath;
   }
      if (this.DocsPath==null){
         System.out.println("root directory is not set");
         return;
        }
        root = new File(this.DocsPath);
        Date start = new Date();
        /**
         * not create then maintenance
         */
        if (!create) {      // delete stale docs
         this.deleting = true;
         this.indexDocs(root, index, create);
        }

        writer = new IndexWriter(index, new StandardAnalyzer(), create);
        writer.maxFieldLength = 1000000;

        this.indexDocs(root, index, create);    // add new docs

        System.out.println("Optimizing index...");
        writer.optimize();
        writer.close();

        Date end = new Date();

        System.out.print(end.getTime() - start.getTime());
        System.out.println(" total milliseconds");
      } catch (Exception e) {
        System.out.println(" caught a " + e.getClass() +
      "\n with message: " + e.getMessage());
      }
  return;
}

/**
  * @return Returns the IndexFilePath.
  */
public String getIndexFilePath() {
  return IndexFilePath;
}
/**
  * @param IndexFilePath The IndexFilePath to set.
  */
public void setIndexFilePath(String property1) {
  this.IndexFilePath = property1;
}
/**
  * @return Returns the DocsPath.
  */
public String getDocsPath() {
  return DocsPath;
}
/**
  * @param DocsPath The DocsPath to set.
  */
public void setDocsPath(String property1) {
  this.DocsPath = property1;
}

/**
  * test
  * @param args
  */
public static void main(String[] args){
  IndexHTML ih=new IndexHTML();
  ih.setDocsPath("D:\\MyProject\\colimas\\clms-doc2\\html");
  ih.setIndexFilePath("D:\\MyProject\\colimas\\index");
  ih.run(true);
}
}


运行后生成3个文件_3i8.cfs,deletable,segments

搜索文件类:

/*
* Created on 2005/07/28
*
* TODO To change the template for this generated file go to
* Window - Preferences - Java - Code Style - Code Templates
*/
package com.nova.colimas.search.query;

/**
* @author tyrone
*
* TODO To change the template for this generated type comment go to
* Window - Preferences - Java - Code Style - Code Templates
*/
public class HitsHTMLDoc {

private String Title;

private String Path;

private String Url;


/**
  * @return Returns the Url.
  */
public String getUrl() {
  return Url;
}
/**
  * @param Url The Url to set.
  */
public void setUrl(String property1) {
  this.Url = property1;
}
/**
  * @return Returns the Path.
  */
public String getPath() {
  return Path;
}
/**
  * @param Path The Path to set.
  */
public void setPath(String property1) {
  this.Path = property1;
}
/**
  * @return Returns the Title.
  */
public String getTitle() {
  return Title;
}
/**
  * @param Title The Title to set.
  */
public void setTitle(String property1) {
  this.Title = property1;
}
}



import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Hits;
import org.apache.lucene.queryParser.QueryParser;
/**
* @author tyrone
*
* TODO To change the template for this generated type comment go to
* Window - Preferences - Java - Code Style - Code Templates
*/
public class SearchFiles {

   private Hits hits;

   public Hits getHits(){
    return hits;
   }

   public HitsHTMLDoc[] run(String indexFilePath,String line){
    HitsHTMLDoc[] hitdocs;
    try {
     Searcher searcher = new IndexSearcher(indexFilePath);
     Analyzer analyzer = new StandardAnalyzer();
     Query query = QueryParser.parse(line, "contents", analyzer);
     System.out.println("Searching for: " + query.toString("contents"));
     this.hits = searcher.search(query);
     if (this.hits.length()==0) return null;
     System.out.println(this.hits.length() + " total matching documents");
     hitdocs=new HitsHTMLDoc[this.hits.length()];
     for (int i = 0; i < hits.length(); i++) {
      Document doc = this.hits.doc(i);
      String path = doc.get("path");
      if (path != null) {
       hitdocs[i].setPath(path);
      } else {
       String url=doc.get("url");
       if (url != null) {
        hitdocs[i]=new HitsHTMLDoc();
        hitdocs[i].setUrl(url);
        String title=doc.get("title");
        if (title!=null)
         hitdocs[i].setTitle(title);
       } else {
        System.out.println(i + ". " + "No path nor URL for this document");
       }
      }
      
     }  
     searcher.close();
     return hitdocs;
    }catch(Exception e){
     System.out.println(" caught a " + e.getClass() +
       "\n with message: " + e.getMessage());    
    }
    return null;
   }
   /**
    * test
    * args=queries
    * @author tyrone
    *
    */
   public static void main(String[] args){
    SearchFiles se=new SearchFiles();
    String query="";
    HitsHTMLDoc[] hitsdoc;
    for (int i=0;i<args.length;i++)
     query=query+args[i]+" ";
    hitsdoc=se.run("D:\\MyProject\\colimas\\index",query);
    if (hitsdoc==null){
     System.out.println("nothing");
     return;
    }
    for (int l=0;l<hitsdoc.length;l++){
     System.out.println("url:"+hitsdoc[l].getUrl());
     System.out.println("path:"+hitsdoc[l].getPath());
     System.out.println("title:"+hitsdoc[l].getTitle());
    }
   }

}




注意事项

1 引用lucene debug你的应用程序时虽然不需要下面的jar包,但每次会提示URLClassPath.class异常,为方便起见还是下载这些jar包。
relaxngDatatype.jar
commons-beanutils.jar
commons-collections.jar
commons-digester.jar
commons-logging.jar
commons-validator.jar
jakarta-oro.jar
struts-legacy.jar

2 生成index文件的目录里不能有其他目录,如果有则会试图删除或报错
本站仅提供存储服务,所有内容均由用户发布,如发现有害或侵权内容,请点击举报
打开APP,阅读全文并永久保存 查看更多类似文章
猜你喜欢
类似文章
[原创]全文搜索引擎Lucene学习笔记(页 1) - 『 编程设计 』 - 青韶论坛 湘...
Lucene版Hello world(世界,你好)
用Lucene检索数据库
lucene多种搜索方式详解例子
lucene爬数据库中的数据无非也是查询数据。所有我们用lucene搜索数据主要有下面几个步骤
Lucene 3.6.1:中文分词、创建索引库、排序、多字段分页查询以及高亮显示
更多类似文章 >>
生活服务
热点新闻
分享 收藏 导长图 关注 下载文章
绑定账号成功
后续可登录账号畅享VIP特权!
如果VIP功能使用有故障,
可点击这里联系客服!

联系客服