使用Lucene对html文件进行索引

我修改了lucene的demo包的IndexHTML类，使其可以被其他Java类调用。

IndexHTML类


import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermEnum;
import java.io.File;
import java.util.Date;
import java.util.Arrays;

//还需调用demo的其他类。
import org.apache.lucene.demo;
/**
 * Create html file index for searching
 * @author tyrone
 *
 */
public class IndexHTML {
 private String DocsPath=null;

 /**
  * the path for index file;
  */
 private String IndexFilePath=null;

 /**
  * true during deletion pass
  */
   private boolean deleting = false;
   /**
    * existing index
    */
   private IndexReader reader;
   /**
    * new index being built
    */
   private IndexWriter writer;
   /**
    * document id iterator
    */
   private TermEnum uidIter;
   
   
 private void indexDocs(File file)throws Exception {
  if (file.isDirectory()) {     // if a directory
   String[] files = file.list();     // list its files
   Arrays.sort(files);     // sort the files
   for (int i = 0; i < files.length; i++)   // recursively index them
    this.indexDocs(new File(file, files[i]));
   
  } else if (file.getPath().endsWith(".html") || // index .html files
    file.getPath().endsWith(".htm") || // index .htm files
    file.getPath().endsWith(".txt")) { // index .txt files
   
   if (this.uidIter != null) {
    String uid = HTMLDocument.uid(file); // construct uid for doc
    
    while (uidIter.term() != null && uidIter.term().field() == "uid" &&
      uidIter.term().text().compareTo(uid) < 0) {
     if (deleting) {     // delete stale docs
      System.out.println("deleting " +
        HTMLDocument.uid2url(uidIter.term().text()));
      reader.delete(uidIter.term());
     }
     uidIter.next();
    }
    if (uidIter.term() != null && uidIter.term().field() == "uid" &&
      uidIter.term().text().compareTo(uid) == 0) {
     uidIter.next();     // keep matching docs
    } else if (!deleting) {     // add new docs
     Document doc = HTMLDocument.Document(file);
     System.out.println("adding " + doc.get("url"));
     writer.addDocument(doc);
    }
   } else { // creating a new index
    Document doc = HTMLDocument.Document(file);
    System.out.println("adding " + doc.get("url"));
    writer.addDocument(doc); // add docs unconditionally
   }
  }
  return;
 }
 
   /** 
    * Walk directory hierarchy in uid order, while keeping uid iterator from
    * existing index in sync.  Mismatches indicate one of: 
    * (a) old documents to be deleted; 
    * (b) unchanged documents, to be left alone; 
    * or (c) new documents, to be indexed.
    */

   private void indexDocs(File file, String index, boolean create)
        throws Exception {
    if (!create) {      // incrementally update
     
     reader = IndexReader.open(index);    // open existing index
     uidIter = reader.terms(new Term("uid", "")); // init uid iterator
     
     this.indexDocs(file);
     
     if (deleting) {      // delete rest of stale docs
      while (uidIter.term() != null && uidIter.term().field() == "uid") {
       System.out.println("deleting " +
         HTMLDocument.uid2url(uidIter.term().text()));
       reader.delete(uidIter.term());
       uidIter.next();
      }
      deleting = false;
     }
     
     uidIter.close();      // close uid iterator
     reader.close();      // close existing index
     
    } else       // don‘t have exisiting
     this.indexDocs(file);    
    
   }
   /**
    * if create=true, create a new index, else refresh old index.
    * @param create
    */
 public void run(boolean create) {
     try {
        String index = "index";
        File root = null;
   if (this.IndexFilePath!=null) {    // index file path
     index = this.IndexFilePath;
   }
      if (this.DocsPath==null){
         System.out.println("root directory is not set");
         return;
        }
        root = new File(this.DocsPath);
        Date start = new Date();
        /**
         * not create then maintenance
         */
        if (!create) {      // delete stale docs
         this.deleting = true;
         this.indexDocs(root, index, create);
        }

        writer = new IndexWriter(index, new StandardAnalyzer(), create);
        writer.maxFieldLength = 1000000;

        this.indexDocs(root, index, create);    // add new docs

        System.out.println("Optimizing index...");
        writer.optimize();
        writer.close();

        Date end = new Date();

        System.out.print(end.getTime() - start.getTime());
        System.out.println(" total milliseconds");
      } catch (Exception e) {
        System.out.println(" caught a " + e.getClass() +
      "\n with message: " + e.getMessage());
      }
  return;
 }

 /**
  * @return Returns the IndexFilePath.
  */
 public String getIndexFilePath() {
  return IndexFilePath;
 }
 /**
  * @param IndexFilePath The IndexFilePath to set.
  */
 public void setIndexFilePath(String property1) {
  this.IndexFilePath = property1;
 }
 /**
  * @return Returns the DocsPath.
  */
 public String getDocsPath() {
  return DocsPath;
 }
 /**
  * @param DocsPath The DocsPath to set.
  */
 public void setDocsPath(String property1) {
  this.DocsPath = property1;
 }

 /**
  * test
  * @param args
  */
 public static void main(String[] args){
  IndexHTML ih=new IndexHTML();
  ih.setDocsPath("D:\\MyProject\\colimas\\clms-doc2\\html");
  ih.setIndexFilePath("D:\\MyProject\\colimas\\index");
  ih.run(true);
 }
}

运行后生成3个文件_3i8.cfs，deletable，segments

搜索文件类：

/*
 * Created on 2005/07/28
 *
 * TODO To change the template for this generated file go to
 * Window - Preferences - Java - Code Style - Code Templates
 */
package com.nova.colimas.search.query;

/**
 * @author tyrone
 *
 * TODO To change the template for this generated type comment go to
 * Window - Preferences - Java - Code Style - Code Templates
 */
public class HitsHTMLDoc {
 
 private String Title;

 private String Path;

 private String Url;


 /**
  * @return Returns the Url.
  */
 public String getUrl() {
  return Url;
 }
 /**
  * @param Url The Url to set.
  */
 public void setUrl(String property1) {
  this.Url = property1;
 }
 /**
  * @return Returns the Path.
  */
 public String getPath() {
  return Path;
 }
 /**
  * @param Path The Path to set.
  */
 public void setPath(String property1) {
  this.Path = property1;
 }
 /**
  * @return Returns the Title.
  */
 public String getTitle() {
  return Title;
 }
 /**
  * @param Title The Title to set.
  */
 public void setTitle(String property1) {
  this.Title = property1;
 }
}

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Hits;
import org.apache.lucene.queryParser.QueryParser;
/**
 * @author tyrone
 *
 * TODO To change the template for this generated type comment go to
 * Window - Preferences - Java - Code Style - Code Templates
 */
public class SearchFiles {
 
   private Hits hits;
 
   public Hits getHits(){
    return hits;
   }
 
   public HitsHTMLDoc[] run(String indexFilePath,String line){
    HitsHTMLDoc[] hitdocs;
    try {
     Searcher searcher = new IndexSearcher(indexFilePath);
     Analyzer analyzer = new StandardAnalyzer();
     Query query = QueryParser.parse(line, "contents", analyzer);
     System.out.println("Searching for: " + query.toString("contents"));
     this.hits = searcher.search(query);
     if (this.hits.length()==0) return null;
     System.out.println(this.hits.length() + " total matching documents");
     hitdocs=new HitsHTMLDoc[this.hits.length()];
     for (int i = 0; i < hits.length(); i++) {
      Document doc = this.hits.doc(i);
      String path = doc.get("path");
      if (path != null) {
       hitdocs[i].setPath(path);
      } else {
       String url=doc.get("url");
       if (url != null) {
        hitdocs[i]=new HitsHTMLDoc();
        hitdocs[i].setUrl(url);
        String title=doc.get("title");
        if (title!=null)
         hitdocs[i].setTitle(title);
       } else {
        System.out.println(i + ". " + "No path nor URL for this document");
       }
      }
      
     }  
     searcher.close();
     return hitdocs;
    }catch(Exception e){
     System.out.println(" caught a " + e.getClass() +
       "\n with message: " + e.getMessage());     
    }
    return null;
   }
   /**
    * test
    * args=queries
    * @author tyrone
    *
    */
   public static void main(String[] args){
    SearchFiles se=new SearchFiles();
    String query="";
    HitsHTMLDoc[] hitsdoc;
    for (int i=0;i<args.length;i++)
     query=query+args[i]+" ";
    hitsdoc=se.run("D:\\MyProject\\colimas\\index",query);
    if (hitsdoc==null){
     System.out.println("nothing");
     return;
    }
    for (int l=0;l<hitsdoc.length;l++){
     System.out.println("url:"+hitsdoc[l].getUrl());
     System.out.println("path:"+hitsdoc[l].getPath());
     System.out.println("title:"+hitsdoc[l].getTitle());
    }
   }

}

注意事项：

1 引用lucene debug你的应用程序时虽然不需要下面的jar包，但每次会提示URLClassPath.class异常，为方便起见还是下载这些jar包。
relaxngDatatype.jar
commons-beanutils.jar
commons-collections.jar
commons-digester.jar
commons-logging.jar
commons-validator.jar
jakarta-oro.jar
struts-legacy.jar

2 生成index文件的目录里不能有其他目录，如果有则会试图删除或报错

本站仅提供存储服务，所有内容均由用户发布，如发现有害或侵权内容，请点击举报。