Use Lucene to generate an HTML file index

Source: Internet
Author: User

I modified the indexhtml class of the Lucene demo package so that it can be called by other Java classes.

Import org. Apache. Lucene. analysis. Standard. standardanalyzer;
Import org.apache.e.doc ument. Document;
Import org. Apache. Lucene. Index. indexreader;
Import org. Apache. Lucene. Index. indexwriter;
Import org. Apache. Lucene. Index. term;
Import org. Apache. Lucene. Index. termenum;
Import java. Io. file;
Import java. util. date;
Import java. util. arrays;

// Other classes of the demo need to be called.
Import org. Apache. Lucene. Demo;
/**
* Create HTML file index for searching
* @ Author Tyrone
*
*/
Public class indexhtml {
Private string docspath = NULL;

/**
* The path for index file;
*/
Private string indexfilepath = NULL;

/**
* True during deletion pass
*/
Private Boolean deleting = false;
/**
* Existing Index
*/
Private indexreader reader;
/**
* New index being built
*/
Private indexwriter writer;
/**
* Document ID iterator
*/
Private termenum uiditer;


Private void indexdocs (File file) throws exception {
If (file. isdirectory () {// If a directory
String [] files = file. List (); // list its files
Arrays. Sort (files); // sort the files
For (INT I = 0; I <files. length; I ++) // recursively index them
This. indexdocs (new file (file, files [I]);

} Else if (file. getpath (). endswith (". html") | // index. html files
File. getpath (). endswith (". htm") | // index. HTM files
File. getpath (). endswith (". txt") {// index. txt files

If (this. uiditer! = NULL ){
String uid = htmldocument. UID (File); // construct uid for Doc

While (uiditer. term ()! = NULL & uiditer. term (). Field () = "uid "&&
Uiditer. term (). Text (). compareto (UID) <0 ){
If (deleting) {// Delete stale docs
System. Out. println ("deleting" +
Htmldocument. uid2url (uiditer. term (). Text ()));
Reader. Delete (uiditer. term ());
}
Uiditer. Next ();
}
If (uiditer. term ()! = NULL & uiditer. term (). Field () = "uid "&&
Uiditer. term (). Text (). compareto (UID) = 0 ){
Uiditer. Next (); // keep matching docs
} Else if (! Deleting) {// Add new docs
Document Doc = htmldocument. Document (File );
System. Out. println ("adding" + Doc. Get ("url "));
Writer. adddocument (DOC );
}
} Else {// creating a new index
Document Doc = htmldocument. Document (File );
System. Out. println ("adding" + Doc. Get ("url "));
Writer. adddocument (DOC); // Add docs unconditionally
}
}
Return;
}
 
/**
* Walk directory hierarchy in uid order, while keeping uid iterator from
* Existing index in sync. mismatches indicate one:
* (A) Old documents to be deleted;
* (B) unchanged documents, to be left alone;
* Or (c) new documents, to be indexed.
*/

Private void indexdocs (File file, string index, Boolean create)
Throws exception {
If (! Create) {// incrementally update

Reader = indexreader. Open (INDEX); // open existing index
Uiditer = reader. Terms (new term ("uid", ""); // init uid iterator

This. indexdocs (File );

If (deleting) {// delete rest of stale docs
While (uiditer. term ()! = NULL & uiditer. term (). Field () = "uid "){
System. Out. println ("deleting" +
Htmldocument. uid2url (uiditer. term (). Text ()));
Reader. Delete (uiditer. term ());
Uiditer. Next ();
}
Deleting = false;
}

Uiditer. Close (); // close uid iterator
Reader. Close (); // close existing index

} Else // don't have exisiting
This. indexdocs (File );

}
/**
* If create = true, create a new index, else refresh old index.
* @ Param create
*/
Public void run (Boolean create ){
Try {
String Index = "Index ";
File root = NULL;
If (this. indexfilepath! = NULL) {// index file path
Index = This. indexfilepath;
}
If (this. docspath = NULL ){
System. Out. println ("root directory is not set ");
Return;
}
Root = new file (this. docspath );
Date start = new date ();
/**
* Not create then maintenance
*/
If (! Create) {// Delete stale docs
This. Deleting = true;
This. indexdocs (root, index, create );
}

Writer = new indexwriter (index, new standardanalyzer (), create );
Writer. maxfieldlength = 1000000;

This. indexdocs (root, index, create); // Add new docs

System. Out. println ("optimizing index ...");
Writer. Optimize ();
Writer. Close ();

Date end = new date ();

System. Out. Print (end. gettime ()-start. gettime ());
System. Out. println ("Total milliseconds ");
} Catch (exception e ){
System. Out. println ("caught a" + E. getclass () +
"/N with message:" + E. getmessage ());
}
Return;
}

/**
* @ Return returns the indexfilepath.
*/
Public String getindexfilepath (){
Return indexfilepath;
}
/**
* @ Param indexfilepath the indexfilepath to set.
*/
Public void setindexfilepath (string property1 ){
This. indexfilepath = property1;
}
/**
* @ Return returns the docspath.
*/
Public String getdocspath (){
Return docspath;
}
/**
* @ Param docspath the docspath to set.
*/
Public void setdocspath (string property1 ){
This. docspath = property1;
}

/**
* Test
* @ Param ARGs
*/
Public static void main (string [] ARGs ){
Indexhtml ih = new indexhtml ();
Ih. setdocspath ("D: // myproject // colimas // clms-doc2 // html ");
Ih. setindexfilepath ("D: // myproject // colimas // Index ");
Ih. Run (true );
}
}

Generate 3 files: _ 3i8. CFs, deletable, and segments.

Search File class:

/*
* Created on 2005/07/28
*
* Todo to change the template for this generated file go
* Window-preferences-Java-code style-code templates
*/
Package com. Nova. colimas. Search. query;

/**
* @ Author Tyrone
*
* Todo to change the template for this generated type comment go
* Window-preferences-Java-code style-code templates
*/
Public class hitshtmldoc {
 
Private String title;

Private string path;

Private string URL;

/**
* @ Return returns the URL.
*/
Public String geturl (){
Return URL;
}
/**
* @ Param URL the URL to set.
*/
Public void seturl (string property1 ){
This. url = property1;
}
/**
* @ Return returns the path.
*/
Public String getpath (){
Return path;
}
/**
* @ Param path the path to set.
*/
Public void setpath (string property1 ){
This. Path = property1;
}
/**
* @ Return returns the title.
*/
Public String gettitle (){
Return title;
}
/**
* @ Param title the title to set.
*/
Public void settitle (string property1 ){
This. Title = property1;
}
}

Import org. Apache. Lucene. analysis. analyzer;
Import org. Apache. Lucene. analysis. Standard. standardanalyzer;
Import org.apache.e.doc ument. Document;
Import org. Apache. Lucene. Search. searcher;
Import org. Apache. Lucene. Search. indexsearcher;
Import org. Apache. Lucene. Search. query;
Import org. Apache. Lucene. Search. Hits;
Import org. Apache. Lucene. queryparser. queryparser;
/**
* @ Author Tyrone
*
* Todo to change the template for this generated type comment go
* Window-preferences-Java-code style-code templates
*/
Public class searchfiles {
 
Private hits;
 
Public hits gethits (){
Return hits;
}
 
Public hitshtmldoc [] Run (string indexfilepath, string line ){
Hitshtmldoc [] hitdocs;
Try {
Searcher searcher = new indexsearcher (indexfilepath );
Analyzer analyzer = new standardanalyzer ();
Query query = queryparser. parse (line, "contents", analyzer );
System. Out. println ("Searching for:" + query. tostring ("contents "));
This. Hits = searcher. Search (query );
If (this. Hits. Length () = 0) return NULL;
System. Out. println (this. Hits. Length () + "Total matching documents ");
Hitdocs = new hitshtmldoc [This. Hits. Length ()];
For (INT I = 0; I Document Doc = this.hits.doc (I );
String Path = Doc. Get ("path ");
If (path! = NULL ){
Hitdocs [I]. setpath (PATH );
} Else {
String url = Doc. Get ("url ");
If (URL! = NULL ){
Hitdocs [I] = new hitshtmldoc ();
Hitdocs [I]. seturl (URL );
String title = Doc. Get ("title ");
If (title! = NULL)
Hitdocs [I]. settitle (title );
} Else {
System. Out. println (I + "." + "no path nor URL for this document ");
}
}

}
Searcher. Close ();
Return hitdocs;
} Catch (exception e ){
System. Out. println ("caught a" + E. getclass () +
"/N with message:" + E. getmessage ());
}
Return NULL;
}
/**
* Test
* ARGs = queries
* @ Author Tyrone
*
*/
Public static void main (string [] ARGs ){
Searchfiles Se = new searchfiles ();
String query = "";
Hitshtmldoc [] hitsdoc;
For (INT I = 0; I <args. length; I ++)
Query = query + ARGs [I] + "";
Hitsdoc = Se. Run ("D: // myproject // colimas // Index", query );
If (hitsdoc = NULL ){
System. Out. println ("nothing ");
Return;
}
For (int l = 0; L System. Out. println ("url:" + hitsdoc [l]. geturl ());
System. Out. println ("Path:" + hitsdoc [l]. getpath ());
System. Out. println ("title:" + hitsdoc [l]. gettitle ());
}
}

}

Note:

1. Although the following jar package is not required when you reference Lucene to debug your application, every time a urlclasspath. Class exception occurs, download these jar packages for convenience.
Relaxngdatatype. Jar
Commons-beanutils.jar
Commons-collections.jar
Commons-digester.jar
Commons-logging.jar
Commons-validator.jar
Jakarta-oro.jar
Struts-legacy.jar

2. No other directories are in the directory where the index file is generated. If yes, an attempt will be made to delete the directory or an error will be reported.

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.