CREATE INDEX: Import java.io.bufferedreader;import java.io.File; Import Java.io.fileinputstream;import Java.io.filenotfoundexception;import Java.io.FileReader; Import java.io.IOException; Import Java.io.inputstream;import java.io.inputstreamreader;import java.io.reader;import java.io.StringReader; Import Java.text.simpledateformat;import java.util.Date; Import Org.apache.lucene.analysis.standard.StandardAnalyzer; Import Org.apache.lucene.document.DateTools; Import org.apache.lucene.document.Document; Import Org.apache.lucene.document.Field; Import Org.apache.lucene.index.IndexWriter; Import Org.apache.lucene.store.Directory; Import Org.apache.lucene.store.SimpleFSDirectory; Import org.apache.lucene.util.Version; Import Org.apache.pdfbox.pdfparser.pdfparser;import Org.apache.pdfbox.pdmodel.pddocument;import Org.apache.pdfbox.util.pdftextstripper;import Org.apache.poi.hslf.hslfslideshow;import Org.apache.poi.hslf.model.slide;import Org.apache.poi.hslf.model.textrun;impoRT Org.apache.poi.hslf.usermodel.richtextrun;import Org.apache.poi.hslf.usermodel.slideshow;import Org.apache.poi.hssf.usermodel.hssfcell;import Org.apache.poi.hssf.usermodel.hssfdateutil;import Org.apache.poi.hssf.usermodel.hssfrow;import Org.apache.poi.hssf.usermodel.hssfsheet;import Org.apache.poi.hssf.usermodel.hssfworkbook;import Org.apache.poi.hwpf.hwpfdocument;import Org.apache.poi.hwpf.usermodel.paragraph;import Org.apache.poi.hwpf.usermodel.range;import Org.apache.poi.poifs.filesystem.documententry;import Org.apache.poi.poifs.filesystem.documentinputstream;import Org.apache.poi.poifs.filesystem.poifsfilesystem;import org.apache.poi.util.littleendian;/** * CREATE INDEX Lucene 3.0+ * @aut Hor Administrator * */public class Indexer {/** * @param args * @throws Exception */PU Blic static void Main (string[] args) throws Exception {//Where the index file is saved String Indexdir = "Data\\test\\ind Exdir "; Where the txt file will be searched String Datedir = "Data\\test\\datedir"; IndexWriter indexwriter = null; Create directory objects Directory dir = new Simplefsdirectory (new File (Indexdir)); Create a IndexWriter object,//The first parameter is directory, the second is a word breaker,//The third represents whether it is created, if False is modified on this basis,//fourth represents the maximum value of the word breaker, for example, new M Axfieldlength (2), which means two words a minute,//general with IndexWriter.MaxFieldLength.LIMITED IndexWriter = new IndexWriter (dir,new StandardAnalyzer (version.lucene_30), true, IndexWriter.MaxFieldLength.UNLIMITED); file[] files = new File (datedir). Listfiles (); for (int i = 0; i < files.length; i++) {Document doc = null; if (Files[i].getname (). EndsWith (". txt")) {doc = new Document (); Create a Field object and put in the Doc object Doc.add (new field ("Contents", New FileReader (Files[i])); Doc.add (New Field ("FileName", Files[i].getname (), Field.Store.YES, Field.Index.NOT_ANAL yzed)); Doc.adD (New Field ("Indexdate", Datetools.datetostring (New Date (), DateTools.Resolution.DAY), Field.store.yes,field.ind Ex. not_analyzed)); }else if (Files[i].getname (). EndsWith (". Doc")) {doc = GetDocument (files[i]); }else if (Files[i].getname (). EndsWith (". ppt")) {doc = Getppt (files[i]); }else if (Files[i].getname (). EndsWith (". xls")) {doc = Getexcel (files[i]); }else if (Files[i].getname (). EndsWith (". pdf")) {doc = Getpdf (files[i]); }else{doc = new Document (); Create a Field object and put in the Doc object Doc.add (new field ("Contents", New FileReader (Files[i])); Doc.add (New Field ("FileName", Files[i].getname (), Field.Store.YES, Field.Index.NOT_ANAL yzed)); Doc.add (New Field ("Indexdate", Datetools.datetostring (New Date (), DateTools.Resolution.DAY), Field.store.yes,fie Ld. index.not_analyzed)); }//Write IndexWriter if (doc!= null) indexwriter.adddocument (DOC); }//See how many index System.out.println are in IndexWriter ("Numdocs:" +indexwriter.numdocs ()); Indexwriter.close (); public static Document getdocument (file file) throws Exception {String DocPath = File.getabsolutepath (); String title = File.getname ();//create Documentdocument document = new document ();/*inputstream inputstream = null; Reader contents = null;try {InputStream = new FileInputStream (file);} catch (FileNotFoundException e) {e.printstacktrace ( );} Wordextractor extractor = new Wordextractor ()//try{//poifsfilesystem Fsys = new Poifsfilesystem (inputstream);// Documententry headerprops =//(documententry) fsys.getroot () getentry ("Worddocument");//documentinputstream din = f Sys.createdocumentinputstream ("Worddocument");//byte[] Header = new byte[headerprops.getsize ()]; Din.read (header);//din.close (); int info = Littleendian.getshort (header, 0xa);//if ((Info & 0x4)! = 0)//{//throw NewFastsavedexception ("fast-saved files is unsupported at this time")//}//if ((Info & 0x100)! = 0)//{//throw new Passwo Rdprotectedexception ("This document is password protected");//}//}finally{//}try {contents = new StringReader ( Extractor.extracttext (InputStream));} catch (Exception e) {e.printstacktrace ();} */stringbuffer contents = new StringBuffer ("");//Document contents try {FileInputStream fs = new FileInputStream (docpat h); Hwpfdocument doc = new hwpfdocument (FS); Range range = Doc.getrange (); int paragraphcount = range.numparagraphs ();//paragraph for (int i = 0; i < Paragraphcount; i++) {//traversal paragraph read data Paragraph pp = range.getparagraph (i); Contents.append (Pp.text ()); }} catch (Exception e) {} String cont = Contents.tostring (). Trim ();d Ocument.add (New Field ("Filenam E ", title, field.store.yes,field.index.analyzed));//tokenized//document.add (New Field (" Contents ", contents));d OCUment.add (new Field ("Contents", cont,field.store.yes,field.index.analyzed)),//document.add (new Field ("Path", DocPath, field.store.yes,field.index.analyzed));d Ocument.add (New Field ("Indexdate", datetools.datetostring (new Date (), DateTools.Resolution.DAY), field.store.yes,field.index.not_analyzed)); return document;} public static Document getppt (File pptfile) throws ioexception{String DocPath = Pptfile.getabsolutepath (); String title = Pptfile.getname (); StringBuffer contents = new StringBuffer ("");//document content InputStream is = new FileInputStream (pptfile); Slideshow ppt = new Slideshow (new Hslfslideshow (IS)); slide[] Slides = ppt.getslides (); Extract text information/*for (Slide each:slides) {//system.out.println ("title:" + Each.gettitle ()); System.out.println ("content:"); textrun[] Textruns = Each.gettextruns (); for (int i=0;i< textruns.length; i++) {//system.out.println (Textruns[i].gettext ()); richtextrun[] Richtextruns = TextrunS[i].getrichtextruns (); for (int j = 0; J < Richtextruns.length; J + +) {//system.out.println (Richtextruns[j].gettext ()); Contents.append (Richtextruns[j].gettext ()); }} contents.append (Each.gettitle ()); }*/for (int i=0;i <slides.length;i++) {textrun[] t = slides[i].gettextruns ();//In order to get the text content of the slide, set up Textrun for (int j=0;j <t.length;j++) {contents.append (T[j].gettext ());//The text is added to the content }//contents.append (Slides[i].gettitle ()); Document document = new document (); String cont = contents.tostring (). Trim ();d Ocument.add (New Field ("filename", title, Field.Store.YES, Field.Index.ANALYZED)//tokenized//document.add (new Field ("Contents", contents));d Ocument.add (" Contents ", cont,field.store.yes,field.index.analyzed));//document.add (New Field (" path ", DocPath, Field.Store.YES, Field.Index.ANALYZED));d Ocument.add (New Field ("Indexdate", Datetools.datetostring (New Date (), DatetooLs. Resolution.day), field.store.yes,field.index.not_analyzed)); return document; public static Document getpdf (file pdf) {String Pdfpath = Pdf.getabsolutepath ();//create input stream to read pdf file String title = PDF.G Etname (); String result = ""; FileInputStream is = null; PDDocument doc = null;try {is = new FileInputStream (PDF); Pdfparser parser = new Pdfparser (IS);p arser.parse ();d oc = Parser.getpddocument (); Pdftextstripper stripper = new Pdftextstripper (); result = Stripper.gettext (doc);} catch (Exception e) {e.printstacktrace ();} finally {if (is = = null) {try {is.close ();} catch (Exception e) {e.printstacktr Ace ();}} if (doc! = null) {try {doc.close ()} catch (Exception e) {e.printstacktrace ();}}} Document document = new document ();d Ocument.add (New Field ("filename", title, field.store.yes,field.index.analyzed)); /tokenizeddocument.add (New Field ("contents", result, field.store.yes,field.index.analyzed));//document.add (new Field ("Path", Pdfpath, field.store.yes,field.index.analyzed)); return doCument;} public static Document Getexcel (File fileexcel) throws Exception {InputStream is = new FileInputStream (fileexcel); StringBuffer content = new StringBuffer (); Hssfworkbook workbook = new Hssfworkbook (IS); for (int numsheets = 0; numsheets < workbook.getnumberofsheets (); numsheets++) {Hssfsheet Asheet = workbook . Getsheetat (numsheets);//obtain a sheet content.append ("\ n"); if (null = = Asheet) {continue; } for (int rowNum = 0; RowNum <= asheet.getlastrownum (); rownum++) {content.append ("\ n"); Hssfrow Arow = Asheet.getrow (RowNum); if (null = = Arow) {continue; } for (short cellnum = 0; Cellnum <= arow.getlastcellnum (); cellnum++) {Hssfcell Acell = Arow.getcell (Cellnum); if (null = = Acell) {continue; } if (acell.getcelltype () = = hssfcell.cell_type_string) {content.append (Acell.getrichstringcell Value (). getString ()); } else if (acell.getcelltype () = = Hssfcell.cell_type_numeric) {Boolean b = Hssfdateutil.iscelldatefo Rmatted (Acell); if (b) {Date date = Acell.getdatecellvalue (); SimpleDateFormat df = new SimpleDateFormat ("Yyyy-mm-dd"); Content.append (Df.format (date)); }}}}} String cont = Content.tostring (); Document document = new document ();d Ocument.add (New Field ("FileName", Fileexcel.getname (), Field.Store.YES, Field.Index.ANALYZED));//tokenizeddocument.add (New Field ("Contents", Cont, field.store.yes,field.index.analyzed)) ;//document.add (New Field ("path", Pdfpath, field.store.yes,field.index.analyzed); return document; } public static string readhtml (String urlstring) {StringBuffer content = new StringBuffer (""); File File = new file (urlstring); FileInputStream FIS = null; try {fis = new FileInputStream (file); Read page BufferedReader reader = new BufferedReader (FIS, "utf-8");//Here The character encoding to note, to the HTML header file consistency, otherwise it will be garbled String line = null; while (line = Reader.readline ())! = null) {Content.append (line + "\ n"); } reader.close (); } catch (Exception e) {e.printstacktrace (); } String contentstring = Content.tostring (); return contentstring; }}
Search index
Import Java.io.File; Import java.io.IOException; Import Org.apache.lucene.analysis.standard.StandardAnalyzer; Import org.apache.lucene.document.Document; Import org.apache.lucene.queryParser.ParseException; Import Org.apache.lucene.queryParser.QueryParser; Import Org.apache.lucene.search.IndexSearcher; Import Org.apache.lucene.search.Query; Import Org.apache.lucene.search.ScoreDoc; Import Org.apache.lucene.search.TopDocs; Import Org.apache.lucene.store.Directory; Import Org.apache.lucene.store.SimpleFSDirectory; Import org.apache.lucene.util.Version; /** * Search Index Lucene 3.0+ * @author Administrator * */public class Searcher {public static void main (Strin G[] args) throws IOException, ParseException {//Where to save the index file String indexdir = "Data\\test\\indexdir" ; Directory dir = new Simplefsdirectory (new File (Indexdir)); To create a Indexsearcher object, this parameter will provide an indexed directory when compared to the IndexWriter object Indexsearcher indexseArch = new Indexsearcher (dir); Create the Queryparser object, the first parameter represents the Lucene version, the second represents the field to search for fields, and the third indicates that the search uses a word breaker queryparser queryparser = new Queryparser (versio N.lucene_30, "Contents", New StandardAnalyzer (version.lucene_30)); Generate Query Object Query query = Queryparser.parse ("ArcGIS"); Search results Topdocs inside there are scoredocs[] array, which holds the index value topdocs hits = Indexsearch.search (query,10); Hits.totalhits said the total number of search System.out.println ("found" +hits.totalhits+ "a"); Loop Hits.scoredocs the data and use the Indexsearch.doc method to restore the document, and then take out the corresponding field value for (int i = 0; i < hits.scoreDocs.length; i+ +) {Scoredoc SDoC = hits.scoredocs[i]; Document doc = Indexsearch.doc (sdoc.doc); System.out.println (Doc.get ("filename")); } indexsearch.close (); } }
Lucene realizes WORD,PDF,EXCEL,PPF full-text retrieval source code