Lucene realizes WORD,PDF,EXCEL,PPF full-text retrieval source code

Last Update:2015-08-25 Source: Internet

Author: User

Tags gettext html header

Developer on Alibaba Coud: Build your first app with APIs, SDKs, and tutorials on the Alibaba Cloud. Read more ＞

CREATE INDEX: Import java.io.bufferedreader;import java.io.File;   Import Java.io.fileinputstream;import Java.io.filenotfoundexception;import Java.io.FileReader;   Import java.io.IOException; Import Java.io.inputstream;import java.io.inputstreamreader;import java.io.reader;import java.io.StringReader;     Import Java.text.simpledateformat;import java.util.Date;   Import Org.apache.lucene.analysis.standard.StandardAnalyzer;   Import Org.apache.lucene.document.DateTools;   Import org.apache.lucene.document.Document;   Import Org.apache.lucene.document.Field;   Import Org.apache.lucene.index.IndexWriter;   Import Org.apache.lucene.store.Directory;   Import Org.apache.lucene.store.SimpleFSDirectory;   Import org.apache.lucene.util.Version; Import Org.apache.pdfbox.pdfparser.pdfparser;import Org.apache.pdfbox.pdmodel.pddocument;import Org.apache.pdfbox.util.pdftextstripper;import Org.apache.poi.hslf.hslfslideshow;import Org.apache.poi.hslf.model.slide;import Org.apache.poi.hslf.model.textrun;impoRT Org.apache.poi.hslf.usermodel.richtextrun;import Org.apache.poi.hslf.usermodel.slideshow;import Org.apache.poi.hssf.usermodel.hssfcell;import Org.apache.poi.hssf.usermodel.hssfdateutil;import Org.apache.poi.hssf.usermodel.hssfrow;import Org.apache.poi.hssf.usermodel.hssfsheet;import Org.apache.poi.hssf.usermodel.hssfworkbook;import Org.apache.poi.hwpf.hwpfdocument;import Org.apache.poi.hwpf.usermodel.paragraph;import Org.apache.poi.hwpf.usermodel.range;import Org.apache.poi.poifs.filesystem.documententry;import Org.apache.poi.poifs.filesystem.documentinputstream;import Org.apache.poi.poifs.filesystem.poifsfilesystem;import org.apache.poi.util.littleendian;/** * CREATE INDEX Lucene 3.0+ * @aut Hor Administrator * */public class Indexer {/** * @param args * @throws Exception */PU Blic static void Main (string[] args) throws Exception {//Where the index file is saved String Indexdir = "Data\\test\\ind           Exdir "; Where the txt file will be searched String Datedir = "Data\\test\\datedir";           IndexWriter indexwriter = null;           Create directory objects Directory dir = new Simplefsdirectory (new File (Indexdir)); Create a IndexWriter object,//The first parameter is directory, the second is a word breaker,//The third represents whether it is created, if False is modified on this basis,//fourth represents the maximum value of the word breaker, for example, new M  Axfieldlength (2), which means two words a minute,//general with IndexWriter.MaxFieldLength.LIMITED IndexWriter = new IndexWriter (dir,new           StandardAnalyzer (version.lucene_30), true, IndexWriter.MaxFieldLength.UNLIMITED);           file[] files = new File (datedir). Listfiles ();        for (int i = 0; i < files.length; i++) {Document doc = null;               if (Files[i].getname (). EndsWith (". txt")) {doc = new Document ();                Create a Field object and put in the Doc object Doc.add (new field ("Contents", New FileReader (Files[i])); Doc.add (New Field ("FileName", Files[i].getname (), Field.Store.YES, Field.Index.NOT_ANAL               yzed)); Doc.adD (New Field ("Indexdate", Datetools.datetostring (New Date (), DateTools.Resolution.DAY), Field.store.yes,field.ind Ex.         not_analyzed));        }else if (Files[i].getname (). EndsWith (". Doc")) {doc = GetDocument (files[i]);        }else if (Files[i].getname (). EndsWith (". ppt")) {doc = Getppt (files[i]);        }else if (Files[i].getname (). EndsWith (". xls")) {doc = Getexcel (files[i]);         }else if (Files[i].getname (). EndsWith (". pdf")) {doc = Getpdf (files[i]);               }else{doc = new Document ();                Create a Field object and put in the Doc object Doc.add (new field ("Contents", New FileReader (Files[i])); Doc.add (New Field ("FileName", Files[i].getname (), Field.Store.YES, Field.Index.NOT_ANAL               yzed)); Doc.add (New Field ("Indexdate", Datetools.datetostring (New Date (), DateTools.Resolution.DAY), Field.store.yes,fie Ld.                         index.not_analyzed));     }//Write IndexWriter   if (doc!= null) indexwriter.adddocument (DOC);        }//See how many index System.out.println are in IndexWriter ("Numdocs:" +indexwriter.numdocs ());            Indexwriter.close (); public static Document getdocument (file file) throws Exception {String DocPath = File.getabsolutepath (); String title = File.getname ();//create Documentdocument document = new document ();/*inputstream inputstream = null; Reader contents = null;try {InputStream = new FileInputStream (file);} catch (FileNotFoundException e) {e.printstacktrace ( );} Wordextractor extractor = new Wordextractor ()//try{//poifsfilesystem Fsys = new Poifsfilesystem (inputstream);// Documententry headerprops =//(documententry) fsys.getroot () getentry ("Worddocument");//documentinputstream din = f  Sys.createdocumentinputstream ("Worddocument");//byte[] Header = new byte[headerprops.getsize ()]; Din.read (header);//din.close (); int info = Littleendian.getshort (header, 0xa);//if ((Info & 0x4)! = 0)//{//throw NewFastsavedexception ("fast-saved files is unsupported at this time")//}//if ((Info & 0x100)! = 0)//{//throw new Passwo Rdprotectedexception ("This document is password protected");//}//}finally{//}try {contents = new StringReader ( Extractor.extracttext (InputStream));} catch (Exception e) {e.printstacktrace ();} */stringbuffer contents = new StringBuffer ("");//Document contents try {FileInputStream fs = new FileInputStream (docpat            h);            Hwpfdocument doc = new hwpfdocument (FS);            Range range = Doc.getrange ();                int paragraphcount = range.numparagraphs ();//paragraph for (int i = 0; i < Paragraphcount; i++) {//traversal paragraph read data                Paragraph pp = range.getparagraph (i);            Contents.append (Pp.text ()); }} catch (Exception e) {} String cont = Contents.tostring (). Trim ();d Ocument.add (New Field ("Filenam E ", title, field.store.yes,field.index.analyzed));//tokenized//document.add (New Field (" Contents ", contents));d OCUment.add (new Field ("Contents", cont,field.store.yes,field.index.analyzed)),//document.add (new Field ("Path", DocPath, field.store.yes,field.index.analyzed));d Ocument.add (New Field ("Indexdate", datetools.datetostring (new        Date (), DateTools.Resolution.DAY), field.store.yes,field.index.not_analyzed)); return document;} public static Document getppt (File pptfile) throws ioexception{String DocPath = Pptfile.getabsolutepath ();            String title = Pptfile.getname ();    StringBuffer contents = new StringBuffer ("");//document content InputStream is = new FileInputStream (pptfile);    Slideshow ppt = new Slideshow (new Hslfslideshow (IS));    slide[] Slides = ppt.getslides ();    Extract text information/*for (Slide each:slides) {//system.out.println ("title:" + Each.gettitle ());    System.out.println ("content:");    textrun[] Textruns = Each.gettextruns ();    for (int i=0;i< textruns.length; i++) {//system.out.println (Textruns[i].gettext ()); richtextrun[] Richtextruns = TextrunS[i].getrichtextruns ();    for (int j = 0; J < Richtextruns.length; J + +) {//system.out.println (Richtextruns[j].gettext ());    Contents.append (Richtextruns[j].gettext ());    }} contents.append (Each.gettitle ());             }*/for (int i=0;i <slides.length;i++) {textrun[] t = slides[i].gettextruns ();//In order to get the text content of the slide, set up Textrun             for (int j=0;j <t.length;j++) {contents.append (T[j].gettext ());//The text is added to the content         }//contents.append (Slides[i].gettitle ());    Document document = new document (); String cont = contents.tostring (). Trim ();d Ocument.add (New Field ("filename", title, Field.Store.YES, Field.Index.ANALYZED)//tokenized//document.add (new Field ("Contents", contents));d Ocument.add (" Contents ", cont,field.store.yes,field.index.analyzed));//document.add (New Field (" path ", DocPath, Field.Store.YES, Field.Index.ANALYZED));d Ocument.add (New Field ("Indexdate", Datetools.datetostring (New Date (), DatetooLs.    Resolution.day), field.store.yes,field.index.not_analyzed));    return document; public static Document getpdf (file pdf) {String Pdfpath = Pdf.getabsolutepath ();//create input stream to read pdf file String title = PDF.G Etname (); String result = ""; FileInputStream is = null; PDDocument doc = null;try {is = new FileInputStream (PDF); Pdfparser parser = new Pdfparser (IS);p arser.parse ();d oc = Parser.getpddocument (); Pdftextstripper stripper = new Pdftextstripper (); result = Stripper.gettext (doc);} catch (Exception e) {e.printstacktrace ();} finally {if (is = = null) {try {is.close ();} catch (Exception e) {e.printstacktr Ace ();}} if (doc! = null) {try {doc.close ()} catch (Exception e) {e.printstacktrace ();}}} Document document = new document ();d Ocument.add (New Field ("filename", title, field.store.yes,field.index.analyzed)); /tokenizeddocument.add (New Field ("contents", result, field.store.yes,field.index.analyzed));//document.add (new Field ("Path", Pdfpath, field.store.yes,field.index.analyzed)); return doCument;}        public static Document Getexcel (File fileexcel) throws Exception {InputStream is = new FileInputStream (fileexcel);        StringBuffer content = new StringBuffer ();        Hssfworkbook workbook = new Hssfworkbook (IS); for (int numsheets = 0; numsheets < workbook.getnumberofsheets (); numsheets++) {Hssfsheet Asheet = workbook            . Getsheetat (numsheets);//obtain a sheet content.append ("\ n");            if (null = = Asheet) {continue;               } for (int rowNum = 0; RowNum <= asheet.getlastrownum (); rownum++) {content.append ("\ n");               Hssfrow Arow = Asheet.getrow (RowNum);               if (null = = Arow) {continue;  } for (short cellnum = 0; Cellnum <= arow.getlastcellnum (); cellnum++) {Hssfcell Acell                   = Arow.getcell (Cellnum);                   if (null = = Acell) {continue;            }       if (acell.getcelltype () = = hssfcell.cell_type_string) {content.append (Acell.getrichstringcell                   Value (). getString ()); } else if (acell.getcelltype () = = Hssfcell.cell_type_numeric) {Boolean b = Hssfdateutil.iscelldatefo                      Rmatted (Acell);                          if (b) {Date date = Acell.getdatecellvalue ();                          SimpleDateFormat df = new SimpleDateFormat ("Yyyy-mm-dd");                      Content.append (Df.format (date));        }}}}} String cont = Content.tostring (); Document document = new document ();d Ocument.add (New Field ("FileName", Fileexcel.getname (), Field.Store.YES, Field.Index.ANALYZED));//tokenizeddocument.add (New Field ("Contents", Cont, field.store.yes,field.index.analyzed))     ;//document.add (New Field ("path", Pdfpath, field.store.yes,field.index.analyzed); return document;      }  public static string readhtml (String urlstring) {StringBuffer content = new StringBuffer ("");        File File = new file (urlstring);        FileInputStream FIS = null;            try {fis = new FileInputStream (file); Read page BufferedReader reader = new BufferedReader (FIS, "utf-8");//Here            The character encoding to note, to the HTML header file consistency, otherwise it will be garbled String line = null;            while (line = Reader.readline ())! = null) {Content.append (line + "\ n");        } reader.close ();        } catch (Exception e) {e.printstacktrace ();        } String contentstring = Content.tostring ();    return contentstring;  }}

Search index

     Import Java.io.File;     Import java.io.IOException;   Import Org.apache.lucene.analysis.standard.StandardAnalyzer;   Import org.apache.lucene.document.Document;   Import org.apache.lucene.queryParser.ParseException;   Import Org.apache.lucene.queryParser.QueryParser;   Import Org.apache.lucene.search.IndexSearcher;   Import Org.apache.lucene.search.Query;   Import Org.apache.lucene.search.ScoreDoc;   Import Org.apache.lucene.search.TopDocs;   Import Org.apache.lucene.store.Directory;   Import Org.apache.lucene.store.SimpleFSDirectory;   Import org.apache.lucene.util.Version; /** * Search Index Lucene 3.0+ * @author Administrator * */public class Searcher {public static void main (Strin G[] args) throws IOException, ParseException {//Where to save the index file String indexdir = "Data\\test\\indexdir"         ;           Directory dir = new Simplefsdirectory (new File (Indexdir)); To create a Indexsearcher object, this parameter will provide an indexed directory when compared to the IndexWriter object Indexsearcher indexseArch = new Indexsearcher (dir); Create the Queryparser object, the first parameter represents the Lucene version, the second represents the field to search for fields, and the third indicates that the search uses a word breaker queryparser queryparser = new Queryparser (versio           N.lucene_30, "Contents", New StandardAnalyzer (version.lucene_30));           Generate Query Object Query query = Queryparser.parse ("ArcGIS");           Search results Topdocs inside there are scoredocs[] array, which holds the index value topdocs hits = Indexsearch.search (query,10);           Hits.totalhits said the total number of search System.out.println ("found" +hits.totalhits+ "a"); Loop Hits.scoredocs the data and use the Indexsearch.doc method to restore the document, and then take out the corresponding field value for (int i = 0; i < hits.scoreDocs.length; i+               +) {Scoredoc SDoC = hits.scoredocs[i];            Document doc = Indexsearch.doc (sdoc.doc);        System.out.println (Doc.get ("filename"));       } indexsearch.close ();  }   }

Lucene realizes WORD,PDF,EXCEL,PPF full-text retrieval source code

This article is an English version of an article which is originally in the Chinese language on aliyun.com and is provided for information purposes only. This website makes no representation or warranty of any kind, either expressed or implied, as to the accuracy, completeness ownership or reliability of the article or any translations thereof. If you have any concerns or complaints relating to the article, please send an email, providing a detailed description of the concern or complaint, to info-contact@alibabacloud.com. A staff member will contact you within 5 working days. Once verified, infringing content will be removed immediately.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

Get Started for Free

Sales Support

1 on 1 presale consultation

Chat Contact Sales
After-Sales Support

24/7 Technical Support 6 Free Tickets per Quarter Faster Response

Open a Ticket
Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.

Learn More