WORD:
Import org.apache.lucene.document.Document;
Import Org.apache.lucene.document.Field;
Import Org.apache.poi.hwpf.extractor.WordExtractor;
Import Java.io.File;
Import Java.io.InputStream;
Import Java.io.FileInputStream;
Import Com.search.code.Index;
Public Document getdocument (index index, string URL, string title, InputStream is) throws Doccenterexception {
String bodyText = null;
try {
Wordextractor ex = new Wordextractor (IS),//is is the inputstream of the Word file
BodyText = Ex.gettext ();
if (!bodytext.equals ("")) {
Index. Addindex (URL, title, bodyText);
}
}catch (Doccenterexception e) {
throw new Doccenterexception ("Cannot extract content from this mocriosoft Word document", e);
}catch (Exception e) {
E.printstacktrace ();
}
}
return null;
}
Excel:
Import org.apache.lucene.document.Document;
Import Org.apache.lucene.document.Field;
Import Org.apache.poi.hwpf.extractor.WordExtractor;
Import Org.apache.poi.hssf.usermodel.HSSFWorkbook;
Import Org.apache.poi.hssf.usermodel.HSSFSheet;
Import Org.apache.poi.hssf.usermodel.HSSFRow;
Import Org.apache.poi.hssf.usermodel.HSSFCell;
Import Java.io.File;
Import Java.io.InputStream;
Import Java.io.FileInputStream;
Import Com.search.code.Index;
Public Document getdocument (index index, string URL, string title, InputStream is) throws Doccenterexception {
StringBuffer content = new StringBuffer ();
try{
Hssfworkbook workbook = new Hssfworkbook (IS);//Create a reference to the Excel workbook file
for (int numsheets = 0; numsheets < workbook.getnumberofsheets (); numsheets++) {
if (null!= workbook.getsheetat (numsheets)) {
Hssfsheet Asheet = Workbook.getsheetat (numsheets);//Get a sheet
for (int rownumofsheet = 0; Rownumofsheet <= asheet.getlastrownum (); rownumofsheet++) {
if (null!= asheet.getrow (Rownumofsheet)) {
Hssfrow Arow = Asheet.getrow (Rownumofsheet); Get a line
for (short cellnumofrow = 0; Cellnumofrow <= arow.getlastcellnum (); cellnumofrow++) {
if (null!= Arow.getcell (Cellnumofrow)) {
Hssfcell Acell = Arow.getcell (Cellnumofrow);//Get column value
Content.append (Acell.getstringcellvalue ());
}
}
}
}
}
}
if (!content.equals ("")) {
Index. Addindex (URL, title, content.tostring ());
}
}catch (Doccenterexception e) {
throw new Doccenterexception ("Cannot extract content from this mocriosoft Word document", e);
}catch (Exception e) {
System.out.println ("has run Xlread ():" + E);
}
return null;
}
Powerpoint:
Import Java.io.InputStream;
Import org.apache.lucene.document.Document;
Import Org.apache.poi.hslf.HSLFSlideShow;
Import Org.apache.poi.hslf.model.TextRun;
Import Org.apache.poi.hslf.model.Slide;
Import Org.apache.poi.hslf.usermodel.SlideShow;
Public Document getdocument (index index, string URL, string title, InputStream is)
Throws Doccenterexception {
StringBuffer content = new StringBuffer ("");
try{
Slideshow SS = new Slideshow (new Hslfslideshow (IS)),//is for file InputStream, establishing slideshow
slide[] Slides = ss.getslides ()//Get each slide
for (int i=0;i<slides.length;i++) {
textrun[] t = slides[i].gettextruns ();//To get the text content of the slide, build the Textrun
for (int j=0;j<t.length;j++) {
Content.append (T[j].gettext ());//This will add the text to the content.
}
Content.append (Slides[i].gettitle ());
}
Index. Addindex (URL, title, content.tostring ());
}catch (Exception ex) {
System.out.println (Ex.tostring ());
}
return null;
}
Pdf:
Import Java.io.InputStream;
Import java.io.IOException;
Import org.apache.lucene.document.Document;
Import org.pdfbox.cos.COSDocument;
Import Org.pdfbox.pdfparser.PDFParser;
Import org.pdfbox.pdmodel.PDDocument;
Import org.pdfbox.pdmodel.PDDocumentInformation;
Import Org.pdfbox.util.PDFTextStripper;
Import Com.search.code.Index;
Public Document getdocument (index index, string URL, string title, InputStream is) throws Doccenterexception {
Cosdocument cosdoc = null;
try {
Cosdoc = Parsedocument (IS);
catch (IOException e) {
Closecosdocument (Cosdoc);
throw new Doccenterexception ("Unable to process the PDF document", E);
}
if (cosdoc.isencrypted ()) {
if (Cosdoc!= null)
Closecosdocument (Cosdoc);
throw new Doccenterexception ("The PDF document is an encrypted document and cannot be processed");
}
String doctext = null;
try {
Pdftextstripper stripper = new Pdftextstripper ();
DocText = Stripper.gettext (new pddocument (Cosdoc));
catch (IOException e) {
Closecosdocument (Cosdoc);
throw new Doccenterexception ("Unable to process the PDF document", E);
}
PDDocument pddoc = null;
try {
Pddoc = new PDDocument (Cosdoc);
Pddocumentinformation DocInfo = Pddoc.getdocumentinformation ();
if (Docinfo.gettitle ()!=null &&!docinfo.gettitle (). Equals ("")) {
title = Docinfo.gettitle ();
}
catch (Exception e) {
Closecosdocument (Cosdoc);
Closepddocument (Pddoc);
SYSTEM.ERR.PRINTLN ("Cannot get metadata for this PDF document" + e.getmessage ());
finally {
Closecosdocument (Cosdoc);
Closepddocument (Pddoc);
}
return null;
}
private static Cosdocument Parsedocument (InputStream is) throws IOException {
Pdfparser parser = new Pdfparser (IS);
Parser.parse ();
return Parser.getdocument ();
}
private void Closecosdocument (Cosdocument cosdoc) {
if (Cosdoc!= null) {
try {
Cosdoc.close ();
catch (IOException e) {
}
}
}
private void Closepddocument (PDDocument pddoc) {
if (Pddoc!= null) {
try {
Pddoc.close ();
catch (IOException e) {
}
}
}
Code replication may be wrong, but the code has been tested, absolutely can be used, poi for 3.0-rc4,pdfbox for 0.7.3
POI: http://jakarta.apache.org/poi/index.html
PDFBox: http://www.pdfbox.org/