WORD:
Import org.apache.e.doc ument. Document;
Import org.apache.e.doc ument. Field;
Import org. apache. poi. hwpf. extractor. WordExtractor;
Import java. io. File;
Import java. io. InputStream;
Import java. io. FileInputStream;
Import com. search. code. Index;
Public Document getDocument (Index index, String url, String title, InputStream is) throws DocCenterException {
String bodyText = null;
Try {
WordExtractor ex = new WordExtractor (is); // is the InputStream of the word file
BodyText = ex. getText ();
If (! BodyText. equals ("")){
Index. AddIndex (url, title, bodyText );
}
} Catch (DocCenterException e ){
Throw new DocCenterException ("cannot extract content from this Mocriosoft Word file", e );
} Catch (Exception e ){
E. printStackTrace ();
}
}
Return null;
}
Excel:
Import org.apache.e.doc ument. Document;
Import org.apache.e.doc ument. Field;
Import org. apache. poi. hwpf. extractor. WordExtractor;
Import org. apache. poi. hssf. usermodel. HSSFWorkbook;
Import org. apache. poi. hssf. usermodel. HSSFSheet;
Import org. apache. poi. hssf. usermodel. HSSFRow;
Import org. apache. poi. hssf. usermodel. HSSFCell;
Import java. io. File;
Import java. io. InputStream;
Import java. io. FileInputStream;
Import com. search. code. Index;
Public Document getDocument (Index index, String url, String title, InputStream is) throws DocCenterException {
StringBuffer content = new StringBuffer ();
Try {
HSSFWorkbook workbook = new HSSFWorkbook (is); // create a reference to an Excel workbook File
For (int numSheets = 0; numSheets <workbook. getNumberOfSheets (); numSheets ++ ){
If (null! = Workbook. getSheetAt (numSheets )){
HSSFSheet aSheet = workbook. getSheetAt (numSheets); // obtain a sheet
For (int rowNumOfSheet = 0; rowNumOfSheet <= aSheet. getLastRowNum (); rowNumOfSheet ++ ){
If (null! = ASheet. getRow (rowNumOfSheet )){
HSSFRow aRow = aSheet. getRow (rowNumOfSheet); // obtain a row
For (short cellNumOfRow = 0; cellNumOfRow <= aRow. getLastCellNum (); cellNumOfRow ++ ){
If (null! = ARow. getCell (cellNumOfRow )){
HSSFCell aCell = aRow. getCell (cellNumOfRow); // obtain the column Value
Content. append (aCell. getStringCellValue ());
}
}
}
}
}
}
If (! Content. equals ("")){
Index. AddIndex (url, title, content. toString ());
}
} Catch (DocCenterException e ){
Throw new DocCenterException ("cannot extract content from this Mocriosoft Word file", e );
} Catch (Exception e ){
System. out. println ("xlRead ():" + e );
}
Return null;
}
PowerPoint:
Import java. io. InputStream;
Import org.apache.e.doc ument. Document;
Import org. apache. poi. hslf. HSLFSlideShow;
Import org. apache. poi. hslf. model. TextRun;
Import org. apache. poi. hslf. model. Slide;
Import org. apache. poi. hslf. usermodel. SlideShow;
Public Document getDocument (Index index, String url, String title, InputStream is)
Throws DocCenterException {
StringBuffer content = new StringBuffer ("");
Try {
SlideShow ss = new SlideShow (new HSLFSlideShow (is); // is the InputStream of the file, and the SlideShow
Slide [] slides = ss. getSlides (); // obtain each Slide
For (int I = 0; I <slides. length; I ++ ){
TextRun [] t = slides [I]. getTextRuns (); // create a TextRun
For (int j = 0; j <t. length; j ++ ){
Content. append (t [j]. getText (); // The text content is added to the content.
}
Content. append (slides [I]. getTitle ());
}
Index. AddIndex (url, title, content. toString ());
} Catch (Exception ex ){
System. out. println (ex. toString ());
}
Return null;
}
PDF:
Import java. io. InputStream;
Import java. io. IOException;
Import org.apache.e.doc ument. Document;
Import orgdomainbox. cos. COSDocument;
Import org.w.boxw.parser. extends parser;
Import orgdomainbox. pdmodel. PDDocument;
Import orgdomainbox. pdmodel. PDDocumentInformation;
Import orgdomainbox. util. extends textstripper;
Import com. search. code. Index;
Public Document getDocument (Index index, String url, String title, InputStream is) throws DocCenterException {
COSDocument cosDoc = null;
Try {
CosDoc = parseDocument (is );
} Catch (IOException e ){
CloseCOSDocument (cosDoc );
Throw new DocCenterException ("unable to process this PDF Document", e );
}
If (cosDoc. isEncrypted ()){
If (cosDoc! = Null)
CloseCOSDocument (cosDoc );
Throw new DocCenterException ("This PDF document is encrypted and cannot be processed ");
}
String docText = null;
Try {
Required textstripper stripper = new required textstripper ();
DocText = stripper. getText (new PDDocument (cosDoc ));
} Catch (IOException e ){
CloseCOSDocument (cosDoc );
Throw new DocCenterException ("unable to process this PDF Document", e );
}
PDDocument pdDoc = null;
Try {
PdDoc = new PDDocument (cosDoc );
PDDocumentInformation docInfo = pdDoc. getDocumentInformation ();
If (docInfo. getTitle ()! = Null &&! DocInfo. getTitle (). equals ("")){
Title = docInfo. getTitle ();
}
} Catch (Exception e ){
CloseCOSDocument (cosDoc );
ClosePDDocument (pdDoc );
System. err. println ("unable to obtain the metadata of this PDF document" + e. getMessage ());
} Finally {
CloseCOSDocument (cosDoc );
ClosePDDocument (pdDoc );
}
Return null;
}
Private static COSDocument parseDocument (InputStream is) throws IOException {
Partition parser = new partition parser (is );
Parser. parse ();
Return parser. getDocument ();
}
Private void closeCOSDocument (COSDocument cosDoc ){
If (cosDoc! = Null ){
Try {
CosDoc. close ();
} Catch (IOException e ){
}
}
}
Private void closePDDocument (PDDocument pdDoc ){
If (pdDoc! = Null ){
Try {
PdDoc. close ();
} Catch (IOException e ){
}
}
}
Code replication may fail, but the code has been tested and is absolutely usable. POI is 3.0-rc4, and product_box is 0.7.3.
POI: http://jakarta.apache.org/poi/index.html
Product_box: http://www.pdfbox.org/