Environment preparation
TXT using Common-io
PDF using PDFBox
The rest with POI
About POI, reading XLS is nothing special, mainly reading doc and ppt,
You need to download the POI source code, then copy all the files under Poi-src-3.7-20101029.zip\poi-3.7\src\scratchpad\src to the project, or encapsulate a jar package yourself
Jar Package Dependency
Code is as follows:
Package test;
import Java.io.BufferedInputStream;
import Java.io.File;
import Java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import Java.text.NumberFormat;
import org.apache.commons.io.FileUtils;
import Org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import Org.apache.pdfbox.util.PDFTextStripper;
import org.apache.poi.POIXMLDocument;
import Org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.hslf.HSLFSlideShow;
import org.apache.poi.hslf.model.Slide;
import Org.apache.poi.hslf.model.TextRun;
import Org.apache.poi.hslf.usermodel.RichTextRun;
import org.apache.poi.hslf.usermodel.SlideShow;
import Org.apache.poi.hssf.usermodel.HSSFCell;
import Org.apache.poi.hssf.usermodel.HSSFRow;
import Org.apache.poi.hssf.usermodel.HSSFSheet;
import Org.apache.poi.hssf.usermodel.HSSFWorkbook;
import Org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import Org.apache.poi.ss.usermodel.Cell;
import Org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
import Org.apache.poi.xssf.usermodel.XSSFCell;
import Org.apache.poi.xssf.usermodel.XSSFRow;
import Org.apache.poi.xssf.usermodel.XSSFSheet;
import Org.apache.poi.xssf.usermodel.XSSFWorkbook;
import Org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.xmlbeans.XmlException;
Public class Readfileutils {
/**
* @param args
* @throws Exception
*/
public static void Main (string[] args) throws Exception {
Readfileutils RF = new Readfileutils ();
String s = "";
s = rf.readtxt ("E:/ITSM Document suffix Analysis report 2.txt");
s = rf.readpdf ("e:/memcached Comprehensive anatomy. pdf");
s = Rf.readexcel ("e:/copy workload and cost template. xls");
s = rf.readexcel2007 ("e:/function point estimation scheme. xlsx");
//s = Rf.readword ("E:/pms Chinese. Doc");
//s = rf.readword2007 ("e:/function point estimation method. docx");
//s = rf.readppt ("e:/Refinement Management Information System Project report v1.0.ppt");
s = rf.readppt2007 ("e:/Refinement Management Information System Project report V1.0.pptx");
System.out.println (s);
}
Read ppt
public string readppt (string file) throws IOException {
StringBuilder sb = new StringBuilder ();
Slideshow ppt = new Slideshow (new Hslfslideshow (file));
slide[] Slides = ppt.getslides ();
Extracting text information
for (Slide each:slides) {
textrun[] Textruns = Each.gettextruns ();
for (int i=0;i< textruns.length; i++) {
richtextrun[] Richtextruns = textruns. Getrichtextruns ();
for (int j = 0; J < Richtextruns.length; J + +) {
Sb.append (Richtextruns[j].gettext ());
}
Sb.append ("\ n");
}
Sb.append ("\ n");
}
return sb.tostring ();
}
Read pptx
public string readPPT2007 (string file) throws IOException, XmlException, openxml4jexception {
return new Xslfpowerpointextractor (poixmldocument.openpackage (file)). GetText ();
}
Read XLS file
public string Readexcel (string file) throws IOException {
StringBuilder content = new StringBuilder ();
Hssfworkbook workbook = new Hssfworkbook (new FileInputStream);//Create a reference to an Excel workbook file
for (int numsheets = 0; numsheets < workbook.getnumberofsheets (); numsheets++) {
if (null! = Workbook.getsheetat (numsheets)) {
Hssfsheet Asheet = Workbook.getsheetat (numsheets);//Get a sheet
for (int rownumofsheet = 0; Rownumofsheet <= asheet
. Getlastrownum (); rownumofsheet++) {
if (null! = Asheet.getrow (Rownumofsheet)) {
Hssfrow Arow = Asheet.getrow (Rownumofsheet); Get a row
for (short cellnumofrow = 0; Cellnumofrow <= arow
. Getlastcellnum (); cellnumofrow++) {
if (null! = Arow.getcell (Cellnumofrow)) {
Hssfcell Acell = Arow.getcell (Cellnumofrow);//Get column values
if (This.convertcell (Acell). Length () > 0) {
Content.append (This.convertcell (Acell));
}
}
Content.append ("\ n");
}
}
}
}
}
return content.tostring ();
}
Read xlsx file
public string readEXCEL2007 (string file) throws IOException {
StringBuilder content = new StringBuilder ();
Xssfworkbook workbook = new Xssfworkbook (file);
for (int numsheets = 0; numsheets < workbook.getnumberofsheets (); numsheets++) {
if (null! = Workbook.getsheetat (numsheets)) {
Xssfsheet Asheet = Workbook.getsheetat (numsheets);//Get a sheet
for (int rownumofsheet = 0; Rownumofsheet <= asheet
. Getlastrownum (); rownumofsheet++) {
if (null! = Asheet.getrow (Rownumofsheet)) {
Xssfrow Arow = Asheet.getrow (Rownumofsheet); Get a row
for (short cellnumofrow = 0; Cellnumofrow <= arow
. Getlastcellnum (); cellnumofrow++) {
if (null! = Arow.getcell (Cellnumofrow)) {
Xssfcell Acell = Arow.getcell (Cellnumofrow);//Get column values
if (This.convertcell (Acell). Length () > 0) {
Content.append (This.convertcell (Acell));
}
}
Content.append ("\ n");
}
}
}
}
}
return content.tostring ();
}
Private String Convertcell (cell cell) {
NumberFormat formater = Numberformat.getinstance ();
Formater.setgroupingused (FALSE);
String cellvalue = "";
if (cell = = null) {
return cellvalue;
}
Switch (Cell.getcelltype ()) {
Case Hssfcell.cell_type_numeric:
Cellvalue = Formater.format (Cell.getnumericcellvalue ());
Break
Case hssfcell.cell_type_string:
Cellvalue = Cell.getstringcellvalue ();
Break
Case Hssfcell.cell_type_blank:
Cellvalue = Cell.getstringcellvalue ();
Break
Case Hssfcell.cell_type_boolean:
Cellvalue = boolean.valueof (Cell.getbooleancellvalue ()). ToString ();
Break
Case HSSFCELL.CELL_TYPE_ERROR:
Cellvalue = string.valueof (Cell.geterrorcellvalue ());
Break
Default
Cellvalue = "";
}
return Cellvalue.trim ();
}
Reading PDF files
public string readpdf (string file) throws IOException {
String result = null;
FileInputStream is = null;
PDDocument document = null;
try {
is = new FileInputStream (file);
Pdfparser parser = new Pdfparser (IS);
Parser.parse ();
Document = Parser.getpddocument ();
Pdftextstripper stripper = new Pdftextstripper ();
result = Stripper.gettext (document);
} finally {
if (is = null) {
Is.close ();
}
if (document! = NULL) {
Document.close ();
}
}
return result;
}
Read Doc file
public string Readword (string file) throws Exception {
String returnstr = "";
try {
Wordextractor wordextractor = new Wordextractor (new FileInputStream (file));
Returnstr = Wordextractor.gettext ();
} catch (FileNotFoundException e) {
E.printstacktrace ();
} catch (IOException e) {
E.printstacktrace ();
}
return returnstr;
}
Read Docx file
public string readWORD2007 (string file) throws Exception {
return new Xwpfwordextractor (poixmldocument.openpackage (file)). GetText ();
}
Read TXT file
public string Readtxt (string file) throws IOException {
String encoding = readfileutils.get_charset (new file);
if (Encoding.equalsignorecase ("GBK")) {
Return fileutils.readfiletostring (new file, "GBK");
} else {
Return fileutils.readfiletostring (new file, "UTF8");
}
}
private static String get_charset (file file) throws IOException {
String charset = "GBK";
byte[] first3bytes = new Byte[3];
Bufferedinputstream bis = null;
try {
Boolean checked = false;
bis = new Bufferedinputstream (new FileInputStream (file));
Bis.mark (0);
int read = Bis.read (first3bytes, 0, 3);
if (read = =-1)
return charset;
if (first3bytes[0] = = (byte) 0xFF && first3bytes[1] = = (byte) 0xFE) {
CharSet = "Utf-16le";
Checked = true;
} else if (first3bytes[0] = = (byte) 0xFE
&& first3bytes[1] = = (byte) 0xFF) {
CharSet = "Utf-16be";
Checked = true;
} else if (first3bytes[0] = = (byte) 0xEF
&& first3bytes[1] = = (byte) 0xBB
&& first3bytes[2] = = (byte) 0xBF) {
CharSet = "UTF-8";
Checked = true;
}
Bis.reset ();
if (!checked) {
int len = 0;
int loc = 0;
while (read = Bis.read ())! =-1) {
loc++;
if (read >= 0xF0)
Break
if (0x80 <= read && read <= 0xBF)//single-occurrence bf below, also considered GBK
Break
if (0xC0 <= read && read <= 0xDF) {
Read = Bis.read ();
if (0x80 <= read && read <= 0xBF)//Double byte (0XC0-0XDF)
(0x80
-0xBF), may also be within GB encoding
Continue
Else
Break
} else if (0xE0 <= read && read <= 0xEF) {//may also be error, but less likely
Read = Bis.read ();
if (0x80 <= read && read <= 0xBF) {
Read = Bis.read ();
if (0x80 <= read && read <= 0xBF) {
CharSet = "UTF-8";
Break
} else
Break
} else
Break
}
}
System.out.println (loc + "" + integer.tohexstring (Read)
// );
}
} catch (Exception e) {
E.printstacktrace ();
} finally {
if (bis! = null) {
Bis.close ();
}
}
return charset;
}
}
Java Read txt/pdf/xls/xlsx/doc/docx/ppt/pptx