Package Office;/*** read doc, Excel, PDF, HTML, generate TXT file, read txt, and generate EXCEL file * @ author javaalpha * @ date 2011-8-1 * @ Version V 1.0 */Import Java. io. bufferedreader; import Java. io. file; import Java. io. fileinputstream; import Java. io. filenotfoundexception; import Java. io. fileoutputstream; import Java. io. filereader; import Java. io. ioexception; import Java. io. inputstream; import Java. io. inputstreamreader; import Java. io. outputstreamwriter; import Java. io. writer; import java.net. malformedurlexception; import java.net. URL; import Java. util. hashmap; import Java. util. iterator; import Java. util. map;
Import javax. Swing. Text. badlocationexception; import javax. Swing. Text. defaultstyleddocument; import javax. Swing. Text. rtf. rtfeditorkit;
Import Org. apache. poi. hssf. usermodel. hssfcell; import Org. apache. poi. hssf. usermodel. hssfrow; import Org. apache. poi. hssf. usermodel. hssfsheet; import Org. apache. poi. hssf. usermodel. hssfworkbook; import org.w.boxw.parser. extends parser; import orgdomainbox. pdmodel. pddocument; import orgdomainbox. util. extends textstripper; import Org. textmining. text. extraction. wordextractor;
Public class readoffice {/*** @ Param ARGs */public static void main (string [] ARGs) {// readdoc ("E:/1.doc "); // readexcel ("E:/1.xls"); // readpdf (" E:/1.20."); // readhtml ("E:/1.html"); readhtmlall ("E: /1.html ");}/*** create a TXT file and write the file content ** @ Param text */static void createtxtandwritedoc (string text, string path) {fileoutputstream Fos = NULL; fileoutputstream out = NULL; try {// create an output file stream. If the file exists, delete the file F = ne first. W file (PATH); If (F. exists () {f. delete ();} Fos = new fileoutputstream (f); out = new fileoutputstream (f); byte [] B = text. getbytes ("gb2312"); out. write (B); out. flush (); system. out. println ("file generation... ");} catch (exception e) {system. out. println ("exception:" + E);} finally {try {If (null! = FOS) {FOS. Close () ;}catch (ioexception e) {e. printstacktrace () ;}try {If (null! = Out) {out. close () ;}} catch (ioexception e) {e. printstacktrace ();} Fos = NULL; out = NULL;}/*** read doc file ** @ Param dir * @ throws exception */static void readdoc (string DIR) {// create an input stream to read the DOC file fileinputstream in = NULL; wordextractor extractor = NULL; string text = NULL; try {In = new fileinputstream (new file (DIR )); // create wordextractorextractor = new wordextractor (); // extract text = extractor from the doc file. extracttex T (in); system. out. println ("text1:" + text);} catch (filenotfoundexception e) {e. printstacktrace ();} catch (exception e) {e. printstacktrace ();} finally {try {If (null! = In) {In. close () ;}} catch (ioexception e) {e. printstacktrace () ;}in = NULL;} // write the file content createtxtandwritedoc (text, "E:/doc.txt ");} /*** read the Excel file ** @ Param dir */@ suppresswarnings ("deprecation") Static void readexcel (string DIR) {/*** @ Param filepath * file path * @ return refers to the Excel content read */stringbuffer buff = new stringbuffer (); try {// create a reference to the Excel Workbook file hssfworkbook WB = new hssfworkbook (New fileinputstream (Dir ); // Create a reference to the worksheet. For (INT numsheets = 0; numsheets <WB. getnumberofsheets (); numsheets ++) {If (null! = WB. getsheetat (numsheets) {hssfsheet asheet = WB. getsheetat (numsheets); // obtain a sheetfor (INT rownumofsheet = 0; rownumofsheet <= asheet. getlastrownum (); rownumofsheet ++) {If (null! = Asheet. getrow (rownumofsheet) {hssfrow Arow = asheet. getrow (rownumofsheet); // get a row for (INT cellnumofrow = 0; cellnumofrow <= Arow. getlastcellnum (); cellnumofrow ++) {If (null! = Arow. getcell (short) cellnumofrow) {hssfcell acell = Arow. getcell (short) cellnumofrow); // obtain the column value switch (acell. getcelltype () {Case hssfcell. cell_type_formula: break; Case hssfcell. cell_type_numeric: buff. append (acell. getnumericcellvalue ()). append (''); break; Case hssfcell. cell_type_string: buff. append (acell. getstringcellvalue ()). append (''); break ;}} buff. append ('') ;}}} catch (filenotfoundexception E) {e. printstacktrace ();} catch (ioexception e) {e. printstacktrace ();} // write the file content createtxtandwritedoc (buff. tostring (), "E:/excel.txt");}/*** read the PowerPoint file ** @ Param dir */static void readppt (string DIR) {}/*** read the PDF file ** @ Param dir */static void readpdf (string DIR) {string result = NULL; fileinputstream is = NULL; pddocument document = NULL; try {is = new fileinputstream (DIR); partition parser = ne W pdfparser (is); parser. parse (); document = parser. getpddocument (); extends textstripper stripper = new extends textstripper (); Result = stripper. gettext (document);} catch (filenotfoundexception e) {e. printstacktrace ();} catch (ioexception e) {e. printstacktrace ();} finally {If (null! = Is) {try {is. Close () ;}catch (ioexception e) {e. printstacktrace () ;}} if (null! = Document) {try {document. close ();} catch (ioexception e) {e. printstacktrace () ;}}// write the file content createtxtandwritedoc (result, "E:/cmd.txt ");} /*** // read the PDF file ** @ Param file * @ throws exception */Public void readpdf (string file) throws exception {// sort Boolean sort = false; // PDF file name string pdffile = file; // input text file name string textfile = NULL; // encoding string encoding = "gb2312 "; // start page extraction int startpage = 1; // end Number of extracted pages int endpage = integer. max_value; // file input stream, generating a text file writer output = NULL; // PDF documentpddocument document stored in the memory = NULL; try {try {// first load the file as a URL. If an exception occurs, load the File URL from the local file system // URL = new URL (pdffile ); // note that the parameter is not a URL of a previous version. but file. Document = pddocument. load (pdffile); // obtain the PDF file name string filename = URL. getFile (); // name the newly generated TXT file if (filename. length ()> 4) {file outputfile = new file (filename. substring (0, filename. length ()-4) + ". TXT "); textfile = outputfile. getname () ;}} catch (malformedurlexception e) {// if an exception occurs when loading as a URL, load it from the file system. // note that the parameter is not a URL of a previous version. but file. Document = pddocument. load (pdffile); If (pdffile. length ()> 4) {textfile = pdffile. substring (0, pdffile. length ()-4) + ". TXT ";}}// file input stream, Write File textfileoutput = new outputstreamwriter (New fileoutputstream (textfile), encoding); // extract textstripper to extract text into textstripper stripper = NULL; stripper = new sort textstripper (); // you can specify whether to sort stripper. setsortbyposition (SORT); // set the start page of stripper. setstartpage (startpage); // sets the end System. out. print (stripper. gettext (document); stripper. setendpage (endpage); // call javastextstripper's writetext to extract and output the stripper text. writetext (document, output);} finally {If (output! = NULL) {// close the output stream output. Close () ;}if (document! = NULL) {// close the PDF documentdocument. close () ;}}/*** read the TXT file ** @ Param filepath * @ return * @ throws exception */Public String gettextfromtxt (string filepath) throws exception {filereader Fr = new filereader (filepath); bufferedreader BR = new bufferedreader (FR); stringbuffer buff = new stringbuffer (); string temp = NULL; while (temp = BR. readline ())! = NULL) {buff. append (temp + "");} BR. close (); Return buff. tostring ();}/*** read the content of the RTF File ** @ Param filepath * @ return */Public String gettextfromrtf (string filepath) {string result = NULL; file file = new file (filepath); try {defastystyleddocument styleddoc = new defaultstyleddocument (); inputstream is = new fileinputstream (File); New rtfeditorkit (). read (is, styleddoc, 0); Result = new string (styleddoc. get Text (0, styleddoc. getlength ()). getbytes ("iso8859_1"); // extract text. to read Chinese characters, use iso8859_1 encoding. Otherwise, garbled characters may occur.} catch (ioexception e) {e. printstacktrace ();} catch (badlocationexception e) {e. printstacktrace ();} return result;}/*** @ Param filepath * file path * @ return get all HTML content */public static string readhtml (string filepath) {bufferedreader BR = NULL; stringbuffer sb = new stringbuffer (); try {BR = new bufferedreader (New Inputstreamreader (New fileinputstream (filepath), "gb2312"); string temp = NULL; while (temp = Br. Readline ())! = NULL) {sb. append (temp) ;}} catch (filenotfoundexception e) {e. printstacktrace ();} catch (ioexception e) {e. printstacktrace ();} // write the file content createtxtandwritedoc (sb. tostring (), "E:/html.txt"); return sb. tostring ();}/*** @ Param filepath * file path * @ return HTML text content */public static void readhtmlall (string filepath) {// obtain the body TAG content string STR = readhtml (filepath); stringbuffer buff = new stringbuffer (); int Maxindex = Str. length ()-1; int begin = 0; int end; // content between> and <while (begin = Str. indexof ('>', begin) <maxindex) {end = Str. indexof ('<', begin); If (end-begin> 1) {buff. append (Str. substring (++ begin, end);} begin = end + 1;} // write the file content createtxtandwritedoc (buff. tostring (), "E:/htmlall.txt"); // return buff. tostring ();}/*** read the file (Text File) in the unit of behavior ** @ Param filepath */public static void readfilebyline (Str Ing filepath) {file = new file (filepath); bufferedreader BD = NULL; Map <string, string> STR = new hashmap <string, string> (); string S1 = ""; string S2 = ""; try {BD = new bufferedreader (New inputstreamreader (New fileinputstream (file), "gb2312 ")); // encoding conversion (key) string temp = ""; int line = 1; while (temp = BD. readline ())! = NULL) {If (temp. length ()> 0) {S1 = temp. substring (0, 3); S1 = s1.trim (); S2 = temp. substring (4); S2 = s2.trim (); Str. put (S1, S2) ;}++ line ;}createexcel (STR) ;}catch (filenotfoundexception e) {e. printstacktrace ();} catch (ioexception e) {e. printstacktrace ();} finally {try {If (BD! = NULL) BD. close ();} catch (ioexception e) {e. printstacktrace () ;}}/ *** output Excel file. The output format is ** @ Param map */@ suppresswarnings ({"deprecation ", "unchecked"}) Static void createexcel (Map <string, string> map) {try {// create an output file stream fileoutputstream fout = new fileoutputstream ("E:/2.xls "); file file = new file ("E:/2.xls"); If (file. exists () {file. delete () ;}// create a new Excel Workbook hssfworkbook workbook = new hssfworkbook (); // create a worksheet in the Excel Workbook, its name is the default value. // to create a worksheet named "Contact user name and phone number", the statement is hssfsheet sheet = workbook. createsheet ("Contact user name and phone number"); hssfrow ROW = NULL; // create a cell (upper left) hssfcell cell1 = NULL at the index 0; hssfcell cell2 = NULL; iterator iter = map. entryset (). iterator (); int I = 0; while (ITER. hasnext () {map. entry entry = (map. entry) ITER. next (); object key = entry. getkey (); object val = entry. getvalue (); ROW = sheet. createrow (short) I ++); cell1 = row. createcell (short) 0); cell2 = row. createcell (short) 1); // defines the cell as the string type cell1.setcelltype (hssfcell. cell_type_string); cell2.setcelltype (hssfcell. cell_type_string); // enter some cell1.setcellvalue (key. tostring (); cell2.setcellvalue (Val. tostring (); if (I> 255) {break ;}// Save the corresponding workbook to the workbook. write (fout); fout. flush (); // The operation ends. close the file fout. close (); system. out. println ("file generation... ");} catch (exception e) {system. out. println ("exception:" + E );}}}