Read doc, Excel, PDF, HTML, generate TXT file, read TXT to generate EXCEL file

Source: Internet
Author: User
Tags file url

 

Package Office;/*** read doc, Excel, PDF, HTML, generate TXT file, read txt, and generate EXCEL file * @ author javaalpha * @ date 2011-8-1 * @ Version V 1.0 */

Import Java. io. bufferedreader; import Java. io. file; import Java. io. fileinputstream; import Java. io. filenotfoundexception; import Java. io. fileoutputstream; import Java. io. filereader; import Java. io. ioexception; import Java. io. inputstream; import Java. io. inputstreamreader; import Java. io. outputstreamwriter; import Java. io. writer; import java.net. malformedurlexception; import java.net. URL; import Java. util. hashmap; import Java. util. iterator; import Java. util. map;

Import javax. Swing. Text. badlocationexception; import javax. Swing. Text. defaultstyleddocument; import javax. Swing. Text. rtf. rtfeditorkit;

Import Org. apache. poi. hssf. usermodel. hssfcell; import Org. apache. poi. hssf. usermodel. hssfrow; import Org. apache. poi. hssf. usermodel. hssfsheet; import Org. apache. poi. hssf. usermodel. hssfworkbook; import org.w.boxw.parser. extends parser; import orgdomainbox. pdmodel. pddocument; import orgdomainbox. util. extends textstripper; import Org. textmining. text. extraction. wordextractor;

 
Public class readoffice {/*** @ Param ARGs */public static void main (string [] ARGs) {// readdoc ("E:/1.doc "); // readexcel ("E:/1.xls"); // readpdf (" E:/1.20."); // readhtml ("E:/1.html"); readhtmlall ("E: /1.html ");}/*** create a TXT file and write the file content ** @ Param text */static void createtxtandwritedoc (string text, string path) {fileoutputstream Fos = NULL; fileoutputstream out = NULL; try {// create an output file stream. If the file exists, delete the file F = ne first. W file (PATH); If (F. exists () {f. delete ();} Fos = new fileoutputstream (f); out = new fileoutputstream (f); byte [] B = text. getbytes ("gb2312"); out. write (B); out. flush (); system. out. println ("file generation... ");} catch (exception e) {system. out. println ("exception:" + E);} finally {try {If (null! = FOS) {FOS. Close () ;}catch (ioexception e) {e. printstacktrace () ;}try {If (null! = Out) {out. close () ;}} catch (ioexception e) {e. printstacktrace ();} Fos = NULL; out = NULL;}/*** read doc file ** @ Param dir * @ throws exception */static void readdoc (string DIR) {// create an input stream to read the DOC file fileinputstream in = NULL; wordextractor extractor = NULL; string text = NULL; try {In = new fileinputstream (new file (DIR )); // create wordextractorextractor = new wordextractor (); // extract text = extractor from the doc file. extracttex T (in); system. out. println ("text1:" + text);} catch (filenotfoundexception e) {e. printstacktrace ();} catch (exception e) {e. printstacktrace ();} finally {try {If (null! = In) {In. close () ;}} catch (ioexception e) {e. printstacktrace () ;}in = NULL;} // write the file content createtxtandwritedoc (text, "E:/doc.txt ");} /*** read the Excel file ** @ Param dir */@ suppresswarnings ("deprecation") Static void readexcel (string DIR) {/*** @ Param filepath * file path * @ return refers to the Excel content read */stringbuffer buff = new stringbuffer (); try {// create a reference to the Excel Workbook file hssfworkbook WB = new hssfworkbook (New fileinputstream (Dir ); // Create a reference to the worksheet. For (INT numsheets = 0; numsheets <WB. getnumberofsheets (); numsheets ++) {If (null! = WB. getsheetat (numsheets) {hssfsheet asheet = WB. getsheetat (numsheets); // obtain a sheetfor (INT rownumofsheet = 0; rownumofsheet <= asheet. getlastrownum (); rownumofsheet ++) {If (null! = Asheet. getrow (rownumofsheet) {hssfrow Arow = asheet. getrow (rownumofsheet); // get a row for (INT cellnumofrow = 0; cellnumofrow <= Arow. getlastcellnum (); cellnumofrow ++) {If (null! = Arow. getcell (short) cellnumofrow) {hssfcell acell = Arow. getcell (short) cellnumofrow); // obtain the column value switch (acell. getcelltype () {Case hssfcell. cell_type_formula: break; Case hssfcell. cell_type_numeric: buff. append (acell. getnumericcellvalue ()). append (''); break; Case hssfcell. cell_type_string: buff. append (acell. getstringcellvalue ()). append (''); break ;}} buff. append ('') ;}}} catch (filenotfoundexception E) {e. printstacktrace ();} catch (ioexception e) {e. printstacktrace ();} // write the file content createtxtandwritedoc (buff. tostring (), "E:/excel.txt");}/*** read the PowerPoint file ** @ Param dir */static void readppt (string DIR) {}/*** read the PDF file ** @ Param dir */static void readpdf (string DIR) {string result = NULL; fileinputstream is = NULL; pddocument document = NULL; try {is = new fileinputstream (DIR); partition parser = ne W pdfparser (is); parser. parse (); document = parser. getpddocument (); extends textstripper stripper = new extends textstripper (); Result = stripper. gettext (document);} catch (filenotfoundexception e) {e. printstacktrace ();} catch (ioexception e) {e. printstacktrace ();} finally {If (null! = Is) {try {is. Close () ;}catch (ioexception e) {e. printstacktrace () ;}} if (null! = Document) {try {document. close ();} catch (ioexception e) {e. printstacktrace () ;}}// write the file content createtxtandwritedoc (result, "E:/cmd.txt ");} /*** // read the PDF file ** @ Param file * @ throws exception */Public void readpdf (string file) throws exception {// sort Boolean sort = false; // PDF file name string pdffile = file; // input text file name string textfile = NULL; // encoding string encoding = "gb2312 "; // start page extraction int startpage = 1; // end Number of extracted pages int endpage = integer. max_value; // file input stream, generating a text file writer output = NULL; // PDF documentpddocument document stored in the memory = NULL; try {try {// first load the file as a URL. If an exception occurs, load the File URL from the local file system // URL = new URL (pdffile ); // note that the parameter is not a URL of a previous version. but file. Document = pddocument. load (pdffile); // obtain the PDF file name string filename = URL. getFile (); // name the newly generated TXT file if (filename. length ()> 4) {file outputfile = new file (filename. substring (0, filename. length ()-4) + ". TXT "); textfile = outputfile. getname () ;}} catch (malformedurlexception e) {// if an exception occurs when loading as a URL, load it from the file system. // note that the parameter is not a URL of a previous version. but file. Document = pddocument. load (pdffile); If (pdffile. length ()> 4) {textfile = pdffile. substring (0, pdffile. length ()-4) + ". TXT ";}}// file input stream, Write File textfileoutput = new outputstreamwriter (New fileoutputstream (textfile), encoding); // extract textstripper to extract text into textstripper stripper = NULL; stripper = new sort textstripper (); // you can specify whether to sort stripper. setsortbyposition (SORT); // set the start page of stripper. setstartpage (startpage); // sets the end System. out. print (stripper. gettext (document); stripper. setendpage (endpage); // call javastextstripper's writetext to extract and output the stripper text. writetext (document, output);} finally {If (output! = NULL) {// close the output stream output. Close () ;}if (document! = NULL) {// close the PDF documentdocument. close () ;}}/*** read the TXT file ** @ Param filepath * @ return * @ throws exception */Public String gettextfromtxt (string filepath) throws exception {filereader Fr = new filereader (filepath); bufferedreader BR = new bufferedreader (FR); stringbuffer buff = new stringbuffer (); string temp = NULL; while (temp = BR. readline ())! = NULL) {buff. append (temp + "");} BR. close (); Return buff. tostring ();}/*** read the content of the RTF File ** @ Param filepath * @ return */Public String gettextfromrtf (string filepath) {string result = NULL; file file = new file (filepath); try {defastystyleddocument styleddoc = new defaultstyleddocument (); inputstream is = new fileinputstream (File); New rtfeditorkit (). read (is, styleddoc, 0); Result = new string (styleddoc. get Text (0, styleddoc. getlength ()). getbytes ("iso8859_1"); // extract text. to read Chinese characters, use iso8859_1 encoding. Otherwise, garbled characters may occur.} catch (ioexception e) {e. printstacktrace ();} catch (badlocationexception e) {e. printstacktrace ();} return result;}/*** @ Param filepath * file path * @ return get all HTML content */public static string readhtml (string filepath) {bufferedreader BR = NULL; stringbuffer sb = new stringbuffer (); try {BR = new bufferedreader (New Inputstreamreader (New fileinputstream (filepath), "gb2312"); string temp = NULL; while (temp = Br. Readline ())! = NULL) {sb. append (temp) ;}} catch (filenotfoundexception e) {e. printstacktrace ();} catch (ioexception e) {e. printstacktrace ();} // write the file content createtxtandwritedoc (sb. tostring (), "E:/html.txt"); return sb. tostring ();}/*** @ Param filepath * file path * @ return HTML text content */public static void readhtmlall (string filepath) {// obtain the body TAG content string STR = readhtml (filepath); stringbuffer buff = new stringbuffer (); int Maxindex = Str. length ()-1; int begin = 0; int end; // content between> and <while (begin = Str. indexof ('>', begin) <maxindex) {end = Str. indexof ('<', begin); If (end-begin> 1) {buff. append (Str. substring (++ begin, end);} begin = end + 1;} // write the file content createtxtandwritedoc (buff. tostring (), "E:/htmlall.txt"); // return buff. tostring ();}/*** read the file (Text File) in the unit of behavior ** @ Param filepath */public static void readfilebyline (Str Ing filepath) {file = new file (filepath); bufferedreader BD = NULL; Map <string, string> STR = new hashmap <string, string> (); string S1 = ""; string S2 = ""; try {BD = new bufferedreader (New inputstreamreader (New fileinputstream (file), "gb2312 ")); // encoding conversion (key) string temp = ""; int line = 1; while (temp = BD. readline ())! = NULL) {If (temp. length ()> 0) {S1 = temp. substring (0, 3); S1 = s1.trim (); S2 = temp. substring (4); S2 = s2.trim (); Str. put (S1, S2) ;}++ line ;}createexcel (STR) ;}catch (filenotfoundexception e) {e. printstacktrace ();} catch (ioexception e) {e. printstacktrace ();} finally {try {If (BD! = NULL) BD. close ();} catch (ioexception e) {e. printstacktrace () ;}}/ *** output Excel file. The output format is ** @ Param map */@ suppresswarnings ({"deprecation ", "unchecked"}) Static void createexcel (Map <string, string> map) {try {// create an output file stream fileoutputstream fout = new fileoutputstream ("E:/2.xls "); file file = new file ("E:/2.xls"); If (file. exists () {file. delete () ;}// create a new Excel Workbook hssfworkbook workbook = new hssfworkbook (); // create a worksheet in the Excel Workbook, its name is the default value. // to create a worksheet named "Contact user name and phone number", the statement is hssfsheet sheet = workbook. createsheet ("Contact user name and phone number"); hssfrow ROW = NULL; // create a cell (upper left) hssfcell cell1 = NULL at the index 0; hssfcell cell2 = NULL; iterator iter = map. entryset (). iterator (); int I = 0; while (ITER. hasnext () {map. entry entry = (map. entry) ITER. next (); object key = entry. getkey (); object val = entry. getvalue (); ROW = sheet. createrow (short) I ++); cell1 = row. createcell (short) 0); cell2 = row. createcell (short) 1); // defines the cell as the string type cell1.setcelltype (hssfcell. cell_type_string); cell2.setcelltype (hssfcell. cell_type_string); // enter some cell1.setcellvalue (key. tostring (); cell2.setcellvalue (Val. tostring (); if (I> 255) {break ;}// Save the corresponding workbook to the workbook. write (fout); fout. flush (); // The operation ends. close the file fout. close (); system. out. println ("file generation... ");} catch (exception e) {system. out. println ("exception:" + E );}}}

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.