For Android, open the source code for parsing files in doc, docx, xls, xlsx, and ppt formats.

Source: Internet
Author: User

For Android, open the source code for parsing files in doc, docx, xls, xlsx, and ppt formats.

You can directly copy the experiment,
Parse doc to tm-extractors-0.4.jar this package
Parse the xls and use the jxl. jar package.

 

import java.io.File;import java.io.FileInputStream;import java.io.FileNotFoundException;import java.io.FileOutputStream;import java.io.IOException;import java.io.InputStream;import java.net.MalformedURLException;import java.net.URL;import java.net.URLConnection;import java.util.ArrayList;import java.util.List;import java.util.zip.ZipEntry;import java.util.zip.ZipException;import java.util.zip.ZipFile;import jxl.Cell;import jxl.CellType;import jxl.DateCell;import jxl.NumberCell;import jxl.Sheet;import jxl.Workbook;import org.apache.http.util.EncodingUtils;import org.textmining.text.extraction.WordExtractor;import org.xmlpull.v1.XmlPullParser;import org.xmlpull.v1.XmlPullParserException;import android.os.Environment;import android.util.Xml;
Public static String readDOC (String path) {// create an input stream to read the doc file FileInputStream in; String text = null; // Environment. getExternalStorageDirectory (). getAbsolutePath () + "/aa.doc") try {in = new FileInputStream (new File (path); int a = in. available (); WordExtractor extractor = null; // create WordExtractor extractor = new WordExtractor (); // extract text = extractor from the doc file. extractText (in); System. out. println ("parsed things" + text);} catch (FileNotFoundException e) {e. printStackTrace ();} catch (Exception e) {e. printStackTrace ();} if (text = null) {text = "parsing file problems";} return text ;}
Public static String readXLS (String path) {String str = ""; try {Workbook workbook = null; workbook = Workbook. getWorkbook (new File (path); Sheet sheet = workbook. getSheet (0); Cell cell = null; int columnCount = sheet. getColumns (); int rowCount = sheet. getRows (); for (int I = 0; I <rowCount; I ++) {for (int j = 0; j <columnCount; j ++) {cell = sheet. getCell (j, I); String temp2 = ""; if (cell. getType () = CellType. NUMBER) {temp2 = (NumberCell) cell ). getValue () + "";} else if (cell. getType () = CellType. DATE) {temp2 = "" + (DateCell) cell ). getDate ();} else {temp2 = "" + cell. getContents ();} str = str + "" + temp2;} str + = "\ n";} workbook. close () ;}catch (Exception e) {}if (str = null) {str = "An error occurred while parsing the file" ;}return str ;}
Public static String readDOCX (String path) {String river = ""; try {ZipFile xlsxFile = new ZipFile (new File (path); ZipEntry sharedStringXML = xlsxFile. getEntry ("word/document. xml "); InputStream inputStream = xlsxFile. getInputStream (sharedStringXML); XmlPullParser xmlParser = Xml. newPullParser (); xmlParser. setInput (inputStream, "UTF-8"); int evtType = xmlParser. getEventType (); while (evtType! = XmlPullParser. END_DOCUMENT) {switch (evtType) {case XmlPullParser. START_TAG: String tag = xmlParser. getName (); System. out. println (tag); if (tag. equalsIgnoreCase ("t") {river + = xmlParser. nextText () + "\ n" ;}break; case XmlPullParser. END_TAG: break; default: break;} evtType = xmlParser. next () ;}} catch (ZipException e) {e. printStackTrace ();} catch (IOException e) {e. printStackTrace ();} catch (XmlPullParserException e) {e. printStackTrace ();} if (river = null) {river = "parsing file problems";} return river ;}
Public static String readXLSX (String path) {String str = ""; String v = null; boolean flat = false; List
 
  
Ls = new ArrayList
  
   
(); Try {ZipFile xlsxFile = new ZipFile (new File (path); ZipEntry sharedStringXML = xlsxFile. getEntry ("xl/sharedStrings. xml "); InputStream inputStream = xlsxFile. getInputStream (sharedStringXML); XmlPullParser xmlParser = Xml. newPullParser (); xmlParser. setInput (inputStream, "UTF-8"); int evtType = xmlParser. getEventType (); while (evtType! = XmlPullParser. END_DOCUMENT) {switch (evtType) {case XmlPullParser. START_TAG: String tag = xmlParser. getName (); if (tag. equalsIgnoreCase ("t") {ls. add (xmlParser. nextText ();} break; case XmlPullParser. END_TAG: break; default: break;} evtType = xmlParser. next ();} ZipEntry sheetXML = xlsxFile. getEntry ("xl/worksheets/sheet1.xml"); InputStream inputStreamsheet = xlsxFile. getInputStream (sheetXML); XmlPullParser xmlParsersheet = Xml. newPullParser (); xmlParsersheet. setInput (inputStreamsheet, "UTF-8"); int evtTypesheet = xmlParsersheet. getEventType (); while (Bytes! = XmlPullParser. END_DOCUMENT) {switch (evtTypesheet) {case XmlPullParser. START_TAG: String tag = xmlParsersheet. getName (); if (tag. equalsIgnoreCase ("row") {} else if (tag. equalsIgnoreCase ("c") {String t = xmlParsersheet. getAttributeValue (null, "t"); if (t! = Null) {flat = true; System. out. println (flat + "");} else {System. out. println (flat + ""); flat = false ;}} else if (tag. equalsIgnoreCase ("v") {v = xmlParsersheet. nextText (); if (v! = Null) {if (flat) {str + = ls. get (Integer. parseInt (v) + "" ;}else {str + = v + "" ;}} break; case XmlPullParser. END_TAG: if (xmlParsersheet. getName (). equalsIgnoreCase ("row") & v! = Null) {str + = "\ n";} break;} evtTypesheet = xmlParsersheet. next ();} System. out. println (str);} catch (ZipException e) {e. printStackTrace ();} catch (IOException e) {e. printStackTrace ();} catch (XmlPullParserException e) {e. printStackTrace ();} if (str = null) {str = "An error occurred while parsing the file";} return str ;}
  
 
Public static String readPPTX (String path) {List
 
  
Ls = new ArrayList
  
   
(); String river = ""; ZipFile xlsxFile = null; try {xlsxFile = new ZipFile (new File (path )); // pptx reads in the read zip format} catch (ZipException e1) {e1.printStackTrace ();} catch (IOException e1) {e1.printStackTrace ();} try {ZipEntry sharedStringXML = xlsxFile. getEntry ("[Content_Types]. xml "); // find the file containing the content InputStream inputStream = xlsxFile. getInputStream (sharedStringXML); // get the file stream XmlPullParser xmlParse R = Xml. newPullParser (); // instantiate pull xmlParser. setInput (inputStream, "UTF-8"); // The int evtType = xmlParser. getEventType (); // get the label type status while (evtType! = XmlPullParser. END_DOCUMENT) {// cyclically read stream switch (evtType) {case XmlPullParser. START_TAG: // judge the tag to start reading String tag = xmlParser. getName (); // obtain the tag if (tag. repeated signorecase ("Override") {String s = xmlParser. getAttributeValue (null, "PartName"); if (s. lastIndexOf ("/ppt/slides/slide") = 0) {ls. add (s) ;}} break; case XmlPullParser. END_TAG: // tag read end break; default: break;} evtType = xmlParser. next ();// Read the next tag} catch (ZipException e) {e. printStackTrace ();} catch (IOException e) {e. printStackTrace ();} catch (XmlPullParserException e) {e. printStackTrace () ;}for (int I = 1; I <(ls. size () + 1); I ++) {// suppose there are 6 slides: river + = "no." + I + "Zhang ·" + "\ n "; try {ZipEntry sharedStringXML = xlsxFile. getEntry ("ppt/slides/slide" + I + ". xml "); // find the file in which the content is stored. InputStream inputStream = xlsxFil E. getInputStream (sharedStringXML); // get the file stream XmlPullParser xmlParser = Xml. newPullParser (); // instantiate pull xmlParser. setInput (inputStream, "UTF-8"); // The int evtType = xmlParser. getEventType (); // get the label type status while (evtType! = XmlPullParser. END_DOCUMENT) {// cyclically read stream switch (evtType) {case XmlPullParser. START_TAG: // judge the tag to start reading String tag = xmlParser. getName (); // obtain the tag if (tag. equalsIgnoreCase ("t") {river + = xmlParser. nextText () + "\ n" ;}break; case XmlPullParser. END_TAG: // tag read end break; default: break;} evtType = xmlParser. next (); // read the next tag} catch (ZipException e) {e. printStackTrace ();} catch (IOException e) {e. printStackTrace ();} catch (XmlPullParserException e) {e. printStackTrace () ;}}if (river = null) {river = "parsing file problems";} return river ;}
  
 

 

Related Article

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.