For Android, open the source code for parsing files in doc, docx, xls, xlsx, and ppt formats.
You can directly copy the experiment,
Parse doc to tm-extractors-0.4.jar this package
Parse the xls and use the jxl. jar package.
import java.io.File;import java.io.FileInputStream;import java.io.FileNotFoundException;import java.io.FileOutputStream;import java.io.IOException;import java.io.InputStream;import java.net.MalformedURLException;import java.net.URL;import java.net.URLConnection;import java.util.ArrayList;import java.util.List;import java.util.zip.ZipEntry;import java.util.zip.ZipException;import java.util.zip.ZipFile;import jxl.Cell;import jxl.CellType;import jxl.DateCell;import jxl.NumberCell;import jxl.Sheet;import jxl.Workbook;import org.apache.http.util.EncodingUtils;import org.textmining.text.extraction.WordExtractor;import org.xmlpull.v1.XmlPullParser;import org.xmlpull.v1.XmlPullParserException;import android.os.Environment;import android.util.Xml;
Public static String readDOC (String path) {// create an input stream to read the doc file FileInputStream in; String text = null; // Environment. getExternalStorageDirectory (). getAbsolutePath () + "/aa.doc") try {in = new FileInputStream (new File (path); int a = in. available (); WordExtractor extractor = null; // create WordExtractor extractor = new WordExtractor (); // extract text = extractor from the doc file. extractText (in); System. out. println ("parsed things" + text);} catch (FileNotFoundException e) {e. printStackTrace ();} catch (Exception e) {e. printStackTrace ();} if (text = null) {text = "parsing file problems";} return text ;}
Public static String readXLS (String path) {String str = ""; try {Workbook workbook = null; workbook = Workbook. getWorkbook (new File (path); Sheet sheet = workbook. getSheet (0); Cell cell = null; int columnCount = sheet. getColumns (); int rowCount = sheet. getRows (); for (int I = 0; I <rowCount; I ++) {for (int j = 0; j <columnCount; j ++) {cell = sheet. getCell (j, I); String temp2 = ""; if (cell. getType () = CellType. NUMBER) {temp2 = (NumberCell) cell ). getValue () + "";} else if (cell. getType () = CellType. DATE) {temp2 = "" + (DateCell) cell ). getDate ();} else {temp2 = "" + cell. getContents ();} str = str + "" + temp2;} str + = "\ n";} workbook. close () ;}catch (Exception e) {}if (str = null) {str = "An error occurred while parsing the file" ;}return str ;}
Public static String readDOCX (String path) {String river = ""; try {ZipFile xlsxFile = new ZipFile (new File (path); ZipEntry sharedStringXML = xlsxFile. getEntry ("word/document. xml "); InputStream inputStream = xlsxFile. getInputStream (sharedStringXML); XmlPullParser xmlParser = Xml. newPullParser (); xmlParser. setInput (inputStream, "UTF-8"); int evtType = xmlParser. getEventType (); while (evtType! = XmlPullParser. END_DOCUMENT) {switch (evtType) {case XmlPullParser. START_TAG: String tag = xmlParser. getName (); System. out. println (tag); if (tag. equalsIgnoreCase ("t") {river + = xmlParser. nextText () + "\ n" ;}break; case XmlPullParser. END_TAG: break; default: break;} evtType = xmlParser. next () ;}} catch (ZipException e) {e. printStackTrace ();} catch (IOException e) {e. printStackTrace ();} catch (XmlPullParserException e) {e. printStackTrace ();} if (river = null) {river = "parsing file problems";} return river ;}
Public static String readXLSX (String path) {String str = ""; String v = null; boolean flat = false; List
Ls = new ArrayList
(); Try {ZipFile xlsxFile = new ZipFile (new File (path); ZipEntry sharedStringXML = xlsxFile. getEntry ("xl/sharedStrings. xml "); InputStream inputStream = xlsxFile. getInputStream (sharedStringXML); XmlPullParser xmlParser = Xml. newPullParser (); xmlParser. setInput (inputStream, "UTF-8"); int evtType = xmlParser. getEventType (); while (evtType! = XmlPullParser. END_DOCUMENT) {switch (evtType) {case XmlPullParser. START_TAG: String tag = xmlParser. getName (); if (tag. equalsIgnoreCase ("t") {ls. add (xmlParser. nextText ();} break; case XmlPullParser. END_TAG: break; default: break;} evtType = xmlParser. next ();} ZipEntry sheetXML = xlsxFile. getEntry ("xl/worksheets/sheet1.xml"); InputStream inputStreamsheet = xlsxFile. getInputStream (sheetXML); XmlPullParser xmlParsersheet = Xml. newPullParser (); xmlParsersheet. setInput (inputStreamsheet, "UTF-8"); int evtTypesheet = xmlParsersheet. getEventType (); while (Bytes! = XmlPullParser. END_DOCUMENT) {switch (evtTypesheet) {case XmlPullParser. START_TAG: String tag = xmlParsersheet. getName (); if (tag. equalsIgnoreCase ("row") {} else if (tag. equalsIgnoreCase ("c") {String t = xmlParsersheet. getAttributeValue (null, "t"); if (t! = Null) {flat = true; System. out. println (flat + "");} else {System. out. println (flat + ""); flat = false ;}} else if (tag. equalsIgnoreCase ("v") {v = xmlParsersheet. nextText (); if (v! = Null) {if (flat) {str + = ls. get (Integer. parseInt (v) + "" ;}else {str + = v + "" ;}} break; case XmlPullParser. END_TAG: if (xmlParsersheet. getName (). equalsIgnoreCase ("row") & v! = Null) {str + = "\ n";} break;} evtTypesheet = xmlParsersheet. next ();} System. out. println (str);} catch (ZipException e) {e. printStackTrace ();} catch (IOException e) {e. printStackTrace ();} catch (XmlPullParserException e) {e. printStackTrace ();} if (str = null) {str = "An error occurred while parsing the file";} return str ;}
Public static String readPPTX (String path) {List
Ls = new ArrayList
(); String river = ""; ZipFile xlsxFile = null; try {xlsxFile = new ZipFile (new File (path )); // pptx reads in the read zip format} catch (ZipException e1) {e1.printStackTrace ();} catch (IOException e1) {e1.printStackTrace ();} try {ZipEntry sharedStringXML = xlsxFile. getEntry ("[Content_Types]. xml "); // find the file containing the content InputStream inputStream = xlsxFile. getInputStream (sharedStringXML); // get the file stream XmlPullParser xmlParse R = Xml. newPullParser (); // instantiate pull xmlParser. setInput (inputStream, "UTF-8"); // The int evtType = xmlParser. getEventType (); // get the label type status while (evtType! = XmlPullParser. END_DOCUMENT) {// cyclically read stream switch (evtType) {case XmlPullParser. START_TAG: // judge the tag to start reading String tag = xmlParser. getName (); // obtain the tag if (tag. repeated signorecase ("Override") {String s = xmlParser. getAttributeValue (null, "PartName"); if (s. lastIndexOf ("/ppt/slides/slide") = 0) {ls. add (s) ;}} break; case XmlPullParser. END_TAG: // tag read end break; default: break;} evtType = xmlParser. next ();// Read the next tag} catch (ZipException e) {e. printStackTrace ();} catch (IOException e) {e. printStackTrace ();} catch (XmlPullParserException e) {e. printStackTrace () ;}for (int I = 1; I <(ls. size () + 1); I ++) {// suppose there are 6 slides: river + = "no." + I + "Zhang ·" + "\ n "; try {ZipEntry sharedStringXML = xlsxFile. getEntry ("ppt/slides/slide" + I + ". xml "); // find the file in which the content is stored. InputStream inputStream = xlsxFil E. getInputStream (sharedStringXML); // get the file stream XmlPullParser xmlParser = Xml. newPullParser (); // instantiate pull xmlParser. setInput (inputStream, "UTF-8"); // The int evtType = xmlParser. getEventType (); // get the label type status while (evtType! = XmlPullParser. END_DOCUMENT) {// cyclically read stream switch (evtType) {case XmlPullParser. START_TAG: // judge the tag to start reading String tag = xmlParser. getName (); // obtain the tag if (tag. equalsIgnoreCase ("t") {river + = xmlParser. nextText () + "\ n" ;}break; case XmlPullParser. END_TAG: // tag read end break; default: break;} evtType = xmlParser. next (); // read the next tag} catch (ZipException e) {e. printStackTrace ();} catch (IOException e) {e. printStackTrace ();} catch (XmlPullParserException e) {e. printStackTrace () ;}}if (river = null) {river = "parsing file problems";} return river ;}