/** * Com.jiaoyiping.pdstest.TestTika.java * Copyright (c) Hewlett-Packard Development Company, L.P. * All Rights res Erved. */package Com.jiaoyiping.pdstest;import Java.io.bufferedinputstream;import Java.io.bufferedoutputstream;import Java.io.file;import Java.io.fileinputstream;import Java.io.fileoutputstream;import Java.io.InputStream;import Java.io.outputstream;import Org.apache.tika.metadata.metadata;import Org.apache.tika.parser.parsecontext;import Org.apache.tika.parser.parser;import Org.apache.tika.parser.mail.rfc822parser;import Org.apache.tika.parser.microsoft.officeparser;import Org.apache.tika.parser.pdf.pdfparser;import Org.apache.tika.sax.bodycontenthandler;import org.junit.test;import org.xml.sax.contenthandler;/** * <pre> * Desc: * @author Coke one flat * @refactor Coke one flat * @date December 4, 2014 PM 1:31:09 * @version 1.0 * @see * Revisions: * Version Dat E Author Description *-------------------------------------------------------------------* 1.0 December 4, 2014 Gio Yiping 1. Created this class. * </pre> */public class Testtika {//Resolution Pdf@testpublic void Testpdf () throws Exception{long start = System.currentti Memillis (); Parser Parser = new Pdfparser (); InputStream is = new Bufferedinputstream (New FileInputStream ("d:\\ my micro disk \ \ document \ \ \ Refer to Document \\Linux Shell script introduction. pdf "))); OutputStream os = new Bufferedoutputstream (New FileOutputStream (" c:\\users\\ Administrator\\desktop\\result.txt "))); Metadata meta = new Metadata (); Meta.add (metadata.content_encoding, "utf-8"); ContentHandler Ihandler = new Bodycontenthandler (OS); Parser.parse (IS, Ihandler, Meta, New Parsecontext ()); Long end = System.currenttimemillis (); Long used = (End-start)/1000; System.out.println ("Time-consuming:" +used+ "seconds");} Parse word@testpublic void Testwrod () throws Exception{long start = System.currenttimemillis (); Parser Parser = new Officeparser () InputStream is = new Bufferedinputstream (New FileInputStream ("D:\\ my Micro\ \ documentation \ \ Reference Document \\JBPM5_ User Guide (Chinese version. Doc "))); OutputStream os = new Bufferedoutputstream (New FileOutputStream new File (" c:\\ Users\\administrator\\desktop\\result.txt "))); Metadata meta = new Metadata (); Meta.add (metadata.content_encoding, "utf-8"); ContentHandler Ihandler = new Bodycontenthandler (OS); Parser.parse (IS, Ihandler, Meta, New Parsecontext ()); Long end = System.currenttimemillis (); Long used = (End-start)/1000; System.out.println ("Time-consuming:" +used+ "seconds");} Parse the email (only the standard EML format can not parse the Microsoft MSG format)//use Commons-email to parse the recipient, sender, subject, content and other metadata, Tika support does not attempt @testpublic void Testemail () throws Exception{long start = System.currenttimemillis (); Parser Parser = new Rfc822parser (); InputStream is = new Bufferedinputstream (New FileInputStream ("c:\\users\\ administrator\\downloads\\ Reply _ re_ data import work-External System enumeration with U-cloud enumeration map. eml ")); OutputStream os = new Bufferedoutputstream (new FileOutputStream (New File ("C:\\users\\administrator\\desktop\\result.txt")); Metadata meta = new Metadata (); Meta.add (MEtadata. Content_encoding, "Utf-8"); ContentHandler Ihandler = new Bodycontenthandler (OS); Parser.parse (IS, Ihandler, Meta, New Parsecontext ()); Long end = System.currenttimemillis (); Long used = (End-start)/1000; System.out.println ("Time-consuming:" +used+ "seconds");}}
Code snippets, using Tika to parse Pdf,word and email