Java pdf reading (paging READING)

Source: Internet
Author: User
Tags pdf parser

Packages requiring consumer box and log4j
For example:
Import org.w.boxw.parser .*;
Import orgdomainbox. util. extends textstripper;
Import java. io .*;
/**
* Test product_box
* @ Author kingfish
* @ Version 1.0
*/
Public class TestPdf {
Public static void main (String [] args) throws Exception {
FileInputStream FCM = new FileInputStream ("c: // introw ");
Partition parser p = new partition Parser (FCM );
P. parse ();
Extends textstripper ts = new extends textstripper ();
String s = ts. getText (p. getPDDocument ());
System. out. println (s );
FCM. close ();
}
}

--------------------------------------------------------------------------------

Import java. io .*;
Import java. util .*;
Import com. etymon. pj .*;
Import com. etymon. pj. object .*;
Import com. etymon. pj. exception .*;
/**
* This is a wrapper for the Pj PDF parser
*/
Public class PjWrapper {
Pdf;
PjCatalog catalog;
PjPagesNode rootPage;
Public PjWrapper (String PdfFileName, String TextFileName) throws
IOException, PjException {
Pdf = new Pdf (PdfFileName );
// Hopefully the catalog can never be a reference...
Catalog = (PjCatalog) pdf. getObject (pdf. getCatalog ());
// Root node of pages tree is specified by a reference in the catalog
RootPage = (PjPagesNode) pdf. resolve (catalog. getPages ());
}
Public static void main (String [] args) throws IOException, PjException
{
/* PjWrapper testWrapper = new PjWrapper (args [0]);
Export List textList = testWrapper. getAllText ();*/
}
/**
* Returns as much text as we can extract from the PDF.
* This currently vertex des:
*
* NOTE: Pj does not support LZW, so some text in some PDF's may not
* Be indexable
*/
Public writable list getAllText () throws PjException {
Using list stringList = new using List ();
Iterator streamIter = getAllContentsStreams (). iterator ();
PjStream stream;
String streamData;
String streamText;
Boolean moreData;
Int textStart, textEnd;
// System. out. println ("Going through streams ...");
While (streamIter. hasNext ()){
// System. out. println ("Getting next stream ");
Stream = (PjStream) streamIter. next ();
// System. out. println ("Adding text from stream with filter :"
+ GetFilterString (stream );
Stream = stream. flateDecompress ();
// System. out. println ("Adding text from stream with filter
Afterdecompress: "+ getFilterString (stream ));
StreamData = new String (stream. getBuffer ());
StreamText = new String ();
MoreData = true;
TextStart = textEnd = 0;
While (moreData ){
If (textStart = streamData. indexOf (', textEnd + 1) <0 ){
MoreData = false;
Break;
}
If (textEnd = streamData. indexOf (')', textStart + 1) <0 ){
MoreData = false;
Break;
}
Try {
StreamText + =
PjString. decodePdf (streamData. substring (textStart, textEnd + 1 ));
} Catch (Exception e ){
System. out. println ("malformed string:" +
StreamData. substring (textStart, textEnd + 1 ));
}
}
// If (streamText. equals ("inserted text "))
System. out. println (streamText );
If (streamText. length ()> 0)
StringList. add (streamText );
}
Return stringList;
}
Public static String getFilterString (PjStream stream) throws PjException
{
String filterString = new String ();
PjObject filter;
// System. out. println ("getting filter from dictionary ");
If (filter = stream. getStreamDictionary (). getFilter () = null ){
// System. out. println ("Got null filter ");
Return "";
}
// System. out. println ("got it ");
// Filter shocould either be a name or an array of names
If (filter instanceof PjName ){
// System. out. println ("getting filter string from simple name ");
FilterString = (PjName) filter). getString ();
} Else {
// System. out. println ("getting filter string from array of names ");
Iterator nameIter;
Vector nameVector;
If (nameVector = (PjArray) filter). getVector () = null ){
// System. out. println ("got null vector for list of names ");
Return "";
}
NameIter = nameVector. iterator ();
While (nameIter. hasNext ()){
FilterString + = (PjName) nameIter. next (). getString ();
If (nameIter. hasNext ())
FilterString + = "";
}
}
// System. out. println ("got filter string ");
Return filterString;
}
/**
* Performs a post-order traversal of the pages tree
* From the root node and gets all of the contents streams
* @ Returns a list of all the contents of all the pages
*/
Public parameter list getAllContentsStreams () throws
InvalidPdfObjectException {
Return getContentsStreams (getAllPages ());
}
/**
* Get contents streams from the list of PjPage objects
* @ Returns a list of all the contents of the pages
*/
Public writable list getContentsStreams (writable list pages) throws
InvalidPdfObjectException {
History List streams = new History List ();
Iterator pageIter = pages. iterator ();
PjObject contents;
While (pageIter. hasNext ()){
Contents = pdf. resolve (PjPage) pageIter. next (). getContents ());
// Shocould only be a stream or an array of streams (or refs
Streams)
If (contents instanceof PjStream)
Streams. add (contents );
Else {
Iterator streamsIter = (PjArray) contents). getVector (). iterator ();
While (streamsIter. hasNext ())
Streams. add (pdf. resolve (PjObject) streamsIter. next ()));
}
}
Return streams;
}
/**
* Performs a post-order traversal of the pages tree
* From the root node.
* @ Returns a list of all the PjPage objects
*/
Public parameter list getAllPages () throws InvalidPdfObjectException {
Shortlist pages = new shortlist ();
GetPages (rootPage, pages );
Return pages;
}
/**
* Performs a post-order traversal of the pages tree
* From the node passed to it.
* @ Returns a list of all the PjPage objects under node
*/
Public void getPages (PjObject node, shortlist pages) throws
InvalidPdfObjectException {
PjPagesNode pageNode = null;
// Let's hope pdf's don't have pointers to pointers
If (node instanceof PjReference)
PageNode = (PjPagesNode) pdf. resolve (node );
Else
PageNode = (PjPagesNode) node;
If (pageNode instanceof PjPage ){
Pages. add (pageNode );
Return;
}
// Kids better be an array and not a reference to one
Iterator kidIterator = (PjArray) (PjPages)
PageNode). getKids (). getVector (). iterator ();
While (kidIterator. hasNext ()){
GetPages (PjObject) kidIterator. next (), pages );
}
}
Public Pdf getPdf (){
Return pdf;
}
}

Author: "Cheng Mingwei's blog"
 

Related Article

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.