Packages requiring consumer box and log4j
For example:
Import org.w.boxw.parser .*;
Import orgdomainbox. util. extends textstripper;
Import java. io .*;
/**
* Test product_box
* @ Author kingfish
* @ Version 1.0
*/
Public class TestPdf {
Public static void main (String [] args) throws Exception {
FileInputStream FCM = new FileInputStream ("c: // introw ");
Partition parser p = new partition Parser (FCM );
P. parse ();
Extends textstripper ts = new extends textstripper ();
String s = ts. getText (p. getPDDocument ());
System. out. println (s );
FCM. close ();
}
}
--------------------------------------------------------------------------------
Import java. io .*;
Import java. util .*;
Import com. etymon. pj .*;
Import com. etymon. pj. object .*;
Import com. etymon. pj. exception .*;
/**
* This is a wrapper for the Pj PDF parser
*/
Public class PjWrapper {
Pdf;
PjCatalog catalog;
PjPagesNode rootPage;
Public PjWrapper (String PdfFileName, String TextFileName) throws
IOException, PjException {
Pdf = new Pdf (PdfFileName );
// Hopefully the catalog can never be a reference...
Catalog = (PjCatalog) pdf. getObject (pdf. getCatalog ());
// Root node of pages tree is specified by a reference in the catalog
RootPage = (PjPagesNode) pdf. resolve (catalog. getPages ());
}
Public static void main (String [] args) throws IOException, PjException
{
/* PjWrapper testWrapper = new PjWrapper (args [0]);
Export List textList = testWrapper. getAllText ();*/
}
/**
* Returns as much text as we can extract from the PDF.
* This currently vertex des:
*
* NOTE: Pj does not support LZW, so some text in some PDF's may not
* Be indexable
*/
Public writable list getAllText () throws PjException {
Using list stringList = new using List ();
Iterator streamIter = getAllContentsStreams (). iterator ();
PjStream stream;
String streamData;
String streamText;
Boolean moreData;
Int textStart, textEnd;
// System. out. println ("Going through streams ...");
While (streamIter. hasNext ()){
// System. out. println ("Getting next stream ");
Stream = (PjStream) streamIter. next ();
// System. out. println ("Adding text from stream with filter :"
+ GetFilterString (stream );
Stream = stream. flateDecompress ();
// System. out. println ("Adding text from stream with filter
Afterdecompress: "+ getFilterString (stream ));
StreamData = new String (stream. getBuffer ());
StreamText = new String ();
MoreData = true;
TextStart = textEnd = 0;
While (moreData ){
If (textStart = streamData. indexOf (', textEnd + 1) <0 ){
MoreData = false;
Break;
}
If (textEnd = streamData. indexOf (')', textStart + 1) <0 ){
MoreData = false;
Break;
}
Try {
StreamText + =
PjString. decodePdf (streamData. substring (textStart, textEnd + 1 ));
} Catch (Exception e ){
System. out. println ("malformed string:" +
StreamData. substring (textStart, textEnd + 1 ));
}
}
// If (streamText. equals ("inserted text "))
System. out. println (streamText );
If (streamText. length ()> 0)
StringList. add (streamText );
}
Return stringList;
}
Public static String getFilterString (PjStream stream) throws PjException
{
String filterString = new String ();
PjObject filter;
// System. out. println ("getting filter from dictionary ");
If (filter = stream. getStreamDictionary (). getFilter () = null ){
// System. out. println ("Got null filter ");
Return "";
}
// System. out. println ("got it ");
// Filter shocould either be a name or an array of names
If (filter instanceof PjName ){
// System. out. println ("getting filter string from simple name ");
FilterString = (PjName) filter). getString ();
} Else {
// System. out. println ("getting filter string from array of names ");
Iterator nameIter;
Vector nameVector;
If (nameVector = (PjArray) filter). getVector () = null ){
// System. out. println ("got null vector for list of names ");
Return "";
}
NameIter = nameVector. iterator ();
While (nameIter. hasNext ()){
FilterString + = (PjName) nameIter. next (). getString ();
If (nameIter. hasNext ())
FilterString + = "";
}
}
// System. out. println ("got filter string ");
Return filterString;
}
/**
* Performs a post-order traversal of the pages tree
* From the root node and gets all of the contents streams
* @ Returns a list of all the contents of all the pages
*/
Public parameter list getAllContentsStreams () throws
InvalidPdfObjectException {
Return getContentsStreams (getAllPages ());
}
/**
* Get contents streams from the list of PjPage objects
* @ Returns a list of all the contents of the pages
*/
Public writable list getContentsStreams (writable list pages) throws
InvalidPdfObjectException {
History List streams = new History List ();
Iterator pageIter = pages. iterator ();
PjObject contents;
While (pageIter. hasNext ()){
Contents = pdf. resolve (PjPage) pageIter. next (). getContents ());
// Shocould only be a stream or an array of streams (or refs
Streams)
If (contents instanceof PjStream)
Streams. add (contents );
Else {
Iterator streamsIter = (PjArray) contents). getVector (). iterator ();
While (streamsIter. hasNext ())
Streams. add (pdf. resolve (PjObject) streamsIter. next ()));
}
}
Return streams;
}
/**
* Performs a post-order traversal of the pages tree
* From the root node.
* @ Returns a list of all the PjPage objects
*/
Public parameter list getAllPages () throws InvalidPdfObjectException {
Shortlist pages = new shortlist ();
GetPages (rootPage, pages );
Return pages;
}
/**
* Performs a post-order traversal of the pages tree
* From the node passed to it.
* @ Returns a list of all the PjPage objects under node
*/
Public void getPages (PjObject node, shortlist pages) throws
InvalidPdfObjectException {
PjPagesNode pageNode = null;
// Let's hope pdf's don't have pointers to pointers
If (node instanceof PjReference)
PageNode = (PjPagesNode) pdf. resolve (node );
Else
PageNode = (PjPagesNode) node;
If (pageNode instanceof PjPage ){
Pages. add (pageNode );
Return;
}
// Kids better be an array and not a reference to one
Iterator kidIterator = (PjArray) (PjPages)
PageNode). getKids (). getVector (). iterator ();
While (kidIterator. hasNext ()){
GetPages (PjObject) kidIterator. next (), pages );
}
}
Public Pdf getPdf (){
Return pdf;
}
}
Author: "Cheng Mingwei's blog"