First import jar package pdfbox-app-1.6.0.jar fontbox-1.6.0.jar pdfbox-1.6.0.jar and jempbox-1.6.0.jar
At the same time, we also need to import Lucene 3.0 package lucene-core-3.0.0.jar
Download URLs http://pdfbox.apache.org/download.html
Import java. Io. file;
Import java. Io. fileoutputstream;
Import java. Io. outputstreamwriter;
Import java. Io. Writer;
Import java.net. malformedurlexception;
Import java.net. url;
Import org.apache.e.doc ument. Document;
Import org.apachedomainbox. pdmodel. pddocument;
Import org.apachedomainbox. util. Specify textstripper;
Public class pdflucenetest {
Public static void main (string [] ARGs ){
Try {
Getext ("D: \ test \ small \ 1.20 ");
} Catch (exception e ){
// Todo auto-generated Catch Block
E. printstacktrace ();
}
}
Public static void getext (string file) throws exception {
// Sort or not
Boolean sort = false;
// PDF file name
String pdffile = file;
// Enter the text file name
String textfile = NULL;
// Encoding method
String encoding = "UTF-8 ";
// Start page Extraction
Int startpage = 1;
// End number of extracted pages
Int endpage = integer. max_value;
// File input stream to generate a text file
Writer output = NULL;
// PDF document stored in memory
Pddocument document = NULL;
Try {
Try {
// First load the file as a URL, and then load the file from the local file system if an exception occurs. //
URL url = new URL (pdffile );
Document = pddocument. Load (URL );
// Obtain the PDF file name
String filename = URL. GetFile ();
// Name the generated TXT file with the original PDF name
If (filename. Length ()> 4 ){
File outputfile = new file (filename. substring (0, filename. Length ()-4) + ". txt ");
Textfile = outputfile. getname ();
}
} Catch (malformedurlexception e ){
// If an exception occurs during URL loading, it will be loaded from the file system
Document = pddocument. Load (pdffile );
If (pdffile. Length ()> 4 ){
Textfile = pdffile. substring (0, pdffile. Length ()-4) + ". txt ";
}
}
// File input stream, which is written into the file inverted textfile
Output = new outputstreamwriter (New fileoutputstream (textfile), encoding );
// Extract textstripper to extract text
Optional textstripper stripper = NULL;
Stripper = new jsontextstripper ();
// Set whether to sort
Stripper. setsortbyposition (SORT );
// Set the start page
Stripper. setstartpage (startpage );
// Set the end page
Stripper. setendpage (endpage );
// Call javastextstripper's writetext to extract and output the text
Stripper. writetext (document, output );
} Finally {
If (output! = NULL ){
// Close the output stream
Output. Close ();
}
If (document! = NULL ){
// Close the PDF document
Document. Close ();
}
}
}
}