/*
* The function of this code is to use Pdfbox.zip's package
* Use Lucene to parse the content of PDF text
* Read the contents of the PDF file. It is then re-written to a. txt file of the same name
* */
Results:
Package PDFBox;
Import Java.io.File;
Import Java.io.FileOutputStream;
Import Java.io.OutputStreamWriter;
Import Java.io.Writer;
Import java.net.MalformedURLException;
Import Java.net.URL;
Import org.pdfbox.pdmodel.PDDocument;
Import Org.pdfbox.util.PDFTextStripper;
public class PDFBox {
Public PDFBox () {
TODO auto-generated Constructor stub
}
public void Getext (String file) throws exception{
Whether to sort
Boolean sort = false;
PDF name
String pdffile = file;
Enter a text file name
String textfile = null;
Setting the Encoding method
String encoding = "UTF-8";
Start extracting pages
int startpage = 1;
End Fetch Pages
int endpage = Integer.max_value;
File input stream, generating a text file
Writer output = null;
In-Memory stored PDF Document
PDDocument document = null;
try{
try{
To load a file first as a URL,
If you get an exception and then load the file from the local file system
URL url = new URL (pdffile);
Document = Pddocument.load (URL);
Get the file name of the PDF
String fileName = Url.getfile ();
Original PDF names to name the newly generated TXT file
if (Filename.length () > 4) {
File OutputFile = new file (filename.substring (0, Filename.length ()-4) + ". txt");
Textfile = Outputfile.getname ();
}
}catch (Malformedurlexception R) {
Mount from file system if loaded as URL to exception
Document = Pddocument.load (Pdffile);
if (Pdffile.length () > 4) {
Textfile = pdffile.substring (0, Pdffile.length ()-4) + ". txt";
}
}
File input stream, write file to Textfile
Output = new OutputStreamWriter (new FileOutputStream (textfile), encoding);
Pdftextstripprt to extract Files
Pdftextstripper stripper = null;
Stripper = new Pdftextstripper ();
Set whether to sort
Stripper.setsortbyposition (sort);
Set Start Page
Stripper.setstartpage (StartPage);
Set End page
Stripper.setendpage (EndPage);
Call Pdftextstripper's Writertext extract and output
Stripper.writetext (document, output);
}finally{
Turn off the output stream
if (output! = NULL)
Output.close ();
}
Close PDF Document
if (document! = NULL) {
Document.close ();
}
}
public static void Main (string[] args) {
TODO auto-generated Method Stub
PDFBox test = new PDFBox ();
try{
Gets the C language code under the E:\Lucene project. PDF content
Test.getext ("E:\\lucene Project \\c language code. pdf");
}catch (Exception e) {
E.printstacktrace ();
}
}
}
Using Lucene to parse the content of PDF text