Recently read the contents and pictures of the PDF file with PDFBox, you can get the content and pictures of each page, but there is a problem is unable to get the picture in the location of the page. The source code is as follows:
Package com.util;
Import Java.awt.image.BufferedImage;
Import Java.io.BufferedInputStream;
Import Java.io.File;
Import Java.io.FileInputStream;
Import Java.io.InputStream;
Import Java.io.StringWriter;
Import Java.text.SimpleDateFormat;
Import Java.util.Calendar;
Import Java.util.Iterator;
Import java.util.List;
Import Java.util.Map;
Import Java.util.Set;
Import Javax.imageio.ImageIO;
Import Org.apache.pdfbox.pdfparser.PDFParser;
Import org.apache.pdfbox.pdmodel.PDDocument;
Import Org.apache.pdfbox.pdmodel.PDDocumentCatalog;
Import Org.apache.pdfbox.pdmodel.PDPage;
Import org.apache.pdfbox.pdmodel.PDResources;
Import Org.apache.pdfbox.pdmodel.graphics.xobject.PDXObjectImage;
Import Org.apache.pdfbox.util.PDFTextStripper;
public class Pdfboxutil {
try {
InputStream InputStream = new Bufferedinputstream (new FileInputStream (New File ("D:/android/a.pdf"));
Pdfparser parser = new Pdfparser (InputStream);
Parser.parse ();
PDDocument pdfdocument = Pddocument.load (InputStream);
PDDocument pdfdocument = Parser.getpddocument ();
StringWriter writer = new StringWriter ();
Pdftextstripper stripper = new Pdftextstripper ();
Stripper.writetext (pdfdocument, writer);
String contents = Writer.getbuffer (). toString ();
/*
Pddocumentinformation documentinformation = Pdfdocument.getdocumentinformation ();
System.out.println ("title:" + Documentinformation.gettitle ());
Pddocumentinformation info = pdfdocument.getdocumentinformation ();
System.out.println ("title:" + Info.gettitle ());
System.out.println ("Subject:" + info.getsubject ());
System.out.println ("Author:" + Info.getauthor ());
System.out.println ("Keyword:" + info.getkeywords ());
System.out.println ("Application:" + info.getcreator ());
System.out.println ("PDF Production Program:" + Info.getproducer ());
System.out.println ("Author:" + info.gettrapped ());
SYSTEM.OUT.PRINTLN ("Creation time:" + DateFormat (Info.getcreationdate ()));
SYSTEM.OUT.PRINTLN ("Modification Time:" + DateFormat (Info.getmodificationdate ()));
*/
/** Document page Information **/
Pddocumentcatalog cata = Pdfdocument.getdocumentcatalog ();
List pages = Cata.getallpages ();
int count = 1;
for (int i = 0; i < pages.size (); i++) {
Pdpage page = (pdpage) pages.get (i);
if (null! = page) {
Text content on this page
StringWriter SW = new StringWriter ();
Pdftextstripper PST = new Pdftextstripper ();
Pst.setstartpage (i+1);
Pst.setendpage (i+1);
Pst.writetext (Pdfdocument, SW);
String content = Sw.getbuffer (). toString ();
SYSTEM.OUT.PRINTLN (content);
pdresources res = page.findresources ();
Get page picture information
Map IMGs = Res.getimages ();
if (null! = IMGs) {
Set KeySet = Imgs.keyset ();
Iterator it = Keyset.iterator ();
while (It.hasnext ()) {
Object obj = It.next ();
Pdxobjectimage img = (pdxobjectimage) imgs.get (obj);
Img.write2file ("d:/" + count);
count++;
}
}
}
}
} catch (Exception e) {
TODO automatically generates catch blocks
E.printstacktrace ();
}
}
public static String DateFormat (Calendar calendar) throws Exception {
if (null = = Calendar)
return null;
String date = null;
try {
String pattern = "YYYY-MM-DD";
SimpleDateFormat format = new SimpleDateFormat (pattern);
Date = Format.format (Calendar.gettime ());
} catch (Exception e) {
Throw e;
}
return date = = null? "": Date;
}
}
In the end there was no way to convert the contents of each page into pictures. Replace the code with the red part to print each page as a picture.
if (null! = page) {
bufferedimage img1 = Page.converttoimage ();
File File = new file ("d:/" +i+ ". PNG ");
Imageio.write (IMG1, "PNG", file);
}
Environmental net