One of the programs used to Poi,jacob,openoffice,icepdf,itext bag, you can search the Internet.
PS: When using OpenOffice to convert a PDF to a picture, the prerequisite is to have the OpenOffice software installed locally and to turn on the service:
After installing OpenOffice
1. After installing OpenOffice, enter the default installation directory.
CD C:\Program Files (x86) \openoffice 4\program
Perform
Soffice-headless-accept= "SOCKET,HOST=127.0.0.1,PORT=8100;URP;"-nofirststartwizard
2. See if the installation is successful
2.1 View the port corresponding to the PID
Netstat-ano|findstr "8100"
2.2 View the PID corresponding service program name
Tasklist|findstr "IPD Value"
Not much to say, directly on the code:
Package com.officefileparser.utils;
Import Java.awt.image.BufferedImage;
Import Java.awt.image.RenderedImage;
Import Java.io.File;
Import Java.io.FileInputStream;
Import java.io.FileNotFoundException;
Import Java.io.FileOutputStream;
Import java.io.IOException;
Import Java.io.InputStream;
Import java.net.ConnectException;
Import java.net.MalformedURLException;
Import java.util.ArrayList;
Import Java.util.HashMap;
Import java.util.List;
Import Javax.imageio.ImageIO;
Import Org.apache.commons.io.FilenameUtils;
Import org.apache.pdfbox.pdmodel.PDDocument;
Import Org.apache.pdfbox.util.PDFTextStripper;
Import Org.apache.poi.hslf.HSLFSlideShow;
Import Org.apache.poi.hslf.extractor.PowerPointExtractor;
Import Org.apache.poi.hslf.model.Picture;
Import Org.apache.poi.hslf.model.Slide;
Import Org.apache.poi.hslf.model.TextRun;
Import Org.apache.poi.hslf.usermodel.PictureData;
Import Org.apache.poi.hslf.usermodel.SlideShow;
Import Org.apache.poi.hwpf.extractor.WordExtractor; Import ORG.APACHE.POi.poifs.filesystem.POIFSFileSystem;
Import org.icepdf.core.pobjects.Document;
Import Org.icepdf.core.pobjects.Page;
Import org.icepdf.core.util.GraphicsRenderingHints;
Import Com.artofsolving.jodconverter.DefaultDocumentFormatRegistry;
Import Com.artofsolving.jodconverter.DocumentConverter;
Import Com.artofsolving.jodconverter.DocumentFormat;
Import com.artofsolving.jodconverter.openoffice.connection.OpenOfficeConnection;
Import com.artofsolving.jodconverter.openoffice.connection.SocketOpenOfficeConnection;
Import Com.artofsolving.jodconverter.openoffice.converter.OpenOfficeDocumentConverter;
Import com.jacob.activeX.ActiveXComponent;
Import Com.jacob.com.Dispatch;
public class Officefileparserutils {/** * use POI to read doc file * @param docurl * doc file path * @return * A String representing the contents of the doc file
* @throws IOException */public static void Readoc (String docpath) throws ioexception{FileInputStream in;
in = new FileInputStream (Docpath);
Wordextractor extractor = new Wordextractor (in); Hwpfdocument hwpfdocument = new Hwpfdocument (in);
An element is a piece of content string[] paratexts = Extractor.getparagraphtext (); for (int i=0 i<paratexts.length; i++) { //a period of deposit in &N Bsp &NBSP;SYSTEM.OUT.PRINTLN (/* "Paragraph" + (i+1) + ":" +*/paratexts[i]); } }/** * use POI to write Word documents * @param destfile Purpose Address * @param filecon Pending write string */ public void Exportdoc (String de Stfile,string Filecon) { try { //doc content & nbsp
Bytearrayinputstream Bais = new Bytearrayinputstream (Filecon.getbytes ());
Poifsfilesystem fs = new Poifsfilesystem (); DirectoryEntry directory = Fs.getroot (); &NBSP;
Directory.createdocument ("Worddocument", Bais);
FileOutputStream ostream = new FileOutputStream (destfile);
Fs.writefilesystem (ostream);
bais.close ();
ostream.close (); } catch (IOException e) { e.printstacktrace ();   ; } }/** * use PDFBox to read PDF Document information * @param pdfpath PDF file path * @return PDF information String * @th
Rows Exception */public static string Getpdftext (String pdfpath) throws Exception {Boolean sort = false;
int startpage = 1;
int endpage = 10;
PDDocument document = null;
File File = new file (Pdfpath); if (!file.exists ()) {System.out.println (File.getabsolutepath () +) does not exist ...
");
Return "";
} System.out.println (file);
Try {try {document = Pddocument.load (file); } CATCH (Malformedurlexception e) {} pdftextstripper stripper = new Pdftextstripper ();
Stripper.setsortbyposition (sort);
Stripper.setstartpage (StartPage);
Stripper.setendpage (EndPage);
return Stripper.gettext (document);
catch (Exception e) {e.printstacktrace ();
Return "";
finally {if (document!= null) {document.close ();
}}//function checks whether the file is ppt public static Boolean checkfile (file file) {Boolean isppt = false;
String filename = File.getname ();
String suffixname = null; if (filename!= null && filename.indexof (".")!=-1) {suffixname = filename.substring (Filename.indexof ("."))
;
System.out.println (Suffixname);
if (Suffixname.equals (". ppt")) {isppt = true;
return isppt;
else {return isppt; /** * Use POI to extract all content of ppt * @param pptpath * @return * @throws ioexception/public static String READPP TAll (String pptpath) throws ioexception{PowerpointextractoR powerpointextractor = new Powerpointextractor (Pptpath);
System.out.println (Powerpointextractor.gettext ());
return Powerpointextractor.gettext (); /** * Use POI to extract PPT content * @param pptpath * @return * @throws ioexception * * public static HASHMAP Readppto Nebyone (String pptpath,string outpath) throws ioexception{content = new//stringbuffer (""); StringBuffer the
Content of ppt InputStream InputStream = new FileInputStream (Pptpath); Hashmap<integer, string> contentpagemap =new Hashmap<integer, string> ();
//constructs a Powerpoint document from a input stream.
Slideshow slideshow = new Slideshow (new Hslfslideshow (InputStream));
To get each slide, think of a slides element as a ppt slide[] slides = slideshow.getslides ();
for (int i = 0; i < slides.length i++) {//Get PPT page number pagenum int pagenum = Slides[i].getslidenumber ();
Read the contents of a slide (including the title), in order to get the text content of the slide, establish textrun,textrun to represent a ppt textrun[] textruns = Slides[i].gettextruns (); System.out.println ("The first" +pagenum+ "Zhang");
Read the title of a slide String title=slides[i].gettitle (); * //get system line break String lineseparator = (St Ring) java.security.AccessController.doPrivileged ( N
EW sun.security.action.GetPropertyAction ("Line.separator")); System.out.print (lineseparator);///If there is no title in a ppt, take the contents of the first text box as the title &NB Sp if (title = null) { title = Textruns[0].gettext (). Re
Placeall ("[\n\r]", "");/Remove line break }
SYSTEM.OUT.PRINTLN ("title:" +title); for (int j = 0; J < Textruns.length J + +) {//Get content in a text box for ppt text String text = Textruns[j].gettext (). ReplaceAll ("[\n\r]", "");
Contentpagemap.put (i, text);
System.out.print (text);
System.out.println ();
}//Extract all pictures in ppt//Get PPT page number pagenum file OutDir = new file (Outpath);
if (!outdir.exists ()) {Outdir.mkdir ();
} extractimageofppt (Pptpath,outpath);
return contentpagemap; ///Use POI to extract pictures in ppt private static void Extractimageofppt (String pptpath,string outpath) throws IOException {//sli
The Desshow class represents a ppt file slideshow ppt = new Slideshow (new Hslfslideshow (Pptpath));
An element in the Sildes array represents a ppt slide[] slides = ppt.getslides ();
Extract all pictures contained in the presentation picturedata[] pdata = Ppt.getpicturedata ();
for (int i = 0; i < pdata.length i++) {PictureData pict = pdata[i];
Picture data byte[] data = Pict.getdata ();
int type = Pict.gettype ();
String ext;
Switch (type) {case Picture.JPEG:ext = '. jpg ';
Break
Case Picture.PNG:ext = ". PNG";
Break Case PictUre.
Wmf:ext = ". Wmf";
Break
Case Picture.EMF:ext = ". EMF";
Break
Case Picture.PICT:ext = ". PICT";
Break
Default:continue;
FileOutputStream out = new FileOutputStream (outpath+ "pict_" + i + ext);
Out.write (data);
Out.close ();
/** * Use OpenOffice to convert Doc's various types of files to PDF format * @param docpath * @param pdfpath * @throws connectexception */public static void Doc2pdf (String docpath, String pdfpath) throws Connectexception { File Inputfile = new file (docpath);//Pre-turn file File outputfile = new file (Pdfpath); PDF file openofficeconnection connection = new socketopenofficeconnection (8100); connection.connect ()//Set up connection Documentconverter Co Nverter = new Openofficedocumentconverter (connection); DEFAUltdocumentformatregistry Formatreg = new Defaultdocumentformatregistry (); Documentformat txt = formatreg.getformatbyfileextension ("ODT");//Set file format Documentformat PDF = formatreg.getformatbyfileextension ("PDF");/Set file format &N Bsp Converter.convert (inputfile, TXT, outputfile, pdf)/File conversion CONNEC Tion.disconnect ()//close connection } /** * use OpenOffice to turn doc documents into pictures, it works well, thinking: first doc document into PDF, and then convert the PDF to a picture * @param docpath * @param imgdirpath */public static void Doc2imags (String do CPath, String imgdirpath) { string pdfpath =string.format ("%s%s.pdf", filenameutils.getfullpath ( Docpath), Filenameutils.getbasename (Docpath)); try { System.out.println (Filenameutils.getfullpath (Docpath) + " " +file Nameutils.getbasename (DoCPath));
doc2pdf (Docpath, Pdfpath); pdf2imgs (Pdfpath, Imgdirpath); file PDF = new File (pdfpath); if (Pdf.isfile ()) { /pdf.delete ()  
; &NBSP}  } catch (Connectexception e) { &NBSP ; e.printstacktrace (); &NBSP} catch (Exception e) { e.printstacktrace (); &NBSP} }/** * convert a PDF to a picture using icepdf and return the picture name * @param pdfpath * @param imagepath * @return Returns the name of the converted picture * @throws exception */ public s Tatic list<string> Pdf2imgs (String pdfpath, String imgdirpath) throws Exception { Document do Cument = new Document ();
Document.setfile (Pdfpath); FLOAT scale = 5f;//magnification float rotation = 0f;//rotation degree Li st<string> imgnames = new arraylist<string> (); int pagenum = Document.getnumberofpages (); File imgdir = new file (Imgdirpath); if (!imgdir.exists ()) { Imgdir.mkdirs () } for (int i = 0; i < pagenum; i++) { BufferedImage image = (bufferedimage) document.getpageimage (i, Graphicsrenderinghints.screen, &NBS P
Page.boundary_cropbox, rotation, scale); renderedimage rendimage = image; try { String FilePath = Imgdirpath + File.separator + i + ". jpg"; &NBSp File File = new file (FilePath); Imageio.write (rendimage, "JPG", file); Imgnames.add (Filenameutils.getname (FilePath)); } catch (IOException e) { e.prints Tacktrace (); return null; { Image.flush (); } document.dispose (); return imgnames; } /** * use Jacob to turn ppt into pictures * @param inputfile * @param imgfilepath */PU Blic static synchronized void ppt2img (String inputfile,string imgfilepath) { &NBSP;SYSTEM.OUT.PRINTL
N ("ppt2pdf========== entry");
activexcomponent app = null; try { app = new Activexcomponent ("PowerPoint.Application");
&NBSP} catch (Exception e) { e.printstacktrace ();
&NBSP} system.out.println ("--------------------");
app.setproperty ("Visible", true);
dispatch ppts = App.getproperty ("Presentations"). Todispatch ();
&NBSP;SYSTEM.OUT.PRINTLN ("ppt2pdf========== ready to open PPT document");
&NBSP;SYSTEM.OUT.PRINTLN (new File (Inputfile). exists ()); dispatch ppt = Dispatch.call (ppts, &N Bsp "Open", & nbsp inputfile, &NBsp true,//read Only &NB Sp true,//untitled Specify whether the file has a title &NBSP ; true//withwindow Specify whether file is visible &NBSP ;
Todispatch ();
&NBSP;SYSTEM.OUT.PRINTLN ("ppt2pdf========== prepare to convert PPT document"); dispatch.call (ppt, "SaveCopyAs", imgfilepath,17);//convert to JPG & nbsp &NBSP;SYSTEM.OUT.PRINTLN ("ppt2pdf========== prepare to close PPT document");
dispatch.call (ppt, "close"); &NBSP;
app.invoke ("Quit"); &NBSP}}