I. Description of the problem
Use Python to read PDF text content.
Second, the effect
third, the operating environment
python2.7
Iv. libraries that need to be installed
Pip Install Pdfminer
v. Implementation of source code
Code 1 (Win64)
# coding=utf-8 Import sys reload (SYS) sys.setdefaultencoding (' utf-8 ') Import time Time1=time.time () import Os.path from PD
Fminer.pdfparser Import pdfparser,pdfdocument from pdfminer.pdfinterp import Pdfresourcemanager, Pdfpageinterpreter From Pdfminer.converter import Pdfpageaggregator to pdfminer.layout import lttextboxhorizontal,laparams from
PDFMINER.PDFINTERP Import pdftextextractionnotallowed result=[] class Cpdf2txtmanager (): Def __init__ (self): "' Constructor ' Def changepdftotext (self, filePath): File = open (path, ' RB ') # in binary read mode Open #用文件对象来创建一个pdf文档分析器 Praser = pdfparser (file) # Create a PDF document DOC = Pdfdocument () # even
Connection Analyzer and Document Object Praser.set_document (DOC) Doc.set_parser (praser) # provides initialization password # If you don't have a password, create an empty string. Doc.initialize () # detects whether the document provides TXT conversion and ignores if not doc.is_extractable:raise Pdftextextra Ctionnotallowed # Create PDF Explorer to pipeShared Resources Rsrcmgr = Pdfresourcemanager () # Create a PDF device Object laparams = Laparams () device = Pdfpagea
Ggregator (Rsrcmgr, Laparams=laparams) # Create a PDF interpreter Object interpreter = Pdfpageinterpreter (rsrcmgr, device)
Pdfstr = ' # Loop through the list, process one page at a time for page in Doc.get_pages (): # doc.get_pages () get page list Interpreter.process_page (page) # The Ltpage object that accepts the page layout = Device.get_result () for X i n layout:if hasattr (x, "Get_text"): # Print X.get_text () result.a Ppend (X.get_text ()) FileNames = Os.path.splitext (FilePath) with open (filenames[0)
+ ' txt ', ' WB ') as F:results = X.get_text () print (results) F.write (results + ' \ n ') if __name__ = = ' __main__ ': ' "' Parse PDF text, save to TXT file ' path = U ' C: /data3.pdf ' Pdf2txtmAnager = Cpdf2txtmanager () pdf2txtmanager.changepdftotext (path) # print result[0] time2 = time.time () pr
int U ' OK, parse pdf end! ' Print U ' total time consuming: ' + str (time2-time1) + ' s '
Code 2 (Win32)
# coding=utf-8 Import sys reload (SYS) sys.setdefaultencoding (' utf-8 ') Import time Time1=time.time () import Os.path from PD Fminer.pdfinterp import Pdfresourcemanager, pdfpageinterpreter from pdfminer.converter import pdfpageaggregator from Pdfminer.layout Import laparams from pdfminer.pdfpage import pdftextextractionnotallowed from Pdfminer.pdfparser Import Pdfparser from pdfminer.pdfdocument import pdfdocument from pdfminer.pdfpage import pdfpage result=[] class Cpdf2txtmanag
ER (): def __init__ (self): ' constructor ' Def changepdftotext (self, FilePath):
File = open (path, ' RB ') # opens in binary read mode #用文件对象来创建一个pdf文档分析器 praser = pdfparser (file) # Create a PDF document doc = Pdfdocument (praser) # detects whether a document provides a TXT conversion and ignores if not doc.is_extractable:raise pdfte
Xtextractionnotallowed # Create PDF Explorer to manage shared resources Rsrcmgr = Pdfresourcemanager () # Create a PDF device object
Laparams = Laparams ()device = Pdfpageaggregator (Rsrcmgr, Laparams=laparams) # Create a PDF interpreter Object interpreter = Pdfpageinterpreter (RSR Cmgr, device) Pdfstr = ' # Looping through the list, processing one page at a time for page in Pdfpage.create_pages (DOC): # doc.get_ Pages () get page List Interpreter.process_page (page) # Ltpage object that accepts the page layout = Device.get_r
Esult () for x in Layout:if hasattr (x, "Get_text"): # Print X.get_text ()
Result.append (X.get_text ()) FileNames = Os.path.splitext (FilePath) With open (Filenames[0] + '. txt ', ' WB ') as F:results = X.get_text () pri NT (Results) F.write (results + ' \ n ') if __name__ = = ' __main__ ': ' ' parse PDF text, save to TXT In the "Path" = U ' c:/36.pdf ' Pdf2txtmanager = Cpdf2txtmanager () pdf2txtmanager.changepdftotext (path) # Print Result[0] TimE2 = Time.time () print u ' OK, parse pdf end! '
Print U ' total time consuming: ' + str (time2-time1) + ' s '