################################## #处理PDF和Word文档 ###################################
‘‘‘
PDF and Word documents are binary files, in addition to text,
They also save a lot of fonts, colors, and layout information
‘‘‘
‘‘‘
Extracting text from a PDF
‘‘‘
################################## #从PDF提取文本 ###################################
Import PyPDF2
Pdffileobj=open (R ' C:\Users\Administrator\Desktop\test.pdf ', ' RB ')
Pdfreader=pypdf2.pdffilereader (Pdffileobj)
Pdfreader.numpages
Pageobj=pdfreader.getpage (0)
Pageobj.extracttext ()
################################## #解压PDF #########################################
Import PyPDF2
Pdfreader=pypdf2.pdffilereader (Open (R ' C:\Users\Administrator\Desktop\test.pdf ', ' RB '))
pdfreader.isencrypted # # #是否加密
Pdfreader.getpage (0)
Pdfreader.decrypt (' Rosebud ') # # # #提供解密口令
Pageobj=pdfreader.getpage (0)
################################## #创建PDF #########################################
‘‘‘
PyPDF2 cannot write arbitrary text to PDF:
PyPDF2 ability to write PDFs, only copy pages from other PDFs, rotate pages, overlay pages, and encrypt files
‘‘‘
‘‘‘
General Way:
1. Open one or more used PDFs (source pdf) to get the Pdffilereader object
2. Create a new Pdffilewriter object
3. Copy the page from the Pdffilereader object to the Pdffilewriter object
4. Writing output PDF with Pdffilewriter object
‘‘‘
#################################################### #拷贝页面 ###################################################### #####
def merge (Pdf_one, pdf_two, filename= ' my.pdf ', output_dir=r ' C:\Users\Administrator\Desktop '):
Input_one = File (Pdf_one, ' RB ')
Input_two = File (Pdf_two, ' RB ')
Pdf_input_one = Pypdf2.pdffilereader (Input_one)
Pdf_input_two = Pypdf2.pdffilereader (input_two)
Numone = Pdf_input_one.getnumpages ()
Numtwo = Pdf_input_two.getnumpages ()
Print Numone, Numtwo
Pdf_output = Pypdf2.pdffilewriter ()
For Pagenum in range (Numone):
print ' Hereo '
Pageobj=pdf_input_one.getpage (Pagenum)
Pdf_output.addpage (Pageobj)
For Pagenum in range (Numtwo):
print ' Heret '
Pageobj=pdf_input_two.getpage (Pagenum)
Pdf_output.addpage (Pageobj)
Pdf_name = Output_dir+filename
Print Pdf_name
Output_stream = File (pdf_name, ' WB ')
Pdf_output.write (Output_stream)
Output_stream.close ()
Input_one.close ()
Input_two.close ()
print ' done! '
Merge (R ' C:\Users\Administrator\Desktop\Pairs_Trading_Quantitative Methods and Analysis.pdf ', R ' C:\Users\ Administrator\desktop\demontjoye.sm.pdf ')
#################################################### #旋转页面 ###################################################### #####
‘‘‘
Using the Rotateclockwise () and Rotatecounterclockwise () methods
The pages of a PDF document can also be rotated by 90-degree integer multiples, which are passed to these methods
Integers 90, 180, or 270
‘‘‘
def merge (Pdf_one, pdf_two, filename= ' my.pdf ', output_dir=r ' C:\Users\Administrator\Desktop '):
Input_one = File (Pdf_one, ' RB ')
Input_two = File (Pdf_two, ' RB ')
Pdf_input_one = Pypdf2.pdffilereader (Input_one)
Pdf_input_two = Pypdf2.pdffilereader (input_two)
Numone = Pdf_input_one.getnumpages ()
Numtwo = Pdf_input_two.getnumpages ()
Print Numone, Numtwo
Pdf_output = Pypdf2.pdffilewriter ()
For Pagenum in range (Numone):
print ' Hereo '
Pageobj=pdf_input_one.getpage (Pagenum)
Pageobj=pageobj.rotateclockwise (90)
Pdf_output.addpage (Pageobj)
For Pagenum in range (Numtwo):
print ' Heret '
Pageobj=pdf_input_two.getpage (Pagenum)
Pageobj=pageobj.rotateclockwise (90)
Pdf_output.addpage (Pageobj)
Pdf_name = Output_dir+filename
Print Pdf_name
Output_stream = File (pdf_name, ' WB ')
Pdf_output.write (Output_stream)
Output_stream.close ()
Input_one.close ()
Input_two.close ()
print ' done! '
Merge (R ' C:\Users\Administrator\Desktop\Pairs_Trading_Quantitative Methods and Analysis.pdf ', R ' C:\Users\ Administrator\desktop\demontjoye.sm.pdf ')
#################################################### #叠加页面 ###################################################### #####
Import PyPDF2
Minutesfile=open (R ' C:\Users\Administrator\Desktop\Pairs_Trading_Quantitative Methods and Analysis.pdf ', ' RB ')
Pdfreader=pypdf2.pdffilereader (Minutesfile)
Minutesfirstpage=pdfreader.getpage (0)
Pdfwatermarkreader=pypdf2.pdffilereader (Open (R ' C:\Users\Administrator\Desktop\deMontjoye.SM.pdf ', ' RB '))
Minutesfirstpage.mergepage (pdfwatermarkreader.getpage (0))
Pdfwriter=pypdf2.pdffilewriter ()
Pdfwriter.addpage (Minutesfirstpage)
For Pagenum in range (1,pdfreader.numpages):
Pageobj=pdfreader.getpage (Pagenum)
Pdfwriter.addpage (Pageobj)
Resultpdffile=open (R ' C:\Users\Administrator\Desktop\merge.pdf ', ' WB ')
Pdfwriter.write (Resultpdffile)
Minutesfile.close ()
Resultpdffile.close ()
#################################################### #加密PDF ##################################################### ######
Import PyPDF2
Pdffile=file (R ' C:\Users\Administrator\Desktop\deMontjoye.SM.pdf ', ' RB ')
Pdfreader=pypdf2.pdffilereader (Pdffile)
Pdfwriter=pypdf2.pdffilewriter ()
For Pagenum in range (pdfreader.numpages):
Pdfwriter.addpage (Pdfreader.getpage (pagenum))
Pdfwriter.encrypt (' Swordfish ')
Resultpdf=file (R ' C:\Users\Administrator\Desktop\t.pdf ', ' WB ')
Pdfwriter.write (Resultpdf)
Resultpdf.close ()
PDF of Python automation