Crawl site content and save it in PDF format
1. Install PDF-dependent package pip install Pdfkit
But when you use Pdfkit, you still get an error.
Traceback (most recent): File"C:\Users\zhan\AppData\Roaming\Python\Python36\site-packages\pdfkit\configuration.py", line +,inch__init__ with open (self.wkhtmltopdf) asf:filenotfounderror: [Errno2] No such file or directory:b"'During handling of the above exception, another exception occurred:OSError:No wkhtmltopdf executable found:"B ' '"If ThisFile exists check that ThisProcess can read it. Otherwise Please install Wkhtmltopdf-https://github.com/jazzcore/python-pdfkit/wiki/installing-wkhtmltopdf
Follow the prompts to download wkhtmltopdf, and install the record installation path.
Use Pdfkit with the following code
# path_wk = r'D:\Program files\wkhtmltopdf\bin\wkhtmltopdf.exe'= Pdfkit.configuration (wkhtmltopdf = path_wk) # pdfkit.from_string ("helloWorld", " 1.pdf ", Configuration=config)
Start your code implementation when the preparation is complete:
#!/usr/bin/env python #coding: utf8import sysimport requestsimport pdfkitimport reimport osclassHtmltopdf (): Def __init__ (self): Self.path_wk= R'D:\Program Files\wkhtmltopdf\bin\wkhtmltopdf.exe'Self.config= Pdfkit.configuration (wkhtmltopdf=self.path_wk) Self.url="http://www.apelearn.com/study_v2/"# Self.reg= Re.compile (r'<li class= "Toctree-l1" ><a.*?href= "(. *?)" >.*?</a></li>') Self.reg= Re.compile (r'<li class= "Toctree-l1" ><a.*?href= "(. *?)" > (. *?) </a></li>') Self.dirname="Aminglinuxbook"Self.result=""Self.chapter=""self.chapter_content=""def get_html (self): s=requests.session () response= S.Get(self.url) response.encoding='Utf-8'text=Self.reg.findall (response.text) Self.result= List (Set(text)) def get_pdfdir (self):ifNot os.path.exists (self.dirname): Os.makedirs (Self.dirname) def get_chapter (self): SELF.GET_PDFD IR () forChapterinchSelf.result:pdfFileName="{0}-{1}.pdf". Format (chapter[0].split ('.')[0],chapter[1]) # Pdffilename= chapter[0].replace ("HTML","PDF") Pdfurl="{0}{1}". Format (Self.url, chapter[0]) FilePath=Os.path.join (Self.dirname, Pdffilename). Strip () print (pdfurl) print (FilePath)Try: Pdfkit.from_url (Pdfurl, FilePath, configuration=self.config) except Exception asE:print (E) def main (): Html2pdf=htmltopdf () html2pdf.get_html () Html2pdf.get_chapter ( )if__name__ = ="__main__": Main ()
Operation Result:
View the downloaded PDF file in the Catalog
Data processing (HTML to PDF)