This article mainly introduces Python's method for grabbing HTML web pages and saving them as PDF files. It analyzes the installation of PyPDF2 module and Python's related operation skills for grabbing HTML pages and generating pdf files based on PyPDF2 module in combination with examples. Friends can refer
The example in this article describes how Python can crawl HTML pages and save them as PDF files. To share with you for your reference, the details are as follows:
I. Introduction
Today, we will scrape the HTML webpage and save it as a PDF.
Preparation
1. Installation and use of PyPDF2 (for merging PDFs):
PyPDF2 version: 1.25.1
installation:
pip install PyPDF2
Example of use:
from PyPDF2 import PdfFileMerger
merger = PdfFileMerger ()
input1 = open ("hql_1_20.pdf", "rb")
input2 = open ("hql_21_40.pdf", "rb")
merger.append (input1)
merger.append (input2)
# Write to an output PDF document
output = open ("hql_all.pdf", "wb")
merger.write (output)
2. Requests and beautifulsoup are two major artifacts of crawlers. Reuqests are used for network requests, and beautifusoup is used to manipulate html data. With these two shuttles, work is done with ease. We don't need a crawler framework such as scrapy. Such a small program comes with a little meaning of killing chickens with a knife. In addition, since you are converting html files to pdf, you must also have corresponding library support. Wkhtmltopdf is a very useful tool. It can use html to pdf conversion for multiple platforms. Pdfkit is a Python package for wkhtmltopdf. First install the following dependencies
For Windows, download the stable version of wkhtmltopdf from http://wkhtmltopdf.org/downloads.html for installation. After the installation is complete, add the program execution path to the $ PATH variable in the system environment. Otherwise, pdfkit cannot find wkhtmltopdf and an error occurs "No wkhtmltopdf executable found". Ubuntu and CentOS can be installed directly from the command line
def get_url_list ():
"" "
Get a list of all URL directories
: return:
"" "
response = requests.get ("http://www.liaoxuefeng.com/wiki/0014316089557264a6b348958f449949df42a6d3a2e542c000")
soup = BeautifulSoup (response.content, "html.parser")
menu_tag = soup.find_all (class _ = "uk-nav uk-nav-side") [1]
urls = []
for li in menu_tag.find_all ("li"):
url = "http://www.liaoxuefeng.com" + li.a.get ('href')
urls.append (url)
return urls
2. Save the HTML file of each article with a template through the article url
def parse_url_to_html (url, name):
"" "
Parse URL and return HTML content
: param url: Parsed URL
: param name: Saved html file name
: return: html
"" "
try:
response = requests.get (url)
soup = BeautifulSoup (response.content, 'html.parser')
# Body
body = soup.find_all (class _ = "x-wiki-content") [0]
# Title
title = soup.find ('h4'). get_text ()
# The title is added to the front of the text and displayed in the center
center_tag = soup.new_tag ("center")
title_tag = soup.new_tag ('h1')
title_tag.string = title
center_tag.insert (1, title_tag)
body.insert (1, center_tag)
html = str (body)
Change the relative path of the src of the img tag in the # body to an absolute path
pattern = "(<img. *? src = \") (. *?) (\ ")"
def func (m):
if not m.group (3) .startswith ("http"):
rtn = m.group (1) + "http://www.liaoxuefeng.com" + m.group (2) + m.group (3)
return rtn
else:
return m.group (1) + m.group (2) + m.group (3)
html = re.compile (pattern) .sub (func, html)
html = html_template.format (content = html)
html = html.encode ("utf-8")
with open (name, 'wb') as f:
f.write (html)
return name
except Exception as e:
logging.error ("Parse error", exc_info = True)
3. Convert html to pdf
def save_pdf (htmls, file_name):
"" "
Save all html files to pdf file
: param htmls: list of html files
: param file_name: pdf file name
: return:
"" "
options = {
'page-size': 'Letter',
'margin-top': '0.75in',
'margin-right': '0.75in',
'margin-bottom': '0.75in',
'margin-left': '0.75in',
'encoding': "UTF-8",
'custom-header': [
('Accept-Encoding', 'gzip')
],
'cookie': [
('cookie-name1', 'cookie-value1'),
('cookie-name2', 'cookie-value2'),
],
'outline-depth': 10,
}
pdfkit.from_file (htmls, file_name, options = options)
4. Combine the converted single PDF into one PDF
merger = PdfFileMerger ()
for pdf in pdfs:
merger.append (open (pdf, 'rb'))
print u "Merge completion" + str (i) + 'pdf' + pdf
Full source code:
# coding = utf-8
import os
import re
import time
import logging
import pdfkit
import requests
from bs4 import BeautifulSoup
from PyPDF2 import PdfFileMerger
html_template = "" "
<! DOCTYPE html>
<html lang = "en">
<head>
<meta charset = "UTF-8">
</ head>
<body>
{content}
</ body>
</ html>
"" "
def parse_url_to_html (url, name):
"" "
Parse URL and return HTML content
: param url: Parsed URL
: param name: Saved html file name
: return: html
"" "
try:
response = requests.get (url)
soup = BeautifulSoup (response.content, 'html.parser')
# Body
body = soup.find_all (class _ = "x-wiki-content") [0]
# Title
title = soup.find ('h4'). get_text ()
# The title is added to the front of the text and displayed in the center
center_tag = soup.new_tag ("center")
title_tag = soup.new_tag ('h1')
title_tag.string = title
center_tag.insert (1, title_tag)
body.insert (1, center_tag)
html = str (body)
Change the relative path of the src of the img tag in the # body to an absolute path
pattern = "(<img. *? src = \") (. *?) (\ ")"
def func (m):
if not m.group (3) .startswith ("http"):
rtn = m.group (1) + "http://www.liaoxuefeng.com" + m.group (2) + m.group (3)
return rtn
else:
return m.group (1) + m.group (2) + m.group (3)
html = re.compile (pattern) .sub (func, html)
html = html_template.format (content = html)
html = html.encode ("utf-8")
with open (name, 'wb') as f:
f.write (html)
return name
except Exception as e:
loggin
g.error ("Parse error", exc_info = True)
def get_url_list ():
"" "
Get a list of all URL directories
: return:
"" "
response = requests.get ("http://www.liaoxuefeng.com/wiki/0014316089557264a6b348958f449949df42a6d3a2e542c000")
soup = BeautifulSoup (response.content, "html.parser")
menu_tag = soup.find_all (class _ = "uk-nav uk-nav-side") [1]
urls = []
for li in menu_tag.find_all ("li"):
url = "http://www.liaoxuefeng.com" + li.a.get ('href')
urls.append (url)
return urls
def save_pdf (htmls, file_name):
"" "
Save all html files to pdf file
: param htmls: list of html files
: param file_name: pdf file name
: return:
"" "
options = {
'page-size': 'Letter',
'margin-top': '0.75in',
'margin-right': '0.75in',
'margin-bottom': '0.75in',
'margin-left': '0.75in',
'encoding': "UTF-8",
'custom-header': [
('Accept-Encoding', 'gzip')
],
'cookie': [
('cookie-name1', 'cookie-value1'),
('cookie-name2', 'cookie-value2'),
],
'outline-depth': 10,
}
pdfkit.from_file (htmls, file_name, options = options)
def main ():
start = time.time ()
file_name = u "liaoxuefeng_Python3_tutorial"
urls = get_url_list ()
for index, url in enumerate (urls):
parse_url_to_html (url, str (index) + ".html")
htmls = []
pdfs = []
for i in range (0,124):
htmls.append (str (i) + '. html')
pdfs.append (file_name + str (i) + '. pdf')
save_pdf (str (i) + '. html', file_name + str (i) + '. pdf')
print u "Conversion completed" + str (i) + 'html'
merger = PdfFileMerger ()
for pdf in pdfs:
merger.append (open (pdf, 'rb'))
print u "Merge completion" + str (i) + 'pdf' + pdf
output = open (u "廖雪峰 Python_all.pdf", "wb")
merger.write (output)
print u "Output PDF succeeded!"
for html in htmls:
os.remove (html)
print u "Delete temporary files" + html
for pdf in pdfs:
os.remove (pdf)
print u "Delete temporary files" + pdf
total_time = time.time ()-start
print (u "Total time:% f seconds"% total_time)
if __name__ == '__main__':
main ()
The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion;
products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the
content of the page makes you feel confusing, please write us an email, we will handle the problem
within 5 days after receiving your email.
If you find any instances of plagiarism from the community, please send an email to:
info-contact@alibabacloud.com
and provide relevant evidence. A staff member will contact you within 5 working days.