After learning the next beautifulsoup, do a network crawler, crawling Readers magazine and Reportlab made into PDF.
crawler.py
Copy the Code code as follows:
#!/usr/bin/env python
#coding =utf-8
"""
Author:anemone
Filename:getmain.py
Last modified:2015-02-19 16:47
E-mail:anemone@82flex.com
"""
Import Urllib2
From BS4 import BeautifulSoup
Import re
Import Sys
Reload (SYS)
Sys.setdefaultencoding (' Utf-8 ')
def geteacharticle (URL):
# response = Urllib2.urlopen (' http://www.52duzhe.com/2015_01/duzh20150104.html ')
Response = Urllib2.urlopen (URL)
html = Response.read ()
Soup = beautifulsoup (HTML) #.decode ("Utf-8"). Encode ("GBK"))
#for i in Soup.find_all (' div '):
# Print i,1
Title=soup.find ("H1"). String
Writer=soup.find (id= "Pub_date"). String.strip ()
_from=soup.find (id= "Media_name"). String.strip ()
Text=soup.get_text () #.encode ("Utf-8")
Main=re.split ("baidu_clb.*;", text)
result={"title": Title, "writer": Writer, "from": _from, "context": main[1]}
return result
#new =open ("New.txt", "W")
#new. Write (result["title"]+ "\ n")
#new. Write (result["writer"]+ "" +result["from"])
#new. Write (result["context"])
#new. Close ()
def getcatalog (issue):
Url= "http://www.52duzhe.com/" +issue[:4]+ "_" +issue[-2:]+ "/"
firsturl=url+ "Duzh" +issue+ "01.html"
firsturl=url+ "Index.html"
Duzhe=dict ()
Response = Urllib2.urlopen (Firsturl)
html = Response.read ()
Soup=beautifulsoup (HTML)
Firsturl=url+soup.table.a.get ("href")
Response = Urllib2.urlopen (Firsturl)
html = Response.read ()
Soup = beautifulsoup (HTML)
All=soup.find_all ("H2")
For I in all:
Print i.string
Duzhe[i.string]=list ()
For link in I.parent.find_all ("a"):
Href=url+link.get ("href")
Print href
While 1:
Try
article=geteacharticle (HREF)
Break
Except
Continue
Duzhe[i.string].append (article)
Return Duzhe
def readduzhe (Duzhe):
For Eachcolumn in Duzhe:
For eacharticle in Duzhe[eachcolumn]:
Print eacharticle["title"]
if __name__ = = ' __main__ ':
# Issue=raw_input ("issue (201501):")
Readduzhe (GetCatalog ("201424"))
getpdf.py
Copy the Code code as follows:
#!/usr/bin/env python
#coding =utf-8
"""
Author:anemone
Filename:writetopdf.py
Last modified:2015-02-20 19:19
E-mail:anemone@82flex.com
"""
#coding =utf-8
Import Reportlab.rl_config
From reportlab.pdfbase import Pdfmetrics
From reportlab.pdfbase.ttfonts import Ttfont
From Reportlab.lib Import fonts
Import Copy
From Reportlab.platypus import Paragraph, simpledoctemplate,flowables
From Reportlab.lib.styles import Getsamplestylesheet
Import crawler
def writepdf (Issue,duzhe):
reportlab.rl_config.warnOnMissingFontGlyphs = 0
Pdfmetrics.registerfont (Ttfont (' song ', "SIMSUN.TTC"))
Pdfmetrics.registerfont (Ttfont (' Hei ', "MSYH.TTC"))
Fonts.addmapping (' song ', 0, 0, ' song ')
Fonts.addmapping (' song ', 0, 1, ' song ')
Fonts.addmapping (' song ', 1, 0, ' Hei ')
Fonts.addmapping (' song ', 1, 1, ' Hei ')
Stylesheet=getsamplestylesheet ()
Normalstyle = copy.deepcopy (stylesheet[' Normal ')
Normalstyle.fontname = ' song '
Normalstyle.fontsize = 11
normalstyle.leading = 11
Normalstyle.firstlineindent = 20
TitleStyle = copy.deepcopy (stylesheet[' Normal ')
Titlestyle.fontname = ' song '
Titlestyle.fontsize = 15
Titlestyle.leading = 20
Firsttitlestyle = copy.deepcopy (stylesheet[' Normal ')
Firsttitlestyle.fontname = ' song '
Firsttitlestyle.fontsize = 20
Firsttitlestyle.leading = 20
Firsttitlestyle.firstlineindent = 50
Smallstyle = copy.deepcopy (stylesheet[' Normal ')
Smallstyle.fontname = ' song '
Smallstyle.fontsize = 8
Smallstyle.leading = 8
story = []
Story.append (Paragraph ("
Reader {0} period". Format (issue), Firsttitlestyle))
For Eachcolumn in Duzhe:
Story.append (Paragraph (' __ ' *28, TitleStyle))
Story.append (Paragraph ('
{0}'. Format (eachcolumn), TitleStyle))
For eacharticle in Duzhe[eachcolumn]:
Story.append (Paragraph (eacharticle["title"],normalstyle))
Story.append (flowables. PageBreak ())
For Eachcolumn in Duzhe:
For eacharticle in Duzhe[eachcolumn]:
Story.append (Paragraph ("
{0}". Format (eacharticle[" title "]), TitleStyle))
Story.append (Paragraph ("{0} {1}". Format (eacharticle["writer"],eacharticle["from"]), Smallstyle)
para=eacharticle["Context"].split ("")
For Eachpara in para:
Story.append (Paragraph (Eachpara,normalstyle))
Story.append (flowables. PageBreak ())
#story. Append (Paragraph ("context", Normalstyle))
doc = Simpledoctemplate ("Duzhe" +issue+ ". pdf")
Print "Writing PDF ..."
Doc.build (story)
def main (issue):
Duzhe=crawler.getcatalog (issue)
Writepdf (Issue,duzhe)
if __name__ = = ' __main__ ':
Issue=raw_input ("Enter issue (201501):")
Main (issue)
The above is the whole content of this article, I hope you can enjoy.