1) ② crawl part of the travel news of Guangming network and the travel news of Guangming Network
1 _ author _ = 'minmin' 2 # coding: UTF-8 3 import re, urllib, sgmllib 4 5 # obtain html 6 def getHtml (url) based on the current url ): 7 page = urllib. urlopen (url) 8 html = page. read () 9 page. close () 10 return html11 12 # obtain the expected article content based on html 13 def func (str): 14 result = re. findall (r "<p style = \" TEXT-INDENT: 30px; MARGIN: 0px 3px 15px \ "> ([^ <>] *) </p> ", getHtml (url), re. m) or re. findall (r "<p> ([^ <>] *) </p>", getHtml (url), re. m) 15 # or re. finda Ll (r "<p style = \" TEXT-JUSTIFY: distriify; TEXT-ALIGN: justify \ "align = \" justify \ "> (.*?) </P> ", getHtml (url), re. m) 16 artical = ''17 for j in result: 18 if len (j) <> 0: 19 j = j. replace ("& nbsp;", "") 20 j = j. replace ("<strong>", "") # replace <STRONG> with "" 21 j = j. replace ("</strong> ","") # Replace </STROGN> with "" 22 artical = artical + j + '\ n' 23 return artical24 25 # The html link label is "", the link property is "href", that is, to obtain the values of all tags in html = a and attrs = href. 26 class URLPaser (sgmllib. SGMLParser): 27 def reset (blank): 28 sgmllib. SGMLParser. reset (blank) 29 blank. urls = [] 30 31 def start_a (blank, attrs): 32 href = [v for k, v in attrs if k = 'href '] 33 if href: 34 blank. urls. extend (href) 35 36 IParser = URLPaser () 37 socket = urllib. urlopen ("http://travel.gmw.cn/node_39034.htm") # Open this page 38 39 # fout = file('qq_art_urls.txt ', 'w') # Write this link to this file 40 IParser. feed (socket. read () # analyze LA 41 42 reg = 'HTTP: // travel.gmw.cn/2015-.*' # This is used to match matching links and match 43 reg2 = 'HTTP: // travel.gmw.cn/2014-.*'44 pattern = re. compile (reg) 45 patter = re. compile (reg2) 46 I = 047 url2 = [] 48 for url in IParser. urls: # links exist in urls 49 url = "http://travel.gmw.cn/" + url50 if pattern. match (url): 51 if url not in url2: 52 url2.append (url) 53 print url54 artical = func (url) 55 print artical56 if len (artical) <> I = I + 158 f = open ("gmw/travel/" + str (I) + '.txt ', 'a +') 59 f. write (artical) 60 f. close () 61 62 if patter. match (url): 63 if url not in url2: 64 url2.append (url) 65 print url66 print artical67 if len (artical) <> 0: 68 I = I + 169 f = open ("gmw/travel/" + str (I) + '.txt ', 'a +') 70 f. write (artical) 71 f. close ()