1 __author__='Minmin'2 #Coding:utf-83 ImportRe,urllib,sgmllib4 5 #get HTML based on the current URL6 defgethtml (URL):7page =urllib.urlopen (URL)8HTML =Page.read ()9 page.close ()Ten returnHTML One A #get the article content you want based on HTML - deffunc (str): -result= Re.findall (r"<p style=\ "text-indent:30px; margin:0px 3px 15px\ "> ([^<>]*) </p>", gethtml (URL), re. MorRe.findall (R"<p> ([^<>]*) </p>", gethtml (URL), re. M) the #or Re.findall (r "<p style=\" Text-justify:distribute; Text-align:justify\ "align=\" justify\ "> (. *?) </p> ", gethtml (URL), re. M) -artical ="' - forJinchResult: - ifLen (j) <>0: +j = J.replace (" ","") -j = J.replace ("<strong>"," ")#Remove the <strong>, and replace it with "" . +j = J.replace ("</strong>"," ")#Remove </STROGN> Replace "" Aartical = artical + j +'\ n' at returnartical - - #The HTML link is labeled "A", and the linked property is "href", which is to get all the TAG=A,ATTRS=HREF values in the HTML. - classUrlpaser (sgmllib. Sgmlparser): - defReset (blank): - Sgmllib. Sgmlparser.reset (blank) inBlank.urls = [] - to defstart_a (blank,attrs): +href = [v forKvinchAttrsifK = ='href'] - ifhref: the blank.urls.extend (HREF) * $Iparser =Urlpaser ()Panax NotoginsengSocket = Urllib.urlopen ("http://travel.gmw.cn/node_39034.htm")#Open this page - the #fout = file (' Qq_art_urls.txt ', ' W ') #要把这个链接写到这个文件中 +Iparser.feed (Socket.read ())#analysis. A theReg ='http://travel.gmw.cn/2015-.*' #this is used to match eligible links, using regular expression matching +Reg2='http://travel.gmw.cn/2014-.*' -Pattern =Re.compile (REG) $Patter =re.compile (REG2) $I=0 -Url2=[] - forUrlinchIparser.urls:#links are available in URLs theURL ="http://travel.gmw.cn/"+URL - ifpattern.match (URL):Wuyi ifUrl not inchUrl2: the url2.append (URL) - PrintURL Wuartical =func (URL) - Printartical About ifLen (artical) <>0: $i = i + 1 -f = open ("gmw/travel/"+str (i) +'. txt','A +') - f.write (artical) - f.close () A + ifpatter.match (URL): the ifUrl not inchUrl2: - url2.append (URL) $ PrintURL the Printartical the ifLen (artical) <>0: thei = i + 1 thef = open ("gmw/travel/"+str (i) +'. txt','A +') - f.write (artical) inF.close ()
1) ② crawl to the bright Web Part travel news