1 __author__='Minmin'2 #Coding:utf-83 ImportRe,urllib,sgmllib,os4 5 #get HTML based on the current URL6 defgethtml (URL):7page =urllib.urlopen (URL)8HTML =Page.read ()9 page.close ()Ten returnHTML One A #get the article content you want based on HTML - deffunc (str): -result= Re.findall (r"<p> ([^<>]*) </p>", gethtml (URL), re. M) theartical ="' - forJinchResult: - ifLen (j) <>0: -j = J.replace (" ","") +j = J.replace ("<STRONG>"," ")#Remove the <strong>, and replace it with "" . -j = J.replace ("</STRONG>"," ")#Remove </STROGN> Replace "" +artical = artical + j +'\ n' A returnartical at - #The HTML link is labeled "A", and the linked property is "href", which is to get all the TAG=A,ATTRS=HREF values in the HTML. - classUrlpaser (sgmllib. Sgmlparser): - defReset (self): - Sgmllib. Sgmlparser.reset (self) -Self.urls = [] in - defstart_a (self,attrs): tohref = [v forKvinchAttrsifK = ='href'] + ifhref: - self.urls.extend (HREF) the *Iparser =Urlpaser () $Socket = Urllib.urlopen ("http://mil.news.sina.com.cn/")#Open this pagePanax Notoginseng - #fout = file (' Qq_art_urls.txt ', ' W ') #要把这个链接写到这个文件中 theIparser.feed (Socket.read ())#analysis. + AReg ='http://mil.news.sina.com.cn/.*'#this is used to match eligible links, using regular expression matching the +Pattern =Re.compile (REG) - $ $OS.GETCWD ()#get the current folder path -Os.path.sep#current system path delimiter - the #determine if a file exists - ifOs.path.exists ('sina_military')==False:WuyiOs.makedirs ('sina_military') the -i =0 WuURL2 = [] - forUrlinchIparser.urls:#links are available in URLs About $ ifpattern.match (URL): - ifUrl not inchUrl2: - url2.append (URL) -artical =func (URL) A Printartical + ifLen (artical) <>0: thei = i + 1 -f = open ("sina_military/"+ STR (i) +'. txt','A +') $ f.write (artical) theF.close ()
1) ④ Crawl Sina Military news, and store the contents in the corresponding folder