1 __author__='Minmin'2 #Coding:utf-83 ImportRe,urllib,sgmllib,os4 5 #get HTML based on the current URL6 defgethtml (URL):7page =urllib.urlopen (URL)8HTML =Page.read ()9 page.close ()Ten returnHTML One A #get the article content you want based on HTML - deffunc (str): -result = Re.findall (r"<p.*?> ([^<>]*) </p>", gethtml (URL), re. M) theartical ="' - - forJinchResult: - ifLen (j) <>0: +j = J.replace ("<strong>"," ") -j = J.replace ("</strong>"," ") +j = J.replace ("<br>"," ") Aj = J.replace (" "," ") atj = J.replace ("“"," ") -j = J.replace ("”"," ") -j = J.replace ("·"," ") -artical = artical + j +'\ n' - returnartical - in #The HTML link is labeled "A", and the linked property is "href", which is to get all the TAG=A,ATTRS=HREF values in the HTML. - classUrlpaser (sgmllib. Sgmlparser): to defReset (self): + Sgmllib. Sgmlparser.reset (self) -Self.urls = [] the * defstart_a (self,attrs): $href = [v forKvinchAttrsifK = ='href']Panax Notoginseng ifhref: - self.urls.extend (HREF) the +Iparser =Urlpaser () ASocket = Urllib.urlopen ("http://economy.china.com/internet/")#Open this page the + #fout = file (' Qq_art_urls.txt ', ' W ') #要把这个链接写到这个文件中 -Iparser.feed (Socket.read ())#analysis. $ $Reg ='http://economy.china.com/internet/.*'#this is used to match eligible links, using regular expression matching - -Pattern =Re.compile (REG) the -OS.GETCWD ()#get the current folder pathWuyiOs.path.sep#current system path delimiter the - #determine if a file exists Wu ifOs.path.exists ('china_news_technology')==False: -Os.makedirs ('china_news_technology') Abouti =0 $URL2 = [] - forUrlinchIparser.urls:#links are available in URLs -URL ="http://economy.china.com"+URL - ifpattern.match (URL): A ifUrl not inchUrl2: + url2.append (URL) theurl = url.replace (". html","_all.html#page_2")#as an article is divided into pages, find the page that shows the full page -artical =func (URL) $ ifLen (artical) <>0: the Printartical the PrintURL thei = i + 1 thef = open ("china_news_technology/"+str (i) +'. txt','A +') - f.write (artical) inF.close ()
1) ① Crawl China News Network technology related parts news