#-*-Coding:utf-8-*-"" "Created on Thu June 09:37:48 2014@author:lifeix" "" Import reimport urllib2import Cookieliburl = ' http://www.cnblogs.com/wendingding/tag/IOS%E5%BC%80%E5%8F%91/default.html?page= ' #url = ' http://www.cnblogs.com /smileevday/category/578973.html?page= ' reg = ' <a id= ' \w+ ' href= ' http://www.cnblogs.com/\w+/p/\w+.html ' >\s*\t *\n*\s*\t*\s*.*?\t*\n*\t*\s*</a> ' Def startparse (author,page=1): CJ = Cookielib. Lwpcookiejar () Cookie_support = Urllib2. Httpcookieprocessor (CJ) opener = Urllib2.build_opener (cookie_support,urllib2. HttpHandler) Urllib2.install_opener (opener) headers = {' user-agent ': ' mozilla/5.0 (Windows NT 6.1; WOW64; rv:14.0) gecko/20100101 firefox/14.0.1 ', ' Referer ': "http://www.cnblogs.com"} flag = True wh Ile flag = = True:nurl = URL + str (page) req = urllib2. Request (nurl,headers=headers) resp = Urllib2.urlopen (req) data = Resp.read () regex = Re.compile (reg , Flags=re. MULTILINE) result = Regex.findall (data) for D in Result:print D if Len (result) < 20: Flag = False Else:page = page + 1 print ' Finished----------------------page:%d '%page if __name__ = = ' __main__ ': Startparse (', 1)
Use Python to get a hyperlink and title to a blog Park author's list of articles