Resources:
Python:http://www.runoob.com/python/python-intro.html
Python Crawler series Tutorial: http://www.cnblogs.com/xin-xin/p/4297852.html
Regular expression: http://www.cnblogs.com/deerchao/archive/2006/08/24/zhengzhe30fengzhongjiaocheng.html
This paste target:
1. To crawl any post of Baidu bar paste
2. Specify whether to crawl only the landlord post content
3. Analyze and save the crawled content to a file
4. Crawl the images that appear in the post
#-*-coding:utf-8-*-"""Created on Fri Apr 11:47:02 2016@author:wuhan"""ImportUrllibImportUrllib2ImportReImport TimeImportOS#Reload (SYS)#sys.setdefaultencoding ("Utf-8")classtool:removeimg= Re.compile ('| {A}') removeaddr= Re.compile ('<a.*?>|</a>') ReplaceLine= Re.compile ('<tr>|<div>|</div>|</p>') replacetd= Re.compile ('<td>') Replacepara= Re.compile ('<p.*?>') Replacebr= Re.compile ('<br><br>|<br>') Removeextratag= Re.compile ('<.*?>') defreplace (self,x): x= Re.sub (self.removeimg,"", x) x= Re.sub (SELF.REMOVEADDR,"", x) x= Re.sub (Self.replaceline,"\ n", x) x= Re.sub (Self.replacebr,"\ n", x) x= Re.sub (Self.replacepara,"\ n", x) x= Re.sub (self.replacetd,"\ t", x) x= Re.sub (Self.removeextratag,"", X)returnX.strip ()classBDTB:def __init__(self, baseUrl, Seelz, Floortag): Self.baseurl=BASEURL Self.seelz='? see_lz='+Str (Seelz) Self.tool=Tool () self.file=None Self.floor= 1Self.defaulttitle= u'Baidu Paste'Self.floortag=Floortagdefgetpage (Self, pagenum):Try: URL= Self.baseurl + Self.seelz +'&pn='+Str (pagenum) Request=Urllib2. Request (URL) Response=Urllib2.urlopen (Request)returnResponse.read (). Decode ('Utf-8') exceptUrllib2. Urlerror, E:ifHasattr (E,"reason"): PrintU'Baidu Post link failure, the cause of the error:', E.reasonreturnNonedefGetTitle (Self, page): Pattern= Re.compile ('', Re. S) Result=re.search (Pattern, page)ifResult:returnResult.group (1). Strip ()Else: returnNonedefgetpagenum (Self, page): Pattern= Re.compile ('<li class= "l_reply_num.*?</span>.*?<span.*?> (. *?) </span>', Re. S) Result=re.search (Pattern, page)ifResult:returnResult.group (1). Strip ()Else: returnNonedefgetcontents (self,page): Pattern= Re.compile ('<div id= "post_content.*?> (. *?) </div>', Re. S) Items=Re.findall (Pattern, page) Contents= [] forIteminchitems:content="\ n"+ self.tool.replace (item) +"\ n"contents.append (Content.encode ('Utf-8')) returnContentsdefSetfiletitle (self, title):ifTitle is notNone:self.file= Open (title +". txt","w+") Else: Self.file= Open (Self.defaulttitle +". txt","w+") defWriteData (self, contents): forIteminchContents:ifSelf.floortag = ='1': Floorline="\ n"+ STR (self.floor) + u"----------------------------------------------------------------------------------------------------------- ------------------------------\ n"self.file.write (floorline) self.file.write (item) Self.floor+ = 1defStart (self): Indexpage= Self.getpage (1) Pagenum=Self.getpagenum (indexpage) title=self.gettitle (indexpage) self.setfiletitle (title)ifPagenum = =None:Print "URL is invalid, please try again" return Try: Print "the posts"+ STR (pagenum) +"page" forIinchRange (1, int (pagenum) +1): Print "is writing the article"+ STR (i) +"page Data"page=self.getpage (i) Contents=self.getcontents (page) self.writedata (contents) self.getpicture (page, i)exceptIOError, E:Print "Write exception, Reason"+E.messagefinally: Print "Write Task Completion" defgetpicture (self, page, pagenum): Reg= R''Imgre= Re.compile (reg)#The regular expression can be compiled into a regular expression objectImglist = Re.findall (imgre,page)#reading data in HTML that contains Imgre (regular expressions)t =Time.localtime (Time.time ()) FolderName= str (t.__getattribute__("Tm_year"))+"-"+str (T.__getattribute__("Tm_mon"))+"-"+str (T.__getattribute__("Tm_mday")) Picpath='e:\\python\\imagedownload\\%s'% (foldername)#local directory to which to download if notOs.path.exists (Picpath):#creates a path when it does not existos.makedirs (picpath) x=0 forImgurlinchImglist:target= picpath+'\\%s_%s.jpg'%(Pagenum, x) Urllib.urlretrieve (Imgurl, target)#download Remote data directly to a localX+=1PrintU"Please enter the post code"BaseURL='http://tieba.baidu.com/p/'+ STR (raw_input (U'http://tieba.baidu.com/p/')) Seelz= Raw_input ("whether only to get the landlord to speak, is input 1, no input 0\n". Decode ('Utf-8'). Encode ('GBK')) Floortag= Raw_input ("whether to write floor information, is input 1, no input 0\n". Decode ('Utf-8'). Encode ('GBK')) Bdtb=BDTB (Baseurl,seelz,floortag) Bdtb.start ()
Web crawler Introduction--Case one: crawl Baidu Post