#!/usr/bin/env python#-*-coding:utf-8-*-Importurllib.requestImportReImportMySQLdbImportSocketdomain='http://www.quanshuwang.com'Headers= { 'user-agent':'mozilla/5.0 (Windows NT 10.0; WOW64) applewebkit/537.36 (khtml, like Gecko) chrome/55.0.2883.87 safari/537.36'}#Get category Listdefgettypelist (type): Req= Urllib.request.Request ('http://www.quanshuwang.com/map/%s.html'%type) req.headers= Headers#Replace header information #Req.add_header () #添加单个头信息res = Urllib.request.urlopen (req)#Get Source codehtml = Res.read (). Decode ('GBK')#decodingReg = R'<a href= "(/book/.+?)" target= "_blank" > (. +?) </a>'Reg= Re.compile (reg)#compiling returnRe.findall (reg,html)defgetnovellist (HREF): req= urllib.request.Request (domain +href) req.headers=Headers Res=Urllib.request.urlopen (req) HTML= Res.read (). Decode ('GBK') Reg= R'<li><a href= "(. +?)" Title= "(. +?)" > (. +?) </a></li>'Reg=Re.compile (REG)returnRe.findall (reg,html)defgetnovelcontent (URL): Req= urllib.request.Request (domain +URL) req.headers=Headers Res=Urllib.request.urlopen (req) HTML= Res.read (). Decode ('GBK','Ignore') Reg= R'style5\ (\);</script> (. *?) <script type= "Text/javascript" >style6\ (\)'Reg=Re.compile (reg,re. S)Print(Domain +URL)returnRe.findall (reg,html) [0]classSql (object): Conn= MySQLdb.connect (host='localhost', port=x,user=' x', password=' x', db='novel', charset='UTF8') defaddnovels (self,sort,novelname): cur= Self.conn.cursor ()#CursorsCur.execute ("insert into novel (Sort,novelname) VALUES ('%s ', '%s ')"%(sort,novelname)) Lastrowid=cur.lastrowid cur.close () self.conn.commit ()returnLastrowiddefaddchapters (self,novelid,chaptername,content): cur=self.conn.cursor () Cur.execute ("INSERT into chapter (Novelid,chaptername,content) values (%s, '%s ', '%s ')"%(novelid,chaptername,content)) Cur.close () self.conn.commit () MySQL=Sql ()if __name__=='__main__': forTypeinchRange (1,10): ifType = = 1: Sort="Fantasy Magic" elifType = = 2: Sort="martial Arts and real repair" elifType = = 3: Sort="Historical Military" elifType = = 4: Sort="Female Frequency Romance" elifType = = 5: Sort="Detective Reasoning" elifType = = 6: Sort="Online Animation" elifType = = 7: Sort="Science Fiction" elifType = = 8: Sort="Horror Supernatural" elifType = = 9: Sort="American and Korean" Else: Print("The requested novel type is wrong!!! ") forHref,novelnameinchgettypelist (type): Lastrowid=mysql.addnovels (sort,novelname) forUrl,title,titleinchgetnovellist (HREF):Try: Print("crawling------------%s '%s '%s"%(sort,novelname,title)) content= Getnovelcontent (Href.replace ('index.html', URL)) Mysql.addchapters (Novelid=lastrowid,chaptername=title,content=content) Socket.setdefaulttimeout (30) exceptException as E:Print("connection interrupted with error:%s!!!!"%e
Crawling the whole site of novel content-"Dog Hi-Silent record"-