Crawl novels of the novel website and save it to the database
First step: Get the content of the novel first
#!/usr/bin/python# -*- coding: utf-8 -*-import urllib2,redomain = '/HTTP Www.quanshu.net ' headers = { "user-agent": "mozilla/5.0 (Windows nt 6.3; win64; x64) AppleWebKit/537.36 (Khtml, like gecko) chrome/58.0.3029.110 safari/537.36 "}def gettypelist (pn=1): #获取分类列表的函数 Req = urllib2. Request (' http://www.quanshu.net/map/%s.html ' % pn) #实例将要请求的对象 req.headers = headers #替换所有头信息 #req. Add_header () #添加单个头信息 res = urllib2.urlopen (req) #开始请求 Html = res.read (). Decode (' GBK ') #decode解码, decoded into unicode reg = r ' <a href= ' (/book/.*?) " target= "_blank" > (. *?) </a> ' reg = re.compile(reg) #增加匹配效率 regular match the returned type is List return re.findall (reg,html) def Getnovellist (URL): #获取章节列表函数 req = urllib2. Request (Domain + url) req.headers = headers Res = urllib2.urlopen (req) html = res.read (). Decode (' GBK ') reg = r ' <li><a href= ' (. *?) " title= ". *?" > (. *?) </a></li> ' reg = re.compile (reg) return re.findall (reg,html) def getnovelcontent (URL): #获取章节内容 req = urllib2. Request (Domain + url) req.headers = headers Res = urllib2.urlopen (req) html = res.read (). Decode (' GBK ') reg = r ' style5\ (\);</script> (. *?) <script type= "Text/javascript" >style6\ (\) ' return re.findall (reg,html) [ 0]if __name__ == ' __main__ ': for type in range (1,10): for url,title in gettypelist (type): for zurl,ztitle in getnovellist (URL ): print u ' Regular crawl----%s ' %ztitle content = getnovelcontent (Url.replace (' index.html ', Zurl)) print content break break
The following results are performed:
650) this.width=650; "src=" Https://s3.51cto.com/wyfs02/M02/97/1E/wKiom1kpIdThRNdaAAIyGKHHJXo728.png "title=" Qq20170527144625.png "alt=" Wkiom1kpidthrndaaaiygkhhjxo728.png "/>
Step two: Store to database
1. Design Database
1.1 New library: novel
650) this.width=650; "src=" Https://s4.51cto.com/wyfs02/M01/97/23/wKiom1kpOr7yfa9aAABoSmd-OnA036.png "title=" 1.png "alt=" Wkiom1kpor7yfa9aaabosmd-ona036.png "/>
1.2 Design Table: Novel
650) this.width=650; "src=" Https://s2.51cto.com/wyfs02/M00/97/24/wKioL1kpOvXi6UDzAABUheeLxB0623.png "title=" 2.png "alt=" Wkiol1kpovxi6udzaabuheelxb0623.png "/>
1.3 Design Table: Chapter
650) this.width=650; "src=" Https://s3.51cto.com/wyfs02/M02/97/23/wKiom1kpO4CgKxdUAAB6tbFOxwU677.png "style=" float : none; "title=" 3-1.png "alt=" Wkiom1kpo4cgkxduaab6tbfoxwu677.png "/>
and set the foreign key
650) this.width=650; "src=" Https://s3.51cto.com/wyfs02/M01/97/24/wKioL1kpO4CANrBSAABInMXtb-E068.png "style=" float : none; "title=" 3-2.png "alt=" Wkiol1kpo4canrbsaabinmxtb-e068.png "/>
2. Writing scripts
#!/usr/bin/python# -*- coding: utf-8 -*-import urllib2,reimport mysqldbclass sql (object): conn = mysqldb.connect (host= ' 192.168.19.213 ', port=3306, User= ' root ', passwd= ' Admin123 ', db= ' novel ', charset= ' UTF8 ') def addnovels (Self,sort, Novelname): cur = self.conn.cursor () cur.execute ("Insert into novel (sort,novelname) values (%s , '%s ') " % (sort,novelname)) lastrowid = cur.lastrowid cur.close () self.conn.commit () return lastrowid def addchapters (self,novelid,chaptername,content): cur = selF.conn.cursor () cur.execute ("Insert into chapter ( novelid,chaptername,content) values (%s , '%s ' , '%s ') " % (novelid,chaptername,content)) cur.close () Self.conn.commit () domain = ' http://www.quanshu.net ' headers = { " User-agent ": " mozilla/5.0 (windows nt 6.3; win64; x64) AppleWebKit/537.36 (Khtml, like gecko) chrome/58.0.3029.110 safari/537.36 "}def gettypelist (pn=1): #获取分类列表的函数 req = urllib2. Request (' http://www.quanshu.net/map/%s.html ' % pn) #实例将要请求的对象 req.headers = headers #替换所有头信息 #req. Add_header () #添加单个头信息 res = urllib2.urlopen (req) #开始请求 html = res.read (). Decode (' GBK ') #decode解码, decoded into unicode reg = r ' <a href= ' (/book/.*?) " target= "_blank" > (. *?) </a> ' reg = re.compile (reg) #增加匹配效率 regular match returns a type of list return re.findall (reg,html) def getnovellist (URL): #获取章节列表函数 req = urllib2. Request (Domain + url) req.headers = headers Res = urllib2.urlopen (req) html = res.read (). Decode (' GBK ') reg = r ' <li><a href= ' (. *?) " title= ". *?" > (. *?) </a></li> ' reg = re.compile (reg) return re.findall (reg,html) def getnovelcontent (URL): #获取章节内容 req = urllib2. ReqUest (Domain + url) req.headers = headers res = urllib2.urlopen (req) html = res.read (). Decode (' GBK ') reg = r ' style5\ (\);</script> (. *?) <script type= "Text/javascript" >style6\ (\) ' return re.findall (reg,html) [ 0]mysql = sql () if __name__ == ' __main__ ': for sort In range (1,10): for url,title in Gettypelist (sort): lastrowid = mysql.addnovels (Sort, title) for zurl,ztitle in getnovellist (URL): print u ' regular crawl----%s ' %ztitLe content = getnovelcontent (Url.replace (' index.html ', Zurl)) print u ' storing----%s ' %ztitle mysql.addchapters (Lastrowid,ztitle, Content
3. Execute script
650) this.width=650; "src=" Https://s3.51cto.com/wyfs02/M02/97/24/wKioL1kpO_SwfYOdAAHAvlowP2k275.png "title=" 5.1. PNG "alt=" Wkiol1kpo_swfyodaahavlowp2k275.png "/>
4. View the database
650) this.width=650; "src=" Https://s4.51cto.com/wyfs02/M02/97/25/wKioL1kpRJ7hsGI_AABKvJGXbtY583.png "title=" 8.png "alt=" Wkiol1kprj7hsgi_aabkvjgxbty583.png "/>
650) this.width=650; "src=" Https://s4.51cto.com/wyfs02/M01/97/25/wKioL1kpRKvyQVNUAAEwW-UKPY8568.png "title=" 9.png "alt=" Wkiol1kprkvyqvnuaaeww-ukpy8568.png "/>
You can see that the storage has succeeded.
Error:
_mysql_exceptions. Operationalerror: (1364, "Field ' novelid ' doesn ' t have a default value")
Solution: Execute SQL statement
SELECT @ @GLOBAL. Sql_mode;
SET @ @GLOBAL. sql_mode= "No_engine_substitution";
650) this.width=650; "src=" Https://s4.51cto.com/wyfs02/M00/97/24/wKioL1kpPCXjTHm5AABPJKISpZs573.png "style=" float : none; "title=" 7.png "alt=" Wkiol1kppcxjthm5aabpjkispzs573.png "/>
Error Reference: http://blog.sina.com.cn/s/blog_6d2b3e4901011j9w.html
This article is from the "M April Days" blog, please be sure to keep this source http://msiyuetian.blog.51cto.com/8637744/1931102
Python crawler: Crawling fiction and storing it in a database