Download the content of the _python version of the encyclopedia

Download the content of the _python version of the encyclopedia _python

Last Update:2017-01-18 Source: Internet

Author: User

Tags commit sleep sqlite

Developer on Alibaba Coud: Build your first app with APIs, SDKs, and tutorials on the Alibaba Cloud. Read more ＞

Copy Code code as follows:

#coding: Utf-8

Import Urllib.request
Import Xml.dom.minidom
Import Sqlite3
Import threading
Import time

Class Logger (object):
def log (self,*msg):
For I in msg:
Print (i)

Log = Logger ()
Log.log (' Test ')

Class Downloader (object):

def __init__ (Self,url):
Self.url = URL

def download (self):
Log.log (' Start download ', Self.url)
Try
Content = Urllib.request.urlopen (self.url). Read ()
#req = urllib.request.Request (URL)
#response = Urllib.request.urlopen (req)
#content = Response.read ()
Log.log (' Download complete ')
Return (content)
Except
Log.log (' download error ')
Return (None)

Class Parser (object):

def __init__ (self,content):
#获得根节点
self.html = xml.dom.minidom.parseString (content)

Def parse (self):
Log.log (' Start extracting data ')
Contents = {' content ': ', ' url ': []}
#获得div节点
DIVs = self.html.getElementsByTagName (' div ')
#获得content节点
For Div in divs:
If Div.hasattribute (' class ') and \
Div.getattribute (' class ') = = ' content ':
#获得糗事百科的内容
Textnode = Div.childnodes[0]
Qcontent = Textnode.data
#数据填充
contents[' content '] = qcontent

#获得上一糗事, next embarrassment node
spans = self.html.getElementsByTagName (' span ')
For span in spans:
Pspan = Span.parentnode
if pspan.tagname = = ' A ':
#pspan为对应的链接, you need to add the corresponding address to the database at this time
url = pspan.getattribute (' href ')
qid = url[10:][:-4]
#数据填充
contents[' url '].append (qid)
Log.log (' Extract data Complete ')
Return (contents)

def downloadpage (qid,db):
url = ' http://www.qiushibaike.com/articles/' +str (qid) + '. htm '
Content = Downloader (url). Download ()
If content:
Contents = parser (content). Parse ()
If contents[' content ']:
Db.updatecontent (qid,contents[' content ')
For i in contents[' URL ']:
Db.addqid (i)
If Len (contents[' url ']) = = 2:
Db.updatestatus (qid,2)

#下载池, indicating the number of links that are allowed to download simultaneously
Class Downloaderpool (object):
def __init__ (self,maxlength=15):
Self.downloaders = [None]*maxlength
Self.downloadlist = []
Self.db = None

def setdownloadlist (self,downloadlist):
Self.downloadlist = List (set (Self.downloadlist+downloadlist))

def setdb (self,db):
SELF.DB = db

def daemon (self):
#每隔一秒查询线程的状态, the inactive thread is set to None
Log.log (' Set daemon ')
For Index,downloader in Enumerate (self.downloaders):
If Downloader:
If not downloader.isalive ():
Log.log (' Put the downloader on Empty ', index)
Self.downloaders[index] = None

#检查线程池状态
For Index,downloader in Enumerate (self.downloaders):
If not downloader:
qid = self.getqid ()
If qid:
#创建线程
t = Threading. Thread (target=downloadpage,args= (qid,self.db))
Self.downloaders[index] = t
T.start ()
T.join ()
Log.log (' Setup Downloader ', index)
#间隔一秒执行一次
Time.sleep (1)

def getqid (self):
Try
TMP = self.downloadlist[0]
Del Self.downloadlist[0]
Return (TMP)
Except
Return (None)

def begindownload (self):
#创建守护线程
Daemon = Threading. Thread (Target=self.daemon)
Daemon.setdaemon (True)
Daemon.start ()
Daemon.join ()

def getdownloader (self):
For Index,downloader in Enumerate (self.downloaders):
If not downloader:
Return (Index)
Return (None)

add_q_id = ' INSERT INTO Qiushibaike (id,success) VALUES (?,?) '
update_q_content = ' Update qiushibaike set content=? where id=? '
Update_q_status = ' Update qiushibaike set success=? where id=? '
Q_list = ' Select id from qiushibaike where success=? '
q_list_by_id = ' SELECT count (*) from qiushibaike where id=? '
Class Dbconnect (object):
"""
CREATE TABLE Qiushibaike (
Id,integer
Content,varchar
Success,interger
)
#id表示糗事的ID
#content表示糗事的内容
#success表示是否下载成功, when the contents of the scandal download complete, and get the previous page, the next page ID means download complete
1 means not completed
2 means complete
"""
def __init__ (self,dbpath= ' Db.sqlite '):
Self.dbpath = DBPath

def addqid (self,qid):
Log.log (' Insert embarrassing encyclopedia ', qid)
#获得连接
cn = Sqlite3.connect (Self.dbpath)
c = Cn.cursor ()

Try
#添加内容并提交
C.execute (add_q_id, (qid,1))
Cn.commit ()
Except
Log.log (' Add ID error ', qid)

#关闭连接
C.close ()

Cn.close ()
Log.log (' Insert success ')

def updatecontent (self,qid,content):
Log.log (' Update Encyclopedia ', qid,content)
#获得连接
cn = Sqlite3.connect (Self.dbpath)
c = Cn.cursor ()
#添加内容并提交
C.execute (Update_q_content, (content,qid))
Cn.commit ()
#关闭连接
C.close ()
Cn.close ()
Log.log (' Update successful ')

def updatestatus (Self,qid,flag):
Log.log (' Update status ', Qid,flag)
#获得连接
cn = Sqlite3.connect (Self.dbpath)
c = Cn.cursor ()
#添加内容并提交
C.execute (Update_q_status, (flag,qid))
Cn.commit ()
#关闭连接
C.close ()
Cn.close ()
Log.log (' Update status succeeded ')

def getlist (self,undonloaded=1):
Log.log (' Get list ')
L = []
#获得连接
cn = Sqlite3.connect (Self.dbpath)
c = Cn.cursor ()
#获得数据
C.execute (Q_list, (undonloaded,))
rows = C.fetchall ()

For I in rows:
L.append (I[0])
#关闭连接
C.close ()
Cn.close ()

Log.log (' Get list successful ')
Return (L)

Class Singledownloader (object):
def __init__ (self):
Self.downloadlist = []

def setdb (self,db):
SELF.DB = db

def setdownloadlist (self,downloadlist):
Self.downloadlist = List (set (Self.downloadlist+downloadlist))

def begindownload (self):
For I in Self.downloadlist:
Downloadpage (i,self.db)

def main ():
db = Dbconnect (' Db.sqlite ')
#dp = Downloaderpool ()
#dp. SETDB (DB)
SP = Singledownloader ()
SP.SETDB (DB)

Dp=sp

Undownloadedlist = Db.getlist ()
#当还有未下载的糗事时就要继续下载
while (Len (undownloadedlist)):
#使用该列表填充下载池
Dp.setdownloadlist (Undownloadedlist)

Dp.begindownload ()

Time.sleep (1)
#重置参数
Undownloadedlist = Db.getlist ()

if __name__ = = ' __main__ ':
Main ()

The code is OK, it works, but you want to do the following 2 things:
1. Multi-Threaded Download
2, code separation of higher, with the object-oriented

This article is an English version of an article which is originally in the Chinese language on aliyun.com and is provided for information purposes only. This website makes no representation or warranty of any kind, either expressed or implied, as to the accuracy, completeness ownership or reliability of the article or any translations thereof. If you have any concerns or complaints relating to the article, please send an email, providing a detailed description of the concern or complaint, to info-contact@alibabacloud.com. A staff member will contact you within 5 working days. Once verified, infringing content will be removed immediately.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

Get Started for Free

Sales Support

1 on 1 presale consultation

Chat Contact Sales
After-Sales Support

24/7 Technical Support 6 Free Tickets per Quarter Faster Response

Open a Ticket
Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.

Learn More