Copy Code code as follows:
#coding: Utf-8
Import Urllib.request
Import Xml.dom.minidom
Import Sqlite3
Import threading
Import time
Class Logger (object):
def log (self,*msg):
For I in msg:
Print (i)
Log = Logger ()
Log.log (' Test ')
Class Downloader (object):
def __init__ (Self,url):
Self.url = URL
def download (self):
Log.log (' Start download ', Self.url)
Try
Content = Urllib.request.urlopen (self.url). Read ()
#req = urllib.request.Request (URL)
#response = Urllib.request.urlopen (req)
#content = Response.read ()
Log.log (' Download complete ')
Return (content)
Except
Log.log (' download error ')
Return (None)
Class Parser (object):
def __init__ (self,content):
#获得根节点
self.html = xml.dom.minidom.parseString (content)
Def parse (self):
Log.log (' Start extracting data ')
Contents = {' content ': ', ' url ': []}
#获得div节点
DIVs = self.html.getElementsByTagName (' div ')
#获得content节点
For Div in divs:
If Div.hasattribute (' class ') and \
Div.getattribute (' class ') = = ' content ':
#获得糗事百科的内容
Textnode = Div.childnodes[0]
Qcontent = Textnode.data
#数据填充
contents[' content '] = qcontent
#获得上一糗事, next embarrassment node
spans = self.html.getElementsByTagName (' span ')
For span in spans:
Pspan = Span.parentnode
if pspan.tagname = = ' A ':
#pspan为对应的链接, you need to add the corresponding address to the database at this time
url = pspan.getattribute (' href ')
qid = url[10:][:-4]
#数据填充
contents[' url '].append (qid)
Log.log (' Extract data Complete ')
Return (contents)
def downloadpage (qid,db):
url = ' http://www.qiushibaike.com/articles/' +str (qid) + '. htm '
Content = Downloader (url). Download ()
If content:
Contents = parser (content). Parse ()
If contents[' content ']:
Db.updatecontent (qid,contents[' content ')
For i in contents[' URL ']:
Db.addqid (i)
If Len (contents[' url ']) = = 2:
Db.updatestatus (qid,2)
#下载池, indicating the number of links that are allowed to download simultaneously
Class Downloaderpool (object):
def __init__ (self,maxlength=15):
Self.downloaders = [None]*maxlength
Self.downloadlist = []
Self.db = None
def setdownloadlist (self,downloadlist):
Self.downloadlist = List (set (Self.downloadlist+downloadlist))
def setdb (self,db):
SELF.DB = db
def daemon (self):
#每隔一秒查询线程的状态, the inactive thread is set to None
Log.log (' Set daemon ')
For Index,downloader in Enumerate (self.downloaders):
If Downloader:
If not downloader.isalive ():
Log.log (' Put the downloader on Empty ', index)
Self.downloaders[index] = None
#检查线程池状态
For Index,downloader in Enumerate (self.downloaders):
If not downloader:
qid = self.getqid ()
If qid:
#创建线程
t = Threading. Thread (target=downloadpage,args= (qid,self.db))
Self.downloaders[index] = t
T.start ()
T.join ()
Log.log (' Setup Downloader ', index)
#间隔一秒执行一次
Time.sleep (1)
def getqid (self):
Try
TMP = self.downloadlist[0]
Del Self.downloadlist[0]
Return (TMP)
Except
Return (None)
def begindownload (self):
#创建守护线程
Daemon = Threading. Thread (Target=self.daemon)
Daemon.setdaemon (True)
Daemon.start ()
Daemon.join ()
def getdownloader (self):
For Index,downloader in Enumerate (self.downloaders):
If not downloader:
Return (Index)
Return (None)
add_q_id = ' INSERT INTO Qiushibaike (id,success) VALUES (?,?) '
update_q_content = ' Update qiushibaike set content=? where id=? '
Update_q_status = ' Update qiushibaike set success=? where id=? '
Q_list = ' Select id from qiushibaike where success=? '
q_list_by_id = ' SELECT count (*) from qiushibaike where id=? '
Class Dbconnect (object):
"""
CREATE TABLE Qiushibaike (
Id,integer
Content,varchar
Success,interger
)
#id表示糗事的ID
#content表示糗事的内容
#success表示是否下载成功, when the contents of the scandal download complete, and get the previous page, the next page ID means download complete
1 means not completed
2 means complete
"""
def __init__ (self,dbpath= ' Db.sqlite '):
Self.dbpath = DBPath
def addqid (self,qid):
Log.log (' Insert embarrassing encyclopedia ', qid)
#获得连接
cn = Sqlite3.connect (Self.dbpath)
c = Cn.cursor ()
Try
#添加内容并提交
C.execute (add_q_id, (qid,1))
Cn.commit ()
Except
Log.log (' Add ID error ', qid)
#关闭连接
C.close ()
Cn.close ()
Log.log (' Insert success ')
def updatecontent (self,qid,content):
Log.log (' Update Encyclopedia ', qid,content)
#获得连接
cn = Sqlite3.connect (Self.dbpath)
c = Cn.cursor ()
#添加内容并提交
C.execute (Update_q_content, (content,qid))
Cn.commit ()
#关闭连接
C.close ()
Cn.close ()
Log.log (' Update successful ')
def updatestatus (Self,qid,flag):
Log.log (' Update status ', Qid,flag)
#获得连接
cn = Sqlite3.connect (Self.dbpath)
c = Cn.cursor ()
#添加内容并提交
C.execute (Update_q_status, (flag,qid))
Cn.commit ()
#关闭连接
C.close ()
Cn.close ()
Log.log (' Update status succeeded ')
def getlist (self,undonloaded=1):
Log.log (' Get list ')
L = []
#获得连接
cn = Sqlite3.connect (Self.dbpath)
c = Cn.cursor ()
#获得数据
C.execute (Q_list, (undonloaded,))
rows = C.fetchall ()
For I in rows:
L.append (I[0])
#关闭连接
C.close ()
Cn.close ()
Log.log (' Get list successful ')
Return (L)
Class Singledownloader (object):
def __init__ (self):
Self.downloadlist = []
def setdb (self,db):
SELF.DB = db
def setdownloadlist (self,downloadlist):
Self.downloadlist = List (set (Self.downloadlist+downloadlist))
def begindownload (self):
For I in Self.downloadlist:
Downloadpage (i,self.db)
def main ():
db = Dbconnect (' Db.sqlite ')
#dp = Downloaderpool ()
#dp. SETDB (DB)
SP = Singledownloader ()
SP.SETDB (DB)
Dp=sp
Undownloadedlist = Db.getlist ()
#当还有未下载的糗事时就要继续下载
while (Len (undownloadedlist)):
#使用该列表填充下载池
Dp.setdownloadlist (Undownloadedlist)
Dp.begindownload ()
Time.sleep (1)
#重置参数
Undownloadedlist = Db.getlist ()
if __name__ = = ' __main__ ':
Main ()
The code is OK, it works, but you want to do the following 2 things:
1. Multi-Threaded Download
2, code separation of higher, with the object-oriented