Download the content of the _python version of the encyclopedia _python

Source: Internet
Author: User
Tags commit sleep sqlite
Copy Code code as follows:

#coding: Utf-8

Import Urllib.request
Import Xml.dom.minidom
Import Sqlite3
Import threading
Import time

Class Logger (object):
def log (self,*msg):
For I in msg:
Print (i)

Log = Logger ()
Log.log (' Test ')

Class Downloader (object):

def __init__ (Self,url):
Self.url = URL

def download (self):
Log.log (' Start download ', Self.url)
Try
Content = Urllib.request.urlopen (self.url). Read ()
#req = urllib.request.Request (URL)
#response = Urllib.request.urlopen (req)
#content = Response.read ()
Log.log (' Download complete ')
Return (content)
Except
Log.log (' download error ')
Return (None)


Class Parser (object):

def __init__ (self,content):
#获得根节点
self.html = xml.dom.minidom.parseString (content)

Def parse (self):
Log.log (' Start extracting data ')
Contents = {' content ': ', ' url ': []}
#获得div节点
DIVs = self.html.getElementsByTagName (' div ')
#获得content节点
For Div in divs:
If Div.hasattribute (' class ') and \
Div.getattribute (' class ') = = ' content ':
#获得糗事百科的内容
Textnode = Div.childnodes[0]
Qcontent = Textnode.data
#数据填充
contents[' content '] = qcontent

#获得上一糗事, next embarrassment node
spans = self.html.getElementsByTagName (' span ')
For span in spans:
Pspan = Span.parentnode
if pspan.tagname = = ' A ':
#pspan为对应的链接, you need to add the corresponding address to the database at this time
url = pspan.getattribute (' href ')
qid = url[10:][:-4]
#数据填充
contents[' url '].append (qid)
Log.log (' Extract data Complete ')
Return (contents)

def downloadpage (qid,db):
url = ' http://www.qiushibaike.com/articles/' +str (qid) + '. htm '
Content = Downloader (url). Download ()
If content:
Contents = parser (content). Parse ()
If contents[' content ']:
Db.updatecontent (qid,contents[' content ')
For i in contents[' URL ']:
Db.addqid (i)
If Len (contents[' url ']) = = 2:
Db.updatestatus (qid,2)

#下载池, indicating the number of links that are allowed to download simultaneously
Class Downloaderpool (object):
def __init__ (self,maxlength=15):
Self.downloaders = [None]*maxlength
Self.downloadlist = []
Self.db = None

def setdownloadlist (self,downloadlist):
Self.downloadlist = List (set (Self.downloadlist+downloadlist))

def setdb (self,db):
SELF.DB = db

def daemon (self):
#每隔一秒查询线程的状态, the inactive thread is set to None
Log.log (' Set daemon ')
For Index,downloader in Enumerate (self.downloaders):
If Downloader:
If not downloader.isalive ():
Log.log (' Put the downloader on Empty ', index)
Self.downloaders[index] = None

#检查线程池状态
For Index,downloader in Enumerate (self.downloaders):
If not downloader:
qid = self.getqid ()
If qid:
#创建线程
t = Threading. Thread (target=downloadpage,args= (qid,self.db))
Self.downloaders[index] = t
T.start ()
T.join ()
Log.log (' Setup Downloader ', index)
#间隔一秒执行一次
Time.sleep (1)

def getqid (self):
Try
TMP = self.downloadlist[0]
Del Self.downloadlist[0]
Return (TMP)
Except
Return (None)

def begindownload (self):
#创建守护线程
Daemon = Threading. Thread (Target=self.daemon)
Daemon.setdaemon (True)
Daemon.start ()
Daemon.join ()

def getdownloader (self):
For Index,downloader in Enumerate (self.downloaders):
If not downloader:
Return (Index)
Return (None)


add_q_id = ' INSERT INTO Qiushibaike (id,success) VALUES (?,?) '
update_q_content = ' Update qiushibaike set content=? where id=? '
Update_q_status = ' Update qiushibaike set success=? where id=? '
Q_list = ' Select id from qiushibaike where success=? '
q_list_by_id = ' SELECT count (*) from qiushibaike where id=? '
Class Dbconnect (object):
"""
CREATE TABLE Qiushibaike (
Id,integer
Content,varchar
Success,interger
)
#id表示糗事的ID
#content表示糗事的内容
#success表示是否下载成功, when the contents of the scandal download complete, and get the previous page, the next page ID means download complete
1 means not completed
2 means complete
"""
def __init__ (self,dbpath= ' Db.sqlite '):
Self.dbpath = DBPath

def addqid (self,qid):
Log.log (' Insert embarrassing encyclopedia ', qid)
#获得连接
cn = Sqlite3.connect (Self.dbpath)
c = Cn.cursor ()

Try
#添加内容并提交
C.execute (add_q_id, (qid,1))
Cn.commit ()
Except
Log.log (' Add ID error ', qid)

#关闭连接
C.close ()

Cn.close ()
Log.log (' Insert success ')

def updatecontent (self,qid,content):
Log.log (' Update Encyclopedia ', qid,content)
#获得连接
cn = Sqlite3.connect (Self.dbpath)
c = Cn.cursor ()
#添加内容并提交
C.execute (Update_q_content, (content,qid))
Cn.commit ()
#关闭连接
C.close ()
Cn.close ()
Log.log (' Update successful ')

def updatestatus (Self,qid,flag):
Log.log (' Update status ', Qid,flag)
#获得连接
cn = Sqlite3.connect (Self.dbpath)
c = Cn.cursor ()
#添加内容并提交
C.execute (Update_q_status, (flag,qid))
Cn.commit ()
#关闭连接
C.close ()
Cn.close ()
Log.log (' Update status succeeded ')

def getlist (self,undonloaded=1):
Log.log (' Get list ')
L = []
#获得连接
cn = Sqlite3.connect (Self.dbpath)
c = Cn.cursor ()
#获得数据
C.execute (Q_list, (undonloaded,))
rows = C.fetchall ()

For I in rows:
L.append (I[0])
#关闭连接
C.close ()
Cn.close ()

Log.log (' Get list successful ')
Return (L)

Class Singledownloader (object):
def __init__ (self):
Self.downloadlist = []

def setdb (self,db):
SELF.DB = db

def setdownloadlist (self,downloadlist):
Self.downloadlist = List (set (Self.downloadlist+downloadlist))

def begindownload (self):
For I in Self.downloadlist:
Downloadpage (i,self.db)

def main ():
db = Dbconnect (' Db.sqlite ')
#dp = Downloaderpool ()
#dp. SETDB (DB)
SP = Singledownloader ()
SP.SETDB (DB)

Dp=sp

Undownloadedlist = Db.getlist ()
#当还有未下载的糗事时就要继续下载
while (Len (undownloadedlist)):
#使用该列表填充下载池
Dp.setdownloadlist (Undownloadedlist)

Dp.begindownload ()

Time.sleep (1)
#重置参数
Undownloadedlist = Db.getlist ()

if __name__ = = ' __main__ ':
Main ()


The code is OK, it works, but you want to do the following 2 things:
1. Multi-Threaded Download
2, code separation of higher, with the object-oriented

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.