Download the content of the embarrassing encyclopedia version _python

Source: Internet
Author: User
The code is as follows:


#coding: Utf-8

Import Urllib.request
Import Xml.dom.minidom
Import Sqlite3
Import threading
Import time

Class Logger (object):
def log (self,*msg):
For I in msg:
Print (i)

Log = Logger ()
Log.log (' under test ')

Class Downloader (object):

def __init__ (Self,url):
Self.url = URL

def download (self):
Log.log (' Start download ', Self.url)
Try
Content = Urllib.request.urlopen (self.url). Read ()
#req = urllib.request.Request (URL)
#response = Urllib.request.urlopen (req)
#content = Response.read ()
Log.log (' Download complete ')
Return (content)
Except
Log.log (' download error ')
Return (None)


Class Parser (object):

def __init__ (self,content):
#获得根节点
self.html = xml.dom.minidom.parseString (content)

Def parse (self):
Log.log (' Start extracting data ')
Contents = {' content ': ', ' url ': []}
#获得div节点
DIVs = self.html.getElementsByTagName (' div ')
#获得content节点
For Div in divs:
If Div.hasattribute (' class ') and \
Div.getattribute (' class ') = = ' content ':
#获得糗事百科的内容
Textnode = Div.childnodes[0]
Qcontent = Textnode.data
#数据填充
contents[' content ' = qcontent

#获得上一糗事, next embarrassing thing node
spans = self.html.getElementsByTagName (' span ')
For span in spans:
Pspan = Span.parentnode
if pspan.tagname = = ' A ':
#pspan为对应的链接, the corresponding address needs to be added to the database at this time
url = pspan.getattribute (' href ')
qid = url[10:][:-4]
#数据填充
contents[' url '].append (qid)
Log.log (' Extract data Complete ')
Return (contents)

def downloadpage (qid,db):
url = ' http://www.qiushibaike.com/articles/' +str (qid) + '. htm '
Content = Downloader (url). Download ()
If content:
Contents = parser (content). Parse ()
If contents[' content ']:
Db.updatecontent (qid,contents[' content ')
For i in contents[' URL ']:
Db.addqid (i)
If Len (contents[' url ')) = = 2:
Db.updatestatus (qid,2)

#下载池 that indicates the number of links allowed to download at the same time
Class Downloaderpool (object):
def __init__ (self,maxlength=15):
Self.downloaders = [None]*maxlength
Self.downloadlist = []
Self.db = None

def setdownloadlist (self,downloadlist):
Self.downloadlist = List (set (Self.downloadlist+downloadlist))

def setdb (self,db):
SELF.DB = db

def daemon (self):
#每隔一秒查询线程的状态, set to none for inactive threads
Log.log (' Set daemon ')
For Index,downloader in Enumerate (self.downloaders):
If Downloader:
If not downloader.isalive ():
Log.log (' Empty The Downloader ', index)
Self.downloaders[index] = None

#检查线程池状态
For Index,downloader in Enumerate (self.downloaders):
If not downloader:
qid = self.getqid ()
If qid:
#创建线程
t = Threading. Thread (target=downloadpage,args= (qid,self.db))
Self.downloaders[index] = t
T.start ()
T.join ()
Log.log (' Set Downloader ', index)
#间隔一秒执行一次
Time.sleep (1)

def getqid (self):
Try
TMP = self.downloadlist[0]
Del Self.downloadlist[0]
Return (TMP)
Except
Return (None)

def begindownload (self):
#创建守护线程
Daemon = Threading. Thread (Target=self.daemon)
Daemon.setdaemon (True)
Daemon.start ()
Daemon.join ()

def getdownloader (self):
For Index,downloader in Enumerate (self.downloaders):
If not downloader:
Return (Index)
Return (None)


add_q_id = ' INSERT INTO Qiushibaike (id,success) VALUES (?,?) '
update_q_content = ' Update qiushibaike set content=? where id=? '
Update_q_status = ' Update qiushibaike set success=? where id=? '
Q_list = ' Select id from qiushibaike where success=? '
q_list_by_id = ' SELECT count (*) from qiushibaike where id=? '
Class Dbconnect (object):
"""
CREATE TABLE Qiushibaike (
Id,integer
Content,varchar
Success,interger
)
#id表示糗事的ID
#content表示糗事的内容
#success表示是否下载成功, when the embarrassing content download is complete and the ID is obtained, the download is complete.
1 means not completed
2 indicates completion
"""
def __init__ (self,dbpath= ' Db.sqlite '):
Self.dbpath = DBPath

def addqid (self,qid):
Log.log (' Insert embarrassing encyclopedia ', qid)
#获得连接
cn = Sqlite3.connect (Self.dbpath)
c = Cn.cursor ()

Try
#添加内容并提交
C.execute (add_q_id, (qid,1))
Cn.commit ()
Except
Log.log (' Error adding ID ', qid)

#关闭连接
C.close ()

Cn.close ()
Log.log (' Insert success ')

def updatecontent (self,qid,content):
Log.log (' Update embarrassing encyclopedia ', qid,content)
#获得连接
cn = Sqlite3.connect (Self.dbpath)
c = Cn.cursor ()
#添加内容并提交
C.execute (Update_q_content, (content,qid))
Cn.commit ()
#关闭连接
C.close ()
Cn.close ()
Log.log (' Update succeeded ')

def updatestatus (Self,qid,flag):
Log.log (' Update status ', Qid,flag)
#获得连接
cn = Sqlite3.connect (Self.dbpath)
c = Cn.cursor ()
#添加内容并提交
C.execute (Update_q_status, (flag,qid))
Cn.commit ()
#关闭连接
C.close ()
Cn.close ()
Log.log (' Update status succeeded ')

def getList (self,undonloaded=1):
Log.log (' Get list ')
L = []
#获得连接
cn = Sqlite3.connect (Self.dbpath)
c = Cn.cursor ()
#获得数据
C.execute (Q_list, (undonloaded,))
rows = C.fetchall ()

For I in rows:
L.append (I[0])
#关闭连接
C.close ()
Cn.close ()

Log.log (' Get list success ')
Return (L)

Class Singledownloader (object):
def __init__ (self):
Self.downloadlist = []

def setdb (self,db):
SELF.DB = db

def setdownloadlist (self,downloadlist):
Self.downloadlist = List (set (Self.downloadlist+downloadlist))

def begindownload (self):
For I in Self.downloadlist:
Downloadpage (i,self.db)

def main ():
db = Dbconnect (' Db.sqlite ')
#dp = Downloaderpool ()
#dp. SETDB (DB)
SP = Singledownloader ()
SP.SETDB (DB)

Dp=sp

Undownloadedlist = Db.getlist ()
#当还有未下载的糗事时就要继续下载
while (Len (undownloadedlist)):
#使用该列表填充下载池
Dp.setdownloadlist (Undownloadedlist)

Dp.begindownload ()

Time.sleep (1)
#重置参数
Undownloadedlist = Db.getlist ()

if __name__ = = ' __main__ ':
Main ()


The code is no problem and works fine, but you want to do the following 2 things:
1, multi-threaded download
2. Higher code separation, with object-oriented
  • Related Article

    Contact Us

    The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

    If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

    A Free Trial That Lets You Build Big!

    Start building with 50+ products and up to 12 months usage for Elastic Compute Service

    • Sales Support

      1 on 1 presale consultation

    • After-Sales Support

      24/7 Technical Support 6 Free Tickets per Quarter Faster Response

    • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.