Download the content of the embarrassing encyclopedia version

Download the content of the embarrassing encyclopedia version _python

Last Update:2016-06-06 Source: Internet

Author: User

Developer on Alibaba Coud: Build your first app with APIs, SDKs, and tutorials on the Alibaba Cloud. Read more ＞

The code is as follows:

#coding: Utf-8

Import Urllib.request
Import Xml.dom.minidom
Import Sqlite3
Import threading
Import time

Class Logger (object):
def log (self,*msg):
For I in msg:
Print (i)

Log = Logger ()
Log.log (' under test ')

Class Downloader (object):

def __init__ (Self,url):
Self.url = URL

def download (self):
Log.log (' Start download ', Self.url)
Try
Content = Urllib.request.urlopen (self.url). Read ()
#req = urllib.request.Request (URL)
#response = Urllib.request.urlopen (req)
#content = Response.read ()
Log.log (' Download complete ')
Return (content)
Except
Log.log (' download error ')
Return (None)

Class Parser (object):

def __init__ (self,content):
#获得根节点
self.html = xml.dom.minidom.parseString (content)

Def parse (self):
Log.log (' Start extracting data ')
Contents = {' content ': ', ' url ': []}
#获得div节点
DIVs = self.html.getElementsByTagName (' div ')
#获得content节点
For Div in divs:
If Div.hasattribute (' class ') and \
Div.getattribute (' class ') = = ' content ':
#获得糗事百科的内容
Textnode = Div.childnodes[0]
Qcontent = Textnode.data
#数据填充
contents[' content ' = qcontent

#获得上一糗事, next embarrassing thing node
spans = self.html.getElementsByTagName (' span ')
For span in spans:
Pspan = Span.parentnode
if pspan.tagname = = ' A ':
#pspan为对应的链接, the corresponding address needs to be added to the database at this time
url = pspan.getattribute (' href ')
qid = url[10:][:-4]
#数据填充
contents[' url '].append (qid)
Log.log (' Extract data Complete ')
Return (contents)

def downloadpage (qid,db):
url = ' http://www.qiushibaike.com/articles/' +str (qid) + '. htm '
Content = Downloader (url). Download ()
If content:
Contents = parser (content). Parse ()
If contents[' content ']:
Db.updatecontent (qid,contents[' content ')
For i in contents[' URL ']:
Db.addqid (i)
If Len (contents[' url ')) = = 2:
Db.updatestatus (qid,2)

#下载池 that indicates the number of links allowed to download at the same time
Class Downloaderpool (object):
def __init__ (self,maxlength=15):
Self.downloaders = [None]*maxlength
Self.downloadlist = []
Self.db = None

def setdownloadlist (self,downloadlist):
Self.downloadlist = List (set (Self.downloadlist+downloadlist))

def setdb (self,db):
SELF.DB = db

def daemon (self):
#每隔一秒查询线程的状态, set to none for inactive threads
Log.log (' Set daemon ')
For Index,downloader in Enumerate (self.downloaders):
If Downloader:
If not downloader.isalive ():
Log.log (' Empty The Downloader ', index)
Self.downloaders[index] = None

#检查线程池状态
For Index,downloader in Enumerate (self.downloaders):
If not downloader:
qid = self.getqid ()
If qid:
#创建线程
t = Threading. Thread (target=downloadpage,args= (qid,self.db))
Self.downloaders[index] = t
T.start ()
T.join ()
Log.log (' Set Downloader ', index)
#间隔一秒执行一次
Time.sleep (1)

def getqid (self):
Try
TMP = self.downloadlist[0]
Del Self.downloadlist[0]
Return (TMP)
Except
Return (None)

def begindownload (self):
#创建守护线程
Daemon = Threading. Thread (Target=self.daemon)
Daemon.setdaemon (True)
Daemon.start ()
Daemon.join ()

def getdownloader (self):
For Index,downloader in Enumerate (self.downloaders):
If not downloader:
Return (Index)
Return (None)

add_q_id = ' INSERT INTO Qiushibaike (id,success) VALUES (?,?) '
update_q_content = ' Update qiushibaike set content=? where id=? '
Update_q_status = ' Update qiushibaike set success=? where id=? '
Q_list = ' Select id from qiushibaike where success=? '
q_list_by_id = ' SELECT count (*) from qiushibaike where id=? '
Class Dbconnect (object):
"""
CREATE TABLE Qiushibaike (
Id,integer
Content,varchar
Success,interger
)
#id表示糗事的ID
#content表示糗事的内容
#success表示是否下载成功, when the embarrassing content download is complete and the ID is obtained, the download is complete.
1 means not completed
2 indicates completion
"""
def __init__ (self,dbpath= ' Db.sqlite '):
Self.dbpath = DBPath

def addqid (self,qid):
Log.log (' Insert embarrassing encyclopedia ', qid)
#获得连接
cn = Sqlite3.connect (Self.dbpath)
c = Cn.cursor ()

Try
#添加内容并提交
C.execute (add_q_id, (qid,1))
Cn.commit ()
Except
Log.log (' Error adding ID ', qid)

#关闭连接
C.close ()

Cn.close ()
Log.log (' Insert success ')

def updatecontent (self,qid,content):
Log.log (' Update embarrassing encyclopedia ', qid,content)
#获得连接
cn = Sqlite3.connect (Self.dbpath)
c = Cn.cursor ()
#添加内容并提交
C.execute (Update_q_content, (content,qid))
Cn.commit ()
#关闭连接
C.close ()
Cn.close ()
Log.log (' Update succeeded ')

def updatestatus (Self,qid,flag):
Log.log (' Update status ', Qid,flag)
#获得连接
cn = Sqlite3.connect (Self.dbpath)
c = Cn.cursor ()
#添加内容并提交
C.execute (Update_q_status, (flag,qid))
Cn.commit ()
#关闭连接
C.close ()
Cn.close ()
Log.log (' Update status succeeded ')

def getList (self,undonloaded=1):
Log.log (' Get list ')
L = []
#获得连接
cn = Sqlite3.connect (Self.dbpath)
c = Cn.cursor ()
#获得数据
C.execute (Q_list, (undonloaded,))
rows = C.fetchall ()

For I in rows:
L.append (I[0])
#关闭连接
C.close ()
Cn.close ()

Log.log (' Get list success ')
Return (L)

Class Singledownloader (object):
def __init__ (self):
Self.downloadlist = []

def setdb (self,db):
SELF.DB = db

def setdownloadlist (self,downloadlist):
Self.downloadlist = List (set (Self.downloadlist+downloadlist))

def begindownload (self):
For I in Self.downloadlist:
Downloadpage (i,self.db)

def main ():
db = Dbconnect (' Db.sqlite ')
#dp = Downloaderpool ()
#dp. SETDB (DB)
SP = Singledownloader ()
SP.SETDB (DB)

Dp=sp

Undownloadedlist = Db.getlist ()
#当还有未下载的糗事时就要继续下载
while (Len (undownloadedlist)):
#使用该列表填充下载池
Dp.setdownloadlist (Undownloadedlist)

Dp.begindownload ()

Time.sleep (1)
#重置参数
Undownloadedlist = Db.getlist ()

if __name__ = = ' __main__ ':
Main ()

The code is no problem and works fine, but you want to do the following 2 things:
1, multi-threaded download
2. Higher code separation, with object-oriented



This article is an English version of an article which is originally in the Chinese language on aliyun.com and is provided for information purposes only. This website makes no representation or warranty of any kind, either expressed or implied, as to the accuracy, completeness ownership or reliability of the article or any translations thereof. If you have any concerns or complaints relating to the article, please send an email, providing a detailed description of the concern or complaint, to info-contact@alibabacloud.com. A staff member will contact you within 5 working days. Once verified, infringing content will be removed immediately.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

Get Started for Free

Sales Support

1 on 1 presale consultation

Chat Contact Sales
After-Sales Support

24/7 Technical Support 6 Free Tickets per Quarter Faster Response

Open a Ticket
Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.

Learn More

Download the content of the embarrassing encyclopedia version _python

Contact Us

What's Trending

Top 10 Tags

Top 10 Keywords

A Free Trial That Lets You Build Big!

Sales Support

After-Sales Support

Download the content of the embarrassing encyclopedia version _python

Contact Us

What's Trending

Top 10 Tags

Top 10 Keywords

Trending Topic

A Free Trial That Lets You Build Big!

Sales Support

After-Sales Support