Using Python for distributed Web page data Capture (III.)--coding implementation

Last Update:2018-07-26 Source: Internet

Author: User

Tags assert thread class

Developer on Alibaba Coud: Build your first app with APIs, SDKs, and tutorials on the Alibaba Cloud. Read more ＞

Hehe, the first two sections don't seem to have much to do with Python. This section is all about code,

This is my first time to write Python, a lot of places are messy, mainly to see the logical process.

I've got a big head for the coded format. Remove the page do not know what the encoding, so first find CharSet, and then to Unicode. Unified under Unicode operation, but the database is UTF8, Windows console must be GBK, but I IDE console must be UTF8. So there will be debug this variable exists ... The main purpose is to control the output encoding.

This program has run for 24 hours, then distributed on 10 machines deployed on a long period of time is basically no problem.

After that, you will crawl the Web page 100,000 times a day.

The source code is as follows:

Content crawling and tools

' Created on 2010-9-15 @author: Chenggong ' ' Import urllib2 import re import socket DEBUG = 0 ' ' Tools ' class tools () : #log函数 @staticmethod def writelog (level,info,notify=false): if DEBUG = = 0:try:print "[+level+]" +info.decode (' UTF-8 ' ). Encode (' GBK ') except:print "[+level+"] "+info.encode (' GBK ') else:print" ["+level+"] "+info #if notify: # print" [Notif Y] Report Administrator!! "#转unicode @staticmethod def tounicode (s,charset): if (charset = =" "): return s else:try:u = Unicode (S, Char Set) Except:u = "" Return u #正则抓取 # @param single whether to crawl only one @staticmethod def getfrompatten (patten,src,single=false): rst = ""; p = re.compile (patten,re. S) all = P.findall (src) to matcher in All:rst + = Matcher + "" if (single): Break Return Rst.strip () ' Web content crawler ' CLA SS Pagegripper (): Url_open_timeout = ten #网页超时时间 max_retry = 3 #最大重试次数 def __init__ (self): Socket.setdefaulttimeout (self. Url_open_timeout) #获取字符集 def getcharset (self,s): rst = Tools.getfrompatten (U ' charset= (. *?) "', s,true" if rst!= "": If RSt = = "UTF8": rst = "Utf-8" return rst #尝试获取页面 def downloadurl (self,url): CharSet = "" page = "" retry = 0 while true:try: fp = urllib2.urlopen (URL) break except Urllib2. Httperror,e: #状态错误 tools.writelog (' error ', ' HTTP status error code= ' +e.code ') raise URLLIB2. Httperror except Urllib2. Urlerror,e: #网络错误超时 tools.writelog (' Warn ', ' page access timeout, retry ... ') retry+=1 if (Retry > self. Max_retry): Tools.writelog (' warn ', ' more than the maximum number of retries, give up ') raise URLLIB2. Urlerror while true:line = Fp.readline () if CharSet = = ' ": CharSet = Self.getcharset (line) if not line:break page + = Too Ls.tounicode (Line,charset) fp.close () return page #获取页面 def getpageinfo (self,url): Tools.writelog ("info", "Start crawling Web page, url= "+url" info = "" Try:info = Self.downloadurl (URL) except:raise tools.writelog ("Debug", "Web Crawl Success") return info ' ' Content extraction class ' ' Class Infogripper (): Pagegripper = Pagegripper () def __init__ (self): Tools.writelog (' Debug ', "Crawler Boot") #抓取标题 def Griptitle (self,data): title = Tools.getfrompatten (U ' box2t sp ">

Web services and task scheduling

' Created on 2010-9-15 @author: Chenggong ' "#-*-coding:utf-8-*-import string,cgi,time from OS import curdir,sep F Rom basehttpserver import basehttprequesthandler,httpserver from infogripper Import * Import re import mysqldb import time Import Threading Import urllib import urllib2 PORT = 8079 VERSION = 0.1 Dbcharset = "UTF8" PARAMS = [' Callback ', ' Sessio NId ', ' Retry ', ' retryinterval ', ' dbhost ', ' dbport ', ' db ', ' dbuser ', ' dbpass ', ' videoid '] dbmap = [' video_id ', ' Ccsiteid ', ' Ccvid ', ' desc_url ', ' site_id ', ' title ', ' Post_time ', ' author ', ' Elapse ', ' channel ', ' tags ', ' create_time ', ' check_time ' , ' status '] ' ERROR code definition ' ERR_OK = 0 Err_param = 1 err_http_timeout = 5 Err_http_status = 6 Err_db_connect_fail = 8 Err_db_sql_fail = 9 Err_gripview = one err_unknow = 12 ' database Adapter ' class Dbadapter (object): Def __init__ (self): SELF.PA Ram = {' IP ': ', ' Port ': 0, ' user ': ', ' pw ': ', ' db ': '} self.connect_once = False #是否连接过数据库 ' ' Create/Update database connection pool ' Def connect (SELF,IP,PORT,USER,PW,DB): if (IP!= self.param[' IP] or port!= self.param[' Port '] or user!= self.param[' user '] or pw!= ' self.param[' pw '] or DB! = self.param[' db ']: tools.writelog (' info ', ' Change database connection pool, ip= ' +ip+ ', port= ' +port+ ', user= ' +user+ ', pw= ' +pw+ ', db= ' +db ') try : if self.connect_once = = True: #释放上次连接 self.cur.close () self.conn.close () Self.conn=mysqldb.connect (user=user,passwd= Pw,db=db,host=ip,port=int (port)) Self.conn.set_character_set (dbcharset) self.connect_once = True self.cur= Self.conn.cursor (MySQLdb.cursors.Cursor) self.param[' ip ' = IP self.param[' port ' = Port self.param[' user '] = user self.param[' pw '] = PW self.param[' db ' = db except:Tools.writelog (' Error ', U ' database connection failed ', True) raise Else:Tools.writelog (' Info ', u ' database connection succeeded ') ' Execute SQL statement ' def execute (self,sql): Tools.writelog (' Debug ', U ' Execute sql: ' +sql ') Try:self.cur.execute ( SQL) Except:Tools.writelog (' Error ', U ' sql execution error: ' +sql ') Raise ' query database ' def query (self,sql): row = {} self.execute (SQL) r Ow=self.cur.fetchall () return row ' video error ' Def updateerr (self,VideoID): Nowtime = Time.strftime ('%y-%m-%d-%h-%m-%s ', Time.localtime (Time.time ())) sql = "UPDATE videos SET" sql = "Chec K_time= ' + nowtime + ', "SQL + +" status=-1 "SQL + =" WHERE video_id= "+videoid self.execute (SQL) Self.conn.commit ()" Update Query Results ' def update (self,obj,videoid,isupdatetitle=true): Tools.writelog (' Debug ', ' Start update database ') try: #更新video表 sql = " UPDATE Videos SET "If" (obj[' Ccsiteid ']!= ""): SQL + + "ccsiteid= '" + obj[' Ccsiteid '] + "'," if (obj[' Ccvid ']!= ""): SQL + = "Ccvid= '" + obj[' ccvid '] + "'," if isupdatetitle:sql + = "title= '" + obj[' title '] + "'," SQL + + "Post_time= '" + obj[' rel Ease '] + "'," SQL + + "author= '" + obj[' user '] + "'," SQL + + "channel= '" + obj[' channel '] + "'," SQL + + "tags= '" + obj[' tag '] + "'," nowtime = Time.strftime ('%y-%m-%d-%h-%m-%s ', Time.localtime (Time.time ())) SQL + + "Check_time= '" + Nowtime + "", "S QL + = "status=0" SQL + = "WHERE video_id=" +videoid self.execute (SQL) #更新count表 if (obj[' views ']!= ' ERROR '): Nowdate = Ti Me.strftime ('%y-%m-%d ', Time.locaLtime (Time.time ())) sql = "SELECT * from counts WHERE" sql = "date = '" + nowdate + "' and video_id=" + videoid rst = SE Lf.query (SQL) If Len (RST) > 0: #如果当天已有记录, update sql = "Update counts SET count=" +obj[' views '] SQL + "WHERE video_id=" + VI Deoid + "and date= '" + nowdate+ "" Else: #否则插入 sql = INSERT into counts VALUES "sql =" (null, +videoid+, ' +nowdate+ ' , "+obj[' views '] +") "Self.execute (SQL) Self.conn.commit () tools.writelog (' Debug '," DB commit OK ") return ERR_OK except Ex Ception,e:print e return err_db_sql_fail ' Task thread class ' ' Class Taskthread (threading. Thread): Def settasktool (self,dbadapter,gripper): Self.dbadapter = Dbadapter Self.gripper = Gripper def setParam (self, PARAM): Self.param = param self.videoid = param[' videoid '] assert self.videoid!= "" Def Init (self): Self.views = "0" Self . Errcode = Err_ok def run (self): Tools.writelog (' Debug ', ' Start reptilian task, sessionid= ' +self.param[' sessionId ']) self.init () Try: #更新数据库连接 self.dbAdapter.connect (self.param[' dbhost '],self.param[' Dbport'],self.param[' dbuser '],self.param[' dbpass '],self.param[' db ']) Except:self.errcode = Err_db_connect_fail #数据库连接失败 Callback (Self.errcode) return #查询该vid的视频 sql = "Select" To column in Dbmap:sql + = column if column!= dbmap[len (DBMAP)- 1]: SQL + = "," SQL + + from videos "SQL +" WHERE video_id= "+self.videoid video = self.dbAdapter.query (sql) assert not ( Len (VIDEO) >1 or Len (video) ==0) #有且仅有一条记录 URL = video[0][3] assert URL!= "" Try:rst = Self.gripper.gripinfo (URL) excep T urllib2. Httperror,e:self.errcode = Err_http_status #HTTP状态错误 self.dbAdapter.updateErr (self.videoid) except URLLIB2. Urlerror,e:self.errcode = Err_http_timeout #HTTP连接超时 self.dbAdapter.updateErr (self.videoid) Except:self.errcode = ERR _unknow #未知错误 self.dbAdapter.updateErr (self.videoid) else:self.views = rst[' views ' if Self.views = "Error": Self.views = "-1" Self.errcode = Err_gripview #数据抓取成功, click Grab failed #更新数据库 (special processing, if the original title has "-" do not update the title field) title = Video[0][5] Assert tit Le!= "" If Re.match ('. *-.* ', title): sELF.ERROCDE = Self.dbAdapter.update (rst,self.videoid,true) Else:self.errcode = Self.dbAdapter.update (rst, self.videoid) Self.callback (self.errcode) tools.writelog (' Info ', ' task over, sessionid= ' +self.param[' sessionId ']) return DEF callback (Self,errcode): results = {' ErrorCode ': Errcode, ' count ': Int (self.views)} results = Urllib.urlencode ( Results) results = Results.replace (' & ', '%26 ') url = self.param[' callback '] url = "?" URL + "sessionid=" + Self.para m[' sessionId ' URL + "&results=" + results retry = 0 while True:try:Tools.writelog (' Debug ', "Callback Master, url=" +url) urllib2 . Urlopen (URL) tools.writelog (' Debug ', ' callback success ') break except URLLIB2. Urlerror, E: #超时, Error Tools.writelog (' Debug ', ' Callback master timeout,%s seconds after retry '%self.param[' RetryInterval ') retry+=1 time.sleep (int ( self.param[' RetryInterval ']) if (Retry > int (self.param[' retry ')): Tools.writelog (' Error ', ' callback Master failed ') return ' Web service class "' Class MyHandler (basehttprequesthandler): Dbadapter = Dbadapter () gripper = Infogripper () def pagesuccess (self) : Self.senD_response (Self.send_header) (' Content-type ', ' text/html ') self.end_headers () def pagefail (self): Self.send_error (404, "Not Found") def GetValue (self,param): src = self.path + ' & ' reg = param + ' = ' + ' (. *?) & ' value = Tools.getfrompatten (reg,src,true) return value def do_get (self): isgetversion = Re.match ('. *vinfoant/ Version.* ', self.path) Istask = Re.match ('. *vinfoant/run.* ', Self.path) if (isgetversion): Self.pagesuccess () Self.wfile.write (VERSION) elif (istask): self.pagesuccess () param = {} for P in params:param[p] = Self.getvalue (p) #获取各项 Parameter Taskthread = Taskthread () taskthread.settasktool (Self.dbadapter, Self.gripper) taskthread.setparam (param) Taskthread.start () #启动任务线程 self.wfile.write ("OK") Else:self.pageFail () return ' Start Web service, Global Entry ' Def starthttpd (): Try : Tools.writelog (' Debug ', ' httpd start. Listen on ' +str (port) httpd = Httpserver ((', port), MyHandler) tools.writelog (' Debug ', ' Success ') Httpd.serve_forever ( Except KeyboardInterrupt:Tools.writelog (' Debug '),' httpd close ... ') Httpd.socket.close () if __name__ = = ' __main__ ': starthttpd ()

This article is an English version of an article which is originally in the Chinese language on aliyun.com and is provided for information purposes only. This website makes no representation or warranty of any kind, either expressed or implied, as to the accuracy, completeness ownership or reliability of the article or any translations thereof. If you have any concerns or complaints relating to the article, please send an email, providing a detailed description of the concern or complaint, to info-contact@alibabacloud.com. A staff member will contact you within 5 working days. Once verified, infringing content will be removed immediately.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

Get Started for Free

Sales Support

1 on 1 presale consultation

Chat Contact Sales
After-Sales Support

24/7 Technical Support 6 Free Tickets per Quarter Faster Response

Open a Ticket
Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.

Learn More