This article mainly introduces the source code of a crawler program written in python. it is a complex, noisy, and repetitive task for anyone who needs to write a crawler, the collection efficiency, link exception handling, and data quality (which are closely related to site code specifications) are considered. Organize and write a crawler by yourself. a single server can enable 1 ~ Eight instances are collected at the same time, and then the data is stored into the database.
#-*-Coding: UTF-8 -*-#! /Usr/local/bin/pythonimport sys, time, OS, stringimport mechanic izeimport urlparsefrom BeautifulSoup import into reimport into loggingimport cgifrom optparse import OptionParser # Sources # Name: TySpider. py # Purpose: WebSite Spider Module # Author: Liu Tiens # Email: liutiansi@gamil.com # Created: 2010/02/16 ## Copyright: (c) 2010 # Pipeline # "| -------------------------------------------------------------------- | defines the loging class; | logging | function: records system-related log information. | "Class Pubclilog (): def _ init _ (self): self. logfile = 'website_log.txt 'def iniLog (self): logger = logging. getLogger () filehandler = logging. fileHandler (self. logfile) streamhandler = logging. streamHandler () fmt = logging. formatter ('% (asctime) s, % (funcName) s, % (message) s') logger. setLevel (logging. DEBUG) logger. addHandler (filehandler) logger. addHandler (streamhandler) return [logger, filehandler] "" | delimiter | defines tySpider class; | delimiter | function: capture classification, title, and other information | "" class BaseTySpider: # Initialize related member methods def _ init _ (self, X, log_switch): # Database Connection self. conn = MySQLdb. connect (db = 'dbname', host = '123. 168.0.10 ', user = 'dbuser', passwd = 'sdflkj934y5jsdgfjh435', charset = 'utf8') # Category and title page Community self. CLASS_URL =' http://test.abc.com/aa/CommTopicsPage ? '# Self. Content_URL =' http://test.bac.com/aa/CommMsgsPage ? '# Start comm value self. X = X # Current comm id modulo, which averages to the table self. mod = self. X % 5 # Community file download page self. body = "" # self. bodySoup object self. soup = None # The variable self. contentbody = "" # self. contentbodySoup object self. contentsoup = None # Log switch self. log_switch = log_switch #============================== get the name and classification method ========== =========================def _ SpiderClass (self, nextpage = None): if nextpage = None: FIXED_QUERY = 'cmm = '+ str (self. x) else: FIXED_QUERY = nextpage [1:] try: rd = mechanic. browser () rd. addheaders = [("User-agent", "Tianya/2010 (compatible; MSIE 6.0; Windows NT 5.1)")] rd. open (self. CLASS_URL + FIXED_QUERY) self. body = rd. response (). read () # rd = mechanic. request (self. CLASS_URL + FIXED_QUERY) # response = mechanic. urlopen (rd) # self. body = response. read () failed t Exception, e: if self. log_switch = "on": logapp = Pubclilog () logger, hdlr = logapp. iniLog () logger.info (self. CLASS_URL + FIXED_QUERY + str (e) hdlr. flush () logger. removeHandler (hdlr) return self. soup = BeautifulSoup (self. body) NextPageObj = self. soup ("a", {'class': re. compile ("fs-paging-item fs-paging-next")}) self. cursor = self. conn. cursor () if nextpage = None: try: Ttag = str (self. soup. table) # print Ttag "------------------ analysis structure -----------------
"SoupTable = BeautifulSoup (Ttag) # locate the first h1 tag tableh1 = soupTable (" h1 ") # print self. X # print "Name:" + tableh1 [0]. string. strip (). encode ('utf-8') # process A non-typed try: # locate A link block that complies with the rule "^ TopByCategory" in the table, tablea [0] is the first qualified connection text, tablea [1]... tablea = soupTable ("a", {'href ': re. compile ("^ TopByCategory")}) if tablea [0]. string. strip () = "": pass # print "BigCLass:" + tablea [0]. string. strip (). encode ('utf-8') # print "SubClass:" + tablea [1]. string. strip (). encode ('utf-8') except T Exception, e: if self. log_switch = "on": logapp = Pubclilog () logger, hdlr = logapp. iniLog () logger.info ("[noClassInfo]" + str (self. x) + str (e) hdlr. flush () logger. removeHandler (hdlr) self.cursor.exe cute ("insert into baname" + str (self. mod) + "values ('% d',' % d', '% s')" % (self. x,-1, tableh1 [0]. string. strip (). encode ('utf-8') self. conn. commit () self. _ SpiderTitle () if NextPageObj: NextPageURL = NextPageObj [0] ['href '] self. _ SpiderClass (NextPageURL) return else: return # obtain the href value of the link 2 object classlink = tablea [1] ['href '] par_dict = cgi. parse_qs (urlparse. urlparse (classlink ). query) # print "CID:" + par_dict ["cid"] [0] # print "SubCID: "+ par_dict [" subcid "] [0] # print" ------------------------------------- "# insert database self.cursor.exe cute (" insert into class values ('% d',' % s ') "% (int (par_dict [" cid "] [0]), tablea [0]. string. strip (). encode ('utf-8') self.cursor.exe cute ("insert into subclass values ('% d',' % d', '% s ') "% (int (par_dict [" subcid "] [0]), int (par_dict [" cid "] [0]), tablea [1]. string. strip (). encode ('utf-8') self.cursor.exe cute ("insert into baname" + str (self. mod) + "values ('% d',' % d', '% s')" % (self. x, int (par_dict ["subcid"] [0]), tableh1 [0]. string. strip (). encode ('utf-8') self. conn. commit () self. _ SpiderTitle () if NextPageObj: NextPageURL = NextPageObj [0] ['href '] self. _ SpiderClass (NextPageURL) self. body = None self. soup = None Ttag = None soupTable = None table = None table1 = None classlink = None par_dict = None Partition T Exception, e: if self. log_switch = "on": logapp = Pubclilog () logger, hdlr = logapp. iniLog () logger.info ("[ClassInfo]" + str (self. x) + str (e) hdlr. flush () logger. removeHandler (hdlr) else: self. _ SpiderTitle () if NextPageObj: NextPageURL = NextPageObj [0] ['href '] self. _ SpiderClass (NextPageURL) #================================ method to obtain the title ======================= ========== def _ SpiderTitle (self): # search for the title table object (table) soupTitleTable = self. soup ("table", {'class': "fs-topic-list"}) # search for the title row object (tr) TitleTr = soupTitleTable [0] ("tr ", {'onmouseover': re. compile ("^ this \. className = 'fs-row-hover '")})" ----------- analysis structure --------------
[New Arrival] welcome the American people to join 0/12 Chinese 2-14 "For CurrTr in TitleTr: try: # Initialize the top position and extract status Title_starred = 'n' Title_sticky = 'n' # obtain the BeautifulSoup object soupCurrTr = BeautifulSoup (str (CurrTr) of the current record # BeautifulSoup analysis HTML error, you can only obtain the sub-state through the number of signs in the span, and there will be some errors # if there is only the essence, it will also be treated as the top. TitleStatus = soupCurrTr ("span", {'Title': ""}) TitlePhotoViewer = soupCurrTr ("a", {'href ': re. compile ("^ PhotoViewer")}) if TitlePhotoViewer. _ len _ () = 1: TitlePhotoViewerBool = 0 else: TitlePhotoViewerBool = 1 if TitleStatus. _ len _ () = 3-TitlePhotoViewerBool: Title_starred = 'y' Title_sticky = 'y' elif TitleStatus. _ len _ () = 2-TitlePhotoViewerBool: Title_sticky = 'y' # obtain the Title of the post = soupCurrTr. a. next. strip ()# Obtain the Post ID par_dict = cgi. parse_qs (urlparse. urlparse (soupCurrTr. a ['href ']). query) # obtain the number of replies and the browser TitleNum = soupCurrTr ("td", {'class': "fs-topic-name"}) TitleArray = string. split (str (TitleNum [0]), '\ n') Title_ReplyNum = string. split (TitleArray [len (TitleArray)-4], '>') [2] Title_ViewNum = string. split (TitleArray [len (TitleArray)-2], '>') [2] [:-6] # obtain the author TitleAuthorObj = soupCurrTr ("td ", {'style': "padding-left: 4px"}) Tit Le_Author = TitleAuthorObj [0]. next. next. next. string. strip (). encode ('utf-8') # obtain the reply time TitleTime = soupCurrTr ("td", {'class': re. compile ("^ fs-topic-last-mdfy fs-meta")}) "print" X: "+ str (self. x) print "Title_starred:" + Title_starred print "Title_sticky:" + Title_sticky print "Title:" + Title # obtain the Post content connection URL print "Title_link:" + soupCurrTr. a ['href '] print "CID:" + par_dict ["tid"] [0] print "Title_ReplyNum:" + Title _ ReplyNum print "Title_ViewNum:" + Title_ViewNum print "Title_Author:" + Title_Author print "TitleTime:" + TitleTime [0]. string. strip (). encode ('utf-8') "" # import self.cursor.exe cute ("insert into Title" + str (self. mod) + "values ('% s',' % d', '% s',' % d', '% d',' % s ', '% s',' % s', '% s') "% (par_dict [" tid "] [0], \ self. x, Title, int (Title_ReplyNum), int (Title_ViewNum), Title_starred, Title_sticky, \ Title_Author.decode ('utf-8'), Tit LeTime [0]. string. strip (). encode ('utf-8') self. conn. commit () self. _ SpiderContent (par_dict ["tid"] [0]) does T Exception, e: if self. log_switch = "on": logapp = Pubclilog () logger, hdlr = logapp. iniLog () logger.info ("[Title]" + str (self. x) + '-' + par_dict ["tid"] [0] + '-' + str (e) hdlr. flush () logger. removeHandler (hdlr) #==================================== get a post and reply method ================== ============== def _ SpiderContent (self, ID, nextpage = No Ne): if nextpage = None: FIXED_QUERY = 'cmm = '+ str (self. x) + '& tid =' + ID + '& ref = regulartopics' else: FIXED_QUERY = nextpage [9:] rd = mechanic. browser () rd. addheaders = [("User-agent", "Tianya/2010 (compatible; MSIE 6.0; Windows NT 5.1)")] rd. open (self. content_URL + FIXED_QUERY) self. contentbody = rd. response (). read () # rd = mechanic. request (self. content_URL + FIXED_QUERY) # response = mechanic. urlopen (rd )# Self. contentbody = response. read () self. contentsoup = BeautifulSoup (self. contentbody) NextPageObj = self. contentsoup ("a", {'class': re. compile ("fs-paging-item fs-paging-next")}) try: Tp = self. contentsoup ("p", {'class': "fs-user-action"}) I = 0 for Currp in Tp: if I = 0: ctype = 'y' else: Ctype = 'n' # posting time soupCurrp = BeautifulSoup (str (Currp) PosttimeObj = soupCurrp ("span", {'class ': "fs-meta"}) Posttime = Post TimeObj [0]. next [1:] Posttime = Posttime [0:-3] # IP address IPObj = soupCurrp ("a", {'href ': re. compile ("CommMsgAddress")}) if IPObj: IP = IPObj [0]. next. strip () else: IP = ''# publish/reply content ContentObj = soupCurrp (" p ", {'class':" fs-user-action-body "}) content = ContentObj [0]. renderContents (). strip () "" print "ID:" + str (self. x) print "ID:" + ID print "Ctype:" + Ctype print "POSTTIME:" + Posttime print "IP:" + IP print "Content:" + Co Ntent "self.cursor.exe cute (" insert into Content "+ str (self. mod) + "values ('% s',' % d', '% s',' % s ') "% (ID, self. x, Ctype, Posttime, IP, Content. decode ('utf-8') self. conn. commit () I + = 1 distinct T Exception, e: if self. log_switch = "on": logapp = Pubclilog () logger, hdlr = logapp. iniLog () logger.info ("[Content]" + str (self. x) + '-' + ID + '-' + str (e) hdlr. flush () logger. removeHandler (hdlr) # if there is a link on the Next page, just traverse if Next PageObj: NextPageURL = NextPageObj [0] ['href '] self. _ SpiderContent (ID, NextPageURL) def _ del _ (self): try: self. cursor. close () self. conn. close () handle T Exception, e: pass # traverse the comm range def initapp (StartValue, EndValue, log_switch): for x in range (StartValue, EndValue): app = BaseTySpider (x, log_switch) app. _ SpiderClass () app = Noneif _ name _ = "_ main _": # define the command line parameter MSG_USAGE = "TySpider. py [-s StartNumber EndNumb Er]-l [on | off] [-v] [-h] "parser = OptionParser (MSG_USAGE) parser. add_option ("-s", "-- set", nargs = 2, action = "store", dest = "comm_value", type = "int", default = False, help = "configure the name ID value range. ". Decode ('utf-8') parser. add_option ("-l", "-- log", action = "store", dest = "log_switch", type = "string", default = "on ", help = "error log switch ". decode ('utf-8') parser. add_option ("-v", "-- version", action = "store_true", dest = "verbose", help = "show version Information ". decode ('utf-8') opts, args = parser. parse_args () if opts. comm_value: if opts. comm_value [0]> opts. comm_value [1]: print "is the end value smaller than the initial value? "Exit (); if opts. log_switch = "on": log_switch = "on" else: log_switch = "off" initapp (opts. comm_value [0], opts. comm_value [1], log_switch) exit (); if opts. verbose: print "WebSite Scider V1.0 beta. "exit;
For more articles about the source code of a crawler written in python, refer to the PHP Chinese website!