Share the source code of a crawler written in python

Source: Internet
Author: User
This article mainly introduces the source code of a crawler program written in python. it is a complex, noisy, and repetitive task for anyone who needs to write a crawler, the collection efficiency, link exception handling, and data quality (which are closely related to site code specifications) are considered. Organize and write a crawler by yourself. a single server can enable 1 ~ Eight instances are collected at the same time, and then the data is stored into the database.

#-*-Coding: UTF-8 -*-#! /Usr/local/bin/pythonimport sys, time, OS, stringimport mechanic izeimport urlparsefrom BeautifulSoup import into reimport into loggingimport cgifrom optparse import OptionParser # Sources # Name: TySpider. py # Purpose: WebSite Spider Module # Author: Liu Tiens # Email: liutiansi@gamil.com # Created: 2010/02/16 ## Copyright: (c) 2010 # Pipeline # "| -------------------------------------------------------------------- | defines the loging class; | logging | function: records system-related log information. | "Class Pubclilog (): def _ init _ (self): self. logfile = 'website_log.txt 'def iniLog (self): logger = logging. getLogger () filehandler = logging. fileHandler (self. logfile) streamhandler = logging. streamHandler () fmt = logging. formatter ('% (asctime) s, % (funcName) s, % (message) s') logger. setLevel (logging. DEBUG) logger. addHandler (filehandler) logger. addHandler (streamhandler) return [logger, filehandler] "" | delimiter | defines tySpider class; | delimiter | function: capture classification, title, and other information | "" class BaseTySpider: # Initialize related member methods def _ init _ (self, X, log_switch): # Database Connection self. conn = MySQLdb. connect (db = 'dbname', host = '123. 168.0.10 ', user = 'dbuser', passwd = 'sdflkj934y5jsdgfjh435', charset = 'utf8') # Category and title page Community self. CLASS_URL =' http://test.abc.com/aa/CommTopicsPage ? '# Self. Content_URL =' http://test.bac.com/aa/CommMsgsPage ? '# Start comm value self. X = X # Current comm id modulo, which averages to the table self. mod = self. X % 5 # Community file download page self. body = "" # self. bodySoup object self. soup = None # The variable self. contentbody = "" # self. contentbodySoup object self. contentsoup = None # Log switch self. log_switch = log_switch #============================== get the name and classification method ========== =========================def _ SpiderClass (self, nextpage = None): if nextpage = None: FIXED_QUERY = 'cmm = '+ str (self. x) else: FIXED_QUERY = nextpage [1:] try: rd = mechanic. browser () rd. addheaders = [("User-agent", "Tianya/2010 (compatible; MSIE 6.0; Windows NT 5.1)")] rd. open (self. CLASS_URL + FIXED_QUERY) self. body = rd. response (). read () # rd = mechanic. request (self. CLASS_URL + FIXED_QUERY) # response = mechanic. urlopen (rd) # self. body = response. read () failed t Exception, e: if self. log_switch = "on": logapp = Pubclilog () logger, hdlr = logapp. iniLog () logger.info (self. CLASS_URL + FIXED_QUERY + str (e) hdlr. flush () logger. removeHandler (hdlr) return self. soup = BeautifulSoup (self. body) NextPageObj = self. soup ("a", {'class': re. compile ("fs-paging-item fs-paging-next")}) self. cursor = self. conn. cursor () if nextpage = None: try: Ttag = str (self. soup. table) # print Ttag "------------------ analysis structure -----------------
 
 
Dunhill

CHINA> People

"SoupTable = BeautifulSoup (Ttag) # locate the first h1 tag tableh1 = soupTable (" h1 ") # print self. X # print "Name:" + tableh1 [0]. string. strip (). encode ('utf-8') # process A non-typed try: # locate A link block that complies with the rule "^ TopByCategory" in the table, tablea [0] is the first qualified connection text, tablea [1]... tablea = soupTable ("a", {'href ': re. compile ("^ TopByCategory")}) if tablea [0]. string. strip () = "": pass # print "BigCLass:" + tablea [0]. string. strip (). encode ('utf-8') # print "SubClass:" + tablea [1]. string. strip (). encode ('utf-8') except T Exception, e: if self. log_switch = "on": logapp = Pubclilog () logger, hdlr = logapp. iniLog () logger.info ("[noClassInfo]" + str (self. x) + str (e) hdlr. flush () logger. removeHandler (hdlr) self.cursor.exe cute ("insert into baname" + str (self. mod) + "values ('% d',' % d', '% s')" % (self. x,-1, tableh1 [0]. string. strip (). encode ('utf-8') self. conn. commit () self. _ SpiderTitle () if NextPageObj: NextPageURL = NextPageObj [0] ['href '] self. _ SpiderClass (NextPageURL) return else: return # obtain the href value of the link 2 object classlink = tablea [1] ['href '] par_dict = cgi. parse_qs (urlparse. urlparse (classlink ). query) # print "CID:" + par_dict ["cid"] [0] # print "SubCID: "+ par_dict [" subcid "] [0] # print" ------------------------------------- "# insert database self.cursor.exe cute (" insert into class values ('% d',' % s ') "% (int (par_dict [" cid "] [0]), tablea [0]. string. strip (). encode ('utf-8') self.cursor.exe cute ("insert into subclass values ('% d',' % d', '% s ') "% (int (par_dict [" subcid "] [0]), int (par_dict [" cid "] [0]), tablea [1]. string. strip (). encode ('utf-8') self.cursor.exe cute ("insert into baname" + str (self. mod) + "values ('% d',' % d', '% s')" % (self. x, int (par_dict ["subcid"] [0]), tableh1 [0]. string. strip (). encode ('utf-8') self. conn. commit () self. _ SpiderTitle () if NextPageObj: NextPageURL = NextPageObj [0] ['href '] self. _ SpiderClass (NextPageURL) self. body = None self. soup = None Ttag = None soupTable = None table = None table1 = None classlink = None par_dict = None Partition T Exception, e: if self. log_switch = "on": logapp = Pubclilog () logger, hdlr = logapp. iniLog () logger.info ("[ClassInfo]" + str (self. x) + str (e) hdlr. flush () logger. removeHandler (hdlr) else: self. _ SpiderTitle () if NextPageObj: NextPageURL = NextPageObj [0] ['href '] self. _ SpiderClass (NextPageURL) #================================ method to obtain the title ======================= ========== def _ SpiderTitle (self): # search for the title table object (table) soupTitleTable = self. soup ("table", {'class': "fs-topic-list"}) # search for the title row object (tr) TitleTr = soupTitleTable [0] ("tr ", {'onmouseover': re. compile ("^ this \. className = 'fs-row-hover '")})" ----------- analysis structure --------------

[New Arrival] welcome the American people to join 0/12 Chinese 2-14 "For CurrTr in TitleTr: try: # Initialize the top position and extract status Title_starred = 'n' Title_sticky = 'n' # obtain the BeautifulSoup object soupCurrTr = BeautifulSoup (str (CurrTr) of the current record # BeautifulSoup analysis HTML error, you can only obtain the sub-state through the number of signs in the span, and there will be some errors # if there is only the essence, it will also be treated as the top. TitleStatus = soupCurrTr ("span", {'Title': ""}) TitlePhotoViewer = soupCurrTr ("a", {'href ': re. compile ("^ PhotoViewer")}) if TitlePhotoViewer. _ len _ () = 1: TitlePhotoViewerBool = 0 else: TitlePhotoViewerBool = 1 if TitleStatus. _ len _ () = 3-TitlePhotoViewerBool: Title_starred = 'y' Title_sticky = 'y' elif TitleStatus. _ len _ () = 2-TitlePhotoViewerBool: Title_sticky = 'y' # obtain the Title of the post = soupCurrTr. a. next. strip ()# Obtain the Post ID par_dict = cgi. parse_qs (urlparse. urlparse (soupCurrTr. a ['href ']). query) # obtain the number of replies and the browser TitleNum = soupCurrTr ("td", {'class': "fs-topic-name"}) TitleArray = string. split (str (TitleNum [0]), '\ n') Title_ReplyNum = string. split (TitleArray [len (TitleArray)-4], '>') [2] Title_ViewNum = string. split (TitleArray [len (TitleArray)-2], '>') [2] [:-6] # obtain the author TitleAuthorObj = soupCurrTr ("td ", {'style': "padding-left: 4px"}) Tit Le_Author = TitleAuthorObj [0]. next. next. next. string. strip (). encode ('utf-8') # obtain the reply time TitleTime = soupCurrTr ("td", {'class': re. compile ("^ fs-topic-last-mdfy fs-meta")}) "print" X: "+ str (self. x) print "Title_starred:" + Title_starred print "Title_sticky:" + Title_sticky print "Title:" + Title # obtain the Post content connection URL print "Title_link:" + soupCurrTr. a ['href '] print "CID:" + par_dict ["tid"] [0] print "Title_ReplyNum:" + Title _ ReplyNum print "Title_ViewNum:" + Title_ViewNum print "Title_Author:" + Title_Author print "TitleTime:" + TitleTime [0]. string. strip (). encode ('utf-8') "" # import self.cursor.exe cute ("insert into Title" + str (self. mod) + "values ('% s',' % d', '% s',' % d', '% d',' % s ', '% s',' % s', '% s') "% (par_dict [" tid "] [0], \ self. x, Title, int (Title_ReplyNum), int (Title_ViewNum), Title_starred, Title_sticky, \ Title_Author.decode ('utf-8'), Tit LeTime [0]. string. strip (). encode ('utf-8') self. conn. commit () self. _ SpiderContent (par_dict ["tid"] [0]) does T Exception, e: if self. log_switch = "on": logapp = Pubclilog () logger, hdlr = logapp. iniLog () logger.info ("[Title]" + str (self. x) + '-' + par_dict ["tid"] [0] + '-' + str (e) hdlr. flush () logger. removeHandler (hdlr) #==================================== get a post and reply method ================== ============== def _ SpiderContent (self, ID, nextpage = No Ne): if nextpage = None: FIXED_QUERY = 'cmm = '+ str (self. x) + '& tid =' + ID + '& ref = regulartopics' else: FIXED_QUERY = nextpage [9:] rd = mechanic. browser () rd. addheaders = [("User-agent", "Tianya/2010 (compatible; MSIE 6.0; Windows NT 5.1)")] rd. open (self. content_URL + FIXED_QUERY) self. contentbody = rd. response (). read () # rd = mechanic. request (self. content_URL + FIXED_QUERY) # response = mechanic. urlopen (rd )# Self. contentbody = response. read () self. contentsoup = BeautifulSoup (self. contentbody) NextPageObj = self. contentsoup ("a", {'class': re. compile ("fs-paging-item fs-paging-next")}) try: Tp = self. contentsoup ("p", {'class': "fs-user-action"}) I = 0 for Currp in Tp: if I = 0: ctype = 'y' else: Ctype = 'n' # posting time soupCurrp = BeautifulSoup (str (Currp) PosttimeObj = soupCurrp ("span", {'class ': "fs-meta"}) Posttime = Post TimeObj [0]. next [1:] Posttime = Posttime [0:-3] # IP address IPObj = soupCurrp ("a", {'href ': re. compile ("CommMsgAddress")}) if IPObj: IP = IPObj [0]. next. strip () else: IP = ''# publish/reply content ContentObj = soupCurrp (" p ", {'class':" fs-user-action-body "}) content = ContentObj [0]. renderContents (). strip () "" print "ID:" + str (self. x) print "ID:" + ID print "Ctype:" + Ctype print "POSTTIME:" + Posttime print "IP:" + IP print "Content:" + Co Ntent "self.cursor.exe cute (" insert into Content "+ str (self. mod) + "values ('% s',' % d', '% s',' % s ') "% (ID, self. x, Ctype, Posttime, IP, Content. decode ('utf-8') self. conn. commit () I + = 1 distinct T Exception, e: if self. log_switch = "on": logapp = Pubclilog () logger, hdlr = logapp. iniLog () logger.info ("[Content]" + str (self. x) + '-' + ID + '-' + str (e) hdlr. flush () logger. removeHandler (hdlr) # if there is a link on the Next page, just traverse if Next PageObj: NextPageURL = NextPageObj [0] ['href '] self. _ SpiderContent (ID, NextPageURL) def _ del _ (self): try: self. cursor. close () self. conn. close () handle T Exception, e: pass # traverse the comm range def initapp (StartValue, EndValue, log_switch): for x in range (StartValue, EndValue): app = BaseTySpider (x, log_switch) app. _ SpiderClass () app = Noneif _ name _ = "_ main _": # define the command line parameter MSG_USAGE = "TySpider. py [-s StartNumber EndNumb Er]-l [on | off] [-v] [-h] "parser = OptionParser (MSG_USAGE) parser. add_option ("-s", "-- set", nargs = 2, action = "store", dest = "comm_value", type = "int", default = False, help = "configure the name ID value range. ". Decode ('utf-8') parser. add_option ("-l", "-- log", action = "store", dest = "log_switch", type = "string", default = "on ", help = "error log switch ". decode ('utf-8') parser. add_option ("-v", "-- version", action = "store_true", dest = "verbose", help = "show version Information ". decode ('utf-8') opts, args = parser. parse_args () if opts. comm_value: if opts. comm_value [0]> opts. comm_value [1]: print "is the end value smaller than the initial value? "Exit (); if opts. log_switch = "on": log_switch = "on" else: log_switch = "off" initapp (opts. comm_value [0], opts. comm_value [1], log_switch) exit (); if opts. verbose: print "WebSite Scider V1.0 beta. "exit;

For more articles about the source code of a crawler written in python, refer to the PHP Chinese website!

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.