Python-based crawler source code
The write crawler is a complex, noisy, and repetitive task. It takes into account collection efficiency, link exception handling, and data quality (which is closely related to site code specifications. Organize and write a crawler by yourself. A single server can enable 1 ~ Eight instances are collected at the same time, and then the data is stored into the database.
#-*-Coding: UTF-8 -*-#! /Usr/local/bin/pythonimport sys, time, OS, stringimport mechanic izeimport urlparsefrom BeautifulSoup import into reimport into loggingimport cgifrom optparse import OptionParser # sources # Name: TySpider. py # Purpose: WebSite Spider Module # Author: Liu tiens # Email: liutiansi@gamil.com # Created: 2010/02/16 ## Copyright: (c) 2010 # pipeline # "| -------------------------------------------------------------------- | defines the loging class; | logging | function: records system-related log information. | "Class Pubclilog (): def _ init _ (self): self. logfile = 'website_log.txt 'def iniLog (self): logger = logging. getLogger () filehandler = logging. fileHandler (self. logfile) streamhandler = logging. streamHandler () fmt = logging. formatter ('% (asctime) s, % (funcName) s, % (message) s') logger. setLevel (logging. DEBUG) logger. addHandler (filehandler) logger. addHandler (streamhandler) return [logger, filehandler] "| Define tySpider class; | ---------------------------------------------------------------------- | function: capture information such as category and title |" "class BaseTySpider: # initialize related member Methods def _ init _ (self, X, log_switch): # database connection self. conn = MySQLdb. connect (db = 'dbname', host = '123. 168.0.10 ', user = 'dbuser', passwd = 'sdflkj934y5jsdgfjh435', charset = 'utf8') # classification and title page Comm Unity self. CLASS_URL = 'HTTP: // test.abc.com/aa/CommTopicsPage? '# Post the reply page self. Content_URL = 'HTTP: // test.bac.com/aa/CommMsgsPage? '# Start comm value self. X = X # current comm id modulo, which averages to the table self. mod = self. X % 5 # Community File Download Page self. body = "" # self. bodySoup object self. soup = None # The variable self. contentbody = "" # self. contentbodySoup object self. contentsoup = None # log switch self. log_switch = log_switch #============================== get the name and classification method ========== =========================def _ SpiderClass (self, nextpage = None): if nextpage = None: FIXED_QUERY = 'cmm = '+ str (self. x) else: FIXED_QU ERY = nextpage [1:] try: rd = mechanic. browser () rd. addheaders = [("User-agent", "Tianya/2010 (compatible; MSIE 6.0; Windows NT 5.1)")] rd. open (self. CLASS_URL + FIXED_QUERY) self. body = rd. response (). read () # rd = mechanic. request (self. CLASS_URL + FIXED_QUERY) # response = mechanic. urlopen (rd) # self. body = response. read () failed t Exception, e: if self. log_switch = "on": logapp = Pubclilog () logger, hdlr = loga Pp. iniLog () logger.info (self. CLASS_URL + FIXED_QUERY + str (e) hdlr. flush () logger. removeHandler (hdlr) return self. soup = BeautifulSoup (self. body) NextPageObj = self. soup ("a", {'class': re. compile ("fs-paging-item fs-paging-next")}) self. cursor = self. conn. cursor () if nextpage = None: try: Ttag = str (self. soup. table) # print Ttag "------------------ analysis structure ----------------- <table cellspacing =" 0 "cellpad Ding = "0"> <tr> <td>
Articles you may be interested in:
- Create a crawler collection novel using Python
- Python makes simple Web Crawler
- Simple python crawling
- Explanation of the basic syntax of Python Crawlers
- Guide to Using Python to write basic crawler modules and frameworks
- Simple learning notes for Python Scrapy crawler framework
- Python crawlers capture data transmitted by mobile apps
- Python crawler simulated logon website with verification code
- Detailed explanation of the basic writing of the Python web crawler Function
- Python crawler tools
- Python web crawler implementation code