Basic modules
Python crawler, web spider. Crawl a Web site to fetch Web page data and extract it for analysis.
The basic module uses the Urllib,urllib2,re, etc. module
Basic usage, Examples:
(1) Make a basic GET request to get the Web page HTML
#!coding=utf-8import urllibimport urllib2 url = ' http://www.baidu.com/' # gets the request requested = Urllib2. Request (URL) Try: # Gets the return response response = Urllib2.urlopen (request) except URLLIB2 according to request. Httperror, E: if Hasattr (E, ' reason '): print e.reason# Read response = bodyhtml () # Read response headersheaders = Response.info ()
(2) Form submission
#!coding=utf-8import urllib2import urllib post_url = ' Post_data = Urllib.urlencode ({ ' username ': ' username ', ' Password ': ' Password ',} ' post_headers = { ' user-agent ': ' mozilla/5.0 (X11; Ubuntu; Linux i686; rv:31.0) gecko/20100101 firefox/31.0 ',} request = Urllib2. Request ( Url=post_url, data=post_data, headers=post_headers,) response = Urllib2.urlopen (Request) HTML = Response.read ()
(3)
#!coding=utf-8 Import Urllib2import Re page_num = 1url = ' http://tieba.baidu.com/p/3238280985?see_lz=1&pn= ' +str ( Page_num) mypage = Urllib2.urlopen (URL). read (). Decode (' GBK ') Myre = Re.compile (R ' class= "D_post_content j_d_post_ Content "> (. *?) ', Re. Dotall) items = Myre.findall (mypage) f = open (' Baidu.txt ', ' A + ') Import sysreload (SYS) sys.setdefaultencoding (' utf-8 ') i = 0texts = []for item in items: i + = 1 print i text = Item.replace ('
', ') text.replace (' \ n ', '). Replace (', ') + ' \ n ' print text f.write (text) f.close ()
(4)
#coding: Utf-8 ' demo login 163 email and download message content ' ' Import urllibimport urllib2import cookielibimport reimport timeimport JSON class Em Ail163:header = {' user-agent ': ' mozilla/5.0 (Windows; U Windows NT 6.1; En-us; rv:1.9.1.6) gecko/20091201 firefox/3.5.6 '} user = ' cookie = none sid = None Mailbaseurl= ' http://twebmail.mail.163.co M ' Def __init__ (self): Self.cookie = Cookielib. Cookiejar () Cookiepro = Urllib2. Httpcookieprocessor (Self.cookie) Urllib2.install_opener (Urllib2.build_opener (cookiepro)) def login (self,user,pwd) : ' Login ' ' postdata = Urllib.urlencode ({' username ': User, ' password ':p wd, ' type ': 1 }) #注意版本不同, the login URL is also different req = Urllib2. Request (url= ' https://ssl.mail.163.com/entry/coremail/fcg/ntesdoor2?funcid=loginone&language=-1& passtype=1&iframe=1&product=mail163&from=web&df=email163&race=-2_45_-2_hz&module=& Uid= ' +user+ ' &style=10&net=t&skinid=null ', data=postdata, heAders=self.header,) res = str (urllib2.urlopen (req). Read ()) #print res Patt = re.compile (' sid= ' ([^ ']+ ') ', re. I) Patt = Patt.search (res) uname = user.split (' @ ') [0] self.user = user if Patt:self.sid = patt.group (1) . Strip () #print self.sid print '%s login successful ... '% (uname) Else:print '%s Login failed ... '% (una ME) def getinbox (self): ' Get mailbox list ' ' print ' \nget mail lists.....\n ' sid = Self.sid URL = self.ma ilbaseurl+ '/jy3/list/list.do?sid= ' +sid+ ' &fid=1&fr=folder ' res = urllib2.urlopen (URL). Read () #获取邮件列表 MailL ist = [] Patt = Re.compile (']+>.*?href= "([^"]+) "[^>]+> (. *?) <\/a>.*?] +>.*?href= "[^>]+> (. *?) <\/a> ', Re. I|re. S) Patt = Patt.findall (res) if Patt==none:return maillist for i in patt:line = {' From ': i[1 ].decode (' UTF8 '), ' URL ': self.mailbaseurl+i[0], ' Subject ': I[2].decode (' UTF8 ')} maillist . Append (liNE) return maillist def getmailmsg (self,url): ' Download message content ' ' content= ' ' print ' \ n download.....%s\ n '% (URL) res = urllib2.urlopen (URL). Read () Patt = Re.compile (' contentURL: ' ([^ ']+ ') ', re. I) Patt = Patt.search (res) if Patt==none:return content url = '%s%s '% (Self.mailbaseurl,patt.group (1)) Ti Me.sleep (1) res = Urllib2.urlopen (URL). Read () Djson = json. Jsondecoder (encoding= ' utf8 ') jsonres = Djson.decode (res) if ' resultvar ' in jsonres:content = Djson.decode (res) [' Resultvar '] time.sleep (3) return content ' Demon ' #初始化mail163 = Email163 () #登录mail163. Login (' lpe234@163.com ', ' 944898186 ') time.sleep (2) #获取收件箱elist = Mail163.getinbox () #获取邮件内容for i in Elist:print ' theme:%s from:%s content: \n%s '% (i[' subject '].encode (' UTF8 '), i[' from '].encode (' UTF8 '), mail163.getmailmsg (i[' url ')). Encode (' UTF8 '))
(5) need to log in the situation
Processing of #1 Cookies import urllib2, cookielibcookie_support= urllib2. Httpcookieprocessor (Cookielib. Cookiejar ()) opener = Urllib2.build_opener (Cookie_support, Urllib2. HttpHandler) Urllib2.install_opener (opener) content = Urllib2.urlopen (' Http://XXXX '). Read () #2 with proxy and cookie opener = Urllib2.build_opener (Proxy_support, Cookie_support, Urllib2. HttpHandler) #3 processing of the form import urllibpostdata=urllib.urlencode ({' username ': ' XXXXX ', ' Password ': ' XXXXX ', ' Continueuri ': ' http://www.verycd.com/', ' FK ': FK, ' login_submit ': ' Login '} ' req = Urllib2. Request (url = ' http://secure.verycd.com/signin/*/http://www.verycd.com/', data = postdata) result = Urllib2.urlopen ( REQ). Read () #4 disguised as browser access headers = {' user-agent ': ' mozilla/5.0 (Windows; U Windows NT 6.1; En-us; rv:1.9.1.6) gecko/20091201 firefox/3.5.6 '}req = urllib2. Request (url = ' http://secure.verycd.com/signin/*/http://www.verycd.com/', data = postdata, headers = headers) #5 anti-"anti-hotlinking "headers = {' Referer ': ' Http://www.cnbeta.com/articles '}
(6) Multithreading
From threading import threadfrom queue import queuefrom time import Sleep#q is the task queue #num is the total number of concurrent threads #jobs is how many tasks q = queue () NUM = 2JO BS = 10# specific processing function, responsible for handling a single task Def do_somthing_using (arguments): print arguments# This is a worker process, responsible for constantly fetching data from the queue and processing def working (): While True: arguments = Q.get () do_somthing_using (arguments) sleep (1) Q.task_done () #fork Num threads wait for a queue for I in range (NUM): t = Thread (target=working) T.setdaemon (True) T.start () #把JOBS排入队列for I in Range (JOBS): q.put (i) #等待所有JOBS完成q. Join ()
Scrapy Frame
Scrapy Framework, Python developed a fast, high-level screen capture and web crawling framework for crawling web sites and extracting structured data from pages. Scrapy can be used for data mining, monitoring and automated testing in a wide range of applications.
Just started learning this framework. Not very good comment. Just feel that this framework has some Java feel and requires too much support from other modules.
(i) Creation of Scrapy project
# Use Scrapy startproject scrapy_test├──scrapy_test│ ├──scrapy.cfg│ └──scrapy_test│ ├──__init__.py│ ├──items.py│ ├──pipelines.py│ ├──settings.py│ └──spiders│ ├──__init__.py# to create Scrapy project
(ii) Description
SCRAPY.CFG: Project configuration file
items.py: Data structure definition file that needs to be extracted
pipelines.py: Pipeline definition for further processing of data extracted from items, such as preservation, etc.
settings.py: Crawler configuration file
Spiders: The directory where the spider is placed
(iii) Dependency packages
Dependency packages are more troublesome.
# Python-dev Package Installation apt-get Install Python-dev # twisted, W3lib, six, Queuelib, Cssselect, libxslt pip install W3libpip Insta ll twistedpip install lxmlapt-get install libxml2-dev libxslt-dev apt-get install python-lxmlpip install cssselect pip ins Tall pyopenssl sudo pip install service_identity # Once installed, you can create a project using Scrapy startproject test
(iv) Crawl instances.
(1) Create Scrapy Project
dizzy@dizzy-pc:~/python/spit$ scrapy startproject itzhaopinnew scrapy project ' Itzhaopin ' created in: /home/dizzy/ Python/spit/itzhaopin You can start your first spider with: cd itzhaopin scrapy genspider Example Example.comdizz y@dizzy-pc:~/python/spit$ dizzy@dizzy-pc:~/python/spit$ CD itzhaopindizzy@dizzy-pc:~/python/spit/itzhaopin$ tree.├──itzhaopin│ ├──__init__.py│ ├──items.py│ ├──pipelines.py│ ├──settings.py│ └── spiders│ └──__init__.py└──scrapy.cfg # scrapy.cfg: Item http://my.oschina.net/lpe234/admin/new-blog config file # items.py : Data structure definition file to be extracted # pipelines.py: Pipeline definition for further processing of data extracted from items, such as Save # settings.py: Crawler profile # Spiders: directory where spiders are placed
(2) Define the data structure to be crawled items.py
From Scrapy.item Import Item, field# defines the data we want to crawl class Tencentitem (item): name = field () # job name Catalog = field () # Job category worklocation = field () # duty station recruitnumber = field () # Number of recruits Detaillink = field () # Job Details link publishtim E = Field () # Publish Time
(3) Realization of Spider class
- The spider is a Python class that inherits from Scarpy.contrib.spiders.CrawlSpider and has 3 members that must be defined.
- Name: Identification of the spider.
- Start_urls: A list of URLs from which spiders start crawling
- Parse (): a method. When the page inside the Start_urls is fetched, it needs to call this method to parse the page content, returning the next page to crawl, or returning to the items list.
Create a new spider,tencent_spider.py below the Spiders directory:
#coding =utf-8 from Scrapy.spider import Basespider class Dmozspider (basespider): name = ' DMOZ ' allowed_ domains = [' dmoz.org '] start_urls = [ ' http://www.dmoz.org/Computers/Programming/Languages/Python/Books/', ' http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/' ] Def parse (self, response): filename = response.url.split ('/') [-2] open (filename, ' WB '). Write (Response.info)
This is a little simpler. Run spider with scrapy crawl Dmoz #