Python crawler, web spider. Crawl a Web site to fetch Web page data and extract it for analysis.
The basic module uses the Urllib,urllib2,re, etc. module
(i) Basic usage, examples
(1) Make a basic GET request to get the Web page HTML
#!coding=utf-8import urllibimport urllib2url = ' http://www.baidu.com/' # gets the request requested = Urllib2. Request (URL) Try: # Gets the return response response = Urllib2.urlopen (request) except URLLIB2 according to request. Httperror, E:if hasattr (E, ' reason '): print e.reason# read response bodyhtml = Response.read () # Read response headers headers = Response.info ()
(2) Form submission
#!coding=utf-8import urllib2import urllibpost_url = ' Post_data = Urllib.urlencode ({' username ': ' username ', ' passwo Rd ': ' Password ',}) post_headers = {' user-agent ': ' mozilla/5.0 (X11; Ubuntu; Linux i686; rv:31.0) gecko/20100101 firefox/31.0 ',}request = Urllib2. Request (Url=post_url, Data=post_data, headers=post_headers,) response = Urllib2.urlopen (request) HTML = RESPONSE.R EAD ()
(3)
#!coding=utf-8import Urllib2import repage_num = 1url = ' http://tieba.baidu.com/p/3238280985?see_lz=1&pn= ' +str ( Page_num) mypage = Urllib2.urlopen (URL). read (). Decode (' GBK ') Myre = Re.compile (R ' class= "D_post_content j_d_post_ Content "> (. *?) </div> ', Re. Dotall) items = Myre.findall (mypage) f = open (' Baidu.txt ', ' A + ') Import sysreload (SYS) sys.setdefaultencoding (' utf-8 ') i = 0texts = []for item in Items:i + = 1 Print I text = item.replace (' <br> ', ') text.replace (' \ n ', '). Repl Ace (', ') + ' \ n ' Print text f.write (text) f.close ()
(4)
#coding: Utf-8 ' demo login 163 email and download Message "' import urllibimport urllib2import Cookielibimport reimport timeimport jsonclass email163: header = {' user-agent ': ' mozilla/5.0 (windows; u; windows nt 6.1; en-us; rv:1.9.1.6) gecko/20091201 firefox/3.5.6 '} user = ' cookie = None sid = None Mailbaseurl= ' http://twebmail.mail.163.com ' def __init__ (self): self.cookie = cookielib. Cookiejar () cookiepro = urllib2. Httpcookieprocessor (Self.cookie) urllib2.install_opener ( Urllib2.build_opener (Cookiepro)) def login (self,user,pwd): ' Login ' postdata = urllib.urlencode ({ ' username ':user, ' Password ':p wd, ' type ':1 }) #注意版本不同, the login URL is also different req = urllib2. Request ( url= ' Https://ssl.mail.163.com/entry/coremail/fcg/ntesdoor2?funcid=loginone&language=-1&passtype=1&ifRame=1&product=mail163&from=web&df=email163&race=-2_45_-2_hz&module=&uid= ' +user+ ' & Style=10&net=t&skinid=null ', data=postdata, headers=self.header, ) res = str (Urllib2.urlopen (req). Read ()) #print res patt = re.compile (' sid= ([^ ']+) ', Re. I) patt = patt.search (res) uname = user.split (' @ ') [0] self.user = user if patt:&Nbsp; self.sid = patt.group (1). Strip () #print self.sid print '%s login successful ... '% (uname) else: print '%s login failed ... '% (uname) def getinbox (self): " get a list of mailboxes ' print ' \nget mail lists.....\n ' sid = self.sid url = self.mailbaseurl+ '/jy3/list/list.do? sid= ' +sid+ ' &fid=1&fr=folder ' res = Urllib2.urlopen (URL). Read () #获取邮件列表 maillist = [] patt = re.compile (' <div\s+class= ' Tdlike ibx_td_from "[^>]+>.*?href=" ([^ "]+)" [^>]+> (. *?) <\/a>.*?<div\s+class= "Tdlike ibx_td_subject" [^>]+>.*?href= "[^>]+> (. *?) <\/a> ', Re. I|re. S) patt = patt.findall (res) if patt==None: return mailList for i in patt: line = { ' from ': I[1].decode (' UTF8 '), ' URL ':self.mailbaseurl+i[0], ' subject ': I[2].decode (' UTF8 ') } maillist.append (line) return maillist def getmailmsg (Self,url): ' Download message content ' content= ' print ' \n download.....%s\n '% (URL) res = urllib2.urlopen (URL). Read () Patt = re.compile (' contentURL: "([^"]+) "', Re. I) patt = patt.search (res) if patt==None: return content url = '%s%s '% ( Self.mailbaseurl,patt.group (1)) time.sleep (1) res = urllib2.urlopen (URL). Read () djson = json. Jsondecoder (encoding= ' UTF8 ') jsonres = djson.decode ( RES) if ' Resultvar ' in jsonRes: content = djson.decode (res) [' Resultvar '] Time.sleep (3) return content " Demon "#初始化mail163 = email163 () #登录mail163. Login (' [email protected] ', ' 944898186 ') Time.sleep (2) # Get Inbox Elist = mail163.getinbox () #获取邮件内容for i in elist: print ' theme:%s from:%s content: \n%s '% (i[' subject '].encode (' UTF8 '), i[' from '].encode (' UTF8 '), Mail163.getmailmsg (i[' url '). Encode (' UTF8 '))