<title>Directed crawler Practical notes</title> Directed crawler Practical notes
The flowchart is as follows:
From the goddess of chasing the assistant (fetish) v0.1:
1.#-*-coding:utf8-*-
2.
3.ImportSmtplib
4. fromEmail.mime.textImportMimetext
5.ImportRequests
6. fromlxmlImportEtree
7.ImportOs
8.ImportTime
9.ImportSys
.Reload (SYS)
One by one .Sys.setdefaultencoding (' Utf-8 ')
.
.
.
A . class mailhelper(object):
. ""
This class implements the ability to send mail
" . "
. def __init__(self):
.
.self.mail_host="Smtp.xxxx.com" #设置服务器
.Self.mail_user="xxxx" #用户名
.self.mail_pass="xxxx" #密码
A .self.mail_postfix="Xxxx.com" #发件箱的后缀
.
. def send_mail(self,to_list,sub,content):
.Me="Xxoohelper"+"<"+self.mail_user+"@"+self.mail_postfix+">"
.msg = Mimetext (content,_subtype=' Plain ', _charset=' Utf-8 ')
.msg[' Subject '] = Sub
.msg[' from '] = Me
.msg[' to '] =";". Join (To_list)
. Try:
.Server = Smtplib. SMTP ()
.Server.connect (Self.mail_host)
.Server.login (Self.mail_user,self.mail_pass)
.Server.sendmail (Me, To_list, msg.as_string ())
The PNs.Server.close ()
. return True
. exceptException, E:
. PrintSTR (e)
. return False
A .
. class xxoohelper(object):
. ""
This class implementation will crawl the first content of Weibo
" . "
. def __init__(self):
.Self.url =' http://weibo.cn/u/xxxxxxx ' #请输入准备抓取的微博地址
.Self.url_login =' https://login.weibo.cn/login/'
.Self.new_url = Self.url_login
.
. def getsource(self):
.html = Requests.get (self.url). Content
. returnHtml
.
. def getData(self,html):
.selector = etree. HTML (HTML)
.Password = Selector.xpath ('//input[@type = ' password ']/@name ')[0]
.VK = Selector.xpath ('//input[@name = ' VK ']/@value ')[0]
.Action = Selector.xpath ('//form[@method = ' post ']/@action ')[0]
.Self.new_url = Self.url_login + action
.data = {
. ' Mobile ':' [email protected] ',
.Password:' xxxxxx ',
. ' Remember ':' on ',
. ' Backurl ':' http://weibo.cn/u/xxxxxx ',#此处请修改为微博地址
. ' Backtitle ':u ' Micro blog ',
. ' Trycount ':"',
. ' VK ': VK,
. ' Submit ':u ' login '
.}
. returnData
.
. def getcontent(self,data):
.newhtml = Requests.post (self.new_url,data=data). Content
The .New_selector = etree. HTML (newhtml)
.Content = New_selector.xpath ('//span[@class = "CTT"] ')
.Newcontent = Unicode (content[2].xpath (' string (.) '). Replace ('/http ',"')
.Sendtime = New_selector.xpath ('//span[@class = ' ct ']/text () ')[0]
A .SendText = newcontent + sendtime
Bayi. returnSendText
.
. def tosave(self,text):
.F= Open (' Weibo.txt ',' A ')
.F.write (text +' \ n ')
.F.close ()
.
. def tocheck(self,data):
. if notOs.path.exists (' Weibo.txt '):
. return True
. Else:
.f = open (' Weibo.txt ',' R ')
A .Existweibo = F.readlines ()
94. ifData +' \ n ' inchExistweibo:
. return False
. Else:
. return True
98.
.if__name__ = =' __main__ ':
.mailto_list=[' [email protected] ']#此处填写接收邮件的邮箱
101.Helper = Xxoohelper ()
102. while True:
103.Source = Helper.getsource ()
104.data = Helper.getdata (source)
.Content = helper.getcontent (data)
106. ifHelper.tocheck (content):
107. ifMailhelper (). Send_mail (Mailto_list,u "Goddess Update", content):
108. Print u "Send Success"
109. Else:
. Print u "Send Failed"
111.Helper.tosave (content)
. PrintContent
113. Else:
. Print u ' pass '
A .Time.sleep ( -)
Directed crawler Practical notes