Background: Use crawlers to crawl Web page ad elements, monitor the number of crawled elements, and periodically send monitoring messages
#!/usr/bin/env python2.7# -*- coding: utf-8 -*-' @xiayun @[email protected]# Crawl site content, use PHANTOMJS:IP Agent + Modify ua+ dynamic page to execute JS ' from selenium import webdriverfrom Selenium.webdriver.common.desired_capabilities import desiredcapabilitiesimport timeimport urllib,urllib2import smtplibimport refrom email.mime.text import mimetextfrom email.header import headerimport sysdef reptile (): global result, data #proxy_ip. txt is an IP proxy pool, you can crawl IP, or buy, but are not stable, #需要在前面再加一个IP验证程序. ips = [i for i in open ("./proxy_ip.txt", ' R '). ReadLine (). Split (' \ n ') if i] print ips for i in IPS: service_args = [] service_args = ['--proxy-type=http ',] ip_str = ". Join (i) print IP_str proxy_IP = '--proxy=%s ' % IP_str service_args.append (PROXY_IP) dcap = dict (DESIREDCAPABILITIES.PHANTOMJS) #创建UA头 dcap["Phantomjs.page.settings.userAgent"] = (' mozilla/5.0 ([Email protected]; cpu iphone os 9_1 like mac os x) AppleWebKit/601.1.46 (Khtml, like gecko) version/9.0 mobile/13b143 safari/601.1 ') &nbSP; #利用phantomjs仿浏览器动作, parameter 2 is proxy ip driver = webdriver. PHANTOMJS (Desired_capabilities=dcap, service_args=service_args) #设置访问超时时间 driver.implicitly_wait () driver.set_page_load_timeout ( ) Try: driver.get (' web address ') except: print "Timeout" finally: data = driver.page_source time.sleep ( ) req&Nbsp;= r "ad elements" rule1 = Re.compile (req) lists = Re.findall (Rule1, data) counts = len (lists) print counts # print data driver.quit () #判断广告元素是否为22 if counts == 22: print "the webpage is ok!" result = "the webpage is ok! find 22 AD Element! proxy_IP:%s " % IP_str break if counts != 22: #IPS. Remove (i) print "%s is bad!" % i.strip () result = "The webpage maybe bad" print "close " #返回结果和网页代码 return result, datadef Send_mail (result,data): receivers = [' [email Protected] '] #接收人 mail_host = ' smtp.exmail.qq.com ' #代理邮箱smtp协议 mail_user = ' [email protected] ' #发送人 mail_ pass = ' xxxx ' #密码 mail_postfix = ' xxxx ' # The Outbox suffix title = str (result) msg = mimetext ( data, ' plain ', ' utf-8 ') #文本格式内容 me = title.decode (' Utf-8 ') + "<" + mail_user + ">" msg[' Subject '] = header (title, ' utf-8 ') msg[' from '] = header (me, ' utf-8 ') msg[' to '] = header (";". Join (receivers), ' utf-8 ') try: &nbSp; s = smtplib. SMTP () s.connect (mail_host) s.login (Mail_user, mail_pass) s.sendmail (Me, Receivers , msg.as_string ()) s.close () print "Send Success" return true except smtplib. smtpexception: print "error: Unable to send mail" return Falseif __name__ == ' __main__ ': while 1: print ' Start ' + ' ' + '. Join (Time.ctime (Time.time ())) result, data = reptile () &nbsP; send_mail (Result=result, data=data) print ' Stop ' + ' ' + '. Join (Time.ctime (Time.time ())) time.sleep ( sys.exit) (0)
This article is from the "Echo Xiayun" blog, so be sure to keep this source http://linuxerxy.blog.51cto.com/10707334/1893893
Python--selenium+phantomjs crawling dynamic page ad source