Python--selenium+phantomjs crawling dynamic page ad source

Source: Internet
Author: User

Background: Use crawlers to crawl Web page ad elements, monitor the number of crawled elements, and periodically send monitoring messages

#!/usr/bin/env python2.7# -*- coding: utf-8 -*-' @xiayun @[email protected]# Crawl site content, use PHANTOMJS:IP Agent + Modify ua+ dynamic page to execute JS ' from selenium import webdriverfrom  Selenium.webdriver.common.desired_capabilities import desiredcapabilitiesimport timeimport  urllib,urllib2import smtplibimport refrom email.mime.text import mimetextfrom  email.header import headerimport sysdef reptile ():     global  result, data     #proxy_ip. txt is an IP proxy pool, you can crawl IP, or buy, but are not stable,      #需要在前面再加一个IP验证程序.     ips = [i for i in open ("./proxy_ip.txt",  ' R '). ReadLine (). Split (' \ n ')  if i]    print ips    for i  in IPS:        service_args = []         service_args = ['--proxy-type=http ',]        ip_str  =  ". Join (i)         print IP_str         proxy_IP =  '--proxy=%s '  % IP_str         service_args.append (PROXY_IP)         dcap  = dict (DESIREDCAPABILITIES.PHANTOMJS)          #创建UA头          dcap["Phantomjs.page.settings.userAgent"] =  (' mozilla/5.0   ([Email protected];         cpu iphone os  9_1 like mac os x)  AppleWebKit/601.1.46  (Khtml, like gecko)          version/9.0 mobile/13b143 safari/601.1 ')         &nbSP; #利用phantomjs仿浏览器动作, parameter 2 is proxy ip        driver = webdriver. PHANTOMJS (Desired_capabilities=dcap, service_args=service_args)           #设置访问超时时间         driver.implicitly_wait ()          driver.set_page_load_timeout (        ) Try:            driver.get (' web address ')          except:             print  "Timeout"         finally:             data = driver.page_source             time.sleep (        )     req&Nbsp;= r "ad elements"             rule1 =  Re.compile (req)             lists =  Re.findall (Rule1, data)             counts  = len (lists)             print  counts            # print data             driver.quit ()               #判断广告元素是否为22              if counts == 22:                 print  "the webpage is ok!"                 result =  "the webpage is ok! find 22  AD Element!                 proxy_IP:%s  " % IP_str                 break             if counts != 22:                  #IPS. Remove (i)                  print  "%s is bad!"  % i.strip ()                  result =  "The webpage maybe bad"     print  "close "     #返回结果和网页代码     return result, datadef  Send_mail (result,data):         receivers = [' [email  Protected] ']  #接收人     mail_host =  ' smtp.exmail.qq.com '   #代理邮箱smtp协议     mail_user =  ' [email protected] '   #发送人     mail_ pass =  ' xxxx '    #密码     mail_postfix =  ' xxxx '   # The Outbox suffix     title = str (result)     msg = mimetext ( data,  ' plain ',  ' utf-8 ')    #文本格式内容     me = title.decode (' Utf-8 ')  +  "<"  + mail_user +  ">"     msg[' Subject '] =  header (title,  ' utf-8 ')     msg[' from '] = header (me,  ' utf-8 ')     msg[' to '] = header (";". Join (receivers),  ' utf-8 ')     try:    &nbSp;   s = smtplib. SMTP ()         s.connect (mail_host)          s.login (Mail_user, mail_pass)         s.sendmail (Me, Receivers , msg.as_string ())         s.close ()          print  "Send Success"         return  true    except smtplib. smtpexception:        print  "error:  Unable to send mail"          return Falseif __name__ ==  ' __main__ ':     while 1:        print  ' Start '  +  '   '  +  '. Join (Time.ctime (Time.time ()))         result,  data = reptile ()   &nbsP;     send_mail (Result=result, data=data)          print  ' Stop '  +  '   '  +  '. Join (Time.ctime (Time.time ()))          time.sleep (    sys.exit) (0)


This article is from the "Echo Xiayun" blog, so be sure to keep this source http://linuxerxy.blog.51cto.com/10707334/1893893

Python--selenium+phantomjs crawling dynamic page ad source

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.