Fetch code:
# Coding=utf-8
Import OS
Import re
From selenium import Webdriver
Import Selenium.webdriver.support.ui as UI
From Selenium.webdriver.common.keys import keys
Import time
From Selenium.webdriver.common.action_chains import Actionchains
Import IniFile
Class Weibo:
def __init__ (self):
#通过配置文件获取IEDriverServer. exe path
ConfigFile = Os.path.join (OS.GETCWD (), ' config.conf ')
CF = Inifile.configfile (configfile)
Iedriverserver = cf. GetValue ("section", "Iedriverserver")
#每抓取一页数据延迟的时间, in seconds, default is 5 seconds
Self.pagedelay = 5
Pageinteraldelay = cf. GetValue ("section", "Pageinteraldelay")
If Pageinteraldelay:
self.pagedelay = Int (pageinteraldelay)
os.environ["Webdriver.ie.driver"] = Iedriverserver
Self.driver = Webdriver. Ie (Iedriverserver)
def printtop (self,topic):
Items = topic.split (' @ ')
HT = Items[0].replace (' \ n ', ')
print ' topic:%s '% HT
its = Items[1].split (' n ')
Author_time_nums = Items[1].split (' n ')
Author_time = Author_time_nums[0]
Nums = author_time_nums[1] #今天 00:14
PATTERN1 = Re.compile (R ' Today \s{1}\d{2}:\d{2}|\d{1,2} month \d{1,2} Day \s{1}\d{2}:\d{2} ')
time1 = Re.findall (pattern1, Author_time)
print ' topic @%s '% author_time.split (') [0]
print ' time:%s '% time1[0]
print ' Likes:%s '% nums.split (') [0]
print ' Comment:%s '% nums.split (') [1]
print ' forwarding:%s '% nums.split (') [2]
print '
def catchdata (Self,classname,firsturl):
‘‘‘
Fetching data
:p Aram ID: The ID of the element tag to get
:p Aram Firsturl: Home URL
: return:
‘‘‘
Start = Time.clock ()
#加载首页
Wait = UI. Webdriverwait (Self.driver, 10)
Self.driver.get (Firsturl)
#打印标题
Print Self.driver.title
Time.sleep (20)
Wait.until (Lambda Driver:self.driver.find_elements_by_xpath (classname))
Elements = Self.driver.find_elements_by_xpath (classname)
For element in Elements:
print '
txt = element.text.encode (' UTF8 ')
Self.printtop (TXT)
Self.driver.close ()
Self.driver.quit ()
End = Time.clock ()
print '
Print "Entire process time:%f seconds"% (End-start)
# #测试抓取微博数据
obj = Weibo ()
Firsturl = "Http://weibo.com/?category=0"
Obj. Catchdata ("//li[@class = ' pt_li pt_li_1 s_bg2 ']", Firsturl)
[Python crawler] Four: Selenium crawl micro-blog data