Python擷取百度瀏覽記錄__Python

來源:互聯網
上載者:User

Python類比百度登入執行個體詳解

http://www.jb51.net/article/78406.htm

Python實戰計劃學習作業2-1

http://blog.csdn.net/python012/article/details/53344501

參考了以下網上擷取百度網吧登陸的代碼,先拿到token,然後用密碼登陸得到cookie,繼續拿到瀏覽記錄

但有時可以拿到有時不行,拿到了空的list,比較了下是因為cookie少了Hm_lvt_之類的東西

也許跟切換帳號需要驗證碼有關,然後嘗試修改cookie,但這兩種cookie都不好改

SimpleCookie

MozillaCookieJar

# -*- coding: utf8 -*-import urllib2import urllibimport cookielibimport reimport bs4import jsonimport timeimport Cookieimport randomimport datetime#import syslog#import requestsURL_BAIDU_INDEX = u'http://www.baidu.com/';#https://passport.baidu.com/v2/api/?getapi&class=login&tpl=mn&tangram=true 也可以用這個URL_BAIDU_TOKEN = 'https://passport.baidu.com/v2/api/?getapi&tpl=pp&apiver=v3&class=login';URL_BAIDU_LOGIN = 'https://passport.baidu.com/v2/api/?login';SAVE_FILE = 'D:\\bduhis.txt';SAVE_JFILE = 'D:\\json.txt';SAVE_CFILE = 'D:\\cookie.txt';#設定使用者名稱、密碼username = '';password = '';#設定cookie,這裡cookiejar可自動管理,無需手動指定#cj = cookielib.CookieJar();filename = 'cookie.txt'cj = cookielib.MozillaCookieJar(SAVE_CFILE);opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj));urllib2.install_opener(opener);#print cj;reqReturn = urllib2.urlopen(URL_BAIDU_INDEX);#cj.set_cookie(make_cookie('testname','testvalue' )) '''更改cookie不成功c=Cookie.SimpleCookie();c["Manageopen"]="cards";c['Manageopen']['expires'] = 0;c['Manageopen']['path'] = "/";c['Manageopen']['domain'] = ".domain.com";c['Manageopen']['secure'] = "";cj.set_cookie(c["Manageopen"]) ;'''print cj;cj.save(ignore_discard=True, ignore_expires=False)#擷取token,tokenReturn = urllib2.urlopen(URL_BAIDU_TOKEN);matchVal = re.search(u'"token" : "(?P<tokenVal>.*?)"',tokenReturn.read());tokenVal = matchVal.group('tokenVal');#構造登入請求參數,該請求資料是通過抓包獲得,對應https://passport.baidu.com/v2/api/?login請求postData = {'username' : username,'password' : password,'u' : 'https://passport.baidu.com/','tpl' : 'pp','token' : tokenVal,'staticpage' : 'https://passport.baidu.com/static/passpc-account/html/v3Jump.html','isPhone' : 'false','charset' : 'utf-8','callback' : 'parent.bd__pcbs__ra48vi'};postData = urllib.urlencode(postData);#發送登入請求loginRequest = urllib2.Request(URL_BAIDU_LOGIN,postData);loginRequest.add_header('Accept','text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8');loginRequest.add_header('Accept-Encoding','gzip,deflate,sdch');loginRequest.add_header('Accept-Language','zh-CN,zh;q=0.8');loginRequest.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.72 Safari/537.36');loginRequest.add_header('Content-Type','application/x-www-form-urlencoded');sendPost = urllib2.urlopen(loginRequest);#查看貼吧個人首頁 ,測試是否登陸成功,由於cookie自動管理,這裡處理起來方便很多#http://tieba.baidu.com/home/main?un=XXXX&fr=index 這個是貼吧個人首頁,各項資訊都可以在此找到連結#teibaUrl = 'http://tieba.baidu.com/f/like/mylike?v=1387441831248'# http://i.baidu.com/my/history# http://map.baidu.com/ #http://map.baidu.com/?newmap=1&reqflag=pcmap&biz=1&from=webmap&da_par=direct&pcevaname=pc3&qt=fav&mode=get&type=favdata&limit=100&lastver=0&t=1481726657277#teibaUrl = 'http://i.baidu.com/my/historylogin'teibaUrl = 'http://i.baidu.com/login/historyCheck/?autoLogin=true'content = urllib2.urlopen(teibaUrl).read();#print content;teibaUrl = 'http://i.baidu.com/history/list'content = urllib2.urlopen(teibaUrl).read();content = content.decode('utf-8').encode('GB18030');print content;teibaUrl = 'http://map.baidu.com/?qt=ssn&t=1482059818916'content2 = urllib2.urlopen(teibaUrl).read();content2 = content2.decode('utf-8').encode('GB18030');print content2;'''1. save to html filedef cbk(a, b, c):  #回呼函數 #@a: 已經下載的資料區塊 #@b: 資料區塊的大小 #@c: 遠程檔案的大小 per = 100.0 * a * b / c  if per > 100:  per = 100  print '%.2f%%' % per urllib.urlretrieve('http://www.cmfish.com/bbs/forum.php','D:\\baidu1.html',cbk);'''def save(filename, contents):   fh = open(filename, 'w')   fh.write(contents)   fh.close() '''2. save to txt filet = json.dumps(content, ensure_ascii=False);hjson = json.loads(content, encoding='utf-8');#t2=content.decode('utf-8');#print hjson['data']['list'][0]['query'];print hjson;arr = hjson['data']['list']tdata = 'Start----------------------------\nTime:'+time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))+' Total:'+str(len(arr))+':\n';for elem in arr:data = str(elem['ts'])+','+elem['query']+','+str(len(elem['clicks']));if len(elem['clicks'])>0:try:for cd in elem['clicks']:data = data + ','+str(cd['title'])+','+str(cd['url']);except Exception,e:    tdata = tdata + 'Error:'+str(e)+'\n';tdata = tdata + data+'\n';print tdata;tdata = tdata + 'End----------------------------\n';save(SAVE_FILE, tdata.encode('gbk')) ;'''hjson = json.loads(content, encoding='utf-8');save(SAVE_JFILE, content.encode('utf-8')) ;# 3. save to mongodbfrom pymongo import MongoClientclient = MongoClient('127.0.0.1', 27017)db = client["Collections"]#資料庫名table=db['his']#表名table.save(hjson)hjson2 = json.loads(content2, encoding='utf-8');table.save(hjson2)#table.insert({'id':'1','name':'cnki'})'''#解析資料,用的BeautifulSoup4,感覺沒有jsoup用的爽soup = bs4.BeautifulSoup(content);#print soup.prettify();list = soup.findAll('a',attrs={"href":re.compile(r"^http:")});#list = soup.findAll(name='a',attrs={'href':re.compile(r"kw="),'title':re.compile(r".")}) ;list = list[1:len(list)];careTeibalist = [];print '貼吧連結\\t吧名\\t等級';print  len(list);for elem in list:soup1 = bs4.BeautifulSoup(str(elem));print 'http://tieba.baidu.com/'+elem['href']+'\\'+elem['title'];'''


然後才發現requests模組可以很方便設定header,只要自己在瀏覽器登陸 下擷取cookie就可以得到正確的list
將得到的json儲存在mongodb裡面

# -*- coding: utf8 -*-import urllib2import urllibimport cookielibimport reimport bs4import jsonimport timeimport Cookieimport randomimport datetimeimport requests#import syslog#import requestsurl = 'http://i.baidu.com/history/list'paras = {'channel':'201,202,300,301,302,303,400,100,500,501,505','ts':1475402100,'after':0,'page':25,'query':''}headers = {    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) ' +                  'Chrome/54.0.2840.87 Safari/537.3','Cookie': '...'}#data=requests.get(url, headers=headers);data=requests.post(url, data=paras,headers=headers);content=data.text;print content;#hjson = json.loads(content, encoding='utf-8');#save(SAVE_JFILE, content.encode('utf-8')) ;# 3. save to mongodbfrom pymongo import MongoClientclient = MongoClient('127.0.0.1', 27017)db = client["Collections"]#資料庫名table=db['history']#表名hjson = json.loads(content, encoding='utf-8');arr = hjson['data']['list']print len(arr)table.save(hjson)cnt = 0#table.insert({'id':'1','name':'cnki'})while (len(arr) >0):cnt = cnt + len(arr)tdate = hjson['data']['ts_start']table.save(hjson)paras = {'channel':'201,202,300,301,302,303,400,100,500,501,505','ts':tdate,'after':0,'page':25,'query':''}data=requests.post(url, data=paras,headers=headers);content=data.text;print content.encode('gbk');hjson = json.loads(content, encoding='utf-8');arr = hjson['data']['list']print tdateprint cnttable=db['ts']#表名table.insert({'name':'test','time':time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())),'ts_start':tdate})





聯繫我們

該頁面正文內容均來源於網絡整理,並不代表阿里雲官方的觀點,該頁面所提到的產品和服務也與阿里云無關,如果該頁面內容對您造成了困擾,歡迎寫郵件給我們,收到郵件我們將在5個工作日內處理。

如果您發現本社區中有涉嫌抄襲的內容,歡迎發送郵件至: info-contact@alibabacloud.com 進行舉報並提供相關證據,工作人員會在 5 個工作天內聯絡您,一經查實,本站將立刻刪除涉嫌侵權內容。

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.