Python gets Baidu browsing record __python

Source: Internet
Author: User
Tags mongoclient mongodb syslog

Python simulation Baidu Login instance detailed

Http://www.jb51.net/article/78406.htm

Python Combat Plan Learning assignment 2-1

http://blog.csdn.net/python012/article/details/53344501

Refer to the following online access to Baidu Internet Café login code, first get token, and then log in with a password to get cookies, continue to get Browse records

But sometimes you can get it sometimes, get the empty list, compare it because cookies are missing hm_lvt_ and stuff like that.

may be related to the switch account need to verify the code, and then try to modify the cookie, but both of these cookies are difficult to change

Simplecookie

Mozillacookiejar

#-*-Coding:utf8-*-import urllib2 import urllib import cookielib import re import BS4 import JSON import time import C
Ookie Import Random import datetime #import syslog #import requests url_baidu_index = U ' http://www.baidu.com/'; #https://passport.baidu.com/v2/api/?getapi&class=login&tpl=mn&tangram=true can also use this URL_BAIDU_TOKEN = '
Https://passport.baidu.com/v2/api/?getapi&tpl=pp&apiver=v3&class=login ';
Url_baidu_login = ' Https://passport.baidu.com/v2/api/?login ';
Save_file = ' d:\\bduhis.txt ';
Save_jfile = ' d:\\json.txt ';
Save_cfile = ' d:\\cookie.txt ';
#设置用户名, password username = ';
Password = '; #设置cookie, this cookiejar can be automatically managed without manually specifying #cj = Cookielib.
Cookiejar (); filename = ' cookie.txt ' CJ = cookielib.

Mozillacookiejar (Save_cfile); Opener = Urllib2.build_opener (urllib2.
Httpcookieprocessor (CJ));
Urllib2.install_opener (opener);
#print CJ;
Reqreturn = Urllib2.urlopen (Url_baidu_index); #cj. Set_cookie (Make_cookie (' testname ', ' TestValue ')) ' Change cookie is unsuccessful C=cookie.simplecookie();
c["Manageopen"]= "cards";
c[' Manageopen ' [' expires '] = 0;
c[' manageopen ' [' path '] = '/';
c[' manageopen ' [' domain '] = '. domain.com ';
c[' Manageopen ' [' secure '] = ' ";
Cj.set_cookie (c["Manageopen"]);
"' Print CJ;
Cj.save (Ignore_discard=true, Ignore_expires=false) #获取token, Tokenreturn = Urllib2.urlopen (Url_baidu_token); Matchval = Re.search (U ' "token": "(?)
p<tokenval>.*) "', Tokenreturn.read ());
Tokenval = Matchval.group (' Tokenval '); #构造登录请求参数, the request data is obtained by grasping the packet, corresponding to Https://passport.baidu.com/v2/api/?login request PostData = {' username ': username, ' password ': Password, ' u ': ' https://passport.baidu.com/', ' TPL ': ' PP ', ' token ': tokenval, ' staticpage ': ' https://passport.baidu.co M/static/passpc-account/html/v3jump.html ', ' Isphone ': ' false ', ' CharSet ': ' Utf-8 ', ' callback ': ' Parent.bd__pcbs__
Ra48vi '};
PostData = Urllib.urlencode (postdata); #发送登录请求 loginrequest = urllib2.
Request (Url_baidu_login,postdata); Loginrequest.add_header (' Accept ', ' text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8 ');
Loginrequest.add_header (' accept-encoding ', ' gzip,deflate,sdch ');
Loginrequest.add_header (' accept-language ', ' zh-cn,zh;q=0.8 '); Loginrequest.add_header (' user-agent ', ' mozilla/5.0 (Windows NT 6.1;
WOW64) applewebkit/537.36 (khtml, like Gecko) chrome/28.0.1500.72 safari/537.36 ');

Loginrequest.add_header (' Content-type ', ' application/x-www-form-urlencoded ');
Sendpost = Urllib2.urlopen (loginrequest); #查看贴吧个人主页, test whether the landing success, because the cookie automatic management, here to deal with a lot of convenience #http://tieba.baidu.com/home/main?un=xxxx&fr=index This is the Post bar Personal homepage, All information can be found in this link #teibaUrl = ' http://tieba.baidu.com/f/like/mylike?v=1387441831248 ' # http://i.baidu.com/my/history # http://map.baidu.com/#http://map.baidu.com/?newmap=1&reqflag=pcmap&biz=1&from=webmap&da_par= direct&pcevaname=pc3&qt=fav&mode=get&type=favdata&limit=100&lastver=0&t= 1481726657277 #teibaUrl = ' http://i.baidu.com/my/historylogin ' Teibaurl = ' http://i.baidu.com/login/historyCheck/? Autologin=true ' content = Urllib2.urlopeN (teibaurl). read ();
#print content;
Teibaurl = ' http://i.baidu.com/history/list ' content = Urllib2.urlopen (teibaurl). read ();
Content = Content.decode (' Utf-8 '). Encode (' GB18030 ');
Print content;
Teibaurl = ' http://map.baidu.com/?qt=ssn&t=1482059818916 ' Content2 = Urllib2.urlopen (Teibaurl). read ();
Content2 = Content2.decode (' Utf-8 '). Encode (' GB18030 ');
Print Content2;  
	' 1. Save to HTML file Def CBK (A, B, c): #回调函数 #@a: Data blocks that have been downloaded #@b: size of the data block #@c: Size of the remote file per = 100.0 * A * b/c if > 100:per = print '%.2f%% '% per urllib.urlretrieve (' http://www.cmfish.com/bbs/forum.php ', ' d:\\b
Aidu1.html ', CBK); ' Def save (filename, contents): FH = open (filename, ' W ') fh.write (contents) fh.close () ' 2.
Save to TXT file T = json.dumps (content, ensure_ascii=false);
Hjson = json.loads (content, encoding= ' utf-8 ');
#t2 =content.decode (' utf-8 ');
#print hjson[' data '] [' list '][0][' query '];
Print Hjson; arr = hjson[' data ' [' list '] tdata = ' Start----------------------------\ntime: ' +time.strftime ('%y-%m-%d%h:%m:%s ', Time.localtime (Time.time ())) + ' total: ' +str (len (arr)) + ': \ n ';
	For elem in arr:data = str (elem[' ts ']) + ', ' +elem[' query ']+ ', ' +str (len (elem[' clicks ')); If Len (elem[' clicks ']) >0:try:for CD in elem[' clicks ': data = data + ', ' +str (cd[' title ']) + ', ' +str (cd[' url '])		
		;
	Except exception,e:tdata = Tdata + ' Error: ' +str (e) + ' \ n ';		
Tdata = tdata + data+ ' \ n ';
Print Tdata;
Tdata = Tdata + ' end----------------------------\ n ';
Save (Save_file, Tdata.encode (' GBK '));
' Hjson = json.loads (content, encoding= ' utf-8 ');
Save (Save_jfile, Content.encode (' utf-8 ')); # 3. Save to MongoDB from Pymongo import mongoclient client = mongoclient (' 127.0.0.1 ', 27017) db = client["collections"] #数据库名 t
Able=db[' his '] #表名 table.save (hjson) Hjson2 = Json.loads (Content2, encoding= ' utf-8 '); Table.save (Hjson2) #table. Insert ({' id ': ' 1 ', ' name ': ' Cnki '}) ' #解析数据, with BeautifulSoup4, feel no jsoup with the cool soup = bs4.
BeautifulSoup (content);
#print soup.prettify (); List = Soup. FindAll (' A ', attrs={"href": Re.compile (r "^http:")});
#list = Soup.findall (name= ' A ', attrs={' href ': Re.compile (r "kw="), ' title ': Re.compile (r ".")});
List = List[1:len (list)];
Careteibalist = [];
print ' Stick link \\t bar name \\t grade ';
Print Len (list); For elem in list:soup1 = BS4.
	BeautifulSoup (str (elem));
print ' http://tieba.baidu.com/' +elem[' href ']+ ' \ ' +elem[' title ';
 '''


Then found that the requests module can be very convenient to set header, as long as you log in the browser to get cookies can get the correct list
Save the resulting JSON in the MongoDB

#-*-Coding:utf8-*-import urllib2 import urllib import cookielib import re import BS4 import JSON import time import C Ookie Import Random import datetime import requests #import syslog #import Requests url = ' Http://i.baidu.com/history/li ST ' paras = {' channel ': ' 201,202,300,301,302,303,400,100,500,501,505 ', ' ts ': 1475402100, ' after ': 0, ' page ': ' Query ': '} headers = {    ' user-agent ': ' mozilla/5.0 ' (Macintosh;  Intel Mac OS X 10_11_6) applewebkit/537.36 (khtml, like Gecko) ' +                


  ' chrome/54.0.2840.87 safari/537.3 ', ' Cookie ': ' ... '}
#data =requests.get (URL, headers=headers);
Data=requests.post (URL, data=paras,headers=headers);
Content=data.text;


Print content;
#hjson = json.loads (content, encoding= ' utf-8 ');
#save (Save_jfile, Content.encode (' utf-8 ')); # 3. Save to MongoDB from Pymongo import mongoclient client = mongoclient (' 127.0.0.1 ', 27017) db = client["collections"] #数据库名 t able=db[' history '] #表名 Hjson = json.Loads (content, encoding= ' utf-8 '); arr = hjson[' data '] [' list '] print len (arr) table.save (hjson) cnt = 0 #table. Insert ({' id ': ' 1 ', ' name ': ' Cnki '}) while Len ( ARR) >0): CNT = cnt + len (arr) tdate = hjson[' data ' [' Ts_start '] table.save (HJSON) paras = {' channel ': ' 201,202,300, 301,302,303,400,100,500,501,505 ', ' ts ': tdate, ' after ': 0, ' page ': ', ' query ': ' Data=requests.post (URL, Data=paras,
	Headers=headers);
	Content=data.text;
	Print Content.encode (' GBK ');
	Hjson = json.loads (content, encoding= ' utf-8 '); arr = hjson[' data '] [' list '] print tdate print cnt table=db[' ts '] #表名 table.insert ({' name ': ' Test ', ' Time ': Time.strftime ('


 %y-%m-%d%h:%m:%s ', Time.localtime (Time.time ())), ' Ts_start ': tdate}





Related Article

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.