Python simulation Baidu Login instance detailed
Http://www.jb51.net/article/78406.htm
Python Combat Plan Learning assignment 2-1
http://blog.csdn.net/python012/article/details/53344501
Refer to the following online access to Baidu Internet Café login code, first get token, and then log in with a password to get cookies, continue to get Browse records
But sometimes you can get it sometimes, get the empty list, compare it because cookies are missing hm_lvt_ and stuff like that.
may be related to the switch account need to verify the code, and then try to modify the cookie, but both of these cookies are difficult to change
Simplecookie
Mozillacookiejar
#-*-Coding:utf8-*-import urllib2 import urllib import cookielib import re import BS4 import JSON import time import C
Ookie Import Random import datetime #import syslog #import requests url_baidu_index = U ' http://www.baidu.com/'; #https://passport.baidu.com/v2/api/?getapi&class=login&tpl=mn&tangram=true can also use this URL_BAIDU_TOKEN = '
Https://passport.baidu.com/v2/api/?getapi&tpl=pp&apiver=v3&class=login ';
Url_baidu_login = ' Https://passport.baidu.com/v2/api/?login ';
Save_file = ' d:\\bduhis.txt ';
Save_jfile = ' d:\\json.txt ';
Save_cfile = ' d:\\cookie.txt ';
#设置用户名, password username = ';
Password = '; #设置cookie, this cookiejar can be automatically managed without manually specifying #cj = Cookielib.
Cookiejar (); filename = ' cookie.txt ' CJ = cookielib.
Mozillacookiejar (Save_cfile); Opener = Urllib2.build_opener (urllib2.
Httpcookieprocessor (CJ));
Urllib2.install_opener (opener);
#print CJ;
Reqreturn = Urllib2.urlopen (Url_baidu_index); #cj. Set_cookie (Make_cookie (' testname ', ' TestValue ')) ' Change cookie is unsuccessful C=cookie.simplecookie();
c["Manageopen"]= "cards";
c[' Manageopen ' [' expires '] = 0;
c[' manageopen ' [' path '] = '/';
c[' manageopen ' [' domain '] = '. domain.com ';
c[' Manageopen ' [' secure '] = ' ";
Cj.set_cookie (c["Manageopen"]);
"' Print CJ;
Cj.save (Ignore_discard=true, Ignore_expires=false) #获取token, Tokenreturn = Urllib2.urlopen (Url_baidu_token); Matchval = Re.search (U ' "token": "(?)
p<tokenval>.*) "', Tokenreturn.read ());
Tokenval = Matchval.group (' Tokenval '); #构造登录请求参数, the request data is obtained by grasping the packet, corresponding to Https://passport.baidu.com/v2/api/?login request PostData = {' username ': username, ' password ': Password, ' u ': ' https://passport.baidu.com/', ' TPL ': ' PP ', ' token ': tokenval, ' staticpage ': ' https://passport.baidu.co M/static/passpc-account/html/v3jump.html ', ' Isphone ': ' false ', ' CharSet ': ' Utf-8 ', ' callback ': ' Parent.bd__pcbs__
Ra48vi '};
PostData = Urllib.urlencode (postdata); #发送登录请求 loginrequest = urllib2.
Request (Url_baidu_login,postdata); Loginrequest.add_header (' Accept ', ' text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8 ');
Loginrequest.add_header (' accept-encoding ', ' gzip,deflate,sdch ');
Loginrequest.add_header (' accept-language ', ' zh-cn,zh;q=0.8 '); Loginrequest.add_header (' user-agent ', ' mozilla/5.0 (Windows NT 6.1;
WOW64) applewebkit/537.36 (khtml, like Gecko) chrome/28.0.1500.72 safari/537.36 ');
Loginrequest.add_header (' Content-type ', ' application/x-www-form-urlencoded ');
Sendpost = Urllib2.urlopen (loginrequest); #查看贴吧个人主页, test whether the landing success, because the cookie automatic management, here to deal with a lot of convenience #http://tieba.baidu.com/home/main?un=xxxx&fr=index This is the Post bar Personal homepage, All information can be found in this link #teibaUrl = ' http://tieba.baidu.com/f/like/mylike?v=1387441831248 ' # http://i.baidu.com/my/history # http://map.baidu.com/#http://map.baidu.com/?newmap=1&reqflag=pcmap&biz=1&from=webmap&da_par= direct&pcevaname=pc3&qt=fav&mode=get&type=favdata&limit=100&lastver=0&t= 1481726657277 #teibaUrl = ' http://i.baidu.com/my/historylogin ' Teibaurl = ' http://i.baidu.com/login/historyCheck/? Autologin=true ' content = Urllib2.urlopeN (teibaurl). read ();
#print content;
Teibaurl = ' http://i.baidu.com/history/list ' content = Urllib2.urlopen (teibaurl). read ();
Content = Content.decode (' Utf-8 '). Encode (' GB18030 ');
Print content;
Teibaurl = ' http://map.baidu.com/?qt=ssn&t=1482059818916 ' Content2 = Urllib2.urlopen (Teibaurl). read ();
Content2 = Content2.decode (' Utf-8 '). Encode (' GB18030 ');
Print Content2;
' 1. Save to HTML file Def CBK (A, B, c): #回调函数 #@a: Data blocks that have been downloaded #@b: size of the data block #@c: Size of the remote file per = 100.0 * A * b/c if > 100:per = print '%.2f%% '% per urllib.urlretrieve (' http://www.cmfish.com/bbs/forum.php ', ' d:\\b
Aidu1.html ', CBK); ' Def save (filename, contents): FH = open (filename, ' W ') fh.write (contents) fh.close () ' 2.
Save to TXT file T = json.dumps (content, ensure_ascii=false);
Hjson = json.loads (content, encoding= ' utf-8 ');
#t2 =content.decode (' utf-8 ');
#print hjson[' data '] [' list '][0][' query '];
Print Hjson; arr = hjson[' data ' [' list '] tdata = ' Start----------------------------\ntime: ' +time.strftime ('%y-%m-%d%h:%m:%s ', Time.localtime (Time.time ())) + ' total: ' +str (len (arr)) + ': \ n ';
For elem in arr:data = str (elem[' ts ']) + ', ' +elem[' query ']+ ', ' +str (len (elem[' clicks ')); If Len (elem[' clicks ']) >0:try:for CD in elem[' clicks ': data = data + ', ' +str (cd[' title ']) + ', ' +str (cd[' url '])
;
Except exception,e:tdata = Tdata + ' Error: ' +str (e) + ' \ n ';
Tdata = tdata + data+ ' \ n ';
Print Tdata;
Tdata = Tdata + ' end----------------------------\ n ';
Save (Save_file, Tdata.encode (' GBK '));
' Hjson = json.loads (content, encoding= ' utf-8 ');
Save (Save_jfile, Content.encode (' utf-8 ')); # 3. Save to MongoDB from Pymongo import mongoclient client = mongoclient (' 127.0.0.1 ', 27017) db = client["collections"] #数据库名 t
Able=db[' his '] #表名 table.save (hjson) Hjson2 = Json.loads (Content2, encoding= ' utf-8 '); Table.save (Hjson2) #table. Insert ({' id ': ' 1 ', ' name ': ' Cnki '}) ' #解析数据, with BeautifulSoup4, feel no jsoup with the cool soup = bs4.
BeautifulSoup (content);
#print soup.prettify (); List = Soup. FindAll (' A ', attrs={"href": Re.compile (r "^http:")});
#list = Soup.findall (name= ' A ', attrs={' href ': Re.compile (r "kw="), ' title ': Re.compile (r ".")});
List = List[1:len (list)];
Careteibalist = [];
print ' Stick link \\t bar name \\t grade ';
Print Len (list); For elem in list:soup1 = BS4.
BeautifulSoup (str (elem));
print ' http://tieba.baidu.com/' +elem[' href ']+ ' \ ' +elem[' title ';
'''
Then found that the requests module can be very convenient to set header, as long as you log in the browser to get cookies can get the correct list
Save the resulting JSON in the MongoDB
#-*-Coding:utf8-*-import urllib2 import urllib import cookielib import re import BS4 import JSON import time import C Ookie Import Random import datetime import requests #import syslog #import Requests url = ' Http://i.baidu.com/history/li ST ' paras = {' channel ': ' 201,202,300,301,302,303,400,100,500,501,505 ', ' ts ': 1475402100, ' after ': 0, ' page ': ' Query ': '} headers = { ' user-agent ': ' mozilla/5.0 ' (Macintosh; Intel Mac OS X 10_11_6) applewebkit/537.36 (khtml, like Gecko) ' +
' chrome/54.0.2840.87 safari/537.3 ', ' Cookie ': ' ... '}
#data =requests.get (URL, headers=headers);
Data=requests.post (URL, data=paras,headers=headers);
Content=data.text;
Print content;
#hjson = json.loads (content, encoding= ' utf-8 ');
#save (Save_jfile, Content.encode (' utf-8 ')); # 3. Save to MongoDB from Pymongo import mongoclient client = mongoclient (' 127.0.0.1 ', 27017) db = client["collections"] #数据库名 t able=db[' history '] #表名 Hjson = json.Loads (content, encoding= ' utf-8 '); arr = hjson[' data '] [' list '] print len (arr) table.save (hjson) cnt = 0 #table. Insert ({' id ': ' 1 ', ' name ': ' Cnki '}) while Len ( ARR) >0): CNT = cnt + len (arr) tdate = hjson[' data ' [' Ts_start '] table.save (HJSON) paras = {' channel ': ' 201,202,300, 301,302,303,400,100,500,501,505 ', ' ts ': tdate, ' after ': 0, ' page ': ', ' query ': ' Data=requests.post (URL, Data=paras,
Headers=headers);
Content=data.text;
Print Content.encode (' GBK ');
Hjson = json.loads (content, encoding= ' utf-8 '); arr = hjson[' data '] [' list '] print tdate print cnt table=db[' ts '] #表名 table.insert ({' name ': ' Test ', ' Time ': Time.strftime ('
%y-%m-%d%h:%m:%s ', Time.localtime (Time.time ())), ' Ts_start ': tdate}