The main file is as follows:
#coding =utf-8import requestsimport urllibimport urllib2import cookielib import WeiboEncodeimport WeiboSearch import timeimport reimport randomimport Httplibclass weibologin:def __init__ (Self, user, pwd, enableproxy = false): #构造方法, the parameter is itself, the user, the password, whether to use a proxy server "Initialize Weibologin,enableproxy indicates whether to use a proxy server, the default shutdown" print "Initializing weibologin ... "self.username = userself.password = pwdself.enableproxy = enableproxyself.serverurl = "http://login.sina.com.cn/sso/prelogin.php?entry=weibo&callback= Sinassocontroller.prelogincallback&su=&rsakt=mod&client=ssologin.js (v1.4.11) &_=1379834957683 " self.loginurl = "Http://login.sina.com.cn/sso/login.php?client=ssologin.js (v1.4.11)" Self.postheader = {' user-agent ': ' mozilla/5.0 (windows nt 6.1; rv:24.0) Gecko/ 20100101 firefox/24.0 '} #用户代 user agent, refers to the browser, its information includes hardware platform, system software, application software and user's personal preferences. Def login (self): #登陆程序 "Login Program" self. Enablecookie (Self.enableproxy) #cookie或代理服务器配置serverTime, nonce, pubkey, rsakv = self. GetServerTime () #登陆的第一步postData = weiboencode.postencode (self.username, self.password, SERVERTIME,&NBSP;NONCE,&NBSP;PUBKEY,&NBSP;RSAKV) #加密用户和密码print "post data length:\n", len ( PostData) req = urllib2. Request (self.loginurl, postdata, self.postheader) #构造网络请求print "Posting request ..." result = urllib2.urlopen (req) #发出网络请求text = result.read () try:loginurl = Weibosearch.sredirectdata (text) #解析重定位结果 (automatically jump to the page after landing) Urllib2.urlopen (loginurl) except:print ' login error! ' return falseprint ' login sucess! ' Return truedef enablecookie (Self, enableproxy): # "enable cookie & proxy (if needed). " Cookiejar = cookielib. Lwpcookiejar () #建立cookiecookie_support = urllib2. Httpcookieprocessor (Cookiejar) #HTTPCookieProcessor instances have one Attribute: #HTTPCookieProcessor .cookiejar (the cookielib. Cookiejar in which cookies are stored.) If enableproxy:proxy_support = urllib2. Proxyhandler ({' http ': ' 59.59.100.123:8118 '}) #使用代理opener = urllib2.build_opener (proxy_support, Cookie_support, urllib2. HttpHandler) #Return an openerdirector instance#the openerdirector class opens URLs via BaseHandlers chained together.print "Proxy enabled" Else:opener = urllib2.build_opener (COOKIE_SUPPORT,&NBSP;URLLIB2. HttpHandler) urllib2.install_opener (opener) #构建cookie对应的openerdef getservertime (self): # "get Server time and nonce, which are used to encode the password "#在摘要认证中服务器让客户选一个随机数 (called" NOnce "), then the browser uses a one-way cryptographic function to generate a message digest (message #digest), which is about the user name, password, given nonce value, HTTP method, and the requested URL. print "Getting server time and nonce ..." Serverdata = urllib2.urlopen ( Self.serverurl). Read () #得到网页内容print serverdatatry:servertime, nonce, pubkey, rsakv = weibosearch.sserverdata (serverdata) #解析得到serverTime, nonce etc Return servertime, nonce, pubkey , rsakvexcept:print ' get server time & nonce error! ' Return nonedef fetch_weibo (id, filename): #不借助API取回微博列表, but only the first few, parameters are user ID, file name target = Open (filename, ' a ') myurl= ' http://weibo.com/u/' +idline = urllib2.urlopen (myurl). Read () Target.write (line) if re.search (R ' \ "Wb_detail ', line):p rint " Success "P = re.compile (R ' \ "Wb_detail\" ') Linelist = p.split (line) for fraction in linelist:matchobj = Re.search (R ' nick-name=\ ". +?\" >\\n + (. +?) < ', fraction) if matchobj:target.write (Matchobj.group (1)) Target.write ("\ n") Def fetchqueryresult (): # This method can retrieve the search results of myurl= "Http://s.weibo.com/user/&auth=ord&age=22y&gender=women®ion=custom : 33:1&page= "#找人页面的urltarget = open (" filename ", ' a ') #输出文件名称for i in range ( 37,51): #起止页码line = urllib2.urlopen (Myurl). Read () While re.search (R ' ids\= (\d+?) \ \ ', line): Matchobj = re.search (R ' ids\= (\d+?) \ \ ', line) print matchobj.group (1) target.write (Matchobj.group (1)) Target.write ("\ n") p = Re.compile (R ' +matchobj.group (1)) Linelist = p.split (line) Line = linelist[len (linelist)-1] Print itime.sleep (2+random.random ());d Ef getjson (): #本方法可调用微博API, retrieving the list of users who have logged in to their blog headers = {' user-agent ': ' mozilla/5.0 (windows nt 6.1; rv:24.0) gecko/20100101 firefox/24.0 '} #定义一些文件头url = "Https://api.weibo.com/2/statuses/user_timEline.json " # here is urlyour_param = {' source ': ' 1675437817 '} # Here is the request parameter! Result = requests.get (Url, params=your_param) # send request if URL is http://s.weibo.com/ weibo/s so The effect of this sentence is &NBSP;HTTP://S.WEIBO.COM/WEIBO/S? refer=sina_indexresult_final = result.text #这样就获取到了你发送的这个URL + Parameters After the result print result.textif __name__ == ' __main__ ': #if the python interpreter is running that module (The source file) as the main program, #it sets the special __name__ variable to have a value # "__main__". #If this file is being imported from another module, #__name__ will be set to the module ' s name.weibologin = Weibologin (' [email protected] ', ' XXXXXXXX ') #邮箱 (account), password If&nbsP;weibologin.login () == True:print "Landing success!" "Myurl=" http://api.weibo.com/2/statuses/timeline_batch.json?source=1675437817&uids=5029941840 "htmlcontent = urllib2.urlopen (Myurl). Read () print htmlcontent
The other two classes to use are weibosearch.py and weiboencode.py.
The weiboencode.py code is as follows:
#coding =utf-8import urllibimport base64import rsaimport binascii def Postencode (USERNAME,&NBSP;PASSWORD,&NBSP;SERVERTIME,&NBSP;NONCE,&NBSP;PUBKEY,&NBSP;RSAKV): "Used to generate post data" encodedUserName = GetUserName (userName) #用户名使用base64加密 encodedpassword = get_pwd (password, Servertime, nonce, pubkey) #目前密码采用rsa加密 postPara = { ' entry ': ' Weibo ', ' Gateway ': ' 1 ', ' from ': ', ' savestate ': ' 7 ', ' Userticket ': ' 1 ', ' ssosimplelogin ': ' 1 ', ' VSNF ': &nbsP; ' 1 ', ' vsnval ': ', ' su ': encodedusername, ' service ': ' Miniblog ', ' servertime ': servertime, ' nonce ': nonce, ' pwencode ': ' rsa2 ', ' SP ': encodedpassword, ' encoding ': ' UTF-8 ', ' Prelt ' : ', ' rsakv ': rsakv, ' url ': ' http://weibo.com/ajaxlogin.php?framelogin=1 &callback=parent.sinassocontroller.feedbackurlcallback ', ' ReturnType ': ' META ' &NBSP;&NBSp; } postdata = urllib.urlencode (PostPara) #网络编码 return postdata def getusername (userName): "Used to Encode user name " usernametemp = urllib.quote (userName) usernameencoded = base64.encodestring (usernametemp) [: -1] return usernameencoded def get_pwd (Password, servertime, nonce, pubkey): rsapublickey = int (pubkey, 16) key = Rsa. PublicKey (rsapublickey, 65537) #创建公钥 message = str (servertime) + ' \ t ' + str (nonce) + ' \ n ' + str (password) #拼接明文js加密文件中得到 passwd = rsa.encrypt (message, key) #加密 passwd = binascii.b2a_hex (passwd) #将加密信息转换为16进制. return passwd
weibosearch.py
#coding =utf-8import reimport json def sserverdata (serverdata): #解析得到serverTime, nonce, etc. "Search the server time & nonce from server data " p = re.compile (' \ (. *) \) ') #re .compile The regular expression can be compiled into a regular expression object jsondata = p.search (Serverdata). Group (1) #查找 data = json.loads (jsondata) #对encodedjson进行decode, get raw data, need to use Json.loads () function servertime = str (data[' servertime ') nonce = data[' nonce '] pubkey = data[' PubKey ']# rsakv = data[' rsakv ']# print "Server time is:", serverTime print "Nonce is:", nonce return servertime, nonce, pubkey, rsakv def sredirectdata (text): p = re.compile (' location\.replace\ ([\ ' "] (. *?) [\ ' "]\) ') loginurl = p.search (text). Group (1) print ' loginurl: ', Loginurl return loginurl
Currently, the crawler can log in automatically and invoke the normal API of Sina Weibo. However, the bulk retrieval of other people's Weibo requires a high level of authorization and is currently being applied.
Sina Weibo simulation landing based on Python