Sina Weibo simulation landing based on Python

Source: Internet
Author: User

The main file is as follows:

#coding =utf-8import requestsimport urllibimport urllib2import cookielib  import  WeiboEncodeimport WeiboSearch import timeimport reimport randomimport  Httplibclass weibologin:def __init__ (Self, user, pwd, enableproxy = false): #构造方法, the parameter is itself, the user, the password, whether to use a proxy server "Initialize Weibologin,enableproxy indicates whether to use a proxy server, the default shutdown"   print  "Initializing  weibologin ... "self.username = userself.password = pwdself.enableproxy =  enableproxyself.serverurl =  "http://login.sina.com.cn/sso/prelogin.php?entry=weibo&callback= Sinassocontroller.prelogincallback&su=&rsakt=mod&client=ssologin.js (v1.4.11) &_=1379834957683 " self.loginurl =  "Http://login.sina.com.cn/sso/login.php?client=ssologin.js (v1.4.11)" Self.postheader  = {' user-agent ':  ' mozilla/5.0  (windows nt 6.1; rv:24.0)  Gecko/ 20100101 firefox/24.0 '} #用户代 user agent, refers to the browser, its information includes hardware platform, system software, application software and user's personal preferences. Def login (self): #登陆程序 "Login Program"   self. Enablecookie (Self.enableproxy) #cookie或代理服务器配置serverTime,  nonce, pubkey, rsakv = self. GetServerTime () #登陆的第一步postData  = weiboencode.postencode (self.username, self.password,  SERVERTIME,&NBSP;NONCE,&NBSP;PUBKEY,&NBSP;RSAKV) #加密用户和密码print   "post data length:\n",  len ( PostData)   req = urllib2. Request (self.loginurl, postdata, self.postheader) #构造网络请求print   "Posting request ..." result  = urllib2.urlopen (req) #发出网络请求text  = result.read () try:loginurl =  Weibosearch.sredirectdata (text) #解析重定位结果 (automatically jump to the page after landing) Urllib2.urlopen (loginurl) except:print  ' login  error! ' return falseprint  ' login sucess! ' Return truedef enablecookie (Self, enableproxy): # "enable cookie & proxy  (if needed). " Cookiejar = cookielib. Lwpcookiejar () #建立cookiecookie_support  = urllib2. Httpcookieprocessor (Cookiejar)    #HTTPCookieProcessor  instances have one  Attribute: #HTTPCookieProcessor .cookiejar  (the cookielib. Cookiejar in which cookies are stored.) If enableproxy:proxy_support = urllib2. Proxyhandler ({' http ': ' 59.59.100.123:8118 '}) #使用代理opener  = urllib2.build_opener (proxy_support,  Cookie_support, urllib2. HttpHandler) #Return  an openerdirector instance#the openerdirector class opens  URLs via BaseHandlers chained together.print  "Proxy enabled" Else:opener  = urllib2.build_opener (COOKIE_SUPPORT,&NBSP;URLLIB2. HttpHandler)  urllib2.install_opener (opener) #构建cookie对应的openerdef  getservertime (self): # "get  Server time and nonce, which are used to encode the password "#在摘要认证中服务器让客户选一个随机数 (called" NOnce "), then the browser uses a one-way cryptographic function to generate a message digest (message  #digest), which is about the user name, password, given nonce value, HTTP method, and the requested URL. print  "Getting server time and nonce ..." Serverdata = urllib2.urlopen ( Self.serverurl). Read () #得到网页内容print  serverdatatry:servertime, nonce, pubkey, rsakv =  weibosearch.sserverdata (serverdata) #解析得到serverTime, nonce etc Return servertime, nonce, pubkey , rsakvexcept:print  ' get server time & nonce error! ' Return nonedef fetch_weibo (id, filename): #不借助API取回微博列表, but only the first few, parameters are user ID, file name target =  Open (filename,  ' a ') myurl= ' http://weibo.com/u/' +idline = urllib2.urlopen (myurl). Read () Target.write (line) if re.search (R ' \ "Wb_detail ',  line):p rint " Success "P = re.compile (R ' \ "Wb_detail\" ') Linelist = p.split (line) for fraction in linelist:matchobj =  Re.search (R ' nick-name=\ ". +?\" >\\n + (. +?) < ',  fraction) if matchobj:target.write (Matchobj.group (1)) Target.write ("\ n") Def fetchqueryresult (): # This method can retrieve the search results of myurl= "Http://s.weibo.com/user/&auth=ord&age=22y&gender=women&region=custom : 33:1&page= "#找人页面的urltarget  = open (" filename ",  ' a ') #输出文件名称for  i in range ( 37,51): #起止页码line  = urllib2.urlopen (Myurl). Read () While re.search (R ' ids\= (\d+?) \ \ ',  line): Matchobj = re.search (R ' ids\= (\d+?) \ \ ',  line) print matchobj.group (1) target.write (Matchobj.group (1)) Target.write ("\ n") p =  Re.compile (R ' +matchobj.group (1)) Linelist = p.split (line) Line = linelist[len (linelist)-1] Print itime.sleep (2+random.random ());d Ef getjson (): #本方法可调用微博API, retrieving the list of users who have logged in to their blog       headers = {' user-agent ':  ' mozilla/5.0  (windows nt 6.1; rv:24.0)   gecko/20100101 firefox/24.0 '} #定义一些文件头url  =  "Https://api.weibo.com/2/statuses/user_timEline.json "  #  here is urlyour_param = {' source ':  ' 1675437817 '}  #  Here is the request parameter! Result = requests.get (Url, params=your_param)   #  send request if URL is http://s.weibo.com/ weibo/s  so   The effect of this sentence is &NBSP;HTTP://S.WEIBO.COM/WEIBO/S? refer=sina_indexresult_final = result.text   #这样就获取到了你发送的这个URL  +  Parameters   After the result print result.textif __name__ ==  ' __main__ ': #if  the python interpreter  is running that module  (The source file)  as the main  program,  #it  sets the special __name__ variable to have a  value # "__main__". #If  this file is being imported from another  module, #__name__  will be set to the module ' s name.weibologin =  Weibologin (' [email protected] ',  ' XXXXXXXX ') #邮箱 (account), password If&nbsP;weibologin.login ()  == True:print  "Landing success!" "Myurl=" http://api.weibo.com/2/statuses/timeline_batch.json?source=1675437817&uids=5029941840 "htmlcontent  = urllib2.urlopen (Myurl). Read () print htmlcontent

The other two classes to use are weibosearch.py and weiboencode.py.

The weiboencode.py code is as follows:

#coding =utf-8import urllibimport base64import rsaimport binascii  def  Postencode (USERNAME,&NBSP;PASSWORD,&NBSP;SERVERTIME,&NBSP;NONCE,&NBSP;PUBKEY,&NBSP;RSAKV):      "Used to generate post data"     encodedUserName =  GetUserName (userName) #用户名使用base64加密     encodedpassword = get_pwd (password,  Servertime, nonce, pubkey) #目前密码采用rsa加密     postPara = {          ' entry ':  ' Weibo ',         ' Gateway ':  ' 1 ',         ' from ':  ',          ' savestate ':  ' 7 ',         ' Userticket ':   ' 1 ',         ' ssosimplelogin ':  ' 1 ',          ' VSNF ': &nbsP; ' 1 ',         ' vsnval ':  ',          ' su ': encodedusername,         ' service ':  ' Miniblog ',         ' servertime ': servertime,          ' nonce ': nonce,         ' pwencode ':   ' rsa2 ',         ' SP ': encodedpassword,          ' encoding ':  ' UTF-8 ',         ' Prelt ' :  ',         ' rsakv ': rsakv,               ' url ':  ' http://weibo.com/ajaxlogin.php?framelogin=1 &callback=parent.sinassocontroller.feedbackurlcallback ',         ' ReturnType ':  ' META ' &NBSP;&NBSp;  }    postdata = urllib.urlencode (PostPara) #网络编码      return postdata def getusername (userName):     "Used to  Encode user name "    usernametemp = urllib.quote (userName)      usernameencoded = base64.encodestring (usernametemp) [: -1]    return  usernameencoded   def get_pwd (Password, servertime, nonce, pubkey):     rsapublickey = int (pubkey, 16)     key =  Rsa. PublicKey (rsapublickey, 65537)   #创建公钥     message = str (servertime)  +  ' \ t '  + str (nonce)  +  ' \ n '  + str (password)   #拼接明文js加密文件中得到      passwd = rsa.encrypt (message, key)   #加密     passwd  = binascii.b2a_hex (passwd)   #将加密信息转换为16进制.     return passwd

weibosearch.py

#coding =utf-8import reimport json  def sserverdata (serverdata): #解析得到serverTime, nonce, etc.      "Search the server time & nonce from server  data "    p = re.compile (' \ (. *) \) ')   #re .compile  The regular expression can be compiled into a regular expression object     jsondata = p.search (Serverdata). Group (1)   #查找      data = json.loads (jsondata)   #对encodedjson进行decode, get raw data, need to use Json.loads () function     servertime = str (data[' servertime ')     nonce =  data[' nonce ']    pubkey = data[' PubKey ']#    rsakv  = data[' rsakv ']#    print  "Server time is:",  serverTime     print  "Nonce is:",  nonce    return servertime,  nonce, pubkey, rsakv def sredirectdata (text):     p = re.compile (' location\.replace\ ([\ ' "] (. *?) [\ ' "]\) ')     loginurl = p.search (text). Group (1)     print   ' loginurl: ', Loginurl    return loginurl

Currently, the crawler can log in automatically and invoke the normal API of Sina Weibo. However, the bulk retrieval of other people's Weibo requires a high level of authorization and is currently being applied.

Sina Weibo simulation landing based on Python

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.