It is a simple function for querying Baidu keyword rankings using python. The following are some introductions:
1. Random UA
2. The operation is simple and convenient. You can directly use getRank (keyword, domain name ).
3. encoding conversion. There should be no problem with coding.
4. Rich results. Not only rankings, but also the title, URL, snapshot time of the search results, which meets SEO requirements
5. It is convenient to use software or use it on your own.
The function is implemented in a single thread, and the speed is slow. You can refer to modifying it as needed.
Copy codeThe Code is as follows:
# Coding = UTF-8
Import requests
Import BeautifulSoup
Import re
Import random
Def decodeAnyWord (w ):
Try:
W. decode ('utf-8 ')
Except t:
W = w. decode ('gb2312 ')
Else:
W = w. decode ('utf-8 ')
Return w
Def createURL (checkWord): # create baidu URL with search words
CheckWord = checkWord. strip ()
CheckWord = checkWord. replace ('', '+'). replace ('\ n ','')
BaiduURL = 'HTTP: // www.baidu.com/s? Wd = % s & rn = 100 '% checkWord
Return baiduURL
Def getContent (baiduURL): # get the content of the serp
UaList = ['mozilla/4.0 + (compatible; + MSIE + 6.0; + Windows + NT + 5.1; + SV1; +. NET + CLR + 1.1.4322; + tentraveler )',
'Mozilla/4.0 + (compatible; + MSIE + 6.0; + Windows + NT + 5.1; + SV1; +. NET + CLR + 2.0.50727; +. NET + CLR + 3.0.20.6.2152; +. NET + CLR + 3.5.30729 )',
'Mozilla/5.0 + (Windows + NT + 5.1) + AppleWebKit/537.1 + (KHTML, + like + Gecko) + Chrome/21.0.1180.89 + Safari/123456 ',
'Mozilla/4.0 + (compatible; + MSIE + 6.0; + Windows + NT + 5.1; + SV1 )',
'Mozilla/5.0 + (Windows + NT + 6.1; + rv: 11.0) + Gecko/20100101 + Firefox/123 ',
'Mozilla/4.0 + (compatible; + MSIE + 8.0; + Windows + NT + 5.1; + Trident/4.0; + SV1 )',
'Mozilla/4.0 + (compatible; + MSIE + 8.0; + Windows + NT + 5.1; + Trident/4.0; + GTB7.1; +. NET + CLR + 2.0.50727 )',
'Mozilla/4.0 + (compatible; + MSIE + 8.0; + Windows + NT + 5.1; + Trident/4.0; + KB974489) ']
Headers = {'user-agent': random. choice (uaList )}
R = requests. get (baiduURL, headers = headers)
Return r. content
Def getLastURL (rawurl): # get final URL while there're redirects
R = requests. get (rawurl)
Return r. url
Def getAtext (atext): # get the text with <a> and </a>
Pat = re. compile (R' <a. *?> (.*?) </A> ')
Match = pat. findall (atext. replace ('\ n ',''))
PureText = match [0]. replace ('<em>', ''). replace ('</em> ','')
Return pureText. replace ('\ n ','')
Def getCacheDate (t): # get the date of cache
Pat = re. compile (R' <span class = "g"> .*? (\ D {4}-\ d {1, 2}-\ d {1, 2}) </span> ')
Match = pat. findall (t)
CacheDate = match [0]
Return cacheDate
Def getRank (checkWord, domain): # main line
CheckWord = checkWord. replace ('\ n ','')
CheckWord = decodeAnyWord (checkWord)
BaiduURL = createURL (checkWord)
Cont = getContent (baiduURL)
Soup = BeautifulSoup. BeautifulSoup (cont)
Results = soup. findAll ('table', {'class': 'result'}) # find all results in this page
For result in results:
CheckData = unicode (result. find ('span ', {'class': 'G '}))
If re. compile (R' ^ [^/] * % s .*? '% Domain). match (checkData. replace (' <B> ', ''). replace (' </B> ',''): # correct
NowRank = result ['id'] # get the rank if match the domain info
ResLink = result. find ('h3 ').
ResURL = resLink ['href ']
DomainURL = getLastURL (resURL) # get the target URL
ResTitle = getAtext (unicode (resLink) # get the title of the target page
Rescache = result. find ('span ', {'class': 'G '})
CacheDate = getCacheDate (unicode (rescache) # get the cache date of the target page
Res = u '% s, % s name, % s, % s, % s' % (checkWord, nowRank, resTitle, cacheDate, domainURL)
Return res. encode ('gb2312 ')
Break
Else:
Return '> 100'
Domain = 'www .baidu.com '# set the domain which you want to search.
Print getRank ('Baidu ', domain)