When you do the site, all look at the inclusion and keywords ranking what, so build this batch keyword query tool.
#encoding: Utf-8import urllib,re,random,time,sys,stringio,sockettry:import pycurlexcept:passfrom BS4 Import Beautif ulsoupscore={1:28.56, 2:19.23, 3:10.2, 4:8.14, 5:7.5, 6:5.72, 7:4.01, 8 : 4.41, 9:5.53, 10:6.70,} #获取根域名, Baidu products Direct display subdomain def root_domain (URL): If ' baidu.com ' in Url:return URL Else:try:url = Url.replace (' http://', ') L = ['. com.cn ', '. org.cn ', '. net.cn ', '. gov.c n '] for suffix in l:if suffix in url:return re.search (' ^ (. *?\.. *?) *([^.] +?\. [^.] +?\. [^.] +) ', url '. Group (2) return Re.search (' ^ (. *?\. *?) *([^.] +?\. [^.] +) ', URL. Group (2) Except:return '-' def curl (URL, Debug=false, **kwargs): list=[' mozilla/5.0 (Window S NT 5.1; rv:37.0) gecko/20100101 firefox/37.0 ', ' mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_0) applewebkit/537.36 (khtml, like Gecko) chrome/31.0.1650.63 safari/537.36 ', ' mozilla/5.0 (Windows NT 5.1) applewebkit/537.36 (khtml, like Gecko) chrome/38.0.2125.122 safari/537.36 "] randhead=random.sample (list,1) while 1: Try:s = Stringio.stringio () c = Pycurl. Curl () c.setopt (Pycurl. URL, url) c.setopt (pycurl. REFERER, URL) c.setopt (pycurl. Followlocation, True) c.setopt (Pycurl. TIMEOUT, C.setopt (Pycurl). ENCODING, ' gzip ') c.setopt (Pycurl. useragent, '%s '%randhead[0]) c.setopt (Pycurl. Nosignal, True) c.setopt (Pycurl. Writefunction, S.write) for K, V in Kwargs.iteritems (): C.setopt (VARs (Pycurl) [K], V) C.perform () C.close () return S.getvalue () except:if debug:raise Continuedef Get_baidudata (keyword,rn): Search_url = ' http://www.baidu.com/s?wd=%s&rn=%d '% (Urllib.quote (ke Yword), rn) Pagetext = Curl (search_url) #获取百度搜索结果源代码 while ' http://verify.Baidu.com ' in Pagetext: #判断 If a verification code appears during the query and then prompts and stops for 10 minutes, and then re-queries print U "The verification code appears during the query process, rest 10 minutes", keyword Time.sleep (600) Pagetext = Curl (search_url) Else:soup = BeautifulSoup (pagetext) data = Soup.find_all ("div", attrs= {' class ': ' Result C-container '}) #提取自然排名结果 return Data returndef get_rank_data (KEYWORD,RN): data = Get_baidudata (KEYWORD,RN) #获取自然排名结果 items = {} for result in data:g = Result.find_all ("A", attrs={' class ': ' C-showurl '}) #获取主域名 if G:sit E=re.search (R ' ([a-za-z0-9\.\-]+) ', g[0].text) host = site.groups (1) [0] Host=root_domain (host) #获取根域名 rank = Int (result[' id ') #排名 if host not in Items.keys (): items[host] = [] Items[host].append (Score[rank]) else:items[host].append (Score[rank]) return items# returns a single word before ten Data def get_keywords (filename): #读取关键词返回列表 kwfile = open (filename, ' r ') keywords = kwfile.readline () kw_list= [] While keywords:kw = Keywords.strip () kw_list.append (kw) keywords = kwfile.readline () kwfi Le.close () return kw_listdef get_all_data (filename,rn): #单域名数据合并 kw_list = get_keywords (filename) items = {} fo R i,kw in Enumerate (kw_list,1): print i,kw item = get_rank_data (KW,RN) for Host,rank in Item.items (): If host not in Items.keys (): items[host] = Rank Else:items[host].ext End (rank) return itemsdef Get_score (FILENAME,RN): data = Get_all_data (filename,rn) fh = open (' Score.csv ', ' A + ') Fh.write (' host,kws,average_score,host_score,\n ') for Host,rank in Data.items (): if host! = None:hos t = Host.encode (' utf-8 ') else:host = ' ERROR page ' KWS = Len (rank) #关键词数 Host_score = SUM (r Ank) #总得分 Average_score = host_score/kws# Average score fh.write (host+ ', ' +str (KWS) + ', ' +str (average_score) + ', ' +str (host_s CORE) + ' \ n ') returnif__name__== "__main__": File=raw_input ("Please enter the file name containing the keyword:")
Python to create bulk keyword ranking query tool