In this paper, we describe the method of Python to extract Baidu search results. Share to everyone for your reference. The implementation method is as follows:
# coding=utf8import Urllib2import stringimport urllibimport reimport random# set multiple user_agents to prevent Baidu limit ipuser_agents = [' mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) gecko/20130406 firefox/23.0 ', \ ' mozilla/5.0 (Windows NT 6.1; WOW64; rv:18.0) gecko/20100101 firefox/18.0 ', \ ' mozilla/5.0 (Windows; U Windows NT 6.1; En-US) applewebkit/533+ \ (khtml, like Gecko) Element Browser 5.0 ', \ ' IBM webexplorer/v0.94 ', ' galaxy/1.0 [en] (Ma C OS X 10.5.6; U EN) ', \ ' mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; trident/6.0) ', \ ' opera/9.80 (Windows NT 6.0) presto/2.12.388 version/12.14 ', \ ' mozilla/5.0 (IPAD; CPU os 6_0 like Mac os X applewebkit/536.26 (khtml, like Gecko) \ version/6.0 mobile/10a5355d safari/8536.25 ', \ ' M ozilla/5.0 (Windows NT 6.1) applewebkit/537.36 (khtml, like Gecko) \ chrome/28.0.1468.0 safari/537.36 ', \ ' MOZILLA/5 .0 (compatible; MSIE 9.0; Windows NT 6.0; trident/5.0; TheWorld) ']def baidu_search (KEYWORD,PN): p= {' wd ': keyword} reS=urllib2.urlopen ("http://www.baidu.com/s?" +urllib.urlencode (P) + "&pn={0}&cl=3&rn=100"). Format (PN)) Html=res.read () return htmldef getList (regex, Text): arr = [] res = Re.findall (regex, text) if res:for R in Res:arr.append (r) return Arrdef Getmatch (regex , text): res = Re.findall (regex, text) if Res:return Res[0] return "def cleartag (text): p = re.compile (U ' <[^> ;] +> ') retval = P.sub ("", text) return retvaldef geturl (keyword): for page in range: pn=page*100+1 html = Bai Du_search (KEYWORD,PN) content = Unicode (HTML, ' utf-8 ', ' ignore ') arrlist = getList (U "
.*?<\/a> ", content) for item in Arrlist:regex = U"
(. *?) <\/a> "link = getmatch (regex,item) url = link[0] #获取标题 #title = Cleartag (link[1]). Encode (' UTF8 ') Try:domain=urllib2. Request (URL) r=random.randint (0,11) domain.add_header (' User-agent ', User_agents[r]) Domain.add_header (' Connection ', ' keep-alive ') response=urllib2.urlopen (domain) uri=response.geturl () print URI exc Ept:continueif __name__== ' __main__ ': Geturl (' python ')
Hopefully this article will help you with Python programming.