This article describes how to capture the title information of a website on the Baidu search result page using Python. This article uses BeautifulSoup to parse HTML. For more information, see
For example, you want to collect SERP results that contain "58 city" in the title and filter results that contain "Beijing" or "Xiamen.
The Python script mainly implements the above functions.
BeautifulSoup is used to parse HTML. For more information, see my other article: install BeautifulSoup in Windows 8.
The code is as follows:
The code is as follows:
_ Author _ = 'civil engineer'
#-*-Coding: UTF-8 -*-
# Collect SERP search result titles
Import urllib2
From bs4 import BeautifulSoup
Import time
# Writing files
Def WriteFile (fileName, content ):
Try:
Fp = file (fileName, "a + ")
Fp. write (content + "\ r ")
Fp. close ()
Except t:
Pass
# Obtain the Html source code
Def GetHtml (url ):
Try:
Req = urllib2.Request (url)
Response = urllib2.urlopen (req, None, 3) # set the timeout time
Data = response. read (). decode ('utf-8', 'ignore ')
Country T: pass
Return data
# Extract the title of the search result SERP
Def FetchTitle (html ):
Try:
Soup = BeautifulSoup (''. join (html ))
For I in soup. findAll ("h3 "):
Title = I. text. encode ("UTF-8 ")
If any (str _ in title for str _ in ("Beijing", "Xiamen ")):
Continue
Else:
Print title
WriteFile ("Result.txt", title)
Except t:
Pass
Keyword = "58 City"
If _ name _ = "_ main __":
Global keyword
Start = time. time ()
For I in range (0, 8 ):
Url = "http://www.baidu.com/s? Wd = intitle: "+ keyword +" & rn = 100 & pn = "+ str (I * 100)
Html = GetHtml (url)
FetchTitle (html)
Time. sleep (1)
C = time. time ()-start
Print ('runtime: % 0.2f seconds '% (c ))