This paper uses requests and BeautifulSoup third party library to crawl the information of short rental in the Beijing short rental network. Code Reference "Learning Python web crawler from scratch."
The complete code is as follows:
From BS4 import BeautifulSoup
Import requests
Import time
headers = {
' User-agent ': ' mozilla/5.0 (Windows NT 10.0; Win64; x64) applewebkit/537.36 (khtml, like Gecko) chrome/62.0.3202.94 safari/537.36 '
}
def judgment_sex (class_name):
if class_name = = [' Member_icol ']:
Return ' female '
Else
Return ' Male '
def get_links (URL):
Wb_data = Requests.get (url,headers = headers)
Soup = BeautifulSoup (wb_data.text, ' lxml ')
Links = soup.select (' #page_list > Ul > li > A ')
For link in Links:
href = link.get ("href")
Get_info (HREF)
def get_info (URL):
Wb_data = Requests.get (url,headers = headers)
Soup = BeautifulSoup (wb_data.text, ' lxml ')
Tittles = Soup.select (' div.pho_info > H4 ')
addresses = Soup.select (' SPAN.PR5 ')
Prices = Soup.select (' #pricePart > div.day_l > Span ')
IMGs = Soup.select (' #floatRightBox > Div.js_box.clearfix > Div.member_pic > A > img ')
names = Soup.select (' #floatRightBox > Div.js_box.clearfix > div.w_240 > H6 > A ')
Sexs = Soup.select (' #floatRightBox > Div.js_box.clearfix > div.w_240 > H6 > Span ')
f = open (' Xiaozhu_data.txt ', ' A + ', encoding= ' utf-8 ')
For tittle,address,price,img,name,sex in Zip (tittles,addresses,prices,imgs,names,sexs):
data = {
' Tittle ': Tittle.get_text (). Strip (),
' Address ': Address.get_text (). Strip (),
' Price ':p rice.get_text (),
' img ': Img.get ("src"),
' Name ': Name.get_text (),
' Sex ': Judgment_sex (Sex.get ("Class"))
}
Print (Data,file = f)
F.close
if __name__ = = ' __main__ ':
URLs = [' http://bj.xiaozhu.com/search-duanzufang-p{}-0/'. Format (number) for number in range (1,50)]
For Single_url in URLs:
Get_links (Single_url)
Time.sleep (2)
Welcome to Exchange study together.