Directly on code
without adding a time interval
after a few pages is blocked
#。 /user/bin/env python #-*-coding:utf-8-*-from BS4 import beautifulsoup import requests import re #函数求详细信息 def Get_deta Il (URLs): Web_data = requests.get (urls) soup = beautifulsoup (web_data.text, ' lxml ') # Get the title, select Get the list, [0], get the first That's what we want. Titles = Soup.select (' div.con_l > Div.pho_info > H4 ') [0].text # get address addr = Soup.select (' Div.pho_ Info > P > span ') [0].text # get rent day_cost = Soup.select (' #pricePart > div.day_l > Span ') [0].text # Listing pictures IMGs = Soup.select (' img[id= "Curbigimage"] ") [0].get (' src ') # Landlord Map Load_imgs = Soup.select (' Div.member_pic
> A > img ') [0].get (' src ') # landlord Netizen load_names = Soup.select (' div.w_240 > H6 > A ') [0].text # Landlord Sex
Load_sexs = Soup.select (' div.member_pic > div ') [0].get (' class ') [0] #判断房东男女 if Load_sexs = ' Member_ico ': Load_sexs = ' Male ' Else:load_sexs = ' female ' #title, add, Day_costs, IMG, load_img, Load_name, Load_sex da TAS = [Titles,addr,day_COST,IMGS,LOAD_IMGS,LOAD_NAMES,LOAD_SEXS] data = {' title ':d atas[0], ' Add ':d atas[1], ' Day_cos T ':d atas[2], ' img ':d atas[3], ' load_img ':d atas[4], ' load_name ':d atas[5], ' Load_sexs ':d atas[ 6]} print (data) #爬取1页中的24个链接 def get_link (URL): #url = ' http://sh.xiaozhu.com/search-duanzufang-0/?startDate=20
17-11-30&enddate=2017-12-01 ' Web_datas = requests.get (URL) soup = BeautifulSoup (web_datas.text, ' lxml ') #链接 For i in range: link = soup.find_all (href=re.compile (R ' http://sh.xiaozhu.com/fangzi/. ')) [I].get (' href ') get_detail (link) get_link (' Http://sh.xiaozhu.com/search-duanzufang-0/?startDate=2017-11-30&e Nddate=2017-12-01 ') for I in Range (2,10): Get_link (' http://sh.xiaozhu.com/search-duanzufang-p%d-0/?startDate= 2017-11-30&enddate=2017-12-01 '%i)
Process