# Function: House price survey
Fromurllibimportparse, request
Frombs4importbeautifulsoup as BS
Frommultiprocessingimportpool
Starttime=datetime.datetime.now ()
Base_url=r ' http://bj.fangjia.com/ershoufang/'
test_search_dict={' changping ': {' Huo ying ': {' line Line 13 ': ' Http://bj.fangjia.com/ershoufang/--r-%E6%98%8C%E5%B9%B3|w-13%E5%8F%B7%E7 %ba%bf|b-%e9%9c%8d%e8%90%a5 '}}}
search_list=[]# List of listing information
tmp_list=[]# Listing URL Cache list
' User-agent ': R ' mozilla/5.0 (Windows NT 6.1; WOW64) applewebkit/537.36 (khtml, like Gecko) '
R ' chrome/45.0.2454.85 safari/537.36 115browser/6.0.3 ',
' Referer ': R ' http://bj.fangjia.com/ershoufang/',
' Host ': R ' bj.fangjia.com ',
' Connection ': ' Keep-alive '
Socket.setdefaulttimeout (Timeout) # set timeout
Req=request. Request (URL, headers=headers)
Response=request.urlopen (req). Read ()
Page=response.decode (' Utf-8 ')
# Get Query Keywords Dict
Defget_search (page, key):
Search_list=soup.find_all (Href=re.compile (key), target= ")
Foriinrange (Len (search_list)):
Soup=bs (str (search_list[i]), ' lxml '
Key=soup.select (' a ') [0].get_text ()
value=soup.a.attrs[' href ']
# Get Listings (nested dictionary traversal)
Defget_info_list (search_dict, layer, Tmp_list, search_list):
layer+=1# Set Dictionary level
Foriinrange (Len (search_dict)):
Tmp_key=list (Search_dict.keys ()) [i]# extract current dictionary level key
Tmp_list.append (Tmp_key) # Adds the current key value as an index to Tmp_list
Tmp_value=search_dict[tmp_key]
Ifisinstance (TMP_VALUE,STR): # When the key value is a URL
Tmp_list.append (tmp_value) # Add URL to Tmp_list
Search_list.append (Copy.deepcopy (Tmp_list)) # Add Tmp_list index URL to search_list
tmp_list=tmp_list[:layer]# Keep index based on hierarchy
eliftmp_value== ': # Skip when key value is empty
layer-=2 # jump out of the key value level
Tmp_list=tmp_list[:layer] # Keep index based on hierarchy
Get_info_list (tmp_value, layer, tmp_list, search_list) # When the key value is a list, iterate through the
Tmp_list=tmp_list[:layer]
Defget_info_pn_list (search_list):
Foriinrange (Len (search_list)):
Print (' >>> is crawling%s '%search_list[i][:3])
SEARCH_URL=SEARCH_LIST[I][3]
Page=get_page (Search_url)
Print (' get page timeout ')
# Get maximum number of pages
Pn_num=soup.select (' span[class= "MR5"] ") [0].get_text ()
Rule=re.compile (R ' \d+ ')
Max_pn=int (Rule.findall (Pn_num) [1])
Forpninrange (1, max_pn+1):
Print (' ************************ is crawling%s page ************************ '%pn)
Pn_rule=re.compile (' [|] ')
Fin_url=pn_rule.sub (R ' |e-%s| ') %PN, search_url,1)
Tmp_url_list=copy.deepcopy (Search_list[i][:3])
Tmp_url_list.append (Fin_url)
Fin_search_list.append (Tmp_url_list)
Defget_info (Fin_search_list, process_i):
Print (' process%s start '%process_i)
Foriinrange (Len (fin_search_list)):
URL=FIN_SEARCH_LIST[I][3]
Print (' Get tag timeout ')
Title_list=soup.select (' a[class= "H_name"])
Address_list=soup.select (' span[class= ' address])
Attr_list=soup.select (' span[class= ' attribute "])
Price_list=soup.find_all (attrs={"class": "Xq_aprice Xq_esf_width"}) # Select is not recognized for some property values (including spaces in the middle of the property value), you can use Find_all ( attrs={}) instead
title=title_list[num].attrs["title"]
Print (R ' ************************ is getting%s************************ '%title)
Address=re.sub (' \ n ', ', Address_list[num].get_text ())
Area=re.search (' \d+[\u4e00-\u9fa5]{2} ', Attr_list[num].get_text ()). Group (0)
Layout=re.search (' \d[^0-9]\d. ', Attr_list[num].get_text ()). Group (0)
Floor=re.search (' \d/\d ', Attr_list[num].get_text ()). Group (0)
Price=re.search (' \d+[\u4e00-\u9fa5] ', Price_list[num].get_text ()). Group (0)
Unit_price=re.search (' \d+[\u4e00-\u9fa5]/. ', Price_list[num].get_text ()). Group (0)
Tag_tmp_list=copy.deepcopy (Fin_search_list[i][:3])
Fortagin[title, address, area, layout, floor, Price, Unit_price]:
Tag_tmp_list.append (TAG)
Fin_info_list.append (Tag_tmp_list)
Print (' Process%s end '%process_i)
Defassignment_search_list (Fin_search_list, Project_num): # Project_num The number of tasks each process contains, the smaller the number, the more processes
Fin_search_list_len=len (Fin_search_list)
Foriinrange (0, Fin_search_list_len, project_num):
Assignment_list.append (Fin_search_list[start:end]) # get list Fragment
Defsave_excel (Fin_info_list, file_name):
tag_name=[' area ', ' plate ', ' Subway ', ' title ', ' position ', ' square meter ', ' huxing ', ' floor ', ' Total price ', ' Unit square meter '
Book=xlsxwriter. Workbook (R ' C:\Users\Administrator\Desktop\%s.xls '%file_name) # is stored on the desktop by default
Tmp=book.add_worksheet ()
Row_num=len (Fin_info_list)
Foriinrange (1, row_num):
Tmp.write_row (Tag_pos, tag_name)
content=fin_info_list[i-1]#-1 is because the table header is occupied by
Tmp.write_row (Con_pos, content)
if__name__== ' __main__ ':
File_name=input (R ' crawl complete, enter filename Save: ')
fin_save_list=[]# Crawl Information Store list
Search_dict=get_search (page, ' R ')
Print (R ' ************************ crawl: Crawling "%s" ************************ '%k)
Second_page=get_page (URL)
Second_search_dict=get_search (second_page, ' B ')
Search_dict[k]=second_search_dict
SECOND_DICT=SEARCH_DICT[K]
Print (R ' ************************ level two crawl: Crawling "%s" ************************ '%s_k)
Third_page=get_page (URL)
Third_search_dict=get_search (Third_page, ' W ')
Print ('%s>%s '% (k, s_k))
Second_dict[s_k]=third_search_dict
Fin_info_list=get_info_list (search_dict, layer, tmp_list, search_list)
Fin_info_pn_list=get_info_pn_list (Fin_info_list)
P=pool (4) # Set Process pool
Assignment_list=assignment_search_list (fin_info_pn_list,2) # Assigning tasks to multiple processes
result=[]# Multiple Process results list
Foriinrange (Len (assignment_list)):
Result.append (P.apply_async (Get_info, args= (assignment_list[i), i))
Forresult_iinrange (len (Result)):
Fin_info_result_list=result[result_i].get ()
Fin_save_list.extend (fin_info_result_list) # Merges the lists obtained by each process
Save_excel (Fin_save_list, file_name)
Endtime=datetime.datetime.now ()
Time= (endtime-starttime). seconds
Print (' In total:%s s '%time)