Objective:
Bitter forced me from xx City to another slightly bigger city, the first problem is to buy a house, struggle more than 10 years, and back to the starting point, nonsense on not much to say, see How to design a program to the city of the price data crawl over.
Solution: The idea of the scheme is very simple, first get down the content of the Web page, through a certain rules of the content analysis, save to the desired format
The difficulty is the parsing of the webpage, is a more meticulous work, must side output, side debugging.
Specific implementation:
Get Web content:
def get_page (URL):
headers = {
' user-agent ': R ' mozilla/5.0 (Windows NT 6.1; WOW64) applewebkit/537.36 (khtml, like Gecko) '
R ' chrome/45.0.2454.85 safari/537.36 115Browser /6.0.3 ',
' Referer ': R ' http://jn.58.com/ershoufang/',
' Host ': R ' jn.58.com ',
' Connection ': ' Keep-alive '
}
timeout =
Socket.setdefaulttimeout (Timeout) # set timeout
req = Request. Request (URL, headers=headers)
response = Request.urlopen (req). Read ()
page = Response.decode (' utf-8 ', ' ignore ')
return page
The second step to parse the Web page: The parsing should pay attention to invalid content processing, or run up will error, debugging is very troublesome
def get_58_house (URL):
html = get_page (URL)
Soup = BeautifulSoup (html, "lxml")
Table =soup.find (id= "main")
DF = PD. DataFrame (columns=["Op_time", "Web", "House_name", "Xq", "xq1", "Price", "Per_price", "Guest Box", "M2", "href", "ts"])
for TR in Table.find_all (' tr '):
Try
Str_name = Tr.find ("P", "Bthead"). Find ("A", "T"). String.strip ()
Str_link = Tr.find ("P", "Bthead"). Find ("A", "T") ["href"]
# #房产小区位置
STR_XQ = List ()
Str_xq1= "
Str_xq2= "
Try
For S in Tr.find_all ("A", "a_xq1"):
Str_xq.append (S.string.strip ())
str_xq1= Str_xq[0]
Str_xq2= Str_xq[1]
Except
Pass
# #房产特色
Str_ts =list ()
Try
For S in Tr.find ("div", "qj-listleft"). Stripped_strings:
Str_ts.append (s)
Except
Pass
# # Price Information ####################
Str_price =list ()
Str_toal = ' '
Str_per = ' '
Str_room = ' '
str_m2 = ' '
Try
For S in Tr.find ("div", "qj-listright Btall"). Stripped_strings:
Str_price.append (s)
Str_toal = str_price[0]
Str_per = Re.findall (r "(\d+\.*\d+)", str_price[1])
Str_room = str_price[2]
str_m2 = Re.findall (r "(\d+\.*\d+)", Str_price[3])
Except
Pass
Except Exception as E:
Print (' Exception ', ":", E)
Try
Row = {' web ': ' 58 with the city ', ' house_name ': str_name, ' xq ': str_xq1, ' xq1 ': str_xq2, ' price ': str_toal, ' per_price ': str_per, ' the ' hostel ' : Str_room, ' m2 ': str_m2, ' ts ': '. Join (STR_TS), ' href ': Str_link}
NewRow = PD. DataFrame (data=row,index=["0"])
Df=df.append (Newrow,ignore_index=true)
Except Exception as E:
Print (' Exception ', ":", E)
F=open ("Log.txt", ' a ')
Traceback.print_exc (FILE=F)
F.write (Row)
F.flush ()
F.close ()
df["Op_time"]=time.strftime ('%y-%m-%d ', Time.localtime (Time.time ()))
Return DF
The third step is to cycle through each page of data and save the data:
Def get_58_house_all ():
# #建立数据库连接
Engine = Create_engine (' Oracle+cx_oracle://user:[email protected]/orcl ')
CNX = Engine.connect ()
# #先清除今天的数据
‘‘‘
strSQL = ' Delete from house where op_time=\ ' {}\ '. Format (time.strftime ('%y-%m-%d ', Time.localtime (Time.time ())))
Cnx.execute (strSQL)
‘‘‘
# #获取首页房产数据
Str_http = "http://jn.58.com/ershoufang/"
Writelog (Time.strftime ('%y-%m-%d%h:%m:%s ', Time.localtime (Time.time ())) + ' Start: ' +str_http ')
Df1=get_58_house (Str_http)
Try
Df1.to_sql (' house ', cnx,if_exists= ' append ')
Except Exception as E:
"' Log exception information
This example uses an Oracle database with the default encoding format of GBK, which causes save errors because of special characters. The following error prompts you to adjust the Oracle character set
The Oracle character set is tuned to UTF8,
Nls_lang:american_america. Al32utf8
Nls_characterset:utf8
Nls_nchar_characterset:utf8
The error message is
Unicodeencodeerror: ' GBK ' codec can ' t encode character ' \xb2 ' in position 13:illegal multibyte sequence
The character is superscript 2, square meter
‘‘‘
Writelog (Time.strftime ('%y-%m-%d%h:%m:%s ', Time.localtime (Time.time ())) + ' Except: ' +str_http ')
Df1.to_csv (' record.csv ', sep= ', ', encoding= ' utf-8 ')
Writelog (Traceback.format_exc ())
Writelog (Time.strftime ('%y-%m-%d%h:%m:%s ', Time.localtime (Time.time ())) + ' End: ' +str_http ')
Time.sleep (20)
# #获取其余69页房产数据
For I in Range (2,70+1):
Try
Str_http = "HTTP://JN.58.COM/ERSHOUFANG/PN" +str (i)
Writelog (Time.strftime ('%y-%m-%d%h:%m:%s ', Time.localtime (Time.time ())) + ' Start: ' +str_http ')
Df1=get_58_house (Str_http)
Df1.to_sql (' house ', cnx,if_exists= ' append ')
Except Exception as E:
# #writelog (". Format (' Save to Database Exception ',": ", E))
Writelog (Time.strftime ('%y-%m-%d%h:%m:%s ', Time.localtime (Time.time ())) + ' Except: ' +str_http ')
Df1.to_csv (' record.csv ', sep= ', ', encoding= ' utf-8 ')
Writelog (Traceback.format_exc ())
Writelog (Time.strftime ('%y-%m-%d%h:%m:%s ', Time.localtime (Time.time ())) + ' End: ' +str_http ')
Time.sleep (20)
# #关闭数据链接
Cnx.close ()
Run to see if the program is running normally.
Python development crawler crawls a city house price information