標籤:技術 nbsp ica pos browser dex ase 開發 main
前言:
苦逼的我從某某城市換到另一個稍微大點的某某城市,面臨的第一個問題就是買房,奮鬥10多年,又回到起點,廢話就不多說了,看看如何設計程式把某同城上的樓價資料抓取過來。
方案:方案思路很簡單,先把網頁內容擷取下來,通過一定規則對內容解析,儲存成想要的格式
痛點是對網頁的解析,是一個比較細緻的活,必須邊輸出,邊調試。
具體實現:
擷取網頁內容:
def get_page(url):
headers = {
‘User-Agent‘: r‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) ‘
r‘Chrome/45.0.2454.85 Safari/537.36 115Browser/6.0.3‘,
‘Referer‘: r‘http://jn.58.com/ershoufang/‘,
‘Host‘: r‘jn.58.com‘,
‘Connection‘: ‘keep-alive‘
}
timeout = 60
socket.setdefaulttimeout(timeout) # 設定逾時
req = request.Request(url, headers=headers)
response = request.urlopen(req).read()
page = response.decode(‘utf-8‘,‘ignore‘)
return page
第二步解析網頁:解析時要注意無效內容的處理,不然跑起來會報錯,調試很麻煩
def get_58_house(url):
html = get_page(url)
soup = BeautifulSoup(html,"lxml")
table =soup.find(id="main")
df = pd.DataFrame(columns=["op_time","web","house_name","xq","xq1","price","per_price","room","m2","href","ts"])
for tr in table.find_all(‘tr‘):
try:
str_name = tr.find("p","bthead").find("a","t").string.strip()
str_link = tr.find("p","bthead").find("a","t")["href"]
##房產小區位置
str_xq = list()
str_xq1= ‘‘
str_xq2= ‘‘
try:
for s in tr.find_all("a","a_xq1") :
str_xq.append(s.string.strip())
str_xq1= str_xq[0]
str_xq2= str_xq[1]
except:
pass
##房產特色
str_ts =list()
try:
for s in tr.find("div","qj-listleft").stripped_strings:
str_ts.append(s)
except:
pass
## 價格資訊####################
str_price =list()
str_toal =‘‘
str_per =‘‘
str_room =‘‘
str_m2 =‘‘
try:
for s in tr.find("div","qj-listright btall").stripped_strings:
str_price.append(s)
str_toal = str_price[0]
str_per = re.findall(r"(\d+\.*\d+)",str_price[1])
str_room = str_price[2]
str_m2 = re.findall(r"(\d+\.*\d+)",str_price[3])
except:
pass
except Exception as e:
print(‘Exception‘,":",e)
try:
row = {‘web‘:‘58同城‘,‘house_name‘:str_name,‘xq‘:str_xq1,‘xq1‘:str_xq2,‘price‘:str_toal,‘per_price‘:str_per,‘room‘:str_room,‘m2‘:str_m2,‘ts‘:‘‘.join(str_ts),‘href‘:str_link}
newrow = pd.DataFrame(data=row,index=["0"])
df=df.append(newrow,ignore_index=True)
except Exception as e:
print(‘Exception‘,":",e)
f=open("log.txt",‘a‘)
traceback.print_exc(file=f)
f.write(row)
f.flush()
f.close()
df["op_time"]=time.strftime(‘%Y-%m-%d‘,time.localtime(time.time()))
return df
第三步迴圈處理每頁資料並儲存資料:
def get_58_house_all():
##建立資料庫連接
engine = create_engine(‘oracle+cx_oracle://user:[email protected]/orcl‘)
cnx = engine.connect()
##先清除今天的資料
‘‘‘
strSql = ‘delete from house where op_time=\‘{}\‘ ‘.format(time.strftime(‘%Y-%m-%d‘,time.localtime(time.time())))
cnx.execute(strSql)
‘‘‘
##擷取首頁房產資料
str_http = "http://jn.58.com/ershoufang/"
writelog(time.strftime(‘%Y-%m-%d %H:%M:%S‘,time.localtime(time.time()))+‘ Start:‘+str_http)
df1=get_58_house(str_http)
try:
df1.to_sql(‘house‘, cnx,if_exists=‘append‘)
except Exception as e:
‘‘‘記錄異常資訊
本例使用的是oracle 資料庫,預設編碼格式為GBK,儲存時因為特殊字元,導致儲存錯誤。錯誤提示如下,需要調整oracle字元集
oracle 字元集調整為UTF8,
NLS_LANG: AMERICAN_AMERICA.AL32UTF8
NLS_CHARACTERSET: UTF8
NLS_NCHAR_CHARACTERSET: UTF8
報錯資訊為
UnicodeEncodeError: ‘gbk‘ codec can‘t encode character ‘\xb2‘ in position 13: illegal multibyte sequence
該字元為上標2,平方米
‘‘‘
writelog(time.strftime(‘%Y-%m-%d %H:%M:%S‘,time.localtime(time.time()))+‘ Except:‘+str_http)
df1.to_csv(‘record.csv‘,sep=‘,‘, encoding=‘utf-8‘)
writelog(traceback.format_exc())
writelog(time.strftime(‘%Y-%m-%d %H:%M:%S‘,time.localtime(time.time()))+‘ End:‘+str_http)
time.sleep(20)
##擷取其餘69頁房產資料
for i in range(2,70+1) :
try:
str_http ="http://jn.58.com/ershoufang/pn"+str(i)
writelog(time.strftime(‘%Y-%m-%d %H:%M:%S‘,time.localtime(time.time()))+‘ Start:‘+str_http)
df1=get_58_house(str_http)
df1.to_sql(‘house‘, cnx,if_exists=‘append‘)
except Exception as e:
##writelog(‘‘.format(‘Save to database Exception‘,":",e) )
writelog(time.strftime(‘%Y-%m-%d %H:%M:%S‘,time.localtime(time.time()))+‘ Except:‘+str_http)
df1.to_csv(‘record.csv‘,sep=‘,‘, encoding=‘utf-8‘)
writelog(traceback.format_exc())
writelog(time.strftime(‘%Y-%m-%d %H:%M:%S‘,time.localtime(time.time()))+‘ End:‘+str_http)
time.sleep(20)
##關閉資料連結
cnx.close()
跑跑看看是不是程式一切運行正常。
Python開發網路爬蟲抓取某同城樓價資訊