標籤:[] code file sea inux div 失敗 欄位 foo
下面是原始碼,大神勿噴。。。。。。
# -*- coding: utf-8 -*-
import requests,time,urllib.request,os,re,xlwt
import threading,random,threadpool
import pymongo,pymysql,logging
from multiprocessing import Process
from lxml import etree
from pymongo import MongoClient
import log
user_agent_list = [ \
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1" ,\
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", \
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", \
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", \
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", \
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", \
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", \
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", \
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", \
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
url = ‘http://newhouse.sz.fang.com/house/s/b911/?ctm=1.sz.xf_search.page.9‘
workbook = xlwt.Workbook()
sheet = workbook.add_sheet("Sheet Name")
#sheet.write(0, 2, ‘foobar‘)# row, column, value
#workbook.save("foobar.xls")
#links = re.findall(‘"((http|ftp)s?://.*?)"‘, str(html.text))#擷取網站所有url的Regex
#client = MongoClient(‘localhost‘,int(27017))#連結資料庫
class Ft(object):
#def __init__(self,url):
#self.url = url
def save_mysql(self,d_t):
for i in d_t:
for ii in i:
lk = str(i[0])
ad = str(i[1])
ade = str(i[2])
pe = str(i[3])
phe = str(i[4])
conn = pymysql.connect(host=‘192.168.191.1‘, user=‘root‘, passwd=‘123456789‘, db=‘data‘, port=3306,
charset=‘utf8‘)
cur = conn.cursor() # 擷取一個遊標
sql = ‘‘‘INSERT INTO ftx(link,adr,adress,price,phone)VALUES("%s","%s","%s","%s","%s")‘‘‘ %(lk, ad, ade, pe, phe)
cur.execute(sql)
data = cur.fetchall()
cur.close() # 關閉遊標
conn.commit() # 事務提交
conn.close() # 釋放資料庫資源
def get_data(self,url):
headers={}
addr = []
url_2 = ‘http://newhouse.gz.fang.com/house/s/b9‘+ str(url) + ‘/?ctm=1.gz.xf_search.page.6‘
url_1 = ‘http://newhouse.sz.fang.com/house/s/b9‘+ str(url) + ‘/?ctm=1.sz.xf_search.page.9‘
headers[‘User-Agent‘] = random.choice(user_agent_list)
try:
html = requests.get(url_2, headers=headers)
html.encoding = ‘gbk‘
if html.status_code == 200:
log.kk(‘下載網頁資料成功‘)
else:
print(‘下載失敗!!!‘)
except requests.exceptions.ReadTimeout as e:
log.gg.kk(e)
selector = etree.HTML(str(html.text))
links = selector.xpath(‘//div[@class="nlc_img"]/a/@href‘)
addrnames = selector.xpath(‘//div[@class="nlcd_name"]/a/text()‘)
for i in addrnames:
addr.append(i.strip())
addrs = selector.xpath(‘//div[@class="address"]/a/@title‘)
prices = selector.xpath(‘//div[@class="nhouse_price"]/span/text()‘)
tels = selector.xpath(‘//div[@class="tel"]/p/text()‘)
r = list(zip(links, addr, addrs, prices, tels))
print(r)
self.save_mysql(r)
def save_data(self,get_dat):
client = MongoClient(‘localhost‘, int(27017)) # 連結mongodb資料庫
def log(self):
logging.basicConfig(level=logging.DEBUG,
format=‘%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s‘,
datefmt=‘%a, %d %b %Y %H:%M:%S‘,
filename=‘myapp.log‘,
filemode=‘w‘)
logging.debug(‘This is debug message‘)
logging.info(‘This is info message‘)
logging.warning(‘This is warning message‘)
if __name__=="__main__":
dt = Ft()
gd = dt.get_data
pool = threadpool.ThreadPool(50)
reqs = threadpool.makeRequests(gd,range(2))
[pool.putRequest(req) for req in reqs]
pool.wait()
下面附上建表代碼:
create table ftx(
id int not null auto_increment,
link varchar(100) not null,
adr varchar(100) not null,
adress varchar(100) not null,
price varchar(100) not null,
phone varchar(100) not null,
PRIMARY KEY (id )
);
alter table ftx modify column price varchar(100) character set utf8 not null #修改欄位的字元集
值得注意的是:在插入資料的時候,記得要相關欄位的字元集變成utf8,否則會報錯,最好剛開始建表就指定該表的字元集為utf8
多線程爬取房天下資料,並且儲存到mysql