Here is the source code, big God do not spray ...
#-*-Coding:utf-8-*-
Import REQUESTS,TIME,URLLIB.REQUEST,OS,RE,XLWT
Import Threading,random,threadpool
Import pymongo,pymysql,logging
From multiprocessing import Process
From lxml import etree
From Pymongo import mongoclient
Import log
User_agent_list = [\
"mozilla/5.0 (Windows NT 6.1; WOW64) applewebkit/537.1 (khtml, like Gecko) chrome/22.0.1207.1 safari/537.1 ", \
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) applewebkit/536.11 (khtml, like Gecko) chrome/20.0.1132.57 safari/536.11 ", \
"mozilla/5.0 (Windows NT 6.1; WOW64) applewebkit/536.6 (khtml, like Gecko) chrome/20.0.1092.0 safari/536.6 ", \
"mozilla/5.0 (Windows NT 6.2) applewebkit/536.6 (khtml, like Gecko) chrome/20.0.1090.0 safari/536.6", \
"mozilla/5.0 (Windows NT 6.2; WOW64) applewebkit/537.1 (khtml, like Gecko) chrome/19.77.34.5 safari/537.1 ", \
"Mozilla/5.0 (X11; Linux x86_64) applewebkit/536.5 (khtml, like Gecko) chrome/19.0.1084.9 safari/536.5 ", \
"mozilla/5.0 (Windows NT 6.0) applewebkit/536.5 (khtml, like Gecko) chrome/19.0.1084.36 safari/536.5", \
"mozilla/5.0 (Windows NT 6.1; WOW64) applewebkit/536.3 (khtml, like Gecko) chrome/19.0.1063.0 safari/536.3 ", \
"mozilla/5.0 (Windows NT 5.1) applewebkit/536.3 (khtml, like Gecko) chrome/19.0.1063.0 safari/536.3", \
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) applewebkit/536.3 (khtml, like Gecko) chrome/19.0.1063.0 safari/536.3 ", \
"mozilla/5.0 (Windows NT 6.2) applewebkit/536.3 (khtml, like Gecko) chrome/19.0.1062.0 safari/536.3", \
"mozilla/5.0 (Windows NT 6.1; WOW64) applewebkit/536.3 (khtml, like Gecko) chrome/19.0.1062.0 safari/536.3 ", \
"mozilla/5.0 (Windows NT 6.2) applewebkit/536.3 (khtml, like Gecko) chrome/19.0.1061.1 safari/536.3", \
"mozilla/5.0 (Windows NT 6.1; WOW64) applewebkit/536.3 (khtml, like Gecko) chrome/19.0.1061.1 safari/536.3 ", \
"mozilla/5.0 (Windows NT 6.1) applewebkit/536.3 (khtml, like Gecko) chrome/19.0.1061.1 safari/536.3", \
"mozilla/5.0 (Windows NT 6.2) applewebkit/536.3 (khtml, like Gecko) chrome/19.0.1061.0 safari/536.3", \
"Mozilla/5.0 (X11; Linux x86_64) applewebkit/535.24 (khtml, like Gecko) chrome/19.0.1055.1 safari/535.24 ", \
"mozilla/5.0 (Windows NT 6.2; WOW64) applewebkit/535.24 (khtml, like Gecko) chrome/19.0.1055.1 safari/535.24 "
]
url = ' http://newhouse.sz.fang.com/house/s/b911/?ctm=1.sz.xf_search.page.9 '
Workbook = xlwt. Workbook ()
Sheet = workbook.add_sheet ("Sheet Name")
#sheet. Write (0, 2, ' Foobar ') # Row, column, value
#workbook. Save ("Foobar.xls")
#links = Re.findall (' "((http|ftp) s?:/ /.*?) "', str (html.text)) #获取网站所有url的正则表达式
#client = mongoclient (' localhost ', int (27017)) #链接数据库
Class Ft (object):
#def __init__ (Self,url):
#self. url = URL
def save_mysql (self,d_t):
For I in d_t:
For II in I:
LK = str (i[0])
AD = str (i[1])
ade = str (i[2])
PE = str (i[3])
PHE = str (i[4])
conn = Pymysql.connect (host= ' 192.168.191.1 ', user= ' root ', passwd= ' 123456789 ', db= ' data ', port=3306,
charset= ' UTF8 ')
cur = conn.cursor () # Gets a cursor
sql = "INSERT into FTX (link,adr,adress,price,phone) VALUES ("%s ","%s ","%s ","%s ","%s ") '% (LK, AD, Ade, PE, PHE)
Cur.execute (SQL)
data = Cur.fetchall ()
Cur.close () # Close cursor
Conn.commit () # Transaction Commit
Conn.close () # Releasing database resources
def get_data (Self,url):
headers={}
addr = []
url_2 = ' http://newhouse.gz.fang.com/house/s/b9 ' + str (URL) + '/?ctm=1.gz.xf_search.page.6 '
Url_1 = ' http://newhouse.sz.fang.com/house/s/b9 ' + str (URL) + '/?ctm=1.sz.xf_search.page.9 '
headers[' user-agent ' = Random.choice (user_agent_list)
Try
html = Requests.get (url_2, Headers=headers)
html.encoding = ' GBK '
if Html.status_code = = 200:
Log.kk (' download Web data Success ')
Else
Print (' Download failed!!! ')
Except Requests.exceptions.ReadTimeout as E:
LOG.GG.KK (e)
selector = etree. HTML (str (html.text))
Links = Selector.xpath ('//div[@class = "nlc_img"]/a/@href ')
Addrnames = Selector.xpath ('//div[@class = ' nlcd_name ']/a/text () ')
For I in Addrnames:
Addr.append (I.strip ())
Addrs = Selector.xpath ('//div[@class = "Address"]/a/@title ')
Prices = Selector.xpath ('//div[@class = ' Nhouse_price ']/span/text () ')
Tels = Selector.xpath ('//div[@class = "Tel"]/p/text () ')
R = List (Zip (links, addr, Addrs, prices, tels))
Print (R)
Self.save_mysql (R)
def save_data (Self,get_dat):
Client = mongoclient (' localhost ', int (27017)) # link MongoDB database
def log (self):
Logging.basicconfig (level=logging. DEBUG,
format= '% (asctime) s% (filename) s[line:% (lineno) d]% (levelname) s% (message) s ',
Datefmt= '%a,%d%b%Y%h:%m:%s ',
Filename= ' Myapp.log ',
Filemode= ' W ')
Logging.debug (' This is Debug message ')
Logging.info (' This is Info message ')
Logging.warning (' This is warning message ')
If __name__== "__main__":
DT = Ft ()
GD = Dt.get_data
Pool = ThreadPool. ThreadPool (50)
Reqs = Threadpool.makerequests (Gd,range (2))
[Pool.putrequest (req) for req in reqs]
Pool.wait ()
The following table code is attached:
CREATE TABLE FTX (
ID int NOT NULL auto_increment,
Link varchar (+) is not NULL,
ADR varchar (+) NOT NULL,
Adress varchar (+) NOT NULL,
Price varchar (+) is not NULL,
Phone varchar (+) NOT NULL,
PRIMARY KEY (ID)
);
ALTER TABLE FTX Modify column price varchar (+) character set UTF8 NOT null #修改字段的字符集
It is important to note that when inserting data, remember that the character set of the relevant field becomes UTF8, otherwise it will error, it is better to start the table to specify that the table's character set is UTF8
Multi-threaded crawl room world data, and stored to MySQL