Multi-threaded crawl room world data, and stored to MySQL

Source: Internet
Author: User
Tags mongoclient xpath

Here is the source code, big God do not spray ...

#-*-Coding:utf-8-*-

Import REQUESTS,TIME,URLLIB.REQUEST,OS,RE,XLWT

Import Threading,random,threadpool

Import pymongo,pymysql,logging

From multiprocessing import Process

From lxml import etree

From Pymongo import mongoclient

Import log

User_agent_list = [\

"mozilla/5.0 (Windows NT 6.1; WOW64) applewebkit/537.1 (khtml, like Gecko) chrome/22.0.1207.1 safari/537.1 ", \

"Mozilla/5.0 (X11; CrOS i686 2268.111.0) applewebkit/536.11 (khtml, like Gecko) chrome/20.0.1132.57 safari/536.11 ", \

"mozilla/5.0 (Windows NT 6.1; WOW64) applewebkit/536.6 (khtml, like Gecko) chrome/20.0.1092.0 safari/536.6 ", \

"mozilla/5.0 (Windows NT 6.2) applewebkit/536.6 (khtml, like Gecko) chrome/20.0.1090.0 safari/536.6", \

"mozilla/5.0 (Windows NT 6.2; WOW64) applewebkit/537.1 (khtml, like Gecko) chrome/19.77.34.5 safari/537.1 ", \

"Mozilla/5.0 (X11; Linux x86_64) applewebkit/536.5 (khtml, like Gecko) chrome/19.0.1084.9 safari/536.5 ", \

"mozilla/5.0 (Windows NT 6.0) applewebkit/536.5 (khtml, like Gecko) chrome/19.0.1084.36 safari/536.5", \

"mozilla/5.0 (Windows NT 6.1; WOW64) applewebkit/536.3 (khtml, like Gecko) chrome/19.0.1063.0 safari/536.3 ", \

"mozilla/5.0 (Windows NT 5.1) applewebkit/536.3 (khtml, like Gecko) chrome/19.0.1063.0 safari/536.3", \

"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) applewebkit/536.3 (khtml, like Gecko) chrome/19.0.1063.0 safari/536.3 ", \

"mozilla/5.0 (Windows NT 6.2) applewebkit/536.3 (khtml, like Gecko) chrome/19.0.1062.0 safari/536.3", \

"mozilla/5.0 (Windows NT 6.1; WOW64) applewebkit/536.3 (khtml, like Gecko) chrome/19.0.1062.0 safari/536.3 ", \

"mozilla/5.0 (Windows NT 6.2) applewebkit/536.3 (khtml, like Gecko) chrome/19.0.1061.1 safari/536.3", \

"mozilla/5.0 (Windows NT 6.1; WOW64) applewebkit/536.3 (khtml, like Gecko) chrome/19.0.1061.1 safari/536.3 ", \

"mozilla/5.0 (Windows NT 6.1) applewebkit/536.3 (khtml, like Gecko) chrome/19.0.1061.1 safari/536.3", \

"mozilla/5.0 (Windows NT 6.2) applewebkit/536.3 (khtml, like Gecko) chrome/19.0.1061.0 safari/536.3", \

"Mozilla/5.0 (X11; Linux x86_64) applewebkit/535.24 (khtml, like Gecko) chrome/19.0.1055.1 safari/535.24 ", \

"mozilla/5.0 (Windows NT 6.2; WOW64) applewebkit/535.24 (khtml, like Gecko) chrome/19.0.1055.1 safari/535.24 "

]

url = ' http://newhouse.sz.fang.com/house/s/b911/?ctm=1.sz.xf_search.page.9 '

Workbook = xlwt. Workbook ()

Sheet = workbook.add_sheet ("Sheet Name")

#sheet. Write (0, 2, ' Foobar ') # Row, column, value

#workbook. Save ("Foobar.xls")

#links = Re.findall (' "((http|ftp) s?:/ /.*?) "', str (html.text)) #获取网站所有url的正则表达式

#client = mongoclient (' localhost ', int (27017)) #链接数据库

Class Ft (object):

#def __init__ (Self,url):

#self. url = URL

def save_mysql (self,d_t):

For I in d_t:

For II in I:

LK = str (i[0])

AD = str (i[1])

ade = str (i[2])

PE = str (i[3])

PHE = str (i[4])

conn = Pymysql.connect (host= ' 192.168.191.1 ', user= ' root ', passwd= ' 123456789 ', db= ' data ', port=3306,

charset= ' UTF8 ')

cur = conn.cursor () # Gets a cursor

sql = "INSERT into FTX (link,adr,adress,price,phone) VALUES ("%s ","%s ","%s ","%s ","%s ") '% (LK, AD, Ade, PE, PHE)

Cur.execute (SQL)

data = Cur.fetchall ()

Cur.close () # Close cursor

Conn.commit () # Transaction Commit

Conn.close () # Releasing database resources

def get_data (Self,url):

headers={}

addr = []

url_2 = ' http://newhouse.gz.fang.com/house/s/b9 ' + str (URL) + '/?ctm=1.gz.xf_search.page.6 '

Url_1 = ' http://newhouse.sz.fang.com/house/s/b9 ' + str (URL) + '/?ctm=1.sz.xf_search.page.9 '

headers[' user-agent ' = Random.choice (user_agent_list)

Try

html = Requests.get (url_2, Headers=headers)

html.encoding = ' GBK '

if Html.status_code = = 200:

Log.kk (' download Web data Success ')

Else

Print (' Download failed!!! ')

Except Requests.exceptions.ReadTimeout as E:

LOG.GG.KK (e)

selector = etree. HTML (str (html.text))

Links = Selector.xpath ('//div[@class = "nlc_img"]/a/@href ')

Addrnames = Selector.xpath ('//div[@class = ' nlcd_name ']/a/text () ')

For I in Addrnames:

Addr.append (I.strip ())

Addrs = Selector.xpath ('//div[@class = "Address"]/a/@title ')

Prices = Selector.xpath ('//div[@class = ' Nhouse_price ']/span/text () ')

Tels = Selector.xpath ('//div[@class = "Tel"]/p/text () ')

R = List (Zip (links, addr, Addrs, prices, tels))

Print (R)

Self.save_mysql (R)

def save_data (Self,get_dat):

Client = mongoclient (' localhost ', int (27017)) # link MongoDB database

def log (self):

Logging.basicconfig (level=logging. DEBUG,

format= '% (asctime) s% (filename) s[line:% (lineno) d]% (levelname) s% (message) s ',

Datefmt= '%a,%d%b%Y%h:%m:%s ',

Filename= ' Myapp.log ',

Filemode= ' W ')

Logging.debug (' This is Debug message ')

Logging.info (' This is Info message ')

Logging.warning (' This is warning message ')

If __name__== "__main__":

DT = Ft ()

GD = Dt.get_data

Pool = ThreadPool. ThreadPool (50)

Reqs = Threadpool.makerequests (Gd,range (2))

[Pool.putrequest (req) for req in reqs]

Pool.wait ()

The following table code is attached:

CREATE TABLE FTX (

ID int NOT NULL auto_increment,

Link varchar (+) is not NULL,

ADR varchar (+) NOT NULL,

Adress varchar (+) NOT NULL,

Price varchar (+) is not NULL,

Phone varchar (+) NOT NULL,

PRIMARY KEY (ID)

);

ALTER TABLE FTX Modify column price varchar (+) character set UTF8 NOT null #修改字段的字符集

It is important to note that when inserting data, remember that the character set of the relevant field becomes UTF8, otherwise it will error, it is better to start the table to specify that the table's character set is UTF8

Multi-threaded crawl room world data, and stored to MySQL

Related Article

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.