多線程爬取房天下資料,並且儲存到mysql

來源:互聯網
上載者:User

標籤:[]   code   file   sea   inux   div   失敗   欄位   foo   

下面是原始碼,大神勿噴。。。。。。

 

# -*- coding: utf-8 -*-

import requests,time,urllib.request,os,re,xlwt

import threading,random,threadpool

import pymongo,pymysql,logging

from multiprocessing import Process

from lxml import etree

from pymongo import MongoClient

import log

 

user_agent_list = [ \

          "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1" ,\

          "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", \

          "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", \

          "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", \

          "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", \

          "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", \

          "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", \

          "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \

          "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \

          "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \

          "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \

          "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \

          "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \

          "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \

          "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \

          "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", \

          "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", \

          "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"

      ]

url = ‘http://newhouse.sz.fang.com/house/s/b911/?ctm=1.sz.xf_search.page.9‘

workbook = xlwt.Workbook()

sheet = workbook.add_sheet("Sheet Name")

 

#sheet.write(0, 2, ‘foobar‘)# row, column, value

#workbook.save("foobar.xls")

#links = re.findall(‘"((http|ftp)s?://.*?)"‘, str(html.text))#擷取網站所有url的Regex

#client = MongoClient(‘localhost‘,int(27017))#連結資料庫

 

class Ft(object):

 

    #def __init__(self,url):

        #self.url = url

 

    def save_mysql(self,d_t):

        for i in d_t:

            for ii in i:

                    lk = str(i[0])

                    ad = str(i[1])

                    ade = str(i[2])

                    pe = str(i[3])

                    phe = str(i[4])

                    conn = pymysql.connect(host=‘192.168.191.1‘, user=‘root‘, passwd=‘123456789‘, db=‘data‘, port=3306,

                                           charset=‘utf8‘)

                    cur = conn.cursor()  # 擷取一個遊標

                    sql = ‘‘‘INSERT INTO ftx(link,adr,adress,price,phone)VALUES("%s","%s","%s","%s","%s")‘‘‘ %(lk, ad, ade, pe, phe)

                    cur.execute(sql)

                    data = cur.fetchall()

                    cur.close()  # 關閉遊標

                    conn.commit()  # 事務提交

                    conn.close()  # 釋放資料庫資源

    def get_data(self,url):

        headers={}

        addr = []

        url_2 = ‘http://newhouse.gz.fang.com/house/s/b9‘+ str(url) + ‘/?ctm=1.gz.xf_search.page.6‘

        url_1 = ‘http://newhouse.sz.fang.com/house/s/b9‘+ str(url) + ‘/?ctm=1.sz.xf_search.page.9‘

        headers[‘User-Agent‘] = random.choice(user_agent_list)

        try:

 

            html = requests.get(url_2, headers=headers)

            html.encoding = ‘gbk‘

            if html.status_code == 200:

                log.kk(‘下載網頁資料成功‘)

            else:

                print(‘下載失敗!!!‘)

 

        except requests.exceptions.ReadTimeout as e:

            log.gg.kk(e)

        selector = etree.HTML(str(html.text))

 

        links = selector.xpath(‘//div[@class="nlc_img"]/a/@href‘)

        addrnames = selector.xpath(‘//div[@class="nlcd_name"]/a/text()‘)

        for i in addrnames:

            addr.append(i.strip())

 

        addrs = selector.xpath(‘//div[@class="address"]/a/@title‘)

        prices = selector.xpath(‘//div[@class="nhouse_price"]/span/text()‘)

        tels = selector.xpath(‘//div[@class="tel"]/p/text()‘)

        r = list(zip(links, addr, addrs, prices, tels))

        print(r)

        self.save_mysql(r)

 

    def save_data(self,get_dat):

 

        client = MongoClient(‘localhost‘, int(27017))  # 連結mongodb資料庫

 

    def log(self):

 

        logging.basicConfig(level=logging.DEBUG,

                            format=‘%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s‘,

                            datefmt=‘%a, %d %b %Y %H:%M:%S‘,

                            filename=‘myapp.log‘,

                            filemode=‘w‘)

 

        logging.debug(‘This is debug message‘)

        logging.info(‘This is info message‘)

        logging.warning(‘This is warning message‘)

 

if __name__=="__main__":

    dt = Ft()

    gd = dt.get_data

    pool = threadpool.ThreadPool(50)

    reqs = threadpool.makeRequests(gd,range(2))

    [pool.putRequest(req) for req in reqs]

    pool.wait()

 

下面附上建表代碼:

 

create table ftx(

id int not null auto_increment,

link varchar(100) not null,

adr varchar(100) not null,

adress varchar(100) not null,

price varchar(100) not null,

phone varchar(100) not null,

 PRIMARY KEY (id )

);

alter table ftx modify column price varchar(100) character set utf8 not null #修改欄位的字元集

 

值得注意的是:在插入資料的時候,記得要相關欄位的字元集變成utf8,否則會報錯,最好剛開始建表就指定該表的字元集為utf8

 


多線程爬取房天下資料,並且儲存到mysql

相關文章

聯繫我們

該頁面正文內容均來源於網絡整理,並不代表阿里雲官方的觀點,該頁面所提到的產品和服務也與阿里云無關,如果該頁面內容對您造成了困擾,歡迎寫郵件給我們,收到郵件我們將在5個工作日內處理。

如果您發現本社區中有涉嫌抄襲的內容,歡迎發送郵件至: info-contact@alibabacloud.com 進行舉報並提供相關證據,工作人員會在 5 個工作天內聯絡您,一經查實,本站將立刻刪除涉嫌侵權內容。

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.