標籤:基本配置 .exe exce 環境 sel l資料庫 usr 拼接 char
需求:
擷取西刺網代理ip資訊,包括ip地址、連接埠號碼、ip類型
那,如何解決這個問題?
分析頁面結構和url設計得知:
資料都在本頁面可以全部擷取,沒有單獨的詳情頁面
下一頁通過更改當前頁面最後url尾碼進行跳轉頁面,那我實現URL的拼接不就解決這個問題了
那,軟體的運行環境?
python3.5
scrapy
twisted
request
pymysql
以上是第三方包,通過pip安裝
MySQL服務
其中db,user,password的值根據實際情況而定
#!/usr/bin/python3__author__ = ‘beimenchuixue‘__blog__ = ‘http://www.cnblogs.com/2bjiujiu/‘import requestsimport pymysqlfrom time import sleepfrom random import randint, choicefrom scrapy.selector import Selectorfrom twisted.enterprise import adbapifrom twisted.internet import reactor# 資料庫基本配置, 自行配置db_settings = { ‘host‘: ‘localhost‘, ‘db‘: ‘db_name‘, ‘user‘: ‘user_name‘, ‘password‘: ‘password‘, ‘charset‘: ‘utf8‘, ‘use_unicode‘: True}# conn = pymysql.connect(**db_settings)# cursor = conn.cursor()# 產生串連池db_conn = adbapi.ConnectionPool(‘pymysql‘, **db_settings)def go_sleep(): """進行隨機io堵塞,模仿人訪問""" while randint(0, 1): sleep(choice([0.1, 0.2, 0.3, 0.4, 0.5, 0.6]))def get_sql(ip, port, ip_type): """獲得sql語句""" if ip and port and ip_type: sql = """insert into ip_server(ip, port, ip_type) value (%s, %s, %s) on DUPLICATE key update ip=values(ip), port=values(port), ip_type=values(ip_type)""" try: params = (ip, int(port), ip_type) except Exception as e: print(e) return None return sql, params else: return Nonedef go_insert(cursor, sql, params): """資料庫插入操作""" try: cursor.execute(sql, params) except Exception as e: print(e)def get_ip(): """爬取ip資訊並存入資料庫""" # 佈建要求頭 headers = { ‘Referer‘: ‘http://www.xicidaili.com/nn/‘, ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36‘ } # 擷取50頁的資料 for page in range(1, 50): # 建立關係映射,增加程式可閱讀性 ip_index, port_index, type_index = 2, 3, 6 # 爬取的url url = ‘http://www.xicidaili.com/nn/{page}‘.format(page=page) go_sleep() response = requests.get(url, headers=headers) # 列印狀態代碼 print(response.status_code) # 進行頁面解析 selectors = Selector(text=response.text) all_trs = selectors.css(‘#ip_list .odd‘) for tr in all_trs: ip = tr.css(‘td:nth-child(%s)::text‘ % ip_index).extract_first() port = tr.css(‘td:nth-child(%s)::text‘ % port_index).extract_first() ip_type = tr.css(‘td:nth-child(%s)::text‘ % type_index).extract_first() sql, params = get_sql(ip, port, ip_type) if sql: try: # cursor.execute(sql, params) # conn.commit() # 執行sql操作 db_conn.runInteraction(go_insert, sql, params) except Exception as e: print(e) else: breakif __name__ == ‘__main__‘: get_ip() # 讓twisted的sql操作去完成 reactor.callLater(4, reactor.stop) reactor.run()
爬取西刺網代理ip,並把其存放mysql資料庫