Demand:
Get Web proxy IP information, including IP address, port number, IP type
So, how to solve this problem?
Analyze page structure and URL design to know:
The data are all available on this page and there is no separate detail page
Next page by changing the last URL suffix of the current page, then I realize the concatenation of the URL does not solve the problem.
So, what is the operating environment of the software?
python3.5
Scrapy
Twisted
Request
Pymysql
The above is a third-party package, installed via PIP
MySQL Service
#!/usr/bin/python3__author__ = ' Beimenchuixue ' __blog__ = ' http://www.cnblogs.com/2bjiujiu/' Import requestsimport Pymysqlfrom Time Import sleepfrom random import randint, choicefrom scrapy.selector import Selectorfrom Twisted.enterpris E Import adbapifrom twisted.internet import reactor# Database basic configuration, self-configuring db_settings = {' host ': ' localhost ', ' db ': ' Db_nam E ', ' user ': ' user_name ', ' Password ': ' Password ', ' charset ': ' UTF8 ', ' use_unicode ': true}# conn = Pymysql.conne CT (**db_settings) # cursor = Conn.cursor () # generates connection pool Db_conn = Adbapi. ConnectionPool (' Pymysql ', **db_settings) def go_sleep (): "" "for random IO blocking, impersonating human access" "While Randint (0, 1): Sleep (choic E ([0.1, 0.2, 0.3, 0.4, 0.5, 0.6]) def get_sql (IP, Port, Ip_type): "" "Get SQL Statement" "" If IP and port and ip_type:s QL = "" INSERT into Ip_server (IP, port, Ip_type) value (%s,%s,%s) on DUPLICATE K EY update ip=values (IP), port=values (port), Ip_type=values (Ip_type) "" "Try: params = (IP, int (port), ip_type) except Exception as E:print (e) return None retur n sql, params else:return nonedef go_insert (cursor, SQL, params): "" "Database insert Operation" "" Try:cursor.execut E (SQL, params) except Exception as E:print (e) def get_ip (): "" "Crawl IP information and deposit to Database" "" # Set Request Header headers = { ' Referer ': ' http://www.xicidaili.com/nn/', ' user-agent ': ' mozilla/5.0 (Windows NT 10.0; Win64; x64) applewebkit/537.36 (khtml, like Gecko) chrome/59.0.3071.115 safari/537.36 '} # gets 50 pages of data for page in range (1, 50): # Build relationship map, add program readable Ip_index, port_index, Type_index = 2, 3, 6 # crawl URL url = ' http://www. Xicidaili.com/nn/{page} '. Format (page=page) go_sleep () response = Requests.get (URL, headers=h eaders) # Printing status code print (response.status_code) # for page Resolution selectors = Selector (Text=response.text) All_trs = Selectors.css (' #ip_list. Odd ') for tr in all_trs:ip = Tr.css (' Td:nth-child (%s):: Text '% ip_index). Extract_first () PO RT = Tr.css (' Td:nth-child (%s):: Text '% port_index). Extract_first () Ip_type = Tr.css (' Td:nth-child (%s):: Text '% Type_index). Extract_first () sql, params = get_sql (IP, port, Ip_type) if Sql:try: # cursor.execute (sql, params) # conn.commit () # Execute SQL operation Db_conn.runinteraction (Go_insert, SQL, params) except Exception as E:print (E ) Else:breakif __name__ = = ' __main__ ': get_ip () # Let the twisted SQL operation go through Reactor.calllater ( 4, Reactor.stop) Reactor.run ()
Crawl the network proxy IP and store it in MySQL database