A year ago suddenly had an inspiration, want to get a powerful web search engine, but because the university undergraduate study software engineering bias embedded direction, the ability of the web is a little weak, not JSP, do not understand HTML, long time did not play SQL, but is taking advantage of the young people of this non-compromise of the vigor, Simply to learn all the previous did not learn all the time, now feel the web is the same thing. OK, nonsense will not say, see the reader of this article, you can first look at what I do: go to the turntable net
OK Search: www.oksousou.com (This is magnetic, by the way to show everyone)
Anyway, because I want to crawl Baidu network disk, and degree Niang you understand of the reptile born, anti-reptile ability is very ox break. Especially like I use my computer to crawl Baidu network disk, climbed a few days Baidu on my machine, crawler began to climb out of things. After the online search, west, found that can be resolved through the agent to solve the problem, so to climb agent. This is the site I'm crawling with:
After http://www.xicidaili.com/he seemed to start fighting back, and I pointed to the talons: http://www.kuaidaili.com.
Presumably see this blog post is mostly the program ape, so still first on the code (I will write a note, rest assured that the crawler to http://www.xicidaili.com/as the target):
#coding: Utf-8
Import JSON
Import Sys
Import Urllib, Urllib2
Import datetime
Import time
Reload (SYS)
Sys.setdefaultencoding (' Utf-8 ')
From queue import queue
From BS4 import BeautifulSoup
Import MySQLdb as MDB
Db_host = ' 127.0.0.1 '
Db_user = ' root '
Db_pass = ' root '
Id=0
st=1000
uk= ' 3758096603 '
Classify= "Inha"
Proxy = {u ' https ': U ' 118.99.66.106:8080 '}
Class ProxyServer:
def __init__ (self): #这个就不说了, database initialization, I'm using MySQL.
Self.dbconn = Mdb.connect (Db_host, Db_user, Db_pass, ' ebook ', charset= ' UTF8 ')
Self.dbconn.autocommit (False)
Self.next_proxy_set = set ()
Self.chance=0
Self.fail=0
Self.count_errno=0
Self.dbcurr = Self.dbconn.cursor ()
Self.dbcurr.execute (' SET NAMES UTF8 ')
def get_prxy (self,num): #这个函数用来爬取代理
While num>0:
Global Proxy,id,uk,classify,st
Count=0
For page in range (1,718): #代理网站总页数, I gave a 718 page
If self.chance >0: #羊毛出在羊身上, if the crawl site starts to counterattack me, I'm going to climb down from him.
Agent Camouflage, this self.chance indicates when I started to change agent
If ST% 100==0:
Self.dbcurr.execute ("SELECT count (*) from proxy")
For R in Self.dbcurr:
COUNT=R[0]
If St>count:
st=1000 #我是从数据库的第1000条开始换的, this section you can change, a random function random change, I wrote very simple
Self.dbcurr.execute ("SELECT * from proxy where id=%s", (ST))
Results = Self.dbcurr.fetchall ()
For R in results:
PROTOCOL=R[1]
IP=R[2]
PORT=R[3]
Pro= (protocol,ip+ ":" +port)
If pro not in Self.next_proxy_set:
Self.next_proxy_set.add (PRO)
Self.chance=0
St+=1
Proxy_support = Urllib2. Proxyhandler (proxy) #注册代理
# opener = Urllib2.build_opener (proxy_support,urllib2. HttpHandler (debuglevel=1))
Opener = Urllib2.build_opener (Proxy_support)
Urllib2.install_opener (opener)
#添加头信息, imitate browser crawl Web page, deal with return 403 Forbidden access problem
# i_headers = {' user-agent ': ' mozilla/5.0 (Windows; U Windows NT 6.1; En-us; rv:1.9.1.6) gecko/20091201 firefox/3.5.6 '}
I_headers = {' user-agent ': ' mozilla/5.0 (Windows NT 6.1) applewebkit/537.36 (khtml, like Gecko) chrome/31.0.1650.48 '}
#url = ' http://www.kuaidaili.com/free/inha/' + str (page)
Url= ' http://www.kuaidaili.com/free/' +classify+ '/' + str (page)
Html_doc= ""
Try
req = Urllib2. Request (Url,headers=i_headers)
Response = Urllib2.urlopen (req, none,5)
Html_doc = Response.read () #这不就获取了要爬取的页面嘛?
Except Exception as ex: #看抛出异常了, may start to counterattack me, I began to change agent
Print "ex=", ex
Pass
Self.chance+=1
If self.chance>0:
If Len (self.next_proxy_set) >0:
Protocol,socket=self.next_proxy_set.pop ()
proxy= {Protocol:socket}
Print "Proxy", Proxy
Print "Change proxy success."
Continue
#html_doc = Urllib2.urlopen (' http://www.xici.net.co/nn/' + str (page)). Read ()
If Html_doc! = "": #解析爬取的页面, with the BeautifulSoup
Soup = BeautifulSoup (html_doc,from_encoding= "UTF8")
#print "Soup", soup
#trs = soup.find (' table ', id= ' ip_list '). Find_all (' tr ') #获得所有行
TRS = ""
Try
TRS = soup.find (' table '). Find_all (' tr ')
Except
Print "Error"
Continue
For TR in trs[1:]:
TDS = Tr.find_all (' TD ')
ip = Tds[0].text.strip () #ip
Port = Tds[1].text.strip () #端口
protocol = Tds[3].text.strip ()
#tds = Tr.find_all (' TD ')
#ip = Tds[2].text.strip ()
#port = Tds[3].text.strip ()
#protocol = Tds[6].text.strip ()
Get_time= Tds[6].text.strip ()
#get_time = "+get_time"
Check_time = Datetime.datetime.strptime (Get_time, '%y-%m-%d%h:%m:%s ')
temp = Time.time ()
x = time.localtime (float (temp))
Time_now = Time.strftime ("%y-%m-%d%h:%m:%s", x) # Get time Now, TimeSpan
Http_ip = protocol+ '://' +ip+ ': ' +port
If protocol = = ' HTTP ' or protocol = = ' HTTPS ': #只要http协议相关代理, no other
Content= ""
Try: #我就是不放心这个网站, so when I climbed down I started to check if the agent really worked.
Proxy_support=urllib2. Proxyhandler ({protocol:http_ip})
# Proxy_support = Urllib2. Proxyhandler ({' http ': ' http://124.200.100.50:8080 '})
Opener = Urllib2.build_opener (Proxy_support, Urllib2. HttpHandler)
Urllib2.install_opener (opener)
If self.count_errno>50:
Self.dbcurr.execute ("Select UID from visited where id=%s", (ID)) #这是我的数据库, I picked up a thing called UK, this
You want to test the link detection agent you want to crawl?
For UID in Self.dbcurr:
Uk=str (Uid[0])
Id+=1
If id>50000:
Id=0
Self.count_errno=0
Test_url= "http://yun.baidu.com/pcloud/friend/getfanslist?start=0&query_uk=" +uk+ "&limit=24" #我用来检测的链接
print "Download:", http_ip+ ">>" +uk
req1 = Urllib2. Request (Test_url,headers=i_headers)
Response1 = Urllib2.urlopen (req1, none,5)
Content = Response1.read ()
Except Exception as ex: #抛异常后的处理
#print "Ex2=", ex
Pass
Self.fail+=1
If self.fail>10:
Self.fail=0
Break
Continue
If content!= "":
Json_body = json.loads (content)
errno = json_body[' errno ']
Self.count_errno+=1
If errno!=-55: #检验该代理是有用的, because content! = "" and degrees Niang return not-55
Print "success."
Self.dbcurr.execute (' Select ID from proxy where ip=%s ', (IP)) #开始入库了
y = Self.dbcurr.fetchone ()
If not y:
print ' Add ', '%s//:%s:%s '% (protocol, IP, port)
Self.dbcurr.execute (' INSERT into proxy (protocol,ip,port,check_time,acq_time) VALUES (%s,%s,%s,%s,%s) ', (Protocol,ip , Port,check_time,time_now))
Self.dbconn.commit ()
Num-=1
If num% 4 ==0:
Classify= "Intr" #这个是原来网站的那几个标签栏名称, I was a column of crawling
If num% 4 ==1:
Classify= "Outha"
If num% 4 ==2:
Classify= "Outtr"
If num% 4 ==3:
Classify= "Inha"
if __name__ = = ' __main__ ':
Proser = ProxyServer ()
Proser.get_prxy (10000) #爬10000次, single threaded, crawl for 12 weeks no problem
The above is my agent crawler code, there is interest can add me qq:3047689758, can go: www.quzhuanpan.com home Follow our Weibo (if you do not see the micro-blog, may be the new code I did not push), thank you for reading, welcome reprint.
How to crawl the proxy server IP address?