The following is all the code of the crawler, completely, thoroughly open, you will not write the program can be used, but please install a Linux system, with the public network conditions, and then run:
Python startcrawler.py
It is necessary to remind you that the database field code, please build your own form, this is too easy, not to say more. At the same time I also provide a download address, the source code are: Download address 1 Download Address 2
#!/usr/bin/env python
# Encoding:utf-8
"""
Author:haoning
Create time:2015.8.1
"""
Import Hashlib
Import OS
Import time
Import datetime
Import Traceback
Import Sys
Import Random
Import JSON
Import socket
Import threading
From Hashlib import SHA1 #进行hash加密
From random import Randint
From struct Import unpack
From socket import Inet_ntoa
From threading import Timer, Thread
From time import sleep
From collections Import Deque
From queue import queue
Import MySQLdb as MDB #数据库连接器
Import Metautils
Import Downloadtorrent
From Bencode import Bencode, Bdecode
Import Pygeoip
Db_host = ' 127.0.0.1 '
Db_user = ' root '
Db_pass = ' root '
Bootstrap_nodes = (
("67.215.246.10", 6881),
("82.221.103.244", 6881),
("23.21.224.150", 6881)
)
RATE = 1 #调控速率
Tid_length = 2
Re_join_dht_interval = 3
Token_length = 2
Info_hash_len = 500000 #50w数据很小, limit memory not to consume too much
Cache_len = #更新数据库缓存
Wait_download = 80
GeoIP = Pygeoip. GeoIP (' GeoIP.dat ')
def is_ip_allowed (IP):
Country = geoip.country_code_by_addr (IP)
If country in (' CN ', ' TW ', ' JP ', ' HK ', ' KR '):
Return True
Return False
def entropy (length):
Return "". Join (Chr (randint (0, 255)) for _ in xrange (length)
Def random_id ():
h = SHA1 ()
H.update (Entropy (20))
Return H.digest ()
def decode_nodes (nodes):
n = []
length = Len (nodes)
if (length% 26)!= 0:
return n
For I in range (0, length, 26):
Nid = Nodes[i:i+20]
ip = Inet_ntoa (nodes[i+20:i+24])
Port = Unpack ("! H ", nodes[i+24:i+26]) [0]
N.append ((Nid, IP, Port))
return n
def timer (T, f):
Timer (T, f). Start ()
def get_neighbor (Target, nid, End=10):
Return Target[:end]+nid[end:]
Class Knode (object):
def __init__ (self, NID, IP, port):
Self.nid = Nid
Self.ip = IP
Self.port = Port
Class Dhtclient (Thread):
def __init__ (self, max_node_qsize):
Thread.__init__ (self)
Self.setdaemon (True)
Self.max_node_qsize = Max_node_qsize
Self.nid = random_id ()
Self.nodes = deque (maxlen=max_node_qsize)
def send_krpc (self, MSG, address):
Try
Self.ufd.sendto (Bencode (msg), address)
Except Exception:
Pass
def send_find_node (self, Address, nid=none):
Nid = Get_neighbor (nid, Self.nid) if Nid else Self.nid
Tid = Entropy (tid_length)
msg = {
"T": Tid,
"Y": "Q",
"Q": "Find_node",
"A": {
"id": Nid,
"Target": random_id ()
}
}
SELF.SEND_KRPC (msg, address)
def join_dht (self):
For address in Bootstrap_nodes:
Self.send_find_node (Address)
def re_join_dht (self):
If Len (self.nodes) = = 0:
SELF.JOIN_DHT ()
Timer (Re_join_dht_interval, SELF.RE_JOIN_DHT)
def auto_send_find_node (self):
wait = 1.0/self.max_node_qsize
While True:
Try
node = Self.nodes.popleft ()
Self.send_find_node ((Node.ip, Node.port), Node.nid)
Except Indexerror:
Pass
Try
Sleep (Wait)
Except Keyboardinterrupt:
Os._exit (0)
def process_find_node_response (self, MSG, address):
nodes = Decode_nodes (msg["R" ["Nodes"])
For node in nodes:
(Nid, IP, port) = node
If Len (nid)!= 20:continue
if IP = = self.bind_ip:continue
n = Knode (nid, IP, Port)
Self.nodes.append (N)
Class Dhtserver (dhtclient): #获得info_hash
def __init__ (self, master, Bind_ip, Bind_port, max_node_qsize):
Dhtclient.__init__ (self, max_node_qsize)
Self.master = Master
SELF.BIND_IP = Bind_ip
Self.bind_port = Bind_port
Self.speed=0
Self.process_request_actions = {
"Get_peers": Self.on_get_peers_request,
"Announce_peer": Self.on_announce_peer_request,
}
SELF.UFD = Socket.socket (socket.af_inet, socket. SOCK_DGRAM, Socket. IPPROTO_UDP)
Self.ufd.bind ((SELF.BIND_IP, Self.bind_port))
Timer (Re_join_dht_interval, SELF.RE_JOIN_DHT)
def run (self):
SELF.RE_JOIN_DHT ()
While True:
Try
(data, address) = Self.ufd.recvfrom (65536)
msg = Bdecode (data)
Self.on_message (msg, address)
Except Exception:
Pass
def on_message (self, MSG, address):
Global RATE #设为全局量
Try
If msg["y"] = = "R":
If msg["R"].has_key ("Nodes"):
Self.process_find_node_response (msg, address) #发现节点
elif msg["Y"] = = "Q":
Try
Self.speed+=1
If Self.speed% 10000 ==0:
Rate=random.randint (1,3)
If rate==2:
Rate=1
If rate==3:
rate=10
If self.speed>100000:
Self.speed=0
If Self.speed% rate==0: #数据过多, takes up too much CPU, divides the speed limit, 1,1,10
self.process_request_actions[msg["Q"]] (MSG, address) #处理其他节点的请求, this process gets Info_hash
#self. process_request_actions[msg["Q"]] (MSG, address) #处理其他节点的请求, this process gets Info_hash
Except Keyerror:
Self.play_dead (msg, address)
Except Keyerror:
Pass
def on_get_peers_request (self, MSG, address):
Try
Infohash = msg["a" ["Info_hash"]
Tid = msg["T"]
Nid = msg["a" ["id"]
token = Infohash[:token_length]
msg = {
"T": Tid,
"Y": "R",
"R": {
"id": Get_neighbor (Infohash, Self.nid),
"Nodes": "",
"Token": Token
}
}
Self.master.log (Infohash, address)
SELF.SEND_KRPC (msg, address)
Except Keyerror:
Pass
def on_announce_peer_request (self, MSG, address):
Try
Infohash = msg["a" ["Info_hash"]
token = msg["a" ["token"]
Nid = msg["a" ["id"]
Tid = msg["T"]
If infohash[:token_length] = = TOKEN:
If msg["a"].has_key ("Implied_port") and msg["a" ["Implied_port"]!= 0:
Port = address[1]
Else
Port = msg["A" ["Port"]
Self.master.log_announce (Infohash, (address[0), port)
Except Exception:
print ' ERROR '
Pass
Finally
Self.ok (msg, address)
def play_dead (self, MSG, address):
Try
Tid = msg["T"]
msg = {
"T": Tid,
"Y": "E",
"E": [I, "Server Error"]
}
SELF.SEND_KRPC (msg, address)
Except Keyerror:
Pass
def ok (self, MSG, address):
Try
Tid = msg["T"]
Nid = msg["a" ["id"]
msg = {
"T": Tid,
"Y": "R",
"R": {
"id": Get_neighbor (Nid, Self.nid)
}
}
SELF.SEND_KRPC (msg, address)
Except Keyerror:
Pass
Class Master (Thread): #解析info_hash
def __init__ (self):
Thread.__init__ (self)
Self.setdaemon (True)
Self.queue = Queue ()
Self.cache = Queue ()
Self.count=0
Self.mutex = Threading. Rlock () #可重入锁 so that a single thread can be obtained again?
Self.waitdownload = Queue ()
Self.metadata_queue = Queue ()
Self.dbconn = Mdb.connect (Db_host, Db_user, Db_pass, ' Oksousou ', charset= ' UTF8 ')
Self.dbconn.autocommit (False)
Self.dbcurr = Self.dbconn.cursor ()
Self.dbcurr.execute (' SET NAMES UTF8 ')
self.visited = set ()
def lock (self): #加锁
Self.mutex.acquire ()
def unlock (self): #解锁
Self.mutex.release ()
def work (Self,item):
Print "Start thread", item
While True:
Self.prepare_download_metadata ()
Self.lock ()
Self.download_metadata ()
Self.unlock ()
Self.lock ()
Self.got_torrent ()
Self.unlock ()
def start_work (Self,max):
For item in Xrange (max):
t = Threading. Thread (Target=self.work, args= (item,))
T.setdaemon (True)
T.start ()
#入队的种子效率更高
def log_announce (self, Binhash, Address=none):
If Self.queue.qsize () < info_hash_len: #大于INFO_HASH_LEN就不要入队, otherwise too late to deal with
If Is_ip_allowed (Address[0]):
Self.queue.put ([Address, Binhash]) #获得info_hash
def log (self, Infohash, Address=none):
If Self.queue.qsize () < info_hash_len: #大于INFO_HASH_LEN/2 Do not join the team, or the back too late to deal with
If Is_ip_allowed (Address[0]):
Self.queue.put ([Address, Infohash])
def prepare_download_metadata (self):
If self.queue.qsize () = 0:
Sleep (2)
#从queue中获得info_hash用来下载
Address, binhash= Self.queue.get ()
If Binhash in self.visited:
Return
If Len (self.visited) > 100000: #大于100000重置队列, think has been visited
self.visited = set ()
Self.visited.add (Binhash)
#跟新已经访问过的info_hash
Info_hash = Binhash.encode (' hex ')
UtcNow = Datetime.datetime.utcnow ()
Self.cache.put ((address,binhash,utcnow)) #装入缓存队列
def download_metadata (self):
If Self.cache.qsize () > CACHE_LEN/2: #出队更新下载
While Self.cache.qsize () > 0: #排空队列
Address,binhash,utcnow = Self.cache.get ()
Info_hash = Binhash.encode (' hex ')
Self.dbcurr.execute (' SELECT ID from Search_hash WHERE info_hash=%s ', (Info_hash,))
y = Self.dbcurr.fetchone ()
If y:
# Update recent discovery time, number of requests
Self.dbcurr.execute (' UPDATE search_hash SET last_seen=%s, requests=requests+1 WHERE info_hash=%s ', (UtcNow, Info_hash) )
Else
Self.waitDownload.put (address, Binhash)
Self.dbconn.commit ()
If Self.waitDownload.qsize () > Wait_download:
While Self.waitDownload.qsize () > 0:
Address,binhash = Self.waitDownload.get ()
t = Threading. Thread (Target=downloadtorrent.download_metadata, args= (address, Binhash, Self.metadata_queue))
T.setdaemon (True)
T.start ()
def decode (self, s):
If type (s) is list:
s = '; '. Join (s)
U = S
For x in (self.encoding, ' utf8 ', ' gbk ', ' Big5 '):
Try
U = S.decode (x)
return u
Except
Pass
Return S.decode (self.encoding, ' ignore ')
def Decode_utf8 (self, d, i):
If i+ '. Utf-8 ' in D:
Return d[i+ '. Utf-8 '].decode (' UTF8 ')
Return Self.decode (D[i])
def parse_metadata (self, data): #解析种子
info = {}
self.encoding = ' UTF8 '
Try
Torrent = Bdecode (data) #编码后解析
If not torrent.get (' name '):
Return None
Except
Return None
detail = torrent
info[' name ' = Self.decode_utf8 (detail, ' name ')
If ' files ' in detail:
info[' files '] = []
For x in detail[' Files ']:
If ' Path.utf-8 ' in x:
v = {' path ': Self.decode ('/'. Join (x[' path.utf-8 ']), ' length ': x[' length ']}
Else
v = {' path ': Self.decode ('/'. Join (x[' path ')], ' length ': x[' length '}
If ' Filehash ' in x:
v[' filehash '] = x[' Filehash '].encode (' hex ')
info[' Files '].append (v)
info[' length '] = SUM ([x[' length '] for x in info[' Files '])
Else
info[' length '] = detail[' length ']
info[' data_hash '] = hashlib.md5 (detail[' pieces '). Hexdigest ()
return info
def got_torrent (self):
If self.metadata_queue.qsize () = 0:
Return
Binhash, address, data,start_time = Self.metadata_queue.get ()
If not data:
Return
Try
info = self.parse_metadata (data)
If not info:
Return
Except
Traceback.print_exc ()
Return
temp = Time.time ()
x = time.localtime (float (temp))
UtcNow = Time.strftime ("%y-%m-%d%h:%m:%s", x) # Get time now
Info_hash = Binhash.encode (' hex ') #磁力
info[' info_hash '] = Info_hash
# Need to build tags
info[' Tagged ' = False
info[' classified ' = False
Info[' requests '] = 1
info[' last_seen '] = UtcNow
info[' create_time '] = UtcNow
info[' source_ip '] = address[0]
If Info.get (' Files '):
files = [Z for z in info[' files '] if not z[' path '].startswith ('_')]
If not files:
Files = info[' files ']
Else
Files = [{' path ': info[' name '], ' length ': info[' length '}]
Files.sort (key=lambda z:z[' length '), reverse=true)
Bigfname = files[0][' path ']
info[' extension ' = metautils.get_extension (bigfname). Lower ()
info[' category '] = metautils.get_category (info[' extension ')
Try
Try
print ' \ n ', ' Saved ', info[' Info_hash '], info[' name ', (Time.time ()-start_time), ' s ', address[0]
Except
print ' \ n ', ' Saved ', info[' Info_hash ']
ret = Self.dbcurr.execute (' INSERT into Search_hash (info_hash,category,data_hash,name,extension,classified,source_ ip,tagged, ' +
' Length,create_time,last_seen,requests ' VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ',
(info[' Info_hash '], info[' category '], info[' Data_hash ', info[' name '], info[' extension '], info[' classified '],
info[' Source_ip '], info[' tagged '], info[' length ', info[' create_time '], info[' Last_seen '], info[' requests '])
If Self.count%50 ==0:
Self.dbconn.commit ()
If self.count>100000:
Self.count=0
Except
Print Self.name, ' Save error ', Self.name, info
Traceback.print_exc ()
Return
if __name__ = = "__main__":
#启动客户端
master = Master ()
Master.start_work (150)
#启动服务器
DHT = Dhtserver (Master, "0.0.0.0", 6881, max_node_qsize=200)
Dht.start ()
Dht.auto_send_find_node ()
Note that the code above has a section of code that needs to download the seed, so the following code is important:
#!/usr/bin/env python
# Encoding:utf-8
"""
Author:haoning
Create time:2015.8.1
"""
From Hashlib Import SHA1
Import Math
From socket import Inet_ntoa
Import socket
From struct import pack, unpack
From threading import Timer, Thread
From time to import sleep, time
From Bencode import Bencode, Bdecode
From Startcrawler import entropy
Bt_protocol = "BitTorrent PROTOCOL"
bt_msg_id = 20
ext_handshake_id = 0
Def random_id ():
hash = SHA1 ()
Hash.update (Entropy (20))
Return Hash.digest ()
def send_packet (The_socket, msg):
The_socket.send (msg)
def send_message (The_socket, msg):
Msg_len = Pack (">i", Len (msg))
Send_packet (The_socket, Msg_len + msg)
def send_handshake (The_socket, Infohash):
Bt_header = Chr (len (bt_protocol)) + Bt_protocol
Ext_bytes = "\x00\x00\x00\x00\x00\x10\x00\x00"
peer_id = random_id ()
Packet = Bt_header + ext_bytes + infohash + peer_id
Send_packet (the_socket, packet)
def check_handshake (Packet, Self_infohash):
Try
Bt_header_len, packet = Ord (packet[:1]), packet[1:]
If Bt_header_len!= len (bt_protocol):
Return False
Except TypeError:
Return False
Bt_header, packet = Packet[:bt_header_len], Packet[bt_header_len:]
If Bt_header!= bt_protocol:
Return False
Packet = packet[8:]
Infohash = packet[:20]
If Infohash!= Self_infohash:
Return False
Return True
def send_ext_handshake (The_socket):
msg = Chr (bt_msg_id) + CHR (ext_handshake_id) + Bencode ({"M": {"Ut_metadata": 1}})
Send_message (The_socket, msg)
def request_metadata (The_socket, Ut_metadata, piece):
"" "bep_0009" ""
msg = Chr (bt_msg_id) + CHR (ut_metadata) + Bencode ({"Msg_type": 0, "piece": Piece})
Send_message (The_socket, msg)
def get_ut_metadata (data):
Ut_metadata = "_metadata"
index = Data.index (ut_metadata) +len (ut_metadata) + 1
return int (Data[index])
def get_metadata_size (data):
Metadata_size = "Metadata_size"
Start = Data.index (metadata_size) + len (metadata_size) + 1
data = Data[start:]
return int (data[:d ata.index ("E")])
def recvall (The_socket, timeout=5):
The_socket.setblocking (0)
Total_data = []
data = ""
Begin = Time ()
While True:
Sleep (0.05)
If Total_data and Time ()-begin > timeout:
Break
Elif time ()-begin > timeout*2:
Break
Try
data = THE_SOCKET.RECV (1024)
If data:
Total_data.append (data)
Begin = Time ()
Except Exception:
Pass
Return "". Join (Total_data)
def download_metadata (address, Infohash, Metadata_queue, timeout=5):
metadata = None
Start_time = Time ()
The_socket = Socket.socket (socket.af_inet, socket. SOCK_STREAM)
Try
The_socket.settimeout (Timeout)
The_socket.connect (Address)
# handshake
Send_handshake (The_socket, Infohash)
Packet = THE_SOCKET.RECV (4096)
# Handshake Error
If not Check_handshake (packet, Infohash):
Return
# ext Handshake
Send_ext_handshake (The_socket)
Packet = THE_SOCKET.RECV (4096)
# Get Ut_metadata and Metadata_size
Ut_metadata, metadata_size = Get_ut_metadata (packet), get_metadata_size (packet)
# Request each piece of metadata
metadata = []
For piece in range (int (Math.ceil (metadata_size/(16.0*1024))): #piece是个控制块, download data based on control block
Request_metadata (The_socket, ut_metadata, piece)
Packet = Recvall (The_socket, timeout) #the_socket. recv (1024*17)
Metadata.append (Packet[packet.index ("ee") + 2:])
metadata = "". Join (metadata)
Except Socket.timeout:
Pass
Except Exception, E:
Pass
Finally
#print "Metadata=%s"% (metadata)
The_socket.close () #确保没回都关闭socket
If metadata!= None: #只让不空的种子入?
Metadata_queue.put ((Infohash, Address, metadata,start_time))
In fact, there is a way to download the seed with the help of libtorrent, but this is too CPU-intensive, so I generally do not use him, as follows:
#coding: UTF8
Import threading
Import Traceback
Import Random
Import time
Import OS
Import socket
Import Libtorrent as Lt
Threading.stack_size (200*1024)
Socket.setdefaulttimeout (30)
Def fetch_torrent (Session, IH, timeout):
Name = Ih.upper ()
url = ' magnet:?xt=urn:btih:%s '% (name,)
data = '
params = {
' Save_path ': '/tmp/downloads/',
' Storage_mode ': lt.storage_mode_t (2),
' Paused ': False,
' auto_managed ': False,
' Duplicate_is_error ': True}
Try
Handle = Lt.add_magnet_uri (session, URL, params)
Except
Return None
Status = Session.status ()
Handle.set_sequential_download (1)
Meta = None
Down_time = Time.time ()
Down_path = None
For I in xrange (0, timeout):
If Handle.has_metadata ():
info = Handle.get_torrent_info ()
Down_path = '/tmp/downloads/%s '% info.name ()
#print ' status ', ' P ', status.num_peers, ' G ', status.dht_global_nodes, ' ts ', status.dht_torrents, ' u ', Status.total_ Upload, ' d ', status.total_download
Meta = Info.metadata ()
Break
Time.sleep (1)
If Down_path and Os.path.exists (Down_path):
Os.system (' rm-rf '%s '% Down_path)
Session.remove_torrent (handle)
return meta
def download_metadata (address, Binhash, Metadata_queue, timeout=20):
metadata = None
Start_time = Time.time ()
Try
Session = Lt.session ()
r = Random.randrange (10000, 50000)
Session.listen_on (R, r+10)
Session.add_dht_router (' router.bittorrent.com ', 6881)
Session.add_dht_router (' router.utorrent.com ', 6881)
Session.add_dht_router (' dht.transmission.com ', 6881)
Session.add_dht_router (' 127.0.0.1 ', 6881)
SESSION.START_DHT ()
metadata = Fetch_torrent (Session, Binhash.encode (' hex '), timeout)
Session = None
Except
Traceback.print_exc ()
Finally
Metadata_queue.put ((Binhash, Address, metadata,start_time))