Python crawler DHT Magnetic source code Open source

Source: Internet
Author: User
Tags chr commit geoip mutex pack sleep socket unpack


The following is all the code of the crawler, completely, thoroughly open, you will not write the program can be used, but please install a Linux system, with the public network conditions, and then run:

Python startcrawler.py
It is necessary to remind you that the database field code, please build your own form, this is too easy, not to say more. At the same time I also provide a download address, the source code are: Download address 1 Download Address 2

#!/usr/bin/env python
# Encoding:utf-8

"""
Author:haoning
Create time:2015.8.1
"""

Import Hashlib


Import OS


Import time


Import datetime


Import Traceback


Import Sys


Import Random


Import JSON


Import socket


Import threading


From Hashlib import SHA1 #进行hash加密


From random import Randint


From struct Import unpack


From socket import Inet_ntoa


From threading import Timer, Thread


From time import sleep


From collections Import Deque


From queue import queue





Import MySQLdb as MDB #数据库连接器





Import Metautils


Import Downloadtorrent


From Bencode import Bencode, Bdecode


Import Pygeoip





Db_host = ' 127.0.0.1 '


Db_user = ' root '


Db_pass = ' root '





Bootstrap_nodes = (


("67.215.246.10", 6881),


("82.221.103.244", 6881),


("23.21.224.150", 6881)


)


RATE = 1 #调控速率


Tid_length = 2


Re_join_dht_interval = 3


Token_length = 2


Info_hash_len = 500000 #50w数据很小, limit memory not to consume too much


Cache_len = #更新数据库缓存


Wait_download = 80








GeoIP = Pygeoip. GeoIP (' GeoIP.dat ')





def is_ip_allowed (IP):


Country = geoip.country_code_by_addr (IP)


If country in (' CN ', ' TW ', ' JP ', ' HK ', ' KR '):


Return True


Return False





def entropy (length):


Return "". Join (Chr (randint (0, 255)) for _ in xrange (length)





Def random_id ():


h = SHA1 ()


H.update (Entropy (20))


Return H.digest ()








def decode_nodes (nodes):


n = []


length = Len (nodes)


if (length% 26)!= 0:


return n





For I in range (0, length, 26):


Nid = Nodes[i:i+20]


ip = Inet_ntoa (nodes[i+20:i+24])


Port = Unpack ("! H ", nodes[i+24:i+26]) [0]


N.append ((Nid, IP, Port))





return n








def timer (T, f):


Timer (T, f). Start ()








def get_neighbor (Target, nid, End=10):


Return Target[:end]+nid[end:]








Class Knode (object):





def __init__ (self, NID, IP, port):


Self.nid = Nid


Self.ip = IP


Self.port = Port








Class Dhtclient (Thread):





def __init__ (self, max_node_qsize):


Thread.__init__ (self)


Self.setdaemon (True)


Self.max_node_qsize = Max_node_qsize


Self.nid = random_id ()


Self.nodes = deque (maxlen=max_node_qsize)





def send_krpc (self, MSG, address):


Try


Self.ufd.sendto (Bencode (msg), address)


Except Exception:


Pass





def send_find_node (self, Address, nid=none):


Nid = Get_neighbor (nid, Self.nid) if Nid else Self.nid


Tid = Entropy (tid_length)


msg = {


"T": Tid,


"Y": "Q",


"Q": "Find_node",


"A": {


"id": Nid,


"Target": random_id ()


}


}


SELF.SEND_KRPC (msg, address)





def join_dht (self):


For address in Bootstrap_nodes:


Self.send_find_node (Address)





def re_join_dht (self):


If Len (self.nodes) = = 0:


SELF.JOIN_DHT ()


Timer (Re_join_dht_interval, SELF.RE_JOIN_DHT)





def auto_send_find_node (self):


wait = 1.0/self.max_node_qsize


While True:


Try


node = Self.nodes.popleft ()


Self.send_find_node ((Node.ip, Node.port), Node.nid)


Except Indexerror:


Pass


Try


Sleep (Wait)


Except Keyboardinterrupt:


Os._exit (0)





def process_find_node_response (self, MSG, address):


nodes = Decode_nodes (msg["R" ["Nodes"])


For node in nodes:


(Nid, IP, port) = node


If Len (nid)!= 20:continue


if IP = = self.bind_ip:continue


n = Knode (nid, IP, Port)


Self.nodes.append (N)








Class Dhtserver (dhtclient): #获得info_hash





def __init__ (self, master, Bind_ip, Bind_port, max_node_qsize):


Dhtclient.__init__ (self, max_node_qsize)





Self.master = Master


SELF.BIND_IP = Bind_ip


Self.bind_port = Bind_port


Self.speed=0





Self.process_request_actions = {


"Get_peers": Self.on_get_peers_request,


"Announce_peer": Self.on_announce_peer_request,


}





SELF.UFD = Socket.socket (socket.af_inet, socket. SOCK_DGRAM, Socket. IPPROTO_UDP)


Self.ufd.bind ((SELF.BIND_IP, Self.bind_port))





Timer (Re_join_dht_interval, SELF.RE_JOIN_DHT)








def run (self):


SELF.RE_JOIN_DHT ()


While True:


Try


(data, address) = Self.ufd.recvfrom (65536)


msg = Bdecode (data)


Self.on_message (msg, address)


Except Exception:


Pass





def on_message (self, MSG, address):


Global RATE #设为全局量


Try


If msg["y"] = = "R":


If msg["R"].has_key ("Nodes"):


Self.process_find_node_response (msg, address) #发现节点


elif msg["Y"] = = "Q":


Try


Self.speed+=1


If Self.speed% 10000 ==0:


Rate=random.randint (1,3)


If rate==2:


Rate=1


If rate==3:


rate=10


If self.speed>100000:


Self.speed=0


If Self.speed% rate==0: #数据过多, takes up too much CPU, divides the speed limit, 1,1,10


self.process_request_actions[msg["Q"]] (MSG, address) #处理其他节点的请求, this process gets Info_hash


#self. process_request_actions[msg["Q"]] (MSG, address) #处理其他节点的请求, this process gets Info_hash


Except Keyerror:


Self.play_dead (msg, address)


Except Keyerror:


Pass





def on_get_peers_request (self, MSG, address):


Try


Infohash = msg["a" ["Info_hash"]


Tid = msg["T"]


Nid = msg["a" ["id"]


token = Infohash[:token_length]


msg = {


"T": Tid,


"Y": "R",


"R": {


"id": Get_neighbor (Infohash, Self.nid),


"Nodes": "",


"Token": Token


}


}


Self.master.log (Infohash, address)


SELF.SEND_KRPC (msg, address)


Except Keyerror:


Pass





def on_announce_peer_request (self, MSG, address):


Try


Infohash = msg["a" ["Info_hash"]


token = msg["a" ["token"]


Nid = msg["a" ["id"]


Tid = msg["T"]





If infohash[:token_length] = = TOKEN:


If msg["a"].has_key ("Implied_port") and msg["a" ["Implied_port"]!= 0:


Port = address[1]


Else


Port = msg["A" ["Port"]


Self.master.log_announce (Infohash, (address[0), port)


Except Exception:


print ' ERROR '


Pass


Finally


Self.ok (msg, address)





def play_dead (self, MSG, address):


Try


Tid = msg["T"]


msg = {


"T": Tid,


"Y": "E",


"E": [I, "Server Error"]


}


SELF.SEND_KRPC (msg, address)


Except Keyerror:


Pass





def ok (self, MSG, address):


Try


Tid = msg["T"]


Nid = msg["a" ["id"]


msg = {


"T": Tid,


"Y": "R",


"R": {


"id": Get_neighbor (Nid, Self.nid)


}


}


SELF.SEND_KRPC (msg, address)


Except Keyerror:


Pass








Class Master (Thread): #解析info_hash





def __init__ (self):


Thread.__init__ (self)


Self.setdaemon (True)


Self.queue = Queue ()


Self.cache = Queue ()


Self.count=0


Self.mutex = Threading. Rlock () #可重入锁 so that a single thread can be obtained again?


Self.waitdownload = Queue ()


Self.metadata_queue = Queue ()


Self.dbconn = Mdb.connect (Db_host, Db_user, Db_pass, ' Oksousou ', charset= ' UTF8 ')


Self.dbconn.autocommit (False)


Self.dbcurr = Self.dbconn.cursor ()


Self.dbcurr.execute (' SET NAMES UTF8 ')


self.visited = set ()





def lock (self): #加锁


Self.mutex.acquire ()





def unlock (self): #解锁


Self.mutex.release ()





def work (Self,item):





Print "Start thread", item


While True:


Self.prepare_download_metadata ()


Self.lock ()


Self.download_metadata ()


Self.unlock ()





Self.lock ()


Self.got_torrent ()


Self.unlock ()





def start_work (Self,max):





For item in Xrange (max):


t = Threading. Thread (Target=self.work, args= (item,))


T.setdaemon (True)


T.start ()





#入队的种子效率更高


def log_announce (self, Binhash, Address=none):


If Self.queue.qsize () < info_hash_len: #大于INFO_HASH_LEN就不要入队, otherwise too late to deal with


If Is_ip_allowed (Address[0]):


Self.queue.put ([Address, Binhash]) #获得info_hash





def log (self, Infohash, Address=none):


If Self.queue.qsize () < info_hash_len: #大于INFO_HASH_LEN/2 Do not join the team, or the back too late to deal with


If Is_ip_allowed (Address[0]):


Self.queue.put ([Address, Infohash])





def prepare_download_metadata (self):





If self.queue.qsize () = 0:


Sleep (2)


#从queue中获得info_hash用来下载


Address, binhash= Self.queue.get ()


If Binhash in self.visited:


Return


If Len (self.visited) > 100000: #大于100000重置队列, think has been visited


self.visited = set ()


Self.visited.add (Binhash)


#跟新已经访问过的info_hash


Info_hash = Binhash.encode (' hex ')


UtcNow = Datetime.datetime.utcnow ()





Self.cache.put ((address,binhash,utcnow)) #装入缓存队列





def download_metadata (self):





If Self.cache.qsize () > CACHE_LEN/2: #出队更新下载


While Self.cache.qsize () > 0: #排空队列


Address,binhash,utcnow = Self.cache.get ()


Info_hash = Binhash.encode (' hex ')


Self.dbcurr.execute (' SELECT ID from Search_hash WHERE info_hash=%s ', (Info_hash,))


y = Self.dbcurr.fetchone ()


If y:


# Update recent discovery time, number of requests


Self.dbcurr.execute (' UPDATE search_hash SET last_seen=%s, requests=requests+1 WHERE info_hash=%s ', (UtcNow, Info_hash) )


Else


Self.waitDownload.put (address, Binhash)


Self.dbconn.commit ()


If Self.waitDownload.qsize () > Wait_download:


While Self.waitDownload.qsize () > 0:


Address,binhash = Self.waitDownload.get ()


t = Threading. Thread (Target=downloadtorrent.download_metadata, args= (address, Binhash, Self.metadata_queue))


T.setdaemon (True)


T.start ()





def decode (self, s):


If type (s) is list:


s = '; '. Join (s)


U = S


For x in (self.encoding, ' utf8 ', ' gbk ', ' Big5 '):


Try


U = S.decode (x)


return u


Except


Pass


Return S.decode (self.encoding, ' ignore ')





def Decode_utf8 (self, d, i):


If i+ '. Utf-8 ' in D:


Return d[i+ '. Utf-8 '].decode (' UTF8 ')


Return Self.decode (D[i])





def parse_metadata (self, data): #解析种子


info = {}


self.encoding = ' UTF8 '


Try


Torrent = Bdecode (data) #编码后解析


If not torrent.get (' name '):


Return None


Except


Return None


detail = torrent


info[' name ' = Self.decode_utf8 (detail, ' name ')


If ' files ' in detail:


info[' files '] = []


For x in detail[' Files ']:


If ' Path.utf-8 ' in x:


v = {' path ': Self.decode ('/'. Join (x[' path.utf-8 ']), ' length ': x[' length ']}


Else


v = {' path ': Self.decode ('/'. Join (x[' path ')], ' length ': x[' length '}


If ' Filehash ' in x:


v[' filehash '] = x[' Filehash '].encode (' hex ')


info[' Files '].append (v)


info[' length '] = SUM ([x[' length '] for x in info[' Files '])


Else


info[' length '] = detail[' length ']


info[' data_hash '] = hashlib.md5 (detail[' pieces '). Hexdigest ()


return info





def got_torrent (self):


If self.metadata_queue.qsize () = 0:


Return


Binhash, address, data,start_time = Self.metadata_queue.get ()


If not data:


Return


Try


info = self.parse_metadata (data)


If not info:


Return


Except


Traceback.print_exc ()


Return





temp = Time.time ()


x = time.localtime (float (temp))


UtcNow = Time.strftime ("%y-%m-%d%h:%m:%s", x) # Get time now





Info_hash = Binhash.encode (' hex ') #磁力


info[' info_hash '] = Info_hash


# Need to build tags


info[' Tagged ' = False


info[' classified ' = False


Info[' requests '] = 1


info[' last_seen '] = UtcNow


info[' create_time '] = UtcNow


info[' source_ip '] = address[0]





If Info.get (' Files '):


files = [Z for z in info[' files '] if not z[' path '].startswith ('_')]


If not files:


Files = info[' files ']


Else


Files = [{' path ': info[' name '], ' length ': info[' length '}]


Files.sort (key=lambda z:z[' length '), reverse=true)


Bigfname = files[0][' path ']


info[' extension ' = metautils.get_extension (bigfname). Lower ()


info[' category '] = metautils.get_category (info[' extension ')





Try


Try


print ' \ n ', ' Saved ', info[' Info_hash '], info[' name ', (Time.time ()-start_time), ' s ', address[0]


Except


print ' \ n ', ' Saved ', info[' Info_hash ']


ret = Self.dbcurr.execute (' INSERT into Search_hash (info_hash,category,data_hash,name,extension,classified,source_ ip,tagged, ' +


' Length,create_time,last_seen,requests ' VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ',


(info[' Info_hash '], info[' category '], info[' Data_hash ', info[' name '], info[' extension '], info[' classified '],


info[' Source_ip '], info[' tagged '], info[' length ', info[' create_time '], info[' Last_seen '], info[' requests '])


If Self.count%50 ==0:


Self.dbconn.commit ()


If self.count>100000:


Self.count=0


Except


Print Self.name, ' Save error ', Self.name, info


Traceback.print_exc ()


Return





if __name__ = = "__main__":





#启动客户端


master = Master ()


Master.start_work (150)





#启动服务器


DHT = Dhtserver (Master, "0.0.0.0", 6881, max_node_qsize=200)


Dht.start ()


Dht.auto_send_find_node ()


Note that the code above has a section of code that needs to download the seed, so the following code is important:

#!/usr/bin/env python
# Encoding:utf-8

"""
Author:haoning
Create time:2015.8.1
"""

From Hashlib Import SHA1


Import Math


From socket import Inet_ntoa


Import socket


From struct import pack, unpack


From threading import Timer, Thread


From time to import sleep, time





From Bencode import Bencode, Bdecode


From Startcrawler import entropy








Bt_protocol = "BitTorrent PROTOCOL"


bt_msg_id = 20


ext_handshake_id = 0





Def random_id ():


hash = SHA1 ()


Hash.update (Entropy (20))


Return Hash.digest ()





def send_packet (The_socket, msg):


The_socket.send (msg)





def send_message (The_socket, msg):


Msg_len = Pack (">i", Len (msg))


Send_packet (The_socket, Msg_len + msg)





def send_handshake (The_socket, Infohash):


Bt_header = Chr (len (bt_protocol)) + Bt_protocol


Ext_bytes = "\x00\x00\x00\x00\x00\x10\x00\x00"


peer_id = random_id ()


Packet = Bt_header + ext_bytes + infohash + peer_id





Send_packet (the_socket, packet)





def check_handshake (Packet, Self_infohash):


Try


Bt_header_len, packet = Ord (packet[:1]), packet[1:]


If Bt_header_len!= len (bt_protocol):


Return False


Except TypeError:


Return False





Bt_header, packet = Packet[:bt_header_len], Packet[bt_header_len:]


If Bt_header!= bt_protocol:


Return False





Packet = packet[8:]


Infohash = packet[:20]


If Infohash!= Self_infohash:


Return False





Return True





def send_ext_handshake (The_socket):


msg = Chr (bt_msg_id) + CHR (ext_handshake_id) + Bencode ({"M": {"Ut_metadata": 1}})


Send_message (The_socket, msg)





def request_metadata (The_socket, Ut_metadata, piece):


"" "bep_0009" ""


msg = Chr (bt_msg_id) + CHR (ut_metadata) + Bencode ({"Msg_type": 0, "piece": Piece})


Send_message (The_socket, msg)





def get_ut_metadata (data):


Ut_metadata = "_metadata"


index = Data.index (ut_metadata) +len (ut_metadata) + 1


return int (Data[index])





def get_metadata_size (data):


Metadata_size = "Metadata_size"


Start = Data.index (metadata_size) + len (metadata_size) + 1


data = Data[start:]


return int (data[:d ata.index ("E")])





def recvall (The_socket, timeout=5):


The_socket.setblocking (0)


Total_data = []


data = ""


Begin = Time ()





While True:


Sleep (0.05)


If Total_data and Time ()-begin > timeout:


Break


Elif time ()-begin > timeout*2:


Break


Try


data = THE_SOCKET.RECV (1024)


If data:


Total_data.append (data)


Begin = Time ()


Except Exception:


Pass


Return "". Join (Total_data)





def download_metadata (address, Infohash, Metadata_queue, timeout=5):


metadata = None


Start_time = Time ()


The_socket = Socket.socket (socket.af_inet, socket. SOCK_STREAM)


Try


The_socket.settimeout (Timeout)


The_socket.connect (Address)





# handshake


Send_handshake (The_socket, Infohash)


Packet = THE_SOCKET.RECV (4096)





# Handshake Error


If not Check_handshake (packet, Infohash):


Return





# ext Handshake


Send_ext_handshake (The_socket)


Packet = THE_SOCKET.RECV (4096)





# Get Ut_metadata and Metadata_size


Ut_metadata, metadata_size = Get_ut_metadata (packet), get_metadata_size (packet)





# Request each piece of metadata


metadata = []


For piece in range (int (Math.ceil (metadata_size/(16.0*1024))): #piece是个控制块, download data based on control block


Request_metadata (The_socket, ut_metadata, piece)


Packet = Recvall (The_socket, timeout) #the_socket. recv (1024*17)


Metadata.append (Packet[packet.index ("ee") + 2:])


metadata = "". Join (metadata)





Except Socket.timeout:


Pass


Except Exception, E:


Pass


Finally


#print "Metadata=%s"% (metadata)


The_socket.close () #确保没回都关闭socket


If metadata!= None: #只让不空的种子入?


Metadata_queue.put ((Infohash, Address, metadata,start_time))


In fact, there is a way to download the seed with the help of libtorrent, but this is too CPU-intensive, so I generally do not use him, as follows:

#coding: UTF8


Import threading


Import Traceback


Import Random


Import time


Import OS


Import socket





Import Libtorrent as Lt





Threading.stack_size (200*1024)


Socket.setdefaulttimeout (30)





Def fetch_torrent (Session, IH, timeout):


Name = Ih.upper ()


url = ' magnet:?xt=urn:btih:%s '% (name,)


data = '


params = {


' Save_path ': '/tmp/downloads/',


' Storage_mode ': lt.storage_mode_t (2),


' Paused ': False,


' auto_managed ': False,


' Duplicate_is_error ': True}


Try


Handle = Lt.add_magnet_uri (session, URL, params)


Except


Return None


Status = Session.status ()


Handle.set_sequential_download (1)


Meta = None


Down_time = Time.time ()


Down_path = None


For I in xrange (0, timeout):


If Handle.has_metadata ():


info = Handle.get_torrent_info ()


Down_path = '/tmp/downloads/%s '% info.name ()


#print ' status ', ' P ', status.num_peers, ' G ', status.dht_global_nodes, ' ts ', status.dht_torrents, ' u ', Status.total_ Upload, ' d ', status.total_download


Meta = Info.metadata ()


Break


Time.sleep (1)


If Down_path and Os.path.exists (Down_path):


Os.system (' rm-rf '%s '% Down_path)


Session.remove_torrent (handle)


return meta








def download_metadata (address, Binhash, Metadata_queue, timeout=20):


metadata = None


Start_time = Time.time ()


Try


Session = Lt.session ()


r = Random.randrange (10000, 50000)


Session.listen_on (R, r+10)


Session.add_dht_router (' router.bittorrent.com ', 6881)


Session.add_dht_router (' router.utorrent.com ', 6881)


Session.add_dht_router (' dht.transmission.com ', 6881)


Session.add_dht_router (' 127.0.0.1 ', 6881)


SESSION.START_DHT ()


metadata = Fetch_torrent (Session, Binhash.encode (' hex '), timeout)


Session = None


Except


Traceback.print_exc ()


Finally


Metadata_queue.put ((Binhash, Address, metadata,start_time))

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.