Qqzeng-ip.dat is a special-format DAT file that can quickly locate IP-related geolocation information. According to the results of the authors ' tests, the 1 million IP lookup speed was 0.5 seconds.
Of course, this has a very big relationship with language, and Python's cyclic performance has always been a matter of criticism. The results of the current Python version test is 100,000 IP lookup speed is about 3.X seconds, still enough, after all, the real situation of 30 seconds ~ 5 minutes of the log is unlikely to appear in a batch of data in the number of non-duplicate IP more than 100,000.
The author provides a java/c/php script that parses the DAT, but does not provide a Python version. So I wrote one for the use of geo-location information that needs to be read in the Python language for IP.
The code is as follows:
#coding: Utf-8import osimport mathimport socketimport structimport iofrom io Import seek_setpath = Os.path.normpath ( Os.path.dirname (Os.path.abspath (__file__)) + "/qqzeng-ip-utf8.dat") class Ipsearch (object): fp = None Firststartipoff Set = None Laststartipoffset = None Prestartoffset = None Preendoffset = None Ipcount = None Prefixcount = None prefixlist = Dict () def __init__ (self): SELF.FP = Io.open (path, "RB") Buff = Self.fp.read (16) Self.firststartipoffset = Self.bytestolong (buff[0],buff[1],buff[2],buff[3]) Self.laststartipoffset = Self.bytesT Olong (buff[4],buff[5],buff[6],buff[7]) Self.prestartoffset = Self.bytestolong (buff[8],buff[9],buff[10],buff[11]) Self.preendoffset = Self.bytestolong (buff[12],buff[13],buff[14],buff[15]) Self.ipcount = (self.lastStartIpOffs Et-self.firststartipoffset)/1 + Self.prefixcount = (self.preendoffset-self.prestartoffset)/9 + 1 Self.fp.seek (self.prestArtoffset,seek_set) Prebuff = Self.fp.read (self.prefixcount*9) for K in range (0,self.prefixcount): i = k*9 StartIndex = Self.bytestolong (Prebuff[1+i],prebuff[2+i],prebuff[3+i],prebuff[4+i]) Endinde x = Self.bytestolong (Prebuff[5+i],prebuff[6+i],prebuff[7+i],prebuff[8+i]) Self.prefixlist[ord (preBuff[i])] = { "Start_index": StartIndex, "End_index": EndIndex} def __del__ (self): if SELF.FP! = None:self.fp.close () def get (SELF,IP): if IP = = ": Return" "high = 0 Low = 0 StartIP = 0 EndIP = 0 Localoffset = 0 locallength = 0 prefix = ip.split ( ".") [0] prefix = int (prefix) ipnum = self.ip2unit (IP) if prefix in Self.prefixList.keys (): IND ex = Self.prefixlist[prefix] Low = index["Start_index") high = index["End_index"] else: Return "" left = Low if low = = High Else Self.binarysearch (low,high,ipnum) left,startip,endip,localoffset,locallength = SE Lf.getindex (left,startip,endip,localoffset,locallength) if StartIP <= ipnum and EndIP >= ipnum:re Turn self.getlocal (localoffset,locallength) Else:return "" Def getlocal (Self,localoffset,localle Ngth): Self.fp.seek (Localoffset,seek_set) return Self.fp.read (locallength) def getindex (Self,left,startip , endip,localoffset,locallength): LeftOffset = Self.firststartipoffset + left*12 Self.fp.seek (leftoffset,seek _set) buff = Self.fp.read (StartIP) = Self.bytestolong (buff[0],buff[1],buff[2],buff[3]) EndIP = sel F.bytestolong (buff[4],buff[5],buff[6],buff[7]) R3 = (ord (buff[8)) << 0 | ord (buff[9]) << 8 | ord (buff[1 0]) << if r3 < 0:R3 + = 4294967296 Localoffset = R3 Locallength = Ord (buff[11 ]) return [left,sTartip,endip,localoffset,locallength] def binarysearch (self,low,high,k): M = 0 while low <= High: Mid = (low + high)/2 Endipnum = Self.getendipnum (mid) if endipnum >= k:m = Mid if Mid = = 0:break High = mid-1 Else:lo W = mid + 1 return m def getendipnum (self,left): LeftOffset = Self.firststartipoffset + (left*12) + 4 Self.fp.seek (leftoffset,seek_set) buf = Self.fp.read (4) return Self.bytestolong (buf[0],buf[1],buf[2],buf [3]) def ip2unit (self,ip): Lip = Self.ip2long (IP) if lip < 0:lip + = 4294967296 Retu RN Lip def ip2long (self,ip): Packedip = Socket.inet_aton (IP) return struct.unpack ("! L ", Packedip) [0] def bytestolong (self,a,b,c,d): Iplong = (ord (a) << 0) | (Ord (b) << 8) | (Ord (c) << 16) | (Ord (d) <<) if IPlong < 0:iplong + = 4294967296 return iplongif __name__ = ' __main__ ': Ipsearch = Ipsearch () p Rint ipsearch.get ("210.51.200.123"). Decode ("Utf-8"). Encode ("GBK") import time StartTime = Time.time () for I in RA Nge (0,100000): Ipsearch.get ("210.51.200.123") EndTime = Time.time () print "Time waste:", endtime-starttime
The test results are as follows:
Compare with the IP information isolated by Baidu:
It's pretty good, right.
Qqzeng-ip.dat IP Library Read Python edition