Simhash's algorithm is simply that, from the mass text of the fast search and known Simhash difference is less than K-bit simhash set, where each text can be represented by a Simhash value, a simhash has 64bit, similar text, 64bit is similar, The empirical value of K in this paper is 3. The disadvantage of this method is as obvious as the advantages, mainly two points, for the short text, K value is very sensitive, another is because the algorithm is in space for time, the system memory is too much.
#!/usr/bin/python
# Coding=utf-8
Class Simhash:
#Constructor
Def __init__ (self, tokens= ', hashbits=128):
Self.hashbits = Hashbits
Self.hash = Self.simhash (tokens);
#toStringfunction
Def __str__ (self):
Return str (self.hash)
#产生simhash value
Def simhash (self, Tokens):
v = [0] * self.hashbits
For t in [Self._string_hash (x) for x in tokens]: #t is the normal hash value of the token
For I in Range (self.hashbits):
Bitmask = 1 << i
If T & Bitmask:
V[i] + + 1 #Check if the current bit is 1, is the words will be that bit +1
Else
V[i]-= 1 # otherwise, this bit-1
Fingerprint = 0
For I in Range (self.hashbits):
If V[i] >= 0:
Fingerprint + 1 << I
Return fingerprint #fingerprint of the entire document is the final bits >=0 's and
#Seeking Haiming distance
Def hamming_distance (self, Other):
x = (self.hash ^ other.hash) & ((1 << self.hashbits)-1)
Tot = 0;
While x:
Tot + 1
X &= x-1
Return tot
#Seeking similarity
Def similarity (self, Other):
A = float (self.hash)
b = Float (other.hash)
If a > B:return b/a
Else:return A/b
#Generate hash values for source(built-in hash of a variable-length version of Python)
def _string_hash (self, source):
If Source = = "":
return 0
Else
x = Ord (source[0]) << 7
m = 1000003
Mask = 2 * * self.hashbits-1
For C in Source:
x = ((x * m) ^ ord (c)) & Mask
x ^= len (source)
if x = = 1:
x =-2
return x
if __name__ = = ' __main__ ':
s = ' This was a test string for testing '
HASH1 = Simhash (S.split ())
s = ' This was a test string for testing also '
HASH2 = Simhash (S.split ())
s = ' Nai nai ge Xiong Cao '
Hash3 = Simhash (S.split ())
Print (Hash1.hamming_distance (HASH2), "", Hash1.similarity (HASH2))
Print (Hash1.hamming_distance (HASH3), "", Hash1.similarity (HASH3))