Simhash's algorithm is simply, from the massive text fast search and known Simhash difference is less than K bit Simhash collection, here each text can be represented by a Simhash value, a simhash has 64bit, similar text, 64bit is similar, The empirical value of K in this paper is 3. The disadvantage of the method is as obvious as the advantages, there are two points, for short text, K value is very sensitive; the other is because the algorithm is space-changing time, the system memory is too much.
The code is as follows:
#!/usr/bin/python
# Coding=utf-8
Class Simhash:
#构造函数
def __init__ (self, tokens= ", hashbits=128):
Self.hashbits = Hashbits
Self.hash = Self.simhash (tokens);
#toString函数
def __str__ (self):
Return str (self.hash)
#生成simhash值
def simhash (self, Tokens):
v = [0] * self.hashbits
For t in [Self._string_hash (x) for x in tokens]: #t为token的普通hash值
For I in Range (self.hashbits):
Bitmask = 1 << i
If T & Bitmask:
V[i] + = 1 #查看当前bit位是否为1, is the word will the bit +1
Else
V[i]-= 1 #否则的话, the bit-1
Fingerprint = 0
For I in Range (self.hashbits):
If V[i] >= 0:
Fingerprint + = 1 << i
Return fingerprint #整个文档的fingerprint为最终各个位 >=0 and
#求海明距离
def hamming_distance (self, Other):
x = (self.hash ^ other.hash) & ((1 << self.hashbits)-1)
tot = 0;
While x:
Tot + = 1
X &= x-1
Return tot
#求相似度
def similarity (self, Other):
A = float (self.hash)
b = Float (other.hash)
If a > B:return b/a
Else:return A/b
#针对source生成hash值 (built-in hash of a variable-length version of Python)
def _string_hash (self, source):
If Source = = "":
return 0
Else
x = Ord (source[0]) << 7
m = 1000003
Mask = 2 * * self.hashbits-1
For C in Source:
x = ((x * m) ^ ord (c)) & Mask
x ^= len (source)
if x = =-1:
x =-2
return x
if __name__ = = ' __main__ ':
s = ' This was a test string for testing '
HASH1 = Simhash (S.split ())
s = ' This was a test string for testing also '
HASH2 = Simhash (S.split ())
s = ' Nai nai ge Xiong Cao '
Hash3 = Simhash (S.split ())
Print (Hash1.hamming_distance (HASH2), "", Hash1.similarity (HASH2))
Print (Hash1.hamming_distance (HASH3), "", Hash1.similarity (HASH3))