Article From:https://www.cnblogs.com/kayy/p/9967494.html
#-*- coding: utf-8 -*-

import jieba.analyse
import numpy as np
import hashlib

cts = 128
#hashString digit

def word_hash(t, w):
if int(t)==1:
return w
else:
return -w
def content_hash(t):
if float(t)<=0:
return '0'
else:
return '1'


def simhash(content):

tags = jieba.analyse.extract_tags(content, topK=100, withWeight=True)

hash_init = np.zeros(cts)

for t, w in tags:
md5 = hashlib.md5()
md5.update(t.encode('utf-8'))

hash_str = str(bin(int(md5.hexdigest(), 16)).replace('0b', '').zfill(cts)[-cts:])

hash_str_deal = map(lambda x: word_hash(x, w), hash_str)

hash_init = hash_str_deal + hash_init
return ''.join(map(lambda x: content_hash(x), hash_init))

======================================================================================================================================================================
#-*- coding: utf-8 -*-

from simhash import simhash
import Levenshtein


if __name__ == '__main__':
content1 = 'After the upsurge of feeling, the discussion about Renren and the value of Renren's live broadcasting began to surface. '
content2 = 'Chen Yizhou also believes that compared with young entrepreneurs, Laojiang Lake is more suitable for industrial Internet. '

hash1 = simhash(content1)
hash2 = simhash(content2)

#print len(filter(lambda x: x[0]==x[1], zip(hash1, hash2)))/128.0
    print Levenshtein.hamming(hash1, hash2)/128.0

#Hamming distance
Link of this Article: simhash

Leave a Reply

Your email address will not be published. Required fields are marked *