modified bugged jaccard algo ( for repeat and mass repeat detection )

This commit is contained in:
Nicolas Coevoet 2013-11-25 10:16:43 +01:00
parent df3118838f
commit 548e432274

View File

@ -2574,8 +2574,11 @@ class ChanTracker(callbacks.Plugin,plugins.ChannelDBHandler):
def _strcompare (self,a,b):
# return [0 - 1] ratio between two string
# jaccard algo
c = [x for x in a if x in b]
return float(len(c)) / (len(a) + len(b) - len(c))
sa, sb = set(a), set(b)
n = len(sa.intersection(sb))
jacc = n / float(len(sa) + len(sb) - n)
# self.log.debug('%s %s %s' % (a,b,jacc))
return jacc
Class = ChanTracker