From 38ab2e449f93e8a0dddc41d615da5b70eebc710d Mon Sep 17 00:00:00 2001 From: Nicolas Coevoet Date: Wed, 16 Jun 2021 22:55:23 +0200 Subject: [PATCH] Complete modification or repetition detection, mostly for pattern creation --- config.py | 41 ++++---- plugin.py | 281 +++++++++++++++++++++++++++++------------------------- 2 files changed, 169 insertions(+), 153 deletions(-) diff --git a/config.py b/config.py index f3a7f4a..d65d4f6 100644 --- a/config.py +++ b/config.py @@ -239,39 +239,30 @@ registry.PositiveInteger(180,"""punishment duration in seconds""")) conf.registerChannelValue(ChanTracker, 'lowFloodComment', registry.String('low flood detected',"""comment added on mode changes database, empty for no comment""")) -# repeat detection +# repeat abuse conf.registerChannelValue(ChanTracker, 'repeatPermit', -registry.Integer(-1,"""Number of repeated text allowed, -1 to disable""")) +registry.Integer(-1,"""number of triggers allowed, -1 to disable""")) conf.registerChannelValue(ChanTracker, 'repeatLife', -registry.PositiveInteger(12,"""Duration of messages's life in repeatPermit counter in seconds""")) -conf.registerChannelValue(ChanTracker, 'repeatPercent', -registry.Probability(0.85,"""percent of similarity needed between previous and current message to trigger a repeat count""")) +registry.PositiveInteger(12,"""life duration of triggers in seconds""")) conf.registerChannelValue(ChanTracker, 'repeatMode', -registry.String('q',"""mode used by the bot when repeat detection is triggered""")) +registry.String('q',"""action when repeatPermit is triggered""")) conf.registerChannelValue(ChanTracker, 'repeatDuration', -registry.PositiveInteger(180,"""punishment duration in seconds""")) +registry.PositiveInteger(180,"""punishment duration in seconds""")) conf.registerChannelValue(ChanTracker, 'repeatComment', registry.String('repeat detected',"""comment added on mode changes database, empty for no comment""")) -# mass repeat detection -conf.registerChannelValue(ChanTracker, 'massRepeatChars', -registry.PositiveInteger(40,"""number of chars needed to enter massRepeat detection""")) -conf.registerChannelValue(ChanTracker, 'massRepeatPermit', -registry.Integer(-1,"""Number of repeated text allowed, -1 to disable, tracks message repetition from various sources on the given channel""")) -conf.registerChannelValue(ChanTracker, 'massRepeatLife', -registry.PositiveInteger(12,"""Duration of messages's life in massRepeat counter, in seconds""")) -conf.registerChannelValue(ChanTracker, 'massRepeatPercent', -registry.Probability(0.85,"""percentage similarity between previous and current message to trigger a repeat count""")) -conf.registerChannelValue(ChanTracker, 'massRepeatMode', -registry.String('b',"""mode used by the bot when repeat detection is triggered""")) -conf.registerChannelValue(ChanTracker, 'massRepeatDuration', -registry.PositiveInteger(1800,"""punition in seconds""")) -conf.registerChannelValue(ChanTracker, 'massRepeatComment', -registry.String('mass repeat detected',"""comment added on mode changes database, empty for no comment""")) -conf.registerChannelValue(ChanTracker, 'massRepeatPatternLife', +# repeat detection +conf.registerChannelValue(ChanTracker, 'repeatMinimum', +registry.PositiveInteger(8,"""minimal candidates patterns length to detect repetitions""")) +conf.registerChannelValue(ChanTracker, 'repeatPercent', +registry.Probability(0.85,"""percent of similarity between messages or pattern candidates""")) +conf.registerChannelValue(ChanTracker, 'repeatCount', +registry.PositiveInteger(5,"""numbers of occurences of candidates patterns""")) +conf.registerChannelValue(ChanTracker, 'repeatPatternMinimum', +registry.Integer(-1,"""minimal length to create automated pattern, if found, triggers same punishment than repeatMode/repeatDuration, -1 to disable""")) +conf.registerChannelValue(ChanTracker, 'repeatPatternLife', registry.PositiveInteger(300,"""duration of pattern life""")) -conf.registerChannelValue(ChanTracker, 'massRepeatPatternLength', -registry.Integer(-1,"""if -1, it uses the default system to compare strings, otherwise, it try to find the longest common message, and use it as a regexp pattern, if found string < length setted, it uses the default string compare""")) + # YES IT'S ANNOYING diff --git a/plugin.py b/plugin.py index fc7232c..bec522f 100644 --- a/plugin.py +++ b/plugin.py @@ -56,10 +56,12 @@ from ipaddress import ip_network as IPNetwork #due to more kind of pattern checked, increase size + ircutils._hostmaskPatternEqualCache = utils.structures.CacheDict(10000) cache = utils.structures.CacheDict(10000) + def applymodes(channel, args=(), prefix='', msg=None): """Returns a MODE that applies changes on channel.""" modes = args @@ -70,6 +72,53 @@ def applymodes(channel, args=(), prefix='', msg=None): mcidr = re.compile(r'^(\d{1,3}\.){0,3}\d{1,3}/\d{1,2}$') m6cidr = re.compile(r'^([0-9a-f]{0,4}:){2,7}[0-9a-f]{0,4}/\d{1,3}$') +def compareString (a,b): + """return 0 to 1 float percent of similarity ( 0.85 seems to be a good average )""" + if a == b: + return 1 + sa, sb = set(a), set(b) + n = len(sa.intersection(sb)) + if float(len(sa) + len(sb) - n) == 0: + return 0 + jacc = n / float(len(sa) + len(sb) - n) + return jacc + +repetr = re.compile(r"(.+?)\1+") + +def repetitions(s): + for match in repetr.finditer(s): + yield (match.group(1), len(match.group(0))/len(match.group(1))) + +def largestString (s1,s2): + """return largest pattern available in 2 strings""" + # From https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Longest_common_substring#Python2 + # License: CC BY-SA + m = [[0] * (1 + len(s2)) for i in range(1 + len(s1))] + longest, x_longest = 0, 0 + for x in range(1, 1 + len(s1)): + for y in range(1, 1 + len(s2)): + if s1[x - 1] == s2[y - 1]: + m[x][y] = m[x - 1][y - 1] + 1 + if m[x][y] > longest: + longest = m[x][y] + x_longest = x + else: + m[x][y] = 0 + return s1[x_longest - longest: x_longest] + +def findPattern(text, minimalCount, minimalLength, minimalPercent): + items = list(repetitions(text)) + size = len(text) + candidates = [] + for item in items: + (pattern, count) = item + percent = ((len(pattern) * count) / size * 100) + if len(pattern) > minimalLength: + if count > minimalCount or percent > minimalPercent: + candidates.append(pattern) + candidates.sort(key=len, reverse=True) + return None if len(candidates) == 0 else candidates[0] + def matchHostmask (pattern,n,resolve): # return the matched pattern for Nick if n.prefix == None or not ircutils.isUserHostmask(n.prefix): @@ -2114,6 +2163,33 @@ class ChanTracker(callbacks.Plugin,plugins.ChannelDBHandler): results.append(id) irc.reply('%s' % ', '.join(results)) rmmode = wrap(rmmode,['owner',commalist('int')]) + + # def getprotection (self,irc,msg,args,channel,protection): + # """[] + + # returns channel's protections settings""" + # s = '' + # if protection == 'clone': + # permit = self.registryValue('clonePermit',channel=channel) + # if permit < 0: + # irc.reply('%s is disabled in %s' % (protected, channel)) + # return + # irc.reply('%s clients in %s triggers +%s during %ss' % (permit,channel,self.registryValue('cloneMode',channel=channel),self.registryValue('cloneDuration',channel=channel))) + # elif protection == 'flood' or protection == 'lowFlood': + # permit = self.registryValue('%sPermit' % protection,channel=channel) + # if permit < 0: + # irc.reply('%s is disabled in %s' % (protection, channel)) + # return + # irc.reply('%s messages in %ss triggers +%s during %ss' % (permit,self.registryValue('%sLife' % protection,channel=channel),self.registryValue('%sMode' % protection,channel=channel),self.registryValue('%sDuration' % protection,channel=channel))) + # elif protection == 'repeat': + # permit = self.registryValue('%sPermit' % protection,channel=channel) + # if permit < 0: + # irc.reply('%s is disabled in %s' % (protection, channel)) + # return + # irc.reply('%s messages in %ss triggers +%s during %ss (%s similarity)' % (permit,self.registryValue('%sLife' % protection,channel=channel),self.registryValue('%sMode' % protection,channel=channel),self.registryValue('%sDuration' % protection,channel=channel),self.registryValue('repeatPercent',channel=channel))) + + + #protect = wrap(protect,['op','text']) def getIrcdMode (self,irc,mode,pattern): # here we try to know which kind of mode and pattern should be computed : @@ -3245,19 +3321,6 @@ class ChanTracker(callbacks.Plugin,plugins.ChannelDBHandler): return if targets == irc.nick: b = False - # todo keep this code commented until request to implement it - #b = False - #if text == 'You are not authorized to perform this operation.': - #b = True - #if b: - #i = self.getIrc(irc) - #for nick in i.nicks: - #n = i.getNick(irc,nick) - #if n.prefix and ircdb.checkCapability(n.prefix, 'owner') and n.prefix != irc.prefix: - #irc.queueMsg(ircmsgs.privmsg(n.prefix.split('!')[0],'Warning got %s notice: %s' % (msg.prefix,text))) - #break - #if text.startswith('*** Message to ') and text.endswith(' throttled due to flooding'): - # as bot floods, todo schedule info to owner else: if msg.nick == irc.nick: return @@ -3288,32 +3351,21 @@ class ChanTracker(callbacks.Plugin,plugins.ChannelDBHandler): isVip = self._isVip(irc,channel,n) if not isVip: isNotice = self._isSomething(irc,channel,best,'notice') - isMass = self._isMassRepeat(irc,channel,text) isBad = False - if isMass: - kind = 'massRepeat' + if isNotice: + isBad = self._isSomething(irc,channel,best,'bad') + if isNotice or isBad: + kind = None + if isBad: + kind = 'bad' + else: + kind = 'notice' mode = self.registryValue('%sMode' % kind,channel=channel) duration = self.registryValue('%sDuration' % kind,channel=channel) comment = self.registryValue('%sComment' % kind,channel=channel) r = self.getIrcdMode(irc,mode,best) self._act(irc,channel,r[0],r[1],duration,comment) - self._isBad(irc,channel,best) self.forceTickle = True - if isNotice: - isBad = self._isSomething(irc,channel,best,'bad') - if not isMass: - if isNotice or isBad: - kind = None - if isBad: - kind = 'bad' - else: - kind = 'notice' - mode = self.registryValue('%sMode' % kind,channel=channel) - duration = self.registryValue('%sDuration' % kind,channel=channel) - comment = self.registryValue('%sComment' % kind,channel=channel) - r = self.getIrcdMode(irc,mode,best) - self._act(irc,channel,r[0],r[1],duration,comment) - self.forceTickle = True if self.registryValue('announceNotice',channel=channel): if not chan.isWrong(best): if self.registryValue('useColorForAnnounces',channel=channel): @@ -3395,7 +3447,6 @@ class ChanTracker(callbacks.Plugin,plugins.ChannelDBHandler): n.addLog(channel,message) # protection features isVip = self._isVip(irc,channel,n) - # checking if message matchs living massRepeatPattern if not isVip: isCtcp = False if isCtcpMsg and not isAction: @@ -3420,10 +3471,6 @@ class ChanTracker(callbacks.Plugin,plugins.ChannelDBHandler): isCap = False if ircdb.checkCapability(msg.prefix,flag): isCap = self._isCap(irc,channel,best,text) - flag = ircdb.makeChannelCapability(channel,'massrepeat') - isMass = False - if ircdb.checkCapability(msg.prefix,flag): - isMass = self._isMassRepeat(irc,channel,text) flag = ircdb.makeChannelCapability(channel,'pattern') isPattern = False if ircdb.checkCapability(msg.prefix,flag): @@ -3451,16 +3498,22 @@ class ChanTracker(callbacks.Plugin,plugins.ChannelDBHandler): isBad = self._isBad(irc,channel,best) self.forceTickle = True chan.countpattern(isPattern.uid,self.getDb(irc.network)) - elif not isPattern and isMass: - kind = 'massRepeat' - mode = self.registryValue('%sMode' % kind,channel=channel) - duration = self.registryValue('%sDuration' % kind,channel=channel) - comment = self.registryValue('%sComment' % kind,channel=channel) - r = self.getIrcdMode(irc,mode,best) - self._act(irc,channel,r[0],r[1],duration,comment) - self._isBad(irc,channel,best) - self.forceTickle = True - elif not isPattern and not isMass: + isTemporaryPattern = False + if not isPattern and not isRepeat: + key = 'pattern%s' % channel + if key in chan.repeatLogs: + patterns = chan.repeatLogs[key] + for pattern in patterns: + if pattern in text: + isTemporaryPattern = pattern + break + if isTemporaryPattern: + chan.repeatLogs[key].enqueue(isTemporaryPattern) + r = self.getIrcdMode(irc,self.registryValue('repeatMode',channel=channel),best) + self._act(irc,channel,r[0],r[1],self.registryValue('repeatDuration',channel=channel),'') # hidden reason matches "%s"' % isTemporaryPattern + isBad = self._isBad(irc,channel,best) + self.forceTickle = True + elif not isPattern and not isTemporaryPattern: if isFlood or isHilight or isRepeat or isCap or isCtcp or isLowFlood: isBad = self._isBad(irc,channel,best) kind = None @@ -4016,7 +4069,37 @@ class ChanTracker(callbacks.Plugin,plugins.ChannelDBHandler): if user in msg: count = count + 1 return count > limit - + + def _addTemporaryPattern(self,irc,channel,pattern,level): + patternLength = self.registryValue('repeatPatternMinimum',channel=channel) + key = 'pattern%s' % channel + if patternLength < 0: + return + if len(pattern) < patternLength: + return + life = self.registryValue('repeatPatternLife',channel=channel) + chan = self.getChan(irc,channel) + if not key in chan.repeatLogs or chan.repeatLogs[key].timeout != life: + chan.repeatLogs[key] = utils.structures.TimeoutQueue(life) + self._logChan(irc,channel,'[%s] pattern created "%s" (%s)' % (channel,pattern,level)) + chan.repeatLogs[key].enqueue(pattern) + + def _computePattern(self,message,logs,probability,patternLength): + candidate = None + bad = False + for msg in logs: + if compareString(message,msg) >= probability: + bad = True + if patternLength > -1: + found = largestString(message,msg) + if found and len(found) > patternLength: + if candidate: + if len(candidate) < len(found): + candidate = found + else: + candidate = found + return (bad,candidate) + def _isRepeat(self,irc,channel,key,message): if self.registryValue('repeatPermit',channel=channel) < 0: return False @@ -4024,59 +4107,36 @@ class ChanTracker(callbacks.Plugin,plugins.ChannelDBHandler): timeout = self.registryValue('repeatLife',channel=channel) if not key in chan.repeatLogs or chan.repeatLogs[key].timeout != timeout: chan.repeatLogs[key] = utils.structures.TimeoutQueue(timeout) + count = self.registryValue('repeatCount',channel=channel) + probability = self.registryValue('repeatPercent',channel=channel) + minimum = self.registryValue('repeatMinimum',channel=channel) + pattern = findPattern(message,count,minimum,100 * probability) + if pattern: + self._addTemporaryPattern(irc,channel,pattern,'single msg') + if self._isSomething(irc,channel,key,'repeat'): + return True + patternLength = self.registryValue('repeatPatternMinimum',channel=channel) logs = chan.repeatLogs[key] - trigger = self.registryValue('repeatPercent',channel=channel) + (flag, pattern) = self._computePattern(message,logs,probability,patternLength) result = False - flag = False - for msg in logs: - if self._strcompare(message,msg) >= trigger: - flag = True - break if flag: result = self._isSomething(irc,channel,key,'repeat') chan.repeatLogs[key].enqueue(message) + if result: + if pattern: + self._addTemporaryPattern(irc,channel,pattern,'single src') return result - - def _isMassRepeat(self,irc,channel,message): - if self.registryValue('massRepeatPermit',channel=channel) < 0 or len(message) < self.registryValue('massRepeatChars',channel=channel): - return False - message = message.lower() - chan = self.getChan(irc,channel) - life = self.registryValue('massRepeatLife',channel=channel) - if not channel in chan.repeatLogs or chan.repeatLogs[channel].timeout != life: - chan.repeatLogs[channel] = utils.structures.TimeoutQueue(life) - patchan = 'pattern%s' % channel - if self.registryValue('massRepeatPatternLength',channel=channel) > 0: - if not patchan in chan.repeatLogs or chan.repeatLogs[patchan].timeout != self.registryValue('massRepeatPatternLife',channel=channel): - chan.repeatLogs[patchan] = utils.structures.TimeoutQueue(self.registryValue('massRepeatPatternLife',channel=channel)) - logs = chan.repeatLogs[patchan] - for msg in logs: - if msg in message: -# self.log.debug('mass repeat "%s" is found in "%s"' % (msg,message)) - #self._isSomething(irc,channel,channel,'massRepeat') - return True + if not channel in chan.repeatLogs or chan.repeatLogs[channel].timeout != timeout: + chan.repeatLogs[channel] = utils.structures.TimeoutQueue(timeout) logs = chan.repeatLogs[channel] - trigger = self.registryValue('massRepeatPercent',channel=channel) - result = False - flag = False - pattern = None - for msg in logs: - if self._strcompare(message,msg) >= trigger: - if self.registryValue('massRepeatPatternLength',channel=channel) > 0: - pattern = self._largestpattern(message,msg) - if pattern and len(pattern) > self.registryValue('massRepeatPatternLength',channel=channel): - pattern = pattern - else: - pattern = None - flag = True - break - if flag: - result = self._isSomething(irc,channel,channel,'massRepeat') - if result and pattern: - if not patchan in chan.repeatLogs or chan.repeatLogs[patchan].timeout != self.registryValue('massRepeatPatternLife',channel=channel): - chan.repeatLogs[patchan] = utils.structures.TimeoutQueue(self.registryValue('massRepeatPatternLife',channel=channel)) - chan.repeatLogs[patchan].enqueue(pattern) + (flag, pattern) = self._computePattern(message,logs,probability,patternLength) chan.repeatLogs[channel].enqueue(message) + result = False + if flag: + result = self._isSomething(irc,channel,channel,'repeat') + if result: + if pattern: + self._addTemporaryPattern(irc,channel,pattern,'all src') return result def _isCap(self,irc,channel,key,message): @@ -4091,40 +4151,5 @@ class ChanTracker(callbacks.Plugin,plugins.ChannelDBHandler): return self._isSomething(irc,channel,key,'cap') return False - def _strcompare (self,a,b): - # return [0 - 1] ratio between two string - # jaccard algo - sa, sb = set(a.lower()), set(b.lower()) - n = len(sa.intersection(sb)) - if float(len(sa) + len(sb) - n) == 0: - return 0 - jacc = n / float(len(sa) + len(sb) - n) - return jacc - - def _largestpattern (self,s1,s2): - s1 = s1.lower() - s2 = s2.lower() - m = [[0] * (1 + len(s2)) for i in range(1 + len(s1))] - longest, x_longest = 0, 0 - for x in range(1, 1 + len(s1)): - for y in range(1, 1 + len(s2)): - if s1[x - 1] == s2[y - 1]: - m[x][y] = m[x - 1][y - 1] + 1 - if m[x][y] > longest: - longest = m[x][y] - x_longest = x - else: - m[x][y] = 0 - return s1[x_longest - longest: x_longest] - - def reset(self): - self._ircs = ircutils.IrcDict() - - def die(self): - self._ircs = ircutils.IrcDict() - - def doError (self,irc,msg): - self._ircs = ircutils.IrcDict() - Class = ChanTracker