Skip to content

Commit

Permalink
Merge pull request #13765 from makyen/Mak-chunk-BL-WL-to-reduce-regex…
Browse files Browse the repository at this point in the history
…-recompile

Chunk the blacklists and watchlist to reduce regex recompiles upon reloading the blacklists and watchlist

autopull
  • Loading branch information
makyen authored Oct 28, 2024
2 parents 1ab92c3 + 69a48f7 commit 69b9a70
Show file tree
Hide file tree
Showing 3 changed files with 94 additions and 75 deletions.
8 changes: 7 additions & 1 deletion chatcommands.py
Original file line number Diff line number Diff line change
Expand Up @@ -2075,7 +2075,7 @@ def invite(msg, room_id, roles):
# --- Post Responses --- #
# noinspection PyIncorrectDocstring
@command(str, whole_msg=True, privileged=False, give_name=True,
aliases=["scan", "scan-force", "report-force", "report-direct"])
aliases=["scan", "scan-force", "report-force", "report-direct", "scan-time", "scan-force-time"])
def report(msg, args, alias_used="report"):
"""
Report a post (or posts)
Expand All @@ -2095,6 +2095,10 @@ def report(msg, args, alias_used="report"):

alias_used = alias_used or "report"

is_timed = "-time" in alias_used
alias_used = alias_used.replace("-time", "")
start_time = time.time()

argsraw = args.split(' "', 1)
urls = argsraw[0].split(' ')

Expand Down Expand Up @@ -2122,6 +2126,8 @@ def report(msg, args, alias_used="report"):
if output:
if 1 < len(urls) > output.count("\n") + 1:
add_or_update_multiple_reporter(msg.owner.id, msg._client.host, time.time())
if is_timed:
output += "\nScanning took {} seconds.".format(round(time.time() - start_time, 3))
return output


Expand Down
129 changes: 71 additions & 58 deletions findspam.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@

from helpers import log, regex_compile_no_cache, strip_pre_and_code_elements, strip_code_elements, \
get_bookended_keyword_regex_text_from_entries, keyword_bookend_regex_text, KEYWORD_BOOKENDING_START, \
get_non_bookended_keyword_regex_text_from_entries
get_non_bookended_keyword_regex_text_from_entries, chunk_list
import metasmoke_cache
from globalvars import GlobalVars
import blacklists
Expand Down Expand Up @@ -582,45 +582,46 @@ class FindSpam:
('warning', '**Very High** ', 30), # > 30 s: Log a "warning" and output to chat as bold "Very High"
]

@staticmethod
def _update_a_blacklist_dual_rule(rule_list, regex_text_generator, entries):
entries = list(entries)
entries_length = len(entries)
if entries_length > 100:
# Get the length to the 100 below the current length
chunk_length = int(str(entries_length)[:-2] + '00')
entries_lists = chunk_list(entries, chunk_length)
else:
# With <= 100 entries, Use an entries_lists with all the entries first, then a regex that can never match
entries_lists = [entries]
if len(entries_lists) == 1:
entries_lists.append = [r'q(?<!q)']
for index in range(2):
new_regex_text = regex_text_generator(entries_lists[index])
if new_regex_text != rule_list[index].regex:
rule_list[index].regex = new_regex_text
try:
del rule_list[index].compiled_regex
except AttributeError:
pass
rule_list[index].sanity_check()

@classmethod
def reload_blacklists(cls):
global bad_keywords_nwb

blacklists.load_blacklists()
new_bad_keywords_regex = get_bookended_keyword_regex_text_from_entries(GlobalVars.bad_keywords)
if new_bad_keywords_regex != cls.rule_bad_keywords.regex:
cls.rule_bad_keywords.regex = new_bad_keywords_regex
try:
del cls.rule_bad_keywords.compiled_regex
except AttributeError:
pass
cls.rule_bad_keywords.sanity_check()
new_watched_keywords_regex = get_bookended_keyword_regex_text_from_entries(GlobalVars.watched_keywords.keys())
if new_watched_keywords_regex != cls.rule_watched_keywords.regex:
cls.rule_watched_keywords.regex = new_watched_keywords_regex
try:
del cls.rule_watched_keywords.compiled_regex
except AttributeError:
pass
cls.rule_watched_keywords.sanity_check()
new_blacklisted_websites_regex = \
get_non_bookended_keyword_regex_text_from_entries(GlobalVars.blacklisted_websites)
if new_blacklisted_websites_regex != cls.rule_blacklisted_websites.regex:
cls.rule_blacklisted_websites.regex = new_blacklisted_websites_regex
try:
del cls.rule_blacklisted_websites.compiled_regex
except AttributeError:
pass
cls.rule_blacklisted_websites.sanity_check()
new_blacklisted_usernames_regex = \
get_non_bookended_keyword_regex_text_from_entries(GlobalVars.blacklisted_usernames)
if new_blacklisted_usernames_regex != cls.rule_blacklisted_usernames.regex:
cls.rule_blacklisted_usernames.regex = new_blacklisted_usernames_regex
try:
del cls.rule_blacklisted_usernames.compiled_regex
except AttributeError:
pass
cls.rule_blacklisted_usernames.sanity_check()
cls._update_a_blacklist_dual_rule(cls.rule_bad_keywords,
get_bookended_keyword_regex_text_from_entries,
GlobalVars.bad_keywords)
cls._update_a_blacklist_dual_rule(cls.rule_watched_keywords,
get_bookended_keyword_regex_text_from_entries,
GlobalVars.watched_keywords.keys())
cls._update_a_blacklist_dual_rule(cls.rule_blacklisted_websites,
get_non_bookended_keyword_regex_text_from_entries,
GlobalVars.blacklisted_websites)
cls._update_a_blacklist_dual_rule(cls.rule_blacklisted_usernames,
get_non_bookended_keyword_regex_text_from_entries,
GlobalVars.blacklisted_usernames)
GlobalVars.blacklisted_numbers_full, GlobalVars.blacklisted_numbers, \
GlobalVars.blacklisted_numbers_normalized = \
phone_numbers.process_numlist(GlobalVars.blacklisted_numbers_raw)
Expand Down Expand Up @@ -769,6 +770,14 @@ def decorator(func):
return decorator


def create_multiple_rules(*args, rule_quantity=1, rule_id=None, **kwargs):
rules = []
for index in range(rule_quantity):
index_rule_id = rule_id + ' index:' + str(index)
rules.append(create_rule(*args, rule_id=index_rule_id, **kwargs))
return rules


def is_whitelisted_website(url):
# Imported from method link_at_end
return bool(WHITELISTED_WEBSITES_REGEX.search(url)) or metasmoke_cache.is_website_whitelisted(url)
Expand Down Expand Up @@ -2407,29 +2416,33 @@ def religion_troll(s, site):


# General blacklists, regex will be filled at the reload_blacklist() call at the bottom
FindSpam.rule_bad_keywords = create_rule("bad keyword in {}", regex="",
username=True, body_summary=True,
max_rep=32, max_score=1, skip_creation_sanity_check=True,
rule_id="main blacklisted keywords")
FindSpam.rule_watched_keywords = create_rule("potentially bad keyword in {}", regex="",
username=True, body_summary=True,
max_rep=32, max_score=1, skip_creation_sanity_check=True,
rule_id="main watchlist",
elapsed_time_reporting={
'draw_attention_min': 20,
'levels': [
('debug', '', 10),
('info', 'High ', 20),
('warning', '**Very High** ', 45),
],
})
FindSpam.rule_blacklisted_websites = create_rule("blacklisted website in {}", regex="", body_summary=True,
max_rep=52, max_score=5, skip_creation_sanity_check=True,
username=True, rule_id="main blacklisted websites")
FindSpam.rule_blacklisted_usernames = create_rule("blacklisted username", regex="",
title=False, body=False, username=True,
skip_creation_sanity_check=True,
rule_id="main blacklisted usernames")
FindSpam.rule_bad_keywords = create_multiple_rules("bad keyword in {}", regex="",
username=True, body_summary=True,
max_rep=32, max_score=1, skip_creation_sanity_check=True,
rule_id="main blacklisted keywords",
rule_quantity=2)
FindSpam.rule_watched_keywords = create_multiple_rules("potentially bad keyword in {}", regex="",
username=True, body_summary=True,
max_rep=32, max_score=1, skip_creation_sanity_check=True,
rule_id="main watchlist",
elapsed_time_reporting={
'draw_attention_min': 20,
'levels': [
('debug', '', 10),
('info', 'High ', 20),
('warning', '**Very High** ', 45),
],
},
rule_quantity=2)
FindSpam.rule_blacklisted_websites = create_multiple_rules("blacklisted website in {}", regex="", body_summary=True,
max_rep=52, max_score=5, skip_creation_sanity_check=True,
username=True, rule_id="main blacklisted websites",
rule_quantity=2)
FindSpam.rule_blacklisted_usernames = create_multiple_rules("blacklisted username", regex="",
title=False, body=False, username=True,
skip_creation_sanity_check=True,
rule_id="main blacklisted usernames",
rule_quantity=2)

# Hardcoded bad keywords without a word boundary (from bad_keywords_nwb list above).
create_rule("bad keyword in {}", regex=r"(?is){}".format("|".join(bad_keywords_nwb)),
Expand Down
32 changes: 16 additions & 16 deletions test/test_findspam.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,22 +102,22 @@
('IDNA misleading link', '<a href="http://www.h%c3%a5nd.no">http://www.h\u00E5nd.no</a>', '', '', False, False, False),
('Mostly punctuation', ';[].[.[.&_$)_\\*&_@$.[;*/-!#*&)(_.\'].1\\)!#_', '', '', False, False, True),
('Few unique', 'asdss, dadasssaadadda, daaaadadsss, ssa,,,addadas,ss\nsdadadsssadadas, sss\ndaaasdddsaaa, asd', '', '', False, False, True),
('ketones', 'ketones', 'ketones', 'chemistry.stackexchange.com', False, False, False),
('ketones', 'ketones', 'ketones', 'chemistry.stackexchange.com', False, True, False),
('ketones', 'ketones', 'ketones', 'chemistry.stackexchange.com', True, False, False),
('ketones', 'ketones', 'ketones', 'chemistry.stackexchange.com', True, True, False),
('keytones', '<p>Some body</p>', 'a username', 'superuser.com', False, False, True),
('A title', 'keytones', 'a username', 'superuser.com', False, False, True),
('A title', '<p>Some body</p>', 'keytones', 'superuser.com', False, False, True),
('keytones', '<p>Some body</p>', 'a username', 'superuser.com', False, True, False),
('A title', 'keytones', 'a username', 'superuser.com', False, True, True),
('A title', '<p>Some body</p>', 'keytones', 'superuser.com', False, True, True),
('keytones', '<p>Some body</p>', 'a username', 'superuser.com', True, False, True),
('A title', 'keytones', 'a username', 'superuser.com', True, False, True),
('A title', '<p>Some body</p>', 'keytones', 'superuser.com', True, False, True),
('keytones', '<p>Some body</p>', 'a username', 'superuser.com', True, True, False),
('A title', 'keytones', 'a username', 'superuser.com', True, True, True),
('A title', '<p>Some body</p>', 'keytones', 'superuser.com', True, True, True),
('ketones on Chemistry', 'ketones', 'ketones', 'chemistry.stackexchange.com', False, False, False),
('ketones on Chemistry as answer', 'ketones', 'ketones', 'chemistry.stackexchange.com', False, True, False),
('ketones on Chemistry as body_summary', 'ketones', 'ketones', 'chemistry.stackexchange.com', True, False, False),
('ketones on Chemistry as body_summary and answer', 'ketones', 'ketones', 'chemistry.stackexchange.com', True, True, False),
('keytones on SuperUser', '<p>Some body</p>', 'a username', 'superuser.com', False, False, True),
('keytones on SuperUser as answer', '<p>Some body</p>', 'a username', 'superuser.com', False, True, False),
('A title with KyT in body', 'keytones', 'a username', 'superuser.com', False, False, True),
('A title with KyT in username', '<p>Some body</p>', 'keytones', 'superuser.com', False, False, True),
('A title with KyT in body as answer', 'keytones', 'a username', 'superuser.com', False, True, True),
('A title with KyT in username as answer', '<p>Some body</p>', 'keytones', 'superuser.com', False, True, True),
('keytones on SuperUser as body_summary', '<p>Some body</p>', 'a username', 'superuser.com', True, False, True),
('A title with KyT in body as body_summary on SuperUser', 'keytones', 'a username', 'superuser.com', True, False, True),
('A title with KyT in username as body_summary on SuperUser', '<p>Some body</p>', 'keytones', 'superuser.com', True, False, True),
('keytones on SuperUser as body_summary and answer', '<p>Some body</p>', 'a username', 'superuser.com', True, True, False),
('A title with KyT in body as body summary and answer', 'keytones', 'a username', 'superuser.com', True, True, True),
('A title with KyT in username as body summary and answer', '<p>Some body</p>', 'keytones', 'superuser.com', True, True, True),
('C01nb4s3 support number', 'obfuscated_word in title', 'spammer', 'stackoverflow.com', False, False, True),
('obfuscated_word in body', 'C01nb4$3 support number', 'spammer', 'stackoverflow.com', False, False, True),
('''airline's responsibilities''', 'test case for "not obfuscated after all" (#7345)', 'good guy', 'stackoverflow.com', False, False, False),
Expand Down

0 comments on commit 69b9a70

Please sign in to comment.