Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Chunk the blacklists and watchlist to reduce regex recompiles upon reloading the blacklists and watchlist #13765

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion chatcommands.py
Original file line number Diff line number Diff line change
Expand Up @@ -2075,7 +2075,7 @@ def invite(msg, room_id, roles):
# --- Post Responses --- #
# noinspection PyIncorrectDocstring
@command(str, whole_msg=True, privileged=False, give_name=True,
aliases=["scan", "scan-force", "report-force", "report-direct"])
aliases=["scan", "scan-force", "report-force", "report-direct", "scan-time", "scan-force-time"])
def report(msg, args, alias_used="report"):
"""
Report a post (or posts)
Expand All @@ -2095,6 +2095,10 @@ def report(msg, args, alias_used="report"):

alias_used = alias_used or "report"

is_timed = "-time" in alias_used
alias_used = alias_used.replace("-time", "")
start_time = time.time()

argsraw = args.split(' "', 1)
urls = argsraw[0].split(' ')

Expand Down Expand Up @@ -2122,6 +2126,8 @@ def report(msg, args, alias_used="report"):
if output:
if 1 < len(urls) > output.count("\n") + 1:
add_or_update_multiple_reporter(msg.owner.id, msg._client.host, time.time())
if is_timed:
output += "\nScanning took {} seconds.".format(round(time.time() - start_time, 3))
return output


Expand Down
129 changes: 71 additions & 58 deletions findspam.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@

from helpers import log, regex_compile_no_cache, strip_pre_and_code_elements, strip_code_elements, \
get_bookended_keyword_regex_text_from_entries, keyword_bookend_regex_text, KEYWORD_BOOKENDING_START, \
get_non_bookended_keyword_regex_text_from_entries
get_non_bookended_keyword_regex_text_from_entries, chunk_list
import metasmoke_cache
from globalvars import GlobalVars
import blacklists
Expand Down Expand Up @@ -582,45 +582,46 @@ class FindSpam:
('warning', '**Very High** ', 30), # > 30 s: Log a "warning" and output to chat as bold "Very High"
]

@staticmethod
def _update_a_blacklist_dual_rule(rule_list, regex_text_generator, entries):
entries = list(entries)
entries_length = len(entries)
if entries_length > 100:
# Get the length to the 100 below the current length
chunk_length = int(str(entries_length)[:-2] + '00')
entries_lists = chunk_list(entries, chunk_length)
else:
# With <= 100 entries, Use an entries_lists with all the entries first, then a regex that can never match
entries_lists = [entries]
if len(entries_lists) == 1:
entries_lists.append = [r'q(?<!q)']
for index in range(2):
new_regex_text = regex_text_generator(entries_lists[index])
if new_regex_text != rule_list[index].regex:
rule_list[index].regex = new_regex_text
try:
del rule_list[index].compiled_regex
except AttributeError:
pass
rule_list[index].sanity_check()

@classmethod
def reload_blacklists(cls):
global bad_keywords_nwb

blacklists.load_blacklists()
new_bad_keywords_regex = get_bookended_keyword_regex_text_from_entries(GlobalVars.bad_keywords)
if new_bad_keywords_regex != cls.rule_bad_keywords.regex:
cls.rule_bad_keywords.regex = new_bad_keywords_regex
try:
del cls.rule_bad_keywords.compiled_regex
except AttributeError:
pass
cls.rule_bad_keywords.sanity_check()
new_watched_keywords_regex = get_bookended_keyword_regex_text_from_entries(GlobalVars.watched_keywords.keys())
if new_watched_keywords_regex != cls.rule_watched_keywords.regex:
cls.rule_watched_keywords.regex = new_watched_keywords_regex
try:
del cls.rule_watched_keywords.compiled_regex
except AttributeError:
pass
cls.rule_watched_keywords.sanity_check()
new_blacklisted_websites_regex = \
get_non_bookended_keyword_regex_text_from_entries(GlobalVars.blacklisted_websites)
if new_blacklisted_websites_regex != cls.rule_blacklisted_websites.regex:
cls.rule_blacklisted_websites.regex = new_blacklisted_websites_regex
try:
del cls.rule_blacklisted_websites.compiled_regex
except AttributeError:
pass
cls.rule_blacklisted_websites.sanity_check()
new_blacklisted_usernames_regex = \
get_non_bookended_keyword_regex_text_from_entries(GlobalVars.blacklisted_usernames)
if new_blacklisted_usernames_regex != cls.rule_blacklisted_usernames.regex:
cls.rule_blacklisted_usernames.regex = new_blacklisted_usernames_regex
try:
del cls.rule_blacklisted_usernames.compiled_regex
except AttributeError:
pass
cls.rule_blacklisted_usernames.sanity_check()
cls._update_a_blacklist_dual_rule(cls.rule_bad_keywords,
get_bookended_keyword_regex_text_from_entries,
GlobalVars.bad_keywords)
cls._update_a_blacklist_dual_rule(cls.rule_watched_keywords,
get_bookended_keyword_regex_text_from_entries,
GlobalVars.watched_keywords.keys())
cls._update_a_blacklist_dual_rule(cls.rule_blacklisted_websites,
get_non_bookended_keyword_regex_text_from_entries,
GlobalVars.blacklisted_websites)
cls._update_a_blacklist_dual_rule(cls.rule_blacklisted_usernames,
get_non_bookended_keyword_regex_text_from_entries,
GlobalVars.blacklisted_usernames)
GlobalVars.blacklisted_numbers_full, GlobalVars.blacklisted_numbers, \
GlobalVars.blacklisted_numbers_normalized = \
phone_numbers.process_numlist(GlobalVars.blacklisted_numbers_raw)
Expand Down Expand Up @@ -769,6 +770,14 @@ def decorator(func):
return decorator


def create_multiple_rules(*args, rule_quantity=1, rule_id=None, **kwargs):
rules = []
for index in range(rule_quantity):
index_rule_id = rule_id + ' index:' + str(index)
rules.append(create_rule(*args, rule_id=index_rule_id, **kwargs))
return rules


def is_whitelisted_website(url):
# Imported from method link_at_end
return bool(WHITELISTED_WEBSITES_REGEX.search(url)) or metasmoke_cache.is_website_whitelisted(url)
Expand Down Expand Up @@ -2407,29 +2416,33 @@ def religion_troll(s, site):


# General blacklists, regex will be filled at the reload_blacklist() call at the bottom
FindSpam.rule_bad_keywords = create_rule("bad keyword in {}", regex="",
username=True, body_summary=True,
max_rep=32, max_score=1, skip_creation_sanity_check=True,
rule_id="main blacklisted keywords")
FindSpam.rule_watched_keywords = create_rule("potentially bad keyword in {}", regex="",
username=True, body_summary=True,
max_rep=32, max_score=1, skip_creation_sanity_check=True,
rule_id="main watchlist",
elapsed_time_reporting={
'draw_attention_min': 20,
'levels': [
('debug', '', 10),
('info', 'High ', 20),
('warning', '**Very High** ', 45),
],
})
FindSpam.rule_blacklisted_websites = create_rule("blacklisted website in {}", regex="", body_summary=True,
max_rep=52, max_score=5, skip_creation_sanity_check=True,
username=True, rule_id="main blacklisted websites")
FindSpam.rule_blacklisted_usernames = create_rule("blacklisted username", regex="",
title=False, body=False, username=True,
skip_creation_sanity_check=True,
rule_id="main blacklisted usernames")
FindSpam.rule_bad_keywords = create_multiple_rules("bad keyword in {}", regex="",
username=True, body_summary=True,
max_rep=32, max_score=1, skip_creation_sanity_check=True,
rule_id="main blacklisted keywords",
rule_quantity=2)
FindSpam.rule_watched_keywords = create_multiple_rules("potentially bad keyword in {}", regex="",
username=True, body_summary=True,
max_rep=32, max_score=1, skip_creation_sanity_check=True,
rule_id="main watchlist",
elapsed_time_reporting={
'draw_attention_min': 20,
'levels': [
('debug', '', 10),
('info', 'High ', 20),
('warning', '**Very High** ', 45),
],
},
rule_quantity=2)
FindSpam.rule_blacklisted_websites = create_multiple_rules("blacklisted website in {}", regex="", body_summary=True,
max_rep=52, max_score=5, skip_creation_sanity_check=True,
username=True, rule_id="main blacklisted websites",
rule_quantity=2)
FindSpam.rule_blacklisted_usernames = create_multiple_rules("blacklisted username", regex="",
title=False, body=False, username=True,
skip_creation_sanity_check=True,
rule_id="main blacklisted usernames",
rule_quantity=2)

# Hardcoded bad keywords without a word boundary (from bad_keywords_nwb list above).
create_rule("bad keyword in {}", regex=r"(?is){}".format("|".join(bad_keywords_nwb)),
Expand Down
32 changes: 16 additions & 16 deletions test/test_findspam.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,22 +102,22 @@
('IDNA misleading link', '<a href="http://www.h%c3%a5nd.no">http://www.h\u00E5nd.no</a>', '', '', False, False, False),
('Mostly punctuation', ';[].[.[.&_$)_\\*&_@$.[;*/-!#*&)(_.\'].1\\)!#_', '', '', False, False, True),
('Few unique', 'asdss, dadasssaadadda, daaaadadsss, ssa,,,addadas,ss\nsdadadsssadadas, sss\ndaaasdddsaaa, asd', '', '', False, False, True),
('ketones', 'ketones', 'ketones', 'chemistry.stackexchange.com', False, False, False),
('ketones', 'ketones', 'ketones', 'chemistry.stackexchange.com', False, True, False),
('ketones', 'ketones', 'ketones', 'chemistry.stackexchange.com', True, False, False),
('ketones', 'ketones', 'ketones', 'chemistry.stackexchange.com', True, True, False),
('keytones', '<p>Some body</p>', 'a username', 'superuser.com', False, False, True),
('A title', 'keytones', 'a username', 'superuser.com', False, False, True),
('A title', '<p>Some body</p>', 'keytones', 'superuser.com', False, False, True),
('keytones', '<p>Some body</p>', 'a username', 'superuser.com', False, True, False),
('A title', 'keytones', 'a username', 'superuser.com', False, True, True),
('A title', '<p>Some body</p>', 'keytones', 'superuser.com', False, True, True),
('keytones', '<p>Some body</p>', 'a username', 'superuser.com', True, False, True),
('A title', 'keytones', 'a username', 'superuser.com', True, False, True),
('A title', '<p>Some body</p>', 'keytones', 'superuser.com', True, False, True),
('keytones', '<p>Some body</p>', 'a username', 'superuser.com', True, True, False),
('A title', 'keytones', 'a username', 'superuser.com', True, True, True),
('A title', '<p>Some body</p>', 'keytones', 'superuser.com', True, True, True),
('ketones on Chemistry', 'ketones', 'ketones', 'chemistry.stackexchange.com', False, False, False),
('ketones on Chemistry as answer', 'ketones', 'ketones', 'chemistry.stackexchange.com', False, True, False),
('ketones on Chemistry as body_summary', 'ketones', 'ketones', 'chemistry.stackexchange.com', True, False, False),
('ketones on Chemistry as body_summary and answer', 'ketones', 'ketones', 'chemistry.stackexchange.com', True, True, False),
('keytones on SuperUser', '<p>Some body</p>', 'a username', 'superuser.com', False, False, True),
('keytones on SuperUser as answer', '<p>Some body</p>', 'a username', 'superuser.com', False, True, False),
('A title with KyT in body', 'keytones', 'a username', 'superuser.com', False, False, True),
('A title with KyT in username', '<p>Some body</p>', 'keytones', 'superuser.com', False, False, True),
('A title with KyT in body as answer', 'keytones', 'a username', 'superuser.com', False, True, True),
('A title with KyT in username as answer', '<p>Some body</p>', 'keytones', 'superuser.com', False, True, True),
('keytones on SuperUser as body_summary', '<p>Some body</p>', 'a username', 'superuser.com', True, False, True),
('A title with KyT in body as body_summary on SuperUser', 'keytones', 'a username', 'superuser.com', True, False, True),
('A title with KyT in username as body_summary on SuperUser', '<p>Some body</p>', 'keytones', 'superuser.com', True, False, True),
('keytones on SuperUser as body_summary and answer', '<p>Some body</p>', 'a username', 'superuser.com', True, True, False),
('A title with KyT in body as body summary and answer', 'keytones', 'a username', 'superuser.com', True, True, True),
('A title with KyT in username as body summary and answer', '<p>Some body</p>', 'keytones', 'superuser.com', True, True, True),
('C01nb4s3 support number', 'obfuscated_word in title', 'spammer', 'stackoverflow.com', False, False, True),
('obfuscated_word in body', 'C01nb4$3 support number', 'spammer', 'stackoverflow.com', False, False, True),
('''airline's responsibilities''', 'test case for "not obfuscated after all" (#7345)', 'good guy', 'stackoverflow.com', False, False, False),
Expand Down