Merge pull request #13765 from makyen/Mak-chunk-BL-WL-to-reduce-regex…

…-recompile Chunk the blacklists and watchlist to reduce regex recompiles upon reloading the blacklists and watchlist autopull
Charcoal-SE · Oct 28, 2024 · 69b9a70 · 69b9a70
2 parents 1ab92c3 + 69a48f7
commit 69b9a70
Show file tree

Hide file tree

Showing 3 changed files with 94 additions and 75 deletions.
diff --git a/chatcommands.py b/chatcommands.py
@@ -2075,7 +2075,7 @@ def invite(msg, room_id, roles):
 # --- Post Responses --- #
 # noinspection PyIncorrectDocstring
 @command(str, whole_msg=True, privileged=False, give_name=True,
-         aliases=["scan", "scan-force", "report-force", "report-direct"])
+         aliases=["scan", "scan-force", "report-force", "report-direct", "scan-time", "scan-force-time"])
 def report(msg, args, alias_used="report"):
     """
     Report a post (or posts)
@@ -2095,6 +2095,10 @@ def report(msg, args, alias_used="report"):
 
     alias_used = alias_used or "report"
 
+    is_timed = "-time" in alias_used
+    alias_used = alias_used.replace("-time", "")
+    start_time = time.time()
+
     argsraw = args.split(' "', 1)
     urls = argsraw[0].split(' ')
 
@@ -2122,6 +2126,8 @@ def report(msg, args, alias_used="report"):
     if output:
         if 1 < len(urls) > output.count("\n") + 1:
             add_or_update_multiple_reporter(msg.owner.id, msg._client.host, time.time())
+        if is_timed:
+            output += "\nScanning took {} seconds.".format(round(time.time() - start_time, 3))
         return output
 
 

diff --git a/findspam.py b/findspam.py
@@ -26,7 +26,7 @@
 
 from helpers import log, regex_compile_no_cache, strip_pre_and_code_elements, strip_code_elements, \
     get_bookended_keyword_regex_text_from_entries, keyword_bookend_regex_text, KEYWORD_BOOKENDING_START, \
-    get_non_bookended_keyword_regex_text_from_entries
+    get_non_bookended_keyword_regex_text_from_entries, chunk_list
 import metasmoke_cache
 from globalvars import GlobalVars
 import blacklists
@@ -582,45 +582,46 @@ class FindSpam:
         ('warning', '**Very High** ', 30),  # > 30 s: Log a "warning" and output to chat as bold "Very High"
     ]
 
+    @staticmethod
+    def _update_a_blacklist_dual_rule(rule_list, regex_text_generator, entries):
+        entries = list(entries)
+        entries_length = len(entries)
+        if entries_length > 100:
+            # Get the length to the 100 below the current length
+            chunk_length = int(str(entries_length)[:-2] + '00')
+            entries_lists = chunk_list(entries, chunk_length)
+        else:
+            # With <= 100 entries, Use an entries_lists with all the entries first, then a regex that can never match
+            entries_lists = [entries]
+        if len(entries_lists) == 1:
+            entries_lists.append = [r'q(?<!q)']
+        for index in range(2):
+            new_regex_text = regex_text_generator(entries_lists[index])
+            if new_regex_text != rule_list[index].regex:
+                rule_list[index].regex = new_regex_text
+                try:
+                    del rule_list[index].compiled_regex
+                except AttributeError:
+                    pass
+                rule_list[index].sanity_check()
+
     @classmethod
     def reload_blacklists(cls):
         global bad_keywords_nwb
 
         blacklists.load_blacklists()
-        new_bad_keywords_regex = get_bookended_keyword_regex_text_from_entries(GlobalVars.bad_keywords)
-        if new_bad_keywords_regex != cls.rule_bad_keywords.regex:
-            cls.rule_bad_keywords.regex = new_bad_keywords_regex
-            try:
-                del cls.rule_bad_keywords.compiled_regex
-            except AttributeError:
-                pass
-            cls.rule_bad_keywords.sanity_check()
-        new_watched_keywords_regex = get_bookended_keyword_regex_text_from_entries(GlobalVars.watched_keywords.keys())
-        if new_watched_keywords_regex != cls.rule_watched_keywords.regex:
-            cls.rule_watched_keywords.regex = new_watched_keywords_regex
-            try:
-                del cls.rule_watched_keywords.compiled_regex
-            except AttributeError:
-                pass
-            cls.rule_watched_keywords.sanity_check()
-        new_blacklisted_websites_regex = \
-            get_non_bookended_keyword_regex_text_from_entries(GlobalVars.blacklisted_websites)
-        if new_blacklisted_websites_regex != cls.rule_blacklisted_websites.regex:
-            cls.rule_blacklisted_websites.regex = new_blacklisted_websites_regex
-            try:
-                del cls.rule_blacklisted_websites.compiled_regex
-            except AttributeError:
-                pass
-            cls.rule_blacklisted_websites.sanity_check()
-        new_blacklisted_usernames_regex = \
-            get_non_bookended_keyword_regex_text_from_entries(GlobalVars.blacklisted_usernames)
-        if new_blacklisted_usernames_regex != cls.rule_blacklisted_usernames.regex:
-            cls.rule_blacklisted_usernames.regex = new_blacklisted_usernames_regex
-            try:
-                del cls.rule_blacklisted_usernames.compiled_regex
-            except AttributeError:
-                pass
-            cls.rule_blacklisted_usernames.sanity_check()
+        cls._update_a_blacklist_dual_rule(cls.rule_bad_keywords,
+                                          get_bookended_keyword_regex_text_from_entries,
+                                          GlobalVars.bad_keywords)
+        cls._update_a_blacklist_dual_rule(cls.rule_watched_keywords,
+                                          get_bookended_keyword_regex_text_from_entries,
+                                          GlobalVars.watched_keywords.keys())
+        cls._update_a_blacklist_dual_rule(cls.rule_blacklisted_websites,
+                                          get_non_bookended_keyword_regex_text_from_entries,
+                                          GlobalVars.blacklisted_websites)
+        cls._update_a_blacklist_dual_rule(cls.rule_blacklisted_usernames,
+                                          get_non_bookended_keyword_regex_text_from_entries,
+                                          GlobalVars.blacklisted_usernames)
         GlobalVars.blacklisted_numbers_full, GlobalVars.blacklisted_numbers, \
             GlobalVars.blacklisted_numbers_normalized = \
             phone_numbers.process_numlist(GlobalVars.blacklisted_numbers_raw)
@@ -769,6 +770,14 @@ def decorator(func):
             return decorator
 
 
+def create_multiple_rules(*args, rule_quantity=1, rule_id=None, **kwargs):
+    rules = []
+    for index in range(rule_quantity):
+        index_rule_id = rule_id + ' index:' + str(index)
+        rules.append(create_rule(*args, rule_id=index_rule_id, **kwargs))
+    return rules
+
+
 def is_whitelisted_website(url):
     # Imported from method link_at_end
     return bool(WHITELISTED_WEBSITES_REGEX.search(url)) or metasmoke_cache.is_website_whitelisted(url)
@@ -2407,29 +2416,33 @@ def religion_troll(s, site):
 
 
 # General blacklists, regex will be filled at the reload_blacklist() call at the bottom
-FindSpam.rule_bad_keywords = create_rule("bad keyword in {}", regex="",
-                                         username=True, body_summary=True,
-                                         max_rep=32, max_score=1, skip_creation_sanity_check=True,
-                                         rule_id="main blacklisted keywords")
-FindSpam.rule_watched_keywords = create_rule("potentially bad keyword in {}", regex="",
-                                             username=True, body_summary=True,
-                                             max_rep=32, max_score=1, skip_creation_sanity_check=True,
-                                             rule_id="main watchlist",
-                                             elapsed_time_reporting={
-                                                 'draw_attention_min': 20,
-                                                 'levels': [
-                                                     ('debug', '', 10),
-                                                     ('info', 'High ', 20),
-                                                     ('warning', '**Very High** ', 45),
-                                                 ],
-                                             })
-FindSpam.rule_blacklisted_websites = create_rule("blacklisted website in {}", regex="", body_summary=True,
-                                                 max_rep=52, max_score=5, skip_creation_sanity_check=True,
-                                                 username=True, rule_id="main blacklisted websites")
-FindSpam.rule_blacklisted_usernames = create_rule("blacklisted username", regex="",
-                                                  title=False, body=False, username=True,
-                                                  skip_creation_sanity_check=True,
-                                                  rule_id="main blacklisted usernames")
+FindSpam.rule_bad_keywords = create_multiple_rules("bad keyword in {}", regex="",
+                                                   username=True, body_summary=True,
+                                                   max_rep=32, max_score=1, skip_creation_sanity_check=True,
+                                                   rule_id="main blacklisted keywords",
+                                                   rule_quantity=2)
+FindSpam.rule_watched_keywords = create_multiple_rules("potentially bad keyword in {}", regex="",
+                                                       username=True, body_summary=True,
+                                                       max_rep=32, max_score=1, skip_creation_sanity_check=True,
+                                                       rule_id="main watchlist",
+                                                       elapsed_time_reporting={
+                                                           'draw_attention_min': 20,
+                                                           'levels': [
+                                                               ('debug', '', 10),
+                                                               ('info', 'High ', 20),
+                                                               ('warning', '**Very High** ', 45),
+                                                           ],
+                                                       },
+                                                       rule_quantity=2)
+FindSpam.rule_blacklisted_websites = create_multiple_rules("blacklisted website in {}", regex="", body_summary=True,
+                                                           max_rep=52, max_score=5, skip_creation_sanity_check=True,
+                                                           username=True, rule_id="main blacklisted websites",
+                                                           rule_quantity=2)
+FindSpam.rule_blacklisted_usernames = create_multiple_rules("blacklisted username", regex="",
+                                                            title=False, body=False, username=True,
+                                                            skip_creation_sanity_check=True,
+                                                            rule_id="main blacklisted usernames",
+                                                            rule_quantity=2)
 
 # Hardcoded bad keywords without a word boundary (from bad_keywords_nwb list above).
 create_rule("bad keyword in {}", regex=r"(?is){}".format("|".join(bad_keywords_nwb)),

diff --git a/test/test_findspam.py b/test/test_findspam.py
@@ -102,22 +102,22 @@
     ('IDNA misleading link', '<a href="http://www.h%c3%a5nd.no">http://www.h\u00E5nd.no</a>', '', '', False, False, False),
     ('Mostly punctuation', ';[].[.[.&_$)_\\*&_@$.[;*/-!#*&)(_.\'].1\\)!#_', '', '', False, False, True),
     ('Few unique', 'asdss, dadasssaadadda, daaaadadsss, ssa,,,addadas,ss\nsdadadsssadadas, sss\ndaaasdddsaaa, asd', '', '', False, False, True),
-    ('ketones', 'ketones', 'ketones', 'chemistry.stackexchange.com', False, False, False),
-    ('ketones', 'ketones', 'ketones', 'chemistry.stackexchange.com', False, True, False),
-    ('ketones', 'ketones', 'ketones', 'chemistry.stackexchange.com', True, False, False),
-    ('ketones', 'ketones', 'ketones', 'chemistry.stackexchange.com', True, True, False),
-    ('keytones', '<p>Some body</p>', 'a username', 'superuser.com', False, False, True),
-    ('A title', 'keytones', 'a username', 'superuser.com', False, False, True),
-    ('A title', '<p>Some body</p>', 'keytones', 'superuser.com', False, False, True),
-    ('keytones', '<p>Some body</p>', 'a username', 'superuser.com', False, True, False),
-    ('A title', 'keytones', 'a username', 'superuser.com', False, True, True),
-    ('A title', '<p>Some body</p>', 'keytones', 'superuser.com', False, True, True),
-    ('keytones', '<p>Some body</p>', 'a username', 'superuser.com', True, False, True),
-    ('A title', 'keytones', 'a username', 'superuser.com', True, False, True),
-    ('A title', '<p>Some body</p>', 'keytones', 'superuser.com', True, False, True),
-    ('keytones', '<p>Some body</p>', 'a username', 'superuser.com', True, True, False),
-    ('A title', 'keytones', 'a username', 'superuser.com', True, True, True),
-    ('A title', '<p>Some body</p>', 'keytones', 'superuser.com', True, True, True),
+    ('ketones on Chemistry', 'ketones', 'ketones', 'chemistry.stackexchange.com', False, False, False),
+    ('ketones on Chemistry as answer', 'ketones', 'ketones', 'chemistry.stackexchange.com', False, True, False),
+    ('ketones on Chemistry as body_summary', 'ketones', 'ketones', 'chemistry.stackexchange.com', True, False, False),
+    ('ketones on Chemistry as body_summary and answer', 'ketones', 'ketones', 'chemistry.stackexchange.com', True, True, False),
+    ('keytones on SuperUser', '<p>Some body</p>', 'a username', 'superuser.com', False, False, True),
+    ('keytones on SuperUser as answer', '<p>Some body</p>', 'a username', 'superuser.com', False, True, False),
+    ('A title with KyT in body', 'keytones', 'a username', 'superuser.com', False, False, True),
+    ('A title with KyT in username', '<p>Some body</p>', 'keytones', 'superuser.com', False, False, True),
+    ('A title with KyT in body as answer', 'keytones', 'a username', 'superuser.com', False, True, True),
+    ('A title with KyT in username as answer', '<p>Some body</p>', 'keytones', 'superuser.com', False, True, True),
+    ('keytones on SuperUser as body_summary', '<p>Some body</p>', 'a username', 'superuser.com', True, False, True),
+    ('A title with KyT in body as body_summary on SuperUser', 'keytones', 'a username', 'superuser.com', True, False, True),
+    ('A title with KyT in username as body_summary on SuperUser', '<p>Some body</p>', 'keytones', 'superuser.com', True, False, True),
+    ('keytones on SuperUser as body_summary and answer', '<p>Some body</p>', 'a username', 'superuser.com', True, True, False),
+    ('A title with KyT in body as body summary and answer', 'keytones', 'a username', 'superuser.com', True, True, True),
+    ('A title with KyT in username as body summary and answer', '<p>Some body</p>', 'keytones', 'superuser.com', True, True, True),
     ('C01nb4s3 support number', 'obfuscated_word in title', 'spammer', 'stackoverflow.com', False, False, True),
     ('obfuscated_word in body', 'C01nb4$3 support number', 'spammer', 'stackoverflow.com', False, False, True),
     ('''airline's responsibilities''', 'test case for "not obfuscated after all" (#7345)', 'good guy', 'stackoverflow.com', False, False, False),