From 75fe0bd29c9a93076b6d8b3db887ce6d11f58fe5 Mon Sep 17 00:00:00 2001 From: user12986714 <65436504+user12986714@users.noreply.github.com> Date: Thu, 16 Jul 2020 23:31:21 -0400 Subject: [PATCH] Strip code block in mostly-img reason This should be reworked once https://chat.stackexchange.com/transcript/message/54842978 get implemented. --- findspam.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/findspam.py b/findspam.py index 7420ac0c21..5cfabc9b1f 100644 --- a/findspam.py +++ b/findspam.py @@ -608,12 +608,19 @@ def len_img_block(string): # max_score=2 to prevent voting fraud @create_rule("post is mostly images", title=False, max_rep=201, max_score=2) def mostly_img(s, site): - if len(s) == 0: + s_len_orig = len(s) + if s_len_orig == 0: return False, "" + # Strip code blocks manually. This should be removed once feature + # https://chat.stackexchange.com/transcript/message/54842978 + # get implemented. + s = regex.sub("(?s)
.*?
", "\nstripped pre\n", s) + s = regex.sub("(?s).*?", "\nstripped code\n", s) + s_len_img = len_img_block(s) - if s_len_img / len(s) > IMG_TXT_R_THRES: - return True, "{:.4f} of the post is html image blocks".format(s_len_img / len(s)) + if s_len_img / s_len_orig > IMG_TXT_R_THRES: + return True, "{:.4f} of the post is html image blocks".format(s_len_img / s_len_orig) return False, ""