forked from Charcoal-SE/SmokeDetector
-
Notifications
You must be signed in to change notification settings - Fork 0
/
recently_scanned_posts.py
172 lines (149 loc) · 7.05 KB
/
recently_scanned_posts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
# coding=utf-8
import time
from globalvars import GlobalVars
from helpers import log
from tasks import Tasks
POST_STRAIGHT_COPY_KEYS = [
'response_timestamp',
'last_edit_date',
'title',
'body_markdown',
]
POSTS_EXPIRE_INTERVAL = 10 * 60 # 10 minutes
def get_key_for_post(post):
if 'is_recently_scanned_post' in post:
return post.get('post_key', None)
site = post.get('site', None)
post_id = post.get('answer_id', None)
if post_id is None:
post_id = post.get('question_id', None)
if site is None or post_id is None:
log('warn', 'Unable to determine site or post_id for recently scanned post:'
' site:{}:: post_id: {}:: post:{}'.format(site, post_id, post))
return None
return "{}/{}".format(site, post_id)
def add_post(post, is_spam=None, reasons=None, why=None, scan_time=None, have_lock=None):
if 'is_recently_scanned_post' not in post:
post = get_recently_scanned_post_from_post(post)
new_key = post['post_key']
if new_key is None:
raise KeyError('post key is None')
new_record = {'post': post, 'scan_timestamp': time.time(),
'is_spam': is_spam, 'reasons': reasons, 'why': why,
'scan_time': scan_time}
if have_lock:
GlobalVars.recently_scanned_posts[new_key] = new_record
else:
with GlobalVars.recently_scanned_posts_lock:
GlobalVars.recently_scanned_posts[new_key] = new_record
def apply_timestamps_to_entry_from_post_and_time_if_newer(post, scanned_entry):
scanned_post = scanned_entry['post']
scanned_post_reponse_timestamp = scanned_post.get('response_timestamp', 0)
post_reponse_timestamp = post.get('response_timestamp', 0)
if post_reponse_timestamp > scanned_post_reponse_timestamp:
scanned_entry['scan_timestamp'] = time.time()
scanned_entry['post']['response_timestamp'] = post.get('response_timestamp', None)
def update_entry_timestamp_if_newer(post, have_lock=None):
key = get_key_for_post(post)
if key is None:
raise KeyError('post key is None')
try:
if have_lock:
rs_entry = GlobalVars.recently_scanned_posts[key]
apply_timestamps_to_entry_from_post_and_time_if_newer(post, rs_entry)
else:
with GlobalVars.recently_scanned_posts_lock:
rs_entry = GlobalVars.recently_scanned_posts[key]
apply_timestamps_to_entry_from_post_and_time_if_newer(post, rs_entry)
except KeyError:
# If the record doesn't exist, we add it.
add_post(post, have_lock=have_lock)
def get_check_equality_data(post):
return (
post.get('last_edit_date', None),
post.get('title', None),
post.get('owner_name', None),
post.get('body_markdown', None),
)
def compare_posts(post, scanned_post):
result = {}
post_resonse_timestamp = post.get('response_timestamp', 0)
scanned_post_resonse_timestamp = scanned_post.get('response_timestamp', 0)
post_is_older = post_resonse_timestamp < scanned_post_resonse_timestamp
result['is_older'] = post_is_older
if post_is_older:
result['is_older_or_unchanged'] = True
return result
scanned_equality_data = get_check_equality_data(scanned_post)
post_equality_data = get_check_equality_data(post)
scanned_equality_data = get_check_equality_data(scanned_post)
is_unchanged = post_equality_data == scanned_equality_data
result['is_unchanged'] = is_unchanged
result['is_older_or_unchanged'] = is_unchanged or post_is_older
result['is_grace_edit'] = False
if not is_unchanged and post_equality_data[0] == scanned_equality_data[0]:
# This should be a grace period edit
what_changed = [post_equality_data[count] == scanned_equality_data[count]
for count in range(len(post_equality_data))]
post_key = post.get('post_key', None)
log('debug', 'GRACE period edit: {}:: matching(ED,T,U,MD):{}:: '.format(post_key, what_changed))
result['is_grace_edit'] = True
return result
def get_recently_scanned_post_from_post(post):
if 'is_recently_scanned_post' in post:
# It's already a RS post
return post
rs_post = {key: post.get(key, None) for key in POST_STRAIGHT_COPY_KEYS}
rs_post['is_recently_scanned_post'] = True
owner_dict = post.get('owner', {})
owner_name = owner_dict.get('display_name', None)
rs_post['owner_name'] = owner_name
rs_post['post_key'] = get_key_for_post(post)
return rs_post
def atomic_compare_update_and_get_spam_data(post, have_lock=False, update=True):
try:
my_lock = False
if not have_lock:
my_lock = GlobalVars.recently_scanned_posts_lock.acquire()
post_rs = post
if 'is_recently_scanned_post' not in post:
post_rs = get_recently_scanned_post_from_post(post)
post_key = post_rs.get('post_key', None)
if post_key is None:
# Without a post_key, we can't check or store.
raise KeyError('post key is None')
scanned_entry = GlobalVars.recently_scanned_posts.get(post_key, None)
if scanned_entry is None or scanned_entry.get('is_spam', None) is None:
if update:
add_post(post_rs, have_lock=True)
return {'is_older_or_unchanged': False, 'no_scanned_entry': True}
scanned_post = scanned_entry['post']
compare_info = compare_posts(post_rs, scanned_post)
if update:
apply_timestamps_to_entry_from_post_and_time_if_newer(post_rs, scanned_entry)
for key in ['is_spam', 'reasons', 'why']:
compare_info[key] = scanned_entry.get(key, None)
return compare_info
except Exception:
raise
finally:
if my_lock:
GlobalVars.recently_scanned_posts_lock.release()
def expire_posts():
min_retained_timestamp = time.time() - GlobalVars.recently_scanned_posts_retention_time
with GlobalVars.recently_scanned_posts_lock:
# A dict comprehension can be used to do this:
# GlobalVars.recently_scanned_posts = {key: value for key, value in GlobalVars.recently_scanned_posts.items()
# if value['scan_timestamp'] > min_retained_timestamp}
# But, that has a notably higher memory requirement than deleting the entries.
# Where the right trade-off wrt. higher memory use vs. maybe more time for del/pop isn't clear and will depend
# on the size of the dict and memory/CPU available for the particular SD instance.
rs_posts = GlobalVars.recently_scanned_posts
original_length = len(rs_posts)
keys_to_delete = [key for key, value in rs_posts.items() if value['scan_timestamp'] < min_retained_timestamp]
for key in keys_to_delete:
rs_posts.pop(key, None)
new_length = len(rs_posts)
log('debug', 'Expire recently scanned posts: start: '
'{}:: now: {}:: expired: {}'.format(original_length, new_length, original_length - new_length))
Tasks.periodic(expire_posts, interval=POSTS_EXPIRE_INTERVAL)