Skip to content

Commit

Permalink
fix: [Mail module] replace signal by multiprocessing (https://docs.py…
Browse files Browse the repository at this point in the history
  • Loading branch information
Terrtia committed May 12, 2020
1 parent ae5c7cd commit 3f6efad
Showing 1 changed file with 29 additions and 29 deletions.
58 changes: 29 additions & 29 deletions bin/Mail.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,25 +21,14 @@
import dns.resolver
import dns.exception

from multiprocessing import Process as Proc
from multiprocessing import Queue

from pubsublogger import publisher
from Helper import Process

from pyfaup.faup import Faup

## REGEX TIMEOUT ##
import signal

def timeout_handler(signum, frame):
raise TimeoutException()

class TimeoutException(Exception):
pass


signal.signal(signal.SIGALRM, timeout_handler)
max_execution_time = 20
## -- ##

sys.path.append(os.path.join(os.environ['AIL_BIN'], 'packages'))
import Item

Expand All @@ -55,6 +44,7 @@ class TimeoutException(Exception):

config_loader = None
## -- ##

def is_mxdomain_in_cache(mxdomain):
return r_serv_cache.exists('mxdomain:{}'.format(mxdomain))

Expand Down Expand Up @@ -120,6 +110,9 @@ def check_mx_record(set_mxdomains, dns_server):
print(e)
return valid_mxdomain

def extract_all_emails(queue, item_content):
queue.put(re.findall(email_regex, item_content))

if __name__ == "__main__":
publisher.port = 6380
publisher.channel = "Script"
Expand All @@ -135,32 +128,37 @@ def check_mx_record(set_mxdomains, dns_server):
# Numbers of Mails needed to Tags
mail_threshold = 10

max_execution_time = 30

email_regex = "[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}"

q = Queue()

while True:
message = p.get_from_set()

if message is not None:
item_id, score = message.split()

item_content = Item.get_item_content(item_id)
item_date = Item.get_item_date(item_id)

#print(item_id)

# Get all emails address
signal.alarm(30)
proc = Proc(target=extract_all_emails, args=(q, item_content))
proc.start()
try:
all_emails = re.findall(email_regex, item_content)
except TimeoutException:
p.incr_module_timeout_statistic()
err_mess = "Mails: processing timeout: {}".format(item_id)
print(err_mess)
publisher.info(err_mess)
signal.signal(signal.SIGALRM, timeout_handler)
continue
finally:
signal.alarm(0)
proc.join(max_execution_time)
if proc.is_alive():
proc.terminate()
p.incr_module_timeout_statistic()
err_mess = "Mails: processing timeout: {}".format(item_id)
print(err_mess)
publisher.info(err_mess)
continue
else:
all_emails = q.get()
except KeyboardInterrupt:
print("Caught KeyboardInterrupt, terminating workers")
proc.terminate()
sys.exit(0)

# filtering duplicate
all_emails = set(all_emails)
Expand All @@ -179,6 +177,8 @@ def check_mx_record(set_mxdomains, dns_server):

valid_mx = check_mx_record(set_mxdomains, dns_server)

item_date = Item.get_item_date(item_id)

num_valid_email = 0
for domain_mx in valid_mx:
num_valid_email += len(dict_mxdomains_email[domain_mx])
Expand Down

0 comments on commit 3f6efad

Please sign in to comment.