Skip to content

Commit

Permalink
Address issue #8
Browse files Browse the repository at this point in the history
  • Loading branch information
FBroy committed Aug 18, 2021
1 parent 3c422b3 commit 67485c1
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 27 deletions.
14 changes: 12 additions & 2 deletions bin/accountCreator.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,12 @@
logging.error("Approval required, skipping...\n")
continue
time.sleep(2)
browser.find_elements_by_name('button')[0].click()
buttons = browser.find_elements_by_name('button')
if len(buttons) != 0:
buttons[0].click()
else:
logging.error("No button found, skipping...")
continue
time.sleep(5)
if args.verbose:
logging.info("Registered successfully!\n")
Expand Down Expand Up @@ -129,7 +134,12 @@
response2 = response2.json()
try:
# Search for the email verification code
confirmationLink = re.findall('https://.*/auth/.*', response2['textBody'])[0]
links = re.findall('https://.*/auth/.*', response2['textBody'])
if len(links) != 0:
confirmationLink = links[0]
else:
logging.error("No link found, skipping...")
continue
if args.verbose:
logging.info(confirmationLink)
# Open the email verification link to verify the email
Expand Down
48 changes: 23 additions & 25 deletions bin/feeder.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,8 +151,7 @@
urls = extractor.find_urls(account['note'])
for url in urls:
# If the url is not valid, drop it and continue
surl = url.split()[0]
if not validators.url(surl):
if not validators.url(url):
continue

output = {}
Expand All @@ -163,37 +162,37 @@
output['meta'] = {}
output['meta']['activitypub:account_id'] = account['id']

output['meta']['activitypub:url-extracted'] = surl
output['meta']['activitypub:url-extracted'] = url

signal.alarm(10)
try:
article = newspaper.Article(surl)
article = newspaper.Article(url)
except TimeoutError:
if args.verbose:
logging.error(f"Timeout reached for {surl}")
logging.error(f"Timeout reached for {url}")
continue
else:
signal.alarm(0)

# Caching
if r.exists(f"cu:{base64.b64encode(surl.encode())}"):
if r.exists(f"cu:{base64.b64encode(url.encode())}"):
if args.verbose:
logging.info(f"URL {surl} already processed")
logging.info(f"URL {url} already processed")
if not args.nocache:
continue
else:
r.set(f"cu:{base64.b64encode(surl.encode())}", account['note'])
r.expire(f"cu:{base64.b64encode(surl.encode())}", cache_expire)
r.set(f"cu:{base64.b64encode(url.encode())}", account['note'])
r.expire(f"cu:{base64.b64encode(url.encode())}", cache_expire)

if args.verbose:
logging.info(f"Downloading and parsing {surl}")
logging.info(f"Downloading and parsing {url}")

try:
article.download()
article.parse()
except ArticleException:
if args.verbose:
logging.error(f"Unable to download/parse {surl}")
logging.error(f"Unable to download/parse {url}")
continue

output['data'] = article.html
Expand All @@ -204,7 +203,7 @@
article.nlp()
except:
if args.verbose:
logging.error(f"Unable to nlp {surl}")
logging.error(f"Unable to nlp {url}")
nlpFailed = True

obj = json.dumps(output['data'], indent=4, sort_keys=True)
Expand Down Expand Up @@ -275,10 +274,9 @@
urls = extractor.find_urls(status['content'])
for url in urls:
# If the url is not valid, drop it and continue
surl = url.split()[0]
if not validators.url(surl):
if not validators.url(url):
continue

output = {}
output['source'] = ailurlextract
output['source-uuid'] = uuid
Expand All @@ -287,37 +285,37 @@
output['meta'] = {}
output['meta']['activitypub:status_id'] = status['id']

output['meta']['activitypub:url-extracted'] = surl
output['meta']['activitypub:url-extracted'] = url

signal.alarm(10)
try:
article = newspaper.Article(surl)
article = newspaper.Article(url)
except TimeoutError:
if args.verbose:
logging.error(f"Timeout reached for {surl}")
logging.error(f"Timeout reached for {url}")
continue
else:
signal.alarm(0)

# Caching
if r.exists(f"cu:{base64.b64encode(surl.encode())}"):
if r.exists(f"cu:{base64.b64encode(url.encode())}"):
if args.verbose:
logging.info(f"URL {surl} already processed")
logging.info(f"URL {url} already processed")
if not args.nocache:
continue
else:
r.set(f"cu:{base64.b64encode(surl.encode())}", status['content'])
r.expire(f"cu:{base64.b64encode(surl.encode())}", cache_expire)
r.set(f"cu:{base64.b64encode(url.encode())}", status['content'])
r.expire(f"cu:{base64.b64encode(url.encode())}", cache_expire)

if args.verbose:
logging.info(f"Downloading and parsing {surl}")
logging.info(f"Downloading and parsing {url}")

try:
article.download()
article.parse()
except ArticleException:
if args.verbose:
logging.error(f"Unable to download/parse {surl}")
logging.error(f"Unable to download/parse {url}")
continue

output['data'] = article.html
Expand All @@ -328,7 +326,7 @@
article.nlp()
except:
if args.verbose:
logging.error(f"Unable to nlp {surl}")
logging.error(f"Unable to nlp {url}")
nlpFailed = True

obj = json.dumps(output['data'], indent=4, sort_keys=True)
Expand Down

0 comments on commit 67485c1

Please sign in to comment.