-
Notifications
You must be signed in to change notification settings - Fork 0
/
scribd_notifier.py
138 lines (121 loc) · 4.31 KB
/
scribd_notifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
from selenium import webdriver
from time import sleep
from bs4 import BeautifulSoup
import argparse
import requests
from urllib.parse import urlparse, parse_qs
#############
# Argument Handling
parser = argparse.ArgumentParser()
parser.add_argument('scribd_search_url', help='URL from Scribd Search')
parser.add_argument('-d', '--discord', help='Discord Webhook URL')
parser.add_argument('-s', '--slack', help='Slack Webhook URL')
parser.add_argument('-v', '--verbose', action='store_true', help='Increase output verbosity')
args = parser.parse_args()
##############
# Helper Functions
def post_to_discord(title, link, author):
try:
payload = {
'content': f"New Document: [{title}]({link}) from {author}"
}
response = requests.post(args.discord, json=payload)
except Exception as e:
print(f"Error {e} posting to Discord")
def post_to_slack(title, link, author):
try:
payload = {
'text': f"New Document: <{link}|{title}> from {author}"
}
response = requests.post(args.slack, json=payload)
except Exception as e:
print(f"Error {e} posting to Slack")
#############
# Selenium Handling
options = webdriver.ChromeOptions()
options.add_argument("--headless=new")
driver = webdriver.Chrome(options=options)
#############
# Get HTML Page
scribd_url = args.scribd_search_url
try:
driver.get(scribd_url)
sleep(5)
soup = BeautifulSoup(driver.page_source, 'html.parser')
if args.verbose:
print(f"Got HTML source from Scribd search")
except Exception as e:
print(f"Error {e} occured getting HTML source")
#############
# Get All Title In Search Result
new_titles = []
elements = soup.find('ul', class_="_1kBhLK _3S_oce _1gyGKg _3MyJWQ _3qn-sP _3NAnxI _2q7Jiq _3MqOhW")
if elements == None:
elements = soup.find('ul', class_="_1kBhLK _1gyGKg _3MyJWQ _3qn-sP _3NAnxI _2q7Jiq _3MqOhW")
# XPATH = //*[@id="root"]/div/main/section/div[2]/div/div/ul
# TODO:Switch to XPATH Parsing instead finding a guess to class names
if elements != None:
for li_tag in elements.find_all('li'):
# Get data
curr_title = li_tag.find(class_="visually_hidden").text
curr_title_link = li_tag.find('a', class_='FluidCell-module_linkOverlay__v8dDs')['href']
curr_title_author = li_tag.find('span', {'data-e2e': 'author'}).text
# Build dict and add to list
curr_dict = {
'title' : curr_title,
'link' : curr_title_link,
'author' : curr_title_author
}
new_titles.append(curr_dict)
if args.verbose:
print(f"Found: {curr_title}")
else:
print("No Documents Found In Query!")
#############
# Read in from file for previous results
old_titles = []
query_params = parse_qs((urlparse(args.scribd_search_url).query))
search_query = query_params.get('query', [''])[0].replace(" ", "_")
try:
with open(search_query + ".txt", "r") as f:
old_titles = f.readlines()
if args.verbose:
print("File existed, sucessfully opened.")
except FileNotFoundError:
if args.verbose:
print(f"File doesn't exist, creating now.")
try:
open(search_query + ".txt", "x")
except Exception as e:
print(f"{e} occurred creating file!")
exit(-1)
except Exception as e:
print(f"{e} occured reading file!")
exit(-1)
old_titles = [x.upper().strip() for x in old_titles]
#############
# Compare and Ping
if len(old_titles) != 0:
for dict in new_titles:
if dict['title'].upper().strip() not in old_titles:
if args.discord != None:
post_to_discord(dict['title'], dict['link'], dict['author'])
elif args.slack != None:
post_to_slack(dict['title'], dict['link'], dict['author'])
else:
print(f"Found new title: {dict['title']}")
else:
for dict in new_titles:
if args.discord != None:
post_to_discord(dict['title'], dict['link'], dict['author'])
elif args.slack != None:
post_to_slack(dict['title'], dict['link'], dict['author'])
else:
print(f"Found new title: {dict['title']}")
#############
# Save New Array over file
with open(search_query + ".txt", "w") as f:
f.writelines("%s\n" % l['title'] for l in new_titles)
#############
# Exit and cleanup
driver.quit()