forked from jpyamamoto/Issuu-PDF-Downloader
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathframework.py
53 lines (43 loc) · 1.52 KB
/
framework.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import re
import urllib
# ----------
# CONSTANTS
# ----------
WIDTH_IMAGE = 1600
HEIGHT_IMAGE = 750
WIDTH_PDF = 1100
HEIGHT_PDF = 900
NAME_PDF = 'output.pdf'
def main():
import core.downloader as program
print("Starting...\n")
url = input("Enter the url of the PDF:")
# Check that the URL provided by the user points to the entire document
# and not to a specific page (e.g. https://issuu.com/user/docs/doc
# instead of https://issuu.com/user/docs/doc/18)
url_end = re.search(r'(.+)/\d+/?$', url)
if url_end:
# If there is a page number at the end of the URL
print('The URL provided points to a specific page in the document.')
url_without_page_number = url_end.group(1)
print('Using the following URL instead:')
print(url_without_page_number)
url = url_without_page_number
else:
# If the URL points to the entire document, without any page number
pass
url_open1 = str(urllib.request.urlopen(url).read().decode("utf-8"))
# Credits to https://txt2re.com/ for the regex (Almost all of it)
# Sorry, I'm not lazy, but I hate making regex's
re1 = '.*?'
re2 = '((?:http|https)(?::\\/{2}[\\w]+)(?:[\\/|\\.]?)(?:[^\\s"]*)(?:png|jpg))'
rg = re.compile(re1+re2, re.IGNORECASE | re.DOTALL)
m = rg.search(url_open1)
if m:
httpurl = m.group(1)
print('Starting from URI: ' + httpurl)
program.downloader(httpurl)
else:
print("Error! No image was found")
if __name__ == '__main__':
main()