-
Notifications
You must be signed in to change notification settings - Fork 1
/
BookDownloaderArchiveOrg.py
103 lines (101 loc) · 3.4 KB
/
BookDownloaderArchiveOrg.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
from selenium import webdriver
import time
from selenium.webdriver.chrome.options import Options
import requests
import os
import threading
import shutil
chrome_options = Options()
email = input("The email for your Archive.org account: ")
password = input("And the password: ")
url = input("Url of book you want to download: ")
dir = input("Where do you want to save the books pages (relative to current directory) : ")
pages = input("How many pages do you want to download: ")
quality = input("What scale do you want the images 1-10 (lower is better res): ")
print("Starting.. btw because of multi threading dont exit the program until it says PRESS ANY KEY TO CONTINUE that means all threads are completed")
quality = int(quality)
pages = int(pages)
chrome_options.add_experimental_option("detach", True)
chrome_options.add_argument("--headless")
driver = webdriver.Chrome("./chromedriver.exe",chrome_options=chrome_options)
driver.get("https://archive.org/account/login")
user = driver.find_element_by_name("username")
pwd = driver.find_element_by_name("password")
cookies_dict = {}
user.send_keys(email)
pwd.send_keys(password+"\n")
time.sleep(0.5)
s = requests.Session()
def Download(url,directory,bookname,pagenum):
if(os.path.exists(directory)):
pass
else:
os.mkdir(directory)
print("Downloading Page: " + str(pagenum))
r = requests.get(url, stream=True,cookies=cookies_dict)
r.raw.decode_content = True
with open(directory+"/"+str(pagenum)+".jpg", 'wb') as out_file:
shutil.copyfileobj(r.raw, out_file)
def split(string):
splitstring = []
for i in string:
splitstring.append(i)
return splitstring
def join(array):
result = ""
for i in array:
result+=str(i)
return result
def FloatToString(float,digits):
float = split(str(float))[1:digits]
float = "".join(float)
float = float.replace(".","")
if(len(split(float)) != 4):
float = "0"+float
return float
def GetPagesArray(srcurl,maxpages):
pages = []
srcurl = srcurl.split("&scale=4")
tmp = split(srcurl[0])
tmp[len(tmp)-1]="2&scale="+str(quality)
tmp = "".join(tmp)
srcurl = str("".join(tmp)+"".join(srcurl[1]))
modurl = srcurl
i=0.0001
while(len(split(str(maxpages))) != 4):
maxpages = "0"+str(maxpages)
if(len(split(maxpages)) == 4):
maxpages = "0."+maxpages
maxpages = float(maxpages)
while(i < maxpages):
modurl = modurl.split(".jp2")
spliturl = split(modurl[0])
spliturl = spliturl[:len(spliturl)-4]
value = split(FloatToString(i,6))
spliturl+=value
modurl = "".join(spliturl)+".jp2&"+modurl[1]
modurl = "".join(modurl)
pages.append(modurl)
modurl=srcurl
i+=0.0001
return pages
def GetBaseUrl(bookurl,num = 0):
success = 0
while(success != 1):
if(num == 0):
driver.get(bookurl)
try:
base = driver.find_elements_by_class_name("BRpageimage")[1].get_attribute("src")
success = 1
except Exception as e:
num=num+1
pass
cookies = driver.get_cookies()
for cookie in cookies:
cookies_dict[cookie['name']] = cookie['value']
return base
print("Finding pages....")
download = GetPagesArray(GetBaseUrl(url),pages)
print("Downloading pages....")
for i in range(0,len(download)):
threading.Thread(target=Download,args=(download[i],dir,"",i)).start()