-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathiascraper.py.broken
51 lines (41 loc) · 2.22 KB
/
iascraper.py.broken
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# Python implementation of the archive.org booklet scraper for Bash (on GNU/Linux)
# Fully cross-platform
# Less giberish in the terminal window (only current page being downloaded and total number of pages to download, no HTTP headers and shiet
# https://github.com/phoemur/wgetter
import os
import requests
# Pages to download
fraum = 1 # 1st page to download
tou = 500 # Last page to download (overshoot is not an issue, will just add empty images easy to delete afterwards)
# Item to download
part1 = 'https://ia801305.us.archive.org/BookReader/BookReaderImages.php?zip=/6/items/KurtSaxonThePoorMansJamesBondVol5.pdf/Kurt%20Saxon%20-%20The%20Poor%20Mans%20James%20Bond%20-%20Vol%201_jp2.zip&file=Kurt%20Saxon%20-%20The%20Poor%20Mans%20James%20Bond%20-%20Vol%201_jp2/Kurt%20Saxon%20-%20The%20Poor%20Mans%20James%20Bond%20-%20Vol%201_' # Cut at the underscore '_' before the four digits of the page
part2 = '.jp2&scale=0&rotate=0' # if you increase the scale value you'll have smaller images (less space but lesser quality), best is 0 or 1 (500kB/1MB per page), worst is 4 (50-100kB per page)
# change the user agent if needed, the cookies is on the wget line with cookies.txt by default
useragent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)"
# change extension if not jpeg
ext = ".jpg"
# this is the default number of zeroes for 1 digit numbers, 0000 0001 0002 0003 0004 (leave as default for any item with less than 9999 pages, raise a github issue if you have an item with 10k pages or more to download)
zer = "000"
# Name of the folder where the item pages are going to be put (TODO: Regex)
name= "my_archive_book" #"$(echo $ura | sed 's/.*items\/\(.*\)/\1/' | awk -F'/' '{print $1}')"
pwd = os.getcwd()
folder = pwd + '\\' + name
try:
os.mkdir(folder)
except OSError:
print ("Creation of the directory %s failed" % name)
else:
print ("Successfully created the directory %s " % name)
while fraum <= tou:
if fraum >= 10:
zer = "00"
if fraum >= 100:
zer = "0"
if fraum >= 1000:
zer = ""
no = str(fraum)
noext = folder + '\\' + no + ".jpg"
image_url = part1 + no + part2
print('\r\npage ', fraum, '/', tou)
filename = wgetter.download(image_url, outdir=noext, cj="cookies.txt")
fraum = fraum + 1