forked from rat-nick/aniRec
-
Notifications
You must be signed in to change notification settings - Fork 0
/
fetch_items.py
46 lines (40 loc) · 1.16 KB
/
fetch_items.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import glob
import os
import time
from multiprocessing import Pool
from urllib.error import HTTPError
from urllib.request import urlopen
URL_BASE = "https://myanimelist.net/anime/"
LAKE_PATH = "data/lake/items"
ids = [i for i in range(100001)]
html_files = glob.glob(LAKE_PATH + "/*.html")
fetched = [int(os.path.splitext(os.path.basename(file))[0]) for file in html_files]
ids = list(set(ids) - set(fetched))
def get_item_page(id):
try:
page = urlopen(f"{URL_BASE}/{id}")
html = page.read().decode("utf-8")
with open(f"{LAKE_PATH}/{id}.html", "w") as f:
f.write(html)
except HTTPError as e:
print(f"{e.code}\t-\t{id}")
if e.code == 429:
time.sleep(20)
try:
get_item_page(id)
except:
pass
# for id in ids:
# try:
# get_item_page(id)
# except HTTPError as e:
# print(f"{e.code}\t-\t{id}")
# if e.code == 429:
# input()
# try:
# get_item_page(id)
# except:
# pass
if __name__ == "__main__":
with Pool() as pool:
pool.map(get_item_page, ids)