-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_data.py
57 lines (34 loc) Β· 1.06 KB
/
get_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import wikipediaapi
import os
import re
MAX_DEPTH = 2
def get_wiki_page(title):
wiki_wiki = wikipediaapi.Wikipedia("TechNova ([email protected])", "en")
page = wiki_wiki.page(title)
return page
articles = {}
def sanitize_filename(filename):
return re.sub(r'[<>:"/\\|?*]', "_", filename)
def gather_pages_recusively(page, depth=0):
if depth > MAX_DEPTH:
return
if page.title in articles:
return
articles[page.title] = page.text
# Create file for page in data folder
sanitized_title = sanitize_filename(page.title)
with open(f"data/{sanitized_title}.txt", "w", encoding="utf-8") as f:
f.write(page.text)
for link in page.links:
gather_pages_recusively(page.links[link], depth + 1)
def prepare_data_folder():
if not os.path.exists("data"):
os.makedirs("data")
def main():
prepare_data_folder()
title = "Python (programming language)"
page = get_wiki_page(title)
gather_pages_recusively(page)
print(articles.keys())
if __name__ == "__main__":
main()