forked from elastic/blog-langchain-elasticsearch
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstep-1A-scrape-urls.py
55 lines (45 loc) · 1.75 KB
/
step-1A-scrape-urls.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import requests
from bs4 import BeautifulSoup
import pickle
print("""
_______. ______ .______ ___ .______ _______
/ | / || _ \ / \ | _ \ | ____|
| (----`| ,----'| |_) | / ^ \ | |_) | | |__
\ \ | | | / / /_\ \ | ___/ | __|
.----) | | `----.| |\ \----./ _____ \ | | | |____
|_______/ \______|| _| `._____/__/ \__\ | _| |_______|
""")
page_url = 'https://starwars.fandom.com/wiki/Category:Canon_articles' # all canon articles
base_url = 'https://starwars.fandom.com'
pages = {}
page_num = 1
while page_url is not None:
result = requests.get(page_url)
content = result.content
soup = BeautifulSoup(content, "html.parser")
# extract urls
links = soup.find_all('a', class_='category-page__member-link')
links_before = len(pages)
if links:
for link in links:
url = base_url + link.get('href')
key = link.get('href').split('/')[-1]
if 'Category:' not in key:
pages[key] = url
new_links = len(pages) - links_before
print(f'Page {page_num} - {new_links} new links ({page_url})')
page_num += 1
# get next page button
next_urls = soup.find_all("a", class_='category-page__pagination-next')
if next_urls:
new_url = next_urls[0].get('href')
if new_url == page_url:
break
else:
page_url = new_url
else:
page_url = None
print(f'Number of pages: {len(pages)}')
# Save to disk
with open('./Dataset/starwars_all_canon_dict.pickle', 'wb') as f:
pickle.dump(pages, f, protocol=pickle.HIGHEST_PROTOCOL)