-
Notifications
You must be signed in to change notification settings - Fork 0
/
sitemap_scraper.py
30 lines (25 loc) · 1.18 KB
/
sitemap_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
'''
scrapes sitemaps from winemag and pickles the links for later use
'''
import logging
import pickle
from samssimplescraper import LinksRetriever
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s- %(message)s',
datefmt='%Y-%m-%d %H:%M:%S',
filename='./logs/sitemap_scraper.log', filemode='w'
)
# instantiate LinksRetriever with the winemag.com sitemap
links_retriever = LinksRetriever(url='https://www.winemag.com/sitemap_index.xml', folders=True)
# get a list of the link using .get_sitemap_links method, filter for only wine review sitemaps
sitemap_links = links_retriever.get_sitemap_links(tag='loc', link_filter='wine_review-sitemap')
assert all(isinstance(s, str) for s in sitemap_links)
logging.info('the sitemap link list is %s links long', len(sitemap_links))
# scrape all the wine review links
total_links = links_retriever.get_next_links(links=sitemap_links, tag='loc')
assert all(isinstance(s, str) for s in total_links)
logging.info('The final web links list is %s links long', len(total_links))
# save list for use on multiple servers
with open('./data/pickled_lists/total_links_list.pkl', 'wb') as fp:
pickle.dump(total_links, fp)