Skip to content

Commit

Permalink
count subpages in progress
Browse files Browse the repository at this point in the history
  • Loading branch information
dale-wahl committed Sep 12, 2023
1 parent efadeb0 commit b18fad6
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 3 deletions.
5 changes: 4 additions & 1 deletion datasources/url_scraper/search_webpages.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,12 +74,14 @@ def get_items(self, query):
"""
self.dataset.log('Query: %s' % str(query))
self.dataset.log('Parameters: %s' % str(self.parameters))
scrape_additional_subpages = self.parameters.get("subpages")
scrape_additional_subpages = self.parameters.get("subpages", 0)
urls_to_scrape = [{'url':url, 'base_url':url, 'num_additional_subpages': scrape_additional_subpages, 'subpage_links':[]} for url in query.get('urls')]

# Do not scrape the same site twice
scraped_urls = set()
num_urls = len(urls_to_scrape)
if scrape_additional_subpages:
num_urls = num_urls * scrape_additional_subpages
done = 0

while urls_to_scrape:
Expand Down Expand Up @@ -185,6 +187,7 @@ def get_items(self, query):
while iframe_links:
link = iframe_links.pop(0)
if self.check_exclude_link(link, scraped_urls):
num_urls += 1
# Add it to be scraped next
urls_to_scrape.insert(0, {
'url': link,
Expand Down
6 changes: 4 additions & 2 deletions datasources/web_archive_scraper/search_web_archive.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,18 +102,20 @@ def get_items(self, query):
:return:
"""
self.dataset.log('query: ' + str(query))
http_request = self.parameters.get("http_request") == 'both'
http_request = self.parameters.get("http_request", "selenium_only") == 'both'
if http_request:
self.dataset.update_status('Scraping Web Archives with Selenium %s and HTTP Requests' % config.get('selenium.browser'))
else:
self.dataset.update_status('Scraping Web Archives with Selenium %s' % config.get('selenium.browser'))
scrape_additional_subpages = self.parameters.get("subpages")
scrape_additional_subpages = self.parameters.get("subpages", 0)

urls_to_scrape = [{'url':url['url'], 'base_url':url['base_url'], 'year':url['year'], 'num_additional_subpages': scrape_additional_subpages, 'subpage_links':[]} for url in query.get('preprocessed_urls')]

# Do not scrape the same site twice
scraped_urls = set()
num_urls = len(urls_to_scrape)
if scrape_additional_subpages:
num_urls = num_urls * scrape_additional_subpages
done = 0

while urls_to_scrape:
Expand Down

0 comments on commit b18fad6

Please sign in to comment.