Skip to content

Commit

Permalink
process urls after start (keep original query parameters)
Browse files Browse the repository at this point in the history
  • Loading branch information
dale-wahl committed Sep 13, 2023
1 parent 8b098b1 commit fb6ea11
Showing 1 changed file with 8 additions and 8 deletions.
16 changes: 8 additions & 8 deletions datasources/web_archive_scraper/search_web_archive.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,15 +103,20 @@ def get_items(self, query):
:param query:
:return:
"""
self.dataset.log('query: ' + str(query))

http_request = self.parameters.get("http_request", "selenium_only") == 'both'
if http_request:
self.dataset.update_status('Scraping Web Archives with Selenium %s and HTTP Requests' % config.get('selenium.browser'))
else:
self.dataset.update_status('Scraping Web Archives with Selenium %s' % config.get('selenium.browser'))
scrape_additional_subpages = self.parameters.get("subpages", 0)

urls_to_scrape = [{'url':url['url'], 'base_url':url['base_url'], 'year':url['year'], 'num_additional_subpages': scrape_additional_subpages, 'subpage_links':[]} for url in query.get('preprocessed_urls')]
preprocessed_urls = []
for url in query.get('preprocessed_urls'):
url_group = SearchWebArchiveWithSelenium.create_web_archive_urls(url, query["min_date"], query["max_date"],
query.get('frequency'))
[preprocessed_urls.append(new_url) for new_url in url_group]
urls_to_scrape = [{'url':url['url'], 'base_url':url['base_url'], 'year':url['year'], 'num_additional_subpages': scrape_additional_subpages, 'subpage_links':[]} for url in preprocessed_urls]

# Do not scrape the same site twice
scraped_urls = set()
Expand Down Expand Up @@ -425,16 +430,11 @@ def validate_query(query, request, user):
if query["max_date"] < query["min_date"]:
raise QueryParametersException("End date must be after start date.")

preprocessed_urls = []
for url in validated_urls:
url_group = SearchWebArchiveWithSelenium.create_web_archive_urls(url, query["min_date"], query["max_date"], query.get('frequency'))
[preprocessed_urls.append(new_url) for new_url in url_group]

return {
"query": query.get("query"),
"min_date": query.get("min_date"),
"max_date": query.get("max_date"),
"preprocessed_urls": preprocessed_urls,
"validated_urls": validated_urls,
"subpages": query.get("subpages", 0),
'http_request': query.get("http_request", "selenium_only"),
}

0 comments on commit fb6ea11

Please sign in to comment.