Skip to content

Commit

Permalink
add setting to toggle display advanced options
Browse files Browse the repository at this point in the history
  • Loading branch information
dale-wahl committed Sep 12, 2023
1 parent f2ce1c3 commit 3e93182
Show file tree
Hide file tree
Showing 3 changed files with 110 additions and 80 deletions.
6 changes: 6 additions & 0 deletions common/lib/config_definition.py
Original file line number Diff line number Diff line change
Expand Up @@ -414,6 +414,12 @@
"help": "Firefox Extensions",
"tooltip": "Can be used by certain processors and datasources",
},
"selenium.display_advanced_options": {
"type": UserInput.OPTION_TOGGLE,
"default": True,
"help": "Show advanced options",
"tooltip": "Show advanced options for Selenium processors",
},
"selenium.installed": {
"type": UserInput.OPTION_TOGGLE,
"default": False,
Expand Down
71 changes: 44 additions & 27 deletions datasources/url_scraper/search_webpages.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import datetime
import random

from common.config_manager import config
from backend.lib.selenium_scraper import SeleniumScraper
from common.lib.exceptions import QueryParametersException, ProcessorInterruptedException
from common.lib.helpers import validate_url
Expand All @@ -20,33 +21,49 @@ class SearchWithSelenium(SeleniumScraper):
extension = "ndjson"
max_workers = 1

options = {
"intro-1": {
"type": UserInput.OPTION_INFO,
"help": "This data source uses [Selenium](https://selenium-python.readthedocs.io/) in combination with "
"a [Firefox webdriver](https://github.com/mozilla/geckodriver/releases) and Firefox for linux "
"to scrape the HTML source code. "
"\n"
"By mimicing a person using an actual browser, this method results in source code that closer "
"resembles the source code an actual user receives when compared with simple HTML requests. It "
"will also render JavaScript that starts as soon as a url is retrieved by a browser. "
},
"query-info": {
"type": UserInput.OPTION_INFO,
"help": "Please enter a list of urls one per line."
},
"query": {
"type": UserInput.OPTION_TEXT_LARGE,
"help": "List of urls"
},
"subpages": {
"type": UserInput.OPTION_TEXT,
"help": "Crawl additional host links/subpages",
"min": 0,
"max": 5,
"default": 0
},
}
@classmethod
def get_options(cls, parent_dataset=None, user=None):
options = {
"intro-1": {
"type": UserInput.OPTION_INFO,
"help": "This data source uses [Selenium](https://selenium-python.readthedocs.io/) in combination with "
"a [Firefox webdriver](https://github.com/mozilla/geckodriver/releases) and Firefox for linux "
"to scrape the HTML source code. "
"\n"
"By mimicing a person using an actual browser, this method results in source code that closer "
"resembles the source code an actual user receives when compared with simple HTML requests. It "
"will also render JavaScript that starts as soon as a url is retrieved by a browser. "
},
"query-info": {
"type": UserInput.OPTION_INFO,
"help": "Please enter a list of urls one per line."
},
"query": {
"type": UserInput.OPTION_TEXT_LARGE,
"help": "List of urls"
},

}
if config.get("selenium.display_advanced_options", False, user=user):
options["subpages"] = {
"type": UserInput.OPTION_TEXT,
"help": "Crawl additional links/subpages",
"min": 0,
"max": 5,
"default": 0,
"tooltip": "If enabled, the scraper will also crawl and collect random links found on the provided page."
}

return options

@classmethod
def is_compatible_with(cls, module=None, user=None):
"""
Allow processor on image sets
:param module: Module to determine compatibility with
"""
return config.get('selenium.installed', False, user=user)

def get_items(self, query):
"""
Expand Down
113 changes: 60 additions & 53 deletions datasources/web_archive_scraper/search_web_archive.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,61 +31,68 @@ class SearchWebArchiveWithSelenium(SeleniumScraper):
# Web Archive will load and then redirect after a few seconds; check for new page to load
redirect_text = ['Got an HTTP 302 response at crawl time', 'Got an HTTP 301 response at crawl time']

options = {
"intro-1": {
"type": UserInput.OPTION_INFO,
"help": "This data source uses [Selenium](https://selenium-python.readthedocs.io/) in combination with "
"a [Firefox webdriver](https://github.com/mozilla/geckodriver/releases) and Firefox for linux "
"to scrape the HTML source code. "
"\n"
"By mimicing a person using an actual browser, this method results in source code that closer "
"resembles the source code an actual user receives when compared with simple HTML requests. It "
"will also render JavaScript that starts as soon as a url is retrieved by a browser. "
},
"query-info": {
"type": UserInput.OPTION_INFO,
"help": "Please enter a list of urls one per line."
},
"query": {
"type": UserInput.OPTION_TEXT_LARGE,
"help": "List of urls"
},
"frequency": {
"type": UserInput.OPTION_CHOICE,
"help": "Frequency over time period",
"tooltip": "Default 'First Available' scrapes the first available result after start date",
"options": {
"first": "First Available",
"monthly": "Monthly",
"weekly": "Weekly",
"daily": "Daily",
"yearly": "Yearly"
@classmethod
def get_options(cls, parent_dataset=None, user=None):
options = {
"intro-1": {
"type": UserInput.OPTION_INFO,
"help": "This data source uses [Selenium](https://selenium-python.readthedocs.io/) in combination with "
"a [Firefox webdriver](https://github.com/mozilla/geckodriver/releases) and Firefox for linux "
"to scrape the HTML source code. "
"\n"
"By mimicing a person using an actual browser, this method results in source code that closer "
"resembles the source code an actual user receives when compared with simple HTML requests. It "
"will also render JavaScript that starts as soon as a url is retrieved by a browser. "
},
"default": "first"
},
"daterange": {
"type": UserInput.OPTION_DATERANGE,
"tooltip": "Scrapes first available page after start date; Uses start and end date for frequency",
"help": "Date range"
},
"subpages": {
"type": UserInput.OPTION_TEXT,
"help": "Crawl additional host links/subpages",
"min": 0,
"max": 5,
"default": 0
},
"http_request": {
"type": UserInput.OPTION_CHOICE,
"help": "HTTP or Selenium request",
"tooltip": "HTTP request added to body field; HTTP request not parsed for text",
"options": {
"both": "Both HTTP request and Selenium WebDriver",
"selenium_only": "Only use Selenium WebDriver",
"query-info": {
"type": UserInput.OPTION_INFO,
"help": "Please enter a list of urls one per line."
},
"default": "selenium_only"
},
}
"query": {
"type": UserInput.OPTION_TEXT_LARGE,
"help": "List of urls"
},
"frequency": {
"type": UserInput.OPTION_CHOICE,
"help": "Frequency over time period",
"tooltip": "Default 'First Available' scrapes the first available result after start date",
"options": {
"first": "First Available",
"monthly": "Monthly",
"weekly": "Weekly",
"daily": "Daily",
"yearly": "Yearly"
},
"default": "first"
},
"daterange": {
"type": UserInput.OPTION_DATERANGE,
"tooltip": "Scrapes first available page after start date; Uses start and end date for frequency",
"help": "Date range"
},
}

if config.get("selenium.display_advanced_options", False, user=user):
options["subpages"] = {
"type": UserInput.OPTION_TEXT,
"help": "Crawl additional links/subpages",
"min": 0,
"max": 5,
"default": 0,
"tooltip": "If enabled, the scraper will also crawl and collect random links found on the provided page."
}
options["http_request"] = {
"type": UserInput.OPTION_CHOICE,
"help": "HTTP or Selenium request",
"tooltip": "Scrape data with HTTP (python request library) and/or Selenium (automated browser to better imitate a real user); HTTP response is added to body field, but not currently parsed to extract text",
"options": {
"both": "Both HTTP request and Selenium WebDriver",
"selenium_only": "Only use Selenium WebDriver",
},
"default": "selenium_only"
}

return options

def get_items(self, query):
"""
Expand Down

0 comments on commit 3e93182

Please sign in to comment.