Skip to content

Commit

Permalink
periodic repo update
Browse files Browse the repository at this point in the history
  • Loading branch information
scottgdaniel committed Sep 21, 2022
1 parent ba5bde8 commit 87ab207
Show file tree
Hide file tree
Showing 13 changed files with 649 additions and 384 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
data
data/*
**/node_modules/*

Expand Down
4 changes: 2 additions & 2 deletions code/downloader/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ For example, the case `1:16-cv-00001` in the Northern District of Illinois would
*Note: In some districts it is common to include judge initials at the end of a case id e.g. `2:15-cr-11112-ABC-DE` . These initials are always excluded from a UCID*.

## Runtime
The scraper is designed to run at night to reduce its impact on server load. By default it will only run between 8pm and 4am (CDT). These parameters can be altered and overridden through the `-rts,` `-rte` and `--override-time` options, see below for details.
The scraper is designed to run at night to reduce its impact on server load. By default it will only run between 6pm and 6am (CDT). These parameters can be altered and overridden through the `-rts,` `-rte` and `--override-time` options, see below for details.

## $$$
Pacer fees can rack up quickly! Running this scraper will incur costs to your own Pacer account. There are a number of options for the scraper that exist to limit the potential for accidentally incurring large charges:
Expand Down Expand Up @@ -150,7 +150,7 @@ Give slightly more verbose logging output
*Query Scraper*

- `-qc, --query-conf TEXT`
Configuration file (.json) for the query that will be used to populate the query form on Pacer. If none is specified the query builder will run in the terminal.
Configuration file (.json) for the query that will be used to populate the query form on Pacer. If none is specified the query builder will run in the terminal. (The query config format is fully described in the TEMPLATE_QUERY object in [forms.py](./forms.py), the most used fields are "filed_from", "filed_to", "nature_suit" and "case_status")

- `--query-prefix TEXT`
A prefix for the filenames of output query HTMLs. If date range of the query is greater than 180 days, the query will be split into chunks of 31 days to prevent PACER crashing while serving a large query results page. Multiple files will be created that follow the pattern `{query_prefix}__i.html` where `i` enumerates over the date range chunks.
Expand Down
279 changes: 142 additions & 137 deletions code/downloader/forms.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,119 +9,6 @@
sys.path.append(str(Path(__file__).resolve().parents[1]))
from support.core import std_path

class FormField:
''' Object that represents a single field as part of a form'''
def __init__(self, name, kind, value, selector, browser):
self.name = name
self.kind = kind
self.value = value
self.selector = selector
self.browser = browser

def locate(self, get_many=False):
''' Returns the web element'''
return locator(self.selector, self.browser, get_many)

def fill(self):
if self.kind == 'text':
el = self.locate()
try:
el.clear()
except:
time.sleep(2)
el = self.locate()
el.clear()
fill_text(self.selector, self.value, self.browser)
# send_keys(el, self.value)

elif self.kind == 'radio':
chosen = [el for el in self.locate(get_many=True) \
if el.get_attribute('value')==self.value][0]
chosen.click()

elif self.kind == 'checkbox':
current_state = bool(self.locate().get_property('checked'))
if self.value != current_state:
self.locate().click()

# Selector for select should be for the select tag (not the options)
elif self.kind == 'select':
select_tag = self.locate()
# Set the value manually which is same as what happens in fill_text
fill_text(self.selector, self.value, self.browser)

# Locator fn for select should return array of elements
elif self.kind == 'multiselect':
# Split out comma-delimited
values = [str(v).strip() for v in self.value.split(',')]
options = self.locate(get_many=True)
subset = [el for el in options if _clean_options_(el) in values]
for el in subset:
el.click()

def __repr__(self):
return f'''<FormField: "{self.name}" (kind:{self.kind})>'''

class FormButton:
def __init__(self, selector, browser):
self.browser = browser
self.selector = selector

def locate(self, get_many=False):
''' Returns the web element'''
return locator(self.selector, self.browser)

class FormFiller:
''' Object used to fill out a webpage form '''

def __init__(self, browser, template, fill_values):
'''
Inputs:
- browser: selenium browser driver
- template ('query', 'login', 'docket' or dict): gets template from get_template, or else manual input as dict
- fill_values (dict): key-value pairs of (field name, value to fill)
'''
self.fields, self.buttons = {}, {}
self.browser = browser

self.template = get_template(template) if type(template) is str else template
self.build(fill_values)

def build(self, fill_values):
''' Combine the template with the fill_values to create the FormFields'''
for field_name, props in self.template['fields'].items():
if field_name in fill_values:
field = FormField(field_name, props['kind'], fill_values[field_name],\
props['selector'], self.browser)
self.fields[field_name] = field

for button_name, selector in self.template['buttons'].items():
self.buttons[button_name] = FormButton(selector, self.browser)


def fill(self):
''' Fill all field values in the form'''
for field in self.fields.values():
field.fill()

# If the form has a pre-submit method, execute it
if 'pre_submit' in self.template:
self.template['pre_submit'](self)

def submit(self):
''' Click the form submit button '''
self.buttons['submit'].locate().click()

def _clean_options_(el, first_only=True):
''' Clean the text of an Option WebElement '''
try:
if first_only:
return el.text.strip().split()[0]
else:
return el.text.strip()
except:
return ''

TEMPLATE_LOGIN = {
'fields': {
'username': {
Expand All @@ -144,7 +31,6 @@ def _clean_options_(el, first_only=True):
}
}


TEMPLATE_DOCKET_SHEET = {
'fields': {
'case_no': {
Expand Down Expand Up @@ -247,29 +133,6 @@ def _clean_options_(el, first_only=True):
},
'pre_submit': lambda form: case_no_pre_submit(form)
}
def case_no_pre_submit(form):
'''Hit enter on case number field to start the lookup'''

if 'case_no' in form.fields:
case_num = form.fields['case_no'].locate()
case_num.send_keys(Keys.RETURN)


run_button = form.buttons['submit'].locate()

# Wait for the case lookup to run before you can 'Run Report'
for i in range(5):
if run_button.is_enabled():
break
# If the case selector appears, choose the first case (the main case)
elif form.browser.find_element_by_id('case_number_pick_area_0').is_displayed():
# Check if any checkbox ticked
docket_checkboxes = form.browser.find_elements_by_css_selector('#case_number_pick_area_0 input[type="checkbox"]')
if not any(box.is_selected() for box in docket_checkboxes):
# Click the first if none pre-selected (default to main)
docket_checkboxes[0].click()

time.sleep(1)

TEMPLATE_QUERY = {
'fields': {
Expand Down Expand Up @@ -380,6 +243,18 @@ def case_no_pre_submit(form):
'pre_submit': lambda form: case_no_pre_submit(form)
}



def _clean_options_(el, first_only=True):
''' Clean the text of an Option WebElement '''
try:
if first_only:
return el.text.strip().split()[0]
else:
return el.text.strip()
except:
return ''

def get_template(s):
if s=='login':
return TEMPLATE_LOGIN
Expand Down Expand Up @@ -420,6 +295,136 @@ def locator(css_selector, browser, get_many=False):
except:
return

def case_no_pre_submit(form):
'''Hit enter on case number field to start the lookup'''

if 'case_no' in form.fields:
case_num = form.fields['case_no'].locate()
case_num.send_keys(Keys.RETURN)


run_button = form.buttons['submit'].locate()

# Wait for the case lookup to run before you can 'Run Report'
for i in range(5):
if run_button.is_enabled():
break
# If the case selector appears, choose the first case (the main case)
elif form.browser.find_element_by_id('case_number_pick_area_0').is_displayed():
# Check if any checkbox ticked
docket_checkboxes = form.browser.find_elements_by_css_selector('#case_number_pick_area_0 input[type="checkbox"]')
if not any(box.is_selected() for box in docket_checkboxes):
# Click the first if none pre-selected (default to main)
docket_checkboxes[0].click()

time.sleep(1)



class FormField:
''' Object that represents a single field as part of a form'''
def __init__(self, name, kind, value, selector, browser):
self.name = name
self.kind = kind
self.value = value
self.selector = selector
self.browser = browser

def locate(self, get_many=False):
''' Returns the web element'''
return locator(self.selector, self.browser, get_many)

def fill(self):
if self.kind == 'text':
el = self.locate()
try:
el.clear()
except:
time.sleep(2)
el = self.locate()
el.clear()
fill_text(self.selector, self.value, self.browser)
# send_keys(el, self.value)

elif self.kind == 'radio':
chosen = [el for el in self.locate(get_many=True) \
if el.get_attribute('value')==self.value][0]
chosen.click()

elif self.kind == 'checkbox':
current_state = bool(self.locate().get_property('checked'))
if self.value != current_state:
self.locate().click()

# Selector for select should be for the select tag (not the options)
elif self.kind == 'select':
select_tag = self.locate()
# Set the value manually which is same as what happens in fill_text
fill_text(self.selector, self.value, self.browser)

# Locator fn for select should return array of elements
elif self.kind == 'multiselect':
# Split out comma-delimited
values = [str(v).strip() for v in self.value.split(',')]
options = self.locate(get_many=True)
subset = [el for el in options if _clean_options_(el) in values]
for el in subset:
el.click()

def __repr__(self):
return f'''<FormField: "{self.name}" (kind:{self.kind})>'''

class FormButton:
def __init__(self, selector, browser):
self.browser = browser
self.selector = selector

def locate(self, get_many=False):
''' Returns the web element'''
return locator(self.selector, self.browser)

class FormFiller:
''' Object used to fill out a webpage form '''

def __init__(self, browser, template, fill_values):
'''
Inputs:
- browser: selenium browser driver
- template ('query', 'login', 'docket' or dict): gets template from get_template, or else manual input as dict
- fill_values (dict): key-value pairs of (field name, value to fill)
'''
self.fields, self.buttons = {}, {}
self.browser = browser

self.template = get_template(template) if type(template) is str else template
self.build(fill_values)

def build(self, fill_values):
''' Combine the template with the fill_values to create the FormFields'''
for field_name, props in self.template['fields'].items():
if field_name in fill_values:
field = FormField(field_name, props['kind'], fill_values[field_name],\
props['selector'], self.browser)
self.fields[field_name] = field

for button_name, selector in self.template['buttons'].items():
self.buttons[button_name] = FormButton(selector, self.browser)

def fill(self):
''' Fill all field values in the form'''
for field in self.fields.values():
field.fill()

# If the form has a pre-submit method, execute it
if 'pre_submit' in self.template:
self.template['pre_submit'](self)

def submit(self):
''' Click the form submit button '''
self.buttons['submit'].locate().click()



def config_builder(tmp):
'''
CLI to generate a config file/dict
Expand Down
Loading

0 comments on commit 87ab207

Please sign in to comment.