Skip to content

Commit

Permalink
minor tweaks to scraper
Browse files Browse the repository at this point in the history
  • Loading branch information
nadahlberg committed Sep 29, 2022
1 parent 4e1ae07 commit 36b0eb4
Show file tree
Hide file tree
Showing 3 changed files with 14 additions and 10 deletions.
2 changes: 1 addition & 1 deletion code/downloader/scrapers.py
Original file line number Diff line number Diff line change
Expand Up @@ -612,7 +612,7 @@ def pull_case(self, case, new_member_list_seen):
download_url = self.browser.current_url
page_source = self.browser.page_source
if self.court == 'psc':
cost = 0
cost = 0
else:
cost = float(ftools.parse_transaction_history(page_source)['cost'])

Expand Down
11 changes: 5 additions & 6 deletions code/parsers/parse_pacer.py
Original file line number Diff line number Diff line change
Expand Up @@ -877,11 +877,9 @@ def case_runner(case, output_dir, court, debug, force_rerun, count, member_df, l
'''
Case parser management
'''
# Get the output path (??? seems like it's trying to make sure to use the new get_expected_path)
# Get the output path
case_fname = Path(case['docket_paths'][0]).stem
pacer_dir = Path(output_dir).resolve().parent.parent
outname = ftools.get_expected_path(ucid=case['ucid'], pacer_path=pacer_dir)
# outname = Path(output_dir) / f"{case_fname}.json"
outname = ftools.get_expected_path(ucid=case['ucid'], manual_subdir_path=output_dir)

if force_rerun or not outname.exists(): # Check whether the output file exists already
case_data = process_html_file(case, member_df, court = court)
Expand Down Expand Up @@ -999,7 +997,7 @@ def parse(input_dir, output_dir, summaries_dir, court=None, all_courts=False, de
@click.command()
@click.argument('input-dir')
@click.option('--output-dir', '-o', default=None, type=click.Path(exists=True, file_okay=False),
help="Directory to place parsed output, if none provided defaults to INPUT_DIR/json ")
help="Directory to place parsed output, if none provided defaults to INPUT_DIR/../json ")
@click.option('--summaries-dir', '-s', default=None, type=click.Path(exists=True, file_okay=False),
help="Directory to place parsed output, if none provided defaults to INPUT_DIR/summaries ")
@click.option('--court', '-c', default=None,
Expand All @@ -1023,4 +1021,5 @@ def parser(**kwargs ):
parse(**kwargs)

if __name__ == '__main__':
parser()
parser()

11 changes: 8 additions & 3 deletions code/support/fhandle_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -573,13 +573,14 @@ def _get_expected_path_old_(ucid, subdir='json', pacer_path=settings.PACER_PATH,

return pacer_path / court / subdir / fname

def get_expected_path(ucid, subdir='json', pacer_path=settings.PACER_PATH, def_no=None, update_ind=None):
def get_expected_path(ucid, subdir='json', manual_subdir_path=None, pacer_path=settings.PACER_PATH, def_no=None, update_ind=None):
'''
Find the expected path of case-level data files
Inputs:
- ucid (str): case ucid
- subdir (str): the subdirectory to look in (see scrapers.PacerCourtDir), one of 'html', 'json', 'docs', 'summaries', 'members'
- manual_subdir_path (str): (mainly for parser) the directory in which to locate the expected path, rather than pacer_path/court/subdir
- pacer_path (Path): path to pacer data directory
- def_no (str or int): the defendant no., if specifying a defendant-specific docket
- update_ind (int): update index (for html files), passed through to generate_docket_filename
Expand All @@ -591,11 +592,15 @@ def get_expected_path(ucid, subdir='json', pacer_path=settings.PACER_PATH, def_n
court, case_no = ucid_data['court'], ucid_data['case_no']
year_part = decompose_caseno(case_no)['year']

# Build the filepath
# Build the filename
ext = SUBDIR_EXTENSIONS[subdir]
fname = generate_docket_filename(case_no, ext=ext, def_no=def_no, ind=update_ind)

return pacer_path / court / subdir / year_part / fname
# Build the full filepath
if manual_subdir_path:
return Path(manual_subdir_path).resolve() / year_part / fname
else:
return pacer_path / court / subdir / year_part / fname

def filename_to_ucid(fname, court):
fpath = Path(fname)
Expand Down

0 comments on commit 36b0eb4

Please sign in to comment.