Skip to content

Commit

Permalink
fix(sd): update extract_from_text
Browse files Browse the repository at this point in the history
Solves #1292

Now parsing: disposition, docket_number and judges
  • Loading branch information
grossir committed Jan 10, 2025
1 parent bb6a4e0 commit 466718d
Show file tree
Hide file tree
Showing 2 changed files with 74 additions and 15 deletions.
64 changes: 51 additions & 13 deletions juriscraper/opinions/united_states/state/sd.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,21 @@

class Site(OpinionSiteLinear):
start_year = 1996
initials_to_judges = {
"MES": "Mark E. Salter",
"SPM": "Scott P. Myren",
"SRJ": "Steven R. Jensen",
"PJD": "Patricia J. DeVaney",
"JMK": "Janine M. Kern",
}
disposition_mapper = {
"dismiss": "Dismiss",
"a": "Affirmed",
"r": "Reverse and remand",
"aff in pt & rev in pt": "Affirm in part and reverse in part",
"aff in pt, vacate, & rem in pt": "Affirm in part, vacate and remand in part",
"aff in pt & vacate": "Affirm and vacate", # https://www.courtlistener.com/opinion/9502826/state-v-scott/pdf/
}

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
Expand All @@ -41,10 +56,6 @@ def _process_html(self) -> None:
if not cite:
continue

# https://ujs.sd.gov/uploads/sc/opinions/2928369ef9a6.pdf
# We abstract out the first part of the docket number here
# And process the full docket number in the `extract_from_text` method
# Called after the file has been downloaded.
url = row.xpath(".//td[2]/a/@href")[0]
docket = url.split("/")[-1][:5]
self.cases.append(
Expand Down Expand Up @@ -137,17 +148,44 @@ def make_backscrape_iterable(self, kwargs: Dict) -> None:
def extract_from_text(self, scraped_text: str) -> Dict[str, Any]:
"""Can we extract the date filed from the text?
Some edge cases:
- case with 2 judges https://www.courtlistener.com/opinion/9456271/mcgee-v-spencer-quarries-inc/pdf/
- case without disposition: https://www.courtlistener.com/opinion/10121701/discipline-of-ravnsborg/pdf/
- case without a judge string https://www.courtlistener.com/opinion/9474051/in-the-matter-of-the-interpretation-of-south-dakota-constitution-and-state/pdf/
:param scraped_text: The content of the document downloaded
:return: Metadata to be added to the case
"""
metadata = {}
target_text = scraped_text[:100]

dockets = re.findall(r"(?<=#)\d+", target_text)
if dockets:
metadata["Docket"] = {"docket_number": ", ".join(dockets)}

judge_regex = r"-[A-Z]{3}(\s*[,&]\s+[A-Z]{3})*"
if judges_match := re.search(judge_regex, target_text):
initials = re.sub(r"[\s,&-]+", " ", judges_match.group(0)).strip()
judges = []
for initial in initials.split(" "):
if judge := self.initials_to_judges.get(initial):
judges.append(judge)
else:
# Catch updates
logger.error(
"Judge initials not mapped to full name %s", initial
)

if judges:
metadata["OpinionCluster"] = {"judges": ", ".join(judges)}

disposition_regex = r"(?<=-)[a-z,&\s]+(?=-)"
if disposition_match := re.search(disposition_regex, target_text):
raw_disposition = disposition_match.group(0)
if disp := self.disposition_mapper.get(raw_disposition):
if metadata.get("OpinionCluster"):
metadata["OpinionCluster"]["disposition"] = disp
else:
metadata["OpinionCluster"] = {"disposition": disp}

# The docket number appears to be the first text on the page.
# So I crop the text to avoid any confusion that might occur in the
# body of an opinion.
docket = re.findall(r"#\d+.*-.-\w{3}", scraped_text[:100])[0]
metadata = {
"Docket": {
"docket_number": docket,
},
}
return metadata
25 changes: 23 additions & 2 deletions tests/local/test_ScraperExtractFromTextTest.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,8 +88,29 @@ class ScraperExtractFromText(unittest.TestCase):
],
"juriscraper.opinions.united_states.state.sd": [
(
"""#30018-a-MES\n2023 S.D. 4""",
{"Docket": {"docket_number": "#30018-a-MES"}},
# https://www.courtlistener.com/opinion/9456271/mcgee-v-spencer-quarries-inc/pdf/
"""#29901-aff in pt & rev in pt-PJD & SRJ\n2023 S.D. 66\nIN THE SUPREME COURT""",
{
"Docket": {"docket_number": "29901"},
"OpinionCluster": {
"disposition": "Affirm in part and reverse in part",
"judges": "Patricia J. DeVaney, Steven R. Jensen",
},
},
"""#30354-SRJ\n2024 S.D. 58\nIN THE SUPREME COURT\nOF THE""",
{
"Docket": {"docket_number": "30354"},
"OpinionCluster": {"judges": "Steven R. Jensen"},
},
# https://www.courtlistener.com/opinion/9406747/estate-of-beadle/?q=court_id%3Asd&page=8
"""#30086, #30094-r-SPM\n2023 S.D. 26\nIN THE SUPREME COURT\nOF THE\nSTATE OF SOUTH DAKOTA""",
{
"Docket": {"docket_number": "30086, 30094"},
"OpinionCluster": {
"judges": "Scott P. Myren",
"disposition": "Reverse and remand",
},
},
),
],
"juriscraper.opinions.united_states.territories.nmariana": [
Expand Down

0 comments on commit 466718d

Please sign in to comment.