Skip to content

Commit

Permalink
feat(scrapers.update_from_text): new command
Browse files Browse the repository at this point in the history
Helps solve: freelawproject/juriscraper#858

- New command to re-run Site.extract_from_text over downloaded opinions
- Able to filter by Docket.court_id ,  OpinionCluster.date_filed, OpinionCluster.precedential_status
- Updates tasks.update_from_document_text to return information for logging purposes
- Updates test_opinion_scraper to get a Site.extract_from_text method
  • Loading branch information
grossir committed Oct 1, 2024
1 parent 1830fb0 commit d871b4a
Show file tree
Hide file tree
Showing 4 changed files with 297 additions and 5 deletions.
159 changes: 159 additions & 0 deletions cl/scrapers/management/commands/update_from_text.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
from datetime import datetime

from django.db import transaction

from cl.lib.command_utils import VerboseCommand, logger
from cl.scrapers.tasks import update_document_from_text
from cl.search.models import PRECEDENTIAL_STATUS, Opinion, OpinionCluster


def update_from_text(
opinion: Opinion, juriscraper_module: str, stats: dict[str, int]
):
"""Calls `update_document_from_text` as used in the scraper flow
and calls the corresponding model's .save()
:param opinion: the Opinion on which to apply extract_from_text
:param juriscraper_module: the scraper module path
:param stats: dict to accumulate counts for reporting. Modified in place
:return None
"""
with transaction.atomic():
changes = update_document_from_text(opinion, juriscraper_module)
if not changes:
logger.info("Did not get any metadata for opinion %s", opinion.id)
return

logger.info("Processing opinion %s", opinion.id)

# Check if changes exist before saving, to prevent unecessary DB queries
if changes.get("Docket"):
opinion.cluster.docket.save()
logger.debug(
"Docket %s updated with data %s",
opinion.cluster.docket.id,
changes["Docket"],
)
stats["Docket"] += 1

if changes.get("OpinionCluster"):
opinion.cluster.save()
logger.debug(
"OpinionCluster %s updated with data %s",
opinion.cluster.id,
changes["OpinionCluster"],
)
stats["OpinionCluster"] += 1

if changes.get("Opinion"):
opinion.save()
logger.debug("Opinion updated with data %s", changes["Opinion"])
stats["Opinion"] += 1

if changes.get("Citation"):
if changes["Citation"].get("citation_created"):
logger.info(
"Citation created with data %s", changes["Citation"]
)
stats["Citation"] += 1
else:
logger.debug(
"Citation not created. Data %s", changes["Citation"]
)


class Command(VerboseCommand):
help = """Updates objects by running Site.extract_from_text
over extracted content found on Opinion.plain_text or Opinion.html.
If `--opinion-ids` is used, filters will be ignored.
If not, the 2 date filters will be required, to prevent triggering
unwanted reprocessing of the whole court's dataset
Recommended use is to run over a sample of the target time period
and check if updates over Docket, OpinionCluster, Opinion and
Citation are as expected
"""
stats = {} # assigned at the end of a command run, for testing

def add_arguments(self, parser):
parser.add_argument(
"--juriscraper-module",
help="""The Juriscraper file which contains the
`extract_from_text` method to be used. The `court_id`
will be deduced from this. Example:
juriscraper.opinions.united_states.federal_appellate.ca1
""",
required=True,
)
parser.add_argument(
"--opinion-ids",
nargs="+",
type=int,
help="""The Opinion ids to re-process.
May be more than one. If this argument is used,
other filters will be ignored""",
)
parser.add_argument(
"date-filed-gte",
default="",
help=r"""A filter value in %Y/%m/%d format.
OpinionCluster.date_filed will have to be greater or equal""",
)
parser.add_argument(
"date-filed-lte",
default="",
help=r"""A filter value in %Y/%m/%d format.
OpinionCluster.date_filed will have to be less or equal""",
)
parser.add_argument(
"--cluster-status",
default="",
choices=[value for value, name in PRECEDENTIAL_STATUS.NAMES],
help="""A value of OpinionCluster.precedential_status. To be
used for filtering the Opinions to be processed
""",
)

def handle(self, *args, **options):
super().handle(*args, **options)
juriscraper_module = options["juriscraper_module"]
# For aggregate reporting
stats = {"Docket": 0, "OpinionCluster": 0, "Opinion": 0, "Citation": 0}

if options["opinion_ids"]:
opinions = Opinion.objects.filter(id__in=options["opinion_ids"])
for op in opinions:
update_from_text(op, juriscraper_module, stats)

logger.info("Modified objects counts: %s", stats)
return

if not (options["date_filed_gte"] and options["date_filed_lte"]):
raise ValueError(
"Both `date-filed-gte` and `date-filed-lte` arguments should have values"
)

court_id = juriscraper_module.split(".")[-1].split("_")[0]
gte_date = datetime.strptime(options["date_filed_gte"], "%Y/%m/%d")
lte_date = datetime.strptime(options["date_filed_lte"], "%Y/%m/%d")
query = {
"docket__court_id": court_id,
"date_filed__gte": gte_date,
"date_filed__lte": lte_date,
}

if options["cluster_status"]:
query["precedential_status"] = options["cluster_status"]

qs = OpinionCluster.objects.filter(**query).prefetch_related(
"sub_opinions"
)
for cluster in qs:
opinions = cluster.sub_opinions.all()
for op in opinions:
update_from_text(op, juriscraper_module, stats)

logger.info("Modified objects counts: %s", stats)
self.stats = stats
12 changes: 8 additions & 4 deletions cl/scrapers/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@

def update_document_from_text(
opinion: Opinion, juriscraper_module: str = ""
) -> None:
) -> dict:
"""Extract additional metadata from document text
We use this code with BIA decisions. Previously Tax.
Expand All @@ -54,12 +54,13 @@ def update_document_from_text(
:param opinion: Opinion object
:param juriscraper_module: full module to get Site object
:return: None
:return: the extracted data dictionary
"""
court = opinion.cluster.docket.court.pk
site = get_scraper_object_by_name(court, juriscraper_module)
if site is None:
return
logger.debug("No site found %s", juriscraper_module)
return {}

metadata_dict = site.extract_from_text(opinion.plain_text or opinion.html)
for model_name, data in metadata_dict.items():
Expand All @@ -70,14 +71,17 @@ def update_document_from_text(
opinion.cluster.__dict__.update(data)
elif model_name == "Citation":
data["cluster_id"] = opinion.cluster_id
ModelClass.objects.get_or_create(**data)
_, citation_created = ModelClass.objects.get_or_create(**data)
metadata_dict["Citation"]["created"] = citation_created
elif model_name == "Opinion":
opinion.__dict__.update(data)
else:
raise NotImplementedError(
f"Object type of {model_name} not yet supported."
)

return metadata_dict


@app.task(
bind=True,
Expand Down
21 changes: 21 additions & 0 deletions cl/scrapers/test_assets/test_opinion_scraper.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import re
from datetime import datetime
from os.path import join

Expand Down Expand Up @@ -53,3 +54,23 @@ def _get_nature_of_suit(self):
def _get_judges(self):
path = "//judge/text()"
return list(self.html.xpath(path))

def extract_from_text(self, scraped_text):
metadata = {}
docket_regex = r"Docket Number: (?P<docket>\d+-\d+)"
disposition_regex = r"Disposition: (?P<disposition>\w+)"
citation_regex = r"(?P<volume>20\d{2}) (?P<reporter>VT) (?P<page>\d+)"
if docket_match := re.search(docket_regex, scraped_text):
metadata["Docket"] = {
"docket_number": docket_match.group("docket")
}

if disposition_match := re.search(disposition_regex, scraped_text):
metadata["OpinionCluster"] = {
"disposition": disposition_match.group("disposition")
}

if citation_match := re.search(citation_regex, scraped_text):
metadata["Citation"] = {**citation_match.groupdict(), "type": 8}

return metadata
110 changes: 109 additions & 1 deletion cl/scrapers/tests.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import os
from datetime import datetime, timedelta
from datetime import date, datetime, timedelta
from http import HTTPStatus
from pathlib import Path
from unittest import TestCase, mock
Expand Down Expand Up @@ -30,6 +30,7 @@
cl_back_scrape_citations,
cl_scrape_opinions,
cl_scrape_oral_arguments,
update_from_text,
)
from cl.scrapers.models import UrlHash
from cl.scrapers.tasks import extract_doc_content, process_audio_file
Expand Down Expand Up @@ -867,3 +868,110 @@ def test_federal_jurisdictions(self):
self.assertEqual(
docket, self.ca2_docket, "Should match using docket number core"
)


class UpdateFromTestCommandTest(TestCase):
"""Test the input processing and DB querying for the command"""

def setUp(self):
self.vt = CourtFactory(id="vt")
self.sc = CourtFactory(id="sc")
self.docket_sc = DocketFactory(court=self.sc, docket_number="20")

# Different dates, status and courts to test command behaviour
self.opinion_2020 = OpinionFactory(
cluster=OpinionClusterFactory(
docket=DocketFactory(court=self.vt, docket_number="12"),
date_filed=date(2020, 6, 1),
precedential_status="Published",
),
plain_text="""Docket Number: 2020-12
Disposition: Affirmed
2020 VT 11""",
)
self.opinion_2020_unpub = OpinionFactory(
cluster=OpinionClusterFactory(
docket=DocketFactory(court=self.vt, docket_number="13"),
date_filed=date(2020, 7, 1),
precedential_status="Unpublished",
),
plain_text="Docket Number: 2020-13\nDisposition: Affirmed",
)

self.opinion_sc = OpinionFactory(
cluster=OpinionClusterFactory(
docket=self.docket_sc,
date_filed=date(2021, 6, 1),
precedential_status="Published",
),
plain_text="Some text with no matches",
id=101,
)

self.opinion_2022 = OpinionFactory(
cluster=OpinionClusterFactory(
docket=DocketFactory(court=self.vt, docket_number="13"),
date_filed=date(2022, 6, 1),
precedential_status="Unpublished",
),
id=100,
plain_text="Docket Number: 2022-13\n2022 VT 11",
)

def test_inputs(self):
"""Do all command inputs work properly?"""

# will target a single opinion, for which extract_from_text
# extracts no metadata. No object should be updated
cmd = update_from_text.Command()
with mock.patch(
"cl.scrapers.tasks.get_scraper_object_by_name",
return_value=test_opinion_scraper.Site(),
):
cmd.handle(juriscraper_module="somepath.sc", opinion_ids=[101])

self.assertFalse(
any(cmd.stats.values()), "No object should be modified"
)

# will target 1 opinion, there are 2 in the time period
# and 3 for the court
with mock.patch(
"cl.scrapers.tasks.get_scraper_object_by_name",
return_value=test_opinion_scraper.Site(),
):
update_from_text.Command().handle(
juriscraper_module="somepath.vt",
opinion_ids=[],
date_filed_gte="2020/06/01",
date_filed_lte="2021/06/01",
cluster_status="Published",
)

# Test that objects were actually updated / created
self.assertEqual(
Citation.objects.filter(cluster=self.opinion_2020.cluster).count(),
1,
"There should be a single citation for this cluster",
)
self.opinion_2020.refresh_from_db()
self.opinion_2020.cluster.refresh_from_db()
self.opinion_2020.cluster.docket.refresh_from_db()
self.assertEqual(
self.opinion_2020.cluster.disposition,
"Affirmed",
"OpinionCluster.disposition was not updated",
)
self.assertEqual(
self.opinion_2020.cluster.docket.docket_number,
"2020-12",
"Docket.docket_number was not updated",
)

# Check that other objects in the time period and court
# were not modified. Meaning, the filter worked
self.assertEqual(
self.opinion_2020_unpub.cluster.docket.docket_number,
"13",
"Unpublished docket should not be modified",
)

0 comments on commit d871b4a

Please sign in to comment.