feat(scrapers.update_from_text): new command

Helps solve: freelawproject/juriscraper#858 - New command to re-run Site.extract_from_text over downloaded opinions - Able to filter by Docket.court_id , OpinionCluster.date_filed, OpinionCluster.precedential_status - Updates tasks.update_from_document_text to return information for logging purposes - Updates test_opinion_scraper to get a Site.extract_from_text method
freelawproject · Oct 1, 2024 · d871b4a · d871b4a
1 parent 1830fb0
commit d871b4a
Show file tree

Hide file tree

Showing 4 changed files with 297 additions and 5 deletions.
diff --git a/cl/scrapers/management/commands/update_from_text.py b/cl/scrapers/management/commands/update_from_text.py
@@ -0,0 +1,159 @@
+from datetime import datetime
+
+from django.db import transaction
+
+from cl.lib.command_utils import VerboseCommand, logger
+from cl.scrapers.tasks import update_document_from_text
+from cl.search.models import PRECEDENTIAL_STATUS, Opinion, OpinionCluster
+
+
+def update_from_text(
+    opinion: Opinion, juriscraper_module: str, stats: dict[str, int]
+):
+    """Calls `update_document_from_text` as used in the scraper flow
+    and calls the corresponding model's .save()
+
+    :param opinion: the Opinion on which to apply extract_from_text
+    :param juriscraper_module: the scraper module path
+    :param stats: dict to accumulate counts for reporting. Modified in place
+
+    :return None
+    """
+    with transaction.atomic():
+        changes = update_document_from_text(opinion, juriscraper_module)
+        if not changes:
+            logger.info("Did not get any metadata for opinion %s", opinion.id)
+            return
+
+        logger.info("Processing opinion %s", opinion.id)
+
+        # Check if changes exist before saving, to prevent unecessary DB queries
+        if changes.get("Docket"):
+            opinion.cluster.docket.save()
+            logger.debug(
+                "Docket %s updated with data %s",
+                opinion.cluster.docket.id,
+                changes["Docket"],
+            )
+            stats["Docket"] += 1
+
+        if changes.get("OpinionCluster"):
+            opinion.cluster.save()
+            logger.debug(
+                "OpinionCluster %s updated with data %s",
+                opinion.cluster.id,
+                changes["OpinionCluster"],
+            )
+            stats["OpinionCluster"] += 1
+
+        if changes.get("Opinion"):
+            opinion.save()
+            logger.debug("Opinion updated with data %s", changes["Opinion"])
+            stats["Opinion"] += 1
+
+        if changes.get("Citation"):
+            if changes["Citation"].get("citation_created"):
+                logger.info(
+                    "Citation created with data %s", changes["Citation"]
+                )
+                stats["Citation"] += 1
+            else:
+                logger.debug(
+                    "Citation not created. Data %s", changes["Citation"]
+                )
+
+
+class Command(VerboseCommand):
+    help = """Updates objects by running Site.extract_from_text
+    over extracted content found on Opinion.plain_text or Opinion.html.
+
+    If `--opinion-ids` is used, filters will be ignored.
+    If not, the 2 date filters will be required, to prevent triggering
+    unwanted reprocessing of the whole court's dataset
+
+    Recommended use is to run over a sample of the target time period
+    and check if updates over Docket, OpinionCluster, Opinion and
+    Citation are as expected
+    """
+    stats = {}  # assigned at the end of a command run, for testing
+
+    def add_arguments(self, parser):
+        parser.add_argument(
+            "--juriscraper-module",
+            help="""The Juriscraper file which contains the
+            `extract_from_text` method to be used. The `court_id`
+            will be deduced from this. Example:
+            juriscraper.opinions.united_states.federal_appellate.ca1
+            """,
+            required=True,
+        )
+        parser.add_argument(
+            "--opinion-ids",
+            nargs="+",
+            type=int,
+            help="""The Opinion ids to re-process.
+            May be more than one. If this argument is used,
+            other filters will be ignored""",
+        )
+        parser.add_argument(
+            "date-filed-gte",
+            default="",
+            help=r"""A filter value in %Y/%m/%d format.
+            OpinionCluster.date_filed will have to be greater or equal""",
+        )
+        parser.add_argument(
+            "date-filed-lte",
+            default="",
+            help=r"""A filter value in %Y/%m/%d format.
+            OpinionCluster.date_filed will have to be less or equal""",
+        )
+        parser.add_argument(
+            "--cluster-status",
+            default="",
+            choices=[value for value, name in PRECEDENTIAL_STATUS.NAMES],
+            help="""A value of OpinionCluster.precedential_status. To be
+            used for filtering the Opinions to be processed
+            """,
+        )
+
+    def handle(self, *args, **options):
+        super().handle(*args, **options)
+        juriscraper_module = options["juriscraper_module"]
+        # For aggregate reporting
+        stats = {"Docket": 0, "OpinionCluster": 0, "Opinion": 0, "Citation": 0}
+
+        if options["opinion_ids"]:
+            opinions = Opinion.objects.filter(id__in=options["opinion_ids"])
+            for op in opinions:
+                update_from_text(op, juriscraper_module, stats)
+
+            logger.info("Modified objects counts: %s", stats)
+            return
+
+        if not (options["date_filed_gte"] and options["date_filed_lte"]):
+            raise ValueError(
+                "Both `date-filed-gte` and `date-filed-lte` arguments should have values"
+            )
+
+        court_id = juriscraper_module.split(".")[-1].split("_")[0]
+        gte_date = datetime.strptime(options["date_filed_gte"], "%Y/%m/%d")
+        lte_date = datetime.strptime(options["date_filed_lte"], "%Y/%m/%d")
+        query = {
+            "docket__court_id": court_id,
+            "date_filed__gte": gte_date,
+            "date_filed__lte": lte_date,
+        }
+
+        if options["cluster_status"]:
+            query["precedential_status"] = options["cluster_status"]
+
+        qs = OpinionCluster.objects.filter(**query).prefetch_related(
+            "sub_opinions"
+        )
+        for cluster in qs:
+            opinions = cluster.sub_opinions.all()
+            for op in opinions:
+                update_from_text(op, juriscraper_module, stats)
+
+        logger.info("Modified objects counts: %s", stats)
+        self.stats = stats
diff --git a/cl/scrapers/tasks.py b/cl/scrapers/tasks.py
@@ -39,7 +39,7 @@
 
 def update_document_from_text(
     opinion: Opinion, juriscraper_module: str = ""
-) -> None:
+) -> dict:
     """Extract additional metadata from document text
 
     We use this code with BIA decisions. Previously Tax.
@@ -54,12 +54,13 @@ def update_document_from_text(
 
     :param opinion: Opinion object
     :param juriscraper_module: full module to get Site object
-    :return: None
+    :return: the extracted data dictionary
     """
     court = opinion.cluster.docket.court.pk
     site = get_scraper_object_by_name(court, juriscraper_module)
     if site is None:
-        return
+        logger.debug("No site found %s", juriscraper_module)
+        return {}
 
     metadata_dict = site.extract_from_text(opinion.plain_text or opinion.html)
     for model_name, data in metadata_dict.items():
@@ -70,14 +71,17 @@ def update_document_from_text(
             opinion.cluster.__dict__.update(data)
         elif model_name == "Citation":
             data["cluster_id"] = opinion.cluster_id
-            ModelClass.objects.get_or_create(**data)
+            _, citation_created = ModelClass.objects.get_or_create(**data)
+            metadata_dict["Citation"]["created"] = citation_created
         elif model_name == "Opinion":
             opinion.__dict__.update(data)
         else:
             raise NotImplementedError(
                 f"Object type of {model_name} not yet supported."
             )
 
+    return metadata_dict
+
 
 @app.task(
     bind=True,

diff --git a/cl/scrapers/test_assets/test_opinion_scraper.py b/cl/scrapers/test_assets/test_opinion_scraper.py
@@ -1,3 +1,4 @@
+import re
 from datetime import datetime
 from os.path import join
 
@@ -53,3 +54,23 @@ def _get_nature_of_suit(self):
     def _get_judges(self):
         path = "//judge/text()"
         return list(self.html.xpath(path))
+
+    def extract_from_text(self, scraped_text):
+        metadata = {}
+        docket_regex = r"Docket Number: (?P<docket>\d+-\d+)"
+        disposition_regex = r"Disposition: (?P<disposition>\w+)"
+        citation_regex = r"(?P<volume>20\d{2}) (?P<reporter>VT) (?P<page>\d+)"
+        if docket_match := re.search(docket_regex, scraped_text):
+            metadata["Docket"] = {
+                "docket_number": docket_match.group("docket")
+            }
+
+        if disposition_match := re.search(disposition_regex, scraped_text):
+            metadata["OpinionCluster"] = {
+                "disposition": disposition_match.group("disposition")
+            }
+
+        if citation_match := re.search(citation_regex, scraped_text):
+            metadata["Citation"] = {**citation_match.groupdict(), "type": 8}
+
+        return metadata
diff --git a/cl/scrapers/tests.py b/cl/scrapers/tests.py
@@ -1,5 +1,5 @@
 import os
-from datetime import datetime, timedelta
+from datetime import date, datetime, timedelta
 from http import HTTPStatus
 from pathlib import Path
 from unittest import TestCase, mock
@@ -30,6 +30,7 @@
     cl_back_scrape_citations,
     cl_scrape_opinions,
     cl_scrape_oral_arguments,
+    update_from_text,
 )
 from cl.scrapers.models import UrlHash
 from cl.scrapers.tasks import extract_doc_content, process_audio_file
@@ -867,3 +868,110 @@ def test_federal_jurisdictions(self):
         self.assertEqual(
             docket, self.ca2_docket, "Should match using docket number core"
         )
+
+
+class UpdateFromTestCommandTest(TestCase):
+    """Test the input processing and DB querying for the command"""
+
+    def setUp(self):
+        self.vt = CourtFactory(id="vt")
+        self.sc = CourtFactory(id="sc")
+        self.docket_sc = DocketFactory(court=self.sc, docket_number="20")
+
+        # Different dates, status and courts to test command behaviour
+        self.opinion_2020 = OpinionFactory(
+            cluster=OpinionClusterFactory(
+                docket=DocketFactory(court=self.vt, docket_number="12"),
+                date_filed=date(2020, 6, 1),
+                precedential_status="Published",
+            ),
+            plain_text="""Docket Number: 2020-12
+            Disposition: Affirmed
+            2020 VT 11""",
+        )
+        self.opinion_2020_unpub = OpinionFactory(
+            cluster=OpinionClusterFactory(
+                docket=DocketFactory(court=self.vt, docket_number="13"),
+                date_filed=date(2020, 7, 1),
+                precedential_status="Unpublished",
+            ),
+            plain_text="Docket Number: 2020-13\nDisposition: Affirmed",
+        )
+
+        self.opinion_sc = OpinionFactory(
+            cluster=OpinionClusterFactory(
+                docket=self.docket_sc,
+                date_filed=date(2021, 6, 1),
+                precedential_status="Published",
+            ),
+            plain_text="Some text with no matches",
+            id=101,
+        )
+
+        self.opinion_2022 = OpinionFactory(
+            cluster=OpinionClusterFactory(
+                docket=DocketFactory(court=self.vt, docket_number="13"),
+                date_filed=date(2022, 6, 1),
+                precedential_status="Unpublished",
+            ),
+            id=100,
+            plain_text="Docket Number: 2022-13\n2022 VT 11",
+        )
+
+    def test_inputs(self):
+        """Do all command inputs work properly?"""
+
+        # will target a single opinion, for which extract_from_text
+        # extracts no metadata. No object should be updated
+        cmd = update_from_text.Command()
+        with mock.patch(
+            "cl.scrapers.tasks.get_scraper_object_by_name",
+            return_value=test_opinion_scraper.Site(),
+        ):
+            cmd.handle(juriscraper_module="somepath.sc", opinion_ids=[101])
+
+        self.assertFalse(
+            any(cmd.stats.values()), "No object should be modified"
+        )
+
+        # will target 1 opinion, there are 2 in the time period
+        # and 3 for the court
+        with mock.patch(
+            "cl.scrapers.tasks.get_scraper_object_by_name",
+            return_value=test_opinion_scraper.Site(),
+        ):
+            update_from_text.Command().handle(
+                juriscraper_module="somepath.vt",
+                opinion_ids=[],
+                date_filed_gte="2020/06/01",
+                date_filed_lte="2021/06/01",
+                cluster_status="Published",
+            )
+
+        # Test that objects were actually updated / created
+        self.assertEqual(
+            Citation.objects.filter(cluster=self.opinion_2020.cluster).count(),
+            1,
+            "There should be a single citation for this cluster",
+        )
+        self.opinion_2020.refresh_from_db()
+        self.opinion_2020.cluster.refresh_from_db()
+        self.opinion_2020.cluster.docket.refresh_from_db()
+        self.assertEqual(
+            self.opinion_2020.cluster.disposition,
+            "Affirmed",
+            "OpinionCluster.disposition was not updated",
+        )
+        self.assertEqual(
+            self.opinion_2020.cluster.docket.docket_number,
+            "2020-12",
+            "Docket.docket_number was not updated",
+        )
+
+        # Check that other objects in the time period and court
+        # were not modified. Meaning, the filter worked
+        self.assertEqual(
+            self.opinion_2020_unpub.cluster.docket.docket_number,
+            "13",
+            "Unpublished docket should not be modified",
+        )