Merge pull request #4303 from grossir/scrape_citations_command

feat(cl_back_scrape_citations): command to scrape citations
freelawproject · Aug 22, 2024 · e70bdb7 · e70bdb7
2 parents 0ecfca5 + b91fbf5
commit e70bdb7
Show file tree

Hide file tree

Showing 8 changed files with 594 additions and 333 deletions.
diff --git a/cl/scrapers/DupChecker.py b/cl/scrapers/DupChecker.py
@@ -1,5 +1,9 @@
 from juriscraper.AbstractSite import logger
 
+from cl.scrapers.exceptions import (
+    ConsecutiveDuplicatesError,
+    SingleDuplicateError,
+)
 from cl.scrapers.models import UrlHash
 from cl.search.models import Court
 
@@ -19,7 +23,6 @@ def __init__(
         self.url_hash = None
         self.dup_count = 0
         self.last_found_date = None
-        self.emulate_break = False
         super().__init__(*args, **kwargs)
 
     def _increment(self, current_date):
@@ -83,29 +86,29 @@ def press_on(
         lookup_by="sha1",
     ):
         """Checks if a we have an `object_type` with identical content in the CL
-        corpus by looking up `lookup_value` in the `lookup_by` field. Depending
-        on the result of that, we either return True or False. True represents
-        the fact that the next item should be processed. False means that either
-        the item was a duplicate or that we've hit so many duplicates that we've
-        stopped checking (we hit a duplicate threshold). Either way, the caller
-        should move to the next item and try it.
+        corpus by looking up `lookup_value` in the `lookup_by` field.
 
-        The effect of this is that this emulates for loop constructs for
-        continue (False), break (False), return (True).
+        If the item is not a duplicate, we will return None, and the caller
+        will proceed normally
+
+        If the item is a duplicate, we will raise SingleDuplicateError
+
+        If the item is a duplicate following a series of duplicates greater than
+        our tolerance threshold, we will raise ConsecutiveDuplicatesError
+
+        If the item is a duplicate and the next item is from an already scraped
+        date, we will raise ConsecutiveDuplicatesError
 
         Following logic applies:
+         - if we do not have the item
+            - early return
          - if we have the item already
             - and if the next date is before this date
             - or if this is our duplicate threshold is exceeded
                 - break
             - otherwise
                 - continue
-         - if not
-            - carry on
         """
-        if self.emulate_break:
-            return False
-
         # check for a duplicate in the db.
         if lookup_by == "sha1":
             exists = object_type.objects.filter(sha1=lookup_value).exists()
@@ -116,41 +119,35 @@ def press_on(
         else:
             raise NotImplementedError("Unknown lookup_by parameter.")
 
-        if exists:
-            logger.info(
-                f"Duplicate found on date: {current_date}, with lookup value: {lookup_value}"
-            )
-            self._increment(current_date)
-
-            # If the next date in the Site object is less than (before) the
-            # current date, we needn't continue because we should already have
-            # that item.
-            if next_date:
-                already_scraped_next_date = next_date < current_date
-            else:
-                already_scraped_next_date = True
-            if not self.full_crawl:
-                if already_scraped_next_date:
-                    if self.court.pk == "mich":
-                        # Michigan sometimes has multiple occurrences of the
-                        # same case with different dates on a page.
-                        return False
-                    else:
-                        logger.info(
-                            "Next case occurs prior to when we found a "
-                            "duplicate. Court is up to date."
-                        )
-                        self.emulate_break = True
-                        return False
-                elif self.dup_count >= self.dup_threshold:
-                    logger.info(
-                        f"Found {self.dup_count} duplicates in a row. Court is up to date."
-                    )
-                    self.emulate_break = True
-                    return False
-            else:
-                # This is a full crawl. Do not emulate a break, BUT be sure to
-                # say that we shouldn't press on, since the item already exists.
-                return False
+        if not exists:
+            return
+
+        logger.info(
+            f"Duplicate found on date: {current_date}, with lookup value: {lookup_value}"
+        )
+        self._increment(current_date)
+
+        # If the next date in the Site object is less than (before) the
+        # current date, we needn't continue because we should already have
+        # that item.
+        if next_date:
+            already_scraped_next_date = next_date < current_date
+        else:
+            already_scraped_next_date = True
+
+        if not self.full_crawl:
+            if already_scraped_next_date:
+                if self.court.pk == "mich":
+                    # Michigan sometimes has multiple occurrences of the
+                    # same case with different dates on a page.
+                    raise SingleDuplicateError(logger=logger)
+                else:
+                    message = "Next case occurs prior to when we found a duplicate. Court is up to date."
+                    raise ConsecutiveDuplicatesError(message, logger=logger)
+            elif self.dup_count >= self.dup_threshold:
+                message = f"Found {self.dup_count} duplicates in a row. Court is up to date."
+                raise ConsecutiveDuplicatesError(message, logger=logger)
         else:
-            return True
+            # This is a full crawl. Do not raise a loop breaking `ConsecutiveDuplicatesError`,
+            # but say that we shouldn't press on, since the item already exists.
+            raise SingleDuplicateError(logger=logger)
diff --git a/cl/scrapers/exceptions.py b/cl/scrapers/exceptions.py
@@ -0,0 +1,82 @@
+import logging
+from typing import Optional
+
+from cl.lib.command_utils import logger
+
+
+class AutoLoggingException(Exception):
+    """Exception with defaults for logging, to be subclassed
+
+    We log expected exceptions to better understand what went wrong
+    Logger calls with level `logging.ERROR` are sent to Sentry, and
+    it's useful to send a `fingerprint` to force a specific grouping by court
+
+    Other `logger` calls are just printed on the console when using a
+    VerboseCommand with proper verbosity levels
+    """
+
+    logging_level = logging.DEBUG
+    message = ""
+    logger = logger
+
+    def __init__(
+        self,
+        message: str = "",
+        logger: Optional[logging.Logger] = None,
+        logging_level: Optional[int] = None,
+        fingerprint: Optional[list[str]] = None,
+    ):
+        if not message:
+            message = self.message
+        if not logger:
+            logger = self.logger
+        if not logging_level:
+            logging_level = self.logging_level
+
+        log_kwargs = {}
+        if fingerprint:
+            log_kwargs["extra"] = {"fingerprint": fingerprint}
+
+        logger.log(logging_level, message, **log_kwargs)
+        super().__init__(message)
+
+
+class ConsecutiveDuplicatesError(AutoLoggingException):
+    """Occurs when consecutive `SingleDuplicateError` are found,
+    which may be used as a signal to break the scraping loop
+    """
+
+    message = "DupChecker emulate break triggered."
+
+
+class SingleDuplicateError(AutoLoggingException):
+    """Occurs when an opinion or audio file already exists
+    in our database
+    """
+
+    message = "Skipping opinion due to duplicated content hash"
+
+
+class BadContentError(AutoLoggingException):
+    """Parent class for errors raised when downloading binary content"""
+
+
+class UnexpectedContentTypeError(BadContentError):
+    """Occurs when the content received from the server has
+    a different content type than the ones listed on
+    site.expected_content_types
+    """
+
+    logging_level = logging.ERROR
+
+
+class NoDownloadUrlError(BadContentError):
+    """Occurs when a DeferredList fetcher fails."""
+
+    logging_level = logging.ERROR
+
+
+class EmptyFileError(BadContentError):
+    """Occurs when the content of the response has lenght 0"""
+
+    logging_level = logging.ERROR
diff --git a/cl/scrapers/management/commands/cl_back_scrape_citations.py b/cl/scrapers/management/commands/cl_back_scrape_citations.py
@@ -0,0 +1,150 @@
+"""
+When opinions are first published on the courts' sites, they won't have
+all their citations assigned. Some courts will publish the citations
+in the same pages we scrape, but months later
+
+This command re-uses the (back)scraper we use to get opinions, to get
+the lagged citations and associate them with the Opinions we first
+downloaded. If we find an Opinion we don't have in the database,
+we ingest it as in a regular scrape
+"""
+
+from django.db import IntegrityError
+from django.utils.encoding import force_bytes
+
+from cl.lib.command_utils import logger
+from cl.lib.crypto import sha1
+from cl.scrapers.DupChecker import DupChecker
+from cl.scrapers.exceptions import BadContentError
+from cl.scrapers.management.commands import cl_back_scrape_opinions
+from cl.scrapers.management.commands.cl_scrape_opinions import make_citation
+from cl.scrapers.utils import get_binary_content
+from cl.search.models import Citation, Court, Opinion
+
+
+class Command(cl_back_scrape_opinions.Command):
+    scrape_target_descr = "citations"
+
+    def scrape_court(
+        self,
+        site,
+        full_crawl: bool = False,
+        ocr_available: bool = True,
+        backscrape: bool = False,
+    ):
+        """
+        If the scraped case has citation data
+            Check for Opinion existance via content hash
+            If we have the Opinion
+                if we don't have the citation -> ingest
+                if we already have the citation -> pass
+            If we don't have the Opinion
+                ingest the opinion with it's citation, that is to say,
+                use the regular scraping process!
+
+        :param site: scraper object that has already downloaded
+            it's case data
+        """
+        court_str = site.court_id.split(".")[-1].split("_")[0]
+        court = Court.objects.get(id=court_str)
+        dup_checker = DupChecker(court, full_crawl=True)
+
+        for case in site:
+            citation = case.get("citations")
+            parallel_citation = case.get("parallel_citations")
+            if not citation and not parallel_citation:
+                logger.debug(
+                    "No citation, skipping row for case %s",
+                    case.get("case_names"),
+                )
+                continue
+
+            try:
+                content = get_binary_content(case["download_urls"], site)
+            except BadContentError:
+                continue
+
+            sha1_hash = sha1(force_bytes(content))
+
+            try:
+                cluster = Opinion.objects.get(sha1=sha1_hash).cluster
+            except Opinion.DoesNotExist:
+                # populate special key to avoid downloading the file again
+                case["content"] = content
+
+                logger.info(
+                    "Case '%s', opinion '%s' has no matching hash in the DB. "
+                    "Has a citation '%s'. Will try to ingest all objects",
+                    case["case_names"],
+                    case["download_urls"],
+                    citation or parallel_citation,
+                )
+
+                self.ingest_a_case(case, None, True, site, dup_checker, court)
+                continue
+
+            for cite in [citation, parallel_citation]:
+                if not cite:
+                    continue
+
+                citation_candidate = make_citation(cite, cluster, court_str)
+                if not citation_candidate:
+                    continue
+
+                if self.citation_is_duplicated(citation_candidate, cite):
+                    continue
+
+                try:
+                    citation_candidate.save()
+                    logger.info(
+                        "Saved citation %s for cluster %s", cite, cluster
+                    )
+                except IntegrityError:
+                    logger.warning(
+                        "Error when saving citation %s for cluster %s",
+                        cite,
+                        cluster,
+                    )
+
+    def citation_is_duplicated(
+        self, citation_candidate: Citation, cite: str
+    ) -> bool:
+        """Checks if the citation is duplicated for the cluster
+
+        Following corpus_importer.utils.add_citations_to_cluster we
+        identify 2 types of duplication:
+        - exact: a citation with the same fields already exists for the cluster
+        - duplication in the same reporter: the cluster already has a citation
+            in that reporter
+
+        :param citation_candidate: the citation object
+        :param cite: citation string
+
+        :return: True if citation is duplicated, False if not
+        """
+        citation_params = {**citation_candidate.__dict__}
+        citation_params.pop("_state", "")
+        citation_params.pop("id", "")
+        cluster_id = citation_candidate.cluster.id
+
+        # Exact duplication
+        if Citation.objects.filter(**citation_params).exists():
+            logger.info(
+                "Citation '%s' already exists for cluster %s",
+                cite,
+                cluster_id,
+            )
+            return True
+
+        # Duplication in the same reporter
+        if Citation.objects.filter(
+            cluster_id=cluster_id, reporter=citation_candidate.reporter
+        ).exists():
+            logger.info(
+                "Another citation in the same reporter '%s' exists for cluster %s",
+                citation_candidate.reporter,
+                cluster_id,
+            )
+            return True
+
+        return False
diff --git a/cl/scrapers/management/commands/cl_back_scrape_oral_arguments.py b/cl/scrapers/management/commands/cl_back_scrape_oral_arguments.py
@@ -5,7 +5,7 @@
 
 
 class Command(cl_scrape_oral_arguments.Command):
-    def parse_and_scrape_site(self, mod, full_crawl):
+    def parse_and_scrape_site(self, mod, options: dict):
         court_str = mod.__name__.split(".")[-1].split("_")[0]
         logger.info(f'Using court_str: "{court_str}"')