From 49aa2b7095dda0614423302c4dcc259c5dc2e1db Mon Sep 17 00:00:00 2001
From: Gianfranco Rossi <gianfranco.rossi.r@gmail.com>
Date: Mon, 12 Aug 2024 22:36:08 -0500
Subject: [PATCH 1/8] refactor(scrapers.utils.get_binary_content)

- Remove "method" argument
- Log errors instead of returning the error message
- Return the cleaned up content using site.cleanup_content
- Update tests
- Update opinions and oral arguments scraper caller to reflect changes
---
 .../management/commands/cl_scrape_opinions.py | 12 ++-----
 .../commands/cl_scrape_oral_arguments.py      | 12 ++-----
 cl/scrapers/tests.py                          | 24 +++++++-------
 cl/scrapers/utils.py                          | 33 +++++++++++--------
 4 files changed, 36 insertions(+), 45 deletions(-)

diff --git a/cl/scrapers/management/commands/cl_scrape_opinions.py b/cl/scrapers/management/commands/cl_scrape_opinions.py
index 59b4626a15..a147da1182 100644
--- a/cl/scrapers/management/commands/cl_scrape_opinions.py
+++ b/cl/scrapers/management/commands/cl_scrape_opinions.py
@@ -277,18 +277,10 @@ def scrape_court(self, site, full_crawl=False, ocr_available=True):
         logger.debug(f"#{len(site)} opinions found.")
         added = 0
         for i, item in enumerate(site):
-            msg, r = get_binary_content(
-                item["download_urls"],
-                site,
-                method=site.method,
-            )
-            if msg:
-                fingerprint = [f"{court_str}-unexpected-content-type"]
-                logger.error(msg, extra={"fingerprint": fingerprint})
+            content = get_binary_content(item["download_urls"], site)
+            if not content:
                 continue
 
-            content = site.cleanup_content(r.content)
-
             current_date = item["case_dates"]
             try:
                 next_date = site[i + 1]["case_dates"]
diff --git a/cl/scrapers/management/commands/cl_scrape_oral_arguments.py b/cl/scrapers/management/commands/cl_scrape_oral_arguments.py
index ab19cac0f5..a3d36ed7bb 100644
--- a/cl/scrapers/management/commands/cl_scrape_oral_arguments.py
+++ b/cl/scrapers/management/commands/cl_scrape_oral_arguments.py
@@ -125,18 +125,10 @@ def scrape_court(
         if site.cookies:
             logger.info(f"Using cookies: {site.cookies}")
         for i, item in enumerate(site):
-            msg, r = get_binary_content(
-                item["download_urls"],
-                site,
-                method=site.method,
-            )
-            if msg:
-                fingerprint = [f"{court_str}-unexpected-content-type"]
-                logger.error(msg, extra={"fingerprint": fingerprint})
+            content = get_binary_content(item["download_urls"], site)
+            if not content:
                 continue
 
-            content = site.cleanup_content(r.content)
-
             current_date = item["case_dates"]
             try:
                 next_date = site[i + 1]["case_dates"]
diff --git a/cl/scrapers/tests.py b/cl/scrapers/tests.py
index dba2479e45..497a491d4d 100644
--- a/cl/scrapers/tests.py
+++ b/cl/scrapers/tests.py
@@ -633,8 +633,9 @@ def test_unexpected_content_type(self, mock_get):
         mock_get.return_value = self.mock_response
         self.site.expected_content_types = ["text/html"]
 
-        msg, _ = get_binary_content("/dummy/url/", self.site)
-        self.assertIn("UnexpectedContentTypeError:", msg)
+        with self.assertLogs(level="ERROR") as cm:
+            get_binary_content("/dummy/url/", self.site)
+        self.assertIn("UnexpectedContentTypeError:", cm.output[0])
 
     @mock.patch("requests.Session.get")
     def test_correct_content_type(self, mock_get):
@@ -642,15 +643,14 @@ def test_correct_content_type(self, mock_get):
         mock_get.return_value = self.mock_response
         self.site.expected_content_types = ["application/pdf"]
 
-        msg, _ = get_binary_content("/dummy/url/", self.site)
-        self.assertEqual("", msg)
+        with self.assertNoLogs(level="ERROR"):
+            _ = get_binary_content("/dummy/url/", self.site)
 
-        self.mock_response.headers = {
-            "Content-Type": "application/pdf;charset=utf-8"
-        }
-        mock_get.return_value = self.mock_response
-        msg, _ = get_binary_content("/dummy/url/", self.site)
-        self.assertEqual("", msg)
+            self.mock_response.headers = {
+                "Content-Type": "application/pdf;charset=utf-8"
+            }
+            mock_get.return_value = self.mock_response
+            _ = get_binary_content("/dummy/url/", self.site)
 
     @mock.patch("requests.Session.get")
     def test_no_content_type(self, mock_get):
@@ -658,5 +658,5 @@ def test_no_content_type(self, mock_get):
         mock_get.return_value = self.mock_response
         self.site.expected_content_types = None
 
-        msg, _ = get_binary_content("/dummy/url/", self.site)
-        self.assertEqual("", msg)
+        with self.assertNoLogs(level="ERROR"):
+            _ = get_binary_content("/dummy/url/", self.site)
diff --git a/cl/scrapers/utils.py b/cl/scrapers/utils.py
index b5e9fe51c1..15ef2bda7e 100644
--- a/cl/scrapers/utils.py
+++ b/cl/scrapers/utils.py
@@ -155,25 +155,29 @@ def get_extension(content: bytes) -> str:
 def get_binary_content(
     download_url: str,
     site: AbstractSite,
-    method: str = "GET",
-) -> Tuple[str, Optional[Response]]:
+) -> Optional[bytes | str]:
     """Downloads the file, covering a few special cases such as invalid SSL
     certificates and empty file errors.
 
     :param download_url: The URL for the item you wish to download.
     :param site: Site object used to download data
-    :param method: The HTTP method used to get the item, or "LOCAL" to get an
-    item during testing
+
     :return: Two values. The first is a msg indicating any errors encountered.
     If blank, that indicates success. The second value is the response object
     containing the downloaded file.
     """
+    court_str = site.court_id.split(".")[-1].split("_")[0]
+    fingerprint = [f"{court_str}-unexpected-content-type"]
+
     if not download_url:
         # Occurs when a DeferredList fetcher fails.
-        msg = f"NoDownloadUrlError: {download_url}\n{traceback.format_exc()}"
-        return msg, None
+        error = f"NoDownloadUrlError: {download_url}\n{traceback.format_exc()}"
+        logger.error(error, extra={"fingerprint": fingerprint})
+        return
+
     # noinspection PyBroadException
-    if method == "LOCAL":
+    if site.method == "LOCAL":
+        # "LOCAL" is the method when testing
         url = os.path.join(settings.MEDIA_ROOT, download_url)
         mr = MockRequest(url=url)
         r = mr.get()
@@ -203,8 +207,9 @@ def get_binary_content(
 
         # test for empty files (thank you CA1)
         if len(r.content) == 0:
-            msg = f"EmptyFileError: {download_url}\n{traceback.format_exc()}"
-            return msg, None
+            error = f"EmptyFileError: {download_url}\n{traceback.format_exc()}"
+            logger.error(error, extra={"fingerprint": fingerprint})
+            return
 
         # test for expected content type (thanks mont for nil)
         if site.expected_content_types:
@@ -218,18 +223,20 @@ def get_binary_content(
                 for mime in site.expected_content_types
             )
             if not m:
-                msg = (
+                error = (
                     f"UnexpectedContentTypeError: {download_url}\n"
                     f'\'"{content_type}" not in {site.expected_content_types}'
                 )
-                return msg, None
+                logger.error(error, extra={"fingerprint": fingerprint})
+                return
 
         # test for and follow meta redirects
         r = follow_redirections(r, s)
         r.raise_for_status()
 
-    # Success!
-    return "", r
+    content = site.cleanup_content(r.content)
+
+    return content
 
 
 def signal_handler(signal, frame):

From 5fb7ba9ab2c4ab813fc8c062dec812286a5d0e51 Mon Sep 17 00:00:00 2001
From: Gianfranco Rossi <gianfranco.rossi.r@gmail.com>
Date: Mon, 12 Aug 2024 22:36:38 -0500
Subject: [PATCH 2/8] feat(cl_back_scrape_citations): command to scrape
 citations

---
 .../commands/cl_back_scrape_citations.py      | 115 ++++++++++++++++++
 1 file changed, 115 insertions(+)
 create mode 100644 cl/scrapers/management/commands/cl_back_scrape_citations.py

diff --git a/cl/scrapers/management/commands/cl_back_scrape_citations.py b/cl/scrapers/management/commands/cl_back_scrape_citations.py
new file mode 100644
index 0000000000..43fa7a3f34
--- /dev/null
+++ b/cl/scrapers/management/commands/cl_back_scrape_citations.py
@@ -0,0 +1,115 @@
+"""
+When opinions are first published on the courts' sites, they won't have
+all their citations assigned. Some courts will publish the citations
+in the same pages we scrape, but months later
+
+This command re-uses the (back)scraper we use to get opinions, to get
+the lagged citations and associate them with the Opinions we first
+downloaded. If we find an Opinion we don't have in the database,
+we ingest it as in a regular scrape
+"""
+
+from django.utils.encoding import force_bytes
+
+from cl.lib.crypto import sha1
+from cl.scrapers.management.commands import cl_back_scrape_opinions
+from cl.scrapers.management.commands.cl_scrape_opinions import make_citation
+from cl.scrapers.utils import get_binary_content
+from cl.search.models import Opinion, Citation, OpinionCluster
+from cl.lib.command_utils import logger
+
+
+class Command(cl_back_scrape_opinions.Command):
+    def scrape_court(self, site, full_crawl=False, ocr_available=True):
+        """
+        If the scraped case has citation data
+            Check for Opinion existance via content hash
+            If we have the Opinion
+                if we don't have the citation -> ingest
+                if we already have the citation -> pass
+            If we don't have the Opinion
+                ingest the opinion with it's citation, that is to say,
+                use the regular scraping process!
+
+        :param site: scraper object that has already downloaded
+            it's case data
+        """
+        missing_opinions = []
+        court_str = site.court_id.split(".")[-1].split("_")[0]
+
+        for case in site:
+            citation = case.get("citation")
+            parallel_citation = case.get("parallel_citation")
+            if not citation and not parallel_citation:
+                continue
+
+            content = get_binary_content(case["download_urls"], site)
+            if not content:
+                # Errors are logged by get_binary_content itself
+                continue
+            sha1_hash = sha1(force_bytes(content))
+
+            try:
+                cluster = Opinion.objects.get(sha1=sha1_hash).cluster
+            except Opinion.DoesNotExist:
+                missing_opinions.append(case)
+                logger.info(
+                    "Opinion with URL '%s' does not exist. Has citation '%s'. Will try to ingest all objects",
+                    case["download_urls"],
+                    citation or parallel_citation,
+                )
+                continue
+
+            for cite in [citation, parallel_citation]:
+                if not cite:
+                    continue
+
+                citation_candidate = make_citation(cite, cluster, court_str)
+                if not citation_candidate:
+                    continue
+
+                if self.citation_is_duplicated(
+                    citation_candidate, cluster, cite
+                ):
+                    continue
+
+                logger.info("Saving citation %s for cluster %s", cite, cluster)
+                citation_candidate.save()
+
+        # We don't have these opinions. Since we are backscraping, if the citation
+        # exists, it will be in the case dictionary, and will be saved in a
+        # regular ingestion process
+        if missing_opinions:
+            site.cases = missing_opinions
+            super().scrape_court(site, full_crawl=True)
+
+    def citation_is_duplicated(
+        self, citation_candidate: Citation, cluster: OpinionCluster, cite: str
+    ) -> bool:
+        """Checks for exact or reporter duplication of citation in the cluster
+        """
+        citation_params = citation_candidate.__dict__
+        citation_params.pop("_state", "")
+        citation_params.pop("id", "")
+
+        # Exact duplication
+        if Citation.objects.filter(**citation_params).exists():
+            logger.info(
+                "Citation '%s' already exists for cluster %s",
+                cite,
+                cluster.id,
+            )
+            return True
+
+        # Duplication in the same reporter
+        if Citation.objects.filter(
+            cluster_id=cluster.id, reporter=citation_candidate.reporter
+        ).exists():
+            logger.info(
+                "Another citation in the same reporter '%s' exists for cluster %s",
+                citation_candidate.reporter,
+                cluster.id,
+            )
+            return True
+
+        return False

From 22945159f325391ac773ebf5afdb1541076265c8 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 13 Aug 2024 03:39:02 +0000
Subject: [PATCH 3/8] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../management/commands/cl_back_scrape_citations.py        | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/cl/scrapers/management/commands/cl_back_scrape_citations.py b/cl/scrapers/management/commands/cl_back_scrape_citations.py
index 43fa7a3f34..8fd59843c8 100644
--- a/cl/scrapers/management/commands/cl_back_scrape_citations.py
+++ b/cl/scrapers/management/commands/cl_back_scrape_citations.py
@@ -11,12 +11,12 @@
 
 from django.utils.encoding import force_bytes
 
+from cl.lib.command_utils import logger
 from cl.lib.crypto import sha1
 from cl.scrapers.management.commands import cl_back_scrape_opinions
 from cl.scrapers.management.commands.cl_scrape_opinions import make_citation
 from cl.scrapers.utils import get_binary_content
-from cl.search.models import Opinion, Citation, OpinionCluster
-from cl.lib.command_utils import logger
+from cl.search.models import Citation, Opinion, OpinionCluster
 
 
 class Command(cl_back_scrape_opinions.Command):
@@ -86,8 +86,7 @@ def scrape_court(self, site, full_crawl=False, ocr_available=True):
     def citation_is_duplicated(
         self, citation_candidate: Citation, cluster: OpinionCluster, cite: str
     ) -> bool:
-        """Checks for exact or reporter duplication of citation in the cluster
-        """
+        """Checks for exact or reporter duplication of citation in the cluster"""
         citation_params = citation_candidate.__dict__
         citation_params.pop("_state", "")
         citation_params.pop("id", "")

From fac3f134c60ccb87e7fe904550a19da50376a300 Mon Sep 17 00:00:00 2001
From: Gianfranco Rossi <gianfranco.rossi.r@gmail.com>
Date: Mon, 12 Aug 2024 23:32:08 -0500
Subject: [PATCH 4/8] fix(scrapers.tests): test new logger calls inside
 get_binary_content

---
 cl/scrapers/tests.py | 16 +++++++++++-----
 cl/scrapers/utils.py |  4 ++--
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/cl/scrapers/tests.py b/cl/scrapers/tests.py
index 497a491d4d..9749042ca0 100644
--- a/cl/scrapers/tests.py
+++ b/cl/scrapers/tests.py
@@ -8,6 +8,7 @@
 from django.conf import settings
 from django.core.files.base import ContentFile
 from django.utils.timezone import now
+from juriscraper.AbstractSite import logger
 
 from cl.alerts.factories import AlertFactory
 from cl.alerts.models import Alert
@@ -626,16 +627,19 @@ def setUp(self):
         self.mock_response.content = b"not empty"
         self.mock_response.headers = {"Content-Type": "application/pdf"}
         self.site = test_opinion_scraper.Site()
+        self.site.method = "GET"
+        self.logger = logger
 
     @mock.patch("requests.Session.get")
     def test_unexpected_content_type(self, mock_get):
         """Test when content type doesn't match scraper expectation."""
         mock_get.return_value = self.mock_response
         self.site.expected_content_types = ["text/html"]
-
-        with self.assertLogs(level="ERROR") as cm:
+        with mock.patch.object(self.logger, "error") as error_mock:
             get_binary_content("/dummy/url/", self.site)
-        self.assertIn("UnexpectedContentTypeError:", cm.output[0])
+        self.assertIn(
+            "UnexpectedContentTypeError:", error_mock.call_args_list[0][0][0]
+        )
 
     @mock.patch("requests.Session.get")
     def test_correct_content_type(self, mock_get):
@@ -643,7 +647,7 @@ def test_correct_content_type(self, mock_get):
         mock_get.return_value = self.mock_response
         self.site.expected_content_types = ["application/pdf"]
 
-        with self.assertNoLogs(level="ERROR"):
+        with mock.patch.object(self.logger, "error") as error_mock:
             _ = get_binary_content("/dummy/url/", self.site)
 
             self.mock_response.headers = {
@@ -651,6 +655,7 @@ def test_correct_content_type(self, mock_get):
             }
             mock_get.return_value = self.mock_response
             _ = get_binary_content("/dummy/url/", self.site)
+            error_mock.assert_not_called()
 
     @mock.patch("requests.Session.get")
     def test_no_content_type(self, mock_get):
@@ -658,5 +663,6 @@ def test_no_content_type(self, mock_get):
         mock_get.return_value = self.mock_response
         self.site.expected_content_types = None
 
-        with self.assertNoLogs(level="ERROR"):
+        with mock.patch.object(self.logger, "error") as error_mock:
             _ = get_binary_content("/dummy/url/", self.site)
+            error_mock.assert_not_called()
diff --git a/cl/scrapers/utils.py b/cl/scrapers/utils.py
index 15ef2bda7e..a1a2e6eab2 100644
--- a/cl/scrapers/utils.py
+++ b/cl/scrapers/utils.py
@@ -181,8 +181,7 @@ def get_binary_content(
         url = os.path.join(settings.MEDIA_ROOT, download_url)
         mr = MockRequest(url=url)
         r = mr.get()
-        r = follow_redirections(r, requests.Session())
-        r.raise_for_status()
+        s = requests.Session()
     else:
         # some sites require a custom ssl_context, contained in the Site's
         # session. However, we can't send a request with both a
@@ -222,6 +221,7 @@ def get_binary_content(
                 content_type in mime.lower()
                 for mime in site.expected_content_types
             )
+
             if not m:
                 error = (
                     f"UnexpectedContentTypeError: {download_url}\n"

From ad975d99fe62e00465072fdd4b610f0db708fe43 Mon Sep 17 00:00:00 2001
From: Gianfranco Rossi <gianfranco.rossi.r@gmail.com>
Date: Thu, 15 Aug 2024 17:07:23 -0500
Subject: [PATCH 5/8] feat(scrapers.tests): add tests for
 cl_back_scrape_citations

Related to freelawproject/juriscraper#858
---
 .../commands/cl_back_scrape_citations.py      | 27 +++++---
 cl/scrapers/tests.py                          | 62 ++++++++++++++++++-
 2 files changed, 80 insertions(+), 9 deletions(-)

diff --git a/cl/scrapers/management/commands/cl_back_scrape_citations.py b/cl/scrapers/management/commands/cl_back_scrape_citations.py
index 8fd59843c8..11f269c14b 100644
--- a/cl/scrapers/management/commands/cl_back_scrape_citations.py
+++ b/cl/scrapers/management/commands/cl_back_scrape_citations.py
@@ -38,9 +38,13 @@ def scrape_court(self, site, full_crawl=False, ocr_available=True):
         court_str = site.court_id.split(".")[-1].split("_")[0]
 
         for case in site:
-            citation = case.get("citation")
-            parallel_citation = case.get("parallel_citation")
+            citation = case.get("citations")
+            parallel_citation = case.get("parallel_citations")
             if not citation and not parallel_citation:
+                logger.debug(
+                    "No citation, skipping row for case %s",
+                    case.get("case_names"),
+                )
                 continue
 
             content = get_binary_content(case["download_urls"], site)
@@ -54,7 +58,9 @@ def scrape_court(self, site, full_crawl=False, ocr_available=True):
             except Opinion.DoesNotExist:
                 missing_opinions.append(case)
                 logger.info(
-                    "Opinion with URL '%s' does not exist. Has citation '%s'. Will try to ingest all objects",
+                    "Case '%s', opinion '%s' has no matching hash in the DB. "
+                    "Has a citation '%s'. Will try to ingest all objects",
+                    case["case_names"],
                     case["download_urls"],
                     citation or parallel_citation,
                 )
@@ -80,14 +86,21 @@ def scrape_court(self, site, full_crawl=False, ocr_available=True):
         # exists, it will be in the case dictionary, and will be saved in a
         # regular ingestion process
         if missing_opinions:
-            site.cases = missing_opinions
-            super().scrape_court(site, full_crawl=True)
+            # It is easy to ingest a filtered list of cases for OpinionSiteLinear
+            # but not for plain OpinionSite
+            if hasattr(site, "cases"):
+                site.cases = missing_opinions
+                super().scrape_court(site, full_crawl=True)
+            else:
+                logger.info("Run the backscraper to collect missing opinions")
 
     def citation_is_duplicated(
         self, citation_candidate: Citation, cluster: OpinionCluster, cite: str
     ) -> bool:
-        """Checks for exact or reporter duplication of citation in the cluster"""
-        citation_params = citation_candidate.__dict__
+        """Checks for exact or reporter duplication of citation in the cluster
+        Inspired on corpus_importer.utils.add_citations_to_cluster
+        """
+        citation_params = {**citation_candidate.__dict__}
         citation_params.pop("_state", "")
         citation_params.pop("id", "")
 
diff --git a/cl/scrapers/tests.py b/cl/scrapers/tests.py
index 9749042ca0..0d9382615e 100644
--- a/cl/scrapers/tests.py
+++ b/cl/scrapers/tests.py
@@ -22,6 +22,7 @@
 from cl.lib.test_helpers import generate_docket_target_sources
 from cl.scrapers.DupChecker import DupChecker
 from cl.scrapers.management.commands import (
+    cl_back_scrape_citations,
     cl_scrape_opinions,
     cl_scrape_oral_arguments,
 )
@@ -29,8 +30,13 @@
 from cl.scrapers.tasks import extract_doc_content, process_audio_file
 from cl.scrapers.test_assets import test_opinion_scraper, test_oral_arg_scraper
 from cl.scrapers.utils import get_binary_content, get_extension
-from cl.search.factories import CourtFactory, DocketFactory
-from cl.search.models import Court, Docket, Opinion
+from cl.search.factories import (
+    CourtFactory,
+    DocketFactory,
+    OpinionClusterFactory,
+    OpinionFactory,
+)
+from cl.search.models import Citation, Court, Docket, Opinion
 from cl.settings import MEDIA_ROOT
 from cl.tests.cases import ESIndexTestCase, SimpleTestCase, TestCase
 from cl.tests.fixtures import ONE_SECOND_MP3_BYTES, SMALL_WAV_BYTES
@@ -666,3 +672,55 @@ def test_no_content_type(self, mock_get):
         with mock.patch.object(self.logger, "error") as error_mock:
             _ = get_binary_content("/dummy/url/", self.site)
             error_mock.assert_not_called()
+
+
+class ScrapeCitationsTestCase(TestCase):
+    """This class only tests the update of existing clusters
+    Since the ingestion of new clusters and their citations call
+    super().scrape_court(), it should be tested in the superclass
+    """
+
+    def setUp(self):
+        keys = [
+            "download_urls",
+            "case_names",
+            "citations",
+            "parallel_citations",
+        ]
+        self.mock_site = mock.MagicMock()
+        self.mock_site.__iter__.return_value = [
+            # update
+            dict(zip(keys, ["", "something", "482 Md. 342", ""])),
+            # exact duplicate
+            dict(zip(keys, ["", "something", "", "482 Md. 342"])),
+            # reporter duplicate
+            dict(zip(keys, ["", "something", "485 Md. 111", ""])),
+            # no citation, ignore
+            dict(zip(keys, ["", "something", "", ""])),
+        ]
+        self.mock_site.court_id = "juriscraper.md"
+        self.hash = "1234" * 10
+        self.hashes = [self.hash, self.hash, self.hash, "111"]
+
+        court = CourtFactory(id="md")
+        docket = DocketFactory(
+            case_name="Attorney Grievance v. Taniform",
+            docket_number="40ag/21",
+            court_id="md",
+            source=Docket.SCRAPER,
+            pacer_case_id=None,
+        )
+        self.cluster = OpinionClusterFactory(docket=docket)
+        opinion = OpinionFactory(sha1=self.hash, cluster=self.cluster)
+
+    def test_citation_scraper(self):
+        """Test if citation scraper creates a citation or ignores duplicates"""
+        cmd = "cl.scrapers.management.commands.cl_back_scrape_citations"
+        with mock.patch(f"{cmd}.sha1", side_effect=self.hashes):
+            with mock.patch(
+                f"{cmd}.get_binary_content", return_value="placeholder"
+            ):
+                cl_back_scrape_citations.Command().scrape_court(self.mock_site)
+
+        citations = Citation.objects.filter(cluster=self.cluster).count()
+        self.assertEqual(citations, 1, "Exactly 1 citation was expected")

From fc3e1a1563309ac2e7c16f27664329afe7221124 Mon Sep 17 00:00:00 2001
From: Gianfranco Rossi <gianfranco.rossi.r@gmail.com>
Date: Fri, 16 Aug 2024 14:04:31 -0500
Subject: [PATCH 6/8] refactor(cl_back_scrape_citations)

Reword docstrings, catch exceptions and refactor code following code review
---
 .../commands/cl_back_scrape_citations.py      | 43 +++++++++++++------
 cl/scrapers/tests.py                          | 11 +++--
 cl/scrapers/utils.py                          |  7 +--
 3 files changed, 40 insertions(+), 21 deletions(-)

diff --git a/cl/scrapers/management/commands/cl_back_scrape_citations.py b/cl/scrapers/management/commands/cl_back_scrape_citations.py
index 11f269c14b..982abfca1a 100644
--- a/cl/scrapers/management/commands/cl_back_scrape_citations.py
+++ b/cl/scrapers/management/commands/cl_back_scrape_citations.py
@@ -9,6 +9,7 @@
 we ingest it as in a regular scrape
 """
 
+from django.db import IntegrityError
 from django.utils.encoding import force_bytes
 
 from cl.lib.command_utils import logger
@@ -16,7 +17,7 @@
 from cl.scrapers.management.commands import cl_back_scrape_opinions
 from cl.scrapers.management.commands.cl_scrape_opinions import make_citation
 from cl.scrapers.utils import get_binary_content
-from cl.search.models import Citation, Opinion, OpinionCluster
+from cl.search.models import Citation, Opinion
 
 
 class Command(cl_back_scrape_opinions.Command):
@@ -74,13 +75,20 @@ def scrape_court(self, site, full_crawl=False, ocr_available=True):
                 if not citation_candidate:
                     continue
 
-                if self.citation_is_duplicated(
-                    citation_candidate, cluster, cite
-                ):
+                if self.citation_is_duplicated(citation_candidate, cite):
                     continue
 
-                logger.info("Saving citation %s for cluster %s", cite, cluster)
-                citation_candidate.save()
+                try:
+                    citation_candidate.save()
+                    logger.info(
+                        "Saved citation %s for cluster %s", cite, cluster
+                    )
+                except IntegrityError:
+                    logger.warning(
+                        "Error when saving citation %s for cluster %s",
+                        cite,
+                        cluster,
+                    )
 
         # We don't have these opinions. Since we are backscraping, if the citation
         # exists, it will be in the case dictionary, and will be saved in a
@@ -95,32 +103,43 @@ def scrape_court(self, site, full_crawl=False, ocr_available=True):
                 logger.info("Run the backscraper to collect missing opinions")
 
     def citation_is_duplicated(
-        self, citation_candidate: Citation, cluster: OpinionCluster, cite: str
+        self, citation_candidate: Citation, cite: str
     ) -> bool:
-        """Checks for exact or reporter duplication of citation in the cluster
-        Inspired on corpus_importer.utils.add_citations_to_cluster
+        """Checks if the citation is duplicated for the cluster
+
+        Following corpus_importer.utils.add_citations_to_cluster we
+        identify 2 types of duplication:
+        - exact: a citation with the same fields already exists for the cluster
+        - duplication in the same reporter: the cluster already has a citation
+            in that reporter
+
+        :param citation_candidate: the citation object
+        :param cite: citation string
+
+        :return: True if citation is duplicated, False if not
         """
         citation_params = {**citation_candidate.__dict__}
         citation_params.pop("_state", "")
         citation_params.pop("id", "")
+        cluster_id = citation_candidate.cluster.id
 
         # Exact duplication
         if Citation.objects.filter(**citation_params).exists():
             logger.info(
                 "Citation '%s' already exists for cluster %s",
                 cite,
-                cluster.id,
+                cluster_id,
             )
             return True
 
         # Duplication in the same reporter
         if Citation.objects.filter(
-            cluster_id=cluster.id, reporter=citation_candidate.reporter
+            cluster_id=cluster_id, reporter=citation_candidate.reporter
         ).exists():
             logger.info(
                 "Another citation in the same reporter '%s' exists for cluster %s",
                 citation_candidate.reporter,
-                cluster.id,
+                cluster_id,
             )
             return True
 
diff --git a/cl/scrapers/tests.py b/cl/scrapers/tests.py
index 0d9382615e..ac6e5de0f7 100644
--- a/cl/scrapers/tests.py
+++ b/cl/scrapers/tests.py
@@ -674,7 +674,7 @@ def test_no_content_type(self, mock_get):
             error_mock.assert_not_called()
 
 
-class ScrapeCitationsTestCase(TestCase):
+class ScrapeCitationsTest(TestCase):
     """This class only tests the update of existing clusters
     Since the ingestion of new clusters and their citations call
     super().scrape_court(), it should be tested in the superclass
@@ -716,11 +716,10 @@ def setUp(self):
     def test_citation_scraper(self):
         """Test if citation scraper creates a citation or ignores duplicates"""
         cmd = "cl.scrapers.management.commands.cl_back_scrape_citations"
-        with mock.patch(f"{cmd}.sha1", side_effect=self.hashes):
-            with mock.patch(
-                f"{cmd}.get_binary_content", return_value="placeholder"
-            ):
-                cl_back_scrape_citations.Command().scrape_court(self.mock_site)
+        with mock.patch(f"{cmd}.sha1", side_effect=self.hashes), mock.patch(
+            f"{cmd}.get_binary_content", return_value="placeholder"
+        ):
+            cl_back_scrape_citations.Command().scrape_court(self.mock_site)
 
         citations = Citation.objects.filter(cluster=self.cluster).count()
         self.assertEqual(citations, 1, "Exactly 1 citation was expected")
diff --git a/cl/scrapers/utils.py b/cl/scrapers/utils.py
index a1a2e6eab2..445797ce45 100644
--- a/cl/scrapers/utils.py
+++ b/cl/scrapers/utils.py
@@ -162,9 +162,10 @@ def get_binary_content(
     :param download_url: The URL for the item you wish to download.
     :param site: Site object used to download data
 
-    :return: Two values. The first is a msg indicating any errors encountered.
-    If blank, that indicates success. The second value is the response object
-    containing the downloaded file.
+    :return: One of:
+        - None if there was no URL, if the downloaded file was empty or if
+            the content type did not match the expected types
+        - The downloaded and cleaned content
     """
     court_str = site.court_id.split(".")[-1].split("_")[0]
     fingerprint = [f"{court_str}-unexpected-content-type"]

From 5a738b34f33e6cfae7a21e7f3b5933af0974bbb5 Mon Sep 17 00:00:00 2001
From: Gianfranco Rossi <gianfranco.rossi.r@gmail.com>
Date: Tue, 20 Aug 2024 11:35:35 -0500
Subject: [PATCH 7/8] refactor(cl_scrape_opinions): function for ingesting a
 single case

- create cl.scrapers.exceptions file to hold exceptions raised when ingesting a single case
- use the exceptions to bubble errors to the main loop, to avoid returning break / continue flags
- refactor DupChecker to raise errors
- refactor get_binary_content to raise errors
- refactor cl_scrape_oral_arguments to new paradigm
- cl_back_scrape_citations can now re-scrape a single case without re-downloading the binary content or manipulating the site object
- adapted DupChecker and ContentType tests to changes
- refactores logger.calls to use lazy formatting
---
 cl/scrapers/DupChecker.py                     |  99 +++++-----
 cl/scrapers/exceptions.py                     |  82 ++++++++
 .../commands/cl_back_scrape_citations.py      |  40 ++--
 .../management/commands/cl_scrape_opinions.py | 185 +++++++++++-------
 .../commands/cl_scrape_oral_arguments.py      | 110 ++++-------
 cl/scrapers/tests.py                          | 151 ++++++--------
 cl/scrapers/utils.py                          |  35 ++--
 7 files changed, 379 insertions(+), 323 deletions(-)
 create mode 100644 cl/scrapers/exceptions.py

diff --git a/cl/scrapers/DupChecker.py b/cl/scrapers/DupChecker.py
index 7d5581955e..10cac56334 100644
--- a/cl/scrapers/DupChecker.py
+++ b/cl/scrapers/DupChecker.py
@@ -1,5 +1,9 @@
 from juriscraper.AbstractSite import logger
 
+from cl.scrapers.exceptions import (
+    ConsecutiveDuplicatesError,
+    SingleDuplicateError,
+)
 from cl.scrapers.models import UrlHash
 from cl.search.models import Court
 
@@ -19,7 +23,6 @@ def __init__(
         self.url_hash = None
         self.dup_count = 0
         self.last_found_date = None
-        self.emulate_break = False
         super().__init__(*args, **kwargs)
 
     def _increment(self, current_date):
@@ -83,29 +86,29 @@ def press_on(
         lookup_by="sha1",
     ):
         """Checks if a we have an `object_type` with identical content in the CL
-        corpus by looking up `lookup_value` in the `lookup_by` field. Depending
-        on the result of that, we either return True or False. True represents
-        the fact that the next item should be processed. False means that either
-        the item was a duplicate or that we've hit so many duplicates that we've
-        stopped checking (we hit a duplicate threshold). Either way, the caller
-        should move to the next item and try it.
+        corpus by looking up `lookup_value` in the `lookup_by` field.
 
-        The effect of this is that this emulates for loop constructs for
-        continue (False), break (False), return (True).
+        If the item is not a duplicate, we will return None, and the caller
+        will proceed normally
+
+        If the item is a duplicate, we will raise SingleDuplicateError
+
+        If the item is a duplicate following a series of duplicates greater than
+        our tolerance threshold, we will raise ConsecutiveDuplicatesError
+
+        If the item is a duplicate and the next item is from an already scraped
+        date, we will raise ConsecutiveDuplicatesError
 
         Following logic applies:
+         - if we do not have the item
+            - early return
          - if we have the item already
             - and if the next date is before this date
             - or if this is our duplicate threshold is exceeded
                 - break
             - otherwise
                 - continue
-         - if not
-            - carry on
         """
-        if self.emulate_break:
-            return False
-
         # check for a duplicate in the db.
         if lookup_by == "sha1":
             exists = object_type.objects.filter(sha1=lookup_value).exists()
@@ -116,41 +119,35 @@ def press_on(
         else:
             raise NotImplementedError("Unknown lookup_by parameter.")
 
-        if exists:
-            logger.info(
-                f"Duplicate found on date: {current_date}, with lookup value: {lookup_value}"
-            )
-            self._increment(current_date)
-
-            # If the next date in the Site object is less than (before) the
-            # current date, we needn't continue because we should already have
-            # that item.
-            if next_date:
-                already_scraped_next_date = next_date < current_date
-            else:
-                already_scraped_next_date = True
-            if not self.full_crawl:
-                if already_scraped_next_date:
-                    if self.court.pk == "mich":
-                        # Michigan sometimes has multiple occurrences of the
-                        # same case with different dates on a page.
-                        return False
-                    else:
-                        logger.info(
-                            "Next case occurs prior to when we found a "
-                            "duplicate. Court is up to date."
-                        )
-                        self.emulate_break = True
-                        return False
-                elif self.dup_count >= self.dup_threshold:
-                    logger.info(
-                        f"Found {self.dup_count} duplicates in a row. Court is up to date."
-                    )
-                    self.emulate_break = True
-                    return False
-            else:
-                # This is a full crawl. Do not emulate a break, BUT be sure to
-                # say that we shouldn't press on, since the item already exists.
-                return False
+        if not exists:
+            return
+
+        logger.info(
+            f"Duplicate found on date: {current_date}, with lookup value: {lookup_value}"
+        )
+        self._increment(current_date)
+
+        # If the next date in the Site object is less than (before) the
+        # current date, we needn't continue because we should already have
+        # that item.
+        if next_date:
+            already_scraped_next_date = next_date < current_date
+        else:
+            already_scraped_next_date = True
+
+        if not self.full_crawl:
+            if already_scraped_next_date:
+                if self.court.pk == "mich":
+                    # Michigan sometimes has multiple occurrences of the
+                    # same case with different dates on a page.
+                    raise SingleDuplicateError(logger=logger)
+                else:
+                    message = "Next case occurs prior to when we found a duplicate. Court is up to date."
+                    raise ConsecutiveDuplicatesError(message, logger=logger)
+            elif self.dup_count >= self.dup_threshold:
+                message = f"Found {self.dup_count} duplicates in a row. Court is up to date."
+                raise ConsecutiveDuplicatesError(message, logger=logger)
         else:
-            return True
+            # This is a full crawl. Do not raise a loop breaking `ConsecutiveDuplicatesError`,
+            # but say that we shouldn't press on, since the item already exists.
+            raise SingleDuplicateError(logger=logger)
diff --git a/cl/scrapers/exceptions.py b/cl/scrapers/exceptions.py
new file mode 100644
index 0000000000..dcefbb80d1
--- /dev/null
+++ b/cl/scrapers/exceptions.py
@@ -0,0 +1,82 @@
+import logging
+from typing import Optional
+
+from cl.lib.command_utils import logger
+
+
+class AutoLoggingException(Exception):
+    """Exception with defaults for logging, to be subclassed
+
+    We log expected exceptions to better understand what went wrong
+    Logger calls with level `logging.ERROR` are sent to Sentry, and
+    it's useful to send a `fingerprint` to force a specific grouping by court
+
+    Other `logger` calls are just printed on the console when using a
+    VerboseCommand with proper verbosity levels
+    """
+
+    logging_level = logging.DEBUG
+    message = ""
+    logger = logger
+
+    def __init__(
+        self,
+        message: str = "",
+        logger: Optional[logging.Logger] = None,
+        logging_level: Optional[int] = None,
+        fingerprint: Optional[list[str]] = None,
+    ):
+        if not message:
+            message = self.message
+        if not logger:
+            logger = self.logger
+        if not logging_level:
+            logging_level = self.logging_level
+
+        log_kwargs = {}
+        if fingerprint:
+            log_kwargs["extra"] = {"fingerprint": fingerprint}
+
+        logger.log(logging_level, message, **log_kwargs)
+        super().__init__(message)
+
+
+class ConsecutiveDuplicatesError(AutoLoggingException):
+    """Occurs when consecutive `SingleDuplicateError` are found,
+    which may be used as a signal to break the scraping loop
+    """
+
+    message = "DupChecker emulate break triggered."
+
+
+class SingleDuplicateError(AutoLoggingException):
+    """Occurs when an opinion or audio file already exists
+    in our database
+    """
+
+    message = "Skipping opinion due to duplicated content hash"
+
+
+class BadContentError(AutoLoggingException):
+    """Parent class for errors raised when downloading binary content"""
+
+
+class UnexpectedContentTypeError(BadContentError):
+    """Occurs when the content received from the server has
+    a different content type than the ones listed on
+    site.expected_content_types
+    """
+
+    logging_level = logging.ERROR
+
+
+class NoDownloadUrlError(BadContentError):
+    """Occurs when a DeferredList fetcher fails."""
+
+    logging_level = logging.ERROR
+
+
+class EmptyFileError(BadContentError):
+    """Occurs when the content of the response has lenght 0"""
+
+    logging_level = logging.ERROR
diff --git a/cl/scrapers/management/commands/cl_back_scrape_citations.py b/cl/scrapers/management/commands/cl_back_scrape_citations.py
index 982abfca1a..8975461217 100644
--- a/cl/scrapers/management/commands/cl_back_scrape_citations.py
+++ b/cl/scrapers/management/commands/cl_back_scrape_citations.py
@@ -14,14 +14,22 @@
 
 from cl.lib.command_utils import logger
 from cl.lib.crypto import sha1
+from cl.scrapers.DupChecker import DupChecker
+from cl.scrapers.exceptions import BadContentError
 from cl.scrapers.management.commands import cl_back_scrape_opinions
 from cl.scrapers.management.commands.cl_scrape_opinions import make_citation
 from cl.scrapers.utils import get_binary_content
-from cl.search.models import Citation, Opinion
+from cl.search.models import Citation, Court, Opinion
 
 
 class Command(cl_back_scrape_opinions.Command):
-    def scrape_court(self, site, full_crawl=False, ocr_available=True):
+    def scrape_court(
+        self,
+        site,
+        full_crawl: bool = False,
+        ocr_available: bool = True,
+        backscrape: bool = False,
+    ):
         """
         If the scraped case has citation data
             Check for Opinion existance via content hash
@@ -35,8 +43,9 @@ def scrape_court(self, site, full_crawl=False, ocr_available=True):
         :param site: scraper object that has already downloaded
             it's case data
         """
-        missing_opinions = []
         court_str = site.court_id.split(".")[-1].split("_")[0]
+        court = Court.objects.get(id=court_str)
+        dup_checker = DupChecker(court, full_crawl=True)
 
         for case in site:
             citation = case.get("citations")
@@ -48,16 +57,19 @@ def scrape_court(self, site, full_crawl=False, ocr_available=True):
                 )
                 continue
 
-            content = get_binary_content(case["download_urls"], site)
-            if not content:
-                # Errors are logged by get_binary_content itself
+            try:
+                content = get_binary_content(case["download_urls"], site)
+            except BadContentError:
                 continue
+
             sha1_hash = sha1(force_bytes(content))
 
             try:
                 cluster = Opinion.objects.get(sha1=sha1_hash).cluster
             except Opinion.DoesNotExist:
-                missing_opinions.append(case)
+                # populate special key to avoid downloading the file again
+                case["content"] = content
+
                 logger.info(
                     "Case '%s', opinion '%s' has no matching hash in the DB. "
                     "Has a citation '%s'. Will try to ingest all objects",
@@ -65,6 +77,8 @@ def scrape_court(self, site, full_crawl=False, ocr_available=True):
                     case["download_urls"],
                     citation or parallel_citation,
                 )
+
+                self.ingest_a_case(case, None, True, site, dup_checker, court)
                 continue
 
             for cite in [citation, parallel_citation]:
@@ -90,18 +104,6 @@ def scrape_court(self, site, full_crawl=False, ocr_available=True):
                         cluster,
                     )
 
-        # We don't have these opinions. Since we are backscraping, if the citation
-        # exists, it will be in the case dictionary, and will be saved in a
-        # regular ingestion process
-        if missing_opinions:
-            # It is easy to ingest a filtered list of cases for OpinionSiteLinear
-            # but not for plain OpinionSite
-            if hasattr(site, "cases"):
-                site.cases = missing_opinions
-                super().scrape_court(site, full_crawl=True)
-            else:
-                logger.info("Run the backscraper to collect missing opinions")
-
     def citation_is_duplicated(
         self, citation_candidate: Citation, cite: str
     ) -> bool:
diff --git a/cl/scrapers/management/commands/cl_scrape_opinions.py b/cl/scrapers/management/commands/cl_scrape_opinions.py
index a147da1182..f1547d8f7d 100644
--- a/cl/scrapers/management/commands/cl_scrape_opinions.py
+++ b/cl/scrapers/management/commands/cl_scrape_opinions.py
@@ -1,6 +1,7 @@
 import signal
 import sys
 import time
+import traceback
 from datetime import date
 from typing import Any, Dict, List, Optional, Tuple, Union
 
@@ -22,6 +23,11 @@
 from cl.lib.string_utils import trunc
 from cl.people_db.lookup_utils import lookup_judges_by_messy_str
 from cl.scrapers.DupChecker import DupChecker
+from cl.scrapers.exceptions import (
+    BadContentError,
+    ConsecutiveDuplicatesError,
+    SingleDuplicateError,
+)
 from cl.scrapers.tasks import extract_doc_content
 from cl.scrapers.utils import (
     get_binary_content,
@@ -213,6 +219,7 @@ def save_everything(
 
 class Command(VerboseCommand):
     help = "Runs the Juriscraper toolkit against one or many jurisdictions."
+    object_type = "opinions"  # for logging purposes
 
     def __init__(self, stdout=None, stderr=None, no_color=False):
         super().__init__(stdout=None, stderr=None, no_color=False)
@@ -261,7 +268,13 @@ def add_arguments(self, parser):
             help="Disable duplicate aborting.",
         )
 
-    def scrape_court(self, site, full_crawl=False, ocr_available=True):
+    def scrape_court(
+        self,
+        site,
+        full_crawl: bool = False,
+        ocr_available: bool = True,
+        backscrape: bool = False,
+    ):
         # Get the court object early for logging
         # opinions.united_states.federal.ca9_u --> ca9
         court_str = site.court_id.split(".")[-1].split("_")[0]
@@ -273,97 +286,116 @@ def scrape_court(self, site, full_crawl=False, ocr_available=True):
             return
 
         if site.cookies:
-            logger.info(f"Using cookies: {site.cookies}")
-        logger.debug(f"#{len(site)} opinions found.")
+            logger.info("Using cookies: %s", site.cookies)
+
+        logger.debug("#%s opinions found.", len(site))
+
         added = 0
         for i, item in enumerate(site):
-            content = get_binary_content(item["download_urls"], site)
-            if not content:
-                continue
-
-            current_date = item["case_dates"]
             try:
                 next_date = site[i + 1]["case_dates"]
             except IndexError:
                 next_date = None
 
-            # request.content is sometimes a str, sometimes unicode, so
-            # force it all to be bytes, pleasing hashlib.
-            sha1_hash = sha1(force_bytes(content))
-            if (
-                court_str == "nev"
-                and item["precedential_statuses"] == "Unpublished"
-            ) or court_str in ["neb"]:
-                # Nevada's non-precedential cases have different SHA1 sums
-                # every time.
-
-                # Nebraska updates the pdf causing the SHA1 to not match
-                # the opinions in CL causing duplicates. See CL issue #1452
-
-                lookup_params = {
-                    "lookup_value": item["download_urls"],
-                    "lookup_by": "download_url",
-                }
-            else:
-                lookup_params = {
-                    "lookup_value": sha1_hash,
-                    "lookup_by": "sha1",
-                }
-
-            proceed = dup_checker.press_on(
-                Opinion, current_date, next_date, **lookup_params
-            )
-            if dup_checker.emulate_break:
-                logger.debug("Emulate break triggered.")
+            try:
+                self.ingest_a_case(
+                    item, next_date, ocr_available, site, dup_checker, court
+                )
+                added += 1
+            except ConsecutiveDuplicatesError:
                 break
-            if not proceed:
-                logger.debug("Skipping opinion.")
-                continue
-
-            # Not a duplicate, carry on
-            logger.info(
-                f"Adding new document found at: {item['download_urls'].encode()}"
-            )
-            dup_checker.reset()
-
-            child_court = get_child_court(
-                item.get("child_courts", ""), court.id
-            )
-
-            docket, opinion, cluster, citations = make_objects(
-                item, child_court or court, sha1_hash, content
-            )
-
-            save_everything(
-                items={
-                    "docket": docket,
-                    "opinion": opinion,
-                    "cluster": cluster,
-                    "citations": citations,
-                },
-                index=False,
-            )
-            extract_doc_content.delay(
-                opinion.pk,
-                ocr_available=ocr_available,
-                citation_jitter=True,
-                juriscraper_module=site.court_id,
-            )
-
-            logger.info(
-                f"Successfully added opinion {opinion.pk}: "
-                f"{item['case_names'].encode()}"
-            )
-            added += 1
+            except (SingleDuplicateError, BadContentError):
+                pass
 
         # Update the hash if everything finishes properly.
         logger.debug(
-            f"{site.court_id}: Successfully crawled {added}/{len(site)} opinions."
+            "%s: Successfully crawled %s/%s %s.",
+            site.court_id,
+            added,
+            len(site),
+            self.object_type,
         )
         if not full_crawl:
             # Only update the hash if no errors occurred.
             dup_checker.update_site_hash(site.hash)
 
+    def ingest_a_case(
+        self,
+        item,
+        next_case_date: date | None,
+        ocr_available: bool,
+        site,
+        dup_checker: DupChecker,
+        court: Court,
+    ):
+        if item.get("content"):
+            content = item.pop("content")
+        else:
+            content = get_binary_content(item["download_urls"], site)
+
+        # request.content is sometimes a str, sometimes unicode, so
+        # force it all to be bytes, pleasing hashlib.
+        sha1_hash = sha1(force_bytes(content))
+
+        if (
+            court.pk == "nev"
+            and item["precedential_statuses"] == "Unpublished"
+        ) or court.pk in ["neb"]:
+            # Nevada's non-precedential cases have different SHA1 sums
+            # every time.
+
+            # Nebraska updates the pdf causing the SHA1 to not match
+            # the opinions in CL causing duplicates. See CL issue #1452
+
+            lookup_params = {
+                "lookup_value": item["download_urls"],
+                "lookup_by": "download_url",
+            }
+        else:
+            lookup_params = {
+                "lookup_value": sha1_hash,
+                "lookup_by": "sha1",
+            }
+
+        # Duplicates will raise errors
+        dup_checker.press_on(
+            Opinion, item["case_dates"], next_case_date, **lookup_params
+        )
+
+        # Not a duplicate, carry on
+        logger.info(
+            "Adding new document found at: %s", item["download_urls"].encode()
+        )
+        dup_checker.reset()
+
+        child_court = get_child_court(item.get("child_courts", ""), court.id)
+
+        docket, opinion, cluster, citations = make_objects(
+            item, child_court or court, sha1_hash, content
+        )
+
+        save_everything(
+            items={
+                "docket": docket,
+                "opinion": opinion,
+                "cluster": cluster,
+                "citations": citations,
+            },
+            index=False,
+        )
+        extract_doc_content.delay(
+            opinion.pk,
+            ocr_available=ocr_available,
+            citation_jitter=True,
+            juriscraper_module=site.court_id,
+        )
+
+        logger.info(
+            "Successfully added opinion %s: %s",
+            opinion.pk,
+            item["case_names"].encode(),
+        )
+
     def parse_and_scrape_site(self, mod, options: dict):
         site = mod.Site().parse()
         self.scrape_court(site, options["full_crawl"])
@@ -407,6 +439,7 @@ def handle(self, *args, **options):
                 capture_exception(
                     e, fingerprint=[module_string, "{{ default }}"]
                 )
+                logger.debug(traceback.format_exc())
             last_court_in_list = i == (num_courts - 1)
             daemon_mode = options["daemon"]
             if last_court_in_list:
diff --git a/cl/scrapers/management/commands/cl_scrape_oral_arguments.py b/cl/scrapers/management/commands/cl_scrape_oral_arguments.py
index a3d36ed7bb..a2f09dbae4 100644
--- a/cl/scrapers/management/commands/cl_scrape_oral_arguments.py
+++ b/cl/scrapers/management/commands/cl_scrape_oral_arguments.py
@@ -106,75 +106,47 @@ def make_objects(
 
 
 class Command(cl_scrape_opinions.Command):
-    def scrape_court(
+    object_type = "oral arguments"
+
+    def ingest_a_case(
         self,
+        item,
+        next_case_date: date | None,
+        ocr_available: bool,
         site,
-        full_crawl: bool = False,
+        dup_checker: DupChecker,
+        court: Court,
         backscrape: bool = False,
-    ) -> None:
-        # Get the court object early for logging
-        # opinions.united_states.federal.ca9_u --> ca9
-        court_str = site.court_id.split(".")[-1].split("_")[0]
-        court = Court.objects.get(pk=court_str)
-
-        dup_checker = DupChecker(court, full_crawl=full_crawl)
-        abort = dup_checker.abort_by_url_hash(site.url, site.hash)
-        if abort:
-            return
-
-        if site.cookies:
-            logger.info(f"Using cookies: {site.cookies}")
-        for i, item in enumerate(site):
-            content = get_binary_content(item["download_urls"], site)
-            if not content:
-                continue
-
-            current_date = item["case_dates"]
-            try:
-                next_date = site[i + 1]["case_dates"]
-            except IndexError:
-                next_date = None
-
-            # request.content is sometimes a str, sometimes unicode, so
-            # force it all to be bytes, pleasing hashlib.
-            sha1_hash = sha1(force_bytes(content))
-            onwards = dup_checker.press_on(
-                Audio,
-                current_date,
-                next_date,
-                lookup_value=sha1_hash,
-                lookup_by="sha1",
-            )
-            if dup_checker.emulate_break:
-                break
-
-            if onwards:
-                # Not a duplicate, carry on
-                logger.info(
-                    f"Adding new document found at: {item['download_urls'].encode()}"
-                )
-                dup_checker.reset()
-
-                docket, audio_file = make_objects(
-                    item, court, sha1_hash, content
-                )
-
-                save_everything(
-                    items={"docket": docket, "audio_file": audio_file},
-                    index=False,
-                    backscrape=backscrape,
-                )
-                process_audio_file.delay(audio_file.pk)
-
-                logger.info(
-                    "Successfully added audio file {pk}: {name}".format(
-                        pk=audio_file.pk,
-                        name=item["case_names"].encode(),
-                    )
-                )
-
-        # Update the hash if everything finishes properly.
-        logger.info(f"{site.court_id}: Successfully crawled oral arguments.")
-        if not full_crawl:
-            # Only update the hash if no errors occurred.
-            dup_checker.update_site_hash(site.hash)
+    ):
+        content = get_binary_content(item["download_urls"], site)
+        # request.content is sometimes a str, sometimes unicode, so
+        # force it all to be bytes, pleasing hashlib.
+        sha1_hash = sha1(force_bytes(content))
+
+        dup_checker.press_on(
+            Audio,
+            item["case_dates"],
+            next_case_date,
+            lookup_value=sha1_hash,
+            lookup_by="sha1",
+        )
+
+        logger.info(
+            "Adding new document found at: %s", item["download_urls"].encode()
+        )
+        dup_checker.reset()
+
+        docket, audio_file = make_objects(item, court, sha1_hash, content)
+
+        save_everything(
+            items={"docket": docket, "audio_file": audio_file},
+            index=False,
+            backscrape=backscrape,
+        )
+        process_audio_file.delay(audio_file.pk)
+
+        logger.info(
+            "Successfully added audio file %s: %s",
+            audio_file.pk,
+            item["case_names"].encode(),
+        )
diff --git a/cl/scrapers/tests.py b/cl/scrapers/tests.py
index ac6e5de0f7..5f3136701a 100644
--- a/cl/scrapers/tests.py
+++ b/cl/scrapers/tests.py
@@ -21,6 +21,11 @@
 from cl.lib.microservice_utils import microservice
 from cl.lib.test_helpers import generate_docket_target_sources
 from cl.scrapers.DupChecker import DupChecker
+from cl.scrapers.exceptions import (
+    ConsecutiveDuplicatesError,
+    SingleDuplicateError,
+    UnexpectedContentTypeError,
+)
 from cl.scrapers.management.commands import (
     cl_back_scrape_citations,
     cl_scrape_opinions,
@@ -448,26 +453,21 @@ def test_press_on_with_an_empty_database(self) -> None:
         site = test_opinion_scraper.Site()
         site.hash = "this is a dummy hash code string"
         for dup_checker in self.dup_checkers:
-            onwards = dup_checker.press_on(
-                Opinion,
-                now(),
-                now() - timedelta(days=1),
-                lookup_value="content",
-                lookup_by="sha1",
-            )
-            if dup_checker.full_crawl:
-                self.assertTrue(
-                    onwards,
-                    "DupChecker says to abort during a full crawl. This should "
-                    "never happen.",
-                )
-            elif dup_checker.full_crawl is False:
-                count = Opinion.objects.all().count()
-                self.assertTrue(
-                    onwards,
-                    "DupChecker says to abort on dups when the database has %s "
-                    "Documents." % count,
+            try:
+                dup_checker.press_on(
+                    Opinion,
+                    now(),
+                    now() - timedelta(days=1),
+                    lookup_value="content",
+                    lookup_by="sha1",
                 )
+            except (SingleDuplicateError, ConsecutiveDuplicatesError):
+                if dup_checker.full_crawl:
+                    failure = "DupChecker says to abort during a full crawl. This should never happen."
+                else:
+                    count = Opinion.objects.all().count()
+                    failure = f"DupChecker says to abort on dups when the database has {count} Documents."
+                self.fail(failure)
 
 
 class DupcheckerWithFixturesTest(TestCase):
@@ -492,78 +492,54 @@ def setUp(self) -> None:
 
     def test_press_on_with_a_dup_found(self) -> None:
         for dup_checker in self.dup_checkers:
-            onwards = dup_checker.press_on(
-                Opinion,
-                now(),
-                now(),
-                lookup_value=self.content_hash,
-                lookup_by="sha1",
-            )
-            if dup_checker.full_crawl:
-                self.assertFalse(
-                    onwards,
-                    "DupChecker returned True during a full crawl, but there "
-                    "should be duplicates in the database.",
-                )
-                self.assertFalse(
-                    dup_checker.emulate_break,
-                    "DupChecker said to emulate a break during a full crawl. "
-                    "Nothing should stop a full crawl!",
-                )
-
-            elif dup_checker.full_crawl is False:
-                self.assertFalse(
-                    onwards,
-                    "DupChecker returned %s but there should be a duplicate in "
-                    "the database. dup_count is %s, and dup_threshold is %s"
-                    % (
-                        onwards,
-                        dup_checker.dup_count,
-                        dup_checker.dup_threshold,
-                    ),
-                )
-                self.assertTrue(
-                    dup_checker.emulate_break,
-                    "We should have hit a break but didn't.",
+            try:
+                dup_checker.press_on(
+                    Opinion,
+                    now(),
+                    now(),
+                    lookup_value=self.content_hash,
+                    lookup_by="sha1",
                 )
+                if not dup_checker.full_crawl:
+                    self.fail("Did not raise ConsecutiveDuplicatesError.")
+            except ConsecutiveDuplicatesError:
+                if dup_checker.full_crawl:
+                    self.fail(
+                        "DupChecker raised ConsecutiveDuplicatesError breaking"
+                        " the outer loop. Nothing should stop a full crawl!"
+                    )
+            except SingleDuplicateError:
+                # Full crawl or not, a SingleDuplicateError is
+                # expected when a duplicate is found
+                pass
 
     def test_press_on_with_dup_found_and_older_date(self) -> None:
         for dup_checker in self.dup_checkers:
             # Note that the next case occurs prior to the current one
-            onwards = dup_checker.press_on(
-                Opinion,
-                now(),
-                now() - timedelta(days=1),
-                lookup_value=self.content_hash,
-                lookup_by="sha1",
-            )
-            if dup_checker.full_crawl:
-                self.assertFalse(
-                    onwards,
-                    "DupChecker returned True during a full crawl, but there "
-                    "should be duplicates in the database.",
+            try:
+                dup_checker.press_on(
+                    Opinion,
+                    now(),
+                    now() - timedelta(days=1),
+                    lookup_value=self.content_hash,
+                    lookup_by="sha1",
                 )
-                self.assertFalse(
-                    dup_checker.emulate_break,
-                    "DupChecker said to emulate a break during a full crawl. "
-                    "Nothing should stop a full crawl!",
-                )
-            else:
-                self.assertFalse(
-                    onwards,
-                    "DupChecker returned %s but there should be a duplicate in "
-                    "the database. dup_count is %s, and dup_threshold is %s"
-                    % (
-                        onwards,
-                        dup_checker.dup_count,
-                        dup_checker.dup_threshold,
-                    ),
-                )
-                self.assertTrue(
-                    dup_checker.emulate_break,
-                    "We should have hit a break but didn't.",
+                self.fail(
+                    "Expected raising SingleDuplicateError, there was a duplicate in the DB"
                 )
 
+            except SingleDuplicateError:
+                # Full crawl or not, a SingleDuplicateError is
+                # expected when a duplicate is found
+                pass
+            except ConsecutiveDuplicatesError:
+                if dup_checker.full_crawl:
+                    self.fail(
+                        "DupChecker raised ConsecutiveDuplicatesError during a "
+                        "full crawl, breaking the outer loop. Nothing should "
+                        "stop a full crawl!"
+                    )
+
 
 class AudioFileTaskTest(TestCase):
     @classmethod
@@ -641,10 +617,11 @@ def test_unexpected_content_type(self, mock_get):
         """Test when content type doesn't match scraper expectation."""
         mock_get.return_value = self.mock_response
         self.site.expected_content_types = ["text/html"]
-        with mock.patch.object(self.logger, "error") as error_mock:
-            get_binary_content("/dummy/url/", self.site)
-        self.assertIn(
-            "UnexpectedContentTypeError:", error_mock.call_args_list[0][0][0]
+        self.assertRaises(
+            UnexpectedContentTypeError,
+            get_binary_content,
+            "/dummy/url/",
+            self.site,
         )
 
     @mock.patch("requests.Session.get")
diff --git a/cl/scrapers/utils.py b/cl/scrapers/utils.py
index 445797ce45..311ccaf221 100644
--- a/cl/scrapers/utils.py
+++ b/cl/scrapers/utils.py
@@ -21,6 +21,11 @@
 from cl.lib.decorators import retry
 from cl.lib.microservice_utils import microservice
 from cl.recap.mergers import find_docket_object
+from cl.scrapers.exceptions import (
+    EmptyFileError,
+    NoDownloadUrlError,
+    UnexpectedContentTypeError,
+)
 from cl.scrapers.tasks import extract_recap_pdf
 from cl.search.models import Court, Docket, RECAPDocument
 
@@ -155,26 +160,18 @@ def get_extension(content: bytes) -> str:
 def get_binary_content(
     download_url: str,
     site: AbstractSite,
-) -> Optional[bytes | str]:
+) -> bytes | str:
     """Downloads the file, covering a few special cases such as invalid SSL
     certificates and empty file errors.
 
     :param download_url: The URL for the item you wish to download.
     :param site: Site object used to download data
 
-    :return: One of:
-        - None if there was no URL, if the downloaded file was empty or if
-            the content type did not match the expected types
-        - The downloaded and cleaned content
+    :return: The downloaded and cleaned content
+    :raises: NoDownloadUrlError, UnexpectedContentTypeError, EmptyFileError
     """
-    court_str = site.court_id.split(".")[-1].split("_")[0]
-    fingerprint = [f"{court_str}-unexpected-content-type"]
-
     if not download_url:
-        # Occurs when a DeferredList fetcher fails.
-        error = f"NoDownloadUrlError: {download_url}\n{traceback.format_exc()}"
-        logger.error(error, extra={"fingerprint": fingerprint})
-        return
+        raise NoDownloadUrlError(download_url)
 
     # noinspection PyBroadException
     if site.method == "LOCAL":
@@ -207,9 +204,7 @@ def get_binary_content(
 
         # test for empty files (thank you CA1)
         if len(r.content) == 0:
-            error = f"EmptyFileError: {download_url}\n{traceback.format_exc()}"
-            logger.error(error, extra={"fingerprint": fingerprint})
-            return
+            raise EmptyFileError(f"EmptyFileError: '{download_url}'")
 
         # test for expected content type (thanks mont for nil)
         if site.expected_content_types:
@@ -224,12 +219,10 @@ def get_binary_content(
             )
 
             if not m:
-                error = (
-                    f"UnexpectedContentTypeError: {download_url}\n"
-                    f'\'"{content_type}" not in {site.expected_content_types}'
-                )
-                logger.error(error, extra={"fingerprint": fingerprint})
-                return
+                court_str = site.court_id.split(".")[-1].split("_")[0]
+                fingerprint = [f"{court_str}-unexpected-content-type"]
+                msg = f"'{download_url}' '{content_type}' not in {site.expected_content_types}"
+                raise UnexpectedContentTypeError(msg, fingerprint=fingerprint)
 
         # test for and follow meta redirects
         r = follow_redirections(r, s)

From ace253e1017a2800a4925e96fa53afd05cdc781e Mon Sep 17 00:00:00 2001
From: Gianfranco Rossi <gianfranco.rossi.r@gmail.com>
Date: Wed, 21 Aug 2024 12:27:59 -0500
Subject: [PATCH 8/8] refactor(cl_scrape_opinions): renamed variable used in
 logging

---
 cl/scrapers/management/commands/cl_back_scrape_citations.py | 2 ++
 .../management/commands/cl_back_scrape_oral_arguments.py    | 2 +-
 cl/scrapers/management/commands/cl_scrape_opinions.py       | 6 +++---
 cl/scrapers/management/commands/cl_scrape_oral_arguments.py | 6 ++++--
 4 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/cl/scrapers/management/commands/cl_back_scrape_citations.py b/cl/scrapers/management/commands/cl_back_scrape_citations.py
index 8975461217..b2da0a4581 100644
--- a/cl/scrapers/management/commands/cl_back_scrape_citations.py
+++ b/cl/scrapers/management/commands/cl_back_scrape_citations.py
@@ -23,6 +23,8 @@
 
 
 class Command(cl_back_scrape_opinions.Command):
+    scrape_target_descr = "citations"
+
     def scrape_court(
         self,
         site,
diff --git a/cl/scrapers/management/commands/cl_back_scrape_oral_arguments.py b/cl/scrapers/management/commands/cl_back_scrape_oral_arguments.py
index b1105f01e0..299a091597 100644
--- a/cl/scrapers/management/commands/cl_back_scrape_oral_arguments.py
+++ b/cl/scrapers/management/commands/cl_back_scrape_oral_arguments.py
@@ -5,7 +5,7 @@
 
 
 class Command(cl_scrape_oral_arguments.Command):
-    def parse_and_scrape_site(self, mod, full_crawl):
+    def parse_and_scrape_site(self, mod, options: dict):
         court_str = mod.__name__.split(".")[-1].split("_")[0]
         logger.info(f'Using court_str: "{court_str}"')
 
diff --git a/cl/scrapers/management/commands/cl_scrape_opinions.py b/cl/scrapers/management/commands/cl_scrape_opinions.py
index f1547d8f7d..9e7854824b 100644
--- a/cl/scrapers/management/commands/cl_scrape_opinions.py
+++ b/cl/scrapers/management/commands/cl_scrape_opinions.py
@@ -219,7 +219,7 @@ def save_everything(
 
 class Command(VerboseCommand):
     help = "Runs the Juriscraper toolkit against one or many jurisdictions."
-    object_type = "opinions"  # for logging purposes
+    scrape_target_descr = "opinions"  # for logging purposes
 
     def __init__(self, stdout=None, stderr=None, no_color=False):
         super().__init__(stdout=None, stderr=None, no_color=False)
@@ -288,7 +288,7 @@ def scrape_court(
         if site.cookies:
             logger.info("Using cookies: %s", site.cookies)
 
-        logger.debug("#%s opinions found.", len(site))
+        logger.debug("#%s %s found.", len(site), self.scrape_target_descr)
 
         added = 0
         for i, item in enumerate(site):
@@ -313,7 +313,7 @@ def scrape_court(
             site.court_id,
             added,
             len(site),
-            self.object_type,
+            self.scrape_target_descr,
         )
         if not full_crawl:
             # Only update the hash if no errors occurred.
diff --git a/cl/scrapers/management/commands/cl_scrape_oral_arguments.py b/cl/scrapers/management/commands/cl_scrape_oral_arguments.py
index a2f09dbae4..33da02925d 100644
--- a/cl/scrapers/management/commands/cl_scrape_oral_arguments.py
+++ b/cl/scrapers/management/commands/cl_scrape_oral_arguments.py
@@ -106,7 +106,7 @@ def make_objects(
 
 
 class Command(cl_scrape_opinions.Command):
-    object_type = "oral arguments"
+    scrape_target_descr = "oral arguments"
 
     def ingest_a_case(
         self,
@@ -132,7 +132,9 @@ def ingest_a_case(
         )
 
         logger.info(
-            "Adding new document found at: %s", item["download_urls"].encode()
+            "Adding new %s found at: %s",
+            self.scrape_target_descr,
+            item["download_urls"].encode(),
         )
         dup_checker.reset()