From 49aa2b7095dda0614423302c4dcc259c5dc2e1db Mon Sep 17 00:00:00 2001 From: Gianfranco Rossi Date: Mon, 12 Aug 2024 22:36:08 -0500 Subject: [PATCH 1/8] refactor(scrapers.utils.get_binary_content) - Remove "method" argument - Log errors instead of returning the error message - Return the cleaned up content using site.cleanup_content - Update tests - Update opinions and oral arguments scraper caller to reflect changes --- .../management/commands/cl_scrape_opinions.py | 12 ++----- .../commands/cl_scrape_oral_arguments.py | 12 ++----- cl/scrapers/tests.py | 24 +++++++------- cl/scrapers/utils.py | 33 +++++++++++-------- 4 files changed, 36 insertions(+), 45 deletions(-) diff --git a/cl/scrapers/management/commands/cl_scrape_opinions.py b/cl/scrapers/management/commands/cl_scrape_opinions.py index 59b4626a15..a147da1182 100644 --- a/cl/scrapers/management/commands/cl_scrape_opinions.py +++ b/cl/scrapers/management/commands/cl_scrape_opinions.py @@ -277,18 +277,10 @@ def scrape_court(self, site, full_crawl=False, ocr_available=True): logger.debug(f"#{len(site)} opinions found.") added = 0 for i, item in enumerate(site): - msg, r = get_binary_content( - item["download_urls"], - site, - method=site.method, - ) - if msg: - fingerprint = [f"{court_str}-unexpected-content-type"] - logger.error(msg, extra={"fingerprint": fingerprint}) + content = get_binary_content(item["download_urls"], site) + if not content: continue - content = site.cleanup_content(r.content) - current_date = item["case_dates"] try: next_date = site[i + 1]["case_dates"] diff --git a/cl/scrapers/management/commands/cl_scrape_oral_arguments.py b/cl/scrapers/management/commands/cl_scrape_oral_arguments.py index ab19cac0f5..a3d36ed7bb 100644 --- a/cl/scrapers/management/commands/cl_scrape_oral_arguments.py +++ b/cl/scrapers/management/commands/cl_scrape_oral_arguments.py @@ -125,18 +125,10 @@ def scrape_court( if site.cookies: logger.info(f"Using cookies: {site.cookies}") for i, item in enumerate(site): - msg, r = get_binary_content( - item["download_urls"], - site, - method=site.method, - ) - if msg: - fingerprint = [f"{court_str}-unexpected-content-type"] - logger.error(msg, extra={"fingerprint": fingerprint}) + content = get_binary_content(item["download_urls"], site) + if not content: continue - content = site.cleanup_content(r.content) - current_date = item["case_dates"] try: next_date = site[i + 1]["case_dates"] diff --git a/cl/scrapers/tests.py b/cl/scrapers/tests.py index dba2479e45..497a491d4d 100644 --- a/cl/scrapers/tests.py +++ b/cl/scrapers/tests.py @@ -633,8 +633,9 @@ def test_unexpected_content_type(self, mock_get): mock_get.return_value = self.mock_response self.site.expected_content_types = ["text/html"] - msg, _ = get_binary_content("/dummy/url/", self.site) - self.assertIn("UnexpectedContentTypeError:", msg) + with self.assertLogs(level="ERROR") as cm: + get_binary_content("/dummy/url/", self.site) + self.assertIn("UnexpectedContentTypeError:", cm.output[0]) @mock.patch("requests.Session.get") def test_correct_content_type(self, mock_get): @@ -642,15 +643,14 @@ def test_correct_content_type(self, mock_get): mock_get.return_value = self.mock_response self.site.expected_content_types = ["application/pdf"] - msg, _ = get_binary_content("/dummy/url/", self.site) - self.assertEqual("", msg) + with self.assertNoLogs(level="ERROR"): + _ = get_binary_content("/dummy/url/", self.site) - self.mock_response.headers = { - "Content-Type": "application/pdf;charset=utf-8" - } - mock_get.return_value = self.mock_response - msg, _ = get_binary_content("/dummy/url/", self.site) - self.assertEqual("", msg) + self.mock_response.headers = { + "Content-Type": "application/pdf;charset=utf-8" + } + mock_get.return_value = self.mock_response + _ = get_binary_content("/dummy/url/", self.site) @mock.patch("requests.Session.get") def test_no_content_type(self, mock_get): @@ -658,5 +658,5 @@ def test_no_content_type(self, mock_get): mock_get.return_value = self.mock_response self.site.expected_content_types = None - msg, _ = get_binary_content("/dummy/url/", self.site) - self.assertEqual("", msg) + with self.assertNoLogs(level="ERROR"): + _ = get_binary_content("/dummy/url/", self.site) diff --git a/cl/scrapers/utils.py b/cl/scrapers/utils.py index b5e9fe51c1..15ef2bda7e 100644 --- a/cl/scrapers/utils.py +++ b/cl/scrapers/utils.py @@ -155,25 +155,29 @@ def get_extension(content: bytes) -> str: def get_binary_content( download_url: str, site: AbstractSite, - method: str = "GET", -) -> Tuple[str, Optional[Response]]: +) -> Optional[bytes | str]: """Downloads the file, covering a few special cases such as invalid SSL certificates and empty file errors. :param download_url: The URL for the item you wish to download. :param site: Site object used to download data - :param method: The HTTP method used to get the item, or "LOCAL" to get an - item during testing + :return: Two values. The first is a msg indicating any errors encountered. If blank, that indicates success. The second value is the response object containing the downloaded file. """ + court_str = site.court_id.split(".")[-1].split("_")[0] + fingerprint = [f"{court_str}-unexpected-content-type"] + if not download_url: # Occurs when a DeferredList fetcher fails. - msg = f"NoDownloadUrlError: {download_url}\n{traceback.format_exc()}" - return msg, None + error = f"NoDownloadUrlError: {download_url}\n{traceback.format_exc()}" + logger.error(error, extra={"fingerprint": fingerprint}) + return + # noinspection PyBroadException - if method == "LOCAL": + if site.method == "LOCAL": + # "LOCAL" is the method when testing url = os.path.join(settings.MEDIA_ROOT, download_url) mr = MockRequest(url=url) r = mr.get() @@ -203,8 +207,9 @@ def get_binary_content( # test for empty files (thank you CA1) if len(r.content) == 0: - msg = f"EmptyFileError: {download_url}\n{traceback.format_exc()}" - return msg, None + error = f"EmptyFileError: {download_url}\n{traceback.format_exc()}" + logger.error(error, extra={"fingerprint": fingerprint}) + return # test for expected content type (thanks mont for nil) if site.expected_content_types: @@ -218,18 +223,20 @@ def get_binary_content( for mime in site.expected_content_types ) if not m: - msg = ( + error = ( f"UnexpectedContentTypeError: {download_url}\n" f'\'"{content_type}" not in {site.expected_content_types}' ) - return msg, None + logger.error(error, extra={"fingerprint": fingerprint}) + return # test for and follow meta redirects r = follow_redirections(r, s) r.raise_for_status() - # Success! - return "", r + content = site.cleanup_content(r.content) + + return content def signal_handler(signal, frame): From 5fb7ba9ab2c4ab813fc8c062dec812286a5d0e51 Mon Sep 17 00:00:00 2001 From: Gianfranco Rossi Date: Mon, 12 Aug 2024 22:36:38 -0500 Subject: [PATCH 2/8] feat(cl_back_scrape_citations): command to scrape citations --- .../commands/cl_back_scrape_citations.py | 115 ++++++++++++++++++ 1 file changed, 115 insertions(+) create mode 100644 cl/scrapers/management/commands/cl_back_scrape_citations.py diff --git a/cl/scrapers/management/commands/cl_back_scrape_citations.py b/cl/scrapers/management/commands/cl_back_scrape_citations.py new file mode 100644 index 0000000000..43fa7a3f34 --- /dev/null +++ b/cl/scrapers/management/commands/cl_back_scrape_citations.py @@ -0,0 +1,115 @@ +""" +When opinions are first published on the courts' sites, they won't have +all their citations assigned. Some courts will publish the citations +in the same pages we scrape, but months later + +This command re-uses the (back)scraper we use to get opinions, to get +the lagged citations and associate them with the Opinions we first +downloaded. If we find an Opinion we don't have in the database, +we ingest it as in a regular scrape +""" + +from django.utils.encoding import force_bytes + +from cl.lib.crypto import sha1 +from cl.scrapers.management.commands import cl_back_scrape_opinions +from cl.scrapers.management.commands.cl_scrape_opinions import make_citation +from cl.scrapers.utils import get_binary_content +from cl.search.models import Opinion, Citation, OpinionCluster +from cl.lib.command_utils import logger + + +class Command(cl_back_scrape_opinions.Command): + def scrape_court(self, site, full_crawl=False, ocr_available=True): + """ + If the scraped case has citation data + Check for Opinion existance via content hash + If we have the Opinion + if we don't have the citation -> ingest + if we already have the citation -> pass + If we don't have the Opinion + ingest the opinion with it's citation, that is to say, + use the regular scraping process! + + :param site: scraper object that has already downloaded + it's case data + """ + missing_opinions = [] + court_str = site.court_id.split(".")[-1].split("_")[0] + + for case in site: + citation = case.get("citation") + parallel_citation = case.get("parallel_citation") + if not citation and not parallel_citation: + continue + + content = get_binary_content(case["download_urls"], site) + if not content: + # Errors are logged by get_binary_content itself + continue + sha1_hash = sha1(force_bytes(content)) + + try: + cluster = Opinion.objects.get(sha1=sha1_hash).cluster + except Opinion.DoesNotExist: + missing_opinions.append(case) + logger.info( + "Opinion with URL '%s' does not exist. Has citation '%s'. Will try to ingest all objects", + case["download_urls"], + citation or parallel_citation, + ) + continue + + for cite in [citation, parallel_citation]: + if not cite: + continue + + citation_candidate = make_citation(cite, cluster, court_str) + if not citation_candidate: + continue + + if self.citation_is_duplicated( + citation_candidate, cluster, cite + ): + continue + + logger.info("Saving citation %s for cluster %s", cite, cluster) + citation_candidate.save() + + # We don't have these opinions. Since we are backscraping, if the citation + # exists, it will be in the case dictionary, and will be saved in a + # regular ingestion process + if missing_opinions: + site.cases = missing_opinions + super().scrape_court(site, full_crawl=True) + + def citation_is_duplicated( + self, citation_candidate: Citation, cluster: OpinionCluster, cite: str + ) -> bool: + """Checks for exact or reporter duplication of citation in the cluster + """ + citation_params = citation_candidate.__dict__ + citation_params.pop("_state", "") + citation_params.pop("id", "") + + # Exact duplication + if Citation.objects.filter(**citation_params).exists(): + logger.info( + "Citation '%s' already exists for cluster %s", + cite, + cluster.id, + ) + return True + + # Duplication in the same reporter + if Citation.objects.filter( + cluster_id=cluster.id, reporter=citation_candidate.reporter + ).exists(): + logger.info( + "Another citation in the same reporter '%s' exists for cluster %s", + citation_candidate.reporter, + cluster.id, + ) + return True + + return False From 22945159f325391ac773ebf5afdb1541076265c8 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 13 Aug 2024 03:39:02 +0000 Subject: [PATCH 3/8] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../management/commands/cl_back_scrape_citations.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/cl/scrapers/management/commands/cl_back_scrape_citations.py b/cl/scrapers/management/commands/cl_back_scrape_citations.py index 43fa7a3f34..8fd59843c8 100644 --- a/cl/scrapers/management/commands/cl_back_scrape_citations.py +++ b/cl/scrapers/management/commands/cl_back_scrape_citations.py @@ -11,12 +11,12 @@ from django.utils.encoding import force_bytes +from cl.lib.command_utils import logger from cl.lib.crypto import sha1 from cl.scrapers.management.commands import cl_back_scrape_opinions from cl.scrapers.management.commands.cl_scrape_opinions import make_citation from cl.scrapers.utils import get_binary_content -from cl.search.models import Opinion, Citation, OpinionCluster -from cl.lib.command_utils import logger +from cl.search.models import Citation, Opinion, OpinionCluster class Command(cl_back_scrape_opinions.Command): @@ -86,8 +86,7 @@ def scrape_court(self, site, full_crawl=False, ocr_available=True): def citation_is_duplicated( self, citation_candidate: Citation, cluster: OpinionCluster, cite: str ) -> bool: - """Checks for exact or reporter duplication of citation in the cluster - """ + """Checks for exact or reporter duplication of citation in the cluster""" citation_params = citation_candidate.__dict__ citation_params.pop("_state", "") citation_params.pop("id", "") From fac3f134c60ccb87e7fe904550a19da50376a300 Mon Sep 17 00:00:00 2001 From: Gianfranco Rossi Date: Mon, 12 Aug 2024 23:32:08 -0500 Subject: [PATCH 4/8] fix(scrapers.tests): test new logger calls inside get_binary_content --- cl/scrapers/tests.py | 16 +++++++++++----- cl/scrapers/utils.py | 4 ++-- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/cl/scrapers/tests.py b/cl/scrapers/tests.py index 497a491d4d..9749042ca0 100644 --- a/cl/scrapers/tests.py +++ b/cl/scrapers/tests.py @@ -8,6 +8,7 @@ from django.conf import settings from django.core.files.base import ContentFile from django.utils.timezone import now +from juriscraper.AbstractSite import logger from cl.alerts.factories import AlertFactory from cl.alerts.models import Alert @@ -626,16 +627,19 @@ def setUp(self): self.mock_response.content = b"not empty" self.mock_response.headers = {"Content-Type": "application/pdf"} self.site = test_opinion_scraper.Site() + self.site.method = "GET" + self.logger = logger @mock.patch("requests.Session.get") def test_unexpected_content_type(self, mock_get): """Test when content type doesn't match scraper expectation.""" mock_get.return_value = self.mock_response self.site.expected_content_types = ["text/html"] - - with self.assertLogs(level="ERROR") as cm: + with mock.patch.object(self.logger, "error") as error_mock: get_binary_content("/dummy/url/", self.site) - self.assertIn("UnexpectedContentTypeError:", cm.output[0]) + self.assertIn( + "UnexpectedContentTypeError:", error_mock.call_args_list[0][0][0] + ) @mock.patch("requests.Session.get") def test_correct_content_type(self, mock_get): @@ -643,7 +647,7 @@ def test_correct_content_type(self, mock_get): mock_get.return_value = self.mock_response self.site.expected_content_types = ["application/pdf"] - with self.assertNoLogs(level="ERROR"): + with mock.patch.object(self.logger, "error") as error_mock: _ = get_binary_content("/dummy/url/", self.site) self.mock_response.headers = { @@ -651,6 +655,7 @@ def test_correct_content_type(self, mock_get): } mock_get.return_value = self.mock_response _ = get_binary_content("/dummy/url/", self.site) + error_mock.assert_not_called() @mock.patch("requests.Session.get") def test_no_content_type(self, mock_get): @@ -658,5 +663,6 @@ def test_no_content_type(self, mock_get): mock_get.return_value = self.mock_response self.site.expected_content_types = None - with self.assertNoLogs(level="ERROR"): + with mock.patch.object(self.logger, "error") as error_mock: _ = get_binary_content("/dummy/url/", self.site) + error_mock.assert_not_called() diff --git a/cl/scrapers/utils.py b/cl/scrapers/utils.py index 15ef2bda7e..a1a2e6eab2 100644 --- a/cl/scrapers/utils.py +++ b/cl/scrapers/utils.py @@ -181,8 +181,7 @@ def get_binary_content( url = os.path.join(settings.MEDIA_ROOT, download_url) mr = MockRequest(url=url) r = mr.get() - r = follow_redirections(r, requests.Session()) - r.raise_for_status() + s = requests.Session() else: # some sites require a custom ssl_context, contained in the Site's # session. However, we can't send a request with both a @@ -222,6 +221,7 @@ def get_binary_content( content_type in mime.lower() for mime in site.expected_content_types ) + if not m: error = ( f"UnexpectedContentTypeError: {download_url}\n" From ad975d99fe62e00465072fdd4b610f0db708fe43 Mon Sep 17 00:00:00 2001 From: Gianfranco Rossi Date: Thu, 15 Aug 2024 17:07:23 -0500 Subject: [PATCH 5/8] feat(scrapers.tests): add tests for cl_back_scrape_citations Related to freelawproject/juriscraper#858 --- .../commands/cl_back_scrape_citations.py | 27 +++++--- cl/scrapers/tests.py | 62 ++++++++++++++++++- 2 files changed, 80 insertions(+), 9 deletions(-) diff --git a/cl/scrapers/management/commands/cl_back_scrape_citations.py b/cl/scrapers/management/commands/cl_back_scrape_citations.py index 8fd59843c8..11f269c14b 100644 --- a/cl/scrapers/management/commands/cl_back_scrape_citations.py +++ b/cl/scrapers/management/commands/cl_back_scrape_citations.py @@ -38,9 +38,13 @@ def scrape_court(self, site, full_crawl=False, ocr_available=True): court_str = site.court_id.split(".")[-1].split("_")[0] for case in site: - citation = case.get("citation") - parallel_citation = case.get("parallel_citation") + citation = case.get("citations") + parallel_citation = case.get("parallel_citations") if not citation and not parallel_citation: + logger.debug( + "No citation, skipping row for case %s", + case.get("case_names"), + ) continue content = get_binary_content(case["download_urls"], site) @@ -54,7 +58,9 @@ def scrape_court(self, site, full_crawl=False, ocr_available=True): except Opinion.DoesNotExist: missing_opinions.append(case) logger.info( - "Opinion with URL '%s' does not exist. Has citation '%s'. Will try to ingest all objects", + "Case '%s', opinion '%s' has no matching hash in the DB. " + "Has a citation '%s'. Will try to ingest all objects", + case["case_names"], case["download_urls"], citation or parallel_citation, ) @@ -80,14 +86,21 @@ def scrape_court(self, site, full_crawl=False, ocr_available=True): # exists, it will be in the case dictionary, and will be saved in a # regular ingestion process if missing_opinions: - site.cases = missing_opinions - super().scrape_court(site, full_crawl=True) + # It is easy to ingest a filtered list of cases for OpinionSiteLinear + # but not for plain OpinionSite + if hasattr(site, "cases"): + site.cases = missing_opinions + super().scrape_court(site, full_crawl=True) + else: + logger.info("Run the backscraper to collect missing opinions") def citation_is_duplicated( self, citation_candidate: Citation, cluster: OpinionCluster, cite: str ) -> bool: - """Checks for exact or reporter duplication of citation in the cluster""" - citation_params = citation_candidate.__dict__ + """Checks for exact or reporter duplication of citation in the cluster + Inspired on corpus_importer.utils.add_citations_to_cluster + """ + citation_params = {**citation_candidate.__dict__} citation_params.pop("_state", "") citation_params.pop("id", "") diff --git a/cl/scrapers/tests.py b/cl/scrapers/tests.py index 9749042ca0..0d9382615e 100644 --- a/cl/scrapers/tests.py +++ b/cl/scrapers/tests.py @@ -22,6 +22,7 @@ from cl.lib.test_helpers import generate_docket_target_sources from cl.scrapers.DupChecker import DupChecker from cl.scrapers.management.commands import ( + cl_back_scrape_citations, cl_scrape_opinions, cl_scrape_oral_arguments, ) @@ -29,8 +30,13 @@ from cl.scrapers.tasks import extract_doc_content, process_audio_file from cl.scrapers.test_assets import test_opinion_scraper, test_oral_arg_scraper from cl.scrapers.utils import get_binary_content, get_extension -from cl.search.factories import CourtFactory, DocketFactory -from cl.search.models import Court, Docket, Opinion +from cl.search.factories import ( + CourtFactory, + DocketFactory, + OpinionClusterFactory, + OpinionFactory, +) +from cl.search.models import Citation, Court, Docket, Opinion from cl.settings import MEDIA_ROOT from cl.tests.cases import ESIndexTestCase, SimpleTestCase, TestCase from cl.tests.fixtures import ONE_SECOND_MP3_BYTES, SMALL_WAV_BYTES @@ -666,3 +672,55 @@ def test_no_content_type(self, mock_get): with mock.patch.object(self.logger, "error") as error_mock: _ = get_binary_content("/dummy/url/", self.site) error_mock.assert_not_called() + + +class ScrapeCitationsTestCase(TestCase): + """This class only tests the update of existing clusters + Since the ingestion of new clusters and their citations call + super().scrape_court(), it should be tested in the superclass + """ + + def setUp(self): + keys = [ + "download_urls", + "case_names", + "citations", + "parallel_citations", + ] + self.mock_site = mock.MagicMock() + self.mock_site.__iter__.return_value = [ + # update + dict(zip(keys, ["", "something", "482 Md. 342", ""])), + # exact duplicate + dict(zip(keys, ["", "something", "", "482 Md. 342"])), + # reporter duplicate + dict(zip(keys, ["", "something", "485 Md. 111", ""])), + # no citation, ignore + dict(zip(keys, ["", "something", "", ""])), + ] + self.mock_site.court_id = "juriscraper.md" + self.hash = "1234" * 10 + self.hashes = [self.hash, self.hash, self.hash, "111"] + + court = CourtFactory(id="md") + docket = DocketFactory( + case_name="Attorney Grievance v. Taniform", + docket_number="40ag/21", + court_id="md", + source=Docket.SCRAPER, + pacer_case_id=None, + ) + self.cluster = OpinionClusterFactory(docket=docket) + opinion = OpinionFactory(sha1=self.hash, cluster=self.cluster) + + def test_citation_scraper(self): + """Test if citation scraper creates a citation or ignores duplicates""" + cmd = "cl.scrapers.management.commands.cl_back_scrape_citations" + with mock.patch(f"{cmd}.sha1", side_effect=self.hashes): + with mock.patch( + f"{cmd}.get_binary_content", return_value="placeholder" + ): + cl_back_scrape_citations.Command().scrape_court(self.mock_site) + + citations = Citation.objects.filter(cluster=self.cluster).count() + self.assertEqual(citations, 1, "Exactly 1 citation was expected") From fc3e1a1563309ac2e7c16f27664329afe7221124 Mon Sep 17 00:00:00 2001 From: Gianfranco Rossi Date: Fri, 16 Aug 2024 14:04:31 -0500 Subject: [PATCH 6/8] refactor(cl_back_scrape_citations) Reword docstrings, catch exceptions and refactor code following code review --- .../commands/cl_back_scrape_citations.py | 43 +++++++++++++------ cl/scrapers/tests.py | 11 +++-- cl/scrapers/utils.py | 7 +-- 3 files changed, 40 insertions(+), 21 deletions(-) diff --git a/cl/scrapers/management/commands/cl_back_scrape_citations.py b/cl/scrapers/management/commands/cl_back_scrape_citations.py index 11f269c14b..982abfca1a 100644 --- a/cl/scrapers/management/commands/cl_back_scrape_citations.py +++ b/cl/scrapers/management/commands/cl_back_scrape_citations.py @@ -9,6 +9,7 @@ we ingest it as in a regular scrape """ +from django.db import IntegrityError from django.utils.encoding import force_bytes from cl.lib.command_utils import logger @@ -16,7 +17,7 @@ from cl.scrapers.management.commands import cl_back_scrape_opinions from cl.scrapers.management.commands.cl_scrape_opinions import make_citation from cl.scrapers.utils import get_binary_content -from cl.search.models import Citation, Opinion, OpinionCluster +from cl.search.models import Citation, Opinion class Command(cl_back_scrape_opinions.Command): @@ -74,13 +75,20 @@ def scrape_court(self, site, full_crawl=False, ocr_available=True): if not citation_candidate: continue - if self.citation_is_duplicated( - citation_candidate, cluster, cite - ): + if self.citation_is_duplicated(citation_candidate, cite): continue - logger.info("Saving citation %s for cluster %s", cite, cluster) - citation_candidate.save() + try: + citation_candidate.save() + logger.info( + "Saved citation %s for cluster %s", cite, cluster + ) + except IntegrityError: + logger.warning( + "Error when saving citation %s for cluster %s", + cite, + cluster, + ) # We don't have these opinions. Since we are backscraping, if the citation # exists, it will be in the case dictionary, and will be saved in a @@ -95,32 +103,43 @@ def scrape_court(self, site, full_crawl=False, ocr_available=True): logger.info("Run the backscraper to collect missing opinions") def citation_is_duplicated( - self, citation_candidate: Citation, cluster: OpinionCluster, cite: str + self, citation_candidate: Citation, cite: str ) -> bool: - """Checks for exact or reporter duplication of citation in the cluster - Inspired on corpus_importer.utils.add_citations_to_cluster + """Checks if the citation is duplicated for the cluster + + Following corpus_importer.utils.add_citations_to_cluster we + identify 2 types of duplication: + - exact: a citation with the same fields already exists for the cluster + - duplication in the same reporter: the cluster already has a citation + in that reporter + + :param citation_candidate: the citation object + :param cite: citation string + + :return: True if citation is duplicated, False if not """ citation_params = {**citation_candidate.__dict__} citation_params.pop("_state", "") citation_params.pop("id", "") + cluster_id = citation_candidate.cluster.id # Exact duplication if Citation.objects.filter(**citation_params).exists(): logger.info( "Citation '%s' already exists for cluster %s", cite, - cluster.id, + cluster_id, ) return True # Duplication in the same reporter if Citation.objects.filter( - cluster_id=cluster.id, reporter=citation_candidate.reporter + cluster_id=cluster_id, reporter=citation_candidate.reporter ).exists(): logger.info( "Another citation in the same reporter '%s' exists for cluster %s", citation_candidate.reporter, - cluster.id, + cluster_id, ) return True diff --git a/cl/scrapers/tests.py b/cl/scrapers/tests.py index 0d9382615e..ac6e5de0f7 100644 --- a/cl/scrapers/tests.py +++ b/cl/scrapers/tests.py @@ -674,7 +674,7 @@ def test_no_content_type(self, mock_get): error_mock.assert_not_called() -class ScrapeCitationsTestCase(TestCase): +class ScrapeCitationsTest(TestCase): """This class only tests the update of existing clusters Since the ingestion of new clusters and their citations call super().scrape_court(), it should be tested in the superclass @@ -716,11 +716,10 @@ def setUp(self): def test_citation_scraper(self): """Test if citation scraper creates a citation or ignores duplicates""" cmd = "cl.scrapers.management.commands.cl_back_scrape_citations" - with mock.patch(f"{cmd}.sha1", side_effect=self.hashes): - with mock.patch( - f"{cmd}.get_binary_content", return_value="placeholder" - ): - cl_back_scrape_citations.Command().scrape_court(self.mock_site) + with mock.patch(f"{cmd}.sha1", side_effect=self.hashes), mock.patch( + f"{cmd}.get_binary_content", return_value="placeholder" + ): + cl_back_scrape_citations.Command().scrape_court(self.mock_site) citations = Citation.objects.filter(cluster=self.cluster).count() self.assertEqual(citations, 1, "Exactly 1 citation was expected") diff --git a/cl/scrapers/utils.py b/cl/scrapers/utils.py index a1a2e6eab2..445797ce45 100644 --- a/cl/scrapers/utils.py +++ b/cl/scrapers/utils.py @@ -162,9 +162,10 @@ def get_binary_content( :param download_url: The URL for the item you wish to download. :param site: Site object used to download data - :return: Two values. The first is a msg indicating any errors encountered. - If blank, that indicates success. The second value is the response object - containing the downloaded file. + :return: One of: + - None if there was no URL, if the downloaded file was empty or if + the content type did not match the expected types + - The downloaded and cleaned content """ court_str = site.court_id.split(".")[-1].split("_")[0] fingerprint = [f"{court_str}-unexpected-content-type"] From 5a738b34f33e6cfae7a21e7f3b5933af0974bbb5 Mon Sep 17 00:00:00 2001 From: Gianfranco Rossi Date: Tue, 20 Aug 2024 11:35:35 -0500 Subject: [PATCH 7/8] refactor(cl_scrape_opinions): function for ingesting a single case - create cl.scrapers.exceptions file to hold exceptions raised when ingesting a single case - use the exceptions to bubble errors to the main loop, to avoid returning break / continue flags - refactor DupChecker to raise errors - refactor get_binary_content to raise errors - refactor cl_scrape_oral_arguments to new paradigm - cl_back_scrape_citations can now re-scrape a single case without re-downloading the binary content or manipulating the site object - adapted DupChecker and ContentType tests to changes - refactores logger.calls to use lazy formatting --- cl/scrapers/DupChecker.py | 99 +++++----- cl/scrapers/exceptions.py | 82 ++++++++ .../commands/cl_back_scrape_citations.py | 40 ++-- .../management/commands/cl_scrape_opinions.py | 185 +++++++++++------- .../commands/cl_scrape_oral_arguments.py | 110 ++++------- cl/scrapers/tests.py | 151 ++++++-------- cl/scrapers/utils.py | 35 ++-- 7 files changed, 379 insertions(+), 323 deletions(-) create mode 100644 cl/scrapers/exceptions.py diff --git a/cl/scrapers/DupChecker.py b/cl/scrapers/DupChecker.py index 7d5581955e..10cac56334 100644 --- a/cl/scrapers/DupChecker.py +++ b/cl/scrapers/DupChecker.py @@ -1,5 +1,9 @@ from juriscraper.AbstractSite import logger +from cl.scrapers.exceptions import ( + ConsecutiveDuplicatesError, + SingleDuplicateError, +) from cl.scrapers.models import UrlHash from cl.search.models import Court @@ -19,7 +23,6 @@ def __init__( self.url_hash = None self.dup_count = 0 self.last_found_date = None - self.emulate_break = False super().__init__(*args, **kwargs) def _increment(self, current_date): @@ -83,29 +86,29 @@ def press_on( lookup_by="sha1", ): """Checks if a we have an `object_type` with identical content in the CL - corpus by looking up `lookup_value` in the `lookup_by` field. Depending - on the result of that, we either return True or False. True represents - the fact that the next item should be processed. False means that either - the item was a duplicate or that we've hit so many duplicates that we've - stopped checking (we hit a duplicate threshold). Either way, the caller - should move to the next item and try it. + corpus by looking up `lookup_value` in the `lookup_by` field. - The effect of this is that this emulates for loop constructs for - continue (False), break (False), return (True). + If the item is not a duplicate, we will return None, and the caller + will proceed normally + + If the item is a duplicate, we will raise SingleDuplicateError + + If the item is a duplicate following a series of duplicates greater than + our tolerance threshold, we will raise ConsecutiveDuplicatesError + + If the item is a duplicate and the next item is from an already scraped + date, we will raise ConsecutiveDuplicatesError Following logic applies: + - if we do not have the item + - early return - if we have the item already - and if the next date is before this date - or if this is our duplicate threshold is exceeded - break - otherwise - continue - - if not - - carry on """ - if self.emulate_break: - return False - # check for a duplicate in the db. if lookup_by == "sha1": exists = object_type.objects.filter(sha1=lookup_value).exists() @@ -116,41 +119,35 @@ def press_on( else: raise NotImplementedError("Unknown lookup_by parameter.") - if exists: - logger.info( - f"Duplicate found on date: {current_date}, with lookup value: {lookup_value}" - ) - self._increment(current_date) - - # If the next date in the Site object is less than (before) the - # current date, we needn't continue because we should already have - # that item. - if next_date: - already_scraped_next_date = next_date < current_date - else: - already_scraped_next_date = True - if not self.full_crawl: - if already_scraped_next_date: - if self.court.pk == "mich": - # Michigan sometimes has multiple occurrences of the - # same case with different dates on a page. - return False - else: - logger.info( - "Next case occurs prior to when we found a " - "duplicate. Court is up to date." - ) - self.emulate_break = True - return False - elif self.dup_count >= self.dup_threshold: - logger.info( - f"Found {self.dup_count} duplicates in a row. Court is up to date." - ) - self.emulate_break = True - return False - else: - # This is a full crawl. Do not emulate a break, BUT be sure to - # say that we shouldn't press on, since the item already exists. - return False + if not exists: + return + + logger.info( + f"Duplicate found on date: {current_date}, with lookup value: {lookup_value}" + ) + self._increment(current_date) + + # If the next date in the Site object is less than (before) the + # current date, we needn't continue because we should already have + # that item. + if next_date: + already_scraped_next_date = next_date < current_date + else: + already_scraped_next_date = True + + if not self.full_crawl: + if already_scraped_next_date: + if self.court.pk == "mich": + # Michigan sometimes has multiple occurrences of the + # same case with different dates on a page. + raise SingleDuplicateError(logger=logger) + else: + message = "Next case occurs prior to when we found a duplicate. Court is up to date." + raise ConsecutiveDuplicatesError(message, logger=logger) + elif self.dup_count >= self.dup_threshold: + message = f"Found {self.dup_count} duplicates in a row. Court is up to date." + raise ConsecutiveDuplicatesError(message, logger=logger) else: - return True + # This is a full crawl. Do not raise a loop breaking `ConsecutiveDuplicatesError`, + # but say that we shouldn't press on, since the item already exists. + raise SingleDuplicateError(logger=logger) diff --git a/cl/scrapers/exceptions.py b/cl/scrapers/exceptions.py new file mode 100644 index 0000000000..dcefbb80d1 --- /dev/null +++ b/cl/scrapers/exceptions.py @@ -0,0 +1,82 @@ +import logging +from typing import Optional + +from cl.lib.command_utils import logger + + +class AutoLoggingException(Exception): + """Exception with defaults for logging, to be subclassed + + We log expected exceptions to better understand what went wrong + Logger calls with level `logging.ERROR` are sent to Sentry, and + it's useful to send a `fingerprint` to force a specific grouping by court + + Other `logger` calls are just printed on the console when using a + VerboseCommand with proper verbosity levels + """ + + logging_level = logging.DEBUG + message = "" + logger = logger + + def __init__( + self, + message: str = "", + logger: Optional[logging.Logger] = None, + logging_level: Optional[int] = None, + fingerprint: Optional[list[str]] = None, + ): + if not message: + message = self.message + if not logger: + logger = self.logger + if not logging_level: + logging_level = self.logging_level + + log_kwargs = {} + if fingerprint: + log_kwargs["extra"] = {"fingerprint": fingerprint} + + logger.log(logging_level, message, **log_kwargs) + super().__init__(message) + + +class ConsecutiveDuplicatesError(AutoLoggingException): + """Occurs when consecutive `SingleDuplicateError` are found, + which may be used as a signal to break the scraping loop + """ + + message = "DupChecker emulate break triggered." + + +class SingleDuplicateError(AutoLoggingException): + """Occurs when an opinion or audio file already exists + in our database + """ + + message = "Skipping opinion due to duplicated content hash" + + +class BadContentError(AutoLoggingException): + """Parent class for errors raised when downloading binary content""" + + +class UnexpectedContentTypeError(BadContentError): + """Occurs when the content received from the server has + a different content type than the ones listed on + site.expected_content_types + """ + + logging_level = logging.ERROR + + +class NoDownloadUrlError(BadContentError): + """Occurs when a DeferredList fetcher fails.""" + + logging_level = logging.ERROR + + +class EmptyFileError(BadContentError): + """Occurs when the content of the response has lenght 0""" + + logging_level = logging.ERROR diff --git a/cl/scrapers/management/commands/cl_back_scrape_citations.py b/cl/scrapers/management/commands/cl_back_scrape_citations.py index 982abfca1a..8975461217 100644 --- a/cl/scrapers/management/commands/cl_back_scrape_citations.py +++ b/cl/scrapers/management/commands/cl_back_scrape_citations.py @@ -14,14 +14,22 @@ from cl.lib.command_utils import logger from cl.lib.crypto import sha1 +from cl.scrapers.DupChecker import DupChecker +from cl.scrapers.exceptions import BadContentError from cl.scrapers.management.commands import cl_back_scrape_opinions from cl.scrapers.management.commands.cl_scrape_opinions import make_citation from cl.scrapers.utils import get_binary_content -from cl.search.models import Citation, Opinion +from cl.search.models import Citation, Court, Opinion class Command(cl_back_scrape_opinions.Command): - def scrape_court(self, site, full_crawl=False, ocr_available=True): + def scrape_court( + self, + site, + full_crawl: bool = False, + ocr_available: bool = True, + backscrape: bool = False, + ): """ If the scraped case has citation data Check for Opinion existance via content hash @@ -35,8 +43,9 @@ def scrape_court(self, site, full_crawl=False, ocr_available=True): :param site: scraper object that has already downloaded it's case data """ - missing_opinions = [] court_str = site.court_id.split(".")[-1].split("_")[0] + court = Court.objects.get(id=court_str) + dup_checker = DupChecker(court, full_crawl=True) for case in site: citation = case.get("citations") @@ -48,16 +57,19 @@ def scrape_court(self, site, full_crawl=False, ocr_available=True): ) continue - content = get_binary_content(case["download_urls"], site) - if not content: - # Errors are logged by get_binary_content itself + try: + content = get_binary_content(case["download_urls"], site) + except BadContentError: continue + sha1_hash = sha1(force_bytes(content)) try: cluster = Opinion.objects.get(sha1=sha1_hash).cluster except Opinion.DoesNotExist: - missing_opinions.append(case) + # populate special key to avoid downloading the file again + case["content"] = content + logger.info( "Case '%s', opinion '%s' has no matching hash in the DB. " "Has a citation '%s'. Will try to ingest all objects", @@ -65,6 +77,8 @@ def scrape_court(self, site, full_crawl=False, ocr_available=True): case["download_urls"], citation or parallel_citation, ) + + self.ingest_a_case(case, None, True, site, dup_checker, court) continue for cite in [citation, parallel_citation]: @@ -90,18 +104,6 @@ def scrape_court(self, site, full_crawl=False, ocr_available=True): cluster, ) - # We don't have these opinions. Since we are backscraping, if the citation - # exists, it will be in the case dictionary, and will be saved in a - # regular ingestion process - if missing_opinions: - # It is easy to ingest a filtered list of cases for OpinionSiteLinear - # but not for plain OpinionSite - if hasattr(site, "cases"): - site.cases = missing_opinions - super().scrape_court(site, full_crawl=True) - else: - logger.info("Run the backscraper to collect missing opinions") - def citation_is_duplicated( self, citation_candidate: Citation, cite: str ) -> bool: diff --git a/cl/scrapers/management/commands/cl_scrape_opinions.py b/cl/scrapers/management/commands/cl_scrape_opinions.py index a147da1182..f1547d8f7d 100644 --- a/cl/scrapers/management/commands/cl_scrape_opinions.py +++ b/cl/scrapers/management/commands/cl_scrape_opinions.py @@ -1,6 +1,7 @@ import signal import sys import time +import traceback from datetime import date from typing import Any, Dict, List, Optional, Tuple, Union @@ -22,6 +23,11 @@ from cl.lib.string_utils import trunc from cl.people_db.lookup_utils import lookup_judges_by_messy_str from cl.scrapers.DupChecker import DupChecker +from cl.scrapers.exceptions import ( + BadContentError, + ConsecutiveDuplicatesError, + SingleDuplicateError, +) from cl.scrapers.tasks import extract_doc_content from cl.scrapers.utils import ( get_binary_content, @@ -213,6 +219,7 @@ def save_everything( class Command(VerboseCommand): help = "Runs the Juriscraper toolkit against one or many jurisdictions." + object_type = "opinions" # for logging purposes def __init__(self, stdout=None, stderr=None, no_color=False): super().__init__(stdout=None, stderr=None, no_color=False) @@ -261,7 +268,13 @@ def add_arguments(self, parser): help="Disable duplicate aborting.", ) - def scrape_court(self, site, full_crawl=False, ocr_available=True): + def scrape_court( + self, + site, + full_crawl: bool = False, + ocr_available: bool = True, + backscrape: bool = False, + ): # Get the court object early for logging # opinions.united_states.federal.ca9_u --> ca9 court_str = site.court_id.split(".")[-1].split("_")[0] @@ -273,97 +286,116 @@ def scrape_court(self, site, full_crawl=False, ocr_available=True): return if site.cookies: - logger.info(f"Using cookies: {site.cookies}") - logger.debug(f"#{len(site)} opinions found.") + logger.info("Using cookies: %s", site.cookies) + + logger.debug("#%s opinions found.", len(site)) + added = 0 for i, item in enumerate(site): - content = get_binary_content(item["download_urls"], site) - if not content: - continue - - current_date = item["case_dates"] try: next_date = site[i + 1]["case_dates"] except IndexError: next_date = None - # request.content is sometimes a str, sometimes unicode, so - # force it all to be bytes, pleasing hashlib. - sha1_hash = sha1(force_bytes(content)) - if ( - court_str == "nev" - and item["precedential_statuses"] == "Unpublished" - ) or court_str in ["neb"]: - # Nevada's non-precedential cases have different SHA1 sums - # every time. - - # Nebraska updates the pdf causing the SHA1 to not match - # the opinions in CL causing duplicates. See CL issue #1452 - - lookup_params = { - "lookup_value": item["download_urls"], - "lookup_by": "download_url", - } - else: - lookup_params = { - "lookup_value": sha1_hash, - "lookup_by": "sha1", - } - - proceed = dup_checker.press_on( - Opinion, current_date, next_date, **lookup_params - ) - if dup_checker.emulate_break: - logger.debug("Emulate break triggered.") + try: + self.ingest_a_case( + item, next_date, ocr_available, site, dup_checker, court + ) + added += 1 + except ConsecutiveDuplicatesError: break - if not proceed: - logger.debug("Skipping opinion.") - continue - - # Not a duplicate, carry on - logger.info( - f"Adding new document found at: {item['download_urls'].encode()}" - ) - dup_checker.reset() - - child_court = get_child_court( - item.get("child_courts", ""), court.id - ) - - docket, opinion, cluster, citations = make_objects( - item, child_court or court, sha1_hash, content - ) - - save_everything( - items={ - "docket": docket, - "opinion": opinion, - "cluster": cluster, - "citations": citations, - }, - index=False, - ) - extract_doc_content.delay( - opinion.pk, - ocr_available=ocr_available, - citation_jitter=True, - juriscraper_module=site.court_id, - ) - - logger.info( - f"Successfully added opinion {opinion.pk}: " - f"{item['case_names'].encode()}" - ) - added += 1 + except (SingleDuplicateError, BadContentError): + pass # Update the hash if everything finishes properly. logger.debug( - f"{site.court_id}: Successfully crawled {added}/{len(site)} opinions." + "%s: Successfully crawled %s/%s %s.", + site.court_id, + added, + len(site), + self.object_type, ) if not full_crawl: # Only update the hash if no errors occurred. dup_checker.update_site_hash(site.hash) + def ingest_a_case( + self, + item, + next_case_date: date | None, + ocr_available: bool, + site, + dup_checker: DupChecker, + court: Court, + ): + if item.get("content"): + content = item.pop("content") + else: + content = get_binary_content(item["download_urls"], site) + + # request.content is sometimes a str, sometimes unicode, so + # force it all to be bytes, pleasing hashlib. + sha1_hash = sha1(force_bytes(content)) + + if ( + court.pk == "nev" + and item["precedential_statuses"] == "Unpublished" + ) or court.pk in ["neb"]: + # Nevada's non-precedential cases have different SHA1 sums + # every time. + + # Nebraska updates the pdf causing the SHA1 to not match + # the opinions in CL causing duplicates. See CL issue #1452 + + lookup_params = { + "lookup_value": item["download_urls"], + "lookup_by": "download_url", + } + else: + lookup_params = { + "lookup_value": sha1_hash, + "lookup_by": "sha1", + } + + # Duplicates will raise errors + dup_checker.press_on( + Opinion, item["case_dates"], next_case_date, **lookup_params + ) + + # Not a duplicate, carry on + logger.info( + "Adding new document found at: %s", item["download_urls"].encode() + ) + dup_checker.reset() + + child_court = get_child_court(item.get("child_courts", ""), court.id) + + docket, opinion, cluster, citations = make_objects( + item, child_court or court, sha1_hash, content + ) + + save_everything( + items={ + "docket": docket, + "opinion": opinion, + "cluster": cluster, + "citations": citations, + }, + index=False, + ) + extract_doc_content.delay( + opinion.pk, + ocr_available=ocr_available, + citation_jitter=True, + juriscraper_module=site.court_id, + ) + + logger.info( + "Successfully added opinion %s: %s", + opinion.pk, + item["case_names"].encode(), + ) + def parse_and_scrape_site(self, mod, options: dict): site = mod.Site().parse() self.scrape_court(site, options["full_crawl"]) @@ -407,6 +439,7 @@ def handle(self, *args, **options): capture_exception( e, fingerprint=[module_string, "{{ default }}"] ) + logger.debug(traceback.format_exc()) last_court_in_list = i == (num_courts - 1) daemon_mode = options["daemon"] if last_court_in_list: diff --git a/cl/scrapers/management/commands/cl_scrape_oral_arguments.py b/cl/scrapers/management/commands/cl_scrape_oral_arguments.py index a3d36ed7bb..a2f09dbae4 100644 --- a/cl/scrapers/management/commands/cl_scrape_oral_arguments.py +++ b/cl/scrapers/management/commands/cl_scrape_oral_arguments.py @@ -106,75 +106,47 @@ def make_objects( class Command(cl_scrape_opinions.Command): - def scrape_court( + object_type = "oral arguments" + + def ingest_a_case( self, + item, + next_case_date: date | None, + ocr_available: bool, site, - full_crawl: bool = False, + dup_checker: DupChecker, + court: Court, backscrape: bool = False, - ) -> None: - # Get the court object early for logging - # opinions.united_states.federal.ca9_u --> ca9 - court_str = site.court_id.split(".")[-1].split("_")[0] - court = Court.objects.get(pk=court_str) - - dup_checker = DupChecker(court, full_crawl=full_crawl) - abort = dup_checker.abort_by_url_hash(site.url, site.hash) - if abort: - return - - if site.cookies: - logger.info(f"Using cookies: {site.cookies}") - for i, item in enumerate(site): - content = get_binary_content(item["download_urls"], site) - if not content: - continue - - current_date = item["case_dates"] - try: - next_date = site[i + 1]["case_dates"] - except IndexError: - next_date = None - - # request.content is sometimes a str, sometimes unicode, so - # force it all to be bytes, pleasing hashlib. - sha1_hash = sha1(force_bytes(content)) - onwards = dup_checker.press_on( - Audio, - current_date, - next_date, - lookup_value=sha1_hash, - lookup_by="sha1", - ) - if dup_checker.emulate_break: - break - - if onwards: - # Not a duplicate, carry on - logger.info( - f"Adding new document found at: {item['download_urls'].encode()}" - ) - dup_checker.reset() - - docket, audio_file = make_objects( - item, court, sha1_hash, content - ) - - save_everything( - items={"docket": docket, "audio_file": audio_file}, - index=False, - backscrape=backscrape, - ) - process_audio_file.delay(audio_file.pk) - - logger.info( - "Successfully added audio file {pk}: {name}".format( - pk=audio_file.pk, - name=item["case_names"].encode(), - ) - ) - - # Update the hash if everything finishes properly. - logger.info(f"{site.court_id}: Successfully crawled oral arguments.") - if not full_crawl: - # Only update the hash if no errors occurred. - dup_checker.update_site_hash(site.hash) + ): + content = get_binary_content(item["download_urls"], site) + # request.content is sometimes a str, sometimes unicode, so + # force it all to be bytes, pleasing hashlib. + sha1_hash = sha1(force_bytes(content)) + + dup_checker.press_on( + Audio, + item["case_dates"], + next_case_date, + lookup_value=sha1_hash, + lookup_by="sha1", + ) + + logger.info( + "Adding new document found at: %s", item["download_urls"].encode() + ) + dup_checker.reset() + + docket, audio_file = make_objects(item, court, sha1_hash, content) + + save_everything( + items={"docket": docket, "audio_file": audio_file}, + index=False, + backscrape=backscrape, + ) + process_audio_file.delay(audio_file.pk) + + logger.info( + "Successfully added audio file %s: %s", + audio_file.pk, + item["case_names"].encode(), + ) diff --git a/cl/scrapers/tests.py b/cl/scrapers/tests.py index ac6e5de0f7..5f3136701a 100644 --- a/cl/scrapers/tests.py +++ b/cl/scrapers/tests.py @@ -21,6 +21,11 @@ from cl.lib.microservice_utils import microservice from cl.lib.test_helpers import generate_docket_target_sources from cl.scrapers.DupChecker import DupChecker +from cl.scrapers.exceptions import ( + ConsecutiveDuplicatesError, + SingleDuplicateError, + UnexpectedContentTypeError, +) from cl.scrapers.management.commands import ( cl_back_scrape_citations, cl_scrape_opinions, @@ -448,26 +453,21 @@ def test_press_on_with_an_empty_database(self) -> None: site = test_opinion_scraper.Site() site.hash = "this is a dummy hash code string" for dup_checker in self.dup_checkers: - onwards = dup_checker.press_on( - Opinion, - now(), - now() - timedelta(days=1), - lookup_value="content", - lookup_by="sha1", - ) - if dup_checker.full_crawl: - self.assertTrue( - onwards, - "DupChecker says to abort during a full crawl. This should " - "never happen.", - ) - elif dup_checker.full_crawl is False: - count = Opinion.objects.all().count() - self.assertTrue( - onwards, - "DupChecker says to abort on dups when the database has %s " - "Documents." % count, + try: + dup_checker.press_on( + Opinion, + now(), + now() - timedelta(days=1), + lookup_value="content", + lookup_by="sha1", ) + except (SingleDuplicateError, ConsecutiveDuplicatesError): + if dup_checker.full_crawl: + failure = "DupChecker says to abort during a full crawl. This should never happen." + else: + count = Opinion.objects.all().count() + failure = f"DupChecker says to abort on dups when the database has {count} Documents." + self.fail(failure) class DupcheckerWithFixturesTest(TestCase): @@ -492,78 +492,54 @@ def setUp(self) -> None: def test_press_on_with_a_dup_found(self) -> None: for dup_checker in self.dup_checkers: - onwards = dup_checker.press_on( - Opinion, - now(), - now(), - lookup_value=self.content_hash, - lookup_by="sha1", - ) - if dup_checker.full_crawl: - self.assertFalse( - onwards, - "DupChecker returned True during a full crawl, but there " - "should be duplicates in the database.", - ) - self.assertFalse( - dup_checker.emulate_break, - "DupChecker said to emulate a break during a full crawl. " - "Nothing should stop a full crawl!", - ) - - elif dup_checker.full_crawl is False: - self.assertFalse( - onwards, - "DupChecker returned %s but there should be a duplicate in " - "the database. dup_count is %s, and dup_threshold is %s" - % ( - onwards, - dup_checker.dup_count, - dup_checker.dup_threshold, - ), - ) - self.assertTrue( - dup_checker.emulate_break, - "We should have hit a break but didn't.", + try: + dup_checker.press_on( + Opinion, + now(), + now(), + lookup_value=self.content_hash, + lookup_by="sha1", ) + if not dup_checker.full_crawl: + self.fail("Did not raise ConsecutiveDuplicatesError.") + except ConsecutiveDuplicatesError: + if dup_checker.full_crawl: + self.fail( + "DupChecker raised ConsecutiveDuplicatesError breaking" + " the outer loop. Nothing should stop a full crawl!" + ) + except SingleDuplicateError: + # Full crawl or not, a SingleDuplicateError is + # expected when a duplicate is found + pass def test_press_on_with_dup_found_and_older_date(self) -> None: for dup_checker in self.dup_checkers: # Note that the next case occurs prior to the current one - onwards = dup_checker.press_on( - Opinion, - now(), - now() - timedelta(days=1), - lookup_value=self.content_hash, - lookup_by="sha1", - ) - if dup_checker.full_crawl: - self.assertFalse( - onwards, - "DupChecker returned True during a full crawl, but there " - "should be duplicates in the database.", + try: + dup_checker.press_on( + Opinion, + now(), + now() - timedelta(days=1), + lookup_value=self.content_hash, + lookup_by="sha1", ) - self.assertFalse( - dup_checker.emulate_break, - "DupChecker said to emulate a break during a full crawl. " - "Nothing should stop a full crawl!", - ) - else: - self.assertFalse( - onwards, - "DupChecker returned %s but there should be a duplicate in " - "the database. dup_count is %s, and dup_threshold is %s" - % ( - onwards, - dup_checker.dup_count, - dup_checker.dup_threshold, - ), - ) - self.assertTrue( - dup_checker.emulate_break, - "We should have hit a break but didn't.", + self.fail( + "Expected raising SingleDuplicateError, there was a duplicate in the DB" ) + except SingleDuplicateError: + # Full crawl or not, a SingleDuplicateError is + # expected when a duplicate is found + pass + except ConsecutiveDuplicatesError: + if dup_checker.full_crawl: + self.fail( + "DupChecker raised ConsecutiveDuplicatesError during a " + "full crawl, breaking the outer loop. Nothing should " + "stop a full crawl!" + ) + class AudioFileTaskTest(TestCase): @classmethod @@ -641,10 +617,11 @@ def test_unexpected_content_type(self, mock_get): """Test when content type doesn't match scraper expectation.""" mock_get.return_value = self.mock_response self.site.expected_content_types = ["text/html"] - with mock.patch.object(self.logger, "error") as error_mock: - get_binary_content("/dummy/url/", self.site) - self.assertIn( - "UnexpectedContentTypeError:", error_mock.call_args_list[0][0][0] + self.assertRaises( + UnexpectedContentTypeError, + get_binary_content, + "/dummy/url/", + self.site, ) @mock.patch("requests.Session.get") diff --git a/cl/scrapers/utils.py b/cl/scrapers/utils.py index 445797ce45..311ccaf221 100644 --- a/cl/scrapers/utils.py +++ b/cl/scrapers/utils.py @@ -21,6 +21,11 @@ from cl.lib.decorators import retry from cl.lib.microservice_utils import microservice from cl.recap.mergers import find_docket_object +from cl.scrapers.exceptions import ( + EmptyFileError, + NoDownloadUrlError, + UnexpectedContentTypeError, +) from cl.scrapers.tasks import extract_recap_pdf from cl.search.models import Court, Docket, RECAPDocument @@ -155,26 +160,18 @@ def get_extension(content: bytes) -> str: def get_binary_content( download_url: str, site: AbstractSite, -) -> Optional[bytes | str]: +) -> bytes | str: """Downloads the file, covering a few special cases such as invalid SSL certificates and empty file errors. :param download_url: The URL for the item you wish to download. :param site: Site object used to download data - :return: One of: - - None if there was no URL, if the downloaded file was empty or if - the content type did not match the expected types - - The downloaded and cleaned content + :return: The downloaded and cleaned content + :raises: NoDownloadUrlError, UnexpectedContentTypeError, EmptyFileError """ - court_str = site.court_id.split(".")[-1].split("_")[0] - fingerprint = [f"{court_str}-unexpected-content-type"] - if not download_url: - # Occurs when a DeferredList fetcher fails. - error = f"NoDownloadUrlError: {download_url}\n{traceback.format_exc()}" - logger.error(error, extra={"fingerprint": fingerprint}) - return + raise NoDownloadUrlError(download_url) # noinspection PyBroadException if site.method == "LOCAL": @@ -207,9 +204,7 @@ def get_binary_content( # test for empty files (thank you CA1) if len(r.content) == 0: - error = f"EmptyFileError: {download_url}\n{traceback.format_exc()}" - logger.error(error, extra={"fingerprint": fingerprint}) - return + raise EmptyFileError(f"EmptyFileError: '{download_url}'") # test for expected content type (thanks mont for nil) if site.expected_content_types: @@ -224,12 +219,10 @@ def get_binary_content( ) if not m: - error = ( - f"UnexpectedContentTypeError: {download_url}\n" - f'\'"{content_type}" not in {site.expected_content_types}' - ) - logger.error(error, extra={"fingerprint": fingerprint}) - return + court_str = site.court_id.split(".")[-1].split("_")[0] + fingerprint = [f"{court_str}-unexpected-content-type"] + msg = f"'{download_url}' '{content_type}' not in {site.expected_content_types}" + raise UnexpectedContentTypeError(msg, fingerprint=fingerprint) # test for and follow meta redirects r = follow_redirections(r, s) From ace253e1017a2800a4925e96fa53afd05cdc781e Mon Sep 17 00:00:00 2001 From: Gianfranco Rossi Date: Wed, 21 Aug 2024 12:27:59 -0500 Subject: [PATCH 8/8] refactor(cl_scrape_opinions): renamed variable used in logging --- cl/scrapers/management/commands/cl_back_scrape_citations.py | 2 ++ .../management/commands/cl_back_scrape_oral_arguments.py | 2 +- cl/scrapers/management/commands/cl_scrape_opinions.py | 6 +++--- cl/scrapers/management/commands/cl_scrape_oral_arguments.py | 6 ++++-- 4 files changed, 10 insertions(+), 6 deletions(-) diff --git a/cl/scrapers/management/commands/cl_back_scrape_citations.py b/cl/scrapers/management/commands/cl_back_scrape_citations.py index 8975461217..b2da0a4581 100644 --- a/cl/scrapers/management/commands/cl_back_scrape_citations.py +++ b/cl/scrapers/management/commands/cl_back_scrape_citations.py @@ -23,6 +23,8 @@ class Command(cl_back_scrape_opinions.Command): + scrape_target_descr = "citations" + def scrape_court( self, site, diff --git a/cl/scrapers/management/commands/cl_back_scrape_oral_arguments.py b/cl/scrapers/management/commands/cl_back_scrape_oral_arguments.py index b1105f01e0..299a091597 100644 --- a/cl/scrapers/management/commands/cl_back_scrape_oral_arguments.py +++ b/cl/scrapers/management/commands/cl_back_scrape_oral_arguments.py @@ -5,7 +5,7 @@ class Command(cl_scrape_oral_arguments.Command): - def parse_and_scrape_site(self, mod, full_crawl): + def parse_and_scrape_site(self, mod, options: dict): court_str = mod.__name__.split(".")[-1].split("_")[0] logger.info(f'Using court_str: "{court_str}"') diff --git a/cl/scrapers/management/commands/cl_scrape_opinions.py b/cl/scrapers/management/commands/cl_scrape_opinions.py index f1547d8f7d..9e7854824b 100644 --- a/cl/scrapers/management/commands/cl_scrape_opinions.py +++ b/cl/scrapers/management/commands/cl_scrape_opinions.py @@ -219,7 +219,7 @@ def save_everything( class Command(VerboseCommand): help = "Runs the Juriscraper toolkit against one or many jurisdictions." - object_type = "opinions" # for logging purposes + scrape_target_descr = "opinions" # for logging purposes def __init__(self, stdout=None, stderr=None, no_color=False): super().__init__(stdout=None, stderr=None, no_color=False) @@ -288,7 +288,7 @@ def scrape_court( if site.cookies: logger.info("Using cookies: %s", site.cookies) - logger.debug("#%s opinions found.", len(site)) + logger.debug("#%s %s found.", len(site), self.scrape_target_descr) added = 0 for i, item in enumerate(site): @@ -313,7 +313,7 @@ def scrape_court( site.court_id, added, len(site), - self.object_type, + self.scrape_target_descr, ) if not full_crawl: # Only update the hash if no errors occurred. diff --git a/cl/scrapers/management/commands/cl_scrape_oral_arguments.py b/cl/scrapers/management/commands/cl_scrape_oral_arguments.py index a2f09dbae4..33da02925d 100644 --- a/cl/scrapers/management/commands/cl_scrape_oral_arguments.py +++ b/cl/scrapers/management/commands/cl_scrape_oral_arguments.py @@ -106,7 +106,7 @@ def make_objects( class Command(cl_scrape_opinions.Command): - object_type = "oral arguments" + scrape_target_descr = "oral arguments" def ingest_a_case( self, @@ -132,7 +132,9 @@ def ingest_a_case( ) logger.info( - "Adding new document found at: %s", item["download_urls"].encode() + "Adding new %s found at: %s", + self.scrape_target_descr, + item["download_urls"].encode(), ) dup_checker.reset()