Skip to content

Commit

Permalink
Merge pull request #473 from nationalarchives/chore/clean-up-parser-e…
Browse files Browse the repository at this point in the history
…rror-handling
  • Loading branch information
jacksonj04 authored Nov 29, 2023
2 parents 11230cf + 6b157d6 commit b8a53cc
Show file tree
Hide file tree
Showing 5 changed files with 57 additions and 41 deletions.
4 changes: 3 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@ The format is based on [Keep a Changelog 1.0.0].

## [Unreleased]

- **Breaking:**: Fully remove the deprecated `caselawclient.api_client` instance.
- **Breaking:** Fully remove the deprecated `caselawclient.api_client` instance.
- **Feature:** New `Document.xml_root_element` function to replace `get_judgment_root`
- **Feature:** Documents which are not valid XML are now identified by the raising of a new `Document.NonXMLDocumentError` exception

## [Release 17.3.0]

Expand Down
25 changes: 21 additions & 4 deletions src/caselawclient/models/documents.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import datetime
import warnings
import xml.etree.ElementTree as ET
from functools import cached_property
from typing import TYPE_CHECKING, Any, Dict, NewType, Optional

Expand All @@ -18,7 +19,7 @@
OnlySupportedOnVersion,
)
from ..xml_helpers import get_xpath_match_string, get_xpath_match_strings
from .utilities import VersionsDict, get_judgment_root, render_versions
from .utilities import VersionsDict, render_versions
from .utilities.aws import (
ParserInstructionsDict,
announce_document_event,
Expand Down Expand Up @@ -76,6 +77,12 @@ class DocumentNotSafeForDeletion(Exception):
pass


class NonXMLDocumentError(Exception):
"""A document cannot be parsed as XML."""

pass


class Document:
"""
A base class from which all other document types are extensions. This class includes the essential methods for
Expand Down Expand Up @@ -393,12 +400,22 @@ def failed_to_parse(self) -> bool:
:return: `True` if there was a complete parser failure, otherwise `False`
"""
if "error" in self._get_root():
if "error" in self.xml_root_element:
return True
return False

def _get_root(self) -> str:
return get_judgment_root(self.content_as_xml_bytestring)
@property
def xml_root_element(self) -> str:
"""
:return: The name of the root tag in the XML
:raises NonXMLDocumentError: This document is not valid XML
"""
try:
parsed_xml = ET.XML(self.content_as_xml_bytestring)
return parsed_xml.tag
except ET.ParseError:
raise NonXMLDocumentError

@cached_property
def has_name(self) -> bool:
Expand Down
9 changes: 0 additions & 9 deletions src/caselawclient/models/utilities/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import re
import xml.etree.ElementTree as ET
from typing import TypedDict

from requests_toolbelt.multipart.decoder import BodyPart
Expand All @@ -12,14 +11,6 @@
uk_namespace = {"uk": "https://caselaw.nationalarchives.gov.uk/akn"}


def get_judgment_root(judgment_xml: bytes) -> str:
try:
parsed_xml = ET.XML(judgment_xml)
return parsed_xml.tag
except ET.ParseError:
return "error"


class VersionsDict(TypedDict):
uri: str
version: int
Expand Down
32 changes: 32 additions & 0 deletions tests/models/test_documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
CannotPublishUnpublishableDocument,
Document,
DocumentNotSafeForDeletion,
NonXMLDocumentError,
UnparsableDate,
)
from caselawclient.models.judgments import Judgment
Expand Down Expand Up @@ -61,6 +62,37 @@ def test_document_failed_to_parse(self, mock_api_client):

assert document.failed_to_parse is True

def test_xml_root_element_akomantoso(self, mock_api_client):
mock_api_client.get_judgment_xml_bytestring.return_value = "<akomaNtoso xmlns:uk='https://caselaw.nationalarchives.gov.uk/akn' xmlns='http://docs.oasis-open.org/legaldocml/ns/akn/3.0'>judgment</akomaNtoso>".encode(
"utf-8"
)

document = Document("test/1234", mock_api_client)

assert (
document.xml_root_element
== "{http://docs.oasis-open.org/legaldocml/ns/akn/3.0}akomaNtoso"
)

def test_xml_root_element_error(self, mock_api_client):
mock_api_client.get_judgment_xml_bytestring.return_value = (
"<error>parser.log contents</error>".encode("utf-8")
)

document = Document("test/1234", mock_api_client)

assert document.xml_root_element == "error"

def test_xml_root_element_malformed(self, mock_api_client):
mock_api_client.get_judgment_xml_bytestring.return_value = (
"<error>malformed xml".encode("utf-8")
)

document = Document("test/1234", mock_api_client)

with pytest.raises(NonXMLDocumentError):
document.xml_root_element

def test_document_parsed(self, mock_api_client):
mock_api_client.get_judgment_xml_bytestring.return_value = """
<akomaNtoso>Parsing succeeded</akomaNtoso>
Expand Down
28 changes: 1 addition & 27 deletions tests/models/utilities/test_utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,38 +4,12 @@
import ds_caselaw_utils
import pytest

from caselawclient.models.utilities import (
extract_version,
get_judgment_root,
move,
render_versions,
)
from caselawclient.models.utilities import extract_version, move, render_versions
from caselawclient.models.utilities.aws import build_new_key, copy_assets

from ...factories import JudgmentFactory


class TestUtils:
def test_get_judgment_root_error(self):
xml = "<error>parser.log contents</error>"
assert get_judgment_root(xml) == "error"

def test_get_judgment_root_akomantoso(self):
xml = (
"<akomaNtoso xmlns:uk='https://caselaw.nationalarchives.gov.uk/akn' "
"xmlns='http://docs.oasis-open.org/legaldocml/ns/akn/3.0'>judgment</akomaNtoso>"
)
assert (
get_judgment_root(xml)
== "{http://docs.oasis-open.org/legaldocml/ns/akn/3.0}akomaNtoso"
)

def test_get_judgment_root_malformed_xml(self):
# Should theoretically never happen but test anyway
xml = "<error>malformed xml"
assert get_judgment_root(xml) == "error"


class TestVersionUtils:
def test_extract_version_uri(self):
uri = "/ewhc/ch/2022/1178_xml_versions/2-1178.xml"
Expand Down

0 comments on commit b8a53cc

Please sign in to comment.