Merge pull request #670 from nationalarchives/use-xml-parsing-for-fir…

…st-stage-replacements [FCL-490] Use XML parsing for first stage replacements
nationalarchives · Jan 8, 2025 · 850aaeb · 850aaeb
2 parents f8f5a56 + a0cb118
commit 850aaeb
Show file tree

Hide file tree

Showing 22 changed files with 2,234 additions and 1,995 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -44,8 +44,12 @@ repos:
             types-requests,
             types-lxml,
             types-beautifulsoup4,
+            types-psycopg2,
+            pandas-stubs,
+            pytest-stub,
             "boto3-stubs[essential, secretsmanager]",
             aws_lambda_powertools,
+            moto,
           ]
 
   - repo: https://github.com/pre-commit/mirrors-prettier

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,10 @@
 # Changelog
 
+## v7.0.0 (2024-11-28)
+
+- Ensure that documents with matching replacements don't replace inside XML strings
+- Fix type linting
+
 ## v6.0.2 (2024-10-03)
 
 - Enrich press summaries by fixing patch url to be press-summary, not press/summary

diff --git a/setup.cfg b/setup.cfg
@@ -1,7 +1,14 @@
 [mypy]
 python_version = 3.12
+mypy_path = src
 check_untyped_defs = True
 ignore_missing_imports = True
 warn_unused_ignores = True
 warn_redundant_casts = True
 warn_unused_configs = True
+
+[mypy-tests.*]
+ignore_errors = True
+
+[mypy-utils.tests.*]
+ignore_errors = True
diff --git a/src/lambdas/determine_legislation_provisions/index.py b/src/lambdas/determine_legislation_provisions/index.py
@@ -49,7 +49,7 @@ def add_timestamp_and_engine_version(
         "uk:tna-enrichment-engine",
         attrs={"xmlns:uk": "https://caselaw.nationalarchives.gov.uk/akn"},
     )
-    enrichment_version.string = "6.0.2"
+    enrichment_version.string = "7.0.0"
 
     if not soup.proprietary:
         msg = "This document does not have a <proprietary> element."

diff --git a/src/lambdas/extract_judgement_contents/index.py b/src/lambdas/extract_judgement_contents/index.py
@@ -41,7 +41,7 @@ def extract_text_content(file_content: DocumentAsXMLString) -> str:
     return parse_file(file_content)
 
 
-def upload_contents(source_key: str, text_content: DocumentAsXMLString):
+def upload_contents(source_key: str, text_content: DocumentAsXMLString | str):
     """
     Uploads text to S3 bucket
     """

diff --git a/src/lambdas/update_rules_processor/index.py b/src/lambdas/update_rules_processor/index.py
@@ -19,7 +19,7 @@ class MismatchedIdShapeError(Exception):
     pass
 
 
-def write_patterns_file(patterns_list: str) -> str:
+def write_patterns_file(patterns_list: list[str]) -> str:
     """
     Write patterns to separate lines
     """

diff --git a/src/lambdas/vlex_upload/index.py b/src/lambdas/vlex_upload/index.py
@@ -6,7 +6,7 @@
 from distutils.util import strtobool
 
 from utils.environment_helpers import validate_env_variable
-from utils.types import DocumentAsXMLString
+from utils.types import DocumentAsXMLBytes
 
 LOGGER = logging.getLogger()
 LOGGER.setLevel(logging.DEBUG)
@@ -25,14 +25,14 @@ def process_event(sqs_rec):
     print("Input bucket name:", source_bucket)
     print("Input S3 key:", source_key)
 
-    file_content = s3_client.get_object(Bucket=source_bucket, Key=source_key)["Body"].read()
+    file_content = DocumentAsXMLBytes(s3_client.get_object(Bucket=source_bucket, Key=source_key)["Body"].read())
 
     upload_contents(source_key, file_content)
     LOGGER.debug("content uploaded")
     return True
 
 
-def upload_contents(source_key: str, text_content: DocumentAsXMLString):
+def upload_contents(source_key: str, text_content: DocumentAsXMLBytes):
     """
     Upload judgment XML to destination S3 bucket
     """

diff --git a/src/legislation_extraction/legislation_matcher_hybrid.py b/src/legislation_extraction/legislation_matcher_hybrid.py
@@ -59,7 +59,7 @@ def mergedict(x, b):
 
 def resolve_overlap(results_dict):
     """
-    Resolves references that have been detected as legislation but overlap in the body of judgement to the most accurate legislation.
+    Resolves references that have been detected as legislation but overlap in the body of judgment to the most accurate legislation.
     This might occur due to the nature of the fuzzy matching where it matches two closely worded legislation to the same text in a judgement.
     This function ensures a 1-to-1 linkage between a legislation title and a detected reference.
     Parameters
@@ -72,9 +72,9 @@ def resolve_overlap(results_dict):
         dictionary containing the detected references with overlapped references removed.
     """
     qq = pd.DataFrame([results_dict])
-    qq = qq.T.explode(0)[0].apply(pd.Series)
+    qq = qq.T.explode([0])[0].apply(pd.Series)
 
-    qq.columns = keys
+    qq.columns = pd.Index(keys)
 
     # get refs that overlap in the text
     mask = (qq.start.values[:, None] >= qq.start.values) & (qq.end.values[:, None] <= qq.end.values)
@@ -89,8 +89,11 @@ def resolve_overlap(results_dict):
 
     # for every detected pair of refs that overlap
     for ol_index in overlaps:
+        # mypy was complaining about ol_index being a `list[signedinteger[_32Bit | _64Bit]]`
+        # so just force it to be a regular list of ints
+        int_ol_index = list(map(int, ol_index))
         # get those two rows
-        overlap_rows = qq.iloc[list(ol_index)]
+        overlap_rows = qq.iloc[int_ol_index]
         # get the worst of the two (or first, if they're equal)
         worst_match_index = overlap_rows.confidence.idxmin()
         # and mark its index for deletion

diff --git a/src/oblique_references/oblique_references.py b/src/oblique_references/oblique_references.py
@@ -24,6 +24,7 @@
 
 from bs4 import BeautifulSoup
 
+from replacer.second_stage_replacer import LegislationReferenceReplacement
 from utils.proper_xml import create_tag_string
 
 LegislationReference = tuple[tuple[int, int], str]
@@ -38,9 +39,6 @@ class LegislationDict(TypedDict):
     href: str
 
 
-LegislationReferenceReplacements = list[dict[str, str | int]]
-
-
 class NotExactlyOneRefTag(RuntimeError):
     """soup.get() can return None if there is no <ref> tag to find, or
     a list of hits if there are multiple tags. These are not handled
@@ -201,9 +199,9 @@ def get_replacements(
     detected_acts: list[LegislationReference],
     legislation_dicts: list[LegislationDict],
     numbered_act: bool,
-    replacements: list[dict],
+    replacements: list[LegislationReferenceReplacement],
     paragraph_number: int,
-) -> LegislationReferenceReplacements:
+) -> list[LegislationReferenceReplacement]:
     """
     Create replacement string for detected oblique reference
     :param detected_acts: detected oblique references
@@ -214,25 +212,29 @@ def get_replacements(
     :returns: list of replacements
     """
     for detected_act in detected_acts:
-        replacement_dict: dict[str, str | int] = {}
         match = detected_act[1]
         if numbered_act:
             matched_replacement = match_numbered_act(detected_act, legislation_dicts)
         else:
             matched_replacement = match_act(detected_act, legislation_dicts, paragraph_number)
-        replacement_dict["detected_ref"] = match
-        replacement_dict["ref_position"] = detected_act[0][0]
-        replacement_dict["ref_para"] = paragraph_number
         if matched_replacement:
-            replacement_dict["ref_tag"] = create_section_ref_tag(matched_replacement, match)
-            replacements.append(replacement_dict)
+            ref_tag = create_section_ref_tag(matched_replacement, match)
+
+            replacements.append(
+                LegislationReferenceReplacement(
+                    detected_ref=match,
+                    ref_position=detected_act[0][0],
+                    ref_para=paragraph_number,
+                    ref_tag=ref_tag,
+                ),
+            )
 
     return replacements
 
 
 def get_oblique_reference_replacements_by_paragraph(
     file_content: str,
-) -> LegislationReferenceReplacements:
+) -> list[LegislationReferenceReplacement]:
     """
     Determines oblique references and replacement strings grouped by paragraph
     :param file_content: original judgment file content
@@ -241,11 +243,11 @@ def get_oblique_reference_replacements_by_paragraph(
     """
     soup = BeautifulSoup(file_content, "xml")
     paragraphs = soup.find_all("p")
-    all_replacements: list[dict] = []
+    all_replacements: list[LegislationReferenceReplacement] = []
     all_legislation_dicts = []
 
     for paragraph_number, paragraph in enumerate(paragraphs):
-        replacements: list[dict] = []
+        replacements: list[LegislationReferenceReplacement] = []
         detected_legislation = detect_reference(str(paragraph), "legislation")
         legislation_dicts = create_legislation_dict(detected_legislation, paragraph_number)
         all_legislation_dicts.extend(legislation_dicts)

diff --git a/src/replacer/make_replacments.py b/src/replacer/make_replacments.py
@@ -65,13 +65,17 @@ def make_post_header_replacements(
         str: The modified legal document content with the replacement applied.
     """
     cleaned_file_content = sanitize_judgment(original_content)
+
     pre_header, end_header_tag, post_header = split_text_by_closing_header_tag(cleaned_file_content)
 
     replaced_post_header_content = apply_replacements(post_header, replacement_patterns)
     LOGGER.info("Got post-header replacement text content")
 
     full_replaced_text_content = pre_header + end_header_tag + replaced_post_header_content
 
+    # raises an lxml.etree.XMLSyntaxError if the output is not valid XML
+    lxml.etree.fromstring(full_replaced_text_content.encode("utf-8"))
+
     return DocumentAsXMLString(full_replaced_text_content)
 
 
@@ -89,7 +93,7 @@ def apply_replacements(content: XMLFragmentAsString, replacement_patterns: str)
         replacement_pattern_dict = json.loads(replacement_pattern_json)
 
         replacement_type, replacement_pattern_list = list(replacement_pattern_dict.items())[0]
-        replacement_pattern = tuple(replacement_pattern_list)
+        replacement_pattern = Replacement(tuple(replacement_pattern_list))
 
         if replacement_type == "case":
             case_replacement_patterns.append(replacement_pattern)

diff --git a/src/replacer/replacer_pipeline.py b/src/replacer/replacer_pipeline.py
@@ -6,9 +6,33 @@
 import html
 import re
 
-from utils.proper_xml import create_tag_string
+from utils.proper_xml import create_tag_string, replace_string_with_tag
 from utils.types import Replacement, XMLFragmentAsString
 
+JUNK_REGEX = r"</judgment>\s*</akomaNtoso>\s*$"
+BAD = '="<'
+
+
+def assert_not_bad(s):
+    if BAD in s:
+        msg = f"{BAD!r} found in XML"
+        raise RuntimeError(msg)
+
+
+def _replace_string_with_tag_handling_junk(file_data, string, tag):
+    """The XML might contain </judgment></akomaNtoso> at the end; remove and replace if so."""
+
+    junk = re.search(JUNK_REGEX, file_data)
+    if junk:
+        good = re.sub(JUNK_REGEX, "", file_data)
+        tail = junk.group()
+    else:
+        good = file_data
+        tail = ""
+
+    new = replace_string_with_tag(XMLFragmentAsString(good), string, tag)
+    return new + tail
+
 
 def fixed_year(year: str) -> str | None:
     """For some reason, years can be returned as "No Year", despite not being present in the code (outside tests) or the database
@@ -40,9 +64,12 @@ def replacer_caselaw(file_data: XMLFragmentAsString, replacement: Replacement) -
     if year:
         attribs["uk:year"] = year
     attribs["uk:origin"] = "TNA"
-    replacement_string = create_tag_string("ref", html.escape(replacement[0]), attribs)
 
-    return XMLFragmentAsString(str(file_data).replace(replacement[0], replacement_string))
+    replacement_tag = create_tag_string("ref", html.escape(replacement[0]), attribs)
+    output = _replace_string_with_tag_handling_junk(file_data, replacement[0], replacement_tag)
+
+    assert_not_bad(file_data)
+    return output
 
 
 def replacer_leg(file_data: XMLFragmentAsString, replacement: Replacement) -> XMLFragmentAsString:
@@ -58,8 +85,10 @@ def replacer_leg(file_data: XMLFragmentAsString, replacement: Replacement) -> XM
         "uk:canonical": replacement[2],
         "uk:origin": "TNA",
     }
-    replacement_string = create_tag_string("ref", html.escape(replacement[0]), attribs)
-    return XMLFragmentAsString(str(file_data).replace(replacement[0], replacement_string))
+    replacement_tag = create_tag_string("ref", html.escape(replacement[0]), attribs)
+    output = _replace_string_with_tag_handling_junk(file_data, replacement[0], replacement_tag)
+    assert_not_bad(file_data)
+    return output
 
 
 def replacer_abbr(file_data: XMLFragmentAsString, replacement: Replacement) -> XMLFragmentAsString:
@@ -69,8 +98,11 @@ def replacer_abbr(file_data: XMLFragmentAsString, replacement: Replacement) -> X
     :param replacement: tuple of citation match and corrected citation
     :return: enriched XML file data
     """
-    replacement_string = f'<abbr title="{replacement[1]}" uk:origin="TNA">{replacement[0]}</abbr>'
-    return XMLFragmentAsString(str(file_data).replace(str(replacement[0]), replacement_string))
+    replacement_tag = f'<abbr title="{replacement[1]}" uk:origin="TNA">{replacement[0]}</abbr>'
+
+    output = _replace_string_with_tag_handling_junk(file_data, replacement[0], replacement_tag)
+    assert_not_bad(file_data)
+    return output
 
 
 def replacer_pipeline(
@@ -87,13 +119,19 @@ def replacer_pipeline(
     :param REPLACEMENTS_ABBR: list of unique tuples of citation match and corrected citation
     :return: enriched XML file data
     """
+
+    assert_not_bad(file_data)
+
     for replacement in list(set(REPLACEMENTS_CASELAW)):
         file_data = replacer_caselaw(file_data, replacement)
+    assert_not_bad(file_data)
 
     for replacement in list(set(REPLACEMENTS_LEG)):
         file_data = replacer_leg(file_data, replacement)
+    assert_not_bad(file_data)
 
     for replacement in list(set(REPLACEMENTS_ABBR)):
         file_data = replacer_abbr(file_data, replacement)
+    assert_not_bad(file_data)
 
     return file_data
-Original file line number
+Diff line change
@@ Expand Up / @@ -19,7 +19,7 @@ class MismatchedIdShapeError(Exception): @@
         pass
-    def write_patterns_file(patterns_list: str) -> str:
+    def write_patterns_file(patterns_list: list[str]) -> str:
         """
         Write patterns to separate lines
         """
@@ Expand Down @@