Skip to content

Commit

Permalink
Merge pull request #670 from nationalarchives/use-xml-parsing-for-fir…
Browse files Browse the repository at this point in the history
…st-stage-replacements

[FCL-490] Use XML parsing for first stage replacements
  • Loading branch information
dragon-dxw authored Jan 8, 2025
2 parents f8f5a56 + a0cb118 commit 850aaeb
Show file tree
Hide file tree
Showing 22 changed files with 2,234 additions and 1,995 deletions.
4 changes: 4 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,12 @@ repos:
types-requests,
types-lxml,
types-beautifulsoup4,
types-psycopg2,
pandas-stubs,
pytest-stub,
"boto3-stubs[essential, secretsmanager]",
aws_lambda_powertools,
moto,
]

- repo: https://github.com/pre-commit/mirrors-prettier
Expand Down
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
# Changelog

## v7.0.0 (2024-11-28)

- Ensure that documents with matching replacements don't replace inside XML strings
- Fix type linting

## v6.0.2 (2024-10-03)

- Enrich press summaries by fixing patch url to be press-summary, not press/summary
Expand Down
7 changes: 7 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
@@ -1,7 +1,14 @@
[mypy]
python_version = 3.12
mypy_path = src
check_untyped_defs = True
ignore_missing_imports = True
warn_unused_ignores = True
warn_redundant_casts = True
warn_unused_configs = True

[mypy-tests.*]
ignore_errors = True

[mypy-utils.tests.*]
ignore_errors = True
2 changes: 1 addition & 1 deletion src/lambdas/determine_legislation_provisions/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def add_timestamp_and_engine_version(
"uk:tna-enrichment-engine",
attrs={"xmlns:uk": "https://caselaw.nationalarchives.gov.uk/akn"},
)
enrichment_version.string = "6.0.2"
enrichment_version.string = "7.0.0"

if not soup.proprietary:
msg = "This document does not have a <proprietary> element."
Expand Down
2 changes: 1 addition & 1 deletion src/lambdas/extract_judgement_contents/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def extract_text_content(file_content: DocumentAsXMLString) -> str:
return parse_file(file_content)


def upload_contents(source_key: str, text_content: DocumentAsXMLString):
def upload_contents(source_key: str, text_content: DocumentAsXMLString | str):
"""
Uploads text to S3 bucket
"""
Expand Down
2 changes: 1 addition & 1 deletion src/lambdas/update_rules_processor/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ class MismatchedIdShapeError(Exception):
pass


def write_patterns_file(patterns_list: str) -> str:
def write_patterns_file(patterns_list: list[str]) -> str:
"""
Write patterns to separate lines
"""
Expand Down
6 changes: 3 additions & 3 deletions src/lambdas/vlex_upload/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from distutils.util import strtobool

from utils.environment_helpers import validate_env_variable
from utils.types import DocumentAsXMLString
from utils.types import DocumentAsXMLBytes

LOGGER = logging.getLogger()
LOGGER.setLevel(logging.DEBUG)
Expand All @@ -25,14 +25,14 @@ def process_event(sqs_rec):
print("Input bucket name:", source_bucket)
print("Input S3 key:", source_key)

file_content = s3_client.get_object(Bucket=source_bucket, Key=source_key)["Body"].read()
file_content = DocumentAsXMLBytes(s3_client.get_object(Bucket=source_bucket, Key=source_key)["Body"].read())

upload_contents(source_key, file_content)
LOGGER.debug("content uploaded")
return True


def upload_contents(source_key: str, text_content: DocumentAsXMLString):
def upload_contents(source_key: str, text_content: DocumentAsXMLBytes):
"""
Upload judgment XML to destination S3 bucket
"""
Expand Down
11 changes: 7 additions & 4 deletions src/legislation_extraction/legislation_matcher_hybrid.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def mergedict(x, b):

def resolve_overlap(results_dict):
"""
Resolves references that have been detected as legislation but overlap in the body of judgement to the most accurate legislation.
Resolves references that have been detected as legislation but overlap in the body of judgment to the most accurate legislation.
This might occur due to the nature of the fuzzy matching where it matches two closely worded legislation to the same text in a judgement.
This function ensures a 1-to-1 linkage between a legislation title and a detected reference.
Parameters
Expand All @@ -72,9 +72,9 @@ def resolve_overlap(results_dict):
dictionary containing the detected references with overlapped references removed.
"""
qq = pd.DataFrame([results_dict])
qq = qq.T.explode(0)[0].apply(pd.Series)
qq = qq.T.explode([0])[0].apply(pd.Series)

qq.columns = keys
qq.columns = pd.Index(keys)

# get refs that overlap in the text
mask = (qq.start.values[:, None] >= qq.start.values) & (qq.end.values[:, None] <= qq.end.values)
Expand All @@ -89,8 +89,11 @@ def resolve_overlap(results_dict):

# for every detected pair of refs that overlap
for ol_index in overlaps:
# mypy was complaining about ol_index being a `list[signedinteger[_32Bit | _64Bit]]`
# so just force it to be a regular list of ints
int_ol_index = list(map(int, ol_index))
# get those two rows
overlap_rows = qq.iloc[list(ol_index)]
overlap_rows = qq.iloc[int_ol_index]
# get the worst of the two (or first, if they're equal)
worst_match_index = overlap_rows.confidence.idxmin()
# and mark its index for deletion
Expand Down
30 changes: 16 additions & 14 deletions src/oblique_references/oblique_references.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

from bs4 import BeautifulSoup

from replacer.second_stage_replacer import LegislationReferenceReplacement
from utils.proper_xml import create_tag_string

LegislationReference = tuple[tuple[int, int], str]
Expand All @@ -38,9 +39,6 @@ class LegislationDict(TypedDict):
href: str


LegislationReferenceReplacements = list[dict[str, str | int]]


class NotExactlyOneRefTag(RuntimeError):
"""soup.get() can return None if there is no <ref> tag to find, or
a list of hits if there are multiple tags. These are not handled
Expand Down Expand Up @@ -201,9 +199,9 @@ def get_replacements(
detected_acts: list[LegislationReference],
legislation_dicts: list[LegislationDict],
numbered_act: bool,
replacements: list[dict],
replacements: list[LegislationReferenceReplacement],
paragraph_number: int,
) -> LegislationReferenceReplacements:
) -> list[LegislationReferenceReplacement]:
"""
Create replacement string for detected oblique reference
:param detected_acts: detected oblique references
Expand All @@ -214,25 +212,29 @@ def get_replacements(
:returns: list of replacements
"""
for detected_act in detected_acts:
replacement_dict: dict[str, str | int] = {}
match = detected_act[1]
if numbered_act:
matched_replacement = match_numbered_act(detected_act, legislation_dicts)
else:
matched_replacement = match_act(detected_act, legislation_dicts, paragraph_number)
replacement_dict["detected_ref"] = match
replacement_dict["ref_position"] = detected_act[0][0]
replacement_dict["ref_para"] = paragraph_number
if matched_replacement:
replacement_dict["ref_tag"] = create_section_ref_tag(matched_replacement, match)
replacements.append(replacement_dict)
ref_tag = create_section_ref_tag(matched_replacement, match)

replacements.append(
LegislationReferenceReplacement(
detected_ref=match,
ref_position=detected_act[0][0],
ref_para=paragraph_number,
ref_tag=ref_tag,
),
)

return replacements


def get_oblique_reference_replacements_by_paragraph(
file_content: str,
) -> LegislationReferenceReplacements:
) -> list[LegislationReferenceReplacement]:
"""
Determines oblique references and replacement strings grouped by paragraph
:param file_content: original judgment file content
Expand All @@ -241,11 +243,11 @@ def get_oblique_reference_replacements_by_paragraph(
"""
soup = BeautifulSoup(file_content, "xml")
paragraphs = soup.find_all("p")
all_replacements: list[dict] = []
all_replacements: list[LegislationReferenceReplacement] = []
all_legislation_dicts = []

for paragraph_number, paragraph in enumerate(paragraphs):
replacements: list[dict] = []
replacements: list[LegislationReferenceReplacement] = []
detected_legislation = detect_reference(str(paragraph), "legislation")
legislation_dicts = create_legislation_dict(detected_legislation, paragraph_number)
all_legislation_dicts.extend(legislation_dicts)
Expand Down
6 changes: 5 additions & 1 deletion src/replacer/make_replacments.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,13 +65,17 @@ def make_post_header_replacements(
str: The modified legal document content with the replacement applied.
"""
cleaned_file_content = sanitize_judgment(original_content)

pre_header, end_header_tag, post_header = split_text_by_closing_header_tag(cleaned_file_content)

replaced_post_header_content = apply_replacements(post_header, replacement_patterns)
LOGGER.info("Got post-header replacement text content")

full_replaced_text_content = pre_header + end_header_tag + replaced_post_header_content

# raises an lxml.etree.XMLSyntaxError if the output is not valid XML
lxml.etree.fromstring(full_replaced_text_content.encode("utf-8"))

return DocumentAsXMLString(full_replaced_text_content)


Expand All @@ -89,7 +93,7 @@ def apply_replacements(content: XMLFragmentAsString, replacement_patterns: str)
replacement_pattern_dict = json.loads(replacement_pattern_json)

replacement_type, replacement_pattern_list = list(replacement_pattern_dict.items())[0]
replacement_pattern = tuple(replacement_pattern_list)
replacement_pattern = Replacement(tuple(replacement_pattern_list))

if replacement_type == "case":
case_replacement_patterns.append(replacement_pattern)
Expand Down
52 changes: 45 additions & 7 deletions src/replacer/replacer_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,33 @@
import html
import re

from utils.proper_xml import create_tag_string
from utils.proper_xml import create_tag_string, replace_string_with_tag
from utils.types import Replacement, XMLFragmentAsString

JUNK_REGEX = r"</judgment>\s*</akomaNtoso>\s*$"
BAD = '="<'


def assert_not_bad(s):
if BAD in s:
msg = f"{BAD!r} found in XML"
raise RuntimeError(msg)


def _replace_string_with_tag_handling_junk(file_data, string, tag):
"""The XML might contain </judgment></akomaNtoso> at the end; remove and replace if so."""

junk = re.search(JUNK_REGEX, file_data)
if junk:
good = re.sub(JUNK_REGEX, "", file_data)
tail = junk.group()
else:
good = file_data
tail = ""

new = replace_string_with_tag(XMLFragmentAsString(good), string, tag)
return new + tail


def fixed_year(year: str) -> str | None:
"""For some reason, years can be returned as "No Year", despite not being present in the code (outside tests) or the database
Expand Down Expand Up @@ -40,9 +64,12 @@ def replacer_caselaw(file_data: XMLFragmentAsString, replacement: Replacement) -
if year:
attribs["uk:year"] = year
attribs["uk:origin"] = "TNA"
replacement_string = create_tag_string("ref", html.escape(replacement[0]), attribs)

return XMLFragmentAsString(str(file_data).replace(replacement[0], replacement_string))
replacement_tag = create_tag_string("ref", html.escape(replacement[0]), attribs)
output = _replace_string_with_tag_handling_junk(file_data, replacement[0], replacement_tag)

assert_not_bad(file_data)
return output


def replacer_leg(file_data: XMLFragmentAsString, replacement: Replacement) -> XMLFragmentAsString:
Expand All @@ -58,8 +85,10 @@ def replacer_leg(file_data: XMLFragmentAsString, replacement: Replacement) -> XM
"uk:canonical": replacement[2],
"uk:origin": "TNA",
}
replacement_string = create_tag_string("ref", html.escape(replacement[0]), attribs)
return XMLFragmentAsString(str(file_data).replace(replacement[0], replacement_string))
replacement_tag = create_tag_string("ref", html.escape(replacement[0]), attribs)
output = _replace_string_with_tag_handling_junk(file_data, replacement[0], replacement_tag)
assert_not_bad(file_data)
return output


def replacer_abbr(file_data: XMLFragmentAsString, replacement: Replacement) -> XMLFragmentAsString:
Expand All @@ -69,8 +98,11 @@ def replacer_abbr(file_data: XMLFragmentAsString, replacement: Replacement) -> X
:param replacement: tuple of citation match and corrected citation
:return: enriched XML file data
"""
replacement_string = f'<abbr title="{replacement[1]}" uk:origin="TNA">{replacement[0]}</abbr>'
return XMLFragmentAsString(str(file_data).replace(str(replacement[0]), replacement_string))
replacement_tag = f'<abbr title="{replacement[1]}" uk:origin="TNA">{replacement[0]}</abbr>'

output = _replace_string_with_tag_handling_junk(file_data, replacement[0], replacement_tag)
assert_not_bad(file_data)
return output


def replacer_pipeline(
Expand All @@ -87,13 +119,19 @@ def replacer_pipeline(
:param REPLACEMENTS_ABBR: list of unique tuples of citation match and corrected citation
:return: enriched XML file data
"""

assert_not_bad(file_data)

for replacement in list(set(REPLACEMENTS_CASELAW)):
file_data = replacer_caselaw(file_data, replacement)
assert_not_bad(file_data)

for replacement in list(set(REPLACEMENTS_LEG)):
file_data = replacer_leg(file_data, replacement)
assert_not_bad(file_data)

for replacement in list(set(REPLACEMENTS_ABBR)):
file_data = replacer_abbr(file_data, replacement)
assert_not_bad(file_data)

return file_data
Loading

0 comments on commit 850aaeb

Please sign in to comment.