Skip to content

Commit

Permalink
Add check that collection ID matches filename
Browse files Browse the repository at this point in the history
  • Loading branch information
mbollmann committed Feb 3, 2025
1 parent c2fcfd6 commit f950af8
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 1 deletion.
8 changes: 7 additions & 1 deletion python/acl_anthology/collections/collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,8 @@ def load(self) -> None:
log.debug(f"Parsing XML data file: {self.path}")
current_volume = cast(Volume, None) # noqa: F841
for _, element in etree.iterparse(
self.path, tag=("meta", "frontmatter", "paper", "volume", "event")
self.path,
tag=("meta", "frontmatter", "paper", "volume", "event", "collection"),
):
discard_element = True
if (
Expand All @@ -150,6 +151,11 @@ def load(self) -> None:
elif element.tag == "event":
self._set_event_from_xml(element)
element.clear()
elif element.tag == "collection":
if element.get("id") != self.id:
raise ValueError(
f"File {self.path} contains Collection '{element.get('id')}'"
)
else:
# Keep element around; should only apply to <event><meta> ...
discard_element = False
Expand Down
8 changes: 8 additions & 0 deletions python/tests/collections/collection_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,14 @@ def test_collection_load(
assert collection.get_event() is None


def test_collection_load_id_mismatch(collection_index, datadir):
collection = Collection(
"2019.emnlp", parent=collection_index, path=datadir / "xml" / "2022.acl.xml"
)
with pytest.raises(ValueError):
collection.load()


@pytest.mark.parametrize("filename", test_cases_xml_roundtrip)
def test_collection_validate_schema(collection_index, datadir, filename):
infile = datadir / "xml" / filename
Expand Down

0 comments on commit f950af8

Please sign in to comment.