Skip to content

Commit

Permalink
Refactor BibkeyIndex: Paper objects now refresh index on __setattr__
Browse files Browse the repository at this point in the history
  • Loading branch information
mbollmann committed Jan 8, 2025
1 parent 3ede52d commit 546e91f
Show file tree
Hide file tree
Showing 4 changed files with 55 additions and 54 deletions.
38 changes: 7 additions & 31 deletions python/acl_anthology/collections/bibkeys.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,46 +103,22 @@ def generate_bibkey(self, paper: Paper) -> str:

return bibkey

def index_paper(self, paper: Paper) -> None:
"""Add a new paper to the index.
def _index_paper(self, bibkey: str, paper: Paper) -> None:
"""Add a paper to the index.
This function is not used when building the index, but only when adding newly created papers. If the paper's bibkey is None, this will automatically generate a bibkey for the paper.
Parameters:
paper: The paper to be indexed.
Warning:
This function should not be called manually. It is invoked automatically when a paper's bibkey is changed.
Raises:
ValueError: If the paper's bibkey is not None and is already in the index.
"""
if not self.is_data_loaded:
self.load()

Check warning on line 116 in python/acl_anthology/collections/bibkeys.py

View check run for this annotation

Codecov / codecov/patch

python/acl_anthology/collections/bibkeys.py#L116

Added line #L116 was not covered by tests
if paper.bibkey is None:
paper.bibkey = self.generate_bibkey(paper)
elif paper.bibkey in self.data:
if bibkey in self.data:
raise ValueError(
f"Cannot index bibkey '{paper.bibkey}' for paper {paper.full_id}; already assigned to {self.data[paper.bibkey].full_id}"
f"Cannot index bibkey '{bibkey}' for paper {paper.full_id}; already assigned to {self.data[bibkey].full_id}"
)

self.data[paper.bibkey] = paper

def refresh_bibkey(self, paper: Paper) -> None:
"""Refresh the paper's bibkey and update the index.
This function can be used to make a paper's bibkey reflect changes in its metadata (such as author list or title), or to update bibkeys after their generation logic has changed. This will modify both the paper and the index.
Parameters:
paper: The paper whose bibkey should be refreshed.
Raises:
KeyError: If the paper wasn't indexed with its current bibkey.
"""
if not self.is_data_loaded:
self.load()
if paper.bibkey is None:
return self.index_paper(paper)
del self.data[paper.bibkey]
paper.bibkey = self.generate_bibkey(paper)
self.data[paper.bibkey] = paper
self.data[bibkey] = paper

def load(self) -> None:
"""Load an index of bibkeys."""
Expand Down
28 changes: 26 additions & 2 deletions python/acl_anthology/collections/paper.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@
from __future__ import annotations

import attrs
import datetime
from attrs import define, field, validators as v
import datetime
from functools import cached_property
import langcodes
from lxml import etree
Expand Down Expand Up @@ -203,6 +203,28 @@ def _attachment_validator(instance: Paper, _: Any, value: Any) -> None:
)


def _update_bibkey_index(
paper: Paper, attr: attrs.Attribute[Any], value: Optional[str]
) -> str:
# Should run on __setattr__ of Paper.bibkey
bibkeyindex = paper.parent.parent.parent.bibkeys
if not bibkeyindex.is_data_loaded:
if value is None:
# Need to load the index to generate new bibkeys
bibkeyindex.load()
else:
return value

old_bibkey = cast(str, paper.bibkey)
if old_bibkey in bibkeyindex:
del bibkeyindex[old_bibkey]
if value is None:
value = bibkeyindex.generate_bibkey(paper)

bibkeyindex._index_paper(value, paper)
return value


@define(field_transformer=auto_validate_types)
class Paper:
"""A paper entry.
Expand Down Expand Up @@ -238,7 +260,9 @@ class Paper:

id: str = field(converter=int_to_str)
parent: Volume = field(repr=False, eq=False)
bibkey: Optional[str] = field()
bibkey: Optional[str] = field(
on_setattr=attrs.setters.pipe(attrs.setters.validate, _update_bibkey_index),
)
title: MarkupText = field()

attachments: list[tuple[str, AttachmentReference]] = field(
Expand Down
3 changes: 2 additions & 1 deletion python/acl_anthology/collections/volume.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,7 +266,8 @@ def create_paper(

kwargs["parent"] = self
paper = Paper(id=id, bibkey=bibkey, title=title, **kwargs)
self.parent.parent.bibkeys.index_paper(paper)
paper.bibkey = bibkey # triggers indexing --- this is a bit obscure?
# self.parent.parent.bibkeys._index_paper(bibkey, paper)
self.data[id] = paper
# TODO: How to solve registration in different indices? Not all indices might be loaded, nor might it be desirable to load them.
# - Papers can be linked to the Person objects of its authors/editors
Expand Down
40 changes: 20 additions & 20 deletions python/tests/collections/bibkeys_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@


def test_bibkeys_indexing(anthology):
index = BibkeyIndex(anthology.collections)
index = anthology.collections.bibkeys
index.load()
assert index.is_data_loaded
assert len(index) > 850
Expand All @@ -35,33 +35,26 @@ def test_bibkeys_indexing(anthology):


def test_bibkeys_index_paper(anthology):
# We manually instance BibKeyIndex and set is_data_loaded=True to prevent automatic indexing
index = BibkeyIndex(anthology.collections)
# We manually set is_data_loaded=True to prevent automatic indexing of bibkeys
index.is_data_loaded = True

paper = anthology.get_paper("2022.acl-long.93")
assert paper.bibkey == "kamal-eddine-etal-2022-frugalscore"
assert "kamal-eddine-etal-2022-frugalscore" not in index

# Indexing the paper should add it to the index
index.index_paper(paper)
index._index_paper(paper.bibkey, paper)
assert "kamal-eddine-etal-2022-frugalscore" in index
assert index["kamal-eddine-etal-2022-frugalscore"] is paper

# Indexing the paper again should raise ValueError
with pytest.raises(ValueError):
index.index_paper(paper)

# Setting the paper's bibkey to None should index the paper again, auto-generating a new & unique bibkey
paper.bibkey = None
index.index_paper(paper)
assert paper.bibkey == "kamal-eddine-etal-2022-frugalscore-learning"
assert "kamal-eddine-etal-2022-frugalscore-learning" in index
assert index["kamal-eddine-etal-2022-frugalscore-learning"] is paper
index._index_paper(paper.bibkey, paper)


def test_bibkeys_generate_bibkey_should_add_title_words(anthology):
index = BibkeyIndex(anthology.collections)
index = anthology.collections.bibkeys
index.load()

# Generating a bibkey for an existing paper should result in a bibkey with another title words added
Expand All @@ -76,7 +69,7 @@ def test_bibkeys_generate_bibkey_should_add_title_words(anthology):


def test_bibkeys_generate_bibkey_should_increment_counter(anthology):
index = BibkeyIndex(anthology.collections)
index = anthology.collections.bibkeys
index.load()

# Generating a bibkey when there are not enough title words should increment the counter
Expand All @@ -86,19 +79,19 @@ def test_bibkeys_generate_bibkey_should_increment_counter(anthology):


def test_bibkeys_generate_bibkey_should_match_existing_bibkeys(anthology):
# We manually instance BibKeyIndex and set is_data_loaded=True to prevent automatic indexing
index = BibkeyIndex(anthology.collections)
# We manually set is_data_loaded=True to prevent automatic indexing of bibkeys
index.is_data_loaded = True

# Now, we can check if the auto-generated bibkeys match the ones in our XML
for paper in anthology.papers("2022.acl"):
assert index.generate_bibkey(paper) == paper.bibkey
# We need to index the paper so it is taken into account for future clashes
index.index_paper(paper)
index._index_paper(paper.bibkey, paper)


def test_bibkeys_refresh_bibkey_should_update(anthology):
index = BibkeyIndex(anthology.collections)
index = anthology.collections.bibkeys
index.load()

# We pick a paper (here, frontmatter) with a bibkey not conforming to what
Expand All @@ -108,16 +101,23 @@ def test_bibkeys_refresh_bibkey_should_update(anthology):
assert "naloma-2022-natural" in index
assert index["naloma-2022-natural"] is paper

# Refreshing the bibkey should replace it with the new version
index.refresh_bibkey(paper)
# Setting the bibkey to None should replace it with an automatically-generated version
paper.bibkey = None # NOTE: use paper.refresh_bibkey() in practice
assert paper.bibkey == "naloma-2022-1"
assert "naloma-2022-natural" not in index
assert "naloma-2022-1" in index
assert index["naloma-2022-1"] is paper

# Setting the bibkey to a custom value should also replace it in the index
paper.bibkey = "naloma-2022-frontmatter"
assert paper.bibkey == "naloma-2022-frontmatter"
assert "naloma-2022-1" not in index
assert "naloma-2022-frontmatter" in index
assert index["naloma-2022-frontmatter"] is paper


def test_bibkeys_refresh_bibkey_should_leave_unchanged(anthology):
index = BibkeyIndex(anthology.collections)
index = anthology.collections.bibkeys
index.load()

# We pick a paper with a bibkey that doesn't need changing
Expand All @@ -127,7 +127,7 @@ def test_bibkeys_refresh_bibkey_should_leave_unchanged(anthology):
assert index["roark-etal-2006-sparseval"] is paper

# Refreshing the bibkey should not change it
index.refresh_bibkey(paper)
paper.bibkey = None # NOTE: use paper.refresh_bibkey() in practice
assert paper.bibkey == "roark-etal-2006-sparseval"
assert "roark-etal-2006-sparseval" in index
assert index["roark-etal-2006-sparseval"] is paper

0 comments on commit 546e91f

Please sign in to comment.