Skip to content

Commit

Permalink
Explicit bibkeys in the XML (acl-org#1431)
Browse files Browse the repository at this point in the history
* Add <bibkey> entry to all XML files

* Make Anthology read bibkey from XML files

* Add script to generate bibkeys

* Reintroduce bibkey on paper page (with copy-to-clipboard button)

Fixes acl-org#1236.
  • Loading branch information
Marcel Bollmann authored Jul 30, 2021
1 parent 64c18f9 commit 9c9d9dd
Show file tree
Hide file tree
Showing 618 changed files with 69,297 additions and 277 deletions.
13 changes: 11 additions & 2 deletions bin/anthology/anthology.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,19 @@ class Anthology:
sigs = None
formatter = None

def __init__(self, importdir=None):
def __init__(self, importdir=None, require_bibkeys=True):
"""Instantiates the Anthology.
:param importdir: Data directory to import from; if not given, you'll
need to call `import_directory` explicitly to load the Anthology data.
:param require_bibkeys: If True (default), will log errors if papers
don't have a bibkey; can be set to False in order to create bibkeys
for newly added papers.
"""
self.formatter = MarkupFormatter()
self.volumes = {} # maps volume IDs to Volume objects
self.papers = {} # maps paper IDs to Paper objects
self._require_bibkeys = require_bibkeys
if importdir is not None:
self.import_directory(importdir)

Expand All @@ -51,7 +60,7 @@ def people(self):

def import_directory(self, importdir):
assert os.path.isdir(importdir), "Directory not found: {}".format(importdir)
self.pindex = AnthologyIndex(self, importdir)
self.pindex = AnthologyIndex(importdir, require_bibkeys=self._require_bibkeys)
self.venues = VenueIndex(importdir)
self.sigs = SIGIndex(importdir)
for xmlfile in glob(importdir + "/xml/*.xml"):
Expand Down
56 changes: 46 additions & 10 deletions bin/anthology/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,11 +58,28 @@ def score_variant(name):


class AnthologyIndex:
"""Keeps an index of persons, their associated papers, paper bibliography
keys, etc.."""

def __init__(self, parent, srcdir=None):
self._parent = parent
"""Keeps an index of people and papers.
This class provides:
- An index of people (authors/editors) with their internal IDs, canonical
names, and name variants.
- A mapping of people to all papers associated with them.
- A set of all bibliography keys used within the Anthology and a method to
create new ones, guaranteeing uniqueness.
The index is NOT automatically populated when instantiating this class, but
rather gets its data from papers being registered in it as they are loaded
from the XML by the main `Anthology` class.
:param srcdir: Path to the Anthology data directory. Only used for loading
the list of name variants.
:param require_bibkeys: Whether to log an error when a paper being added
does not have a bibkey. Should only be set to False during the ingestion of
new papers, when this class is being used to generate new, unique bibkeys.
"""

def __init__(self, srcdir=None, require_bibkeys=True):
self._require_bibkeys = require_bibkeys
self.bibkeys = set()
self.stopwords = load_stopwords("en")
self.id_to_canonical = {} # maps ids to canonical names
Expand Down Expand Up @@ -189,11 +206,11 @@ def _is_stopword(self, word, paper):
return True
return False

def create_bibkey(self, paper):
def create_bibkey(self, paper, vidx=None):
"""Create a unique bibliography key for the given paper."""
if paper.is_volume:
# Proceedings volumes use venue acronym instead of authors/editors
bibnames = slugify(self._parent.venues.get_main_venue(paper.full_id))
bibnames = slugify(vidx.get_main_venue(paper.full_id))
else:
# Regular papers use author/editor names
names = paper.get("author")
Expand Down Expand Up @@ -227,11 +244,26 @@ def create_bibkey(self, paper):
bibkey
)
)
self.bibkeys.add(bibkey)
paper.bibkey = bibkey
self.register_bibkey(paper)
return bibkey

def register_bibkey(self, paper):
"""Register a paper's bibkey in Anthology-wide set to ensure uniqueness."""
key = paper.bibkey
if key is None:
if self._require_bibkeys:
log.error("Paper {} has no bibkey!".format(paper.full_id))
return
if key in self.bibkeys:
log.error(
"Paper {} has bibkey that is not unique ({})!".format(paper.full_id, key)
)
return
self.bibkeys.add(key)

def register(self, paper, dummy=False):
"""Register all names associated with the given paper.
"""Register bibkey and names associated with the given paper.
:param dummy: If True, will only resolve the author/editor names without
actually linking them to the given paper. This is used for volumes
Expand All @@ -243,7 +275,11 @@ def register(self, paper, dummy=False):
assert isinstance(paper, Paper), "Expected Paper, got {} ({})".format(
type(paper), repr(paper)
)
paper.bibkey = self.create_bibkey(paper)
# Make sure paper has a bibkey and it is unique (except for dummy
# frontmatter, as it is not an actual paper)
if not dummy:
self.register_bibkey(paper)
# Resolve and register authors/editors for this paper
for role in ("author", "editor"):
for name, id_ in paper.get(role, []):
if id_ is None:
Expand Down
9 changes: 5 additions & 4 deletions bin/anthology/papers.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def __init__(self, paper_id, ingest_date, volume, formatter):
self.formatter = formatter
self._id = paper_id
self._ingest_date = ingest_date
self._bibkey = False
self._bibkey = None
self.is_volume = paper_id == "0"

# initialize metadata with keys inherited from volume
Expand Down Expand Up @@ -74,7 +74,10 @@ def from_xml(xml_element, *args):
for key, value in parse_element(xml_element).items():
if key == "author" and "editor" in paper.attrib:
del paper.attrib["editor"]
paper.attrib[key] = value
if key == "bibkey":
paper.bibkey = value
else:
paper.attrib[key] = value

# Frontmatter title is the volume 'booktitle'
if paper.is_volume:
Expand Down Expand Up @@ -185,8 +188,6 @@ def anthology_id(self):

@property
def bibkey(self):
if not self._bibkey:
self._bibkey = self.full_id # fallback
return self._bibkey

@bibkey.setter
Expand Down
13 changes: 12 additions & 1 deletion bin/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@

from collections import defaultdict, OrderedDict
from datetime import datetime
from glob import glob

from normalize_anth import normalize
from anthology.bibtex import read_bibtex
Expand All @@ -68,6 +69,15 @@ def log(text: str, fake: bool = False):
print(f"{message}{text}", file=sys.stderr)


def load_bibkeys(anthology_datadir):
bibkeys = set()
for xmlfile in glob(os.path.join(anthology_datadir, "xml", "*.xml")):
tree = etree.parse(xmlfile)
root = tree.getroot()
bibkeys.update(str(elem.text) for elem in root.iterfind(".//bibkey"))
return bibkeys


def read_meta(path: str) -> Dict[str, Any]:
meta = {"chairs": []}
with open(path) as instream:
Expand Down Expand Up @@ -189,7 +199,8 @@ def main(args):

sig_index = SIGIndex(srcdir=anthology_datadir)

people = AnthologyIndex(None, srcdir=anthology_datadir)
people = AnthologyIndex(srcdir=anthology_datadir)
people.bibkeys = load_bibkeys(anthology_datadir)

def correct_caps(name):
"""
Expand Down
116 changes: 116 additions & 0 deletions bin/write_bibkeys_to_xml.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# Copyright 2021 Marcel Bollmann <[email protected]>
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Usage: write_bibkeys_to_xml.py [--importdir=DIR] [-c | --commit] [--debug]
Generates BibTeX keys for papers that lack them, and writes them to the XML
(if -c|--commit is given).
Options:
--importdir=DIR Directory to import XML files from.
[default: {scriptdir}/../data/]
-c, --commit Commit (=write) the changes to the XML files;
will only do a dry run otherwise.
--debug Output debug-level log messages.
-h, --help Display this helpful text.
"""

from docopt import docopt
from lxml import etree
import logging as log
import os

from anthology import Anthology
from anthology.utils import SeverityTracker, make_simple_element, indent

import lxml.etree as ET


def write_bibkeys(anthology, srcdir, commit=False):
for volume_id, volume in anthology.volumes.items():
papers_without_bibkey = []

for paper in volume:
bibkey = paper.bibkey
if bibkey is None or bibkey == paper.full_id:
papers_without_bibkey.append(paper)

if papers_without_bibkey:
log.info(
f"Found {len(papers_without_bibkey):4d} papers without bibkeys in volume {volume_id}"
)
if not commit:
continue
else:
continue

# We got some new bibkeys and need to write them to the XML
xml_file = os.path.join(srcdir, "xml", f"{volume.collection_id}.xml")
tree = ET.parse(xml_file)
root = tree.getroot()

for paper in papers_without_bibkey:
if paper.paper_id == "0":
node = root.find(f"./volume[@id='{paper.volume_id}']/frontmatter")
if node is None: # dummy frontmatter
continue
else:
node = root.find(
f"./volume[@id='{paper.volume_id}']/paper[@id='{paper.paper_id}']"
)
if node is None:
log.error(f"Paper {paper.full_id} not found in {xml_file}")
continue

# Generate unique bibkey
bibkey = anthology.pindex.create_bibkey(paper, vidx=anthology.venues)
make_simple_element("bibkey", bibkey, parent=node)

indent(root)
tree.write(xml_file, encoding="UTF-8", xml_declaration=True)


if __name__ == "__main__":
args = docopt(__doc__)
scriptdir = os.path.dirname(os.path.abspath(__file__))
if "{scriptdir}" in args["--importdir"]:
args["--importdir"] = os.path.abspath(
args["--importdir"].format(scriptdir=scriptdir)
)

log_level = log.DEBUG if args["--debug"] else log.INFO
log.basicConfig(format="%(levelname)-8s %(message)s", level=log_level)
tracker = SeverityTracker()
log.getLogger().addHandler(tracker)

log.info("Instantiating the Anthology...")
anthology = Anthology(importdir=args["--importdir"], require_bibkeys=False)
log.info("Scanning for papers without <bibkey> tags...")
write_bibkeys(anthology, args["--importdir"], commit=bool(args["--commit"]))

if not args["--commit"]:
if tracker.highest >= log.ERROR:
log.warning(
"There were errors! Please check them carefully before re-running this script with -c/--commit."
)
else:
log.warning(
"Re-run this script with -c/--commit to save these changes to the XML files."
)

if tracker.highest >= log.ERROR:
exit(1)
Loading

0 comments on commit 9c9d9dd

Please sign in to comment.