Skip to content

Commit

Permalink
Ingesting EAMT archives (acl-org#968)
Browse files Browse the repository at this point in the history
* added ingest_mtarchive.py script
* small modifications to anthology/utils.py
* add dummy frontmatter to volumes to ensure editors get added to people
* added EAMT 1993–2015
* split EAMT 2016 volume 2 into individual papers
* miscellaneous name fixes
* added EAMT to the front page

Co-authored-by: Daniel Gildea <gildea>
  • Loading branch information
mjpost authored Nov 3, 2020
1 parent 277294a commit b392b8f
Show file tree
Hide file tree
Showing 30 changed files with 4,279 additions and 73 deletions.
5 changes: 5 additions & 0 deletions bin/anthology/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,10 @@ def retrieve_url(remote_url: str, local_path: str):
:param remote_url: The URL to download from. Currently supports http only.
:param local_path: Where to save the file to.
"""
outdir = os.path.dirname(local_path)
if not os.path.exists(outdir):
os.makedirs(outdir)

if remote_url.startswith("http"):
import ssl
import urllib.request
Expand All @@ -138,6 +142,7 @@ def retrieve_url(remote_url: str, local_path: str):
request = urllib.request.Request(
remote_url, headers={'User-Agent': 'Mozilla/5.0'}
)

with opener.open(request, timeout=1000) as url, open(
local_path, mode="wb"
) as input_file_fh:
Expand Down
6 changes: 5 additions & 1 deletion bin/anthology/volumes.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,11 @@ def from_xml(
front_matter_xml = volume_xml.find("frontmatter")
if front_matter_xml is not None:
front_matter = Paper.from_xml(front_matter_xml, volume, formatter)
volume.add_frontmatter(front_matter)
else:
# dummy front matter to make sure that editors of
# volume get registered as people in author database
front_matter = Paper("0", ingest_date, volume, formatter)
volume.add_frontmatter(front_matter)

return volume

Expand Down
87 changes: 30 additions & 57 deletions bin/ingest_mtarchive.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,43 +32,16 @@
import sys
import urllib.request

from anthology.utils import make_simple_element, indent, compute_hash
from anthology.utils import (
make_simple_element,
indent,
compute_hash_from_file,
retrieve_url,
)
from datetime import datetime
from normalize_anth import normalize


def download(remote_path, local_path):
if os.path.exists(local_path):
print(f"{local_path} already exists, not re-downloading", file=sys.stderr)
return True
local_dir = os.path.dirname(local_path)
if not os.path.exists(local_dir):
print(f"Creating directory {local_dir}", file=sys.stderr)
os.makedirs(local_dir)

if remote_path.startswith("http"):
try:
print(
f"-> Downloading file from {remote_path} to {local_path}", file=sys.stderr
)
with urllib.request.urlopen(remote_path) as url, open(
local_path, mode="wb"
) as input_file_fh:
input_file_fh.write(url.read())
except ssl.SSLError:
raise Exception(f"Could not download {remote_path} to {local_path}")
except urllib.error.HTTPError:
print(f"-> FAILED to download {remote_path}", file=sys.stderr)
return False
except urllib.error.URLError:
print(f"-> FAILED to download {remote_path}", file=sys.stderr)
return False
else:
shutil.copyfile(remote_path, local_path)

return True


def extract_pages(source_path, page_range, local_path):
if os.path.exists(local_path):
print(f"{local_path} already exists, not re-extracting", file=sys.stderr)
Expand Down Expand Up @@ -150,9 +123,8 @@ def main(args):
pdf_local_path = os.path.join(
args.anthology_files_path, venue, f"{volume_anth_id}.pdf"
)
download(proceedings_pdf, pdf_local_path)
with open(pdf_local_path, "rb") as f:
checksum = compute_hash(f.read())
retrieve_url(proceedings_pdf, pdf_local_path)
checksum = compute_hash_from_file(pdf_local_path)
make_simple_element(
"url", volume_anth_id, attrib={"hash": checksum}, parent=meta
)
Expand All @@ -179,10 +151,6 @@ def main(args):
)
sys.exit(1)

if not os.path.exists(collection_id):
print(f"Creating {collection_id}", file=sys.stderr)
os.makedirs(collection_id)

paperid = 0
# Create entries for all the papers
for row in csv.DictReader(args.tsv_file, delimiter=args.delimiter):
Expand All @@ -202,18 +170,20 @@ def main(args):
# Only make the title for not-the-frontmatter
make_simple_element("title", title_text, parent=paper)

author_list = row["Authors"].split(" and ")

for author_name in author_list:
if author_name == "":
continue
author = make_simple_element("author", parent=paper)
if ", " in author_name:
last, first = author_name.split(", ")
else:
first, last = ' '.join(author_name.split()[:-1]), author_name.split()[-1]
make_simple_element("first", first, parent=author)
make_simple_element("last", last, parent=author)
author_list = row["Authors"].split(" and ")
for author_name in author_list:
if author_name == "":
continue
author = make_simple_element("author", parent=paper)
if ", " in author_name:
last, first = author_name.split(", ")
else:
first, last = (
' '.join(author_name.split()[:-1]),
author_name.split()[-1],
)
make_simple_element("first", first, parent=author)
make_simple_element("last", last, parent=author)

if pages is not None:
make_simple_element("pages", pages, parent=paper)
Expand All @@ -223,7 +193,7 @@ def main(args):
pdf_local_path = os.path.join(args.anthology_files_path, venue, f"{anth_id}.pdf")
url = None
if "Pdf" in row and row["Pdf"] != "":
if download(row["Pdf"], pdf_local_path):
if retrieve_url(row["Pdf"], pdf_local_path):
url = anth_id

elif "pages in pdf" in row:
Expand All @@ -232,8 +202,7 @@ def main(args):
url = anth_id

if url is not None:
with open(pdf_local_path, "rb") as f:
checksum = compute_hash(f.read())
checksum = compute_hash_from_file(pdf_local_path)

make_simple_element("url", url, attrib={"hash": checksum}, parent=paper)

Expand All @@ -252,9 +221,13 @@ def main(args):
venue,
name,
)
if download(row["Presentation"], local_path):
if retrieve_url(row["Presentation"], local_path):
checksum = compute_hash_from_file(local_path)
make_simple_element(
"attachment", name, attrib={"type": "presentation"}, parent=paper
"attachment",
name,
attrib={"type": "presentation", "hash": checksum},
parent=paper,
)

# Normalize
Expand Down
Loading

0 comments on commit b392b8f

Please sign in to comment.