forked from nblomqvist/easy2acl
-
Notifications
You must be signed in to change notification settings - Fork 8
/
easy2acl.py
executable file
·229 lines (187 loc) · 7.39 KB
/
easy2acl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
#!/usr/bin/env python3
#
# easy2acl.py - Convert data from EasyChair for use with ACLPUB
#
# Original Author: Nils Blomqvist
# Forked/modified by: Asad Sayeed
# Further modifications and docs (for 2019 Anthology): Matt Post
# Index for LaTeX book proceedings: Mehdi Ghanimifard and Simon Dobnik
#
# Please see the documentation in the README file at http://github.com/acl-org/easy2acl.
import os
import re
import sys
from csv import DictReader
from glob import glob
from shutil import copy, rmtree
from unicode_tex import unicode_to_tex
from pybtex.database import BibliographyData, Entry
from PyPDF2 import PdfFileReader
def texify(string):
"""Return a modified version of the argument string where non-ASCII symbols have
been converted into LaTeX escape codes.
"""
return ' '.join(map(unicode_to_tex, string.split())).replace(r'\textquotesingle', "'")
#,----
#| Metadata
#`----
metadata = { 'chairs': [] }
with open('meta') as metadata_file:
for line in metadata_file:
key, value = line.rstrip().split(maxsplit=1)
if key == 'chairs':
metadata[key].append(value)
else:
metadata[key] = value
for key in 'abbrev volume title shortbooktitle booktitle month year location publisher chairs'.split():
if key not in metadata:
print('Fatal: missing key "{}" from "meta" file'.format(key))
print("Please see the documentation at https://acl-org.github.io/ACLPUB/anthology.html.")
sys.exit(1)
for key in "bib_url volume_name short_booktitle type".split():
if key in metadata:
print('Fatal: bad key "{}" in the "meta" file'.format(key))
print("Please see the documentation at https://acl-org.github.io/ACLPUB/anthology.html.")
sys.exit(1)
venue = metadata["abbrev"]
volume_name = metadata["volume"]
year = metadata["year"]
#
# Build a dictionary of submissions (which has author information).
#
submissions = {}
with open('submissions') as submissions_file:
for line in submissions_file:
entry = line.rstrip().split("\t")
submission_id = entry[0]
authors = entry[1].replace(' and', ',').split(', ')
title = entry[2]
submissions[submission_id] = (title, authors)
print("Found ", len(submissions), " submitted files")
#
# Append each accepted submission, as a tuple, to the 'accepted' list.
# Order in this file is used to determine program order.
#
accepted = []
with open('accepted') as accepted_file:
for line in accepted_file:
entry = line.rstrip().split("\t")
# modified here to filter out the rejected files rather than doing
# that by hand
if entry[-1] == 'ACCEPT':
submission_id = entry[0]
title = entry[1]
authors = submissions[submission_id][1]
accepted.append((submission_id, title, authors))
print("Found ", len(accepted), " accepted files")
# Read abstracts
abstracts = {}
if os.path.exists('submission.csv'):
with open('submission.csv') as csv_file:
d = DictReader(csv_file)
for row in d:
abstracts[row['#']] = row['abstract']
print('Found ', len(abstracts), 'abstracts')
else:
print('No abstracts available.')
#
# Find all relevant PDFs
#
# The PDF of the full proceedings
full_pdf_file = 'pdf/{}_{}.pdf'.format(venue, year)
if not os.path.exists(full_pdf_file):
print("Fatal: could not find full volume PDF '{}'".format(full_pdf_file))
sys.exit(1)
# The PDF of the frontmatter
frontmatter_pdf_file = 'pdf/{}_{}_frontmatter.pdf'.format(venue, year)
if not os.path.exists(frontmatter_pdf_file):
print("Fatal: could not find frontmatter PDF file '{}'".format(frontmatter_pdf_file))
sys.exit(1)
# File locations of all PDFs (seeded with PDF for frontmatter)
pdfs = { '0': frontmatter_pdf_file }
for pdf_file in glob('pdf/{}_{}_paper_*.pdf'.format(venue, year)):
submission_id = pdf_file.split('_')[-1].replace('.pdf', '')
pdfs[submission_id] = pdf_file
# List of accepted papers (seeded with frontmatter)
accepted.insert(0, ('0', metadata['booktitle'], metadata['chairs']))
#
# Create Anthology tarball
#
# Create destination directories
for dir in ['bib', 'pdf']:
dest_dir = os.path.join('proceedings/cdrom', dir)
if not os.path.exists(dest_dir):
os.makedirs(dest_dir)
# Copy over "meta" file
print('COPYING meta -> proceedings/meta', file=sys.stderr)
copy('meta', 'proceedings/meta')
final_bibs = []
start_page = 1
for paper_id, entry in enumerate(accepted):
submission_id, paper_title, authors = entry
authors = ' and '.join(authors)
if not submission_id in pdfs:
print('Fatal: no PDF found for paper', paper_id, file=sys.stderr)
sys.exit(1)
pdf_path = pdfs[submission_id]
dest_path = 'proceedings/cdrom/pdf/{}.{}-{}.{}.pdf'.format(year, venue, volume_name, paper_id)
copy(pdf_path, dest_path)
print('COPYING', pdf_path, '->', dest_path, file=sys.stderr)
bib_path = dest_path.replace('pdf', 'bib')
if not os.path.exists(os.path.dirname(bib_path)):
os.makedirs(os.path.dirname(bib_path))
anthology_id = os.path.basename(dest_path).replace('.pdf', '')
bib_type = 'inproceedings' if submission_id != '0' else 'proceedings'
bib_entry = Entry(bib_type, [
('author', authors),
('title', paper_title),
('year', metadata['year']),
('month', metadata['month']),
('address', metadata['location']),
('publisher', metadata['publisher']),
])
# Add page range if not frontmatter
if paper_id > 0:
with open(pdf_path, 'rb') as in_:
file = PdfFileReader(in_)
last_page = start_page + file.getNumPages() - 1
bib_entry.fields['pages'] = '{}--{}'.format(start_page, last_page)
start_page = last_page + 1
# Add the abstract if present
if submission_id in abstracts:
bib_entry.fields['abstract'] = abstracts.get(submission_id)
# Add booktitle for non-proceedings entries
if bib_type == 'inproceedings':
bib_entry.fields['booktitle'] = metadata['booktitle']
try:
bib_string = BibliographyData({ anthology_id: bib_entry }).to_string('bibtex')
except TypeError as e:
print('Fatal: Error in BibTeX-encoding paper', submission_id, file=sys.stderr)
sys.exit(1)
final_bibs.append(bib_string)
with open(bib_path, 'w') as out_bib:
print(bib_string, file=out_bib)
print('CREATED', bib_path)
# Create an index for LaTeX book proceedings
if not os.path.exists('book-proceedings'):
os.makedirs('book-proceedings')
with open('book-proceedings/all_papers.tex', 'w') as book_file:
for entry in accepted:
submission_id, paper_title, authors = entry
if submission_id == '0':
continue
if len(authors) > 1:
authors = ', '.join(authors[:-1]) + ' and ' + authors[-1]
else:
authors = authors[0]
print("""\goodpaper{{../{pdf_file}}}{{{title}}}%
{{{authors}}}\n""".format(authors=texify(authors), pdf_file=pdfs[submission_id], title=texify(paper_title)), file=book_file)
# Write the volume-level bib with all the entries
dest_bib = 'proceedings/cdrom/{}-{}.bib'.format(venue, year)
with open(dest_bib, 'w') as whole_bib:
print('\n'.join(final_bibs), file=whole_bib)
print('CREATED', dest_bib)
# Copy over the volume-level PDF
dest_pdf = dest_bib.replace('bib', 'pdf')
print('COPYING', full_pdf_file, '->', dest_pdf, file=sys.stderr)
copy(full_pdf_file, dest_pdf)