-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextract_sentence_pairs.py
202 lines (170 loc) · 7.48 KB
/
extract_sentence_pairs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
### This code can be used to extract pairs of sentences from the SweLL-gold corpus sources (originals) and targets (correction hypotheses) where the two have the same length.
import xml.etree.ElementTree as ET
import argparse
import csv
import conllu
#######################################################################
### This is used to replace "pseudonyms" like A-stad or B-land with something more readable (but not necessarily logically coherent).
placeholder_map = { # baseline "pseudonymization"
'kurs': ['kurs'],
'kursen': ['kursen'],
'skola': ['Buroskola', 'Andeskola', 'Storeskola', 'Bungahjulet'],
'region': ['Sydlunda', 'Undered', 'Hanskim', 'Bungalarna'],
'svensk-stad': ['Sydden', 'Norrebock', 'Rosaborg', 'Ögglestad'],
'institution': ['Volvodrömen', 'Linsbiblioteket', 'Forkecentralen', 'Bungavård'],
'geoplats': ['Fafjällen', 'Undberget', 'Baraön', 'Lokomitt'],
'linjen': ['buss'],
'stad-gen': ['Syddens', 'Norrebocks', 'Rosaborgs', 'Ögglestads'],
'stad': ['Oslo', 'Paris', 'Bagdad', 'Caracas'],
'land': ['Danmark', 'Mongoliet', 'Sudan', 'Peru'],
'land-gen': ['Danmarks', 'Mongoliets', 'Sudans', 'Perus'],
'hemland': ['Brasil', 'Spanien', 'Irak', 'Kina'],
'plats': ['Burocentrum', 'Andeplats', 'Storetorg', 'Bungafors']
}
#######################################################################
def pseudonymize(word):
if 'A-' in word:
i = 0
elif 'B-' in word:
i = 1
elif 'C-' in word:
i = 2
elif 'D-' in word:
i = 3
else:
return word
try:
pseudo_candidates = placeholder_map[word[2:]]
pseudo = pseudo_candidates[i % len(pseudo_candidates)]
except KeyError:
return word
return pseudo
def get_essays(file: str):
'''A function that retrieves the data from a SweLL xml file and splits it into essays.
Args:
file (str): Path and name of the file.
Returns:
A list of essays.
'''
tree = ET.parse(file)
root = tree.getroot()
essays = []
essay = []
for elem in root.iter():
if elem.tag == 'text':
if len(essay) > 1: # exclude meaningless
essays.append(essay)
essay = []
essay.append((elem.tag, elem.text, elem.attrib))
essays.append(essay)
return essays
def get_sent_dict(essays: list, replace_nls: bool=False):
'''A function that retrieves sentences for every essay.
Args:
essays (list): A list of essays retrieved from the XML file.
replace_nls (bool): A flag specifying whether  characters are to be replaced with \n. If set to False, s do NOT appear in the final output
Returns:
A list of essays.
'''
essay_dict = {}
essay_sents = []
for essay in essays:
metadata = essay[0][2]
essay_id = metadata['essay_id']
sent = []
for (cat,word,info) in essay[1:]: # without metadata
if replace_nls:
word = word.replace("", "\n")
if cat == 'link':
continue
elif cat == 'sentence':
if len(sent) > 0:
essay_sents.append(sent)
sent = []
else:
if '' not in word and essay_id not in word:
word = pseudonymize(word)
sent.append((word, info["correction_label"] if "correction_label" in info else "_"))
essay_sents.append(sent)
essay_dict[essay_id] = {"metadata": metadata, "sentences": essay_sents}
essay_sents = []
return essay_dict
def pair_up(source_dict: dict, target_dict: dict):
'''A function that selects the essays with equal number of sentences in both source and target and aligns them naively.
Args:
source_dict (dict): A dictionary with source sentences.
target_dict (dict): A dictionary with target sentences.
Returns:
A list tuples of (original sentence, target sentence, essay ID).
'''
paired_up = []
for essay_id, val in source_dict.items():
metadata = val["metadata"]
assert metadata == target_dict[essay_id]["metadata"]
sentences = val["sentences"]
if len(sentences) == len(target_dict[essay_id]["sentences"]):
for i, sent in enumerate(sentences):
paired_up.append({
"original": sent,
"target": target_dict[essay_id]["sentences"][i],
"metadata": metadata})
return paired_up
def tokenlist(word_label_pairs, metadata):
'''
Build a conllu.TokenList out of the sentence-level information extracted
from SweLL.
Args:
word_label_pairs (list): a list of word-correction label pairs.
metadata (dict): metadata of the essay the sentence belongs to, directly in SweLL format.
Returns:
a conllu.TokenList representing the sentence.
'''
tokens = []
for i, (word,label) in enumerate(word_label_pairs):
tokens.append(conllu.Token(
id=i + 1,
form=word,
lemma="_",
upos="_",
xpos="_",
feats="_",
head="_",
deprel="_",
deps="_",
misc=label))
return conllu.TokenList(tokens=tokens, metadata=metadata)
if __name__ == "__main__":
formats = ["tsv", "conllu"]
parser = argparse.ArgumentParser()
parser.add_argument('source', help='The path to the sourceSweLL.xml file')
parser.add_argument('target', help='The path to the targetSweLL.xml file')
parser.add_argument('--format', required=False, default="tsv", help='output format (tab-separated or CONLL-U)', type=str, choices=formats)
parser.add_argument('--outfile', required=False, default="swell_sent_pairs.tsv", type=str, help='The name for the output file. If outputting CoNLL-U "org-" resp. "trg-" are added to the corresponding filenames')
args = parser.parse_args()
source_essays = get_essays(args.source)
target_essays = get_essays(args.target)
output_format = args.format
if not output_format in formats:
print("invalid format, please select 'conllu' or 'tsv'")
exit(-1)
source_dict = get_sent_dict(source_essays)
target_dict = get_sent_dict(target_essays)
pairs = pair_up(source_dict, target_dict)
if output_format == "tsv":
with open(args.outfile, "w") as outfile:
for (i,pair) in enumerate(pairs):
org_sent = " ".join([token for (token,_) in pair["original"]])
trg_sent = " ".join([token for (token,_) in pair["target"]])
labels = ",".join([label for (_,label) in pair["original"] if label != "_"])
csv_row = pair["metadata"] | {"original sentence": org_sent, "corrected sentence": trg_sent, "correction labels": labels}
writer = csv.DictWriter(outfile, csv_row.keys(), delimiter="\t")
if i == 0: writer.writeheader()
writer.writerow(csv_row)
else: # conllu
with open("org-" + args.outfile, "w") as org_outfile, open("trg-" + args.outfile, "w") as trg_outfile:
for pair in pairs:
metadata = pair["metadata"]
org_sent = tokenlist(pair["original"], metadata)
trg_sent = tokenlist(pair["target"], metadata)
org_outfile.write(org_sent.serialize())
trg_outfile.write(trg_sent.serialize())