-
Notifications
You must be signed in to change notification settings - Fork 4
/
m2_to_conll_conversion-script.py
81 lines (75 loc) · 3.24 KB
/
m2_to_conll_conversion-script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
# script to convert M2 files to CoNLL tab-delimited vertical format with tokens and error types in 2 columns, by Chris Bryant, [email protected]
# with minor modifications by Andrew Caines, [email protected], to add corrections as a 3rd column
# requires argument variables: /path/to/inputfile.m2 & -out /path/to/outputfile.conll
import argparse
def main():
# Parse command line args
args = parse_args()
# Open the M2 file and output file
with open(args.m2_file) as m2, open(args.out, "w") as out:
# Save a complete M2 block here and count blocks
m2_block = []
# Loop through M2 lines
for line in m2:
line = line.strip()
# Add non empty lines to the block
if line: m2_block.append(line)
# Empty lines signal the end of the block, so process it
else:
# Get the original text and edits
orig = m2_block[0].split()[1:] # Ignore leading "S"
edits = simplify_edits(m2_block[1:])
# Create a dict of edited token ids and labels
tok_dict = create_token_labels(orig, edits)
# Loop through orig tokens by id (AC 2022-07-19: add a 3rd column for hypothesised corrections)
for i in range(0, len(orig)):
# Erroneous token
if i in tok_dict: out.write("\t".join([orig[i], tok_dict[i]])+"\n")
# Correct token
else: out.write("\t".join([orig[i], "-", "-"])+"\n")
# Write toks and labels to output
out.write("\n")
# Reset the block
m2_block = []
# Parse command line arguments
def parse_args():
# Define and parse program input
parser = argparse.ArgumentParser()
parser.add_argument("m2_file", help="The path to an M2 file.")
parser.add_argument("-out", help="The path to the output conll file", required=True)
args = parser.parse_args()
return args
# Input: A list of M2 edit lines
# Output: A list of edit lists; [[o_start, o_end, cat, cor],...]
# AC 2022-07-19: add correction too (e[3])
def simplify_edits(edits_in):
edits_out = []
for e in edits_in:
e = e.split("|||")
span = e[0].split()
# Ignore noop, UNK edits
if e[1] == "noop" or e[1] == "UNK": continue
# Only process annotator 0 for now
if e[-1] != "0": continue
# Save the simplified edit
edits_out.append([int(span[1]), int(span[2]), e[1], e[2], e[3]])
return edits_out
# Input 1: A list of original tokens
# Input 2: A list of edits: [[o_start, o_end, cat, cor],...]
# Output: A dictionary; key is token id, value is label
# AC 2022-07-19: add correction too (e[3])
def create_token_labels(orig, edits):
tok_dict = {}
# Loop through edits
for e in edits:
# Adjust missing word spans to have a non-zero range
if e[0] == e[1]: e[1] += 1
# Loop through the range of ids
for i in range(e[0], e[1]):
# Save the label on this token
tok_dict[i] = e[2] + "\t" + e[3]
# NOTE: Tokens that have multiple labels will end up with the last one
return tok_dict
# Run the main function
if __name__ == "__main__":
main()