-
Notifications
You must be signed in to change notification settings - Fork 2
/
evaluate.py
139 lines (121 loc) · 5.5 KB
/
evaluate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import argparse, evaluator, pyconll
POS = evaluator.POS_KEY
def load_predictions(args):
# Regular CoNLL-U format
if args.pred_upos_index == 3 and args.pred_xpos_index == 4 and args.pred_feats_index == 5:
return pyconll.load_from_file(args.prediction)
# other format
else:
s = ""
with open(args.prediction, 'r') as pred_file:
for line in pred_file:
if line.strip() == "":
s += line
elif line.startswith("#"):
s += line
else:
elements = line.split("\t")
if args.pred_upos_index >= 0 and args.pred_upos_index < len(elements):
upos = elements[args.pred_upos_index].strip()
else:
upos = "_"
if args.pred_xpos_index >= 0 and args.pred_xpos_index < len(elements):
xpos = elements[args.pred_xpos_index].strip()
else:
xpos = "_"
if args.pred_feats_index >= 0 and args.pred_feats_index < len(elements):
feats = elements[args.pred_feats_index].strip()
else:
feats = "_"
s += "0\t_\t_\t{}\t{}\t{}\t0\t_\t_\t_\n".format(upos, xpos, feats)
return pyconll.load_from_string(s)
def extract_inconsistencies(training_file):
coocs = {}
allkeys = set()
data = pyconll.load_from_file(training_file)
for sentence in data:
for token in sentence:
if token.upos not in coocs:
coocs[token.upos] = set()
for m in token.feats:
if m in evaluator.UNIV_FEATURES:
coocs[token.upos].add(m)
allkeys.add(m)
inconsistencies = {}
for postag in coocs:
inconsistencies[postag] = allkeys - coocs[postag]
# print("Inconsistencies found:")
# for postag in inconsistencies:
# print("{}: {}".format(postag, ", ".join(inconsistencies[postag])))
return inconsistencies
def evaluate_file(args, inconsistencies):
print("Prediction file: ", args.prediction)
print("Gold file: ", args.gold)
pred_file = load_predictions(args)
gold_file = pyconll.load_from_file(args.gold)
if len(pred_file) != len(gold_file):
print("Number of sentences does not match!")
print("Prediction: {} Gold: {}".format(len(pred_file), len(gold_file)))
return
upos_evaluator = evaluator.Evaluator(mode="exact")
xpos_evaluator = evaluator.Evaluator(mode="exact")
feats_evaluator = evaluator.Evaluator(mode="by_feats")
ufeats_evaluator = evaluator.Evaluator(mode="exact", only_univ=True)
upos_feats_evaluator = evaluator.Evaluator(mode="by_feats")
incons_count = 0
token_count = 0
for pred_sent, gold_sent in zip(pred_file, gold_file):
if len(pred_sent) != len(gold_sent):
print("Number of words in sentence does not match!")
print("Prediction: {} Gold: {}".format(len(pred_sent), len(gold_sent)))
print("Prediction:", pred_sent._meta)
print("Gold:", gold_sent._meta)
continue
for pred_token, gold_token in zip(pred_sent, gold_sent):
if args.upos:
upos_evaluator.add_instance({POS: gold_token.upos}, {POS: pred_token.upos})
if args.xpos:
xpos_evaluator.add_instance({POS: gold_token.xpos}, {POS: pred_token.xpos})
if args.feats:
gold_feats = {x: ",".join(gold_token.feats[x]) for x in gold_token.feats}
pred_feats = {x: ",".join(pred_token.feats[x]) for x in pred_token.feats}
feats_evaluator.add_instance(gold_feats, pred_feats)
ufeats_evaluator.add_instance(gold_feats, pred_feats)
if args.upos:
if args.incons:
token_count += 1
if len(set(pred_feats.keys()) & inconsistencies[pred_token.upos]) > 0:
incons_count += 1
gold_feats.update({POS: gold_token.upos})
pred_feats.update({POS: pred_token.upos})
upos_feats_evaluator.add_instance(gold_feats, pred_feats)
if upos_evaluator.instance_count > 0:
print("UPOS accuracy {:.2f}%".format(100*upos_evaluator.acc()))
if xpos_evaluator.instance_count > 0:
print("XPOS accuracy {:.2f}%".format(100*xpos_evaluator.acc()))
if feats_evaluator.instance_count > 0:
print("FEATS micro-F1 {:.2f}%".format(100*feats_evaluator.micro_f1()))
if upos_feats_evaluator.instance_count > 0:
print("UPOS+FEATS micro-F1 {:.2f}%".format(100*upos_feats_evaluator.micro_f1()))
if ufeats_evaluator.instance_count > 0:
print("UFEATS accuracy {:.2f}%".format(100*ufeats_evaluator.acc()))
if token_count > 0:
print("UFEATS inconsistencies {:.2f}%".format(100*incons_count / token_count))
print()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Stand-alone evaluation script.')
parser.add_argument("prediction", type=str, default="", help="File with the predicted labels")
parser.add_argument("gold", type=str, default="", help="File with the gold labels")
parser.add_argument("--upos", action="store_true", help="Whether to evaluate the UPOS column")
parser.add_argument("--xpos", action="store_true", help="Whether to evaluate the XPOS column")
parser.add_argument("--feats", action="store_true", help="Whether to evaluate the FEATS column")
parser.add_argument("--incons", type=str, default="", help="Whether to evaluate inconsistent features and from which file they are extracted (typically training data")
parser.add_argument("--pred-upos-index", type=int, default=3, help="Zero-based column index for the UPOS label in the prediction file")
parser.add_argument("--pred-xpos-index", type=int, default=4, help="Zero-based column index for the XPOS label in the prediction file")
parser.add_argument("--pred-feats-index", type=int, default=5, help="Zero-based column index for the FEATS labels in the prediction file")
args = parser.parse_args()
if args.incons != "":
inconsistencies = extract_inconsistencies(args.incons)
else:
inconsistencies = {}
evaluate_file(args, inconsistencies)