-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathconllu-analyser.py
70 lines (60 loc) · 1.61 KB
/
conllu-analyser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import sys
# Change this if the corpus is really big
MIN_ANALYSIS_FREQ = 1
dico = {}
def train(inn, out):
for line in inn.readlines():
line = line.strip()
if line == '':
continue
if line[0] == '#':
continue
row = line.split('\t')
if row[0].count('.') > 0 or row[0].count('-') > 0:
continue
form = row[1]
analysis = row[2] + '\t' + row[3] + '\t' + row[5];
if form not in dico:
dico[form] = {}
if analysis not in dico[form]:
dico[form][analysis] = 0
dico[form][analysis] += 1
for form in dico:
for analysis in dico[form]:
if dico[form][analysis] >= MIN_ANALYSIS_FREQ:
print('%s\t%s' % (form, analysis), file=out)
def analyse(inn, out, model):
dico = {}
for line in model.readlines():
line = line.strip()
if line == '':
continue
row = line.split('\t')
form = row[0]
row[3] = row[3].replace('_','').replace('|', ' ')
rest = row[1:]
if form not in dico:
dico[form] = []
dico[form].append(rest)
line = inn.readline()
while line != '':
line = line.strip('\n')
for punct in '!,.?:;':
line = line.replace(punct, ' '+punct)
for token in line.split(' '):
print('"<%s>"' % (token), file=out)
if token in dico:
for analysis in dico[token]:
print('\t"%s" %s' % (analysis[0], ' '.join(analysis[1:])), file=out)
else:
print('\t"*%s"' % (token), file=out)
line = inn.readline()
if len(sys.argv) == 3 and sys.argv[1] == '-t':
out = open(sys.argv[2], 'w')
train(sys.stdin, out)
elif len(sys.argv) == 2:
model = open(sys.argv[1])
analyse(sys.stdin, sys.stdout, model)
else:
print('conllu-analyser.py [-t] model.dat');
sys.exit(-1)