-
Notifications
You must be signed in to change notification settings - Fork 1
/
trees.py
250 lines (192 loc) · 6.66 KB
/
trees.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
import sys
from dataclasses import dataclass
from typing import Iterable
@dataclass
class WordLine:
"UD wordlines with 10 named fields"
ID: str
FORM: str
LEMMA: str
POS: str
XPOS: str
FEATS: str
HEAD: str
DEPREL: str
DEPS: str
MISC: str
def as_dict(self):
return {
'ID': self.ID, 'FORM': self.FORM, 'LEMMA': self.LEMMA,
'POS': self.POS, 'XPOS': self.XPOS,
'FEATS': self.FEATS, 'HEAD': self.HEAD, 'DEPREL': self.DEPREL,
'DEPS': self.DEPS, 'MISC': self.MISC
}
def __str__(self):
return '\t'.join(self.as_dict().values())
def feats(self) -> dict:
featvals = [fv.split('=') for fv in self.FEATS.split('|')]
return {fv[0]: fv[1] for fv in featvals}
WORDLINE_FIELDS = set('ID FORM LEMMA POS XPOS FEATS HEAD DEPREL DEPS MISC'.split())
ROOT_LABEL = 'root'
def ifint(id: str) ->int:
if id.isdigit():
return int(id)
else:
return int(float(id)) # for ids like '7.1'
class NotValidWordLine(Exception):
pass
class NotValidTree(Exception):
pass
def read_wordline(s: str) -> WordLine:
"read a string as a WordLine, fail if not valid"
fields = s.strip().split('\t')
if len(fields) == 10 and fields[0][0].isdigit():
return WordLine(*fields)
else:
raise NotValidWordLine
def read_wordlines(lines):
"read a sequence of strings as WordLines, ignoring failed ones"
for line in lines:
try:
word = read_wordline(line)
yield word
except:
pass
def ngrams(n, trees):
"n-grams of wordlines, inside trees but not over tree boundaries"
for tree in trees:
wordlines = tree.wordlines()
for i in range(len(wordlines)-n):
yield wordlines[i:i+n]
def wordline_ngrams(n, wordliness):
"n-grams of wordlines, inside stanzas but not over tree boundaries"
for wordlines in wordliness:
for i in range(len(wordlines)-n):
yield wordlines[i:i+n]
def replace_by_underscores(fields, wordline):
"replace the values of named fields by underscores"
ldict = wordline.as_dict()
for field in fields:
ldict[field] = '_'
return WordLine(**ldict)
def wordline_statistics(fields, wordlines):
"frequency table of a combination of fields, as dictionary"
stats = {}
for word in wordlines:
value = tuple(word.as_dict()[field] for field in fields)
stats[value] = stats.get(value, 0) + 1
return stats
def sorted_statistics(stats, key=lambda x: x):
"frequency given as dict, sorted as list in descending order"
stats = list(stats.items())
stats.sort(key = lambda it: -key(it[1]))
return stats
def cosine_similarity(stats1, stats2):
"cosine similarity between two frequency dictionaries"
dot = 0
for k in stats1:
dot += stats1[k] * stats2.get(k, 0)
len1 = sum(v*v for v in stats1.values())
len2 = sum(v*v for v in stats2.values())
return dot/((len1 ** 0.5) * (len2 ** 0.5))
def wordline_ngram_statistics(fields, wordlinengrams):
"frequency table of n-grams of field combinations"
stats = {}
for ngram in wordlinengrams:
value = tuple(tuple(word.as_dict()[field] for field in fields) for word in ngram)
stats[value] = stats.get(value, 0) + 1
return stats
@dataclass
class Tree:
"rose trees"
root: object
subtrees: list
def prettyprint(self, level=0, indent=2):
lines = [level*indent*' ' + str(self.root)]
level += 1
for tree in self.subtrees:
lines.extend(tree.prettyprint(level))
return lines
def __len__(self):
return 1 + sum(map(len, self.subtrees))
def depth(self):
if self.subtrees:
return 1 + max(map(lambda t: t.depth(), self.subtrees))
else:
return 1
def prune_subtrees_below(tree: Tree, depth: int) -> Tree:
"leave out parts of trees below given depth, 1 means keep root only"
if depth <= 1:
tree.subtrees = []
else:
tree.subtrees = [prune_subtrees_below(st, depth-1) for st in tree.subtrees]
return tree
@dataclass
class DepTree(Tree):
"depencency trees: rose trees with word lines as nodes"
comments: list[str]
def __str__(self):
lines = self.comments
lines.extend(self.prettyprint())
return '\n'.join(lines)
def wordlines(self):
words = [self.root]
for tree in self.subtrees:
words.extend(tree.wordlines())
words.sort(key=lambda w: ifint(w.ID))
return words
def sentence(self):
return ' '.join([word.FORM for word in self.wordlines()])
def prefix_comments(self, ss):
self.comments = ss + self.comments
def add_misc(self, s):
self.root.MISC += '+' + s
def build_deptree(ns: list[WordLine]) -> DepTree:
"build a dependency tree from a list of word lines"
def build_subtree(ns, root):
subtrees = [build_subtree(ns, n) for n in ns if n.HEAD == root.ID]
return DepTree(root, subtrees, [])
try:
root = [n for n in ns if n.HEAD == '0'][0]
dt = build_subtree(ns, root)
# if len(dt) != len(ns): # 7.1
# raise NotValidTree
return dt
except:
raise NotValidTree(str(ns))
def relabel_deptree(tree: DepTree) -> DepTree:
"set DEPREL of head to root and its HEAD to 0, renumber wordlines to 1, 2, ..."
root = tree.root
root.MISC = root.MISC + '('+root.DEPREL+')'
root.DEPREL = 'root'
words = tree.wordlines() # sorted by ID
numbers = {w.ID: str(i) for w, i in zip(words, range(1, len(words) + 1))}
numbers[root.HEAD] = '0'
def renumber(t):
if t.root.ID.isdigit():
t.root.ID = numbers[t.root.ID]
t.root.HEAD = numbers[t.root.HEAD]
for st in t.subtrees:
renumber(st)
return t
r = renumber(tree)
# r.prefix_comments(tree.comments)
return r
def nonprojective(tree: DepTree) -> bool:
"if a subtree is not projective, i.e. does not span over a continuous sequence"
ids = [int(w.ID) for w in tree.wordlines() if w.ID.isdigit()]
ids.sort()
return len(ids) < 1 + max(ids) - min(ids)
def echo_conllu_file(file: Iterable[str]):
"reads a stream of lines, interprets them as word lines, and prints back"
for line in file:
try:
t = read_wordline(line)
print(t)
except:
if not line.strip() or line.startswith('#'):
print(line.strip())
else:
print('INVALID', line)
if __name__ == '__mainz__':
echo_conllu_file(sys.stdin)