-
Notifications
You must be signed in to change notification settings - Fork 0
/
count_phrases.py
executable file
·168 lines (150 loc) · 6.4 KB
/
count_phrases.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
#!/usr/bin/env python3
"""
This script counts adjective-noun, noun-noun, verb-object, verb-subject,
determiner-noun, and negated verb phrases in a corpus and produces a plain-text
vocabulary file with counts. It takes as input a corpus and a word vocabulary
(that must have been precomputed). If working on lemmas, the word vocabulary
must have also been built for lemmas. The corpus must be a list in msgpack.gz
format, with each list element representing a sentence with the following
structure:
{
[ word_1, ..., word_N ],
[ lemma_1, ..., lemma_N ],
[ pos_tag_1, ..., pos_tag_N ],
[ ( basic_gr_1, head_1, dep_1), ..., (basic_gr_M, head_M, dep_M) ],
[ ( enhanced_gr_1, head_1, dep_1), ..., (enhanced_gr_K, head_K, dep_K) ]
}
Usage:
count_phrases.py [--lemmas] [--min-count N] <corpus> <vocab> <output-dir>
Options:
-h --help Show this screen
--lemmas Work with lemmas instead of raw words
--min-count N Minimum phrase count threshold [default: 2]
"""
import gzip
from time import time
from collections import Counter
import docopt
import msgpack
from utils import cleanup_token, find_phrases
args = docopt.docopt(__doc__)
min_count = int(args["--min-count"])
vocab = set()
with open(args["<vocab>"]) as fin:
for line in fin:
vocab.add(line.split("\t")[0].strip())
if args["--lemmas"]:
print("Counting lemmatised phrases...")
else:
print("Counting phrases...")
# iterates through the corpus, extracting phrases
nn_counts = Counter()
an_counts = Counter()
vs_counts = Counter()
vo_counts = Counter()
vps_counts = Counter()
vpo_counts = Counter()
nvs_counts = Counter()
nvo_counts = Counter()
dn_counts = Counter()
sentences_seen = 0
report_interval = 1000000
start_time = time()
with gzip.open(args["<corpus>"], "rb") as fin:
unp = msgpack.Unpacker(fin, encoding="utf-8")
for words, lemmas, tags, _, extended_grs in unp:
if args["--lemmas"]:
words = lemmas
words = [cleanup_token(word) for word in words]
phrases = find_phrases(tags, extended_grs, words)
# adjective-noun
for dep, head in phrases.an:
if words[head] in vocab and words[dep] in vocab:
an_counts[(words[dep], words[head])] += 1
# verb-subject
for dep, head in phrases.vs:
if words[head] in vocab and words[dep] in vocab:
vs_counts[(words[dep], words[head])] += 1
# verb-object
for head, dep in phrases.vo:
if words[head] in vocab and words[dep] in vocab:
vo_counts[(words[head], words[dep])] += 1
# verb-subject with particle
for vps in phrases.vps:
if words[vps[0]] in vocab and words[vps[1]] in vocab and words[vps[2]] in vocab:
if len(vps) == 3:
vps_counts[(words[vps[0]], words[vps[1]], words[vps[2]])] += 1
elif len(vps) == 4 and words[vps[3]] in vocab:
vps_counts[(words[vps[0]], words[vps[1]], words[vps[2]], words[vps[3]])] += 1
# verb-object with particle
for vpo in phrases.vpo:
if words[vpo[0]] in vocab and words[vpo[1]] in vocab and words[vpo[2]] in vocab:
if len(vpo) == 3:
vpo_counts[(words[vpo[0]], words[vpo[1]], words[vpo[2]])] += 1
elif len(vpo) == 4 and words[vpo[3]] in vocab:
vpo_counts[(words[vpo[0]], words[vpo[1]], words[vpo[2]], words[vpo[3]])] += 1
# determiner-noun
for dep, head in phrases.dn:
if words[head] in vocab and words[dep] in vocab:
dn_counts[(words[dep], words[head])] += 1
# negated verb-object
for neg, head, dep in phrases.nvo:
if words[neg] in vocab and words[head] in vocab and words[dep] in vocab:
nvo_counts[(words[neg], words[head], words[dep])] += 1
# negated verb-subject
for dep, neg, head in phrases.nvs:
if words[neg] in vocab and words[head] in vocab and words[dep] in vocab:
nvs_counts[(words[dep], words[neg], words[head])] += 1
# noun-noun
for left, right in phrases.nn:
if words[left] in vocab and words[right] in vocab:
nn_counts[(words[left], words[right])] += 1
sentences_seen += 1
if sentences_seen % report_interval == 0:
elapsed_mins = (time() - start_time) // 60
print("Processed "+str(sentences_seen)+" sentences in "+str(elapsed_mins)+" minutes")
with open(args["<output-dir>"]+"/an_vocab.txt", "w") as fout:
for phrase, count in an_counts.most_common():
if count < min_count:
break
fout.write(" ".join(phrase)+"\t"+str(count)+"\n")
with open(args["<output-dir>"]+"/nn_vocab.txt", "w") as fout:
for phrase, count in nn_counts.most_common():
if count < min_count:
break
fout.write(" ".join(phrase)+"\t"+str(count)+"\n")
with open(args["<output-dir>"]+"/vs_vocab.txt", "w") as fout:
for phrase, count in vs_counts.most_common():
if count < min_count:
break
fout.write(" ".join(phrase)+"\t"+str(count)+"\n")
with open(args["<output-dir>"]+"/vo_vocab.txt", "w") as fout:
for phrase, count in vo_counts.most_common():
if count < min_count:
break
fout.write(" ".join(phrase)+"\t"+str(count)+"\n")
with open(args["<output-dir>"]+"/dn_vocab.txt", "w") as fout:
for phrase, count in dn_counts.most_common():
if count < min_count:
break
fout.write(" ".join(phrase)+"\t"+str(count)+"\n")
with open(args["<output-dir>"]+"/nvo_vocab.txt", "w") as fout:
for phrase, count in nvo_counts.most_common():
if count < min_count:
break
fout.write(" ".join(phrase)+"\t"+str(count)+"\n")
with open(args["<output-dir>"]+"/nvs_vocab.txt", "w") as fout:
for phrase, count in nvs_counts.most_common():
if count < min_count:
break
fout.write(" ".join(phrase)+"\t"+str(count)+"\n")
with open(args["<output-dir>"]+"/vpo_vocab.txt", "w") as fout:
for phrase, count in vpo_counts.most_common():
if count < min_count:
break
fout.write(" ".join(phrase)+"\t"+str(count)+"\n")
with open(args["<output-dir>"]+"/vps_vocab.txt", "w") as fout:
for phrase, count in vps_counts.most_common():
if count < min_count:
break
fout.write(" ".join(phrase)+"\t"+str(count)+"\n")