-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvarieties.py
218 lines (197 loc) · 6.28 KB
/
varieties.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
#! /usr/bin/python
from trie import Trie #for the trie class code
from counts import partition,get_peaks
from stats import *
import re
"""
Implement a couple of algorithms from the paper by Hafer and Weiss
"""
TRIE=Trie() #instantiate the trie
RTRIE=Trie(reverse=True) #instantiate trie for predecessor counts
WORDS={} #segmentation data
TESTFILE='/data/cs65/morphology/segments.eng'
#TESTFILE='newWords.txt'
def numCorrect(word, seg, totCorrect,totBounds):
corWord = WORDS[word]
j = 0
length = max(len(seg),len(corWord))
for i in range(length):
if j>= length-2:
break
if corWord[i] == ' ':
totBounds += 1
if seg[j] == ' ':
totCorrect+=1
j += 1
elif seg[j] == ' ':
j+=2
if j >= length -1:
break
else:
j+=1
if j >= length - 1:
break
return totCorrect, totBounds
def cutoff_accuracy(segments, cut = 5, rcut = 17, rev=False, duo=False, \
sumCut = False, duoPeaks = False, sumPeaks = False, negFreq = False, \
hybrid = False):
"""
grades the segments
"""
totCorr = 0
totCuts = 0
totBounds = 0
for word in WORDS:
if re.search(r'-',word):
words = word.split('-')
seg = words[0]
for i in range(1,len(words)):
seg1, totCuts = cutoff_segment(words[i], cut, rcut, totCuts, rev,duo, \
sumCut, duoPeaks, sumPeaks, negFreq, hybrid)
if seg1[0]== ' ':
seg = seg + " - " + seg1[1:]
elif seg1[len(seg1)-1]==' ':
seg = seg + " - " + seg1[:len(seg1)-2]
else:
seg = seg + " - " + seg1
totCuts+=2
else:
seg, totCuts = cutoff_segment(word, cut, rcut, totCuts, rev,duo,sumCut, \
duoPeaks, sumPeaks, negFreq, hybrid)
totCorr, totBounds = numCorrect(word, seg, totCorr,totBounds)
precision, recall, f = compute_stats(totCuts, totCorr, totBounds)
precisionStr = "precision: " + str(precision)
recallStr = "\trecall: " + str(recall)
return precisionStr + recallStr + "\tf: " + str(f)
def cutoff_segment(word, cutoff, rcut, totCuts, reverse, duo, \
sumCut, duoPeaks, sumPeaks, negFreq, hybrid):
"""
segments words base on cutoff frequency
"""
counts = []
if reverse:
counts = RTRIE.predecessorCount(word)
else:
counts = TRIE.successorCount(word)
if duo:
rcounts = RTRIE.predecessorCount(word)
cutList = get_duoCutoffs(counts, rcounts, cutoff, rcut)
elif sumCut:
rcounts = RTRIE.predecessorCount(word)
cutList = get_sumCutoffs(counts, rcounts, cutoff)
elif duoPeaks:
rcounts = RTRIE.predecessorCount(word)
cutList = get_duoPeaks(counts, rcounts)
elif sumPeaks:
rcounts = RTRIE.predecessorCount(word)
cutList = get_sumPeaks(counts, rcounts)
elif negFreq:
cutList = get_negFreq(word)
elif hybrid:
rcounts = RTRIE.predecessorCount(word)
cutList = get_hybrid(word, counts, rcounts)
else:
cutList = get_cutoffs(counts,cutoff,reverse)
totCuts+= len(cutList)
parts = partition(word, cutList)
return ' '.join(parts), totCuts
def get_duoCutoffs(counts, rcounts, cutoff, rcut):
"""
Given two lists of counts, returns another list identifying
the points where both counts are above the cutoff
"""
seg1 = get_cutoffs(counts, cutoff)
segRev = get_cutoffs(rcounts, rcut, True)
segRev = [segRev[i]-1 for i in range(len(segRev))]
segList = list(set(seg1) & set(segRev))
segList.sort()
return segList
def get_sumCutoffs(counts, rcounts, cutoff):
"""Given two lists of counts, returns another list
identifying the points where the sum of the counts are above the cutoff
"""
thresh = []
start = 2
if len(counts) < 2:
return thresh
for i in range(start, len(counts)-1):
if counts[i] + rcounts[i] >= cutoff:
thresh.append(i)
return thresh
def get_cutoffs(counts, cutoff, predecessor=False):
"""
Given a list of counts, returns another list identifying
the points where the counts are above the cutoff
"""
thresholds = []
start = 0
if len(counts) < 2:
return thresholds
if not predecessor:
start = 1#one char prefixes are tricky
for i in range(start, len(counts)):
if counts[i] >= cutoff:
thresholds.append(i+1)
"""if predecessor:#segmenting of predecessor peaks should be one before
thresholds.append(i)
else:
thresholds.append(i+1)"""
return thresholds
def get_duoPeaks(counts, rcounts):
"""
Given two lists of counts, returns another list identifying
the points where both counts are at a peak or plateau
"""
seg1 = get_peaks(counts)
segRev = get_peaks(rcounts, True)
segList = list(set(seg1) & set(segRev))
segList.sort()
return segList
def get_sumPeaks(counts,rcounts):
"""
Given two lists of counts, returns another list identifying
the points where the sum of the counts is at a peak or plateau
"""
return get_peaks([counts[i]+rcounts[i+1] for i in range(len(counts)-1)])
def get_negFreq(word):
"""
Given a word, returns a list identifying the substrings of length
4 or more that appear as seperate full words in the corpus
"""
segList = []
for i in range(4,len(word)):
if TRIE.isMember(word[:i]):
segList.append(i)
return segList
def get_hybrid(word, counts, rcounts):
"""
Given a word, returns a list identifying the points where both successor
and predecessor counts to some differing degree indicate a segment.
"""
segList = []
negHits = get_negFreq(word)
for i in range(len(counts)-1):
if i in negHits and rcounts[i] > 3:
segList.append(i)
elif rcounts[i] > 18 and (counts[i] > 1 or i in negHits):
segList.append(i)
return segList
def main():
#build the list of test words
f = open(TESTFILE, 'r')
segments = map(lambda f: f.strip('\n'), f.readlines())
f.close()
#build dictionary of words vs segments
for word in segments:
tmp = word.split('\t')
WORDS[tmp[0]] = tmp[1] #put the kv pairing
#TRIE.pretty_print()
print "rev: " + cutoff_accuracy(segments, 14, rev=True)
print "duo: " + cutoff_accuracy(segments,2,10,duo=True)
print "sumCut: " + cutoff_accuracy(segments,22,sumCut=True)
print "duoPeaks: " + cutoff_accuracy(segments, duoPeaks=True)
print "sumPeaks: " + cutoff_accuracy(segments, sumPeaks=True)
print "negFreq: " + cutoff_accuracy(segments, negFreq=True)
print "hybrid: " + cutoff_accuracy(segments, hybrid=True)
if __name__=='__main__':
main()