-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcounts.py
145 lines (119 loc) · 3.6 KB
/
counts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
#! /usr/bin/python
from trie import Trie #for the trie class code
from stats import metrics, compute_stats
"""
Implement a couple of algorithms from the paper by Harris
'From Phoneme to Morpheme'. Of particular interest are the 3 algorithms
1. Successor Counts
2. Predecessor Counts
Which we will attempt to work with here
"""
TRIE=Trie() #instantiate the trie
RTRIE=Trie(reverse=True) #instantiate trie for predecessor counts
WORDS={} #segmentation data
TESTFILE='/data/cs65/morphology/segments.eng'
#stuff for the stats
CUTS=0
CORRECT_CUTS=0
TRUE_BOUNDARIES=0
def successor_segmentor():
"""
User our global list of test words and segment them basing on the successor
counts that exist within the trie TRIE that we have globally
instantiated.
"""
global CUTS
global CORRECT_CUTS
global TRUE_BOUNDARIES
#initialize
CUTS, CORRECT_CUTS, TRUE_BOUNDARIES = (0,0,0)
correct = 0 #number of correct annotations
for word in WORDS:
a,b,c = metrics(segment_word(word), WORDS[word])
CUTS += b
CORRECT_CUTS += a
TRUE_BOUNDARIES += c
#print word, segment_word(word), WORDS[word]
def predecessor_segmentor():
"""
User our global list of test words and segment them basing on predecessor
counts that exist within the trie RTRIE that we have globally
instantiated.
"""
global CUTS
global CORRECT_CUTS
global TRUE_BOUNDARIES
#initialize
CUTS, CORRECT_CUTS, TRUE_BOUNDARIES = (0,0,0)
correct = 0 #number of correct annotations
for word in WORDS:
a,b,c = metrics(segment_word(word, reverse=True), WORDS[word])
CUTS += b
CORRECT_CUTS += a
TRUE_BOUNDARIES += c
#print word, segment_word(word, reverse=True), WORDS[word]
def segment_word(word, reverse=False):
"""
Given a word, return a segmentation of the word based on successor counts
for the word. If words has not been seen before return the word unsegmented
"""
counts = []
if reverse:
counts = RTRIE.predecessorCount(word)
else:
counts = TRIE.successorCount(word)
parts = partition(word, get_peaks(counts, reverse)) #segment word on the peaks
return ' '.join(parts)
def partition(alist, indices):
"""
Given a list and another list of indices, return the original list
split up at the said indices.
"""
return [alist[i:j] for i, j in zip([0]+indices, indices+[None])]
def get_peaks(counts, predecessor=False):
"""
Given a list of counts, return another list identifying where the peaks in
the list are
"""
peaks = []
start = 0
if len(counts) < 2:
return peaks
#differentiate starting point
if not predecessor:
start = 2 #one character prefixes are a bit tricky
for i in range(start, len(counts)-1):
if counts[i-1] < counts[i] > counts[i+1]:
if predecessor:
peaks.append(i)
else:
peaks.append(i+1)
return peaks
def statistics(tp, tn, fp, fn):
"""
Compute statistics of Accuracy, Recall and F-score based on the values
of True Positives/Negatives and False Positives/Negatives.
"""
pass
def main():
#build the list of test words
f = open(TESTFILE, 'r')
segments = map(lambda f: f.strip('\n'), f.readlines())
f.close()
#build dictionary of words vs segments
for word in segments:
tmp = word.split('\t')
WORDS[tmp[0]] = tmp[1] #put the kv pairing
#RTRIE.pretty_print()
successor_segmentor()
print "Successor Stats"
print "****************"
print compute_stats(CUTS, CORRECT_CUTS, TRUE_BOUNDARIES)
print
predecessor_segmentor()
print "Predecessor Stats"
print "****************"
print compute_stats(CUTS, CORRECT_CUTS, TRUE_BOUNDARIES)
print
if __name__=='__main__':
main()